codeine 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codeine/__init__.py +15 -0
- codeine/constraints/banned.py +444 -0
- codeine/constraints/base.py +39 -0
- codeine/constraints/mutations.py +115 -0
- codeine/graph/base.py +267 -0
- codeine/graph/compile.py +489 -0
- codeine/graph/nodes.py +111 -0
- codeine/graph/view.py +781 -0
- codeine/motifs/restriction.py +105 -0
- codeine/motifs/validate.py +117 -0
- codeine/space/__init__.py +0 -0
- codeine/space/coding.py +490 -0
- codeine/space/mutation.py +512 -0
- codeine/translation/__init__.py +0 -0
- codeine/translation/data/__init__.py +0 -0
- codeine/translation/data/tables.json +2252 -0
- codeine/translation/data/weights.py +232 -0
- codeine/translation/tables.py +200 -0
- codeine/translation/weights.py +323 -0
- codeine/utils/__init__.py +0 -0
- codeine/utils/dict.py +23 -0
- codeine/utils/display.py +124 -0
- codeine/utils/sampling.py +90 -0
- codeine-0.1.0.dist-info/METADATA +162 -0
- codeine-0.1.0.dist-info/RECORD +28 -0
- codeine-0.1.0.dist-info/WHEEL +5 -0
- codeine-0.1.0.dist-info/licenses/LICENSE +21 -0
- codeine-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Common restriction site motifs, obtained from New England BioLabs here:
|
|
3
|
+
|
|
4
|
+
https://www.neb.com/en/tools-and-resources/selection-charts/alphabetized-list-of-recognition-specificities
|
|
5
|
+
|
|
6
|
+
Retrieved: 2026-06-09
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from enum import Enum
|
|
10
|
+
from typing import Tuple
|
|
11
|
+
|
|
12
|
+
_COMPLEMENT = str.maketrans('ACGTacgt', 'TGCAtgca')
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def reverse_complement(seq: str) -> str:
|
|
16
|
+
"""
|
|
17
|
+
Return the reverse complement of a DNA sequence.
|
|
18
|
+
"""
|
|
19
|
+
return seq.translate(_COMPLEMENT)[::-1]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class RestrictionSite(Enum):
|
|
23
|
+
"""
|
|
24
|
+
Common restriction enzyme recognition sequences.
|
|
25
|
+
|
|
26
|
+
These can be passed directly to ``CodingSpace`` to forbid restriction
|
|
27
|
+
sites without specifying the DNA motifs manually. Both forward and reverse
|
|
28
|
+
sequences are passed.
|
|
29
|
+
|
|
30
|
+
Examples
|
|
31
|
+
--------
|
|
32
|
+
>>> CodingSpace(
|
|
33
|
+
... aa_seq,
|
|
34
|
+
... forbidden_motifs=[
|
|
35
|
+
... RestrictionSite.EcoRI,
|
|
36
|
+
... RestrictionSite.BsaI,
|
|
37
|
+
... ],
|
|
38
|
+
... )
|
|
39
|
+
|
|
40
|
+
>>> RestrictionSite.EcoRI.forward
|
|
41
|
+
'GAATTC'
|
|
42
|
+
>>> RestrictionSite.EcoRI.reverse
|
|
43
|
+
'GAATTC'
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
# BioBricks
|
|
47
|
+
EcoRI = 'GAATTC'
|
|
48
|
+
XbaI = 'TCTAGA'
|
|
49
|
+
SpeI = 'ACTAGT'
|
|
50
|
+
PstI = 'CTGCAG'
|
|
51
|
+
|
|
52
|
+
# Cloning
|
|
53
|
+
BamHI = 'GGATCC'
|
|
54
|
+
HindIII = 'AAGCTT'
|
|
55
|
+
XhoI = 'CTCGAG'
|
|
56
|
+
SalI = 'GTCGAC'
|
|
57
|
+
KpnI = 'GGTACC'
|
|
58
|
+
SacI = 'GAGCTC'
|
|
59
|
+
NcoI = 'CCATGG'
|
|
60
|
+
NdeI = 'CATATG'
|
|
61
|
+
NotI = 'GCGGCCGC'
|
|
62
|
+
MluI = 'ACGCGT'
|
|
63
|
+
AgeI = 'ACCGGT'
|
|
64
|
+
AvrII = 'CCTAGG'
|
|
65
|
+
BglII = 'AGATCT'
|
|
66
|
+
|
|
67
|
+
# Golden Gate
|
|
68
|
+
BsaI = 'GGTCTC'
|
|
69
|
+
BsmBI = 'CGTCTC'
|
|
70
|
+
BbsI = 'GAAGAC'
|
|
71
|
+
SapI = 'GCTCTTC'
|
|
72
|
+
|
|
73
|
+
def __repr__(self):
|
|
74
|
+
return f'RestrictionSite.{self.name}'
|
|
75
|
+
|
|
76
|
+
def __str__(self):
|
|
77
|
+
if self.forward == self.reverse:
|
|
78
|
+
return f'{self.name} ({self.forward})'
|
|
79
|
+
|
|
80
|
+
return f'{self.name} ({self.forward} / {self.reverse})'
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def forward(self) -> str:
|
|
84
|
+
"""
|
|
85
|
+
Forward recognition sequence.
|
|
86
|
+
"""
|
|
87
|
+
return self.value
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def reverse(self) -> str:
|
|
91
|
+
"""
|
|
92
|
+
Reverse-complemented recognition sequence.
|
|
93
|
+
"""
|
|
94
|
+
return reverse_complement(self.value)
|
|
95
|
+
|
|
96
|
+
@property
|
|
97
|
+
def motifs(self) -> Tuple[str, ...]:
|
|
98
|
+
"""
|
|
99
|
+
All motifs corresponding to this restriction site (forward and reverse).
|
|
100
|
+
Palindromic sites return a single motif.
|
|
101
|
+
"""
|
|
102
|
+
if self.forward == self.reverse:
|
|
103
|
+
return self.forward,
|
|
104
|
+
|
|
105
|
+
return self.forward, self.reverse
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
|
|
2
|
+
from typing import List, Optional, Sequence, Union
|
|
3
|
+
|
|
4
|
+
from codeine.motifs.restriction import RestrictionSite
|
|
5
|
+
|
|
6
|
+
ForbiddenMotif = Union[str, RestrictionSite]
|
|
7
|
+
ForbiddenMotifs = Union[ForbiddenMotif, Sequence[ForbiddenMotif]]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def expand_and_validate_forbidden_motifs(
|
|
11
|
+
forbidden_motifs: ForbiddenMotifs,
|
|
12
|
+
rna: bool
|
|
13
|
+
) -> List[str]:
|
|
14
|
+
"""
|
|
15
|
+
Convert a set of forbidden motifs into a set of sequences to ban.
|
|
16
|
+
|
|
17
|
+
Parameters
|
|
18
|
+
----------
|
|
19
|
+
forbidden_motifs
|
|
20
|
+
A sequence of either dna/rna strings or RestrictionSite objects.
|
|
21
|
+
rna
|
|
22
|
+
Whether to use RNA.
|
|
23
|
+
|
|
24
|
+
Returns
|
|
25
|
+
-------
|
|
26
|
+
A list of forbidden nucleotide sequences.
|
|
27
|
+
"""
|
|
28
|
+
all_sequences = []
|
|
29
|
+
|
|
30
|
+
if isinstance(forbidden_motifs, (str, RestrictionSite)):
|
|
31
|
+
forbidden_motifs = [forbidden_motifs]
|
|
32
|
+
|
|
33
|
+
for motif in forbidden_motifs:
|
|
34
|
+
if isinstance(motif, RestrictionSite):
|
|
35
|
+
sequences = [*motif.motifs]
|
|
36
|
+
|
|
37
|
+
elif isinstance(motif, str):
|
|
38
|
+
if len(motif) == 0:
|
|
39
|
+
raise ValueError('Forbidden motifs cannot be empty.')
|
|
40
|
+
|
|
41
|
+
sequences = [motif]
|
|
42
|
+
|
|
43
|
+
else:
|
|
44
|
+
raise TypeError('Forbidden motifs must be strings or codeine.RestrictionSite.')
|
|
45
|
+
|
|
46
|
+
sequences = [seq.upper() for seq in sequences]
|
|
47
|
+
sequences = [seq.replace('T', 'U') if rna else seq.replace('U', 'T') for seq in sequences]
|
|
48
|
+
|
|
49
|
+
allowed = set('ACGU' if rna else 'ACGT')
|
|
50
|
+
for seq in sequences:
|
|
51
|
+
if not set(seq) <= allowed:
|
|
52
|
+
raise ValueError('Forbidden motifs must be nucleotide sequences.')
|
|
53
|
+
|
|
54
|
+
all_sequences += sequences
|
|
55
|
+
|
|
56
|
+
return sorted(set(all_sequences))
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def expand_and_validate_max_homopolymer(
|
|
60
|
+
max_homopolymer: int,
|
|
61
|
+
rna: bool = False
|
|
62
|
+
) -> List[str]:
|
|
63
|
+
"""
|
|
64
|
+
Convert a max homopolymer constraint into a set of banned sequences.
|
|
65
|
+
|
|
66
|
+
Parameters
|
|
67
|
+
----------
|
|
68
|
+
max_homopolymer
|
|
69
|
+
The max length of homopolymer
|
|
70
|
+
rna
|
|
71
|
+
Whether to use RNA
|
|
72
|
+
|
|
73
|
+
Returns
|
|
74
|
+
-------
|
|
75
|
+
A list of forbidden nucleotide sequences.
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
if not isinstance(max_homopolymer, int):
|
|
79
|
+
raise TypeError('max_homopolymer must be an integer.')
|
|
80
|
+
|
|
81
|
+
if max_homopolymer < 1:
|
|
82
|
+
raise ValueError('max_homopolymer must be at least 1.')
|
|
83
|
+
|
|
84
|
+
nts = 'ACGU' if rna else 'ACGT'
|
|
85
|
+
return [nt * (max_homopolymer + 1) for nt in nts]
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def expand_and_validate_sequence_constraints(
|
|
89
|
+
forbidden_motifs: Optional[ForbiddenMotifs] = None,
|
|
90
|
+
max_homopolymer: Optional[int] = None,
|
|
91
|
+
rna: bool = False,
|
|
92
|
+
):
|
|
93
|
+
"""
|
|
94
|
+
Convert forbidden sequences and/or max homopolymer constraints into sets of banned sequences.
|
|
95
|
+
|
|
96
|
+
Parameters
|
|
97
|
+
----------
|
|
98
|
+
forbidden_motifs
|
|
99
|
+
A sequence of either dna/rna strings or RestrictionSite objects.
|
|
100
|
+
max_homopolymer
|
|
101
|
+
The max allowed homopolymer.
|
|
102
|
+
rna
|
|
103
|
+
Whether to use RNA.
|
|
104
|
+
|
|
105
|
+
Returns
|
|
106
|
+
-------
|
|
107
|
+
A list of forbidden nucleotide sequences.
|
|
108
|
+
"""
|
|
109
|
+
forbidden_sequences = []
|
|
110
|
+
|
|
111
|
+
if forbidden_motifs is not None:
|
|
112
|
+
forbidden_sequences += expand_and_validate_forbidden_motifs(forbidden_motifs, rna=rna)
|
|
113
|
+
|
|
114
|
+
if max_homopolymer is not None:
|
|
115
|
+
forbidden_sequences += expand_and_validate_max_homopolymer(max_homopolymer, rna=rna)
|
|
116
|
+
|
|
117
|
+
return sorted(set(forbidden_sequences))
|
|
File without changes
|
codeine/space/coding.py
ADDED
|
@@ -0,0 +1,490 @@
|
|
|
1
|
+
import pickle
|
|
2
|
+
import random
|
|
3
|
+
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Dict, Generator, List, Optional, Sequence, Tuple, Union, TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from codeine.space.mutation import MutationSpace
|
|
9
|
+
|
|
10
|
+
from codeine.graph.base import CodonGraph, CodonRestriction
|
|
11
|
+
from codeine.motifs.validate import expand_and_validate_sequence_constraints, ForbiddenMotifs
|
|
12
|
+
from codeine.motifs.restriction import RestrictionSite
|
|
13
|
+
from codeine.translation.tables import TranslationTable
|
|
14
|
+
from codeine.translation.weights import CodonWeights
|
|
15
|
+
from codeine.utils.display import format_forbidden_motifs, format_forbidden_motif,\
|
|
16
|
+
format_count, format_restrictions
|
|
17
|
+
from codeine.utils.sampling import Seedable
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class CodingSpace:
|
|
21
|
+
"""
|
|
22
|
+
Represents a space of valid coding sequences for a protein under constraints.
|
|
23
|
+
"""
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
aa_seq: str,
|
|
27
|
+
*,
|
|
28
|
+
translation_table: Optional[TranslationTable] = None,
|
|
29
|
+
rna: Optional[bool] = None,
|
|
30
|
+
codon_restrictions: Optional[Dict[int, CodonRestriction]] = None,
|
|
31
|
+
forbidden_motifs: Optional[ForbiddenMotifs] = None,
|
|
32
|
+
max_homopolymer: Optional[int] = None,
|
|
33
|
+
context_l: str = '',
|
|
34
|
+
context_r: str = '',
|
|
35
|
+
codon_weights: Optional[CodonWeights] = None,
|
|
36
|
+
seed: Optional[Seedable] = None,
|
|
37
|
+
) -> None:
|
|
38
|
+
"""
|
|
39
|
+
Parameters
|
|
40
|
+
----------
|
|
41
|
+
aa_seq
|
|
42
|
+
The amino acid sequence.
|
|
43
|
+
translation_table
|
|
44
|
+
The translation table to use. Leave blank to use standard table.
|
|
45
|
+
rna
|
|
46
|
+
Whether to use RNA. If false or blank, use DNA.
|
|
47
|
+
codon_restrictions
|
|
48
|
+
Any codon restrictions in the format e.g. ``{4: 'TCC'}`` or ``{5: ['AGT', 'AGC']}``. Positions are 1-based.
|
|
49
|
+
forbidden_motifs
|
|
50
|
+
Forbidden motifs, either as strings or as ``codeine.RestrictionSite``.
|
|
51
|
+
max_homopolymer
|
|
52
|
+
The maximum allowed length of nucleotide homopolymer
|
|
53
|
+
context_l
|
|
54
|
+
The context sequence to the left of the coding sequence.
|
|
55
|
+
context_r
|
|
56
|
+
The context sequence to the right of the coding sequence.
|
|
57
|
+
codon_weights
|
|
58
|
+
The codon weights to use. Leave blank to sample uniformly.
|
|
59
|
+
seed
|
|
60
|
+
Seed used to initialise the random number generator for sampling.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
translation_table, codon_weights = self._resolve_tables(translation_table, codon_weights, rna)
|
|
64
|
+
|
|
65
|
+
graph = CodonGraph(
|
|
66
|
+
aa_seq,
|
|
67
|
+
codon_restrictions=codon_restrictions,
|
|
68
|
+
translation_table=translation_table,
|
|
69
|
+
weights=codon_weights,
|
|
70
|
+
context_l=context_l,
|
|
71
|
+
context_r=context_r,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
view = graph.view(seed=seed)
|
|
75
|
+
self.view = view
|
|
76
|
+
|
|
77
|
+
self.forbidden_motifs = self._normalise_forbidden_motifs(forbidden_motifs)
|
|
78
|
+
self.max_homopolymer = max_homopolymer
|
|
79
|
+
|
|
80
|
+
self._update_forbidden_sequences()
|
|
81
|
+
|
|
82
|
+
@classmethod
|
|
83
|
+
def load(cls, path) -> 'CodingSpace':
|
|
84
|
+
"""
|
|
85
|
+
Load a coding space from disc.
|
|
86
|
+
"""
|
|
87
|
+
with Path(path).open('rb') as f:
|
|
88
|
+
return pickle.load(f)
|
|
89
|
+
|
|
90
|
+
def save(self, path) -> None:
|
|
91
|
+
"""
|
|
92
|
+
Save this coding space to disc.
|
|
93
|
+
"""
|
|
94
|
+
with Path(path).open('wb') as f:
|
|
95
|
+
pickle.dump(self, f)
|
|
96
|
+
|
|
97
|
+
def __getitem__(self, index: Union[int, slice]) -> Union[str, List[str]]:
|
|
98
|
+
"""
|
|
99
|
+
Return one or more valid sequences.
|
|
100
|
+
|
|
101
|
+
Parameters
|
|
102
|
+
----------
|
|
103
|
+
index
|
|
104
|
+
Zero-based sequence index or slice.
|
|
105
|
+
|
|
106
|
+
Returns
|
|
107
|
+
-------
|
|
108
|
+
str or List[str]
|
|
109
|
+
The indexed sequence, or a list of sequences for a slice.
|
|
110
|
+
"""
|
|
111
|
+
return self.view[index]
|
|
112
|
+
|
|
113
|
+
def __iter__(self) -> Generator[str, None, None]:
|
|
114
|
+
"""
|
|
115
|
+
Iterate over all valid sequences in this coding space.
|
|
116
|
+
Be aware that "all valid sequences" can be astronomically many!
|
|
117
|
+
|
|
118
|
+
Yields
|
|
119
|
+
----------
|
|
120
|
+
All valid sequences in the coding space, in order.
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
yield from self.view
|
|
124
|
+
|
|
125
|
+
def __contains__(self, seq: str) -> bool:
|
|
126
|
+
"""
|
|
127
|
+
Does the given seq exist in this space?
|
|
128
|
+
|
|
129
|
+
Returns
|
|
130
|
+
----------
|
|
131
|
+
True if and only if this is a valid sequence in this space.
|
|
132
|
+
"""
|
|
133
|
+
|
|
134
|
+
return seq in self.view
|
|
135
|
+
|
|
136
|
+
def __repr__(self) -> str:
|
|
137
|
+
molecule = 'RNA' if self.translation_table.rna else 'DNA'
|
|
138
|
+
|
|
139
|
+
lines = [
|
|
140
|
+
f'{type(self).__name__}',
|
|
141
|
+
'',
|
|
142
|
+
f'Translation table: {self.translation_table.table_id} ({self.translation_table.name})',
|
|
143
|
+
f'Molecule type: {molecule}',
|
|
144
|
+
'',
|
|
145
|
+
f'Amino acid sequence ({len(self.aa_seq)} aa):',
|
|
146
|
+
f'{self.aa_seq}',
|
|
147
|
+
'',
|
|
148
|
+
]
|
|
149
|
+
|
|
150
|
+
if self.codon_restrictions:
|
|
151
|
+
lines += [
|
|
152
|
+
'Codon restrictions:',
|
|
153
|
+
*format_restrictions(
|
|
154
|
+
self.codon_restrictions,
|
|
155
|
+
label='restricted positions',
|
|
156
|
+
max_lines=4,
|
|
157
|
+
),
|
|
158
|
+
'',
|
|
159
|
+
]
|
|
160
|
+
|
|
161
|
+
if self.forbidden_motifs:
|
|
162
|
+
motifs = self.forbidden_motifs
|
|
163
|
+
|
|
164
|
+
if isinstance(motifs, (str, RestrictionSite)):
|
|
165
|
+
motifs = [motifs]
|
|
166
|
+
|
|
167
|
+
lines += [
|
|
168
|
+
'Forbidden motifs:',
|
|
169
|
+
*format_forbidden_motifs(
|
|
170
|
+
[
|
|
171
|
+
format_forbidden_motif(motif, rna=self.translation_table.rna)
|
|
172
|
+
for motif in motifs
|
|
173
|
+
],
|
|
174
|
+
max_lines=4,
|
|
175
|
+
),
|
|
176
|
+
'',
|
|
177
|
+
]
|
|
178
|
+
|
|
179
|
+
if self.max_homopolymer is not None:
|
|
180
|
+
lines += [
|
|
181
|
+
'Maximum homopolymer length:',
|
|
182
|
+
f' {self.max_homopolymer}',
|
|
183
|
+
'',
|
|
184
|
+
]
|
|
185
|
+
|
|
186
|
+
if self.pinned_codons:
|
|
187
|
+
lines += [
|
|
188
|
+
'Temporary pins:',
|
|
189
|
+
*format_restrictions(
|
|
190
|
+
self.pinned_codons,
|
|
191
|
+
label='pinned positions',
|
|
192
|
+
max_lines=4,
|
|
193
|
+
),
|
|
194
|
+
'',
|
|
195
|
+
]
|
|
196
|
+
|
|
197
|
+
lines.append(f'Num. valid coding sequences: {format_count(self.n_valid_sequences)}')
|
|
198
|
+
|
|
199
|
+
return '\n'.join(lines)
|
|
200
|
+
|
|
201
|
+
def sample(self, n: Optional[int] = None) -> str:
|
|
202
|
+
"""
|
|
203
|
+
Sample one or more variants from this coding space.
|
|
204
|
+
|
|
205
|
+
Parameters
|
|
206
|
+
----------
|
|
207
|
+
n
|
|
208
|
+
Number of sequences to sample. If omitted, return a single sequence.
|
|
209
|
+
|
|
210
|
+
Returns
|
|
211
|
+
-------
|
|
212
|
+
A sampled string sequence from this coding space.
|
|
213
|
+
"""
|
|
214
|
+
return self.view.sample(n=n)
|
|
215
|
+
|
|
216
|
+
def enumerate(self) -> Generator[str, None, None]:
|
|
217
|
+
"""
|
|
218
|
+
Generate all sequences in this space. If there are many (and often there are
|
|
219
|
+
astronomically many), one would not expect to reach the 'end'. However for smaller
|
|
220
|
+
sequence spaces, such as mutation spaces, it's quite possible to get there.
|
|
221
|
+
|
|
222
|
+
Yields
|
|
223
|
+
------
|
|
224
|
+
str
|
|
225
|
+
A valid coding sequence.
|
|
226
|
+
"""
|
|
227
|
+
yield from self.view.enumerate()
|
|
228
|
+
|
|
229
|
+
def contains(self, seq: str) -> bool:
|
|
230
|
+
"""
|
|
231
|
+
Check whether a coding sequence is contained in this coding space.
|
|
232
|
+
|
|
233
|
+
Parameters
|
|
234
|
+
----------
|
|
235
|
+
seq
|
|
236
|
+
The sequence to check.
|
|
237
|
+
|
|
238
|
+
Returns
|
|
239
|
+
-------
|
|
240
|
+
True if and only if the sequence is contained in this coding space.
|
|
241
|
+
"""
|
|
242
|
+
return self.view.contains(seq)
|
|
243
|
+
|
|
244
|
+
def mutants(
|
|
245
|
+
self,
|
|
246
|
+
cds: str,
|
|
247
|
+
free_positions: Optional[Sequence[int]] = None,
|
|
248
|
+
min_nts: Optional[int] = None,
|
|
249
|
+
max_nts: Optional[int] = None,
|
|
250
|
+
min_codons: Optional[int] = None,
|
|
251
|
+
max_codons: Optional[int] = None,
|
|
252
|
+
) -> 'MutationSpace':
|
|
253
|
+
"""
|
|
254
|
+
Return a space of mutants relative to a given coding sequence, i.e. a space derived
|
|
255
|
+
from this one but which fixes the sequence on all but the specified positions.
|
|
256
|
+
|
|
257
|
+
Parameters
|
|
258
|
+
----------
|
|
259
|
+
cds
|
|
260
|
+
The sequence to mutate.
|
|
261
|
+
free_positions
|
|
262
|
+
The positions that are allowed to vary.
|
|
263
|
+
min_nts
|
|
264
|
+
The min nucleotide (Hamming) distance relative to the reference sequence.
|
|
265
|
+
max_nts
|
|
266
|
+
The max nucleotide (Hamming) distance relative to the reference sequence.
|
|
267
|
+
min_codons
|
|
268
|
+
The min number of changed codons relative to the reference sequence.
|
|
269
|
+
max_codons
|
|
270
|
+
The max number of changed codons relative to the reference sequence.
|
|
271
|
+
"""
|
|
272
|
+
cds = self.translation_table.normalise_sequence(cds)
|
|
273
|
+
|
|
274
|
+
if not self.contains(cds):
|
|
275
|
+
raise ValueError('CDS is not contained in this coding space.')
|
|
276
|
+
|
|
277
|
+
from codeine.space.mutation import MutationSpace
|
|
278
|
+
|
|
279
|
+
return MutationSpace(
|
|
280
|
+
space=self,
|
|
281
|
+
cds=cds,
|
|
282
|
+
free_positions=free_positions,
|
|
283
|
+
min_nts=min_nts,
|
|
284
|
+
max_nts=max_nts,
|
|
285
|
+
min_codons=min_codons,
|
|
286
|
+
max_codons=max_codons,
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
def pin_codons(self, pinned_codons: Dict[int, str]) -> None:
|
|
290
|
+
"""
|
|
291
|
+
Pin temporary codons in this coding space.
|
|
292
|
+
|
|
293
|
+
Parameters
|
|
294
|
+
----------
|
|
295
|
+
pinned_codons
|
|
296
|
+
A dict specifying which codons to pin, by position.
|
|
297
|
+
"""
|
|
298
|
+
self.view.pin_codons(pinned_codons)
|
|
299
|
+
|
|
300
|
+
def unpin_codons(self, positions: Sequence[int]) -> None:
|
|
301
|
+
"""
|
|
302
|
+
Remove temporary codon pins by position.
|
|
303
|
+
|
|
304
|
+
Parameters
|
|
305
|
+
----------
|
|
306
|
+
positions
|
|
307
|
+
Positions to unpin.
|
|
308
|
+
"""
|
|
309
|
+
self.view.unpin_codons(positions)
|
|
310
|
+
|
|
311
|
+
def set_pinned_codons(self, pinned_codons: Dict[int, str]) -> None:
|
|
312
|
+
"""
|
|
313
|
+
Replace all temporary codon pins on this coding space.
|
|
314
|
+
|
|
315
|
+
Parameters
|
|
316
|
+
----------
|
|
317
|
+
pinned_codons
|
|
318
|
+
A dict specifying which codons to pin, by position.
|
|
319
|
+
"""
|
|
320
|
+
self.view.set_pinned_codons(pinned_codons)
|
|
321
|
+
|
|
322
|
+
def clear_pins(self) -> None:
|
|
323
|
+
"""
|
|
324
|
+
Remove all temporary codon pins from this coding space.
|
|
325
|
+
"""
|
|
326
|
+
self.view.clear_pins()
|
|
327
|
+
|
|
328
|
+
def set_forbidden_motifs(self, forbidden_motifs: ForbiddenMotifs) -> None:
|
|
329
|
+
"""
|
|
330
|
+
Set the forbidden motifs for this coding space.
|
|
331
|
+
|
|
332
|
+
Parameters
|
|
333
|
+
----------
|
|
334
|
+
forbidden_motifs
|
|
335
|
+
Motifs that should be forbidden in generated sequences.
|
|
336
|
+
"""
|
|
337
|
+
self.forbidden_motifs = self._normalise_forbidden_motifs(forbidden_motifs)
|
|
338
|
+
self._update_forbidden_sequences()
|
|
339
|
+
|
|
340
|
+
def clear_forbidden_motifs(self) -> None:
|
|
341
|
+
"""
|
|
342
|
+
Remove all forbidden motifs from this coding space.
|
|
343
|
+
"""
|
|
344
|
+
self.set_forbidden_motifs(None)
|
|
345
|
+
|
|
346
|
+
def set_max_homopolymer(self, max_homopolymer: Optional[int]) -> None:
|
|
347
|
+
"""
|
|
348
|
+
Set the maximum allowed homopolymer length.
|
|
349
|
+
|
|
350
|
+
Parameters
|
|
351
|
+
----------
|
|
352
|
+
max_homopolymer
|
|
353
|
+
The longest allowed repeated run of one nucleotide, or None for no limit.
|
|
354
|
+
"""
|
|
355
|
+
self.max_homopolymer = max_homopolymer
|
|
356
|
+
self._update_forbidden_sequences()
|
|
357
|
+
|
|
358
|
+
def clear_max_homopolymer(self) -> None:
|
|
359
|
+
"""
|
|
360
|
+
Remove the maximum homopolymer constraint from this coding space.
|
|
361
|
+
"""
|
|
362
|
+
self.set_max_homopolymer(None)
|
|
363
|
+
|
|
364
|
+
@property
|
|
365
|
+
def n_valid_sequences(self) -> int:
|
|
366
|
+
"""
|
|
367
|
+
The number of valid sequences in this space.
|
|
368
|
+
"""
|
|
369
|
+
return self.view.n_valid_sequences
|
|
370
|
+
|
|
371
|
+
@property
|
|
372
|
+
def aa_seq(self) -> str:
|
|
373
|
+
"""
|
|
374
|
+
The amino acid sequence for this coding space.
|
|
375
|
+
"""
|
|
376
|
+
return self.view.aa_seq
|
|
377
|
+
|
|
378
|
+
@property
|
|
379
|
+
def translation_table(self) -> TranslationTable:
|
|
380
|
+
"""
|
|
381
|
+
The translation table being used in this space.
|
|
382
|
+
"""
|
|
383
|
+
return self.view.translation_table
|
|
384
|
+
|
|
385
|
+
@property
|
|
386
|
+
def codon_weights(self) -> CodonWeights:
|
|
387
|
+
"""
|
|
388
|
+
The codon weights being used in this space.
|
|
389
|
+
"""
|
|
390
|
+
return self.view.codon_weights
|
|
391
|
+
|
|
392
|
+
@property
|
|
393
|
+
def codon_restrictions(self) -> Dict[int, CodonRestriction]:
|
|
394
|
+
"""
|
|
395
|
+
The fixed codon restrictions from the underlying graph.
|
|
396
|
+
"""
|
|
397
|
+
return self.view.codon_restrictions
|
|
398
|
+
|
|
399
|
+
@property
|
|
400
|
+
def context_l(self) -> str:
|
|
401
|
+
"""
|
|
402
|
+
The left context sequence from the underlying graph.
|
|
403
|
+
"""
|
|
404
|
+
return self.view.context_l
|
|
405
|
+
|
|
406
|
+
@property
|
|
407
|
+
def context_r(self) -> str:
|
|
408
|
+
"""
|
|
409
|
+
The right context sequence from the underlying graph.
|
|
410
|
+
"""
|
|
411
|
+
return self.view.context_r
|
|
412
|
+
|
|
413
|
+
@property
|
|
414
|
+
def pinned_codons(self) -> Dict[int, List[str]]:
|
|
415
|
+
"""
|
|
416
|
+
Temporary codon pins currently applied to this coding space.
|
|
417
|
+
"""
|
|
418
|
+
return self.view.pinned_codons
|
|
419
|
+
|
|
420
|
+
def _update_forbidden_sequences(self) -> None:
|
|
421
|
+
"""
|
|
422
|
+
Rebuild concrete forbidden sequences and apply them to the view.
|
|
423
|
+
"""
|
|
424
|
+
forbidden_sequences = expand_and_validate_sequence_constraints(
|
|
425
|
+
forbidden_motifs=self.forbidden_motifs,
|
|
426
|
+
max_homopolymer=self.max_homopolymer,
|
|
427
|
+
rna=self.translation_table.rna,
|
|
428
|
+
)
|
|
429
|
+
self.view.set_banned_sequences(forbidden_sequences)
|
|
430
|
+
|
|
431
|
+
@staticmethod
|
|
432
|
+
def _resolve_tables(
|
|
433
|
+
translation_table: Optional[TranslationTable],
|
|
434
|
+
codon_weights: Optional[CodonWeights],
|
|
435
|
+
rna: Optional[bool],
|
|
436
|
+
) -> Tuple[TranslationTable, CodonWeights]:
|
|
437
|
+
"""
|
|
438
|
+
Resolve user-submited (or not) translation table, codon weights, and RNA flag.
|
|
439
|
+
"""
|
|
440
|
+
|
|
441
|
+
if rna is None:
|
|
442
|
+
if translation_table is not None and codon_weights is not None \
|
|
443
|
+
and translation_table.rna != codon_weights.rna:
|
|
444
|
+
raise ValueError('Provided translation table and codon weights must have the same molecule type.')
|
|
445
|
+
|
|
446
|
+
if translation_table is not None:
|
|
447
|
+
rna = translation_table.rna
|
|
448
|
+
elif codon_weights is not None:
|
|
449
|
+
rna = codon_weights.rna
|
|
450
|
+
else:
|
|
451
|
+
rna = False
|
|
452
|
+
|
|
453
|
+
else:
|
|
454
|
+
if translation_table is not None and translation_table.rna != rna:
|
|
455
|
+
raise ValueError('Value for rna is inconsistent with the provided translation table.')
|
|
456
|
+
|
|
457
|
+
if codon_weights is not None and codon_weights.rna != rna:
|
|
458
|
+
raise ValueError('Value for rna is inconsistent with the provided codon weights.')
|
|
459
|
+
|
|
460
|
+
if translation_table is None:
|
|
461
|
+
translation_table = TranslationTable(table_id=1, rna=rna)
|
|
462
|
+
|
|
463
|
+
if codon_weights is None:
|
|
464
|
+
codon_weights = CodonWeights.uniform(table=translation_table)
|
|
465
|
+
|
|
466
|
+
return translation_table, codon_weights
|
|
467
|
+
|
|
468
|
+
def _normalise_forbidden_motifs(
|
|
469
|
+
self,
|
|
470
|
+
forbidden_motifs: Optional[ForbiddenMotifs],
|
|
471
|
+
) -> Optional[ForbiddenMotifs]:
|
|
472
|
+
"""
|
|
473
|
+
Normalise string forbidden motifs to the molecule type used by this coding
|
|
474
|
+
space. RestrictionSite objects are left unchanged.
|
|
475
|
+
"""
|
|
476
|
+
if forbidden_motifs is None:
|
|
477
|
+
return None
|
|
478
|
+
|
|
479
|
+
if isinstance(forbidden_motifs, str):
|
|
480
|
+
return self.translation_table.normalise_sequence(forbidden_motifs)
|
|
481
|
+
|
|
482
|
+
if isinstance(forbidden_motifs, RestrictionSite):
|
|
483
|
+
return forbidden_motifs
|
|
484
|
+
|
|
485
|
+
return [
|
|
486
|
+
self.translation_table.normalise_sequence(motif)
|
|
487
|
+
if isinstance(motif, str)
|
|
488
|
+
else motif
|
|
489
|
+
for motif in forbidden_motifs
|
|
490
|
+
]
|