codeine 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,512 @@
1
+ from typing import Collection, Dict, FrozenSet, List, Generator, Optional, Set, Union
2
+
3
+ from codeine.constraints.mutations import MutationDistanceConstraint
4
+ from codeine.graph.base import CodonRestriction
5
+ from codeine.motifs.restriction import RestrictionSite
6
+ from codeine.space.coding import CodingSpace
7
+ from codeine.translation.tables import TranslationTable
8
+ from codeine.translation.weights import CodonWeights
9
+ from codeine.utils.display import format_forbidden_motifs, format_forbidden_motif,\
10
+ format_count, format_restrictions, format_positions
11
+
12
+
13
+ class MutationSpace:
14
+ """
15
+ Represents the subset of valid coding sequences reachable from a reference CDS
16
+ by mutation under constraints.
17
+
18
+ A ``MutationSpace`` is defined by:
19
+
20
+ - A ``CodingSpace`` containing the global sequence constraints.
21
+ - A reference CDS.
22
+ - A set of codon positions that are free to mutate.
23
+
24
+ Positions that are not free are temporarily considered frozen and
25
+ will remain identical to the reference CDS.
26
+ """
27
+
28
+ def __init__(self,
29
+ space: CodingSpace,
30
+ cds: str,
31
+ *,
32
+ free_positions: Optional[Collection[int]] = None,
33
+ min_nts: Optional[int] = None,
34
+ max_nts: Optional[int] = None,
35
+ min_codons: Optional[int] = None,
36
+ max_codons: Optional[int] = None,
37
+ ):
38
+ """
39
+ Parameters
40
+ ----------
41
+ space
42
+ The ``CodingSpace`` object to which this given sequence should belong.
43
+ cds
44
+ The parent/reference CDS.
45
+ free_positions
46
+ Which positions are allowed to change?
47
+ min_nts
48
+ Minimum number of nucleotide differences from the reference CDS.
49
+ max_nts
50
+ Maximum number of nucleotide differences from the reference CDS.
51
+ min_codons
52
+ Minimum number of codon differences from the reference CDS.
53
+ max_codons
54
+ Maximum number of codon differences from the reference CDS.
55
+ """
56
+ self.forbidden_motifs = space.forbidden_motifs
57
+ self.max_homopolymer = space.max_homopolymer
58
+
59
+ self.view = space.view.copy()
60
+ self._base_pins = dict(space.view.pinned_codons)
61
+
62
+ self.cds = self._validate_cds(cds)
63
+
64
+ if free_positions is None:
65
+ free_positions = range(1, len(self.view.aa_seq) + 1)
66
+
67
+ self._free_positions: Set[int] = set()
68
+ self.set_free_positions(free_positions)
69
+
70
+ self.min_nts: Optional[int] = None
71
+ self.max_nts: Optional[int] = None
72
+ self.min_codons: Optional[int] = None
73
+ self.max_codons: Optional[int] = None
74
+
75
+ self.set_distance_constraints(
76
+ min_nts=min_nts,
77
+ max_nts=max_nts,
78
+ min_codons=min_codons,
79
+ max_codons=max_codons,
80
+ )
81
+
82
+ def __getitem__(self, index: Union[int, slice]) -> Union[str, List[str]]:
83
+ """
84
+ Return one or more valid sequences.
85
+
86
+ Parameters
87
+ ----------
88
+ index
89
+ Zero-based sequence index or slice.
90
+
91
+ Returns
92
+ -------
93
+ str or List[str]
94
+ The indexed sequence, or a list of sequences for a slice.
95
+ """
96
+ return self.view[index]
97
+
98
+ def __iter__(self) -> Generator[str, None, None]:
99
+ """
100
+ Iterate over all valid sequences in this mutation space.
101
+ Be aware that "all valid sequences" can be astronomically many!
102
+
103
+ Yields
104
+ ----------
105
+ All valid sequences in the coding space, in order.
106
+ """
107
+ yield from self.view
108
+
109
+ def __contains__(self, seq: str) -> bool:
110
+ """
111
+ Does the given seq exist in this space?
112
+
113
+ Returns
114
+ ----------
115
+ True if and only if this is a valid sequence in this space.
116
+ """
117
+ return seq in self.view
118
+
119
+ def __repr__(self) -> str:
120
+ molecule = 'RNA' if self.translation_table.rna else 'DNA'
121
+
122
+ lines = [
123
+ f'{type(self).__name__}',
124
+ '',
125
+ f'Translation table: {self.translation_table.table_id} ({self.translation_table.name})',
126
+ f'Molecule type: {molecule}',
127
+ '',
128
+ f'Amino acid sequence ({len(self.aa_seq)} aa):',
129
+ f'{self.aa_seq}',
130
+ '',
131
+ 'Reference CDS:',
132
+ self.cds,
133
+ '',
134
+ ]
135
+
136
+ if self.codon_restrictions:
137
+ lines += [
138
+ 'Codon restrictions:',
139
+ *format_restrictions(
140
+ self.codon_restrictions,
141
+ label='restricted positions',
142
+ max_lines=4,
143
+ ),
144
+ '',
145
+ ]
146
+
147
+ if self.forbidden_motifs:
148
+ motifs = self.forbidden_motifs
149
+
150
+ if isinstance(motifs, (str, RestrictionSite)):
151
+ motifs = [motifs]
152
+
153
+ lines += [
154
+ 'Forbidden motifs:',
155
+ *format_forbidden_motifs(
156
+ [
157
+ format_forbidden_motif(
158
+ motif,
159
+ rna=self.translation_table.rna,
160
+ )
161
+ for motif in motifs
162
+ ],
163
+ max_lines=4,
164
+ ),
165
+ '',
166
+ ]
167
+
168
+ if self.max_homopolymer is not None:
169
+ lines += [
170
+ 'Maximum homopolymer length:',
171
+ f' {self.max_homopolymer}',
172
+ '',
173
+ ]
174
+
175
+ if self._base_pins:
176
+ lines += [
177
+ 'Inherited pins:',
178
+ *format_restrictions(
179
+ self._base_pins,
180
+ label='pinned positions',
181
+ max_lines=4,
182
+ ),
183
+ '',
184
+ ]
185
+
186
+ lines += [
187
+ 'Free positions:',
188
+ f' {format_positions(self.free_positions)}',
189
+ '',
190
+ ]
191
+
192
+ if self.has_distance_constraints:
193
+ lines += [
194
+ 'Mutation distance:',
195
+ f' nts: {self._format_distance(self.min_nts, self.max_nts)}',
196
+ f' codons: {self._format_distance(self.min_codons, self.max_codons)}',
197
+ '',
198
+ ]
199
+ else:
200
+ lines.append(
201
+ f'Num. valid variants: {format_count(self.n_valid_variants)}'
202
+ )
203
+
204
+ return '\n'.join(lines)
205
+
206
+ def sample(self, n: Optional[int] = None) -> str:
207
+ """
208
+ Sample one or more variants from this mutation space.
209
+
210
+ Parameters
211
+ ----------
212
+ n
213
+ Number of sequences to sample. If omitted, return a single sequence.
214
+
215
+ Returns
216
+ -------
217
+ A sampled string sequence from this mutation space.
218
+ """
219
+ return self.view.sample(n=n)
220
+
221
+ def enumerate(self) -> Generator[str, None, None]:
222
+ """
223
+ Generate all sequences in this mutation space.
224
+
225
+ Yields
226
+ ------
227
+ str
228
+ A valid coding sequence.
229
+ """
230
+ yield from self.view.enumerate()
231
+
232
+ def contains(self, seq: str) -> bool:
233
+ """
234
+ Check whether a coding sequence is contained in this mutation space.
235
+
236
+ Parameters
237
+ ----------
238
+ seq
239
+ The sequence to check
240
+
241
+ Returns
242
+ -------
243
+ True if and only if the sequence is contained in this mutation space.
244
+ """
245
+ return self.view.contains(seq)
246
+
247
+ def set_free_positions(self, positions: Collection[int]) -> None:
248
+ """
249
+ Replace the current set of free positions.
250
+ """
251
+ self._free_positions = self._validate_positions(positions)
252
+ self._update_pins()
253
+
254
+ def freeze_positions(self, positions: Collection[int]) -> None:
255
+ """
256
+ Freeze the given codon positions.
257
+ """
258
+ positions = self._validate_positions(positions)
259
+
260
+ self._free_positions -= positions
261
+ self._update_pins()
262
+
263
+ def unfreeze_positions(self, positions: Collection[int]) -> None:
264
+ """
265
+ Unfreeze the given codon positions.
266
+ """
267
+ positions = self._validate_positions(positions)
268
+
269
+ self._free_positions |= positions
270
+ self._update_pins()
271
+
272
+ def freeze_all(self) -> None:
273
+ """
274
+ Freeze all codon positions.
275
+ """
276
+ self._free_positions.clear()
277
+ self._update_pins()
278
+
279
+ def unfreeze_all(self) -> None:
280
+ """
281
+ Unfreeze all codon positions.
282
+ """
283
+ self._free_positions = set(range(1, len(self.view.aa_seq) + 1))
284
+ self._update_pins()
285
+
286
+ def set_distance_constraints(self,
287
+ min_nts: Optional[int] = None,
288
+ max_nts: Optional[int] = None,
289
+ min_codons: Optional[int] = None,
290
+ max_codons: Optional[int] = None,
291
+ ) -> None:
292
+ """
293
+ Set mutation distance constraints.
294
+
295
+ Distances are measured from the reference CDS and can be either nucleotide (Hammming)
296
+ distances, i.e. the number of nucleotides that are different from the reference CDS,
297
+ or codon distances, i.e. the number of codons that are different.
298
+ """
299
+
300
+ self._validate_distance('min_nts', min_nts)
301
+ self._validate_distance('max_nts', max_nts)
302
+ self._validate_distance('min_codons', min_codons)
303
+ self._validate_distance('max_codons', max_codons)
304
+
305
+ if min_nts is not None and max_nts is not None and min_nts > max_nts:
306
+ raise ValueError('min_nts cannot be greater than max_nts.')
307
+
308
+ if min_codons is not None and max_codons is not None and min_codons > max_codons:
309
+ raise ValueError('min_codons cannot be greater than max_codons.')
310
+
311
+ self.min_nts = min_nts
312
+ self.max_nts = max_nts
313
+ self.min_codons = min_codons
314
+ self.max_codons = max_codons
315
+
316
+ self._update_path_constraint()
317
+
318
+ def clear_distance_constraints(self) -> None:
319
+ """
320
+ Remove all mutation distance constraints.
321
+ """
322
+ self.set_distance_constraints()
323
+
324
+ @property
325
+ def aa_seq(self) -> str:
326
+ """
327
+ The amino acid sequence for this mutation space.
328
+ """
329
+ return self.view.aa_seq
330
+
331
+ @property
332
+ def translation_table(self) -> TranslationTable:
333
+ """
334
+ The translation table from the underlying graph.
335
+ """
336
+ return self.view.translation_table
337
+
338
+ @property
339
+ def codon_weights(self) -> CodonWeights:
340
+ """
341
+ The codon weights from the underlying graph.
342
+ """
343
+ return self.view.codon_weights
344
+
345
+ @property
346
+ def codon_restrictions(self) -> Dict[int, CodonRestriction]:
347
+ """
348
+ The fixed codon restrictions from the underlying graph.
349
+ """
350
+ return self.view.codon_restrictions
351
+
352
+ @property
353
+ def context_l(self) -> str:
354
+ """
355
+ The left context sequence from the underlying graph.
356
+ """
357
+ return self.view.context_l
358
+
359
+ @property
360
+ def context_r(self) -> str:
361
+ """
362
+ The right context sequence from the underlying graph.
363
+ """
364
+ return self.view.context_r
365
+
366
+ @property
367
+ def pinned_codons(self) -> Dict[int, List[str]]:
368
+ """
369
+ Pins currently applied to the mutation view.
370
+
371
+ This includes inherited pins and pins used internally to freeze positions.
372
+ """
373
+ return self.view.pinned_codons
374
+
375
+ @property
376
+ def n_valid_variants(self) -> int:
377
+ """
378
+ Number of valid variants under the current mutation constraints.
379
+ """
380
+ return self.view.n_valid_sequences
381
+
382
+ @property
383
+ def free_positions(self) -> FrozenSet[int]:
384
+ """
385
+ Codon positions that are currently free to mutate.
386
+ """
387
+ return frozenset(self._free_positions)
388
+
389
+ @property
390
+ def frozen_positions(self) -> FrozenSet[int]:
391
+ """
392
+ Codon positions that are currently fixed to the reference CDS.
393
+ """
394
+ all_positions = set(range(1, len(self.view.aa_seq) + 1))
395
+ return frozenset(all_positions - self._free_positions)
396
+
397
+ @property
398
+ def has_distance_constraints(self) -> bool:
399
+ """
400
+ Whether this mutation space has mutation distance constraints.
401
+ """
402
+ distance_constraints = [self.min_nts, self.max_nts, self.min_codons, self.max_codons]
403
+ return any(value is not None for value in distance_constraints)
404
+
405
+ def _update_path_constraint(self) -> None:
406
+ """
407
+ Apply the current distance constraints to the underlying view.
408
+ """
409
+ if not self.has_distance_constraints:
410
+ self.view.clear_path_constraint()
411
+ return
412
+
413
+ self.view.set_path_constraint(
414
+ MutationDistanceConstraint(
415
+ reference_cds=self.cds,
416
+ min_nts=self.min_nts,
417
+ max_nts=self.max_nts,
418
+ min_codons=self.min_codons,
419
+ max_codons=self.max_codons,
420
+ )
421
+ )
422
+
423
+ def _validate_positions(self, positions: Collection[int]) -> Set[int]:
424
+ """
425
+ Validate a collection of codon positions.
426
+ """
427
+ positions = set(positions)
428
+ invalid = [pos for pos in positions if pos < 1 or pos > len(self.view.aa_seq)]
429
+ if invalid:
430
+ raise ValueError(f'Invalid codon positions: {sorted(invalid)}')
431
+
432
+ return positions
433
+
434
+ @staticmethod
435
+ def _validate_distance(name: str, value: Optional[int]) -> None:
436
+ """
437
+ Validate a mutation distance value.
438
+ """
439
+ if value is None:
440
+ return
441
+
442
+ if not isinstance(value, int):
443
+ raise TypeError(f'{name} must be an integer.')
444
+
445
+ if value < 0:
446
+ raise ValueError(f'{name} must be non-negative.')
447
+
448
+ @staticmethod
449
+ def _format_distance(minimum: Optional[int], maximum: Optional[int]) -> str:
450
+ """
451
+ Format a distance range for display.
452
+ """
453
+ if minimum is None and maximum is None:
454
+ return 'any'
455
+
456
+ if minimum == maximum:
457
+ return str(minimum)
458
+
459
+ if minimum is None:
460
+ return f'up to {maximum}'
461
+
462
+ if maximum is None:
463
+ return f'at least {minimum}'
464
+
465
+ return f'{minimum}..{maximum}'
466
+
467
+ def _validate_cds(self, cds: str) -> str:
468
+ """
469
+ Check that the CDS belongs to the underlying space.
470
+
471
+ Parameters
472
+ ----------
473
+ cds
474
+ The inputted CDS.
475
+
476
+ Returns
477
+ -------
478
+ A normalised and validated version of the inputted CDS.
479
+ """
480
+ cds = cds.upper()
481
+
482
+ if not self.view.contains(cds):
483
+ raise ValueError('CDS is not contained in this coding space.')
484
+
485
+ return cds
486
+
487
+ def _codon_at_position(self, pos: int) -> str:
488
+ """
489
+ Get the codon of the reference CDS at the specified position.
490
+
491
+ Parameters
492
+ ----------
493
+ pos
494
+ The position in the AA sequence.
495
+
496
+ Returns
497
+ -------
498
+ A codon.
499
+ """
500
+ return self.cds[3 * (pos - 1): 3 * pos]
501
+
502
+ def _update_pins(self) -> None:
503
+ """
504
+ Update the pins on the underlying view from inherited and frozen pins.
505
+ """
506
+ frozen_pins = {pos: self._codon_at_position(pos) for pos in self.frozen_positions}
507
+ pins = {**self._base_pins, **frozen_pins}
508
+
509
+ if pins == self.view.pinned_codons:
510
+ return
511
+
512
+ self.view.set_pinned_codons(pins)
File without changes
File without changes