codeine 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
codeine/__init__.py ADDED
@@ -0,0 +1,15 @@
1
+ from codeine.space.coding import CodingSpace
2
+ from codeine.space.mutation import MutationSpace
3
+
4
+ from codeine.translation.tables import TranslationTable
5
+ from codeine.translation.weights import CodonWeights
6
+
7
+ from codeine.motifs.restriction import RestrictionSite
8
+
9
+ __all__ = [
10
+ 'CodingSpace',
11
+ 'CodonWeights',
12
+ 'MutationSpace',
13
+ 'RestrictionSite',
14
+ 'TranslationTable'
15
+ ]
@@ -0,0 +1,444 @@
1
+ from typing import Dict, FrozenSet, List, NamedTuple, Optional, Sequence, Tuple
2
+
3
+ from codeine.graph.base import CodonGraph
4
+ from codeine.graph.nodes import CodonNode
5
+
6
+ # A step is a decision in the codon graph, i.e. (graph pos, choice)
7
+ Step = Tuple[int, str]
8
+
9
+
10
+ class SubPath(NamedTuple):
11
+ """
12
+ A subpath in the codon graph, indicating a sequence that can be obtained
13
+ by following a specified sequence of steps, starting at a given offset.
14
+ """
15
+ sequence: str
16
+ steps: Tuple[Step, ...]
17
+ offset: int
18
+
19
+
20
+ # A watch (path_ix, matched_length) is the status of a single
21
+ # watched path, indicating how much of the path has been seen so far.
22
+ Watch = Tuple[int, int]
23
+
24
+ # The tracker state is a set of watches. We update the
25
+ # watches every time we make a choice.
26
+ BannedTrackerState = FrozenSet[Watch]
27
+
28
+ # Integer ID for a registered banned tracker state.
29
+ BannedTrackerStateId = int
30
+
31
+ # Internal transition value:
32
+ # None -> banned sequence completed
33
+ # Watch -> continue watching this path
34
+ TransitionValue = Optional[Watch]
35
+
36
+
37
+ class BannedTrackerAdvanceResult(NamedTuple):
38
+ """
39
+ Result of advancing a registered banned-tracker state.
40
+
41
+ If banned is True, the graph step has completed a banned sequence and the
42
+ candidate path should be rejected. Otherwise, state_id gives the registered
43
+ tracker state reached after taking the step.
44
+ """
45
+ banned: bool
46
+ state_id: BannedTrackerStateId = 0
47
+
48
+
49
+ CLEAR_ADVANCE_RESULT = BannedTrackerAdvanceResult(banned=False, state_id=0)
50
+ BANNED_ADVANCE_RESULT = BannedTrackerAdvanceResult(banned=True, state_id=0)
51
+
52
+
53
+ class BannedSequenceTracker:
54
+ """
55
+ Tracks progress along concrete banned graph subpaths.
56
+
57
+ A SubPath stores:
58
+
59
+ sequence
60
+ The banned sequence being tracked.
61
+
62
+ steps
63
+ Concrete graph emissions as (pos, choice) pairs.
64
+
65
+ offset
66
+ Where sequence starts inside the first choice.
67
+
68
+ Internally, a tracker state is a frozenset of watches:
69
+
70
+ (path_ix, matched_length)
71
+
72
+ meaning that matched_length bases of paths[path_ix].sequence have already
73
+ matched. States are registered and exposed to the compiler as integer IDs,
74
+ so traversal states remain compact and cheap to hash.
75
+
76
+ Transitions are precomputed as:
77
+
78
+ choice -> (path_ix, matched_length) -> banned | next watch | dead
79
+ """
80
+
81
+ def __init__(self, graph: CodonGraph, banned_sequences: Sequence[str]) -> None:
82
+ """
83
+ Constructor for the BannedSequenceTracker class.
84
+
85
+ Parameters
86
+ ----------
87
+ graph
88
+ The codon graph on which to operate.
89
+ banned_sequences
90
+ A collection of sequences that must not occur in generated paths.
91
+ """
92
+ self.graph = graph
93
+ self.banned_sequences = tuple(sequence.upper() for sequence in banned_sequences)
94
+
95
+ self.initial_state: BannedTrackerState = frozenset()
96
+ self.initial_state_id: int = 0
97
+
98
+ self.state_ids: Dict[BannedTrackerState, BannedTrackerStateId] = {self.initial_state: self.initial_state_id}
99
+ self.states: List[BannedTrackerState] = [self.initial_state]
100
+
101
+ self.advance_cache: Dict[Tuple[Step, BannedTrackerStateId], BannedTrackerAdvanceResult] = {}
102
+
103
+ self.paths = self._find_banned_paths()
104
+ self.starts = self._build_starts()
105
+ self.transitions = self._build_transitions()
106
+
107
+ @property
108
+ def is_trivial(self) -> bool:
109
+ """
110
+ Whether this tracker is trivial - i.e. there are no paths that would generate
111
+ a sequence containing a banned sequence.
112
+
113
+ Returns
114
+ -------
115
+ True if and only if the tracker is trivial.
116
+ """
117
+ return len(self.paths) == 0
118
+
119
+ def _find_banned_paths(self) -> Tuple[SubPath, ...]:
120
+ """
121
+ Find every concrete graph path that can generate a banned sequence.
122
+
123
+ Each returned SubPath records the emitted sequence, the graph steps
124
+ required to produce it, and the offset at which the banned sequence begins
125
+ within the first emitted choice.
126
+
127
+ Returns
128
+ -------
129
+ Tuple[SubPath, ...]
130
+ All graph subpaths capable of producing one of the banned sequences.
131
+ """
132
+ paths = []
133
+
134
+ for sequence in self.banned_sequences:
135
+ paths.extend(_find_matching_subpaths(self.graph, sequence))
136
+
137
+ return tuple(paths)
138
+
139
+ def _build_starts(self) -> Dict[Step, Tuple[TransitionValue, ...]]:
140
+ """
141
+ Build the initial watch transitions for each possible graph step.
142
+
143
+ The returned mapping records which watches should be created when a given
144
+ step is taken. If a banned sequence is completed immediately, the transition
145
+ value is None.
146
+
147
+ Returns
148
+ -------
149
+ Dict[Step, Tuple[TransitionValue, ...]]
150
+ Mapping from graph step to the watches that should be started after
151
+ taking that step.
152
+ """
153
+ starts: Dict[Step, List[TransitionValue]] = {}
154
+
155
+ for path_ix, path in enumerate(self.paths):
156
+ first_step = path.steps[0]
157
+ _pos, first_choice = first_step
158
+
159
+ emitted = first_choice[path.offset:]
160
+ matched_length = min(len(emitted), len(path.sequence))
161
+
162
+ if matched_length >= len(path.sequence):
163
+ result = None
164
+ else:
165
+ result = (path_ix, matched_length)
166
+
167
+ starts.setdefault(first_step, []).append(result)
168
+
169
+ return {
170
+ key: tuple(results)
171
+ for key, results in starts.items()
172
+ }
173
+
174
+ def _build_transitions(self) -> Dict[str, Dict[Watch, TransitionValue]]:
175
+ """
176
+ Build transitions between active watches.
177
+
178
+ For each emitted graph choice, records how every active watch should
179
+ advance. A transition value of None indicates that the banned sequence
180
+ has been completed.
181
+
182
+ Returns
183
+ -------
184
+ Dict[str, Dict[Watch, TransitionValue]]
185
+ Mapping from emitted graph choice to the corresponding watch
186
+ transitions.
187
+ """
188
+ transitions: Dict[str, Dict[Watch, TransitionValue]] = {}
189
+
190
+ for path_ix, path in enumerate(self.paths):
191
+ matched_length = min(
192
+ len(path.steps[0][1]) - path.offset,
193
+ len(path.sequence),
194
+ )
195
+
196
+ if matched_length >= len(path.sequence):
197
+ continue
198
+
199
+ for _pos, choice in path.steps[1:]:
200
+ watch = (path_ix, matched_length)
201
+ remaining = path.sequence[matched_length:]
202
+
203
+ if choice.startswith(remaining):
204
+ transitions.setdefault(choice, {})[watch] = None
205
+ break
206
+
207
+ if remaining.startswith(choice):
208
+ matched_length += len(choice)
209
+ transitions.setdefault(choice, {})[watch] = (path_ix, matched_length)
210
+ continue
211
+
212
+ break
213
+
214
+ return transitions
215
+
216
+ def _get_or_register_state_id(
217
+ self,
218
+ state: BannedTrackerState,
219
+ ) -> BannedTrackerStateId:
220
+ """
221
+ Return the integer ID for a banned-tracker state, creating one if needed.
222
+
223
+ Parameters
224
+ ----------
225
+ state
226
+ The concrete frozenset-of-watches tracker state.
227
+
228
+ Returns
229
+ -------
230
+ BannedTrackerStateId
231
+ Stable integer ID for the given tracker state.
232
+ """
233
+ state_id = self.state_ids.get(state)
234
+
235
+ if state_id is None:
236
+ state_id = len(self.states)
237
+ self.state_ids[state] = state_id
238
+ self.states.append(state)
239
+
240
+ return state_id
241
+
242
+ def advance(
243
+ self,
244
+ step: Step,
245
+ state_id: BannedTrackerStateId,
246
+ ) -> BannedTrackerAdvanceResult:
247
+ """
248
+ Advance a registered tracker state after taking one graph step.
249
+
250
+ Parameters
251
+ ----------
252
+ step
253
+ The graph step just taken, as (graph pos, choice).
254
+ state_id
255
+ Integer ID of the current banned-tracker state.
256
+
257
+ Returns
258
+ -------
259
+ BannedTrackerAdvanceResult
260
+ Whether the step completed a banned sequence, and otherwise the
261
+ integer ID of the updated tracker state.
262
+ """
263
+ key = (step, state_id)
264
+
265
+ cached = self.advance_cache.get(key)
266
+ if cached is not None:
267
+ return cached
268
+
269
+ state = self.states[state_id]
270
+ starts = self.starts.get(step)
271
+
272
+ if starts is None and not state:
273
+ self.advance_cache[key] = CLEAR_ADVANCE_RESULT
274
+ return CLEAR_ADVANCE_RESULT
275
+
276
+ _pos, choice = step
277
+ transitions = self.transitions.get(choice)
278
+ next_state = set()
279
+
280
+ if starts is not None:
281
+ for watch in starts:
282
+ if watch is None:
283
+ self.advance_cache[key] = BANNED_ADVANCE_RESULT
284
+ return BANNED_ADVANCE_RESULT
285
+
286
+ next_state.add(watch)
287
+
288
+ if transitions is not None:
289
+ for watch in state:
290
+ next_watch = transitions.get(watch)
291
+
292
+ if next_watch is None:
293
+ if watch in transitions:
294
+ self.advance_cache[key] = BANNED_ADVANCE_RESULT
295
+ return BANNED_ADVANCE_RESULT
296
+
297
+ continue
298
+
299
+ next_state.add(next_watch)
300
+
301
+ if not next_state:
302
+ self.advance_cache[key] = CLEAR_ADVANCE_RESULT
303
+ return CLEAR_ADVANCE_RESULT
304
+
305
+ result = BannedTrackerAdvanceResult(
306
+ banned=False,
307
+ state_id=self._get_or_register_state_id(frozenset(next_state)),
308
+ )
309
+
310
+ self.advance_cache[key] = result
311
+ return result
312
+
313
+
314
+ def _find_matching_subpaths(
315
+ graph: CodonGraph,
316
+ sequence: str,
317
+ ) -> List[SubPath]:
318
+ """
319
+ Find all graph subpaths that can emit a given banned sequence.
320
+
321
+ Parameters
322
+ ----------
323
+ graph
324
+ The codon graph to search.
325
+ sequence
326
+ The banned sequence to search for.
327
+
328
+ Returns
329
+ -------
330
+ Matching subpaths as a list of SubPath objects.
331
+ """
332
+ sequence = sequence.upper()
333
+
334
+ if len(sequence) == 0:
335
+ raise ValueError('Sequence cannot be empty.')
336
+
337
+ matches: List[SubPath] = []
338
+ candidate_matches = []
339
+
340
+ def add_match(partial_path, offset):
341
+ steps = tuple(
342
+ (node.pos, choice)
343
+ for node, choice in partial_path
344
+ )
345
+
346
+ matches.append(
347
+ SubPath(
348
+ sequence=sequence,
349
+ steps=steps,
350
+ offset=offset,
351
+ )
352
+ )
353
+
354
+ # First, check which nodes we can start at.
355
+ for node in graph.nodes:
356
+ if node is graph.end_node:
357
+ continue
358
+
359
+ for choice, child in node.transitions.items():
360
+ for offset in range(len(choice)):
361
+ choice_subsequence = choice[offset:]
362
+
363
+ if choice_subsequence.startswith(sequence):
364
+ # Bingo!
365
+ add_match(((node, choice),), offset)
366
+
367
+ elif sequence.startswith(choice_subsequence):
368
+ # Maybe bingo! Maygo!
369
+ candidate_matches.append((
370
+ ((node, choice),),
371
+ offset,
372
+ len(choice_subsequence),
373
+ ))
374
+
375
+ def reinspect_candidate_matches(candidate_matches):
376
+ reinspect = []
377
+
378
+ for partial_path, offset, seen_length in candidate_matches:
379
+ previous_node, previous_choice = partial_path[-1]
380
+ node = previous_node.transitions[previous_choice]
381
+
382
+ remaining_sequence = sequence[seen_length:]
383
+
384
+ if remaining_sequence == '':
385
+ # Fantastic!
386
+ add_match(partial_path, offset)
387
+ continue
388
+
389
+ if node is graph.end_node:
390
+ continue
391
+
392
+ if isinstance(node, CodonNode):
393
+ choice_length = 3
394
+ else:
395
+ choice_length = len(next(iter(node.transitions)))
396
+
397
+ # Sneaky shortcut if we've crossed into the right context:
398
+ if isinstance(node, CodonNode):
399
+ pos = node.pos
400
+
401
+ remaining_sequence_length = len(sequence) - seen_length
402
+ remaining_coding_length = 3 * (len(graph.aa_seq) - pos + 1)
403
+
404
+ if remaining_sequence_length > remaining_coding_length:
405
+ sequence_end = sequence[seen_length + remaining_coding_length:]
406
+
407
+ if not graph.context_r.startswith(sequence_end):
408
+ continue
409
+
410
+ for choice, child in node.transitions.items():
411
+
412
+ if len(remaining_sequence) >= choice_length:
413
+
414
+ if remaining_sequence.startswith(choice):
415
+ # Keep going...
416
+ reinspect.append((
417
+ partial_path + ((node, choice),),
418
+ offset,
419
+ seen_length + choice_length,
420
+ ))
421
+
422
+ else:
423
+ # Hard luck this time.
424
+ continue
425
+
426
+ else:
427
+
428
+ if choice.startswith(remaining_sequence):
429
+ # Wahoo!
430
+ add_match(
431
+ partial_path + ((node, choice),),
432
+ offset,
433
+ )
434
+
435
+ else:
436
+ # Hard luck this time.
437
+ continue
438
+
439
+ return reinspect
440
+
441
+ while candidate_matches:
442
+ candidate_matches = reinspect_candidate_matches(candidate_matches)
443
+
444
+ return matches
@@ -0,0 +1,39 @@
1
+ from typing import Any, Optional
2
+
3
+ ConstraintState = Any
4
+
5
+
6
+ class PathConstraint:
7
+ """
8
+ Base class for tracking constraints applied while walking a codon graph.
9
+ Designed to track sequence properties that can be calculated by accumulating
10
+ calculations along a path length.
11
+
12
+ The idea is to update a state based on the previous state, current node, and choice.
13
+ """
14
+
15
+ @property
16
+ def initial_state(self) -> ConstraintState:
17
+ """
18
+ Initial constraint-tracking state.
19
+ """
20
+ return ()
21
+
22
+ def advance(
23
+ self,
24
+ state: Any,
25
+ pos: int,
26
+ choice: str,
27
+ ) -> Optional[Any]:
28
+ """
29
+ Advance the constraint state after taking one graph choice.
30
+
31
+ Return None if this choice should be rejected.
32
+ """
33
+ return state
34
+
35
+ def is_satisfied(self, state: ConstraintState) -> bool:
36
+ """
37
+ Return whether this constraint is satisfied by the current state.
38
+ """
39
+ return True
@@ -0,0 +1,115 @@
1
+ from dataclasses import dataclass
2
+ from typing import Optional, Tuple
3
+
4
+ from codeine.constraints.base import PathConstraint
5
+
6
+ # nt_diffs, codon_diffs
7
+ MutationDistanceState = Tuple[Optional[int], Optional[int]]
8
+
9
+
10
+ @dataclass
11
+ class MutationDistanceConstraint(PathConstraint):
12
+ """
13
+ Constrain graph walks by distance from a reference CDS.
14
+
15
+ Nucleotide distance counts individual nucleotide changes. Codon distance
16
+ counts codons that differ, regardless of how many nucleotides changed.
17
+ """
18
+
19
+ reference_cds: str
20
+ min_nts: Optional[int] = None
21
+ max_nts: Optional[int] = None
22
+ min_codons: Optional[int] = None
23
+ max_codons: Optional[int] = None
24
+
25
+ def __post_init__(self) -> None:
26
+
27
+ # Store the reference codons once, on init.
28
+ ref_codons = [self.reference_cds[i:i + 3] for i in range(0, len(self.reference_cds), 3)]
29
+ self._ref_codons = tuple(ref_codons)
30
+
31
+ self._tracks_nts = self.min_nts is not None or self.max_nts is not None
32
+ self._tracks_codons = self.min_codons is not None or self.max_codons is not None
33
+
34
+ self._initial_state = (
35
+ 0 if self._tracks_nts else None,
36
+ 0 if self._tracks_codons else None,
37
+ )
38
+
39
+ self.first_pos = 1
40
+ self.last_pos = len(self._ref_codons)
41
+
42
+ @property
43
+ def tracks_nts(self) -> bool:
44
+ """
45
+ Whether nucleotide differences are constrained.
46
+ """
47
+ return self._tracks_nts
48
+
49
+ @property
50
+ def tracks_codons(self) -> bool:
51
+ """
52
+ Whether codon differences are constrained.
53
+ """
54
+ return self._tracks_codons
55
+
56
+ @property
57
+ def initial_state(self) -> MutationDistanceState:
58
+ """
59
+ Initial mutation-distance state.
60
+ """
61
+ return self._initial_state
62
+
63
+ def advance(
64
+ self,
65
+ state: MutationDistanceState,
66
+ pos: int,
67
+ choice: str,
68
+ ) -> Optional[MutationDistanceState]:
69
+ """
70
+ Advance mutation-distance tracking by one graph choice. The state updates on each
71
+ advance by adding the number of nt/codon differences given each next choice.
72
+
73
+ Non-codon nodes do not affect distance. Codon nodes add the distance
74
+ between the chosen codon and the reference codon at the same position.
75
+ """
76
+ if pos < self.first_pos or pos > self.last_pos:
77
+ return state
78
+
79
+ nt_diffs, codon_diffs = state
80
+ ref_codon = self._ref_codons[pos - 1]
81
+
82
+ nt_diff = (
83
+ (ref_codon[0] != choice[0])
84
+ + (ref_codon[1] != choice[1])
85
+ + (ref_codon[2] != choice[2])
86
+ )
87
+ codon_diff = int(nt_diff != 0)
88
+
89
+ if self._tracks_nts:
90
+ nt_diffs += nt_diff
91
+
92
+ if self.max_nts is not None and nt_diffs > self.max_nts:
93
+ return None
94
+
95
+ if self._tracks_codons:
96
+ codon_diffs += codon_diff
97
+
98
+ if self.max_codons is not None and codon_diffs > self.max_codons:
99
+ return None
100
+
101
+ return nt_diffs, codon_diffs
102
+
103
+ def is_satisfied(self, state: MutationDistanceState) -> bool:
104
+ """
105
+ Check whether a given state satisfies minimum distances.
106
+ """
107
+ nt_diffs, codon_diffs = state
108
+
109
+ if self.min_nts is not None and nt_diffs < self.min_nts:
110
+ return False
111
+
112
+ if self.min_codons is not None and codon_diffs < self.min_codons:
113
+ return False
114
+
115
+ return True