PyPI - codeine - Versions diffs - 0.1.0__py3-none-any.whl - Mend

codeine 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

codeine/__init__.py +15 -0
codeine/constraints/banned.py +444 -0
codeine/constraints/base.py +39 -0
codeine/constraints/mutations.py +115 -0
codeine/graph/base.py +267 -0
codeine/graph/compile.py +489 -0
codeine/graph/nodes.py +111 -0
codeine/graph/view.py +781 -0
codeine/motifs/restriction.py +105 -0
codeine/motifs/validate.py +117 -0
codeine/space/__init__.py +0 -0
codeine/space/coding.py +490 -0
codeine/space/mutation.py +512 -0
codeine/translation/__init__.py +0 -0
codeine/translation/data/__init__.py +0 -0
codeine/translation/data/tables.json +2252 -0
codeine/translation/data/weights.py +232 -0
codeine/translation/tables.py +200 -0
codeine/translation/weights.py +323 -0
codeine/utils/__init__.py +0 -0
codeine/utils/dict.py +23 -0
codeine/utils/display.py +124 -0
codeine/utils/sampling.py +90 -0
codeine-0.1.0.dist-info/METADATA +162 -0
codeine-0.1.0.dist-info/RECORD +28 -0
codeine-0.1.0.dist-info/WHEEL +5 -0
codeine-0.1.0.dist-info/licenses/LICENSE +21 -0
codeine-0.1.0.dist-info/top_level.txt +1 -0

codeine/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+from codeine.space.coding import CodingSpace
+from codeine.space.mutation import MutationSpace
+from codeine.translation.tables import TranslationTable
+from codeine.translation.weights import CodonWeights
+from codeine.motifs.restriction import RestrictionSite
+__all__ = [
+    'CodingSpace',
+    'CodonWeights',
+    'MutationSpace',
+    'RestrictionSite',
+    'TranslationTable'
+]

codeine/constraints/banned.py ADDED Viewed

@@ -0,0 +1,444 @@
+from typing import Dict, FrozenSet, List, NamedTuple, Optional, Sequence, Tuple
+from codeine.graph.base import CodonGraph
+from codeine.graph.nodes import CodonNode
+# A step is a decision in the codon graph, i.e. (graph pos, choice)
+Step = Tuple[int, str]
+class SubPath(NamedTuple):
+    """
+    A subpath in the codon graph, indicating a sequence that can be obtained
+    by following a specified sequence of steps, starting at a given offset.
+    """
+    sequence: str
+    steps: Tuple[Step, ...]
+    offset: int
+# A watch (path_ix, matched_length) is the status of a single
+# watched path, indicating how much of the path has been seen so far.
+Watch = Tuple[int, int]
+# The tracker state is a set of watches. We update the
+# watches every time we make a choice.
+BannedTrackerState = FrozenSet[Watch]
+# Integer ID for a registered banned tracker state.
+BannedTrackerStateId = int
+# Internal transition value:
+#   None   -> banned sequence completed
+#   Watch  -> continue watching this path
+TransitionValue = Optional[Watch]
+class BannedTrackerAdvanceResult(NamedTuple):
+    """
+    Result of advancing a registered banned-tracker state.
+    If banned is True, the graph step has completed a banned sequence and the
+    candidate path should be rejected. Otherwise, state_id gives the registered
+    tracker state reached after taking the step.
+    """
+    banned: bool
+    state_id: BannedTrackerStateId = 0
+CLEAR_ADVANCE_RESULT = BannedTrackerAdvanceResult(banned=False, state_id=0)
+BANNED_ADVANCE_RESULT = BannedTrackerAdvanceResult(banned=True, state_id=0)
+class BannedSequenceTracker:
+    """
+    Tracks progress along concrete banned graph subpaths.
+    A SubPath stores:
+        sequence
+            The banned sequence being tracked.
+        steps
+            Concrete graph emissions as (pos, choice) pairs.
+        offset
+            Where sequence starts inside the first choice.
+    Internally, a tracker state is a frozenset of watches:
+        (path_ix, matched_length)
+    meaning that matched_length bases of paths[path_ix].sequence have already
+    matched. States are registered and exposed to the compiler as integer IDs,
+    so traversal states remain compact and cheap to hash.
+    Transitions are precomputed as:
+        choice -> (path_ix, matched_length) -> banned | next watch | dead
+    """
+    def __init__(self, graph: CodonGraph, banned_sequences: Sequence[str]) -> None:
+        """
+        Constructor for the BannedSequenceTracker class.
+        Parameters
+        ----------
+        graph
+            The codon graph on which to operate.
+        banned_sequences
+            A collection of sequences that must not occur in generated paths.
+        """
+        self.graph = graph
+        self.banned_sequences = tuple(sequence.upper() for sequence in banned_sequences)
+        self.initial_state: BannedTrackerState = frozenset()
+        self.initial_state_id: int = 0
+        self.state_ids: Dict[BannedTrackerState, BannedTrackerStateId] = {self.initial_state: self.initial_state_id}
+        self.states: List[BannedTrackerState] = [self.initial_state]
+        self.advance_cache: Dict[Tuple[Step, BannedTrackerStateId], BannedTrackerAdvanceResult] = {}
+        self.paths = self._find_banned_paths()
+        self.starts = self._build_starts()
+        self.transitions = self._build_transitions()
+    @property
+    def is_trivial(self) -> bool:
+        """
+        Whether this tracker is trivial - i.e. there are no paths that would generate
+        a sequence containing a banned sequence.
+        Returns
+        -------
+        True if and only if the tracker is trivial.
+        """
+        return len(self.paths) == 0
+    def _find_banned_paths(self) -> Tuple[SubPath, ...]:
+        """
+        Find every concrete graph path that can generate a banned sequence.
+        Each returned SubPath records the emitted sequence, the graph steps
+        required to produce it, and the offset at which the banned sequence begins
+        within the first emitted choice.
+        Returns
+        -------
+        Tuple[SubPath, ...]
+            All graph subpaths capable of producing one of the banned sequences.
+        """
+        paths = []
+        for sequence in self.banned_sequences:
+            paths.extend(_find_matching_subpaths(self.graph, sequence))
+        return tuple(paths)
+    def _build_starts(self) -> Dict[Step, Tuple[TransitionValue, ...]]:
+        """
+        Build the initial watch transitions for each possible graph step.
+        The returned mapping records which watches should be created when a given
+        step is taken. If a banned sequence is completed immediately, the transition
+        value is None.
+        Returns
+        -------
+        Dict[Step, Tuple[TransitionValue, ...]]
+            Mapping from graph step to the watches that should be started after
+            taking that step.
+        """
+        starts: Dict[Step, List[TransitionValue]] = {}
+        for path_ix, path in enumerate(self.paths):
+            first_step = path.steps[0]
+            _pos, first_choice = first_step
+            emitted = first_choice[path.offset:]
+            matched_length = min(len(emitted), len(path.sequence))
+            if matched_length >= len(path.sequence):
+                result = None
+            else:
+                result = (path_ix, matched_length)
+            starts.setdefault(first_step, []).append(result)
+        return {
+            key: tuple(results)
+            for key, results in starts.items()
+        }
+    def _build_transitions(self) -> Dict[str, Dict[Watch, TransitionValue]]:
+        """
+        Build transitions between active watches.
+        For each emitted graph choice, records how every active watch should
+        advance. A transition value of None indicates that the banned sequence
+        has been completed.
+        Returns
+        -------
+        Dict[str, Dict[Watch, TransitionValue]]
+            Mapping from emitted graph choice to the corresponding watch
+            transitions.
+        """
+        transitions: Dict[str, Dict[Watch, TransitionValue]] = {}
+        for path_ix, path in enumerate(self.paths):
+            matched_length = min(
+                len(path.steps[0][1]) - path.offset,
+                len(path.sequence),
+            )
+            if matched_length >= len(path.sequence):
+                continue
+            for _pos, choice in path.steps[1:]:
+                watch = (path_ix, matched_length)
+                remaining = path.sequence[matched_length:]
+                if choice.startswith(remaining):
+                    transitions.setdefault(choice, {})[watch] = None
+                    break
+                if remaining.startswith(choice):
+                    matched_length += len(choice)
+                    transitions.setdefault(choice, {})[watch] = (path_ix, matched_length)
+                    continue
+                break
+        return transitions
+    def _get_or_register_state_id(
+            self,
+            state: BannedTrackerState,
+    ) -> BannedTrackerStateId:
+        """
+        Return the integer ID for a banned-tracker state, creating one if needed.
+        Parameters
+        ----------
+        state
+            The concrete frozenset-of-watches tracker state.
+        Returns
+        -------
+        BannedTrackerStateId
+            Stable integer ID for the given tracker state.
+        """
+        state_id = self.state_ids.get(state)
+        if state_id is None:
+            state_id = len(self.states)
+            self.state_ids[state] = state_id
+            self.states.append(state)
+        return state_id
+    def advance(
+            self,
+            step: Step,
+            state_id: BannedTrackerStateId,
+    ) -> BannedTrackerAdvanceResult:
+        """
+        Advance a registered tracker state after taking one graph step.
+        Parameters
+        ----------
+        step
+            The graph step just taken, as (graph pos, choice).
+        state_id
+            Integer ID of the current banned-tracker state.
+        Returns
+        -------
+        BannedTrackerAdvanceResult
+            Whether the step completed a banned sequence, and otherwise the
+            integer ID of the updated tracker state.
+        """
+        key = (step, state_id)
+        cached = self.advance_cache.get(key)
+        if cached is not None:
+            return cached
+        state = self.states[state_id]
+        starts = self.starts.get(step)
+        if starts is None and not state:
+            self.advance_cache[key] = CLEAR_ADVANCE_RESULT
+            return CLEAR_ADVANCE_RESULT
+        _pos, choice = step
+        transitions = self.transitions.get(choice)
+        next_state = set()
+        if starts is not None:
+            for watch in starts:
+                if watch is None:
+                    self.advance_cache[key] = BANNED_ADVANCE_RESULT
+                    return BANNED_ADVANCE_RESULT
+                next_state.add(watch)
+        if transitions is not None:
+            for watch in state:
+                next_watch = transitions.get(watch)
+                if next_watch is None:
+                    if watch in transitions:
+                        self.advance_cache[key] = BANNED_ADVANCE_RESULT
+                        return BANNED_ADVANCE_RESULT
+                    continue
+                next_state.add(next_watch)
+        if not next_state:
+            self.advance_cache[key] = CLEAR_ADVANCE_RESULT
+            return CLEAR_ADVANCE_RESULT
+        result = BannedTrackerAdvanceResult(
+            banned=False,
+            state_id=self._get_or_register_state_id(frozenset(next_state)),
+        )
+        self.advance_cache[key] = result
+        return result
+def _find_matching_subpaths(
+        graph: CodonGraph,
+        sequence: str,
+) -> List[SubPath]:
+    """
+    Find all graph subpaths that can emit a given banned sequence.
+    Parameters
+    ----------
+    graph
+        The codon graph to search.
+    sequence
+        The banned sequence to search for.
+    Returns
+    -------
+    Matching subpaths as a list of SubPath objects.
+    """
+    sequence = sequence.upper()
+    if len(sequence) == 0:
+        raise ValueError('Sequence cannot be empty.')
+    matches: List[SubPath] = []
+    candidate_matches = []
+    def add_match(partial_path, offset):
+        steps = tuple(
+            (node.pos, choice)
+            for node, choice in partial_path
+        )
+        matches.append(
+            SubPath(
+                sequence=sequence,
+                steps=steps,
+                offset=offset,
+            )
+        )
+    # First, check which nodes we can start at.
+    for node in graph.nodes:
+        if node is graph.end_node:
+            continue
+        for choice, child in node.transitions.items():
+            for offset in range(len(choice)):
+                choice_subsequence = choice[offset:]
+                if choice_subsequence.startswith(sequence):
+                    # Bingo!
+                    add_match(((node, choice),), offset)
+                elif sequence.startswith(choice_subsequence):
+                    # Maybe bingo! Maygo!
+                    candidate_matches.append((
+                        ((node, choice),),
+                        offset,
+                        len(choice_subsequence),
+                    ))
+    def reinspect_candidate_matches(candidate_matches):
+        reinspect = []
+        for partial_path, offset, seen_length in candidate_matches:
+            previous_node, previous_choice = partial_path[-1]
+            node = previous_node.transitions[previous_choice]
+            remaining_sequence = sequence[seen_length:]
+            if remaining_sequence == '':
+                # Fantastic!
+                add_match(partial_path, offset)
+                continue
+            if node is graph.end_node:
+                continue
+            if isinstance(node, CodonNode):
+                choice_length = 3
+            else:
+                choice_length = len(next(iter(node.transitions)))
+            # Sneaky shortcut if we've crossed into the right context:
+            if isinstance(node, CodonNode):
+                pos = node.pos
+                remaining_sequence_length = len(sequence) - seen_length
+                remaining_coding_length = 3 * (len(graph.aa_seq) - pos + 1)
+                if remaining_sequence_length > remaining_coding_length:
+                    sequence_end = sequence[seen_length + remaining_coding_length:]
+                    if not graph.context_r.startswith(sequence_end):
+                        continue
+            for choice, child in node.transitions.items():
+                if len(remaining_sequence) >= choice_length:
+                    if remaining_sequence.startswith(choice):
+                        # Keep going...
+                        reinspect.append((
+                            partial_path + ((node, choice),),
+                            offset,
+                            seen_length + choice_length,
+                        ))
+                    else:
+                        # Hard luck this time.
+                        continue
+                else:
+                    if choice.startswith(remaining_sequence):
+                        # Wahoo!
+                        add_match(
+                            partial_path + ((node, choice),),
+                            offset,
+                        )
+                    else:
+                        # Hard luck this time.
+                        continue
+        return reinspect
+    while candidate_matches:
+        candidate_matches = reinspect_candidate_matches(candidate_matches)
+    return matches

codeine/constraints/base.py ADDED Viewed

@@ -0,0 +1,39 @@
+from typing import Any, Optional
+ConstraintState = Any
+class PathConstraint:
+    """
+    Base class for tracking constraints applied while walking a codon graph.
+    Designed to track sequence properties that can be calculated by accumulating
+    calculations along a path length.
+    The idea is to update a state based on the previous state, current node, and choice.
+    """
+    @property
+    def initial_state(self) -> ConstraintState:
+        """
+        Initial constraint-tracking state.
+        """
+        return ()
+    def advance(
+        self,
+        state: Any,
+        pos: int,
+        choice: str,
+    ) -> Optional[Any]:
+        """
+        Advance the constraint state after taking one graph choice.
+        Return None if this choice should be rejected.
+        """
+        return state
+    def is_satisfied(self, state: ConstraintState) -> bool:
+        """
+        Return whether this constraint is satisfied by the current state.
+        """
+        return True

codeine/constraints/mutations.py ADDED Viewed

@@ -0,0 +1,115 @@
+from dataclasses import dataclass
+from typing import Optional, Tuple
+from codeine.constraints.base import PathConstraint
+# nt_diffs, codon_diffs
+MutationDistanceState = Tuple[Optional[int], Optional[int]]
+@dataclass
+class MutationDistanceConstraint(PathConstraint):
+    """
+    Constrain graph walks by distance from a reference CDS.
+    Nucleotide distance counts individual nucleotide changes. Codon distance
+    counts codons that differ, regardless of how many nucleotides changed.
+    """
+    reference_cds: str
+    min_nts: Optional[int] = None
+    max_nts: Optional[int] = None
+    min_codons: Optional[int] = None
+    max_codons: Optional[int] = None
+    def __post_init__(self) -> None:
+        # Store the reference codons once, on init.
+        ref_codons = [self.reference_cds[i:i + 3] for i in range(0, len(self.reference_cds), 3)]
+        self._ref_codons = tuple(ref_codons)
+        self._tracks_nts = self.min_nts is not None or self.max_nts is not None
+        self._tracks_codons = self.min_codons is not None or self.max_codons is not None
+        self._initial_state = (
+            0 if self._tracks_nts else None,
+            0 if self._tracks_codons else None,
+        )
+        self.first_pos = 1
+        self.last_pos = len(self._ref_codons)
+    @property
+    def tracks_nts(self) -> bool:
+        """
+        Whether nucleotide differences are constrained.
+        """
+        return self._tracks_nts
+    @property
+    def tracks_codons(self) -> bool:
+        """
+        Whether codon differences are constrained.
+        """
+        return self._tracks_codons
+    @property
+    def initial_state(self) -> MutationDistanceState:
+        """
+        Initial mutation-distance state.
+        """
+        return self._initial_state
+    def advance(
+        self,
+        state: MutationDistanceState,
+        pos: int,
+        choice: str,
+    ) -> Optional[MutationDistanceState]:
+        """
+        Advance mutation-distance tracking by one graph choice. The state updates on each
+        advance by adding the number of nt/codon differences given each next choice.
+        Non-codon nodes do not affect distance. Codon nodes add the distance
+        between the chosen codon and the reference codon at the same position.
+        """
+        if pos < self.first_pos or pos > self.last_pos:
+            return state
+        nt_diffs, codon_diffs = state
+        ref_codon = self._ref_codons[pos - 1]
+        nt_diff = (
+            (ref_codon[0] != choice[0])
+            + (ref_codon[1] != choice[1])
+            + (ref_codon[2] != choice[2])
+        )
+        codon_diff = int(nt_diff != 0)
+        if self._tracks_nts:
+            nt_diffs += nt_diff
+            if self.max_nts is not None and nt_diffs > self.max_nts:
+                return None
+        if self._tracks_codons:
+            codon_diffs += codon_diff
+            if self.max_codons is not None and codon_diffs > self.max_codons:
+                return None
+        return nt_diffs, codon_diffs
+    def is_satisfied(self, state: MutationDistanceState) -> bool:
+        """
+        Check whether a given state satisfies minimum distances.
+        """
+        nt_diffs, codon_diffs = state
+        if self.min_nts is not None and nt_diffs < self.min_nts:
+            return False
+        if self.min_codons is not None and codon_diffs < self.min_codons:
+            return False
+        return True