PyPI - geney - Versions diffs - 1.4.40__py3-none-any.whl - Mend

geney 1.4.40__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

geney/__init__.py +25 -0
geney/engines.py +307 -0
geney/oncosplice.py +411 -0
geney/pipelines.py +97 -0
geney/samples.py +3 -0
geney/splice_graph.py +371 -0
geney/splicing_table.py +142 -0
geney/transcripts.py +68 -0
geney/utils.py +254 -0
geney/variants.py +389 -0
geney-1.4.40.dist-info/METADATA +32 -0
geney-1.4.40.dist-info/RECORD +14 -0
geney-1.4.40.dist-info/WHEEL +5 -0
geney-1.4.40.dist-info/top_level.txt +1 -0

geney/splice_graph.py ADDED Viewed

@@ -0,0 +1,371 @@
+# oncosplice/splice_graph.py
+from __future__ import annotations
+from collections import defaultdict
+from typing import Any, Dict, Generator, List, Tuple
+import numpy as np
+import pandas as pd
+from pandas import Series
+from .utils import short_hash_of_list  # type: ignore
+class SpliceSimulator:
+    """
+    Builds a splice-site graph from a splicing DataFrame and enumerates isoform paths.
+    """
+    def __init__(self, splicing_df: pd.DataFrame, transcript, max_distance: int, feature: str = "event"):
+        self.full_df = splicing_df
+        self.feature = feature
+        self.rev = transcript.rev
+        self.transcript_start = transcript.transcript_start
+        self.transcript_end = transcript.transcript_end
+        self.donors = transcript.donors
+        self.acceptors = transcript.acceptors
+        self.transcript = transcript
+        self.max_distance = max_distance
+        self.set_donor_nodes()
+        self.set_acceptor_nodes()
+    def _compute_splice_df(self, site_type: str) -> pd.DataFrame:
+        feature_col = f"{self.feature}_prob"
+        df = getattr(self.full_df, site_type + "s").copy()
+        site_set = getattr(self, site_type + "s")
+        missing = set(site_set) - set(df.index)
+        if missing:
+            df = pd.concat([df, pd.DataFrame(index=list(missing))], axis=0)
+            df.loc[list(missing), ["annotated", "ref_prob", feature_col]] = [True, 1, 1]
+        if "annotated" not in df.columns:
+            df["annotated"] = False
+        else:
+            df["annotated"] = df["annotated"].where(df["annotated"].notna(), False).astype(bool)
+        df.sort_index(ascending=not self.rev, inplace=True)
+        MIN_INCREASE_RATIO = 0.2
+        df["discovered_delta"] = np.where(
+            ~df["annotated"],
+            (df[feature_col] - df["ref_prob"]),
+            np.nan,
+        )
+        df["discovered_delta"] = df["discovered_delta"].where(
+            df["discovered_delta"] >= MIN_INCREASE_RATIO, 0
+        )
+        with np.errstate(divide="ignore", invalid="ignore"):
+            df["deleted_delta"] = np.where(
+                (df["ref_prob"] > 0) & df["annotated"],
+                (df[feature_col] - df["ref_prob"]) / df["ref_prob"],
+                0,
+            )
+        df["deleted_delta"] = df["deleted_delta"].clip(upper=0)
+        df["P"] = df["annotated"].astype(float) + df["discovered_delta"] + df["deleted_delta"]
+        return df
+    @property
+    def donor_df(self) -> pd.DataFrame:
+        return self._compute_splice_df("donor")
+    @property
+    def acceptor_df(self) -> pd.DataFrame:
+        return self._compute_splice_df("acceptor")
+    def report(self, pos):
+        metadata = self.find_splice_site_proximity(pos)
+        metadata["donor_events"] = self.donor_df[
+            (self.donor_df.deleted_delta.abs() > 0.2)
+            | (self.donor_df.discovered_delta.abs() > 0.2)
+        ].reset_index().to_json()
+        metadata["acceptor_events"] = self.acceptor_df[
+            (self.acceptor_df.deleted_delta.abs() > 0.2)
+            | (self.acceptor_df.discovered_delta.abs() > 0.2)
+        ].reset_index().to_json()
+        metadata["missplicing"] = self.max_splicing_delta("event_prob")
+        return metadata
+    def max_splicing_delta(self, event: str) -> float:
+        all_diffs = []
+        for site_type in ["donors", "acceptors"]:
+            df = self.full_df[site_type]
+            diffs = (df[event] - df["ref_prob"]).tolist()
+            all_diffs.extend(diffs)
+        return max(all_diffs, key=abs)
+    def set_donor_nodes(self) -> None:
+        donors = self.donor_df.P
+        donor_list = list(donors[donors > 0].round(2).items())
+        donor_list.append((self.transcript_end, 1))
+        self.donor_nodes = sorted(
+            donor_list, key=lambda x: int(x[0]), reverse=bool(self.rev)
+        )
+    def set_acceptor_nodes(self) -> None:
+        acceptors = self.acceptor_df.P
+        acceptor_list = list(acceptors[acceptors > 0].round(2).items())
+        acceptor_list.insert(0, (self.transcript_start, 1.0))
+        self.acceptor_nodes = sorted(
+            acceptor_list, key=lambda x: int(x[0]), reverse=bool(self.rev)
+        )
+    def generate_graph(self) -> Dict[Tuple[int, str], List[Tuple[int, str, float]]]:
+        adjacency_list: Dict[Tuple[int, str], List[Tuple[int, str, float]]] = defaultdict(list)
+        # donor -> acceptor
+        for d_pos, d_prob in self.donor_nodes:
+            running_prob = 1.0
+            for a_pos, a_prob in self.acceptor_nodes:
+                correct_orientation = ((a_pos > d_pos and not self.rev) or (a_pos < d_pos and self.rev))
+                distance_valid = abs(a_pos - d_pos) <= self.max_distance
+                if not (correct_orientation and distance_valid):
+                    continue
+                if not self.rev:
+                    in_between_acceptors = sum(1 for a, _ in self.acceptor_nodes if d_pos < a < a_pos)
+                    in_between_donors = sum(1 for d, _ in self.donor_nodes if d_pos < d < a_pos)
+                else:
+                    in_between_acceptors = sum(1 for a, _ in self.acceptor_nodes if a_pos < a < d_pos)
+                    in_between_donors = sum(1 for d, _ in self.donor_nodes if a_pos < d < d_pos)
+                if in_between_donors == 0 or in_between_acceptors == 0:
+                    adjacency_list[(d_pos, "donor")].append((a_pos, "acceptor", a_prob))
+                    running_prob -= a_prob
+                else:
+                    if running_prob > 0:
+                        adjacency_list[(d_pos, "donor")].append(
+                            (a_pos, "acceptor", a_prob * running_prob)
+                        )
+                        running_prob -= a_prob
+                    else:
+                        break
+        # acceptor -> donor
+        for a_pos, a_prob in self.acceptor_nodes:
+            running_prob = 1.0
+            for d_pos, d_prob in self.donor_nodes:
+                correct_orientation = ((d_pos > a_pos and not self.rev) or (d_pos < a_pos and self.rev))
+                distance_valid = abs(d_pos - a_pos) <= self.max_distance
+                if not (correct_orientation and distance_valid):
+                    continue
+                if not self.rev:
+                    in_between_acceptors = sum(1 for a, _ in self.acceptor_nodes if a_pos < a < d_pos)
+                    in_between_donors = sum(1 for d, _ in self.donor_nodes if a_pos < d < d_pos)
+                else:
+                    in_between_acceptors = sum(1 for a, _ in self.acceptor_nodes if d_pos < a < a_pos)
+                    in_between_donors = sum(1 for d, _ in self.donor_nodes if d_pos < d < a_pos)
+                tag = "donor" if d_pos != self.transcript_end else "transcript_end"
+                if in_between_acceptors == 0:
+                    adjacency_list[(a_pos, "acceptor")].append((d_pos, tag, d_prob))
+                    running_prob -= d_prob
+                else:
+                    if running_prob > 0:
+                        adjacency_list[(a_pos, "acceptor")].append(
+                            (d_pos, tag, d_prob * running_prob)
+                        )
+                        running_prob -= d_prob
+                    else:
+                        break
+        # transcript_start -> donors
+        running_prob = 1.0
+        for d_pos, d_prob in self.donor_nodes:
+            correct_orientation = (
+                (d_pos > self.transcript_start and not self.rev)
+                or (d_pos < self.transcript_start and self.rev)
+            )
+            distance_valid = abs(d_pos - self.transcript_start) <= self.max_distance
+            if correct_orientation and distance_valid:
+                adjacency_list[(self.transcript_start, "transcript_start")].append(
+                    (d_pos, "donor", d_prob)
+                )
+                running_prob -= d_prob
+                if running_prob <= 0:
+                    break
+        # normalize outgoing edges
+        for key, next_nodes in adjacency_list.items():
+            total_prob = sum(prob for (_, _, prob) in next_nodes)
+            if total_prob > 0:
+                adjacency_list[key] = [
+                    (pos, typ, round(prob / total_prob, 3))
+                    for pos, typ, prob in next_nodes
+                ]
+        return adjacency_list
+    def find_all_paths(
+        self,
+        graph: Dict[Tuple[int, str], List[Tuple[int, str, float]]],
+        start: Tuple[int, str],
+        end: Tuple[int, str],
+        path: List[Tuple[int, str]] | None = None,
+        probability: float = 1.0,
+    ) -> Generator[Tuple[List[Tuple[int, str]], float], None, None]:
+        if path is None:
+            path = [start]
+        else:
+            path = path + [start]
+        if start == end:
+            yield path, probability
+            return
+        if start not in graph:
+            return
+        for next_pos, tag, prob in graph[start]:
+            yield from self.find_all_paths(
+                graph,
+                (next_pos, tag),
+                end,
+                path,
+                probability * prob,
+            )
+    def get_viable_paths(self) -> List[Tuple[List[Tuple[int, str]], float]]:
+        graph = self.generate_graph()
+        start_node = (self.transcript_start, "transcript_start")
+        end_node = (self.transcript_end, "transcript_end")
+        paths = list(self.find_all_paths(graph, start_node, end_node))
+        paths.sort(key=lambda x: x[1], reverse=True)
+        return paths
+    def get_viable_transcripts(self, metadata: bool = False):
+        graph = self.generate_graph()
+        start_node = (self.transcript_start, "transcript_start")
+        end_node = (self.transcript_end, "transcript_end")
+        paths = list(self.find_all_paths(graph, start_node, end_node))
+        paths.sort(key=lambda x: x[1], reverse=True)
+        for path, prob in paths:
+            donors = [pos for pos, typ in path if typ == "donor"]
+            acceptors = [pos for pos, typ in path if typ == "acceptor"]
+            t = self.transcript.clone()
+            t.donors = [d for d in donors if d != t.transcript_end]
+            t.acceptors = [a for a in acceptors if a != t.transcript_start]
+            t.path_weight = prob
+            t.path_hash = short_hash_of_list(tuple(donors + acceptors))
+            t.generate_mature_mrna().generate_protein()
+            if metadata:
+                md = pd.concat(
+                    [
+                        self.compare_splicing_to_reference(t),
+                        pd.Series(
+                            {
+                                "isoform_prevalence": t.path_weight,
+                                "isoform_id": t.path_hash,
+                            }
+                        ),
+                    ]
+                )
+                yield t, md
+            else:
+                yield t
+    def find_splice_site_proximity(self, pos: int) -> Series:
+        def result(region, index, start, end):
+            return pd.Series(
+                {
+                    "region": region,
+                    "index": index + 1,
+                    "5'_dist": abs(pos - min(start, end)),
+                    "3'_dist": abs(pos - max(start, end)),
+                }
+            )
+        if not hasattr(self.transcript, "exons") or not hasattr(self.transcript, "introns"):
+            return pd.Series(
+                {"region": None, "index": None, "5'_dist": np.inf, "3'_dist": np.inf}
+            )
+        for i, (start, end) in enumerate(self.transcript.exons):
+            if min(start, end) <= pos <= max(start, end):
+                return result("exon", i, start, end)
+        for i, (start, end) in enumerate(self.transcript.introns):
+            if min(start, end) <= pos <= max(start, end):
+                return result("intron", i, start, end)
+        return pd.Series(
+            {"region": None, "index": None, "5'_dist": np.inf, "3'_dist": np.inf}
+        )
+    def define_missplicing_events(self, var) -> Tuple[str, str, str, str, str]:
+        ref = self.transcript
+        ref_introns, ref_exons = getattr(ref, "introns", []), getattr(ref, "exons", [])
+        var_introns, var_exons = getattr(var, "introns", []), getattr(var, "exons", [])
+        num_ref_exons = len(ref_exons)
+        num_ref_introns = len(ref_introns)
+        pes, pir, es, ne, ir = [], [], [], [], []
+        for exon_count, (t1, t2) in enumerate(ref_exons):
+            for (s1, s2) in var_exons:
+                if (not ref.rev and ((s1 == t1 and s2 < t2) or (s1 > t1 and s2 == t2))) or (
+                    ref.rev and ((s1 == t1 and s2 > t2) or (s1 < t1 and s2 == t2))
+                ):
+                    pes.append(
+                        f"Exon {exon_count+1}/{num_ref_exons} truncated: {(t1, t2)} --> {(s1, s2)}"
+                    )
+        for intron_count, (t1, t2) in enumerate(ref_introns):
+            for (s1, s2) in var_introns:
+                if (not ref.rev and ((s1 == t1 and s2 < t2) or (s1 > t1 and s2 == t2))) or (
+                    ref.rev and ((s1 == t1 and s2 > t2) or (s1 < t1 and s2 == t2))
+                ):
+                    pir.append(
+                        f"Intron {intron_count+1}/{num_ref_introns} partially retained: {(t1, t2)} --> {(s1, s2)}"
+                    )
+        for exon_count, (t1, t2) in enumerate(ref_exons):
+            if t1 not in var.acceptors and t2 not in var.donors:
+                es.append(
+                    f"Exon {exon_count+1}/{num_ref_exons} skipped: {(t1, t2)}"
+                )
+        for (s1, s2) in var_exons:
+            if s1 not in ref.acceptors and s2 not in ref.donors:
+                ne.append(f"Novel Exon: {(s1, s2)}")
+        for intron_count, (t1, t2) in enumerate(ref_introns):
+            if t1 not in var.donors and t2 not in var.acceptors:
+                ir.append(
+                    f"Intron {intron_count+1}/{num_ref_introns} retained: {(t1, t2)}"
+                )
+        return ",".join(pes), ",".join(pir), ",".join(es), ",".join(ne), ",".join(ir)
+    def summarize_missplicing_event(self, pes, pir, es, ne, ir) -> str:
+        event = []
+        if pes:
+            event.append("PES")
+        if es:
+            event.append("ES")
+        if pir:
+            event.append("PIR")
+        if ir:
+            event.append("IR")
+        if ne:
+            event.append("NE")
+        return ",".join(event) if event else "-"
+    def compare_splicing_to_reference(self, transcript_variant) -> Series:
+        pes, pir, es, ne, ir = self.define_missplicing_events(transcript_variant)
+        return pd.Series(
+            {
+                "pes": pes,
+                "pir": pir,
+                "es": es,
+                "ne": ne,
+                "ir": ir,
+                "summary": self.summarize_missplicing_event(pes, pir, es, ne, ir),
+            }
+        )

geney/splicing_table.py ADDED Viewed

@@ -0,0 +1,142 @@
+# oncosplice/splicing_table.py
+from __future__ import annotations
+from typing import Dict, Optional, Union
+import numpy as np
+import pandas as pd
+from .engines import run_splicing_engine
+def predict_splicing(s, position: int, engine: str = 'spliceai', context: int = 7500,
+                    ) -> Union['SeqMat', pd.DataFrame]:
+    """
+    Predict splicing probabilities at a given position using the specified engine.
+    Args:
+        position (int): The genomic position to predict splicing probabilities for.
+        engine (str): The prediction engine to use. Supported: 'spliceai', 'pangolin'.
+        context (int): The length of the target central region (default: 7500).
+        format (str): Output format for the splicing engine results.
+    Returns:
+        pd.DataFrame: A DataFrame containing:
+            - position: The genomic position
+            - donor_prob: Probability of being a donor splice site
+            - acceptor_prob: Probability of being an acceptor splice site
+            - nucleotides: The nucleotide sequence at that position
+    Raises:
+        ValueError: If an unsupported engine is provided.
+        IndexError: If the position is not found in the sequence.
+    """
+    # Validate position is within sequence bounds
+    if position < s.index.min() or position > s.index.max():
+        raise ValueError(f"Position {position} is outside sequence bounds [{s.index.min()}, {s.index.max()}]")
+    # Retrieve extended context (includes flanks) around the position.
+    target = s.clone(position - context, position + context)
+    # Check if target clone resulted in empty sequence
+    if len(target.seq) == 0:
+        raise ValueError(f"No sequence data found around position {position} with context {context}")
+    seq, indices = target.seq, target.index
+    # Validate indices array is not empty
+    if len(indices) == 0:
+        raise ValueError(f"No indices found in sequence around position {position}")
+    # Find relative position within the context window
+    rel_pos = np.abs(indices - position).argmin()
+    left_missing, right_missing = max(0, context - rel_pos), max(0, context - (len(seq) - rel_pos))
+    # print(left_missing, right_missing)
+    if left_missing > 0 or right_missing > 0:
+        step = -1 if s.rev else 1
+        if left_missing > 0:
+            left_pad = np.arange(indices[0] - step * left_missing, indices[0], step)
+        else:
+            left_pad = np.array([], dtype=indices.dtype)
+        if right_missing > 0:
+            right_pad = np.arange(indices[-1] + step, indices[-1] + step * (right_missing + 1), step)
+        else:
+            right_pad = np.array([], dtype=indices.dtype)
+        seq = 'N' * left_missing + seq + 'N' * right_missing
+        indices = np.concatenate([left_pad, indices, right_pad])
+    # Run the splicing prediction engine (function assumed to be defined externally)
+    donor_probs, acceptor_probs = run_splicing_engine(seq=seq, engine=engine)
+    # Trim off the fixed flanks before returning results.
+    seq = seq[5000:-5000]
+    indices = indices[5000:-5000]
+    df = pd.DataFrame({
+        'position': indices,
+        'donor_prob': donor_probs,
+        'acceptor_prob': acceptor_probs,
+        'nucleotides': list(seq)
+    }).set_index('position').round(3)
+    df.attrs['name'] = s.name
+    return df
+def adjoin_splicing_outcomes(
+    splicing_predictions: Dict[str, pd.DataFrame],
+    transcript: Optional[object] = None,
+) -> pd.DataFrame:
+    """
+    Combine splicing predictions for multiple mutations into a multi-index DataFrame.
+    splicing_predictions: {label -> DF with 'donor_prob','acceptor_prob','nucleotides'}
+    transcript: optional transcript (must have .acceptors, .donors, .rev)
+    """
+    if not splicing_predictions:
+        raise ValueError("splicing_predictions cannot be empty")
+    dfs = []
+    for label, df in splicing_predictions.items():
+        if not isinstance(df, pd.DataFrame):
+            raise TypeError(f"Expected DataFrame for '{label}', got {type(df).__name__}")
+        required_cols = ["donor_prob", "acceptor_prob", "nucleotides"]
+        missing = [c for c in required_cols if c not in df.columns]
+        if missing:
+            raise ValueError(
+                f"DataFrame for '{label}' missing required columns: {missing}"
+            )
+        var_df = df.rename(
+            columns={
+                "donor_prob": ("donors", f"{label}_prob"),
+                "acceptor_prob": ("acceptors", f"{label}_prob"),
+                "nucleotides": ("nts", f"{label}"),
+            }
+        )
+        dfs.append(var_df)
+    try:
+        full_df = pd.concat(dfs, axis=1)
+    except Exception as e:
+        raise ValueError(f"Failed to concatenate DataFrames: {e}") from e
+    if not isinstance(full_df.columns, pd.MultiIndex):
+        full_df.columns = pd.MultiIndex.from_tuples(full_df.columns)
+    if transcript is not None:
+        full_df[("acceptors", "annotated")] = full_df.apply(
+            lambda row: row.name in transcript.acceptors, axis=1
+        )
+        full_df[("donors", "annotated")] = full_df.apply(
+            lambda row: row.name in transcript.donors, axis=1
+        )
+        full_df.sort_index(axis=1, level=0, inplace=True)
+        full_df.sort_index(ascending=not transcript.rev, inplace=True)
+    else:
+        full_df.sort_index(axis=1, level=0, inplace=True)
+    return full_df

geney/transcripts.py ADDED Viewed

@@ -0,0 +1,68 @@
+# oncosplice/transcripts.py
+from __future__ import annotations
+from typing import Dict, Iterable, List, Tuple, Optional
+from .splicing_table import adjoin_splicing_outcomes, predict_splicing
+class TranscriptLibrary:
+    """
+    Holds a reference transcript and mutated variants derived from a MutationalEvent.
+    _transcripts: {'ref': ref_transcript, 'event': all_mutations, 'mut1': first mutation, ...}
+    """
+    def __init__(self, reference_transcript, mutations: Iterable[Tuple[int, str, str]]):
+        self.ref = reference_transcript.clone()
+        self.event = reference_transcript.clone()
+        self._transcripts: Dict[str, object] = {"ref": self.ref, "event": self.event}
+        for i, (pos, ref, alt) in enumerate(mutations):
+            self.event.pre_mrna.apply_mutations((pos, ref, alt))
+            if len(list(mutations)) > 1:
+                t = reference_transcript.clone()
+                t.pre_mrna.apply_mutations((pos, ref, alt))
+                name = f"mut{i+1}"
+                self._transcripts[name] = t
+                setattr(self, name, t)
+        setattr(self, "ref", self.ref)
+        setattr(self, "event", self.event)
+    def predict_splicing(self, pos, engine: str = "spliceai", inplace: bool = False):
+        """
+        Run splicing predictions for all transcripts at a genomic position.
+        Assumes each transcript has pre_mrna.predict_splicing(pos, engine, inplace=True)
+        and stores results in pre_mrna.predicted_splicing.
+        """
+        splicing_predictions = {
+            k: predict_splicing(t.pre_mrna, pos, engine=engine)
+            for k, t in self._transcripts.items()
+        }
+        self.splicing_results = adjoin_splicing_outcomes(
+            {k: df for k, df in splicing_predictions.items()},
+            self.ref,
+        )
+        if inplace:
+            return self
+        return self.splicing_results
+    def get_event_columns(self, event_name: str, sites=("donors", "acceptors")):
+        """
+        Extract selected columns for a given event label ('event', 'mut1', etc.).
+        Returns a DataFrame subset of self.splicing_results.
+        """
+        if not hasattr(self, "splicing_results"):
+            raise ValueError("You must run predict_splicing() first.")
+        metrics = (f"{event_name}_prob", "ref_prob", "annotated")
+        cols = [(site, metric) for site in sites for metric in metrics]
+        return self.splicing_results.loc[:, cols]
+    def __getitem__(self, key):
+        return self._transcripts[key]
+    def __iter__(self):
+        return iter(self._transcripts.items())