PyPI - geney - Versions diffs - 1.4.40__py3-none-any.whl → 1.4.41__py3-none-any.whl - Mend

geney 1.4.40py3-none-any.whl → 1.4.41py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

geney/__init__.py +18 -5
geney/engines.py +313 -204
geney/pipelines.py +88 -46
geney/splice_graph.py +213 -7
geney/transcripts.py +1 -1
{geney-1.4.40.dist-info → geney-1.4.41.dist-info}/METADATA +2 -1
geney-1.4.41.dist-info/RECORD +11 -0
geney/samples.py +0 -3
geney/splicing_table.py +0 -142
geney/utils.py +0 -254
geney-1.4.40.dist-info/RECORD +0 -14
{geney-1.4.40.dist-info → geney-1.4.41.dist-info}/WHEEL +0 -0
{geney-1.4.40.dist-info → geney-1.4.41.dist-info}/top_level.txt +0 -0

geney/pipelines.py CHANGED Viewed

@@ -4,36 +4,24 @@ from __future__ import annotations
 from datetime import datetime
 import pandas as pd
-from seqmat import Gene  # external dependency
+from seqmat import Gene
 from .splice_graph import SpliceSimulator
 from .transcripts import TranscriptLibrary
 from .variants import MutationalEvent
-from .oncosplice import Oncosplice  # your existing oncosplice core
+from .oncosplice import Oncosplice
-def max_splicing_delta(mut_id, transcript_id=None, splicing_engine='spliceai', organism='hg38'):
-    print("we are here")
-    m = MutationalEvent(mut_id)
-    assert m.compatible(), 'Mutations in event are incompatible'
-    reference_transcript = Gene.from_file(
-        m.gene, organism=organism).transcript(transcript_id).generate_pre_mrna().generate_mature_mrna().generate_protein()
-    tl = TranscriptLibrary(reference_transcript, m)
-    splicing_results = tl.predict_splicing(m.position, engine=splicing_engine, inplace=True).get_event_columns('event')
-    ss = SpliceSimulator(splicing_results, tl.event, feature='event', max_distance=100_000_000)
-    return ss.max_splicing_delta('event_prob')
-def oncosplice_pipeline_single_transcript(
+def oncosplice_pipeline(
     mut_id: str,
     transcript_id: str | None = None,
     splicing_engine: str = "spliceai",
     organism: str = "hg38",
 ) -> pd.DataFrame:
     """
-    High-level pipeline:
-      mutation event -> transcript -> splicing -> splice graph -> isoforms -> oncosplice scores
+    Run the full oncosplice pipeline for a mutation.
+    Returns DataFrame with all viable isoforms and their oncosplice scores.
     """
     m = MutationalEvent(mut_id)
     assert m.compatible(), "Mutations in event are incompatible"
@@ -47,7 +35,6 @@ def oncosplice_pipeline_single_transcript(
     )
     tl = TranscriptLibrary(reference_transcript, m)
     central_pos = m.central_position
     tl.predict_splicing(central_pos, engine=splicing_engine, inplace=True)
@@ -57,18 +44,16 @@ def oncosplice_pipeline_single_transcript(
         splicing_results, tl.event, feature="event", max_distance=100_000_000
     )
-    base_report = pd.Series(
-        {
-            "mut_id": mut_id,
-            "gene": m.gene,
-            "transcript_id": reference_transcript.transcript_id,
-            "primary_transcript": reference_transcript.primary_transcript,
-            "splicing_engine": splicing_engine,
-            "central_position": central_pos,
-            "mutation_count": len(m.positions),
-            "time_of_execution": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
-        }
-    )
+    base_report = pd.Series({
+        "mut_id": mut_id,
+        "gene": m.gene,
+        "transcript_id": reference_transcript.transcript_id,
+        "primary_transcript": reference_transcript.primary_transcript,
+        "splicing_engine": splicing_engine,
+        "central_position": central_pos,
+        "mutation_count": len(m.positions),
+        "time_of_execution": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+    })
     ss_metadata = ss.report(central_pos)
     rows = []
@@ -79,19 +64,76 @@ def oncosplice_pipeline_single_transcript(
             reference_transcript.cons_vector,
         )
         rows.append(
-            pd.concat(
-                [
-                    base_report,
-                    ss_metadata,
-                    isoform_metadata,
-                    pd.Series(
-                        {
-                            "reference_mrna": reference_transcript.mature_mrna.seq,
-                            "variant_mrna": variant_transcript.mature_mrna.seq,
-                        }
-                    ),
-                    onco.get_analysis_series(),
-                ]
-            )
+            pd.concat([
+                base_report,
+                ss_metadata,
+                isoform_metadata,
+                pd.Series({
+                    "reference_mrna": reference_transcript.mature_mrna.seq,
+                    "variant_mrna": variant_transcript.mature_mrna.seq,
+                }),
+                onco.get_analysis_series(),
+            ])
         )
-    return pd.DataFrame(rows)
+    return pd.DataFrame(rows)
+def oncosplice_top_isoform(
+    mut_id: str,
+    transcript_id: str | None = None,
+    splicing_engine: str = "spliceai",
+    organism: str = "hg38",
+) -> pd.Series | None:
+    """
+    Get the most likely non-reference isoform for a mutation.
+    Returns Series with full oncosplice analysis, or None if no missplicing detected.
+    """
+    df = oncosplice_pipeline(mut_id, transcript_id, splicing_engine, organism)
+    if df.empty:
+        return None
+    variants = df[df["summary"] != "-"]
+    if variants.empty:
+        return None
+    return variants.iloc[0]
+def max_splicing_delta(
+    mut_id: str,
+    transcript_id: str | None = None,
+    splicing_engine: str = "spliceai",
+    organism: str = "hg38",
+) -> float:
+    """
+    Get the maximum splice site probability change for a mutation.
+    """
+    m = MutationalEvent(mut_id)
+    assert m.compatible(), "Mutations in event are incompatible"
+    reference_transcript = (
+        Gene.from_file(m.gene, organism=organism)
+        .transcript(transcript_id)
+        .generate_pre_mrna()
+        .generate_mature_mrna()
+        .generate_protein()
+    )
+    tl = TranscriptLibrary(reference_transcript, m)
+    splicing_results = tl.predict_splicing(
+        m.central_position, engine=splicing_engine, inplace=True
+    ).get_event_columns("event")
+    ss = SpliceSimulator(
+        splicing_results, tl.event, feature="event", max_distance=100_000_000
+    )
+    return ss.max_splicing_delta("event_prob")
+# Keep old name for backwards compatibility
+oncosplice_pipeline_single_transcript = oncosplice_pipeline

geney/splice_graph.py CHANGED Viewed

@@ -2,13 +2,18 @@
 from __future__ import annotations
 from collections import defaultdict
-from typing import Any, Dict, Generator, List, Tuple
+import hashlib
+from typing import Dict, Generator, List, Tuple
 import numpy as np
 import pandas as pd
 from pandas import Series
-from .utils import short_hash_of_list  # type: ignore
+def _short_hash(items: Tuple) -> str:
+    """Generate a short hash string from a tuple."""
+    encoded = repr(items).encode('utf-8')
+    return hashlib.sha256(encoded).hexdigest()[:8]
 class SpliceSimulator:
@@ -90,6 +95,65 @@ class SpliceSimulator:
         metadata["missplicing"] = self.max_splicing_delta("event_prob")
         return metadata
+    def summarize_events(self, threshold: float = 0.2) -> str:
+        """
+        Generate human-readable summary of splice site changes.
+        Returns text describing discovered and deleted donors/acceptors.
+        Format: "D(position) ref_prob -> event_prob" or "A(position) ref_prob -> event_prob"
+        """
+        feature_col = f"{self.feature}_prob"
+        lines = []
+        # Process donors
+        donor_df = self.donor_df
+        discovered_donors = donor_df[donor_df["discovered_delta"].abs() >= threshold]
+        deleted_donors = donor_df[donor_df["deleted_delta"].abs() >= threshold]
+        if len(discovered_donors) > 0 or len(deleted_donors) > 0:
+            lines.append("=== DONORS ===")
+            if len(discovered_donors) > 0:
+                lines.append("Discovered:")
+                for pos, row in discovered_donors.iterrows():
+                    ref = row.get("ref_prob", 0)
+                    evt = row.get(feature_col, row.get("event_prob", 0))
+                    lines.append(f"  D({pos}) {ref:.2f} -> {evt:.2f} [+{evt-ref:.2f}]")
+            if len(deleted_donors) > 0:
+                lines.append("Deleted:")
+                for pos, row in deleted_donors.iterrows():
+                    ref = row.get("ref_prob", 0)
+                    evt = row.get(feature_col, row.get("event_prob", 0))
+                    lines.append(f"  D({pos}) {ref:.2f} -> {evt:.2f} [{evt-ref:.2f}]")
+        # Process acceptors
+        acceptor_df = self.acceptor_df
+        discovered_acceptors = acceptor_df[acceptor_df["discovered_delta"].abs() >= threshold]
+        deleted_acceptors = acceptor_df[acceptor_df["deleted_delta"].abs() >= threshold]
+        if len(discovered_acceptors) > 0 or len(deleted_acceptors) > 0:
+            lines.append("=== ACCEPTORS ===")
+            if len(discovered_acceptors) > 0:
+                lines.append("Discovered:")
+                for pos, row in discovered_acceptors.iterrows():
+                    ref = row.get("ref_prob", 0)
+                    evt = row.get(feature_col, row.get("event_prob", 0))
+                    lines.append(f"  A({pos}) {ref:.2f} -> {evt:.2f} [+{evt-ref:.2f}]")
+            if len(deleted_acceptors) > 0:
+                lines.append("Deleted:")
+                for pos, row in deleted_acceptors.iterrows():
+                    ref = row.get("ref_prob", 0)
+                    evt = row.get(feature_col, row.get("event_prob", 0))
+                    lines.append(f"  A({pos}) {ref:.2f} -> {evt:.2f} [{evt-ref:.2f}]")
+        if not lines:
+            return "No significant splice site changes detected."
+        return "\n".join(lines)
     def max_splicing_delta(self, event: str) -> float:
         all_diffs = []
         for site_type in ["donors", "acceptors"]:
@@ -236,6 +300,110 @@ class SpliceSimulator:
         paths.sort(key=lambda x: x[1], reverse=True)
         return paths
+    def isoforms_df(self) -> pd.DataFrame:
+        """
+        Return a DataFrame of all viable isoforms with probabilities and missplicing descriptions.
+        Columns:
+            - isoform_id: unique hash of the splice path
+            - probability: probability/prevalence of this isoform
+            - splicing_changes: short missplicing event codes (ES, IR, PES, PIR, NE, or "-" for canonical)
+            - exon_skipping: full exon skipping details
+            - partial_exon_skipping: partial exon skipping (truncation) details
+            - intron_retention: full intron retention details
+            - partial_intron_retention: partial intron retention details
+            - novel_exon: novel/cryptic exon details
+        """
+        rows = []
+        for t, md in self.get_viable_transcripts(metadata=True):
+            rows.append({
+                "isoform_id": md.get("isoform_id", ""),
+                "probability": md.get("isoform_prevalence", 0.0),
+                "splicing_changes": md.get("summary", "-"),
+                "exon_skipping": md.get("es", ""),
+                "partial_exon_skipping": md.get("pes", ""),
+                "intron_retention": md.get("ir", ""),
+                "partial_intron_retention": md.get("pir", ""),
+                "novel_exon": md.get("ne", ""),
+            })
+        if not rows:
+            return pd.DataFrame()
+        return pd.DataFrame(rows)
+    def _is_implausible_ir_path(self, var_transcript) -> bool:
+        """
+        Check if this transcript has intron retention that is implausible
+        because nearby cryptic splice sites compensate for the lost original sites.
+        Returns True if the path should be filtered out.
+        Key insight: If the variant uses ANY splice site near a reference intron
+        boundary, the intron is being spliced (possibly at a shifted position).
+        True IR only occurs when NO splice sites are used near BOTH boundaries.
+        """
+        ref_introns = getattr(self.transcript, "introns", [])
+        if not ref_introns:
+            return False
+        TOLERANCE = 500  # bp - consider splice sites within this distance as "covering" the boundary
+        MIN_TOTAL_PROB = 0.5  # if total prob >= this, cryptic sites could compensate
+        var_donors = set(var_transcript.donors)
+        var_acceptors = set(var_transcript.acceptors)
+        donor_df = self.donor_df
+        acceptor_df = self.acceptor_df
+        for t1, t2 in ref_introns:
+            # Determine which end is donor and which is acceptor based on strand
+            if not self.rev:
+                donor_pos, acceptor_pos = t1, t2  # + strand
+            else:
+                donor_pos, acceptor_pos = t2, t1  # - strand
+            # Check if variant uses ANY donor near the reference donor position
+            donor_used = any(
+                abs(d - donor_pos) <= TOLERANCE
+                for d in var_donors
+            )
+            # Check if variant uses ANY acceptor near the reference acceptor position
+            acceptor_used = any(
+                abs(a - acceptor_pos) <= TOLERANCE
+                for a in var_acceptors
+            )
+            # If both boundaries are used (possibly at shifted positions),
+            # the intron is being spliced out - NOT retained
+            if donor_used and acceptor_used:
+                continue
+            # At least one boundary is NOT used - this path has potential IR
+            # Check if cryptic sites with high probability exist but aren't being used
+            # (which would make this IR path implausible)
+            nearby_donors = donor_df.loc[
+                (donor_df.index >= donor_pos - TOLERANCE) &
+                (donor_df.index <= donor_pos + TOLERANCE)
+            ]
+            total_donor_prob = nearby_donors["P"].sum() if len(nearby_donors) > 0 else 0
+            nearby_acceptors = acceptor_df.loc[
+                (acceptor_df.index >= acceptor_pos - TOLERANCE) &
+                (acceptor_df.index <= acceptor_pos + TOLERANCE)
+            ]
+            total_acceptor_prob = nearby_acceptors["P"].sum() if len(nearby_acceptors) > 0 else 0
+            # If both boundaries have high probability cryptic sites available,
+            # but this path doesn't use them, the IR is implausible
+            if total_donor_prob >= MIN_TOTAL_PROB and total_acceptor_prob >= MIN_TOTAL_PROB:
+                return True
+        return False
     def get_viable_transcripts(self, metadata: bool = False):
         graph = self.generate_graph()
         start_node = (self.transcript_start, "transcript_start")
@@ -251,8 +419,13 @@ class SpliceSimulator:
             t.donors = [d for d in donors if d != t.transcript_end]
             t.acceptors = [a for a in acceptors if a != t.transcript_start]
             t.path_weight = prob
-            t.path_hash = short_hash_of_list(tuple(donors + acceptors))
+            t.path_hash = _short_hash(tuple(donors + acceptors))
             t.generate_mature_mrna().generate_protein()
+            # Filter out implausible IR paths (where cryptic sites compensate)
+            if self._is_implausible_ir_path(t):
+                continue
             if metadata:
                 md = pd.concat(
                     [
@@ -306,7 +479,9 @@ class SpliceSimulator:
         num_ref_introns = len(ref_introns)
         pes, pir, es, ne, ir = [], [], [], [], []
+        pir_intron_indices = set()  # Track which introns have PIR
+        # Partial exon skipping (exon truncation)
         for exon_count, (t1, t2) in enumerate(ref_exons):
             for (s1, s2) in var_exons:
                 if (not ref.rev and ((s1 == t1 and s2 < t2) or (s1 > t1 and s2 == t2))) or (
@@ -316,30 +491,61 @@ class SpliceSimulator:
                         f"Exon {exon_count+1}/{num_ref_exons} truncated: {(t1, t2)} --> {(s1, s2)}"
                     )
+        # Partial intron retention (one boundary preserved, other shifted)
         for intron_count, (t1, t2) in enumerate(ref_introns):
             for (s1, s2) in var_introns:
+                # Check if one boundary matches and the intron is shorter (partial retention)
                 if (not ref.rev and ((s1 == t1 and s2 < t2) or (s1 > t1 and s2 == t2))) or (
                     ref.rev and ((s1 == t1 and s2 > t2) or (s1 < t1 and s2 == t2))
                 ):
                     pir.append(
                         f"Intron {intron_count+1}/{num_ref_introns} partially retained: {(t1, t2)} --> {(s1, s2)}"
                     )
+                    pir_intron_indices.add(intron_count)
+        # Exon skipping (both boundaries missing)
         for exon_count, (t1, t2) in enumerate(ref_exons):
             if t1 not in var.acceptors and t2 not in var.donors:
                 es.append(
                     f"Exon {exon_count+1}/{num_ref_exons} skipped: {(t1, t2)}"
                 )
+        # Novel exon (boundaries not in reference)
         for (s1, s2) in var_exons:
             if s1 not in ref.acceptors and s2 not in ref.donors:
                 ne.append(f"Novel Exon: {(s1, s2)}")
+        # Full intron retention - only if NOT already partial retention
+        # AND no splice sites are being used near the intron boundaries
+        TOLERANCE = 500  # bp - consider splice sites within this distance as "covering" the boundary
         for intron_count, (t1, t2) in enumerate(ref_introns):
-            if t1 not in var.donors and t2 not in var.acceptors:
-                ir.append(
-                    f"Intron {intron_count+1}/{num_ref_introns} retained: {(t1, t2)}"
-                )
+            if intron_count in pir_intron_indices:
+                continue  # Already classified as PIR
+            # Check if the intron is preserved exactly in variant
+            intron_preserved = any(s1 == t1 and s2 == t2 for s1, s2 in var_introns)
+            if intron_preserved:
+                continue  # Intron is properly spliced
+            # Determine donor/acceptor positions based on strand
+            if not ref.rev:
+                donor_pos, acceptor_pos = t1, t2  # + strand
+            else:
+                donor_pos, acceptor_pos = t2, t1  # - strand
+            # Check if variant uses ANY splice site near each boundary
+            # If so, the intron is being spliced (at shifted positions), not retained
+            donor_used = any(abs(d - donor_pos) <= TOLERANCE for d in var.donors)
+            acceptor_used = any(abs(a - acceptor_pos) <= TOLERANCE for a in var.acceptors)
+            if donor_used and acceptor_used:
+                continue  # Intron is being spliced at shifted positions, not retained
+            # If we get here, the intron is truly retained
+            ir.append(
+                f"Intron {intron_count+1}/{num_ref_introns} retained: {(t1, t2)}"
+            )
         return ",".join(pes), ",".join(pir), ",".join(es), ",".join(ne), ",".join(ir)

geney/transcripts.py CHANGED Viewed

@@ -3,7 +3,7 @@ from __future__ import annotations
 from typing import Dict, Iterable, List, Tuple, Optional
-from .splicing_table import adjoin_splicing_outcomes, predict_splicing
+from .engines import adjoin_splicing_outcomes, predict_splicing
 class TranscriptLibrary:

{geney-1.4.40.dist-info → geney-1.4.41.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: geney
-Version: 1.4.40
+Version: 1.4.41
 Summary: A Python package for gene expression modeling.
 Home-page: https://github.com/nicolaslynn/geney
 Author: Nicolas Lynn
@@ -22,6 +22,7 @@ Requires-Dist: tensorflow>=2.8.0
 Requires-Dist: keras>=2.8.0
 Requires-Dist: torch
 Requires-Dist: seqmat
+Requires-Dist: h5py
 Dynamic: author
 Dynamic: author-email
 Dynamic: classifier

geney-1.4.41.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,11 @@
+geney/__init__.py,sha256=nkhniqCNWJzrb7xHgTDFEXSvRVdggb9ZCJ7ih7HEYq8,966
+geney/engines.py,sha256=9_oNsoluJsjdLC3cyWttjHF3cuQoy65FWgS4r7ehzek,14296
+geney/oncosplice.py,sha256=eGQQl9ftmoFENMYBWoJtenKWmzyxR9N1of5cZst_bHQ,18014
+geney/pipelines.py,sha256=gsy-gmHIi260SC5MKQ9IBSE0wko8Tvd7IC3wj083mPQ,3996
+geney/splice_graph.py,sha256=PANtLUAQiz578NZwxVlTSgboetnToHnQSkYpT0zbi_w,23931
+geney/transcripts.py,sha256=BBgyeqF4jeIiHaD_bXxgOTXz19kdUgjcPVo4ClpcSUg,2594
+geney/variants.py,sha256=vjbiBH-duZ4TJZyXwXbQ_VmJxCFafjeDwLNTZg3ubSc,11832
+geney-1.4.41.dist-info/METADATA,sha256=zuzWKIEeHSaFr08eRUjq3ZSiloOepcCD_QRG5ifS8j0,972
+geney-1.4.41.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+geney-1.4.41.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
+geney-1.4.41.dist-info/RECORD,,

geney/samples.py DELETED Viewed

@@ -1,3 +0,0 @@
-mut_id = 'KRAS:12:25227343:G:T'
-epistasis_id = 'KRAS:12:25227343:G:T|KRAS:12:25227344:A:T'

geney/splicing_table.py DELETED Viewed

@@ -1,142 +0,0 @@
-# oncosplice/splicing_table.py
-from __future__ import annotations
-from typing import Dict, Optional, Union
-import numpy as np
-import pandas as pd
-from .engines import run_splicing_engine
-def predict_splicing(s, position: int, engine: str = 'spliceai', context: int = 7500,
-                    ) -> Union['SeqMat', pd.DataFrame]:
-    """
-    Predict splicing probabilities at a given position using the specified engine.
-    Args:
-        position (int): The genomic position to predict splicing probabilities for.
-        engine (str): The prediction engine to use. Supported: 'spliceai', 'pangolin'.
-        context (int): The length of the target central region (default: 7500).
-        format (str): Output format for the splicing engine results.
-    Returns:
-        pd.DataFrame: A DataFrame containing:
-            - position: The genomic position
-            - donor_prob: Probability of being a donor splice site
-            - acceptor_prob: Probability of being an acceptor splice site
-            - nucleotides: The nucleotide sequence at that position
-    Raises:
-        ValueError: If an unsupported engine is provided.
-        IndexError: If the position is not found in the sequence.
-    """
-    # Validate position is within sequence bounds
-    if position < s.index.min() or position > s.index.max():
-        raise ValueError(f"Position {position} is outside sequence bounds [{s.index.min()}, {s.index.max()}]")
-    # Retrieve extended context (includes flanks) around the position.
-    target = s.clone(position - context, position + context)
-    # Check if target clone resulted in empty sequence
-    if len(target.seq) == 0:
-        raise ValueError(f"No sequence data found around position {position} with context {context}")
-    seq, indices = target.seq, target.index
-    # Validate indices array is not empty
-    if len(indices) == 0:
-        raise ValueError(f"No indices found in sequence around position {position}")
-    # Find relative position within the context window
-    rel_pos = np.abs(indices - position).argmin()
-    left_missing, right_missing = max(0, context - rel_pos), max(0, context - (len(seq) - rel_pos))
-    # print(left_missing, right_missing)
-    if left_missing > 0 or right_missing > 0:
-        step = -1 if s.rev else 1
-        if left_missing > 0:
-            left_pad = np.arange(indices[0] - step * left_missing, indices[0], step)
-        else:
-            left_pad = np.array([], dtype=indices.dtype)
-        if right_missing > 0:
-            right_pad = np.arange(indices[-1] + step, indices[-1] + step * (right_missing + 1), step)
-        else:
-            right_pad = np.array([], dtype=indices.dtype)
-        seq = 'N' * left_missing + seq + 'N' * right_missing
-        indices = np.concatenate([left_pad, indices, right_pad])
-    # Run the splicing prediction engine (function assumed to be defined externally)
-    donor_probs, acceptor_probs = run_splicing_engine(seq=seq, engine=engine)
-    # Trim off the fixed flanks before returning results.
-    seq = seq[5000:-5000]
-    indices = indices[5000:-5000]
-    df = pd.DataFrame({
-        'position': indices,
-        'donor_prob': donor_probs,
-        'acceptor_prob': acceptor_probs,
-        'nucleotides': list(seq)
-    }).set_index('position').round(3)
-    df.attrs['name'] = s.name
-    return df
-def adjoin_splicing_outcomes(
-    splicing_predictions: Dict[str, pd.DataFrame],
-    transcript: Optional[object] = None,
-) -> pd.DataFrame:
-    """
-    Combine splicing predictions for multiple mutations into a multi-index DataFrame.
-    splicing_predictions: {label -> DF with 'donor_prob','acceptor_prob','nucleotides'}
-    transcript: optional transcript (must have .acceptors, .donors, .rev)
-    """
-    if not splicing_predictions:
-        raise ValueError("splicing_predictions cannot be empty")
-    dfs = []
-    for label, df in splicing_predictions.items():
-        if not isinstance(df, pd.DataFrame):
-            raise TypeError(f"Expected DataFrame for '{label}', got {type(df).__name__}")
-        required_cols = ["donor_prob", "acceptor_prob", "nucleotides"]
-        missing = [c for c in required_cols if c not in df.columns]
-        if missing:
-            raise ValueError(
-                f"DataFrame for '{label}' missing required columns: {missing}"
-            )
-        var_df = df.rename(
-            columns={
-                "donor_prob": ("donors", f"{label}_prob"),
-                "acceptor_prob": ("acceptors", f"{label}_prob"),
-                "nucleotides": ("nts", f"{label}"),
-            }
-        )
-        dfs.append(var_df)
-    try:
-        full_df = pd.concat(dfs, axis=1)
-    except Exception as e:
-        raise ValueError(f"Failed to concatenate DataFrames: {e}") from e
-    if not isinstance(full_df.columns, pd.MultiIndex):
-        full_df.columns = pd.MultiIndex.from_tuples(full_df.columns)
-    if transcript is not None:
-        full_df[("acceptors", "annotated")] = full_df.apply(
-            lambda row: row.name in transcript.acceptors, axis=1
-        )
-        full_df[("donors", "annotated")] = full_df.apply(
-            lambda row: row.name in transcript.donors, axis=1
-        )
-        full_df.sort_index(axis=1, level=0, inplace=True)
-        full_df.sort_index(ascending=not transcript.rev, inplace=True)
-    else:
-        full_df.sort_index(axis=1, level=0, inplace=True)
-    return full_df

geney 1.4.40__py3-none-any.whl → 1.4.41__py3-none-any.whl

geney 1.4.40py3-none-any.whl → 1.4.41py3-none-any.whl