PyPI - LZGraphs - Versions diffs - 2.0.0__tar.gz → 2.1.1__tar.gz - Mend

LZGraphs 2.0.0tar.gz → 2.1.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

{lzgraphs-2.0.0 → lzgraphs-2.1.1}/CHANGELOG.md RENAMED Viewed

@@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 - Custom exceptions module with comprehensive exception hierarchy for better error handling
-- Information-theoretic metrics module (`LZGraphs.Metrics.entropy`)
+- Information-theoretic metrics module (`LZGraphs.metrics.entropy`)
   - `node_entropy()` - Shannon entropy of node probability distribution
   - `edge_entropy()` - Shannon entropy of edge transition probabilities
   - `graph_entropy()` - Combined graph entropy measure

{lzgraphs-2.0.0 → lzgraphs-2.1.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: LZGraphs
-Version: 2.0.0
+Version: 2.1.1
 Summary: An Implementation of LZ76 Based Graphs for Repertoire Representation and Analysis
 Author-email: Thomas Konstantinovsky <thomaskon90@gmail.com>
 Maintainer-email: Thomas Konstantinovsky <thomaskon90@gmail.com>

{lzgraphs-2.0.0 → lzgraphs-2.1.1}/src/LZGraphs/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
-__version__ = "2.0.0"
+__version__ = "2.1.1"
 # =============================================================================
 # Graph classes
@@ -44,6 +44,13 @@ from .metrics.entropy import (
     cross_entropy,
     kl_divergence,
     mutual_information_genes,
+    transition_predictability,
+    graph_compression_ratio,
+    repertoire_compressibility_index,
+    transition_kl_divergence,
+    transition_jsd,
+    transition_mutual_information_profile,
+    path_entropy_rate,
 )
 # =============================================================================
@@ -56,6 +63,11 @@ from .metrics.saturation import NodeEdgeSaturationProbe
 # =============================================================================
 from .metrics.convenience import compare_repertoires
+# =============================================================================
+# Metrics - PGen Distribution
+# =============================================================================
+from .metrics.pgen_distribution import LZPgenDistribution, compare_lzpgen_distributions
 # =============================================================================
 # Utilities
 # =============================================================================
@@ -145,10 +157,20 @@ __all__ = [
     'cross_entropy',
     'kl_divergence',
     'mutual_information_genes',
+    'transition_predictability',
+    'graph_compression_ratio',
+    'repertoire_compressibility_index',
+    'transition_kl_divergence',
+    'transition_jsd',
+    'transition_mutual_information_profile',
+    'path_entropy_rate',
     # Saturation
     'NodeEdgeSaturationProbe',
     # Convenience
     'compare_repertoires',
+    # PGen distribution
+    'LZPgenDistribution',
+    'compare_lzpgen_distributions',
     # Utilities
     'generate_kmer_dictionary',
     'lempel_ziv_decomposition',

{lzgraphs-2.0.0 → lzgraphs-2.1.1}/src/LZGraphs/graphs/amino_acid_positional.py RENAMED Viewed

@@ -1,5 +1,4 @@
 import logging
-import re
 import time
 from typing import List, Tuple, Union, Optional, Generator
@@ -10,7 +9,7 @@ from tqdm.auto import tqdm
 from .lz_graph_base import LZGraphBase
 from ..utilities.decomposition import lempel_ziv_decomposition
-from ..utilities.misc import window, choice
+from ..utilities.misc import window
 from ..exceptions import (
     EmptyDataError,
     MissingColumnError,
@@ -144,18 +143,10 @@ class AAPLZGraph(LZGraphBase):
         self._normalize_edge_weights()
         self.verbose_driver(3, verbose)
-        # If gene data is available, normalize gene weights in parallel
-        if self.genetic:
-            self._batch_gene_weight_normalization(verbose=verbose)
-            self.verbose_driver(4, verbose)
         # Additional map derivations
         self.edges_list = None
-        self._derive_terminal_state_map()
-        self.verbose_driver(7, verbose)
         self._derive_stop_probability_data()
-        self.verbose_driver(8, verbose)
-        self.verbose_driver(5, verbose)
+        self.verbose_driver(9, verbose)
         # Optionally compute the PGEN for each sequence
         if calculate_trainset_pgen:
@@ -306,8 +297,8 @@ class AAPLZGraph(LZGraphBase):
         """
         Given a sub-pattern that might look like "ABC_10", extract only the amino acids ("ABC").
         """
-        match = re.search(r'[A-Z]+', base)
-        return match.group(0) if match else ""
+        idx = base.rfind('_')
+        return base[:idx] if idx > 0 else base
     def _decomposed_sequence_generator(
         self,
@@ -416,16 +407,16 @@ class AAPLZGraph(LZGraphBase):
                 val = np.finfo(float).eps if use_epsilon else 0.0
                 return (val, val)
-            e_data = self.graph[step1][step2]
+            ed = self.graph[step1][step2]['data']
             # If these genes aren't on the edge, it's effectively 0
-            if v not in e_data or j not in e_data:
+            if not ed.has_gene(v) or not ed.has_gene(j):
                 if verbose:
                     logger.warning(f"Edge {step1}->{step2} missing {v} or {j}.")
                 val = np.finfo(float).eps if use_epsilon else 0.0
                 return (val, val)
-            proba_v *= e_data[v]
-            proba_j *= e_data[j]
+            proba_v *= ed.v_probability(v)
+            proba_j *= ed.j_probability(j)
         return proba_v, proba_j
@@ -484,19 +475,18 @@ class AAPLZGraph(LZGraphBase):
                     logger.warning(f"Current state {current_state} not in graph.")
                     break
-                edge_info = pd.DataFrame(dict(self.graph[current_state]))
+                # Get edges that have both selected V and J genes
+                edges = self.outgoing_edges(current_state)
                 # Apply blacklist if present
                 if (current_state, selected_v, selected_j) in self.genetic_walks_black_list:
                     blacklisted = self.genetic_walks_black_list[(current_state, selected_v, selected_j)]
-                    edge_info = edge_info.drop(columns=blacklisted, errors="ignore")
+                    edges = {nb: ed for nb, ed in edges.items() if nb not in blacklisted}
-                # Check for presence of selected V/J genes
-                # We'll consider edges that contain both selected_v and selected_j
-                # in the attribute keys
-                sub_df = edge_info.T[[selected_v, selected_j]].dropna(how="any") if \
-                    {selected_v, selected_j}.issubset(edge_info.index) else pd.DataFrame()
+                # Filter to edges containing both V and J genes
+                valid_edges = {nb: ed for nb, ed in edges.items()
+                               if ed.has_gene(selected_v) and ed.has_gene(selected_j)}
-                if sub_df.empty:
+                if not valid_edges:
                     # No valid edges
                     if len(walk) > 2:
                         prev_state = walk[-2]
@@ -510,10 +500,11 @@ class AAPLZGraph(LZGraphBase):
                         selected_v, selected_j = self._select_random_vj_genes(vj_init)
                     continue
-                # Weighted choice among these edges
-                w = edge_info.loc["weight", sub_df.index]
-                w /= w.sum()
-                if w.empty:
+                # Weighted choice among valid edges
+                nbs = list(valid_edges.keys())
+                weights = np.array([valid_edges[nb].weight for nb in nbs])
+                w_sum = weights.sum()
+                if w_sum == 0:
                     # Again, no valid edges
                     if len(walk) > 2:
                         prev_state = walk[-2]
@@ -527,7 +518,8 @@ class AAPLZGraph(LZGraphBase):
                         selected_v, selected_j = self._select_random_vj_genes(vj_init)
                     continue
-                current_state = np.random.choice(w.index, p=w.values)
+                weights /= w_sum
+                current_state = np.random.choice(nbs, p=weights)
                 walk.append(current_state)
             results.append((walk, selected_v, selected_j))
@@ -594,7 +586,8 @@ class AAPLZGraph(LZGraphBase):
         to_drop = []
         for src, dst, attrs in self.edges_list:
-            if (v not in attrs) or (j not in attrs):
+            ed = attrs.get('data')
+            if ed is None or not (ed.has_gene(v) and ed.has_gene(j)):
                 to_drop.append((src, dst))
         G = self.graph.copy()
@@ -632,15 +625,14 @@ class AAPLZGraph(LZGraphBase):
             self.genetic_walks_black_list = {}
         while current_state not in final_states:
-            edge_info = pd.DataFrame(dict(G[current_state]))
+            # Get outgoing edges from the gene subgraph
+            edges = {nb: G[current_state][nb]['data'] for nb in G[current_state]}
             # Apply blacklist
             if (selected_v, selected_j, current_state) in self.genetic_walks_black_list:
-                edge_info = edge_info.drop(
-                    columns=self.genetic_walks_black_list[(selected_v, selected_j, current_state)],
-                    errors="ignore"
-                )
+                blacklisted = self.genetic_walks_black_list[(selected_v, selected_j, current_state)]
+                edges = {nb: ed for nb, ed in edges.items() if nb not in blacklisted}
-            if edge_info.shape[1] == 0:
+            if not edges:
                 if len(walk) > 1:
                     prev_state = walk[-2]
                     blacklisted_cols = self.genetic_walks_black_list.get((selected_v, selected_j, prev_state), [])
@@ -649,14 +641,13 @@ class AAPLZGraph(LZGraphBase):
                     walk.pop()
                     current_state = walk[-1]
                 else:
-                    # Stuck at the start
                     break
                 continue
-            sub_df = edge_info.T[[selected_v, selected_j]].dropna(how="any") if \
-                {selected_v, selected_j}.issubset(edge_info.index) else pd.DataFrame()
-            if sub_df.empty:
-                # No valid edges — backtrack or break
+            # Filter to edges containing both V and J genes
+            valid_edges = {nb: ed for nb, ed in edges.items()
+                           if ed.has_gene(selected_v) and ed.has_gene(selected_j)}
+            if not valid_edges:
                 if len(walk) > 1:
                     prev_state = walk[-2]
                     blacklisted_cols = self.genetic_walks_black_list.get((selected_v, selected_j, prev_state), [])
@@ -668,9 +659,10 @@ class AAPLZGraph(LZGraphBase):
                     break
                 continue
-            w = edge_info.loc["weight", sub_df.index]
-            w /= w.sum()
-            next_state = np.random.choice(w.index, p=w.values)
+            nbs = list(valid_edges.keys())
+            weights = np.array([valid_edges[nb].weight for nb in nbs])
+            weights /= weights.sum()
+            next_state = np.random.choice(nbs, p=weights)
             walk.append(next_state)
             current_state = next_state

lzgraphs-2.1.1/src/LZGraphs/graphs/edge_data.py ADDED Viewed

@@ -0,0 +1,197 @@
+"""
+EdgeData: Encapsulates all data for a single directed edge in an LZGraph.
+Raw counts are the source of truth. Normalized probabilities are cached
+after calling normalize() and are read-only.
+"""
+from ..utilities.misc import _is_v_gene, _is_j_gene
+__all__ = ["EdgeData"]
+class EdgeData:
+    """Stores all data for a single directed edge in an LZGraph.
+    Raw counts are the source of truth. Normalized probabilities
+    are cached after calling normalize() and are read-only.
+    Attributes:
+        count (int): Raw transition count (source of truth).
+        v_genes (dict): {gene_name: raw_count} for V genes.
+        j_genes (dict): {gene_name: raw_count} for J genes.
+    """
+    __slots__ = ('count', '_weight', 'v_genes', 'j_genes')
+    def __init__(self):
+        self.count = 0
+        self._weight = 0.0
+        self.v_genes = {}
+        self.j_genes = {}
+    @property
+    def weight(self):
+        """Cached transition probability P(B|A), set by normalize()."""
+        return self._weight
+    @property
+    def vsum(self):
+        """Total count of V gene observations on this edge."""
+        return sum(self.v_genes.values())
+    @property
+    def jsum(self):
+        """Total count of J gene observations on this edge."""
+        return sum(self.j_genes.values())
+    @property
+    def is_genetic(self):
+        """Whether this edge has any gene data."""
+        return bool(self.v_genes or self.j_genes)
+    def record(self, v_gene=None, j_gene=None):
+        """Record one traversal during graph construction.
+        Args:
+            v_gene (str, optional): V gene to record.
+            j_gene (str, optional): J gene to record.
+        """
+        self.count += 1
+        if v_gene is not None:
+            self.v_genes[v_gene] = self.v_genes.get(v_gene, 0) + 1
+        if j_gene is not None:
+            self.j_genes[j_gene] = self.j_genes.get(j_gene, 0) + 1
+    def unrecord(self, v_gene=None, j_gene=None):
+        """Remove one traversal (for sequence removal).
+        Args:
+            v_gene (str, optional): V gene to decrement.
+            j_gene (str, optional): J gene to decrement.
+        """
+        self.count = max(0, self.count - 1)
+        if v_gene is not None and v_gene in self.v_genes:
+            self.v_genes[v_gene] -= 1
+            if self.v_genes[v_gene] <= 0:
+                del self.v_genes[v_gene]
+        if j_gene is not None and j_gene in self.j_genes:
+            self.j_genes[j_gene] -= 1
+            if self.j_genes[j_gene] <= 0:
+                del self.j_genes[j_gene]
+    def merge(self, other):
+        """Merge another EdgeData into this one (for graph union).
+        Args:
+            other (EdgeData): The edge data to merge in.
+        """
+        self.count += other.count
+        for g, c in other.v_genes.items():
+            self.v_genes[g] = self.v_genes.get(g, 0) + c
+        for g, c in other.j_genes.items():
+            self.j_genes[g] = self.j_genes.get(g, 0) + c
+    def normalize(self, node_frequency, alpha=0.0, n_successors=0):
+        """Compute and cache transition probability from raw count.
+        Args:
+            node_frequency (int): Total outgoing count from source node.
+            alpha (float): Laplace smoothing parameter.
+            n_successors (int): Number of successors (for Laplace smoothing).
+        """
+        if alpha > 0:
+            denom = node_frequency + alpha * n_successors
+            self._weight = (self.count + alpha) / denom if denom > 0 else 0.0
+        elif node_frequency > 0:
+            self._weight = self.count / node_frequency
+        else:
+            self._weight = 0.0
+    def v_probability(self, gene):
+        """Return P(gene) among V genes on this edge."""
+        vsum = self.vsum
+        return self.v_genes.get(gene, 0) / vsum if vsum > 0 else 0.0
+    def j_probability(self, gene):
+        """Return P(gene) among J genes on this edge."""
+        jsum = self.jsum
+        return self.j_genes.get(gene, 0) / jsum if jsum > 0 else 0.0
+    def has_gene(self, gene):
+        """Check if a gene (V or J) is present on this edge."""
+        return gene in self.v_genes or gene in self.j_genes
+    def gene_dict(self):
+        """Return {gene: probability} dict for all genes on this edge."""
+        result = {}
+        vsum, jsum = self.vsum, self.jsum
+        for g, c in self.v_genes.items():
+            result[g] = c / vsum if vsum > 0 else 0.0
+        for g, c in self.j_genes.items():
+            result[g] = c / jsum if jsum > 0 else 0.0
+        return result
+    def to_legacy_dict(self):
+        """Convert to flat dict matching old edge attribute format.
+        Returns:
+            dict: {weight, count, Vsum, Jsum, gene_name: probability, ...}
+        """
+        d = {'weight': self._weight, 'count': self.count}
+        if self.v_genes:
+            d['Vsum'] = self.vsum
+            for g in self.v_genes:
+                d[g] = self.v_probability(g)
+        if self.j_genes:
+            d['Jsum'] = self.jsum
+            for g in self.j_genes:
+                d[g] = self.j_probability(g)
+        return d
+    @classmethod
+    def from_legacy_dict(cls, d, node_frequency=0):
+        """Reconstruct EdgeData from an old-format flat dict.
+        Used for loading old saves where edge data was stored as
+        {weight, Vsum, Jsum, gene_name: probability, ...}.
+        Args:
+            d (dict): Old-format edge attribute dictionary.
+            node_frequency (int): Per-node observed frequency for count recovery.
+        Returns:
+            EdgeData: Reconstructed edge data.
+        """
+        edge = cls()
+        edge._weight = d.get('weight', 0.0)
+        edge.count = d.get('count', 0)
+        if edge.count == 0 and node_frequency > 0:
+            edge.count = int(round(edge._weight * node_frequency))
+        vsum = d.get('Vsum', 0)
+        jsum = d.get('Jsum', 0)
+        for key, val in d.items():
+            if key in ('weight', 'count', 'Vsum', 'Jsum'):
+                continue
+            if _is_v_gene(key) and vsum > 0:
+                edge.v_genes[key] = int(round(val * vsum))
+            elif _is_j_gene(key) and jsum > 0:
+                edge.j_genes[key] = int(round(val * jsum))
+        return edge
+    def __getstate__(self):
+        return (self.count, self._weight, self.v_genes, self.j_genes)
+    def __setstate__(self, state):
+        self.count, self._weight, self.v_genes, self.j_genes = state
+    def __eq__(self, other):
+        if not isinstance(other, EdgeData):
+            return NotImplemented
+        return (self.count == other.count
+                and self.v_genes == other.v_genes
+                and self.j_genes == other.j_genes)
+    def __repr__(self):
+        return (f"EdgeData(count={self.count}, weight={self._weight:.4f}, "
+                f"v={len(self.v_genes)}, j={len(self.j_genes)})")

lzgraphs-2.1.1/src/LZGraphs/graphs/graph_operations.py ADDED Viewed

@@ -0,0 +1,115 @@
+import numpy as np
+import pandas as pd
+from .edge_data import EdgeData
+from ..exceptions import IncompatibleGraphsError
+__all__ = ['graph_union']
+def graph_union(graphA, graphB):
+    """Perform a union operation between two graphs.
+    graphA will be updated in-place to be the equivalent of the union
+    of both. The result is logically equal to constructing a graph from
+    the combined sequences of two separate repertoires.
+    Since EdgeData stores raw counts as the source of truth, the union
+    simply merges counts and then recalculates all derived probabilities.
+    Args:
+        graphA (LZGraph): An LZGraph (will be modified in-place).
+        graphB (LZGraph): An LZGraph of the same class as graphA.
+    Returns:
+        LZGraph: graphA, updated with the union of both graphs.
+    """
+    if not isinstance(graphA, type(graphB)) and not isinstance(graphB, type(graphA)):
+        raise IncompatibleGraphsError(
+            type1=type(graphA).__name__,
+            type2=type(graphB).__name__,
+            message="Both graphs must be of the same type for union operation."
+        )
+    # 1. Merge edges (raw counts)
+    for a, b in graphB.graph.edges:
+        ed_b = graphB.graph[a][b]['data']
+        if graphA.graph.has_edge(a, b):
+            graphA.graph[a][b]['data'].merge(ed_b)
+        else:
+            # Ensure nodes exist
+            if a not in graphA.graph:
+                graphA.graph.add_node(a)
+            if b not in graphA.graph:
+                graphA.graph.add_node(b)
+            # Deep copy EdgeData from B
+            ed_new = EdgeData()
+            ed_new.merge(ed_b)
+            graphA.graph.add_edge(a, b, data=ed_new)
+    # Also add any nodes from B that have no edges
+    for node in graphB.graph.nodes:
+        if node not in graphA.graph:
+            graphA.graph.add_node(node)
+    # 2. Merge sequence-level counts
+    # (per_node_observed_frequency is recomputed in recalculate())
+    graphA.initial_states = graphA.initial_states.combine(
+        graphB.initial_states, lambda x, y: x + y, fill_value=0
+    )
+    graphA.terminal_states = graphA.terminal_states.combine(
+        graphB.terminal_states, lambda x, y: x + y, fill_value=0
+    )
+    graphA.n_subpatterns += graphB.n_subpatterns
+    graphA.n_transitions += graphB.n_transitions
+    # Merge lengths
+    if hasattr(graphB, 'lengths'):
+        for length, count in graphB.lengths.items():
+            graphA.lengths[length] = graphA.lengths.get(length, 0) + count
+    # 4. Merge gene-level data (if genetic)
+    if graphA.genetic and graphB.genetic:
+        # Weighted average of marginal gene distributions
+        nA = graphA.initial_states.sum()
+        nB = graphB.initial_states.sum()
+        nTotal = nA + nB
+        if nTotal > 0:
+            graphA.marginal_vgenes = (
+                graphA.marginal_vgenes.combine(graphB.marginal_vgenes,
+                    lambda x, y: x * nA / nTotal + y * nB / nTotal, fill_value=0)
+            )
+            graphA.marginal_jgenes = (
+                graphA.marginal_jgenes.combine(graphB.marginal_jgenes,
+                    lambda x, y: x * nA / nTotal + y * nB / nTotal, fill_value=0)
+            )
+            graphA.vj_probabilities = (
+                graphA.vj_probabilities.combine(graphB.vj_probabilities,
+                    lambda x, y: x * nA / nTotal + y * nB / nTotal, fill_value=0)
+            )
+        # Merge length_distribution counts
+        if hasattr(graphA, 'length_distribution') and hasattr(graphB, 'length_distribution'):
+            graphA.length_distribution = graphA.length_distribution.combine(
+                graphB.length_distribution, lambda x, y: x + y, fill_value=0
+            )
+        # Merge observed gene sets
+        if hasattr(graphB, 'observed_vgenes'):
+            graphA.observed_vgenes = list(
+                set(graphA.observed_vgenes) | set(graphB.observed_vgenes)
+            )
+        if hasattr(graphB, 'observed_jgenes'):
+            graphA.observed_jgenes = list(
+                set(graphA.observed_jgenes) | set(graphB.observed_jgenes)
+            )
+    # 5. Recalculate ALL derived state from raw counts
+    graphA.recalculate()
+    # Clear cached edges list
+    if hasattr(graphA, 'edges_list'):
+        graphA.edges_list = None
+    return graphA

LZGraphs 2.0.0__tar.gz → 2.1.1__tar.gz

LZGraphs 2.0.0tar.gz → 2.1.1tar.gz