PyPI - LZGraphs - Versions diffs - 2.2.0__tar.gz → 2.3.0__tar.gz - Mend

LZGraphs 2.2.0tar.gz → 2.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: LZGraphs
-Version: 2.2.0
+Version: 2.3.0
 Summary: An Implementation of LZ76 Based Graphs for Repertoire Representation and Analysis
 Author-email: Thomas Konstantinovsky <thomaskon90@gmail.com>
 Maintainer-email: Thomas Konstantinovsky <thomaskon90@gmail.com>

lzgraphs-2.3.0/setup.py ADDED Viewed

@@ -0,0 +1,40 @@
+"""
+Build script for optional C extensions.
+The _fast_walk extension accelerates LZGraph.simulate() by ~50-100x.
+If compilation fails (no C compiler), the package still installs and
+falls back to the pure-Python implementation automatically.
+"""
+import os
+import sys
+from setuptools import setup, Extension
+# Ensure setuptools can resolve the dynamic version (attr = "LZGraphs.__version__")
+# when running in an isolated build environment where src/ isn't on sys.path.
+sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "src"))
+ext_modules = [
+    Extension(
+        "LZGraphs._fast_walk",
+        sources=[os.path.join("src", "LZGraphs", "_fast_walk.c")],
+        # No external library dependencies — pure C + Python.h
+    ),
+]
+def run_setup(extensions):
+    setup(ext_modules=extensions)
+try:
+    run_setup(ext_modules)
+except Exception:
+    print(
+        "\n"
+        "WARNING: Failed to compile C extension _fast_walk.\n"
+        "         LZGraphs will use the pure-Python fallback for simulate().\n"
+        "         This is fine — the package works without it, just slower.\n"
+        "\n"
+    )
+    run_setup([])

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
-__version__ = "2.2.0"
+__version__ = "2.3.0"
 # =============================================================================
 # Graph classes

lzgraphs-2.3.0/src/LZGraphs/_fast_walk.c ADDED Viewed

@@ -0,0 +1,321 @@
+/*
+ * _fast_walk.c — CPython C extension for fast Markov chain random walks.
+ *
+ * Implements the full simulate() loop in C including string assembly,
+ * for ~100-200x speedup over the original pure-Python implementation.
+ * Uses xoshiro256++ for fast, high-quality RNG.
+ *
+ * The extension is optional: if it fails to compile (no C compiler),
+ * LZGraphs falls back to the pure-Python bisect-based implementation.
+ */
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#include <stdint.h>
+#include <string.h>
+/* ========================================================================
+ * xoshiro256++ RNG — public domain by David Blackman and Sebastiano Vigna
+ * ======================================================================== */
+static inline uint64_t rotl(const uint64_t x, int k) {
+    return (x << k) | (x >> (64 - k));
+}
+typedef struct {
+    uint64_t s[4];
+} xoshiro256_state;
+static inline uint64_t xoshiro256pp_next(xoshiro256_state *state) {
+    const uint64_t result = rotl(state->s[0] + state->s[3], 23) + state->s[0];
+    const uint64_t t = state->s[1] << 17;
+    state->s[2] ^= state->s[0];
+    state->s[3] ^= state->s[1];
+    state->s[1] ^= state->s[2];
+    state->s[0] ^= state->s[3];
+    state->s[2] ^= t;
+    state->s[3] = rotl(state->s[3], 45);
+    return result;
+}
+static inline double xoshiro256pp_double(xoshiro256_state *state) {
+    return (double)(xoshiro256pp_next(state) >> 11) * 0x1.0p-53;
+}
+static inline uint64_t splitmix64(uint64_t *x) {
+    uint64_t z = (*x += 0x9e3779b97f4a7c15ULL);
+    z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9ULL;
+    z = (z ^ (z >> 27)) * 0x94d049bb133111ebULL;
+    return z ^ (z >> 31);
+}
+static void seed_xoshiro256(xoshiro256_state *state, uint64_t seed) {
+    state->s[0] = splitmix64(&seed);
+    state->s[1] = splitmix64(&seed);
+    state->s[2] = splitmix64(&seed);
+    state->s[3] = splitmix64(&seed);
+}
+/* ========================================================================
+ * Binary search (bisect_left) on a double array
+ * ======================================================================== */
+static inline Py_ssize_t bisect_left_double(
+    const double *arr, Py_ssize_t n, double value
+) {
+    Py_ssize_t lo = 0, hi = n;
+    while (lo < hi) {
+        Py_ssize_t mid = lo + (hi - lo) / 2;
+        if (arr[mid] < value)
+            lo = mid + 1;
+        else
+            hi = mid;
+    }
+    return lo;
+}
+/* ========================================================================
+ * simulate_walks — full simulation with string assembly in C
+ *
+ * Args:
+ *   n_walks       : int
+ *   offsets       : intp array [n_nodes+1] (buffer)
+ *   neighbors     : intp array [total_edges] (buffer)
+ *   cumweights    : float64 array [total_edges] (buffer)
+ *   stop_probs    : float64 array [n_nodes] (buffer)
+ *   initial_ids   : intp array [n_initial] (buffer)
+ *   initial_cw    : float64 array [n_initial] (buffer)
+ *   seed          : uint64
+ *   clean_labels  : list[str] — label for each node ID
+ *   return_walks  : bool — if True, return (walk, seq) tuples
+ *   id_to_node    : list[str] — node names (only used if return_walks)
+ *
+ * Returns:
+ *   list[str]  or  list[tuple[list[str], str]]
+ * ======================================================================== */
+static PyObject* py_simulate_walks(PyObject *self, PyObject *args) {
+    int n_walks, return_walks;
+    Py_buffer offsets_buf, neighbors_buf, cumweights_buf;
+    Py_buffer stop_probs_buf, initial_ids_buf, initial_cw_buf;
+    unsigned long long seed;
+    PyObject *clean_labels;  /* Python list of str */
+    PyObject *id_to_node;    /* Python list of str */
+    PyObject *result_list = NULL;
+    if (!PyArg_ParseTuple(args, "iy*y*y*y*y*y*KOpO",
+            &n_walks,
+            &offsets_buf, &neighbors_buf, &cumweights_buf,
+            &stop_probs_buf, &initial_ids_buf, &initial_cw_buf,
+            &seed,
+            &clean_labels,
+            &return_walks,
+            &id_to_node))
+        return NULL;
+    const Py_ssize_t *offsets = (const Py_ssize_t *)offsets_buf.buf;
+    const Py_ssize_t *neighbors = (const Py_ssize_t *)neighbors_buf.buf;
+    const double *cumweights = (const double *)cumweights_buf.buf;
+    const double *stop_probs = (const double *)stop_probs_buf.buf;
+    const Py_ssize_t *initial_ids = (const Py_ssize_t *)initial_ids_buf.buf;
+    const double *initial_cw = (const double *)initial_cw_buf.buf;
+    const Py_ssize_t n_initial = initial_cw_buf.len / (Py_ssize_t)sizeof(double);
+    if (n_initial <= 0) {
+        PyErr_SetString(PyExc_ValueError,
+            "Cannot simulate: graph has no initial states.");
+        goto cleanup;
+    }
+    /* Pre-fetch label UTF-8 data for fast string assembly */
+    const Py_ssize_t n_labels = PyList_GET_SIZE(clean_labels);
+    const char **label_ptrs = (const char **)PyMem_Malloc(n_labels * sizeof(char *));
+    Py_ssize_t *label_lens = (Py_ssize_t *)PyMem_Malloc(n_labels * sizeof(Py_ssize_t));
+    if (!label_ptrs || !label_lens) {
+        PyMem_Free(label_ptrs);
+        PyMem_Free(label_lens);
+        PyErr_NoMemory();
+        goto cleanup;
+    }
+    for (Py_ssize_t i = 0; i < n_labels; i++) {
+        PyObject *s = PyList_GET_ITEM(clean_labels, i);
+        label_ptrs[i] = PyUnicode_AsUTF8AndSize(s, &label_lens[i]);
+        if (!label_ptrs[i]) {
+            PyMem_Free(label_ptrs);
+            PyMem_Free(label_lens);
+            goto cleanup;
+        }
+    }
+    xoshiro256_state rng;
+    seed_xoshiro256(&rng, (uint64_t)seed);
+    result_list = PyList_New(n_walks);
+    if (!result_list) {
+        PyMem_Free(label_ptrs);
+        PyMem_Free(label_lens);
+        goto cleanup;
+    }
+    /* Reusable walk buffer */
+    Py_ssize_t walk_cap = 64;
+    Py_ssize_t *walk_buf = (Py_ssize_t *)PyMem_Malloc(walk_cap * sizeof(Py_ssize_t));
+    /* Reusable string buffer */
+    Py_ssize_t str_cap = 256;
+    char *str_buf = (char *)PyMem_Malloc(str_cap);
+    if (!walk_buf || !str_buf) {
+        PyMem_Free(walk_buf);
+        PyMem_Free(str_buf);
+        PyMem_Free(label_ptrs);
+        PyMem_Free(label_lens);
+        Py_DECREF(result_list);
+        PyErr_NoMemory();
+        goto cleanup;
+    }
+    for (int i = 0; i < n_walks; i++) {
+        /* Pick initial state */
+        double r = xoshiro256pp_double(&rng);
+        Py_ssize_t init_idx = bisect_left_double(initial_cw, n_initial, r);
+        if (init_idx >= n_initial) init_idx = n_initial - 1;
+        Py_ssize_t current = initial_ids[init_idx];
+        Py_ssize_t walk_len = 0;
+        walk_buf[walk_len++] = current;
+        /* Build string incrementally */
+        Py_ssize_t str_len = 0;
+        Py_ssize_t llen = label_lens[current];
+        if (str_len + llen > str_cap) {
+            str_cap = (str_len + llen) * 2;
+            str_buf = (char *)PyMem_Realloc(str_buf, str_cap);
+            if (!str_buf) goto oom;
+        }
+        memcpy(str_buf + str_len, label_ptrs[current], llen);
+        str_len += llen;
+        while (1) {
+            double sp = stop_probs[current];
+            if (sp == sp) {
+                if (xoshiro256pp_double(&rng) < sp)
+                    break;
+            }
+            Py_ssize_t start = offsets[current];
+            Py_ssize_t end = offsets[current + 1];
+            if (start == end)
+                break;
+            r = xoshiro256pp_double(&rng);
+            Py_ssize_t idx = bisect_left_double(cumweights + start, end - start, r);
+            if (idx >= end - start) idx = end - start - 1;
+            current = neighbors[start + idx];
+            /* Grow walk buffer if needed */
+            if (walk_len >= walk_cap) {
+                walk_cap *= 2;
+                Py_ssize_t *new_buf = (Py_ssize_t *)PyMem_Realloc(walk_buf, walk_cap * sizeof(Py_ssize_t));
+                if (!new_buf) goto oom;
+                walk_buf = new_buf;
+            }
+            walk_buf[walk_len++] = current;
+            /* Append label to string buffer */
+            llen = label_lens[current];
+            if (str_len + llen > str_cap) {
+                str_cap = (str_len + llen) * 2;
+                char *new_str = (char *)PyMem_Realloc(str_buf, str_cap);
+                if (!new_str) goto oom;
+                str_buf = new_str;
+            }
+            memcpy(str_buf + str_len, label_ptrs[current], llen);
+            str_len += llen;
+        }
+        /* Create Python string from buffer */
+        PyObject *seq = PyUnicode_FromStringAndSize(str_buf, str_len);
+        if (!seq) goto oom;
+        if (return_walks) {
+            /* Build walk list of node name strings */
+            PyObject *walk = PyList_New(walk_len);
+            if (!walk) { Py_DECREF(seq); goto oom; }
+            for (Py_ssize_t j = 0; j < walk_len; j++) {
+                PyObject *node_name = PyList_GET_ITEM(id_to_node, walk_buf[j]);
+                Py_INCREF(node_name);
+                PyList_SET_ITEM(walk, j, node_name);
+            }
+            PyObject *tup = PyTuple_Pack(2, walk, seq);
+            Py_DECREF(walk);
+            Py_DECREF(seq);
+            if (!tup) goto oom;
+            PyList_SET_ITEM(result_list, i, tup);
+        } else {
+            PyList_SET_ITEM(result_list, i, seq);
+        }
+    }
+    PyMem_Free(walk_buf);
+    PyMem_Free(str_buf);
+    PyMem_Free(label_ptrs);
+    PyMem_Free(label_lens);
+    goto cleanup;
+oom:
+    PyMem_Free(walk_buf);
+    PyMem_Free(str_buf);
+    PyMem_Free(label_ptrs);
+    PyMem_Free(label_lens);
+    Py_XDECREF(result_list);
+    result_list = NULL;
+    if (!PyErr_Occurred())
+        PyErr_NoMemory();
+cleanup:
+    PyBuffer_Release(&offsets_buf);
+    PyBuffer_Release(&neighbors_buf);
+    PyBuffer_Release(&cumweights_buf);
+    PyBuffer_Release(&stop_probs_buf);
+    PyBuffer_Release(&initial_ids_buf);
+    PyBuffer_Release(&initial_cw_buf);
+    return result_list;
+}
+/* ========================================================================
+ * Module definition
+ * ======================================================================== */
+static PyMethodDef FastWalkMethods[] = {
+    {"simulate_walks", py_simulate_walks, METH_VARARGS,
+     "Run n random walks on a CSR-encoded graph with string assembly.\n\n"
+     "Args:\n"
+     "    n_walks (int): Number of walks.\n"
+     "    offsets (array): CSR row offsets [n_nodes+1], dtype=intp.\n"
+     "    neighbors (array): Flat neighbor IDs, dtype=intp.\n"
+     "    cumweights (array): Flat cumulative weights, dtype=float64.\n"
+     "    stop_probs (array): Per-node stop probability (NaN=none), dtype=float64.\n"
+     "    initial_ids (array): Initial state IDs, dtype=intp.\n"
+     "    initial_cumprobs (array): Cumulative initial probs, dtype=float64.\n"
+     "    seed (int): RNG seed (xoshiro256++).\n"
+     "    clean_labels (list[str]): Subpattern label for each node.\n"
+     "    return_walks (bool): If True, return (walk, seq) tuples.\n"
+     "    id_to_node (list[str]): Node names for walk output.\n\n"
+     "Returns:\n"
+     "    list[str] or list[tuple[list[str], str]]\n"},
+    {NULL, NULL, 0, NULL}
+};
+static struct PyModuleDef fast_walk_module = {
+    PyModuleDef_HEAD_INIT,
+    "_fast_walk",
+    "C-accelerated random walk simulation for LZGraphs.\n"
+    "Uses xoshiro256++ RNG for high-quality, fast random number generation.\n"
+    "This module is optional — LZGraphs falls back to pure Python if unavailable.",
+    -1,
+    FastWalkMethods
+};
+PyMODINIT_FUNC PyInit__fast_walk(void) {
+    return PyModule_Create(&fast_walk_module);
+}

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs/graphs/amino_acid_positional.py RENAMED Viewed

@@ -141,8 +141,6 @@ class AAPLZGraph(LZGraphBase):
         self._log_step("Graph constructed.", verbose)
         # Normalize and derive probability dicts
-        self.length_counts = dict(self.lengths)
         total_terminal = sum(self.terminal_state_counts.values())
         self.length_probabilities = (
             {k: v / total_terminal for k, v in self.terminal_state_counts.items()}

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs/graphs/graph_operations.py RENAMED Viewed

@@ -91,19 +91,10 @@ def graph_union(graphA, graphB):
             }
         # Merge length_distribution counts
-        if hasattr(graphA, 'length_counts') and hasattr(graphB, 'length_counts'):
-            for k, v in graphB.length_counts.items():
-                graphA.length_counts[k] = graphA.length_counts.get(k, 0) + v
-        # Merge observed gene sets
-        if hasattr(graphB, 'observed_v_genes'):
-            graphA.observed_v_genes = list(
-                set(graphA.observed_v_genes) | set(graphB.observed_v_genes)
-            )
-        if hasattr(graphB, 'observed_j_genes'):
-            graphA.observed_j_genes = list(
-                set(graphA.observed_j_genes) | set(graphB.observed_j_genes)
-            )
+        for k, v in graphB.lengths.items():
+            graphA.lengths[k] = graphA.lengths.get(k, 0) + v
+        # observed_v/j_genes are now derived from marginal_v/j_genes (already merged above)
     # 5. Recalculate ALL derived state from raw counts
     graphA.recalculate()

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs/graphs/lz_graph_base.py RENAMED Viewed

@@ -1,6 +1,7 @@
 import logging
 import re
 from abc import ABC, abstractmethod
+from bisect import bisect_left as _bisect_left
 from time import time
 import networkx as nx
@@ -12,6 +13,12 @@ from ..utilities.misc import choice, window
 # Shared constants
 from ..constants import _EPS, _LOG_EPS
+# Optional C extension for fast simulation
+try:
+    from .._fast_walk import simulate_walks as _c_simulate_walks
+except ImportError:
+    _c_simulate_walks = None
 # EdgeData
 from .edge_data import EdgeData
@@ -106,6 +113,19 @@ class LZGraphBase(
         # Topological order cache (built lazily, invalidated on structural changes)
         self._topo_order = None
+    # ------------------------------------------------------------------
+    # Derived properties (single source of truth)
+    # ------------------------------------------------------------------
+    @property
+    def length_counts(self):
+        """Alias for ``lengths`` — avoids storing the same dict twice."""
+        return self.lengths
+    @length_counts.setter
+    def length_counts(self, value):
+        self.lengths = value
     @staticmethod
     def _normalize_input(data, seq_column, abundances=None, v_genes=None, j_genes=None):
         """Convert flexible input to a standardised dict-of-lists.
@@ -242,7 +262,7 @@ class LZGraphBase(
         if self.has_gene_data and other.has_gene_data:
             aux += not _dicts_close(self.marginal_v_genes, other.marginal_v_genes, decimals=3)
             aux += not _dicts_close(self.vj_probabilities, other.vj_probabilities, decimals=3)
-            aux += not _dicts_close(self.length_counts, other.length_counts, decimals=3)
+            aux += not _dicts_close(self.lengths, other.lengths, decimals=3)
         return (aux == 0)
@@ -802,6 +822,10 @@ class LZGraphBase(
     def _build_walk_cache(self, seed=None):
         """Build pre-computed numpy arrays for fast random walks.
+        Uses CSR (Compressed Sparse Row) format for neighbor data so
+        the entire walk can be driven by flat numpy arrays and
+        ``searchsorted`` instead of per-step ``rng.choice()`` calls.
         Returns a dict with the cache data, stored as ``self._walk_cache``.
         """
         graph = self.graph
@@ -814,17 +838,42 @@ class LZGraphBase(
         # Pre-compute clean labels for all nodes
         clean_labels = np.array([self.extract_subpattern(name) for name in nodes], dtype=object)
-        # Per-node neighbor IDs and weights
-        neighbor_ids = [None] * n
-        neighbor_weights = [None] * n
+        # Build per-node neighbor/weight arrays.
+        # node_neighbors + node_weights: used by lzpgen_distribution
+        # node_cumweights: used by Python simulate fallback (bisect)
+        # CSR flat arrays are built only when the C extension is available.
+        node_neighbors = [None] * n   # list of numpy int arrays
+        node_weights = [None] * n     # list of numpy float arrays
+        node_cumweights = [None] * n  # list of Python float lists (for bisect)
         for i, name in enumerate(nodes):
             succs = list(graph.successors(name))
             if succs:
                 ids = np.array([node_to_id[s] for s in succs], dtype=np.intp)
+                node_neighbors[i] = ids
                 wts = np.array([graph[name][s]['data'].weight for s in succs], dtype=np.float64)
-                wts /= wts.sum()  # ensure normalization
-                neighbor_ids[i] = ids
-                neighbor_weights[i] = wts
+                wts /= wts.sum()
+                node_weights[i] = wts
+                cw = np.cumsum(wts)
+                cw[-1] = 1.0  # clamp for floating point safety
+                node_cumweights[i] = cw.tolist()
+        # Build flat CSR arrays only if C extension is available
+        if _c_simulate_walks is not None:
+            csr_offsets = np.empty(n + 1, dtype=np.intp)
+            csr_parts_nb = []
+            csr_parts_cw = []
+            offset = 0
+            for i in range(n):
+                csr_offsets[i] = offset
+                if node_neighbors[i] is not None:
+                    csr_parts_nb.append(node_neighbors[i])
+                    csr_parts_cw.append(np.array(node_cumweights[i], dtype=np.float64))
+                    offset += len(node_neighbors[i])
+            csr_offsets[n] = offset
+            csr_neighbors = np.concatenate(csr_parts_nb) if csr_parts_nb else np.empty(0, dtype=np.intp)
+            csr_cumweights = np.concatenate(csr_parts_cw) if csr_parts_cw else np.empty(0, dtype=np.float64)
+        else:
+            csr_offsets = csr_neighbors = csr_cumweights = None
         # Stop probabilities: NaN for non-terminal nodes
         stop_probs = np.full(n, np.nan, dtype=np.float64)
@@ -834,12 +883,19 @@ class LZGraphBase(
         # Initial state arrays
         init_states = list(self.initial_state_probabilities.keys())
+        if not init_states:
+            raise ValueError(
+                "Cannot simulate: graph has no initial states. "
+                "Ensure the graph was constructed with valid sequences."
+            )
         init_probs = np.array(
             [self.initial_state_probabilities[s] for s in init_states],
             dtype=np.float64,
         )
         init_probs = init_probs / init_probs.sum()  # ensure normalization
         initial_ids = np.array([node_to_id[s] for s in init_states], dtype=np.intp)
+        initial_cumprobs = np.cumsum(init_probs)
+        initial_cumprobs[-1] = 1.0  # clamp
         rng = np.random.default_rng(seed)
@@ -847,11 +903,16 @@ class LZGraphBase(
             'node_to_id': node_to_id,
             'id_to_node': id_to_node,
             'clean_labels': clean_labels,
-            'neighbor_ids': neighbor_ids,
-            'neighbor_weights': neighbor_weights,
+            'node_neighbors': node_neighbors,
+            'node_weights': node_weights,
+            'node_cumweights': node_cumweights,
+            'csr_offsets': csr_offsets,
+            'csr_neighbors': csr_neighbors,
+            'csr_cumweights': csr_cumweights,
             'stop_probs': stop_probs,
             'initial_ids': initial_ids,
             'initial_probs': init_probs,
+            'initial_cumprobs': initial_cumprobs,
             'rng': rng,
         }
         return self._walk_cache
@@ -879,46 +940,89 @@ class LZGraphBase(
             self._build_walk_cache(seed)
         cache = self._walk_cache
+        clean_labels = cache['clean_labels']
+        id_to_node = cache['id_to_node']
+        # ── C fast path ──────────────────────────────────────────────
+        if _c_simulate_walks is not None:
+            rng_seed = seed if seed is not None else int(cache['rng'].integers(0, 2**63))
+            return _c_simulate_walks(
+                n,
+                cache['csr_offsets'],
+                cache['csr_neighbors'],
+                cache['csr_cumweights'],
+                cache['stop_probs'],
+                cache['initial_ids'],
+                cache['initial_cumprobs'],
+                rng_seed,
+                list(clean_labels),
+                return_walks,
+                list(id_to_node),
+            )
+        # ── Python fallback ──────────────────────────────────────────
         rng = cache['rng']
         initial_ids = cache['initial_ids']
-        initial_probs = cache['initial_probs']
+        initial_cumprobs = cache['initial_cumprobs']
         stop_probs = cache['stop_probs']
-        neighbor_ids = cache['neighbor_ids']
-        neighbor_weights = cache['neighbor_weights']
-        clean_labels = cache['clean_labels']
-        id_to_node = cache['id_to_node']
+        node_neighbors = cache['node_neighbors']
+        node_cumweights = cache['node_cumweights']
+        # Pre-generate random numbers in bulk for throughput
+        buf_size = max(n * 25, 1024)
+        rand_buf = rng.random(buf_size)
+        rand_idx = 0
+        # Local references to avoid repeated global/attribute lookups
+        bisect = _bisect_left
+        init_cumprobs_list = initial_cumprobs.tolist()
         results = []
+        results_append = results.append
         for _ in range(n):
-            # Pick initial state
-            current = rng.choice(initial_ids, p=initial_probs)
+            # Refill buffer if running low
+            if rand_idx + 50 > len(rand_buf):
+                rand_buf = rng.random(buf_size)
+                rand_idx = 0
+            # Pick initial state via bisect on cumulative probs
+            current = initial_ids[bisect(init_cumprobs_list, rand_buf[rand_idx])]
+            rand_idx += 1
             parts = [clean_labels[current]]
             walk_ids = [current] if return_walks else None
             while True:
                 # Check stop condition
                 stop_p = stop_probs[current]
-                if not np.isnan(stop_p):
-                    if rng.random() < stop_p:
+                if stop_p == stop_p:  # fast NaN check (NaN != NaN)
+                    if rand_buf[rand_idx] < stop_p:
+                        rand_idx += 1
                         break
+                    rand_idx += 1
                 # Check for dead-end (no outgoing edges)
-                nb_ids = neighbor_ids[current]
-                if nb_ids is None:
+                nb = node_neighbors[current]
+                if nb is None:
                     break
-                # Take a step
-                current = rng.choice(nb_ids, p=neighbor_weights[current])
+                # Take a step via bisect on per-node cumulative weights
+                current = nb[bisect(node_cumweights[current], rand_buf[rand_idx])]
+                rand_idx += 1
                 parts.append(clean_labels[current])
                 if return_walks:
                     walk_ids.append(current)
+                # Refill buffer if running low
+                if rand_idx + 50 > len(rand_buf):
+                    rand_buf = rng.random(buf_size)
+                    rand_idx = 0
             sequence = ''.join(parts)
             if return_walks:
                 walk = [id_to_node[wid] for wid in walk_ids]
-                results.append((walk, sequence))
+                results_append((walk, sequence))
             else:
-                results.append(sequence)
+                results_append(sequence)
         return results

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs/graphs/nucleotide_double_positional.py RENAMED Viewed

@@ -126,8 +126,6 @@ class NDPLZGraph(LZGraphBase):
         self._log_step("Graph constructed.", verbose)
         # Normalize and derive probability dicts
-        self.length_counts = dict(self.lengths)
         total_terminal = sum(self.terminal_state_counts.values())
         self.length_probabilities = (
             {k: v / total_terminal for k, v in self.terminal_state_counts.items()}

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs/mixins/gene_logic.py RENAMED Viewed

@@ -17,6 +17,28 @@ class GeneLogicMixin:
           with probability distribution `weights`.
     """
+    @property
+    def observed_v_genes(self):
+        """Unique V genes — derived from ``marginal_v_genes`` keys."""
+        mg = getattr(self, 'marginal_v_genes', None)
+        return list(mg.keys()) if mg else []
+    @observed_v_genes.setter
+    def observed_v_genes(self, value):
+        # Accept sets from old pickles / JSON deserialization (no-op storage)
+        pass
+    @property
+    def observed_j_genes(self):
+        """Unique J genes — derived from ``marginal_j_genes`` keys."""
+        mg = getattr(self, 'marginal_j_genes', None)
+        return list(mg.keys()) if mg else []
+    @observed_j_genes.setter
+    def observed_j_genes(self, value):
+        # Accept sets from old pickles / JSON deserialization (no-op storage)
+        pass
     def _raise_genetic_mode_error(self):
         """
         Raise an error if genetic mode is off but a genetic function is called.
@@ -39,10 +61,6 @@ class GeneLogicMixin:
         v_list = data['v_genes']
         j_list = data['j_genes']
-        # Unique sets of V and J
-        self.observed_v_genes = list(set(v_list))
-        self.observed_j_genes = list(set(j_list))
         # Marginal distributions (normalized) — stored as plain dicts
         n = len(v_list)
         v_counts = {}

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs/mixins/lzpgen_distribution.py RENAMED Viewed

@@ -56,8 +56,8 @@ class LZPgenDistributionMixin:
         initial_ids = cache['initial_ids']
         initial_probs = cache['initial_probs']
         stop_probs = cache['stop_probs']
-        neighbor_ids = cache['neighbor_ids']
-        neighbor_weights = cache['neighbor_weights']
+        node_neighbors = cache['node_neighbors']
+        node_weights = cache['node_weights']
         # Pre-compute log values for zero per-step overhead
         eps = _EPS
@@ -71,9 +71,9 @@ class LZPgenDistributionMixin:
         neighbor_log_weights = [None] * n_nodes
         for i in range(n_nodes):
-            if neighbor_weights[i] is not None:
+            if node_weights[i] is not None:
                 neighbor_log_weights[i] = np.log(
-                    np.maximum(neighbor_weights[i], eps)
+                    np.maximum(node_weights[i], eps)
                 )
         log_probs = np.empty(n, dtype=np.float64)
@@ -94,16 +94,16 @@ class LZPgenDistributionMixin:
                         break
                 # Dead-end check
-                nb_ids = neighbor_ids[current]
-                if nb_ids is None:
+                nb = node_neighbors[current]
+                if nb is None:
                     log_p += np.log(eps)
                     break
                 # Take a step
-                n_nb = len(nb_ids)
-                step_idx = rng.choice(n_nb, p=neighbor_weights[current])
+                n_nb = len(nb)
+                step_idx = rng.choice(n_nb, p=node_weights[current])
                 log_p += neighbor_log_weights[current][step_idx]
-                current = nb_ids[step_idx]
+                current = nb[step_idx]
             log_probs[seq_idx] = log_p

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs/mixins/serialization.py RENAMED Viewed

@@ -32,7 +32,7 @@ class SerializationMixin:
         'subpattern_individual_probability': 'node_probability',
         'per_node_observed_frequency': 'node_outgoing_counts',
         'length_distribution_proba': 'length_probabilities',
-        'length_distribution': 'length_counts',
+        'length_distribution': 'lengths',
         'n_subpatterns': 'num_subpatterns',
         'n_transitions': 'num_transitions',
         'marginal_vgenes': 'marginal_v_genes',
@@ -55,6 +55,20 @@ class SerializationMixin:
         'j_call': 'J',
     }
+    # Transient attributes excluded from pickle (rebuilt on demand)
+    _TRANSIENT_ATTRS = frozenset({
+        '_walk_cache',
+        '_topo_order',
+        '_edges_cache',
+        'constructor_start_time',
+        'constructor_end_time',
+    })
+    def __getstate__(self):
+        """Exclude transient caches from pickle to reduce file size."""
+        return {k: v for k, v in self.__dict__.items()
+                if k not in self._TRANSIENT_ATTRS}
     def __setstate__(self, state):
         """Restore instance from pickle, migrating old attribute names and pandas types."""
         # Migrate old attribute names to new names
@@ -62,6 +76,17 @@ class SerializationMixin:
             if old_name in state and new_name not in state:
                 state[new_name] = state.pop(old_name)
+        # length_counts is now a property aliasing lengths — migrate old pickles
+        if 'length_counts' in state:
+            if 'lengths' not in state:
+                state['lengths'] = state.pop('length_counts')
+            else:
+                del state['length_counts']
+        # observed_v/j_genes are now properties — remove stored values
+        state.pop('observed_v_genes', None)
+        state.pop('observed_j_genes', None)
         # Migrate 'wsif/sep' key inside terminal_state_data dicts
         tsd = state.get('terminal_state_data')
         if tsd is not None and isinstance(tsd, dict):
@@ -80,7 +105,7 @@ class SerializationMixin:
         for attr in ('initial_state_counts', 'terminal_state_counts',
                       'initial_state_probabilities', 'length_probabilities',
                       'marginal_v_genes', 'marginal_j_genes', 'vj_probabilities',
-                      'length_counts'):
+                      'lengths'):
             val = getattr(self, attr, None)
             if val is not None and hasattr(val, 'to_dict'):
                 setattr(self, attr, val.to_dict())
@@ -324,12 +349,7 @@ class SerializationMixin:
                 data['marginal_j_genes'] = _to_dict(self.marginal_j_genes)
             if hasattr(self, 'vj_probabilities'):
                 data['vj_probabilities'] = _to_dict(self.vj_probabilities)
-            if hasattr(self, 'length_counts'):
-                data['length_counts'] = _to_dict(self.length_counts)
-            if hasattr(self, 'observed_v_genes'):
-                data['observed_v_genes'] = list(self.observed_v_genes)
-            if hasattr(self, 'observed_j_genes'):
-                data['observed_j_genes'] = list(self.observed_j_genes)
+            data['length_counts'] = _to_dict(self.lengths)
         # Terminal state data
         if hasattr(self, 'terminal_state_data'):
@@ -421,7 +441,9 @@ class SerializationMixin:
         instance.terminal_state_counts = _to_plain_dict(
             data.get('terminal_state_counts', data.get('terminal_states'))
         )
-        instance.lengths = data.get('lengths', {})
+        instance.lengths = data.get('lengths',
+                                    data.get('length_counts',
+                                             data.get('length_distribution', {})))
         instance.vj_combination_graphs = {}
         instance.num_neighbours = {}
         instance.node_outgoing_counts = data.get('node_outgoing_counts',
@@ -451,15 +473,6 @@ class SerializationMixin:
                 instance.marginal_j_genes = _to_plain_dict(mg_j)
             if 'vj_probabilities' in data:
                 instance.vj_probabilities = _to_plain_dict(data['vj_probabilities'])
-            lc = data.get('length_counts', data.get('length_distribution'))
-            if lc is not None:
-                instance.length_counts = _to_plain_dict(lc)
-            ov = data.get('observed_v_genes', data.get('observed_vgenes'))
-            if ov is not None:
-                instance.observed_v_genes = set(ov)
-            oj = data.get('observed_j_genes', data.get('observed_jgenes'))
-            if oj is not None:
-                instance.observed_j_genes = set(oj)
         # Restore terminal state data
         if 'terminal_state_data' in data:

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: LZGraphs
-Version: 2.2.0
+Version: 2.3.0
 Summary: An Implementation of LZ76 Based Graphs for Repertoire Representation and Analysis
 Author-email: Thomas Konstantinovsky <thomaskon90@gmail.com>
 Maintainer-email: Thomas Konstantinovsky <thomaskon90@gmail.com>

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs.egg-info/SOURCES.txt RENAMED Viewed

@@ -6,7 +6,9 @@ README.md
 pyproject.toml
 requirements.txt
 setup.cfg
+setup.py
 src/LZGraphs/__init__.py
+src/LZGraphs/_fast_walk.c
 src/LZGraphs/constants.py
 src/LZGraphs/py.typed
 src/LZGraphs.egg-info/PKG-INFO

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/tests/test_lzpgen_distribution.py RENAMED Viewed

@@ -58,8 +58,13 @@ class TestLZPgenDistributionBasic:
         result = aap_lzgraph.lzpgen_distribution(n=0, seed=42)
         assert len(result) == 0
-    def test_consistent_with_walk_log_probability(self, aap_lzgraph):
+    def test_consistent_with_walk_log_probability(self, aap_lzgraph, monkeypatch):
         """Values should match walk_log_probability on the same walks."""
+        # Force Python fallback so simulate() and lzpgen_distribution()
+        # use the same numpy RNG and produce identical walks for the same seed.
+        import LZGraphs.graphs.lz_graph_base as _base
+        monkeypatch.setattr(_base, '_c_simulate_walks', None)
         walks_and_seqs = aap_lzgraph.simulate(20, seed=42, return_walks=True)
         dist = aap_lzgraph.lzpgen_distribution(n=20, seed=42)

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/CHANGELOG.md RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/CONTRIBUTING.md RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/LICENSE RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/MANIFEST.in RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/README.md RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/pyproject.toml RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/requirements.txt RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/setup.cfg RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs/bag_of_words/__init__.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs/bag_of_words/bow_encoder.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs/constants.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs/exceptions/__init__.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs/graphs/__init__.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs/graphs/edge_data.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs/graphs/naive.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs/metrics/__init__.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs/metrics/convenience.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs/metrics/diversity.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs/metrics/entropy.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs/metrics/pgen_distribution.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs/metrics/saturation.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs/mixins/__init__.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs/mixins/bayesian_posterior.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs/mixins/gene_prediction.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs/mixins/graph_topology.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs/mixins/random_walk.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs/mixins/walk_analysis.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs/py.typed RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs/utilities/__init__.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs/utilities/decomposition.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs/utilities/helpers.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs/utilities/misc.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs/visualization/__init__.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs/visualization/visualize.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs.egg-info/requires.txt RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/src/LZGraphs.egg-info/top_level.txt RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/tests/test_aap_lzgraph.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/tests/test_abundance.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/tests/test_analytical_distribution.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/tests/test_base_class_methods.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/tests/test_bow_encoder.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/tests/test_diversity_theory.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/tests/test_flexible_input.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/tests/test_graph_operations.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/tests/test_metrics.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/tests/test_naive_lzgraph.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/tests/test_ndp_lzgraph.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/tests/test_new_features.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/tests/test_pgen_fixes.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/tests/test_serialization.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/tests/test_simulate.py RENAMED Viewed

File without changes

{lzgraphs-2.2.0 → lzgraphs-2.3.0}/tests/test_utilities.py RENAMED Viewed

File without changes

LZGraphs 2.2.0__tar.gz → 2.3.0__tar.gz

LZGraphs 2.2.0tar.gz → 2.3.0tar.gz