PyPI - polyads - Versions diffs - 0.0.1__py3-none-any.whl - Mend

polyads 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

polyads/__init__.py +4 -0
polyads/binary_search.py +56 -0
polyads/data.py +54 -0
polyads/fitter.py +189 -0
polyads/losses.py +77 -0
polyads/model.py +217 -0
polyads/polyad_eval.py +226 -0
polyads/polyad_generation.py +191 -0
polyads/polyad_utils.py +27 -0
polyads/utils.py +25 -0
polyads/validations.py +73 -0
polyads-0.0.1.dist-info/METADATA +927 -0
polyads-0.0.1.dist-info/RECORD +15 -0
polyads-0.0.1.dist-info/WHEEL +4 -0
polyads-0.0.1.dist-info/licenses/LICENSE +661 -0

polyads/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .model import PolyadEstimator
+from .data import generate_data
+__all__ = ["PolyadEstimator", "generate_data"]

polyads/binary_search.py ADDED Viewed

@@ -0,0 +1,56 @@
+import numpy as np
+from numba import jit
+@jit(nopython=True)
+def _binary_search_edge_value(keys, values, query):
+    """
+    Generalized binary search for edge value in a sorted list of edge keys.
+    Parameters
+    ----------
+    keys : np.ndarray
+        Array of edge keys (shape: [n_edges, D]).
+    values : np.ndarray
+        Array of edge values (shape: [n_edges]).
+    query : np.ndarray
+        Query key (shape: [D]).
+    Returns
+    -------
+    int
+        Value associated with query key, or 0 if not found.
+    """
+    lo = 0
+    hi, D = keys.shape
+    hi -= 1
+    while lo <= hi:
+        mid = (lo + hi) // 2
+        kmid = keys[mid]
+        # --- lex_less(kmid, query) ---
+        less = False
+        for i in range(D):
+            if kmid[i] < query[i]:
+                less = True
+                break
+            elif kmid[i] > query[i]:
+                less = False
+                break
+        if less:
+            lo = mid + 1
+            continue
+        # --- lex_equal(kmid, query) ---
+        equal = True
+        for i in range(D):
+            if kmid[i] != query[i]:
+                equal = False
+                break
+        if equal:
+            return values[mid]
+        hi = mid - 1
+    return 0

polyads/data.py ADDED Viewed

@@ -0,0 +1,54 @@
+import numpy as np
+import pandas as pd
+def generate_data(seed, n_ds, c, shape, beta, groups=None, return_full = False):
+    """
+    Generate synthetic data for polyad model.
+    Allows for p-dimensional features (last axis of X is p).
+    beta should be a vector of length p or a scalar.
+    """
+    rng = np.random.default_rng(seed)
+    D = len(n_ds)
+    # beta: shape (p,) or scalar
+    beta = np.asarray(beta)
+    if beta.ndim == 0:
+        beta = beta[None]
+    p = beta.size
+    X = rng.normal(size=(*n_ds, p))
+    # for i in range(1, n_ds[-1]):
+    #     X[:,:,i,-1] = X[:,:,0,-1] # to force a singular Hessian
+    # Compute linear predictor
+    linpred = c + np.tensordot(X, beta, axes=([-1],[0]))
+    # Fill the null groups
+    if groups is None:
+        groups = []
+        for d in range(D):
+            groups.append( [d_p for d_p in range(D) if d_p != d] )
+    # Add the fixed effects
+    thetas = []
+    for g in groups:
+        theta_g = rng.normal(scale = .25, size = tuple( n_ds[d] for d in g ))
+        thetas.append(theta_g)
+        linpred += theta_g[tuple(slice(None) if d in g else np.newaxis for d in range(D))]
+    if shape == np.inf:
+        lam = np.exp(linpred)
+    else:
+        lam = rng.gamma(shape, np.exp(linpred)/shape)
+    Y = rng.poisson(lam)
+    keys = np.where(Y > 0)
+    values = Y[Y>0]
+    df = pd.DataFrame({f"i_{i}": keys[i] for i in range(len(n_ds))})
+    df["Y"] = values
+    if return_full:
+        return Y, X, linpred, thetas
+    else:
+        return df, X

polyads/fitter.py ADDED Viewed

@@ -0,0 +1,189 @@
+import warnings
+try:
+    from numba.core.errors import NumbaPendingDeprecationWarning
+    warnings.filterwarnings("ignore", category=NumbaPendingDeprecationWarning)
+except ImportError:
+    pass
+warnings.filterwarnings("ignore", category=UserWarning)
+warnings.filterwarnings("ignore", category=DeprecationWarning)
+warnings.filterwarnings("ignore", category=FutureWarning)
+import numpy as np
+import time
+from .polyad_eval import (
+    _compute_polyad_features, _evaluate_polyad_losses,
+    _evaluate_loss, _compute_polyad_pairwise_covariance
+)
+from .polyad_generation import _find_active_polyads, _compute_polyads_permutations
+from .utils import get_bar_description, _get_keys_values
+# Main fitting routine for PolyadEstimator
+def _fit_polyad_estimator(
+    beta: np.ndarray,
+    df: np.ndarray,
+    eval_X: np.ndarray,
+    eval_kwargs: dict = None,
+    max_iter: int = 100,
+    tol: float = 1e-4,
+    max_step: float = 1.0,
+    use_tqdm: bool = False,
+    loss: str = "poisson_multiclass",
+    max_n_polyads: int = 1e7,
+    variance_threshold: float = 1.0,
+) -> dict:
+    """
+    Fit the polyad model using iterative optimization.
+    Parameters
+    ----------
+    beta : np.ndarray
+        Initial parameter vector for the model.
+    df : np.ndarray
+        Data array containing the observed data (primary and edge indices/values).
+    eval_X : np.ndarray
+        Feature matrix for evaluation.
+    eval_kwargs : dict, optional
+        Additional keyword arguments for feature evaluation (default: None).
+    max_iter : int, optional
+        Maximum number of optimization iterations (default: 100).
+    tol : float, optional
+        Tolerance for convergence (default: 1e-4).
+    max_step : float, optional
+        Maximum allowed step size for parameter updates (default: 0.5).
+    use_tqdm : bool, optional
+        Whether to display a progress bar using tqdm (default: False).
+    loss : str, optional
+        Loss function name. Supported: 'poisson_binary', 'poisson_multiclass', 'poisson_binary_multiclass'.
+        (default: 'poisson_multiclass')
+    max_n_polyads : int, optional
+        Maximum number of polyads to consider (default: 1e7).
+    variance_threshold : float, optional
+        Threshold for variance approximation (default: 1.0). Ranges from 0 (never) to 1 (always).
+    Returns
+    -------
+    dict
+        Dictionary containing the results of the optimization, including:
+        - 'beta': Final parameter vector
+        - 'converged': Whether convergence was achieved
+        - 'loss': Final loss value
+        - 'score': Gradient at solution
+        - 'hessian': Hessian matrix at solution
+        - 'det': Determinant of Hessian
+        - 'iterations': Number of iterations performed
+        - 'var': Estimated parameter covariance matrix
+        - 'n_polyads': Number of polyads used
+        - 'time': Total runtime in seconds
+    """
+    num_pos_edges = len(df)
+    if use_tqdm:
+        from tqdm import tqdm
+        progress_bar = tqdm(total=max_iter, desc=f"Gathering polyads over {num_pos_edges}² pairs of edges")
+    ts = time.time()
+    D: int = len(df.columns) - 1  # Polyad dimension
+    p: int = beta.size
+    primary_indices, edge_indices, edge_values = _get_keys_values(df.values)
+    # Find all active polyads in the data
+    xis, Y_xis, m_xis, M_xis = _find_active_polyads(
+        D, primary_indices, edge_indices, edge_values, int(max_n_polyads))
+    # Compute feature matrix for all polyads
+    X_xis = _compute_polyad_features(
+        D, p, xis, eval_X, kwargs=eval_kwargs)
+    num_polyads: int = m_xis.size
+    if num_polyads == 0:
+        if use_tqdm:
+            progress_bar.close()
+        return {
+            "beta": beta,
+            "converged": False,
+            "loss": np.inf,
+            "score": np.full(p, np.inf),
+            "hessian": np.full((p, p), np.inf),
+            "det": np.inf,
+            "iterations": 0,
+            "var": np.full((p, p), np.inf),
+            "n_edges": num_pos_edges,
+            "n_polyads": 0,
+            "n_pairs": 0,
+            "time": time.time() - ts
+        }
+    if use_tqdm:
+        progress_bar.set_description(get_bar_description(beta, num_polyads))
+    iteration: int = 0
+    while iteration < max_iter:
+        iteration += 1
+        # Evaluate loss, gradient, and Hessian for current parameters
+        log_likelihoods, expectations, variances = _evaluate_polyad_losses(
+            D, Y_xis, X_xis, m_xis, M_xis, beta, loss)
+        loss_val, gradient, hessian = _evaluate_loss(p, X_xis, log_likelihoods, expectations, variances)
+        hessian_det = np.linalg.det(hessian)
+        gradient_norm = np.linalg.norm(gradient)
+        # Compute update direction (Newton or gradient step)
+        if hessian_det > 1e-8:
+            update_direction = np.linalg.solve(hessian, gradient)
+        else:
+            update_direction = gradient
+        update_norm = np.linalg.norm(update_direction)
+        # Limit step size if necessary
+        update_direction = update_direction if update_norm <= max_step else update_direction/update_norm * max_step
+        beta -= update_direction
+        if use_tqdm:
+            progress_bar.update(1)
+            progress_bar.set_description(get_bar_description(beta, num_polyads, grad_norm=gradient_norm, det=hessian_det))
+        # Check for convergence or stopping criteria
+        if update_norm < tol or iteration == max_iter or hessian_det <= 1e-8:
+            var = np.inf * np.ones((p, p))
+            n_pairs = 0
+            if hessian_det > 1e-8:
+                if use_tqdm:
+                    progress_bar.set_description(f"Finding all permutations of the {num_polyads} active polyads")
+                # Compute covariance matrix for parameter estimates
+                xi_permutations = _compute_polyads_permutations(D, xis)
+                xi_permutations = xi_permutations[np.lexsort([xi_permutations[:,i] for i in range(D)])]
+                permutation_group_ends = np.append(
+                    1 + np.where(np.sum(np.abs(xi_permutations[1:,:D] - xi_permutations[:-1,:D]), axis=1) > 0)[0],
+                    xi_permutations.shape[0])
+                n_pairs = permutation_group_ends[1:] - permutation_group_ends[:-1]
+                n_pairs = (n_pairs * (n_pairs + 1) // 2).sum()
+                n_pairs += permutation_group_ends[0] * (permutation_group_ends[0] + 1) // 2
+                if use_tqdm:
+                    progress_bar.set_description(f"Looping over {num_polyads} polyads to evaluate the variance")
+                covariance_matrix = _compute_polyad_pairwise_covariance(
+                    D, p, num_polyads,
+                    xi_permutations, permutation_group_ends,
+                    expectations, X_xis,
+                    variance_threshold = variance_threshold
+                )
+                inv_hessian = np.linalg.inv(hessian)
+                var = inv_hessian @ covariance_matrix @ inv_hessian.T
+            return {
+                "beta": beta,
+                "converged": bool(update_norm < tol),
+                "loss": loss_val,
+                "score": gradient,
+                "hessian": hessian,
+                "det": hessian_det,
+                "iterations": iteration,
+                "var": var,
+                "n_edges": num_pos_edges,
+                "n_polyads": num_polyads,
+                "n_pairs": n_pairs,
+                "time": time.time() - ts
+            }

polyads/losses.py ADDED Viewed

@@ -0,0 +1,77 @@
+import numpy as np
+from numba import jit
+@jit(nopython=True)
+def _compute_polyad_loss(
+    Y_xis: np.ndarray,
+    m_xis: int,
+    M_xis: int,
+    signs: np.ndarray,
+    c: float,
+    loss: str
+) -> tuple:
+    """
+    Evaluate the distribution for a given polyad.
+    Parameters
+    ----------
+    Y_xi : np.ndarray
+        Array of observed counts for each configuration of the polyad.
+    min_positive_count : int
+        Minimum count among positive-sign configurations.
+    min_negative_count : int
+        Minimum count among negative-sign configurations.
+    signs : np.ndarray
+        Array of +1/-1 signs for each configuration.
+    c : float
+        Linear predictor (dot product of features and parameters).
+    loss : str
+        Loss function name. Supported: 'poisson_binary', 'poisson_multiclass', 'poisson_binary_multiclass'.
+    Returns
+    -------
+    tuple
+        (loss, expectation, variance) for the polyad.
+    """
+    Y_0 = Y_xis - signs * m_xis
+    nl = np.arange(m_xis + M_xis + 1)
+    W = np.empty(m_xis + M_xis + 1)
+    W[0] = 0
+    for i in range(m_xis + M_xis):
+        W[i+1] = W[i] - np.sum( signs * np.log(Y_0 + i * signs + (signs+1)/2) ) + c
+    W -= np.max(W)
+    ps = np.exp(W)
+    if loss == "poisson_multiclass":
+        err = np.log(ps.sum()) - W[m_xis]
+        ps /= ps.sum()
+        exp = (nl * ps).sum()
+        var = ((nl**2) * ps).sum() - exp**2
+        return err, exp - m_xis, var
+    else:
+        err_t = np.log(ps.sum())
+        ps_t = ps/ps.sum()
+        exp_t = (nl * ps_t).sum()
+        var_t = ((nl**2) * ps_t).sum() - exp_t**2
+        if loss == "poisson_binary_balanced":
+            cut_m = 1
+            ps_under_cut = ps_t[0]
+            while ps_under_cut < .5 and cut_m < m_xis + M_xis:
+                ps_under_cut += ps_t[cut_m]
+                cut_m += 1
+        else:
+            cut_m = (m_xis + M_xis + 1)//2
+        if m_xis >= cut_m:
+            err_s = np.log(ps[cut_m:].sum())
+            ps_s = ps[cut_m:]/ps[cut_m:].sum()
+            exp_s = (nl[cut_m:] * ps_s).sum()
+            var_s = ((nl[cut_m:]**2) * ps_s).sum() - exp_s**2
+        else:
+            err_s = np.log(ps[:cut_m].sum())
+            ps_s = ps[:cut_m]/ps[:cut_m].sum()
+            exp_s = (nl[:cut_m] * ps_s).sum()
+            var_s = ((nl[:cut_m]**2) * ps_s).sum() - exp_s**2
+        return err_t - err_s, exp_t - exp_s, var_t - var_s

polyads/model.py ADDED Viewed

@@ -0,0 +1,217 @@
+import numpy as np
+from scipy.stats import norm
+import pandas as pd
+from .validations import _validate_fit_inputs
+from .fitter import _fit_polyad_estimator
+class PolyadEstimator:
+    """
+    Scikit-learn-like estimator for polyad-based statistical modeling.
+    """
+    def __init__(
+        self,
+        max_iter: int = 100,
+        tol: float = 1e-4,
+        max_step: float = 1.0,
+        loss: str = "poisson_multiclass",
+        max_n_polyads: int = int(1e8),
+        variance_threshold: float = 0.0,
+        use_tqdm: bool = False
+    ) -> None:
+        """
+        Initialize a PolyadEstimator instance.
+        Parameters
+        ----------
+        max_iter : int, optional
+            Maximum number of optimization iterations (default: 100).
+        tol : float, optional
+            Convergence tolerance for the optimization (default: 1e-4).
+        max_step : float, optional
+            Maximum step size for parameter updates (default: 0.5).
+        loss : str, optional
+            Loss function name. Supported: 'poisson_binary', 'poisson_multiclass'.
+            (default: 'poisson_multiclass')
+        max_n_polyads : int, optional
+            Maximum number of polyads to generate/use (default: 1e7).
+        variance_threshold : float, optional
+            Threshold for variance approximation (default: 0.0). Ranges from 0 (never) to 1 (always).
+        use_tqdm : bool, optional
+            Whether to display a progress bar during fitting (default: False).
+        Returns
+        -------
+        None
+        """
+        self.max_iter = max_iter
+        self.tol = tol
+        self.max_step = max_step
+        self.loss = loss
+        self.max_n_polyads = int(max_n_polyads)
+        self.variance_threshold = variance_threshold
+        self.use_tqdm = use_tqdm
+        self.beta_ = None
+        self.n_edges_ = None
+        self.n_polyads_ = None
+        self.n_pairs_ = None
+        self.loss_ = None
+        self.score_ = None
+        self.hessian_ = None
+        self.det_ = None
+        self.var_ = None
+        self.converged_ = None
+        self.iterations_ = None
+        self.time_ = None
+    def _validate_input(self, df, beta_init, eval_X, X, loss, indices, values):
+        supported_losses = ["poisson_binary", "poisson_multiclass", "poisson_binary_balanced"]
+        return _validate_fit_inputs(df, beta_init, eval_X, X, loss, supported_losses, indices, values)
+    @staticmethod
+    def default_eval_X(matrix: np.ndarray) -> 'callable':
+        """
+        Returns an eval_X function for a given 3D matrix.
+        """
+        def eval_X(key):
+            return matrix[tuple(key)]
+        return eval_X
+    def fit(
+        self,
+        df: pd.DataFrame,
+        indices: list,
+        values: str,
+        beta_init: np.ndarray,
+        eval_X: 'callable | None' = None,
+        eval_kwargs: 'dict | None' = None,
+        X: 'np.ndarray | None' = None,
+        loss: str = None,
+    ) -> 'PolyadEstimator':
+        """
+        Fit the polyad model to data.
+        Parameters
+        ----------
+        df : pandas.DataFrame
+            DataFrame with columns representing the observed data (typically columns: t, i, j, v).
+        beta_init : np.ndarray or list or float
+            Initial guess for the parameter vector (shape: (n_features,) or scalar).
+        eval_X : callable or None, optional
+            Feature extraction function. If None and X is provided, uses default_eval_X(X).
+            Should accept a key (tuple or array) and return a feature vector.
+        eval_kwargs : dict or None, optional
+            Additional keyword arguments to pass to eval_X.
+        X : np.ndarray or None, optional
+            3D or 4D numpy array of features. If provided and eval_X is None, uses default_eval_X(X).
+        Returns
+        -------
+        self : PolyadEstimator
+            The fitted estimator instance (self).
+        """
+        if eval_X is None:
+            if X is not None:
+                eval_X = self.default_eval_X(X)
+                if eval_kwargs is None:
+                    eval_kwargs = {}
+            else:
+                raise ValueError("You must provide either eval_X or X (matrix)!")
+        # Allow override of loss for this fit, else use self.loss
+        loss_name = loss if loss is not None else self.loss
+        # Validate all inputs before fitting
+        df, beta_init = self._validate_input(df, beta_init, eval_X, X, loss_name, indices, values)
+        result = _fit_polyad_estimator(
+            beta_init,
+            df,
+            eval_X,
+            eval_kwargs=eval_kwargs,
+            max_iter=self.max_iter,
+            tol=self.tol,
+            max_step=self.max_step,
+            use_tqdm=self.use_tqdm,
+            loss=loss_name,
+            max_n_polyads=self.max_n_polyads,
+            variance_threshold=self.variance_threshold
+        )
+        self.beta_ = result['beta']
+        self.n_edges_ = result['n_edges']
+        self.n_polyads_ = result['n_polyads']
+        self.n_pairs_ = result['n_pairs']
+        self.loss_ = result['loss']
+        self.score_ = result['score']
+        self.hessian_ = result['hessian']
+        self.det_ = result['det']
+        self.var_ = result['var']
+        self.converged_ = result['converged']
+        self.iterations_ = result['iterations']
+        self.time_ = result['time']
+        return self
+    def summary(self, alpha: float = 0.05) -> None:
+        """
+        Display a regression summary for a fitted PolyadEstimator, similar to statsmodels/pystats.
+        Shows coefficient, std err, z, p-value, and confidence intervals.
+        """
+        if self.beta_ is None or self.var_ is None:
+            print("Model is not fitted or variance is not available.")
+            return
+        beta = np.asarray(self.beta_)
+        var = np.asarray(self.var_)
+        se = np.sqrt(np.diag(var))
+        z = beta / se
+        p = 2 * (1 - norm.cdf(np.abs(z)))
+        ci_low = beta + norm.ppf(alpha/2) * se
+        ci_upp = beta + norm.ppf(1 - alpha/2) * se
+        df = pd.DataFrame({
+            'coef': beta,
+            'std err': se,
+            'z': z,
+            'P>|z|': p,
+            f'[{100*alpha/2:.1f}%': ci_low,
+            f'{100*(1-alpha/2):.1f}%]': ci_upp
+        })
+        print("="*65)
+        print("\t"*3 + "Polyad Fit Results")
+        print("="*65)
+        if self.n_polyads_ == 0:
+            print(f"No polyads found. No optimization step was taken.")
+        else:
+            print(f"Converged: {self.converged_}")
+            print(f"Iterations: {self.iterations_}")
+            print(f"Time: {self.time_:.2f} seconds")
+            print(f"Number of Positive Edges: {self.n_edges_}")
+            print(f"Number of Active Polyads: {self.n_polyads_}")
+            if self.n_polyads_ == self.max_n_polyads:
+                print("(Maximum number of polyads reached. Results may be unreliable.)")
+            # Provide statistics when the model reaches a singular Hessian
+            if self.det_ <= 1e-8 and self.n_polyads_ > 0:
+                print("="*65)
+                print(df)
+                print("="*65)
+                print("The optimization procedure encountered a singular Hessian,")
+                print("its eigenvalues and eigenvectors are:")
+                eigvals, eigvecs = np.linalg.eigh(self.hessian_)
+                eigvals *= (np.abs(eigvals) > 1e-8)
+                eigvecs *= (np.abs(eigvecs) > 1e-8)
+                df = pd.DataFrame({
+                    "eigenvalue": eigvals,
+                    "eigenvector": [eigvecs[i] for i in range(eigvecs.shape[0])],
+                })
+                df.sort_values(by="eigenvalue", inplace=True)
+                print(df)
+                print("A singular Hessian may be associated with collinear variables.")
+                print("It can also be associated with ill-defined models, e.g.,")
+                print("one feature do not vary and is absorbed by the fixed effects.")
+                print("The eigenvectors of zero may explain both cases.")
+            else:
+                print(f"Number of Pairs of Active Polyads Sharing Edges: {self.n_pairs_}")
+                print("="*65)
+                print(df)