PyPI - hNMF - Versions diffs - 0.3.0__py3-none-any.whl - Mend

hNMF 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

hnmf/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from hnmf.model import *
2	+ from hnmf.helpers import *

hnmf/helpers.py ADDED Viewed

@@ -0,0 +1,542 @@
+import logging
+import warnings
+from collections.abc import Callable
+from typing import TYPE_CHECKING, Literal, TypeAlias
+import numpy as np
+import numpy.typing as npt
+from numpy.linalg import matrix_rank, norm, svd
+from numpy.random import mtrand
+from scipy import sparse as sp
+from sklearn.decomposition import non_negative_factorization
+if TYPE_CHECKING:
+    import networkx as nx
+logger = logging.getLogger(__name__)
+AnlsAlgorithm: TypeAlias = Callable[
+    [
+        npt.NDArray[np.float64],
+        npt.NDArray[np.float64],
+        npt.NDArray[np.float64],
+        npt.DTypeLike,
+    ],
+    npt.NDArray[np.float64],
+]
+def anls_entry_rank2_precompute(
+    left: npt.NDArray[np.float64],
+    right: npt.NDArray[np.float64],
+    H: npt.NDArray[np.float64],
+    dtype: npt.DTypeLike,
+) -> npt.NDArray[np.float64]:
+    eps = 1e-6
+    n = right.shape[0]
+    solve_either = np.zeros((n, 2), dtype=dtype)
+    solve_either[:, 0] = right[:, 0] / left[0, 0]
+    solve_either[:, 1] = right[:, 0] / left[1, 1]
+    cosine_either = solve_either * np.sqrt(np.array([left[0, 0], left[1, 1]]))
+    choose_first = cosine_either[:, 0] >= cosine_either[:, 1]
+    solve_either[choose_first, 1] = 0
+    solve_either[np.logical_not(choose_first), 0] = 0
+    if np.abs(left[0, 0]) < eps and abs(left[0, 1]) < eps:
+        logger.error(
+            "Error: The 2x2 matrix is close to singular or the input data matrix has tiny values",
+        )
+    else:
+        if np.abs(left[0, 0] >= np.abs(left[0, 1])):
+            t = left[1, 0] / left[0, 0]
+            a2 = left[0, 0] + t * left[1, 0]
+            b2 = left[0, 1] + t * left[1, 1]
+            d2 = left[1, 1] - t * left[0, 1]
+            if np.abs(d2 / a2) < eps:
+                logger.error("Error: The 2x2 matrix is close to singular")
+            e2 = right[:, 0] + t * right[:, 1]
+            f2 = right[:, 1] - t * right[:, 0]
+        else:
+            ct = left[0, 0] / left[1, 0]
+            a2 = left[1, 0] + ct * left[0, 0]
+            b2 = left[1, 1] + ct * left[0, 1]
+            d2 = -left[0, 1] + ct * left[1, 1]
+            if np.abs(d2 / a2) < eps:
+                logger.error("Error: The 2x2 matrix is close to singular")
+            e2 = right[:, 1] + ct * right[:, 0]
+            f2 = -right[:, 0] + ct * right[:, 1]
+        H[:, 1] = f2 * (1 / d2)
+        H[:, 0] = (e2 - b2 * H[:, 1]) * (1 / a2)
+    use_either = np.logical_not(np.all(H > 0, axis=1))
+    H[use_either, :] = solve_either[use_either, :]
+    return H
+def trial_split_sklearn(
+    min_priority: float,
+    X: npt.NDArray,
+    subset: npt.NDArray[np.int64],
+    W_parent: npt.NDArray[np.float64],
+    random_state: np.random.RandomState,
+    trial_allowance: int,
+    unbalanced: float,
+    dtype: npt.DTypeLike,
+    tol: float,
+    maxiter: int,
+    init: Literal[None, "random", "nndsvd", "nndsvda", "nndsvdar"],
+    alpha_W: float,
+    alpha_H: float | Literal["same"],
+):
+    m: int = X.shape[0]
+    trial = 0
+    subset_backup = subset
+    W_buffer_one = np.zeros((m, 2), dtype=dtype)
+    H_buffer_one = np.zeros((2, len(subset)), dtype=dtype)
+    priority_one = -2.0
+    while trial < trial_allowance:
+        cluster_subset, W_buffer_one, H_buffer_one, priority_one = split_once_sklearn(
+            X=X,
+            subset=subset,
+            W_parent=W_parent,
+            random_state=random_state,
+            dtype=dtype,
+            tol=tol,
+            maxiter=maxiter,
+            init=init,
+            alpha_W=alpha_W,
+            alpha_H=alpha_H,
+        )
+        if priority_one < 0:
+            break
+        unique_cluster_subset = np.unique(cluster_subset)
+        if len(unique_cluster_subset) != 2:
+            logger.error("Invalid number of unique sub-clusters!")
+        length_cluster1 = len(np.where(cluster_subset == unique_cluster_subset[0])[0])
+        length_cluster2 = len(np.where(cluster_subset == unique_cluster_subset[1])[0])
+        if min(length_cluster1, length_cluster2) < unbalanced * len(cluster_subset):
+            logger.debug(
+                f"Below imbalanced threshold: {unbalanced * len(cluster_subset)}",
+            )
+            idx_small = np.argmin(np.array([length_cluster1, length_cluster2]))
+            subset_small = np.where(cluster_subset == unique_cluster_subset[idx_small])[
+                0
+            ]
+            subset_small = subset[subset_small]
+            _, _, _, priority_one_small = split_once_sklearn(
+                X=X,
+                subset=subset_small,
+                W_parent=W_buffer_one[:, idx_small],
+                random_state=random_state,
+                dtype=dtype,
+                tol=tol,
+                maxiter=maxiter,
+                init=init,
+                alpha_W=0.0,
+                alpha_H=0.0,
+            )
+            if priority_one_small < min_priority:
+                trial += 1
+                if trial < trial_allowance:
+                    logger.debug(f"Dropped {len(subset_small)} features...")
+                    subset = np.setdiff1d(subset, subset_small)
+            else:
+                break
+        else:
+            break
+    if trial == trial_allowance:
+        logger.debug(
+            f"Reached trial allowance, recycled {len(subset_backup) - len(subset)} features",
+        )
+        subset = subset_backup
+        W_buffer_one = np.zeros((m, 2), dtype=dtype)
+        H_buffer_one = np.zeros((2, len(subset)), dtype=dtype)
+        priority_one = -2
+    return subset, W_buffer_one, H_buffer_one, priority_one
+def split_once_sklearn(
+    X: npt.NDArray,
+    subset: npt.NDArray[np.int64],
+    W_parent: npt.NDArray[np.float64],
+    random_state: mtrand.RandomState,
+    dtype: npt.DTypeLike,
+    tol: float,
+    maxiter: int,
+    init: Literal[None, "random", "nndsvd", "nndsvda", "nndsvdar", "custom"],
+    alpha_W: float,
+    alpha_H: float | Literal["same"],
+) -> tuple[
+    npt.NDArray[np.float64], npt.NDArray[np.float64], npt.NDArray[np.float64], float
+]:
+    m = X.shape[0]
+    if len(subset) <= 3:
+        cluster_subset = np.ones(len(subset), dtype=dtype)
+        W_buffer_one = np.zeros((m, 2), dtype=dtype)
+        H_buffer_one = np.zeros((2, len(subset)), dtype=dtype)
+        priority_one = -1
+    else:
+        term_subset = np.flatnonzero(np.sum(X[:, subset], axis=1))
+        X_subset = X[term_subset, :][:, subset]
+        W = random_state.rand(len(term_subset), 2)
+        H = random_state.rand(2, len(subset))
+        W, H, _n_iter = non_negative_factorization(
+            X=X_subset,
+            W=W,
+            H=H,
+            n_components=2,
+            init=init,
+            update_H=True,
+            solver="cd",
+            beta_loss=2,
+            tol=tol,
+            max_iter=maxiter,
+            alpha_W=alpha_W,
+            alpha_H=alpha_H,
+            l1_ratio=0.0,
+            random_state=random_state,
+            verbose=0,
+            shuffle=False,
+        )
+        cluster_subset = np.argmax(H, axis=0)
+        W_buffer_one = np.zeros((m, 2), dtype=dtype)
+        W_buffer_one[term_subset, :] = W
+        H_buffer_one = H
+        if len(np.unique(cluster_subset)) > 1:
+            priority_one = compute_priority(W_parent, W_buffer_one, dtype=dtype)
+        else:
+            priority_one = -1
+    return cluster_subset, W_buffer_one, H_buffer_one, priority_one
+def trial_split(
+    min_priority: float,
+    X: npt.NDArray[np.float64],
+    subset: npt.NDArray[np.int64],
+    W_parent: npt.NDArray[np.float64],
+    random_state: np.random.RandomState,
+    trial_allowance: int,
+    unbalanced: float,
+    dtype: npt.DTypeLike,
+    anls_alg: AnlsAlgorithm,
+    vec_norm: float,
+    normW: bool,
+    tol: float,
+    maxiter: int,
+) -> tuple[npt.NDArray[np.int64], npt.NDArray, npt.NDArray, float]:
+    m = X.shape[0]
+    trial = 0
+    subset_backup = subset
+    W_buffer_one = np.zeros((m, 2), dtype=dtype)
+    H_buffer_one = np.zeros((2, len(subset)), dtype=dtype)
+    priority_one = -2.0
+    while trial < trial_allowance:
+        cluster_subset, W_buffer_one, H_buffer_one, priority_one = split_once(
+            X=X,
+            subset=subset,
+            W_parent=W_parent,
+            random_state=random_state,
+            dtype=dtype,
+            anls_alg=anls_alg,
+            vec_norm=vec_norm,
+            normW=normW,
+            tol=tol,
+            maxiter=maxiter,
+        )
+        if priority_one < 0:
+            break
+        unique_cluster_subset = np.unique(cluster_subset)
+        if len(unique_cluster_subset) != 2:
+            logger.warning("Invalid number of unique sub-clusters!")
+        length_cluster1 = len(np.where(cluster_subset == unique_cluster_subset[0])[0])
+        length_cluster2 = len(np.where(cluster_subset == unique_cluster_subset[1])[0])
+        if min(length_cluster1, length_cluster2) < unbalanced * len(cluster_subset):
+            idx_small = np.argmin(np.array([length_cluster1, length_cluster2]))
+            subset_small = np.where(cluster_subset == unique_cluster_subset[idx_small])[
+                0
+            ]
+            subset_small = subset[subset_small]
+            _, _, _, priority_one_small = split_once(
+                X=X,
+                subset=subset_small,
+                W_parent=W_buffer_one[:, idx_small],
+                random_state=random_state,
+                dtype=dtype,
+                anls_alg=anls_alg,
+                vec_norm=vec_norm,
+                normW=normW,
+                maxiter=maxiter,
+                tol=tol,
+            )
+            if priority_one_small < min_priority:
+                trial += 1
+                if trial < trial_allowance:
+                    logger.info(f"Dropped {len(subset_small)} documents...")
+                    subset = np.setdiff1d(subset, subset_small)
+            else:
+                break
+        else:
+            break
+    if trial == trial_allowance:
+        logger.info(f"Recycled {len(subset_backup) - len(subset)} documents...")
+        subset = subset_backup
+        W_buffer_one = np.zeros((m, 2), dtype=dtype)
+        H_buffer_one = np.zeros((2, len(subset)), dtype=dtype)
+        priority_one = -2
+    return subset, W_buffer_one, H_buffer_one, priority_one
+def split_once(
+    X: npt.NDArray[np.float64],
+    subset: npt.NDArray[np.int64],
+    W_parent: npt.NDArray[np.float64],
+    random_state: mtrand.RandomState,
+    dtype: npt.DTypeLike,
+    anls_alg: AnlsAlgorithm,
+    vec_norm: float,
+    normW: bool,
+    tol: float,
+    maxiter: int,
+) -> tuple[npt.NDArray[np.float64], npt.NDArray, npt.NDArray, float]:
+    m = X.shape[0]
+    if len(subset) <= 3:
+        cluster_subset = np.ones(len(subset), dtype=dtype)
+        W_buffer_one = np.zeros((m, 2), dtype=dtype)
+        H_buffer_one = np.zeros((2, len(subset)), dtype=dtype)
+        priority_one = -1
+    else:
+        term_subset = np.where(np.sum(X[:, subset], axis=1) != 0)[0]
+        X_subset = X[term_subset, :][:, subset]
+        W = random_state.rand(len(term_subset), 2)
+        H = random_state.rand(2, len(subset))
+        W, H = nmfsh_comb_rank2(
+            X_subset,
+            W,
+            H,
+            anls_alg=anls_alg,
+            vec_norm=vec_norm,
+            normW=normW,
+            tol=tol,
+            maxiter=maxiter,
+            dtype=dtype,
+        )
+        cluster_subset = np.argmax(H, axis=0)
+        W_buffer_one = np.zeros((m, 2), dtype=dtype)
+        W_buffer_one[term_subset, :] = W
+        H_buffer_one = H
+        if len(np.unique(cluster_subset)) > 1:
+            priority_one = compute_priority(W_parent, W_buffer_one, dtype=dtype)
+        else:
+            priority_one = -1
+    return cluster_subset, W_buffer_one, H_buffer_one, priority_one
+def compute_priority(
+    W_parent: npt.NDArray[np.float64],
+    W_child: npt.NDArray[np.float64],
+    dtype: npt.DTypeLike,
+) -> float:
+    n = len(W_parent)
+    idx_parent = np.argsort(W_parent)[::-1]
+    sorted_parent = W_parent[idx_parent]
+    idx_child1 = np.argsort(W_child[:, 0])[::-1]
+    idx_child2 = np.argsort(W_child[:, 1])[::-1]
+    n_part = len(np.where(W_parent != 0)[0])
+    if n_part <= 1:
+        priority = -3
+    else:
+        weight = np.log(np.arange(n, 0, -1))
+        first_zero = np.where(sorted_parent == 0)[0]
+        if len(first_zero) > 0:
+            weight[first_zero[0] :] = 1
+        weight_part = np.zeros(n, dtype=dtype)
+        weight_part[:n_part] = np.log(np.arange(n_part, 0, -1))
+        idx1 = np.argsort(idx_child1)
+        idx2 = np.argsort(idx_child2)
+        max_pos = np.maximum(idx1, idx2)
+        discount = np.log(n - max_pos[idx_parent] + 1)
+        discount[discount == 0] = np.log(2)
+        weight /= discount
+        weight_part /= discount
+        ndcg1 = NDCG_part(idx_parent, idx_child1, weight, weight_part)
+        ndcg2 = NDCG_part(idx_parent, idx_child2, weight, weight_part)
+        priority = ndcg1 * ndcg2
+    return priority
+def NDCG_part(
+    ground: npt.NDArray[np.int64],
+    test: npt.NDArray[np.int64],
+    weight: npt.NDArray,
+    weight_part: npt.NDArray,
+) -> float:
+    seq_idx = np.argsort(ground)
+    weight_part = weight_part[seq_idx]
+    n = len(test)
+    uncum_score = weight_part[test]
+    uncum_score[2:] /= np.log2(np.arange(2, n))
+    cum_score = np.sum(uncum_score)
+    ideal_score = np.sort(weight)[::-1]
+    ideal_score[2:] /= np.log2(np.arange(2, n))
+    cum_ideal_score = np.sum(ideal_score)
+    score = cum_score / cum_ideal_score
+    return score
+def nmfsh_comb_rank2(
+    A: npt.NDArray,
+    Winit: npt.NDArray,
+    Hinit: npt.NDArray,
+    anls_alg: AnlsAlgorithm,
+    vec_norm: float,
+    normW: bool,
+    tol: float,
+    maxiter: int,
+    dtype: npt.DTypeLike,
+) -> tuple[npt.NDArray, npt.NDArray]:
+    """"""
+    eps = 1e-6
+    shape: tuple[int, int] = A.shape
+    m, n = shape
+    W, H = Winit, Hinit
+    if W.shape[1] != 2:
+        warnings.warn(
+            f"Error: Wrong size of W! Expected shape of (n, 2) but received W of shape ({W.shape[0]}, {W.shape[1]})",
+            stacklevel=2,
+        )
+    if H.shape[0] != 2:
+        warnings.warn(
+            f"Error: Wrong size of H! Expected shape of (2, n) but received H of shape ({H.shape[0]}, {H.shape[1]})",
+            stacklevel=2,
+        )
+    left = H.dot(H.T)
+    right = A.dot(H.T)
+    for iter_ in range(maxiter):
+        if matrix_rank(left) < 2:
+            W = np.zeros((m, 2), dtype=dtype)
+            H = np.zeros((2, n), dtype=dtype)
+            if sp.issparse(A):
+                U, _S, V = svd(A.toarray(), full_matrices=False)  # type: ignore[attr-defined]  # A can be sparse
+            else:
+                U, _S, V = svd(A, full_matrices=False)
+            U, V = U[:, 0], V[0, :]
+            if sum(U) < 0:
+                U, V = -U, -V
+            W[:, 0] = U
+            H[0, :] = V
+            return W, H
+        W = anls_alg(left, right, W, dtype)
+        norms_W = norm(W, axis=0)
+        if np.min(norms_W) < eps:
+            logger.warning("Error: Some column of W is essentially zero")
+        W *= 1.0 / norms_W
+        left = W.T.dot(W)
+        right = A.T.dot(W)
+        if matrix_rank(left) < 2:
+            W = np.zeros((m, 2), dtype=dtype)
+            H = np.zeros((2, n), dtype=dtype)
+            if sp.issparse(A):
+                U, _S, V = svd(A.toarray(), full_matrices=False)  # type: ignore[attr-defined]  # A can be sparse
+            else:
+                U, _S, V = svd(A, full_matrices=False)
+            U, V = U[:, 0], V[0, :]
+            if sum(U) < 0:
+                U, V = -U, -V
+            W[:, 0] = U
+            H[0, :] = V
+            return W, H
+        H = anls_alg(left, right, H.T, dtype).T
+        gradH = left.dot(H) - right.T
+        left = H.dot(H.T)
+        right = A.dot(H.T)
+        gradW = W.dot(left) - right
+        initgrad = 1
+        if iter_ == 0:
+            gradW_square = np.sum(np.power(gradW[np.logical_or(gradW <= 0, W > 0)], 2))
+            gradH_square = np.sum(np.power(gradH[np.logical_or(gradH <= 0, H > 0)], 2))
+            initgrad = np.sqrt(gradW_square + gradH_square)
+            continue
+        gradW_square = np.sum(np.power(gradW[np.logical_or(gradW <= 0, W > 0)], 2))
+        gradH_square = np.sum(np.power(gradH[np.logical_or(gradH <= 0, H > 0)], 2))
+        projnorm = np.sqrt(gradW_square + gradH_square)
+        if projnorm < tol * initgrad:
+            break
+    if vec_norm != 0:
+        if normW:
+            norms = np.power(np.sum(np.power(W, vec_norm), axis=0), 1 / vec_norm)
+            W /= norms
+            H *= norms[:, None]
+        else:
+            norms = np.power(np.sum(np.power(H, vec_norm), axis=1), 1 / vec_norm)
+            W *= norms[None, :]
+            H /= norms
+    return W, H
+def tree_to_nx(tree: npt.NDArray, weights: npt.NDArray | None = None) -> "nx.DiGraph":
+    import networkx as nx
+    g = nx.DiGraph()
+    g.add_node("Root", name="Root", is_word=False, id="Root")
+    for parent_node, row in enumerate(tree, start=0):
+        # Here the ith row refers to the ith node as a parent
+        parent_id = str(int(parent_node))
+        parent_idx = int(parent_node)
+        parent_name = f"Node {parent_id}"
+        if row.sum() > 0:
+            for child in row:
+                child_id = str(int(child))
+                child_idx = int(child)
+                child_name = f"Node {child_id}"
+                if parent_idx not in g.nodes:
+                    g.add_node(
+                        parent_idx,
+                        is_word=False,
+                        name=parent_name,
+                        id=parent_id,
+                    )
+                if child_idx not in g.nodes:
+                    g.add_node(child_idx, is_word=False, name=child_name, id=child_id)
+                g.add_edge(parent_idx, child_idx)
+                if weights is not None:
+                    child_weight = weights[child_idx]
+                    g.nodes[child_idx]["weight"] = child_weight
+    g.add_edge("Root", 0)
+    g.add_edge("Root", 1)
+    return g

hnmf/model.py ADDED Viewed

@@ -0,0 +1,672 @@
+import logging
+from collections import defaultdict
+from dataclasses import dataclass
+from operator import itemgetter
+from typing import Any, Literal, Self
+import numpy as np
+import numpy.typing as npt
+from sklearn.base import BaseEstimator
+from sklearn.decomposition import NMF
+from hnmf.helpers import (
+    trial_split_sklearn,
+)
+from hnmf.progress_tree import ProgressTree
+@dataclass(frozen=True, slots=True)
+class DiscriminatedSample:
+    sample: Any
+    node: int
+    node_value: float
+    others_value: float
+logger = logging.getLogger(__name__)
+class HierarchicalNMF(BaseEstimator):
+    k: int
+    unbalanced: float
+    init: Literal[None, "random", "nndsvd", "nndsvda", "nndsvdar"]
+    solver: Literal["cd", "mu"]
+    beta_loss: Literal["FRO", 0, "KL", 1, "IS", 2]
+    alpha_W: float
+    alpha_H: Literal["same"] | float
+    random_state: np.random.RandomState
+    trial_allowance: int
+    tol: float
+    maxiter: int
+    dtype: npt.DTypeLike
+    n_samples_: int | None
+    n_features_: int | None
+    n_nodes_: int
+    n_leaves_: int
+    tree_: npt.NDArray | None
+    splits_: npt.NDArray | None
+    is_leaf_: npt.NDArray | None
+    clusters_: npt.NDArray | None
+    Ws_: npt.NDArray | None
+    Hs_: npt.NDArray | None
+    W_buffer_: npt.NDArray | None
+    H_buffer_: npt.NDArray | None
+    priorities_: npt.NDArray | None
+    id2sample_: dict[int, str] | None
+    id2feature_: dict[int, str] | None
+    feature2id_: dict[str, int] | None
+    def __init__(
+        self,
+        k: int,
+        unbalanced: float = 0.1,
+        init: Literal[None, "random", "nndsvd", "nndsvda", "nndsvdar"] = None,
+        solver: Literal["cd", "mu"] = "cd",
+        beta_loss: Literal["FRO", 0, "KL", 1, "IS", 2] = 0,
+        alpha_W: float = 0.0,
+        alpha_H: Literal["same"] | float = "same",
+        random_state: int = 42,
+        trial_allowance: int = 100,
+        tol: float = 1e-6,
+        maxiter: int = 10000,
+        dtype: npt.DTypeLike = np.float64,
+    ):
+        self.k = k
+        self.unbalanced = unbalanced
+        self.init = init
+        self.solver = solver
+        self.beta_loss = beta_loss
+        self.alpha_W = alpha_W
+        self.alpha_H = alpha_H
+        self.random_state = np.random.RandomState(seed=random_state)
+        self.trial_allowance = trial_allowance
+        self.tol = tol
+        self.maxiter = maxiter
+        self.dtype = dtype
+        self.n_samples_ = None
+        self.n_features_ = None
+        self.n_nodes_ = 0
+        self.n_leaves_ = 0
+        self.tree_ = None
+        self.splits_ = None
+        self.is_leaf_ = None
+        self.clusters_ = None
+        self.Ws_ = None
+        self.Hs_ = None
+        self.W_buffer_ = None
+        self.H_buffer_ = None
+        self.priorities_ = None
+        self.id2sample_ = None
+        self.id2feature_ = None
+        self.feature2id_ = None
+    """
+    Implements Hierarchical rank-2 NMF
+    Parameters
+    ----------
+    k: int
+        The number of desired leaf nodes
+    unbalanced : float
+        A threshold to determine if one of the two clusters is an outlier set. A smaller value means more tolerance for
+        imbalance between two clusters. See parameter beta in Algorithm 3 in the reference paper.
+    init : InitMethod
+        The initialization method used to initially fill W and H
+    solver : NMFSolver
+        The solver used to minimize the distance function
+    beta_loss : BetaLoss
+        Beta divergence to be minimized
+    alpha_W : float, defaults to 0.0
+        Constant that multiplies the regularization terms of W. Set it to zero (default) to have no regularization on W.
+        See `sklearn.decomposition.NMF`_
+    alpha_H: float or 'same', defaults to 'same'
+        Constant that multiplies the regularization terms of H. Set it to zero to have no regularization on H. If 'same'
+         (default), it takes the same value as alpha_W.
+        See `sklearn.decomposition.NMF`_
+    random_state : int
+        random seed
+    trial_allowance : int
+        Number of trials allowed for removing outliers and splitting a node again. See parameter T in Algorithm 3 in
+        the reference paper.
+    tol : float
+        Tolerance parameter for stopping criterion in each run of NMF.
+    maxiter : int
+        Maximum number of iteration times in each run of NMF
+    dtype : npt.DTypeLike
+        Dtype used for numpy arrays
+    Attributes
+    ----------
+    tree_ : np.ndarray
+        A 2-by-(k-1) matrix that encodes the tree structure. The two entries in the i-th column are the numberings of
+        the two children of the node with numbering i. The root node has numbering 0, with its two children always
+        having numbering 1 and numbering 2. Thus the root node is NOT included in the 'tree' variable.
+    splits_ :
+        An array of length k-1. It keeps track of the numberings of the nodes being split from the 1st split to the
+        (k-1)-th split. (The first entry is always 0.)
+    is_leaf_ :
+        An array of length 2*(k-1). A "1" at index ``i`` means that the node with numbering ``i`` is a leaf node in the final
+        tree generated, and "0" indicates non-leaf nodes in the final tree.
+    clusters_ :
+        Array with shape(n_nodes, n_features). A "1" at index ``i`` means that the sample with numbering ``c`` was
+        included in this nodes subset
+    Hs_ :
+        Array with shape (n_nodes, n_features)
+    Ws_ :
+        Array with shape (n_nodes, n_samples)
+    Notes
+    -----
+    ``W`` refers to the decomposed matrix. scikit-learn equivalent of::
+        W = model.fit_transform(X)
+    ``H`` refers to the factorization matrix. scikit-learn equivalent of::
+        model.components_
+    Adapted from [rank-2]_
+    """
+    def _init_fit(
+        self, X: npt.NDArray, term_subset: npt.NDArray
+    ) -> tuple[npt.NDArray[np.float64], npt.NDArray[np.float64]]:
+        if not self.n_samples_:
+            raise ValueError("n_samples_ not set before _init_fit called")
+        nmf = NMF(
+            n_components=2,
+            random_state=self.random_state,
+            tol=self.tol,
+            max_iter=self.maxiter,
+            init=self.init,
+        )
+        if len(term_subset) == self.n_samples_:
+            W = nmf.fit_transform(X)
+            H = nmf.components_
+            return W, H
+        W_tmp = nmf.fit_transform(X[term_subset, :])
+        H = nmf.components_
+        W = np.zeros((self.n_samples_, 2), dtype=self.dtype)
+        W[term_subset, :] = W_tmp
+        return W, H
+    def fit(self, X: npt.NDArray) -> Self:
+        """
+        Fit `HierarchicalNMF` to data
+        """
+        shape: tuple[int, int] = X.shape
+        n_samples, n_features = shape
+        self.n_samples_ = n_samples
+        self.n_features_ = n_features
+        # TODO Expect different sized ranks
+        clusters: list[npt.NDArray[np.int64] | None] = [None] * (2 * (self.k - 1))
+        Ws = [None] * (2 * (self.k - 1))
+        Hs = [None] * (2 * (self.k - 1))
+        W_buffer = [None] * (2 * (self.k - 1))
+        H_buffer = [None] * (2 * (self.k - 1))
+        priorities = np.zeros(2 * (self.k - 1), dtype=self.dtype)
+        is_leaf = np.zeros(2 * (self.k - 1), dtype=np.bool)  # No leaves at start
+        tree = np.zeros((2, 2 * (self.k - 1)), dtype=np.int64)
+        splits = -np.ones(self.k - 1, dtype=np.int64)
+        # Where X has at least one non-zero
+        term_subset = np.flatnonzero(np.sum(X, axis=1))
+        W, H = self._init_fit(X, term_subset)
+        result_used = 0
+        with ProgressTree() as pt:
+            for i in range(self.k - 1):
+                if i == 0:
+                    split_node = 0
+                    new_nodes = [0, 1]
+                    min_priority = 1e40
+                    split_subset = np.arange(n_features)
+                else:
+                    leaves = np.where(is_leaf == 1)[0]
+                    temp_priority = priorities[leaves]
+                    if len(np.where(temp_priority > 0)[0]) > 0:
+                        min_priority = np.min(temp_priority[temp_priority > 0])
+                        split_node = np.argmax(temp_priority)
+                    else:  # There are no more candidates stop early
+                        min_priority = -1
+                        split_node = 0
+                    if temp_priority[split_node] < 0 or min_priority == -1:
+                        logger.warning(
+                            f"Cannot generate all {self.k} leaf clusters, stopping at {i} leaf clusters"
+                        )
+                        Ws = [i for i in Ws if i is not None]
+                        W_buffer = [i for i in W_buffer if i is not None]
+                        Hs = [i for i in Hs if i is not None]
+                        H_buffer = [i for i in H_buffer if i is not None]
+                        # Resize attributes
+                        tree = tree[:, :result_used]
+                        splits = splits[:result_used]
+                        is_leaf = is_leaf[:result_used]
+                        clusters = clusters[:result_used]
+                        priorities = priorities[:result_used]
+                        self.tree_ = tree.T
+                        self.splits_ = splits
+                        self.is_leaf_ = is_leaf
+                        self.n_nodes_ = self.is_leaf_.shape[0]
+                        self.n_leaves_ = int(np.count_nonzero(self.is_leaf_))
+                        self.clusters_ = self._stack_clusters(clusters)
+                        self.Ws_ = np.array(Ws)
+                        self.Hs_ = np.array(Hs)
+                        self.W_buffer_ = np.array(W_buffer)
+                        self.H_buffer_ = self._stack_H_buffer(H_buffer)
+                        self.priorities_ = priorities
+                        return self
+                    split_node = leaves[split_node]  # Attempt to split this node
+                    is_leaf[split_node] = 0
+                    W = W_buffer[split_node]
+                    H = H_buffer[split_node]
+                    # Find which features are clustered on this node
+                    split_subset = clusters[split_node]
+                    new_nodes = [result_used, result_used + 1]
+                    tree[:, split_node] = new_nodes
+                result_used += 2
+                # For each row find where it is more greatly represented
+                cluster_subset = np.argmax(H, axis=0)
+                subset_0 = np.flatnonzero(cluster_subset == 0)
+                subset_1 = np.flatnonzero(cluster_subset == 1)
+                ls0 = len(subset_0)
+                ls1 = len(subset_1)
+                if i == 0:
+                    pt.add_branch("Root", new_nodes[0], ls0)
+                    pt.add_branch("Root", new_nodes[1], ls1)
+                else:
+                    pt.add_branch(split_node, new_nodes[0], ls0)
+                    pt.add_branch(split_node, new_nodes[1], ls1)
+                clusters[new_nodes[0]] = split_subset[subset_0]
+                clusters[new_nodes[1]] = split_subset[subset_1]
+                Ws[new_nodes[0]] = W[:, 0]
+                Ws[new_nodes[1]] = W[:, 1]
+                # These will not have shape of (2, n_features) because they are fitting a subset
+                # Create zero filled array of shape (2, n_features)
+                h_temp = np.zeros(shape=(2, self.n_features_), dtype=self.dtype)
+                # Which features are present in H
+                h_temp[0, split_subset] = H[0]
+                h_temp[1, split_subset] = H[1]
+                Hs[new_nodes[0]] = h_temp[0]
+                Hs[new_nodes[1]] = h_temp[1]
+                splits[i] = split_node
+                is_leaf[new_nodes] = 1
+                subset = clusters[new_nodes[0]]
+                (
+                    subset,
+                    W_buffer_one,
+                    H_buffer_one,
+                    priority_one,
+                ) = trial_split_sklearn(
+                    min_priority=min_priority,
+                    X=X,
+                    subset=subset,
+                    W_parent=W[:, 0],
+                    random_state=self.random_state,
+                    trial_allowance=self.trial_allowance,
+                    unbalanced=self.unbalanced,
+                    dtype=self.dtype,
+                    tol=self.tol,
+                    maxiter=self.maxiter,
+                    init=self.init,
+                    alpha_W=self.alpha_W,
+                    alpha_H=self.alpha_H,
+                )
+                clusters[new_nodes[0]] = subset
+                W_buffer[new_nodes[0]] = W_buffer_one
+                H_buffer[new_nodes[0]] = H_buffer_one
+                priorities[new_nodes[0]] = priority_one
+                subset = clusters[new_nodes[1]]
+                (
+                    subset,
+                    W_buffer_one,
+                    H_buffer_one,
+                    priority_one,
+                ) = trial_split_sklearn(
+                    min_priority=min_priority,
+                    X=X,
+                    subset=subset,
+                    W_parent=W[:, 1],
+                    random_state=self.random_state,
+                    trial_allowance=self.trial_allowance,
+                    unbalanced=self.unbalanced,
+                    dtype=self.dtype,
+                    tol=self.tol,
+                    maxiter=self.maxiter,
+                    init=self.init,
+                    alpha_W=self.alpha_W,
+                    alpha_H=self.alpha_H,
+                )
+                clusters[new_nodes[1]] = subset
+                W_buffer[new_nodes[1]] = W_buffer_one
+                H_buffer[new_nodes[1]] = H_buffer_one
+                priorities[new_nodes[1]] = priority_one
+        self.tree_ = tree.T
+        self.splits_ = splits
+        self.is_leaf_ = is_leaf
+        self.clusters_ = self._stack_clusters(clusters)
+        self.Ws_ = np.array(Ws)
+        self.Hs_ = np.array(Hs)
+        self.W_buffer_ = np.array(W_buffer)
+        self.H_buffer_ = self._stack_H_buffer(H_buffer)
+        self.priorities_ = priorities
+        self.n_nodes_ = self.is_leaf_.shape[0]
+        self.n_leaves_ = int(np.count_nonzero(self.is_leaf_))
+        return self
+    def _stack_clusters(self, clusters: list[npt.NDArray | None]) -> npt.NDArray:
+        if not self.n_features_:
+            raise ValueError("n_features_ not set before _stack_clusters called")
+        result = np.zeros((len(clusters), self.n_features_), dtype=np.int64)
+        for i, cluster in enumerate(clusters):
+            result[i, cluster] = 1
+        return result
+    def _stack_H_buffer(self, buffer: list) -> npt.NDArray:
+        if self.n_features_ is None:
+            raise ValueError("n_features_ not set before _stack_H_buffer called")
+        if self.clusters_ is None:
+            raise ValueError("clusters_ not set before _stack_H_buffer called")
+        result = np.zeros((len(buffer), 2, self.n_features_), dtype=self.dtype)
+        for i, buff in enumerate(buffer):
+            cluster_nz_idx = np.argwhere(self.clusters_[i]).flatten()
+            result[i, 0, cluster_nz_idx] = buff[0, :]
+            result[i, 1, cluster_nz_idx] = buff[1, :]
+        return result
+    def top_features_in_node(self, node: int, n: int = 10) -> list[tuple]:
+        """
+        For a given node, return the top n features and values
+        """
+        if self.Hs_ is None:
+            raise ValueError("Model not fitted, Hs_ is None")
+        node_i = self.Hs_[node]
+        ranks = node_i.argsort()[::-1][:n]
+        return [(i, node_i[i]) for i in ranks if node_i[i] > 0]
+    def top_nodes_in_feature(
+        self,
+        feature_idx: int | str,
+        n: int = 10,
+        leaves_only: bool = True,
+    ) -> list[tuple]:
+        """
+        Returns the top nodes for a specified feature
+        """
+        if self.Hs_ is None:
+            raise ValueError("Model not fitted, Hs_ is None")
+        node_leaf_idx = np.where(self.is_leaf_ == 1)[0]
+        node_weights = self.Hs_.T[feature_idx]
+        ranks = node_weights.argsort()[::-1]
+        if leaves_only:
+            ranks = ranks[np.isin(ranks, node_leaf_idx)]
+        ranks = ranks[:n]
+        return [(i, node_weights[i]) for i in ranks if node_weights[i] > 0]
+    def top_nodes_in_samples(self, n: int = 10, leaves_only: bool = True):
+        """
+        Returns the top nodes for each sample.
+        """
+        if self.Ws_ is None or self.n_nodes_ is None:
+            raise ValueError("Model not fitted, Ws_ is None")
+        # Idx of leaves
+        node_leaf_idx = np.where(self.is_leaf_ == 1)[0]
+        # Keep map of enumerated -> actual cluster
+        if leaves_only:
+            node_map = dict(enumerate(node_leaf_idx))
+        else:
+            node_map = dict(enumerate(range(self.n_nodes_)))
+        # A dictionary of {sample : [top_nodes]}
+        output = {}
+        # Ws_ is shape n_nodes, n_samples
+        # Transpose weights so it has samples as rows, nodes as columns
+        weights = self.Ws_.T[node_leaf_idx].T if leaves_only else self.Ws_.T
+        # The ellipsis indicates that the selection is done row wise
+        sample_tops = weights.argsort()[:, ::-1][:, :n]
+        # Create an array with samples as rows, top n weights as columns
+        sample_top_weights = np.take_along_axis(weights, sample_tops, axis=1)
+        for sample_idx, (node_ids, node_weights) in enumerate(
+            zip(sample_tops, sample_top_weights, strict=True)
+        ):
+            tops = [
+                (node_map[node_id], weight)
+                for node_id, weight in zip(node_ids, node_weights, strict=True)
+                if weight > 0
+            ]
+            tops.sort(key=itemgetter(1), reverse=True)
+            output[sample_idx] = tops
+        return output
+    def top_samples_in_nodes(self, n: int = 10, leaves_only: bool = True):
+        """
+        Returns the top samples for each node
+        """
+        if self.Ws_ is None:
+            raise ValueError("Model not fitted, Ws_ is None")
+        # Idx of leaves
+        node_leaf_idx = np.where(self.is_leaf_ == 1)[0]
+        # A dictionary of {nodes : [sample]}
+        output = {}
+        # Ws_ is shape n_nodes, n_samples
+        weights = self.Ws_
+        # The ellipsis indicates that the selection is done row wise
+        node_tops = weights.argsort()[:, ::-1][:, :n]
+        # Create an array with samples as rows, top n weights as columns
+        node_top_weights = np.take_along_axis(weights, node_tops, axis=1)
+        for node_idx, (sample_ids, sample_weights) in enumerate(
+            zip(node_tops, node_top_weights, strict=True)
+        ):
+            if leaves_only and node_idx not in node_leaf_idx:
+                continue
+            tops = [
+                (sample_id, weight)
+                for sample_id, weight in zip(sample_ids, sample_weights, strict=True)
+                if weight > 0
+            ]
+            tops.sort(key=itemgetter(1), reverse=True)
+            # Decode samples if available
+            output[node_idx] = tops
+        return output
+    def top_discriminative_samples_in_node(
+        self,
+        node: int,
+        n: int = 10,
+        sign: Literal["positive", "negative", "abs"] = "abs",
+    ) -> "list[DiscriminatedSample]":
+        """
+        Computes most discriminative samples (node vs rest)
+        Parameters
+        ----------
+        node
+        n
+            The number of features to return
+        sign
+            One of `['positive', 'negative', 'abs']`.
+        Returns
+        --------
+        list of dict with form::
+            sample: Any
+            node: int
+            node_value: float
+            others_value: float
+        """
+        if self.Ws_ is None:
+            raise ValueError("Model not fitted, Ws_ is None")
+        if sign not in ("positive", "negative", "abs"):
+            raise ValueError("Sign must be one of 'positive', 'negative' or 'abs'")
+        # Masks
+        member_mask = np.array(node, dtype=np.int64)
+        non_member_mask = np.array(
+            [x for x in np.arange(0, self.n_nodes_) if x != node]
+        )
+        member_values = self.Ws_[member_mask].ravel()
+        other_means = self.Ws_[non_member_mask].mean(axis=0)
+        diffs = (
+            np.abs(member_values - other_means)
+            if sign == "positive"
+            else member_values - other_means
+            if sign == "positive"
+            else other_means - member_values
+        )
+        diff_tops = diffs.argsort()[::-1][:n]
+        return [
+            DiscriminatedSample(
+                sample=diff,
+                node=node,
+                node_value=member_values[diff],
+                others_value=other_means[diff],
+            )
+            for diff in diff_tops
+        ]
+    def cluster_features(
+        self,
+        leaves_only: bool = True,
+        include_outliers: bool = True,
+    ) -> dict[int, list[int]]:
+        """
+        Returns the features assigned as a cluster to nodes
+        Parameters
+        ----------
+        leaves_only
+            Whether to return only leaf nodes
+        include_outliers
+            If True, features without a node assignment are returned under the key -1
+        """
+        if self.clusters_ is None:
+            raise ValueError("Model not fitted, clusters_ is None")
+        output = defaultdict(list)
+        node_leaf_idx = np.where(self.is_leaf_ == 1)[0]
+        clusters = self.clusters_[node_leaf_idx] if leaves_only else self.clusters_
+        assignments = np.argwhere(clusters)
+        for cluster_idx, feature_idx in assignments:
+            output[cluster_idx].append(feature_idx)
+        if include_outliers:
+            outliers = np.where(clusters.sum(axis=0) == 0)[0]
+            output[-1] = outliers
+        return dict(output)
+    def cluster_assignments(
+        self,
+        leaves_only: bool = True,
+        include_outliers: bool = True,
+    ) -> dict[int, set[int]]:
+        """
+        Returns a mapping of features and their assigned cluster(s)
+        Parameters
+        ----------
+        leaves_only
+            Whether to return only leaf nodes
+        include_outliers
+            If True, include feature_idx keys that are not assigned a cluster.
+        """
+        if self.clusters_ is None:
+            raise ValueError("Model not fitted, clusters_ is None")
+        node_leaf_idx = np.where(self.is_leaf_ == 1)[0]
+        clusters = self.clusters_
+        output = defaultdict(set)
+        assignments = np.argwhere(clusters)
+        if leaves_only:
+            assignments = assignments[
+                np.where(np.isin(assignments[:, 0], node_leaf_idx))[0]
+            ]
+        for cluster_idx, feature_idx in assignments:
+            output[cluster_idx].add(feature_idx)
+        if include_outliers:
+            outliers = np.where(clusters.sum(axis=0) == 0)[0]
+            for outlier in outliers:
+                output[outlier] = set()
+        return dict(output)

hnmf/progress_tree.py ADDED Viewed

@@ -0,0 +1,54 @@
+from rich.live import Live
+from rich.tree import Tree
+class ProgressTree:
+    tree: Tree | None
+    live : Live | None
+    branches: dict[str | int, Tree]
+    def __init__(self):
+        self.live = None
+        self.tree = None
+        self.branches = {}
+    def __enter__(self):
+        self.tree = Tree("", guide_style="bold blue", hide_root=True)
+        self._get_or_create_branch("Root", None, None)
+        self.live = Live(self.tree)
+        self.live.start()
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.live:
+            self.live.stop()
+    def _get_or_create_branch(
+        self, k: str | int, source: Tree | None, desc: int | None,
+    ) -> "Tree":
+        branch = self.branches.get(k, None)
+        if branch:
+            return branch
+        display_name = f"[green]{k}" if not desc else f"[green]{k}:({desc})"
+        if self.tree is None:
+            raise RuntimeError("ProgressTree context not entered.")
+        branch = source.add(display_name) if source else self.tree.add(display_name)
+        self.branches[k] = branch
+        return branch
+    def add_branch(
+        self,
+        source: str | int,
+        target: int | str,
+        desc: int | None,
+    ):
+        if self.tree is None or self.live is None:
+            raise RuntimeError("ProgressTree context not entered.")
+        source_branch = self._get_or_create_branch(source, None, None)
+        self._get_or_create_branch(target, source_branch, desc)
+        self.live.update(self.tree)

hnmf-0.3.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,74 @@
+Metadata-Version: 2.4
+Name: hNMF
+Version: 0.3.0
+Summary: Hierarchical NMF
+Project-URL: Homepage, https://github.com/estasney/hNMF
+Author-email: Eric Stasney <estasney@users.noreply.github.com>
+License: MIT
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 3.12
+Requires-Python: >=3.12
+Requires-Dist: networkx>=2.3
+Requires-Dist: numpy>=1.24.4
+Requires-Dist: rich>=14.2.0
+Requires-Dist: scikit-learn>=1.3.2
+Requires-Dist: scipy>=1.10.1
+Description-Content-Type: text/markdown
+# hierarchical-nmf-python
+* fork of https://github.com/rudvlf0413/hierarchical-nmf-python
+* with familiar SKLearn interface
+## Installation
+```bash
+pip install hnmf
+```
+## Usage
+### 20 Newsgroups
+```python
+from sklearn.datasets import fetch_20newsgroups
+from sklearn.feature_extraction.text import TfidfVectorizer
+from hnmf import HierarchicalNMF
+n_features = 1000
+n_leaves = 20
+data, _ = fetch_20newsgroups(shuffle=True, random_state=1,
+                             remove=('headers', 'footers', 'quotes'),
+                             return_X_y=True)
+# Use tf-idf features for NMF.
+tfidf = TfidfVectorizer(max_df=0.95, min_df=2,
+                        max_features=n_features,
+                        stop_words='english')
+X = tfidf.fit_transform(data)
+id2feature = {i: token for i, token in enumerate(tfidf.get_feature_names_out())}
+# hNMF
+model = HierarchicalNMF(k=n_leaves)
+model.fit(X)
+model.cluster_features()
+```
+## Documentation
+To build the documentation:
+```bash
+mkdocs build
+```
+To preview locally:
+```bash
+mkdocs serve
+```
+The documentation will be built to the `docs/` folder for GitHub Pages.
+## Reference
+- Papers: [Fast rank-2 nonnegative matrix factorization for hierarchical document clustering](https://smallk.github.io/papers/hierNMF2.pdf)
+- Originally adapted from MATLAB: https://github.com/dakuang/hiernmf2

hnmf-0.3.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,7 @@
+hnmf/__init__.py,sha256=hAyoIQt-2esjz3EtMVCxcyJmZZSXoaU8_I3Fqu0JrRM,52
+hnmf/helpers.py,sha256=cZr69WdFgAbpmb8XjuuuUSC5maoYMjsEi02OEm5l5cU,17896
+hnmf/model.py,sha256=bKwE-alQ8kx2Xf6NBhe7ZKZI_-ebamjNJM4erj-j7RA,22960
+hnmf/progress_tree.py,sha256=NrkMPOgp2HfrhiooWjD-4t3zCN1tXtg3xcrQ3lMNW-w,1562
+hnmf-0.3.0.dist-info/METADATA,sha256=MnN-F7VxfPsL6uzAzIAgaaE_ceFccFf61lRqxnIVKds,1913
+hnmf-0.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+hnmf-0.3.0.dist-info/RECORD,,

hnmf-0.3.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.28.0
+Root-Is-Purelib: true
+Tag: py3-none-any