hNMF 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hnmf/__init__.py ADDED
@@ -0,0 +1,2 @@
1
+ from hnmf.model import *
2
+ from hnmf.helpers import *
hnmf/helpers.py ADDED
@@ -0,0 +1,542 @@
1
+ import logging
2
+ import warnings
3
+ from collections.abc import Callable
4
+ from typing import TYPE_CHECKING, Literal, TypeAlias
5
+
6
+ import numpy as np
7
+ import numpy.typing as npt
8
+ from numpy.linalg import matrix_rank, norm, svd
9
+ from numpy.random import mtrand
10
+ from scipy import sparse as sp
11
+ from sklearn.decomposition import non_negative_factorization
12
+
13
+ if TYPE_CHECKING:
14
+ import networkx as nx
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ AnlsAlgorithm: TypeAlias = Callable[
19
+ [
20
+ npt.NDArray[np.float64],
21
+ npt.NDArray[np.float64],
22
+ npt.NDArray[np.float64],
23
+ npt.DTypeLike,
24
+ ],
25
+ npt.NDArray[np.float64],
26
+ ]
27
+
28
+
29
+ def anls_entry_rank2_precompute(
30
+ left: npt.NDArray[np.float64],
31
+ right: npt.NDArray[np.float64],
32
+ H: npt.NDArray[np.float64],
33
+ dtype: npt.DTypeLike,
34
+ ) -> npt.NDArray[np.float64]:
35
+ eps = 1e-6
36
+ n = right.shape[0]
37
+
38
+ solve_either = np.zeros((n, 2), dtype=dtype)
39
+ solve_either[:, 0] = right[:, 0] / left[0, 0]
40
+ solve_either[:, 1] = right[:, 0] / left[1, 1]
41
+ cosine_either = solve_either * np.sqrt(np.array([left[0, 0], left[1, 1]]))
42
+ choose_first = cosine_either[:, 0] >= cosine_either[:, 1]
43
+ solve_either[choose_first, 1] = 0
44
+ solve_either[np.logical_not(choose_first), 0] = 0
45
+
46
+ if np.abs(left[0, 0]) < eps and abs(left[0, 1]) < eps:
47
+ logger.error(
48
+ "Error: The 2x2 matrix is close to singular or the input data matrix has tiny values",
49
+ )
50
+ else:
51
+ if np.abs(left[0, 0] >= np.abs(left[0, 1])):
52
+ t = left[1, 0] / left[0, 0]
53
+ a2 = left[0, 0] + t * left[1, 0]
54
+ b2 = left[0, 1] + t * left[1, 1]
55
+ d2 = left[1, 1] - t * left[0, 1]
56
+ if np.abs(d2 / a2) < eps:
57
+ logger.error("Error: The 2x2 matrix is close to singular")
58
+
59
+ e2 = right[:, 0] + t * right[:, 1]
60
+ f2 = right[:, 1] - t * right[:, 0]
61
+ else:
62
+ ct = left[0, 0] / left[1, 0]
63
+ a2 = left[1, 0] + ct * left[0, 0]
64
+ b2 = left[1, 1] + ct * left[0, 1]
65
+ d2 = -left[0, 1] + ct * left[1, 1]
66
+ if np.abs(d2 / a2) < eps:
67
+ logger.error("Error: The 2x2 matrix is close to singular")
68
+
69
+ e2 = right[:, 1] + ct * right[:, 0]
70
+ f2 = -right[:, 0] + ct * right[:, 1]
71
+
72
+ H[:, 1] = f2 * (1 / d2)
73
+ H[:, 0] = (e2 - b2 * H[:, 1]) * (1 / a2)
74
+
75
+ use_either = np.logical_not(np.all(H > 0, axis=1))
76
+ H[use_either, :] = solve_either[use_either, :]
77
+
78
+ return H
79
+
80
+
81
+ def trial_split_sklearn(
82
+ min_priority: float,
83
+ X: npt.NDArray,
84
+ subset: npt.NDArray[np.int64],
85
+ W_parent: npt.NDArray[np.float64],
86
+ random_state: np.random.RandomState,
87
+ trial_allowance: int,
88
+ unbalanced: float,
89
+ dtype: npt.DTypeLike,
90
+ tol: float,
91
+ maxiter: int,
92
+ init: Literal[None, "random", "nndsvd", "nndsvda", "nndsvdar"],
93
+ alpha_W: float,
94
+ alpha_H: float | Literal["same"],
95
+ ):
96
+ m: int = X.shape[0]
97
+ trial = 0
98
+ subset_backup = subset
99
+ W_buffer_one = np.zeros((m, 2), dtype=dtype)
100
+ H_buffer_one = np.zeros((2, len(subset)), dtype=dtype)
101
+ priority_one = -2.0
102
+ while trial < trial_allowance:
103
+ cluster_subset, W_buffer_one, H_buffer_one, priority_one = split_once_sklearn(
104
+ X=X,
105
+ subset=subset,
106
+ W_parent=W_parent,
107
+ random_state=random_state,
108
+ dtype=dtype,
109
+ tol=tol,
110
+ maxiter=maxiter,
111
+ init=init,
112
+ alpha_W=alpha_W,
113
+ alpha_H=alpha_H,
114
+ )
115
+ if priority_one < 0:
116
+ break
117
+
118
+ unique_cluster_subset = np.unique(cluster_subset)
119
+ if len(unique_cluster_subset) != 2:
120
+ logger.error("Invalid number of unique sub-clusters!")
121
+
122
+ length_cluster1 = len(np.where(cluster_subset == unique_cluster_subset[0])[0])
123
+ length_cluster2 = len(np.where(cluster_subset == unique_cluster_subset[1])[0])
124
+ if min(length_cluster1, length_cluster2) < unbalanced * len(cluster_subset):
125
+ logger.debug(
126
+ f"Below imbalanced threshold: {unbalanced * len(cluster_subset)}",
127
+ )
128
+ idx_small = np.argmin(np.array([length_cluster1, length_cluster2]))
129
+ subset_small = np.where(cluster_subset == unique_cluster_subset[idx_small])[
130
+ 0
131
+ ]
132
+ subset_small = subset[subset_small]
133
+ _, _, _, priority_one_small = split_once_sklearn(
134
+ X=X,
135
+ subset=subset_small,
136
+ W_parent=W_buffer_one[:, idx_small],
137
+ random_state=random_state,
138
+ dtype=dtype,
139
+ tol=tol,
140
+ maxiter=maxiter,
141
+ init=init,
142
+ alpha_W=0.0,
143
+ alpha_H=0.0,
144
+ )
145
+ if priority_one_small < min_priority:
146
+ trial += 1
147
+ if trial < trial_allowance:
148
+ logger.debug(f"Dropped {len(subset_small)} features...")
149
+ subset = np.setdiff1d(subset, subset_small)
150
+ else:
151
+ break
152
+ else:
153
+ break
154
+
155
+ if trial == trial_allowance:
156
+ logger.debug(
157
+ f"Reached trial allowance, recycled {len(subset_backup) - len(subset)} features",
158
+ )
159
+ subset = subset_backup
160
+ W_buffer_one = np.zeros((m, 2), dtype=dtype)
161
+ H_buffer_one = np.zeros((2, len(subset)), dtype=dtype)
162
+ priority_one = -2
163
+
164
+ return subset, W_buffer_one, H_buffer_one, priority_one
165
+
166
+
167
+ def split_once_sklearn(
168
+ X: npt.NDArray,
169
+ subset: npt.NDArray[np.int64],
170
+ W_parent: npt.NDArray[np.float64],
171
+ random_state: mtrand.RandomState,
172
+ dtype: npt.DTypeLike,
173
+ tol: float,
174
+ maxiter: int,
175
+ init: Literal[None, "random", "nndsvd", "nndsvda", "nndsvdar", "custom"],
176
+ alpha_W: float,
177
+ alpha_H: float | Literal["same"],
178
+ ) -> tuple[
179
+ npt.NDArray[np.float64], npt.NDArray[np.float64], npt.NDArray[np.float64], float
180
+ ]:
181
+ m = X.shape[0]
182
+ if len(subset) <= 3:
183
+ cluster_subset = np.ones(len(subset), dtype=dtype)
184
+ W_buffer_one = np.zeros((m, 2), dtype=dtype)
185
+ H_buffer_one = np.zeros((2, len(subset)), dtype=dtype)
186
+ priority_one = -1
187
+ else:
188
+ term_subset = np.flatnonzero(np.sum(X[:, subset], axis=1))
189
+ X_subset = X[term_subset, :][:, subset]
190
+ W = random_state.rand(len(term_subset), 2)
191
+ H = random_state.rand(2, len(subset))
192
+ W, H, _n_iter = non_negative_factorization(
193
+ X=X_subset,
194
+ W=W,
195
+ H=H,
196
+ n_components=2,
197
+ init=init,
198
+ update_H=True,
199
+ solver="cd",
200
+ beta_loss=2,
201
+ tol=tol,
202
+ max_iter=maxiter,
203
+ alpha_W=alpha_W,
204
+ alpha_H=alpha_H,
205
+ l1_ratio=0.0,
206
+ random_state=random_state,
207
+ verbose=0,
208
+ shuffle=False,
209
+ )
210
+ cluster_subset = np.argmax(H, axis=0)
211
+ W_buffer_one = np.zeros((m, 2), dtype=dtype)
212
+ W_buffer_one[term_subset, :] = W
213
+ H_buffer_one = H
214
+ if len(np.unique(cluster_subset)) > 1:
215
+ priority_one = compute_priority(W_parent, W_buffer_one, dtype=dtype)
216
+ else:
217
+ priority_one = -1
218
+ return cluster_subset, W_buffer_one, H_buffer_one, priority_one
219
+
220
+
221
+ def trial_split(
222
+ min_priority: float,
223
+ X: npt.NDArray[np.float64],
224
+ subset: npt.NDArray[np.int64],
225
+ W_parent: npt.NDArray[np.float64],
226
+ random_state: np.random.RandomState,
227
+ trial_allowance: int,
228
+ unbalanced: float,
229
+ dtype: npt.DTypeLike,
230
+ anls_alg: AnlsAlgorithm,
231
+ vec_norm: float,
232
+ normW: bool,
233
+ tol: float,
234
+ maxiter: int,
235
+ ) -> tuple[npt.NDArray[np.int64], npt.NDArray, npt.NDArray, float]:
236
+ m = X.shape[0]
237
+ trial = 0
238
+ subset_backup = subset
239
+ W_buffer_one = np.zeros((m, 2), dtype=dtype)
240
+ H_buffer_one = np.zeros((2, len(subset)), dtype=dtype)
241
+ priority_one = -2.0
242
+ while trial < trial_allowance:
243
+ cluster_subset, W_buffer_one, H_buffer_one, priority_one = split_once(
244
+ X=X,
245
+ subset=subset,
246
+ W_parent=W_parent,
247
+ random_state=random_state,
248
+ dtype=dtype,
249
+ anls_alg=anls_alg,
250
+ vec_norm=vec_norm,
251
+ normW=normW,
252
+ tol=tol,
253
+ maxiter=maxiter,
254
+ )
255
+ if priority_one < 0:
256
+ break
257
+
258
+ unique_cluster_subset = np.unique(cluster_subset)
259
+ if len(unique_cluster_subset) != 2:
260
+ logger.warning("Invalid number of unique sub-clusters!")
261
+
262
+ length_cluster1 = len(np.where(cluster_subset == unique_cluster_subset[0])[0])
263
+ length_cluster2 = len(np.where(cluster_subset == unique_cluster_subset[1])[0])
264
+ if min(length_cluster1, length_cluster2) < unbalanced * len(cluster_subset):
265
+ idx_small = np.argmin(np.array([length_cluster1, length_cluster2]))
266
+ subset_small = np.where(cluster_subset == unique_cluster_subset[idx_small])[
267
+ 0
268
+ ]
269
+ subset_small = subset[subset_small]
270
+ _, _, _, priority_one_small = split_once(
271
+ X=X,
272
+ subset=subset_small,
273
+ W_parent=W_buffer_one[:, idx_small],
274
+ random_state=random_state,
275
+ dtype=dtype,
276
+ anls_alg=anls_alg,
277
+ vec_norm=vec_norm,
278
+ normW=normW,
279
+ maxiter=maxiter,
280
+ tol=tol,
281
+ )
282
+ if priority_one_small < min_priority:
283
+ trial += 1
284
+ if trial < trial_allowance:
285
+ logger.info(f"Dropped {len(subset_small)} documents...")
286
+ subset = np.setdiff1d(subset, subset_small)
287
+ else:
288
+ break
289
+ else:
290
+ break
291
+
292
+ if trial == trial_allowance:
293
+ logger.info(f"Recycled {len(subset_backup) - len(subset)} documents...")
294
+ subset = subset_backup
295
+ W_buffer_one = np.zeros((m, 2), dtype=dtype)
296
+ H_buffer_one = np.zeros((2, len(subset)), dtype=dtype)
297
+ priority_one = -2
298
+
299
+ return subset, W_buffer_one, H_buffer_one, priority_one
300
+
301
+
302
+ def split_once(
303
+ X: npt.NDArray[np.float64],
304
+ subset: npt.NDArray[np.int64],
305
+ W_parent: npt.NDArray[np.float64],
306
+ random_state: mtrand.RandomState,
307
+ dtype: npt.DTypeLike,
308
+ anls_alg: AnlsAlgorithm,
309
+ vec_norm: float,
310
+ normW: bool,
311
+ tol: float,
312
+ maxiter: int,
313
+ ) -> tuple[npt.NDArray[np.float64], npt.NDArray, npt.NDArray, float]:
314
+ m = X.shape[0]
315
+ if len(subset) <= 3:
316
+ cluster_subset = np.ones(len(subset), dtype=dtype)
317
+ W_buffer_one = np.zeros((m, 2), dtype=dtype)
318
+ H_buffer_one = np.zeros((2, len(subset)), dtype=dtype)
319
+ priority_one = -1
320
+ else:
321
+ term_subset = np.where(np.sum(X[:, subset], axis=1) != 0)[0]
322
+ X_subset = X[term_subset, :][:, subset]
323
+ W = random_state.rand(len(term_subset), 2)
324
+ H = random_state.rand(2, len(subset))
325
+ W, H = nmfsh_comb_rank2(
326
+ X_subset,
327
+ W,
328
+ H,
329
+ anls_alg=anls_alg,
330
+ vec_norm=vec_norm,
331
+ normW=normW,
332
+ tol=tol,
333
+ maxiter=maxiter,
334
+ dtype=dtype,
335
+ )
336
+ cluster_subset = np.argmax(H, axis=0)
337
+ W_buffer_one = np.zeros((m, 2), dtype=dtype)
338
+ W_buffer_one[term_subset, :] = W
339
+ H_buffer_one = H
340
+ if len(np.unique(cluster_subset)) > 1:
341
+ priority_one = compute_priority(W_parent, W_buffer_one, dtype=dtype)
342
+ else:
343
+ priority_one = -1
344
+
345
+ return cluster_subset, W_buffer_one, H_buffer_one, priority_one
346
+
347
+
348
+ def compute_priority(
349
+ W_parent: npt.NDArray[np.float64],
350
+ W_child: npt.NDArray[np.float64],
351
+ dtype: npt.DTypeLike,
352
+ ) -> float:
353
+ n = len(W_parent)
354
+ idx_parent = np.argsort(W_parent)[::-1]
355
+ sorted_parent = W_parent[idx_parent]
356
+ idx_child1 = np.argsort(W_child[:, 0])[::-1]
357
+ idx_child2 = np.argsort(W_child[:, 1])[::-1]
358
+
359
+ n_part = len(np.where(W_parent != 0)[0])
360
+ if n_part <= 1:
361
+ priority = -3
362
+ else:
363
+ weight = np.log(np.arange(n, 0, -1))
364
+ first_zero = np.where(sorted_parent == 0)[0]
365
+ if len(first_zero) > 0:
366
+ weight[first_zero[0] :] = 1
367
+
368
+ weight_part = np.zeros(n, dtype=dtype)
369
+ weight_part[:n_part] = np.log(np.arange(n_part, 0, -1))
370
+ idx1 = np.argsort(idx_child1)
371
+ idx2 = np.argsort(idx_child2)
372
+ max_pos = np.maximum(idx1, idx2)
373
+ discount = np.log(n - max_pos[idx_parent] + 1)
374
+ discount[discount == 0] = np.log(2)
375
+ weight /= discount
376
+ weight_part /= discount
377
+
378
+ ndcg1 = NDCG_part(idx_parent, idx_child1, weight, weight_part)
379
+ ndcg2 = NDCG_part(idx_parent, idx_child2, weight, weight_part)
380
+ priority = ndcg1 * ndcg2
381
+
382
+ return priority
383
+
384
+
385
+ def NDCG_part(
386
+ ground: npt.NDArray[np.int64],
387
+ test: npt.NDArray[np.int64],
388
+ weight: npt.NDArray,
389
+ weight_part: npt.NDArray,
390
+ ) -> float:
391
+ seq_idx = np.argsort(ground)
392
+ weight_part = weight_part[seq_idx]
393
+
394
+ n = len(test)
395
+ uncum_score = weight_part[test]
396
+ uncum_score[2:] /= np.log2(np.arange(2, n))
397
+ cum_score = np.sum(uncum_score)
398
+
399
+ ideal_score = np.sort(weight)[::-1]
400
+ ideal_score[2:] /= np.log2(np.arange(2, n))
401
+ cum_ideal_score = np.sum(ideal_score)
402
+
403
+ score = cum_score / cum_ideal_score
404
+ return score
405
+
406
+
407
+ def nmfsh_comb_rank2(
408
+ A: npt.NDArray,
409
+ Winit: npt.NDArray,
410
+ Hinit: npt.NDArray,
411
+ anls_alg: AnlsAlgorithm,
412
+ vec_norm: float,
413
+ normW: bool,
414
+ tol: float,
415
+ maxiter: int,
416
+ dtype: npt.DTypeLike,
417
+ ) -> tuple[npt.NDArray, npt.NDArray]:
418
+ """"""
419
+ eps = 1e-6
420
+ shape: tuple[int, int] = A.shape
421
+ m, n = shape
422
+ W, H = Winit, Hinit
423
+
424
+ if W.shape[1] != 2:
425
+ warnings.warn(
426
+ f"Error: Wrong size of W! Expected shape of (n, 2) but received W of shape ({W.shape[0]}, {W.shape[1]})",
427
+ stacklevel=2,
428
+ )
429
+
430
+ if H.shape[0] != 2:
431
+ warnings.warn(
432
+ f"Error: Wrong size of H! Expected shape of (2, n) but received H of shape ({H.shape[0]}, {H.shape[1]})",
433
+ stacklevel=2,
434
+ )
435
+
436
+ left = H.dot(H.T)
437
+ right = A.dot(H.T)
438
+ for iter_ in range(maxiter):
439
+ if matrix_rank(left) < 2:
440
+ W = np.zeros((m, 2), dtype=dtype)
441
+ H = np.zeros((2, n), dtype=dtype)
442
+ if sp.issparse(A):
443
+ U, _S, V = svd(A.toarray(), full_matrices=False) # type: ignore[attr-defined] # A can be sparse
444
+ else:
445
+ U, _S, V = svd(A, full_matrices=False)
446
+ U, V = U[:, 0], V[0, :]
447
+ if sum(U) < 0:
448
+ U, V = -U, -V
449
+
450
+ W[:, 0] = U
451
+ H[0, :] = V
452
+
453
+ return W, H
454
+
455
+ W = anls_alg(left, right, W, dtype)
456
+ norms_W = norm(W, axis=0)
457
+ if np.min(norms_W) < eps:
458
+ logger.warning("Error: Some column of W is essentially zero")
459
+
460
+ W *= 1.0 / norms_W
461
+ left = W.T.dot(W)
462
+ right = A.T.dot(W)
463
+ if matrix_rank(left) < 2:
464
+ W = np.zeros((m, 2), dtype=dtype)
465
+ H = np.zeros((2, n), dtype=dtype)
466
+ if sp.issparse(A):
467
+ U, _S, V = svd(A.toarray(), full_matrices=False) # type: ignore[attr-defined] # A can be sparse
468
+ else:
469
+ U, _S, V = svd(A, full_matrices=False)
470
+ U, V = U[:, 0], V[0, :]
471
+ if sum(U) < 0:
472
+ U, V = -U, -V
473
+
474
+ W[:, 0] = U
475
+ H[0, :] = V
476
+
477
+ return W, H
478
+
479
+ H = anls_alg(left, right, H.T, dtype).T
480
+ gradH = left.dot(H) - right.T
481
+ left = H.dot(H.T)
482
+ right = A.dot(H.T)
483
+ gradW = W.dot(left) - right
484
+ initgrad = 1
485
+ if iter_ == 0:
486
+ gradW_square = np.sum(np.power(gradW[np.logical_or(gradW <= 0, W > 0)], 2))
487
+ gradH_square = np.sum(np.power(gradH[np.logical_or(gradH <= 0, H > 0)], 2))
488
+ initgrad = np.sqrt(gradW_square + gradH_square)
489
+ continue
490
+ gradW_square = np.sum(np.power(gradW[np.logical_or(gradW <= 0, W > 0)], 2))
491
+ gradH_square = np.sum(np.power(gradH[np.logical_or(gradH <= 0, H > 0)], 2))
492
+ projnorm = np.sqrt(gradW_square + gradH_square)
493
+
494
+ if projnorm < tol * initgrad:
495
+ break
496
+
497
+ if vec_norm != 0:
498
+ if normW:
499
+ norms = np.power(np.sum(np.power(W, vec_norm), axis=0), 1 / vec_norm)
500
+ W /= norms
501
+ H *= norms[:, None]
502
+ else:
503
+ norms = np.power(np.sum(np.power(H, vec_norm), axis=1), 1 / vec_norm)
504
+ W *= norms[None, :]
505
+ H /= norms
506
+
507
+ return W, H
508
+
509
+
510
+ def tree_to_nx(tree: npt.NDArray, weights: npt.NDArray | None = None) -> "nx.DiGraph":
511
+ import networkx as nx
512
+
513
+ g = nx.DiGraph()
514
+ g.add_node("Root", name="Root", is_word=False, id="Root")
515
+ for parent_node, row in enumerate(tree, start=0):
516
+ # Here the ith row refers to the ith node as a parent
517
+ parent_id = str(int(parent_node))
518
+ parent_idx = int(parent_node)
519
+ parent_name = f"Node {parent_id}"
520
+ if row.sum() > 0:
521
+ for child in row:
522
+ child_id = str(int(child))
523
+ child_idx = int(child)
524
+ child_name = f"Node {child_id}"
525
+
526
+ if parent_idx not in g.nodes:
527
+ g.add_node(
528
+ parent_idx,
529
+ is_word=False,
530
+ name=parent_name,
531
+ id=parent_id,
532
+ )
533
+ if child_idx not in g.nodes:
534
+ g.add_node(child_idx, is_word=False, name=child_name, id=child_id)
535
+ g.add_edge(parent_idx, child_idx)
536
+ if weights is not None:
537
+ child_weight = weights[child_idx]
538
+ g.nodes[child_idx]["weight"] = child_weight
539
+
540
+ g.add_edge("Root", 0)
541
+ g.add_edge("Root", 1)
542
+ return g
hnmf/model.py ADDED
@@ -0,0 +1,672 @@
1
+ import logging
2
+ from collections import defaultdict
3
+ from dataclasses import dataclass
4
+ from operator import itemgetter
5
+ from typing import Any, Literal, Self
6
+
7
+ import numpy as np
8
+ import numpy.typing as npt
9
+ from sklearn.base import BaseEstimator
10
+ from sklearn.decomposition import NMF
11
+
12
+ from hnmf.helpers import (
13
+ trial_split_sklearn,
14
+ )
15
+ from hnmf.progress_tree import ProgressTree
16
+
17
+
18
+ @dataclass(frozen=True, slots=True)
19
+ class DiscriminatedSample:
20
+ sample: Any
21
+ node: int
22
+ node_value: float
23
+ others_value: float
24
+
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ class HierarchicalNMF(BaseEstimator):
30
+ k: int
31
+ unbalanced: float
32
+ init: Literal[None, "random", "nndsvd", "nndsvda", "nndsvdar"]
33
+ solver: Literal["cd", "mu"]
34
+ beta_loss: Literal["FRO", 0, "KL", 1, "IS", 2]
35
+ alpha_W: float
36
+ alpha_H: Literal["same"] | float
37
+ random_state: np.random.RandomState
38
+ trial_allowance: int
39
+ tol: float
40
+ maxiter: int
41
+ dtype: npt.DTypeLike
42
+ n_samples_: int | None
43
+ n_features_: int | None
44
+ n_nodes_: int
45
+ n_leaves_: int
46
+ tree_: npt.NDArray | None
47
+ splits_: npt.NDArray | None
48
+ is_leaf_: npt.NDArray | None
49
+ clusters_: npt.NDArray | None
50
+ Ws_: npt.NDArray | None
51
+ Hs_: npt.NDArray | None
52
+ W_buffer_: npt.NDArray | None
53
+ H_buffer_: npt.NDArray | None
54
+ priorities_: npt.NDArray | None
55
+ id2sample_: dict[int, str] | None
56
+ id2feature_: dict[int, str] | None
57
+ feature2id_: dict[str, int] | None
58
+
59
+ def __init__(
60
+ self,
61
+ k: int,
62
+ unbalanced: float = 0.1,
63
+ init: Literal[None, "random", "nndsvd", "nndsvda", "nndsvdar"] = None,
64
+ solver: Literal["cd", "mu"] = "cd",
65
+ beta_loss: Literal["FRO", 0, "KL", 1, "IS", 2] = 0,
66
+ alpha_W: float = 0.0,
67
+ alpha_H: Literal["same"] | float = "same",
68
+ random_state: int = 42,
69
+ trial_allowance: int = 100,
70
+ tol: float = 1e-6,
71
+ maxiter: int = 10000,
72
+ dtype: npt.DTypeLike = np.float64,
73
+ ):
74
+ self.k = k
75
+ self.unbalanced = unbalanced
76
+ self.init = init
77
+ self.solver = solver
78
+ self.beta_loss = beta_loss
79
+ self.alpha_W = alpha_W
80
+ self.alpha_H = alpha_H
81
+ self.random_state = np.random.RandomState(seed=random_state)
82
+ self.trial_allowance = trial_allowance
83
+ self.tol = tol
84
+ self.maxiter = maxiter
85
+ self.dtype = dtype
86
+
87
+ self.n_samples_ = None
88
+ self.n_features_ = None
89
+ self.n_nodes_ = 0
90
+ self.n_leaves_ = 0
91
+ self.tree_ = None
92
+ self.splits_ = None
93
+ self.is_leaf_ = None
94
+ self.clusters_ = None
95
+ self.Ws_ = None
96
+ self.Hs_ = None
97
+ self.W_buffer_ = None
98
+ self.H_buffer_ = None
99
+ self.priorities_ = None
100
+ self.id2sample_ = None
101
+ self.id2feature_ = None
102
+ self.feature2id_ = None
103
+
104
+ """
105
+ Implements Hierarchical rank-2 NMF
106
+
107
+ Parameters
108
+ ----------
109
+
110
+ k: int
111
+ The number of desired leaf nodes
112
+ unbalanced : float
113
+ A threshold to determine if one of the two clusters is an outlier set. A smaller value means more tolerance for
114
+ imbalance between two clusters. See parameter beta in Algorithm 3 in the reference paper.
115
+ init : InitMethod
116
+ The initialization method used to initially fill W and H
117
+ solver : NMFSolver
118
+ The solver used to minimize the distance function
119
+ beta_loss : BetaLoss
120
+ Beta divergence to be minimized
121
+ alpha_W : float, defaults to 0.0
122
+ Constant that multiplies the regularization terms of W. Set it to zero (default) to have no regularization on W.
123
+ See `sklearn.decomposition.NMF`_
124
+ alpha_H: float or 'same', defaults to 'same'
125
+ Constant that multiplies the regularization terms of H. Set it to zero to have no regularization on H. If 'same'
126
+ (default), it takes the same value as alpha_W.
127
+ See `sklearn.decomposition.NMF`_
128
+ random_state : int
129
+ random seed
130
+ trial_allowance : int
131
+ Number of trials allowed for removing outliers and splitting a node again. See parameter T in Algorithm 3 in
132
+ the reference paper.
133
+ tol : float
134
+ Tolerance parameter for stopping criterion in each run of NMF.
135
+ maxiter : int
136
+ Maximum number of iteration times in each run of NMF
137
+ dtype : npt.DTypeLike
138
+ Dtype used for numpy arrays
139
+
140
+
141
+ Attributes
142
+ ----------
143
+ tree_ : np.ndarray
144
+ A 2-by-(k-1) matrix that encodes the tree structure. The two entries in the i-th column are the numberings of
145
+ the two children of the node with numbering i. The root node has numbering 0, with its two children always
146
+ having numbering 1 and numbering 2. Thus the root node is NOT included in the 'tree' variable.
147
+
148
+ splits_ :
149
+ An array of length k-1. It keeps track of the numberings of the nodes being split from the 1st split to the
150
+ (k-1)-th split. (The first entry is always 0.)
151
+
152
+ is_leaf_ :
153
+ An array of length 2*(k-1). A "1" at index ``i`` means that the node with numbering ``i`` is a leaf node in the final
154
+ tree generated, and "0" indicates non-leaf nodes in the final tree.
155
+
156
+ clusters_ :
157
+ Array with shape(n_nodes, n_features). A "1" at index ``i`` means that the sample with numbering ``c`` was
158
+ included in this nodes subset
159
+
160
+
161
+ Hs_ :
162
+ Array with shape (n_nodes, n_features)
163
+
164
+ Ws_ :
165
+ Array with shape (n_nodes, n_samples)
166
+
167
+ Notes
168
+ -----
169
+
170
+ ``W`` refers to the decomposed matrix. scikit-learn equivalent of::
171
+
172
+ W = model.fit_transform(X)
173
+
174
+ ``H`` refers to the factorization matrix. scikit-learn equivalent of::
175
+
176
+ model.components_
177
+
178
+
179
+ Adapted from [rank-2]_
180
+
181
+ """
182
+
183
+ def _init_fit(
184
+ self, X: npt.NDArray, term_subset: npt.NDArray
185
+ ) -> tuple[npt.NDArray[np.float64], npt.NDArray[np.float64]]:
186
+ if not self.n_samples_:
187
+ raise ValueError("n_samples_ not set before _init_fit called")
188
+
189
+ nmf = NMF(
190
+ n_components=2,
191
+ random_state=self.random_state,
192
+ tol=self.tol,
193
+ max_iter=self.maxiter,
194
+ init=self.init,
195
+ )
196
+
197
+ if len(term_subset) == self.n_samples_:
198
+ W = nmf.fit_transform(X)
199
+ H = nmf.components_
200
+ return W, H
201
+
202
+ W_tmp = nmf.fit_transform(X[term_subset, :])
203
+ H = nmf.components_
204
+ W = np.zeros((self.n_samples_, 2), dtype=self.dtype)
205
+ W[term_subset, :] = W_tmp
206
+
207
+ return W, H
208
+
209
+ def fit(self, X: npt.NDArray) -> Self:
210
+ """
211
+ Fit `HierarchicalNMF` to data
212
+ """
213
+ shape: tuple[int, int] = X.shape
214
+ n_samples, n_features = shape
215
+ self.n_samples_ = n_samples
216
+ self.n_features_ = n_features
217
+
218
+ # TODO Expect different sized ranks
219
+ clusters: list[npt.NDArray[np.int64] | None] = [None] * (2 * (self.k - 1))
220
+ Ws = [None] * (2 * (self.k - 1))
221
+ Hs = [None] * (2 * (self.k - 1))
222
+ W_buffer = [None] * (2 * (self.k - 1))
223
+ H_buffer = [None] * (2 * (self.k - 1))
224
+ priorities = np.zeros(2 * (self.k - 1), dtype=self.dtype)
225
+ is_leaf = np.zeros(2 * (self.k - 1), dtype=np.bool) # No leaves at start
226
+ tree = np.zeros((2, 2 * (self.k - 1)), dtype=np.int64)
227
+ splits = -np.ones(self.k - 1, dtype=np.int64)
228
+
229
+ # Where X has at least one non-zero
230
+ term_subset = np.flatnonzero(np.sum(X, axis=1))
231
+
232
+ W, H = self._init_fit(X, term_subset)
233
+
234
+ result_used = 0
235
+
236
+ with ProgressTree() as pt:
237
+ for i in range(self.k - 1):
238
+ if i == 0:
239
+ split_node = 0
240
+ new_nodes = [0, 1]
241
+ min_priority = 1e40
242
+ split_subset = np.arange(n_features)
243
+ else:
244
+ leaves = np.where(is_leaf == 1)[0]
245
+ temp_priority = priorities[leaves]
246
+
247
+ if len(np.where(temp_priority > 0)[0]) > 0:
248
+ min_priority = np.min(temp_priority[temp_priority > 0])
249
+ split_node = np.argmax(temp_priority)
250
+ else: # There are no more candidates stop early
251
+ min_priority = -1
252
+ split_node = 0
253
+
254
+ if temp_priority[split_node] < 0 or min_priority == -1:
255
+ logger.warning(
256
+ f"Cannot generate all {self.k} leaf clusters, stopping at {i} leaf clusters"
257
+ )
258
+
259
+ Ws = [i for i in Ws if i is not None]
260
+ W_buffer = [i for i in W_buffer if i is not None]
261
+
262
+ Hs = [i for i in Hs if i is not None]
263
+ H_buffer = [i for i in H_buffer if i is not None]
264
+
265
+ # Resize attributes
266
+ tree = tree[:, :result_used]
267
+ splits = splits[:result_used]
268
+ is_leaf = is_leaf[:result_used]
269
+ clusters = clusters[:result_used]
270
+ priorities = priorities[:result_used]
271
+
272
+ self.tree_ = tree.T
273
+ self.splits_ = splits
274
+ self.is_leaf_ = is_leaf
275
+ self.n_nodes_ = self.is_leaf_.shape[0]
276
+ self.n_leaves_ = int(np.count_nonzero(self.is_leaf_))
277
+ self.clusters_ = self._stack_clusters(clusters)
278
+ self.Ws_ = np.array(Ws)
279
+ self.Hs_ = np.array(Hs)
280
+ self.W_buffer_ = np.array(W_buffer)
281
+ self.H_buffer_ = self._stack_H_buffer(H_buffer)
282
+ self.priorities_ = priorities
283
+ return self
284
+
285
+ split_node = leaves[split_node] # Attempt to split this node
286
+ is_leaf[split_node] = 0
287
+ W = W_buffer[split_node]
288
+ H = H_buffer[split_node]
289
+
290
+ # Find which features are clustered on this node
291
+ split_subset = clusters[split_node]
292
+ new_nodes = [result_used, result_used + 1]
293
+ tree[:, split_node] = new_nodes
294
+
295
+ result_used += 2
296
+ # For each row find where it is more greatly represented
297
+ cluster_subset = np.argmax(H, axis=0)
298
+
299
+ subset_0 = np.flatnonzero(cluster_subset == 0)
300
+ subset_1 = np.flatnonzero(cluster_subset == 1)
301
+ ls0 = len(subset_0)
302
+ ls1 = len(subset_1)
303
+
304
+ if i == 0:
305
+ pt.add_branch("Root", new_nodes[0], ls0)
306
+ pt.add_branch("Root", new_nodes[1], ls1)
307
+ else:
308
+ pt.add_branch(split_node, new_nodes[0], ls0)
309
+ pt.add_branch(split_node, new_nodes[1], ls1)
310
+
311
+ clusters[new_nodes[0]] = split_subset[subset_0]
312
+ clusters[new_nodes[1]] = split_subset[subset_1]
313
+ Ws[new_nodes[0]] = W[:, 0]
314
+ Ws[new_nodes[1]] = W[:, 1]
315
+
316
+ # These will not have shape of (2, n_features) because they are fitting a subset
317
+ # Create zero filled array of shape (2, n_features)
318
+ h_temp = np.zeros(shape=(2, self.n_features_), dtype=self.dtype)
319
+ # Which features are present in H
320
+
321
+ h_temp[0, split_subset] = H[0]
322
+ h_temp[1, split_subset] = H[1]
323
+
324
+ Hs[new_nodes[0]] = h_temp[0]
325
+ Hs[new_nodes[1]] = h_temp[1]
326
+
327
+ splits[i] = split_node
328
+ is_leaf[new_nodes] = 1
329
+
330
+ subset = clusters[new_nodes[0]]
331
+ (
332
+ subset,
333
+ W_buffer_one,
334
+ H_buffer_one,
335
+ priority_one,
336
+ ) = trial_split_sklearn(
337
+ min_priority=min_priority,
338
+ X=X,
339
+ subset=subset,
340
+ W_parent=W[:, 0],
341
+ random_state=self.random_state,
342
+ trial_allowance=self.trial_allowance,
343
+ unbalanced=self.unbalanced,
344
+ dtype=self.dtype,
345
+ tol=self.tol,
346
+ maxiter=self.maxiter,
347
+ init=self.init,
348
+ alpha_W=self.alpha_W,
349
+ alpha_H=self.alpha_H,
350
+ )
351
+ clusters[new_nodes[0]] = subset
352
+ W_buffer[new_nodes[0]] = W_buffer_one
353
+ H_buffer[new_nodes[0]] = H_buffer_one
354
+ priorities[new_nodes[0]] = priority_one
355
+
356
+ subset = clusters[new_nodes[1]]
357
+ (
358
+ subset,
359
+ W_buffer_one,
360
+ H_buffer_one,
361
+ priority_one,
362
+ ) = trial_split_sklearn(
363
+ min_priority=min_priority,
364
+ X=X,
365
+ subset=subset,
366
+ W_parent=W[:, 1],
367
+ random_state=self.random_state,
368
+ trial_allowance=self.trial_allowance,
369
+ unbalanced=self.unbalanced,
370
+ dtype=self.dtype,
371
+ tol=self.tol,
372
+ maxiter=self.maxiter,
373
+ init=self.init,
374
+ alpha_W=self.alpha_W,
375
+ alpha_H=self.alpha_H,
376
+ )
377
+ clusters[new_nodes[1]] = subset
378
+ W_buffer[new_nodes[1]] = W_buffer_one
379
+ H_buffer[new_nodes[1]] = H_buffer_one
380
+ priorities[new_nodes[1]] = priority_one
381
+ self.tree_ = tree.T
382
+ self.splits_ = splits
383
+ self.is_leaf_ = is_leaf
384
+ self.clusters_ = self._stack_clusters(clusters)
385
+ self.Ws_ = np.array(Ws)
386
+ self.Hs_ = np.array(Hs)
387
+ self.W_buffer_ = np.array(W_buffer)
388
+ self.H_buffer_ = self._stack_H_buffer(H_buffer)
389
+ self.priorities_ = priorities
390
+ self.n_nodes_ = self.is_leaf_.shape[0]
391
+ self.n_leaves_ = int(np.count_nonzero(self.is_leaf_))
392
+ return self
393
+
394
+ def _stack_clusters(self, clusters: list[npt.NDArray | None]) -> npt.NDArray:
395
+ if not self.n_features_:
396
+ raise ValueError("n_features_ not set before _stack_clusters called")
397
+ result = np.zeros((len(clusters), self.n_features_), dtype=np.int64)
398
+ for i, cluster in enumerate(clusters):
399
+ result[i, cluster] = 1
400
+ return result
401
+
402
+ def _stack_H_buffer(self, buffer: list) -> npt.NDArray:
403
+ if self.n_features_ is None:
404
+ raise ValueError("n_features_ not set before _stack_H_buffer called")
405
+ if self.clusters_ is None:
406
+ raise ValueError("clusters_ not set before _stack_H_buffer called")
407
+
408
+ result = np.zeros((len(buffer), 2, self.n_features_), dtype=self.dtype)
409
+ for i, buff in enumerate(buffer):
410
+ cluster_nz_idx = np.argwhere(self.clusters_[i]).flatten()
411
+ result[i, 0, cluster_nz_idx] = buff[0, :]
412
+ result[i, 1, cluster_nz_idx] = buff[1, :]
413
+ return result
414
+
415
+ def top_features_in_node(self, node: int, n: int = 10) -> list[tuple]:
416
+ """
417
+ For a given node, return the top n features and values
418
+ """
419
+
420
+ if self.Hs_ is None:
421
+ raise ValueError("Model not fitted, Hs_ is None")
422
+
423
+ node_i = self.Hs_[node]
424
+ ranks = node_i.argsort()[::-1][:n]
425
+ return [(i, node_i[i]) for i in ranks if node_i[i] > 0]
426
+
427
+ def top_nodes_in_feature(
428
+ self,
429
+ feature_idx: int | str,
430
+ n: int = 10,
431
+ leaves_only: bool = True,
432
+ ) -> list[tuple]:
433
+ """
434
+ Returns the top nodes for a specified feature
435
+ """
436
+ if self.Hs_ is None:
437
+ raise ValueError("Model not fitted, Hs_ is None")
438
+
439
+ node_leaf_idx = np.where(self.is_leaf_ == 1)[0]
440
+ node_weights = self.Hs_.T[feature_idx]
441
+ ranks = node_weights.argsort()[::-1]
442
+ if leaves_only:
443
+ ranks = ranks[np.isin(ranks, node_leaf_idx)]
444
+
445
+ ranks = ranks[:n]
446
+
447
+ return [(i, node_weights[i]) for i in ranks if node_weights[i] > 0]
448
+
449
+ def top_nodes_in_samples(self, n: int = 10, leaves_only: bool = True):
450
+ """
451
+ Returns the top nodes for each sample.
452
+ """
453
+
454
+ if self.Ws_ is None or self.n_nodes_ is None:
455
+ raise ValueError("Model not fitted, Ws_ is None")
456
+
457
+ # Idx of leaves
458
+ node_leaf_idx = np.where(self.is_leaf_ == 1)[0]
459
+ # Keep map of enumerated -> actual cluster
460
+ if leaves_only:
461
+ node_map = dict(enumerate(node_leaf_idx))
462
+ else:
463
+ node_map = dict(enumerate(range(self.n_nodes_)))
464
+
465
+ # A dictionary of {sample : [top_nodes]}
466
+
467
+ output = {}
468
+
469
+ # Ws_ is shape n_nodes, n_samples
470
+ # Transpose weights so it has samples as rows, nodes as columns
471
+
472
+ weights = self.Ws_.T[node_leaf_idx].T if leaves_only else self.Ws_.T
473
+
474
+ # The ellipsis indicates that the selection is done row wise
475
+ sample_tops = weights.argsort()[:, ::-1][:, :n]
476
+
477
+ # Create an array with samples as rows, top n weights as columns
478
+ sample_top_weights = np.take_along_axis(weights, sample_tops, axis=1)
479
+
480
+ for sample_idx, (node_ids, node_weights) in enumerate(
481
+ zip(sample_tops, sample_top_weights, strict=True)
482
+ ):
483
+ tops = [
484
+ (node_map[node_id], weight)
485
+ for node_id, weight in zip(node_ids, node_weights, strict=True)
486
+ if weight > 0
487
+ ]
488
+ tops.sort(key=itemgetter(1), reverse=True)
489
+ output[sample_idx] = tops
490
+
491
+ return output
492
+
493
+ def top_samples_in_nodes(self, n: int = 10, leaves_only: bool = True):
494
+ """
495
+ Returns the top samples for each node
496
+ """
497
+
498
+ if self.Ws_ is None:
499
+ raise ValueError("Model not fitted, Ws_ is None")
500
+
501
+ # Idx of leaves
502
+ node_leaf_idx = np.where(self.is_leaf_ == 1)[0]
503
+
504
+ # A dictionary of {nodes : [sample]}
505
+
506
+ output = {}
507
+
508
+ # Ws_ is shape n_nodes, n_samples
509
+
510
+ weights = self.Ws_
511
+
512
+ # The ellipsis indicates that the selection is done row wise
513
+ node_tops = weights.argsort()[:, ::-1][:, :n]
514
+
515
+ # Create an array with samples as rows, top n weights as columns
516
+ node_top_weights = np.take_along_axis(weights, node_tops, axis=1)
517
+
518
+ for node_idx, (sample_ids, sample_weights) in enumerate(
519
+ zip(node_tops, node_top_weights, strict=True)
520
+ ):
521
+ if leaves_only and node_idx not in node_leaf_idx:
522
+ continue
523
+ tops = [
524
+ (sample_id, weight)
525
+ for sample_id, weight in zip(sample_ids, sample_weights, strict=True)
526
+ if weight > 0
527
+ ]
528
+ tops.sort(key=itemgetter(1), reverse=True)
529
+ # Decode samples if available
530
+
531
+ output[node_idx] = tops
532
+
533
+ return output
534
+
535
+ def top_discriminative_samples_in_node(
536
+ self,
537
+ node: int,
538
+ n: int = 10,
539
+ sign: Literal["positive", "negative", "abs"] = "abs",
540
+ ) -> "list[DiscriminatedSample]":
541
+ """
542
+ Computes most discriminative samples (node vs rest)
543
+
544
+ Parameters
545
+ ----------
546
+ node
547
+ n
548
+ The number of features to return
549
+ sign
550
+ One of `['positive', 'negative', 'abs']`.
551
+
552
+ Returns
553
+ --------
554
+ list of dict with form::
555
+
556
+ sample: Any
557
+ node: int
558
+ node_value: float
559
+ others_value: float
560
+
561
+ """
562
+
563
+ if self.Ws_ is None:
564
+ raise ValueError("Model not fitted, Ws_ is None")
565
+ if sign not in ("positive", "negative", "abs"):
566
+ raise ValueError("Sign must be one of 'positive', 'negative' or 'abs'")
567
+
568
+ # Masks
569
+ member_mask = np.array(node, dtype=np.int64)
570
+ non_member_mask = np.array(
571
+ [x for x in np.arange(0, self.n_nodes_) if x != node]
572
+ )
573
+
574
+ member_values = self.Ws_[member_mask].ravel()
575
+ other_means = self.Ws_[non_member_mask].mean(axis=0)
576
+
577
+ diffs = (
578
+ np.abs(member_values - other_means)
579
+ if sign == "positive"
580
+ else member_values - other_means
581
+ if sign == "positive"
582
+ else other_means - member_values
583
+ )
584
+
585
+ diff_tops = diffs.argsort()[::-1][:n]
586
+
587
+ return [
588
+ DiscriminatedSample(
589
+ sample=diff,
590
+ node=node,
591
+ node_value=member_values[diff],
592
+ others_value=other_means[diff],
593
+ )
594
+ for diff in diff_tops
595
+ ]
596
+
597
+ def cluster_features(
598
+ self,
599
+ leaves_only: bool = True,
600
+ include_outliers: bool = True,
601
+ ) -> dict[int, list[int]]:
602
+ """
603
+ Returns the features assigned as a cluster to nodes
604
+
605
+ Parameters
606
+ ----------
607
+ leaves_only
608
+ Whether to return only leaf nodes
609
+ include_outliers
610
+ If True, features without a node assignment are returned under the key -1
611
+
612
+ """
613
+
614
+ if self.clusters_ is None:
615
+ raise ValueError("Model not fitted, clusters_ is None")
616
+
617
+ output = defaultdict(list)
618
+
619
+ node_leaf_idx = np.where(self.is_leaf_ == 1)[0]
620
+
621
+ clusters = self.clusters_[node_leaf_idx] if leaves_only else self.clusters_
622
+
623
+ assignments = np.argwhere(clusters)
624
+
625
+ for cluster_idx, feature_idx in assignments:
626
+ output[cluster_idx].append(feature_idx)
627
+
628
+ if include_outliers:
629
+ outliers = np.where(clusters.sum(axis=0) == 0)[0]
630
+ output[-1] = outliers
631
+
632
+ return dict(output)
633
+
634
+ def cluster_assignments(
635
+ self,
636
+ leaves_only: bool = True,
637
+ include_outliers: bool = True,
638
+ ) -> dict[int, set[int]]:
639
+ """
640
+ Returns a mapping of features and their assigned cluster(s)
641
+
642
+ Parameters
643
+ ----------
644
+ leaves_only
645
+ Whether to return only leaf nodes
646
+ include_outliers
647
+ If True, include feature_idx keys that are not assigned a cluster.
648
+
649
+ """
650
+
651
+ if self.clusters_ is None:
652
+ raise ValueError("Model not fitted, clusters_ is None")
653
+
654
+ node_leaf_idx = np.where(self.is_leaf_ == 1)[0]
655
+
656
+ clusters = self.clusters_
657
+ output = defaultdict(set)
658
+ assignments = np.argwhere(clusters)
659
+ if leaves_only:
660
+ assignments = assignments[
661
+ np.where(np.isin(assignments[:, 0], node_leaf_idx))[0]
662
+ ]
663
+
664
+ for cluster_idx, feature_idx in assignments:
665
+ output[cluster_idx].add(feature_idx)
666
+
667
+ if include_outliers:
668
+ outliers = np.where(clusters.sum(axis=0) == 0)[0]
669
+ for outlier in outliers:
670
+ output[outlier] = set()
671
+
672
+ return dict(output)
hnmf/progress_tree.py ADDED
@@ -0,0 +1,54 @@
1
+
2
+ from rich.live import Live
3
+ from rich.tree import Tree
4
+
5
+
6
+ class ProgressTree:
7
+
8
+ tree: Tree | None
9
+ live : Live | None
10
+ branches: dict[str | int, Tree]
11
+
12
+
13
+ def __init__(self):
14
+ self.live = None
15
+ self.tree = None
16
+ self.branches = {}
17
+
18
+ def __enter__(self):
19
+ self.tree = Tree("", guide_style="bold blue", hide_root=True)
20
+ self._get_or_create_branch("Root", None, None)
21
+ self.live = Live(self.tree)
22
+ self.live.start()
23
+ return self
24
+
25
+ def __exit__(self, exc_type, exc_val, exc_tb):
26
+ if self.live:
27
+ self.live.stop()
28
+
29
+ def _get_or_create_branch(
30
+ self, k: str | int, source: Tree | None, desc: int | None,
31
+ ) -> "Tree":
32
+ branch = self.branches.get(k, None)
33
+ if branch:
34
+ return branch
35
+ display_name = f"[green]{k}" if not desc else f"[green]{k}:({desc})"
36
+ if self.tree is None:
37
+ raise RuntimeError("ProgressTree context not entered.")
38
+ branch = source.add(display_name) if source else self.tree.add(display_name)
39
+ self.branches[k] = branch
40
+ return branch
41
+
42
+ def add_branch(
43
+ self,
44
+ source: str | int,
45
+ target: int | str,
46
+ desc: int | None,
47
+ ):
48
+
49
+ if self.tree is None or self.live is None:
50
+ raise RuntimeError("ProgressTree context not entered.")
51
+
52
+ source_branch = self._get_or_create_branch(source, None, None)
53
+ self._get_or_create_branch(target, source_branch, desc)
54
+ self.live.update(self.tree)
@@ -0,0 +1,74 @@
1
+ Metadata-Version: 2.4
2
+ Name: hNMF
3
+ Version: 0.3.0
4
+ Summary: Hierarchical NMF
5
+ Project-URL: Homepage, https://github.com/estasney/hNMF
6
+ Author-email: Eric Stasney <estasney@users.noreply.github.com>
7
+ License: MIT
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Programming Language :: Python
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Requires-Python: >=3.12
12
+ Requires-Dist: networkx>=2.3
13
+ Requires-Dist: numpy>=1.24.4
14
+ Requires-Dist: rich>=14.2.0
15
+ Requires-Dist: scikit-learn>=1.3.2
16
+ Requires-Dist: scipy>=1.10.1
17
+ Description-Content-Type: text/markdown
18
+
19
+ # hierarchical-nmf-python
20
+ * fork of https://github.com/rudvlf0413/hierarchical-nmf-python
21
+ * with familiar SKLearn interface
22
+
23
+ ## Installation
24
+ ```bash
25
+ pip install hnmf
26
+ ```
27
+
28
+ ## Usage
29
+ ### 20 Newsgroups
30
+
31
+ ```python
32
+ from sklearn.datasets import fetch_20newsgroups
33
+ from sklearn.feature_extraction.text import TfidfVectorizer
34
+ from hnmf import HierarchicalNMF
35
+
36
+ n_features = 1000
37
+ n_leaves = 20
38
+
39
+ data, _ = fetch_20newsgroups(shuffle=True, random_state=1,
40
+ remove=('headers', 'footers', 'quotes'),
41
+ return_X_y=True)
42
+
43
+ # Use tf-idf features for NMF.
44
+ tfidf = TfidfVectorizer(max_df=0.95, min_df=2,
45
+ max_features=n_features,
46
+ stop_words='english')
47
+
48
+ X = tfidf.fit_transform(data)
49
+ id2feature = {i: token for i, token in enumerate(tfidf.get_feature_names_out())}
50
+
51
+ # hNMF
52
+ model = HierarchicalNMF(k=n_leaves)
53
+ model.fit(X)
54
+ model.cluster_features()
55
+ ```
56
+
57
+ ## Documentation
58
+
59
+ To build the documentation:
60
+ ```bash
61
+ mkdocs build
62
+ ```
63
+
64
+ To preview locally:
65
+ ```bash
66
+ mkdocs serve
67
+ ```
68
+
69
+ The documentation will be built to the `docs/` folder for GitHub Pages.
70
+
71
+ ## Reference
72
+ - Papers: [Fast rank-2 nonnegative matrix factorization for hierarchical document clustering](https://smallk.github.io/papers/hierNMF2.pdf)
73
+
74
+ - Originally adapted from MATLAB: https://github.com/dakuang/hiernmf2
@@ -0,0 +1,7 @@
1
+ hnmf/__init__.py,sha256=hAyoIQt-2esjz3EtMVCxcyJmZZSXoaU8_I3Fqu0JrRM,52
2
+ hnmf/helpers.py,sha256=cZr69WdFgAbpmb8XjuuuUSC5maoYMjsEi02OEm5l5cU,17896
3
+ hnmf/model.py,sha256=bKwE-alQ8kx2Xf6NBhe7ZKZI_-ebamjNJM4erj-j7RA,22960
4
+ hnmf/progress_tree.py,sha256=NrkMPOgp2HfrhiooWjD-4t3zCN1tXtg3xcrQ3lMNW-w,1562
5
+ hnmf-0.3.0.dist-info/METADATA,sha256=MnN-F7VxfPsL6uzAzIAgaaE_ceFccFf61lRqxnIVKds,1913
6
+ hnmf-0.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
7
+ hnmf-0.3.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any