datatypical 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
datatypical.py ADDED
@@ -0,0 +1,3417 @@
1
+ """
2
+ DataTypical v0.7 --- Dual-Perspective Significance with Shapley Explanations
3
+ ===========================================================================
4
+
5
+ Revolutionary framework combining geometric and influence-based significance.
6
+
7
+ Key Innovation:
8
+ - Actual significance: Samples that ARE archetypal/prototypical/stereotypical (geometric)
9
+ - Formative instances: Samples that MAKE the dataset archetypal/prototypical/stereotypical (Shapley)
10
+ - Local explanations: WHY each sample is significant (feature attributions)
11
+
12
+ Two complementary perspectives:
13
+ 1. LOCAL: "This sample IS significant because features X, Y contribute most"
14
+ 2. GLOBAL: "This sample CREATES significance by defining the distribution and boundary"
15
+
16
+ What's new in v0.7:
17
+ - shapley_mode parameter (True/False)
18
+ - When True: computes explanations + formative instances
19
+ - Dual rankings: *_rank (actual) + *_shapley_rank (formative)
20
+ - Novel value functions: convex hull, coverage, extremeness
21
+ - Parallel Shapley computation with Option A (accurate v0.4 explanations)
22
+
23
+ All v0.6 features retained:
24
+ - Local explanations via get_shapley_explanations()
25
+ - Global explanations to identify formative instances
26
+
27
+ All v0.5 features retained:
28
+ - Tabular/Text/Graph support
29
+ - Label column preservation
30
+ - Graph topology features
31
+
32
+ All v0.4 features retained:
33
+ - User-configurable stereotypes
34
+
35
+ Sections:
36
+ [A] Exceptions & Globals
37
+ [B] Thread Control
38
+ [C] Helpers (sparse/dense math)
39
+ [D] Facility-Location (CELF, deterministic)
40
+ [E] Shapley Significance Engine (NEW in v0.6)
41
+ [F] DataTypical API
42
+ [G] Graph Topology Features
43
+ [H] Stereotype Computation
44
+ """
45
+
46
+ from __future__ import annotations
47
+
48
+ from dataclasses import dataclass, field, fields as dc_fields
49
+ from typing import Iterable, List, Optional, Dict, Tuple, Union, Callable
50
+
51
+ import heapq
52
+ import math
53
+ import gc
54
+ import warnings
55
+ import hashlib
56
+ import numpy as np
57
+ import pandas as pd
58
+ from sklearn.decomposition import NMF
59
+ from sklearn.feature_extraction.text import TfidfVectorizer
60
+ from sklearn.preprocessing import MinMaxScaler
61
+ from threadpoolctl import threadpool_limits
62
+ from joblib import Parallel, delayed
63
+ from sklearn.exceptions import ConvergenceWarning
64
+
65
+ try:
66
+ from numba import jit, prange
67
+ NUMBA_AVAILABLE = True
68
+ except ImportError:
69
+ NUMBA_AVAILABLE = False
70
+ # Dummy decorator if numba not available
71
+ def jit(*args, **kwargs):
72
+ def decorator(func):
73
+ return func
74
+ return decorator
75
+
76
+ try:
77
+ import scipy.sparse as sp
78
+ except Exception:
79
+ sp = None
80
+ ArrayLike = Union[np.ndarray, "sp.spmatrix"]
81
+
82
+ try:
83
+ from scipy.spatial import ConvexHull
84
+ from scipy.spatial.distance import cdist
85
+ except Exception:
86
+ ConvexHull = None
87
+ cdist = None
88
+
89
+ try:
90
+ from py_pcha import PCHA
91
+ except ImportError:
92
+ PCHA = None
93
+
94
+ try:
95
+ import faiss
96
+ FAISS_AVAILABLE = True
97
+ except ImportError:
98
+ FAISS_AVAILABLE = False
99
+
100
+ # ============================================================
101
+ # [A] Exceptions & Globals
102
+ # ============================================================
103
+ class DataTypicalError(Exception):
104
+ pass
105
+
106
+ class MemoryBudgetError(DataTypicalError):
107
+ pass
108
+
109
+ class ConfigError(DataTypicalError):
110
+ pass
111
+
112
+ def _seed_everything(seed: int) -> None:
113
+ np.random.seed(seed)
114
+
115
+
116
+ # ============================================================
117
+ # [B] Thread Control
118
+ # ============================================================
119
+ class _ThreadControl:
120
+ def __init__(self, deterministic: bool = True):
121
+ self.deterministic = deterministic
122
+ self._ctx = None
123
+ self.effective_limit = None
124
+
125
+ def __enter__(self):
126
+ if self.deterministic:
127
+ self._ctx = threadpool_limits(limits=1)
128
+ self.effective_limit = 1
129
+ else:
130
+ self._ctx = threadpool_limits(limits=None)
131
+ self.effective_limit = None
132
+ self._ctx.__enter__()
133
+ return self
134
+
135
+ def __exit__(self, exc_type, exc, tb):
136
+ if self._ctx is not None:
137
+ self._ctx.__exit__(exc_type, exc, tb)
138
+
139
+
140
+ # ============================================================
141
+ # [C] Helpers (sparse/dense math)
142
+ # ============================================================
143
+ def _cleanup_memory(*arrays, force_gc: bool = False) -> None:
144
+ """
145
+ Explicitly delete arrays and optionally force garbage collection.
146
+
147
+ MEMORY OPTIMIZED: Python's GC doesn't always free memory immediately.
148
+ This forces cleanup of large temporaries to reduce peak memory usage.
149
+ """
150
+ for arr in arrays:
151
+ if arr is not None:
152
+ del arr
153
+
154
+ if force_gc:
155
+ gc.collect()
156
+
157
+
158
+ def _l2_normalize_rows_dense(X: np.ndarray) -> np.ndarray:
159
+ norms = np.linalg.norm(X, axis=1, keepdims=True)
160
+ norms[norms == 0.0] = 1.0
161
+ return X / norms
162
+
163
+
164
+
165
+ def _sparse_l2_normalize_rows(X: "sp.spmatrix") -> "sp.spmatrix":
166
+ if sp is None:
167
+ raise ImportError("scipy is required for sparse operations.")
168
+ if not sp.isspmatrix_csr(X):
169
+ X = X.tocsr(copy=False)
170
+ sq = X.multiply(X).sum(axis=1)
171
+ norms = np.sqrt(np.maximum(np.asarray(sq).ravel(), 0.0))
172
+ norms[norms == 0.0] = 1.0
173
+ D = sp.diags(1.0 / norms)
174
+ return D @ X
175
+
176
+
177
+ def _sparse_minmax_0_1_nonneg(M: "sp.spmatrix") -> "sp.spmatrix":
178
+ if sp is None:
179
+ raise ImportError("scipy is required for sparse operations.")
180
+ if not sp.isspmatrix(M):
181
+ raise TypeError("Expected a scipy.sparse matrix.")
182
+ A = M.tocsc(copy=False)
183
+ # CRITICAL: Must use .toarray() to convert sparse matrix to dense
184
+ col_max = A.max(axis=0).toarray().ravel()
185
+ col_max[col_max == 0.0] = 1.0
186
+ return (A @ sp.diags(1.0 / col_max)).tocsr()
187
+
188
+
189
+ def _chunk_len(n_left: int, n_right: int, bytes_per: int, max_memory_mb: int) -> int:
190
+ if max_memory_mb <= 0:
191
+ raise MemoryBudgetError("max_memory_mb must be positive")
192
+ max_bytes = max_memory_mb * 1024 * 1024
193
+ return max(1, min(n_right, int(max_bytes // max(8, n_left * bytes_per))))
194
+
195
+
196
+ def _ensure_dtype(X: np.ndarray, dtype: str = 'float32') -> np.ndarray:
197
+ """
198
+ Ensure array has specified dtype, converting if necessary.
199
+
200
+ MEMORY OPTIMIZED: Default to float32 (4 bytes) instead of float64 (8 bytes).
201
+ """
202
+ target_dtype = np.float32 if dtype == 'float32' else np.float64
203
+
204
+ if X.dtype != target_dtype:
205
+ return X.astype(target_dtype, copy=False)
206
+ return X
207
+
208
+
209
+ def _euclidean_min_to_set_dense(
210
+ X: np.ndarray, Y: np.ndarray, max_memory_mb: int = 2048
211
+ ) -> np.ndarray:
212
+ """
213
+ Compute minimum Euclidean distance from each row of X to any row in Y.
214
+
215
+ OPTIMIZED: Uses Numba JIT for 2-3× speedup and better memory efficiency.
216
+ """
217
+ n, d = X.shape
218
+ m = Y.shape[0]
219
+
220
+ # For small problems, use JIT-compiled direct computation
221
+ if n * m < 100000:
222
+ return _euclidean_min_jit(X, Y)
223
+
224
+ # For large problems, use chunked computation with JIT
225
+ best = np.full(n, np.inf, dtype=np.float64)
226
+ block = _chunk_len(n, m, bytes_per=8, max_memory_mb=max_memory_mb)
227
+
228
+ # Pre-compute X squared norms once
229
+ x2 = np.sum(X * X, axis=1)
230
+
231
+ for s in range(0, m, block):
232
+ e = min(m, s + block)
233
+ YY = Y[s:e]
234
+
235
+ # Use JIT-compiled function for this chunk
236
+ chunk_dists = _euclidean_chunk_jit(X, YY, x2)
237
+ best = np.minimum(best, chunk_dists)
238
+
239
+ return best
240
+
241
+
242
+ @jit(nopython=True, parallel=True, cache=True, fastmath=True)
243
+ def _euclidean_min_jit(X: np.ndarray, Y: np.ndarray) -> np.ndarray:
244
+ """
245
+ JIT-compiled minimum Euclidean distance computation.
246
+
247
+ Uses parallel loops for multi-core acceleration.
248
+ """
249
+ n = X.shape[0]
250
+ m = Y.shape[0]
251
+ d = X.shape[1]
252
+
253
+ min_dists = np.empty(n, dtype=np.float64)
254
+
255
+ # Parallel loop over X samples (explicit with prange)
256
+ for i in prange(n): # Changed from range to prange
257
+ min_dist = np.inf
258
+
259
+ for j in range(m):
260
+ dist_sq = 0.0
261
+ for k in range(d):
262
+ diff = X[i, k] - Y[j, k]
263
+ dist_sq += diff * diff
264
+
265
+ if dist_sq < min_dist:
266
+ min_dist = dist_sq
267
+
268
+ min_dists[i] = np.sqrt(max(min_dist, 0.0))
269
+
270
+ return min_dists
271
+
272
+
273
+ @jit(nopython=True, cache=True, fastmath=True)
274
+ def _euclidean_chunk_jit(
275
+ X: np.ndarray,
276
+ Y_chunk: np.ndarray,
277
+ x2: np.ndarray
278
+ ) -> np.ndarray:
279
+ """
280
+ JIT-compiled chunked distance computation using pre-computed norms.
281
+
282
+ Computes: sqrt(||x||² + ||y||² - 2⟨x,y⟩) efficiently.
283
+ """
284
+ n = X.shape[0]
285
+ m = Y_chunk.shape[0]
286
+ d = X.shape[1]
287
+
288
+ min_dists = np.empty(n, dtype=np.float64)
289
+
290
+ # Pre-compute Y chunk squared norms
291
+ y2 = np.empty(m, dtype=np.float64)
292
+ for j in range(m):
293
+ y2_val = 0.0
294
+ for k in range(d):
295
+ y2_val += Y_chunk[j, k] * Y_chunk[j, k]
296
+ y2[j] = y2_val
297
+
298
+ # Parallel loop over X samples
299
+ for i in range(n):
300
+ min_dist_sq = np.inf
301
+
302
+ for j in range(m):
303
+ # Compute dot product
304
+ dot = 0.0
305
+ for k in range(d):
306
+ dot += X[i, k] * Y_chunk[j, k]
307
+
308
+ # Distance squared using pre-computed norms
309
+ dist_sq = x2[i] + y2[j] - 2.0 * dot
310
+
311
+ if dist_sq < min_dist_sq:
312
+ min_dist_sq = dist_sq
313
+
314
+ min_dists[i] = np.sqrt(max(min_dist_sq, 0.0))
315
+
316
+ return min_dists
317
+
318
+ @jit(nopython=True, cache=True, fastmath=True)
319
+ def _pairwise_euclidean_jit(X: np.ndarray) -> np.ndarray:
320
+ """
321
+ JIT-compiled pairwise Euclidean distance matrix.
322
+
323
+ Returns upper triangle only to save memory.
324
+ """
325
+ n = X.shape[0]
326
+ d = X.shape[1]
327
+
328
+ # Compute full distance matrix (symmetric)
329
+ dists = np.zeros((n, n), dtype=np.float64)
330
+
331
+ # Parallel outer loop
332
+ for i in range(n):
333
+ for j in range(i + 1, n):
334
+ dist_sq = 0.0
335
+ for k in range(d):
336
+ diff = X[i, k] - X[j, k]
337
+ dist_sq += diff * diff
338
+
339
+ dist = np.sqrt(max(dist_sq, 0.0))
340
+ dists[i, j] = dist
341
+ dists[j, i] = dist # Symmetric
342
+
343
+ return dists
344
+
345
+
346
+ @jit(nopython=True, cache=True, fastmath=True)
347
+ def _cosine_similarity_jit(X: np.ndarray, Y: np.ndarray) -> np.ndarray:
348
+ """
349
+ JIT-compiled cosine similarity between L2-normalized vectors.
350
+
351
+ For L2-normalized data, this is just the dot product.
352
+ """
353
+ n = X.shape[0]
354
+ m = Y.shape[0]
355
+ d = X.shape[1]
356
+
357
+ sims = np.empty((n, m), dtype=np.float64)
358
+
359
+ # Parallel loop over X samples
360
+ for i in range(n):
361
+ for j in range(m):
362
+ dot = 0.0
363
+ for k in range(d):
364
+ dot += X[i, k] * Y[j, k]
365
+ sims[i, j] = max(dot, 0.0) # Clip negative similarities
366
+
367
+ return sims
368
+
369
+
370
+ # ============================================================
371
+ # [D] Facility-Location (CELF, deterministic)
372
+ # ============================================================
373
+ @dataclass
374
+ class FacilityLocationSelector:
375
+ def __init__(self, n_prototypes=10, deterministic=True, speed_mode=False, verbose=False):
376
+ self.n_prototypes = int(n_prototypes)
377
+ self.deterministic = bool(deterministic)
378
+ self.speed_mode = bool(speed_mode)
379
+ self.verbose = bool(verbose)
380
+
381
+ def select(self, X_l2, weights=None, forbidden=None):
382
+ """
383
+ Deterministic CELF for facility-location with:
384
+ • content-based tie-breaking (perm-invariant),
385
+ • optional client weights (e.g., density),
386
+ • optional forbidden candidate set (still count as clients).
387
+ Expects rows to be L2-normalized. Works with dense or sparse input.
388
+ Returns: (selected_indices, marginal_gains)
389
+ """
390
+ import numpy as np, heapq, hashlib
391
+
392
+ # --- dense float64 view
393
+ if sp is not None and sp.isspmatrix(X_l2):
394
+ X = X_l2.toarray().astype(np.float64, copy=False)
395
+ else:
396
+ X = np.asarray(X_l2, dtype=np.float64)
397
+ n = X.shape[0]
398
+ if n == 0:
399
+ return np.array([], dtype=int), np.array([], dtype=float)
400
+
401
+ # --- client weights (normalize to mean 1 for scale stability)
402
+ if weights is None:
403
+ w = np.ones(n, dtype=np.float64)
404
+ else:
405
+ w = np.asarray(weights, dtype=np.float64).ravel()
406
+ m = float(w.mean())
407
+ w = w / m if m > 0 else np.ones_like(w)
408
+
409
+ # --- forbidden candidates (excluded from selection, included as clients)
410
+ forb = np.zeros(n, dtype=bool)
411
+ if forbidden is not None:
412
+ forb_idx = np.asarray(list(forbidden), dtype=int)
413
+ forb_idx = forb_idx[(forb_idx >= 0) & (forb_idx < n)]
414
+ forb[forb_idx] = True
415
+
416
+ # --- target number of prototypes (cap to available candidates)
417
+ k_req = int(getattr(self, "n_prototypes", min(10, n)))
418
+ available = n - int(forb.sum())
419
+ k = max(0, min(k_req, available))
420
+ if k == 0:
421
+ return np.array([], dtype=int), np.array([], dtype=float)
422
+
423
+ # --- permutation-invariant tie-breaker: hash of row content
424
+ def row_key(i: int) -> int:
425
+ h = hashlib.blake2b(X[i].tobytes(), digest_size=8)
426
+ return int.from_bytes(h.digest(), "big", signed=False)
427
+ keys = np.fromiter((row_key(i) for i in range(n)), dtype=np.uint64, count=n)
428
+
429
+ # --- CELF init
430
+ best = np.zeros(n, dtype=np.float64) # current best similarity per client
431
+ last_eval = np.full(n, -1, dtype=np.int64) # last #selected when gain was computed
432
+ last_gain = np.zeros(n, dtype=np.float64)
433
+
434
+ # Initial exact gains: g0[c] = sum_i w_i * max(0, <x_i, x_c>)
435
+ g0 = np.zeros(n, dtype=np.float64)
436
+ # block multiply to limit memory
437
+ target_bytes = 256 * 1024 * 1024 # 256MB scratch
438
+ item = np.dtype(np.float64).itemsize
439
+ max_b = max(1, int(target_bytes // max(1, n * item)))
440
+ bsz = max(1, min(n, max_b))
441
+ XT = X.T
442
+ for s in range(0, n, bsz):
443
+ e = min(n, s + bsz)
444
+ S = X[s:e] @ XT # (e-s, n)
445
+ np.maximum(S, 0.0, out=S)
446
+ g0 += (w[s:e, None] * S).sum(axis=0, dtype=np.float64)
447
+
448
+ last_gain[:] = g0
449
+ last_eval[:] = 0
450
+
451
+ # heap items: (-gain_estimate, key, idx) – ties broken by content key
452
+ heap = [(-float(g0[c]), int(keys[c]), int(c)) for c in range(n) if not forb[c]]
453
+ heapq.heapify(heap)
454
+
455
+ selected: list[int] = []
456
+ gains: list[float] = []
457
+ it = 0
458
+ while len(selected) < k and heap:
459
+ neg_g_est, _, c = heapq.heappop(heap)
460
+ if last_eval[c] == it:
461
+ # accept candidate
462
+ selected.append(c)
463
+ gains.append(float(last_gain[c]))
464
+ s = X @ X[c]
465
+ np.maximum(s, 0.0, out=s)
466
+ np.maximum(best, s, out=best)
467
+ it += 1
468
+ continue
469
+ # refresh exact marginal gain vs current 'best'
470
+ s = X @ X[c]
471
+ improv = s - best
472
+ improv[improv < 0.0] = 0.0
473
+ g_exact = float((w * improv).sum(dtype=np.float64))
474
+ last_gain[c] = g_exact
475
+ last_eval[c] = it
476
+ heapq.heappush(heap, (-g_exact, int(keys[c]), int(c)))
477
+
478
+ return np.asarray(selected, dtype=int), np.asarray(gains, dtype=float)
479
+
480
+
481
+ def select(self, X_l2, weights=None, forbidden=None):
482
+ """
483
+ Select prototypes using lazy CELF with optional FAISS acceleration.
484
+
485
+ OPTIMIZED: Uses FAISS for datasets with n > 1,000 samples for massive speedup.
486
+ MEMORY OPTIMIZED: Explicit cleanup of similarity matrix after use.
487
+ """
488
+ import numpy as np
489
+
490
+ if sp is not None and sp.isspmatrix(X_l2):
491
+ X = X_l2.toarray().astype(np.float64, copy=False)
492
+ else:
493
+ X = np.asarray(X_l2, dtype=np.float64)
494
+ n = X.shape[0]
495
+ if n == 0:
496
+ return np.array([], dtype=int), np.array([], dtype=float)
497
+
498
+ # Normalize weights
499
+ if weights is None:
500
+ w = np.ones(n, dtype=np.float64)
501
+ else:
502
+ w = np.asarray(weights, dtype=np.float64).ravel()
503
+ m = float(w.mean())
504
+ w = w / m if m > 0 else np.ones_like(w)
505
+
506
+ # OPTIMIZED: Use FAISS for large datasets if available
507
+ use_faiss = FAISS_AVAILABLE and n > 1000 and not self.speed_mode
508
+
509
+ if use_faiss:
510
+ if self.verbose:
511
+ print(f" Using FAISS acceleration for n={n}")
512
+ result = self._select_with_faiss(X, w, forbidden)
513
+ # MEMORY CLEANUP: Free X copy before returning
514
+ _cleanup_memory(X, force_gc=True)
515
+ return result
516
+
517
+ # Otherwise use the cached similarity matrix approach
518
+ import heapq, hashlib
519
+
520
+ # Handle forbidden indices
521
+ forb = np.zeros(n, dtype=bool)
522
+ if forbidden is not None:
523
+ forb_idx = np.asarray(list(forbidden), dtype=int)
524
+ forb_idx = forb_idx[(forb_idx >= 0) & (forb_idx < n)]
525
+ forb[forb_idx] = True
526
+
527
+ k_req = int(getattr(self, "n_prototypes", min(10, n)))
528
+ available = n - int(forb.sum())
529
+ k = max(0, min(k_req, available))
530
+ if k == 0:
531
+ return np.array([], dtype=int), np.array([], dtype=float)
532
+
533
+ # Pre-compute similarity matrix
534
+ XT = X.T
535
+ S = X @ XT
536
+ np.maximum(S, 0.0, out=S)
537
+
538
+ # MEMORY CLEANUP: Free XT after similarity computation
539
+ _cleanup_memory(XT)
540
+
541
+ # Pre-compute weighted candidate similarities
542
+ S_weighted = w[None, :] * S
543
+ candidate_sims = S_weighted.sum(axis=1)
544
+
545
+ # MEMORY CLEANUP: Free S_weighted after computing candidate_sims
546
+ _cleanup_memory(S_weighted)
547
+
548
+ # Generate deterministic keys
549
+ def row_key(i: int) -> int:
550
+ h = hashlib.blake2b(X[i].tobytes(), digest_size=8)
551
+ return int.from_bytes(h.digest(), "big", signed=False)
552
+ keys = np.fromiter((row_key(i) for i in range(n)), dtype=np.uint64, count=n)
553
+
554
+ # CELF state tracking
555
+ best = np.zeros(n, dtype=np.float64)
556
+ last_eval = np.full(n, -1, dtype=np.int64)
557
+ last_gain = candidate_sims.copy()
558
+ last_eval[:] = 0
559
+
560
+ # Initialize heap
561
+ heap = [(-float(candidate_sims[c]), int(keys[c]), int(c))
562
+ for c in range(n) if not forb[c]]
563
+ heapq.heapify(heap)
564
+
565
+ selected = []
566
+ gains = []
567
+ it = 0
568
+
569
+ while len(selected) < k and heap:
570
+ neg_g_est, _, c = heapq.heappop(heap)
571
+
572
+ if last_eval[c] == it:
573
+ selected.append(c)
574
+ gains.append(float(last_gain[c]))
575
+ s_c = S[c, :]
576
+ np.maximum(best, s_c, out=best)
577
+ it += 1
578
+ continue
579
+
580
+ # Lazy evaluation
581
+ s_c = S[c, :]
582
+ improv = s_c - best
583
+ improv[improv < 0.0] = 0.0
584
+ g_exact = float((w * improv).sum(dtype=np.float64))
585
+
586
+ last_gain[c] = g_exact
587
+ last_eval[c] = it
588
+ heapq.heappush(heap, (-g_exact, int(keys[c]), int(c)))
589
+
590
+ # MEMORY CLEANUP: Free large arrays before returning
591
+ _cleanup_memory(S, X, best, last_gain, candidate_sims, force_gc=True)
592
+
593
+ return np.asarray(selected, dtype=int), np.asarray(gains, dtype=float)
594
+
595
+
596
+ # ============================================================
597
+ # [E] Shapley Significance Engine (NEW in v0.6)
598
+ # ============================================================
599
+
600
+ class ShapleyEarlyStopping:
601
+ """Early stopping for Shapley convergence using relative change."""
602
+
603
+ def __init__(self, patience: int = 10, tolerance: float = 0.01):
604
+ self.patience = patience
605
+ self.tolerance = tolerance
606
+ self.history = []
607
+ self.stable_count = 0
608
+
609
+ def update(self, shapley_estimates: np.ndarray, n_perms: int) -> Tuple[bool, Dict]:
610
+ if n_perms < 20:
611
+ return False, {'converged': False, 'n_permutations': n_perms}
612
+
613
+ self.history.append(shapley_estimates.copy())
614
+
615
+ if len(self.history) < 2:
616
+ return False, {'converged': False, 'n_permutations': n_perms}
617
+
618
+ old = self.history[-2]
619
+ new = self.history[-1]
620
+
621
+ denom = np.abs(old) + 1e-12
622
+ rel_change = np.abs(new - old) / denom
623
+ max_rel_change = np.max(rel_change)
624
+ mean_rel_change = np.mean(rel_change)
625
+
626
+ if mean_rel_change < self.tolerance:
627
+ self.stable_count += 1
628
+ else:
629
+ self.stable_count = 0
630
+
631
+ should_stop = self.stable_count >= self.patience
632
+
633
+ info = {
634
+ 'converged': should_stop,
635
+ 'n_permutations': n_perms,
636
+ 'mean_rel_change': float(mean_rel_change),
637
+ 'max_rel_change': float(max_rel_change),
638
+ 'stable_iterations': self.stable_count
639
+ }
640
+
641
+ return should_stop, info
642
+
643
+
644
+ @jit(nopython=True, cache=True, fastmath=True)
645
+ def _compute_marginals_jit(
646
+ perm: np.ndarray,
647
+ values: np.ndarray,
648
+ n_samples: int,
649
+ n_features: int
650
+ ) -> np.ndarray:
651
+ """
652
+ JIT-compiled function to compute Shapley marginal contributions.
653
+
654
+ This is the performance-critical inner loop - compiled to machine code by Numba.
655
+
656
+ Parameters
657
+ ----------
658
+ perm : array of sample indices in permutation order
659
+ values : array of value function results for each coalition size
660
+ n_samples : number of samples
661
+ n_features : number of features
662
+
663
+ Returns
664
+ -------
665
+ shapley_contrib : (n_samples, n_features) array of marginal contributions
666
+ """
667
+ shapley_contrib = np.zeros((n_samples, n_features), dtype=np.float64)
668
+
669
+ for j in range(n_samples):
670
+ sample_idx = perm[j]
671
+ marginal = values[j+1] - values[j]
672
+
673
+ # Broadcast marginal across all features
674
+ for f in range(n_features):
675
+ shapley_contrib[sample_idx, f] = marginal / n_features
676
+
677
+ return shapley_contrib
678
+
679
+
680
+ @jit(nopython=True, cache=True, fastmath=True)
681
+ def _compute_feature_marginals_jit(
682
+ perm: np.ndarray,
683
+ values: np.ndarray,
684
+ n_features: int
685
+ ) -> np.ndarray:
686
+ """
687
+ JIT-compiled function to compute feature-level Shapley marginal contributions.
688
+
689
+ Parameters
690
+ ----------
691
+ perm : array of feature indices in permutation order
692
+ values : array of value function results for each feature coalition size
693
+ n_features : number of features
694
+
695
+ Returns
696
+ -------
697
+ shapley_contrib : (n_features,) array of marginal contributions
698
+ """
699
+ shapley_contrib = np.zeros(n_features, dtype=np.float64)
700
+
701
+ for j in range(n_features):
702
+ feat_idx = perm[j]
703
+ marginal = values[j+1] - values[j]
704
+ shapley_contrib[feat_idx] = marginal
705
+
706
+ return shapley_contrib
707
+
708
+
709
+ class ShapleySignificanceEngine:
710
+ """
711
+ Compute Shapley values for dual-perspective significance analysis.
712
+
713
+ Supports two modes:
714
+ 1. Explanations: Why is this sample archetypal/prototypical/stereotypical?
715
+ 2. Formative: Which samples create the archetypal/prototypical/stereotypical structure?
716
+ """
717
+
718
+ def __init__(
719
+ self,
720
+ n_permutations: int = 100,
721
+ random_state: int = 42,
722
+ n_jobs: int = -1,
723
+ early_stopping_patience: int = 10,
724
+ early_stopping_tolerance: float = 0.01,
725
+ verbose: bool = False
726
+ ):
727
+ self.n_permutations = n_permutations
728
+ self.random_state = random_state
729
+ self.n_jobs = n_jobs
730
+ self.early_stopping_patience = early_stopping_patience
731
+ self.early_stopping_tolerance = early_stopping_tolerance
732
+ self.verbose = verbose
733
+ self.rng = np.random.RandomState(random_state)
734
+
735
+ def compute_shapley_values(
736
+ self,
737
+ X: np.ndarray,
738
+ value_function: Callable,
739
+ value_function_name: str = "unknown",
740
+ context: Optional[Dict] = None
741
+ ) -> Tuple[np.ndarray, Dict]:
742
+ """
743
+ Compute Shapley values using specified value function.
744
+
745
+ OPTIMIZED: Uses shared memory for parallel processing to avoid data copying.
746
+ MEMORY OPTIMIZED: Cleanup batch results immediately after accumulation.
747
+ """
748
+ n_samples, n_features = X.shape
749
+
750
+ if self.verbose:
751
+ print(f"\n Computing {value_function_name}...")
752
+ print(f" Samples: {n_samples}, Features: {n_features}")
753
+ print(f" Max permutations: {self.n_permutations}")
754
+
755
+ early_stop = ShapleyEarlyStopping(
756
+ patience=self.early_stopping_patience,
757
+ tolerance=self.early_stopping_tolerance
758
+ )
759
+
760
+ shapley_sum = np.zeros((n_samples, n_features), dtype=np.float64)
761
+ n_perms_used = 0
762
+
763
+ batch_size = max(1, self.n_permutations // 10)
764
+ info = {'converged': False, 'mean_rel_change': 0.0}
765
+
766
+ # OPTIMIZED: Decide parallelization strategy based on data size
767
+ use_parallel = self.n_jobs != 1 and n_samples >= 20
768
+
769
+ # OPTIMIZED: For small datasets or single-threaded, use direct computation
770
+ if not use_parallel:
771
+ for batch_start in range(0, self.n_permutations, batch_size):
772
+ batch_end = min(batch_start + batch_size, self.n_permutations)
773
+ batch_perms = [self.rng.permutation(n_samples) for _ in range(batch_end - batch_start)]
774
+
775
+ for perm in batch_perms:
776
+ shapley_contrib = self._process_single_permutation(perm, X, value_function, context)
777
+ shapley_sum += shapley_contrib
778
+ n_perms_used += 1
779
+
780
+ # MEMORY CLEANUP: Free batch permutations immediately
781
+ _cleanup_memory(batch_perms)
782
+
783
+ current_estimate = shapley_sum / n_perms_used
784
+ should_stop, info = early_stop.update(current_estimate, n_perms_used)
785
+
786
+ if should_stop and n_perms_used >= 50:
787
+ if self.verbose:
788
+ print(f" Early stop at {n_perms_used} perms (change: {info['mean_rel_change']:.6f})")
789
+ break
790
+ else:
791
+ # OPTIMIZED: Use threading backend for better memory sharing
792
+ for batch_start in range(0, self.n_permutations, batch_size):
793
+ batch_end = min(batch_start + batch_size, self.n_permutations)
794
+ batch_perms = [self.rng.permutation(n_samples) for _ in range(batch_end - batch_start)]
795
+
796
+ # Use threading backend for shared memory access
797
+ batch_results = Parallel(
798
+ n_jobs=self.n_jobs,
799
+ backend='threading',
800
+ verbose=0
801
+ )(
802
+ delayed(self._process_single_permutation)(perm, X, value_function, context)
803
+ for perm in batch_perms
804
+ )
805
+
806
+ # Accumulate results efficiently
807
+ for shapley_contrib in batch_results:
808
+ shapley_sum += shapley_contrib
809
+ n_perms_used += 1
810
+
811
+ # MEMORY CLEANUP: Free batch results and permutations immediately
812
+ _cleanup_memory(batch_results, batch_perms)
813
+
814
+ current_estimate = shapley_sum / n_perms_used
815
+ should_stop, info = early_stop.update(current_estimate, n_perms_used)
816
+
817
+ if should_stop and n_perms_used >= 50:
818
+ if self.verbose:
819
+ print(f" Early stop at {n_perms_used} perms (change: {info['mean_rel_change']:.6f})")
820
+ break
821
+
822
+ Phi = shapley_sum / n_perms_used
823
+
824
+ # Verify additivity
825
+ all_indices = np.arange(n_samples)
826
+ if context is not None:
827
+ total_actual = value_function(X, all_indices, context)
828
+ else:
829
+ total_actual = value_function(X, all_indices)
830
+
831
+ total_from_shapley = np.sum(Phi)
832
+ additivity_error = abs(total_from_shapley - total_actual) / (abs(total_actual) + 1e-12)
833
+
834
+ info = {
835
+ 'n_permutations_used': n_perms_used,
836
+ 'converged': info.get('converged', False) if n_perms_used < self.n_permutations else True,
837
+ 'mean_rel_change': info.get('mean_rel_change', 0.0),
838
+ 'additivity_error': float(additivity_error),
839
+ 'total_shapley': float(total_from_shapley),
840
+ 'total_actual': float(total_actual)
841
+ }
842
+
843
+ if self.verbose:
844
+ print(f" ✓ {n_perms_used} perms, additivity error: {additivity_error:.6f}")
845
+
846
+ # MEMORY CLEANUP: Free shapley_sum before returning Phi (they're different objects)
847
+ _cleanup_memory(shapley_sum)
848
+
849
+ return Phi, info
850
+
851
+
852
+ def compute_feature_shapley_values(
853
+ self,
854
+ X: np.ndarray,
855
+ value_function: Callable,
856
+ value_function_name: str = "unknown",
857
+ context: Optional[Dict] = None
858
+ ) -> Tuple[np.ndarray, Dict]:
859
+ """
860
+ Compute feature-level Shapley values for each sample.
861
+
862
+ OPTIMIZED: Uses threading backend for better memory sharing.
863
+ """
864
+ n_samples, n_features = X.shape
865
+
866
+ if self.verbose:
867
+ print(f"\n Computing feature-level {value_function_name}...")
868
+ print(f" Samples: {n_samples}, Features: {n_features}")
869
+ print(f" Max permutations: {self.n_permutations}")
870
+
871
+ early_stop = ShapleyEarlyStopping(
872
+ patience=self.early_stopping_patience,
873
+ tolerance=self.early_stopping_tolerance
874
+ )
875
+
876
+ shapley_sum = np.zeros((n_samples, n_features), dtype=np.float64)
877
+ n_perms_used = 0
878
+
879
+ batch_size = max(1, self.n_permutations // 10)
880
+ info = {'converged': False, 'mean_rel_change': 0.0}
881
+
882
+ # OPTIMIZED: Decide parallelization strategy
883
+ use_parallel = self.n_jobs != 1 and n_features >= 10
884
+
885
+ for batch_start in range(0, self.n_permutations, batch_size):
886
+ batch_end = min(batch_start + batch_size, self.n_permutations)
887
+
888
+ # Generate feature permutations for this batch
889
+ batch_perms = [self.rng.permutation(n_features) for _ in range(batch_end - batch_start)]
890
+
891
+ # Process each sample
892
+ for sample_idx in range(n_samples):
893
+ if use_parallel:
894
+ # OPTIMIZED: Threading backend for memory sharing
895
+ batch_results = Parallel(
896
+ n_jobs=self.n_jobs,
897
+ backend='threading',
898
+ verbose=0
899
+ )(
900
+ delayed(self._process_feature_permutation)(
901
+ sample_idx, perm, X, value_function, value_function_name, context
902
+ )
903
+ for perm in batch_perms
904
+ )
905
+ else:
906
+ # Direct computation for small problems
907
+ batch_results = [
908
+ self._process_feature_permutation(
909
+ sample_idx, perm, X, value_function, value_function_name, context
910
+ )
911
+ for perm in batch_perms
912
+ ]
913
+
914
+ for shapley_contrib in batch_results:
915
+ shapley_sum[sample_idx, :] += shapley_contrib
916
+
917
+ n_perms_used += len(batch_perms)
918
+
919
+ current_estimate = shapley_sum / n_perms_used
920
+ should_stop, info = early_stop.update(current_estimate, n_perms_used)
921
+
922
+ if should_stop and n_perms_used >= 50:
923
+ if self.verbose:
924
+ print(f" Early stop at {n_perms_used} perms (change: {info['mean_rel_change']:.6f})")
925
+ break
926
+
927
+ Phi = shapley_sum / n_perms_used
928
+
929
+ # Compute additivity error
930
+ total_errors = []
931
+ for sample_idx in range(n_samples):
932
+ shapley_total = np.sum(Phi[sample_idx, :])
933
+ if context is not None:
934
+ actual_value = value_function(X[sample_idx:sample_idx+1, :], np.array([sample_idx]), context)
935
+ else:
936
+ actual_value = value_function(X[sample_idx:sample_idx+1, :], np.array([sample_idx]))
937
+
938
+ error = abs(shapley_total - actual_value) / (abs(actual_value) + 1e-12)
939
+ total_errors.append(error)
940
+
941
+ additivity_error = np.mean(total_errors)
942
+
943
+ info_out = {
944
+ 'n_permutations_used': n_perms_used,
945
+ 'converged': info.get('converged', False) if n_perms_used < self.n_permutations else True,
946
+ 'mean_rel_change': info.get('mean_rel_change', 0.0),
947
+ 'additivity_error': float(additivity_error)
948
+ }
949
+
950
+ if self.verbose:
951
+ print(f" {n_perms_used} perms, mean additivity error: {additivity_error:.6f}")
952
+
953
+ return Phi, info_out
954
+
955
+
956
+ def _process_single_permutation(
957
+ self,
958
+ perm: np.ndarray,
959
+ X: np.ndarray,
960
+ value_function: Callable,
961
+ context: Optional[Dict]
962
+ ) -> np.ndarray:
963
+ """
964
+ Process one permutation to compute marginal contributions.
965
+
966
+ OPTIMIZED: Delegates to JIT-compiled helper for massive speedup.
967
+ """
968
+ n_samples, n_features = X.shape
969
+ shapley_contrib = np.zeros((n_samples, n_features), dtype=np.float64)
970
+
971
+ # Compute all value function calls first (can't JIT this part due to callable)
972
+ values = np.zeros(n_samples + 1, dtype=np.float64)
973
+ values[0] = 0.0
974
+
975
+ for j in range(n_samples):
976
+ subset_indices = perm[:j+1]
977
+ X_subset = X[subset_indices]
978
+
979
+ if context is not None:
980
+ values[j+1] = value_function(X_subset, subset_indices, context)
981
+ else:
982
+ values[j+1] = value_function(X_subset, subset_indices)
983
+
984
+ # Now use JIT-compiled function to compute marginal contributions
985
+ shapley_contrib = _compute_marginals_jit(perm, values, n_samples, n_features)
986
+
987
+ return shapley_contrib
988
+
989
+ def _process_feature_permutation(
990
+ self,
991
+ sample_idx: int,
992
+ perm: np.ndarray,
993
+ X: np.ndarray,
994
+ value_function: Callable,
995
+ metric_name: str,
996
+ context: Optional[Dict] = None
997
+ ) -> np.ndarray:
998
+ """
999
+ Process one feature permutation for a single sample to compute per-feature contributions.
1000
+
1001
+ OPTIMIZED: Uses JIT-compiled helper for faster computation.
1002
+ """
1003
+ n_features = X.shape[1]
1004
+ shapley_contrib = np.zeros(n_features, dtype=np.float64)
1005
+
1006
+ # Compute all value function calls first (can't JIT this part)
1007
+ values = np.zeros(n_features + 1, dtype=np.float64)
1008
+ values[0] = 0.0
1009
+
1010
+ for j in range(n_features):
1011
+ feature_subset = perm[:j+1]
1012
+ X_sample_subset = X[sample_idx:sample_idx+1, :][:, feature_subset]
1013
+
1014
+ if context is not None:
1015
+ values[j+1] = value_function(X_sample_subset, np.array([sample_idx]), context)
1016
+ else:
1017
+ values[j+1] = value_function(X_sample_subset, np.array([sample_idx]))
1018
+
1019
+ # Use JIT-compiled function to compute marginals
1020
+ shapley_contrib = _compute_feature_marginals_jit(perm, values, n_features)
1021
+
1022
+ return shapley_contrib
1023
+
1024
+ # ============================================================
1025
+ # Value Functions for Formative Instance Discovery
1026
+ # ============================================================
1027
+
1028
+ def formative_archetypal_convex_hull(
1029
+ X_subset: np.ndarray,
1030
+ indices: np.ndarray,
1031
+ context: Optional[Dict] = None
1032
+ ) -> float:
1033
+ """
1034
+ Archetypal formative value function: Convex hull volume.
1035
+
1036
+ Samples that expand the convex hull boundary are formative archetypes.
1037
+
1038
+ SAFE: Falls back to range-based metric in high dimensions to avoid segfaults.
1039
+ """
1040
+ if len(X_subset) < 3:
1041
+ return 0.0
1042
+
1043
+ n_samples, n_features = X_subset.shape
1044
+
1045
+ # CRITICAL FIX: ConvexHull segfaults in high dimensions (>20D)
1046
+ # Always use safe fallback for high-dimensional data
1047
+ if n_features > 20 or ConvexHull is None or n_samples < n_features + 1:
1048
+ # Safe fallback: Feature range coverage (no segfault risk)
1049
+ ranges = X_subset.max(axis=0) - X_subset.min(axis=0)
1050
+ return float(np.prod(ranges + 1e-10)) # Product of ranges (volume proxy)
1051
+
1052
+ # Low dimensions: Try ConvexHull with safety wrapper
1053
+ try:
1054
+ # Ensure data is float64 for numerical stability
1055
+ X_clean = np.asarray(X_subset, dtype=np.float64)
1056
+
1057
+ # Remove duplicate points (causes ConvexHull to fail)
1058
+ X_unique = np.unique(X_clean, axis=0)
1059
+
1060
+ if len(X_unique) < n_features + 1:
1061
+ # Not enough unique points for hull in this dimension
1062
+ ranges = X_unique.max(axis=0) - X_unique.min(axis=0)
1063
+ return float(np.prod(ranges + 1e-10))
1064
+
1065
+ hull = ConvexHull(X_unique)
1066
+ return float(hull.volume)
1067
+
1068
+ except Exception:
1069
+ # ConvexHull failed - use safe fallback
1070
+ ranges = X_subset.max(axis=0) - X_subset.min(axis=0)
1071
+ return float(np.prod(ranges + 1e-10))
1072
+
1073
+
1074
+ def formative_prototypical_coverage(
1075
+ X_subset: np.ndarray,
1076
+ indices: np.ndarray,
1077
+ context: Optional[Dict] = None
1078
+ ) -> float:
1079
+ """
1080
+ Prototypical formative value function: Coverage/representativeness.
1081
+
1082
+ Samples that maximize pairwise similarity coverage are formative prototypes.
1083
+ """
1084
+ if len(X_subset) < 2:
1085
+ return 0.0
1086
+
1087
+ # L2 normalize
1088
+ norms = np.linalg.norm(X_subset, axis=1, keepdims=True)
1089
+ norms[norms == 0.0] = 1.0
1090
+ X_l2 = X_subset / norms
1091
+
1092
+ # Pairwise cosine similarities
1093
+ similarities = X_l2 @ X_l2.T
1094
+ np.fill_diagonal(similarities, 0)
1095
+
1096
+ if similarities.size == 0:
1097
+ return 0.0
1098
+
1099
+ max_sims = np.max(similarities, axis=1) if similarities.shape[0] > 0 else np.array([0.0])
1100
+ return float(np.mean(max_sims))
1101
+
1102
+
1103
+ def formative_stereotypical_extremeness(
1104
+ X_subset: np.ndarray,
1105
+ indices: np.ndarray,
1106
+ context: Dict
1107
+ ) -> float:
1108
+ """
1109
+ Stereotypical formative value function: Extremeness from median.
1110
+
1111
+ Samples that pull the distribution toward the target are formative stereotypes.
1112
+ """
1113
+ if len(X_subset) == 0:
1114
+ return 0.0
1115
+
1116
+ target_values = context['target_values']
1117
+ target = context['target']
1118
+ median = context.get('median', np.median(target_values))
1119
+
1120
+ subset_vals = target_values[indices]
1121
+
1122
+ if target == 'max':
1123
+ # How far above median?
1124
+ extremeness = np.mean(np.maximum(subset_vals - median, 0))
1125
+ elif target == 'min':
1126
+ # How far below median?
1127
+ extremeness = np.mean(np.maximum(median - subset_vals, 0))
1128
+ else:
1129
+ # How much closer to target than median?
1130
+ target_val = float(target)
1131
+ median_dist = abs(median - target_val)
1132
+ subset_dist = np.mean(np.abs(subset_vals - target_val))
1133
+ extremeness = median_dist - subset_dist
1134
+
1135
+ return float(extremeness)
1136
+
1137
+
1138
+ # ============================================================
1139
+ # [E] DataTypical API
1140
+ # ============================================================
1141
+ @dataclass
1142
+ class DataTypical:
1143
+ # ---- Core Config ----
1144
+ nmf_rank: int = 8
1145
+ n_prototypes: int = 20
1146
+ scale: str = "minmax"
1147
+ distance_metric: str = "euclidean"
1148
+ similarity_metric: str = "cosine"
1149
+ deterministic: bool = True
1150
+ n_jobs: int = -1
1151
+ max_iter_nmf: int = 400
1152
+ tol_nmf: float = 1e-4
1153
+ feature_weights: Optional[np.ndarray] = None
1154
+ speed_mode: bool = False
1155
+ dtype: str = "float32"
1156
+ random_state: int = 42
1157
+ max_memory_mb: int = 2048
1158
+ return_ranks_only: bool = False
1159
+ auto_n_prototypes: Optional[str] = None
1160
+ verbose: bool = False
1161
+ max_missing_frac: float = 1.0
1162
+
1163
+ # ---- Stereotype Configuration (NEW in v0.4) ----
1164
+ stereotype_column: Optional[str] = None
1165
+ stereotype_target: Union[str, float] = "max"
1166
+ label_columns: Optional[List[str]] = None
1167
+ stereotype_keywords: Optional[List[str]] = None
1168
+ graph_topology_features: Optional[List[str]] = None
1169
+
1170
+ # ---- Data Type Configuration (NEW in v0.5) ----
1171
+ data_type: Optional[str] = None
1172
+
1173
+ # ---- Shapley Configuration (NEW in v0.6) ----
1174
+ shapley_mode: bool = False
1175
+ shapley_n_permutations: int = 100
1176
+ shapley_top_n: Optional[Union[int, float]] = None # CHANGED: Now supports float
1177
+ shapley_early_stopping_patience: int = 10
1178
+ shapley_early_stopping_tolerance: float = 0.01
1179
+ shapley_compute_formative: Optional[bool] = None # NEW in v0.7: None = auto from fast_mode
1180
+
1181
+ # ---- Performance Mode (NEW in v0.7) ----
1182
+ fast_mode: bool = False
1183
+ archetypal_method: Optional[str] = None
1184
+
1185
+ # ---- Artifacts ----
1186
+ W_: Optional[np.ndarray] = field(default=None, init=False)
1187
+ H_: Optional[np.ndarray] = field(default=None, init=False)
1188
+ reconstruction_error_: Optional[float] = field(default=None, init=False)
1189
+
1190
+ n_archetypes_: Optional[int] = field(default=None, init=False)
1191
+ prototype_indices_: Optional[np.ndarray] = field(default=None, init=False)
1192
+ prototype_rows_: Optional[np.ndarray] = field(default=None, init=False)
1193
+ marginal_gains_: Optional[np.ndarray] = field(default=None, init=False)
1194
+ assignments_: Optional[np.ndarray] = field(default=None, init=False)
1195
+ coverage_: Optional[np.ndarray] = field(default=None, init=False)
1196
+ knee_: Optional[int] = field(default=None, init=False)
1197
+
1198
+ scaler_: Optional[MinMaxScaler] = field(default=None, init=False)
1199
+ vectorizer_: Optional[TfidfVectorizer] = field(default=None, init=False)
1200
+ nmf_model_: Optional[NMF] = field(default=None, init=False)
1201
+
1202
+ settings_: Dict = field(default_factory=dict, init=False)
1203
+ ideals_: Dict[str, np.ndarray] = field(default_factory=dict, init=False)
1204
+ dropped_columns_: List[str] = field(default_factory=list, init=False)
1205
+ missingness_: Dict[str, float] = field(default_factory=dict, init=False)
1206
+ train_index_: Optional[pd.Index] = field(default=None, init=False)
1207
+
1208
+ # Feature selection for tables (numeric-only)
1209
+ feature_columns_: Optional[List[str]] = field(default=None, init=False)
1210
+ impute_median_: Optional[np.ndarray] = field(default=None, init=False)
1211
+ keep_mask_: Optional[np.ndarray] = field(default=None, init=False)
1212
+
1213
+ # NEW in v0.4: Stereotype artifacts
1214
+ _df_original_fit: Optional[pd.DataFrame] = field(default=None, init=False)
1215
+ label_df_: Optional[pd.DataFrame] = field(default=None, init=False)
1216
+ text_metadata_: Optional[pd.DataFrame] = field(default=None, init=False)
1217
+ stereotype_keyword_scores_: Optional[np.ndarray] = field(default=None, init=False)
1218
+ graph_topology_df_: Optional[pd.DataFrame] = field(default=None, init=False)
1219
+
1220
+ # Data type detection (NEW in v0.5)
1221
+ _detected_data_type: Optional[str] = field(default=None, init=False)
1222
+
1223
+ # ---- Shapley Artifacts (NEW in v0.6) ----
1224
+ Phi_archetypal_explanations_: Optional[np.ndarray] = field(default=None, init=False)
1225
+ Phi_prototypical_explanations_: Optional[np.ndarray] = field(default=None, init=False)
1226
+ Phi_stereotypical_explanations_: Optional[np.ndarray] = field(default=None, init=False)
1227
+
1228
+ Phi_archetypal_formative_: Optional[np.ndarray] = field(default=None, init=False)
1229
+ Phi_prototypical_formative_: Optional[np.ndarray] = field(default=None, init=False)
1230
+ Phi_stereotypical_formative_: Optional[np.ndarray] = field(default=None, init=False)
1231
+
1232
+ shapley_info_: Dict = field(default_factory=dict, init=False)
1233
+ _stereotype_source_fit_: Optional[pd.Series] = field(default=None, init=False)
1234
+
1235
+
1236
+ # --------------------------
1237
+ # Auto-Detection and Routing
1238
+ # --------------------------
1239
+
1240
+ def _auto_detect_data_type(self, X, **kwargs) -> str:
1241
+ """
1242
+ Auto-detect data type based on input format.
1243
+
1244
+ Priority:
1245
+ 1. Graph: If edges/edge_index parameter present
1246
+ 2. Text: If X is list/tuple of strings
1247
+ 3. Tabular: If X is DataFrame or array
1248
+
1249
+ Parameters
1250
+ ----------
1251
+ X : various
1252
+ Input data
1253
+ **kwargs
1254
+ Additional parameters (checked for edges/edge_index)
1255
+
1256
+ Returns
1257
+ -------
1258
+ data_type : str
1259
+ One of 'graph', 'text', 'tabular'
1260
+
1261
+ Raises
1262
+ ------
1263
+ ValueError
1264
+ If data type cannot be determined
1265
+ """
1266
+ # Priority 1: Graph (edges parameter indicates graph data)
1267
+ if 'edges' in kwargs or 'edge_index' in kwargs:
1268
+ return 'graph'
1269
+
1270
+ # Priority 2: Text (list/tuple of strings)
1271
+ if isinstance(X, (list, tuple)):
1272
+ if len(X) > 0 and isinstance(X[0], str):
1273
+ return 'text'
1274
+
1275
+ # Priority 3: Tabular (DataFrame or array)
1276
+ if isinstance(X, (pd.DataFrame, np.ndarray)):
1277
+ return 'tabular'
1278
+
1279
+ # Cannot determine
1280
+ raise ValueError(
1281
+ f"Cannot auto-detect data type from input of type {type(X)}. "
1282
+ f"Supported formats: DataFrame/array (tabular), list of strings (text), "
1283
+ f"or provide edges parameter (graph). "
1284
+ f"Alternatively, specify data_type='tabular'/'text'/'graph' explicitly."
1285
+ )
1286
+
1287
+ def _validate_data_type(self, detected: str) -> str:
1288
+ """
1289
+ Validate and resolve data_type configuration.
1290
+
1291
+ If data_type is specified in config, validate it matches expected values.
1292
+ Otherwise use auto-detected type.
1293
+
1294
+ Parameters
1295
+ ----------
1296
+ detected : str
1297
+ Auto-detected data type
1298
+
1299
+ Returns
1300
+ -------
1301
+ data_type : str
1302
+ Final data type to use
1303
+
1304
+ Raises
1305
+ ------
1306
+ ValueError
1307
+ If configured data_type is invalid
1308
+ """
1309
+ if self.data_type is not None:
1310
+ # Manual override provided
1311
+ valid_types = {'tabular', 'text', 'graph'}
1312
+ if self.data_type not in valid_types:
1313
+ raise ValueError(
1314
+ f"Invalid data_type='{self.data_type}'. "
1315
+ f"Must be one of {valid_types} or None (auto-detect)."
1316
+ )
1317
+ if self.verbose:
1318
+ if detected != self.data_type:
1319
+ print(f"Using configured data_type='{self.data_type}' "
1320
+ f"(auto-detected: '{detected}')")
1321
+ return self.data_type
1322
+ else:
1323
+ # Use auto-detected
1324
+ if self.verbose:
1325
+ print(f"Auto-detected data_type: '{detected}'")
1326
+ return detected
1327
+
1328
+ def _apply_fast_mode_defaults(self) -> None:
1329
+ """
1330
+ Apply fast_mode preset defaults if parameters not explicitly set.
1331
+
1332
+ fast_mode=True: Exploration (NMF + explanations only + subsample)
1333
+ fast_mode=False: Publication (AA + formative + full dataset)
1334
+
1335
+ Users can override any individual parameter by setting it explicitly.
1336
+ """
1337
+ if self.fast_mode:
1338
+ # Fast mode defaults (exploration)
1339
+ if self.archetypal_method is None:
1340
+ self.archetypal_method = 'nmf'
1341
+
1342
+ # Reduce Shapley permutations for speed
1343
+ if self.shapley_n_permutations == 100: # Default value, not overridden
1344
+ self.shapley_n_permutations = 30
1345
+
1346
+ # Subsample explanations to top 50%
1347
+ if self.shapley_top_n is None:
1348
+ self.shapley_top_n = 0.5 # 50% of instances
1349
+
1350
+ # Skip formative in fast mode (explanations only)
1351
+ if self.shapley_compute_formative is None:
1352
+ self.shapley_compute_formative = False
1353
+
1354
+ else:
1355
+ # Publication mode defaults (rigorous)
1356
+ if self.archetypal_method is None:
1357
+ self.archetypal_method = 'aa' # True archetypal analysis
1358
+
1359
+ # Keep shapley_n_permutations=100 (default)
1360
+ # Keep shapley_top_n=None (compute for all instances)
1361
+ # Compute formative in publication mode
1362
+ if self.shapley_compute_formative is None:
1363
+ self.shapley_compute_formative = True
1364
+
1365
+ # Validate archetypal_method
1366
+ if self.archetypal_method not in ['nmf', 'aa']:
1367
+ raise ValueError(
1368
+ f"archetypal_method must be 'nmf' or 'aa', got '{self.archetypal_method}'"
1369
+ )
1370
+
1371
+ if self.verbose:
1372
+ mode_name = "Fast" if self.fast_mode else "Publication"
1373
+ print(f"\n{mode_name} mode defaults:")
1374
+ print(f" archetypal_method: {self.archetypal_method}")
1375
+ print(f" shapley_n_permutations: {self.shapley_n_permutations}")
1376
+ print(f" shapley_top_n: {self.shapley_top_n if self.shapley_top_n else 'all instances'}")
1377
+ print(f" shapley_compute_formative: {self.shapley_compute_formative}")
1378
+
1379
+ # --------------------------
1380
+ # Unified Interface
1381
+ # --------------------------
1382
+ def fit(self, X: Union[pd.DataFrame, np.ndarray, List[str]], **kwargs):
1383
+ """
1384
+ Fit DataTypical on input data (auto-detects format).
1385
+
1386
+ Automatically detects whether input is tabular, text, or graph data
1387
+ based on format and parameters. Can be overridden with data_type parameter.
1388
+
1389
+ Parameters
1390
+ ----------
1391
+ X : DataFrame, array, or list of strings
1392
+ Input data:
1393
+ - Tabular: DataFrame or 2D array
1394
+ - Text: List of string documents
1395
+ - Graph: Node features (with edges parameter)
1396
+
1397
+ **kwargs : optional
1398
+ Additional parameters for specific data types:
1399
+
1400
+ For text:
1401
+ vectorizer : str, default 'tfidf'
1402
+ text_metadata : pd.DataFrame, optional
1403
+
1404
+ For graph:
1405
+ edges : np.ndarray (required for graph detection)
1406
+ Edge list as (2, n_edges) or (n_edges, 2)
1407
+ edge_index : np.ndarray (alias for edges)
1408
+ compute_topology : bool, default True
1409
+
1410
+ Returns
1411
+ -------
1412
+ self : DataTypical
1413
+ Fitted estimator
1414
+
1415
+ Examples
1416
+ --------
1417
+ >>> # Tabular (auto-detected)
1418
+ >>> dt = DataTypical()
1419
+ >>> dt.fit(dataframe)
1420
+
1421
+ >>> # Text (auto-detected from list of strings)
1422
+ >>> dt = DataTypical(stereotype_keywords=['protein'])
1423
+ >>> dt.fit(corpus)
1424
+
1425
+ >>> # Graph (auto-detected from edges parameter)
1426
+ >>> dt = DataTypical(graph_topology_features=['degree'])
1427
+ >>> dt.fit(node_features, edges=edge_list)
1428
+
1429
+ >>> # Manual override
1430
+ >>> dt = DataTypical(data_type='tabular')
1431
+ >>> dt.fit(data)
1432
+ """
1433
+ # Apply fast_mode defaults (if not already applied)
1434
+ if not hasattr(self, '_fast_mode_applied'):
1435
+ self._apply_fast_mode_defaults()
1436
+ self._fast_mode_applied = True
1437
+
1438
+ # Auto-detect data type
1439
+ detected = self._auto_detect_data_type(X, **kwargs)
1440
+
1441
+ # Validate and resolve final type
1442
+ final_type = self._validate_data_type(detected)
1443
+ self._detected_data_type = final_type
1444
+
1445
+ # Route to appropriate internal method
1446
+ if final_type == 'tabular':
1447
+ return self._fit_tabular(X)
1448
+ elif final_type == 'text':
1449
+ vectorizer = kwargs.get('vectorizer', 'tfidf')
1450
+ text_metadata = kwargs.get('text_metadata', None)
1451
+ return self._fit_text(X, vectorizer, text_metadata)
1452
+ elif final_type == 'graph':
1453
+ edges = kwargs.get('edges', kwargs.get('edge_index', None))
1454
+ compute_topology = kwargs.get('compute_topology', True)
1455
+ return self._fit_graph(X, edges, compute_topology)
1456
+ else:
1457
+ raise RuntimeError(f"Unknown data type: {final_type}")
1458
+
1459
+ def transform(self, X: Union[pd.DataFrame, np.ndarray, List[str]], **kwargs):
1460
+ """
1461
+ Transform data using fitted model (uses detected format from fit).
1462
+
1463
+ Parameters
1464
+ ----------
1465
+ X : DataFrame, array, or list of strings
1466
+ Input data (same format as used in fit)
1467
+ **kwargs : optional
1468
+ Additional parameters (same as fit)
1469
+
1470
+ Returns
1471
+ -------
1472
+ results : pd.DataFrame
1473
+ Significance rankings and diagnostics
1474
+
1475
+ Examples
1476
+ --------
1477
+ >>> dt = DataTypical()
1478
+ >>> dt.fit(train_data)
1479
+ >>> results = dt.transform(test_data)
1480
+ """
1481
+ if self._detected_data_type is None:
1482
+ raise RuntimeError("Model not fitted. Call fit() first.")
1483
+
1484
+ return_ranks_only = kwargs.get('return_ranks_only', self.return_ranks_only)
1485
+
1486
+ if self._detected_data_type == 'tabular':
1487
+ return self._transform_tabular(X, return_ranks_only)
1488
+ elif self._detected_data_type == 'text':
1489
+ return self._transform_text(X, return_ranks_only)
1490
+ elif self._detected_data_type == 'graph':
1491
+ # Graph transform needs to recompute topology if edges provided
1492
+ edges = kwargs.get('edges', kwargs.get('edge_index', None))
1493
+ return self._transform_graph(X, edges, return_ranks_only)
1494
+ else:
1495
+ raise RuntimeError(f"Unknown detected type: {self._detected_data_type}")
1496
+
1497
+ def fit_transform(
1498
+ self,
1499
+ X: Union[pd.DataFrame, np.ndarray, List[str]],
1500
+ return_ranks_only: Optional[bool] = None,
1501
+ **kwargs
1502
+ ) -> pd.DataFrame:
1503
+ """
1504
+ Fit and transform in one step (auto-detects format).
1505
+
1506
+ Parameters
1507
+ ----------
1508
+ X : DataFrame, array, or list of strings
1509
+ Input data
1510
+ return_ranks_only : bool, optional
1511
+ If True, return only rank columns
1512
+ **kwargs : optional
1513
+ Additional parameters (see fit() for details)
1514
+
1515
+ Returns
1516
+ -------
1517
+ results : pd.DataFrame
1518
+ Significance rankings and diagnostics
1519
+
1520
+ Examples
1521
+ --------
1522
+ >>> # Tabular
1523
+ >>> dt = DataTypical()
1524
+ >>> results = dt.fit_transform(data)
1525
+
1526
+ >>> # Text
1527
+ >>> dt = DataTypical(stereotype_keywords=['keyword'])
1528
+ >>> results = dt.fit_transform(corpus)
1529
+
1530
+ >>> # Graph
1531
+ >>> dt = DataTypical(graph_topology_features=['degree'])
1532
+ >>> results = dt.fit_transform(node_features, edges=edges)
1533
+ """
1534
+ self.fit(X, **kwargs)
1535
+ if return_ranks_only is not None:
1536
+ kwargs['return_ranks_only'] = return_ranks_only
1537
+ return self.transform(X, **kwargs)
1538
+
1539
+ # --------------------------
1540
+ # Internal Methods (Type-Specific)
1541
+ # --------------------------
1542
+ def _fit_tabular(self, X: Union[pd.DataFrame, np.ndarray]):
1543
+ """Internal method for fitting tabular data."""
1544
+ self._validate_stereotype_config()
1545
+ df = X if isinstance(X, pd.DataFrame) else pd.DataFrame(np.asarray(X))
1546
+ self.train_index_ = df.index.copy()
1547
+ with _ThreadControl(self.deterministic and not self.speed_mode) as tc:
1548
+ _seed_everything(self.random_state)
1549
+ X_scaled, X_l2 = self._preprocess_table_fit(df)
1550
+ self._fit_components(X_scaled, X_l2, df.index)
1551
+
1552
+ # Store stereotype source for Shapley
1553
+ if self.stereotype_column is not None and self.shapley_mode:
1554
+ self._stereotype_source_fit_ = self._get_stereotype_source_table(df)
1555
+
1556
+ # NEW: Shapley analysis
1557
+ if self.shapley_mode:
1558
+ if self.verbose:
1559
+ print("\n" + "="*70)
1560
+ print("SHAPLEY DUAL-PERSPECTIVE ANALYSIS")
1561
+ print("="*70)
1562
+ self._fit_shapley_dual_perspective(X_scaled, X_l2, df.index)
1563
+ self._record_settings(tc)
1564
+ return self
1565
+
1566
+ def _fit_text(
1567
+ self,
1568
+ corpus: Union[List[str], Iterable[str]],
1569
+ vectorizer: str = "tfidf",
1570
+ text_metadata: Optional[pd.DataFrame] = None
1571
+ ):
1572
+ """Internal method for fitting text data."""
1573
+ self._validate_stereotype_config()
1574
+ with _ThreadControl(self.deterministic and not self.speed_mode) as tc:
1575
+ _seed_everything(self.random_state)
1576
+ X_scaled, X_l2 = self._preprocess_text_fit(corpus, vectorizer, text_metadata)
1577
+ idx = pd.RangeIndex(X_scaled.shape[0])
1578
+ self.train_index_ = idx
1579
+ self._fit_components(X_scaled, X_l2, idx)
1580
+ self._record_settings(tc)
1581
+ return self
1582
+
1583
+ def _fit_graph(
1584
+ self,
1585
+ node_features: Union[pd.DataFrame, np.ndarray],
1586
+ edges: Optional[np.ndarray] = None,
1587
+ compute_topology: bool = True
1588
+ ):
1589
+ """Internal method for fitting graph data."""
1590
+ # Convert to DataFrame
1591
+ if isinstance(node_features, pd.DataFrame):
1592
+ df = node_features.copy()
1593
+ else:
1594
+ df = pd.DataFrame(node_features)
1595
+
1596
+ n_nodes = len(df)
1597
+
1598
+ # Compute topology features if edges provided
1599
+ self.graph_topology_df_ = None
1600
+ if edges is not None and compute_topology:
1601
+ topology_df = self._compute_graph_topology_features(edges, n_nodes)
1602
+ self.graph_topology_df_ = topology_df
1603
+
1604
+ # Append to node features
1605
+ for col in topology_df.columns:
1606
+ if col not in df.columns:
1607
+ df[col] = topology_df[col].values
1608
+ else:
1609
+ warnings.warn(f"Topology feature '{col}' already exists, skipping")
1610
+
1611
+ # Delegate to tabular processing
1612
+ return self._fit_tabular(df)
1613
+
1614
+ def _transform_tabular(
1615
+ self,
1616
+ X: Union[pd.DataFrame, np.ndarray],
1617
+ return_ranks_only: bool = False
1618
+ ) -> pd.DataFrame:
1619
+ """Internal method for transforming tabular data."""
1620
+ df = X if isinstance(X, pd.DataFrame) else pd.DataFrame(np.asarray(X))
1621
+ with _ThreadControl(self.deterministic and not self.speed_mode):
1622
+ X_scaled, X_l2 = self._preprocess_table_transform(df)
1623
+
1624
+ # Get stereotype source for transform
1625
+ stereotype_source = None
1626
+ if self.stereotype_column is not None:
1627
+ stereotype_source = self._get_stereotype_source_table(df)
1628
+
1629
+ ranks = self._score_with_fitted(X_scaled, X_l2, df.index, stereotype_source)
1630
+
1631
+ # Add Shapley rankings (including None columns if formative skipped)
1632
+ if self.shapley_mode:
1633
+ shapley_ranks = self._compute_shapley_formative_ranks()
1634
+ ranks = pd.concat([ranks, shapley_ranks], axis=1)
1635
+
1636
+ if return_ranks_only:
1637
+ return ranks
1638
+ out = df.copy()
1639
+ for col in ranks.columns:
1640
+ out[col] = ranks[col]
1641
+ return out
1642
+
1643
+ def _transform_text(
1644
+ self,
1645
+ corpus: Union[List[str], Iterable[str]],
1646
+ return_ranks_only: bool = False
1647
+ ) -> pd.DataFrame:
1648
+ """Internal method for transforming text data."""
1649
+ with _ThreadControl(self.deterministic and not self.speed_mode):
1650
+ X_scaled, X_l2 = self._preprocess_text_transform(corpus)
1651
+ idx = pd.RangeIndex(X_scaled.shape[0])
1652
+
1653
+ # Get stereotype source (priority: metadata column > keywords > None)
1654
+ stereotype_source = None
1655
+
1656
+ # Priority 1: Metadata column (from fit_text)
1657
+ if self.stereotype_column is not None and self.text_metadata_ is not None:
1658
+ if self.stereotype_column in self.text_metadata_.columns:
1659
+ stereotype_source = self.text_metadata_[self.stereotype_column]
1660
+
1661
+ # Priority 2: Keyword scores (recompute on new corpus)
1662
+ elif self.stereotype_keywords is not None:
1663
+ corpus_list = list(corpus)
1664
+ X_tfidf = self.vectorizer_.transform(corpus_list)
1665
+ keyword_scores = self._compute_keyword_scores(
1666
+ X_tfidf, corpus_list, self.stereotype_keywords
1667
+ )
1668
+ stereotype_source = pd.Series(keyword_scores)
1669
+
1670
+ ranks = self._score_with_fitted(X_scaled, X_l2, idx, stereotype_source)
1671
+
1672
+ # Add Shapley rankings (including None columns if formative skipped)
1673
+ if self.shapley_mode:
1674
+ shapley_ranks = self._compute_shapley_formative_ranks()
1675
+ ranks = pd.concat([ranks, shapley_ranks], axis=1)
1676
+
1677
+ return ranks
1678
+
1679
+ def _transform_graph(
1680
+ self,
1681
+ node_features: Union[pd.DataFrame, np.ndarray],
1682
+ edges: Optional[np.ndarray] = None,
1683
+ return_ranks_only: bool = False
1684
+ ) -> pd.DataFrame:
1685
+ """Internal method for transforming graph data."""
1686
+ # Convert to DataFrame
1687
+ if isinstance(node_features, pd.DataFrame):
1688
+ df = node_features.copy()
1689
+ else:
1690
+ df = pd.DataFrame(node_features)
1691
+
1692
+ n_nodes = len(df)
1693
+
1694
+ # Recompute topology features if edges provided and model was trained with them
1695
+ if edges is not None and self.graph_topology_df_ is not None:
1696
+ topology_df = self._compute_graph_topology_features(edges, n_nodes)
1697
+
1698
+ # Append to node features
1699
+ for col in topology_df.columns:
1700
+ if col not in df.columns:
1701
+ df[col] = topology_df[col].values
1702
+
1703
+ # Delegate to tabular transform (which handles Shapley ranks)
1704
+ return self._transform_tabular(df, return_ranks_only)
1705
+
1706
+ # ============================================================
1707
+ # Shapley Dual-Perspective Methods (NEW in v0.6)
1708
+ # ============================================================
1709
+
1710
+ def _fit_shapley_dual_perspective(
1711
+ self,
1712
+ X_scaled: ArrayLike,
1713
+ X_l2: ArrayLike,
1714
+ index: pd.Index
1715
+ ) -> None:
1716
+ """
1717
+ Fit Shapley analysis with dual perspective:
1718
+ 1. Explanations: Why is each sample significant? (always computed)
1719
+ 2. Formative: Which samples create structure? (optional)
1720
+
1721
+ MEMORY OPTIMIZED: Cleanup X_dense after Shapley computation.
1722
+ """
1723
+ X_dense = X_scaled.toarray() if (sp is not None and sp.isspmatrix(X_scaled)) \
1724
+ else np.asarray(X_scaled, dtype=np.float64)
1725
+ n_samples, n_features = X_dense.shape
1726
+
1727
+ # Determine if we compute formative
1728
+ compute_formative = self.shapley_compute_formative if self.shapley_compute_formative is not None else True
1729
+
1730
+ # SUBSAMPLE LOGIC: Only for explanations
1731
+ subsample_indices_explanations = None
1732
+
1733
+ if self.shapley_top_n is not None:
1734
+ # Support both fraction and absolute count
1735
+ if isinstance(self.shapley_top_n, float) and 0 < self.shapley_top_n < 1:
1736
+ n_subsample = max(1, int(self.shapley_top_n * n_samples))
1737
+ else:
1738
+ n_subsample = int(self.shapley_top_n)
1739
+
1740
+ if n_subsample < n_samples:
1741
+ if self.verbose:
1742
+ print(f"\n[Subsampling] Selecting top {n_subsample} samples per metric")
1743
+ if compute_formative:
1744
+ print(" Formative: Full dataset (required for structure)")
1745
+ else:
1746
+ print(" Formative: SKIPPED (fast_mode)")
1747
+
1748
+ # Get correct stereotype source for ranking
1749
+ stereotype_source = self._stereotype_source_fit_ if hasattr(self, '_stereotype_source_fit_') else None
1750
+ temp_results = self._score_with_fitted(X_scaled, X_l2, index, stereotype_source)
1751
+
1752
+ # Get top n_subsample for each metric separately
1753
+ top_arch = set(temp_results.nlargest(n_subsample, 'archetypal_rank').index)
1754
+ top_proto = set(temp_results.nlargest(n_subsample, 'prototypical_rank').index)
1755
+ top_stereo = set(temp_results.nlargest(n_subsample, 'stereotypical_rank').index)
1756
+
1757
+ # Union of all top samples - NO TRIMMING!
1758
+ # This ensures all top-N samples from each metric have Shapley values
1759
+ top_indices_union = top_arch | top_proto | top_stereo
1760
+
1761
+ if self.verbose:
1762
+ print(f" Top {n_subsample} archetypal samples: {len(top_arch)}")
1763
+ print(f" Top {n_subsample} prototypical samples: {len(top_proto)}")
1764
+ print(f" Top {n_subsample} stereotypical samples: {len(top_stereo)}")
1765
+ print(f" Union: {len(top_indices_union)} unique samples")
1766
+ print(f" (Computing Shapley for all union samples - ensures no empty plots)")
1767
+
1768
+ # Identify core samples (appear in multiple metric top-N lists)
1769
+ # These get full permutations; secondary samples get reduced permutations
1770
+ sample_counts = {}
1771
+ for idx in top_indices_union:
1772
+ count = sum([idx in top_arch, idx in top_proto, idx in top_stereo])
1773
+ sample_counts[idx] = count
1774
+
1775
+ # Core = samples in 2+ metrics (most important)
1776
+ core_samples_df_idx = [idx for idx, cnt in sample_counts.items() if cnt >= 2]
1777
+ core_positions = sorted([index.get_loc(idx) for idx in core_samples_df_idx])
1778
+ self._union_core_samples = np.array(core_positions)
1779
+
1780
+ if self.verbose:
1781
+ print(f" Core samples (in 2+ metrics): {len(core_samples_df_idx)}")
1782
+ print(f" Secondary samples (in 1 metric): {len(top_indices_union) - len(core_samples_df_idx)}")
1783
+
1784
+ # Convert to positional indices (deterministic order via sorting)
1785
+ top_positions = sorted([index.get_loc(idx) for idx in top_indices_union])
1786
+ subsample_indices_explanations = np.array(top_positions)
1787
+
1788
+ # MEMORY CLEANUP
1789
+ _cleanup_memory(temp_results)
1790
+
1791
+ # Initialize Shapley engine
1792
+ engine = ShapleySignificanceEngine(
1793
+ n_permutations=self.shapley_n_permutations,
1794
+ random_state=self.random_state,
1795
+ n_jobs=self.n_jobs,
1796
+ early_stopping_patience=self.shapley_early_stopping_patience,
1797
+ early_stopping_tolerance=self.shapley_early_stopping_tolerance,
1798
+ verbose=self.verbose
1799
+ )
1800
+
1801
+ # PERSPECTIVE 1: Formative Instances (optional)
1802
+ if compute_formative:
1803
+ if self.verbose:
1804
+ print("\n[1] Computing Formative Instances (global perspective)...")
1805
+ print(" Using FULL dataset (required to measure structure)")
1806
+
1807
+ # Formative archetypal (convex hull)
1808
+ self.Phi_archetypal_formative_, self.shapley_info_['archetypal_formative'] = \
1809
+ engine.compute_shapley_values(
1810
+ X_dense, # Always full dataset
1811
+ formative_archetypal_convex_hull,
1812
+ "Archetypal Formative (Convex Hull)"
1813
+ )
1814
+
1815
+ # Formative prototypical (coverage)
1816
+ self.Phi_prototypical_formative_, self.shapley_info_['prototypical_formative'] = \
1817
+ engine.compute_shapley_values(
1818
+ X_dense,
1819
+ formative_prototypical_coverage,
1820
+ "Prototypical Formative (Coverage)"
1821
+ )
1822
+
1823
+ # Formative stereotypical (extremeness)
1824
+ if self.stereotype_column is not None and hasattr(self, '_stereotype_source_fit_'):
1825
+ target_values = self._stereotype_source_fit_.to_numpy(dtype=np.float64)
1826
+ context = {
1827
+ 'target_values': target_values,
1828
+ 'target': self.stereotype_target,
1829
+ 'median': np.median(target_values)
1830
+ }
1831
+
1832
+ self.Phi_stereotypical_formative_, self.shapley_info_['stereotypical_formative'] = \
1833
+ engine.compute_shapley_values(
1834
+ X_dense,
1835
+ formative_stereotypical_extremeness,
1836
+ "Stereotypical Formative (Extremeness)",
1837
+ context
1838
+ )
1839
+ else:
1840
+ self.Phi_stereotypical_formative_ = None
1841
+ else:
1842
+ # Skip formative computation
1843
+ if self.verbose:
1844
+ print("\n[1] Skipping Formative Instances (fast_mode)")
1845
+
1846
+ self.Phi_archetypal_formative_ = None
1847
+ self.Phi_prototypical_formative_ = None
1848
+ self.Phi_stereotypical_formative_ = None
1849
+
1850
+ # PERSPECTIVE 2: Explanations (always computed, optionally subsampled)
1851
+ if self.verbose:
1852
+ print("\n[2] Computing Local Explanations (why is each sample significant)...")
1853
+ if subsample_indices_explanations is not None:
1854
+ print(f" Computing for {len(subsample_indices_explanations)} samples (union of top-N per metric)")
1855
+ else:
1856
+ print(f" Computing for all {n_samples} instances")
1857
+
1858
+ self._fit_shapley_explanations(
1859
+ X_dense, X_l2, index, engine,
1860
+ subsample_indices_explanations
1861
+ )
1862
+
1863
+ # MEMORY CLEANUP: Free X_dense copy (original X_scaled still needed)
1864
+ _cleanup_memory(X_dense, force_gc=True)
1865
+
1866
+ if self.verbose:
1867
+ print("\n" + "="*70)
1868
+ if compute_formative:
1869
+ print("✓ Shapley Dual-Perspective Analysis Complete")
1870
+ else:
1871
+ print("✓ Shapley Explanations Complete (formative skipped)")
1872
+ print("="*70)
1873
+
1874
+
1875
+ def _fit_shapley_explanations(
1876
+ self,
1877
+ X_dense: np.ndarray,
1878
+ X_l2: ArrayLike,
1879
+ index: pd.Index,
1880
+ engine: ShapleySignificanceEngine,
1881
+ subsample_indices: Optional[np.ndarray] = None
1882
+ ) -> None:
1883
+ """
1884
+ Compute Shapley explanations with optional subsampling.
1885
+
1886
+ OPTIMIZED: Two-tier permutation strategy for union samples.
1887
+ """
1888
+
1889
+ n_samples, n_features = X_dense.shape
1890
+
1891
+ # Determine which samples to compute for
1892
+ if subsample_indices is not None:
1893
+ samples_to_compute = subsample_indices
1894
+
1895
+ # OPTIMIZATION: Two-tier permutation strategy
1896
+ # If we have union samples, use full permutations only for "core" samples
1897
+ # Core = samples that appear in multiple metric top-N lists
1898
+ if hasattr(self, '_union_core_samples'):
1899
+ core_samples = self._union_core_samples
1900
+ secondary_samples = np.setdiff1d(samples_to_compute, core_samples)
1901
+
1902
+ if self.verbose and len(secondary_samples) > 0:
1903
+ print(f" Two-tier permutation strategy:")
1904
+ print(f" Core samples ({len(core_samples)}): {engine.n_permutations} permutations")
1905
+ print(f" Secondary samples ({len(secondary_samples)}): {engine.n_permutations // 2} permutations")
1906
+ else:
1907
+ core_samples = samples_to_compute
1908
+ secondary_samples = np.array([])
1909
+ else:
1910
+ samples_to_compute = np.arange(n_samples)
1911
+ core_samples = samples_to_compute
1912
+ secondary_samples = np.array([])
1913
+
1914
+ # Initialize full-size arrays (zeros for non-computed samples)
1915
+ self.Phi_archetypal_explanations_ = np.zeros((n_samples, n_features), dtype=np.float64)
1916
+ self.Phi_prototypical_explanations_ = np.zeros((n_samples, n_features), dtype=np.float64)
1917
+ self.Phi_stereotypical_explanations_ = np.zeros((n_samples, n_features), dtype=np.float64)
1918
+
1919
+ # Value functions for explanations
1920
+ def explain_archetypal_features(X_subset, indices, ctx):
1921
+ """Archetypal score for single sample with feature subset."""
1922
+ if len(X_subset) == 0 or X_subset.shape[1] == 0:
1923
+ return 0.0
1924
+ dist_to_boundary = np.minimum(X_subset, 1.0 - X_subset)
1925
+ archetypal_contribution = np.mean(1.0 - 2.0 * dist_to_boundary, axis=1)
1926
+ return float(np.mean(archetypal_contribution))
1927
+
1928
+ def explain_prototypical_features(X_subset, indices, ctx):
1929
+ """Prototypical score for single sample with feature subset."""
1930
+ if len(X_subset) == 0 or X_subset.shape[1] == 0:
1931
+ return 0.0
1932
+ return float(np.mean(np.var(X_subset, axis=1)))
1933
+
1934
+ context = {'sample_mode': 'features'}
1935
+
1936
+ # COMPUTE CORE SAMPLES (full permutations)
1937
+ if len(core_samples) > 0:
1938
+ if self.verbose:
1939
+ print(f" Computing archetypal explanations (core: {len(core_samples)} samples)...")
1940
+
1941
+ X_core = X_dense[core_samples, :]
1942
+ Phi_arch_core, info_arch = engine.compute_feature_shapley_values(
1943
+ X_core,
1944
+ explain_archetypal_features,
1945
+ "Archetypal Explanations (Core)",
1946
+ context
1947
+ )
1948
+ self.Phi_archetypal_explanations_[core_samples, :] = Phi_arch_core
1949
+ self.shapley_info_['archetypal_explanations'] = info_arch
1950
+
1951
+ if self.verbose:
1952
+ print(f" Computing prototypical explanations (core: {len(core_samples)} samples)...")
1953
+
1954
+ Phi_proto_core, info_proto = engine.compute_feature_shapley_values(
1955
+ X_core,
1956
+ explain_prototypical_features,
1957
+ "Prototypical Explanations (Core)",
1958
+ context
1959
+ )
1960
+ self.Phi_prototypical_explanations_[core_samples, :] = Phi_proto_core
1961
+ self.shapley_info_['prototypical_explanations'] = info_proto
1962
+
1963
+ # Stereotypical explanations (if applicable)
1964
+ if self.stereotype_column is not None:
1965
+ def explain_stereotypical_features(X_subset, indices, ctx):
1966
+ if len(X_subset) == 0 or X_subset.shape[1] == 0:
1967
+ return 0.0
1968
+ if ctx.get('target_values') is None:
1969
+ return 0.0
1970
+
1971
+ sample_idx = indices[0]
1972
+ target_value = ctx['target_values'][sample_idx]
1973
+ target = ctx['stereotype_target']
1974
+
1975
+ if isinstance(target, str):
1976
+ median = ctx.get('median', np.median(ctx['target_values']))
1977
+ if target == 'max':
1978
+ distance = max(0, target_value - median)
1979
+ elif target == 'min':
1980
+ distance = max(0, median - target_value)
1981
+ else:
1982
+ distance = 0.0
1983
+ else:
1984
+ distance = -abs(target_value - target)
1985
+
1986
+ feature_contrib = float(np.mean(np.abs(X_subset)))
1987
+ return distance * feature_contrib
1988
+
1989
+ if self.verbose:
1990
+ print(f" Computing stereotypical explanations (core: {len(core_samples)} samples)...")
1991
+
1992
+ context['stereotype_target'] = self.stereotype_target
1993
+ context['target_values'] = self._stereotype_source_fit_.to_numpy(dtype=np.float64) if hasattr(self, '_stereotype_source_fit_') else None
1994
+ context['median'] = np.median(context['target_values']) if context['target_values'] is not None else 0.0
1995
+
1996
+ Phi_stereo_core, info_stereo = engine.compute_feature_shapley_values(
1997
+ X_core,
1998
+ explain_stereotypical_features,
1999
+ "Stereotypical Explanations (Core)",
2000
+ context
2001
+ )
2002
+ self.Phi_stereotypical_explanations_[core_samples, :] = Phi_stereo_core
2003
+ self.shapley_info_['stereotypical_explanations'] = info_stereo
2004
+
2005
+ # MEMORY CLEANUP
2006
+ _cleanup_memory(X_core)
2007
+
2008
+ # COMPUTE SECONDARY SAMPLES (reduced permutations for speed)
2009
+ if len(secondary_samples) > 0:
2010
+ # Temporarily reduce permutations
2011
+ original_n_perms = engine.n_permutations
2012
+ engine.n_permutations = max(10, original_n_perms // 2)
2013
+
2014
+ if self.verbose:
2015
+ print(f" Computing explanations (secondary: {len(secondary_samples)} samples, {engine.n_permutations} perms)...")
2016
+
2017
+ X_secondary = X_dense[secondary_samples, :]
2018
+
2019
+ # Archetypal
2020
+ Phi_arch_sec, _ = engine.compute_feature_shapley_values(
2021
+ X_secondary, explain_archetypal_features,
2022
+ "Archetypal Explanations (Secondary)", context
2023
+ )
2024
+ self.Phi_archetypal_explanations_[secondary_samples, :] = Phi_arch_sec
2025
+
2026
+ # Prototypical
2027
+ Phi_proto_sec, _ = engine.compute_feature_shapley_values(
2028
+ X_secondary, explain_prototypical_features,
2029
+ "Prototypical Explanations (Secondary)", context
2030
+ )
2031
+ self.Phi_prototypical_explanations_[secondary_samples, :] = Phi_proto_sec
2032
+
2033
+ # Stereotypical
2034
+ if self.stereotype_column is not None:
2035
+ Phi_stereo_sec, _ = engine.compute_feature_shapley_values(
2036
+ X_secondary, explain_stereotypical_features,
2037
+ "Stereotypical Explanations (Secondary)", context
2038
+ )
2039
+ self.Phi_stereotypical_explanations_[secondary_samples, :] = Phi_stereo_sec
2040
+
2041
+ # Restore original permutations
2042
+ engine.n_permutations = original_n_perms
2043
+
2044
+ # MEMORY CLEANUP
2045
+ _cleanup_memory(X_secondary)
2046
+ else:
2047
+ self.Phi_stereotypical_explanations_ = None if self.stereotype_column is None else self.Phi_stereotypical_explanations_
2048
+
2049
+ def _v04_archetypal_value(
2050
+ self,
2051
+ X_subset: np.ndarray,
2052
+ indices: np.ndarray,
2053
+ context: Dict
2054
+ ) -> float:
2055
+ """Value function: Mean archetypal rank from v0.4 NMF method (Option A)."""
2056
+ if len(X_subset) < context['nmf_rank']:
2057
+ return 0.0
2058
+
2059
+ try:
2060
+ nmf = NMF(
2061
+ n_components=min(context['nmf_rank'], len(X_subset)-1),
2062
+ init='random',
2063
+ random_state=context['random_state'],
2064
+ max_iter=100,
2065
+ tol=0.01
2066
+ )
2067
+
2068
+ X_nn = X_subset - X_subset.min() + 1e-6
2069
+ W_subset = nmf.fit_transform(X_nn)
2070
+ W_norm = W_subset / (W_subset.sum(axis=1, keepdims=True) + 1e-12)
2071
+ arch_scores = np.max(W_norm, axis=1)
2072
+
2073
+ return float(np.mean(arch_scores))
2074
+ except:
2075
+ return float(np.mean(np.ptp(X_subset, axis=0)))
2076
+
2077
+ def _v04_prototypical_value(
2078
+ self,
2079
+ X_subset: np.ndarray,
2080
+ indices: np.ndarray,
2081
+ context: Dict
2082
+ ) -> float:
2083
+ """Value function: Coverage from v0.4 facility location."""
2084
+ if len(X_subset) < 2:
2085
+ return 0.0
2086
+
2087
+ norms = np.linalg.norm(X_subset, axis=1, keepdims=True)
2088
+ norms[norms == 0.0] = 1.0
2089
+ X_l2 = X_subset / norms
2090
+
2091
+ sims = X_l2 @ X_l2.T
2092
+ np.fill_diagonal(sims, 0)
2093
+
2094
+ max_sims = np.max(sims, axis=1) if sims.shape[0] > 0 else np.array([0.0])
2095
+ return float(np.mean(max_sims))
2096
+
2097
+ def _v04_stereotypical_value(
2098
+ self,
2099
+ X_subset: np.ndarray,
2100
+ indices: np.ndarray,
2101
+ context: Dict
2102
+ ) -> float:
2103
+ """Value function: Target alignment from v0.4 stereotype targeting."""
2104
+ if context.get('target_values') is None:
2105
+ s = np.max(np.abs(X_subset - 0.5), axis=1) * 2.0
2106
+ return float(np.mean(s))
2107
+
2108
+ target_vals = context['target_values'][indices]
2109
+ target = context['stereotype_target']
2110
+
2111
+ if target == 'max':
2112
+ return float(np.mean(target_vals))
2113
+ elif target == 'min':
2114
+ return float(-np.mean(target_vals))
2115
+ else:
2116
+ return float(-np.mean(np.abs(target_vals - float(target))))
2117
+
2118
+ def _compute_shapley_formative_ranks(self) -> pd.DataFrame:
2119
+ """Compute formative instance rankings from Shapley values."""
2120
+
2121
+ # Check if formative was computed
2122
+ if self.Phi_archetypal_formative_ is None:
2123
+ # Return None columns if formative wasn't computed
2124
+ n_samples = len(self.train_index_)
2125
+ return pd.DataFrame({
2126
+ 'archetypal_shapley_rank': [None] * n_samples,
2127
+ 'prototypical_shapley_rank': [None] * n_samples,
2128
+ 'stereotypical_shapley_rank': [None] * n_samples,
2129
+ }, index=self.train_index_)
2130
+
2131
+ # Formative was computed - proceed normally
2132
+ n_samples = self.Phi_archetypal_formative_.shape[0]
2133
+
2134
+ arch_formative = self.Phi_archetypal_formative_.sum(axis=1)
2135
+ proto_formative = self.Phi_prototypical_formative_.sum(axis=1)
2136
+
2137
+ if self.Phi_stereotypical_formative_ is not None:
2138
+ stereo_formative = self.Phi_stereotypical_formative_.sum(axis=1)
2139
+ else:
2140
+ stereo_formative = np.zeros(n_samples)
2141
+
2142
+ def normalize(ranks):
2143
+ r_min, r_max = ranks.min(), ranks.max()
2144
+ if (r_max - r_min) > 1e-12:
2145
+ return (ranks - r_min) / (r_max - r_min)
2146
+ else:
2147
+ return np.ones_like(ranks) * 0.5
2148
+
2149
+ return pd.DataFrame({
2150
+ 'archetypal_shapley_rank': np.round(normalize(arch_formative), 10),
2151
+ 'prototypical_shapley_rank': np.round(normalize(proto_formative), 10),
2152
+ 'stereotypical_shapley_rank': np.round(normalize(stereo_formative), 10),
2153
+ }, index=self.train_index_)
2154
+
2155
+
2156
+ def get_shapley_explanations(self, sample_idx: int) -> Dict[str, np.ndarray]:
2157
+ """Get Shapley feature attributions explaining why sample is archetypal/prototypical/stereotypical."""
2158
+ if not self.shapley_mode:
2159
+ raise RuntimeError("Shapley mode not enabled. Set shapley_mode=True when fitting.")
2160
+
2161
+ if self.Phi_archetypal_explanations_ is None:
2162
+ raise RuntimeError("Shapley explanations not computed. Call fit() first.")
2163
+
2164
+ # Convert DataFrame index to positional index
2165
+ if hasattr(self, 'train_index_') and self.train_index_ is not None:
2166
+ try:
2167
+ pos_idx = self.train_index_.get_loc(sample_idx)
2168
+ except KeyError:
2169
+ raise ValueError(f"Sample index {sample_idx} not found in training data")
2170
+ else:
2171
+ # Assume sample_idx is already positional
2172
+ pos_idx = sample_idx
2173
+
2174
+ explanations = {}
2175
+
2176
+ if self.Phi_archetypal_explanations_ is not None:
2177
+ explanations['archetypal'] = self.Phi_archetypal_explanations_[pos_idx]
2178
+
2179
+ if self.Phi_prototypical_explanations_ is not None:
2180
+ explanations['prototypical'] = self.Phi_prototypical_explanations_[pos_idx]
2181
+
2182
+ if self.Phi_stereotypical_explanations_ is not None:
2183
+ explanations['stereotypical'] = self.Phi_stereotypical_explanations_[pos_idx]
2184
+
2185
+ return explanations
2186
+
2187
+ def get_formative_attributions(self, sample_idx: int) -> Dict[str, np.ndarray]:
2188
+ """Get Shapley feature attributions showing how sample creates archetypal/prototypical/stereotypical structure."""
2189
+ if not self.shapley_mode:
2190
+ raise RuntimeError("Shapley mode not enabled. Set shapley_mode=True when fitting.")
2191
+
2192
+ if self.Phi_archetypal_formative_ is None:
2193
+ raise RuntimeError(
2194
+ "Formative instances not computed. "
2195
+ "This occurs when fast_mode=True (formative skipped for speed). "
2196
+ "Use fast_mode=False to compute formative instances."
2197
+ )
2198
+
2199
+ # Convert DataFrame index to positional index
2200
+ if hasattr(self, 'train_index_') and self.train_index_ is not None:
2201
+ try:
2202
+ pos_idx = self.train_index_.get_loc(sample_idx)
2203
+ except KeyError:
2204
+ raise ValueError(f"Sample index {sample_idx} not found in training data")
2205
+ else:
2206
+ # Assume sample_idx is already positional
2207
+ pos_idx = sample_idx
2208
+
2209
+ attributions = {}
2210
+
2211
+ if self.Phi_archetypal_formative_ is not None:
2212
+ attributions['archetypal'] = self.Phi_archetypal_formative_[pos_idx]
2213
+
2214
+ if self.Phi_prototypical_formative_ is not None:
2215
+ attributions['prototypical'] = self.Phi_prototypical_formative_[pos_idx]
2216
+
2217
+ if self.Phi_stereotypical_formative_ is not None:
2218
+ attributions['stereotypical'] = self.Phi_stereotypical_formative_[pos_idx]
2219
+
2220
+ return attributions
2221
+
2222
+ # --------------------------
2223
+ # Text (TF-IDF)
2224
+ # --------------------------
2225
+ def fit_text(
2226
+ self,
2227
+ corpus: Iterable[str],
2228
+ vectorizer: str = "tfidf",
2229
+ text_metadata: Optional[pd.DataFrame] = None
2230
+ ):
2231
+ """
2232
+ Fit on text corpus with optional metadata.
2233
+
2234
+ Parameters
2235
+ ----------
2236
+ corpus : Iterable[str]
2237
+ Text documents
2238
+ vectorizer : str
2239
+ Vectorization method (default: 'tfidf')
2240
+ text_metadata : pd.DataFrame, optional
2241
+ Document-level properties for stereotype computation
2242
+ Must have same number of rows as documents in corpus
2243
+ """
2244
+ self._validate_stereotype_config()
2245
+ with _ThreadControl(self.deterministic and not self.speed_mode) as tc:
2246
+ _seed_everything(self.random_state)
2247
+ X_scaled, X_l2 = self._preprocess_text_fit(corpus, vectorizer, text_metadata)
2248
+ idx = pd.RangeIndex(X_scaled.shape[0])
2249
+ self.train_index_ = idx
2250
+ self._fit_components(X_scaled, X_l2, idx)
2251
+ self._record_settings(tc)
2252
+ return self
2253
+
2254
+ def transform_text(self, corpus: Iterable[str]) -> pd.DataFrame:
2255
+ """Transform text corpus."""
2256
+ with _ThreadControl(self.deterministic and not self.speed_mode):
2257
+ X_scaled, X_l2 = self._preprocess_text_transform(corpus)
2258
+ idx = pd.RangeIndex(X_scaled.shape[0])
2259
+
2260
+ # Get stereotype source (priority: metadata column > keywords > None)
2261
+ stereotype_source = None
2262
+
2263
+ # Priority 1: Metadata column (from fit_text)
2264
+ if self.stereotype_column is not None and self.text_metadata_ is not None:
2265
+ if self.stereotype_column in self.text_metadata_.columns:
2266
+ stereotype_source = self.text_metadata_[self.stereotype_column]
2267
+
2268
+ # Priority 2: Keyword scores (recompute on new corpus)
2269
+ elif self.stereotype_keywords is not None:
2270
+ corpus_list = list(corpus)
2271
+ X_tfidf = self.vectorizer_.transform(corpus_list)
2272
+ keyword_scores = self._compute_keyword_scores(
2273
+ X_tfidf, corpus_list, self.stereotype_keywords
2274
+ )
2275
+ stereotype_source = pd.Series(keyword_scores)
2276
+
2277
+ return self._score_with_fitted(X_scaled, X_l2, idx, stereotype_source)
2278
+
2279
+ def fit_transform_text(
2280
+ self,
2281
+ corpus: Iterable[str],
2282
+ vectorizer: str = "tfidf",
2283
+ text_metadata: Optional[pd.DataFrame] = None
2284
+ ) -> pd.DataFrame:
2285
+ """Fit and transform text in one step."""
2286
+ self.fit_text(corpus, vectorizer=vectorizer, text_metadata=text_metadata)
2287
+ return self.transform_text(corpus)
2288
+
2289
+ # --------------------------
2290
+ # Signals / Graphs (numeric)
2291
+ # --------------------------
2292
+ def fit_transform_signals(self, X_signal: Union[pd.DataFrame, np.ndarray]) -> pd.DataFrame:
2293
+ self.fit(X_signal)
2294
+ return self.transform(X_signal, return_ranks_only=True)
2295
+
2296
+ def fit_transform_graph(
2297
+ self,
2298
+ node_features: Union[pd.DataFrame, np.ndarray],
2299
+ edges: Optional[np.ndarray] = None,
2300
+ edge_index: Optional[np.ndarray] = None,
2301
+ compute_topology: bool = True
2302
+ ) -> pd.DataFrame:
2303
+ """
2304
+ Fit and transform graph data.
2305
+
2306
+ Parameters
2307
+ ----------
2308
+ node_features : DataFrame or array
2309
+ Node feature matrix (n_nodes, n_features)
2310
+ edges : np.ndarray, optional
2311
+ Edge list as (2, n_edges) or (n_edges, 2)
2312
+ Alias: edge_index
2313
+ compute_topology : bool
2314
+ Whether to compute and append topology features
2315
+
2316
+ Returns
2317
+ -------
2318
+ results : pd.DataFrame
2319
+ Rankings with topology features if computed
2320
+ """
2321
+ # Handle edge_index alias
2322
+ if edges is None and edge_index is not None:
2323
+ edges = edge_index
2324
+
2325
+ # Convert to DataFrame
2326
+ if isinstance(node_features, pd.DataFrame):
2327
+ df = node_features.copy()
2328
+ else:
2329
+ df = pd.DataFrame(node_features)
2330
+
2331
+ n_nodes = len(df)
2332
+
2333
+ # Compute topology features if edges provided
2334
+ self.graph_topology_df_ = None
2335
+ if edges is not None and compute_topology:
2336
+ topology_df = self._compute_graph_topology_features(edges, n_nodes)
2337
+ self.graph_topology_df_ = topology_df
2338
+
2339
+ # Append to node features
2340
+ for col in topology_df.columns:
2341
+ if col not in df.columns:
2342
+ df[col] = topology_df[col].values
2343
+ else:
2344
+ warnings.warn(f"Topology feature '{col}' already exists, skipping")
2345
+
2346
+ # Standard tabular processing
2347
+ self.fit(df)
2348
+
2349
+ # Use standard transform which preserves label columns
2350
+ results = self.transform(df, return_ranks_only=False)
2351
+
2352
+ return results
2353
+
2354
+ # --------------------------
2355
+ # Ideals (legacy stereotypes)
2356
+ # --------------------------
2357
+ def register_ideal(self, name: str, ideal_vector: Union[np.ndarray, List[float]]) -> None:
2358
+ v = np.asarray(ideal_vector, dtype=np.float64).ravel()
2359
+ if self.scaler_ is None:
2360
+ raise RuntimeError("Call fit/fit_text before registering ideals.")
2361
+ d = self.H_.shape[1] if self.H_ is not None else self.scaler_.n_features_in_
2362
+ if v.shape[0] != d:
2363
+ raise ValueError(f"Ideal has dim {v.shape[0]} but data has {d} features.")
2364
+ self.ideals_[name] = v.copy()
2365
+
2366
+ # --------------------------
2367
+ # Config / sklearn interop
2368
+ # --------------------------
2369
+ def to_config(self) -> Dict:
2370
+ cfg = {k: getattr(self, k) for k in [
2371
+ "nmf_rank","n_prototypes","scale","distance_metric","similarity_metric",
2372
+ "deterministic","n_jobs","max_iter_nmf","tol_nmf","speed_mode","dtype",
2373
+ "random_state","max_memory_mb","return_ranks_only","auto_n_prototypes",
2374
+ "verbose","max_missing_frac",
2375
+ "stereotype_column","stereotype_target","label_columns",
2376
+ "stereotype_keywords","graph_topology_features"
2377
+ ]}
2378
+ cfg["version"] = "0.4"
2379
+ return cfg
2380
+
2381
+ @classmethod
2382
+ def from_config(cls, cfg: Dict) -> "DataTypical":
2383
+ try:
2384
+ return cls(**{k: v for k, v in cfg.items() if k in {f.name for f in dc_fields(cls)}})
2385
+ except TypeError as e:
2386
+ raise ConfigError(str(e))
2387
+
2388
+ def get_params(self, deep: bool = True) -> Dict:
2389
+ return {f.name: getattr(self, f.name) for f in dc_fields(self) if f.init}
2390
+
2391
+ def set_params(self, **params):
2392
+ for k, v in params.items():
2393
+ if not hasattr(self, k):
2394
+ raise ValueError(f"Unknown parameter {k}")
2395
+ setattr(self, k, v)
2396
+ return self
2397
+
2398
+ # ============================================================
2399
+ # [F] Graph Topology Features (NEW in v0.4)
2400
+ # ============================================================
2401
+ def _compute_graph_topology_features(
2402
+ self,
2403
+ edge_index: np.ndarray,
2404
+ n_nodes: int,
2405
+ feature_names: Optional[List[str]] = None
2406
+ ) -> pd.DataFrame:
2407
+ """
2408
+ Compute graph topology features.
2409
+
2410
+ Parameters
2411
+ ----------
2412
+ edge_index : np.ndarray
2413
+ Edge list (2, n_edges) or (n_edges, 2)
2414
+ n_nodes : int
2415
+ Number of nodes
2416
+ feature_names : List[str], optional
2417
+ Which topology features to compute
2418
+
2419
+ Returns
2420
+ -------
2421
+ topology_df : pd.DataFrame
2422
+ Computed topology features (n_nodes, n_features)
2423
+ """
2424
+ try:
2425
+ import networkx as nx
2426
+ except ImportError:
2427
+ raise ImportError(
2428
+ "NetworkX is required for graph topology features. "
2429
+ "Install with: pip install networkx"
2430
+ )
2431
+
2432
+ # Convert edge_index to NetworkX graph
2433
+ if edge_index.shape[0] == 2:
2434
+ edges = edge_index.T # (n_edges, 2)
2435
+ else:
2436
+ edges = edge_index
2437
+
2438
+ G = nx.Graph()
2439
+ G.add_nodes_from(range(n_nodes))
2440
+ G.add_edges_from(edges)
2441
+
2442
+ # Determine which features to compute
2443
+ if feature_names is None:
2444
+ feature_names = self.graph_topology_features or ['degree', 'clustering']
2445
+
2446
+ topology_data = {}
2447
+
2448
+ for feat_name in feature_names:
2449
+ if feat_name == 'degree':
2450
+ degree_dict = dict(G.degree())
2451
+ topology_data['degree'] = [degree_dict.get(i, 0) for i in range(n_nodes)]
2452
+
2453
+ elif feat_name == 'clustering':
2454
+ clust_dict = nx.clustering(G)
2455
+ topology_data['clustering'] = [clust_dict.get(i, 0.0) for i in range(n_nodes)]
2456
+
2457
+ elif feat_name == 'pagerank':
2458
+ pr_dict = nx.pagerank(G, max_iter=100)
2459
+ topology_data['pagerank'] = [pr_dict.get(i, 0.0) for i in range(n_nodes)]
2460
+
2461
+ elif feat_name == 'triangles':
2462
+ tri_dict = nx.triangles(G)
2463
+ topology_data['triangles'] = [tri_dict.get(i, 0) for i in range(n_nodes)]
2464
+
2465
+ elif feat_name == 'betweenness':
2466
+ bet_dict = nx.betweenness_centrality(G)
2467
+ topology_data['betweenness'] = [bet_dict.get(i, 0.0) for i in range(n_nodes)]
2468
+
2469
+ elif feat_name == 'closeness':
2470
+ close_dict = nx.closeness_centrality(G)
2471
+ topology_data['closeness'] = [close_dict.get(i, 0.0) for i in range(n_nodes)]
2472
+
2473
+ elif feat_name == 'eigenvector':
2474
+ try:
2475
+ eigen_dict = nx.eigenvector_centrality(G, max_iter=100)
2476
+ topology_data['eigenvector'] = [eigen_dict.get(i, 0.0) for i in range(n_nodes)]
2477
+ except:
2478
+ warnings.warn("Eigenvector centrality failed, using zeros")
2479
+ topology_data['eigenvector'] = [0.0] * n_nodes
2480
+
2481
+ else:
2482
+ warnings.warn(f"Unknown topology feature: {feat_name}")
2483
+
2484
+ return pd.DataFrame(topology_data, index=range(n_nodes))
2485
+
2486
+ # ============================================================
2487
+ # [G] Stereotype Computation (NEW in v0.4)
2488
+ # ============================================================
2489
+ def _validate_stereotype_config(self):
2490
+ """Validate stereotype configuration at fit time."""
2491
+
2492
+ # Check conflicting specifications
2493
+ if self.stereotype_column is not None and self.stereotype_keywords is not None:
2494
+ raise ConfigError(
2495
+ "Cannot specify both stereotype_column and stereotype_keywords. "
2496
+ "Use stereotype_column for metadata or stereotype_keywords for text relevance."
2497
+ )
2498
+
2499
+ # Validate target
2500
+ if isinstance(self.stereotype_target, str):
2501
+ if self.stereotype_target not in ['min', 'max']:
2502
+ raise ConfigError(
2503
+ f"stereotype_target must be 'min', 'max', or numeric value, "
2504
+ f"got: '{self.stereotype_target}'"
2505
+ )
2506
+
2507
+ def _compute_stereotypical_rank(
2508
+ self,
2509
+ X_scaled: ArrayLike,
2510
+ index: pd.Index,
2511
+ stereotype_source: Optional[pd.Series] = None
2512
+ ) -> np.ndarray:
2513
+ """
2514
+ Compute stereotypical ranking based on configuration.
2515
+
2516
+ Parameters
2517
+ ----------
2518
+ X_scaled : ArrayLike
2519
+ Scaled feature matrix (for fallback to extremeness)
2520
+ index : pd.Index
2521
+ Row index
2522
+ stereotype_source : pd.Series, optional
2523
+ Pre-computed values to rank against (from df_original, metadata, or topology)
2524
+
2525
+ Returns
2526
+ -------
2527
+ stereotype_rank : np.ndarray
2528
+ Scores in [0, 1] where 1 = closest to stereotype target
2529
+ """
2530
+ if stereotype_source is None:
2531
+ # BACKWARD COMPATIBLE: use extremeness
2532
+ X_dense = X_scaled.toarray() if (sp is not None and sp.isspmatrix(X_scaled)) else X_scaled
2533
+ s = np.max(np.abs(X_dense - 0.5), axis=1) * 2.0
2534
+ s_min, s_max = float(s.min()), float(s.max())
2535
+ if (s_max - s_min) > 1e-12:
2536
+ return (s - s_min) / (s_max - s_min)
2537
+ else:
2538
+ return np.zeros_like(s)
2539
+
2540
+ # USER-DIRECTED: Rank toward specific target
2541
+ values = stereotype_source.to_numpy(dtype=np.float64)
2542
+
2543
+ # Handle NaN values
2544
+ valid_mask = ~np.isnan(values)
2545
+ if not np.any(valid_mask):
2546
+ warnings.warn("All stereotype values are NaN, using zeros")
2547
+ return np.zeros(len(values))
2548
+
2549
+ # Compute target value
2550
+ if isinstance(self.stereotype_target, str):
2551
+ if self.stereotype_target == "min":
2552
+ target = np.nanmin(values)
2553
+ elif self.stereotype_target == "max":
2554
+ target = np.nanmax(values)
2555
+ else:
2556
+ raise ValueError(
2557
+ f"stereotype_target must be 'min', 'max', or numeric value, "
2558
+ f"got '{self.stereotype_target}'"
2559
+ )
2560
+ else:
2561
+ target = float(self.stereotype_target)
2562
+
2563
+ # Rank by distance to target (inverted: 1 = closest, 0 = furthest)
2564
+ distances = np.abs(values - target)
2565
+ max_dist = np.nanmax(distances)
2566
+
2567
+ if max_dist > 1e-12:
2568
+ stereotype_rank = 1.0 - (distances / max_dist)
2569
+ else:
2570
+ # All values identical or at target
2571
+ stereotype_rank = np.ones_like(distances, dtype=np.float64)
2572
+
2573
+ # Handle NaN entries
2574
+ stereotype_rank[~valid_mask] = 0.0
2575
+
2576
+ return np.clip(stereotype_rank, 0.0, 1.0)
2577
+
2578
+
2579
+ def _get_stereotype_source_table(self, df: pd.DataFrame) -> Optional[pd.Series]:
2580
+ """Extract stereotype values from tabular data."""
2581
+ if self.stereotype_column is None:
2582
+ return None
2583
+
2584
+ # Check if column exists in df (features or labels)
2585
+ if self.stereotype_column not in df.columns:
2586
+ raise ValueError(
2587
+ f"stereotype_column '{self.stereotype_column}' not found. "
2588
+ f"Available columns: {list(df.columns)}"
2589
+ )
2590
+
2591
+ return df[self.stereotype_column]
2592
+
2593
+ def _get_stereotype_source_text(self) -> Optional[pd.Series]:
2594
+ """Extract stereotype values from text metadata or keywords."""
2595
+
2596
+ # Priority 1: User-specified column from metadata
2597
+ if self.stereotype_column is not None:
2598
+ if self.text_metadata_ is None:
2599
+ raise ValueError(
2600
+ "stereotype_column specified but no text_metadata provided. "
2601
+ "Pass text_metadata to fit_text() or use stereotype_keywords."
2602
+ )
2603
+
2604
+ if self.stereotype_column not in self.text_metadata_.columns:
2605
+ raise ValueError(
2606
+ f"stereotype_column '{self.stereotype_column}' not found in text_metadata. "
2607
+ f"Available columns: {list(self.text_metadata_.columns)}"
2608
+ )
2609
+
2610
+ return self.text_metadata_[self.stereotype_column]
2611
+
2612
+ # Priority 2: Keyword-based scores
2613
+ if self.stereotype_keyword_scores_ is not None:
2614
+ return pd.Series(self.stereotype_keyword_scores_)
2615
+
2616
+ # No stereotype specified
2617
+ return None
2618
+
2619
+ def _compute_keyword_scores(
2620
+ self,
2621
+ X_tfidf: "sp.spmatrix",
2622
+ corpus: List[str],
2623
+ keywords: List[str]
2624
+ ) -> np.ndarray:
2625
+ """
2626
+ Compute relevance scores for documents based on keyword TF-IDF sum.
2627
+
2628
+ Parameters
2629
+ ----------
2630
+ X_tfidf : sparse matrix
2631
+ TF-IDF matrix (n_docs, n_vocab)
2632
+ corpus : List[str]
2633
+ Original documents (for fallback if keywords not in vocab)
2634
+ keywords : List[str]
2635
+ Keywords to compute relevance for
2636
+
2637
+ Returns
2638
+ -------
2639
+ scores : np.ndarray
2640
+ Relevance score per document (n_docs,)
2641
+ """
2642
+ vocab = self.vectorizer_.vocabulary_
2643
+
2644
+ # Find indices of keywords in vocabulary
2645
+ keyword_indices = []
2646
+ missing_keywords = []
2647
+
2648
+ for kw in keywords:
2649
+ if kw in vocab:
2650
+ keyword_indices.append(vocab[kw])
2651
+ else:
2652
+ missing_keywords.append(kw)
2653
+
2654
+ if missing_keywords:
2655
+ warnings.warn(
2656
+ f"Keywords not found in vocabulary: {missing_keywords}. "
2657
+ f"These will be ignored in stereotype computation."
2658
+ )
2659
+
2660
+ if not keyword_indices:
2661
+ warnings.warn(
2662
+ "No stereotype keywords found in vocabulary. "
2663
+ "Using zero scores (equivalent to no stereotype)."
2664
+ )
2665
+ return np.zeros(X_tfidf.shape[0])
2666
+
2667
+ # Sum TF-IDF scores for keyword columns
2668
+ keyword_indices = np.array(keyword_indices)
2669
+ X_keywords = X_tfidf[:, keyword_indices]
2670
+ scores = np.asarray(X_keywords.sum(axis=1)).ravel()
2671
+
2672
+ return scores
2673
+
2674
+ # ============================================================
2675
+ # Internals - Tables (numeric-only features)
2676
+ # ============================================================
2677
+ def _select_numeric_features(self, df: pd.DataFrame) -> pd.DataFrame:
2678
+ """Pick numeric feature columns; auto-exclude ID-like columns."""
2679
+ feat_df = df.select_dtypes(include=[np.number]).copy()
2680
+ if feat_df.shape[1] == 0:
2681
+ raise DataTypicalError("No numeric feature columns found for tabular processing.")
2682
+
2683
+ n = len(feat_df)
2684
+ to_drop = set()
2685
+ for col in feat_df.columns:
2686
+ name = str(col).lower()
2687
+ if name == "id" or name.endswith("_id") or name.startswith("id_"):
2688
+ to_drop.add(col)
2689
+ continue
2690
+
2691
+ s = feat_df[col]
2692
+ nunique = s.nunique(dropna=True)
2693
+
2694
+ # near-unique numerics behave like row IDs
2695
+ if nunique >= 0.8 * n:
2696
+ to_drop.add(col)
2697
+ continue
2698
+
2699
+ # strict monotone sequence (typical of indices)
2700
+ if n > 1:
2701
+ diffs = np.diff(s.values).astype(float)
2702
+ if (diffs > 0).all() or (diffs < 0).all():
2703
+ to_drop.add(col)
2704
+ continue
2705
+
2706
+ feature_cols = [c for c in feat_df.columns if c not in to_drop]
2707
+ if not feature_cols:
2708
+ # fall back to all numeric if we dropped everything
2709
+ feature_cols = list(feat_df.columns)
2710
+ if self.verbose:
2711
+ warnings.warn("All numeric columns looked like IDs; using them anyway.")
2712
+ return feat_df[feature_cols]
2713
+
2714
+ def _preprocess_table_fit(self, df: pd.DataFrame) -> Tuple[np.ndarray, ArrayLike]:
2715
+ # Store original df for stereotype computation
2716
+ self._df_original_fit = df.copy()
2717
+
2718
+ # Separate label columns (preserve but don't use in NMF)
2719
+ if self.label_columns is not None:
2720
+ label_cols_present = [c for c in self.label_columns if c in df.columns]
2721
+ missing_labels = [c for c in self.label_columns if c not in df.columns]
2722
+
2723
+ if missing_labels:
2724
+ warnings.warn(f"Label columns not found: {missing_labels}")
2725
+
2726
+ if label_cols_present:
2727
+ self.label_df_ = df[label_cols_present].copy()
2728
+ df_for_features = df.drop(columns=label_cols_present)
2729
+ else:
2730
+ self.label_df_ = None
2731
+ df_for_features = df
2732
+ else:
2733
+ self.label_df_ = None
2734
+ df_for_features = df
2735
+
2736
+ # Pick numeric features only
2737
+ feat_df = self._select_numeric_features(df_for_features)
2738
+ self.feature_columns_ = list(feat_df.columns)
2739
+
2740
+ X = feat_df.to_numpy(dtype=self.dtype, copy=True)
2741
+
2742
+ # Missingness report on features
2743
+ miss_frac = np.mean(pd.isna(feat_df), axis=0).to_numpy()
2744
+ self.missingness_ = {name: float(frac) for name, frac in zip(self.feature_columns_, miss_frac)}
2745
+ worst = np.max(miss_frac) if miss_frac.size else 0.0
2746
+ if worst > self.max_missing_frac:
2747
+ raise DataTypicalError(
2748
+ f"Missingness too high (max frac={worst:.3f} > threshold={self.max_missing_frac})."
2749
+ )
2750
+
2751
+ # Deterministic imputer: per-feature median
2752
+ med = np.nanmedian(X, axis=0)
2753
+ inds = np.where(np.isnan(X))
2754
+ X[inds] = np.take(med, inds[1])
2755
+ self.impute_median_ = med
2756
+
2757
+ # Scale to [0,1]
2758
+ self.scaler_ = MinMaxScaler(copy=True, clip=True)
2759
+ X_scaled_full = self.scaler_.fit_transform(X).astype(self.dtype, copy=False)
2760
+
2761
+ # Drop constant columns
2762
+ var = X_scaled_full.var(axis=0)
2763
+ keep_mask = var > 0.0
2764
+ self.keep_mask_ = keep_mask
2765
+ if not np.all(keep_mask):
2766
+ self.dropped_columns_ = [c for c, k in zip(self.feature_columns_, keep_mask) if not k]
2767
+ if self.verbose:
2768
+ warnings.warn(f"Dropped constant feature columns: {self.dropped_columns_}")
2769
+ X_scaled = X_scaled_full[:, keep_mask]
2770
+
2771
+ # Optional feature weights (length must match number of original numeric features)
2772
+ if self.feature_weights is not None:
2773
+ w = np.asarray(self.feature_weights, dtype=np.float64).ravel()
2774
+ if w.shape[0] != len(self.feature_columns_):
2775
+ warnings.warn("feature_weights length mismatch – ignoring weights.")
2776
+ else:
2777
+ X_scaled = (X_scaled * w[keep_mask]).astype(self.dtype, copy=False)
2778
+
2779
+ # L2 copy
2780
+ X_l2 = _l2_normalize_rows_dense(X_scaled.astype(np.float64))
2781
+ return X_scaled, X_l2
2782
+
2783
+ def _preprocess_table_transform(self, df: pd.DataFrame) -> Tuple[np.ndarray, ArrayLike]:
2784
+ if any(v is None for v in (self.feature_columns_, self.impute_median_, self.keep_mask_, self.scaler_)):
2785
+ raise RuntimeError("Model not fitted.")
2786
+ # Align by feature column NAMES (order enforced from training)
2787
+ missing = [c for c in self.feature_columns_ if c not in df.columns]
2788
+ if missing:
2789
+ raise DataTypicalError(f"Missing required feature columns at transform: {missing}")
2790
+ feat_df = df[self.feature_columns_]
2791
+ # Ensure numeric
2792
+ if not all(np.issubdtype(t, np.number) for t in feat_df.dtypes):
2793
+ raise DataTypicalError("Non-numeric values present in feature columns at transform.")
2794
+ X = feat_df.to_numpy(dtype=self.dtype, copy=True)
2795
+
2796
+ # Impute with training medians
2797
+ inds = np.where(np.isnan(X))
2798
+ if inds[0].size:
2799
+ X[inds] = np.take(self.impute_median_, inds[1])
2800
+
2801
+ # Scale using fitted scaler; then drop constants via keep_mask_
2802
+ X_scaled_full = self.scaler_.transform(X).astype(self.dtype, copy=False)
2803
+ X_scaled = X_scaled_full[:, self.keep_mask_]
2804
+
2805
+ # Optional weights
2806
+ if self.feature_weights is not None and len(self.feature_columns_) == self.feature_weights.shape[0]:
2807
+ X_scaled = (X_scaled * np.asarray(self.feature_weights)[self.keep_mask_]).astype(self.dtype, copy=False)
2808
+
2809
+ X_l2 = _l2_normalize_rows_dense(X_scaled.astype(np.float64))
2810
+ return X_scaled, X_l2
2811
+
2812
+ # ============================================================
2813
+ # Internals - Text
2814
+ # ============================================================
2815
+ def _preprocess_text_fit(
2816
+ self,
2817
+ corpus: Iterable[str],
2818
+ vectorizer: str,
2819
+ text_metadata: Optional[pd.DataFrame] = None
2820
+ ) -> Tuple[ArrayLike, ArrayLike]:
2821
+ """
2822
+ Preprocess text with optional metadata for stereotypes.
2823
+
2824
+ Parameters
2825
+ ----------
2826
+ corpus : Iterable[str]
2827
+ Text documents
2828
+ vectorizer : str
2829
+ Vectorization method
2830
+ text_metadata : pd.DataFrame, optional
2831
+ External document-level properties (e.g., relevance scores, timestamps)
2832
+ """
2833
+ if vectorizer != "tfidf":
2834
+ raise NotImplementedError("Only TF-IDF supported in v0.4.")
2835
+ if sp is None:
2836
+ raise ImportError("scipy is required for text path.")
2837
+
2838
+ corpus_list = list(corpus)
2839
+ n_docs = len(corpus_list)
2840
+
2841
+ # Store metadata if provided
2842
+ if text_metadata is not None:
2843
+ if len(text_metadata) != n_docs:
2844
+ raise ValueError(
2845
+ f"text_metadata length ({len(text_metadata)}) must match "
2846
+ f"corpus length ({n_docs})"
2847
+ )
2848
+ self.text_metadata_ = text_metadata.copy()
2849
+ else:
2850
+ self.text_metadata_ = None
2851
+
2852
+ # Fit TF-IDF vectorizer
2853
+ self.vectorizer_ = TfidfVectorizer()
2854
+ X_tfidf = self.vectorizer_.fit_transform(corpus_list)
2855
+
2856
+ # Compute keyword-based stereotype if specified
2857
+ if self.stereotype_keywords is not None:
2858
+ self.stereotype_keyword_scores_ = self._compute_keyword_scores(
2859
+ X_tfidf, corpus_list, self.stereotype_keywords
2860
+ )
2861
+ else:
2862
+ self.stereotype_keyword_scores_ = None
2863
+
2864
+ X_scaled_sp = _sparse_minmax_0_1_nonneg(X_tfidf)
2865
+ X_l2 = _sparse_l2_normalize_rows(X_scaled_sp)
2866
+
2867
+ return X_scaled_sp, X_l2
2868
+
2869
+ def _preprocess_text_transform(self, corpus: Iterable[str]) -> Tuple[ArrayLike, ArrayLike]:
2870
+ if self.vectorizer_ is None:
2871
+ raise RuntimeError("Call fit_text first.")
2872
+ if sp is None:
2873
+ raise ImportError("scipy is required for text path.")
2874
+ X_tfidf = self.vectorizer_.transform(list(corpus))
2875
+ X_scaled_sp = _sparse_minmax_0_1_nonneg(X_tfidf)
2876
+ X_l2 = _sparse_l2_normalize_rows(X_scaled_sp)
2877
+ return X_scaled_sp, X_l2
2878
+
2879
+
2880
+ # ============================================================
2881
+ # Archetypal Analysis Methods (NEW in v0.7)
2882
+ # ============================================================
2883
+
2884
+ def _fit_archetypal_aa(self, X_scaled: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
2885
+ """
2886
+ True archetypal analysis with PCHA (primary) and ConvexHull (fallback).
2887
+
2888
+ MEMORY OPTIMIZED: Respects configured dtype while preserving input dtype when needed.
2889
+ """
2890
+ n_samples, n_features = X_scaled.shape
2891
+
2892
+ # OPTIMIZED: Use configured dtype, but respect input if it's float64
2893
+ input_dtype = X_scaled.dtype
2894
+ if input_dtype == np.float64:
2895
+ target_dtype = np.float64 # Preserve float64 if input is float64
2896
+ elif self.dtype == 'float32':
2897
+ target_dtype = np.float32
2898
+ else:
2899
+ target_dtype = np.float64
2900
+
2901
+ # Determine effective k
2902
+ k_max = min(n_samples, n_features)
2903
+ k_eff = min(self.nmf_rank, k_max)
2904
+
2905
+ # Try PCHA first (stable in high dimensions)
2906
+ if PCHA is not None and k_eff >= 2:
2907
+ try:
2908
+ # PCHA requires float64 internally
2909
+ X_T = X_scaled.astype(np.float64).T.copy()
2910
+ X_min = X_T.min()
2911
+ if X_min < 0:
2912
+ X_T = X_T - X_min + 1e-10
2913
+
2914
+ if self.verbose:
2915
+ print(f" Computing {k_eff} archetypes using PCHA (stable in {n_features}D)...")
2916
+
2917
+ XC, S, C, SSE, varexpl = PCHA(X_T, noc=k_eff, delta=0.0)
2918
+
2919
+ if self.verbose:
2920
+ print(f" PCHA converged, variance explained: {varexpl:.2%}")
2921
+
2922
+ # Convert to ndarray (PCHA returns matrix objects)
2923
+ W = np.asarray(S.T, dtype=target_dtype)
2924
+ H = np.asarray(XC.T, dtype=target_dtype)
2925
+
2926
+ # Validate dimensions with detailed error messages
2927
+ if W.shape != (n_samples, k_eff):
2928
+ raise ValueError(f"PCHA W shape error: got {W.shape}, expected ({n_samples}, {k_eff})")
2929
+ if H.shape != (k_eff, n_features):
2930
+ raise ValueError(f"PCHA H shape error: got {H.shape}, expected ({k_eff}, {n_features})")
2931
+
2932
+ self.nmf_model_ = None
2933
+ self.reconstruction_error_ = float(SSE)
2934
+ self.n_archetypes_ = k_eff
2935
+ return W, H
2936
+
2937
+ except Exception as e:
2938
+ if self.verbose:
2939
+ print(f" PCHA failed ({e}), trying ConvexHull")
2940
+
2941
+ # Try ConvexHull fallback (low dimensions only)
2942
+ if ConvexHull is not None and cdist is not None and n_features <= 20:
2943
+ try:
2944
+ # ConvexHull needs float64
2945
+ X_hull = X_scaled.astype(np.float64)
2946
+ hull = ConvexHull(X_hull)
2947
+ boundary_indices = np.unique(hull.simplices.ravel())
2948
+ n_archetypes = len(boundary_indices)
2949
+
2950
+ if self.verbose:
2951
+ print(f" Found {n_archetypes} archetypes on convex hull")
2952
+
2953
+ W = np.zeros((n_samples, n_archetypes), dtype=target_dtype)
2954
+ for i in range(n_samples):
2955
+ point = X_hull[i:i+1]
2956
+ boundary_points = X_hull[boundary_indices]
2957
+ distances = cdist(point, boundary_points).ravel()
2958
+ weights = 1.0 / (distances + 1e-6)
2959
+ W[i, :] = weights / weights.sum()
2960
+
2961
+ H = np.asarray(X_scaled[boundary_indices], dtype=target_dtype)
2962
+ self.nmf_model_ = None
2963
+ self.reconstruction_error_ = None
2964
+ self.n_archetypes_ = n_archetypes
2965
+ return W, H
2966
+ except Exception as e:
2967
+ if self.verbose:
2968
+ print(f" ConvexHull failed ({e}), using NMF")
2969
+
2970
+ # Final fallback: NMF
2971
+ if self.verbose:
2972
+ print(f" Using NMF fallback")
2973
+ return self._fit_archetypal_nmf(X_scaled)
2974
+
2975
+
2976
+ def _fit_archetypal_nmf(self, X_scaled: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
2977
+ """
2978
+ Fast NMF-based approximation of archetypes.
2979
+
2980
+ MEMORY OPTIMIZED: Respects configured dtype.
2981
+ """
2982
+ # Ensure non-negative (NMF requirement)
2983
+ X_nonneg = np.maximum(X_scaled.astype(np.float64), 0)
2984
+
2985
+ # Determine effective rank
2986
+ k_eff = min(self.nmf_rank, X_nonneg.shape[0], X_nonneg.shape[1])
2987
+
2988
+ # OPTIMIZED: Determine target dtype
2989
+ input_dtype = X_scaled.dtype
2990
+ if input_dtype == np.float64:
2991
+ target_dtype = np.float64
2992
+ elif self.dtype == 'float32':
2993
+ target_dtype = np.float32
2994
+ else:
2995
+ target_dtype = np.float64
2996
+
2997
+ if self.verbose:
2998
+ print(f"\nFitting archetypes: NMF (k={k_eff})")
2999
+
3000
+ # Fit NMF with convergence warning suppressed
3001
+ with warnings.catch_warnings():
3002
+ warnings.filterwarnings('ignore', category=ConvergenceWarning)
3003
+ nmf = NMF(
3004
+ n_components=k_eff,
3005
+ init='nndsvd',
3006
+ max_iter=self.max_iter_nmf,
3007
+ tol=self.tol_nmf,
3008
+ random_state=self.random_state
3009
+ )
3010
+ W = nmf.fit_transform(X_nonneg)
3011
+ H = nmf.components_
3012
+
3013
+ # Store model and metadata
3014
+ self.nmf_model_ = nmf
3015
+ self.reconstruction_error_ = float(nmf.reconstruction_err_)
3016
+ self.n_archetypes_ = k_eff
3017
+
3018
+ # OPTIMIZED: Ensure output matches target dtype
3019
+ W = W.astype(target_dtype, copy=False)
3020
+ H = H.astype(target_dtype, copy=False)
3021
+
3022
+ return W, H
3023
+
3024
+ # ============================================================
3025
+ # Internals - Fit Components (NMF + Prototypes)
3026
+ # ============================================================
3027
+ def _fit_components(self, X_scaled: ArrayLike, X_l2: ArrayLike, index: pd.Index) -> None:
3028
+ """
3029
+ Fit archetypal and prototypical components.
3030
+
3031
+ MEMORY OPTIMIZED: Explicit cleanup of large temporaries.
3032
+
3033
+ Parameters
3034
+ ----------
3035
+ X_scaled : ArrayLike
3036
+ Scaled feature matrix [0, 1]
3037
+ X_l2 : ArrayLike
3038
+ L2-normalized feature matrix
3039
+ index : pd.Index
3040
+ Sample index
3041
+ """
3042
+ # ---- ARCHETYPAL ANALYSIS (NMF or AA)
3043
+ X_euc = X_scaled.toarray().astype(np.float64, copy=False) \
3044
+ if (sp is not None and sp.isspmatrix(X_scaled)) else np.asarray(X_scaled, dtype=np.float64)
3045
+
3046
+ if self.verbose:
3047
+ method_name = "Archetypal Analysis (PCHA+ConvexHull)" if self.archetypal_method == 'aa' else "NMF Approximation"
3048
+ print(f"\nFitting archetypal: {method_name}")
3049
+
3050
+ # Call appropriate method
3051
+ if self.archetypal_method == 'aa':
3052
+ W, H = self._fit_archetypal_aa(X_euc)
3053
+ else: # 'nmf'
3054
+ W, H = self._fit_archetypal_nmf(X_euc)
3055
+
3056
+ # Store with validation and correct dtype
3057
+ input_dtype = X_euc.dtype
3058
+ if input_dtype == np.float64:
3059
+ target_dtype = np.float64
3060
+ elif self.dtype == 'float32':
3061
+ target_dtype = np.float32
3062
+ else:
3063
+ target_dtype = np.float64
3064
+
3065
+ self.W_ = W.astype(target_dtype, copy=False)
3066
+ self.H_ = H.astype(target_dtype, copy=False)
3067
+ self.n_archetypes_ = self.H_.shape[0]
3068
+
3069
+ # Final validation
3070
+ n_samples, n_features = X_euc.shape
3071
+ assert self.W_.shape == (n_samples, self.n_archetypes_), \
3072
+ f"W_ dimension mismatch: {self.W_.shape} vs ({n_samples}, {self.n_archetypes_})"
3073
+ assert self.H_.shape == (self.n_archetypes_, n_features), \
3074
+ f"H_ dimension mismatch: {self.H_.shape} vs ({self.n_archetypes_}, {n_features})"
3075
+
3076
+ # MEMORY CLEANUP: Free W, H temporaries (we've stored them in self.W_, self.H_)
3077
+ _cleanup_memory(W, H)
3078
+
3079
+ if self.verbose:
3080
+ print(f" Stored: W ={self.W_.shape}, H ={self.H_.shape}, n_archetypes ={self.n_archetypes_}")
3081
+
3082
+ # ---- Prepare scaled dense & L2 copies
3083
+ X_euc = X_scaled.toarray().astype(np.float64, copy=False) \
3084
+ if (sp is not None and sp.isspmatrix(X_scaled)) else np.asarray(X_scaled, dtype=np.float64)
3085
+ Xl2 = X_l2.toarray().astype(np.float64, copy=False) \
3086
+ if (sp is not None and sp.isspmatrix(X_l2)) else np.asarray(X_l2, dtype=np.float64)
3087
+ n = X_euc.shape[0]
3088
+
3089
+ # ---- Helper: archetypal "cornerness" score
3090
+ def _corner_scores(Xe: np.ndarray) -> np.ndarray:
3091
+ eps = 1e-12
3092
+ col_min, col_max = Xe.min(axis=0), Xe.max(axis=0)
3093
+ hits_edge = (col_min <= eps) & (col_max >= 1.0 - eps)
3094
+ idxs = np.where(hits_edge)[0]
3095
+ if idxs.size >= 2:
3096
+ var = Xe[:, idxs].var(axis=0)
3097
+ take = idxs[np.argsort(-var)[:2]]
3098
+ else:
3099
+ var = Xe.var(axis=0)
3100
+ take = np.argsort(-var)[:2] if Xe.shape[1] >= 2 else np.array([0])
3101
+ X2 = Xe[:, take] if take.size else Xe[:, :1]
3102
+ m = np.minimum(X2, 1.0 - X2)
3103
+ dmin = np.sqrt(np.sum(m * m, axis=1))
3104
+ denom = math.sqrt(X2.shape[1]) if X2.shape[1] >= 1 else 1.0
3105
+ return 1.0 - np.clip(dmin / denom, 0.0, 1.0)
3106
+
3107
+ # ---- Helper: kNN density (cosine)
3108
+ def _knn_density_cosine(Xl2_arr: np.ndarray, k: int = 10, clip_neg: bool = True) -> np.ndarray:
3109
+ S = Xl2_arr @ Xl2_arr.T
3110
+ if clip_neg:
3111
+ S[S < 0.0] = 0.0
3112
+ np.fill_diagonal(S, 0.0)
3113
+ k = max(1, min(k, max(1, n - 1)))
3114
+ topk = np.partition(S, -k, axis=1)[:, -k:]
3115
+ dens = topk.mean(axis=1)
3116
+ m = dens.mean()
3117
+ return dens / m if m > 0 else np.ones_like(dens)
3118
+
3119
+ # ---- Build forbidden set from top archetypal (if enabled)
3120
+ disallow_overlap = bool(getattr(self, "disallow_overlap", False))
3121
+ overlap_alpha = float(getattr(self, "overlap_alpha", 0.0))
3122
+ forbidden = set()
3123
+ if disallow_overlap and overlap_alpha > 0.0:
3124
+ corner = _corner_scores(X_euc)
3125
+ m = max(1, min(n - 1, int(math.ceil(overlap_alpha * n))))
3126
+ order = np.argsort(-corner)
3127
+ forbidden = set(order[:m])
3128
+
3129
+ # ---- Compute kNN density for prototype selection
3130
+ dens = _knn_density_cosine(Xl2, k=10)
3131
+
3132
+ # ---- Prototypes via CELF with optional density weighting
3133
+ if self.verbose:
3134
+ print(f"\nFitting prototypes: Facility Location (k={self.n_prototypes})")
3135
+
3136
+ # Determine if density weighting is enabled
3137
+ density_weighted_fl = bool(getattr(self, "density_weighted_fl", False))
3138
+ density_k = int(getattr(self, "density_k", 10))
3139
+ density_clip_neg = bool(getattr(self, "density_clip_neg", True))
3140
+ weights = dens if density_weighted_fl else None
3141
+
3142
+ # Run facility location selector (it handles similarity matrix internally)
3143
+ selector = FacilityLocationSelector(
3144
+ n_prototypes=self.n_prototypes,
3145
+ deterministic=self.deterministic,
3146
+ speed_mode=self.speed_mode,
3147
+ verbose=self.verbose
3148
+ )
3149
+ P_idx, mg = selector.select(Xl2, weights=weights, forbidden=forbidden)
3150
+
3151
+ # Optional auto-k (Kneedle)
3152
+ knee = None
3153
+ if self.auto_n_prototypes == "kneedle" and mg.size >= 2:
3154
+ knee = self._kneedle(mg)
3155
+ if knee is not None and knee > 0:
3156
+ P_idx = P_idx[:knee]
3157
+ mg = mg[:knee]
3158
+
3159
+ self.prototype_indices_ = P_idx
3160
+ self.prototype_rows_ = index.to_numpy()[P_idx]
3161
+ self.marginal_gains_ = mg
3162
+ self.knee_ = knee
3163
+
3164
+ # Detect knee in marginal gains
3165
+ if len(mg) > 2:
3166
+ diffs = np.diff(mg)
3167
+ if len(diffs) > 1:
3168
+ diffs2 = np.diff(diffs)
3169
+ self.knee_ = int(np.argmax(np.abs(diffs2)) + 1)
3170
+ else:
3171
+ self.knee_ = 1
3172
+ else:
3173
+ self.knee_ = len(mg)
3174
+
3175
+ # Training-time assignments & coverage
3176
+ best_cos, proto_label = self._assignments_cosine(Xl2, P_idx)
3177
+ self.assignments_ = proto_label
3178
+ self.coverage_ = best_cos
3179
+
3180
+ if self.verbose:
3181
+ print(f" Selected {len(P_idx)} prototypes, knee at {self.knee_}")
3182
+
3183
+ # ---- Stereotypes (verbose output)
3184
+ if self.verbose:
3185
+ if self.stereotype_column is not None:
3186
+ target_str = f"'{self.stereotype_target}'" if isinstance(self.stereotype_target, str) else f"{self.stereotype_target}"
3187
+ print(f"\nStereotypical configuration:")
3188
+ print(f" Target column: '{self.stereotype_column}'")
3189
+ print(f" Target value: {target_str}")
3190
+
3191
+ # Show target distribution if we have the data
3192
+ if hasattr(self, '_df_original_fit') and self.stereotype_column in self._df_original_fit.columns:
3193
+ stereo_vals = self._df_original_fit[self.stereotype_column]
3194
+ print(f" Column range: [{stereo_vals.min():.2f}, {stereo_vals.max():.2f}]")
3195
+
3196
+ if isinstance(self.stereotype_target, str):
3197
+ if self.stereotype_target == 'max':
3198
+ print(f" Targeting samples with maximum {self.stereotype_column}")
3199
+ elif self.stereotype_target == 'min':
3200
+ print(f" Targeting samples with minimum {self.stereotype_column}")
3201
+ else:
3202
+ distance_to_target = abs(stereo_vals - self.stereotype_target).mean()
3203
+ print(f" Mean distance to target: {distance_to_target:.2f}")
3204
+ else:
3205
+ print(f"\nStereotypical: Not configured (using feature extremeness)")
3206
+
3207
+ # ============================================================
3208
+ # Internals - Scoring with fitted artifacts
3209
+ # ============================================================
3210
+ def _score_with_fitted(
3211
+ self,
3212
+ X_scaled: ArrayLike,
3213
+ X_l2: ArrayLike,
3214
+ index: pd.Index,
3215
+ stereotype_source: Optional[pd.Series] = None
3216
+ ) -> pd.DataFrame:
3217
+ """
3218
+ Score data with fitted artifacts.
3219
+
3220
+ CRITICAL: This method must handle dimension matching correctly for transform.
3221
+ MEMORY OPTIMIZED: Cleanup large temporaries during transform.
3222
+ """
3223
+ if (self.W_ is None or self.H_ is None) or self.prototype_indices_ is None:
3224
+ raise RuntimeError("Call fit first")
3225
+
3226
+ # Validate stored dimensions
3227
+ n_archetypes = self.n_archetypes_
3228
+ n_features_model = self.H_.shape[1]
3229
+
3230
+ # ---- Archetypal projections
3231
+ X_for_transform = X_scaled.astype(np.float64) if (sp is not None and sp.isspmatrix(X_scaled)) \
3232
+ else np.asarray(X_scaled, dtype=np.float64)
3233
+
3234
+ n_samples_transform = X_for_transform.shape[0]
3235
+ n_features_transform = X_for_transform.shape[1]
3236
+
3237
+ # CRITICAL VALIDATION
3238
+ if n_features_transform != n_features_model:
3239
+ raise ValueError(
3240
+ f"Feature dimension mismatch: transform data has {n_features_transform} features, "
3241
+ f"but model was trained with {n_features_model} features"
3242
+ )
3243
+
3244
+ if self.nmf_model_ is not None:
3245
+ # NMF method: use fitted model to transform
3246
+ W = self.nmf_model_.transform(X_for_transform)
3247
+ else:
3248
+ # AA method: compute weights from H using least squares
3249
+ H = self.H_
3250
+
3251
+ # Validate H dimensions before computation
3252
+ assert H.shape == (n_archetypes, n_features_model), \
3253
+ f"H dimension error: {H.shape} vs ({n_archetypes}, {n_features_model})"
3254
+
3255
+ HHT = H @ H.T
3256
+ assert HHT.shape == (n_archetypes, n_archetypes), \
3257
+ f"HHT dimension error: {HHT.shape} vs ({n_archetypes}, {n_archetypes})"
3258
+
3259
+ # Regularized inverse
3260
+ HHT_inv = np.linalg.pinv(HHT + 1e-6 * np.eye(HHT.shape[0]))
3261
+
3262
+ # Matrix multiplication with dimension checking
3263
+ W = X_for_transform @ H.T @ HHT_inv
3264
+
3265
+ # MEMORY CLEANUP: Free intermediate matrices
3266
+ _cleanup_memory(HHT, HHT_inv)
3267
+
3268
+ # Ensure non-negative
3269
+ W = np.maximum(W, 0)
3270
+
3271
+ # Validate W dimensions
3272
+ assert W.shape == (n_samples_transform, n_archetypes), \
3273
+ f"W dimension error: {W.shape} vs ({n_samples_transform}, {n_archetypes})"
3274
+
3275
+ # Normalize W
3276
+ W_row_sum = W.sum(axis=1, keepdims=True)
3277
+ W_row_sum[W_row_sum == 0.0] = 1.0
3278
+ W_norm = W / W_row_sum
3279
+ arch_wmax = W_norm.max(axis=1)
3280
+
3281
+ # MEMORY CLEANUP: Free W_norm after extracting needed values
3282
+ _cleanup_memory(W_norm, W_row_sum)
3283
+
3284
+ # Distances to archetypes
3285
+ X_dense = X_for_transform.toarray() if (sp is not None and sp.isspmatrix(X_for_transform)) \
3286
+ else np.asarray(X_for_transform)
3287
+
3288
+ dists_c = np.sqrt(np.maximum(
3289
+ ((X_dense[:, None, :] - self.H_[None, :, :]) ** 2).sum(axis=2),
3290
+ 0.0
3291
+ ))
3292
+ arch_d_min = dists_c.min(axis=1)
3293
+
3294
+ # MEMORY CLEANUP: Free distance matrix after extracting needed values
3295
+ _cleanup_memory(dists_c)
3296
+
3297
+ # ---- Prototypes: cosine assignment
3298
+ P_idx = self.prototype_indices_
3299
+ best_cos, proto_label = self._assignments_cosine(X_l2, P_idx)
3300
+
3301
+ # Euclidean distance to prototypes
3302
+ X_euc = X_scaled.toarray().astype(np.float64, copy=False) \
3303
+ if (sp is not None and sp.isspmatrix(X_scaled)) else np.asarray(X_scaled, dtype=np.float64)
3304
+ P_mat = X_euc[P_idx] if P_idx.max() < len(X_euc) else self.W_[P_idx]
3305
+
3306
+ best_euc = _euclidean_min_to_set_dense(X_euc, P_mat, max_memory_mb=self.max_memory_mb)
3307
+
3308
+ # MEMORY CLEANUP: Free P_mat after distance computation
3309
+ _cleanup_memory(P_mat)
3310
+
3311
+ norm95 = np.percentile(best_euc, 95) or 1.0
3312
+ proto_d_norm95 = np.clip(best_euc / norm95, 0.0, 1.0)
3313
+
3314
+ # ---- Compute ranks
3315
+ # Archetypal rank
3316
+ eps = 1e-12
3317
+ col_min = X_euc.min(axis=0)
3318
+ col_max = X_euc.max(axis=0)
3319
+ hits_edge = (col_min <= eps) & (col_max >= 1.0 - eps)
3320
+ idxs = np.where(hits_edge)[0]
3321
+ if idxs.size >= 2:
3322
+ var = X_euc[:, idxs].var(axis=0)
3323
+ take = idxs[np.argsort(-var)[:2]]
3324
+ else:
3325
+ var = X_euc.var(axis=0)
3326
+ take = np.argsort(-var)[:2] if X_euc.shape[1] >= 2 else np.array([0])
3327
+ X2 = X_euc[:, take] if take.size else X_euc[:, :1]
3328
+ m = np.minimum(X2, 1.0 - X2)
3329
+ dmin = np.sqrt(np.sum(m * m, axis=1))
3330
+ denom = math.sqrt(X2.shape[1]) if X2.shape[1] >= 1 else 1.0
3331
+ corner_score = 1.0 - np.clip(dmin / denom, 0.0, 1.0)
3332
+
3333
+ archetypal_score = arch_wmax * 0.7 + corner_score * 0.3
3334
+
3335
+ # MEMORY CLEANUP: Free intermediate arrays
3336
+ _cleanup_memory(X2, col_min, col_max, corner_score)
3337
+
3338
+ # Prototypical rank
3339
+ prototypical_score = (1.0 - proto_d_norm95) * 0.5 + best_cos * 0.5
3340
+
3341
+ # Stereotypical rank
3342
+ stereotypical_scores = self._compute_stereotypical_rank(X_scaled, index, stereotype_source)
3343
+
3344
+ # ---- Build output DataFrame (only keep rank columns)
3345
+ out = pd.DataFrame(
3346
+ {
3347
+ "archetypal_rank": np.round(archetypal_score, 10),
3348
+ "prototypical_rank": np.round(prototypical_score, 10),
3349
+ "stereotypical_rank": np.round(stereotypical_scores, 10),
3350
+ },
3351
+ index=index,
3352
+ )
3353
+
3354
+ # MEMORY CLEANUP: Force GC before returning (transform often called repeatedly)
3355
+ _cleanup_memory(X_dense, X_euc, W, force_gc=True)
3356
+
3357
+ return out
3358
+
3359
+ # ------------------------------------------------------------
3360
+ def _assignments_cosine(
3361
+ self,
3362
+ X_l2: ArrayLike,
3363
+ prototype_indices: np.ndarray
3364
+ ) -> Tuple[np.ndarray, np.ndarray]:
3365
+ """
3366
+ Compute cosine similarity assignments to prototypes.
3367
+
3368
+ OPTIMIZED: Uses JIT-compiled function for 2-3× speedup.
3369
+ """
3370
+ # Convert to dense if needed
3371
+ Xl2_dense = X_l2.toarray() if (sp is not None and sp.isspmatrix(X_l2)) else np.asarray(X_l2, dtype=np.float64)
3372
+ P_l2 = Xl2_dense[prototype_indices]
3373
+
3374
+ n_samples = Xl2_dense.shape[0]
3375
+ n_protos = len(prototype_indices)
3376
+
3377
+ # OPTIMIZED: Use JIT for small to medium datasets
3378
+ if n_samples * n_protos < 1000000:
3379
+ sims = _cosine_similarity_jit(Xl2_dense, P_l2)
3380
+ else:
3381
+ # For very large datasets, use numpy (better for huge matrices)
3382
+ sims = Xl2_dense @ P_l2.T
3383
+ np.maximum(sims, 0.0, out=sims)
3384
+
3385
+ best_idx = sims.argmax(axis=1).astype(int)
3386
+ best_sim = sims[np.arange(len(sims)), best_idx]
3387
+
3388
+ return best_sim, best_idx
3389
+
3390
+ def _kneedle(self, gains: np.ndarray) -> Optional[int]:
3391
+ U = np.cumsum(gains)
3392
+ if U[-1] == 0.0:
3393
+ return None
3394
+ U_norm = U / U[-1]
3395
+ k = gains.size
3396
+ x = np.linspace(1 / k, 1.0, k)
3397
+ diff = U_norm - x
3398
+ return int(np.argmax(diff)) + 1
3399
+
3400
+ def _record_settings(self, tc: _ThreadControl):
3401
+ self.settings_ = {
3402
+ "deterministic": bool(self.deterministic),
3403
+ "speed_mode": bool(self.speed_mode),
3404
+ "thread_limit": tc.effective_limit,
3405
+ "random_state": int(self.random_state),
3406
+ "dtype": str(self.dtype),
3407
+ "max_memory_mb": int(self.max_memory_mb),
3408
+ }
3409
+
3410
+
3411
+ __all__ = [
3412
+ "DataTypical",
3413
+ "FacilityLocationSelector",
3414
+ "DataTypicalError",
3415
+ "ConfigError",
3416
+ "MemoryBudgetError",
3417
+ ]