datatypical 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datatypical-0.7.0.dist-info/METADATA +302 -0
- datatypical-0.7.0.dist-info/RECORD +7 -0
- datatypical-0.7.0.dist-info/WHEEL +5 -0
- datatypical-0.7.0.dist-info/licenses/LICENSE +21 -0
- datatypical-0.7.0.dist-info/top_level.txt +2 -0
- datatypical.py +3417 -0
- datatypical_viz.py +912 -0
datatypical.py
ADDED
|
@@ -0,0 +1,3417 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DataTypical v0.7 --- Dual-Perspective Significance with Shapley Explanations
|
|
3
|
+
===========================================================================
|
|
4
|
+
|
|
5
|
+
Revolutionary framework combining geometric and influence-based significance.
|
|
6
|
+
|
|
7
|
+
Key Innovation:
|
|
8
|
+
- Actual significance: Samples that ARE archetypal/prototypical/stereotypical (geometric)
|
|
9
|
+
- Formative instances: Samples that MAKE the dataset archetypal/prototypical/stereotypical (Shapley)
|
|
10
|
+
- Local explanations: WHY each sample is significant (feature attributions)
|
|
11
|
+
|
|
12
|
+
Two complementary perspectives:
|
|
13
|
+
1. LOCAL: "This sample IS significant because features X, Y contribute most"
|
|
14
|
+
2. GLOBAL: "This sample CREATES significance by defining the distribution and boundary"
|
|
15
|
+
|
|
16
|
+
What's new in v0.7:
|
|
17
|
+
- shapley_mode parameter (True/False)
|
|
18
|
+
- When True: computes explanations + formative instances
|
|
19
|
+
- Dual rankings: *_rank (actual) + *_shapley_rank (formative)
|
|
20
|
+
- Novel value functions: convex hull, coverage, extremeness
|
|
21
|
+
- Parallel Shapley computation with Option A (accurate v0.4 explanations)
|
|
22
|
+
|
|
23
|
+
All v0.6 features retained:
|
|
24
|
+
- Local explanations via get_shapley_explanations()
|
|
25
|
+
- Global explanations to identify formative instances
|
|
26
|
+
|
|
27
|
+
All v0.5 features retained:
|
|
28
|
+
- Tabular/Text/Graph support
|
|
29
|
+
- Label column preservation
|
|
30
|
+
- Graph topology features
|
|
31
|
+
|
|
32
|
+
All v0.4 features retained:
|
|
33
|
+
- User-configurable stereotypes
|
|
34
|
+
|
|
35
|
+
Sections:
|
|
36
|
+
[A] Exceptions & Globals
|
|
37
|
+
[B] Thread Control
|
|
38
|
+
[C] Helpers (sparse/dense math)
|
|
39
|
+
[D] Facility-Location (CELF, deterministic)
|
|
40
|
+
[E] Shapley Significance Engine (NEW in v0.6)
|
|
41
|
+
[F] DataTypical API
|
|
42
|
+
[G] Graph Topology Features
|
|
43
|
+
[H] Stereotype Computation
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
from __future__ import annotations
|
|
47
|
+
|
|
48
|
+
from dataclasses import dataclass, field, fields as dc_fields
|
|
49
|
+
from typing import Iterable, List, Optional, Dict, Tuple, Union, Callable
|
|
50
|
+
|
|
51
|
+
import heapq
|
|
52
|
+
import math
|
|
53
|
+
import gc
|
|
54
|
+
import warnings
|
|
55
|
+
import hashlib
|
|
56
|
+
import numpy as np
|
|
57
|
+
import pandas as pd
|
|
58
|
+
from sklearn.decomposition import NMF
|
|
59
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
60
|
+
from sklearn.preprocessing import MinMaxScaler
|
|
61
|
+
from threadpoolctl import threadpool_limits
|
|
62
|
+
from joblib import Parallel, delayed
|
|
63
|
+
from sklearn.exceptions import ConvergenceWarning
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
from numba import jit, prange
|
|
67
|
+
NUMBA_AVAILABLE = True
|
|
68
|
+
except ImportError:
|
|
69
|
+
NUMBA_AVAILABLE = False
|
|
70
|
+
# Dummy decorator if numba not available
|
|
71
|
+
def jit(*args, **kwargs):
|
|
72
|
+
def decorator(func):
|
|
73
|
+
return func
|
|
74
|
+
return decorator
|
|
75
|
+
|
|
76
|
+
try:
|
|
77
|
+
import scipy.sparse as sp
|
|
78
|
+
except Exception:
|
|
79
|
+
sp = None
|
|
80
|
+
ArrayLike = Union[np.ndarray, "sp.spmatrix"]
|
|
81
|
+
|
|
82
|
+
try:
|
|
83
|
+
from scipy.spatial import ConvexHull
|
|
84
|
+
from scipy.spatial.distance import cdist
|
|
85
|
+
except Exception:
|
|
86
|
+
ConvexHull = None
|
|
87
|
+
cdist = None
|
|
88
|
+
|
|
89
|
+
try:
|
|
90
|
+
from py_pcha import PCHA
|
|
91
|
+
except ImportError:
|
|
92
|
+
PCHA = None
|
|
93
|
+
|
|
94
|
+
try:
|
|
95
|
+
import faiss
|
|
96
|
+
FAISS_AVAILABLE = True
|
|
97
|
+
except ImportError:
|
|
98
|
+
FAISS_AVAILABLE = False
|
|
99
|
+
|
|
100
|
+
# ============================================================
|
|
101
|
+
# [A] Exceptions & Globals
|
|
102
|
+
# ============================================================
|
|
103
|
+
class DataTypicalError(Exception):
|
|
104
|
+
pass
|
|
105
|
+
|
|
106
|
+
class MemoryBudgetError(DataTypicalError):
|
|
107
|
+
pass
|
|
108
|
+
|
|
109
|
+
class ConfigError(DataTypicalError):
|
|
110
|
+
pass
|
|
111
|
+
|
|
112
|
+
def _seed_everything(seed: int) -> None:
|
|
113
|
+
np.random.seed(seed)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
# ============================================================
|
|
117
|
+
# [B] Thread Control
|
|
118
|
+
# ============================================================
|
|
119
|
+
class _ThreadControl:
|
|
120
|
+
def __init__(self, deterministic: bool = True):
|
|
121
|
+
self.deterministic = deterministic
|
|
122
|
+
self._ctx = None
|
|
123
|
+
self.effective_limit = None
|
|
124
|
+
|
|
125
|
+
def __enter__(self):
|
|
126
|
+
if self.deterministic:
|
|
127
|
+
self._ctx = threadpool_limits(limits=1)
|
|
128
|
+
self.effective_limit = 1
|
|
129
|
+
else:
|
|
130
|
+
self._ctx = threadpool_limits(limits=None)
|
|
131
|
+
self.effective_limit = None
|
|
132
|
+
self._ctx.__enter__()
|
|
133
|
+
return self
|
|
134
|
+
|
|
135
|
+
def __exit__(self, exc_type, exc, tb):
|
|
136
|
+
if self._ctx is not None:
|
|
137
|
+
self._ctx.__exit__(exc_type, exc, tb)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
# ============================================================
|
|
141
|
+
# [C] Helpers (sparse/dense math)
|
|
142
|
+
# ============================================================
|
|
143
|
+
def _cleanup_memory(*arrays, force_gc: bool = False) -> None:
|
|
144
|
+
"""
|
|
145
|
+
Explicitly delete arrays and optionally force garbage collection.
|
|
146
|
+
|
|
147
|
+
MEMORY OPTIMIZED: Python's GC doesn't always free memory immediately.
|
|
148
|
+
This forces cleanup of large temporaries to reduce peak memory usage.
|
|
149
|
+
"""
|
|
150
|
+
for arr in arrays:
|
|
151
|
+
if arr is not None:
|
|
152
|
+
del arr
|
|
153
|
+
|
|
154
|
+
if force_gc:
|
|
155
|
+
gc.collect()
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _l2_normalize_rows_dense(X: np.ndarray) -> np.ndarray:
|
|
159
|
+
norms = np.linalg.norm(X, axis=1, keepdims=True)
|
|
160
|
+
norms[norms == 0.0] = 1.0
|
|
161
|
+
return X / norms
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _sparse_l2_normalize_rows(X: "sp.spmatrix") -> "sp.spmatrix":
|
|
166
|
+
if sp is None:
|
|
167
|
+
raise ImportError("scipy is required for sparse operations.")
|
|
168
|
+
if not sp.isspmatrix_csr(X):
|
|
169
|
+
X = X.tocsr(copy=False)
|
|
170
|
+
sq = X.multiply(X).sum(axis=1)
|
|
171
|
+
norms = np.sqrt(np.maximum(np.asarray(sq).ravel(), 0.0))
|
|
172
|
+
norms[norms == 0.0] = 1.0
|
|
173
|
+
D = sp.diags(1.0 / norms)
|
|
174
|
+
return D @ X
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _sparse_minmax_0_1_nonneg(M: "sp.spmatrix") -> "sp.spmatrix":
|
|
178
|
+
if sp is None:
|
|
179
|
+
raise ImportError("scipy is required for sparse operations.")
|
|
180
|
+
if not sp.isspmatrix(M):
|
|
181
|
+
raise TypeError("Expected a scipy.sparse matrix.")
|
|
182
|
+
A = M.tocsc(copy=False)
|
|
183
|
+
# CRITICAL: Must use .toarray() to convert sparse matrix to dense
|
|
184
|
+
col_max = A.max(axis=0).toarray().ravel()
|
|
185
|
+
col_max[col_max == 0.0] = 1.0
|
|
186
|
+
return (A @ sp.diags(1.0 / col_max)).tocsr()
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _chunk_len(n_left: int, n_right: int, bytes_per: int, max_memory_mb: int) -> int:
|
|
190
|
+
if max_memory_mb <= 0:
|
|
191
|
+
raise MemoryBudgetError("max_memory_mb must be positive")
|
|
192
|
+
max_bytes = max_memory_mb * 1024 * 1024
|
|
193
|
+
return max(1, min(n_right, int(max_bytes // max(8, n_left * bytes_per))))
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _ensure_dtype(X: np.ndarray, dtype: str = 'float32') -> np.ndarray:
|
|
197
|
+
"""
|
|
198
|
+
Ensure array has specified dtype, converting if necessary.
|
|
199
|
+
|
|
200
|
+
MEMORY OPTIMIZED: Default to float32 (4 bytes) instead of float64 (8 bytes).
|
|
201
|
+
"""
|
|
202
|
+
target_dtype = np.float32 if dtype == 'float32' else np.float64
|
|
203
|
+
|
|
204
|
+
if X.dtype != target_dtype:
|
|
205
|
+
return X.astype(target_dtype, copy=False)
|
|
206
|
+
return X
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _euclidean_min_to_set_dense(
|
|
210
|
+
X: np.ndarray, Y: np.ndarray, max_memory_mb: int = 2048
|
|
211
|
+
) -> np.ndarray:
|
|
212
|
+
"""
|
|
213
|
+
Compute minimum Euclidean distance from each row of X to any row in Y.
|
|
214
|
+
|
|
215
|
+
OPTIMIZED: Uses Numba JIT for 2-3× speedup and better memory efficiency.
|
|
216
|
+
"""
|
|
217
|
+
n, d = X.shape
|
|
218
|
+
m = Y.shape[0]
|
|
219
|
+
|
|
220
|
+
# For small problems, use JIT-compiled direct computation
|
|
221
|
+
if n * m < 100000:
|
|
222
|
+
return _euclidean_min_jit(X, Y)
|
|
223
|
+
|
|
224
|
+
# For large problems, use chunked computation with JIT
|
|
225
|
+
best = np.full(n, np.inf, dtype=np.float64)
|
|
226
|
+
block = _chunk_len(n, m, bytes_per=8, max_memory_mb=max_memory_mb)
|
|
227
|
+
|
|
228
|
+
# Pre-compute X squared norms once
|
|
229
|
+
x2 = np.sum(X * X, axis=1)
|
|
230
|
+
|
|
231
|
+
for s in range(0, m, block):
|
|
232
|
+
e = min(m, s + block)
|
|
233
|
+
YY = Y[s:e]
|
|
234
|
+
|
|
235
|
+
# Use JIT-compiled function for this chunk
|
|
236
|
+
chunk_dists = _euclidean_chunk_jit(X, YY, x2)
|
|
237
|
+
best = np.minimum(best, chunk_dists)
|
|
238
|
+
|
|
239
|
+
return best
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
@jit(nopython=True, parallel=True, cache=True, fastmath=True)
|
|
243
|
+
def _euclidean_min_jit(X: np.ndarray, Y: np.ndarray) -> np.ndarray:
|
|
244
|
+
"""
|
|
245
|
+
JIT-compiled minimum Euclidean distance computation.
|
|
246
|
+
|
|
247
|
+
Uses parallel loops for multi-core acceleration.
|
|
248
|
+
"""
|
|
249
|
+
n = X.shape[0]
|
|
250
|
+
m = Y.shape[0]
|
|
251
|
+
d = X.shape[1]
|
|
252
|
+
|
|
253
|
+
min_dists = np.empty(n, dtype=np.float64)
|
|
254
|
+
|
|
255
|
+
# Parallel loop over X samples (explicit with prange)
|
|
256
|
+
for i in prange(n): # Changed from range to prange
|
|
257
|
+
min_dist = np.inf
|
|
258
|
+
|
|
259
|
+
for j in range(m):
|
|
260
|
+
dist_sq = 0.0
|
|
261
|
+
for k in range(d):
|
|
262
|
+
diff = X[i, k] - Y[j, k]
|
|
263
|
+
dist_sq += diff * diff
|
|
264
|
+
|
|
265
|
+
if dist_sq < min_dist:
|
|
266
|
+
min_dist = dist_sq
|
|
267
|
+
|
|
268
|
+
min_dists[i] = np.sqrt(max(min_dist, 0.0))
|
|
269
|
+
|
|
270
|
+
return min_dists
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
@jit(nopython=True, cache=True, fastmath=True)
|
|
274
|
+
def _euclidean_chunk_jit(
|
|
275
|
+
X: np.ndarray,
|
|
276
|
+
Y_chunk: np.ndarray,
|
|
277
|
+
x2: np.ndarray
|
|
278
|
+
) -> np.ndarray:
|
|
279
|
+
"""
|
|
280
|
+
JIT-compiled chunked distance computation using pre-computed norms.
|
|
281
|
+
|
|
282
|
+
Computes: sqrt(||x||² + ||y||² - 2⟨x,y⟩) efficiently.
|
|
283
|
+
"""
|
|
284
|
+
n = X.shape[0]
|
|
285
|
+
m = Y_chunk.shape[0]
|
|
286
|
+
d = X.shape[1]
|
|
287
|
+
|
|
288
|
+
min_dists = np.empty(n, dtype=np.float64)
|
|
289
|
+
|
|
290
|
+
# Pre-compute Y chunk squared norms
|
|
291
|
+
y2 = np.empty(m, dtype=np.float64)
|
|
292
|
+
for j in range(m):
|
|
293
|
+
y2_val = 0.0
|
|
294
|
+
for k in range(d):
|
|
295
|
+
y2_val += Y_chunk[j, k] * Y_chunk[j, k]
|
|
296
|
+
y2[j] = y2_val
|
|
297
|
+
|
|
298
|
+
# Parallel loop over X samples
|
|
299
|
+
for i in range(n):
|
|
300
|
+
min_dist_sq = np.inf
|
|
301
|
+
|
|
302
|
+
for j in range(m):
|
|
303
|
+
# Compute dot product
|
|
304
|
+
dot = 0.0
|
|
305
|
+
for k in range(d):
|
|
306
|
+
dot += X[i, k] * Y_chunk[j, k]
|
|
307
|
+
|
|
308
|
+
# Distance squared using pre-computed norms
|
|
309
|
+
dist_sq = x2[i] + y2[j] - 2.0 * dot
|
|
310
|
+
|
|
311
|
+
if dist_sq < min_dist_sq:
|
|
312
|
+
min_dist_sq = dist_sq
|
|
313
|
+
|
|
314
|
+
min_dists[i] = np.sqrt(max(min_dist_sq, 0.0))
|
|
315
|
+
|
|
316
|
+
return min_dists
|
|
317
|
+
|
|
318
|
+
@jit(nopython=True, cache=True, fastmath=True)
|
|
319
|
+
def _pairwise_euclidean_jit(X: np.ndarray) -> np.ndarray:
|
|
320
|
+
"""
|
|
321
|
+
JIT-compiled pairwise Euclidean distance matrix.
|
|
322
|
+
|
|
323
|
+
Returns upper triangle only to save memory.
|
|
324
|
+
"""
|
|
325
|
+
n = X.shape[0]
|
|
326
|
+
d = X.shape[1]
|
|
327
|
+
|
|
328
|
+
# Compute full distance matrix (symmetric)
|
|
329
|
+
dists = np.zeros((n, n), dtype=np.float64)
|
|
330
|
+
|
|
331
|
+
# Parallel outer loop
|
|
332
|
+
for i in range(n):
|
|
333
|
+
for j in range(i + 1, n):
|
|
334
|
+
dist_sq = 0.0
|
|
335
|
+
for k in range(d):
|
|
336
|
+
diff = X[i, k] - X[j, k]
|
|
337
|
+
dist_sq += diff * diff
|
|
338
|
+
|
|
339
|
+
dist = np.sqrt(max(dist_sq, 0.0))
|
|
340
|
+
dists[i, j] = dist
|
|
341
|
+
dists[j, i] = dist # Symmetric
|
|
342
|
+
|
|
343
|
+
return dists
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
@jit(nopython=True, cache=True, fastmath=True)
|
|
347
|
+
def _cosine_similarity_jit(X: np.ndarray, Y: np.ndarray) -> np.ndarray:
|
|
348
|
+
"""
|
|
349
|
+
JIT-compiled cosine similarity between L2-normalized vectors.
|
|
350
|
+
|
|
351
|
+
For L2-normalized data, this is just the dot product.
|
|
352
|
+
"""
|
|
353
|
+
n = X.shape[0]
|
|
354
|
+
m = Y.shape[0]
|
|
355
|
+
d = X.shape[1]
|
|
356
|
+
|
|
357
|
+
sims = np.empty((n, m), dtype=np.float64)
|
|
358
|
+
|
|
359
|
+
# Parallel loop over X samples
|
|
360
|
+
for i in range(n):
|
|
361
|
+
for j in range(m):
|
|
362
|
+
dot = 0.0
|
|
363
|
+
for k in range(d):
|
|
364
|
+
dot += X[i, k] * Y[j, k]
|
|
365
|
+
sims[i, j] = max(dot, 0.0) # Clip negative similarities
|
|
366
|
+
|
|
367
|
+
return sims
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
# ============================================================
|
|
371
|
+
# [D] Facility-Location (CELF, deterministic)
|
|
372
|
+
# ============================================================
|
|
373
|
+
@dataclass
|
|
374
|
+
class FacilityLocationSelector:
|
|
375
|
+
def __init__(self, n_prototypes=10, deterministic=True, speed_mode=False, verbose=False):
|
|
376
|
+
self.n_prototypes = int(n_prototypes)
|
|
377
|
+
self.deterministic = bool(deterministic)
|
|
378
|
+
self.speed_mode = bool(speed_mode)
|
|
379
|
+
self.verbose = bool(verbose)
|
|
380
|
+
|
|
381
|
+
def select(self, X_l2, weights=None, forbidden=None):
|
|
382
|
+
"""
|
|
383
|
+
Deterministic CELF for facility-location with:
|
|
384
|
+
• content-based tie-breaking (perm-invariant),
|
|
385
|
+
• optional client weights (e.g., density),
|
|
386
|
+
• optional forbidden candidate set (still count as clients).
|
|
387
|
+
Expects rows to be L2-normalized. Works with dense or sparse input.
|
|
388
|
+
Returns: (selected_indices, marginal_gains)
|
|
389
|
+
"""
|
|
390
|
+
import numpy as np, heapq, hashlib
|
|
391
|
+
|
|
392
|
+
# --- dense float64 view
|
|
393
|
+
if sp is not None and sp.isspmatrix(X_l2):
|
|
394
|
+
X = X_l2.toarray().astype(np.float64, copy=False)
|
|
395
|
+
else:
|
|
396
|
+
X = np.asarray(X_l2, dtype=np.float64)
|
|
397
|
+
n = X.shape[0]
|
|
398
|
+
if n == 0:
|
|
399
|
+
return np.array([], dtype=int), np.array([], dtype=float)
|
|
400
|
+
|
|
401
|
+
# --- client weights (normalize to mean 1 for scale stability)
|
|
402
|
+
if weights is None:
|
|
403
|
+
w = np.ones(n, dtype=np.float64)
|
|
404
|
+
else:
|
|
405
|
+
w = np.asarray(weights, dtype=np.float64).ravel()
|
|
406
|
+
m = float(w.mean())
|
|
407
|
+
w = w / m if m > 0 else np.ones_like(w)
|
|
408
|
+
|
|
409
|
+
# --- forbidden candidates (excluded from selection, included as clients)
|
|
410
|
+
forb = np.zeros(n, dtype=bool)
|
|
411
|
+
if forbidden is not None:
|
|
412
|
+
forb_idx = np.asarray(list(forbidden), dtype=int)
|
|
413
|
+
forb_idx = forb_idx[(forb_idx >= 0) & (forb_idx < n)]
|
|
414
|
+
forb[forb_idx] = True
|
|
415
|
+
|
|
416
|
+
# --- target number of prototypes (cap to available candidates)
|
|
417
|
+
k_req = int(getattr(self, "n_prototypes", min(10, n)))
|
|
418
|
+
available = n - int(forb.sum())
|
|
419
|
+
k = max(0, min(k_req, available))
|
|
420
|
+
if k == 0:
|
|
421
|
+
return np.array([], dtype=int), np.array([], dtype=float)
|
|
422
|
+
|
|
423
|
+
# --- permutation-invariant tie-breaker: hash of row content
|
|
424
|
+
def row_key(i: int) -> int:
|
|
425
|
+
h = hashlib.blake2b(X[i].tobytes(), digest_size=8)
|
|
426
|
+
return int.from_bytes(h.digest(), "big", signed=False)
|
|
427
|
+
keys = np.fromiter((row_key(i) for i in range(n)), dtype=np.uint64, count=n)
|
|
428
|
+
|
|
429
|
+
# --- CELF init
|
|
430
|
+
best = np.zeros(n, dtype=np.float64) # current best similarity per client
|
|
431
|
+
last_eval = np.full(n, -1, dtype=np.int64) # last #selected when gain was computed
|
|
432
|
+
last_gain = np.zeros(n, dtype=np.float64)
|
|
433
|
+
|
|
434
|
+
# Initial exact gains: g0[c] = sum_i w_i * max(0, <x_i, x_c>)
|
|
435
|
+
g0 = np.zeros(n, dtype=np.float64)
|
|
436
|
+
# block multiply to limit memory
|
|
437
|
+
target_bytes = 256 * 1024 * 1024 # 256MB scratch
|
|
438
|
+
item = np.dtype(np.float64).itemsize
|
|
439
|
+
max_b = max(1, int(target_bytes // max(1, n * item)))
|
|
440
|
+
bsz = max(1, min(n, max_b))
|
|
441
|
+
XT = X.T
|
|
442
|
+
for s in range(0, n, bsz):
|
|
443
|
+
e = min(n, s + bsz)
|
|
444
|
+
S = X[s:e] @ XT # (e-s, n)
|
|
445
|
+
np.maximum(S, 0.0, out=S)
|
|
446
|
+
g0 += (w[s:e, None] * S).sum(axis=0, dtype=np.float64)
|
|
447
|
+
|
|
448
|
+
last_gain[:] = g0
|
|
449
|
+
last_eval[:] = 0
|
|
450
|
+
|
|
451
|
+
# heap items: (-gain_estimate, key, idx) – ties broken by content key
|
|
452
|
+
heap = [(-float(g0[c]), int(keys[c]), int(c)) for c in range(n) if not forb[c]]
|
|
453
|
+
heapq.heapify(heap)
|
|
454
|
+
|
|
455
|
+
selected: list[int] = []
|
|
456
|
+
gains: list[float] = []
|
|
457
|
+
it = 0
|
|
458
|
+
while len(selected) < k and heap:
|
|
459
|
+
neg_g_est, _, c = heapq.heappop(heap)
|
|
460
|
+
if last_eval[c] == it:
|
|
461
|
+
# accept candidate
|
|
462
|
+
selected.append(c)
|
|
463
|
+
gains.append(float(last_gain[c]))
|
|
464
|
+
s = X @ X[c]
|
|
465
|
+
np.maximum(s, 0.0, out=s)
|
|
466
|
+
np.maximum(best, s, out=best)
|
|
467
|
+
it += 1
|
|
468
|
+
continue
|
|
469
|
+
# refresh exact marginal gain vs current 'best'
|
|
470
|
+
s = X @ X[c]
|
|
471
|
+
improv = s - best
|
|
472
|
+
improv[improv < 0.0] = 0.0
|
|
473
|
+
g_exact = float((w * improv).sum(dtype=np.float64))
|
|
474
|
+
last_gain[c] = g_exact
|
|
475
|
+
last_eval[c] = it
|
|
476
|
+
heapq.heappush(heap, (-g_exact, int(keys[c]), int(c)))
|
|
477
|
+
|
|
478
|
+
return np.asarray(selected, dtype=int), np.asarray(gains, dtype=float)
|
|
479
|
+
|
|
480
|
+
|
|
481
|
+
def select(self, X_l2, weights=None, forbidden=None):
|
|
482
|
+
"""
|
|
483
|
+
Select prototypes using lazy CELF with optional FAISS acceleration.
|
|
484
|
+
|
|
485
|
+
OPTIMIZED: Uses FAISS for datasets with n > 1,000 samples for massive speedup.
|
|
486
|
+
MEMORY OPTIMIZED: Explicit cleanup of similarity matrix after use.
|
|
487
|
+
"""
|
|
488
|
+
import numpy as np
|
|
489
|
+
|
|
490
|
+
if sp is not None and sp.isspmatrix(X_l2):
|
|
491
|
+
X = X_l2.toarray().astype(np.float64, copy=False)
|
|
492
|
+
else:
|
|
493
|
+
X = np.asarray(X_l2, dtype=np.float64)
|
|
494
|
+
n = X.shape[0]
|
|
495
|
+
if n == 0:
|
|
496
|
+
return np.array([], dtype=int), np.array([], dtype=float)
|
|
497
|
+
|
|
498
|
+
# Normalize weights
|
|
499
|
+
if weights is None:
|
|
500
|
+
w = np.ones(n, dtype=np.float64)
|
|
501
|
+
else:
|
|
502
|
+
w = np.asarray(weights, dtype=np.float64).ravel()
|
|
503
|
+
m = float(w.mean())
|
|
504
|
+
w = w / m if m > 0 else np.ones_like(w)
|
|
505
|
+
|
|
506
|
+
# OPTIMIZED: Use FAISS for large datasets if available
|
|
507
|
+
use_faiss = FAISS_AVAILABLE and n > 1000 and not self.speed_mode
|
|
508
|
+
|
|
509
|
+
if use_faiss:
|
|
510
|
+
if self.verbose:
|
|
511
|
+
print(f" Using FAISS acceleration for n={n}")
|
|
512
|
+
result = self._select_with_faiss(X, w, forbidden)
|
|
513
|
+
# MEMORY CLEANUP: Free X copy before returning
|
|
514
|
+
_cleanup_memory(X, force_gc=True)
|
|
515
|
+
return result
|
|
516
|
+
|
|
517
|
+
# Otherwise use the cached similarity matrix approach
|
|
518
|
+
import heapq, hashlib
|
|
519
|
+
|
|
520
|
+
# Handle forbidden indices
|
|
521
|
+
forb = np.zeros(n, dtype=bool)
|
|
522
|
+
if forbidden is not None:
|
|
523
|
+
forb_idx = np.asarray(list(forbidden), dtype=int)
|
|
524
|
+
forb_idx = forb_idx[(forb_idx >= 0) & (forb_idx < n)]
|
|
525
|
+
forb[forb_idx] = True
|
|
526
|
+
|
|
527
|
+
k_req = int(getattr(self, "n_prototypes", min(10, n)))
|
|
528
|
+
available = n - int(forb.sum())
|
|
529
|
+
k = max(0, min(k_req, available))
|
|
530
|
+
if k == 0:
|
|
531
|
+
return np.array([], dtype=int), np.array([], dtype=float)
|
|
532
|
+
|
|
533
|
+
# Pre-compute similarity matrix
|
|
534
|
+
XT = X.T
|
|
535
|
+
S = X @ XT
|
|
536
|
+
np.maximum(S, 0.0, out=S)
|
|
537
|
+
|
|
538
|
+
# MEMORY CLEANUP: Free XT after similarity computation
|
|
539
|
+
_cleanup_memory(XT)
|
|
540
|
+
|
|
541
|
+
# Pre-compute weighted candidate similarities
|
|
542
|
+
S_weighted = w[None, :] * S
|
|
543
|
+
candidate_sims = S_weighted.sum(axis=1)
|
|
544
|
+
|
|
545
|
+
# MEMORY CLEANUP: Free S_weighted after computing candidate_sims
|
|
546
|
+
_cleanup_memory(S_weighted)
|
|
547
|
+
|
|
548
|
+
# Generate deterministic keys
|
|
549
|
+
def row_key(i: int) -> int:
|
|
550
|
+
h = hashlib.blake2b(X[i].tobytes(), digest_size=8)
|
|
551
|
+
return int.from_bytes(h.digest(), "big", signed=False)
|
|
552
|
+
keys = np.fromiter((row_key(i) for i in range(n)), dtype=np.uint64, count=n)
|
|
553
|
+
|
|
554
|
+
# CELF state tracking
|
|
555
|
+
best = np.zeros(n, dtype=np.float64)
|
|
556
|
+
last_eval = np.full(n, -1, dtype=np.int64)
|
|
557
|
+
last_gain = candidate_sims.copy()
|
|
558
|
+
last_eval[:] = 0
|
|
559
|
+
|
|
560
|
+
# Initialize heap
|
|
561
|
+
heap = [(-float(candidate_sims[c]), int(keys[c]), int(c))
|
|
562
|
+
for c in range(n) if not forb[c]]
|
|
563
|
+
heapq.heapify(heap)
|
|
564
|
+
|
|
565
|
+
selected = []
|
|
566
|
+
gains = []
|
|
567
|
+
it = 0
|
|
568
|
+
|
|
569
|
+
while len(selected) < k and heap:
|
|
570
|
+
neg_g_est, _, c = heapq.heappop(heap)
|
|
571
|
+
|
|
572
|
+
if last_eval[c] == it:
|
|
573
|
+
selected.append(c)
|
|
574
|
+
gains.append(float(last_gain[c]))
|
|
575
|
+
s_c = S[c, :]
|
|
576
|
+
np.maximum(best, s_c, out=best)
|
|
577
|
+
it += 1
|
|
578
|
+
continue
|
|
579
|
+
|
|
580
|
+
# Lazy evaluation
|
|
581
|
+
s_c = S[c, :]
|
|
582
|
+
improv = s_c - best
|
|
583
|
+
improv[improv < 0.0] = 0.0
|
|
584
|
+
g_exact = float((w * improv).sum(dtype=np.float64))
|
|
585
|
+
|
|
586
|
+
last_gain[c] = g_exact
|
|
587
|
+
last_eval[c] = it
|
|
588
|
+
heapq.heappush(heap, (-g_exact, int(keys[c]), int(c)))
|
|
589
|
+
|
|
590
|
+
# MEMORY CLEANUP: Free large arrays before returning
|
|
591
|
+
_cleanup_memory(S, X, best, last_gain, candidate_sims, force_gc=True)
|
|
592
|
+
|
|
593
|
+
return np.asarray(selected, dtype=int), np.asarray(gains, dtype=float)
|
|
594
|
+
|
|
595
|
+
|
|
596
|
+
# ============================================================
|
|
597
|
+
# [E] Shapley Significance Engine (NEW in v0.6)
|
|
598
|
+
# ============================================================
|
|
599
|
+
|
|
600
|
+
class ShapleyEarlyStopping:
|
|
601
|
+
"""Early stopping for Shapley convergence using relative change."""
|
|
602
|
+
|
|
603
|
+
def __init__(self, patience: int = 10, tolerance: float = 0.01):
|
|
604
|
+
self.patience = patience
|
|
605
|
+
self.tolerance = tolerance
|
|
606
|
+
self.history = []
|
|
607
|
+
self.stable_count = 0
|
|
608
|
+
|
|
609
|
+
def update(self, shapley_estimates: np.ndarray, n_perms: int) -> Tuple[bool, Dict]:
|
|
610
|
+
if n_perms < 20:
|
|
611
|
+
return False, {'converged': False, 'n_permutations': n_perms}
|
|
612
|
+
|
|
613
|
+
self.history.append(shapley_estimates.copy())
|
|
614
|
+
|
|
615
|
+
if len(self.history) < 2:
|
|
616
|
+
return False, {'converged': False, 'n_permutations': n_perms}
|
|
617
|
+
|
|
618
|
+
old = self.history[-2]
|
|
619
|
+
new = self.history[-1]
|
|
620
|
+
|
|
621
|
+
denom = np.abs(old) + 1e-12
|
|
622
|
+
rel_change = np.abs(new - old) / denom
|
|
623
|
+
max_rel_change = np.max(rel_change)
|
|
624
|
+
mean_rel_change = np.mean(rel_change)
|
|
625
|
+
|
|
626
|
+
if mean_rel_change < self.tolerance:
|
|
627
|
+
self.stable_count += 1
|
|
628
|
+
else:
|
|
629
|
+
self.stable_count = 0
|
|
630
|
+
|
|
631
|
+
should_stop = self.stable_count >= self.patience
|
|
632
|
+
|
|
633
|
+
info = {
|
|
634
|
+
'converged': should_stop,
|
|
635
|
+
'n_permutations': n_perms,
|
|
636
|
+
'mean_rel_change': float(mean_rel_change),
|
|
637
|
+
'max_rel_change': float(max_rel_change),
|
|
638
|
+
'stable_iterations': self.stable_count
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
return should_stop, info
|
|
642
|
+
|
|
643
|
+
|
|
644
|
+
@jit(nopython=True, cache=True, fastmath=True)
|
|
645
|
+
def _compute_marginals_jit(
|
|
646
|
+
perm: np.ndarray,
|
|
647
|
+
values: np.ndarray,
|
|
648
|
+
n_samples: int,
|
|
649
|
+
n_features: int
|
|
650
|
+
) -> np.ndarray:
|
|
651
|
+
"""
|
|
652
|
+
JIT-compiled function to compute Shapley marginal contributions.
|
|
653
|
+
|
|
654
|
+
This is the performance-critical inner loop - compiled to machine code by Numba.
|
|
655
|
+
|
|
656
|
+
Parameters
|
|
657
|
+
----------
|
|
658
|
+
perm : array of sample indices in permutation order
|
|
659
|
+
values : array of value function results for each coalition size
|
|
660
|
+
n_samples : number of samples
|
|
661
|
+
n_features : number of features
|
|
662
|
+
|
|
663
|
+
Returns
|
|
664
|
+
-------
|
|
665
|
+
shapley_contrib : (n_samples, n_features) array of marginal contributions
|
|
666
|
+
"""
|
|
667
|
+
shapley_contrib = np.zeros((n_samples, n_features), dtype=np.float64)
|
|
668
|
+
|
|
669
|
+
for j in range(n_samples):
|
|
670
|
+
sample_idx = perm[j]
|
|
671
|
+
marginal = values[j+1] - values[j]
|
|
672
|
+
|
|
673
|
+
# Broadcast marginal across all features
|
|
674
|
+
for f in range(n_features):
|
|
675
|
+
shapley_contrib[sample_idx, f] = marginal / n_features
|
|
676
|
+
|
|
677
|
+
return shapley_contrib
|
|
678
|
+
|
|
679
|
+
|
|
680
|
+
@jit(nopython=True, cache=True, fastmath=True)
|
|
681
|
+
def _compute_feature_marginals_jit(
|
|
682
|
+
perm: np.ndarray,
|
|
683
|
+
values: np.ndarray,
|
|
684
|
+
n_features: int
|
|
685
|
+
) -> np.ndarray:
|
|
686
|
+
"""
|
|
687
|
+
JIT-compiled function to compute feature-level Shapley marginal contributions.
|
|
688
|
+
|
|
689
|
+
Parameters
|
|
690
|
+
----------
|
|
691
|
+
perm : array of feature indices in permutation order
|
|
692
|
+
values : array of value function results for each feature coalition size
|
|
693
|
+
n_features : number of features
|
|
694
|
+
|
|
695
|
+
Returns
|
|
696
|
+
-------
|
|
697
|
+
shapley_contrib : (n_features,) array of marginal contributions
|
|
698
|
+
"""
|
|
699
|
+
shapley_contrib = np.zeros(n_features, dtype=np.float64)
|
|
700
|
+
|
|
701
|
+
for j in range(n_features):
|
|
702
|
+
feat_idx = perm[j]
|
|
703
|
+
marginal = values[j+1] - values[j]
|
|
704
|
+
shapley_contrib[feat_idx] = marginal
|
|
705
|
+
|
|
706
|
+
return shapley_contrib
|
|
707
|
+
|
|
708
|
+
|
|
709
|
+
class ShapleySignificanceEngine:
|
|
710
|
+
"""
|
|
711
|
+
Compute Shapley values for dual-perspective significance analysis.
|
|
712
|
+
|
|
713
|
+
Supports two modes:
|
|
714
|
+
1. Explanations: Why is this sample archetypal/prototypical/stereotypical?
|
|
715
|
+
2. Formative: Which samples create the archetypal/prototypical/stereotypical structure?
|
|
716
|
+
"""
|
|
717
|
+
|
|
718
|
+
def __init__(
|
|
719
|
+
self,
|
|
720
|
+
n_permutations: int = 100,
|
|
721
|
+
random_state: int = 42,
|
|
722
|
+
n_jobs: int = -1,
|
|
723
|
+
early_stopping_patience: int = 10,
|
|
724
|
+
early_stopping_tolerance: float = 0.01,
|
|
725
|
+
verbose: bool = False
|
|
726
|
+
):
|
|
727
|
+
self.n_permutations = n_permutations
|
|
728
|
+
self.random_state = random_state
|
|
729
|
+
self.n_jobs = n_jobs
|
|
730
|
+
self.early_stopping_patience = early_stopping_patience
|
|
731
|
+
self.early_stopping_tolerance = early_stopping_tolerance
|
|
732
|
+
self.verbose = verbose
|
|
733
|
+
self.rng = np.random.RandomState(random_state)
|
|
734
|
+
|
|
735
|
+
def compute_shapley_values(
|
|
736
|
+
self,
|
|
737
|
+
X: np.ndarray,
|
|
738
|
+
value_function: Callable,
|
|
739
|
+
value_function_name: str = "unknown",
|
|
740
|
+
context: Optional[Dict] = None
|
|
741
|
+
) -> Tuple[np.ndarray, Dict]:
|
|
742
|
+
"""
|
|
743
|
+
Compute Shapley values using specified value function.
|
|
744
|
+
|
|
745
|
+
OPTIMIZED: Uses shared memory for parallel processing to avoid data copying.
|
|
746
|
+
MEMORY OPTIMIZED: Cleanup batch results immediately after accumulation.
|
|
747
|
+
"""
|
|
748
|
+
n_samples, n_features = X.shape
|
|
749
|
+
|
|
750
|
+
if self.verbose:
|
|
751
|
+
print(f"\n Computing {value_function_name}...")
|
|
752
|
+
print(f" Samples: {n_samples}, Features: {n_features}")
|
|
753
|
+
print(f" Max permutations: {self.n_permutations}")
|
|
754
|
+
|
|
755
|
+
early_stop = ShapleyEarlyStopping(
|
|
756
|
+
patience=self.early_stopping_patience,
|
|
757
|
+
tolerance=self.early_stopping_tolerance
|
|
758
|
+
)
|
|
759
|
+
|
|
760
|
+
shapley_sum = np.zeros((n_samples, n_features), dtype=np.float64)
|
|
761
|
+
n_perms_used = 0
|
|
762
|
+
|
|
763
|
+
batch_size = max(1, self.n_permutations // 10)
|
|
764
|
+
info = {'converged': False, 'mean_rel_change': 0.0}
|
|
765
|
+
|
|
766
|
+
# OPTIMIZED: Decide parallelization strategy based on data size
|
|
767
|
+
use_parallel = self.n_jobs != 1 and n_samples >= 20
|
|
768
|
+
|
|
769
|
+
# OPTIMIZED: For small datasets or single-threaded, use direct computation
|
|
770
|
+
if not use_parallel:
|
|
771
|
+
for batch_start in range(0, self.n_permutations, batch_size):
|
|
772
|
+
batch_end = min(batch_start + batch_size, self.n_permutations)
|
|
773
|
+
batch_perms = [self.rng.permutation(n_samples) for _ in range(batch_end - batch_start)]
|
|
774
|
+
|
|
775
|
+
for perm in batch_perms:
|
|
776
|
+
shapley_contrib = self._process_single_permutation(perm, X, value_function, context)
|
|
777
|
+
shapley_sum += shapley_contrib
|
|
778
|
+
n_perms_used += 1
|
|
779
|
+
|
|
780
|
+
# MEMORY CLEANUP: Free batch permutations immediately
|
|
781
|
+
_cleanup_memory(batch_perms)
|
|
782
|
+
|
|
783
|
+
current_estimate = shapley_sum / n_perms_used
|
|
784
|
+
should_stop, info = early_stop.update(current_estimate, n_perms_used)
|
|
785
|
+
|
|
786
|
+
if should_stop and n_perms_used >= 50:
|
|
787
|
+
if self.verbose:
|
|
788
|
+
print(f" Early stop at {n_perms_used} perms (change: {info['mean_rel_change']:.6f})")
|
|
789
|
+
break
|
|
790
|
+
else:
|
|
791
|
+
# OPTIMIZED: Use threading backend for better memory sharing
|
|
792
|
+
for batch_start in range(0, self.n_permutations, batch_size):
|
|
793
|
+
batch_end = min(batch_start + batch_size, self.n_permutations)
|
|
794
|
+
batch_perms = [self.rng.permutation(n_samples) for _ in range(batch_end - batch_start)]
|
|
795
|
+
|
|
796
|
+
# Use threading backend for shared memory access
|
|
797
|
+
batch_results = Parallel(
|
|
798
|
+
n_jobs=self.n_jobs,
|
|
799
|
+
backend='threading',
|
|
800
|
+
verbose=0
|
|
801
|
+
)(
|
|
802
|
+
delayed(self._process_single_permutation)(perm, X, value_function, context)
|
|
803
|
+
for perm in batch_perms
|
|
804
|
+
)
|
|
805
|
+
|
|
806
|
+
# Accumulate results efficiently
|
|
807
|
+
for shapley_contrib in batch_results:
|
|
808
|
+
shapley_sum += shapley_contrib
|
|
809
|
+
n_perms_used += 1
|
|
810
|
+
|
|
811
|
+
# MEMORY CLEANUP: Free batch results and permutations immediately
|
|
812
|
+
_cleanup_memory(batch_results, batch_perms)
|
|
813
|
+
|
|
814
|
+
current_estimate = shapley_sum / n_perms_used
|
|
815
|
+
should_stop, info = early_stop.update(current_estimate, n_perms_used)
|
|
816
|
+
|
|
817
|
+
if should_stop and n_perms_used >= 50:
|
|
818
|
+
if self.verbose:
|
|
819
|
+
print(f" Early stop at {n_perms_used} perms (change: {info['mean_rel_change']:.6f})")
|
|
820
|
+
break
|
|
821
|
+
|
|
822
|
+
Phi = shapley_sum / n_perms_used
|
|
823
|
+
|
|
824
|
+
# Verify additivity
|
|
825
|
+
all_indices = np.arange(n_samples)
|
|
826
|
+
if context is not None:
|
|
827
|
+
total_actual = value_function(X, all_indices, context)
|
|
828
|
+
else:
|
|
829
|
+
total_actual = value_function(X, all_indices)
|
|
830
|
+
|
|
831
|
+
total_from_shapley = np.sum(Phi)
|
|
832
|
+
additivity_error = abs(total_from_shapley - total_actual) / (abs(total_actual) + 1e-12)
|
|
833
|
+
|
|
834
|
+
info = {
|
|
835
|
+
'n_permutations_used': n_perms_used,
|
|
836
|
+
'converged': info.get('converged', False) if n_perms_used < self.n_permutations else True,
|
|
837
|
+
'mean_rel_change': info.get('mean_rel_change', 0.0),
|
|
838
|
+
'additivity_error': float(additivity_error),
|
|
839
|
+
'total_shapley': float(total_from_shapley),
|
|
840
|
+
'total_actual': float(total_actual)
|
|
841
|
+
}
|
|
842
|
+
|
|
843
|
+
if self.verbose:
|
|
844
|
+
print(f" ✓ {n_perms_used} perms, additivity error: {additivity_error:.6f}")
|
|
845
|
+
|
|
846
|
+
# MEMORY CLEANUP: Free shapley_sum before returning Phi (they're different objects)
|
|
847
|
+
_cleanup_memory(shapley_sum)
|
|
848
|
+
|
|
849
|
+
return Phi, info
|
|
850
|
+
|
|
851
|
+
|
|
852
|
+
def compute_feature_shapley_values(
|
|
853
|
+
self,
|
|
854
|
+
X: np.ndarray,
|
|
855
|
+
value_function: Callable,
|
|
856
|
+
value_function_name: str = "unknown",
|
|
857
|
+
context: Optional[Dict] = None
|
|
858
|
+
) -> Tuple[np.ndarray, Dict]:
|
|
859
|
+
"""
|
|
860
|
+
Compute feature-level Shapley values for each sample.
|
|
861
|
+
|
|
862
|
+
OPTIMIZED: Uses threading backend for better memory sharing.
|
|
863
|
+
"""
|
|
864
|
+
n_samples, n_features = X.shape
|
|
865
|
+
|
|
866
|
+
if self.verbose:
|
|
867
|
+
print(f"\n Computing feature-level {value_function_name}...")
|
|
868
|
+
print(f" Samples: {n_samples}, Features: {n_features}")
|
|
869
|
+
print(f" Max permutations: {self.n_permutations}")
|
|
870
|
+
|
|
871
|
+
early_stop = ShapleyEarlyStopping(
|
|
872
|
+
patience=self.early_stopping_patience,
|
|
873
|
+
tolerance=self.early_stopping_tolerance
|
|
874
|
+
)
|
|
875
|
+
|
|
876
|
+
shapley_sum = np.zeros((n_samples, n_features), dtype=np.float64)
|
|
877
|
+
n_perms_used = 0
|
|
878
|
+
|
|
879
|
+
batch_size = max(1, self.n_permutations // 10)
|
|
880
|
+
info = {'converged': False, 'mean_rel_change': 0.0}
|
|
881
|
+
|
|
882
|
+
# OPTIMIZED: Decide parallelization strategy
|
|
883
|
+
use_parallel = self.n_jobs != 1 and n_features >= 10
|
|
884
|
+
|
|
885
|
+
for batch_start in range(0, self.n_permutations, batch_size):
|
|
886
|
+
batch_end = min(batch_start + batch_size, self.n_permutations)
|
|
887
|
+
|
|
888
|
+
# Generate feature permutations for this batch
|
|
889
|
+
batch_perms = [self.rng.permutation(n_features) for _ in range(batch_end - batch_start)]
|
|
890
|
+
|
|
891
|
+
# Process each sample
|
|
892
|
+
for sample_idx in range(n_samples):
|
|
893
|
+
if use_parallel:
|
|
894
|
+
# OPTIMIZED: Threading backend for memory sharing
|
|
895
|
+
batch_results = Parallel(
|
|
896
|
+
n_jobs=self.n_jobs,
|
|
897
|
+
backend='threading',
|
|
898
|
+
verbose=0
|
|
899
|
+
)(
|
|
900
|
+
delayed(self._process_feature_permutation)(
|
|
901
|
+
sample_idx, perm, X, value_function, value_function_name, context
|
|
902
|
+
)
|
|
903
|
+
for perm in batch_perms
|
|
904
|
+
)
|
|
905
|
+
else:
|
|
906
|
+
# Direct computation for small problems
|
|
907
|
+
batch_results = [
|
|
908
|
+
self._process_feature_permutation(
|
|
909
|
+
sample_idx, perm, X, value_function, value_function_name, context
|
|
910
|
+
)
|
|
911
|
+
for perm in batch_perms
|
|
912
|
+
]
|
|
913
|
+
|
|
914
|
+
for shapley_contrib in batch_results:
|
|
915
|
+
shapley_sum[sample_idx, :] += shapley_contrib
|
|
916
|
+
|
|
917
|
+
n_perms_used += len(batch_perms)
|
|
918
|
+
|
|
919
|
+
current_estimate = shapley_sum / n_perms_used
|
|
920
|
+
should_stop, info = early_stop.update(current_estimate, n_perms_used)
|
|
921
|
+
|
|
922
|
+
if should_stop and n_perms_used >= 50:
|
|
923
|
+
if self.verbose:
|
|
924
|
+
print(f" Early stop at {n_perms_used} perms (change: {info['mean_rel_change']:.6f})")
|
|
925
|
+
break
|
|
926
|
+
|
|
927
|
+
Phi = shapley_sum / n_perms_used
|
|
928
|
+
|
|
929
|
+
# Compute additivity error
|
|
930
|
+
total_errors = []
|
|
931
|
+
for sample_idx in range(n_samples):
|
|
932
|
+
shapley_total = np.sum(Phi[sample_idx, :])
|
|
933
|
+
if context is not None:
|
|
934
|
+
actual_value = value_function(X[sample_idx:sample_idx+1, :], np.array([sample_idx]), context)
|
|
935
|
+
else:
|
|
936
|
+
actual_value = value_function(X[sample_idx:sample_idx+1, :], np.array([sample_idx]))
|
|
937
|
+
|
|
938
|
+
error = abs(shapley_total - actual_value) / (abs(actual_value) + 1e-12)
|
|
939
|
+
total_errors.append(error)
|
|
940
|
+
|
|
941
|
+
additivity_error = np.mean(total_errors)
|
|
942
|
+
|
|
943
|
+
info_out = {
|
|
944
|
+
'n_permutations_used': n_perms_used,
|
|
945
|
+
'converged': info.get('converged', False) if n_perms_used < self.n_permutations else True,
|
|
946
|
+
'mean_rel_change': info.get('mean_rel_change', 0.0),
|
|
947
|
+
'additivity_error': float(additivity_error)
|
|
948
|
+
}
|
|
949
|
+
|
|
950
|
+
if self.verbose:
|
|
951
|
+
print(f" {n_perms_used} perms, mean additivity error: {additivity_error:.6f}")
|
|
952
|
+
|
|
953
|
+
return Phi, info_out
|
|
954
|
+
|
|
955
|
+
|
|
956
|
+
def _process_single_permutation(
|
|
957
|
+
self,
|
|
958
|
+
perm: np.ndarray,
|
|
959
|
+
X: np.ndarray,
|
|
960
|
+
value_function: Callable,
|
|
961
|
+
context: Optional[Dict]
|
|
962
|
+
) -> np.ndarray:
|
|
963
|
+
"""
|
|
964
|
+
Process one permutation to compute marginal contributions.
|
|
965
|
+
|
|
966
|
+
OPTIMIZED: Delegates to JIT-compiled helper for massive speedup.
|
|
967
|
+
"""
|
|
968
|
+
n_samples, n_features = X.shape
|
|
969
|
+
shapley_contrib = np.zeros((n_samples, n_features), dtype=np.float64)
|
|
970
|
+
|
|
971
|
+
# Compute all value function calls first (can't JIT this part due to callable)
|
|
972
|
+
values = np.zeros(n_samples + 1, dtype=np.float64)
|
|
973
|
+
values[0] = 0.0
|
|
974
|
+
|
|
975
|
+
for j in range(n_samples):
|
|
976
|
+
subset_indices = perm[:j+1]
|
|
977
|
+
X_subset = X[subset_indices]
|
|
978
|
+
|
|
979
|
+
if context is not None:
|
|
980
|
+
values[j+1] = value_function(X_subset, subset_indices, context)
|
|
981
|
+
else:
|
|
982
|
+
values[j+1] = value_function(X_subset, subset_indices)
|
|
983
|
+
|
|
984
|
+
# Now use JIT-compiled function to compute marginal contributions
|
|
985
|
+
shapley_contrib = _compute_marginals_jit(perm, values, n_samples, n_features)
|
|
986
|
+
|
|
987
|
+
return shapley_contrib
|
|
988
|
+
|
|
989
|
+
def _process_feature_permutation(
|
|
990
|
+
self,
|
|
991
|
+
sample_idx: int,
|
|
992
|
+
perm: np.ndarray,
|
|
993
|
+
X: np.ndarray,
|
|
994
|
+
value_function: Callable,
|
|
995
|
+
metric_name: str,
|
|
996
|
+
context: Optional[Dict] = None
|
|
997
|
+
) -> np.ndarray:
|
|
998
|
+
"""
|
|
999
|
+
Process one feature permutation for a single sample to compute per-feature contributions.
|
|
1000
|
+
|
|
1001
|
+
OPTIMIZED: Uses JIT-compiled helper for faster computation.
|
|
1002
|
+
"""
|
|
1003
|
+
n_features = X.shape[1]
|
|
1004
|
+
shapley_contrib = np.zeros(n_features, dtype=np.float64)
|
|
1005
|
+
|
|
1006
|
+
# Compute all value function calls first (can't JIT this part)
|
|
1007
|
+
values = np.zeros(n_features + 1, dtype=np.float64)
|
|
1008
|
+
values[0] = 0.0
|
|
1009
|
+
|
|
1010
|
+
for j in range(n_features):
|
|
1011
|
+
feature_subset = perm[:j+1]
|
|
1012
|
+
X_sample_subset = X[sample_idx:sample_idx+1, :][:, feature_subset]
|
|
1013
|
+
|
|
1014
|
+
if context is not None:
|
|
1015
|
+
values[j+1] = value_function(X_sample_subset, np.array([sample_idx]), context)
|
|
1016
|
+
else:
|
|
1017
|
+
values[j+1] = value_function(X_sample_subset, np.array([sample_idx]))
|
|
1018
|
+
|
|
1019
|
+
# Use JIT-compiled function to compute marginals
|
|
1020
|
+
shapley_contrib = _compute_feature_marginals_jit(perm, values, n_features)
|
|
1021
|
+
|
|
1022
|
+
return shapley_contrib
|
|
1023
|
+
|
|
1024
|
+
# ============================================================
|
|
1025
|
+
# Value Functions for Formative Instance Discovery
|
|
1026
|
+
# ============================================================
|
|
1027
|
+
|
|
1028
|
+
def formative_archetypal_convex_hull(
|
|
1029
|
+
X_subset: np.ndarray,
|
|
1030
|
+
indices: np.ndarray,
|
|
1031
|
+
context: Optional[Dict] = None
|
|
1032
|
+
) -> float:
|
|
1033
|
+
"""
|
|
1034
|
+
Archetypal formative value function: Convex hull volume.
|
|
1035
|
+
|
|
1036
|
+
Samples that expand the convex hull boundary are formative archetypes.
|
|
1037
|
+
|
|
1038
|
+
SAFE: Falls back to range-based metric in high dimensions to avoid segfaults.
|
|
1039
|
+
"""
|
|
1040
|
+
if len(X_subset) < 3:
|
|
1041
|
+
return 0.0
|
|
1042
|
+
|
|
1043
|
+
n_samples, n_features = X_subset.shape
|
|
1044
|
+
|
|
1045
|
+
# CRITICAL FIX: ConvexHull segfaults in high dimensions (>20D)
|
|
1046
|
+
# Always use safe fallback for high-dimensional data
|
|
1047
|
+
if n_features > 20 or ConvexHull is None or n_samples < n_features + 1:
|
|
1048
|
+
# Safe fallback: Feature range coverage (no segfault risk)
|
|
1049
|
+
ranges = X_subset.max(axis=0) - X_subset.min(axis=0)
|
|
1050
|
+
return float(np.prod(ranges + 1e-10)) # Product of ranges (volume proxy)
|
|
1051
|
+
|
|
1052
|
+
# Low dimensions: Try ConvexHull with safety wrapper
|
|
1053
|
+
try:
|
|
1054
|
+
# Ensure data is float64 for numerical stability
|
|
1055
|
+
X_clean = np.asarray(X_subset, dtype=np.float64)
|
|
1056
|
+
|
|
1057
|
+
# Remove duplicate points (causes ConvexHull to fail)
|
|
1058
|
+
X_unique = np.unique(X_clean, axis=0)
|
|
1059
|
+
|
|
1060
|
+
if len(X_unique) < n_features + 1:
|
|
1061
|
+
# Not enough unique points for hull in this dimension
|
|
1062
|
+
ranges = X_unique.max(axis=0) - X_unique.min(axis=0)
|
|
1063
|
+
return float(np.prod(ranges + 1e-10))
|
|
1064
|
+
|
|
1065
|
+
hull = ConvexHull(X_unique)
|
|
1066
|
+
return float(hull.volume)
|
|
1067
|
+
|
|
1068
|
+
except Exception:
|
|
1069
|
+
# ConvexHull failed - use safe fallback
|
|
1070
|
+
ranges = X_subset.max(axis=0) - X_subset.min(axis=0)
|
|
1071
|
+
return float(np.prod(ranges + 1e-10))
|
|
1072
|
+
|
|
1073
|
+
|
|
1074
|
+
def formative_prototypical_coverage(
|
|
1075
|
+
X_subset: np.ndarray,
|
|
1076
|
+
indices: np.ndarray,
|
|
1077
|
+
context: Optional[Dict] = None
|
|
1078
|
+
) -> float:
|
|
1079
|
+
"""
|
|
1080
|
+
Prototypical formative value function: Coverage/representativeness.
|
|
1081
|
+
|
|
1082
|
+
Samples that maximize pairwise similarity coverage are formative prototypes.
|
|
1083
|
+
"""
|
|
1084
|
+
if len(X_subset) < 2:
|
|
1085
|
+
return 0.0
|
|
1086
|
+
|
|
1087
|
+
# L2 normalize
|
|
1088
|
+
norms = np.linalg.norm(X_subset, axis=1, keepdims=True)
|
|
1089
|
+
norms[norms == 0.0] = 1.0
|
|
1090
|
+
X_l2 = X_subset / norms
|
|
1091
|
+
|
|
1092
|
+
# Pairwise cosine similarities
|
|
1093
|
+
similarities = X_l2 @ X_l2.T
|
|
1094
|
+
np.fill_diagonal(similarities, 0)
|
|
1095
|
+
|
|
1096
|
+
if similarities.size == 0:
|
|
1097
|
+
return 0.0
|
|
1098
|
+
|
|
1099
|
+
max_sims = np.max(similarities, axis=1) if similarities.shape[0] > 0 else np.array([0.0])
|
|
1100
|
+
return float(np.mean(max_sims))
|
|
1101
|
+
|
|
1102
|
+
|
|
1103
|
+
def formative_stereotypical_extremeness(
|
|
1104
|
+
X_subset: np.ndarray,
|
|
1105
|
+
indices: np.ndarray,
|
|
1106
|
+
context: Dict
|
|
1107
|
+
) -> float:
|
|
1108
|
+
"""
|
|
1109
|
+
Stereotypical formative value function: Extremeness from median.
|
|
1110
|
+
|
|
1111
|
+
Samples that pull the distribution toward the target are formative stereotypes.
|
|
1112
|
+
"""
|
|
1113
|
+
if len(X_subset) == 0:
|
|
1114
|
+
return 0.0
|
|
1115
|
+
|
|
1116
|
+
target_values = context['target_values']
|
|
1117
|
+
target = context['target']
|
|
1118
|
+
median = context.get('median', np.median(target_values))
|
|
1119
|
+
|
|
1120
|
+
subset_vals = target_values[indices]
|
|
1121
|
+
|
|
1122
|
+
if target == 'max':
|
|
1123
|
+
# How far above median?
|
|
1124
|
+
extremeness = np.mean(np.maximum(subset_vals - median, 0))
|
|
1125
|
+
elif target == 'min':
|
|
1126
|
+
# How far below median?
|
|
1127
|
+
extremeness = np.mean(np.maximum(median - subset_vals, 0))
|
|
1128
|
+
else:
|
|
1129
|
+
# How much closer to target than median?
|
|
1130
|
+
target_val = float(target)
|
|
1131
|
+
median_dist = abs(median - target_val)
|
|
1132
|
+
subset_dist = np.mean(np.abs(subset_vals - target_val))
|
|
1133
|
+
extremeness = median_dist - subset_dist
|
|
1134
|
+
|
|
1135
|
+
return float(extremeness)
|
|
1136
|
+
|
|
1137
|
+
|
|
1138
|
+
# ============================================================
|
|
1139
|
+
# [E] DataTypical API
|
|
1140
|
+
# ============================================================
|
|
1141
|
+
@dataclass
|
|
1142
|
+
class DataTypical:
|
|
1143
|
+
# ---- Core Config ----
|
|
1144
|
+
nmf_rank: int = 8
|
|
1145
|
+
n_prototypes: int = 20
|
|
1146
|
+
scale: str = "minmax"
|
|
1147
|
+
distance_metric: str = "euclidean"
|
|
1148
|
+
similarity_metric: str = "cosine"
|
|
1149
|
+
deterministic: bool = True
|
|
1150
|
+
n_jobs: int = -1
|
|
1151
|
+
max_iter_nmf: int = 400
|
|
1152
|
+
tol_nmf: float = 1e-4
|
|
1153
|
+
feature_weights: Optional[np.ndarray] = None
|
|
1154
|
+
speed_mode: bool = False
|
|
1155
|
+
dtype: str = "float32"
|
|
1156
|
+
random_state: int = 42
|
|
1157
|
+
max_memory_mb: int = 2048
|
|
1158
|
+
return_ranks_only: bool = False
|
|
1159
|
+
auto_n_prototypes: Optional[str] = None
|
|
1160
|
+
verbose: bool = False
|
|
1161
|
+
max_missing_frac: float = 1.0
|
|
1162
|
+
|
|
1163
|
+
# ---- Stereotype Configuration (NEW in v0.4) ----
|
|
1164
|
+
stereotype_column: Optional[str] = None
|
|
1165
|
+
stereotype_target: Union[str, float] = "max"
|
|
1166
|
+
label_columns: Optional[List[str]] = None
|
|
1167
|
+
stereotype_keywords: Optional[List[str]] = None
|
|
1168
|
+
graph_topology_features: Optional[List[str]] = None
|
|
1169
|
+
|
|
1170
|
+
# ---- Data Type Configuration (NEW in v0.5) ----
|
|
1171
|
+
data_type: Optional[str] = None
|
|
1172
|
+
|
|
1173
|
+
# ---- Shapley Configuration (NEW in v0.6) ----
|
|
1174
|
+
shapley_mode: bool = False
|
|
1175
|
+
shapley_n_permutations: int = 100
|
|
1176
|
+
shapley_top_n: Optional[Union[int, float]] = None # CHANGED: Now supports float
|
|
1177
|
+
shapley_early_stopping_patience: int = 10
|
|
1178
|
+
shapley_early_stopping_tolerance: float = 0.01
|
|
1179
|
+
shapley_compute_formative: Optional[bool] = None # NEW in v0.7: None = auto from fast_mode
|
|
1180
|
+
|
|
1181
|
+
# ---- Performance Mode (NEW in v0.7) ----
|
|
1182
|
+
fast_mode: bool = False
|
|
1183
|
+
archetypal_method: Optional[str] = None
|
|
1184
|
+
|
|
1185
|
+
# ---- Artifacts ----
|
|
1186
|
+
W_: Optional[np.ndarray] = field(default=None, init=False)
|
|
1187
|
+
H_: Optional[np.ndarray] = field(default=None, init=False)
|
|
1188
|
+
reconstruction_error_: Optional[float] = field(default=None, init=False)
|
|
1189
|
+
|
|
1190
|
+
n_archetypes_: Optional[int] = field(default=None, init=False)
|
|
1191
|
+
prototype_indices_: Optional[np.ndarray] = field(default=None, init=False)
|
|
1192
|
+
prototype_rows_: Optional[np.ndarray] = field(default=None, init=False)
|
|
1193
|
+
marginal_gains_: Optional[np.ndarray] = field(default=None, init=False)
|
|
1194
|
+
assignments_: Optional[np.ndarray] = field(default=None, init=False)
|
|
1195
|
+
coverage_: Optional[np.ndarray] = field(default=None, init=False)
|
|
1196
|
+
knee_: Optional[int] = field(default=None, init=False)
|
|
1197
|
+
|
|
1198
|
+
scaler_: Optional[MinMaxScaler] = field(default=None, init=False)
|
|
1199
|
+
vectorizer_: Optional[TfidfVectorizer] = field(default=None, init=False)
|
|
1200
|
+
nmf_model_: Optional[NMF] = field(default=None, init=False)
|
|
1201
|
+
|
|
1202
|
+
settings_: Dict = field(default_factory=dict, init=False)
|
|
1203
|
+
ideals_: Dict[str, np.ndarray] = field(default_factory=dict, init=False)
|
|
1204
|
+
dropped_columns_: List[str] = field(default_factory=list, init=False)
|
|
1205
|
+
missingness_: Dict[str, float] = field(default_factory=dict, init=False)
|
|
1206
|
+
train_index_: Optional[pd.Index] = field(default=None, init=False)
|
|
1207
|
+
|
|
1208
|
+
# Feature selection for tables (numeric-only)
|
|
1209
|
+
feature_columns_: Optional[List[str]] = field(default=None, init=False)
|
|
1210
|
+
impute_median_: Optional[np.ndarray] = field(default=None, init=False)
|
|
1211
|
+
keep_mask_: Optional[np.ndarray] = field(default=None, init=False)
|
|
1212
|
+
|
|
1213
|
+
# NEW in v0.4: Stereotype artifacts
|
|
1214
|
+
_df_original_fit: Optional[pd.DataFrame] = field(default=None, init=False)
|
|
1215
|
+
label_df_: Optional[pd.DataFrame] = field(default=None, init=False)
|
|
1216
|
+
text_metadata_: Optional[pd.DataFrame] = field(default=None, init=False)
|
|
1217
|
+
stereotype_keyword_scores_: Optional[np.ndarray] = field(default=None, init=False)
|
|
1218
|
+
graph_topology_df_: Optional[pd.DataFrame] = field(default=None, init=False)
|
|
1219
|
+
|
|
1220
|
+
# Data type detection (NEW in v0.5)
|
|
1221
|
+
_detected_data_type: Optional[str] = field(default=None, init=False)
|
|
1222
|
+
|
|
1223
|
+
# ---- Shapley Artifacts (NEW in v0.6) ----
|
|
1224
|
+
Phi_archetypal_explanations_: Optional[np.ndarray] = field(default=None, init=False)
|
|
1225
|
+
Phi_prototypical_explanations_: Optional[np.ndarray] = field(default=None, init=False)
|
|
1226
|
+
Phi_stereotypical_explanations_: Optional[np.ndarray] = field(default=None, init=False)
|
|
1227
|
+
|
|
1228
|
+
Phi_archetypal_formative_: Optional[np.ndarray] = field(default=None, init=False)
|
|
1229
|
+
Phi_prototypical_formative_: Optional[np.ndarray] = field(default=None, init=False)
|
|
1230
|
+
Phi_stereotypical_formative_: Optional[np.ndarray] = field(default=None, init=False)
|
|
1231
|
+
|
|
1232
|
+
shapley_info_: Dict = field(default_factory=dict, init=False)
|
|
1233
|
+
_stereotype_source_fit_: Optional[pd.Series] = field(default=None, init=False)
|
|
1234
|
+
|
|
1235
|
+
|
|
1236
|
+
# --------------------------
|
|
1237
|
+
# Auto-Detection and Routing
|
|
1238
|
+
# --------------------------
|
|
1239
|
+
|
|
1240
|
+
def _auto_detect_data_type(self, X, **kwargs) -> str:
|
|
1241
|
+
"""
|
|
1242
|
+
Auto-detect data type based on input format.
|
|
1243
|
+
|
|
1244
|
+
Priority:
|
|
1245
|
+
1. Graph: If edges/edge_index parameter present
|
|
1246
|
+
2. Text: If X is list/tuple of strings
|
|
1247
|
+
3. Tabular: If X is DataFrame or array
|
|
1248
|
+
|
|
1249
|
+
Parameters
|
|
1250
|
+
----------
|
|
1251
|
+
X : various
|
|
1252
|
+
Input data
|
|
1253
|
+
**kwargs
|
|
1254
|
+
Additional parameters (checked for edges/edge_index)
|
|
1255
|
+
|
|
1256
|
+
Returns
|
|
1257
|
+
-------
|
|
1258
|
+
data_type : str
|
|
1259
|
+
One of 'graph', 'text', 'tabular'
|
|
1260
|
+
|
|
1261
|
+
Raises
|
|
1262
|
+
------
|
|
1263
|
+
ValueError
|
|
1264
|
+
If data type cannot be determined
|
|
1265
|
+
"""
|
|
1266
|
+
# Priority 1: Graph (edges parameter indicates graph data)
|
|
1267
|
+
if 'edges' in kwargs or 'edge_index' in kwargs:
|
|
1268
|
+
return 'graph'
|
|
1269
|
+
|
|
1270
|
+
# Priority 2: Text (list/tuple of strings)
|
|
1271
|
+
if isinstance(X, (list, tuple)):
|
|
1272
|
+
if len(X) > 0 and isinstance(X[0], str):
|
|
1273
|
+
return 'text'
|
|
1274
|
+
|
|
1275
|
+
# Priority 3: Tabular (DataFrame or array)
|
|
1276
|
+
if isinstance(X, (pd.DataFrame, np.ndarray)):
|
|
1277
|
+
return 'tabular'
|
|
1278
|
+
|
|
1279
|
+
# Cannot determine
|
|
1280
|
+
raise ValueError(
|
|
1281
|
+
f"Cannot auto-detect data type from input of type {type(X)}. "
|
|
1282
|
+
f"Supported formats: DataFrame/array (tabular), list of strings (text), "
|
|
1283
|
+
f"or provide edges parameter (graph). "
|
|
1284
|
+
f"Alternatively, specify data_type='tabular'/'text'/'graph' explicitly."
|
|
1285
|
+
)
|
|
1286
|
+
|
|
1287
|
+
def _validate_data_type(self, detected: str) -> str:
|
|
1288
|
+
"""
|
|
1289
|
+
Validate and resolve data_type configuration.
|
|
1290
|
+
|
|
1291
|
+
If data_type is specified in config, validate it matches expected values.
|
|
1292
|
+
Otherwise use auto-detected type.
|
|
1293
|
+
|
|
1294
|
+
Parameters
|
|
1295
|
+
----------
|
|
1296
|
+
detected : str
|
|
1297
|
+
Auto-detected data type
|
|
1298
|
+
|
|
1299
|
+
Returns
|
|
1300
|
+
-------
|
|
1301
|
+
data_type : str
|
|
1302
|
+
Final data type to use
|
|
1303
|
+
|
|
1304
|
+
Raises
|
|
1305
|
+
------
|
|
1306
|
+
ValueError
|
|
1307
|
+
If configured data_type is invalid
|
|
1308
|
+
"""
|
|
1309
|
+
if self.data_type is not None:
|
|
1310
|
+
# Manual override provided
|
|
1311
|
+
valid_types = {'tabular', 'text', 'graph'}
|
|
1312
|
+
if self.data_type not in valid_types:
|
|
1313
|
+
raise ValueError(
|
|
1314
|
+
f"Invalid data_type='{self.data_type}'. "
|
|
1315
|
+
f"Must be one of {valid_types} or None (auto-detect)."
|
|
1316
|
+
)
|
|
1317
|
+
if self.verbose:
|
|
1318
|
+
if detected != self.data_type:
|
|
1319
|
+
print(f"Using configured data_type='{self.data_type}' "
|
|
1320
|
+
f"(auto-detected: '{detected}')")
|
|
1321
|
+
return self.data_type
|
|
1322
|
+
else:
|
|
1323
|
+
# Use auto-detected
|
|
1324
|
+
if self.verbose:
|
|
1325
|
+
print(f"Auto-detected data_type: '{detected}'")
|
|
1326
|
+
return detected
|
|
1327
|
+
|
|
1328
|
+
def _apply_fast_mode_defaults(self) -> None:
|
|
1329
|
+
"""
|
|
1330
|
+
Apply fast_mode preset defaults if parameters not explicitly set.
|
|
1331
|
+
|
|
1332
|
+
fast_mode=True: Exploration (NMF + explanations only + subsample)
|
|
1333
|
+
fast_mode=False: Publication (AA + formative + full dataset)
|
|
1334
|
+
|
|
1335
|
+
Users can override any individual parameter by setting it explicitly.
|
|
1336
|
+
"""
|
|
1337
|
+
if self.fast_mode:
|
|
1338
|
+
# Fast mode defaults (exploration)
|
|
1339
|
+
if self.archetypal_method is None:
|
|
1340
|
+
self.archetypal_method = 'nmf'
|
|
1341
|
+
|
|
1342
|
+
# Reduce Shapley permutations for speed
|
|
1343
|
+
if self.shapley_n_permutations == 100: # Default value, not overridden
|
|
1344
|
+
self.shapley_n_permutations = 30
|
|
1345
|
+
|
|
1346
|
+
# Subsample explanations to top 50%
|
|
1347
|
+
if self.shapley_top_n is None:
|
|
1348
|
+
self.shapley_top_n = 0.5 # 50% of instances
|
|
1349
|
+
|
|
1350
|
+
# Skip formative in fast mode (explanations only)
|
|
1351
|
+
if self.shapley_compute_formative is None:
|
|
1352
|
+
self.shapley_compute_formative = False
|
|
1353
|
+
|
|
1354
|
+
else:
|
|
1355
|
+
# Publication mode defaults (rigorous)
|
|
1356
|
+
if self.archetypal_method is None:
|
|
1357
|
+
self.archetypal_method = 'aa' # True archetypal analysis
|
|
1358
|
+
|
|
1359
|
+
# Keep shapley_n_permutations=100 (default)
|
|
1360
|
+
# Keep shapley_top_n=None (compute for all instances)
|
|
1361
|
+
# Compute formative in publication mode
|
|
1362
|
+
if self.shapley_compute_formative is None:
|
|
1363
|
+
self.shapley_compute_formative = True
|
|
1364
|
+
|
|
1365
|
+
# Validate archetypal_method
|
|
1366
|
+
if self.archetypal_method not in ['nmf', 'aa']:
|
|
1367
|
+
raise ValueError(
|
|
1368
|
+
f"archetypal_method must be 'nmf' or 'aa', got '{self.archetypal_method}'"
|
|
1369
|
+
)
|
|
1370
|
+
|
|
1371
|
+
if self.verbose:
|
|
1372
|
+
mode_name = "Fast" if self.fast_mode else "Publication"
|
|
1373
|
+
print(f"\n{mode_name} mode defaults:")
|
|
1374
|
+
print(f" archetypal_method: {self.archetypal_method}")
|
|
1375
|
+
print(f" shapley_n_permutations: {self.shapley_n_permutations}")
|
|
1376
|
+
print(f" shapley_top_n: {self.shapley_top_n if self.shapley_top_n else 'all instances'}")
|
|
1377
|
+
print(f" shapley_compute_formative: {self.shapley_compute_formative}")
|
|
1378
|
+
|
|
1379
|
+
# --------------------------
|
|
1380
|
+
# Unified Interface
|
|
1381
|
+
# --------------------------
|
|
1382
|
+
def fit(self, X: Union[pd.DataFrame, np.ndarray, List[str]], **kwargs):
|
|
1383
|
+
"""
|
|
1384
|
+
Fit DataTypical on input data (auto-detects format).
|
|
1385
|
+
|
|
1386
|
+
Automatically detects whether input is tabular, text, or graph data
|
|
1387
|
+
based on format and parameters. Can be overridden with data_type parameter.
|
|
1388
|
+
|
|
1389
|
+
Parameters
|
|
1390
|
+
----------
|
|
1391
|
+
X : DataFrame, array, or list of strings
|
|
1392
|
+
Input data:
|
|
1393
|
+
- Tabular: DataFrame or 2D array
|
|
1394
|
+
- Text: List of string documents
|
|
1395
|
+
- Graph: Node features (with edges parameter)
|
|
1396
|
+
|
|
1397
|
+
**kwargs : optional
|
|
1398
|
+
Additional parameters for specific data types:
|
|
1399
|
+
|
|
1400
|
+
For text:
|
|
1401
|
+
vectorizer : str, default 'tfidf'
|
|
1402
|
+
text_metadata : pd.DataFrame, optional
|
|
1403
|
+
|
|
1404
|
+
For graph:
|
|
1405
|
+
edges : np.ndarray (required for graph detection)
|
|
1406
|
+
Edge list as (2, n_edges) or (n_edges, 2)
|
|
1407
|
+
edge_index : np.ndarray (alias for edges)
|
|
1408
|
+
compute_topology : bool, default True
|
|
1409
|
+
|
|
1410
|
+
Returns
|
|
1411
|
+
-------
|
|
1412
|
+
self : DataTypical
|
|
1413
|
+
Fitted estimator
|
|
1414
|
+
|
|
1415
|
+
Examples
|
|
1416
|
+
--------
|
|
1417
|
+
>>> # Tabular (auto-detected)
|
|
1418
|
+
>>> dt = DataTypical()
|
|
1419
|
+
>>> dt.fit(dataframe)
|
|
1420
|
+
|
|
1421
|
+
>>> # Text (auto-detected from list of strings)
|
|
1422
|
+
>>> dt = DataTypical(stereotype_keywords=['protein'])
|
|
1423
|
+
>>> dt.fit(corpus)
|
|
1424
|
+
|
|
1425
|
+
>>> # Graph (auto-detected from edges parameter)
|
|
1426
|
+
>>> dt = DataTypical(graph_topology_features=['degree'])
|
|
1427
|
+
>>> dt.fit(node_features, edges=edge_list)
|
|
1428
|
+
|
|
1429
|
+
>>> # Manual override
|
|
1430
|
+
>>> dt = DataTypical(data_type='tabular')
|
|
1431
|
+
>>> dt.fit(data)
|
|
1432
|
+
"""
|
|
1433
|
+
# Apply fast_mode defaults (if not already applied)
|
|
1434
|
+
if not hasattr(self, '_fast_mode_applied'):
|
|
1435
|
+
self._apply_fast_mode_defaults()
|
|
1436
|
+
self._fast_mode_applied = True
|
|
1437
|
+
|
|
1438
|
+
# Auto-detect data type
|
|
1439
|
+
detected = self._auto_detect_data_type(X, **kwargs)
|
|
1440
|
+
|
|
1441
|
+
# Validate and resolve final type
|
|
1442
|
+
final_type = self._validate_data_type(detected)
|
|
1443
|
+
self._detected_data_type = final_type
|
|
1444
|
+
|
|
1445
|
+
# Route to appropriate internal method
|
|
1446
|
+
if final_type == 'tabular':
|
|
1447
|
+
return self._fit_tabular(X)
|
|
1448
|
+
elif final_type == 'text':
|
|
1449
|
+
vectorizer = kwargs.get('vectorizer', 'tfidf')
|
|
1450
|
+
text_metadata = kwargs.get('text_metadata', None)
|
|
1451
|
+
return self._fit_text(X, vectorizer, text_metadata)
|
|
1452
|
+
elif final_type == 'graph':
|
|
1453
|
+
edges = kwargs.get('edges', kwargs.get('edge_index', None))
|
|
1454
|
+
compute_topology = kwargs.get('compute_topology', True)
|
|
1455
|
+
return self._fit_graph(X, edges, compute_topology)
|
|
1456
|
+
else:
|
|
1457
|
+
raise RuntimeError(f"Unknown data type: {final_type}")
|
|
1458
|
+
|
|
1459
|
+
def transform(self, X: Union[pd.DataFrame, np.ndarray, List[str]], **kwargs):
|
|
1460
|
+
"""
|
|
1461
|
+
Transform data using fitted model (uses detected format from fit).
|
|
1462
|
+
|
|
1463
|
+
Parameters
|
|
1464
|
+
----------
|
|
1465
|
+
X : DataFrame, array, or list of strings
|
|
1466
|
+
Input data (same format as used in fit)
|
|
1467
|
+
**kwargs : optional
|
|
1468
|
+
Additional parameters (same as fit)
|
|
1469
|
+
|
|
1470
|
+
Returns
|
|
1471
|
+
-------
|
|
1472
|
+
results : pd.DataFrame
|
|
1473
|
+
Significance rankings and diagnostics
|
|
1474
|
+
|
|
1475
|
+
Examples
|
|
1476
|
+
--------
|
|
1477
|
+
>>> dt = DataTypical()
|
|
1478
|
+
>>> dt.fit(train_data)
|
|
1479
|
+
>>> results = dt.transform(test_data)
|
|
1480
|
+
"""
|
|
1481
|
+
if self._detected_data_type is None:
|
|
1482
|
+
raise RuntimeError("Model not fitted. Call fit() first.")
|
|
1483
|
+
|
|
1484
|
+
return_ranks_only = kwargs.get('return_ranks_only', self.return_ranks_only)
|
|
1485
|
+
|
|
1486
|
+
if self._detected_data_type == 'tabular':
|
|
1487
|
+
return self._transform_tabular(X, return_ranks_only)
|
|
1488
|
+
elif self._detected_data_type == 'text':
|
|
1489
|
+
return self._transform_text(X, return_ranks_only)
|
|
1490
|
+
elif self._detected_data_type == 'graph':
|
|
1491
|
+
# Graph transform needs to recompute topology if edges provided
|
|
1492
|
+
edges = kwargs.get('edges', kwargs.get('edge_index', None))
|
|
1493
|
+
return self._transform_graph(X, edges, return_ranks_only)
|
|
1494
|
+
else:
|
|
1495
|
+
raise RuntimeError(f"Unknown detected type: {self._detected_data_type}")
|
|
1496
|
+
|
|
1497
|
+
def fit_transform(
|
|
1498
|
+
self,
|
|
1499
|
+
X: Union[pd.DataFrame, np.ndarray, List[str]],
|
|
1500
|
+
return_ranks_only: Optional[bool] = None,
|
|
1501
|
+
**kwargs
|
|
1502
|
+
) -> pd.DataFrame:
|
|
1503
|
+
"""
|
|
1504
|
+
Fit and transform in one step (auto-detects format).
|
|
1505
|
+
|
|
1506
|
+
Parameters
|
|
1507
|
+
----------
|
|
1508
|
+
X : DataFrame, array, or list of strings
|
|
1509
|
+
Input data
|
|
1510
|
+
return_ranks_only : bool, optional
|
|
1511
|
+
If True, return only rank columns
|
|
1512
|
+
**kwargs : optional
|
|
1513
|
+
Additional parameters (see fit() for details)
|
|
1514
|
+
|
|
1515
|
+
Returns
|
|
1516
|
+
-------
|
|
1517
|
+
results : pd.DataFrame
|
|
1518
|
+
Significance rankings and diagnostics
|
|
1519
|
+
|
|
1520
|
+
Examples
|
|
1521
|
+
--------
|
|
1522
|
+
>>> # Tabular
|
|
1523
|
+
>>> dt = DataTypical()
|
|
1524
|
+
>>> results = dt.fit_transform(data)
|
|
1525
|
+
|
|
1526
|
+
>>> # Text
|
|
1527
|
+
>>> dt = DataTypical(stereotype_keywords=['keyword'])
|
|
1528
|
+
>>> results = dt.fit_transform(corpus)
|
|
1529
|
+
|
|
1530
|
+
>>> # Graph
|
|
1531
|
+
>>> dt = DataTypical(graph_topology_features=['degree'])
|
|
1532
|
+
>>> results = dt.fit_transform(node_features, edges=edges)
|
|
1533
|
+
"""
|
|
1534
|
+
self.fit(X, **kwargs)
|
|
1535
|
+
if return_ranks_only is not None:
|
|
1536
|
+
kwargs['return_ranks_only'] = return_ranks_only
|
|
1537
|
+
return self.transform(X, **kwargs)
|
|
1538
|
+
|
|
1539
|
+
# --------------------------
|
|
1540
|
+
# Internal Methods (Type-Specific)
|
|
1541
|
+
# --------------------------
|
|
1542
|
+
def _fit_tabular(self, X: Union[pd.DataFrame, np.ndarray]):
|
|
1543
|
+
"""Internal method for fitting tabular data."""
|
|
1544
|
+
self._validate_stereotype_config()
|
|
1545
|
+
df = X if isinstance(X, pd.DataFrame) else pd.DataFrame(np.asarray(X))
|
|
1546
|
+
self.train_index_ = df.index.copy()
|
|
1547
|
+
with _ThreadControl(self.deterministic and not self.speed_mode) as tc:
|
|
1548
|
+
_seed_everything(self.random_state)
|
|
1549
|
+
X_scaled, X_l2 = self._preprocess_table_fit(df)
|
|
1550
|
+
self._fit_components(X_scaled, X_l2, df.index)
|
|
1551
|
+
|
|
1552
|
+
# Store stereotype source for Shapley
|
|
1553
|
+
if self.stereotype_column is not None and self.shapley_mode:
|
|
1554
|
+
self._stereotype_source_fit_ = self._get_stereotype_source_table(df)
|
|
1555
|
+
|
|
1556
|
+
# NEW: Shapley analysis
|
|
1557
|
+
if self.shapley_mode:
|
|
1558
|
+
if self.verbose:
|
|
1559
|
+
print("\n" + "="*70)
|
|
1560
|
+
print("SHAPLEY DUAL-PERSPECTIVE ANALYSIS")
|
|
1561
|
+
print("="*70)
|
|
1562
|
+
self._fit_shapley_dual_perspective(X_scaled, X_l2, df.index)
|
|
1563
|
+
self._record_settings(tc)
|
|
1564
|
+
return self
|
|
1565
|
+
|
|
1566
|
+
def _fit_text(
|
|
1567
|
+
self,
|
|
1568
|
+
corpus: Union[List[str], Iterable[str]],
|
|
1569
|
+
vectorizer: str = "tfidf",
|
|
1570
|
+
text_metadata: Optional[pd.DataFrame] = None
|
|
1571
|
+
):
|
|
1572
|
+
"""Internal method for fitting text data."""
|
|
1573
|
+
self._validate_stereotype_config()
|
|
1574
|
+
with _ThreadControl(self.deterministic and not self.speed_mode) as tc:
|
|
1575
|
+
_seed_everything(self.random_state)
|
|
1576
|
+
X_scaled, X_l2 = self._preprocess_text_fit(corpus, vectorizer, text_metadata)
|
|
1577
|
+
idx = pd.RangeIndex(X_scaled.shape[0])
|
|
1578
|
+
self.train_index_ = idx
|
|
1579
|
+
self._fit_components(X_scaled, X_l2, idx)
|
|
1580
|
+
self._record_settings(tc)
|
|
1581
|
+
return self
|
|
1582
|
+
|
|
1583
|
+
def _fit_graph(
|
|
1584
|
+
self,
|
|
1585
|
+
node_features: Union[pd.DataFrame, np.ndarray],
|
|
1586
|
+
edges: Optional[np.ndarray] = None,
|
|
1587
|
+
compute_topology: bool = True
|
|
1588
|
+
):
|
|
1589
|
+
"""Internal method for fitting graph data."""
|
|
1590
|
+
# Convert to DataFrame
|
|
1591
|
+
if isinstance(node_features, pd.DataFrame):
|
|
1592
|
+
df = node_features.copy()
|
|
1593
|
+
else:
|
|
1594
|
+
df = pd.DataFrame(node_features)
|
|
1595
|
+
|
|
1596
|
+
n_nodes = len(df)
|
|
1597
|
+
|
|
1598
|
+
# Compute topology features if edges provided
|
|
1599
|
+
self.graph_topology_df_ = None
|
|
1600
|
+
if edges is not None and compute_topology:
|
|
1601
|
+
topology_df = self._compute_graph_topology_features(edges, n_nodes)
|
|
1602
|
+
self.graph_topology_df_ = topology_df
|
|
1603
|
+
|
|
1604
|
+
# Append to node features
|
|
1605
|
+
for col in topology_df.columns:
|
|
1606
|
+
if col not in df.columns:
|
|
1607
|
+
df[col] = topology_df[col].values
|
|
1608
|
+
else:
|
|
1609
|
+
warnings.warn(f"Topology feature '{col}' already exists, skipping")
|
|
1610
|
+
|
|
1611
|
+
# Delegate to tabular processing
|
|
1612
|
+
return self._fit_tabular(df)
|
|
1613
|
+
|
|
1614
|
+
def _transform_tabular(
|
|
1615
|
+
self,
|
|
1616
|
+
X: Union[pd.DataFrame, np.ndarray],
|
|
1617
|
+
return_ranks_only: bool = False
|
|
1618
|
+
) -> pd.DataFrame:
|
|
1619
|
+
"""Internal method for transforming tabular data."""
|
|
1620
|
+
df = X if isinstance(X, pd.DataFrame) else pd.DataFrame(np.asarray(X))
|
|
1621
|
+
with _ThreadControl(self.deterministic and not self.speed_mode):
|
|
1622
|
+
X_scaled, X_l2 = self._preprocess_table_transform(df)
|
|
1623
|
+
|
|
1624
|
+
# Get stereotype source for transform
|
|
1625
|
+
stereotype_source = None
|
|
1626
|
+
if self.stereotype_column is not None:
|
|
1627
|
+
stereotype_source = self._get_stereotype_source_table(df)
|
|
1628
|
+
|
|
1629
|
+
ranks = self._score_with_fitted(X_scaled, X_l2, df.index, stereotype_source)
|
|
1630
|
+
|
|
1631
|
+
# Add Shapley rankings (including None columns if formative skipped)
|
|
1632
|
+
if self.shapley_mode:
|
|
1633
|
+
shapley_ranks = self._compute_shapley_formative_ranks()
|
|
1634
|
+
ranks = pd.concat([ranks, shapley_ranks], axis=1)
|
|
1635
|
+
|
|
1636
|
+
if return_ranks_only:
|
|
1637
|
+
return ranks
|
|
1638
|
+
out = df.copy()
|
|
1639
|
+
for col in ranks.columns:
|
|
1640
|
+
out[col] = ranks[col]
|
|
1641
|
+
return out
|
|
1642
|
+
|
|
1643
|
+
def _transform_text(
|
|
1644
|
+
self,
|
|
1645
|
+
corpus: Union[List[str], Iterable[str]],
|
|
1646
|
+
return_ranks_only: bool = False
|
|
1647
|
+
) -> pd.DataFrame:
|
|
1648
|
+
"""Internal method for transforming text data."""
|
|
1649
|
+
with _ThreadControl(self.deterministic and not self.speed_mode):
|
|
1650
|
+
X_scaled, X_l2 = self._preprocess_text_transform(corpus)
|
|
1651
|
+
idx = pd.RangeIndex(X_scaled.shape[0])
|
|
1652
|
+
|
|
1653
|
+
# Get stereotype source (priority: metadata column > keywords > None)
|
|
1654
|
+
stereotype_source = None
|
|
1655
|
+
|
|
1656
|
+
# Priority 1: Metadata column (from fit_text)
|
|
1657
|
+
if self.stereotype_column is not None and self.text_metadata_ is not None:
|
|
1658
|
+
if self.stereotype_column in self.text_metadata_.columns:
|
|
1659
|
+
stereotype_source = self.text_metadata_[self.stereotype_column]
|
|
1660
|
+
|
|
1661
|
+
# Priority 2: Keyword scores (recompute on new corpus)
|
|
1662
|
+
elif self.stereotype_keywords is not None:
|
|
1663
|
+
corpus_list = list(corpus)
|
|
1664
|
+
X_tfidf = self.vectorizer_.transform(corpus_list)
|
|
1665
|
+
keyword_scores = self._compute_keyword_scores(
|
|
1666
|
+
X_tfidf, corpus_list, self.stereotype_keywords
|
|
1667
|
+
)
|
|
1668
|
+
stereotype_source = pd.Series(keyword_scores)
|
|
1669
|
+
|
|
1670
|
+
ranks = self._score_with_fitted(X_scaled, X_l2, idx, stereotype_source)
|
|
1671
|
+
|
|
1672
|
+
# Add Shapley rankings (including None columns if formative skipped)
|
|
1673
|
+
if self.shapley_mode:
|
|
1674
|
+
shapley_ranks = self._compute_shapley_formative_ranks()
|
|
1675
|
+
ranks = pd.concat([ranks, shapley_ranks], axis=1)
|
|
1676
|
+
|
|
1677
|
+
return ranks
|
|
1678
|
+
|
|
1679
|
+
def _transform_graph(
|
|
1680
|
+
self,
|
|
1681
|
+
node_features: Union[pd.DataFrame, np.ndarray],
|
|
1682
|
+
edges: Optional[np.ndarray] = None,
|
|
1683
|
+
return_ranks_only: bool = False
|
|
1684
|
+
) -> pd.DataFrame:
|
|
1685
|
+
"""Internal method for transforming graph data."""
|
|
1686
|
+
# Convert to DataFrame
|
|
1687
|
+
if isinstance(node_features, pd.DataFrame):
|
|
1688
|
+
df = node_features.copy()
|
|
1689
|
+
else:
|
|
1690
|
+
df = pd.DataFrame(node_features)
|
|
1691
|
+
|
|
1692
|
+
n_nodes = len(df)
|
|
1693
|
+
|
|
1694
|
+
# Recompute topology features if edges provided and model was trained with them
|
|
1695
|
+
if edges is not None and self.graph_topology_df_ is not None:
|
|
1696
|
+
topology_df = self._compute_graph_topology_features(edges, n_nodes)
|
|
1697
|
+
|
|
1698
|
+
# Append to node features
|
|
1699
|
+
for col in topology_df.columns:
|
|
1700
|
+
if col not in df.columns:
|
|
1701
|
+
df[col] = topology_df[col].values
|
|
1702
|
+
|
|
1703
|
+
# Delegate to tabular transform (which handles Shapley ranks)
|
|
1704
|
+
return self._transform_tabular(df, return_ranks_only)
|
|
1705
|
+
|
|
1706
|
+
# ============================================================
|
|
1707
|
+
# Shapley Dual-Perspective Methods (NEW in v0.6)
|
|
1708
|
+
# ============================================================
|
|
1709
|
+
|
|
1710
|
+
def _fit_shapley_dual_perspective(
|
|
1711
|
+
self,
|
|
1712
|
+
X_scaled: ArrayLike,
|
|
1713
|
+
X_l2: ArrayLike,
|
|
1714
|
+
index: pd.Index
|
|
1715
|
+
) -> None:
|
|
1716
|
+
"""
|
|
1717
|
+
Fit Shapley analysis with dual perspective:
|
|
1718
|
+
1. Explanations: Why is each sample significant? (always computed)
|
|
1719
|
+
2. Formative: Which samples create structure? (optional)
|
|
1720
|
+
|
|
1721
|
+
MEMORY OPTIMIZED: Cleanup X_dense after Shapley computation.
|
|
1722
|
+
"""
|
|
1723
|
+
X_dense = X_scaled.toarray() if (sp is not None and sp.isspmatrix(X_scaled)) \
|
|
1724
|
+
else np.asarray(X_scaled, dtype=np.float64)
|
|
1725
|
+
n_samples, n_features = X_dense.shape
|
|
1726
|
+
|
|
1727
|
+
# Determine if we compute formative
|
|
1728
|
+
compute_formative = self.shapley_compute_formative if self.shapley_compute_formative is not None else True
|
|
1729
|
+
|
|
1730
|
+
# SUBSAMPLE LOGIC: Only for explanations
|
|
1731
|
+
subsample_indices_explanations = None
|
|
1732
|
+
|
|
1733
|
+
if self.shapley_top_n is not None:
|
|
1734
|
+
# Support both fraction and absolute count
|
|
1735
|
+
if isinstance(self.shapley_top_n, float) and 0 < self.shapley_top_n < 1:
|
|
1736
|
+
n_subsample = max(1, int(self.shapley_top_n * n_samples))
|
|
1737
|
+
else:
|
|
1738
|
+
n_subsample = int(self.shapley_top_n)
|
|
1739
|
+
|
|
1740
|
+
if n_subsample < n_samples:
|
|
1741
|
+
if self.verbose:
|
|
1742
|
+
print(f"\n[Subsampling] Selecting top {n_subsample} samples per metric")
|
|
1743
|
+
if compute_formative:
|
|
1744
|
+
print(" Formative: Full dataset (required for structure)")
|
|
1745
|
+
else:
|
|
1746
|
+
print(" Formative: SKIPPED (fast_mode)")
|
|
1747
|
+
|
|
1748
|
+
# Get correct stereotype source for ranking
|
|
1749
|
+
stereotype_source = self._stereotype_source_fit_ if hasattr(self, '_stereotype_source_fit_') else None
|
|
1750
|
+
temp_results = self._score_with_fitted(X_scaled, X_l2, index, stereotype_source)
|
|
1751
|
+
|
|
1752
|
+
# Get top n_subsample for each metric separately
|
|
1753
|
+
top_arch = set(temp_results.nlargest(n_subsample, 'archetypal_rank').index)
|
|
1754
|
+
top_proto = set(temp_results.nlargest(n_subsample, 'prototypical_rank').index)
|
|
1755
|
+
top_stereo = set(temp_results.nlargest(n_subsample, 'stereotypical_rank').index)
|
|
1756
|
+
|
|
1757
|
+
# Union of all top samples - NO TRIMMING!
|
|
1758
|
+
# This ensures all top-N samples from each metric have Shapley values
|
|
1759
|
+
top_indices_union = top_arch | top_proto | top_stereo
|
|
1760
|
+
|
|
1761
|
+
if self.verbose:
|
|
1762
|
+
print(f" Top {n_subsample} archetypal samples: {len(top_arch)}")
|
|
1763
|
+
print(f" Top {n_subsample} prototypical samples: {len(top_proto)}")
|
|
1764
|
+
print(f" Top {n_subsample} stereotypical samples: {len(top_stereo)}")
|
|
1765
|
+
print(f" Union: {len(top_indices_union)} unique samples")
|
|
1766
|
+
print(f" (Computing Shapley for all union samples - ensures no empty plots)")
|
|
1767
|
+
|
|
1768
|
+
# Identify core samples (appear in multiple metric top-N lists)
|
|
1769
|
+
# These get full permutations; secondary samples get reduced permutations
|
|
1770
|
+
sample_counts = {}
|
|
1771
|
+
for idx in top_indices_union:
|
|
1772
|
+
count = sum([idx in top_arch, idx in top_proto, idx in top_stereo])
|
|
1773
|
+
sample_counts[idx] = count
|
|
1774
|
+
|
|
1775
|
+
# Core = samples in 2+ metrics (most important)
|
|
1776
|
+
core_samples_df_idx = [idx for idx, cnt in sample_counts.items() if cnt >= 2]
|
|
1777
|
+
core_positions = sorted([index.get_loc(idx) for idx in core_samples_df_idx])
|
|
1778
|
+
self._union_core_samples = np.array(core_positions)
|
|
1779
|
+
|
|
1780
|
+
if self.verbose:
|
|
1781
|
+
print(f" Core samples (in 2+ metrics): {len(core_samples_df_idx)}")
|
|
1782
|
+
print(f" Secondary samples (in 1 metric): {len(top_indices_union) - len(core_samples_df_idx)}")
|
|
1783
|
+
|
|
1784
|
+
# Convert to positional indices (deterministic order via sorting)
|
|
1785
|
+
top_positions = sorted([index.get_loc(idx) for idx in top_indices_union])
|
|
1786
|
+
subsample_indices_explanations = np.array(top_positions)
|
|
1787
|
+
|
|
1788
|
+
# MEMORY CLEANUP
|
|
1789
|
+
_cleanup_memory(temp_results)
|
|
1790
|
+
|
|
1791
|
+
# Initialize Shapley engine
|
|
1792
|
+
engine = ShapleySignificanceEngine(
|
|
1793
|
+
n_permutations=self.shapley_n_permutations,
|
|
1794
|
+
random_state=self.random_state,
|
|
1795
|
+
n_jobs=self.n_jobs,
|
|
1796
|
+
early_stopping_patience=self.shapley_early_stopping_patience,
|
|
1797
|
+
early_stopping_tolerance=self.shapley_early_stopping_tolerance,
|
|
1798
|
+
verbose=self.verbose
|
|
1799
|
+
)
|
|
1800
|
+
|
|
1801
|
+
# PERSPECTIVE 1: Formative Instances (optional)
|
|
1802
|
+
if compute_formative:
|
|
1803
|
+
if self.verbose:
|
|
1804
|
+
print("\n[1] Computing Formative Instances (global perspective)...")
|
|
1805
|
+
print(" Using FULL dataset (required to measure structure)")
|
|
1806
|
+
|
|
1807
|
+
# Formative archetypal (convex hull)
|
|
1808
|
+
self.Phi_archetypal_formative_, self.shapley_info_['archetypal_formative'] = \
|
|
1809
|
+
engine.compute_shapley_values(
|
|
1810
|
+
X_dense, # Always full dataset
|
|
1811
|
+
formative_archetypal_convex_hull,
|
|
1812
|
+
"Archetypal Formative (Convex Hull)"
|
|
1813
|
+
)
|
|
1814
|
+
|
|
1815
|
+
# Formative prototypical (coverage)
|
|
1816
|
+
self.Phi_prototypical_formative_, self.shapley_info_['prototypical_formative'] = \
|
|
1817
|
+
engine.compute_shapley_values(
|
|
1818
|
+
X_dense,
|
|
1819
|
+
formative_prototypical_coverage,
|
|
1820
|
+
"Prototypical Formative (Coverage)"
|
|
1821
|
+
)
|
|
1822
|
+
|
|
1823
|
+
# Formative stereotypical (extremeness)
|
|
1824
|
+
if self.stereotype_column is not None and hasattr(self, '_stereotype_source_fit_'):
|
|
1825
|
+
target_values = self._stereotype_source_fit_.to_numpy(dtype=np.float64)
|
|
1826
|
+
context = {
|
|
1827
|
+
'target_values': target_values,
|
|
1828
|
+
'target': self.stereotype_target,
|
|
1829
|
+
'median': np.median(target_values)
|
|
1830
|
+
}
|
|
1831
|
+
|
|
1832
|
+
self.Phi_stereotypical_formative_, self.shapley_info_['stereotypical_formative'] = \
|
|
1833
|
+
engine.compute_shapley_values(
|
|
1834
|
+
X_dense,
|
|
1835
|
+
formative_stereotypical_extremeness,
|
|
1836
|
+
"Stereotypical Formative (Extremeness)",
|
|
1837
|
+
context
|
|
1838
|
+
)
|
|
1839
|
+
else:
|
|
1840
|
+
self.Phi_stereotypical_formative_ = None
|
|
1841
|
+
else:
|
|
1842
|
+
# Skip formative computation
|
|
1843
|
+
if self.verbose:
|
|
1844
|
+
print("\n[1] Skipping Formative Instances (fast_mode)")
|
|
1845
|
+
|
|
1846
|
+
self.Phi_archetypal_formative_ = None
|
|
1847
|
+
self.Phi_prototypical_formative_ = None
|
|
1848
|
+
self.Phi_stereotypical_formative_ = None
|
|
1849
|
+
|
|
1850
|
+
# PERSPECTIVE 2: Explanations (always computed, optionally subsampled)
|
|
1851
|
+
if self.verbose:
|
|
1852
|
+
print("\n[2] Computing Local Explanations (why is each sample significant)...")
|
|
1853
|
+
if subsample_indices_explanations is not None:
|
|
1854
|
+
print(f" Computing for {len(subsample_indices_explanations)} samples (union of top-N per metric)")
|
|
1855
|
+
else:
|
|
1856
|
+
print(f" Computing for all {n_samples} instances")
|
|
1857
|
+
|
|
1858
|
+
self._fit_shapley_explanations(
|
|
1859
|
+
X_dense, X_l2, index, engine,
|
|
1860
|
+
subsample_indices_explanations
|
|
1861
|
+
)
|
|
1862
|
+
|
|
1863
|
+
# MEMORY CLEANUP: Free X_dense copy (original X_scaled still needed)
|
|
1864
|
+
_cleanup_memory(X_dense, force_gc=True)
|
|
1865
|
+
|
|
1866
|
+
if self.verbose:
|
|
1867
|
+
print("\n" + "="*70)
|
|
1868
|
+
if compute_formative:
|
|
1869
|
+
print("✓ Shapley Dual-Perspective Analysis Complete")
|
|
1870
|
+
else:
|
|
1871
|
+
print("✓ Shapley Explanations Complete (formative skipped)")
|
|
1872
|
+
print("="*70)
|
|
1873
|
+
|
|
1874
|
+
|
|
1875
|
+
def _fit_shapley_explanations(
|
|
1876
|
+
self,
|
|
1877
|
+
X_dense: np.ndarray,
|
|
1878
|
+
X_l2: ArrayLike,
|
|
1879
|
+
index: pd.Index,
|
|
1880
|
+
engine: ShapleySignificanceEngine,
|
|
1881
|
+
subsample_indices: Optional[np.ndarray] = None
|
|
1882
|
+
) -> None:
|
|
1883
|
+
"""
|
|
1884
|
+
Compute Shapley explanations with optional subsampling.
|
|
1885
|
+
|
|
1886
|
+
OPTIMIZED: Two-tier permutation strategy for union samples.
|
|
1887
|
+
"""
|
|
1888
|
+
|
|
1889
|
+
n_samples, n_features = X_dense.shape
|
|
1890
|
+
|
|
1891
|
+
# Determine which samples to compute for
|
|
1892
|
+
if subsample_indices is not None:
|
|
1893
|
+
samples_to_compute = subsample_indices
|
|
1894
|
+
|
|
1895
|
+
# OPTIMIZATION: Two-tier permutation strategy
|
|
1896
|
+
# If we have union samples, use full permutations only for "core" samples
|
|
1897
|
+
# Core = samples that appear in multiple metric top-N lists
|
|
1898
|
+
if hasattr(self, '_union_core_samples'):
|
|
1899
|
+
core_samples = self._union_core_samples
|
|
1900
|
+
secondary_samples = np.setdiff1d(samples_to_compute, core_samples)
|
|
1901
|
+
|
|
1902
|
+
if self.verbose and len(secondary_samples) > 0:
|
|
1903
|
+
print(f" Two-tier permutation strategy:")
|
|
1904
|
+
print(f" Core samples ({len(core_samples)}): {engine.n_permutations} permutations")
|
|
1905
|
+
print(f" Secondary samples ({len(secondary_samples)}): {engine.n_permutations // 2} permutations")
|
|
1906
|
+
else:
|
|
1907
|
+
core_samples = samples_to_compute
|
|
1908
|
+
secondary_samples = np.array([])
|
|
1909
|
+
else:
|
|
1910
|
+
samples_to_compute = np.arange(n_samples)
|
|
1911
|
+
core_samples = samples_to_compute
|
|
1912
|
+
secondary_samples = np.array([])
|
|
1913
|
+
|
|
1914
|
+
# Initialize full-size arrays (zeros for non-computed samples)
|
|
1915
|
+
self.Phi_archetypal_explanations_ = np.zeros((n_samples, n_features), dtype=np.float64)
|
|
1916
|
+
self.Phi_prototypical_explanations_ = np.zeros((n_samples, n_features), dtype=np.float64)
|
|
1917
|
+
self.Phi_stereotypical_explanations_ = np.zeros((n_samples, n_features), dtype=np.float64)
|
|
1918
|
+
|
|
1919
|
+
# Value functions for explanations
|
|
1920
|
+
def explain_archetypal_features(X_subset, indices, ctx):
|
|
1921
|
+
"""Archetypal score for single sample with feature subset."""
|
|
1922
|
+
if len(X_subset) == 0 or X_subset.shape[1] == 0:
|
|
1923
|
+
return 0.0
|
|
1924
|
+
dist_to_boundary = np.minimum(X_subset, 1.0 - X_subset)
|
|
1925
|
+
archetypal_contribution = np.mean(1.0 - 2.0 * dist_to_boundary, axis=1)
|
|
1926
|
+
return float(np.mean(archetypal_contribution))
|
|
1927
|
+
|
|
1928
|
+
def explain_prototypical_features(X_subset, indices, ctx):
|
|
1929
|
+
"""Prototypical score for single sample with feature subset."""
|
|
1930
|
+
if len(X_subset) == 0 or X_subset.shape[1] == 0:
|
|
1931
|
+
return 0.0
|
|
1932
|
+
return float(np.mean(np.var(X_subset, axis=1)))
|
|
1933
|
+
|
|
1934
|
+
context = {'sample_mode': 'features'}
|
|
1935
|
+
|
|
1936
|
+
# COMPUTE CORE SAMPLES (full permutations)
|
|
1937
|
+
if len(core_samples) > 0:
|
|
1938
|
+
if self.verbose:
|
|
1939
|
+
print(f" Computing archetypal explanations (core: {len(core_samples)} samples)...")
|
|
1940
|
+
|
|
1941
|
+
X_core = X_dense[core_samples, :]
|
|
1942
|
+
Phi_arch_core, info_arch = engine.compute_feature_shapley_values(
|
|
1943
|
+
X_core,
|
|
1944
|
+
explain_archetypal_features,
|
|
1945
|
+
"Archetypal Explanations (Core)",
|
|
1946
|
+
context
|
|
1947
|
+
)
|
|
1948
|
+
self.Phi_archetypal_explanations_[core_samples, :] = Phi_arch_core
|
|
1949
|
+
self.shapley_info_['archetypal_explanations'] = info_arch
|
|
1950
|
+
|
|
1951
|
+
if self.verbose:
|
|
1952
|
+
print(f" Computing prototypical explanations (core: {len(core_samples)} samples)...")
|
|
1953
|
+
|
|
1954
|
+
Phi_proto_core, info_proto = engine.compute_feature_shapley_values(
|
|
1955
|
+
X_core,
|
|
1956
|
+
explain_prototypical_features,
|
|
1957
|
+
"Prototypical Explanations (Core)",
|
|
1958
|
+
context
|
|
1959
|
+
)
|
|
1960
|
+
self.Phi_prototypical_explanations_[core_samples, :] = Phi_proto_core
|
|
1961
|
+
self.shapley_info_['prototypical_explanations'] = info_proto
|
|
1962
|
+
|
|
1963
|
+
# Stereotypical explanations (if applicable)
|
|
1964
|
+
if self.stereotype_column is not None:
|
|
1965
|
+
def explain_stereotypical_features(X_subset, indices, ctx):
|
|
1966
|
+
if len(X_subset) == 0 or X_subset.shape[1] == 0:
|
|
1967
|
+
return 0.0
|
|
1968
|
+
if ctx.get('target_values') is None:
|
|
1969
|
+
return 0.0
|
|
1970
|
+
|
|
1971
|
+
sample_idx = indices[0]
|
|
1972
|
+
target_value = ctx['target_values'][sample_idx]
|
|
1973
|
+
target = ctx['stereotype_target']
|
|
1974
|
+
|
|
1975
|
+
if isinstance(target, str):
|
|
1976
|
+
median = ctx.get('median', np.median(ctx['target_values']))
|
|
1977
|
+
if target == 'max':
|
|
1978
|
+
distance = max(0, target_value - median)
|
|
1979
|
+
elif target == 'min':
|
|
1980
|
+
distance = max(0, median - target_value)
|
|
1981
|
+
else:
|
|
1982
|
+
distance = 0.0
|
|
1983
|
+
else:
|
|
1984
|
+
distance = -abs(target_value - target)
|
|
1985
|
+
|
|
1986
|
+
feature_contrib = float(np.mean(np.abs(X_subset)))
|
|
1987
|
+
return distance * feature_contrib
|
|
1988
|
+
|
|
1989
|
+
if self.verbose:
|
|
1990
|
+
print(f" Computing stereotypical explanations (core: {len(core_samples)} samples)...")
|
|
1991
|
+
|
|
1992
|
+
context['stereotype_target'] = self.stereotype_target
|
|
1993
|
+
context['target_values'] = self._stereotype_source_fit_.to_numpy(dtype=np.float64) if hasattr(self, '_stereotype_source_fit_') else None
|
|
1994
|
+
context['median'] = np.median(context['target_values']) if context['target_values'] is not None else 0.0
|
|
1995
|
+
|
|
1996
|
+
Phi_stereo_core, info_stereo = engine.compute_feature_shapley_values(
|
|
1997
|
+
X_core,
|
|
1998
|
+
explain_stereotypical_features,
|
|
1999
|
+
"Stereotypical Explanations (Core)",
|
|
2000
|
+
context
|
|
2001
|
+
)
|
|
2002
|
+
self.Phi_stereotypical_explanations_[core_samples, :] = Phi_stereo_core
|
|
2003
|
+
self.shapley_info_['stereotypical_explanations'] = info_stereo
|
|
2004
|
+
|
|
2005
|
+
# MEMORY CLEANUP
|
|
2006
|
+
_cleanup_memory(X_core)
|
|
2007
|
+
|
|
2008
|
+
# COMPUTE SECONDARY SAMPLES (reduced permutations for speed)
|
|
2009
|
+
if len(secondary_samples) > 0:
|
|
2010
|
+
# Temporarily reduce permutations
|
|
2011
|
+
original_n_perms = engine.n_permutations
|
|
2012
|
+
engine.n_permutations = max(10, original_n_perms // 2)
|
|
2013
|
+
|
|
2014
|
+
if self.verbose:
|
|
2015
|
+
print(f" Computing explanations (secondary: {len(secondary_samples)} samples, {engine.n_permutations} perms)...")
|
|
2016
|
+
|
|
2017
|
+
X_secondary = X_dense[secondary_samples, :]
|
|
2018
|
+
|
|
2019
|
+
# Archetypal
|
|
2020
|
+
Phi_arch_sec, _ = engine.compute_feature_shapley_values(
|
|
2021
|
+
X_secondary, explain_archetypal_features,
|
|
2022
|
+
"Archetypal Explanations (Secondary)", context
|
|
2023
|
+
)
|
|
2024
|
+
self.Phi_archetypal_explanations_[secondary_samples, :] = Phi_arch_sec
|
|
2025
|
+
|
|
2026
|
+
# Prototypical
|
|
2027
|
+
Phi_proto_sec, _ = engine.compute_feature_shapley_values(
|
|
2028
|
+
X_secondary, explain_prototypical_features,
|
|
2029
|
+
"Prototypical Explanations (Secondary)", context
|
|
2030
|
+
)
|
|
2031
|
+
self.Phi_prototypical_explanations_[secondary_samples, :] = Phi_proto_sec
|
|
2032
|
+
|
|
2033
|
+
# Stereotypical
|
|
2034
|
+
if self.stereotype_column is not None:
|
|
2035
|
+
Phi_stereo_sec, _ = engine.compute_feature_shapley_values(
|
|
2036
|
+
X_secondary, explain_stereotypical_features,
|
|
2037
|
+
"Stereotypical Explanations (Secondary)", context
|
|
2038
|
+
)
|
|
2039
|
+
self.Phi_stereotypical_explanations_[secondary_samples, :] = Phi_stereo_sec
|
|
2040
|
+
|
|
2041
|
+
# Restore original permutations
|
|
2042
|
+
engine.n_permutations = original_n_perms
|
|
2043
|
+
|
|
2044
|
+
# MEMORY CLEANUP
|
|
2045
|
+
_cleanup_memory(X_secondary)
|
|
2046
|
+
else:
|
|
2047
|
+
self.Phi_stereotypical_explanations_ = None if self.stereotype_column is None else self.Phi_stereotypical_explanations_
|
|
2048
|
+
|
|
2049
|
+
def _v04_archetypal_value(
|
|
2050
|
+
self,
|
|
2051
|
+
X_subset: np.ndarray,
|
|
2052
|
+
indices: np.ndarray,
|
|
2053
|
+
context: Dict
|
|
2054
|
+
) -> float:
|
|
2055
|
+
"""Value function: Mean archetypal rank from v0.4 NMF method (Option A)."""
|
|
2056
|
+
if len(X_subset) < context['nmf_rank']:
|
|
2057
|
+
return 0.0
|
|
2058
|
+
|
|
2059
|
+
try:
|
|
2060
|
+
nmf = NMF(
|
|
2061
|
+
n_components=min(context['nmf_rank'], len(X_subset)-1),
|
|
2062
|
+
init='random',
|
|
2063
|
+
random_state=context['random_state'],
|
|
2064
|
+
max_iter=100,
|
|
2065
|
+
tol=0.01
|
|
2066
|
+
)
|
|
2067
|
+
|
|
2068
|
+
X_nn = X_subset - X_subset.min() + 1e-6
|
|
2069
|
+
W_subset = nmf.fit_transform(X_nn)
|
|
2070
|
+
W_norm = W_subset / (W_subset.sum(axis=1, keepdims=True) + 1e-12)
|
|
2071
|
+
arch_scores = np.max(W_norm, axis=1)
|
|
2072
|
+
|
|
2073
|
+
return float(np.mean(arch_scores))
|
|
2074
|
+
except:
|
|
2075
|
+
return float(np.mean(np.ptp(X_subset, axis=0)))
|
|
2076
|
+
|
|
2077
|
+
def _v04_prototypical_value(
|
|
2078
|
+
self,
|
|
2079
|
+
X_subset: np.ndarray,
|
|
2080
|
+
indices: np.ndarray,
|
|
2081
|
+
context: Dict
|
|
2082
|
+
) -> float:
|
|
2083
|
+
"""Value function: Coverage from v0.4 facility location."""
|
|
2084
|
+
if len(X_subset) < 2:
|
|
2085
|
+
return 0.0
|
|
2086
|
+
|
|
2087
|
+
norms = np.linalg.norm(X_subset, axis=1, keepdims=True)
|
|
2088
|
+
norms[norms == 0.0] = 1.0
|
|
2089
|
+
X_l2 = X_subset / norms
|
|
2090
|
+
|
|
2091
|
+
sims = X_l2 @ X_l2.T
|
|
2092
|
+
np.fill_diagonal(sims, 0)
|
|
2093
|
+
|
|
2094
|
+
max_sims = np.max(sims, axis=1) if sims.shape[0] > 0 else np.array([0.0])
|
|
2095
|
+
return float(np.mean(max_sims))
|
|
2096
|
+
|
|
2097
|
+
def _v04_stereotypical_value(
|
|
2098
|
+
self,
|
|
2099
|
+
X_subset: np.ndarray,
|
|
2100
|
+
indices: np.ndarray,
|
|
2101
|
+
context: Dict
|
|
2102
|
+
) -> float:
|
|
2103
|
+
"""Value function: Target alignment from v0.4 stereotype targeting."""
|
|
2104
|
+
if context.get('target_values') is None:
|
|
2105
|
+
s = np.max(np.abs(X_subset - 0.5), axis=1) * 2.0
|
|
2106
|
+
return float(np.mean(s))
|
|
2107
|
+
|
|
2108
|
+
target_vals = context['target_values'][indices]
|
|
2109
|
+
target = context['stereotype_target']
|
|
2110
|
+
|
|
2111
|
+
if target == 'max':
|
|
2112
|
+
return float(np.mean(target_vals))
|
|
2113
|
+
elif target == 'min':
|
|
2114
|
+
return float(-np.mean(target_vals))
|
|
2115
|
+
else:
|
|
2116
|
+
return float(-np.mean(np.abs(target_vals - float(target))))
|
|
2117
|
+
|
|
2118
|
+
def _compute_shapley_formative_ranks(self) -> pd.DataFrame:
|
|
2119
|
+
"""Compute formative instance rankings from Shapley values."""
|
|
2120
|
+
|
|
2121
|
+
# Check if formative was computed
|
|
2122
|
+
if self.Phi_archetypal_formative_ is None:
|
|
2123
|
+
# Return None columns if formative wasn't computed
|
|
2124
|
+
n_samples = len(self.train_index_)
|
|
2125
|
+
return pd.DataFrame({
|
|
2126
|
+
'archetypal_shapley_rank': [None] * n_samples,
|
|
2127
|
+
'prototypical_shapley_rank': [None] * n_samples,
|
|
2128
|
+
'stereotypical_shapley_rank': [None] * n_samples,
|
|
2129
|
+
}, index=self.train_index_)
|
|
2130
|
+
|
|
2131
|
+
# Formative was computed - proceed normally
|
|
2132
|
+
n_samples = self.Phi_archetypal_formative_.shape[0]
|
|
2133
|
+
|
|
2134
|
+
arch_formative = self.Phi_archetypal_formative_.sum(axis=1)
|
|
2135
|
+
proto_formative = self.Phi_prototypical_formative_.sum(axis=1)
|
|
2136
|
+
|
|
2137
|
+
if self.Phi_stereotypical_formative_ is not None:
|
|
2138
|
+
stereo_formative = self.Phi_stereotypical_formative_.sum(axis=1)
|
|
2139
|
+
else:
|
|
2140
|
+
stereo_formative = np.zeros(n_samples)
|
|
2141
|
+
|
|
2142
|
+
def normalize(ranks):
|
|
2143
|
+
r_min, r_max = ranks.min(), ranks.max()
|
|
2144
|
+
if (r_max - r_min) > 1e-12:
|
|
2145
|
+
return (ranks - r_min) / (r_max - r_min)
|
|
2146
|
+
else:
|
|
2147
|
+
return np.ones_like(ranks) * 0.5
|
|
2148
|
+
|
|
2149
|
+
return pd.DataFrame({
|
|
2150
|
+
'archetypal_shapley_rank': np.round(normalize(arch_formative), 10),
|
|
2151
|
+
'prototypical_shapley_rank': np.round(normalize(proto_formative), 10),
|
|
2152
|
+
'stereotypical_shapley_rank': np.round(normalize(stereo_formative), 10),
|
|
2153
|
+
}, index=self.train_index_)
|
|
2154
|
+
|
|
2155
|
+
|
|
2156
|
+
def get_shapley_explanations(self, sample_idx: int) -> Dict[str, np.ndarray]:
|
|
2157
|
+
"""Get Shapley feature attributions explaining why sample is archetypal/prototypical/stereotypical."""
|
|
2158
|
+
if not self.shapley_mode:
|
|
2159
|
+
raise RuntimeError("Shapley mode not enabled. Set shapley_mode=True when fitting.")
|
|
2160
|
+
|
|
2161
|
+
if self.Phi_archetypal_explanations_ is None:
|
|
2162
|
+
raise RuntimeError("Shapley explanations not computed. Call fit() first.")
|
|
2163
|
+
|
|
2164
|
+
# Convert DataFrame index to positional index
|
|
2165
|
+
if hasattr(self, 'train_index_') and self.train_index_ is not None:
|
|
2166
|
+
try:
|
|
2167
|
+
pos_idx = self.train_index_.get_loc(sample_idx)
|
|
2168
|
+
except KeyError:
|
|
2169
|
+
raise ValueError(f"Sample index {sample_idx} not found in training data")
|
|
2170
|
+
else:
|
|
2171
|
+
# Assume sample_idx is already positional
|
|
2172
|
+
pos_idx = sample_idx
|
|
2173
|
+
|
|
2174
|
+
explanations = {}
|
|
2175
|
+
|
|
2176
|
+
if self.Phi_archetypal_explanations_ is not None:
|
|
2177
|
+
explanations['archetypal'] = self.Phi_archetypal_explanations_[pos_idx]
|
|
2178
|
+
|
|
2179
|
+
if self.Phi_prototypical_explanations_ is not None:
|
|
2180
|
+
explanations['prototypical'] = self.Phi_prototypical_explanations_[pos_idx]
|
|
2181
|
+
|
|
2182
|
+
if self.Phi_stereotypical_explanations_ is not None:
|
|
2183
|
+
explanations['stereotypical'] = self.Phi_stereotypical_explanations_[pos_idx]
|
|
2184
|
+
|
|
2185
|
+
return explanations
|
|
2186
|
+
|
|
2187
|
+
def get_formative_attributions(self, sample_idx: int) -> Dict[str, np.ndarray]:
|
|
2188
|
+
"""Get Shapley feature attributions showing how sample creates archetypal/prototypical/stereotypical structure."""
|
|
2189
|
+
if not self.shapley_mode:
|
|
2190
|
+
raise RuntimeError("Shapley mode not enabled. Set shapley_mode=True when fitting.")
|
|
2191
|
+
|
|
2192
|
+
if self.Phi_archetypal_formative_ is None:
|
|
2193
|
+
raise RuntimeError(
|
|
2194
|
+
"Formative instances not computed. "
|
|
2195
|
+
"This occurs when fast_mode=True (formative skipped for speed). "
|
|
2196
|
+
"Use fast_mode=False to compute formative instances."
|
|
2197
|
+
)
|
|
2198
|
+
|
|
2199
|
+
# Convert DataFrame index to positional index
|
|
2200
|
+
if hasattr(self, 'train_index_') and self.train_index_ is not None:
|
|
2201
|
+
try:
|
|
2202
|
+
pos_idx = self.train_index_.get_loc(sample_idx)
|
|
2203
|
+
except KeyError:
|
|
2204
|
+
raise ValueError(f"Sample index {sample_idx} not found in training data")
|
|
2205
|
+
else:
|
|
2206
|
+
# Assume sample_idx is already positional
|
|
2207
|
+
pos_idx = sample_idx
|
|
2208
|
+
|
|
2209
|
+
attributions = {}
|
|
2210
|
+
|
|
2211
|
+
if self.Phi_archetypal_formative_ is not None:
|
|
2212
|
+
attributions['archetypal'] = self.Phi_archetypal_formative_[pos_idx]
|
|
2213
|
+
|
|
2214
|
+
if self.Phi_prototypical_formative_ is not None:
|
|
2215
|
+
attributions['prototypical'] = self.Phi_prototypical_formative_[pos_idx]
|
|
2216
|
+
|
|
2217
|
+
if self.Phi_stereotypical_formative_ is not None:
|
|
2218
|
+
attributions['stereotypical'] = self.Phi_stereotypical_formative_[pos_idx]
|
|
2219
|
+
|
|
2220
|
+
return attributions
|
|
2221
|
+
|
|
2222
|
+
# --------------------------
|
|
2223
|
+
# Text (TF-IDF)
|
|
2224
|
+
# --------------------------
|
|
2225
|
+
def fit_text(
|
|
2226
|
+
self,
|
|
2227
|
+
corpus: Iterable[str],
|
|
2228
|
+
vectorizer: str = "tfidf",
|
|
2229
|
+
text_metadata: Optional[pd.DataFrame] = None
|
|
2230
|
+
):
|
|
2231
|
+
"""
|
|
2232
|
+
Fit on text corpus with optional metadata.
|
|
2233
|
+
|
|
2234
|
+
Parameters
|
|
2235
|
+
----------
|
|
2236
|
+
corpus : Iterable[str]
|
|
2237
|
+
Text documents
|
|
2238
|
+
vectorizer : str
|
|
2239
|
+
Vectorization method (default: 'tfidf')
|
|
2240
|
+
text_metadata : pd.DataFrame, optional
|
|
2241
|
+
Document-level properties for stereotype computation
|
|
2242
|
+
Must have same number of rows as documents in corpus
|
|
2243
|
+
"""
|
|
2244
|
+
self._validate_stereotype_config()
|
|
2245
|
+
with _ThreadControl(self.deterministic and not self.speed_mode) as tc:
|
|
2246
|
+
_seed_everything(self.random_state)
|
|
2247
|
+
X_scaled, X_l2 = self._preprocess_text_fit(corpus, vectorizer, text_metadata)
|
|
2248
|
+
idx = pd.RangeIndex(X_scaled.shape[0])
|
|
2249
|
+
self.train_index_ = idx
|
|
2250
|
+
self._fit_components(X_scaled, X_l2, idx)
|
|
2251
|
+
self._record_settings(tc)
|
|
2252
|
+
return self
|
|
2253
|
+
|
|
2254
|
+
def transform_text(self, corpus: Iterable[str]) -> pd.DataFrame:
|
|
2255
|
+
"""Transform text corpus."""
|
|
2256
|
+
with _ThreadControl(self.deterministic and not self.speed_mode):
|
|
2257
|
+
X_scaled, X_l2 = self._preprocess_text_transform(corpus)
|
|
2258
|
+
idx = pd.RangeIndex(X_scaled.shape[0])
|
|
2259
|
+
|
|
2260
|
+
# Get stereotype source (priority: metadata column > keywords > None)
|
|
2261
|
+
stereotype_source = None
|
|
2262
|
+
|
|
2263
|
+
# Priority 1: Metadata column (from fit_text)
|
|
2264
|
+
if self.stereotype_column is not None and self.text_metadata_ is not None:
|
|
2265
|
+
if self.stereotype_column in self.text_metadata_.columns:
|
|
2266
|
+
stereotype_source = self.text_metadata_[self.stereotype_column]
|
|
2267
|
+
|
|
2268
|
+
# Priority 2: Keyword scores (recompute on new corpus)
|
|
2269
|
+
elif self.stereotype_keywords is not None:
|
|
2270
|
+
corpus_list = list(corpus)
|
|
2271
|
+
X_tfidf = self.vectorizer_.transform(corpus_list)
|
|
2272
|
+
keyword_scores = self._compute_keyword_scores(
|
|
2273
|
+
X_tfidf, corpus_list, self.stereotype_keywords
|
|
2274
|
+
)
|
|
2275
|
+
stereotype_source = pd.Series(keyword_scores)
|
|
2276
|
+
|
|
2277
|
+
return self._score_with_fitted(X_scaled, X_l2, idx, stereotype_source)
|
|
2278
|
+
|
|
2279
|
+
def fit_transform_text(
|
|
2280
|
+
self,
|
|
2281
|
+
corpus: Iterable[str],
|
|
2282
|
+
vectorizer: str = "tfidf",
|
|
2283
|
+
text_metadata: Optional[pd.DataFrame] = None
|
|
2284
|
+
) -> pd.DataFrame:
|
|
2285
|
+
"""Fit and transform text in one step."""
|
|
2286
|
+
self.fit_text(corpus, vectorizer=vectorizer, text_metadata=text_metadata)
|
|
2287
|
+
return self.transform_text(corpus)
|
|
2288
|
+
|
|
2289
|
+
# --------------------------
|
|
2290
|
+
# Signals / Graphs (numeric)
|
|
2291
|
+
# --------------------------
|
|
2292
|
+
def fit_transform_signals(self, X_signal: Union[pd.DataFrame, np.ndarray]) -> pd.DataFrame:
|
|
2293
|
+
self.fit(X_signal)
|
|
2294
|
+
return self.transform(X_signal, return_ranks_only=True)
|
|
2295
|
+
|
|
2296
|
+
def fit_transform_graph(
|
|
2297
|
+
self,
|
|
2298
|
+
node_features: Union[pd.DataFrame, np.ndarray],
|
|
2299
|
+
edges: Optional[np.ndarray] = None,
|
|
2300
|
+
edge_index: Optional[np.ndarray] = None,
|
|
2301
|
+
compute_topology: bool = True
|
|
2302
|
+
) -> pd.DataFrame:
|
|
2303
|
+
"""
|
|
2304
|
+
Fit and transform graph data.
|
|
2305
|
+
|
|
2306
|
+
Parameters
|
|
2307
|
+
----------
|
|
2308
|
+
node_features : DataFrame or array
|
|
2309
|
+
Node feature matrix (n_nodes, n_features)
|
|
2310
|
+
edges : np.ndarray, optional
|
|
2311
|
+
Edge list as (2, n_edges) or (n_edges, 2)
|
|
2312
|
+
Alias: edge_index
|
|
2313
|
+
compute_topology : bool
|
|
2314
|
+
Whether to compute and append topology features
|
|
2315
|
+
|
|
2316
|
+
Returns
|
|
2317
|
+
-------
|
|
2318
|
+
results : pd.DataFrame
|
|
2319
|
+
Rankings with topology features if computed
|
|
2320
|
+
"""
|
|
2321
|
+
# Handle edge_index alias
|
|
2322
|
+
if edges is None and edge_index is not None:
|
|
2323
|
+
edges = edge_index
|
|
2324
|
+
|
|
2325
|
+
# Convert to DataFrame
|
|
2326
|
+
if isinstance(node_features, pd.DataFrame):
|
|
2327
|
+
df = node_features.copy()
|
|
2328
|
+
else:
|
|
2329
|
+
df = pd.DataFrame(node_features)
|
|
2330
|
+
|
|
2331
|
+
n_nodes = len(df)
|
|
2332
|
+
|
|
2333
|
+
# Compute topology features if edges provided
|
|
2334
|
+
self.graph_topology_df_ = None
|
|
2335
|
+
if edges is not None and compute_topology:
|
|
2336
|
+
topology_df = self._compute_graph_topology_features(edges, n_nodes)
|
|
2337
|
+
self.graph_topology_df_ = topology_df
|
|
2338
|
+
|
|
2339
|
+
# Append to node features
|
|
2340
|
+
for col in topology_df.columns:
|
|
2341
|
+
if col not in df.columns:
|
|
2342
|
+
df[col] = topology_df[col].values
|
|
2343
|
+
else:
|
|
2344
|
+
warnings.warn(f"Topology feature '{col}' already exists, skipping")
|
|
2345
|
+
|
|
2346
|
+
# Standard tabular processing
|
|
2347
|
+
self.fit(df)
|
|
2348
|
+
|
|
2349
|
+
# Use standard transform which preserves label columns
|
|
2350
|
+
results = self.transform(df, return_ranks_only=False)
|
|
2351
|
+
|
|
2352
|
+
return results
|
|
2353
|
+
|
|
2354
|
+
# --------------------------
|
|
2355
|
+
# Ideals (legacy stereotypes)
|
|
2356
|
+
# --------------------------
|
|
2357
|
+
def register_ideal(self, name: str, ideal_vector: Union[np.ndarray, List[float]]) -> None:
|
|
2358
|
+
v = np.asarray(ideal_vector, dtype=np.float64).ravel()
|
|
2359
|
+
if self.scaler_ is None:
|
|
2360
|
+
raise RuntimeError("Call fit/fit_text before registering ideals.")
|
|
2361
|
+
d = self.H_.shape[1] if self.H_ is not None else self.scaler_.n_features_in_
|
|
2362
|
+
if v.shape[0] != d:
|
|
2363
|
+
raise ValueError(f"Ideal has dim {v.shape[0]} but data has {d} features.")
|
|
2364
|
+
self.ideals_[name] = v.copy()
|
|
2365
|
+
|
|
2366
|
+
# --------------------------
|
|
2367
|
+
# Config / sklearn interop
|
|
2368
|
+
# --------------------------
|
|
2369
|
+
def to_config(self) -> Dict:
|
|
2370
|
+
cfg = {k: getattr(self, k) for k in [
|
|
2371
|
+
"nmf_rank","n_prototypes","scale","distance_metric","similarity_metric",
|
|
2372
|
+
"deterministic","n_jobs","max_iter_nmf","tol_nmf","speed_mode","dtype",
|
|
2373
|
+
"random_state","max_memory_mb","return_ranks_only","auto_n_prototypes",
|
|
2374
|
+
"verbose","max_missing_frac",
|
|
2375
|
+
"stereotype_column","stereotype_target","label_columns",
|
|
2376
|
+
"stereotype_keywords","graph_topology_features"
|
|
2377
|
+
]}
|
|
2378
|
+
cfg["version"] = "0.4"
|
|
2379
|
+
return cfg
|
|
2380
|
+
|
|
2381
|
+
@classmethod
|
|
2382
|
+
def from_config(cls, cfg: Dict) -> "DataTypical":
|
|
2383
|
+
try:
|
|
2384
|
+
return cls(**{k: v for k, v in cfg.items() if k in {f.name for f in dc_fields(cls)}})
|
|
2385
|
+
except TypeError as e:
|
|
2386
|
+
raise ConfigError(str(e))
|
|
2387
|
+
|
|
2388
|
+
def get_params(self, deep: bool = True) -> Dict:
|
|
2389
|
+
return {f.name: getattr(self, f.name) for f in dc_fields(self) if f.init}
|
|
2390
|
+
|
|
2391
|
+
def set_params(self, **params):
|
|
2392
|
+
for k, v in params.items():
|
|
2393
|
+
if not hasattr(self, k):
|
|
2394
|
+
raise ValueError(f"Unknown parameter {k}")
|
|
2395
|
+
setattr(self, k, v)
|
|
2396
|
+
return self
|
|
2397
|
+
|
|
2398
|
+
# ============================================================
|
|
2399
|
+
# [F] Graph Topology Features (NEW in v0.4)
|
|
2400
|
+
# ============================================================
|
|
2401
|
+
def _compute_graph_topology_features(
|
|
2402
|
+
self,
|
|
2403
|
+
edge_index: np.ndarray,
|
|
2404
|
+
n_nodes: int,
|
|
2405
|
+
feature_names: Optional[List[str]] = None
|
|
2406
|
+
) -> pd.DataFrame:
|
|
2407
|
+
"""
|
|
2408
|
+
Compute graph topology features.
|
|
2409
|
+
|
|
2410
|
+
Parameters
|
|
2411
|
+
----------
|
|
2412
|
+
edge_index : np.ndarray
|
|
2413
|
+
Edge list (2, n_edges) or (n_edges, 2)
|
|
2414
|
+
n_nodes : int
|
|
2415
|
+
Number of nodes
|
|
2416
|
+
feature_names : List[str], optional
|
|
2417
|
+
Which topology features to compute
|
|
2418
|
+
|
|
2419
|
+
Returns
|
|
2420
|
+
-------
|
|
2421
|
+
topology_df : pd.DataFrame
|
|
2422
|
+
Computed topology features (n_nodes, n_features)
|
|
2423
|
+
"""
|
|
2424
|
+
try:
|
|
2425
|
+
import networkx as nx
|
|
2426
|
+
except ImportError:
|
|
2427
|
+
raise ImportError(
|
|
2428
|
+
"NetworkX is required for graph topology features. "
|
|
2429
|
+
"Install with: pip install networkx"
|
|
2430
|
+
)
|
|
2431
|
+
|
|
2432
|
+
# Convert edge_index to NetworkX graph
|
|
2433
|
+
if edge_index.shape[0] == 2:
|
|
2434
|
+
edges = edge_index.T # (n_edges, 2)
|
|
2435
|
+
else:
|
|
2436
|
+
edges = edge_index
|
|
2437
|
+
|
|
2438
|
+
G = nx.Graph()
|
|
2439
|
+
G.add_nodes_from(range(n_nodes))
|
|
2440
|
+
G.add_edges_from(edges)
|
|
2441
|
+
|
|
2442
|
+
# Determine which features to compute
|
|
2443
|
+
if feature_names is None:
|
|
2444
|
+
feature_names = self.graph_topology_features or ['degree', 'clustering']
|
|
2445
|
+
|
|
2446
|
+
topology_data = {}
|
|
2447
|
+
|
|
2448
|
+
for feat_name in feature_names:
|
|
2449
|
+
if feat_name == 'degree':
|
|
2450
|
+
degree_dict = dict(G.degree())
|
|
2451
|
+
topology_data['degree'] = [degree_dict.get(i, 0) for i in range(n_nodes)]
|
|
2452
|
+
|
|
2453
|
+
elif feat_name == 'clustering':
|
|
2454
|
+
clust_dict = nx.clustering(G)
|
|
2455
|
+
topology_data['clustering'] = [clust_dict.get(i, 0.0) for i in range(n_nodes)]
|
|
2456
|
+
|
|
2457
|
+
elif feat_name == 'pagerank':
|
|
2458
|
+
pr_dict = nx.pagerank(G, max_iter=100)
|
|
2459
|
+
topology_data['pagerank'] = [pr_dict.get(i, 0.0) for i in range(n_nodes)]
|
|
2460
|
+
|
|
2461
|
+
elif feat_name == 'triangles':
|
|
2462
|
+
tri_dict = nx.triangles(G)
|
|
2463
|
+
topology_data['triangles'] = [tri_dict.get(i, 0) for i in range(n_nodes)]
|
|
2464
|
+
|
|
2465
|
+
elif feat_name == 'betweenness':
|
|
2466
|
+
bet_dict = nx.betweenness_centrality(G)
|
|
2467
|
+
topology_data['betweenness'] = [bet_dict.get(i, 0.0) for i in range(n_nodes)]
|
|
2468
|
+
|
|
2469
|
+
elif feat_name == 'closeness':
|
|
2470
|
+
close_dict = nx.closeness_centrality(G)
|
|
2471
|
+
topology_data['closeness'] = [close_dict.get(i, 0.0) for i in range(n_nodes)]
|
|
2472
|
+
|
|
2473
|
+
elif feat_name == 'eigenvector':
|
|
2474
|
+
try:
|
|
2475
|
+
eigen_dict = nx.eigenvector_centrality(G, max_iter=100)
|
|
2476
|
+
topology_data['eigenvector'] = [eigen_dict.get(i, 0.0) for i in range(n_nodes)]
|
|
2477
|
+
except:
|
|
2478
|
+
warnings.warn("Eigenvector centrality failed, using zeros")
|
|
2479
|
+
topology_data['eigenvector'] = [0.0] * n_nodes
|
|
2480
|
+
|
|
2481
|
+
else:
|
|
2482
|
+
warnings.warn(f"Unknown topology feature: {feat_name}")
|
|
2483
|
+
|
|
2484
|
+
return pd.DataFrame(topology_data, index=range(n_nodes))
|
|
2485
|
+
|
|
2486
|
+
# ============================================================
|
|
2487
|
+
# [G] Stereotype Computation (NEW in v0.4)
|
|
2488
|
+
# ============================================================
|
|
2489
|
+
def _validate_stereotype_config(self):
|
|
2490
|
+
"""Validate stereotype configuration at fit time."""
|
|
2491
|
+
|
|
2492
|
+
# Check conflicting specifications
|
|
2493
|
+
if self.stereotype_column is not None and self.stereotype_keywords is not None:
|
|
2494
|
+
raise ConfigError(
|
|
2495
|
+
"Cannot specify both stereotype_column and stereotype_keywords. "
|
|
2496
|
+
"Use stereotype_column for metadata or stereotype_keywords for text relevance."
|
|
2497
|
+
)
|
|
2498
|
+
|
|
2499
|
+
# Validate target
|
|
2500
|
+
if isinstance(self.stereotype_target, str):
|
|
2501
|
+
if self.stereotype_target not in ['min', 'max']:
|
|
2502
|
+
raise ConfigError(
|
|
2503
|
+
f"stereotype_target must be 'min', 'max', or numeric value, "
|
|
2504
|
+
f"got: '{self.stereotype_target}'"
|
|
2505
|
+
)
|
|
2506
|
+
|
|
2507
|
+
def _compute_stereotypical_rank(
|
|
2508
|
+
self,
|
|
2509
|
+
X_scaled: ArrayLike,
|
|
2510
|
+
index: pd.Index,
|
|
2511
|
+
stereotype_source: Optional[pd.Series] = None
|
|
2512
|
+
) -> np.ndarray:
|
|
2513
|
+
"""
|
|
2514
|
+
Compute stereotypical ranking based on configuration.
|
|
2515
|
+
|
|
2516
|
+
Parameters
|
|
2517
|
+
----------
|
|
2518
|
+
X_scaled : ArrayLike
|
|
2519
|
+
Scaled feature matrix (for fallback to extremeness)
|
|
2520
|
+
index : pd.Index
|
|
2521
|
+
Row index
|
|
2522
|
+
stereotype_source : pd.Series, optional
|
|
2523
|
+
Pre-computed values to rank against (from df_original, metadata, or topology)
|
|
2524
|
+
|
|
2525
|
+
Returns
|
|
2526
|
+
-------
|
|
2527
|
+
stereotype_rank : np.ndarray
|
|
2528
|
+
Scores in [0, 1] where 1 = closest to stereotype target
|
|
2529
|
+
"""
|
|
2530
|
+
if stereotype_source is None:
|
|
2531
|
+
# BACKWARD COMPATIBLE: use extremeness
|
|
2532
|
+
X_dense = X_scaled.toarray() if (sp is not None and sp.isspmatrix(X_scaled)) else X_scaled
|
|
2533
|
+
s = np.max(np.abs(X_dense - 0.5), axis=1) * 2.0
|
|
2534
|
+
s_min, s_max = float(s.min()), float(s.max())
|
|
2535
|
+
if (s_max - s_min) > 1e-12:
|
|
2536
|
+
return (s - s_min) / (s_max - s_min)
|
|
2537
|
+
else:
|
|
2538
|
+
return np.zeros_like(s)
|
|
2539
|
+
|
|
2540
|
+
# USER-DIRECTED: Rank toward specific target
|
|
2541
|
+
values = stereotype_source.to_numpy(dtype=np.float64)
|
|
2542
|
+
|
|
2543
|
+
# Handle NaN values
|
|
2544
|
+
valid_mask = ~np.isnan(values)
|
|
2545
|
+
if not np.any(valid_mask):
|
|
2546
|
+
warnings.warn("All stereotype values are NaN, using zeros")
|
|
2547
|
+
return np.zeros(len(values))
|
|
2548
|
+
|
|
2549
|
+
# Compute target value
|
|
2550
|
+
if isinstance(self.stereotype_target, str):
|
|
2551
|
+
if self.stereotype_target == "min":
|
|
2552
|
+
target = np.nanmin(values)
|
|
2553
|
+
elif self.stereotype_target == "max":
|
|
2554
|
+
target = np.nanmax(values)
|
|
2555
|
+
else:
|
|
2556
|
+
raise ValueError(
|
|
2557
|
+
f"stereotype_target must be 'min', 'max', or numeric value, "
|
|
2558
|
+
f"got '{self.stereotype_target}'"
|
|
2559
|
+
)
|
|
2560
|
+
else:
|
|
2561
|
+
target = float(self.stereotype_target)
|
|
2562
|
+
|
|
2563
|
+
# Rank by distance to target (inverted: 1 = closest, 0 = furthest)
|
|
2564
|
+
distances = np.abs(values - target)
|
|
2565
|
+
max_dist = np.nanmax(distances)
|
|
2566
|
+
|
|
2567
|
+
if max_dist > 1e-12:
|
|
2568
|
+
stereotype_rank = 1.0 - (distances / max_dist)
|
|
2569
|
+
else:
|
|
2570
|
+
# All values identical or at target
|
|
2571
|
+
stereotype_rank = np.ones_like(distances, dtype=np.float64)
|
|
2572
|
+
|
|
2573
|
+
# Handle NaN entries
|
|
2574
|
+
stereotype_rank[~valid_mask] = 0.0
|
|
2575
|
+
|
|
2576
|
+
return np.clip(stereotype_rank, 0.0, 1.0)
|
|
2577
|
+
|
|
2578
|
+
|
|
2579
|
+
def _get_stereotype_source_table(self, df: pd.DataFrame) -> Optional[pd.Series]:
|
|
2580
|
+
"""Extract stereotype values from tabular data."""
|
|
2581
|
+
if self.stereotype_column is None:
|
|
2582
|
+
return None
|
|
2583
|
+
|
|
2584
|
+
# Check if column exists in df (features or labels)
|
|
2585
|
+
if self.stereotype_column not in df.columns:
|
|
2586
|
+
raise ValueError(
|
|
2587
|
+
f"stereotype_column '{self.stereotype_column}' not found. "
|
|
2588
|
+
f"Available columns: {list(df.columns)}"
|
|
2589
|
+
)
|
|
2590
|
+
|
|
2591
|
+
return df[self.stereotype_column]
|
|
2592
|
+
|
|
2593
|
+
def _get_stereotype_source_text(self) -> Optional[pd.Series]:
|
|
2594
|
+
"""Extract stereotype values from text metadata or keywords."""
|
|
2595
|
+
|
|
2596
|
+
# Priority 1: User-specified column from metadata
|
|
2597
|
+
if self.stereotype_column is not None:
|
|
2598
|
+
if self.text_metadata_ is None:
|
|
2599
|
+
raise ValueError(
|
|
2600
|
+
"stereotype_column specified but no text_metadata provided. "
|
|
2601
|
+
"Pass text_metadata to fit_text() or use stereotype_keywords."
|
|
2602
|
+
)
|
|
2603
|
+
|
|
2604
|
+
if self.stereotype_column not in self.text_metadata_.columns:
|
|
2605
|
+
raise ValueError(
|
|
2606
|
+
f"stereotype_column '{self.stereotype_column}' not found in text_metadata. "
|
|
2607
|
+
f"Available columns: {list(self.text_metadata_.columns)}"
|
|
2608
|
+
)
|
|
2609
|
+
|
|
2610
|
+
return self.text_metadata_[self.stereotype_column]
|
|
2611
|
+
|
|
2612
|
+
# Priority 2: Keyword-based scores
|
|
2613
|
+
if self.stereotype_keyword_scores_ is not None:
|
|
2614
|
+
return pd.Series(self.stereotype_keyword_scores_)
|
|
2615
|
+
|
|
2616
|
+
# No stereotype specified
|
|
2617
|
+
return None
|
|
2618
|
+
|
|
2619
|
+
def _compute_keyword_scores(
|
|
2620
|
+
self,
|
|
2621
|
+
X_tfidf: "sp.spmatrix",
|
|
2622
|
+
corpus: List[str],
|
|
2623
|
+
keywords: List[str]
|
|
2624
|
+
) -> np.ndarray:
|
|
2625
|
+
"""
|
|
2626
|
+
Compute relevance scores for documents based on keyword TF-IDF sum.
|
|
2627
|
+
|
|
2628
|
+
Parameters
|
|
2629
|
+
----------
|
|
2630
|
+
X_tfidf : sparse matrix
|
|
2631
|
+
TF-IDF matrix (n_docs, n_vocab)
|
|
2632
|
+
corpus : List[str]
|
|
2633
|
+
Original documents (for fallback if keywords not in vocab)
|
|
2634
|
+
keywords : List[str]
|
|
2635
|
+
Keywords to compute relevance for
|
|
2636
|
+
|
|
2637
|
+
Returns
|
|
2638
|
+
-------
|
|
2639
|
+
scores : np.ndarray
|
|
2640
|
+
Relevance score per document (n_docs,)
|
|
2641
|
+
"""
|
|
2642
|
+
vocab = self.vectorizer_.vocabulary_
|
|
2643
|
+
|
|
2644
|
+
# Find indices of keywords in vocabulary
|
|
2645
|
+
keyword_indices = []
|
|
2646
|
+
missing_keywords = []
|
|
2647
|
+
|
|
2648
|
+
for kw in keywords:
|
|
2649
|
+
if kw in vocab:
|
|
2650
|
+
keyword_indices.append(vocab[kw])
|
|
2651
|
+
else:
|
|
2652
|
+
missing_keywords.append(kw)
|
|
2653
|
+
|
|
2654
|
+
if missing_keywords:
|
|
2655
|
+
warnings.warn(
|
|
2656
|
+
f"Keywords not found in vocabulary: {missing_keywords}. "
|
|
2657
|
+
f"These will be ignored in stereotype computation."
|
|
2658
|
+
)
|
|
2659
|
+
|
|
2660
|
+
if not keyword_indices:
|
|
2661
|
+
warnings.warn(
|
|
2662
|
+
"No stereotype keywords found in vocabulary. "
|
|
2663
|
+
"Using zero scores (equivalent to no stereotype)."
|
|
2664
|
+
)
|
|
2665
|
+
return np.zeros(X_tfidf.shape[0])
|
|
2666
|
+
|
|
2667
|
+
# Sum TF-IDF scores for keyword columns
|
|
2668
|
+
keyword_indices = np.array(keyword_indices)
|
|
2669
|
+
X_keywords = X_tfidf[:, keyword_indices]
|
|
2670
|
+
scores = np.asarray(X_keywords.sum(axis=1)).ravel()
|
|
2671
|
+
|
|
2672
|
+
return scores
|
|
2673
|
+
|
|
2674
|
+
# ============================================================
|
|
2675
|
+
# Internals - Tables (numeric-only features)
|
|
2676
|
+
# ============================================================
|
|
2677
|
+
def _select_numeric_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
2678
|
+
"""Pick numeric feature columns; auto-exclude ID-like columns."""
|
|
2679
|
+
feat_df = df.select_dtypes(include=[np.number]).copy()
|
|
2680
|
+
if feat_df.shape[1] == 0:
|
|
2681
|
+
raise DataTypicalError("No numeric feature columns found for tabular processing.")
|
|
2682
|
+
|
|
2683
|
+
n = len(feat_df)
|
|
2684
|
+
to_drop = set()
|
|
2685
|
+
for col in feat_df.columns:
|
|
2686
|
+
name = str(col).lower()
|
|
2687
|
+
if name == "id" or name.endswith("_id") or name.startswith("id_"):
|
|
2688
|
+
to_drop.add(col)
|
|
2689
|
+
continue
|
|
2690
|
+
|
|
2691
|
+
s = feat_df[col]
|
|
2692
|
+
nunique = s.nunique(dropna=True)
|
|
2693
|
+
|
|
2694
|
+
# near-unique numerics behave like row IDs
|
|
2695
|
+
if nunique >= 0.8 * n:
|
|
2696
|
+
to_drop.add(col)
|
|
2697
|
+
continue
|
|
2698
|
+
|
|
2699
|
+
# strict monotone sequence (typical of indices)
|
|
2700
|
+
if n > 1:
|
|
2701
|
+
diffs = np.diff(s.values).astype(float)
|
|
2702
|
+
if (diffs > 0).all() or (diffs < 0).all():
|
|
2703
|
+
to_drop.add(col)
|
|
2704
|
+
continue
|
|
2705
|
+
|
|
2706
|
+
feature_cols = [c for c in feat_df.columns if c not in to_drop]
|
|
2707
|
+
if not feature_cols:
|
|
2708
|
+
# fall back to all numeric if we dropped everything
|
|
2709
|
+
feature_cols = list(feat_df.columns)
|
|
2710
|
+
if self.verbose:
|
|
2711
|
+
warnings.warn("All numeric columns looked like IDs; using them anyway.")
|
|
2712
|
+
return feat_df[feature_cols]
|
|
2713
|
+
|
|
2714
|
+
def _preprocess_table_fit(self, df: pd.DataFrame) -> Tuple[np.ndarray, ArrayLike]:
|
|
2715
|
+
# Store original df for stereotype computation
|
|
2716
|
+
self._df_original_fit = df.copy()
|
|
2717
|
+
|
|
2718
|
+
# Separate label columns (preserve but don't use in NMF)
|
|
2719
|
+
if self.label_columns is not None:
|
|
2720
|
+
label_cols_present = [c for c in self.label_columns if c in df.columns]
|
|
2721
|
+
missing_labels = [c for c in self.label_columns if c not in df.columns]
|
|
2722
|
+
|
|
2723
|
+
if missing_labels:
|
|
2724
|
+
warnings.warn(f"Label columns not found: {missing_labels}")
|
|
2725
|
+
|
|
2726
|
+
if label_cols_present:
|
|
2727
|
+
self.label_df_ = df[label_cols_present].copy()
|
|
2728
|
+
df_for_features = df.drop(columns=label_cols_present)
|
|
2729
|
+
else:
|
|
2730
|
+
self.label_df_ = None
|
|
2731
|
+
df_for_features = df
|
|
2732
|
+
else:
|
|
2733
|
+
self.label_df_ = None
|
|
2734
|
+
df_for_features = df
|
|
2735
|
+
|
|
2736
|
+
# Pick numeric features only
|
|
2737
|
+
feat_df = self._select_numeric_features(df_for_features)
|
|
2738
|
+
self.feature_columns_ = list(feat_df.columns)
|
|
2739
|
+
|
|
2740
|
+
X = feat_df.to_numpy(dtype=self.dtype, copy=True)
|
|
2741
|
+
|
|
2742
|
+
# Missingness report on features
|
|
2743
|
+
miss_frac = np.mean(pd.isna(feat_df), axis=0).to_numpy()
|
|
2744
|
+
self.missingness_ = {name: float(frac) for name, frac in zip(self.feature_columns_, miss_frac)}
|
|
2745
|
+
worst = np.max(miss_frac) if miss_frac.size else 0.0
|
|
2746
|
+
if worst > self.max_missing_frac:
|
|
2747
|
+
raise DataTypicalError(
|
|
2748
|
+
f"Missingness too high (max frac={worst:.3f} > threshold={self.max_missing_frac})."
|
|
2749
|
+
)
|
|
2750
|
+
|
|
2751
|
+
# Deterministic imputer: per-feature median
|
|
2752
|
+
med = np.nanmedian(X, axis=0)
|
|
2753
|
+
inds = np.where(np.isnan(X))
|
|
2754
|
+
X[inds] = np.take(med, inds[1])
|
|
2755
|
+
self.impute_median_ = med
|
|
2756
|
+
|
|
2757
|
+
# Scale to [0,1]
|
|
2758
|
+
self.scaler_ = MinMaxScaler(copy=True, clip=True)
|
|
2759
|
+
X_scaled_full = self.scaler_.fit_transform(X).astype(self.dtype, copy=False)
|
|
2760
|
+
|
|
2761
|
+
# Drop constant columns
|
|
2762
|
+
var = X_scaled_full.var(axis=0)
|
|
2763
|
+
keep_mask = var > 0.0
|
|
2764
|
+
self.keep_mask_ = keep_mask
|
|
2765
|
+
if not np.all(keep_mask):
|
|
2766
|
+
self.dropped_columns_ = [c for c, k in zip(self.feature_columns_, keep_mask) if not k]
|
|
2767
|
+
if self.verbose:
|
|
2768
|
+
warnings.warn(f"Dropped constant feature columns: {self.dropped_columns_}")
|
|
2769
|
+
X_scaled = X_scaled_full[:, keep_mask]
|
|
2770
|
+
|
|
2771
|
+
# Optional feature weights (length must match number of original numeric features)
|
|
2772
|
+
if self.feature_weights is not None:
|
|
2773
|
+
w = np.asarray(self.feature_weights, dtype=np.float64).ravel()
|
|
2774
|
+
if w.shape[0] != len(self.feature_columns_):
|
|
2775
|
+
warnings.warn("feature_weights length mismatch – ignoring weights.")
|
|
2776
|
+
else:
|
|
2777
|
+
X_scaled = (X_scaled * w[keep_mask]).astype(self.dtype, copy=False)
|
|
2778
|
+
|
|
2779
|
+
# L2 copy
|
|
2780
|
+
X_l2 = _l2_normalize_rows_dense(X_scaled.astype(np.float64))
|
|
2781
|
+
return X_scaled, X_l2
|
|
2782
|
+
|
|
2783
|
+
def _preprocess_table_transform(self, df: pd.DataFrame) -> Tuple[np.ndarray, ArrayLike]:
|
|
2784
|
+
if any(v is None for v in (self.feature_columns_, self.impute_median_, self.keep_mask_, self.scaler_)):
|
|
2785
|
+
raise RuntimeError("Model not fitted.")
|
|
2786
|
+
# Align by feature column NAMES (order enforced from training)
|
|
2787
|
+
missing = [c for c in self.feature_columns_ if c not in df.columns]
|
|
2788
|
+
if missing:
|
|
2789
|
+
raise DataTypicalError(f"Missing required feature columns at transform: {missing}")
|
|
2790
|
+
feat_df = df[self.feature_columns_]
|
|
2791
|
+
# Ensure numeric
|
|
2792
|
+
if not all(np.issubdtype(t, np.number) for t in feat_df.dtypes):
|
|
2793
|
+
raise DataTypicalError("Non-numeric values present in feature columns at transform.")
|
|
2794
|
+
X = feat_df.to_numpy(dtype=self.dtype, copy=True)
|
|
2795
|
+
|
|
2796
|
+
# Impute with training medians
|
|
2797
|
+
inds = np.where(np.isnan(X))
|
|
2798
|
+
if inds[0].size:
|
|
2799
|
+
X[inds] = np.take(self.impute_median_, inds[1])
|
|
2800
|
+
|
|
2801
|
+
# Scale using fitted scaler; then drop constants via keep_mask_
|
|
2802
|
+
X_scaled_full = self.scaler_.transform(X).astype(self.dtype, copy=False)
|
|
2803
|
+
X_scaled = X_scaled_full[:, self.keep_mask_]
|
|
2804
|
+
|
|
2805
|
+
# Optional weights
|
|
2806
|
+
if self.feature_weights is not None and len(self.feature_columns_) == self.feature_weights.shape[0]:
|
|
2807
|
+
X_scaled = (X_scaled * np.asarray(self.feature_weights)[self.keep_mask_]).astype(self.dtype, copy=False)
|
|
2808
|
+
|
|
2809
|
+
X_l2 = _l2_normalize_rows_dense(X_scaled.astype(np.float64))
|
|
2810
|
+
return X_scaled, X_l2
|
|
2811
|
+
|
|
2812
|
+
# ============================================================
|
|
2813
|
+
# Internals - Text
|
|
2814
|
+
# ============================================================
|
|
2815
|
+
def _preprocess_text_fit(
|
|
2816
|
+
self,
|
|
2817
|
+
corpus: Iterable[str],
|
|
2818
|
+
vectorizer: str,
|
|
2819
|
+
text_metadata: Optional[pd.DataFrame] = None
|
|
2820
|
+
) -> Tuple[ArrayLike, ArrayLike]:
|
|
2821
|
+
"""
|
|
2822
|
+
Preprocess text with optional metadata for stereotypes.
|
|
2823
|
+
|
|
2824
|
+
Parameters
|
|
2825
|
+
----------
|
|
2826
|
+
corpus : Iterable[str]
|
|
2827
|
+
Text documents
|
|
2828
|
+
vectorizer : str
|
|
2829
|
+
Vectorization method
|
|
2830
|
+
text_metadata : pd.DataFrame, optional
|
|
2831
|
+
External document-level properties (e.g., relevance scores, timestamps)
|
|
2832
|
+
"""
|
|
2833
|
+
if vectorizer != "tfidf":
|
|
2834
|
+
raise NotImplementedError("Only TF-IDF supported in v0.4.")
|
|
2835
|
+
if sp is None:
|
|
2836
|
+
raise ImportError("scipy is required for text path.")
|
|
2837
|
+
|
|
2838
|
+
corpus_list = list(corpus)
|
|
2839
|
+
n_docs = len(corpus_list)
|
|
2840
|
+
|
|
2841
|
+
# Store metadata if provided
|
|
2842
|
+
if text_metadata is not None:
|
|
2843
|
+
if len(text_metadata) != n_docs:
|
|
2844
|
+
raise ValueError(
|
|
2845
|
+
f"text_metadata length ({len(text_metadata)}) must match "
|
|
2846
|
+
f"corpus length ({n_docs})"
|
|
2847
|
+
)
|
|
2848
|
+
self.text_metadata_ = text_metadata.copy()
|
|
2849
|
+
else:
|
|
2850
|
+
self.text_metadata_ = None
|
|
2851
|
+
|
|
2852
|
+
# Fit TF-IDF vectorizer
|
|
2853
|
+
self.vectorizer_ = TfidfVectorizer()
|
|
2854
|
+
X_tfidf = self.vectorizer_.fit_transform(corpus_list)
|
|
2855
|
+
|
|
2856
|
+
# Compute keyword-based stereotype if specified
|
|
2857
|
+
if self.stereotype_keywords is not None:
|
|
2858
|
+
self.stereotype_keyword_scores_ = self._compute_keyword_scores(
|
|
2859
|
+
X_tfidf, corpus_list, self.stereotype_keywords
|
|
2860
|
+
)
|
|
2861
|
+
else:
|
|
2862
|
+
self.stereotype_keyword_scores_ = None
|
|
2863
|
+
|
|
2864
|
+
X_scaled_sp = _sparse_minmax_0_1_nonneg(X_tfidf)
|
|
2865
|
+
X_l2 = _sparse_l2_normalize_rows(X_scaled_sp)
|
|
2866
|
+
|
|
2867
|
+
return X_scaled_sp, X_l2
|
|
2868
|
+
|
|
2869
|
+
def _preprocess_text_transform(self, corpus: Iterable[str]) -> Tuple[ArrayLike, ArrayLike]:
|
|
2870
|
+
if self.vectorizer_ is None:
|
|
2871
|
+
raise RuntimeError("Call fit_text first.")
|
|
2872
|
+
if sp is None:
|
|
2873
|
+
raise ImportError("scipy is required for text path.")
|
|
2874
|
+
X_tfidf = self.vectorizer_.transform(list(corpus))
|
|
2875
|
+
X_scaled_sp = _sparse_minmax_0_1_nonneg(X_tfidf)
|
|
2876
|
+
X_l2 = _sparse_l2_normalize_rows(X_scaled_sp)
|
|
2877
|
+
return X_scaled_sp, X_l2
|
|
2878
|
+
|
|
2879
|
+
|
|
2880
|
+
# ============================================================
|
|
2881
|
+
# Archetypal Analysis Methods (NEW in v0.7)
|
|
2882
|
+
# ============================================================
|
|
2883
|
+
|
|
2884
|
+
def _fit_archetypal_aa(self, X_scaled: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
|
2885
|
+
"""
|
|
2886
|
+
True archetypal analysis with PCHA (primary) and ConvexHull (fallback).
|
|
2887
|
+
|
|
2888
|
+
MEMORY OPTIMIZED: Respects configured dtype while preserving input dtype when needed.
|
|
2889
|
+
"""
|
|
2890
|
+
n_samples, n_features = X_scaled.shape
|
|
2891
|
+
|
|
2892
|
+
# OPTIMIZED: Use configured dtype, but respect input if it's float64
|
|
2893
|
+
input_dtype = X_scaled.dtype
|
|
2894
|
+
if input_dtype == np.float64:
|
|
2895
|
+
target_dtype = np.float64 # Preserve float64 if input is float64
|
|
2896
|
+
elif self.dtype == 'float32':
|
|
2897
|
+
target_dtype = np.float32
|
|
2898
|
+
else:
|
|
2899
|
+
target_dtype = np.float64
|
|
2900
|
+
|
|
2901
|
+
# Determine effective k
|
|
2902
|
+
k_max = min(n_samples, n_features)
|
|
2903
|
+
k_eff = min(self.nmf_rank, k_max)
|
|
2904
|
+
|
|
2905
|
+
# Try PCHA first (stable in high dimensions)
|
|
2906
|
+
if PCHA is not None and k_eff >= 2:
|
|
2907
|
+
try:
|
|
2908
|
+
# PCHA requires float64 internally
|
|
2909
|
+
X_T = X_scaled.astype(np.float64).T.copy()
|
|
2910
|
+
X_min = X_T.min()
|
|
2911
|
+
if X_min < 0:
|
|
2912
|
+
X_T = X_T - X_min + 1e-10
|
|
2913
|
+
|
|
2914
|
+
if self.verbose:
|
|
2915
|
+
print(f" Computing {k_eff} archetypes using PCHA (stable in {n_features}D)...")
|
|
2916
|
+
|
|
2917
|
+
XC, S, C, SSE, varexpl = PCHA(X_T, noc=k_eff, delta=0.0)
|
|
2918
|
+
|
|
2919
|
+
if self.verbose:
|
|
2920
|
+
print(f" PCHA converged, variance explained: {varexpl:.2%}")
|
|
2921
|
+
|
|
2922
|
+
# Convert to ndarray (PCHA returns matrix objects)
|
|
2923
|
+
W = np.asarray(S.T, dtype=target_dtype)
|
|
2924
|
+
H = np.asarray(XC.T, dtype=target_dtype)
|
|
2925
|
+
|
|
2926
|
+
# Validate dimensions with detailed error messages
|
|
2927
|
+
if W.shape != (n_samples, k_eff):
|
|
2928
|
+
raise ValueError(f"PCHA W shape error: got {W.shape}, expected ({n_samples}, {k_eff})")
|
|
2929
|
+
if H.shape != (k_eff, n_features):
|
|
2930
|
+
raise ValueError(f"PCHA H shape error: got {H.shape}, expected ({k_eff}, {n_features})")
|
|
2931
|
+
|
|
2932
|
+
self.nmf_model_ = None
|
|
2933
|
+
self.reconstruction_error_ = float(SSE)
|
|
2934
|
+
self.n_archetypes_ = k_eff
|
|
2935
|
+
return W, H
|
|
2936
|
+
|
|
2937
|
+
except Exception as e:
|
|
2938
|
+
if self.verbose:
|
|
2939
|
+
print(f" PCHA failed ({e}), trying ConvexHull")
|
|
2940
|
+
|
|
2941
|
+
# Try ConvexHull fallback (low dimensions only)
|
|
2942
|
+
if ConvexHull is not None and cdist is not None and n_features <= 20:
|
|
2943
|
+
try:
|
|
2944
|
+
# ConvexHull needs float64
|
|
2945
|
+
X_hull = X_scaled.astype(np.float64)
|
|
2946
|
+
hull = ConvexHull(X_hull)
|
|
2947
|
+
boundary_indices = np.unique(hull.simplices.ravel())
|
|
2948
|
+
n_archetypes = len(boundary_indices)
|
|
2949
|
+
|
|
2950
|
+
if self.verbose:
|
|
2951
|
+
print(f" Found {n_archetypes} archetypes on convex hull")
|
|
2952
|
+
|
|
2953
|
+
W = np.zeros((n_samples, n_archetypes), dtype=target_dtype)
|
|
2954
|
+
for i in range(n_samples):
|
|
2955
|
+
point = X_hull[i:i+1]
|
|
2956
|
+
boundary_points = X_hull[boundary_indices]
|
|
2957
|
+
distances = cdist(point, boundary_points).ravel()
|
|
2958
|
+
weights = 1.0 / (distances + 1e-6)
|
|
2959
|
+
W[i, :] = weights / weights.sum()
|
|
2960
|
+
|
|
2961
|
+
H = np.asarray(X_scaled[boundary_indices], dtype=target_dtype)
|
|
2962
|
+
self.nmf_model_ = None
|
|
2963
|
+
self.reconstruction_error_ = None
|
|
2964
|
+
self.n_archetypes_ = n_archetypes
|
|
2965
|
+
return W, H
|
|
2966
|
+
except Exception as e:
|
|
2967
|
+
if self.verbose:
|
|
2968
|
+
print(f" ConvexHull failed ({e}), using NMF")
|
|
2969
|
+
|
|
2970
|
+
# Final fallback: NMF
|
|
2971
|
+
if self.verbose:
|
|
2972
|
+
print(f" Using NMF fallback")
|
|
2973
|
+
return self._fit_archetypal_nmf(X_scaled)
|
|
2974
|
+
|
|
2975
|
+
|
|
2976
|
+
def _fit_archetypal_nmf(self, X_scaled: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
|
2977
|
+
"""
|
|
2978
|
+
Fast NMF-based approximation of archetypes.
|
|
2979
|
+
|
|
2980
|
+
MEMORY OPTIMIZED: Respects configured dtype.
|
|
2981
|
+
"""
|
|
2982
|
+
# Ensure non-negative (NMF requirement)
|
|
2983
|
+
X_nonneg = np.maximum(X_scaled.astype(np.float64), 0)
|
|
2984
|
+
|
|
2985
|
+
# Determine effective rank
|
|
2986
|
+
k_eff = min(self.nmf_rank, X_nonneg.shape[0], X_nonneg.shape[1])
|
|
2987
|
+
|
|
2988
|
+
# OPTIMIZED: Determine target dtype
|
|
2989
|
+
input_dtype = X_scaled.dtype
|
|
2990
|
+
if input_dtype == np.float64:
|
|
2991
|
+
target_dtype = np.float64
|
|
2992
|
+
elif self.dtype == 'float32':
|
|
2993
|
+
target_dtype = np.float32
|
|
2994
|
+
else:
|
|
2995
|
+
target_dtype = np.float64
|
|
2996
|
+
|
|
2997
|
+
if self.verbose:
|
|
2998
|
+
print(f"\nFitting archetypes: NMF (k={k_eff})")
|
|
2999
|
+
|
|
3000
|
+
# Fit NMF with convergence warning suppressed
|
|
3001
|
+
with warnings.catch_warnings():
|
|
3002
|
+
warnings.filterwarnings('ignore', category=ConvergenceWarning)
|
|
3003
|
+
nmf = NMF(
|
|
3004
|
+
n_components=k_eff,
|
|
3005
|
+
init='nndsvd',
|
|
3006
|
+
max_iter=self.max_iter_nmf,
|
|
3007
|
+
tol=self.tol_nmf,
|
|
3008
|
+
random_state=self.random_state
|
|
3009
|
+
)
|
|
3010
|
+
W = nmf.fit_transform(X_nonneg)
|
|
3011
|
+
H = nmf.components_
|
|
3012
|
+
|
|
3013
|
+
# Store model and metadata
|
|
3014
|
+
self.nmf_model_ = nmf
|
|
3015
|
+
self.reconstruction_error_ = float(nmf.reconstruction_err_)
|
|
3016
|
+
self.n_archetypes_ = k_eff
|
|
3017
|
+
|
|
3018
|
+
# OPTIMIZED: Ensure output matches target dtype
|
|
3019
|
+
W = W.astype(target_dtype, copy=False)
|
|
3020
|
+
H = H.astype(target_dtype, copy=False)
|
|
3021
|
+
|
|
3022
|
+
return W, H
|
|
3023
|
+
|
|
3024
|
+
# ============================================================
|
|
3025
|
+
# Internals - Fit Components (NMF + Prototypes)
|
|
3026
|
+
# ============================================================
|
|
3027
|
+
def _fit_components(self, X_scaled: ArrayLike, X_l2: ArrayLike, index: pd.Index) -> None:
|
|
3028
|
+
"""
|
|
3029
|
+
Fit archetypal and prototypical components.
|
|
3030
|
+
|
|
3031
|
+
MEMORY OPTIMIZED: Explicit cleanup of large temporaries.
|
|
3032
|
+
|
|
3033
|
+
Parameters
|
|
3034
|
+
----------
|
|
3035
|
+
X_scaled : ArrayLike
|
|
3036
|
+
Scaled feature matrix [0, 1]
|
|
3037
|
+
X_l2 : ArrayLike
|
|
3038
|
+
L2-normalized feature matrix
|
|
3039
|
+
index : pd.Index
|
|
3040
|
+
Sample index
|
|
3041
|
+
"""
|
|
3042
|
+
# ---- ARCHETYPAL ANALYSIS (NMF or AA)
|
|
3043
|
+
X_euc = X_scaled.toarray().astype(np.float64, copy=False) \
|
|
3044
|
+
if (sp is not None and sp.isspmatrix(X_scaled)) else np.asarray(X_scaled, dtype=np.float64)
|
|
3045
|
+
|
|
3046
|
+
if self.verbose:
|
|
3047
|
+
method_name = "Archetypal Analysis (PCHA+ConvexHull)" if self.archetypal_method == 'aa' else "NMF Approximation"
|
|
3048
|
+
print(f"\nFitting archetypal: {method_name}")
|
|
3049
|
+
|
|
3050
|
+
# Call appropriate method
|
|
3051
|
+
if self.archetypal_method == 'aa':
|
|
3052
|
+
W, H = self._fit_archetypal_aa(X_euc)
|
|
3053
|
+
else: # 'nmf'
|
|
3054
|
+
W, H = self._fit_archetypal_nmf(X_euc)
|
|
3055
|
+
|
|
3056
|
+
# Store with validation and correct dtype
|
|
3057
|
+
input_dtype = X_euc.dtype
|
|
3058
|
+
if input_dtype == np.float64:
|
|
3059
|
+
target_dtype = np.float64
|
|
3060
|
+
elif self.dtype == 'float32':
|
|
3061
|
+
target_dtype = np.float32
|
|
3062
|
+
else:
|
|
3063
|
+
target_dtype = np.float64
|
|
3064
|
+
|
|
3065
|
+
self.W_ = W.astype(target_dtype, copy=False)
|
|
3066
|
+
self.H_ = H.astype(target_dtype, copy=False)
|
|
3067
|
+
self.n_archetypes_ = self.H_.shape[0]
|
|
3068
|
+
|
|
3069
|
+
# Final validation
|
|
3070
|
+
n_samples, n_features = X_euc.shape
|
|
3071
|
+
assert self.W_.shape == (n_samples, self.n_archetypes_), \
|
|
3072
|
+
f"W_ dimension mismatch: {self.W_.shape} vs ({n_samples}, {self.n_archetypes_})"
|
|
3073
|
+
assert self.H_.shape == (self.n_archetypes_, n_features), \
|
|
3074
|
+
f"H_ dimension mismatch: {self.H_.shape} vs ({self.n_archetypes_}, {n_features})"
|
|
3075
|
+
|
|
3076
|
+
# MEMORY CLEANUP: Free W, H temporaries (we've stored them in self.W_, self.H_)
|
|
3077
|
+
_cleanup_memory(W, H)
|
|
3078
|
+
|
|
3079
|
+
if self.verbose:
|
|
3080
|
+
print(f" Stored: W ={self.W_.shape}, H ={self.H_.shape}, n_archetypes ={self.n_archetypes_}")
|
|
3081
|
+
|
|
3082
|
+
# ---- Prepare scaled dense & L2 copies
|
|
3083
|
+
X_euc = X_scaled.toarray().astype(np.float64, copy=False) \
|
|
3084
|
+
if (sp is not None and sp.isspmatrix(X_scaled)) else np.asarray(X_scaled, dtype=np.float64)
|
|
3085
|
+
Xl2 = X_l2.toarray().astype(np.float64, copy=False) \
|
|
3086
|
+
if (sp is not None and sp.isspmatrix(X_l2)) else np.asarray(X_l2, dtype=np.float64)
|
|
3087
|
+
n = X_euc.shape[0]
|
|
3088
|
+
|
|
3089
|
+
# ---- Helper: archetypal "cornerness" score
|
|
3090
|
+
def _corner_scores(Xe: np.ndarray) -> np.ndarray:
|
|
3091
|
+
eps = 1e-12
|
|
3092
|
+
col_min, col_max = Xe.min(axis=0), Xe.max(axis=0)
|
|
3093
|
+
hits_edge = (col_min <= eps) & (col_max >= 1.0 - eps)
|
|
3094
|
+
idxs = np.where(hits_edge)[0]
|
|
3095
|
+
if idxs.size >= 2:
|
|
3096
|
+
var = Xe[:, idxs].var(axis=0)
|
|
3097
|
+
take = idxs[np.argsort(-var)[:2]]
|
|
3098
|
+
else:
|
|
3099
|
+
var = Xe.var(axis=0)
|
|
3100
|
+
take = np.argsort(-var)[:2] if Xe.shape[1] >= 2 else np.array([0])
|
|
3101
|
+
X2 = Xe[:, take] if take.size else Xe[:, :1]
|
|
3102
|
+
m = np.minimum(X2, 1.0 - X2)
|
|
3103
|
+
dmin = np.sqrt(np.sum(m * m, axis=1))
|
|
3104
|
+
denom = math.sqrt(X2.shape[1]) if X2.shape[1] >= 1 else 1.0
|
|
3105
|
+
return 1.0 - np.clip(dmin / denom, 0.0, 1.0)
|
|
3106
|
+
|
|
3107
|
+
# ---- Helper: kNN density (cosine)
|
|
3108
|
+
def _knn_density_cosine(Xl2_arr: np.ndarray, k: int = 10, clip_neg: bool = True) -> np.ndarray:
|
|
3109
|
+
S = Xl2_arr @ Xl2_arr.T
|
|
3110
|
+
if clip_neg:
|
|
3111
|
+
S[S < 0.0] = 0.0
|
|
3112
|
+
np.fill_diagonal(S, 0.0)
|
|
3113
|
+
k = max(1, min(k, max(1, n - 1)))
|
|
3114
|
+
topk = np.partition(S, -k, axis=1)[:, -k:]
|
|
3115
|
+
dens = topk.mean(axis=1)
|
|
3116
|
+
m = dens.mean()
|
|
3117
|
+
return dens / m if m > 0 else np.ones_like(dens)
|
|
3118
|
+
|
|
3119
|
+
# ---- Build forbidden set from top archetypal (if enabled)
|
|
3120
|
+
disallow_overlap = bool(getattr(self, "disallow_overlap", False))
|
|
3121
|
+
overlap_alpha = float(getattr(self, "overlap_alpha", 0.0))
|
|
3122
|
+
forbidden = set()
|
|
3123
|
+
if disallow_overlap and overlap_alpha > 0.0:
|
|
3124
|
+
corner = _corner_scores(X_euc)
|
|
3125
|
+
m = max(1, min(n - 1, int(math.ceil(overlap_alpha * n))))
|
|
3126
|
+
order = np.argsort(-corner)
|
|
3127
|
+
forbidden = set(order[:m])
|
|
3128
|
+
|
|
3129
|
+
# ---- Compute kNN density for prototype selection
|
|
3130
|
+
dens = _knn_density_cosine(Xl2, k=10)
|
|
3131
|
+
|
|
3132
|
+
# ---- Prototypes via CELF with optional density weighting
|
|
3133
|
+
if self.verbose:
|
|
3134
|
+
print(f"\nFitting prototypes: Facility Location (k={self.n_prototypes})")
|
|
3135
|
+
|
|
3136
|
+
# Determine if density weighting is enabled
|
|
3137
|
+
density_weighted_fl = bool(getattr(self, "density_weighted_fl", False))
|
|
3138
|
+
density_k = int(getattr(self, "density_k", 10))
|
|
3139
|
+
density_clip_neg = bool(getattr(self, "density_clip_neg", True))
|
|
3140
|
+
weights = dens if density_weighted_fl else None
|
|
3141
|
+
|
|
3142
|
+
# Run facility location selector (it handles similarity matrix internally)
|
|
3143
|
+
selector = FacilityLocationSelector(
|
|
3144
|
+
n_prototypes=self.n_prototypes,
|
|
3145
|
+
deterministic=self.deterministic,
|
|
3146
|
+
speed_mode=self.speed_mode,
|
|
3147
|
+
verbose=self.verbose
|
|
3148
|
+
)
|
|
3149
|
+
P_idx, mg = selector.select(Xl2, weights=weights, forbidden=forbidden)
|
|
3150
|
+
|
|
3151
|
+
# Optional auto-k (Kneedle)
|
|
3152
|
+
knee = None
|
|
3153
|
+
if self.auto_n_prototypes == "kneedle" and mg.size >= 2:
|
|
3154
|
+
knee = self._kneedle(mg)
|
|
3155
|
+
if knee is not None and knee > 0:
|
|
3156
|
+
P_idx = P_idx[:knee]
|
|
3157
|
+
mg = mg[:knee]
|
|
3158
|
+
|
|
3159
|
+
self.prototype_indices_ = P_idx
|
|
3160
|
+
self.prototype_rows_ = index.to_numpy()[P_idx]
|
|
3161
|
+
self.marginal_gains_ = mg
|
|
3162
|
+
self.knee_ = knee
|
|
3163
|
+
|
|
3164
|
+
# Detect knee in marginal gains
|
|
3165
|
+
if len(mg) > 2:
|
|
3166
|
+
diffs = np.diff(mg)
|
|
3167
|
+
if len(diffs) > 1:
|
|
3168
|
+
diffs2 = np.diff(diffs)
|
|
3169
|
+
self.knee_ = int(np.argmax(np.abs(diffs2)) + 1)
|
|
3170
|
+
else:
|
|
3171
|
+
self.knee_ = 1
|
|
3172
|
+
else:
|
|
3173
|
+
self.knee_ = len(mg)
|
|
3174
|
+
|
|
3175
|
+
# Training-time assignments & coverage
|
|
3176
|
+
best_cos, proto_label = self._assignments_cosine(Xl2, P_idx)
|
|
3177
|
+
self.assignments_ = proto_label
|
|
3178
|
+
self.coverage_ = best_cos
|
|
3179
|
+
|
|
3180
|
+
if self.verbose:
|
|
3181
|
+
print(f" Selected {len(P_idx)} prototypes, knee at {self.knee_}")
|
|
3182
|
+
|
|
3183
|
+
# ---- Stereotypes (verbose output)
|
|
3184
|
+
if self.verbose:
|
|
3185
|
+
if self.stereotype_column is not None:
|
|
3186
|
+
target_str = f"'{self.stereotype_target}'" if isinstance(self.stereotype_target, str) else f"{self.stereotype_target}"
|
|
3187
|
+
print(f"\nStereotypical configuration:")
|
|
3188
|
+
print(f" Target column: '{self.stereotype_column}'")
|
|
3189
|
+
print(f" Target value: {target_str}")
|
|
3190
|
+
|
|
3191
|
+
# Show target distribution if we have the data
|
|
3192
|
+
if hasattr(self, '_df_original_fit') and self.stereotype_column in self._df_original_fit.columns:
|
|
3193
|
+
stereo_vals = self._df_original_fit[self.stereotype_column]
|
|
3194
|
+
print(f" Column range: [{stereo_vals.min():.2f}, {stereo_vals.max():.2f}]")
|
|
3195
|
+
|
|
3196
|
+
if isinstance(self.stereotype_target, str):
|
|
3197
|
+
if self.stereotype_target == 'max':
|
|
3198
|
+
print(f" Targeting samples with maximum {self.stereotype_column}")
|
|
3199
|
+
elif self.stereotype_target == 'min':
|
|
3200
|
+
print(f" Targeting samples with minimum {self.stereotype_column}")
|
|
3201
|
+
else:
|
|
3202
|
+
distance_to_target = abs(stereo_vals - self.stereotype_target).mean()
|
|
3203
|
+
print(f" Mean distance to target: {distance_to_target:.2f}")
|
|
3204
|
+
else:
|
|
3205
|
+
print(f"\nStereotypical: Not configured (using feature extremeness)")
|
|
3206
|
+
|
|
3207
|
+
# ============================================================
|
|
3208
|
+
# Internals - Scoring with fitted artifacts
|
|
3209
|
+
# ============================================================
|
|
3210
|
+
def _score_with_fitted(
|
|
3211
|
+
self,
|
|
3212
|
+
X_scaled: ArrayLike,
|
|
3213
|
+
X_l2: ArrayLike,
|
|
3214
|
+
index: pd.Index,
|
|
3215
|
+
stereotype_source: Optional[pd.Series] = None
|
|
3216
|
+
) -> pd.DataFrame:
|
|
3217
|
+
"""
|
|
3218
|
+
Score data with fitted artifacts.
|
|
3219
|
+
|
|
3220
|
+
CRITICAL: This method must handle dimension matching correctly for transform.
|
|
3221
|
+
MEMORY OPTIMIZED: Cleanup large temporaries during transform.
|
|
3222
|
+
"""
|
|
3223
|
+
if (self.W_ is None or self.H_ is None) or self.prototype_indices_ is None:
|
|
3224
|
+
raise RuntimeError("Call fit first")
|
|
3225
|
+
|
|
3226
|
+
# Validate stored dimensions
|
|
3227
|
+
n_archetypes = self.n_archetypes_
|
|
3228
|
+
n_features_model = self.H_.shape[1]
|
|
3229
|
+
|
|
3230
|
+
# ---- Archetypal projections
|
|
3231
|
+
X_for_transform = X_scaled.astype(np.float64) if (sp is not None and sp.isspmatrix(X_scaled)) \
|
|
3232
|
+
else np.asarray(X_scaled, dtype=np.float64)
|
|
3233
|
+
|
|
3234
|
+
n_samples_transform = X_for_transform.shape[0]
|
|
3235
|
+
n_features_transform = X_for_transform.shape[1]
|
|
3236
|
+
|
|
3237
|
+
# CRITICAL VALIDATION
|
|
3238
|
+
if n_features_transform != n_features_model:
|
|
3239
|
+
raise ValueError(
|
|
3240
|
+
f"Feature dimension mismatch: transform data has {n_features_transform} features, "
|
|
3241
|
+
f"but model was trained with {n_features_model} features"
|
|
3242
|
+
)
|
|
3243
|
+
|
|
3244
|
+
if self.nmf_model_ is not None:
|
|
3245
|
+
# NMF method: use fitted model to transform
|
|
3246
|
+
W = self.nmf_model_.transform(X_for_transform)
|
|
3247
|
+
else:
|
|
3248
|
+
# AA method: compute weights from H using least squares
|
|
3249
|
+
H = self.H_
|
|
3250
|
+
|
|
3251
|
+
# Validate H dimensions before computation
|
|
3252
|
+
assert H.shape == (n_archetypes, n_features_model), \
|
|
3253
|
+
f"H dimension error: {H.shape} vs ({n_archetypes}, {n_features_model})"
|
|
3254
|
+
|
|
3255
|
+
HHT = H @ H.T
|
|
3256
|
+
assert HHT.shape == (n_archetypes, n_archetypes), \
|
|
3257
|
+
f"HHT dimension error: {HHT.shape} vs ({n_archetypes}, {n_archetypes})"
|
|
3258
|
+
|
|
3259
|
+
# Regularized inverse
|
|
3260
|
+
HHT_inv = np.linalg.pinv(HHT + 1e-6 * np.eye(HHT.shape[0]))
|
|
3261
|
+
|
|
3262
|
+
# Matrix multiplication with dimension checking
|
|
3263
|
+
W = X_for_transform @ H.T @ HHT_inv
|
|
3264
|
+
|
|
3265
|
+
# MEMORY CLEANUP: Free intermediate matrices
|
|
3266
|
+
_cleanup_memory(HHT, HHT_inv)
|
|
3267
|
+
|
|
3268
|
+
# Ensure non-negative
|
|
3269
|
+
W = np.maximum(W, 0)
|
|
3270
|
+
|
|
3271
|
+
# Validate W dimensions
|
|
3272
|
+
assert W.shape == (n_samples_transform, n_archetypes), \
|
|
3273
|
+
f"W dimension error: {W.shape} vs ({n_samples_transform}, {n_archetypes})"
|
|
3274
|
+
|
|
3275
|
+
# Normalize W
|
|
3276
|
+
W_row_sum = W.sum(axis=1, keepdims=True)
|
|
3277
|
+
W_row_sum[W_row_sum == 0.0] = 1.0
|
|
3278
|
+
W_norm = W / W_row_sum
|
|
3279
|
+
arch_wmax = W_norm.max(axis=1)
|
|
3280
|
+
|
|
3281
|
+
# MEMORY CLEANUP: Free W_norm after extracting needed values
|
|
3282
|
+
_cleanup_memory(W_norm, W_row_sum)
|
|
3283
|
+
|
|
3284
|
+
# Distances to archetypes
|
|
3285
|
+
X_dense = X_for_transform.toarray() if (sp is not None and sp.isspmatrix(X_for_transform)) \
|
|
3286
|
+
else np.asarray(X_for_transform)
|
|
3287
|
+
|
|
3288
|
+
dists_c = np.sqrt(np.maximum(
|
|
3289
|
+
((X_dense[:, None, :] - self.H_[None, :, :]) ** 2).sum(axis=2),
|
|
3290
|
+
0.0
|
|
3291
|
+
))
|
|
3292
|
+
arch_d_min = dists_c.min(axis=1)
|
|
3293
|
+
|
|
3294
|
+
# MEMORY CLEANUP: Free distance matrix after extracting needed values
|
|
3295
|
+
_cleanup_memory(dists_c)
|
|
3296
|
+
|
|
3297
|
+
# ---- Prototypes: cosine assignment
|
|
3298
|
+
P_idx = self.prototype_indices_
|
|
3299
|
+
best_cos, proto_label = self._assignments_cosine(X_l2, P_idx)
|
|
3300
|
+
|
|
3301
|
+
# Euclidean distance to prototypes
|
|
3302
|
+
X_euc = X_scaled.toarray().astype(np.float64, copy=False) \
|
|
3303
|
+
if (sp is not None and sp.isspmatrix(X_scaled)) else np.asarray(X_scaled, dtype=np.float64)
|
|
3304
|
+
P_mat = X_euc[P_idx] if P_idx.max() < len(X_euc) else self.W_[P_idx]
|
|
3305
|
+
|
|
3306
|
+
best_euc = _euclidean_min_to_set_dense(X_euc, P_mat, max_memory_mb=self.max_memory_mb)
|
|
3307
|
+
|
|
3308
|
+
# MEMORY CLEANUP: Free P_mat after distance computation
|
|
3309
|
+
_cleanup_memory(P_mat)
|
|
3310
|
+
|
|
3311
|
+
norm95 = np.percentile(best_euc, 95) or 1.0
|
|
3312
|
+
proto_d_norm95 = np.clip(best_euc / norm95, 0.0, 1.0)
|
|
3313
|
+
|
|
3314
|
+
# ---- Compute ranks
|
|
3315
|
+
# Archetypal rank
|
|
3316
|
+
eps = 1e-12
|
|
3317
|
+
col_min = X_euc.min(axis=0)
|
|
3318
|
+
col_max = X_euc.max(axis=0)
|
|
3319
|
+
hits_edge = (col_min <= eps) & (col_max >= 1.0 - eps)
|
|
3320
|
+
idxs = np.where(hits_edge)[0]
|
|
3321
|
+
if idxs.size >= 2:
|
|
3322
|
+
var = X_euc[:, idxs].var(axis=0)
|
|
3323
|
+
take = idxs[np.argsort(-var)[:2]]
|
|
3324
|
+
else:
|
|
3325
|
+
var = X_euc.var(axis=0)
|
|
3326
|
+
take = np.argsort(-var)[:2] if X_euc.shape[1] >= 2 else np.array([0])
|
|
3327
|
+
X2 = X_euc[:, take] if take.size else X_euc[:, :1]
|
|
3328
|
+
m = np.minimum(X2, 1.0 - X2)
|
|
3329
|
+
dmin = np.sqrt(np.sum(m * m, axis=1))
|
|
3330
|
+
denom = math.sqrt(X2.shape[1]) if X2.shape[1] >= 1 else 1.0
|
|
3331
|
+
corner_score = 1.0 - np.clip(dmin / denom, 0.0, 1.0)
|
|
3332
|
+
|
|
3333
|
+
archetypal_score = arch_wmax * 0.7 + corner_score * 0.3
|
|
3334
|
+
|
|
3335
|
+
# MEMORY CLEANUP: Free intermediate arrays
|
|
3336
|
+
_cleanup_memory(X2, col_min, col_max, corner_score)
|
|
3337
|
+
|
|
3338
|
+
# Prototypical rank
|
|
3339
|
+
prototypical_score = (1.0 - proto_d_norm95) * 0.5 + best_cos * 0.5
|
|
3340
|
+
|
|
3341
|
+
# Stereotypical rank
|
|
3342
|
+
stereotypical_scores = self._compute_stereotypical_rank(X_scaled, index, stereotype_source)
|
|
3343
|
+
|
|
3344
|
+
# ---- Build output DataFrame (only keep rank columns)
|
|
3345
|
+
out = pd.DataFrame(
|
|
3346
|
+
{
|
|
3347
|
+
"archetypal_rank": np.round(archetypal_score, 10),
|
|
3348
|
+
"prototypical_rank": np.round(prototypical_score, 10),
|
|
3349
|
+
"stereotypical_rank": np.round(stereotypical_scores, 10),
|
|
3350
|
+
},
|
|
3351
|
+
index=index,
|
|
3352
|
+
)
|
|
3353
|
+
|
|
3354
|
+
# MEMORY CLEANUP: Force GC before returning (transform often called repeatedly)
|
|
3355
|
+
_cleanup_memory(X_dense, X_euc, W, force_gc=True)
|
|
3356
|
+
|
|
3357
|
+
return out
|
|
3358
|
+
|
|
3359
|
+
# ------------------------------------------------------------
|
|
3360
|
+
def _assignments_cosine(
|
|
3361
|
+
self,
|
|
3362
|
+
X_l2: ArrayLike,
|
|
3363
|
+
prototype_indices: np.ndarray
|
|
3364
|
+
) -> Tuple[np.ndarray, np.ndarray]:
|
|
3365
|
+
"""
|
|
3366
|
+
Compute cosine similarity assignments to prototypes.
|
|
3367
|
+
|
|
3368
|
+
OPTIMIZED: Uses JIT-compiled function for 2-3× speedup.
|
|
3369
|
+
"""
|
|
3370
|
+
# Convert to dense if needed
|
|
3371
|
+
Xl2_dense = X_l2.toarray() if (sp is not None and sp.isspmatrix(X_l2)) else np.asarray(X_l2, dtype=np.float64)
|
|
3372
|
+
P_l2 = Xl2_dense[prototype_indices]
|
|
3373
|
+
|
|
3374
|
+
n_samples = Xl2_dense.shape[0]
|
|
3375
|
+
n_protos = len(prototype_indices)
|
|
3376
|
+
|
|
3377
|
+
# OPTIMIZED: Use JIT for small to medium datasets
|
|
3378
|
+
if n_samples * n_protos < 1000000:
|
|
3379
|
+
sims = _cosine_similarity_jit(Xl2_dense, P_l2)
|
|
3380
|
+
else:
|
|
3381
|
+
# For very large datasets, use numpy (better for huge matrices)
|
|
3382
|
+
sims = Xl2_dense @ P_l2.T
|
|
3383
|
+
np.maximum(sims, 0.0, out=sims)
|
|
3384
|
+
|
|
3385
|
+
best_idx = sims.argmax(axis=1).astype(int)
|
|
3386
|
+
best_sim = sims[np.arange(len(sims)), best_idx]
|
|
3387
|
+
|
|
3388
|
+
return best_sim, best_idx
|
|
3389
|
+
|
|
3390
|
+
def _kneedle(self, gains: np.ndarray) -> Optional[int]:
|
|
3391
|
+
U = np.cumsum(gains)
|
|
3392
|
+
if U[-1] == 0.0:
|
|
3393
|
+
return None
|
|
3394
|
+
U_norm = U / U[-1]
|
|
3395
|
+
k = gains.size
|
|
3396
|
+
x = np.linspace(1 / k, 1.0, k)
|
|
3397
|
+
diff = U_norm - x
|
|
3398
|
+
return int(np.argmax(diff)) + 1
|
|
3399
|
+
|
|
3400
|
+
def _record_settings(self, tc: _ThreadControl):
|
|
3401
|
+
self.settings_ = {
|
|
3402
|
+
"deterministic": bool(self.deterministic),
|
|
3403
|
+
"speed_mode": bool(self.speed_mode),
|
|
3404
|
+
"thread_limit": tc.effective_limit,
|
|
3405
|
+
"random_state": int(self.random_state),
|
|
3406
|
+
"dtype": str(self.dtype),
|
|
3407
|
+
"max_memory_mb": int(self.max_memory_mb),
|
|
3408
|
+
}
|
|
3409
|
+
|
|
3410
|
+
|
|
3411
|
+
__all__ = [
|
|
3412
|
+
"DataTypical",
|
|
3413
|
+
"FacilityLocationSelector",
|
|
3414
|
+
"DataTypicalError",
|
|
3415
|
+
"ConfigError",
|
|
3416
|
+
"MemoryBudgetError",
|
|
3417
|
+
]
|