structural-topic-model 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystm/__init__.py +31 -0
- pystm/_estep.py +159 -0
- pystm/_mnreg.py +221 -0
- pystm/_mstep.py +96 -0
- pystm/_spectral.py +151 -0
- pystm/_utils.py +41 -0
- pystm/diagnostics.py +168 -0
- pystm/effects.py +203 -0
- pystm/model_selection.py +166 -0
- pystm/stm.py +443 -0
- structural_topic_model-0.2.0.dist-info/METADATA +234 -0
- structural_topic_model-0.2.0.dist-info/RECORD +14 -0
- structural_topic_model-0.2.0.dist-info/WHEEL +4 -0
- structural_topic_model-0.2.0.dist-info/licenses/LICENSE +21 -0
pystm/diagnostics.py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""Model diagnostics (ports of semanticCoherence.R, exclusivity.R,
|
|
2
|
+
residuals.R and topicCorr.R)."""
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
from scipy import stats
|
|
8
|
+
from scipy.sparse import csc_matrix, csr_matrix, issparse
|
|
9
|
+
from scipy.stats import rankdata
|
|
10
|
+
|
|
11
|
+
from ._utils import safelog
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _as_csr(X):
|
|
15
|
+
return csr_matrix(X) if not issparse(X) else X.tocsr()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _semcoh_one_beta(X, logbeta, M):
|
|
19
|
+
"""Semantic coherence per topic for one beta (semCoh1beta in R)."""
|
|
20
|
+
K = logbeta.shape[0]
|
|
21
|
+
top_words = np.argsort(-logbeta, axis=1)[:, :M]
|
|
22
|
+
wordlist, positions = np.unique(top_words, return_inverse=True)
|
|
23
|
+
labels = positions.reshape(K, M)
|
|
24
|
+
|
|
25
|
+
sub = csc_matrix(X[:, wordlist])
|
|
26
|
+
sub.data = np.minimum(sub.data, 1.0) # binarize
|
|
27
|
+
cross = (sub.T @ sub).toarray() # document co-occurrence counts
|
|
28
|
+
|
|
29
|
+
result = np.zeros(K)
|
|
30
|
+
for k in range(K):
|
|
31
|
+
idx = labels[k]
|
|
32
|
+
for a in range(M):
|
|
33
|
+
for b in range(M):
|
|
34
|
+
m_, l_ = idx[a], idx[b]
|
|
35
|
+
if m_ > l_:
|
|
36
|
+
result[k] += (np.log(0.01 + cross[m_, l_])
|
|
37
|
+
- np.log(cross[l_, l_] + 0.01))
|
|
38
|
+
return result
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def semantic_coherence(model, X, content=None, M=10):
|
|
42
|
+
"""Semantic coherence (Mimno et al. 2011) per topic.
|
|
43
|
+
|
|
44
|
+
Higher is better; the metric checks that a topic's top ``M`` words
|
|
45
|
+
co-occur within documents. For content covariate models pass the
|
|
46
|
+
content labels; the score is then the document-weighted average over
|
|
47
|
+
the aspect-specific betas, as in the R package.
|
|
48
|
+
"""
|
|
49
|
+
X = _as_csr(X)
|
|
50
|
+
if model.aspect_components_ is None:
|
|
51
|
+
return _semcoh_one_beta(X, safelog(model.components_), M)
|
|
52
|
+
if content is None:
|
|
53
|
+
raise ValueError(
|
|
54
|
+
"The model was fitted with a content covariate; pass the "
|
|
55
|
+
"matching content labels."
|
|
56
|
+
)
|
|
57
|
+
levels = model.content_levels_
|
|
58
|
+
index = np.searchsorted(levels, np.asarray(content).ravel())
|
|
59
|
+
result = np.zeros(model.n_components)
|
|
60
|
+
for a in range(len(levels)):
|
|
61
|
+
subset = index == a
|
|
62
|
+
if not subset.any():
|
|
63
|
+
continue
|
|
64
|
+
logbeta = safelog(model.aspect_components_[a])
|
|
65
|
+
result += _semcoh_one_beta(X[subset], logbeta, M) * subset.sum()
|
|
66
|
+
return result / X.shape[0]
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def exclusivity(model, M=10, frexw=0.7):
|
|
70
|
+
"""FREX-based exclusivity per topic (exclusivity in R).
|
|
71
|
+
|
|
72
|
+
Not defined for content covariate models (matching the R package).
|
|
73
|
+
"""
|
|
74
|
+
if model.aspect_components_ is not None:
|
|
75
|
+
raise ValueError(
|
|
76
|
+
"Exclusivity calculation is only designed for models without "
|
|
77
|
+
"content covariates."
|
|
78
|
+
)
|
|
79
|
+
w = frexw
|
|
80
|
+
tbeta = model.components_.T # (V, K)
|
|
81
|
+
mat = tbeta / tbeta.sum(axis=1, keepdims=True)
|
|
82
|
+
|
|
83
|
+
ex = np.apply_along_axis(rankdata, 0, mat) / mat.shape[0]
|
|
84
|
+
fr = np.apply_along_axis(rankdata, 0, tbeta) / mat.shape[0]
|
|
85
|
+
frex = 1.0 / (w / ex + (1 - w) / fr)
|
|
86
|
+
index = np.argsort(-tbeta, axis=0)[:M]
|
|
87
|
+
return np.array([
|
|
88
|
+
frex[index[:, k], k].sum() for k in range(tbeta.shape[1])
|
|
89
|
+
])
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def check_residuals(model, X, content=None, tol=0.01):
|
|
93
|
+
"""Multinomial dispersion of the residuals (Taddy 2012).
|
|
94
|
+
|
|
95
|
+
Under a correctly specified model the dispersion is 1; values above 1
|
|
96
|
+
suggest the number of topics is too small. Returns a dict with
|
|
97
|
+
``dispersion``, ``pvalue`` and ``df``.
|
|
98
|
+
"""
|
|
99
|
+
X = _as_csr(X)
|
|
100
|
+
n, V = X.shape
|
|
101
|
+
K = model.n_components
|
|
102
|
+
theta = model.theta_
|
|
103
|
+
if theta.shape[0] != n:
|
|
104
|
+
raise ValueError("X must be the corpus the model was fitted on.")
|
|
105
|
+
|
|
106
|
+
beta = model._beta_list()
|
|
107
|
+
if model.content_levels_ is not None:
|
|
108
|
+
if content is None:
|
|
109
|
+
raise ValueError(
|
|
110
|
+
"The model was fitted with a content covariate; pass the "
|
|
111
|
+
"matching content labels."
|
|
112
|
+
)
|
|
113
|
+
index = np.searchsorted(model.content_levels_,
|
|
114
|
+
np.asarray(content).ravel())
|
|
115
|
+
else:
|
|
116
|
+
index = np.zeros(n, dtype=np.int64)
|
|
117
|
+
|
|
118
|
+
d = n * (K - 1) + K * (V - 1)
|
|
119
|
+
D = 0.0
|
|
120
|
+
Nhat = 0
|
|
121
|
+
for i in range(n):
|
|
122
|
+
row = X.getrow(i)
|
|
123
|
+
q = theta[i] @ beta[index[i]] # (V,)
|
|
124
|
+
m = row.sum()
|
|
125
|
+
Nhat += int((q * m > tol).sum())
|
|
126
|
+
x = np.zeros(V)
|
|
127
|
+
x[row.indices] = row.data
|
|
128
|
+
denom = m * q * (1 - q)
|
|
129
|
+
D += ((x**2 - 2 * x * q * m) / denom).sum() + (m * q / (1 - q)).sum()
|
|
130
|
+
|
|
131
|
+
df = Nhat - V - d
|
|
132
|
+
with np.errstate(invalid="ignore"):
|
|
133
|
+
dispersion = D / df
|
|
134
|
+
pvalue = stats.chi2.sf(D, df) if df > 0 else np.nan
|
|
135
|
+
return {"dispersion": dispersion, "pvalue": pvalue, "df": df}
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
class TopicCorrelations:
|
|
139
|
+
"""Result of :func:`topic_corr` (class topicCorr in R).
|
|
140
|
+
|
|
141
|
+
Attributes
|
|
142
|
+
----------
|
|
143
|
+
cor : ndarray (K, K)
|
|
144
|
+
Correlation matrix with entries below ``cutoff`` (in absolute
|
|
145
|
+
value) set to zero.
|
|
146
|
+
posadj : ndarray (K, K)
|
|
147
|
+
Adjacency matrix of positive correlations above the cutoff.
|
|
148
|
+
poscor : ndarray (K, K)
|
|
149
|
+
Correlations masked to the positive adjacency structure.
|
|
150
|
+
"""
|
|
151
|
+
|
|
152
|
+
def __init__(self, cor, posadj, poscor):
|
|
153
|
+
self.cor = cor
|
|
154
|
+
self.posadj = posadj
|
|
155
|
+
self.poscor = poscor
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def topic_corr(model, cutoff=0.01):
|
|
159
|
+
"""Topic correlation graph from theta (topicCorr method="simple").
|
|
160
|
+
|
|
161
|
+
The R package's "huge" method (semiparametric graphical model
|
|
162
|
+
selection) depends on the huge package and is not implemented.
|
|
163
|
+
"""
|
|
164
|
+
cormat = np.corrcoef(model.theta_, rowvar=False)
|
|
165
|
+
posadj = (cormat > cutoff).astype(float)
|
|
166
|
+
poscor = cormat * posadj
|
|
167
|
+
cor = np.where(np.abs(cormat) > cutoff, cormat, 0.0)
|
|
168
|
+
return TopicCorrelations(cor=cor, posadj=posadj, poscor=poscor)
|
pystm/effects.py
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
"""Covariate effect estimation (port of estimateEffect.R / thetaPosterior.R).
|
|
2
|
+
|
|
3
|
+
Regressions where topic proportions are the outcome, propagating the
|
|
4
|
+
measurement uncertainty of theta via the method of composition: draw
|
|
5
|
+
theta from the variational posterior, run the OLS, repeat, then pool.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import warnings
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
from scipy import stats
|
|
14
|
+
from sklearn.utils import check_random_state
|
|
15
|
+
|
|
16
|
+
from ._utils import row_softmax
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _global_sigma(model):
|
|
20
|
+
"""Global approximation to the per-document posterior covariance.
|
|
21
|
+
|
|
22
|
+
Subtracts the contribution of deviations from the prior mean out of
|
|
23
|
+
the topic covariance, leaving the (average) variational covariance
|
|
24
|
+
(thetapost.global in R).
|
|
25
|
+
"""
|
|
26
|
+
lambda_ = model.eta_
|
|
27
|
+
mu = model.mu_
|
|
28
|
+
diff = lambda_ - (mu[None, :] if mu.ndim == 1 else mu)
|
|
29
|
+
covariance = (diff.T @ diff) / lambda_.shape[0]
|
|
30
|
+
sigma = model.sigma_ - covariance
|
|
31
|
+
# guard against indefiniteness from the subtraction
|
|
32
|
+
evals, evecs = np.linalg.eigh(sigma)
|
|
33
|
+
if evals[0] <= 0:
|
|
34
|
+
evals = np.maximum(evals, 1e-10)
|
|
35
|
+
sigma = (evecs * evals) @ evecs.T
|
|
36
|
+
return sigma
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _draw_theta(model, rng):
|
|
40
|
+
"""One draw of theta for every document (Global approximation)."""
|
|
41
|
+
sigma = _global_sigma(model)
|
|
42
|
+
chol = np.linalg.cholesky(sigma)
|
|
43
|
+
z = rng.standard_normal(model.eta_.shape)
|
|
44
|
+
eta = model.eta_ + z @ chol.T
|
|
45
|
+
return row_softmax(
|
|
46
|
+
np.column_stack([eta, np.zeros(eta.shape[0])])
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class _QRRegression:
|
|
51
|
+
"""OLS with a cached QR decomposition (qr.lm / summary.qr.lm in R)."""
|
|
52
|
+
|
|
53
|
+
def __init__(self, xmat, prior=None):
|
|
54
|
+
self.n_obs = xmat.shape[0]
|
|
55
|
+
p = xmat.shape[1]
|
|
56
|
+
if prior is not None:
|
|
57
|
+
if np.isscalar(prior):
|
|
58
|
+
prior = np.diag(np.full(p, float(prior)))
|
|
59
|
+
xmat = np.vstack([xmat, np.linalg.cholesky(prior).T])
|
|
60
|
+
if np.linalg.matrix_rank(xmat) < p:
|
|
61
|
+
warnings.warn(
|
|
62
|
+
"Covariate matrix is singular; adding a small ridge prior "
|
|
63
|
+
"(1e-5) for numerical stability.", stacklevel=3,
|
|
64
|
+
)
|
|
65
|
+
xmat = np.vstack([xmat, np.sqrt(1e-5) * np.eye(p)])
|
|
66
|
+
self.xmat = xmat
|
|
67
|
+
self.q, self.r = np.linalg.qr(xmat)
|
|
68
|
+
self.rinv = np.linalg.inv(self.r)
|
|
69
|
+
self.df_residual = xmat.shape[0] - p
|
|
70
|
+
|
|
71
|
+
def fit(self, y):
|
|
72
|
+
if y.shape[0] != self.xmat.shape[0]:
|
|
73
|
+
y = np.concatenate(
|
|
74
|
+
[y, np.zeros(self.xmat.shape[0] - y.shape[0])]
|
|
75
|
+
)
|
|
76
|
+
coef = self.rinv @ (self.q.T @ y)
|
|
77
|
+
resid = y - self.xmat @ coef
|
|
78
|
+
resvar = (resid @ resid) / self.df_residual
|
|
79
|
+
vcov = resvar * (self.rinv @ self.rinv.T)
|
|
80
|
+
return coef, vcov
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class EstimatedEffects:
|
|
84
|
+
"""Result of :func:`estimate_effect`.
|
|
85
|
+
|
|
86
|
+
Attributes
|
|
87
|
+
----------
|
|
88
|
+
parameters : dict
|
|
89
|
+
Maps topic index to a list of ``(coef, vcov)`` pairs, one per
|
|
90
|
+
composition draw.
|
|
91
|
+
topics : list of int
|
|
92
|
+
Topics for which effects were estimated (0-based).
|
|
93
|
+
n_obs : int
|
|
94
|
+
Number of documents.
|
|
95
|
+
n_params : int
|
|
96
|
+
Number of regression coefficients (including the intercept).
|
|
97
|
+
"""
|
|
98
|
+
|
|
99
|
+
def __init__(self, parameters, topics, n_obs, n_params):
|
|
100
|
+
self.parameters = parameters
|
|
101
|
+
self.topics = topics
|
|
102
|
+
self.n_obs = n_obs
|
|
103
|
+
self.n_params = n_params
|
|
104
|
+
|
|
105
|
+
def summary(self, topics=None, nsim=500, random_state=None):
|
|
106
|
+
"""Pooled coefficient tables (summary.estimateEffect in R).
|
|
107
|
+
|
|
108
|
+
Returns a dict mapping topic index to a record array with fields
|
|
109
|
+
``estimate``, ``std_error``, ``t_value`` and ``p_value``, one row
|
|
110
|
+
per regression coefficient.
|
|
111
|
+
"""
|
|
112
|
+
rng = check_random_state(random_state)
|
|
113
|
+
topics = self.topics if topics is None else list(topics)
|
|
114
|
+
dtype = [("estimate", float), ("std_error", float),
|
|
115
|
+
("t_value", float), ("p_value", float)]
|
|
116
|
+
tables = {}
|
|
117
|
+
for k in topics:
|
|
118
|
+
if k not in self.parameters:
|
|
119
|
+
raise ValueError(f"Topic {k} was not estimated.")
|
|
120
|
+
sims = np.vstack([
|
|
121
|
+
rng.multivariate_normal(est, vcov, size=nsim)
|
|
122
|
+
for est, vcov in self.parameters[k]
|
|
123
|
+
])
|
|
124
|
+
est = sims.mean(axis=0)
|
|
125
|
+
se = sims.std(axis=0, ddof=1)
|
|
126
|
+
tval = est / se
|
|
127
|
+
rdf = self.n_obs - self.n_params
|
|
128
|
+
p = 2 * stats.t.sf(np.abs(tval), rdf)
|
|
129
|
+
table = np.zeros(len(est), dtype=dtype)
|
|
130
|
+
table["estimate"] = est
|
|
131
|
+
table["std_error"] = se
|
|
132
|
+
table["t_value"] = tval
|
|
133
|
+
table["p_value"] = p
|
|
134
|
+
tables[k] = table
|
|
135
|
+
return tables
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def estimate_effect(model, prevalence, topics=None,
|
|
139
|
+
uncertainty="Global", nsims=25, prior=None,
|
|
140
|
+
random_state=None):
|
|
141
|
+
"""Regress topic proportions on covariates (estimateEffect in R).
|
|
142
|
+
|
|
143
|
+
Parameters
|
|
144
|
+
----------
|
|
145
|
+
model : fitted StructuralTopicModel
|
|
146
|
+
prevalence : array-like of shape (n_samples, n_covariates)
|
|
147
|
+
Covariate design matrix for the regression; an intercept column
|
|
148
|
+
is added automatically. Should normally contain (at least) the
|
|
149
|
+
covariates used when fitting the model. Categorical variables
|
|
150
|
+
must be encoded numerically beforehand.
|
|
151
|
+
topics : iterable of int, optional
|
|
152
|
+
0-based topic indices to estimate effects for (default: all).
|
|
153
|
+
uncertainty : {"Global", "None"}, default="Global"
|
|
154
|
+
"Global" draws theta from the variational posterior using a
|
|
155
|
+
globally shared covariance approximation; "None" uses the MAP
|
|
156
|
+
theta without measurement uncertainty. (The R package's "Local"
|
|
157
|
+
method is not implemented.)
|
|
158
|
+
nsims : int, default=25
|
|
159
|
+
Number of method-of-composition draws ("Global" only).
|
|
160
|
+
prior : float or ndarray, optional
|
|
161
|
+
Ridge penalty (scalar or full precision matrix) added to the
|
|
162
|
+
regression for numerical stability.
|
|
163
|
+
"""
|
|
164
|
+
if not hasattr(model, "theta_"):
|
|
165
|
+
raise ValueError("model must be a fitted StructuralTopicModel.")
|
|
166
|
+
if uncertainty not in ("Global", "None"):
|
|
167
|
+
raise ValueError(
|
|
168
|
+
"uncertainty must be 'Global' or 'None' ('Local' is not "
|
|
169
|
+
"implemented; 'Global' is the recommended method)."
|
|
170
|
+
)
|
|
171
|
+
rng = check_random_state(random_state)
|
|
172
|
+
K = model.theta_.shape[1]
|
|
173
|
+
topics = list(range(K)) if topics is None else list(topics)
|
|
174
|
+
if any(k < 0 or k >= K for k in topics):
|
|
175
|
+
raise ValueError("topics must be 0-based indices below n_components.")
|
|
176
|
+
|
|
177
|
+
xmat = np.asarray(prevalence, dtype=np.float64)
|
|
178
|
+
if xmat.ndim == 1:
|
|
179
|
+
xmat = xmat[:, None]
|
|
180
|
+
if xmat.shape[0] != model.theta_.shape[0]:
|
|
181
|
+
raise ValueError(
|
|
182
|
+
"prevalence has a different number of rows than the fitted "
|
|
183
|
+
"documents."
|
|
184
|
+
)
|
|
185
|
+
if not np.allclose(xmat[:, 0], 1.0):
|
|
186
|
+
xmat = np.column_stack([np.ones(xmat.shape[0]), xmat])
|
|
187
|
+
|
|
188
|
+
reg = _QRRegression(xmat, prior=prior)
|
|
189
|
+
if uncertainty == "None":
|
|
190
|
+
nsims = 1
|
|
191
|
+
|
|
192
|
+
parameters = {k: [] for k in topics}
|
|
193
|
+
for _ in range(nsims):
|
|
194
|
+
if uncertainty == "None":
|
|
195
|
+
theta = model.theta_
|
|
196
|
+
else:
|
|
197
|
+
theta = _draw_theta(model, rng)
|
|
198
|
+
for k in topics:
|
|
199
|
+
parameters[k].append(reg.fit(theta[:, k]))
|
|
200
|
+
|
|
201
|
+
return EstimatedEffects(parameters, topics,
|
|
202
|
+
n_obs=model.theta_.shape[0],
|
|
203
|
+
n_params=xmat.shape[1])
|
pystm/model_selection.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""Choosing the number of topics (ports of searchK.R and heldout.R)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from scipy.sparse import csr_matrix, issparse, lil_matrix
|
|
7
|
+
from scipy.special import gammaln
|
|
8
|
+
from sklearn.utils import check_random_state
|
|
9
|
+
|
|
10
|
+
from ._utils import safelog
|
|
11
|
+
from .diagnostics import check_residuals, exclusivity, semantic_coherence
|
|
12
|
+
from .stm import StructuralTopicModel
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def make_heldout(X, N=None, proportion=0.5, random_state=None):
|
|
16
|
+
"""Hold out a fraction of tokens for document completion (make.heldout).
|
|
17
|
+
|
|
18
|
+
Selects ``N`` documents (default 10%) and removes ``proportion`` of
|
|
19
|
+
each one's tokens. Held-out tokens whose word disappears entirely
|
|
20
|
+
from the training corpus are dropped (mirroring the R package's vocab
|
|
21
|
+
remapping).
|
|
22
|
+
|
|
23
|
+
Returns a dict with ``X_train`` (csr matrix), ``index`` (held-out
|
|
24
|
+
document ids) and ``docs`` (list of ``(word_indices, counts)``).
|
|
25
|
+
"""
|
|
26
|
+
X = csr_matrix(X) if not issparse(X) else X.tocsr()
|
|
27
|
+
rng = check_random_state(random_state)
|
|
28
|
+
n_docs, V = X.shape
|
|
29
|
+
if N is None:
|
|
30
|
+
N = int(np.floor(0.1 * n_docs))
|
|
31
|
+
if not 0 < N <= n_docs:
|
|
32
|
+
raise ValueError("N must be between 1 and the number of documents.")
|
|
33
|
+
if not 0 < proportion < 1:
|
|
34
|
+
raise ValueError("proportion must be in (0, 1).")
|
|
35
|
+
|
|
36
|
+
index = np.sort(rng.choice(n_docs, size=N, replace=False))
|
|
37
|
+
X_train = lil_matrix(X.copy())
|
|
38
|
+
missing_docs = []
|
|
39
|
+
kept_index = []
|
|
40
|
+
for i in index:
|
|
41
|
+
row = X.getrow(i)
|
|
42
|
+
if row.indices.shape[0] < 2:
|
|
43
|
+
continue # too few distinct words to split (as in R)
|
|
44
|
+
tokens = np.repeat(row.indices, row.data.astype(np.int64))
|
|
45
|
+
nsamp = max(1, int(np.floor(proportion * tokens.shape[0])))
|
|
46
|
+
nsamp = min(nsamp, tokens.shape[0] - 1) # keep the doc non-empty
|
|
47
|
+
held = rng.choice(tokens.shape[0], size=nsamp, replace=False)
|
|
48
|
+
held_counts = np.bincount(tokens[held], minlength=V)
|
|
49
|
+
words = np.flatnonzero(held_counts)
|
|
50
|
+
for w in words:
|
|
51
|
+
X_train[i, w] -= held_counts[w]
|
|
52
|
+
missing_docs.append((words, held_counts[words].astype(np.float64)))
|
|
53
|
+
kept_index.append(i)
|
|
54
|
+
X_train = csr_matrix(X_train)
|
|
55
|
+
X_train.eliminate_zeros()
|
|
56
|
+
|
|
57
|
+
# drop held-out tokens of words that vanished from the training corpus
|
|
58
|
+
train_wcounts = np.asarray(X_train.sum(axis=0)).ravel()
|
|
59
|
+
final_docs, final_index = [], []
|
|
60
|
+
for i, (words, counts) in zip(kept_index, missing_docs):
|
|
61
|
+
keep = train_wcounts[words] > 0
|
|
62
|
+
if keep.any():
|
|
63
|
+
final_docs.append((words[keep], counts[keep]))
|
|
64
|
+
final_index.append(i)
|
|
65
|
+
return {
|
|
66
|
+
"X_train": X_train,
|
|
67
|
+
"index": np.asarray(final_index, dtype=np.int64),
|
|
68
|
+
"docs": final_docs,
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def eval_heldout(model, heldout, content=None):
|
|
73
|
+
"""Heldout log-likelihood by document completion (eval.heldout).
|
|
74
|
+
|
|
75
|
+
Uses the fitted theta of each partially observed document to score
|
|
76
|
+
its held-out tokens. Returns a dict with ``expected_heldout`` (the
|
|
77
|
+
mean over documents of the mean per-token log-probability) and the
|
|
78
|
+
per-document values ``doc_heldout``.
|
|
79
|
+
"""
|
|
80
|
+
beta = model._beta_list()
|
|
81
|
+
if model.content_levels_ is not None:
|
|
82
|
+
if content is None:
|
|
83
|
+
raise ValueError(
|
|
84
|
+
"The model was fitted with a content covariate; pass the "
|
|
85
|
+
"matching content labels."
|
|
86
|
+
)
|
|
87
|
+
index_all = np.searchsorted(model.content_levels_,
|
|
88
|
+
np.asarray(content).ravel())
|
|
89
|
+
else:
|
|
90
|
+
index_all = np.zeros(model.theta_.shape[0], dtype=np.int64)
|
|
91
|
+
|
|
92
|
+
doc_scores = np.empty(len(heldout["docs"]))
|
|
93
|
+
for j, (doc_id, (words, counts)) in enumerate(
|
|
94
|
+
zip(heldout["index"], heldout["docs"])):
|
|
95
|
+
logprobs = safelog(
|
|
96
|
+
model.theta_[doc_id] @ beta[index_all[doc_id]][:, words]
|
|
97
|
+
)
|
|
98
|
+
doc_scores[j] = np.repeat(logprobs, counts.astype(np.int64)).mean()
|
|
99
|
+
return {
|
|
100
|
+
"expected_heldout": float(np.nanmean(doc_scores)),
|
|
101
|
+
"doc_heldout": doc_scores,
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def search_k(X, K_values, *, prevalence=None, content=None, N=None,
|
|
106
|
+
proportion=0.5, M=10, heldout_random_state=None,
|
|
107
|
+
model_params=None, verbose=False):
|
|
108
|
+
"""Fit models over a grid of K and compute diagnostics (searchK).
|
|
109
|
+
|
|
110
|
+
Parameters
|
|
111
|
+
----------
|
|
112
|
+
X : array-like or sparse matrix of shape (n_samples, n_features)
|
|
113
|
+
K_values : iterable of int
|
|
114
|
+
Topic numbers to evaluate.
|
|
115
|
+
prevalence, content : optional
|
|
116
|
+
Covariates forwarded to :meth:`StructuralTopicModel.fit`.
|
|
117
|
+
N, proportion : heldout construction parameters (see make_heldout).
|
|
118
|
+
M : int, default=10
|
|
119
|
+
Number of top words for exclusivity / semantic coherence.
|
|
120
|
+
model_params : dict, optional
|
|
121
|
+
Extra keyword arguments for the StructuralTopicModel constructor.
|
|
122
|
+
|
|
123
|
+
Returns
|
|
124
|
+
-------
|
|
125
|
+
dict of arrays keyed by "K", "heldout", "residual", "bound",
|
|
126
|
+
"lbound", "exclus", "semcoh", "em_its" (exclusivity and semantic
|
|
127
|
+
coherence are omitted for content covariate models, as in R).
|
|
128
|
+
"""
|
|
129
|
+
K_values = list(K_values)
|
|
130
|
+
model_params = dict(model_params or {})
|
|
131
|
+
heldout = make_heldout(X, N=N, proportion=proportion,
|
|
132
|
+
random_state=heldout_random_state)
|
|
133
|
+
|
|
134
|
+
results = {key: [] for key in
|
|
135
|
+
("K", "heldout", "residual", "bound", "lbound",
|
|
136
|
+
"exclus", "semcoh", "em_its")}
|
|
137
|
+
for K in K_values:
|
|
138
|
+
if verbose:
|
|
139
|
+
print(f"searchK: fitting K={K} ...")
|
|
140
|
+
model = StructuralTopicModel(n_components=K, **model_params)
|
|
141
|
+
model.fit(heldout["X_train"], prevalence=prevalence, content=content)
|
|
142
|
+
|
|
143
|
+
results["K"].append(K)
|
|
144
|
+
results["heldout"].append(
|
|
145
|
+
eval_heldout(model, heldout, content=content)["expected_heldout"]
|
|
146
|
+
)
|
|
147
|
+
results["residual"].append(
|
|
148
|
+
check_residuals(model, heldout["X_train"],
|
|
149
|
+
content=content)["dispersion"]
|
|
150
|
+
)
|
|
151
|
+
bound = max(model.bound_)
|
|
152
|
+
results["bound"].append(bound)
|
|
153
|
+
results["lbound"].append(bound + gammaln(K + 1))
|
|
154
|
+
if content is None:
|
|
155
|
+
results["exclus"].append(
|
|
156
|
+
float(np.mean(exclusivity(model, M=M, frexw=0.7)))
|
|
157
|
+
)
|
|
158
|
+
results["semcoh"].append(
|
|
159
|
+
float(np.mean(semantic_coherence(model, heldout["X_train"],
|
|
160
|
+
M=M)))
|
|
161
|
+
)
|
|
162
|
+
results["em_its"].append(model.n_iter_)
|
|
163
|
+
|
|
164
|
+
if content is not None:
|
|
165
|
+
del results["exclus"], results["semcoh"]
|
|
166
|
+
return {key: np.asarray(val) for key, val in results.items()}
|