structural-topic-model 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystm/__init__.py +31 -0
- pystm/_estep.py +159 -0
- pystm/_mnreg.py +221 -0
- pystm/_mstep.py +96 -0
- pystm/_spectral.py +151 -0
- pystm/_utils.py +41 -0
- pystm/diagnostics.py +168 -0
- pystm/effects.py +203 -0
- pystm/model_selection.py +166 -0
- pystm/stm.py +443 -0
- structural_topic_model-0.2.0.dist-info/METADATA +234 -0
- structural_topic_model-0.2.0.dist-info/RECORD +14 -0
- structural_topic_model-0.2.0.dist-info/WHEEL +4 -0
- structural_topic_model-0.2.0.dist-info/licenses/LICENSE +21 -0
pystm/stm.py
ADDED
|
@@ -0,0 +1,443 @@
|
|
|
1
|
+
"""Structural Topic Model with a scikit-learn style API.
|
|
2
|
+
|
|
3
|
+
Port of the R ``stm`` package's core estimation routine (variational EM
|
|
4
|
+
for the logistic-normal topic model with prevalence covariates).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
from scipy.sparse import csr_matrix, issparse
|
|
11
|
+
from scipy.stats import rankdata
|
|
12
|
+
from sklearn.base import BaseEstimator, TransformerMixin
|
|
13
|
+
from sklearn.utils import check_random_state
|
|
14
|
+
from sklearn.utils.validation import check_is_fitted
|
|
15
|
+
|
|
16
|
+
from ._estep import decompose_sigma, estep, optimize_document
|
|
17
|
+
from ._mnreg import mnreg
|
|
18
|
+
from ._mstep import opt_beta, opt_mu, opt_sigma
|
|
19
|
+
from ._spectral import spectral_init
|
|
20
|
+
from ._utils import row_softmax, safelog, to_doc_list
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class StructuralTopicModel(BaseEstimator, TransformerMixin):
|
|
24
|
+
"""Structural Topic Model (STM) with variational EM.
|
|
25
|
+
|
|
26
|
+
STM is a logistic-normal topic model in which document metadata
|
|
27
|
+
("prevalence covariates") shifts the prior mean of each document's
|
|
28
|
+
topic proportions. Without covariates the model reduces to the
|
|
29
|
+
Correlated Topic Model. The API follows
|
|
30
|
+
:class:`sklearn.decomposition.LatentDirichletAllocation`; covariates
|
|
31
|
+
are passed to :meth:`fit` / :meth:`transform` via the ``prevalence``
|
|
32
|
+
keyword.
|
|
33
|
+
|
|
34
|
+
Parameters
|
|
35
|
+
----------
|
|
36
|
+
n_components : int, default=10
|
|
37
|
+
Number of topics K (must be >= 2).
|
|
38
|
+
init : {"spectral", "random"}, default="spectral"
|
|
39
|
+
Initialization method. "spectral" is the deterministic anchor-word
|
|
40
|
+
algorithm of Arora et al. (2013), recommended by the stm authors.
|
|
41
|
+
max_iter : int, default=500
|
|
42
|
+
Maximum number of EM iterations.
|
|
43
|
+
tol : float, default=1e-5
|
|
44
|
+
EM stops when the relative change of the approximate evidence
|
|
45
|
+
lower bound falls below ``tol``.
|
|
46
|
+
sigma_prior : float, default=0.0
|
|
47
|
+
Weight in [0, 1] of regularization of the topic covariance matrix
|
|
48
|
+
towards its diagonal.
|
|
49
|
+
gamma_max_iter : int, default=1000
|
|
50
|
+
Iteration limit for the variational prevalence regression.
|
|
51
|
+
e_step_max_iter : int, default=500
|
|
52
|
+
BFGS iteration limit for each document's optimization.
|
|
53
|
+
max_vocab : int or None, default=10000
|
|
54
|
+
With spectral initialization, only this many most frequent terms
|
|
55
|
+
are used to build the gram matrix (None disables the cap).
|
|
56
|
+
content_interactions : bool, default=True
|
|
57
|
+
For content covariate models, whether to include topic-by-level
|
|
58
|
+
interaction deviations in addition to the main effects.
|
|
59
|
+
warm_start : bool, default=False
|
|
60
|
+
When True, repeated calls to :meth:`fit` continue EM from the
|
|
61
|
+
previous solution instead of re-initializing (the analogue of the
|
|
62
|
+
R package's ``model=`` restart argument). Each call runs up to
|
|
63
|
+
``max_iter`` further iterations and ``bound_`` keeps the full
|
|
64
|
+
history. ``n_components``, the vocabulary and the covariate
|
|
65
|
+
setup must stay the same; per-document state is reused only when
|
|
66
|
+
the corpus has the same number of documents.
|
|
67
|
+
random_state : int, RandomState or None, default=None
|
|
68
|
+
Only used for ``init="random"``; the spectral method is
|
|
69
|
+
deterministic.
|
|
70
|
+
verbose : int, default=0
|
|
71
|
+
If positive, print the bound after each EM iteration.
|
|
72
|
+
|
|
73
|
+
Attributes
|
|
74
|
+
----------
|
|
75
|
+
components_ : ndarray of shape (n_components, n_features)
|
|
76
|
+
Topic-word distributions (each row sums to 1). Note this differs
|
|
77
|
+
from sklearn's LDA, whose ``components_`` holds unnormalized
|
|
78
|
+
pseudo-counts. For content covariate models this is the
|
|
79
|
+
aspect-frequency-weighted average of ``aspect_components_``.
|
|
80
|
+
aspect_components_ : ndarray of shape (n_levels, n_components, n_features) or None
|
|
81
|
+
Per-content-level topic-word distributions (content models only).
|
|
82
|
+
kappa_ : dict or None
|
|
83
|
+
SAGE parameters of the content model: baseline log-probabilities
|
|
84
|
+
``m`` (n_features,) and sparse deviations ``params``.
|
|
85
|
+
content_levels_ : ndarray or None
|
|
86
|
+
Sorted unique content covariate levels seen at fit time.
|
|
87
|
+
gamma_ : ndarray of shape (1 + n_covariates, n_components - 1) or None
|
|
88
|
+
Prevalence regression coefficients (first row is the intercept).
|
|
89
|
+
None when no covariates were used.
|
|
90
|
+
mu_ : ndarray
|
|
91
|
+
Prior means of eta: shape (n_components - 1,) without covariates,
|
|
92
|
+
(n_samples, n_components - 1) with covariates.
|
|
93
|
+
sigma_ : ndarray of shape (n_components - 1, n_components - 1)
|
|
94
|
+
Topic covariance matrix.
|
|
95
|
+
theta_ : ndarray of shape (n_samples, n_components)
|
|
96
|
+
Topic proportions of the training documents.
|
|
97
|
+
eta_ : ndarray of shape (n_samples, n_components - 1)
|
|
98
|
+
Variational means of the logistic-normal parameters.
|
|
99
|
+
bound_ : list of float
|
|
100
|
+
Approximate evidence lower bound at each EM iteration.
|
|
101
|
+
n_iter_ : int
|
|
102
|
+
Number of EM iterations run.
|
|
103
|
+
converged_ : bool
|
|
104
|
+
Whether the bound converged before ``max_iter``.
|
|
105
|
+
|
|
106
|
+
References
|
|
107
|
+
----------
|
|
108
|
+
Roberts, M., Stewart, B., & Tingley, D. (2019). "stm: An R Package for
|
|
109
|
+
Structural Topic Models." Journal of Statistical Software 91(2).
|
|
110
|
+
"""
|
|
111
|
+
|
|
112
|
+
def __init__(self, n_components=10, *, init="spectral", max_iter=500,
|
|
113
|
+
tol=1e-5, sigma_prior=0.0, gamma_max_iter=1000,
|
|
114
|
+
e_step_max_iter=500, max_vocab=10000,
|
|
115
|
+
content_interactions=True, warm_start=False,
|
|
116
|
+
random_state=None, verbose=0):
|
|
117
|
+
self.n_components = n_components
|
|
118
|
+
self.init = init
|
|
119
|
+
self.max_iter = max_iter
|
|
120
|
+
self.tol = tol
|
|
121
|
+
self.sigma_prior = sigma_prior
|
|
122
|
+
self.gamma_max_iter = gamma_max_iter
|
|
123
|
+
self.e_step_max_iter = e_step_max_iter
|
|
124
|
+
self.max_vocab = max_vocab
|
|
125
|
+
self.content_interactions = content_interactions
|
|
126
|
+
self.warm_start = warm_start
|
|
127
|
+
self.random_state = random_state
|
|
128
|
+
self.verbose = verbose
|
|
129
|
+
|
|
130
|
+
# ------------------------------------------------------------------
|
|
131
|
+
# validation helpers
|
|
132
|
+
|
|
133
|
+
def _validate_inputs(self, X, prevalence, *, reset):
|
|
134
|
+
X = csr_matrix(X) if not issparse(X) else X.tocsr()
|
|
135
|
+
X = X.astype(np.float64)
|
|
136
|
+
if not reset and X.shape[1] != self.components_.shape[1]:
|
|
137
|
+
raise ValueError(
|
|
138
|
+
f"X has {X.shape[1]} features, but the model was fitted "
|
|
139
|
+
f"with {self.components_.shape[1]}."
|
|
140
|
+
)
|
|
141
|
+
doc_lens = np.asarray(X.sum(axis=1)).ravel()
|
|
142
|
+
if np.any(doc_lens == 0):
|
|
143
|
+
raise ValueError(
|
|
144
|
+
"X contains empty documents; remove them before fitting "
|
|
145
|
+
"(cf. prepDocuments in the R package)."
|
|
146
|
+
)
|
|
147
|
+
design = None
|
|
148
|
+
if prevalence is not None:
|
|
149
|
+
design = np.asarray(prevalence, dtype=np.float64)
|
|
150
|
+
if design.ndim == 1:
|
|
151
|
+
design = design[:, None]
|
|
152
|
+
if design.shape[0] != X.shape[0]:
|
|
153
|
+
raise ValueError(
|
|
154
|
+
"prevalence has a different number of rows than X."
|
|
155
|
+
)
|
|
156
|
+
if np.isnan(design).any():
|
|
157
|
+
raise ValueError("Missing values in prevalence covariates.")
|
|
158
|
+
# prepend an intercept unless one is already there
|
|
159
|
+
if not np.allclose(design[:, 0], 1.0):
|
|
160
|
+
design = np.column_stack([np.ones(design.shape[0]), design])
|
|
161
|
+
return X, design
|
|
162
|
+
|
|
163
|
+
def _encode_content(self, content, n_docs, *, reset):
|
|
164
|
+
"""Map content covariate labels to aspect indices (betaindex)."""
|
|
165
|
+
if content is None:
|
|
166
|
+
if not reset and getattr(self, "content_levels_", None) is not None:
|
|
167
|
+
raise ValueError(
|
|
168
|
+
"The model was fitted with a content covariate; pass "
|
|
169
|
+
"the matching content labels."
|
|
170
|
+
)
|
|
171
|
+
return None, np.zeros(n_docs, dtype=np.int64)
|
|
172
|
+
content = np.asarray(content).ravel()
|
|
173
|
+
if content.shape[0] != n_docs:
|
|
174
|
+
raise ValueError("content has a different number of rows than X.")
|
|
175
|
+
if reset:
|
|
176
|
+
levels = np.unique(content)
|
|
177
|
+
else:
|
|
178
|
+
if getattr(self, "content_levels_", None) is None:
|
|
179
|
+
raise ValueError(
|
|
180
|
+
"The model was fitted without a content covariate."
|
|
181
|
+
)
|
|
182
|
+
levels = self.content_levels_
|
|
183
|
+
unseen = ~np.isin(content, levels)
|
|
184
|
+
if unseen.any():
|
|
185
|
+
raise ValueError(
|
|
186
|
+
f"Unseen content levels: {np.unique(content[unseen])!r}"
|
|
187
|
+
)
|
|
188
|
+
index = np.searchsorted(levels, content)
|
|
189
|
+
return levels, index
|
|
190
|
+
|
|
191
|
+
# ------------------------------------------------------------------
|
|
192
|
+
# fitting
|
|
193
|
+
|
|
194
|
+
def fit(self, X, y=None, *, prevalence=None, content=None):
|
|
195
|
+
"""Fit the model to a document-term count matrix.
|
|
196
|
+
|
|
197
|
+
Parameters
|
|
198
|
+
----------
|
|
199
|
+
X : array-like or sparse matrix of shape (n_samples, n_features)
|
|
200
|
+
Word counts per document.
|
|
201
|
+
y : ignored
|
|
202
|
+
prevalence : array-like of shape (n_samples, n_covariates), optional
|
|
203
|
+
Prevalence covariate design matrix. An intercept column is
|
|
204
|
+
added automatically. Categorical variables must be encoded
|
|
205
|
+
numerically (e.g. one-hot) beforehand.
|
|
206
|
+
content : array-like of shape (n_samples,), optional
|
|
207
|
+
Content covariate: one categorical label per document. Each
|
|
208
|
+
level gets its own topic-word distributions, estimated as
|
|
209
|
+
sparse deviations from a shared baseline (SAGE-style, via
|
|
210
|
+
distributed Poisson regression as in the R package's L1 mode).
|
|
211
|
+
"""
|
|
212
|
+
K = self.n_components
|
|
213
|
+
if not (isinstance(K, (int, np.integer)) and K >= 2):
|
|
214
|
+
raise ValueError("n_components must be an integer >= 2.")
|
|
215
|
+
if not 0.0 <= self.sigma_prior <= 1.0:
|
|
216
|
+
raise ValueError("sigma_prior must be between 0 and 1.")
|
|
217
|
+
if self.init not in ("spectral", "random"):
|
|
218
|
+
raise ValueError("init must be 'spectral' or 'random'.")
|
|
219
|
+
|
|
220
|
+
warm = self.warm_start and hasattr(self, "components_")
|
|
221
|
+
X, design = self._validate_inputs(X, prevalence, reset=not warm)
|
|
222
|
+
docs = to_doc_list(X)
|
|
223
|
+
N, V = X.shape
|
|
224
|
+
|
|
225
|
+
if warm:
|
|
226
|
+
# ---- continue from the previous solution (cf. the R
|
|
227
|
+
# package's model= restart argument) ----
|
|
228
|
+
if self.components_.shape[0] != K:
|
|
229
|
+
raise ValueError(
|
|
230
|
+
f"warm_start requires the same n_components as the "
|
|
231
|
+
f"previous fit ({self.components_.shape[0]}), "
|
|
232
|
+
f"got {K}."
|
|
233
|
+
)
|
|
234
|
+
levels, beta_index = self._encode_content(content, N,
|
|
235
|
+
reset=False)
|
|
236
|
+
A = 1 if levels is None else len(levels)
|
|
237
|
+
beta = [b.copy() for b in self._beta_list()]
|
|
238
|
+
sigma = self.sigma_.copy()
|
|
239
|
+
if (design is not None and self.gamma_ is not None
|
|
240
|
+
and design.shape[1] == self.gamma_.shape[0]):
|
|
241
|
+
gamma = self.gamma_.copy()
|
|
242
|
+
mu = design @ gamma
|
|
243
|
+
else:
|
|
244
|
+
gamma = None
|
|
245
|
+
mu = (self.mu_.copy() if self.mu_.ndim == 1
|
|
246
|
+
else np.zeros(K - 1))
|
|
247
|
+
# per-document state is only reusable for the same corpus
|
|
248
|
+
lambda_ = (self.eta_.copy() if self.eta_.shape[0] == N
|
|
249
|
+
else np.zeros((N, K - 1)))
|
|
250
|
+
kappa = self.kappa_
|
|
251
|
+
bound_history = list(self.bound_)
|
|
252
|
+
else:
|
|
253
|
+
levels, beta_index = self._encode_content(content, N,
|
|
254
|
+
reset=True)
|
|
255
|
+
A = 1 if levels is None else len(levels)
|
|
256
|
+
|
|
257
|
+
# ---- initialization (stm.init) ----
|
|
258
|
+
if self.init == "spectral":
|
|
259
|
+
init_beta = spectral_init(X, K, max_vocab=self.max_vocab)
|
|
260
|
+
else:
|
|
261
|
+
rng = check_random_state(self.random_state)
|
|
262
|
+
b = rng.gamma(0.1, 1.0, size=(K, V))
|
|
263
|
+
init_beta = b / b.sum(axis=1, keepdims=True)
|
|
264
|
+
beta = [init_beta.copy() for _ in range(A)]
|
|
265
|
+
mu = np.zeros(K - 1)
|
|
266
|
+
sigma = np.diag(np.full(K - 1, 20.0))
|
|
267
|
+
lambda_ = np.zeros((N, K - 1))
|
|
268
|
+
gamma = None
|
|
269
|
+
kappa = None
|
|
270
|
+
bound_history = []
|
|
271
|
+
|
|
272
|
+
wcounts = np.asarray(X.sum(axis=0)).ravel()
|
|
273
|
+
|
|
274
|
+
# ---- EM loop (stm.control) ----
|
|
275
|
+
converged = False
|
|
276
|
+
for _ in range(self.max_iter):
|
|
277
|
+
# like the R code, document-specific means are only available
|
|
278
|
+
# once gamma has been estimated (i.e. from the second iteration)
|
|
279
|
+
update_mu = gamma is not None
|
|
280
|
+
sigma_ss, beta_ss, bound, lambda_ = estep(
|
|
281
|
+
docs, beta_index, update_mu, beta, lambda_, mu, sigma,
|
|
282
|
+
max_optim_iter=self.e_step_max_iter,
|
|
283
|
+
)
|
|
284
|
+
mu, gamma = opt_mu(lambda_, covar=design,
|
|
285
|
+
max_iter=self.gamma_max_iter)
|
|
286
|
+
sigma = opt_sigma(sigma_ss, lambda_, mu, self.sigma_prior)
|
|
287
|
+
if levels is None:
|
|
288
|
+
beta = opt_beta(beta_ss)
|
|
289
|
+
else:
|
|
290
|
+
beta, kappa = mnreg(
|
|
291
|
+
beta_ss, wcounts,
|
|
292
|
+
interactions=self.content_interactions,
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
bound_history.append(float(bound.sum()))
|
|
296
|
+
if self.verbose:
|
|
297
|
+
print(f"Iteration {len(bound_history)}: "
|
|
298
|
+
f"bound = {bound_history[-1]:.2f}")
|
|
299
|
+
if len(bound_history) > 1:
|
|
300
|
+
old, new = bound_history[-2], bound_history[-1]
|
|
301
|
+
if (new - old) / abs(old) < self.tol:
|
|
302
|
+
converged = True
|
|
303
|
+
if self.verbose:
|
|
304
|
+
print("Model converged.")
|
|
305
|
+
break
|
|
306
|
+
|
|
307
|
+
# ---- pack the results ----
|
|
308
|
+
self.content_levels_ = levels
|
|
309
|
+
if levels is None:
|
|
310
|
+
self.components_ = beta[0]
|
|
311
|
+
self.aspect_components_ = None
|
|
312
|
+
self.kappa_ = None
|
|
313
|
+
else:
|
|
314
|
+
self.aspect_components_ = np.stack(beta)
|
|
315
|
+
# corpus-level summary: aspect betas weighted by frequency
|
|
316
|
+
weights = np.bincount(beta_index, minlength=A) / N
|
|
317
|
+
self.components_ = np.tensordot(
|
|
318
|
+
weights, self.aspect_components_, axes=1
|
|
319
|
+
)
|
|
320
|
+
self.kappa_ = kappa
|
|
321
|
+
self.gamma_ = gamma
|
|
322
|
+
self.mu_ = mu
|
|
323
|
+
self.sigma_ = sigma
|
|
324
|
+
self.eta_ = lambda_
|
|
325
|
+
full_eta = np.column_stack([lambda_, np.zeros(N)])
|
|
326
|
+
self.theta_ = row_softmax(full_eta)
|
|
327
|
+
self.bound_ = bound_history
|
|
328
|
+
self.n_iter_ = len(bound_history)
|
|
329
|
+
self.converged_ = converged
|
|
330
|
+
self.n_features_in_ = V
|
|
331
|
+
return self
|
|
332
|
+
|
|
333
|
+
def fit_transform(self, X, y=None, *, prevalence=None, content=None):
|
|
334
|
+
"""Fit the model and return the training documents' theta."""
|
|
335
|
+
return self.fit(X, prevalence=prevalence, content=content).theta_
|
|
336
|
+
|
|
337
|
+
# ------------------------------------------------------------------
|
|
338
|
+
# inference on new documents
|
|
339
|
+
|
|
340
|
+
def _new_doc_priors(self, design):
|
|
341
|
+
"""Per-document prior means for held-out inference."""
|
|
342
|
+
if self.gamma_ is not None:
|
|
343
|
+
if design is None:
|
|
344
|
+
raise ValueError(
|
|
345
|
+
"The model was fitted with prevalence covariates; pass "
|
|
346
|
+
"the matching covariates."
|
|
347
|
+
)
|
|
348
|
+
if design.shape[1] != self.gamma_.shape[0]:
|
|
349
|
+
raise ValueError(
|
|
350
|
+
"prevalence has a different number of columns than "
|
|
351
|
+
"at fit time."
|
|
352
|
+
)
|
|
353
|
+
return design @ self.gamma_, True
|
|
354
|
+
mu = self.mu_ if self.mu_.ndim == 1 else self.mu_.mean(axis=0)
|
|
355
|
+
return mu, False
|
|
356
|
+
|
|
357
|
+
def _beta_list(self):
|
|
358
|
+
"""Topic-word distributions as a per-aspect list."""
|
|
359
|
+
if self.aspect_components_ is None:
|
|
360
|
+
return [self.components_]
|
|
361
|
+
return list(self.aspect_components_)
|
|
362
|
+
|
|
363
|
+
def transform(self, X, *, prevalence=None, content=None):
|
|
364
|
+
"""Infer topic proportions for (possibly new) documents.
|
|
365
|
+
|
|
366
|
+
Runs one E-step with the fitted global parameters held fixed
|
|
367
|
+
(cf. fitNewDocuments in the R package) and returns theta of shape
|
|
368
|
+
(n_samples, n_components).
|
|
369
|
+
"""
|
|
370
|
+
check_is_fitted(self, "components_")
|
|
371
|
+
X, design = self._validate_inputs(X, prevalence, reset=False)
|
|
372
|
+
docs = to_doc_list(X)
|
|
373
|
+
N = X.shape[0]
|
|
374
|
+
K = self.n_components
|
|
375
|
+
_, beta_index = self._encode_content(content, N, reset=False)
|
|
376
|
+
beta = self._beta_list()
|
|
377
|
+
mu, update_mu = self._new_doc_priors(design)
|
|
378
|
+
|
|
379
|
+
siginv, sigmaentropy = decompose_sigma(self.sigma_)
|
|
380
|
+
eta = np.zeros((N, K - 1))
|
|
381
|
+
for i, (words, counts) in enumerate(docs):
|
|
382
|
+
beta_d = np.ascontiguousarray(beta[beta_index[i]][:, words])
|
|
383
|
+
mu_d = mu[i] if update_mu else mu
|
|
384
|
+
_, eta[i] = optimize_document(
|
|
385
|
+
eta[i], beta_d, counts, mu_d, siginv, sigmaentropy,
|
|
386
|
+
max_optim_iter=self.e_step_max_iter,
|
|
387
|
+
)
|
|
388
|
+
return row_softmax(np.column_stack([eta, np.zeros(N)]))
|
|
389
|
+
|
|
390
|
+
def score(self, X, y=None, *, prevalence=None, content=None):
|
|
391
|
+
"""Approximate evidence lower bound of ``X`` under the fitted model."""
|
|
392
|
+
check_is_fitted(self, "components_")
|
|
393
|
+
X, design = self._validate_inputs(X, prevalence, reset=False)
|
|
394
|
+
docs = to_doc_list(X)
|
|
395
|
+
N = X.shape[0]
|
|
396
|
+
K = self.n_components
|
|
397
|
+
_, beta_index = self._encode_content(content, N, reset=False)
|
|
398
|
+
mu, update_mu = self._new_doc_priors(design)
|
|
399
|
+
_, _, bound, _ = estep(
|
|
400
|
+
docs, beta_index, update_mu,
|
|
401
|
+
self._beta_list(), np.zeros((N, K - 1)), mu, self.sigma_,
|
|
402
|
+
max_optim_iter=self.e_step_max_iter,
|
|
403
|
+
)
|
|
404
|
+
return float(bound.sum())
|
|
405
|
+
|
|
406
|
+
def perplexity(self, X, *, prevalence=None, content=None):
|
|
407
|
+
"""Per-token perplexity of ``X``, ``exp(-bound / n_tokens)``.
|
|
408
|
+
|
|
409
|
+
Like :meth:`sklearn.decomposition.LatentDirichletAllocation.perplexity`
|
|
410
|
+
this is based on the variational bound (here the logistic-normal
|
|
411
|
+
ELBO from :meth:`score`), so values are comparable between fits
|
|
412
|
+
of this class on the same data; lower is better.
|
|
413
|
+
"""
|
|
414
|
+
check_is_fitted(self, "components_")
|
|
415
|
+
bound = self.score(X, prevalence=prevalence, content=content)
|
|
416
|
+
X_csr, _ = self._validate_inputs(X, None, reset=False)
|
|
417
|
+
n_tokens = X_csr.sum()
|
|
418
|
+
return float(np.exp(-bound / n_tokens))
|
|
419
|
+
|
|
420
|
+
# ------------------------------------------------------------------
|
|
421
|
+
# interpretation helpers
|
|
422
|
+
|
|
423
|
+
def top_words(self, n_words=10, *, kind="prob", frex_weight=0.5):
|
|
424
|
+
"""Indices of the top words per topic (cf. labelTopics).
|
|
425
|
+
|
|
426
|
+
``kind="prob"`` ranks by within-topic probability; ``kind="frex"``
|
|
427
|
+
balances frequency and exclusivity with weight ``frex_weight``.
|
|
428
|
+
Returns an array of shape (n_components, n_words).
|
|
429
|
+
"""
|
|
430
|
+
check_is_fitted(self, "components_")
|
|
431
|
+
logbeta = safelog(self.components_)
|
|
432
|
+
if kind == "prob":
|
|
433
|
+
scores = logbeta
|
|
434
|
+
elif kind == "frex":
|
|
435
|
+
from scipy.special import logsumexp
|
|
436
|
+
|
|
437
|
+
excl = logbeta - logsumexp(logbeta, axis=0, keepdims=True)
|
|
438
|
+
freq_rank = np.apply_along_axis(rankdata, 1, logbeta) / logbeta.shape[1]
|
|
439
|
+
excl_rank = np.apply_along_axis(rankdata, 1, excl) / logbeta.shape[1]
|
|
440
|
+
scores = 1.0 / (frex_weight / freq_rank + (1 - frex_weight) / excl_rank)
|
|
441
|
+
else:
|
|
442
|
+
raise ValueError("kind must be 'prob' or 'frex'.")
|
|
443
|
+
return np.argsort(-scores, axis=1)[:, :n_words]
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: structural-topic-model
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Python implementation of the Structural Topic Model (STM), a port of the R stm package with a scikit-learn style API
|
|
5
|
+
Project-URL: Homepage, https://github.com/hirata-keisuke/pystm
|
|
6
|
+
Project-URL: Repository, https://github.com/hirata-keisuke/pystm
|
|
7
|
+
Project-URL: Issues, https://github.com/hirata-keisuke/pystm/issues
|
|
8
|
+
Author-email: hirata-keisuke <plainpeace39th@gmail.com>
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: NLP,STM,structural topic model,text mining,topic model
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
19
|
+
Requires-Python: >=3.12
|
|
20
|
+
Requires-Dist: numpy>=2.0
|
|
21
|
+
Requires-Dist: scikit-learn>=1.9.0
|
|
22
|
+
Requires-Dist: scipy>=1.14
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
|
|
25
|
+
# pystm — Structural Topic Model in Python
|
|
26
|
+
|
|
27
|
+
R の [stm](https://github.com/bstewart/stm) パッケージ(Roberts, Stewart & Tingley)のコア推定アルゴリズムを Python に移植したものです。API は scikit-learn の `LatentDirichletAllocation` に倣っています。
|
|
28
|
+
|
|
29
|
+
## STM とは
|
|
30
|
+
|
|
31
|
+
STM はロジスティック正規トピックモデルで、文書のメタデータ(prevalence 共変量)が各文書のトピック比率の事前平均をシフトさせます。共変量なしの場合は Correlated Topic Model (CTM) に帰着します。推定は semi-collapsed 変分 EM で行います(R 版 `stm()` と同一のアルゴリズム)。
|
|
32
|
+
|
|
33
|
+
## 使い方
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
import numpy as np
|
|
37
|
+
from pystm import StructuralTopicModel
|
|
38
|
+
|
|
39
|
+
# X: (n_docs, n_vocab) の単語カウント行列(dense / scipy.sparse どちらも可)
|
|
40
|
+
# covar: (n_docs, n_covariates) の prevalence 共変量(切片は自動付与)
|
|
41
|
+
|
|
42
|
+
model = StructuralTopicModel(n_components=10, init="spectral")
|
|
43
|
+
model.fit(X, prevalence=covar)
|
|
44
|
+
|
|
45
|
+
model.theta_ # 学習文書のトピック比率 (n_docs, K)
|
|
46
|
+
model.components_ # トピック-単語分布 (K, V)。各行の和は1
|
|
47
|
+
model.gamma_ # prevalence 回帰係数 (1+P, K-1)。先頭行が切片
|
|
48
|
+
model.sigma_ # トピック共分散行列 (K-1, K-1)
|
|
49
|
+
|
|
50
|
+
# 新規文書の推論(fitNewDocuments 相当)
|
|
51
|
+
theta_new = model.transform(X_new, prevalence=covar_new)
|
|
52
|
+
|
|
53
|
+
# トピックの代表語(labelTopics 相当)
|
|
54
|
+
model.top_words(n_words=10) # 確率順
|
|
55
|
+
model.top_words(n_words=10, kind="frex") # FREX(頻度と排他性のバランス)
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
共変量を渡さなければ CTM として推定されます:
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
model = StructuralTopicModel(n_components=10).fit(X)
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### content 共変量(SAGE / Distributed Multinomial Regression)
|
|
65
|
+
|
|
66
|
+
文書のカテゴリによってトピック内の語彙の使い方が変わるモデルです。各文書に1つのカテゴリラベルを渡します:
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
model = StructuralTopicModel(n_components=10)
|
|
70
|
+
model.fit(X, prevalence=covar, content=party_labels) # 例: 政党ラベル
|
|
71
|
+
|
|
72
|
+
model.aspect_components_ # カテゴリ別トピック-語彙分布 (n_levels, K, V)
|
|
73
|
+
model.kappa_["params"] # ベースラインからのスパースな偏差(lasso 推定)
|
|
74
|
+
model.content_levels_ # カテゴリ水準
|
|
75
|
+
# transform / score にも同じ content を渡す
|
|
76
|
+
model.transform(X_new, prevalence=c_new, content=labels_new)
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
R 版の `kappa.prior="L1"`(既定)に相当する Distributed Poisson 回帰で推定します。glmnet の代わりに、設計行列のインジケータ構造を利用して語彙方向に完全ベクトル化した IRLS+座標降下の Poisson lasso を実装しています(正則化パスと情報量規準による選択も R と同じ)。
|
|
80
|
+
|
|
81
|
+
### estimateEffect 相当: 共変量効果の推定
|
|
82
|
+
|
|
83
|
+
トピック比率を目的変数とする回帰を method of composition(変分事後分布からの θ サンプリング)で行い、測定不確実性込みの係数を返します:
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
from pystm import estimate_effect
|
|
87
|
+
|
|
88
|
+
eff = estimate_effect(model, covar, uncertainty="Global", nsims=25)
|
|
89
|
+
tables = eff.summary() # {topic: 構造化配列(estimate/std_error/t_value/p_value)}
|
|
90
|
+
tables[0]["estimate"] # トピック0の回帰係数(先頭が切片)
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
`uncertainty="Global"`(推奨・既定)と `"None"` をサポートします(R の `"Local"` は未実装)。
|
|
94
|
+
|
|
95
|
+
### searchK 相当: トピック数の選択
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
from pystm import search_k
|
|
99
|
+
|
|
100
|
+
res = search_k(X, K_values=[5, 10, 15], prevalence=covar,
|
|
101
|
+
model_params={"max_iter": 100})
|
|
102
|
+
res["heldout"] # document completion による heldout 対数尤度
|
|
103
|
+
res["residual"] # Taddy (2012) の残差分散(1 に近いほど良い)
|
|
104
|
+
res["semcoh"] # 意味的一貫性 / res["exclus"]: 排他性
|
|
105
|
+
res["bound"], res["lbound"], res["em_its"]
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### その他の診断
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
from pystm import topic_corr, semantic_coherence, exclusivity, check_residuals
|
|
112
|
+
|
|
113
|
+
tc = topic_corr(model, cutoff=0.01) # トピック相関グラフ(simple 法)
|
|
114
|
+
tc.posadj # 正相関の隣接行列
|
|
115
|
+
semantic_coherence(model, X, M=10) # トピックごとの意味的一貫性
|
|
116
|
+
exclusivity(model, M=10) # トピックごとの排他性(content モデル不可)
|
|
117
|
+
check_residuals(model, X) # 残差分散検定 {dispersion, pvalue, df}
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
## R 版との対応
|
|
121
|
+
|
|
122
|
+
| R | Python |
|
|
123
|
+
|---|---|
|
|
124
|
+
| `stm(docs, vocab, K, prevalence=~x, data=meta)` | `StructuralTopicModel(n_components=K).fit(X, prevalence=design)` |
|
|
125
|
+
| `init.type="Spectral"` (推奨・既定) | `init="spectral"` (既定) |
|
|
126
|
+
| `init.type="Random"` | `init="random"` |
|
|
127
|
+
| `gamma.prior="Pooled"` (既定) | 実装済み(共変量ありのとき自動) |
|
|
128
|
+
| `sigma.prior` | `sigma_prior` |
|
|
129
|
+
| `emtol` / `max.em.its` | `tol` / `max_iter` |
|
|
130
|
+
| `model=`(フィット済みモデルから再開) | `warm_start=True`(fit を繰り返し呼ぶ。`bound_` に履歴が蓄積) |
|
|
131
|
+
| `content=~group`(`kappa.prior="L1"`, 既定) | `fit(X, content=labels)` |
|
|
132
|
+
| `interactions` | `content_interactions` |
|
|
133
|
+
| `fitNewDocuments()` | `transform()` |
|
|
134
|
+
| `labelTopics()` | `top_words()` |
|
|
135
|
+
| `estimateEffect()` / `summary()` | `estimate_effect()` / `.summary()` |
|
|
136
|
+
| `searchK()` | `search_k()` |
|
|
137
|
+
| `make.heldout()` / `eval.heldout()` | `make_heldout()` / `eval_heldout()` |
|
|
138
|
+
| `topicCorr(method="simple")` | `topic_corr()` |
|
|
139
|
+
| `semanticCoherence()` / `exclusivity()` / `checkResiduals()` | `semantic_coherence()` / `exclusivity()` / `check_residuals()` |
|
|
140
|
+
| `$theta` / `$beta` / `$sigma` / `$mu$gamma` | `theta_` / `components_` / `sigma_` / `gamma_` |
|
|
141
|
+
| `$beta$logbeta`(content モデル) | `aspect_components_`(確率スケール) |
|
|
142
|
+
| `$beta$kappa` | `kappa_` |
|
|
143
|
+
|
|
144
|
+
### scikit-learn LDA との API 差分
|
|
145
|
+
|
|
146
|
+
- `perplexity(X)` を sklearn LDA と同様に提供(変分下界ベースの `exp(-bound/総トークン数)`。低いほど良い)。
|
|
147
|
+
- `warm_start=True` で sklearn 流の継続学習(R 版 `model=` 相当)。
|
|
148
|
+
- `components_` は正規化済みの確率分布(sklearn LDA は擬似カウント)。
|
|
149
|
+
- 共変量は `fit(X, prevalence=...)` / `transform(X, prevalence=...)` のキーワードで渡す。R の formula は使えないので、カテゴリ変数は事前に one-hot 等で数値化してください(`patsy` や `pandas.get_dummies` が便利)。
|
|
150
|
+
|
|
151
|
+
### 未実装
|
|
152
|
+
|
|
153
|
+
- `gamma.prior="L1"`(prevalence 側の glmnet 依存モード)
|
|
154
|
+
- `kappa.prior="Jeffreys"`(content の旧推定法。R 版でも後方互換のためだけに残されている)
|
|
155
|
+
- `fixedintercept=FALSE`(content モデルの切片推定)
|
|
156
|
+
- LDA(collapsed Gibbs)初期化、`ngroups` メモ化推論、`K=0`(Lee & Mimno)
|
|
157
|
+
- `estimateEffect()` の `uncertainty="Local"`、formula インターフェース(スプライン `s()` 等は事前に基底展開した行列を渡せば等価)
|
|
158
|
+
- `topicCorr(method="huge")`(huge パッケージ依存)、`selectModel()`、`permutationTest()`、プロット関数群
|
|
159
|
+
|
|
160
|
+
また、spectral 初期化の RecoverL2 は R 版既定の quadprog の代わりにペナルティ付き NNLS による厳密に近い解法を使います(指数勾配法 `recoverEG=TRUE` 相当も `pystm._spectral.recover_l2(solver="expgrad")` として利用可能)。
|
|
161
|
+
|
|
162
|
+
## 実装メモ(R 版からの移植で見つかった重要な点)
|
|
163
|
+
|
|
164
|
+
1. **`update.mu` の切り替えタイミング**: R 版(`stm.control.R`)では E-step に渡す事前平均の選択を `update.mu = !is.null(mu$gamma)` で判定している。つまり**初回 E-step は共有平均(ゼロベクトル)を使い、γ が推定された 2 回目以降に文書別の事前平均 Xγ に切り替わる**。「prevalence 共変量があるか」で判定すると、初回 E-step で形状不一致または誤った事前を使うバグになる(本実装も最初これを踏んだ)。
|
|
165
|
+
|
|
166
|
+
2. **RecoverL2 のソルバー選択**: R 版の既定は quadprog による厳密な単体制約付き QP(`recoverEG=FALSE`)。論文由来の指数勾配法(`recoverEG=TRUE`)は反復上限 500 では、**1つのトピックが支配的なコーパス(文書内でトピックが強く混ざる場合)に収束不足**となり、初期化品質が大きく劣化した(K=10 の合成データで cos 類似度 0.45 前後 vs NNLS で 0.97)。反復を 20,000 まで増やしても改善しなかったため、最適化の遅さではなく平坦な目的関数で実質停止していた。本実装はペナルティ付き NNLS(和=1 制約を重み付き行で課す)を既定とした。
|
|
167
|
+
|
|
168
|
+
3. **Random 初期化は局所解に落ちやすい**(R 版ドキュメントの記述どおり、seed によりトピック復元が大きく変わる)。動作確認・検証には決定的な Spectral 初期化を使うこと。
|
|
169
|
+
|
|
170
|
+
4. **gram 行列の検証方法**: スペクトル初期化の正しさは、合成データで経験 gram 行列が理論期待値 `β' E[θθ'] β`(行正規化後)と一致するかで切り分けられる(本実装では最大誤差 ~1e-3 で一致)。初期化品質が悪いときは実装バグではなく、コーパス側の共起信号の弱さ(θ の混合度)が原因のことがある。
|
|
171
|
+
|
|
172
|
+
5. **mnreg(content 共変量)の高速化**: Distributed Poisson 回帰の設計行列は「トピック主効果 / アスペクト主効果 / 交互作用」の3グループのインジケータ列で、**各グループ内の列は互いに素な行しか触らない**。そのためグループ単位の座標降下が1回の行列演算になり、さらに全語彙が同一の設計行列を共有するので V 方向にも完全ベクトル化できる。汎用の座標降下実装と比べ同一解で大幅に高速(さらに IRLS 上限 4 / スイープ上限 8 / tol 1e-4 に絞っても β の最大差は ~2e-5)。
|
|
173
|
+
|
|
174
|
+
6. **同梱 gadarianFit の前処理は現行 textProcessor と異なる**: パッケージ同梱の `gadarianFit`(2017年)の語彙は、現行 `textProcessor.R` の処理順(句読点除去→ストップワード除去、ダッシュ保存)では再現できない。旧版の処理順(**ストップワード除去が句読点除去より先**=アポストロフィ付きの "can't" 等がストップワードとして除去される、かつ**ダッシュ非保存**= "tax-payers"→"taxpayers")+ `lower.thresh=3` で215語が完全一致する([scripts/gadarian_prep.py](scripts/gadarian_prep.py) の `legacy_order=True`)。R 版の再現実験をする際はパッケージバージョンごとの前処理差に注意。
|
|
175
|
+
|
|
176
|
+
7. **E-step の数値ガードが実データでは必須**: R/C++ 原実装どおりの素朴な `exp(eta)` 計算は、実コーパス(短文・偏った β・大きめ K)で BFGS の直線探索が極端な点を踏んだときに inf/NaN を発生させ、Hessian の cholesky が落ちる。η のクリップ(±200)、log と除算の下限(1e-300)、非有限解のフォールバックを `_estep.py` に追加した(通常領域の値は不変、合成データ・gadarian 検証とも退行なし)。
|
|
177
|
+
|
|
178
|
+
8. **heldout 構築時の語彙消失**: トークンを訓練側から取り除くと、コーパス全体から消える語が生じうる。R 版 `make.heldout` は語彙を再番号付けして missing 側からも削除している。これを怠ると、その語の β が 0 になり heldout 対数尤度が -inf になる。本実装も missing 側から該当トークンを除外している。
|
|
179
|
+
|
|
180
|
+
## R 版との検証(gadarianFit)
|
|
181
|
+
|
|
182
|
+
R パッケージ同梱の `gadarianFit`(Roberts et al. 2014 AJPS の Gadarian & Albertson 移民調査データ、K=3、prevalence = treatment*pid_rep、N=341)を参照解として、本実装を数値レベルで検証済み。再現方法:
|
|
183
|
+
|
|
184
|
+
```bash
|
|
185
|
+
uv run python scripts/validate_gadarian.py # 11/11 チェック合格
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
| 検証項目 | 結果 |
|
|
189
|
+
|---|---|
|
|
190
|
+
| コーパス再現(textProcessor + prepDocuments の移植) | 語彙215語・単語カウントともR版と**完全一致** |
|
|
191
|
+
| R版パラメータでの E-step bound | pystm -13575.82 vs R -13575.91(R の1反復増分 0.103 の範囲内で一致) |
|
|
192
|
+
| R版パラメータでの文書別 θ | 相関 > 0.9999、最大差 0.02 |
|
|
193
|
+
| R版の解からの EM 継続(不動点チェック) | bound 単調増加・増分は収束閾値レベル(10反復で +1.86) |
|
|
194
|
+
| 独立フィット(Spectral 初期化)の bound | R比 -0.18%(R は確率的 LDA 初期化なので局所解の違いは想定内) |
|
|
195
|
+
| トピックの対応 | 3トピックとも cos 類似度 0.88 前後、上位語ほぼ一致(worri/immigr/border、job/tax/pay、peopl/countri/come) |
|
|
196
|
+
| treatment 効果 | 全トピックで符号一致。有意な正の効果は +0.215 vs R +0.219 とほぼ同値 |
|
|
197
|
+
|
|
198
|
+
注: R 版のフィット自体は LDA Gibbs 初期化(R の乱数)に依存するため完全一致は原理的に不可能。代わりに「R の解が本実装の EM の不動点になっているか」「bound 計算が R の報告値と一致するか」で実装の同一性を確認している。
|
|
199
|
+
|
|
200
|
+
## 開発
|
|
201
|
+
|
|
202
|
+
```bash
|
|
203
|
+
uv sync
|
|
204
|
+
uv run pytest tests/
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
## 他プロジェクトからの利用(配布)
|
|
208
|
+
|
|
209
|
+
配布名・import 名ともに `pystm`。実行時依存は numpy / scipy / scikit-learn のみ
|
|
210
|
+
(janome / dash 等は application 用の dev 依存で、配布物には含まれない)。
|
|
211
|
+
|
|
212
|
+
```bash
|
|
213
|
+
# PyPI からインストール
|
|
214
|
+
pip install structural-topic-model
|
|
215
|
+
# または
|
|
216
|
+
uv add structural-topic-model
|
|
217
|
+
|
|
218
|
+
# ローカルパス参照(開発中)
|
|
219
|
+
uv add --editable /path/to/202606_StructuralTopicModel
|
|
220
|
+
|
|
221
|
+
# Git 経由
|
|
222
|
+
uv add git+<リポジトリURL>
|
|
223
|
+
|
|
224
|
+
# wheel をビルド
|
|
225
|
+
uv build # dist/structural_topic_model-x.y.z-py3-none-any.whl
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
配布名は `structural-topic-model`、import 名は `pystm` のまま維持しています
|
|
229
|
+
(PyPI の `pystm` は別の実装に取られているため)。
|
|
230
|
+
|
|
231
|
+
## 参考文献
|
|
232
|
+
|
|
233
|
+
- Roberts, M., Stewart, B., & Tingley, D. (2019). stm: An R Package for Structural Topic Models. *Journal of Statistical Software*, 91(2).
|
|
234
|
+
- Arora, S. et al. (2013). A Practical Algorithm for Topic Modeling with Provable Guarantees. *ICML*.
|