structural-topic-model 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pystm/stm.py ADDED
@@ -0,0 +1,443 @@
1
+ """Structural Topic Model with a scikit-learn style API.
2
+
3
+ Port of the R ``stm`` package's core estimation routine (variational EM
4
+ for the logistic-normal topic model with prevalence covariates).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import numpy as np
10
+ from scipy.sparse import csr_matrix, issparse
11
+ from scipy.stats import rankdata
12
+ from sklearn.base import BaseEstimator, TransformerMixin
13
+ from sklearn.utils import check_random_state
14
+ from sklearn.utils.validation import check_is_fitted
15
+
16
+ from ._estep import decompose_sigma, estep, optimize_document
17
+ from ._mnreg import mnreg
18
+ from ._mstep import opt_beta, opt_mu, opt_sigma
19
+ from ._spectral import spectral_init
20
+ from ._utils import row_softmax, safelog, to_doc_list
21
+
22
+
23
+ class StructuralTopicModel(BaseEstimator, TransformerMixin):
24
+ """Structural Topic Model (STM) with variational EM.
25
+
26
+ STM is a logistic-normal topic model in which document metadata
27
+ ("prevalence covariates") shifts the prior mean of each document's
28
+ topic proportions. Without covariates the model reduces to the
29
+ Correlated Topic Model. The API follows
30
+ :class:`sklearn.decomposition.LatentDirichletAllocation`; covariates
31
+ are passed to :meth:`fit` / :meth:`transform` via the ``prevalence``
32
+ keyword.
33
+
34
+ Parameters
35
+ ----------
36
+ n_components : int, default=10
37
+ Number of topics K (must be >= 2).
38
+ init : {"spectral", "random"}, default="spectral"
39
+ Initialization method. "spectral" is the deterministic anchor-word
40
+ algorithm of Arora et al. (2013), recommended by the stm authors.
41
+ max_iter : int, default=500
42
+ Maximum number of EM iterations.
43
+ tol : float, default=1e-5
44
+ EM stops when the relative change of the approximate evidence
45
+ lower bound falls below ``tol``.
46
+ sigma_prior : float, default=0.0
47
+ Weight in [0, 1] of regularization of the topic covariance matrix
48
+ towards its diagonal.
49
+ gamma_max_iter : int, default=1000
50
+ Iteration limit for the variational prevalence regression.
51
+ e_step_max_iter : int, default=500
52
+ BFGS iteration limit for each document's optimization.
53
+ max_vocab : int or None, default=10000
54
+ With spectral initialization, only this many most frequent terms
55
+ are used to build the gram matrix (None disables the cap).
56
+ content_interactions : bool, default=True
57
+ For content covariate models, whether to include topic-by-level
58
+ interaction deviations in addition to the main effects.
59
+ warm_start : bool, default=False
60
+ When True, repeated calls to :meth:`fit` continue EM from the
61
+ previous solution instead of re-initializing (the analogue of the
62
+ R package's ``model=`` restart argument). Each call runs up to
63
+ ``max_iter`` further iterations and ``bound_`` keeps the full
64
+ history. ``n_components``, the vocabulary and the covariate
65
+ setup must stay the same; per-document state is reused only when
66
+ the corpus has the same number of documents.
67
+ random_state : int, RandomState or None, default=None
68
+ Only used for ``init="random"``; the spectral method is
69
+ deterministic.
70
+ verbose : int, default=0
71
+ If positive, print the bound after each EM iteration.
72
+
73
+ Attributes
74
+ ----------
75
+ components_ : ndarray of shape (n_components, n_features)
76
+ Topic-word distributions (each row sums to 1). Note this differs
77
+ from sklearn's LDA, whose ``components_`` holds unnormalized
78
+ pseudo-counts. For content covariate models this is the
79
+ aspect-frequency-weighted average of ``aspect_components_``.
80
+ aspect_components_ : ndarray of shape (n_levels, n_components, n_features) or None
81
+ Per-content-level topic-word distributions (content models only).
82
+ kappa_ : dict or None
83
+ SAGE parameters of the content model: baseline log-probabilities
84
+ ``m`` (n_features,) and sparse deviations ``params``.
85
+ content_levels_ : ndarray or None
86
+ Sorted unique content covariate levels seen at fit time.
87
+ gamma_ : ndarray of shape (1 + n_covariates, n_components - 1) or None
88
+ Prevalence regression coefficients (first row is the intercept).
89
+ None when no covariates were used.
90
+ mu_ : ndarray
91
+ Prior means of eta: shape (n_components - 1,) without covariates,
92
+ (n_samples, n_components - 1) with covariates.
93
+ sigma_ : ndarray of shape (n_components - 1, n_components - 1)
94
+ Topic covariance matrix.
95
+ theta_ : ndarray of shape (n_samples, n_components)
96
+ Topic proportions of the training documents.
97
+ eta_ : ndarray of shape (n_samples, n_components - 1)
98
+ Variational means of the logistic-normal parameters.
99
+ bound_ : list of float
100
+ Approximate evidence lower bound at each EM iteration.
101
+ n_iter_ : int
102
+ Number of EM iterations run.
103
+ converged_ : bool
104
+ Whether the bound converged before ``max_iter``.
105
+
106
+ References
107
+ ----------
108
+ Roberts, M., Stewart, B., & Tingley, D. (2019). "stm: An R Package for
109
+ Structural Topic Models." Journal of Statistical Software 91(2).
110
+ """
111
+
112
+ def __init__(self, n_components=10, *, init="spectral", max_iter=500,
113
+ tol=1e-5, sigma_prior=0.0, gamma_max_iter=1000,
114
+ e_step_max_iter=500, max_vocab=10000,
115
+ content_interactions=True, warm_start=False,
116
+ random_state=None, verbose=0):
117
+ self.n_components = n_components
118
+ self.init = init
119
+ self.max_iter = max_iter
120
+ self.tol = tol
121
+ self.sigma_prior = sigma_prior
122
+ self.gamma_max_iter = gamma_max_iter
123
+ self.e_step_max_iter = e_step_max_iter
124
+ self.max_vocab = max_vocab
125
+ self.content_interactions = content_interactions
126
+ self.warm_start = warm_start
127
+ self.random_state = random_state
128
+ self.verbose = verbose
129
+
130
+ # ------------------------------------------------------------------
131
+ # validation helpers
132
+
133
+ def _validate_inputs(self, X, prevalence, *, reset):
134
+ X = csr_matrix(X) if not issparse(X) else X.tocsr()
135
+ X = X.astype(np.float64)
136
+ if not reset and X.shape[1] != self.components_.shape[1]:
137
+ raise ValueError(
138
+ f"X has {X.shape[1]} features, but the model was fitted "
139
+ f"with {self.components_.shape[1]}."
140
+ )
141
+ doc_lens = np.asarray(X.sum(axis=1)).ravel()
142
+ if np.any(doc_lens == 0):
143
+ raise ValueError(
144
+ "X contains empty documents; remove them before fitting "
145
+ "(cf. prepDocuments in the R package)."
146
+ )
147
+ design = None
148
+ if prevalence is not None:
149
+ design = np.asarray(prevalence, dtype=np.float64)
150
+ if design.ndim == 1:
151
+ design = design[:, None]
152
+ if design.shape[0] != X.shape[0]:
153
+ raise ValueError(
154
+ "prevalence has a different number of rows than X."
155
+ )
156
+ if np.isnan(design).any():
157
+ raise ValueError("Missing values in prevalence covariates.")
158
+ # prepend an intercept unless one is already there
159
+ if not np.allclose(design[:, 0], 1.0):
160
+ design = np.column_stack([np.ones(design.shape[0]), design])
161
+ return X, design
162
+
163
+ def _encode_content(self, content, n_docs, *, reset):
164
+ """Map content covariate labels to aspect indices (betaindex)."""
165
+ if content is None:
166
+ if not reset and getattr(self, "content_levels_", None) is not None:
167
+ raise ValueError(
168
+ "The model was fitted with a content covariate; pass "
169
+ "the matching content labels."
170
+ )
171
+ return None, np.zeros(n_docs, dtype=np.int64)
172
+ content = np.asarray(content).ravel()
173
+ if content.shape[0] != n_docs:
174
+ raise ValueError("content has a different number of rows than X.")
175
+ if reset:
176
+ levels = np.unique(content)
177
+ else:
178
+ if getattr(self, "content_levels_", None) is None:
179
+ raise ValueError(
180
+ "The model was fitted without a content covariate."
181
+ )
182
+ levels = self.content_levels_
183
+ unseen = ~np.isin(content, levels)
184
+ if unseen.any():
185
+ raise ValueError(
186
+ f"Unseen content levels: {np.unique(content[unseen])!r}"
187
+ )
188
+ index = np.searchsorted(levels, content)
189
+ return levels, index
190
+
191
+ # ------------------------------------------------------------------
192
+ # fitting
193
+
194
+ def fit(self, X, y=None, *, prevalence=None, content=None):
195
+ """Fit the model to a document-term count matrix.
196
+
197
+ Parameters
198
+ ----------
199
+ X : array-like or sparse matrix of shape (n_samples, n_features)
200
+ Word counts per document.
201
+ y : ignored
202
+ prevalence : array-like of shape (n_samples, n_covariates), optional
203
+ Prevalence covariate design matrix. An intercept column is
204
+ added automatically. Categorical variables must be encoded
205
+ numerically (e.g. one-hot) beforehand.
206
+ content : array-like of shape (n_samples,), optional
207
+ Content covariate: one categorical label per document. Each
208
+ level gets its own topic-word distributions, estimated as
209
+ sparse deviations from a shared baseline (SAGE-style, via
210
+ distributed Poisson regression as in the R package's L1 mode).
211
+ """
212
+ K = self.n_components
213
+ if not (isinstance(K, (int, np.integer)) and K >= 2):
214
+ raise ValueError("n_components must be an integer >= 2.")
215
+ if not 0.0 <= self.sigma_prior <= 1.0:
216
+ raise ValueError("sigma_prior must be between 0 and 1.")
217
+ if self.init not in ("spectral", "random"):
218
+ raise ValueError("init must be 'spectral' or 'random'.")
219
+
220
+ warm = self.warm_start and hasattr(self, "components_")
221
+ X, design = self._validate_inputs(X, prevalence, reset=not warm)
222
+ docs = to_doc_list(X)
223
+ N, V = X.shape
224
+
225
+ if warm:
226
+ # ---- continue from the previous solution (cf. the R
227
+ # package's model= restart argument) ----
228
+ if self.components_.shape[0] != K:
229
+ raise ValueError(
230
+ f"warm_start requires the same n_components as the "
231
+ f"previous fit ({self.components_.shape[0]}), "
232
+ f"got {K}."
233
+ )
234
+ levels, beta_index = self._encode_content(content, N,
235
+ reset=False)
236
+ A = 1 if levels is None else len(levels)
237
+ beta = [b.copy() for b in self._beta_list()]
238
+ sigma = self.sigma_.copy()
239
+ if (design is not None and self.gamma_ is not None
240
+ and design.shape[1] == self.gamma_.shape[0]):
241
+ gamma = self.gamma_.copy()
242
+ mu = design @ gamma
243
+ else:
244
+ gamma = None
245
+ mu = (self.mu_.copy() if self.mu_.ndim == 1
246
+ else np.zeros(K - 1))
247
+ # per-document state is only reusable for the same corpus
248
+ lambda_ = (self.eta_.copy() if self.eta_.shape[0] == N
249
+ else np.zeros((N, K - 1)))
250
+ kappa = self.kappa_
251
+ bound_history = list(self.bound_)
252
+ else:
253
+ levels, beta_index = self._encode_content(content, N,
254
+ reset=True)
255
+ A = 1 if levels is None else len(levels)
256
+
257
+ # ---- initialization (stm.init) ----
258
+ if self.init == "spectral":
259
+ init_beta = spectral_init(X, K, max_vocab=self.max_vocab)
260
+ else:
261
+ rng = check_random_state(self.random_state)
262
+ b = rng.gamma(0.1, 1.0, size=(K, V))
263
+ init_beta = b / b.sum(axis=1, keepdims=True)
264
+ beta = [init_beta.copy() for _ in range(A)]
265
+ mu = np.zeros(K - 1)
266
+ sigma = np.diag(np.full(K - 1, 20.0))
267
+ lambda_ = np.zeros((N, K - 1))
268
+ gamma = None
269
+ kappa = None
270
+ bound_history = []
271
+
272
+ wcounts = np.asarray(X.sum(axis=0)).ravel()
273
+
274
+ # ---- EM loop (stm.control) ----
275
+ converged = False
276
+ for _ in range(self.max_iter):
277
+ # like the R code, document-specific means are only available
278
+ # once gamma has been estimated (i.e. from the second iteration)
279
+ update_mu = gamma is not None
280
+ sigma_ss, beta_ss, bound, lambda_ = estep(
281
+ docs, beta_index, update_mu, beta, lambda_, mu, sigma,
282
+ max_optim_iter=self.e_step_max_iter,
283
+ )
284
+ mu, gamma = opt_mu(lambda_, covar=design,
285
+ max_iter=self.gamma_max_iter)
286
+ sigma = opt_sigma(sigma_ss, lambda_, mu, self.sigma_prior)
287
+ if levels is None:
288
+ beta = opt_beta(beta_ss)
289
+ else:
290
+ beta, kappa = mnreg(
291
+ beta_ss, wcounts,
292
+ interactions=self.content_interactions,
293
+ )
294
+
295
+ bound_history.append(float(bound.sum()))
296
+ if self.verbose:
297
+ print(f"Iteration {len(bound_history)}: "
298
+ f"bound = {bound_history[-1]:.2f}")
299
+ if len(bound_history) > 1:
300
+ old, new = bound_history[-2], bound_history[-1]
301
+ if (new - old) / abs(old) < self.tol:
302
+ converged = True
303
+ if self.verbose:
304
+ print("Model converged.")
305
+ break
306
+
307
+ # ---- pack the results ----
308
+ self.content_levels_ = levels
309
+ if levels is None:
310
+ self.components_ = beta[0]
311
+ self.aspect_components_ = None
312
+ self.kappa_ = None
313
+ else:
314
+ self.aspect_components_ = np.stack(beta)
315
+ # corpus-level summary: aspect betas weighted by frequency
316
+ weights = np.bincount(beta_index, minlength=A) / N
317
+ self.components_ = np.tensordot(
318
+ weights, self.aspect_components_, axes=1
319
+ )
320
+ self.kappa_ = kappa
321
+ self.gamma_ = gamma
322
+ self.mu_ = mu
323
+ self.sigma_ = sigma
324
+ self.eta_ = lambda_
325
+ full_eta = np.column_stack([lambda_, np.zeros(N)])
326
+ self.theta_ = row_softmax(full_eta)
327
+ self.bound_ = bound_history
328
+ self.n_iter_ = len(bound_history)
329
+ self.converged_ = converged
330
+ self.n_features_in_ = V
331
+ return self
332
+
333
+ def fit_transform(self, X, y=None, *, prevalence=None, content=None):
334
+ """Fit the model and return the training documents' theta."""
335
+ return self.fit(X, prevalence=prevalence, content=content).theta_
336
+
337
+ # ------------------------------------------------------------------
338
+ # inference on new documents
339
+
340
+ def _new_doc_priors(self, design):
341
+ """Per-document prior means for held-out inference."""
342
+ if self.gamma_ is not None:
343
+ if design is None:
344
+ raise ValueError(
345
+ "The model was fitted with prevalence covariates; pass "
346
+ "the matching covariates."
347
+ )
348
+ if design.shape[1] != self.gamma_.shape[0]:
349
+ raise ValueError(
350
+ "prevalence has a different number of columns than "
351
+ "at fit time."
352
+ )
353
+ return design @ self.gamma_, True
354
+ mu = self.mu_ if self.mu_.ndim == 1 else self.mu_.mean(axis=0)
355
+ return mu, False
356
+
357
+ def _beta_list(self):
358
+ """Topic-word distributions as a per-aspect list."""
359
+ if self.aspect_components_ is None:
360
+ return [self.components_]
361
+ return list(self.aspect_components_)
362
+
363
+ def transform(self, X, *, prevalence=None, content=None):
364
+ """Infer topic proportions for (possibly new) documents.
365
+
366
+ Runs one E-step with the fitted global parameters held fixed
367
+ (cf. fitNewDocuments in the R package) and returns theta of shape
368
+ (n_samples, n_components).
369
+ """
370
+ check_is_fitted(self, "components_")
371
+ X, design = self._validate_inputs(X, prevalence, reset=False)
372
+ docs = to_doc_list(X)
373
+ N = X.shape[0]
374
+ K = self.n_components
375
+ _, beta_index = self._encode_content(content, N, reset=False)
376
+ beta = self._beta_list()
377
+ mu, update_mu = self._new_doc_priors(design)
378
+
379
+ siginv, sigmaentropy = decompose_sigma(self.sigma_)
380
+ eta = np.zeros((N, K - 1))
381
+ for i, (words, counts) in enumerate(docs):
382
+ beta_d = np.ascontiguousarray(beta[beta_index[i]][:, words])
383
+ mu_d = mu[i] if update_mu else mu
384
+ _, eta[i] = optimize_document(
385
+ eta[i], beta_d, counts, mu_d, siginv, sigmaentropy,
386
+ max_optim_iter=self.e_step_max_iter,
387
+ )
388
+ return row_softmax(np.column_stack([eta, np.zeros(N)]))
389
+
390
+ def score(self, X, y=None, *, prevalence=None, content=None):
391
+ """Approximate evidence lower bound of ``X`` under the fitted model."""
392
+ check_is_fitted(self, "components_")
393
+ X, design = self._validate_inputs(X, prevalence, reset=False)
394
+ docs = to_doc_list(X)
395
+ N = X.shape[0]
396
+ K = self.n_components
397
+ _, beta_index = self._encode_content(content, N, reset=False)
398
+ mu, update_mu = self._new_doc_priors(design)
399
+ _, _, bound, _ = estep(
400
+ docs, beta_index, update_mu,
401
+ self._beta_list(), np.zeros((N, K - 1)), mu, self.sigma_,
402
+ max_optim_iter=self.e_step_max_iter,
403
+ )
404
+ return float(bound.sum())
405
+
406
+ def perplexity(self, X, *, prevalence=None, content=None):
407
+ """Per-token perplexity of ``X``, ``exp(-bound / n_tokens)``.
408
+
409
+ Like :meth:`sklearn.decomposition.LatentDirichletAllocation.perplexity`
410
+ this is based on the variational bound (here the logistic-normal
411
+ ELBO from :meth:`score`), so values are comparable between fits
412
+ of this class on the same data; lower is better.
413
+ """
414
+ check_is_fitted(self, "components_")
415
+ bound = self.score(X, prevalence=prevalence, content=content)
416
+ X_csr, _ = self._validate_inputs(X, None, reset=False)
417
+ n_tokens = X_csr.sum()
418
+ return float(np.exp(-bound / n_tokens))
419
+
420
+ # ------------------------------------------------------------------
421
+ # interpretation helpers
422
+
423
+ def top_words(self, n_words=10, *, kind="prob", frex_weight=0.5):
424
+ """Indices of the top words per topic (cf. labelTopics).
425
+
426
+ ``kind="prob"`` ranks by within-topic probability; ``kind="frex"``
427
+ balances frequency and exclusivity with weight ``frex_weight``.
428
+ Returns an array of shape (n_components, n_words).
429
+ """
430
+ check_is_fitted(self, "components_")
431
+ logbeta = safelog(self.components_)
432
+ if kind == "prob":
433
+ scores = logbeta
434
+ elif kind == "frex":
435
+ from scipy.special import logsumexp
436
+
437
+ excl = logbeta - logsumexp(logbeta, axis=0, keepdims=True)
438
+ freq_rank = np.apply_along_axis(rankdata, 1, logbeta) / logbeta.shape[1]
439
+ excl_rank = np.apply_along_axis(rankdata, 1, excl) / logbeta.shape[1]
440
+ scores = 1.0 / (frex_weight / freq_rank + (1 - frex_weight) / excl_rank)
441
+ else:
442
+ raise ValueError("kind must be 'prob' or 'frex'.")
443
+ return np.argsort(-scores, axis=1)[:, :n_words]
@@ -0,0 +1,234 @@
1
+ Metadata-Version: 2.4
2
+ Name: structural-topic-model
3
+ Version: 0.2.0
4
+ Summary: Python implementation of the Structural Topic Model (STM), a port of the R stm package with a scikit-learn style API
5
+ Project-URL: Homepage, https://github.com/hirata-keisuke/pystm
6
+ Project-URL: Repository, https://github.com/hirata-keisuke/pystm
7
+ Project-URL: Issues, https://github.com/hirata-keisuke/pystm/issues
8
+ Author-email: hirata-keisuke <plainpeace39th@gmail.com>
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Keywords: NLP,STM,structural topic model,text mining,topic model
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Classifier: Topic :: Text Processing :: Linguistic
19
+ Requires-Python: >=3.12
20
+ Requires-Dist: numpy>=2.0
21
+ Requires-Dist: scikit-learn>=1.9.0
22
+ Requires-Dist: scipy>=1.14
23
+ Description-Content-Type: text/markdown
24
+
25
+ # pystm — Structural Topic Model in Python
26
+
27
+ R の [stm](https://github.com/bstewart/stm) パッケージ(Roberts, Stewart & Tingley)のコア推定アルゴリズムを Python に移植したものです。API は scikit-learn の `LatentDirichletAllocation` に倣っています。
28
+
29
+ ## STM とは
30
+
31
+ STM はロジスティック正規トピックモデルで、文書のメタデータ(prevalence 共変量)が各文書のトピック比率の事前平均をシフトさせます。共変量なしの場合は Correlated Topic Model (CTM) に帰着します。推定は semi-collapsed 変分 EM で行います(R 版 `stm()` と同一のアルゴリズム)。
32
+
33
+ ## 使い方
34
+
35
+ ```python
36
+ import numpy as np
37
+ from pystm import StructuralTopicModel
38
+
39
+ # X: (n_docs, n_vocab) の単語カウント行列(dense / scipy.sparse どちらも可)
40
+ # covar: (n_docs, n_covariates) の prevalence 共変量(切片は自動付与)
41
+
42
+ model = StructuralTopicModel(n_components=10, init="spectral")
43
+ model.fit(X, prevalence=covar)
44
+
45
+ model.theta_ # 学習文書のトピック比率 (n_docs, K)
46
+ model.components_ # トピック-単語分布 (K, V)。各行の和は1
47
+ model.gamma_ # prevalence 回帰係数 (1+P, K-1)。先頭行が切片
48
+ model.sigma_ # トピック共分散行列 (K-1, K-1)
49
+
50
+ # 新規文書の推論(fitNewDocuments 相当)
51
+ theta_new = model.transform(X_new, prevalence=covar_new)
52
+
53
+ # トピックの代表語(labelTopics 相当)
54
+ model.top_words(n_words=10) # 確率順
55
+ model.top_words(n_words=10, kind="frex") # FREX(頻度と排他性のバランス)
56
+ ```
57
+
58
+ 共変量を渡さなければ CTM として推定されます:
59
+
60
+ ```python
61
+ model = StructuralTopicModel(n_components=10).fit(X)
62
+ ```
63
+
64
+ ### content 共変量(SAGE / Distributed Multinomial Regression)
65
+
66
+ 文書のカテゴリによってトピック内の語彙の使い方が変わるモデルです。各文書に1つのカテゴリラベルを渡します:
67
+
68
+ ```python
69
+ model = StructuralTopicModel(n_components=10)
70
+ model.fit(X, prevalence=covar, content=party_labels) # 例: 政党ラベル
71
+
72
+ model.aspect_components_ # カテゴリ別トピック-語彙分布 (n_levels, K, V)
73
+ model.kappa_["params"] # ベースラインからのスパースな偏差(lasso 推定)
74
+ model.content_levels_ # カテゴリ水準
75
+ # transform / score にも同じ content を渡す
76
+ model.transform(X_new, prevalence=c_new, content=labels_new)
77
+ ```
78
+
79
+ R 版の `kappa.prior="L1"`(既定)に相当する Distributed Poisson 回帰で推定します。glmnet の代わりに、設計行列のインジケータ構造を利用して語彙方向に完全ベクトル化した IRLS+座標降下の Poisson lasso を実装しています(正則化パスと情報量規準による選択も R と同じ)。
80
+
81
+ ### estimateEffect 相当: 共変量効果の推定
82
+
83
+ トピック比率を目的変数とする回帰を method of composition(変分事後分布からの θ サンプリング)で行い、測定不確実性込みの係数を返します:
84
+
85
+ ```python
86
+ from pystm import estimate_effect
87
+
88
+ eff = estimate_effect(model, covar, uncertainty="Global", nsims=25)
89
+ tables = eff.summary() # {topic: 構造化配列(estimate/std_error/t_value/p_value)}
90
+ tables[0]["estimate"] # トピック0の回帰係数(先頭が切片)
91
+ ```
92
+
93
+ `uncertainty="Global"`(推奨・既定)と `"None"` をサポートします(R の `"Local"` は未実装)。
94
+
95
+ ### searchK 相当: トピック数の選択
96
+
97
+ ```python
98
+ from pystm import search_k
99
+
100
+ res = search_k(X, K_values=[5, 10, 15], prevalence=covar,
101
+ model_params={"max_iter": 100})
102
+ res["heldout"] # document completion による heldout 対数尤度
103
+ res["residual"] # Taddy (2012) の残差分散(1 に近いほど良い)
104
+ res["semcoh"] # 意味的一貫性 / res["exclus"]: 排他性
105
+ res["bound"], res["lbound"], res["em_its"]
106
+ ```
107
+
108
+ ### その他の診断
109
+
110
+ ```python
111
+ from pystm import topic_corr, semantic_coherence, exclusivity, check_residuals
112
+
113
+ tc = topic_corr(model, cutoff=0.01) # トピック相関グラフ(simple 法)
114
+ tc.posadj # 正相関の隣接行列
115
+ semantic_coherence(model, X, M=10) # トピックごとの意味的一貫性
116
+ exclusivity(model, M=10) # トピックごとの排他性(content モデル不可)
117
+ check_residuals(model, X) # 残差分散検定 {dispersion, pvalue, df}
118
+ ```
119
+
120
+ ## R 版との対応
121
+
122
+ | R | Python |
123
+ |---|---|
124
+ | `stm(docs, vocab, K, prevalence=~x, data=meta)` | `StructuralTopicModel(n_components=K).fit(X, prevalence=design)` |
125
+ | `init.type="Spectral"` (推奨・既定) | `init="spectral"` (既定) |
126
+ | `init.type="Random"` | `init="random"` |
127
+ | `gamma.prior="Pooled"` (既定) | 実装済み(共変量ありのとき自動) |
128
+ | `sigma.prior` | `sigma_prior` |
129
+ | `emtol` / `max.em.its` | `tol` / `max_iter` |
130
+ | `model=`(フィット済みモデルから再開) | `warm_start=True`(fit を繰り返し呼ぶ。`bound_` に履歴が蓄積) |
131
+ | `content=~group`(`kappa.prior="L1"`, 既定) | `fit(X, content=labels)` |
132
+ | `interactions` | `content_interactions` |
133
+ | `fitNewDocuments()` | `transform()` |
134
+ | `labelTopics()` | `top_words()` |
135
+ | `estimateEffect()` / `summary()` | `estimate_effect()` / `.summary()` |
136
+ | `searchK()` | `search_k()` |
137
+ | `make.heldout()` / `eval.heldout()` | `make_heldout()` / `eval_heldout()` |
138
+ | `topicCorr(method="simple")` | `topic_corr()` |
139
+ | `semanticCoherence()` / `exclusivity()` / `checkResiduals()` | `semantic_coherence()` / `exclusivity()` / `check_residuals()` |
140
+ | `$theta` / `$beta` / `$sigma` / `$mu$gamma` | `theta_` / `components_` / `sigma_` / `gamma_` |
141
+ | `$beta$logbeta`(content モデル) | `aspect_components_`(確率スケール) |
142
+ | `$beta$kappa` | `kappa_` |
143
+
144
+ ### scikit-learn LDA との API 差分
145
+
146
+ - `perplexity(X)` を sklearn LDA と同様に提供(変分下界ベースの `exp(-bound/総トークン数)`。低いほど良い)。
147
+ - `warm_start=True` で sklearn 流の継続学習(R 版 `model=` 相当)。
148
+ - `components_` は正規化済みの確率分布(sklearn LDA は擬似カウント)。
149
+ - 共変量は `fit(X, prevalence=...)` / `transform(X, prevalence=...)` のキーワードで渡す。R の formula は使えないので、カテゴリ変数は事前に one-hot 等で数値化してください(`patsy` や `pandas.get_dummies` が便利)。
150
+
151
+ ### 未実装
152
+
153
+ - `gamma.prior="L1"`(prevalence 側の glmnet 依存モード)
154
+ - `kappa.prior="Jeffreys"`(content の旧推定法。R 版でも後方互換のためだけに残されている)
155
+ - `fixedintercept=FALSE`(content モデルの切片推定)
156
+ - LDA(collapsed Gibbs)初期化、`ngroups` メモ化推論、`K=0`(Lee & Mimno)
157
+ - `estimateEffect()` の `uncertainty="Local"`、formula インターフェース(スプライン `s()` 等は事前に基底展開した行列を渡せば等価)
158
+ - `topicCorr(method="huge")`(huge パッケージ依存)、`selectModel()`、`permutationTest()`、プロット関数群
159
+
160
+ また、spectral 初期化の RecoverL2 は R 版既定の quadprog の代わりにペナルティ付き NNLS による厳密に近い解法を使います(指数勾配法 `recoverEG=TRUE` 相当も `pystm._spectral.recover_l2(solver="expgrad")` として利用可能)。
161
+
162
+ ## 実装メモ(R 版からの移植で見つかった重要な点)
163
+
164
+ 1. **`update.mu` の切り替えタイミング**: R 版(`stm.control.R`)では E-step に渡す事前平均の選択を `update.mu = !is.null(mu$gamma)` で判定している。つまり**初回 E-step は共有平均(ゼロベクトル)を使い、γ が推定された 2 回目以降に文書別の事前平均 Xγ に切り替わる**。「prevalence 共変量があるか」で判定すると、初回 E-step で形状不一致または誤った事前を使うバグになる(本実装も最初これを踏んだ)。
165
+
166
+ 2. **RecoverL2 のソルバー選択**: R 版の既定は quadprog による厳密な単体制約付き QP(`recoverEG=FALSE`)。論文由来の指数勾配法(`recoverEG=TRUE`)は反復上限 500 では、**1つのトピックが支配的なコーパス(文書内でトピックが強く混ざる場合)に収束不足**となり、初期化品質が大きく劣化した(K=10 の合成データで cos 類似度 0.45 前後 vs NNLS で 0.97)。反復を 20,000 まで増やしても改善しなかったため、最適化の遅さではなく平坦な目的関数で実質停止していた。本実装はペナルティ付き NNLS(和=1 制約を重み付き行で課す)を既定とした。
167
+
168
+ 3. **Random 初期化は局所解に落ちやすい**(R 版ドキュメントの記述どおり、seed によりトピック復元が大きく変わる)。動作確認・検証には決定的な Spectral 初期化を使うこと。
169
+
170
+ 4. **gram 行列の検証方法**: スペクトル初期化の正しさは、合成データで経験 gram 行列が理論期待値 `β' E[θθ'] β`(行正規化後)と一致するかで切り分けられる(本実装では最大誤差 ~1e-3 で一致)。初期化品質が悪いときは実装バグではなく、コーパス側の共起信号の弱さ(θ の混合度)が原因のことがある。
171
+
172
+ 5. **mnreg(content 共変量)の高速化**: Distributed Poisson 回帰の設計行列は「トピック主効果 / アスペクト主効果 / 交互作用」の3グループのインジケータ列で、**各グループ内の列は互いに素な行しか触らない**。そのためグループ単位の座標降下が1回の行列演算になり、さらに全語彙が同一の設計行列を共有するので V 方向にも完全ベクトル化できる。汎用の座標降下実装と比べ同一解で大幅に高速(さらに IRLS 上限 4 / スイープ上限 8 / tol 1e-4 に絞っても β の最大差は ~2e-5)。
173
+
174
+ 6. **同梱 gadarianFit の前処理は現行 textProcessor と異なる**: パッケージ同梱の `gadarianFit`(2017年)の語彙は、現行 `textProcessor.R` の処理順(句読点除去→ストップワード除去、ダッシュ保存)では再現できない。旧版の処理順(**ストップワード除去が句読点除去より先**=アポストロフィ付きの "can't" 等がストップワードとして除去される、かつ**ダッシュ非保存**= "tax-payers"→"taxpayers")+ `lower.thresh=3` で215語が完全一致する([scripts/gadarian_prep.py](scripts/gadarian_prep.py) の `legacy_order=True`)。R 版の再現実験をする際はパッケージバージョンごとの前処理差に注意。
175
+
176
+ 7. **E-step の数値ガードが実データでは必須**: R/C++ 原実装どおりの素朴な `exp(eta)` 計算は、実コーパス(短文・偏った β・大きめ K)で BFGS の直線探索が極端な点を踏んだときに inf/NaN を発生させ、Hessian の cholesky が落ちる。η のクリップ(±200)、log と除算の下限(1e-300)、非有限解のフォールバックを `_estep.py` に追加した(通常領域の値は不変、合成データ・gadarian 検証とも退行なし)。
177
+
178
+ 8. **heldout 構築時の語彙消失**: トークンを訓練側から取り除くと、コーパス全体から消える語が生じうる。R 版 `make.heldout` は語彙を再番号付けして missing 側からも削除している。これを怠ると、その語の β が 0 になり heldout 対数尤度が -inf になる。本実装も missing 側から該当トークンを除外している。
179
+
180
+ ## R 版との検証(gadarianFit)
181
+
182
+ R パッケージ同梱の `gadarianFit`(Roberts et al. 2014 AJPS の Gadarian & Albertson 移民調査データ、K=3、prevalence = treatment*pid_rep、N=341)を参照解として、本実装を数値レベルで検証済み。再現方法:
183
+
184
+ ```bash
185
+ uv run python scripts/validate_gadarian.py # 11/11 チェック合格
186
+ ```
187
+
188
+ | 検証項目 | 結果 |
189
+ |---|---|
190
+ | コーパス再現(textProcessor + prepDocuments の移植) | 語彙215語・単語カウントともR版と**完全一致** |
191
+ | R版パラメータでの E-step bound | pystm -13575.82 vs R -13575.91(R の1反復増分 0.103 の範囲内で一致) |
192
+ | R版パラメータでの文書別 θ | 相関 > 0.9999、最大差 0.02 |
193
+ | R版の解からの EM 継続(不動点チェック) | bound 単調増加・増分は収束閾値レベル(10反復で +1.86) |
194
+ | 独立フィット(Spectral 初期化)の bound | R比 -0.18%(R は確率的 LDA 初期化なので局所解の違いは想定内) |
195
+ | トピックの対応 | 3トピックとも cos 類似度 0.88 前後、上位語ほぼ一致(worri/immigr/border、job/tax/pay、peopl/countri/come) |
196
+ | treatment 効果 | 全トピックで符号一致。有意な正の効果は +0.215 vs R +0.219 とほぼ同値 |
197
+
198
+ 注: R 版のフィット自体は LDA Gibbs 初期化(R の乱数)に依存するため完全一致は原理的に不可能。代わりに「R の解が本実装の EM の不動点になっているか」「bound 計算が R の報告値と一致するか」で実装の同一性を確認している。
199
+
200
+ ## 開発
201
+
202
+ ```bash
203
+ uv sync
204
+ uv run pytest tests/
205
+ ```
206
+
207
+ ## 他プロジェクトからの利用(配布)
208
+
209
+ 配布名・import 名ともに `pystm`。実行時依存は numpy / scipy / scikit-learn のみ
210
+ (janome / dash 等は application 用の dev 依存で、配布物には含まれない)。
211
+
212
+ ```bash
213
+ # PyPI からインストール
214
+ pip install structural-topic-model
215
+ # または
216
+ uv add structural-topic-model
217
+
218
+ # ローカルパス参照(開発中)
219
+ uv add --editable /path/to/202606_StructuralTopicModel
220
+
221
+ # Git 経由
222
+ uv add git+<リポジトリURL>
223
+
224
+ # wheel をビルド
225
+ uv build # dist/structural_topic_model-x.y.z-py3-none-any.whl
226
+ ```
227
+
228
+ 配布名は `structural-topic-model`、import 名は `pystm` のまま維持しています
229
+ (PyPI の `pystm` は別の実装に取られているため)。
230
+
231
+ ## 参考文献
232
+
233
+ - Roberts, M., Stewart, B., & Tingley, D. (2019). stm: An R Package for Structural Topic Models. *Journal of Statistical Software*, 91(2).
234
+ - Arora, S. et al. (2013). A Practical Algorithm for Topic Modeling with Provable Guarantees. *ICML*.