structural-topic-model 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pystm/diagnostics.py ADDED
@@ -0,0 +1,168 @@
1
+ """Model diagnostics (ports of semanticCoherence.R, exclusivity.R,
2
+ residuals.R and topicCorr.R)."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import numpy as np
7
+ from scipy import stats
8
+ from scipy.sparse import csc_matrix, csr_matrix, issparse
9
+ from scipy.stats import rankdata
10
+
11
+ from ._utils import safelog
12
+
13
+
14
+ def _as_csr(X):
15
+ return csr_matrix(X) if not issparse(X) else X.tocsr()
16
+
17
+
18
+ def _semcoh_one_beta(X, logbeta, M):
19
+ """Semantic coherence per topic for one beta (semCoh1beta in R)."""
20
+ K = logbeta.shape[0]
21
+ top_words = np.argsort(-logbeta, axis=1)[:, :M]
22
+ wordlist, positions = np.unique(top_words, return_inverse=True)
23
+ labels = positions.reshape(K, M)
24
+
25
+ sub = csc_matrix(X[:, wordlist])
26
+ sub.data = np.minimum(sub.data, 1.0) # binarize
27
+ cross = (sub.T @ sub).toarray() # document co-occurrence counts
28
+
29
+ result = np.zeros(K)
30
+ for k in range(K):
31
+ idx = labels[k]
32
+ for a in range(M):
33
+ for b in range(M):
34
+ m_, l_ = idx[a], idx[b]
35
+ if m_ > l_:
36
+ result[k] += (np.log(0.01 + cross[m_, l_])
37
+ - np.log(cross[l_, l_] + 0.01))
38
+ return result
39
+
40
+
41
+ def semantic_coherence(model, X, content=None, M=10):
42
+ """Semantic coherence (Mimno et al. 2011) per topic.
43
+
44
+ Higher is better; the metric checks that a topic's top ``M`` words
45
+ co-occur within documents. For content covariate models pass the
46
+ content labels; the score is then the document-weighted average over
47
+ the aspect-specific betas, as in the R package.
48
+ """
49
+ X = _as_csr(X)
50
+ if model.aspect_components_ is None:
51
+ return _semcoh_one_beta(X, safelog(model.components_), M)
52
+ if content is None:
53
+ raise ValueError(
54
+ "The model was fitted with a content covariate; pass the "
55
+ "matching content labels."
56
+ )
57
+ levels = model.content_levels_
58
+ index = np.searchsorted(levels, np.asarray(content).ravel())
59
+ result = np.zeros(model.n_components)
60
+ for a in range(len(levels)):
61
+ subset = index == a
62
+ if not subset.any():
63
+ continue
64
+ logbeta = safelog(model.aspect_components_[a])
65
+ result += _semcoh_one_beta(X[subset], logbeta, M) * subset.sum()
66
+ return result / X.shape[0]
67
+
68
+
69
+ def exclusivity(model, M=10, frexw=0.7):
70
+ """FREX-based exclusivity per topic (exclusivity in R).
71
+
72
+ Not defined for content covariate models (matching the R package).
73
+ """
74
+ if model.aspect_components_ is not None:
75
+ raise ValueError(
76
+ "Exclusivity calculation is only designed for models without "
77
+ "content covariates."
78
+ )
79
+ w = frexw
80
+ tbeta = model.components_.T # (V, K)
81
+ mat = tbeta / tbeta.sum(axis=1, keepdims=True)
82
+
83
+ ex = np.apply_along_axis(rankdata, 0, mat) / mat.shape[0]
84
+ fr = np.apply_along_axis(rankdata, 0, tbeta) / mat.shape[0]
85
+ frex = 1.0 / (w / ex + (1 - w) / fr)
86
+ index = np.argsort(-tbeta, axis=0)[:M]
87
+ return np.array([
88
+ frex[index[:, k], k].sum() for k in range(tbeta.shape[1])
89
+ ])
90
+
91
+
92
+ def check_residuals(model, X, content=None, tol=0.01):
93
+ """Multinomial dispersion of the residuals (Taddy 2012).
94
+
95
+ Under a correctly specified model the dispersion is 1; values above 1
96
+ suggest the number of topics is too small. Returns a dict with
97
+ ``dispersion``, ``pvalue`` and ``df``.
98
+ """
99
+ X = _as_csr(X)
100
+ n, V = X.shape
101
+ K = model.n_components
102
+ theta = model.theta_
103
+ if theta.shape[0] != n:
104
+ raise ValueError("X must be the corpus the model was fitted on.")
105
+
106
+ beta = model._beta_list()
107
+ if model.content_levels_ is not None:
108
+ if content is None:
109
+ raise ValueError(
110
+ "The model was fitted with a content covariate; pass the "
111
+ "matching content labels."
112
+ )
113
+ index = np.searchsorted(model.content_levels_,
114
+ np.asarray(content).ravel())
115
+ else:
116
+ index = np.zeros(n, dtype=np.int64)
117
+
118
+ d = n * (K - 1) + K * (V - 1)
119
+ D = 0.0
120
+ Nhat = 0
121
+ for i in range(n):
122
+ row = X.getrow(i)
123
+ q = theta[i] @ beta[index[i]] # (V,)
124
+ m = row.sum()
125
+ Nhat += int((q * m > tol).sum())
126
+ x = np.zeros(V)
127
+ x[row.indices] = row.data
128
+ denom = m * q * (1 - q)
129
+ D += ((x**2 - 2 * x * q * m) / denom).sum() + (m * q / (1 - q)).sum()
130
+
131
+ df = Nhat - V - d
132
+ with np.errstate(invalid="ignore"):
133
+ dispersion = D / df
134
+ pvalue = stats.chi2.sf(D, df) if df > 0 else np.nan
135
+ return {"dispersion": dispersion, "pvalue": pvalue, "df": df}
136
+
137
+
138
+ class TopicCorrelations:
139
+ """Result of :func:`topic_corr` (class topicCorr in R).
140
+
141
+ Attributes
142
+ ----------
143
+ cor : ndarray (K, K)
144
+ Correlation matrix with entries below ``cutoff`` (in absolute
145
+ value) set to zero.
146
+ posadj : ndarray (K, K)
147
+ Adjacency matrix of positive correlations above the cutoff.
148
+ poscor : ndarray (K, K)
149
+ Correlations masked to the positive adjacency structure.
150
+ """
151
+
152
+ def __init__(self, cor, posadj, poscor):
153
+ self.cor = cor
154
+ self.posadj = posadj
155
+ self.poscor = poscor
156
+
157
+
158
+ def topic_corr(model, cutoff=0.01):
159
+ """Topic correlation graph from theta (topicCorr method="simple").
160
+
161
+ The R package's "huge" method (semiparametric graphical model
162
+ selection) depends on the huge package and is not implemented.
163
+ """
164
+ cormat = np.corrcoef(model.theta_, rowvar=False)
165
+ posadj = (cormat > cutoff).astype(float)
166
+ poscor = cormat * posadj
167
+ cor = np.where(np.abs(cormat) > cutoff, cormat, 0.0)
168
+ return TopicCorrelations(cor=cor, posadj=posadj, poscor=poscor)
pystm/effects.py ADDED
@@ -0,0 +1,203 @@
1
+ """Covariate effect estimation (port of estimateEffect.R / thetaPosterior.R).
2
+
3
+ Regressions where topic proportions are the outcome, propagating the
4
+ measurement uncertainty of theta via the method of composition: draw
5
+ theta from the variational posterior, run the OLS, repeat, then pool.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import warnings
11
+
12
+ import numpy as np
13
+ from scipy import stats
14
+ from sklearn.utils import check_random_state
15
+
16
+ from ._utils import row_softmax
17
+
18
+
19
+ def _global_sigma(model):
20
+ """Global approximation to the per-document posterior covariance.
21
+
22
+ Subtracts the contribution of deviations from the prior mean out of
23
+ the topic covariance, leaving the (average) variational covariance
24
+ (thetapost.global in R).
25
+ """
26
+ lambda_ = model.eta_
27
+ mu = model.mu_
28
+ diff = lambda_ - (mu[None, :] if mu.ndim == 1 else mu)
29
+ covariance = (diff.T @ diff) / lambda_.shape[0]
30
+ sigma = model.sigma_ - covariance
31
+ # guard against indefiniteness from the subtraction
32
+ evals, evecs = np.linalg.eigh(sigma)
33
+ if evals[0] <= 0:
34
+ evals = np.maximum(evals, 1e-10)
35
+ sigma = (evecs * evals) @ evecs.T
36
+ return sigma
37
+
38
+
39
+ def _draw_theta(model, rng):
40
+ """One draw of theta for every document (Global approximation)."""
41
+ sigma = _global_sigma(model)
42
+ chol = np.linalg.cholesky(sigma)
43
+ z = rng.standard_normal(model.eta_.shape)
44
+ eta = model.eta_ + z @ chol.T
45
+ return row_softmax(
46
+ np.column_stack([eta, np.zeros(eta.shape[0])])
47
+ )
48
+
49
+
50
+ class _QRRegression:
51
+ """OLS with a cached QR decomposition (qr.lm / summary.qr.lm in R)."""
52
+
53
+ def __init__(self, xmat, prior=None):
54
+ self.n_obs = xmat.shape[0]
55
+ p = xmat.shape[1]
56
+ if prior is not None:
57
+ if np.isscalar(prior):
58
+ prior = np.diag(np.full(p, float(prior)))
59
+ xmat = np.vstack([xmat, np.linalg.cholesky(prior).T])
60
+ if np.linalg.matrix_rank(xmat) < p:
61
+ warnings.warn(
62
+ "Covariate matrix is singular; adding a small ridge prior "
63
+ "(1e-5) for numerical stability.", stacklevel=3,
64
+ )
65
+ xmat = np.vstack([xmat, np.sqrt(1e-5) * np.eye(p)])
66
+ self.xmat = xmat
67
+ self.q, self.r = np.linalg.qr(xmat)
68
+ self.rinv = np.linalg.inv(self.r)
69
+ self.df_residual = xmat.shape[0] - p
70
+
71
+ def fit(self, y):
72
+ if y.shape[0] != self.xmat.shape[0]:
73
+ y = np.concatenate(
74
+ [y, np.zeros(self.xmat.shape[0] - y.shape[0])]
75
+ )
76
+ coef = self.rinv @ (self.q.T @ y)
77
+ resid = y - self.xmat @ coef
78
+ resvar = (resid @ resid) / self.df_residual
79
+ vcov = resvar * (self.rinv @ self.rinv.T)
80
+ return coef, vcov
81
+
82
+
83
+ class EstimatedEffects:
84
+ """Result of :func:`estimate_effect`.
85
+
86
+ Attributes
87
+ ----------
88
+ parameters : dict
89
+ Maps topic index to a list of ``(coef, vcov)`` pairs, one per
90
+ composition draw.
91
+ topics : list of int
92
+ Topics for which effects were estimated (0-based).
93
+ n_obs : int
94
+ Number of documents.
95
+ n_params : int
96
+ Number of regression coefficients (including the intercept).
97
+ """
98
+
99
+ def __init__(self, parameters, topics, n_obs, n_params):
100
+ self.parameters = parameters
101
+ self.topics = topics
102
+ self.n_obs = n_obs
103
+ self.n_params = n_params
104
+
105
+ def summary(self, topics=None, nsim=500, random_state=None):
106
+ """Pooled coefficient tables (summary.estimateEffect in R).
107
+
108
+ Returns a dict mapping topic index to a record array with fields
109
+ ``estimate``, ``std_error``, ``t_value`` and ``p_value``, one row
110
+ per regression coefficient.
111
+ """
112
+ rng = check_random_state(random_state)
113
+ topics = self.topics if topics is None else list(topics)
114
+ dtype = [("estimate", float), ("std_error", float),
115
+ ("t_value", float), ("p_value", float)]
116
+ tables = {}
117
+ for k in topics:
118
+ if k not in self.parameters:
119
+ raise ValueError(f"Topic {k} was not estimated.")
120
+ sims = np.vstack([
121
+ rng.multivariate_normal(est, vcov, size=nsim)
122
+ for est, vcov in self.parameters[k]
123
+ ])
124
+ est = sims.mean(axis=0)
125
+ se = sims.std(axis=0, ddof=1)
126
+ tval = est / se
127
+ rdf = self.n_obs - self.n_params
128
+ p = 2 * stats.t.sf(np.abs(tval), rdf)
129
+ table = np.zeros(len(est), dtype=dtype)
130
+ table["estimate"] = est
131
+ table["std_error"] = se
132
+ table["t_value"] = tval
133
+ table["p_value"] = p
134
+ tables[k] = table
135
+ return tables
136
+
137
+
138
+ def estimate_effect(model, prevalence, topics=None,
139
+ uncertainty="Global", nsims=25, prior=None,
140
+ random_state=None):
141
+ """Regress topic proportions on covariates (estimateEffect in R).
142
+
143
+ Parameters
144
+ ----------
145
+ model : fitted StructuralTopicModel
146
+ prevalence : array-like of shape (n_samples, n_covariates)
147
+ Covariate design matrix for the regression; an intercept column
148
+ is added automatically. Should normally contain (at least) the
149
+ covariates used when fitting the model. Categorical variables
150
+ must be encoded numerically beforehand.
151
+ topics : iterable of int, optional
152
+ 0-based topic indices to estimate effects for (default: all).
153
+ uncertainty : {"Global", "None"}, default="Global"
154
+ "Global" draws theta from the variational posterior using a
155
+ globally shared covariance approximation; "None" uses the MAP
156
+ theta without measurement uncertainty. (The R package's "Local"
157
+ method is not implemented.)
158
+ nsims : int, default=25
159
+ Number of method-of-composition draws ("Global" only).
160
+ prior : float or ndarray, optional
161
+ Ridge penalty (scalar or full precision matrix) added to the
162
+ regression for numerical stability.
163
+ """
164
+ if not hasattr(model, "theta_"):
165
+ raise ValueError("model must be a fitted StructuralTopicModel.")
166
+ if uncertainty not in ("Global", "None"):
167
+ raise ValueError(
168
+ "uncertainty must be 'Global' or 'None' ('Local' is not "
169
+ "implemented; 'Global' is the recommended method)."
170
+ )
171
+ rng = check_random_state(random_state)
172
+ K = model.theta_.shape[1]
173
+ topics = list(range(K)) if topics is None else list(topics)
174
+ if any(k < 0 or k >= K for k in topics):
175
+ raise ValueError("topics must be 0-based indices below n_components.")
176
+
177
+ xmat = np.asarray(prevalence, dtype=np.float64)
178
+ if xmat.ndim == 1:
179
+ xmat = xmat[:, None]
180
+ if xmat.shape[0] != model.theta_.shape[0]:
181
+ raise ValueError(
182
+ "prevalence has a different number of rows than the fitted "
183
+ "documents."
184
+ )
185
+ if not np.allclose(xmat[:, 0], 1.0):
186
+ xmat = np.column_stack([np.ones(xmat.shape[0]), xmat])
187
+
188
+ reg = _QRRegression(xmat, prior=prior)
189
+ if uncertainty == "None":
190
+ nsims = 1
191
+
192
+ parameters = {k: [] for k in topics}
193
+ for _ in range(nsims):
194
+ if uncertainty == "None":
195
+ theta = model.theta_
196
+ else:
197
+ theta = _draw_theta(model, rng)
198
+ for k in topics:
199
+ parameters[k].append(reg.fit(theta[:, k]))
200
+
201
+ return EstimatedEffects(parameters, topics,
202
+ n_obs=model.theta_.shape[0],
203
+ n_params=xmat.shape[1])
@@ -0,0 +1,166 @@
1
+ """Choosing the number of topics (ports of searchK.R and heldout.R)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ from scipy.sparse import csr_matrix, issparse, lil_matrix
7
+ from scipy.special import gammaln
8
+ from sklearn.utils import check_random_state
9
+
10
+ from ._utils import safelog
11
+ from .diagnostics import check_residuals, exclusivity, semantic_coherence
12
+ from .stm import StructuralTopicModel
13
+
14
+
15
+ def make_heldout(X, N=None, proportion=0.5, random_state=None):
16
+ """Hold out a fraction of tokens for document completion (make.heldout).
17
+
18
+ Selects ``N`` documents (default 10%) and removes ``proportion`` of
19
+ each one's tokens. Held-out tokens whose word disappears entirely
20
+ from the training corpus are dropped (mirroring the R package's vocab
21
+ remapping).
22
+
23
+ Returns a dict with ``X_train`` (csr matrix), ``index`` (held-out
24
+ document ids) and ``docs`` (list of ``(word_indices, counts)``).
25
+ """
26
+ X = csr_matrix(X) if not issparse(X) else X.tocsr()
27
+ rng = check_random_state(random_state)
28
+ n_docs, V = X.shape
29
+ if N is None:
30
+ N = int(np.floor(0.1 * n_docs))
31
+ if not 0 < N <= n_docs:
32
+ raise ValueError("N must be between 1 and the number of documents.")
33
+ if not 0 < proportion < 1:
34
+ raise ValueError("proportion must be in (0, 1).")
35
+
36
+ index = np.sort(rng.choice(n_docs, size=N, replace=False))
37
+ X_train = lil_matrix(X.copy())
38
+ missing_docs = []
39
+ kept_index = []
40
+ for i in index:
41
+ row = X.getrow(i)
42
+ if row.indices.shape[0] < 2:
43
+ continue # too few distinct words to split (as in R)
44
+ tokens = np.repeat(row.indices, row.data.astype(np.int64))
45
+ nsamp = max(1, int(np.floor(proportion * tokens.shape[0])))
46
+ nsamp = min(nsamp, tokens.shape[0] - 1) # keep the doc non-empty
47
+ held = rng.choice(tokens.shape[0], size=nsamp, replace=False)
48
+ held_counts = np.bincount(tokens[held], minlength=V)
49
+ words = np.flatnonzero(held_counts)
50
+ for w in words:
51
+ X_train[i, w] -= held_counts[w]
52
+ missing_docs.append((words, held_counts[words].astype(np.float64)))
53
+ kept_index.append(i)
54
+ X_train = csr_matrix(X_train)
55
+ X_train.eliminate_zeros()
56
+
57
+ # drop held-out tokens of words that vanished from the training corpus
58
+ train_wcounts = np.asarray(X_train.sum(axis=0)).ravel()
59
+ final_docs, final_index = [], []
60
+ for i, (words, counts) in zip(kept_index, missing_docs):
61
+ keep = train_wcounts[words] > 0
62
+ if keep.any():
63
+ final_docs.append((words[keep], counts[keep]))
64
+ final_index.append(i)
65
+ return {
66
+ "X_train": X_train,
67
+ "index": np.asarray(final_index, dtype=np.int64),
68
+ "docs": final_docs,
69
+ }
70
+
71
+
72
+ def eval_heldout(model, heldout, content=None):
73
+ """Heldout log-likelihood by document completion (eval.heldout).
74
+
75
+ Uses the fitted theta of each partially observed document to score
76
+ its held-out tokens. Returns a dict with ``expected_heldout`` (the
77
+ mean over documents of the mean per-token log-probability) and the
78
+ per-document values ``doc_heldout``.
79
+ """
80
+ beta = model._beta_list()
81
+ if model.content_levels_ is not None:
82
+ if content is None:
83
+ raise ValueError(
84
+ "The model was fitted with a content covariate; pass the "
85
+ "matching content labels."
86
+ )
87
+ index_all = np.searchsorted(model.content_levels_,
88
+ np.asarray(content).ravel())
89
+ else:
90
+ index_all = np.zeros(model.theta_.shape[0], dtype=np.int64)
91
+
92
+ doc_scores = np.empty(len(heldout["docs"]))
93
+ for j, (doc_id, (words, counts)) in enumerate(
94
+ zip(heldout["index"], heldout["docs"])):
95
+ logprobs = safelog(
96
+ model.theta_[doc_id] @ beta[index_all[doc_id]][:, words]
97
+ )
98
+ doc_scores[j] = np.repeat(logprobs, counts.astype(np.int64)).mean()
99
+ return {
100
+ "expected_heldout": float(np.nanmean(doc_scores)),
101
+ "doc_heldout": doc_scores,
102
+ }
103
+
104
+
105
+ def search_k(X, K_values, *, prevalence=None, content=None, N=None,
106
+ proportion=0.5, M=10, heldout_random_state=None,
107
+ model_params=None, verbose=False):
108
+ """Fit models over a grid of K and compute diagnostics (searchK).
109
+
110
+ Parameters
111
+ ----------
112
+ X : array-like or sparse matrix of shape (n_samples, n_features)
113
+ K_values : iterable of int
114
+ Topic numbers to evaluate.
115
+ prevalence, content : optional
116
+ Covariates forwarded to :meth:`StructuralTopicModel.fit`.
117
+ N, proportion : heldout construction parameters (see make_heldout).
118
+ M : int, default=10
119
+ Number of top words for exclusivity / semantic coherence.
120
+ model_params : dict, optional
121
+ Extra keyword arguments for the StructuralTopicModel constructor.
122
+
123
+ Returns
124
+ -------
125
+ dict of arrays keyed by "K", "heldout", "residual", "bound",
126
+ "lbound", "exclus", "semcoh", "em_its" (exclusivity and semantic
127
+ coherence are omitted for content covariate models, as in R).
128
+ """
129
+ K_values = list(K_values)
130
+ model_params = dict(model_params or {})
131
+ heldout = make_heldout(X, N=N, proportion=proportion,
132
+ random_state=heldout_random_state)
133
+
134
+ results = {key: [] for key in
135
+ ("K", "heldout", "residual", "bound", "lbound",
136
+ "exclus", "semcoh", "em_its")}
137
+ for K in K_values:
138
+ if verbose:
139
+ print(f"searchK: fitting K={K} ...")
140
+ model = StructuralTopicModel(n_components=K, **model_params)
141
+ model.fit(heldout["X_train"], prevalence=prevalence, content=content)
142
+
143
+ results["K"].append(K)
144
+ results["heldout"].append(
145
+ eval_heldout(model, heldout, content=content)["expected_heldout"]
146
+ )
147
+ results["residual"].append(
148
+ check_residuals(model, heldout["X_train"],
149
+ content=content)["dispersion"]
150
+ )
151
+ bound = max(model.bound_)
152
+ results["bound"].append(bound)
153
+ results["lbound"].append(bound + gammaln(K + 1))
154
+ if content is None:
155
+ results["exclus"].append(
156
+ float(np.mean(exclusivity(model, M=M, frexw=0.7)))
157
+ )
158
+ results["semcoh"].append(
159
+ float(np.mean(semantic_coherence(model, heldout["X_train"],
160
+ M=M)))
161
+ )
162
+ results["em_its"].append(model.n_iter_)
163
+
164
+ if content is not None:
165
+ del results["exclus"], results["semcoh"]
166
+ return {key: np.asarray(val) for key, val in results.items()}