pycorpdiff 0.1.0a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pycorpdiff/__init__.py +126 -0
- pycorpdiff/_backends/__init__.py +3 -0
- pycorpdiff/_backends/pandas.py +3 -0
- pycorpdiff/_backends/polars.py +3 -0
- pycorpdiff/collocation/__init__.py +19 -0
- pycorpdiff/collocation/cooccurrence.py +65 -0
- pycorpdiff/collocation/measures.py +102 -0
- pycorpdiff/collocation/network.py +233 -0
- pycorpdiff/collocation/shift.py +146 -0
- pycorpdiff/compare.py +345 -0
- pycorpdiff/corpus.py +411 -0
- pycorpdiff/datasets/__init__.py +27 -0
- pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
- pycorpdiff/datasets/_generate_hansard.py +221 -0
- pycorpdiff/datasets/hansard.py +235 -0
- pycorpdiff/datasets/histwords.py +221 -0
- pycorpdiff/explain.py +177 -0
- pycorpdiff/io/__init__.py +16 -0
- pycorpdiff/io/duckdb.py +92 -0
- pycorpdiff/io/huggingface.py +142 -0
- pycorpdiff/io/readers.py +138 -0
- pycorpdiff/keyness/__init__.py +26 -0
- pycorpdiff/keyness/bayes.py +50 -0
- pycorpdiff/keyness/chi_squared.py +94 -0
- pycorpdiff/keyness/correction.py +34 -0
- pycorpdiff/keyness/dispersion.py +89 -0
- pycorpdiff/keyness/effect_sizes.py +65 -0
- pycorpdiff/keyness/loglikelihood.py +92 -0
- pycorpdiff/keyness/multicorpus.py +143 -0
- pycorpdiff/keyness/permutation.py +154 -0
- pycorpdiff/py.typed +0 -0
- pycorpdiff/results.py +635 -0
- pycorpdiff/semantic/__init__.py +18 -0
- pycorpdiff/semantic/alignment.py +53 -0
- pycorpdiff/semantic/embed.py +84 -0
- pycorpdiff/semantic/shift.py +224 -0
- pycorpdiff/semantic/trajectory.py +166 -0
- pycorpdiff/stats.py +69 -0
- pycorpdiff/temporal/__init__.py +15 -0
- pycorpdiff/temporal/bocpd.py +233 -0
- pycorpdiff/temporal/causal_impact.py +293 -0
- pycorpdiff/temporal/changepoint.py +92 -0
- pycorpdiff/temporal/forecast.py +405 -0
- pycorpdiff/temporal/its.py +123 -0
- pycorpdiff/temporal/slicing.py +174 -0
- pycorpdiff/tokenize.py +110 -0
- pycorpdiff/viz/__init__.py +37 -0
- pycorpdiff/viz/bocpd.py +173 -0
- pycorpdiff/viz/causal_impact.py +142 -0
- pycorpdiff/viz/collocation.py +48 -0
- pycorpdiff/viz/dispersion.py +117 -0
- pycorpdiff/viz/forecast.py +129 -0
- pycorpdiff/viz/keyness.py +96 -0
- pycorpdiff/viz/network.py +186 -0
- pycorpdiff/viz/scattertext.py +160 -0
- pycorpdiff/viz/semantic_forecast.py +114 -0
- pycorpdiff/viz/trajectory.py +48 -0
- pycorpdiff-0.1.0a0.dist-info/METADATA +230 -0
- pycorpdiff-0.1.0a0.dist-info/RECORD +61 -0
- pycorpdiff-0.1.0a0.dist-info/WHEEL +4 -0
- pycorpdiff-0.1.0a0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,405 @@
|
|
|
1
|
+
"""Forward prediction for :class:`TemporalTrajectory`.
|
|
2
|
+
|
|
3
|
+
The other temporal modules answer *retrospective* questions —
|
|
4
|
+
"when did things change", "was there a step at this known event".
|
|
5
|
+
This one answers the *predictive* one: where is the trajectory
|
|
6
|
+
heading, and what's the uncertainty around that?
|
|
7
|
+
|
|
8
|
+
Method: state-space exponential smoothing (Hyndman et al. 2008) via
|
|
9
|
+
``statsmodels.tsa.exponential_smoothing.ets.ETSModel``. Default
|
|
10
|
+
``method="auto"`` selects ETS for series of length ≥ 8 and falls
|
|
11
|
+
back to a Holt linear-trend model for shorter histories where ETS
|
|
12
|
+
overfits. Rates are forecast on the logit scale and back-transformed
|
|
13
|
+
so prediction intervals are pinned to ``[0, 1]`` instead of admitting
|
|
14
|
+
nonsensical negative frequencies.
|
|
15
|
+
|
|
16
|
+
Prediction intervals are the analytical PIs that come out of the
|
|
17
|
+
state-space fit at the requested ``level``; for short series they
|
|
18
|
+
will be appropriately wide. Honest uncertainty beats false precision.
|
|
19
|
+
|
|
20
|
+
Reference
|
|
21
|
+
---------
|
|
22
|
+
Hyndman, R. J., Koehler, A. B., Ord, J. K., & Snyder, R. D. (2008).
|
|
23
|
+
*Forecasting with Exponential Smoothing: The State Space Approach*.
|
|
24
|
+
Springer.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
from collections.abc import Sequence
|
|
30
|
+
from dataclasses import dataclass, field
|
|
31
|
+
from pathlib import Path
|
|
32
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
33
|
+
|
|
34
|
+
import numpy as np
|
|
35
|
+
import pandas as pd
|
|
36
|
+
|
|
37
|
+
from ..results import _table_to_html, _table_to_json
|
|
38
|
+
|
|
39
|
+
if TYPE_CHECKING:
|
|
40
|
+
import altair as alt
|
|
41
|
+
|
|
42
|
+
ForecastMethod = Literal["auto", "ets", "holt"]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass(frozen=True)
|
|
46
|
+
class ForecastResult:
|
|
47
|
+
"""Forward prediction of a :class:`TemporalTrajectory`.
|
|
48
|
+
|
|
49
|
+
Carries both the original history (so :meth:`plot` can render the
|
|
50
|
+
solid-then-dashed continuation in a single chart) and the forecast
|
|
51
|
+
table with point estimates and prediction intervals.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
history: pd.DataFrame
|
|
55
|
+
forecast: pd.DataFrame
|
|
56
|
+
targets: list[str]
|
|
57
|
+
freq: str
|
|
58
|
+
horizon: int
|
|
59
|
+
level: float
|
|
60
|
+
method: str
|
|
61
|
+
params: dict[str, Any] = field(default_factory=dict)
|
|
62
|
+
|
|
63
|
+
def to_df(self) -> pd.DataFrame:
|
|
64
|
+
"""Return just the forecast table (point + PI per term × period)."""
|
|
65
|
+
return self.forecast.copy()
|
|
66
|
+
|
|
67
|
+
def to_combined(self) -> pd.DataFrame:
|
|
68
|
+
"""Return history + forecast stacked, with a ``kind`` column
|
|
69
|
+
distinguishing the two."""
|
|
70
|
+
h = self.history.copy()
|
|
71
|
+
h["kind"] = "observed"
|
|
72
|
+
h = h.rename(columns={"relfreq": "point"})[
|
|
73
|
+
["period", "term", "point", "ci_lower", "ci_upper", "kind"]
|
|
74
|
+
]
|
|
75
|
+
f = self.forecast.copy()
|
|
76
|
+
f["kind"] = "forecast"
|
|
77
|
+
return pd.concat([h, f[h.columns]], ignore_index=True)
|
|
78
|
+
|
|
79
|
+
def to_html(self, path: str | Path | None = None, **kw: Any) -> str:
|
|
80
|
+
"""Render the forecast table as HTML."""
|
|
81
|
+
return _table_to_html(self.forecast, path, **kw)
|
|
82
|
+
|
|
83
|
+
def to_json(self, path: str | Path | None = None, **kw: Any) -> str:
|
|
84
|
+
"""Render the forecast table as JSON."""
|
|
85
|
+
return _table_to_json(self.forecast, path, **kw)
|
|
86
|
+
|
|
87
|
+
def plot(self, **kw: Any) -> alt.Chart:
|
|
88
|
+
"""Layered altair chart: solid observed + dashed forecast.
|
|
89
|
+
|
|
90
|
+
The history portion shows the existing Wilson CI band; the
|
|
91
|
+
forecast portion shows the prediction-interval band at the
|
|
92
|
+
chosen ``level``.
|
|
93
|
+
"""
|
|
94
|
+
from ..viz.forecast import forecast_plot
|
|
95
|
+
|
|
96
|
+
return forecast_plot(
|
|
97
|
+
history=self.history, forecast=self.forecast, **kw
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
def summary(self) -> str:
|
|
101
|
+
return (
|
|
102
|
+
f"ForecastResult(targets={self.targets!r}, freq={self.freq!r}, "
|
|
103
|
+
f"horizon={self.horizon}, level={self.level}, method={self.method})"
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def forecast_trajectory(
|
|
108
|
+
trajectory_table: pd.DataFrame,
|
|
109
|
+
*,
|
|
110
|
+
targets: Sequence[str] | None = None,
|
|
111
|
+
horizon: int = 4,
|
|
112
|
+
level: float = 0.95,
|
|
113
|
+
method: ForecastMethod = "auto",
|
|
114
|
+
logit_transform: bool = True,
|
|
115
|
+
) -> pd.DataFrame:
|
|
116
|
+
"""Forecast every (or selected) target term's trajectory ``horizon``
|
|
117
|
+
periods forward.
|
|
118
|
+
|
|
119
|
+
Parameters
|
|
120
|
+
----------
|
|
121
|
+
trajectory_table
|
|
122
|
+
The DataFrame attached to a :class:`TemporalTrajectory` —
|
|
123
|
+
must carry ``period``, ``term``, ``relfreq`` columns at minimum.
|
|
124
|
+
targets
|
|
125
|
+
Which terms to forecast. ``None`` (default) forecasts every
|
|
126
|
+
term present in the table.
|
|
127
|
+
horizon
|
|
128
|
+
Number of periods to extrapolate.
|
|
129
|
+
level
|
|
130
|
+
Prediction-interval level. ``0.95`` → 95% PI.
|
|
131
|
+
method
|
|
132
|
+
``"auto"`` (default) picks ETS for series of length ≥ 8, Holt
|
|
133
|
+
otherwise. ``"ets"`` or ``"holt"`` force a specific method.
|
|
134
|
+
logit_transform
|
|
135
|
+
If ``True`` (default), forecast on the logit scale and
|
|
136
|
+
back-transform so the prediction intervals stay in ``[0, 1]``.
|
|
137
|
+
Disable only if forecasting something other than a rate.
|
|
138
|
+
|
|
139
|
+
Returns
|
|
140
|
+
-------
|
|
141
|
+
pandas.DataFrame
|
|
142
|
+
Columns: ``period``, ``term``, ``point``, ``ci_lower``,
|
|
143
|
+
``ci_upper``. One row per (target, future period). Period
|
|
144
|
+
values match the freq of the input trajectory.
|
|
145
|
+
"""
|
|
146
|
+
if horizon < 1:
|
|
147
|
+
raise ValueError(f"horizon must be >= 1; got {horizon}")
|
|
148
|
+
if not 0 < level < 1:
|
|
149
|
+
raise ValueError(f"level must be in (0, 1); got {level}")
|
|
150
|
+
|
|
151
|
+
if targets is None:
|
|
152
|
+
target_list = list(trajectory_table["term"].unique())
|
|
153
|
+
else:
|
|
154
|
+
target_list = list(targets)
|
|
155
|
+
missing = set(target_list) - set(trajectory_table["term"].unique())
|
|
156
|
+
if missing:
|
|
157
|
+
raise ValueError(
|
|
158
|
+
f"unknown targets: {sorted(missing)!r}; "
|
|
159
|
+
f"trajectory carries {sorted(trajectory_table['term'].unique())!r}"
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
rows: list[pd.DataFrame] = []
|
|
163
|
+
for term in target_list:
|
|
164
|
+
sub = (
|
|
165
|
+
trajectory_table[trajectory_table["term"] == term]
|
|
166
|
+
.sort_values("period")
|
|
167
|
+
.set_index("period")["relfreq"]
|
|
168
|
+
)
|
|
169
|
+
if sub.isna().any():
|
|
170
|
+
sub = sub.dropna()
|
|
171
|
+
if len(sub) < 4:
|
|
172
|
+
raise ValueError(
|
|
173
|
+
f"need at least 4 observations to forecast {term!r}; "
|
|
174
|
+
f"got {len(sub)}"
|
|
175
|
+
)
|
|
176
|
+
fc = _forecast_one(
|
|
177
|
+
sub,
|
|
178
|
+
horizon=horizon,
|
|
179
|
+
level=level,
|
|
180
|
+
method=method,
|
|
181
|
+
logit_transform=logit_transform,
|
|
182
|
+
)
|
|
183
|
+
fc.insert(1, "term", term)
|
|
184
|
+
rows.append(fc)
|
|
185
|
+
|
|
186
|
+
return pd.concat(rows, ignore_index=True)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def forecast_semantic_drift(
|
|
190
|
+
trajectory_df: pd.DataFrame,
|
|
191
|
+
*,
|
|
192
|
+
targets: Sequence[str] | None = None,
|
|
193
|
+
horizon: int = 4,
|
|
194
|
+
level: float = 0.95,
|
|
195
|
+
method: ForecastMethod = "auto",
|
|
196
|
+
) -> pd.DataFrame:
|
|
197
|
+
"""Forecast a :func:`pycorpdiff.semantic_trajectory` output forward.
|
|
198
|
+
|
|
199
|
+
Operates on the ``distance_from_baseline`` column — the cosine
|
|
200
|
+
displacement of each per-period contextual centroid from the
|
|
201
|
+
baseline period. Same state-space machinery as
|
|
202
|
+
:func:`forecast_trajectory`, but the logit transform is *off* by
|
|
203
|
+
default (cosine distance lives in roughly ``[0, 2]``, not
|
|
204
|
+
``[0, 1]``) and the prediction interval is clipped to be
|
|
205
|
+
non-negative since negative cosine *distance* is nonsensical.
|
|
206
|
+
|
|
207
|
+
Parameters
|
|
208
|
+
----------
|
|
209
|
+
trajectory_df
|
|
210
|
+
The DataFrame returned by :func:`pycorpdiff.semantic_trajectory`
|
|
211
|
+
— must carry ``period``, ``target``, ``distance_from_baseline``
|
|
212
|
+
columns.
|
|
213
|
+
targets
|
|
214
|
+
Restrict to a subset of targets. ``None`` forecasts every
|
|
215
|
+
target in the table.
|
|
216
|
+
horizon
|
|
217
|
+
Number of periods to extrapolate.
|
|
218
|
+
level
|
|
219
|
+
Prediction-interval level. ``0.95`` → 95% PI.
|
|
220
|
+
method
|
|
221
|
+
``"auto"`` (default), ``"ets"``, or ``"holt"``.
|
|
222
|
+
|
|
223
|
+
Returns
|
|
224
|
+
-------
|
|
225
|
+
pandas.DataFrame
|
|
226
|
+
Columns: ``period``, ``target``, ``point``, ``ci_lower``,
|
|
227
|
+
``ci_upper``. One row per (target, future period).
|
|
228
|
+
"""
|
|
229
|
+
if horizon < 1:
|
|
230
|
+
raise ValueError(f"horizon must be >= 1; got {horizon}")
|
|
231
|
+
if not 0 < level < 1:
|
|
232
|
+
raise ValueError(f"level must be in (0, 1); got {level}")
|
|
233
|
+
|
|
234
|
+
required_cols = {"period", "target", "distance_from_baseline"}
|
|
235
|
+
missing = required_cols - set(trajectory_df.columns)
|
|
236
|
+
if missing:
|
|
237
|
+
raise ValueError(
|
|
238
|
+
f"trajectory_df is missing required columns: {sorted(missing)!r}"
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
if targets is None:
|
|
242
|
+
target_list = list(trajectory_df["target"].unique())
|
|
243
|
+
else:
|
|
244
|
+
target_list = list(targets)
|
|
245
|
+
unknown = set(target_list) - set(trajectory_df["target"].unique())
|
|
246
|
+
if unknown:
|
|
247
|
+
raise ValueError(
|
|
248
|
+
f"unknown targets: {sorted(unknown)!r}; "
|
|
249
|
+
f"trajectory carries {sorted(trajectory_df['target'].unique())!r}"
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
rows: list[pd.DataFrame] = []
|
|
253
|
+
for tgt in target_list:
|
|
254
|
+
sub = (
|
|
255
|
+
trajectory_df[trajectory_df["target"] == tgt]
|
|
256
|
+
.sort_values("period")
|
|
257
|
+
.set_index("period")["distance_from_baseline"]
|
|
258
|
+
)
|
|
259
|
+
sub = sub.dropna()
|
|
260
|
+
if len(sub) < 4:
|
|
261
|
+
raise ValueError(
|
|
262
|
+
f"need at least 4 observations to forecast {tgt!r}; "
|
|
263
|
+
f"got {len(sub)}"
|
|
264
|
+
)
|
|
265
|
+
fc = _forecast_one(
|
|
266
|
+
sub,
|
|
267
|
+
horizon=horizon,
|
|
268
|
+
level=level,
|
|
269
|
+
method=method,
|
|
270
|
+
logit_transform=False,
|
|
271
|
+
)
|
|
272
|
+
# Clip lower bound at 0 — cosine distance is non-negative.
|
|
273
|
+
fc["ci_lower"] = fc["ci_lower"].clip(lower=0.0)
|
|
274
|
+
fc["point"] = fc["point"].clip(lower=0.0)
|
|
275
|
+
fc.insert(1, "target", tgt)
|
|
276
|
+
rows.append(fc)
|
|
277
|
+
|
|
278
|
+
return pd.concat(rows, ignore_index=True)
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def _forecast_one(
|
|
282
|
+
series: pd.Series,
|
|
283
|
+
*,
|
|
284
|
+
horizon: int,
|
|
285
|
+
level: float,
|
|
286
|
+
method: ForecastMethod,
|
|
287
|
+
logit_transform: bool,
|
|
288
|
+
) -> pd.DataFrame:
|
|
289
|
+
"""Forecast a single univariate rate series.
|
|
290
|
+
|
|
291
|
+
Returns a DataFrame with columns ``period``, ``point``,
|
|
292
|
+
``ci_lower``, ``ci_upper``. Period values are constructed from the
|
|
293
|
+
input series's PeriodIndex.
|
|
294
|
+
"""
|
|
295
|
+
try:
|
|
296
|
+
from statsmodels.tsa.exponential_smoothing.ets import ETSModel
|
|
297
|
+
from statsmodels.tsa.holtwinters import Holt
|
|
298
|
+
except ImportError as exc: # pragma: no cover
|
|
299
|
+
raise ImportError(
|
|
300
|
+
"forecast() requires statsmodels. "
|
|
301
|
+
"Install with: pip install 'pycorpdiff[temporal]'"
|
|
302
|
+
) from exc
|
|
303
|
+
|
|
304
|
+
# statsmodels prefers a DatetimeIndex with a known freq for ETS —
|
|
305
|
+
# convert from PeriodIndex and attach the inferred frequency so the
|
|
306
|
+
# state-space model can extrapolate cleanly.
|
|
307
|
+
ts_series = series.copy()
|
|
308
|
+
if isinstance(series.index, pd.PeriodIndex):
|
|
309
|
+
period_freq = series.index.freq
|
|
310
|
+
ts_index = series.index.to_timestamp()
|
|
311
|
+
inferred = getattr(ts_index, "inferred_freq", None)
|
|
312
|
+
ts_series.index = (
|
|
313
|
+
pd.DatetimeIndex(ts_index, freq=inferred) if inferred else ts_index
|
|
314
|
+
)
|
|
315
|
+
else:
|
|
316
|
+
period_freq = None
|
|
317
|
+
|
|
318
|
+
if logit_transform:
|
|
319
|
+
eps = 1e-6
|
|
320
|
+
y_arr = np.clip(ts_series.to_numpy(dtype=float), eps, 1.0 - eps)
|
|
321
|
+
z = pd.Series(
|
|
322
|
+
np.log(y_arr / (1.0 - y_arr)),
|
|
323
|
+
index=ts_series.index,
|
|
324
|
+
name=ts_series.name,
|
|
325
|
+
)
|
|
326
|
+
else:
|
|
327
|
+
z = ts_series.astype(float)
|
|
328
|
+
|
|
329
|
+
chosen = method
|
|
330
|
+
if chosen == "auto":
|
|
331
|
+
chosen = "ets" if len(z) >= 8 else "holt"
|
|
332
|
+
|
|
333
|
+
alpha = 1.0 - level
|
|
334
|
+
|
|
335
|
+
if chosen == "ets":
|
|
336
|
+
model = ETSModel(
|
|
337
|
+
z, error="add", trend="add", seasonal=None
|
|
338
|
+
).fit(disp=False)
|
|
339
|
+
pred = model.get_prediction(start=len(z), end=len(z) + horizon - 1)
|
|
340
|
+
summary = pred.summary_frame(alpha=alpha)
|
|
341
|
+
z_point = summary["mean"].to_numpy(dtype=float)
|
|
342
|
+
z_lower = summary["pi_lower"].to_numpy(dtype=float)
|
|
343
|
+
z_upper = summary["pi_upper"].to_numpy(dtype=float)
|
|
344
|
+
elif chosen == "holt":
|
|
345
|
+
holt = Holt(z, initialization_method="estimated").fit()
|
|
346
|
+
# Holt has no analytical PI in older statsmodels — use
|
|
347
|
+
# forecast_int via simulate-based intervals.
|
|
348
|
+
z_point = np.asarray(holt.forecast(steps=horizon), dtype=float)
|
|
349
|
+
# Residual-based PI (Hyndman §6.4): std of fitted residuals
|
|
350
|
+
# widens with sqrt(h) under the additive-error assumption.
|
|
351
|
+
resid_std = float(np.std(holt.resid, ddof=1))
|
|
352
|
+
from scipy.stats import norm
|
|
353
|
+
|
|
354
|
+
crit = float(norm.ppf(1.0 - alpha / 2.0))
|
|
355
|
+
h_steps = np.arange(1, horizon + 1, dtype=float)
|
|
356
|
+
widening = crit * resid_std * np.sqrt(h_steps)
|
|
357
|
+
z_lower = z_point - widening
|
|
358
|
+
z_upper = z_point + widening
|
|
359
|
+
else:
|
|
360
|
+
raise ValueError(
|
|
361
|
+
f"unknown method={chosen!r}; expected 'auto', 'ets', or 'holt'"
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
if logit_transform:
|
|
365
|
+
point = _inv_logit(z_point)
|
|
366
|
+
lower = _inv_logit(z_lower)
|
|
367
|
+
upper = _inv_logit(z_upper)
|
|
368
|
+
else:
|
|
369
|
+
point = z_point
|
|
370
|
+
lower = z_lower
|
|
371
|
+
upper = z_upper
|
|
372
|
+
|
|
373
|
+
# Future periods — match original index type.
|
|
374
|
+
if period_freq is not None and isinstance(series.index, pd.PeriodIndex):
|
|
375
|
+
last_period = series.index[-1]
|
|
376
|
+
future_periods = pd.period_range(
|
|
377
|
+
start=last_period + 1, periods=horizon, freq=period_freq
|
|
378
|
+
)
|
|
379
|
+
future_idx: pd.Index = pd.Index(future_periods)
|
|
380
|
+
else:
|
|
381
|
+
last_ts = series.index[-1]
|
|
382
|
+
inferred = getattr(series.index, "inferred_freq", None) or "D"
|
|
383
|
+
offset = pd.tseries.frequencies.to_offset(inferred)
|
|
384
|
+
future_idx = pd.date_range(
|
|
385
|
+
start=last_ts + offset, periods=horizon, freq=offset
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
return pd.DataFrame(
|
|
389
|
+
{
|
|
390
|
+
"period": future_idx,
|
|
391
|
+
"point": point,
|
|
392
|
+
"ci_lower": lower,
|
|
393
|
+
"ci_upper": upper,
|
|
394
|
+
}
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
def _inv_logit(x: np.ndarray) -> np.ndarray:
|
|
399
|
+
"""Numerically-stable inverse logit."""
|
|
400
|
+
out: np.ndarray = np.where(
|
|
401
|
+
x >= 0,
|
|
402
|
+
1.0 / (1.0 + np.exp(-x)),
|
|
403
|
+
np.exp(x) / (1.0 + np.exp(x)),
|
|
404
|
+
)
|
|
405
|
+
return out
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""Interrupted time-series analysis around a known event date.
|
|
2
|
+
|
|
3
|
+
Implements the standard segmented-regression specification:
|
|
4
|
+
|
|
5
|
+
y_t = β₀ + β₁·t + β₂·post_t + β₃·(t − t_event)·post_t + ε
|
|
6
|
+
|
|
7
|
+
where ``post_t`` is 1 when ``t ≥ t_event`` and 0 otherwise. The
|
|
8
|
+
coefficients of interest:
|
|
9
|
+
|
|
10
|
+
- ``β₂`` (level change): the immediate step at the intervention.
|
|
11
|
+
- ``β₃`` (slope change): how the post-period trend differs from
|
|
12
|
+
the pre-period trend.
|
|
13
|
+
|
|
14
|
+
Reference
|
|
15
|
+
---------
|
|
16
|
+
Wagner, A. K., Soumerai, S. B., Zhang, F., & Ross-Degnan, D. (2002).
|
|
17
|
+
Segmented regression analysis of interrupted time series studies in
|
|
18
|
+
medication use research. *Journal of Clinical Pharmacy and
|
|
19
|
+
Therapeutics*, 27(4), 299-309.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import numpy as np
|
|
25
|
+
import pandas as pd
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def interrupted_time_series(
|
|
29
|
+
series: pd.Series,
|
|
30
|
+
event_date: str | pd.Period | pd.Timestamp,
|
|
31
|
+
) -> pd.DataFrame:
|
|
32
|
+
"""Fit a segmented-regression ITS model around ``event_date``.
|
|
33
|
+
|
|
34
|
+
Parameters
|
|
35
|
+
----------
|
|
36
|
+
series
|
|
37
|
+
A series indexed by period (or datetime). Index values must
|
|
38
|
+
compare against ``event_date`` to produce the pre/post split.
|
|
39
|
+
event_date
|
|
40
|
+
Where to place the intervention. Anything pandas can compare to
|
|
41
|
+
the series index — a string, a :class:`pandas.Period`, a
|
|
42
|
+
:class:`pandas.Timestamp`.
|
|
43
|
+
|
|
44
|
+
Returns
|
|
45
|
+
-------
|
|
46
|
+
pandas.DataFrame
|
|
47
|
+
Four rows (one per coefficient), columns ``term``, ``coef``,
|
|
48
|
+
``std_err``, ``t``, ``p_value``, ``ci_lower``, ``ci_upper``.
|
|
49
|
+
``term`` values: ``"intercept"``, ``"time"``, ``"level_change"``,
|
|
50
|
+
``"slope_change"``.
|
|
51
|
+
"""
|
|
52
|
+
try:
|
|
53
|
+
import statsmodels.api as sm
|
|
54
|
+
except ImportError as exc: # pragma: no cover
|
|
55
|
+
raise ImportError(
|
|
56
|
+
"interrupted_time_series requires statsmodels. "
|
|
57
|
+
"Install with: pip install 'pycorpdiff[temporal]'"
|
|
58
|
+
) from exc
|
|
59
|
+
|
|
60
|
+
if len(series) < 4:
|
|
61
|
+
raise ValueError(f"need at least 4 observations; got {len(series)}")
|
|
62
|
+
if series.isna().any():
|
|
63
|
+
raise ValueError("series contains NaN values; impute or drop them first")
|
|
64
|
+
|
|
65
|
+
y = series.to_numpy(dtype=float)
|
|
66
|
+
# Use a 0-based time index — coefficients are then interpretable as
|
|
67
|
+
# "per period" effects without anchoring to a Unix epoch.
|
|
68
|
+
t = np.arange(len(series), dtype=float)
|
|
69
|
+
|
|
70
|
+
event_norm = _normalise_event(event_date, series)
|
|
71
|
+
# Find the first index where index value is >= event_date.
|
|
72
|
+
idx = series.index
|
|
73
|
+
post_mask = np.array([_ge(period, event_norm) for period in idx], dtype=int)
|
|
74
|
+
if post_mask.sum() == 0:
|
|
75
|
+
raise ValueError(f"event_date={event_date!r} is after the last period")
|
|
76
|
+
if post_mask.sum() == len(series):
|
|
77
|
+
raise ValueError(f"event_date={event_date!r} is before the first period")
|
|
78
|
+
|
|
79
|
+
event_t = float(t[post_mask.astype(bool)][0])
|
|
80
|
+
time_after = (t - event_t) * post_mask
|
|
81
|
+
|
|
82
|
+
x = np.column_stack([np.ones_like(t), t, post_mask.astype(float), time_after])
|
|
83
|
+
model = sm.OLS(y, x).fit()
|
|
84
|
+
|
|
85
|
+
conf_int = model.conf_int()
|
|
86
|
+
terms = ["intercept", "time", "level_change", "slope_change"]
|
|
87
|
+
return pd.DataFrame(
|
|
88
|
+
{
|
|
89
|
+
"term": terms,
|
|
90
|
+
"coef": model.params,
|
|
91
|
+
"std_err": model.bse,
|
|
92
|
+
"t": model.tvalues,
|
|
93
|
+
"p_value": model.pvalues,
|
|
94
|
+
"ci_lower": conf_int[:, 0],
|
|
95
|
+
"ci_upper": conf_int[:, 1],
|
|
96
|
+
}
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _normalise_event(
|
|
101
|
+
event: str | pd.Period | pd.Timestamp, series: pd.Series
|
|
102
|
+
) -> pd.Period | pd.Timestamp:
|
|
103
|
+
"""Coerce ``event`` to the same type as the series index for comparison.
|
|
104
|
+
|
|
105
|
+
The pandas type hierarchy here is a bit fiddly: ``Period`` and
|
|
106
|
+
``Timestamp`` aren't directly comparable, and ``Period(event,
|
|
107
|
+
freq=BaseOffset)`` isn't accepted by mypy even though it works at
|
|
108
|
+
runtime. We route through string forms where types disagree.
|
|
109
|
+
"""
|
|
110
|
+
sample = series.index[0]
|
|
111
|
+
if isinstance(sample, pd.Period):
|
|
112
|
+
if isinstance(event, pd.Period):
|
|
113
|
+
return event
|
|
114
|
+
freqstr: str = str(sample.freqstr)
|
|
115
|
+
return pd.Period(str(event), freq=freqstr)
|
|
116
|
+
if isinstance(event, pd.Period):
|
|
117
|
+
return pd.Timestamp(str(event))
|
|
118
|
+
return pd.Timestamp(event)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _ge(period_index_value: object, event: object) -> bool:
|
|
122
|
+
"""``period_index_value >= event`` with sensible coercions."""
|
|
123
|
+
return bool(period_index_value >= event) # type: ignore[operator]
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
"""Temporal slicing primitives — ``TemporalCorpus``, ``track``, ``Tracker``."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Iterator
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
from ..corpus import Corpus, CorpusSlice
|
|
12
|
+
from ..results import TemporalTrajectory
|
|
13
|
+
from ..stats import wilson_ci
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass(frozen=True)
|
|
17
|
+
class TemporalCorpus:
|
|
18
|
+
"""A corpus indexed by time period for diachronic analysis.
|
|
19
|
+
|
|
20
|
+
Constructed via :meth:`pycorpdiff.Corpus.by_time`; bucketing of the
|
|
21
|
+
parent corpus's ``time_col`` follows the pandas offset alias
|
|
22
|
+
``freq``. Periods with no documents are skipped — there's no
|
|
23
|
+
silent-zero entry in :meth:`periods` or :meth:`iter_slices`.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
parent: Corpus
|
|
27
|
+
time_col: str
|
|
28
|
+
freq: str = "Y"
|
|
29
|
+
|
|
30
|
+
def __len__(self) -> int:
|
|
31
|
+
return len(self.parent)
|
|
32
|
+
|
|
33
|
+
def _period_series(self) -> pd.Series:
|
|
34
|
+
"""Per-document Period values, indexed like the parent's docs frame."""
|
|
35
|
+
times = pd.to_datetime(self.parent.docs[self.time_col])
|
|
36
|
+
return times.dt.to_period(self.freq)
|
|
37
|
+
|
|
38
|
+
def periods(self) -> list[pd.Period]:
|
|
39
|
+
"""Sorted list of populated periods."""
|
|
40
|
+
return sorted(self._period_series().unique())
|
|
41
|
+
|
|
42
|
+
def slice(self, period: pd.Period | str) -> CorpusSlice:
|
|
43
|
+
"""Return the :class:`CorpusSlice` for one period.
|
|
44
|
+
|
|
45
|
+
``period`` may be a :class:`pandas.Period` or any string pandas
|
|
46
|
+
can parse to one (e.g. ``"2020"``, ``"2020Q1"``, ``"2020-03"``).
|
|
47
|
+
"""
|
|
48
|
+
idx = self._period_series()
|
|
49
|
+
period_obj = pd.Period(period, freq=self.freq) if isinstance(period, str) else period
|
|
50
|
+
mask = pd.Series(idx.values == period_obj, index=self.parent.docs.index)
|
|
51
|
+
return CorpusSlice(
|
|
52
|
+
parent=self.parent,
|
|
53
|
+
mask=mask,
|
|
54
|
+
filters={"period": str(period_obj)},
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
def iter_slices(self) -> Iterator[tuple[pd.Period, CorpusSlice]]:
|
|
58
|
+
"""Yield ``(period, CorpusSlice)`` pairs in chronological order."""
|
|
59
|
+
idx = self._period_series()
|
|
60
|
+
for period in self.periods():
|
|
61
|
+
mask = pd.Series(idx.values == period, index=self.parent.docs.index)
|
|
62
|
+
yield period, CorpusSlice(
|
|
63
|
+
parent=self.parent, mask=mask, filters={"period": str(period)}
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@dataclass(frozen=True)
|
|
68
|
+
class Tracker:
|
|
69
|
+
"""A diachronic tracker over one or more target terms."""
|
|
70
|
+
|
|
71
|
+
corpus: Corpus | CorpusSlice
|
|
72
|
+
targets: list[str]
|
|
73
|
+
|
|
74
|
+
def over_time(
|
|
75
|
+
self,
|
|
76
|
+
freq: str = "Y",
|
|
77
|
+
time_col: str = "date",
|
|
78
|
+
confidence: float = 0.95,
|
|
79
|
+
) -> TemporalTrajectory:
|
|
80
|
+
"""Return a :class:`TemporalTrajectory` of relative frequencies.
|
|
81
|
+
|
|
82
|
+
For every populated period × target, computes the raw count,
|
|
83
|
+
period token total, relative frequency, and a Wilson score
|
|
84
|
+
interval at ``confidence`` (default 95%). The output frame has
|
|
85
|
+
one row per (period, term) pair, sorted by term then period.
|
|
86
|
+
"""
|
|
87
|
+
temporal = self.corpus.by_time(time_col, freq)
|
|
88
|
+
rows: list[dict[str, object]] = []
|
|
89
|
+
for period, slice_ in temporal.iter_slices():
|
|
90
|
+
tokens_per_doc = slice_.tokens()
|
|
91
|
+
all_tokens: list[str] = [tok for doc in tokens_per_doc for tok in doc]
|
|
92
|
+
total = len(all_tokens)
|
|
93
|
+
counter = pd.Series(all_tokens).value_counts() if total else pd.Series(dtype=int)
|
|
94
|
+
for target in self.targets:
|
|
95
|
+
count = int(counter.get(target, 0))
|
|
96
|
+
relfreq = (count / total) if total > 0 else float("nan")
|
|
97
|
+
lo, hi = wilson_ci(
|
|
98
|
+
np.array([count], dtype=np.int64),
|
|
99
|
+
np.array([total], dtype=np.int64),
|
|
100
|
+
confidence=confidence,
|
|
101
|
+
)
|
|
102
|
+
rows.append(
|
|
103
|
+
{
|
|
104
|
+
"period": period,
|
|
105
|
+
"term": target,
|
|
106
|
+
"count": count,
|
|
107
|
+
"total": total,
|
|
108
|
+
"relfreq": relfreq,
|
|
109
|
+
"ci_lower": float(lo[0]),
|
|
110
|
+
"ci_upper": float(hi[0]),
|
|
111
|
+
}
|
|
112
|
+
)
|
|
113
|
+
table = (
|
|
114
|
+
pd.DataFrame(rows)
|
|
115
|
+
.sort_values(["term", "period"], kind="stable")
|
|
116
|
+
.reset_index(drop=True)
|
|
117
|
+
)
|
|
118
|
+
return TemporalTrajectory(table=table, targets=list(self.targets), freq=freq)
|
|
119
|
+
|
|
120
|
+
def trajectory(
|
|
121
|
+
self,
|
|
122
|
+
freq: str = "Y",
|
|
123
|
+
time_col: str = "date",
|
|
124
|
+
confidence: float = 0.95,
|
|
125
|
+
) -> TemporalTrajectory:
|
|
126
|
+
"""Alias for :meth:`over_time`."""
|
|
127
|
+
return self.over_time(freq=freq, time_col=time_col, confidence=confidence)
|
|
128
|
+
|
|
129
|
+
def semantic_over_time(
|
|
130
|
+
self,
|
|
131
|
+
freq: str = "Y",
|
|
132
|
+
time_col: str = "date",
|
|
133
|
+
embedder: object | None = None,
|
|
134
|
+
window: int = 5,
|
|
135
|
+
baseline_period: str | None = None,
|
|
136
|
+
) -> pd.DataFrame:
|
|
137
|
+
"""Track each target's *contextual centroid* across time periods.
|
|
138
|
+
|
|
139
|
+
Where :meth:`over_time` returns relative frequencies, this
|
|
140
|
+
returns a semantic trajectory: per-period averaged contextual
|
|
141
|
+
embeddings with cosine distance to a baseline period. With
|
|
142
|
+
SBERT this surfaces meaning shifts that pure frequency
|
|
143
|
+
analysis misses.
|
|
144
|
+
|
|
145
|
+
See :func:`pycorpdiff.semantic.semantic_trajectory` for the
|
|
146
|
+
full parameter docs.
|
|
147
|
+
"""
|
|
148
|
+
from ..semantic.shift import ( # noqa: F401 — keeps the import side-effect close to the use
|
|
149
|
+
semantic_shift,
|
|
150
|
+
)
|
|
151
|
+
from ..semantic.trajectory import semantic_trajectory
|
|
152
|
+
|
|
153
|
+
return semantic_trajectory(
|
|
154
|
+
self.corpus,
|
|
155
|
+
target=self.targets if len(self.targets) > 1 else self.targets[0],
|
|
156
|
+
time_col=time_col,
|
|
157
|
+
freq=freq,
|
|
158
|
+
embedder=embedder, # type: ignore[arg-type]
|
|
159
|
+
window=window,
|
|
160
|
+
baseline_period=baseline_period,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def track(
|
|
165
|
+
corpus: Corpus | CorpusSlice, target: str | list[str]
|
|
166
|
+
) -> Tracker:
|
|
167
|
+
"""Construct a :class:`Tracker` for diachronic analysis of target term(s).
|
|
168
|
+
|
|
169
|
+
Accepts either a :class:`Corpus` or a :class:`CorpusSlice`, so
|
|
170
|
+
``pcd.track(corpus.slice(topic="immigration"), "criminal")`` works
|
|
171
|
+
out of the box.
|
|
172
|
+
"""
|
|
173
|
+
targets = [target] if isinstance(target, str) else list(target)
|
|
174
|
+
return Tracker(corpus=corpus, targets=targets)
|