pycorpdiff 0.1.0a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. pycorpdiff/__init__.py +126 -0
  2. pycorpdiff/_backends/__init__.py +3 -0
  3. pycorpdiff/_backends/pandas.py +3 -0
  4. pycorpdiff/_backends/polars.py +3 -0
  5. pycorpdiff/collocation/__init__.py +19 -0
  6. pycorpdiff/collocation/cooccurrence.py +65 -0
  7. pycorpdiff/collocation/measures.py +102 -0
  8. pycorpdiff/collocation/network.py +233 -0
  9. pycorpdiff/collocation/shift.py +146 -0
  10. pycorpdiff/compare.py +345 -0
  11. pycorpdiff/corpus.py +411 -0
  12. pycorpdiff/datasets/__init__.py +27 -0
  13. pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
  14. pycorpdiff/datasets/_generate_hansard.py +221 -0
  15. pycorpdiff/datasets/hansard.py +235 -0
  16. pycorpdiff/datasets/histwords.py +221 -0
  17. pycorpdiff/explain.py +177 -0
  18. pycorpdiff/io/__init__.py +16 -0
  19. pycorpdiff/io/duckdb.py +92 -0
  20. pycorpdiff/io/huggingface.py +142 -0
  21. pycorpdiff/io/readers.py +138 -0
  22. pycorpdiff/keyness/__init__.py +26 -0
  23. pycorpdiff/keyness/bayes.py +50 -0
  24. pycorpdiff/keyness/chi_squared.py +94 -0
  25. pycorpdiff/keyness/correction.py +34 -0
  26. pycorpdiff/keyness/dispersion.py +89 -0
  27. pycorpdiff/keyness/effect_sizes.py +65 -0
  28. pycorpdiff/keyness/loglikelihood.py +92 -0
  29. pycorpdiff/keyness/multicorpus.py +143 -0
  30. pycorpdiff/keyness/permutation.py +154 -0
  31. pycorpdiff/py.typed +0 -0
  32. pycorpdiff/results.py +635 -0
  33. pycorpdiff/semantic/__init__.py +18 -0
  34. pycorpdiff/semantic/alignment.py +53 -0
  35. pycorpdiff/semantic/embed.py +84 -0
  36. pycorpdiff/semantic/shift.py +224 -0
  37. pycorpdiff/semantic/trajectory.py +166 -0
  38. pycorpdiff/stats.py +69 -0
  39. pycorpdiff/temporal/__init__.py +15 -0
  40. pycorpdiff/temporal/bocpd.py +233 -0
  41. pycorpdiff/temporal/causal_impact.py +293 -0
  42. pycorpdiff/temporal/changepoint.py +92 -0
  43. pycorpdiff/temporal/forecast.py +405 -0
  44. pycorpdiff/temporal/its.py +123 -0
  45. pycorpdiff/temporal/slicing.py +174 -0
  46. pycorpdiff/tokenize.py +110 -0
  47. pycorpdiff/viz/__init__.py +37 -0
  48. pycorpdiff/viz/bocpd.py +173 -0
  49. pycorpdiff/viz/causal_impact.py +142 -0
  50. pycorpdiff/viz/collocation.py +48 -0
  51. pycorpdiff/viz/dispersion.py +117 -0
  52. pycorpdiff/viz/forecast.py +129 -0
  53. pycorpdiff/viz/keyness.py +96 -0
  54. pycorpdiff/viz/network.py +186 -0
  55. pycorpdiff/viz/scattertext.py +160 -0
  56. pycorpdiff/viz/semantic_forecast.py +114 -0
  57. pycorpdiff/viz/trajectory.py +48 -0
  58. pycorpdiff-0.1.0a0.dist-info/METADATA +230 -0
  59. pycorpdiff-0.1.0a0.dist-info/RECORD +61 -0
  60. pycorpdiff-0.1.0a0.dist-info/WHEEL +4 -0
  61. pycorpdiff-0.1.0a0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,405 @@
1
+ """Forward prediction for :class:`TemporalTrajectory`.
2
+
3
+ The other temporal modules answer *retrospective* questions —
4
+ "when did things change", "was there a step at this known event".
5
+ This one answers the *predictive* one: where is the trajectory
6
+ heading, and what's the uncertainty around that?
7
+
8
+ Method: state-space exponential smoothing (Hyndman et al. 2008) via
9
+ ``statsmodels.tsa.exponential_smoothing.ets.ETSModel``. Default
10
+ ``method="auto"`` selects ETS for series of length ≥ 8 and falls
11
+ back to a Holt linear-trend model for shorter histories where ETS
12
+ overfits. Rates are forecast on the logit scale and back-transformed
13
+ so prediction intervals are pinned to ``[0, 1]`` instead of admitting
14
+ nonsensical negative frequencies.
15
+
16
+ Prediction intervals are the analytical PIs that come out of the
17
+ state-space fit at the requested ``level``; for short series they
18
+ will be appropriately wide. Honest uncertainty beats false precision.
19
+
20
+ Reference
21
+ ---------
22
+ Hyndman, R. J., Koehler, A. B., Ord, J. K., & Snyder, R. D. (2008).
23
+ *Forecasting with Exponential Smoothing: The State Space Approach*.
24
+ Springer.
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ from collections.abc import Sequence
30
+ from dataclasses import dataclass, field
31
+ from pathlib import Path
32
+ from typing import TYPE_CHECKING, Any, Literal
33
+
34
+ import numpy as np
35
+ import pandas as pd
36
+
37
+ from ..results import _table_to_html, _table_to_json
38
+
39
+ if TYPE_CHECKING:
40
+ import altair as alt
41
+
42
+ ForecastMethod = Literal["auto", "ets", "holt"]
43
+
44
+
45
+ @dataclass(frozen=True)
46
+ class ForecastResult:
47
+ """Forward prediction of a :class:`TemporalTrajectory`.
48
+
49
+ Carries both the original history (so :meth:`plot` can render the
50
+ solid-then-dashed continuation in a single chart) and the forecast
51
+ table with point estimates and prediction intervals.
52
+ """
53
+
54
+ history: pd.DataFrame
55
+ forecast: pd.DataFrame
56
+ targets: list[str]
57
+ freq: str
58
+ horizon: int
59
+ level: float
60
+ method: str
61
+ params: dict[str, Any] = field(default_factory=dict)
62
+
63
+ def to_df(self) -> pd.DataFrame:
64
+ """Return just the forecast table (point + PI per term × period)."""
65
+ return self.forecast.copy()
66
+
67
+ def to_combined(self) -> pd.DataFrame:
68
+ """Return history + forecast stacked, with a ``kind`` column
69
+ distinguishing the two."""
70
+ h = self.history.copy()
71
+ h["kind"] = "observed"
72
+ h = h.rename(columns={"relfreq": "point"})[
73
+ ["period", "term", "point", "ci_lower", "ci_upper", "kind"]
74
+ ]
75
+ f = self.forecast.copy()
76
+ f["kind"] = "forecast"
77
+ return pd.concat([h, f[h.columns]], ignore_index=True)
78
+
79
+ def to_html(self, path: str | Path | None = None, **kw: Any) -> str:
80
+ """Render the forecast table as HTML."""
81
+ return _table_to_html(self.forecast, path, **kw)
82
+
83
+ def to_json(self, path: str | Path | None = None, **kw: Any) -> str:
84
+ """Render the forecast table as JSON."""
85
+ return _table_to_json(self.forecast, path, **kw)
86
+
87
+ def plot(self, **kw: Any) -> alt.Chart:
88
+ """Layered altair chart: solid observed + dashed forecast.
89
+
90
+ The history portion shows the existing Wilson CI band; the
91
+ forecast portion shows the prediction-interval band at the
92
+ chosen ``level``.
93
+ """
94
+ from ..viz.forecast import forecast_plot
95
+
96
+ return forecast_plot(
97
+ history=self.history, forecast=self.forecast, **kw
98
+ )
99
+
100
+ def summary(self) -> str:
101
+ return (
102
+ f"ForecastResult(targets={self.targets!r}, freq={self.freq!r}, "
103
+ f"horizon={self.horizon}, level={self.level}, method={self.method})"
104
+ )
105
+
106
+
107
+ def forecast_trajectory(
108
+ trajectory_table: pd.DataFrame,
109
+ *,
110
+ targets: Sequence[str] | None = None,
111
+ horizon: int = 4,
112
+ level: float = 0.95,
113
+ method: ForecastMethod = "auto",
114
+ logit_transform: bool = True,
115
+ ) -> pd.DataFrame:
116
+ """Forecast every (or selected) target term's trajectory ``horizon``
117
+ periods forward.
118
+
119
+ Parameters
120
+ ----------
121
+ trajectory_table
122
+ The DataFrame attached to a :class:`TemporalTrajectory` —
123
+ must carry ``period``, ``term``, ``relfreq`` columns at minimum.
124
+ targets
125
+ Which terms to forecast. ``None`` (default) forecasts every
126
+ term present in the table.
127
+ horizon
128
+ Number of periods to extrapolate.
129
+ level
130
+ Prediction-interval level. ``0.95`` → 95% PI.
131
+ method
132
+ ``"auto"`` (default) picks ETS for series of length ≥ 8, Holt
133
+ otherwise. ``"ets"`` or ``"holt"`` force a specific method.
134
+ logit_transform
135
+ If ``True`` (default), forecast on the logit scale and
136
+ back-transform so the prediction intervals stay in ``[0, 1]``.
137
+ Disable only if forecasting something other than a rate.
138
+
139
+ Returns
140
+ -------
141
+ pandas.DataFrame
142
+ Columns: ``period``, ``term``, ``point``, ``ci_lower``,
143
+ ``ci_upper``. One row per (target, future period). Period
144
+ values match the freq of the input trajectory.
145
+ """
146
+ if horizon < 1:
147
+ raise ValueError(f"horizon must be >= 1; got {horizon}")
148
+ if not 0 < level < 1:
149
+ raise ValueError(f"level must be in (0, 1); got {level}")
150
+
151
+ if targets is None:
152
+ target_list = list(trajectory_table["term"].unique())
153
+ else:
154
+ target_list = list(targets)
155
+ missing = set(target_list) - set(trajectory_table["term"].unique())
156
+ if missing:
157
+ raise ValueError(
158
+ f"unknown targets: {sorted(missing)!r}; "
159
+ f"trajectory carries {sorted(trajectory_table['term'].unique())!r}"
160
+ )
161
+
162
+ rows: list[pd.DataFrame] = []
163
+ for term in target_list:
164
+ sub = (
165
+ trajectory_table[trajectory_table["term"] == term]
166
+ .sort_values("period")
167
+ .set_index("period")["relfreq"]
168
+ )
169
+ if sub.isna().any():
170
+ sub = sub.dropna()
171
+ if len(sub) < 4:
172
+ raise ValueError(
173
+ f"need at least 4 observations to forecast {term!r}; "
174
+ f"got {len(sub)}"
175
+ )
176
+ fc = _forecast_one(
177
+ sub,
178
+ horizon=horizon,
179
+ level=level,
180
+ method=method,
181
+ logit_transform=logit_transform,
182
+ )
183
+ fc.insert(1, "term", term)
184
+ rows.append(fc)
185
+
186
+ return pd.concat(rows, ignore_index=True)
187
+
188
+
189
+ def forecast_semantic_drift(
190
+ trajectory_df: pd.DataFrame,
191
+ *,
192
+ targets: Sequence[str] | None = None,
193
+ horizon: int = 4,
194
+ level: float = 0.95,
195
+ method: ForecastMethod = "auto",
196
+ ) -> pd.DataFrame:
197
+ """Forecast a :func:`pycorpdiff.semantic_trajectory` output forward.
198
+
199
+ Operates on the ``distance_from_baseline`` column — the cosine
200
+ displacement of each per-period contextual centroid from the
201
+ baseline period. Same state-space machinery as
202
+ :func:`forecast_trajectory`, but the logit transform is *off* by
203
+ default (cosine distance lives in roughly ``[0, 2]``, not
204
+ ``[0, 1]``) and the prediction interval is clipped to be
205
+ non-negative since negative cosine *distance* is nonsensical.
206
+
207
+ Parameters
208
+ ----------
209
+ trajectory_df
210
+ The DataFrame returned by :func:`pycorpdiff.semantic_trajectory`
211
+ — must carry ``period``, ``target``, ``distance_from_baseline``
212
+ columns.
213
+ targets
214
+ Restrict to a subset of targets. ``None`` forecasts every
215
+ target in the table.
216
+ horizon
217
+ Number of periods to extrapolate.
218
+ level
219
+ Prediction-interval level. ``0.95`` → 95% PI.
220
+ method
221
+ ``"auto"`` (default), ``"ets"``, or ``"holt"``.
222
+
223
+ Returns
224
+ -------
225
+ pandas.DataFrame
226
+ Columns: ``period``, ``target``, ``point``, ``ci_lower``,
227
+ ``ci_upper``. One row per (target, future period).
228
+ """
229
+ if horizon < 1:
230
+ raise ValueError(f"horizon must be >= 1; got {horizon}")
231
+ if not 0 < level < 1:
232
+ raise ValueError(f"level must be in (0, 1); got {level}")
233
+
234
+ required_cols = {"period", "target", "distance_from_baseline"}
235
+ missing = required_cols - set(trajectory_df.columns)
236
+ if missing:
237
+ raise ValueError(
238
+ f"trajectory_df is missing required columns: {sorted(missing)!r}"
239
+ )
240
+
241
+ if targets is None:
242
+ target_list = list(trajectory_df["target"].unique())
243
+ else:
244
+ target_list = list(targets)
245
+ unknown = set(target_list) - set(trajectory_df["target"].unique())
246
+ if unknown:
247
+ raise ValueError(
248
+ f"unknown targets: {sorted(unknown)!r}; "
249
+ f"trajectory carries {sorted(trajectory_df['target'].unique())!r}"
250
+ )
251
+
252
+ rows: list[pd.DataFrame] = []
253
+ for tgt in target_list:
254
+ sub = (
255
+ trajectory_df[trajectory_df["target"] == tgt]
256
+ .sort_values("period")
257
+ .set_index("period")["distance_from_baseline"]
258
+ )
259
+ sub = sub.dropna()
260
+ if len(sub) < 4:
261
+ raise ValueError(
262
+ f"need at least 4 observations to forecast {tgt!r}; "
263
+ f"got {len(sub)}"
264
+ )
265
+ fc = _forecast_one(
266
+ sub,
267
+ horizon=horizon,
268
+ level=level,
269
+ method=method,
270
+ logit_transform=False,
271
+ )
272
+ # Clip lower bound at 0 — cosine distance is non-negative.
273
+ fc["ci_lower"] = fc["ci_lower"].clip(lower=0.0)
274
+ fc["point"] = fc["point"].clip(lower=0.0)
275
+ fc.insert(1, "target", tgt)
276
+ rows.append(fc)
277
+
278
+ return pd.concat(rows, ignore_index=True)
279
+
280
+
281
+ def _forecast_one(
282
+ series: pd.Series,
283
+ *,
284
+ horizon: int,
285
+ level: float,
286
+ method: ForecastMethod,
287
+ logit_transform: bool,
288
+ ) -> pd.DataFrame:
289
+ """Forecast a single univariate rate series.
290
+
291
+ Returns a DataFrame with columns ``period``, ``point``,
292
+ ``ci_lower``, ``ci_upper``. Period values are constructed from the
293
+ input series's PeriodIndex.
294
+ """
295
+ try:
296
+ from statsmodels.tsa.exponential_smoothing.ets import ETSModel
297
+ from statsmodels.tsa.holtwinters import Holt
298
+ except ImportError as exc: # pragma: no cover
299
+ raise ImportError(
300
+ "forecast() requires statsmodels. "
301
+ "Install with: pip install 'pycorpdiff[temporal]'"
302
+ ) from exc
303
+
304
+ # statsmodels prefers a DatetimeIndex with a known freq for ETS —
305
+ # convert from PeriodIndex and attach the inferred frequency so the
306
+ # state-space model can extrapolate cleanly.
307
+ ts_series = series.copy()
308
+ if isinstance(series.index, pd.PeriodIndex):
309
+ period_freq = series.index.freq
310
+ ts_index = series.index.to_timestamp()
311
+ inferred = getattr(ts_index, "inferred_freq", None)
312
+ ts_series.index = (
313
+ pd.DatetimeIndex(ts_index, freq=inferred) if inferred else ts_index
314
+ )
315
+ else:
316
+ period_freq = None
317
+
318
+ if logit_transform:
319
+ eps = 1e-6
320
+ y_arr = np.clip(ts_series.to_numpy(dtype=float), eps, 1.0 - eps)
321
+ z = pd.Series(
322
+ np.log(y_arr / (1.0 - y_arr)),
323
+ index=ts_series.index,
324
+ name=ts_series.name,
325
+ )
326
+ else:
327
+ z = ts_series.astype(float)
328
+
329
+ chosen = method
330
+ if chosen == "auto":
331
+ chosen = "ets" if len(z) >= 8 else "holt"
332
+
333
+ alpha = 1.0 - level
334
+
335
+ if chosen == "ets":
336
+ model = ETSModel(
337
+ z, error="add", trend="add", seasonal=None
338
+ ).fit(disp=False)
339
+ pred = model.get_prediction(start=len(z), end=len(z) + horizon - 1)
340
+ summary = pred.summary_frame(alpha=alpha)
341
+ z_point = summary["mean"].to_numpy(dtype=float)
342
+ z_lower = summary["pi_lower"].to_numpy(dtype=float)
343
+ z_upper = summary["pi_upper"].to_numpy(dtype=float)
344
+ elif chosen == "holt":
345
+ holt = Holt(z, initialization_method="estimated").fit()
346
+ # Holt has no analytical PI in older statsmodels — use
347
+ # forecast_int via simulate-based intervals.
348
+ z_point = np.asarray(holt.forecast(steps=horizon), dtype=float)
349
+ # Residual-based PI (Hyndman §6.4): std of fitted residuals
350
+ # widens with sqrt(h) under the additive-error assumption.
351
+ resid_std = float(np.std(holt.resid, ddof=1))
352
+ from scipy.stats import norm
353
+
354
+ crit = float(norm.ppf(1.0 - alpha / 2.0))
355
+ h_steps = np.arange(1, horizon + 1, dtype=float)
356
+ widening = crit * resid_std * np.sqrt(h_steps)
357
+ z_lower = z_point - widening
358
+ z_upper = z_point + widening
359
+ else:
360
+ raise ValueError(
361
+ f"unknown method={chosen!r}; expected 'auto', 'ets', or 'holt'"
362
+ )
363
+
364
+ if logit_transform:
365
+ point = _inv_logit(z_point)
366
+ lower = _inv_logit(z_lower)
367
+ upper = _inv_logit(z_upper)
368
+ else:
369
+ point = z_point
370
+ lower = z_lower
371
+ upper = z_upper
372
+
373
+ # Future periods — match original index type.
374
+ if period_freq is not None and isinstance(series.index, pd.PeriodIndex):
375
+ last_period = series.index[-1]
376
+ future_periods = pd.period_range(
377
+ start=last_period + 1, periods=horizon, freq=period_freq
378
+ )
379
+ future_idx: pd.Index = pd.Index(future_periods)
380
+ else:
381
+ last_ts = series.index[-1]
382
+ inferred = getattr(series.index, "inferred_freq", None) or "D"
383
+ offset = pd.tseries.frequencies.to_offset(inferred)
384
+ future_idx = pd.date_range(
385
+ start=last_ts + offset, periods=horizon, freq=offset
386
+ )
387
+
388
+ return pd.DataFrame(
389
+ {
390
+ "period": future_idx,
391
+ "point": point,
392
+ "ci_lower": lower,
393
+ "ci_upper": upper,
394
+ }
395
+ )
396
+
397
+
398
+ def _inv_logit(x: np.ndarray) -> np.ndarray:
399
+ """Numerically-stable inverse logit."""
400
+ out: np.ndarray = np.where(
401
+ x >= 0,
402
+ 1.0 / (1.0 + np.exp(-x)),
403
+ np.exp(x) / (1.0 + np.exp(x)),
404
+ )
405
+ return out
@@ -0,0 +1,123 @@
1
+ """Interrupted time-series analysis around a known event date.
2
+
3
+ Implements the standard segmented-regression specification:
4
+
5
+ y_t = β₀ + β₁·t + β₂·post_t + β₃·(t − t_event)·post_t + ε
6
+
7
+ where ``post_t`` is 1 when ``t ≥ t_event`` and 0 otherwise. The
8
+ coefficients of interest:
9
+
10
+ - ``β₂`` (level change): the immediate step at the intervention.
11
+ - ``β₃`` (slope change): how the post-period trend differs from
12
+ the pre-period trend.
13
+
14
+ Reference
15
+ ---------
16
+ Wagner, A. K., Soumerai, S. B., Zhang, F., & Ross-Degnan, D. (2002).
17
+ Segmented regression analysis of interrupted time series studies in
18
+ medication use research. *Journal of Clinical Pharmacy and
19
+ Therapeutics*, 27(4), 299-309.
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import numpy as np
25
+ import pandas as pd
26
+
27
+
28
+ def interrupted_time_series(
29
+ series: pd.Series,
30
+ event_date: str | pd.Period | pd.Timestamp,
31
+ ) -> pd.DataFrame:
32
+ """Fit a segmented-regression ITS model around ``event_date``.
33
+
34
+ Parameters
35
+ ----------
36
+ series
37
+ A series indexed by period (or datetime). Index values must
38
+ compare against ``event_date`` to produce the pre/post split.
39
+ event_date
40
+ Where to place the intervention. Anything pandas can compare to
41
+ the series index — a string, a :class:`pandas.Period`, a
42
+ :class:`pandas.Timestamp`.
43
+
44
+ Returns
45
+ -------
46
+ pandas.DataFrame
47
+ Four rows (one per coefficient), columns ``term``, ``coef``,
48
+ ``std_err``, ``t``, ``p_value``, ``ci_lower``, ``ci_upper``.
49
+ ``term`` values: ``"intercept"``, ``"time"``, ``"level_change"``,
50
+ ``"slope_change"``.
51
+ """
52
+ try:
53
+ import statsmodels.api as sm
54
+ except ImportError as exc: # pragma: no cover
55
+ raise ImportError(
56
+ "interrupted_time_series requires statsmodels. "
57
+ "Install with: pip install 'pycorpdiff[temporal]'"
58
+ ) from exc
59
+
60
+ if len(series) < 4:
61
+ raise ValueError(f"need at least 4 observations; got {len(series)}")
62
+ if series.isna().any():
63
+ raise ValueError("series contains NaN values; impute or drop them first")
64
+
65
+ y = series.to_numpy(dtype=float)
66
+ # Use a 0-based time index — coefficients are then interpretable as
67
+ # "per period" effects without anchoring to a Unix epoch.
68
+ t = np.arange(len(series), dtype=float)
69
+
70
+ event_norm = _normalise_event(event_date, series)
71
+ # Find the first index where index value is >= event_date.
72
+ idx = series.index
73
+ post_mask = np.array([_ge(period, event_norm) for period in idx], dtype=int)
74
+ if post_mask.sum() == 0:
75
+ raise ValueError(f"event_date={event_date!r} is after the last period")
76
+ if post_mask.sum() == len(series):
77
+ raise ValueError(f"event_date={event_date!r} is before the first period")
78
+
79
+ event_t = float(t[post_mask.astype(bool)][0])
80
+ time_after = (t - event_t) * post_mask
81
+
82
+ x = np.column_stack([np.ones_like(t), t, post_mask.astype(float), time_after])
83
+ model = sm.OLS(y, x).fit()
84
+
85
+ conf_int = model.conf_int()
86
+ terms = ["intercept", "time", "level_change", "slope_change"]
87
+ return pd.DataFrame(
88
+ {
89
+ "term": terms,
90
+ "coef": model.params,
91
+ "std_err": model.bse,
92
+ "t": model.tvalues,
93
+ "p_value": model.pvalues,
94
+ "ci_lower": conf_int[:, 0],
95
+ "ci_upper": conf_int[:, 1],
96
+ }
97
+ )
98
+
99
+
100
+ def _normalise_event(
101
+ event: str | pd.Period | pd.Timestamp, series: pd.Series
102
+ ) -> pd.Period | pd.Timestamp:
103
+ """Coerce ``event`` to the same type as the series index for comparison.
104
+
105
+ The pandas type hierarchy here is a bit fiddly: ``Period`` and
106
+ ``Timestamp`` aren't directly comparable, and ``Period(event,
107
+ freq=BaseOffset)`` isn't accepted by mypy even though it works at
108
+ runtime. We route through string forms where types disagree.
109
+ """
110
+ sample = series.index[0]
111
+ if isinstance(sample, pd.Period):
112
+ if isinstance(event, pd.Period):
113
+ return event
114
+ freqstr: str = str(sample.freqstr)
115
+ return pd.Period(str(event), freq=freqstr)
116
+ if isinstance(event, pd.Period):
117
+ return pd.Timestamp(str(event))
118
+ return pd.Timestamp(event)
119
+
120
+
121
+ def _ge(period_index_value: object, event: object) -> bool:
122
+ """``period_index_value >= event`` with sensible coercions."""
123
+ return bool(period_index_value >= event) # type: ignore[operator]
@@ -0,0 +1,174 @@
1
+ """Temporal slicing primitives — ``TemporalCorpus``, ``track``, ``Tracker``."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Iterator
6
+ from dataclasses import dataclass
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+
11
+ from ..corpus import Corpus, CorpusSlice
12
+ from ..results import TemporalTrajectory
13
+ from ..stats import wilson_ci
14
+
15
+
16
+ @dataclass(frozen=True)
17
+ class TemporalCorpus:
18
+ """A corpus indexed by time period for diachronic analysis.
19
+
20
+ Constructed via :meth:`pycorpdiff.Corpus.by_time`; bucketing of the
21
+ parent corpus's ``time_col`` follows the pandas offset alias
22
+ ``freq``. Periods with no documents are skipped — there's no
23
+ silent-zero entry in :meth:`periods` or :meth:`iter_slices`.
24
+ """
25
+
26
+ parent: Corpus
27
+ time_col: str
28
+ freq: str = "Y"
29
+
30
+ def __len__(self) -> int:
31
+ return len(self.parent)
32
+
33
+ def _period_series(self) -> pd.Series:
34
+ """Per-document Period values, indexed like the parent's docs frame."""
35
+ times = pd.to_datetime(self.parent.docs[self.time_col])
36
+ return times.dt.to_period(self.freq)
37
+
38
+ def periods(self) -> list[pd.Period]:
39
+ """Sorted list of populated periods."""
40
+ return sorted(self._period_series().unique())
41
+
42
+ def slice(self, period: pd.Period | str) -> CorpusSlice:
43
+ """Return the :class:`CorpusSlice` for one period.
44
+
45
+ ``period`` may be a :class:`pandas.Period` or any string pandas
46
+ can parse to one (e.g. ``"2020"``, ``"2020Q1"``, ``"2020-03"``).
47
+ """
48
+ idx = self._period_series()
49
+ period_obj = pd.Period(period, freq=self.freq) if isinstance(period, str) else period
50
+ mask = pd.Series(idx.values == period_obj, index=self.parent.docs.index)
51
+ return CorpusSlice(
52
+ parent=self.parent,
53
+ mask=mask,
54
+ filters={"period": str(period_obj)},
55
+ )
56
+
57
+ def iter_slices(self) -> Iterator[tuple[pd.Period, CorpusSlice]]:
58
+ """Yield ``(period, CorpusSlice)`` pairs in chronological order."""
59
+ idx = self._period_series()
60
+ for period in self.periods():
61
+ mask = pd.Series(idx.values == period, index=self.parent.docs.index)
62
+ yield period, CorpusSlice(
63
+ parent=self.parent, mask=mask, filters={"period": str(period)}
64
+ )
65
+
66
+
67
+ @dataclass(frozen=True)
68
+ class Tracker:
69
+ """A diachronic tracker over one or more target terms."""
70
+
71
+ corpus: Corpus | CorpusSlice
72
+ targets: list[str]
73
+
74
+ def over_time(
75
+ self,
76
+ freq: str = "Y",
77
+ time_col: str = "date",
78
+ confidence: float = 0.95,
79
+ ) -> TemporalTrajectory:
80
+ """Return a :class:`TemporalTrajectory` of relative frequencies.
81
+
82
+ For every populated period × target, computes the raw count,
83
+ period token total, relative frequency, and a Wilson score
84
+ interval at ``confidence`` (default 95%). The output frame has
85
+ one row per (period, term) pair, sorted by term then period.
86
+ """
87
+ temporal = self.corpus.by_time(time_col, freq)
88
+ rows: list[dict[str, object]] = []
89
+ for period, slice_ in temporal.iter_slices():
90
+ tokens_per_doc = slice_.tokens()
91
+ all_tokens: list[str] = [tok for doc in tokens_per_doc for tok in doc]
92
+ total = len(all_tokens)
93
+ counter = pd.Series(all_tokens).value_counts() if total else pd.Series(dtype=int)
94
+ for target in self.targets:
95
+ count = int(counter.get(target, 0))
96
+ relfreq = (count / total) if total > 0 else float("nan")
97
+ lo, hi = wilson_ci(
98
+ np.array([count], dtype=np.int64),
99
+ np.array([total], dtype=np.int64),
100
+ confidence=confidence,
101
+ )
102
+ rows.append(
103
+ {
104
+ "period": period,
105
+ "term": target,
106
+ "count": count,
107
+ "total": total,
108
+ "relfreq": relfreq,
109
+ "ci_lower": float(lo[0]),
110
+ "ci_upper": float(hi[0]),
111
+ }
112
+ )
113
+ table = (
114
+ pd.DataFrame(rows)
115
+ .sort_values(["term", "period"], kind="stable")
116
+ .reset_index(drop=True)
117
+ )
118
+ return TemporalTrajectory(table=table, targets=list(self.targets), freq=freq)
119
+
120
+ def trajectory(
121
+ self,
122
+ freq: str = "Y",
123
+ time_col: str = "date",
124
+ confidence: float = 0.95,
125
+ ) -> TemporalTrajectory:
126
+ """Alias for :meth:`over_time`."""
127
+ return self.over_time(freq=freq, time_col=time_col, confidence=confidence)
128
+
129
+ def semantic_over_time(
130
+ self,
131
+ freq: str = "Y",
132
+ time_col: str = "date",
133
+ embedder: object | None = None,
134
+ window: int = 5,
135
+ baseline_period: str | None = None,
136
+ ) -> pd.DataFrame:
137
+ """Track each target's *contextual centroid* across time periods.
138
+
139
+ Where :meth:`over_time` returns relative frequencies, this
140
+ returns a semantic trajectory: per-period averaged contextual
141
+ embeddings with cosine distance to a baseline period. With
142
+ SBERT this surfaces meaning shifts that pure frequency
143
+ analysis misses.
144
+
145
+ See :func:`pycorpdiff.semantic.semantic_trajectory` for the
146
+ full parameter docs.
147
+ """
148
+ from ..semantic.shift import ( # noqa: F401 — keeps the import side-effect close to the use
149
+ semantic_shift,
150
+ )
151
+ from ..semantic.trajectory import semantic_trajectory
152
+
153
+ return semantic_trajectory(
154
+ self.corpus,
155
+ target=self.targets if len(self.targets) > 1 else self.targets[0],
156
+ time_col=time_col,
157
+ freq=freq,
158
+ embedder=embedder, # type: ignore[arg-type]
159
+ window=window,
160
+ baseline_period=baseline_period,
161
+ )
162
+
163
+
164
+ def track(
165
+ corpus: Corpus | CorpusSlice, target: str | list[str]
166
+ ) -> Tracker:
167
+ """Construct a :class:`Tracker` for diachronic analysis of target term(s).
168
+
169
+ Accepts either a :class:`Corpus` or a :class:`CorpusSlice`, so
170
+ ``pcd.track(corpus.slice(topic="immigration"), "criminal")`` works
171
+ out of the box.
172
+ """
173
+ targets = [target] if isinstance(target, str) else list(target)
174
+ return Tracker(corpus=corpus, targets=targets)