pycorpdiff 0.1.0a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. pycorpdiff/__init__.py +126 -0
  2. pycorpdiff/_backends/__init__.py +3 -0
  3. pycorpdiff/_backends/pandas.py +3 -0
  4. pycorpdiff/_backends/polars.py +3 -0
  5. pycorpdiff/collocation/__init__.py +19 -0
  6. pycorpdiff/collocation/cooccurrence.py +65 -0
  7. pycorpdiff/collocation/measures.py +102 -0
  8. pycorpdiff/collocation/network.py +233 -0
  9. pycorpdiff/collocation/shift.py +146 -0
  10. pycorpdiff/compare.py +345 -0
  11. pycorpdiff/corpus.py +411 -0
  12. pycorpdiff/datasets/__init__.py +27 -0
  13. pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
  14. pycorpdiff/datasets/_generate_hansard.py +221 -0
  15. pycorpdiff/datasets/hansard.py +235 -0
  16. pycorpdiff/datasets/histwords.py +221 -0
  17. pycorpdiff/explain.py +177 -0
  18. pycorpdiff/io/__init__.py +16 -0
  19. pycorpdiff/io/duckdb.py +92 -0
  20. pycorpdiff/io/huggingface.py +142 -0
  21. pycorpdiff/io/readers.py +138 -0
  22. pycorpdiff/keyness/__init__.py +26 -0
  23. pycorpdiff/keyness/bayes.py +50 -0
  24. pycorpdiff/keyness/chi_squared.py +94 -0
  25. pycorpdiff/keyness/correction.py +34 -0
  26. pycorpdiff/keyness/dispersion.py +89 -0
  27. pycorpdiff/keyness/effect_sizes.py +65 -0
  28. pycorpdiff/keyness/loglikelihood.py +92 -0
  29. pycorpdiff/keyness/multicorpus.py +143 -0
  30. pycorpdiff/keyness/permutation.py +154 -0
  31. pycorpdiff/py.typed +0 -0
  32. pycorpdiff/results.py +635 -0
  33. pycorpdiff/semantic/__init__.py +18 -0
  34. pycorpdiff/semantic/alignment.py +53 -0
  35. pycorpdiff/semantic/embed.py +84 -0
  36. pycorpdiff/semantic/shift.py +224 -0
  37. pycorpdiff/semantic/trajectory.py +166 -0
  38. pycorpdiff/stats.py +69 -0
  39. pycorpdiff/temporal/__init__.py +15 -0
  40. pycorpdiff/temporal/bocpd.py +233 -0
  41. pycorpdiff/temporal/causal_impact.py +293 -0
  42. pycorpdiff/temporal/changepoint.py +92 -0
  43. pycorpdiff/temporal/forecast.py +405 -0
  44. pycorpdiff/temporal/its.py +123 -0
  45. pycorpdiff/temporal/slicing.py +174 -0
  46. pycorpdiff/tokenize.py +110 -0
  47. pycorpdiff/viz/__init__.py +37 -0
  48. pycorpdiff/viz/bocpd.py +173 -0
  49. pycorpdiff/viz/causal_impact.py +142 -0
  50. pycorpdiff/viz/collocation.py +48 -0
  51. pycorpdiff/viz/dispersion.py +117 -0
  52. pycorpdiff/viz/forecast.py +129 -0
  53. pycorpdiff/viz/keyness.py +96 -0
  54. pycorpdiff/viz/network.py +186 -0
  55. pycorpdiff/viz/scattertext.py +160 -0
  56. pycorpdiff/viz/semantic_forecast.py +114 -0
  57. pycorpdiff/viz/trajectory.py +48 -0
  58. pycorpdiff-0.1.0a0.dist-info/METADATA +230 -0
  59. pycorpdiff-0.1.0a0.dist-info/RECORD +61 -0
  60. pycorpdiff-0.1.0a0.dist-info/WHEEL +4 -0
  61. pycorpdiff-0.1.0a0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,233 @@
1
+ """Bayesian online changepoint detection (Adams & MacKay 2007).
2
+
3
+ Where :func:`detect_changepoints` runs PELT *offline* — needing the full
4
+ series and returning MAP locations after the fact — BOCPD runs *online*:
5
+ at each new observation it returns the posterior distribution over *run
6
+ length*, where the run length is the number of periods since the last
7
+ changepoint. The two most actionable summaries:
8
+
9
+ - **MAP run length** at time *t*: the most-probable number of periods
10
+ since the last changepoint. Drops sharply at changepoints, grows
11
+ steadily during stable regimes.
12
+ - **Changepoint probability** P(*r* < ``threshold`` | data through *t*):
13
+ posterior probability that a changepoint occurred within the last
14
+ ``threshold`` steps. Useful for live monitoring.
15
+
16
+ Observation model: Gaussian with a Normal-Inverse-Gamma conjugate
17
+ prior over (mean, variance). The predictive distribution is Student's
18
+ *t*, computed analytically.
19
+
20
+ Reference
21
+ ---------
22
+ Adams, R. P., & MacKay, D. J. C. (2007). Bayesian online changepoint
23
+ detection. arXiv:0710.3742.
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ from dataclasses import dataclass, field
29
+ from pathlib import Path
30
+ from typing import TYPE_CHECKING, Any
31
+
32
+ import numpy as np
33
+ import pandas as pd
34
+ from scipy.stats import t as student_t
35
+
36
+ from ..results import _table_to_html, _table_to_json
37
+
38
+ if TYPE_CHECKING:
39
+ import altair as alt
40
+
41
+
42
+ @dataclass(frozen=True)
43
+ class BocpdResult:
44
+ """Output of :func:`bocpd`.
45
+
46
+ The full ``run_length_posterior`` matrix is retained for plotting
47
+ (the canonical BOCPD diagnostic is a heatmap of this matrix). The
48
+ derived summaries — ``map_run_length`` and ``cp_probability`` — are
49
+ the actionable per-step signals.
50
+ """
51
+
52
+ series: pd.Series
53
+ run_length_posterior: np.ndarray # shape (T, max_r+1)
54
+ map_run_length: pd.Series # length T, indexed like series
55
+ cp_probability: pd.Series # length T, indexed like series
56
+ hazard: float
57
+ params: dict[str, Any] = field(default_factory=dict)
58
+
59
+ def to_df(self) -> pd.DataFrame:
60
+ """Return a flat per-step table (period, value, map_run_length, cp_probability)."""
61
+ return pd.DataFrame(
62
+ {
63
+ "period": self.series.index,
64
+ "value": self.series.to_numpy(dtype=float),
65
+ "map_run_length": self.map_run_length.to_numpy(dtype=int),
66
+ "cp_probability": self.cp_probability.to_numpy(dtype=float),
67
+ }
68
+ )
69
+
70
+ def to_html(self, path: str | Path | None = None, **kw: Any) -> str:
71
+ return _table_to_html(self.to_df(), path, **kw)
72
+
73
+ def to_json(self, path: str | Path | None = None, **kw: Any) -> str:
74
+ return _table_to_json(self.to_df(), path, **kw)
75
+
76
+ def detected_changepoints(self, *, threshold: int = 3) -> pd.DataFrame:
77
+ """Periods where the MAP run length dropped below ``threshold``.
78
+
79
+ Returns
80
+ -------
81
+ pandas.DataFrame
82
+ Columns: ``period``, ``map_run_length``, ``cp_probability``.
83
+ """
84
+ df = self.to_df()
85
+ flagged = df[df["map_run_length"] <= threshold].copy()
86
+ return flagged.reset_index(drop=True)
87
+
88
+ def plot(self, **kw: Any) -> alt.Chart:
89
+ from ..viz.bocpd import bocpd_plot
90
+
91
+ return bocpd_plot(self, **kw)
92
+
93
+ def summary(self) -> str:
94
+ detected = self.detected_changepoints(threshold=3)
95
+ return (
96
+ f"BocpdResult(hazard={self.hazard}, T={len(self.series)}, "
97
+ f"detected={len(detected)})"
98
+ )
99
+
100
+
101
+ def bocpd(
102
+ series: pd.Series,
103
+ *,
104
+ hazard: float = 0.01,
105
+ mu_0: float | None = None,
106
+ kappa_0: float = 1.0,
107
+ alpha_0: float = 1.0,
108
+ beta_0: float | None = None,
109
+ max_run_length: int | None = None,
110
+ ) -> BocpdResult:
111
+ """Bayesian Online Changepoint Detection.
112
+
113
+ Parameters
114
+ ----------
115
+ series
116
+ Time-indexed series of scalar observations.
117
+ hazard
118
+ Constant hazard rate — probability of a changepoint at any
119
+ given step. Equivalent to an expected mean run length of
120
+ ``1/hazard``. ``0.01`` (default) → expect a regime change every
121
+ ~100 periods. For monthly data, ``hazard=1/24`` → "every 2
122
+ years".
123
+ mu_0
124
+ Prior mean. Defaults to the series mean, but for serious work
125
+ pass a sensible scale-anchored prior (e.g. zero for centered
126
+ rates).
127
+ kappa_0
128
+ Prior pseudo-count on the mean. ``1.0`` is weakly informative.
129
+ alpha_0, beta_0
130
+ Inverse-Gamma hyperparameters on the variance. ``beta_0``
131
+ defaults to a small data-derived value; override if you have
132
+ scale information from outside the data window.
133
+ max_run_length
134
+ Truncate the run-length state at this many steps. Defaults to
135
+ the full series length. Setting it (e.g. ``200``) bounds the
136
+ per-step cost at O(T·max_run_length) for very long series.
137
+
138
+ Returns
139
+ -------
140
+ :class:`BocpdResult`
141
+ """
142
+ if hazard <= 0 or hazard >= 1:
143
+ raise ValueError(f"hazard must be in (0, 1); got {hazard}")
144
+ if kappa_0 <= 0 or alpha_0 <= 0:
145
+ raise ValueError(
146
+ f"kappa_0 and alpha_0 must be positive; got {kappa_0}, {alpha_0}"
147
+ )
148
+
149
+ y = series.to_numpy(dtype=float)
150
+ n = len(y)
151
+ if n < 2:
152
+ raise ValueError(f"need at least 2 observations; got {n}")
153
+
154
+ if mu_0 is None:
155
+ mu_0 = float(np.mean(y))
156
+ if beta_0 is None:
157
+ # Weakly informative: encode the data's order-of-magnitude
158
+ # variance so the t-distribution scale is sensible.
159
+ beta_0 = max(float(np.var(y, ddof=1)) * (alpha_0 - 0.5) if alpha_0 > 0.5 else 0.01, 1e-6)
160
+ if beta_0 <= 0:
161
+ raise ValueError(f"beta_0 must be positive; got {beta_0}")
162
+
163
+ max_r = n if max_run_length is None else min(n, int(max_run_length))
164
+
165
+ # Posterior matrix and per-run-length sufficient stats.
166
+ posterior = np.zeros((n + 1, max_r + 1))
167
+ posterior[0, 0] = 1.0
168
+ mu_vec = np.array([mu_0], dtype=float)
169
+ kappa_vec = np.array([kappa_0], dtype=float)
170
+ alpha_vec = np.array([alpha_0], dtype=float)
171
+ beta_vec = np.array([beta_0], dtype=float)
172
+
173
+ for t in range(n):
174
+ x = y[t]
175
+
176
+ # Truncate state at max_r entries.
177
+ if len(mu_vec) > max_r:
178
+ mu_vec = mu_vec[:max_r]
179
+ kappa_vec = kappa_vec[:max_r]
180
+ alpha_vec = alpha_vec[:max_r]
181
+ beta_vec = beta_vec[:max_r]
182
+ n_r = len(mu_vec)
183
+
184
+ # 1. Predictive probability for x under each run length.
185
+ scale_vec = np.sqrt(beta_vec * (kappa_vec + 1.0) / (alpha_vec * kappa_vec))
186
+ df_vec = 2.0 * alpha_vec
187
+ pi_vec = student_t.pdf(x, df=df_vec, loc=mu_vec, scale=scale_vec)
188
+
189
+ # 2. Growth mass (no changepoint) — shifts up by one r.
190
+ growth = posterior[t, :n_r] * pi_vec * (1.0 - hazard)
191
+ # 3. Changepoint mass (everything collapses to r=0).
192
+ cp_mass = float((posterior[t, :n_r] * pi_vec * hazard).sum())
193
+
194
+ next_len = min(n_r + 1, max_r + 1)
195
+ posterior[t + 1, 0] = cp_mass
196
+ if next_len > 1:
197
+ posterior[t + 1, 1:next_len] = growth[: next_len - 1]
198
+ total = posterior[t + 1, :].sum()
199
+ if total > 0:
200
+ posterior[t + 1, :] /= total
201
+
202
+ # 4. Update sufficient statistics — shift up with prior in slot 0.
203
+ new_mu = (kappa_vec * mu_vec + x) / (kappa_vec + 1.0)
204
+ new_kappa = kappa_vec + 1.0
205
+ new_alpha = alpha_vec + 0.5
206
+ new_beta = beta_vec + (kappa_vec * (x - mu_vec) ** 2) / (
207
+ 2.0 * (kappa_vec + 1.0)
208
+ )
209
+ mu_vec = np.concatenate([[mu_0], new_mu])
210
+ kappa_vec = np.concatenate([[kappa_0], new_kappa])
211
+ alpha_vec = np.concatenate([[alpha_0], new_alpha])
212
+ beta_vec = np.concatenate([[beta_0], new_beta])
213
+
214
+ posterior_obs = posterior[1:, : max_r + 1]
215
+ map_r = posterior_obs.argmax(axis=1).astype(int)
216
+ cp_prob = posterior_obs[:, 0]
217
+
218
+ return BocpdResult(
219
+ series=series,
220
+ run_length_posterior=posterior_obs,
221
+ map_run_length=pd.Series(map_r, index=series.index, name="map_run_length"),
222
+ cp_probability=pd.Series(
223
+ cp_prob, index=series.index, name="cp_probability"
224
+ ),
225
+ hazard=hazard,
226
+ params={
227
+ "mu_0": mu_0,
228
+ "kappa_0": kappa_0,
229
+ "alpha_0": alpha_0,
230
+ "beta_0": beta_0,
231
+ "max_run_length": max_r,
232
+ },
233
+ )
@@ -0,0 +1,293 @@
1
+ """Bayesian counterfactual causal impact (Brodersen et al. 2015).
2
+
3
+ The :func:`interrupted_time_series` module answers "is there a step
4
+ discontinuity at this known event?" via segmented OLS. The harder —
5
+ and more interesting — question is "what *would* the trajectory have
6
+ looked like *without* the event?" That's the counterfactual, and the
7
+ gap between observed reality and counterfactual prediction is the
8
+ causal effect of the event.
9
+
10
+ Method: Bayesian structural time series (BSTS) — a state-space model
11
+ with a local linear trend fit on the pre-event window, then projected
12
+ forward as the counterfactual for the post-event window. Implemented
13
+ via :class:`statsmodels.tsa.UnobservedComponents`. The credible
14
+ intervals on the pointwise and cumulative effects come from Monte
15
+ Carlo simulation against the joint posterior of the state-space
16
+ filter — anchored at the end of the pre-event training data and rolled
17
+ forward through the post-event horizon.
18
+
19
+ Reference
20
+ ---------
21
+ Brodersen, K. H., Gallusser, F., Koehler, J., Remy, N., & Scott, S. L.
22
+ (2015). Inferring causal impact using Bayesian structural time-series
23
+ models. *Annals of Applied Statistics*, 9(1), 247-274.
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ from dataclasses import dataclass, field
29
+ from pathlib import Path
30
+ from typing import TYPE_CHECKING, Any
31
+
32
+ import numpy as np
33
+ import pandas as pd
34
+
35
+ from ..results import _table_to_html, _table_to_json
36
+
37
+ if TYPE_CHECKING:
38
+ import altair as alt
39
+
40
+
41
+ @dataclass(frozen=True)
42
+ class CausalImpactResult:
43
+ """Counterfactual causal-impact analysis around a known event.
44
+
45
+ Tables (all period-indexed, columns ``period``, ``observed``,
46
+ ``counterfactual``, ``counterfactual_lower``, ``counterfactual_upper``,
47
+ plus the same for the pointwise and cumulative effects).
48
+
49
+ Summary scalars live in :attr:`metrics` as a dict keyed by the
50
+ standard CausalImpact reporting set: avg effect, absolute effect,
51
+ relative effect, posterior probability of no effect.
52
+ """
53
+
54
+ target: str
55
+ event_date: pd.Timestamp
56
+ table: pd.DataFrame
57
+ metrics: dict[str, float] = field(default_factory=dict)
58
+ level: float = 0.95
59
+ n_pre: int = 0
60
+ n_post: int = 0
61
+ params: dict[str, Any] = field(default_factory=dict)
62
+
63
+ def to_df(self) -> pd.DataFrame:
64
+ return self.table.copy()
65
+
66
+ def to_html(self, path: str | Path | None = None, **kw: Any) -> str:
67
+ return _table_to_html(self.table, path, **kw)
68
+
69
+ def to_json(self, path: str | Path | None = None, **kw: Any) -> str:
70
+ return _table_to_json(self.table, path, **kw)
71
+
72
+ def plot(self, **kw: Any) -> alt.Chart:
73
+ from ..viz.causal_impact import causal_impact_plot
74
+
75
+ return causal_impact_plot(self, **kw)
76
+
77
+ def summary(self) -> str:
78
+ import math
79
+
80
+ m = self.metrics
81
+ avg = m.get("avg_effect", float("nan"))
82
+ cum = m.get("cumulative_effect", float("nan"))
83
+ rel = m.get("relative_effect", float("nan"))
84
+ ci_lo = m.get("avg_effect_lower", float("nan"))
85
+ ci_hi = m.get("avg_effect_upper", float("nan"))
86
+ p = m.get("posterior_prob_no_effect", float("nan"))
87
+ rel_str = (
88
+ "n/a (counterfactual ≈ 0)" if math.isnan(rel) else f"{rel * 100:+.1f}%"
89
+ )
90
+ return (
91
+ f"CausalImpactResult(target={self.target!r}, "
92
+ f"event={self.event_date.date()}, pre={self.n_pre}, post={self.n_post})\n"
93
+ f" avg effect: {avg:+.4f} per period "
94
+ f"({int(self.level * 100)}% CrI [{ci_lo:+.4f}, {ci_hi:+.4f}])\n"
95
+ f" cumulative effect: {cum:+.4f}\n"
96
+ f" relative effect: {rel_str} vs counterfactual mean\n"
97
+ f" P(no effect): {p:.3f}"
98
+ )
99
+
100
+
101
+ def causal_impact(
102
+ series: pd.Series,
103
+ event_date: str | pd.Period | pd.Timestamp,
104
+ *,
105
+ level: float = 0.95,
106
+ n_samples: int = 1000,
107
+ seed: int | None = 0,
108
+ model: str = "local linear trend",
109
+ ) -> CausalImpactResult:
110
+ """Counterfactual causal impact of an event on a time series.
111
+
112
+ Fits a Bayesian structural time-series model on the pre-event
113
+ portion of ``series`` and projects it forward through the
114
+ post-event window as the counterfactual "what would have happened
115
+ without the event". Observed minus counterfactual is the causal
116
+ effect, with credible intervals from Monte Carlo simulation
117
+ against the joint state-space posterior.
118
+
119
+ Parameters
120
+ ----------
121
+ series
122
+ Period-indexed time series of values (e.g. relative
123
+ frequencies from :meth:`TemporalTrajectory.over_time`).
124
+ event_date
125
+ Where to place the intervention. Anything pandas can compare
126
+ to the series index.
127
+ level
128
+ Credible-interval level. ``0.95`` → 95% CrI on every band.
129
+ n_samples
130
+ Number of Monte Carlo paths to sample for the pointwise +
131
+ cumulative CIs. 1000 is the conventional default and runs in
132
+ well under a second on typical CL series.
133
+ seed
134
+ RNG seed for reproducibility.
135
+ model
136
+ Trend specification passed to
137
+ :class:`statsmodels.tsa.UnobservedComponents` — usually
138
+ ``"local linear trend"`` (level + slope) or ``"local level"``.
139
+
140
+ Returns
141
+ -------
142
+ :class:`CausalImpactResult`
143
+ """
144
+ try:
145
+ import statsmodels.api as sm
146
+ except ImportError as exc: # pragma: no cover
147
+ raise ImportError(
148
+ "causal_impact requires statsmodels. "
149
+ "Install with: pip install 'pycorpdiff[temporal]'"
150
+ ) from exc
151
+
152
+ if not 0 < level < 1:
153
+ raise ValueError(f"level must be in (0, 1); got {level}")
154
+ if n_samples < 100:
155
+ raise ValueError(f"n_samples must be >= 100; got {n_samples}")
156
+
157
+ # Period → Timestamp index so the state-space model can use the
158
+ # inferred frequency for forward extension.
159
+ work = series.copy()
160
+ if isinstance(work.index, pd.PeriodIndex):
161
+ work.index = work.index.to_timestamp()
162
+ inferred = getattr(work.index, "inferred_freq", None)
163
+ if inferred is not None and isinstance(work.index, pd.DatetimeIndex):
164
+ # Rebuild a frequency-tagged DatetimeIndex without trying to mutate
165
+ # the read-only ``.freq`` attribute. statsmodels picks up the freq
166
+ # from this for the forecast horizon.
167
+ work.index = pd.DatetimeIndex(work.index, freq=inferred)
168
+
169
+ event_ts = _coerce_event(event_date, work)
170
+
171
+ pre_mask = np.asarray(work.index < event_ts)
172
+ post_mask = ~pre_mask
173
+ if int(pre_mask.sum()) < 4:
174
+ raise ValueError(
175
+ f"need at least 4 pre-event observations; got {int(pre_mask.sum())} "
176
+ f"(event_date={event_date!r})"
177
+ )
178
+ if int(post_mask.sum()) < 1:
179
+ raise ValueError(
180
+ f"need at least 1 post-event observation; event_date={event_date!r} "
181
+ "is at or after the last period"
182
+ )
183
+
184
+ pre = work[pre_mask]
185
+ post = work[post_mask]
186
+
187
+ # Fit BSTS on the pre-event window.
188
+ uc = sm.tsa.UnobservedComponents(pre, level=model)
189
+ fit = uc.fit(disp=False)
190
+
191
+ # Counterfactual point forecast + marginal CI from get_forecast.
192
+ h = len(post)
193
+ fc = fit.get_forecast(steps=h)
194
+ cf_mean = np.asarray(fc.predicted_mean, dtype=float)
195
+ cf_ci = np.asarray(fc.conf_int(alpha=1.0 - level), dtype=float)
196
+ cf_lower = cf_ci[:, 0]
197
+ cf_upper = cf_ci[:, 1]
198
+
199
+ # Monte Carlo against the joint posterior for the cumulative + pointwise
200
+ # effect bands — anchored at the end of the pre-event filter state.
201
+ sim = fit.simulate(
202
+ nsimulations=h,
203
+ repetitions=int(n_samples),
204
+ anchor="end",
205
+ random_state=seed,
206
+ )
207
+ paths = sim.to_numpy() if isinstance(sim, pd.DataFrame) else np.asarray(sim)
208
+ # statsmodels returns shape (h, repetitions); transpose to (N, h).
209
+ paths = paths.T if paths.shape == (h, n_samples) else paths
210
+
211
+ obs_arr = post.to_numpy(dtype=float)
212
+ pointwise_paths = obs_arr[None, :] - paths
213
+ cumulative_paths = np.cumsum(pointwise_paths, axis=1)
214
+
215
+ alpha = 1.0 - level
216
+ lo_q, hi_q = alpha / 2.0, 1.0 - alpha / 2.0
217
+ pw_lower = np.quantile(pointwise_paths, lo_q, axis=0)
218
+ pw_upper = np.quantile(pointwise_paths, hi_q, axis=0)
219
+ cum_mean = cumulative_paths.mean(axis=0)
220
+ cum_lower = np.quantile(cumulative_paths, lo_q, axis=0)
221
+ cum_upper = np.quantile(cumulative_paths, hi_q, axis=0)
222
+ pw_mean = obs_arr - cf_mean
223
+
224
+ # Summary scalars — Brodersen "average effect" is over the post window.
225
+ avg_effect = float(pw_mean.mean())
226
+ avg_lower = float(np.quantile(pointwise_paths.mean(axis=1), lo_q))
227
+ avg_upper = float(np.quantile(pointwise_paths.mean(axis=1), hi_q))
228
+ cf_mean_avg = float(cf_mean.mean())
229
+ relative_effect = (
230
+ avg_effect / cf_mean_avg if abs(cf_mean_avg) > 1e-12 else float("nan")
231
+ )
232
+ # Posterior probability of no effect: fraction of paths where the avg
233
+ # effect crosses zero — two-tailed.
234
+ avg_effect_per_path = pointwise_paths.mean(axis=1)
235
+ if avg_effect >= 0:
236
+ p_no_effect = float((avg_effect_per_path <= 0).mean()) * 2.0
237
+ else:
238
+ p_no_effect = float((avg_effect_per_path >= 0).mean()) * 2.0
239
+ p_no_effect = min(1.0, max(0.0, p_no_effect))
240
+
241
+ # Build the long table — one row per post-event period. Use the
242
+ # original series' index so callers see Period objects if that's
243
+ # what they passed in.
244
+ if isinstance(series.index, pd.PeriodIndex):
245
+ periods_out: pd.Index = series.index[post_mask]
246
+ else:
247
+ periods_out = work.index[post_mask]
248
+ table = pd.DataFrame(
249
+ {
250
+ "period": periods_out,
251
+ "observed": obs_arr,
252
+ "counterfactual": cf_mean,
253
+ "counterfactual_lower": cf_lower,
254
+ "counterfactual_upper": cf_upper,
255
+ "pointwise_effect": pw_mean,
256
+ "pointwise_lower": pw_lower,
257
+ "pointwise_upper": pw_upper,
258
+ "cumulative_effect": cum_mean,
259
+ "cumulative_lower": cum_lower,
260
+ "cumulative_upper": cum_upper,
261
+ }
262
+ )
263
+
264
+ metrics = {
265
+ "avg_effect": avg_effect,
266
+ "avg_effect_lower": avg_lower,
267
+ "avg_effect_upper": avg_upper,
268
+ "cumulative_effect": float(cum_mean[-1]),
269
+ "cumulative_effect_lower": float(cum_lower[-1]),
270
+ "cumulative_effect_upper": float(cum_upper[-1]),
271
+ "relative_effect": relative_effect,
272
+ "counterfactual_mean": cf_mean_avg,
273
+ "posterior_prob_no_effect": p_no_effect,
274
+ }
275
+ return CausalImpactResult(
276
+ target="", # filled in by the caller — TemporalTrajectory knows the name
277
+ event_date=event_ts,
278
+ table=table,
279
+ metrics=metrics,
280
+ level=level,
281
+ n_pre=int(pre_mask.sum()),
282
+ n_post=int(post_mask.sum()),
283
+ params={"model": model, "n_samples": n_samples, "seed": seed},
284
+ )
285
+
286
+
287
+ def _coerce_event(event: object, series: pd.Series) -> pd.Timestamp:
288
+ """Coerce an event-date argument to a Timestamp aligned with the series."""
289
+ if isinstance(event, pd.Timestamp):
290
+ return event
291
+ if isinstance(event, pd.Period):
292
+ return pd.Timestamp(str(event))
293
+ return pd.Timestamp(str(event))
@@ -0,0 +1,92 @@
1
+ """Changepoint detection on temporal frequency / similarity series.
2
+
3
+ Wraps the ``ruptures`` library; importable only when the ``temporal``
4
+ extra is installed.
5
+
6
+ References
7
+ ----------
8
+ Truong, C., Oudre, L., & Vayatis, N. (2020). Selective review of
9
+ offline change point detection methods. *Signal Processing*, 167,
10
+ 107299.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from typing import Literal
16
+
17
+ import numpy as np
18
+ import pandas as pd
19
+
20
+ ChangepointMethod = Literal["pelt", "binseg", "window"]
21
+
22
+
23
+ def detect_changepoints(
24
+ series: pd.Series,
25
+ method: ChangepointMethod = "pelt",
26
+ penalty: float | None = None,
27
+ model: str = "rbf",
28
+ ) -> pd.DataFrame:
29
+ """Detect changepoints in a temporal value series.
30
+
31
+ Parameters
32
+ ----------
33
+ series
34
+ A 1-D series indexed by period (or any orderable). Index values
35
+ propagate into the output so changepoints can be reported in
36
+ their original time vocabulary.
37
+ method
38
+ ``"pelt"`` (default), ``"binseg"``, or ``"window"`` — the three
39
+ offline algorithms ruptures exposes. PELT is exact and usually
40
+ best; BinSeg is greedy but faster on long series; window scans
41
+ a fixed window across the signal.
42
+ penalty
43
+ Penalty term for adding a changepoint. Larger = fewer
44
+ changepoints. If ``None``, defaults to ``log(n)`` (BIC-style).
45
+ Pass an explicit float to tune sensitivity.
46
+ model
47
+ Cost function: ``"rbf"`` (default), ``"l1"``, ``"l2"``,
48
+ ``"normal"``. See ruptures documentation for the full list.
49
+
50
+ Returns
51
+ -------
52
+ pandas.DataFrame
53
+ Columns ``period`` (index value at the breakpoint), ``index``
54
+ (positional index), and ``method`` (echo of the chosen method).
55
+ """
56
+ try:
57
+ import ruptures as rpt
58
+ except ImportError as exc: # pragma: no cover
59
+ raise ImportError(
60
+ "detect_changepoints requires ruptures. "
61
+ "Install with: pip install 'pycorpdiff[temporal]'"
62
+ ) from exc
63
+
64
+ values = np.asarray(series.to_numpy(dtype=float))
65
+ if values.ndim != 1:
66
+ raise ValueError("series must be 1-D")
67
+ if len(values) < 4:
68
+ raise ValueError(f"need at least 4 observations; got {len(values)}")
69
+ if np.isnan(values).any():
70
+ raise ValueError("series contains NaN values; impute or drop them first")
71
+
72
+ pen = float(np.log(len(values))) if penalty is None else float(penalty)
73
+
74
+ if method == "pelt":
75
+ algo = rpt.Pelt(model=model)
76
+ elif method == "binseg":
77
+ algo = rpt.Binseg(model=model)
78
+ elif method == "window":
79
+ algo = rpt.Window(model=model)
80
+ else: # pragma: no cover - typing makes this unreachable
81
+ raise ValueError(f"unknown method={method!r}")
82
+
83
+ algo.fit(values.reshape(-1, 1))
84
+ # ruptures returns endpoint indices including len(values) as the
85
+ # final "boundary"; drop it to keep only interior changepoints.
86
+ breakpoints = [bp for bp in algo.predict(pen=pen) if bp < len(values)]
87
+
88
+ rows = [
89
+ {"period": series.index[bp], "index": bp, "method": method}
90
+ for bp in breakpoints
91
+ ]
92
+ return pd.DataFrame(rows, columns=["period", "index", "method"])