pycorpdiff 0.1.0a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pycorpdiff/__init__.py +126 -0
- pycorpdiff/_backends/__init__.py +3 -0
- pycorpdiff/_backends/pandas.py +3 -0
- pycorpdiff/_backends/polars.py +3 -0
- pycorpdiff/collocation/__init__.py +19 -0
- pycorpdiff/collocation/cooccurrence.py +65 -0
- pycorpdiff/collocation/measures.py +102 -0
- pycorpdiff/collocation/network.py +233 -0
- pycorpdiff/collocation/shift.py +146 -0
- pycorpdiff/compare.py +345 -0
- pycorpdiff/corpus.py +411 -0
- pycorpdiff/datasets/__init__.py +27 -0
- pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
- pycorpdiff/datasets/_generate_hansard.py +221 -0
- pycorpdiff/datasets/hansard.py +235 -0
- pycorpdiff/datasets/histwords.py +221 -0
- pycorpdiff/explain.py +177 -0
- pycorpdiff/io/__init__.py +16 -0
- pycorpdiff/io/duckdb.py +92 -0
- pycorpdiff/io/huggingface.py +142 -0
- pycorpdiff/io/readers.py +138 -0
- pycorpdiff/keyness/__init__.py +26 -0
- pycorpdiff/keyness/bayes.py +50 -0
- pycorpdiff/keyness/chi_squared.py +94 -0
- pycorpdiff/keyness/correction.py +34 -0
- pycorpdiff/keyness/dispersion.py +89 -0
- pycorpdiff/keyness/effect_sizes.py +65 -0
- pycorpdiff/keyness/loglikelihood.py +92 -0
- pycorpdiff/keyness/multicorpus.py +143 -0
- pycorpdiff/keyness/permutation.py +154 -0
- pycorpdiff/py.typed +0 -0
- pycorpdiff/results.py +635 -0
- pycorpdiff/semantic/__init__.py +18 -0
- pycorpdiff/semantic/alignment.py +53 -0
- pycorpdiff/semantic/embed.py +84 -0
- pycorpdiff/semantic/shift.py +224 -0
- pycorpdiff/semantic/trajectory.py +166 -0
- pycorpdiff/stats.py +69 -0
- pycorpdiff/temporal/__init__.py +15 -0
- pycorpdiff/temporal/bocpd.py +233 -0
- pycorpdiff/temporal/causal_impact.py +293 -0
- pycorpdiff/temporal/changepoint.py +92 -0
- pycorpdiff/temporal/forecast.py +405 -0
- pycorpdiff/temporal/its.py +123 -0
- pycorpdiff/temporal/slicing.py +174 -0
- pycorpdiff/tokenize.py +110 -0
- pycorpdiff/viz/__init__.py +37 -0
- pycorpdiff/viz/bocpd.py +173 -0
- pycorpdiff/viz/causal_impact.py +142 -0
- pycorpdiff/viz/collocation.py +48 -0
- pycorpdiff/viz/dispersion.py +117 -0
- pycorpdiff/viz/forecast.py +129 -0
- pycorpdiff/viz/keyness.py +96 -0
- pycorpdiff/viz/network.py +186 -0
- pycorpdiff/viz/scattertext.py +160 -0
- pycorpdiff/viz/semantic_forecast.py +114 -0
- pycorpdiff/viz/trajectory.py +48 -0
- pycorpdiff-0.1.0a0.dist-info/METADATA +230 -0
- pycorpdiff-0.1.0a0.dist-info/RECORD +61 -0
- pycorpdiff-0.1.0a0.dist-info/WHEEL +4 -0
- pycorpdiff-0.1.0a0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
"""Bayesian online changepoint detection (Adams & MacKay 2007).
|
|
2
|
+
|
|
3
|
+
Where :func:`detect_changepoints` runs PELT *offline* — needing the full
|
|
4
|
+
series and returning MAP locations after the fact — BOCPD runs *online*:
|
|
5
|
+
at each new observation it returns the posterior distribution over *run
|
|
6
|
+
length*, where the run length is the number of periods since the last
|
|
7
|
+
changepoint. The two most actionable summaries:
|
|
8
|
+
|
|
9
|
+
- **MAP run length** at time *t*: the most-probable number of periods
|
|
10
|
+
since the last changepoint. Drops sharply at changepoints, grows
|
|
11
|
+
steadily during stable regimes.
|
|
12
|
+
- **Changepoint probability** P(*r* < ``threshold`` | data through *t*):
|
|
13
|
+
posterior probability that a changepoint occurred within the last
|
|
14
|
+
``threshold`` steps. Useful for live monitoring.
|
|
15
|
+
|
|
16
|
+
Observation model: Gaussian with a Normal-Inverse-Gamma conjugate
|
|
17
|
+
prior over (mean, variance). The predictive distribution is Student's
|
|
18
|
+
*t*, computed analytically.
|
|
19
|
+
|
|
20
|
+
Reference
|
|
21
|
+
---------
|
|
22
|
+
Adams, R. P., & MacKay, D. J. C. (2007). Bayesian online changepoint
|
|
23
|
+
detection. arXiv:0710.3742.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
from dataclasses import dataclass, field
|
|
29
|
+
from pathlib import Path
|
|
30
|
+
from typing import TYPE_CHECKING, Any
|
|
31
|
+
|
|
32
|
+
import numpy as np
|
|
33
|
+
import pandas as pd
|
|
34
|
+
from scipy.stats import t as student_t
|
|
35
|
+
|
|
36
|
+
from ..results import _table_to_html, _table_to_json
|
|
37
|
+
|
|
38
|
+
if TYPE_CHECKING:
|
|
39
|
+
import altair as alt
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass(frozen=True)
|
|
43
|
+
class BocpdResult:
|
|
44
|
+
"""Output of :func:`bocpd`.
|
|
45
|
+
|
|
46
|
+
The full ``run_length_posterior`` matrix is retained for plotting
|
|
47
|
+
(the canonical BOCPD diagnostic is a heatmap of this matrix). The
|
|
48
|
+
derived summaries — ``map_run_length`` and ``cp_probability`` — are
|
|
49
|
+
the actionable per-step signals.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
series: pd.Series
|
|
53
|
+
run_length_posterior: np.ndarray # shape (T, max_r+1)
|
|
54
|
+
map_run_length: pd.Series # length T, indexed like series
|
|
55
|
+
cp_probability: pd.Series # length T, indexed like series
|
|
56
|
+
hazard: float
|
|
57
|
+
params: dict[str, Any] = field(default_factory=dict)
|
|
58
|
+
|
|
59
|
+
def to_df(self) -> pd.DataFrame:
|
|
60
|
+
"""Return a flat per-step table (period, value, map_run_length, cp_probability)."""
|
|
61
|
+
return pd.DataFrame(
|
|
62
|
+
{
|
|
63
|
+
"period": self.series.index,
|
|
64
|
+
"value": self.series.to_numpy(dtype=float),
|
|
65
|
+
"map_run_length": self.map_run_length.to_numpy(dtype=int),
|
|
66
|
+
"cp_probability": self.cp_probability.to_numpy(dtype=float),
|
|
67
|
+
}
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
def to_html(self, path: str | Path | None = None, **kw: Any) -> str:
|
|
71
|
+
return _table_to_html(self.to_df(), path, **kw)
|
|
72
|
+
|
|
73
|
+
def to_json(self, path: str | Path | None = None, **kw: Any) -> str:
|
|
74
|
+
return _table_to_json(self.to_df(), path, **kw)
|
|
75
|
+
|
|
76
|
+
def detected_changepoints(self, *, threshold: int = 3) -> pd.DataFrame:
|
|
77
|
+
"""Periods where the MAP run length dropped below ``threshold``.
|
|
78
|
+
|
|
79
|
+
Returns
|
|
80
|
+
-------
|
|
81
|
+
pandas.DataFrame
|
|
82
|
+
Columns: ``period``, ``map_run_length``, ``cp_probability``.
|
|
83
|
+
"""
|
|
84
|
+
df = self.to_df()
|
|
85
|
+
flagged = df[df["map_run_length"] <= threshold].copy()
|
|
86
|
+
return flagged.reset_index(drop=True)
|
|
87
|
+
|
|
88
|
+
def plot(self, **kw: Any) -> alt.Chart:
|
|
89
|
+
from ..viz.bocpd import bocpd_plot
|
|
90
|
+
|
|
91
|
+
return bocpd_plot(self, **kw)
|
|
92
|
+
|
|
93
|
+
def summary(self) -> str:
|
|
94
|
+
detected = self.detected_changepoints(threshold=3)
|
|
95
|
+
return (
|
|
96
|
+
f"BocpdResult(hazard={self.hazard}, T={len(self.series)}, "
|
|
97
|
+
f"detected={len(detected)})"
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def bocpd(
|
|
102
|
+
series: pd.Series,
|
|
103
|
+
*,
|
|
104
|
+
hazard: float = 0.01,
|
|
105
|
+
mu_0: float | None = None,
|
|
106
|
+
kappa_0: float = 1.0,
|
|
107
|
+
alpha_0: float = 1.0,
|
|
108
|
+
beta_0: float | None = None,
|
|
109
|
+
max_run_length: int | None = None,
|
|
110
|
+
) -> BocpdResult:
|
|
111
|
+
"""Bayesian Online Changepoint Detection.
|
|
112
|
+
|
|
113
|
+
Parameters
|
|
114
|
+
----------
|
|
115
|
+
series
|
|
116
|
+
Time-indexed series of scalar observations.
|
|
117
|
+
hazard
|
|
118
|
+
Constant hazard rate — probability of a changepoint at any
|
|
119
|
+
given step. Equivalent to an expected mean run length of
|
|
120
|
+
``1/hazard``. ``0.01`` (default) → expect a regime change every
|
|
121
|
+
~100 periods. For monthly data, ``hazard=1/24`` → "every 2
|
|
122
|
+
years".
|
|
123
|
+
mu_0
|
|
124
|
+
Prior mean. Defaults to the series mean, but for serious work
|
|
125
|
+
pass a sensible scale-anchored prior (e.g. zero for centered
|
|
126
|
+
rates).
|
|
127
|
+
kappa_0
|
|
128
|
+
Prior pseudo-count on the mean. ``1.0`` is weakly informative.
|
|
129
|
+
alpha_0, beta_0
|
|
130
|
+
Inverse-Gamma hyperparameters on the variance. ``beta_0``
|
|
131
|
+
defaults to a small data-derived value; override if you have
|
|
132
|
+
scale information from outside the data window.
|
|
133
|
+
max_run_length
|
|
134
|
+
Truncate the run-length state at this many steps. Defaults to
|
|
135
|
+
the full series length. Setting it (e.g. ``200``) bounds the
|
|
136
|
+
per-step cost at O(T·max_run_length) for very long series.
|
|
137
|
+
|
|
138
|
+
Returns
|
|
139
|
+
-------
|
|
140
|
+
:class:`BocpdResult`
|
|
141
|
+
"""
|
|
142
|
+
if hazard <= 0 or hazard >= 1:
|
|
143
|
+
raise ValueError(f"hazard must be in (0, 1); got {hazard}")
|
|
144
|
+
if kappa_0 <= 0 or alpha_0 <= 0:
|
|
145
|
+
raise ValueError(
|
|
146
|
+
f"kappa_0 and alpha_0 must be positive; got {kappa_0}, {alpha_0}"
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
y = series.to_numpy(dtype=float)
|
|
150
|
+
n = len(y)
|
|
151
|
+
if n < 2:
|
|
152
|
+
raise ValueError(f"need at least 2 observations; got {n}")
|
|
153
|
+
|
|
154
|
+
if mu_0 is None:
|
|
155
|
+
mu_0 = float(np.mean(y))
|
|
156
|
+
if beta_0 is None:
|
|
157
|
+
# Weakly informative: encode the data's order-of-magnitude
|
|
158
|
+
# variance so the t-distribution scale is sensible.
|
|
159
|
+
beta_0 = max(float(np.var(y, ddof=1)) * (alpha_0 - 0.5) if alpha_0 > 0.5 else 0.01, 1e-6)
|
|
160
|
+
if beta_0 <= 0:
|
|
161
|
+
raise ValueError(f"beta_0 must be positive; got {beta_0}")
|
|
162
|
+
|
|
163
|
+
max_r = n if max_run_length is None else min(n, int(max_run_length))
|
|
164
|
+
|
|
165
|
+
# Posterior matrix and per-run-length sufficient stats.
|
|
166
|
+
posterior = np.zeros((n + 1, max_r + 1))
|
|
167
|
+
posterior[0, 0] = 1.0
|
|
168
|
+
mu_vec = np.array([mu_0], dtype=float)
|
|
169
|
+
kappa_vec = np.array([kappa_0], dtype=float)
|
|
170
|
+
alpha_vec = np.array([alpha_0], dtype=float)
|
|
171
|
+
beta_vec = np.array([beta_0], dtype=float)
|
|
172
|
+
|
|
173
|
+
for t in range(n):
|
|
174
|
+
x = y[t]
|
|
175
|
+
|
|
176
|
+
# Truncate state at max_r entries.
|
|
177
|
+
if len(mu_vec) > max_r:
|
|
178
|
+
mu_vec = mu_vec[:max_r]
|
|
179
|
+
kappa_vec = kappa_vec[:max_r]
|
|
180
|
+
alpha_vec = alpha_vec[:max_r]
|
|
181
|
+
beta_vec = beta_vec[:max_r]
|
|
182
|
+
n_r = len(mu_vec)
|
|
183
|
+
|
|
184
|
+
# 1. Predictive probability for x under each run length.
|
|
185
|
+
scale_vec = np.sqrt(beta_vec * (kappa_vec + 1.0) / (alpha_vec * kappa_vec))
|
|
186
|
+
df_vec = 2.0 * alpha_vec
|
|
187
|
+
pi_vec = student_t.pdf(x, df=df_vec, loc=mu_vec, scale=scale_vec)
|
|
188
|
+
|
|
189
|
+
# 2. Growth mass (no changepoint) — shifts up by one r.
|
|
190
|
+
growth = posterior[t, :n_r] * pi_vec * (1.0 - hazard)
|
|
191
|
+
# 3. Changepoint mass (everything collapses to r=0).
|
|
192
|
+
cp_mass = float((posterior[t, :n_r] * pi_vec * hazard).sum())
|
|
193
|
+
|
|
194
|
+
next_len = min(n_r + 1, max_r + 1)
|
|
195
|
+
posterior[t + 1, 0] = cp_mass
|
|
196
|
+
if next_len > 1:
|
|
197
|
+
posterior[t + 1, 1:next_len] = growth[: next_len - 1]
|
|
198
|
+
total = posterior[t + 1, :].sum()
|
|
199
|
+
if total > 0:
|
|
200
|
+
posterior[t + 1, :] /= total
|
|
201
|
+
|
|
202
|
+
# 4. Update sufficient statistics — shift up with prior in slot 0.
|
|
203
|
+
new_mu = (kappa_vec * mu_vec + x) / (kappa_vec + 1.0)
|
|
204
|
+
new_kappa = kappa_vec + 1.0
|
|
205
|
+
new_alpha = alpha_vec + 0.5
|
|
206
|
+
new_beta = beta_vec + (kappa_vec * (x - mu_vec) ** 2) / (
|
|
207
|
+
2.0 * (kappa_vec + 1.0)
|
|
208
|
+
)
|
|
209
|
+
mu_vec = np.concatenate([[mu_0], new_mu])
|
|
210
|
+
kappa_vec = np.concatenate([[kappa_0], new_kappa])
|
|
211
|
+
alpha_vec = np.concatenate([[alpha_0], new_alpha])
|
|
212
|
+
beta_vec = np.concatenate([[beta_0], new_beta])
|
|
213
|
+
|
|
214
|
+
posterior_obs = posterior[1:, : max_r + 1]
|
|
215
|
+
map_r = posterior_obs.argmax(axis=1).astype(int)
|
|
216
|
+
cp_prob = posterior_obs[:, 0]
|
|
217
|
+
|
|
218
|
+
return BocpdResult(
|
|
219
|
+
series=series,
|
|
220
|
+
run_length_posterior=posterior_obs,
|
|
221
|
+
map_run_length=pd.Series(map_r, index=series.index, name="map_run_length"),
|
|
222
|
+
cp_probability=pd.Series(
|
|
223
|
+
cp_prob, index=series.index, name="cp_probability"
|
|
224
|
+
),
|
|
225
|
+
hazard=hazard,
|
|
226
|
+
params={
|
|
227
|
+
"mu_0": mu_0,
|
|
228
|
+
"kappa_0": kappa_0,
|
|
229
|
+
"alpha_0": alpha_0,
|
|
230
|
+
"beta_0": beta_0,
|
|
231
|
+
"max_run_length": max_r,
|
|
232
|
+
},
|
|
233
|
+
)
|
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
"""Bayesian counterfactual causal impact (Brodersen et al. 2015).
|
|
2
|
+
|
|
3
|
+
The :func:`interrupted_time_series` module answers "is there a step
|
|
4
|
+
discontinuity at this known event?" via segmented OLS. The harder —
|
|
5
|
+
and more interesting — question is "what *would* the trajectory have
|
|
6
|
+
looked like *without* the event?" That's the counterfactual, and the
|
|
7
|
+
gap between observed reality and counterfactual prediction is the
|
|
8
|
+
causal effect of the event.
|
|
9
|
+
|
|
10
|
+
Method: Bayesian structural time series (BSTS) — a state-space model
|
|
11
|
+
with a local linear trend fit on the pre-event window, then projected
|
|
12
|
+
forward as the counterfactual for the post-event window. Implemented
|
|
13
|
+
via :class:`statsmodels.tsa.UnobservedComponents`. The credible
|
|
14
|
+
intervals on the pointwise and cumulative effects come from Monte
|
|
15
|
+
Carlo simulation against the joint posterior of the state-space
|
|
16
|
+
filter — anchored at the end of the pre-event training data and rolled
|
|
17
|
+
forward through the post-event horizon.
|
|
18
|
+
|
|
19
|
+
Reference
|
|
20
|
+
---------
|
|
21
|
+
Brodersen, K. H., Gallusser, F., Koehler, J., Remy, N., & Scott, S. L.
|
|
22
|
+
(2015). Inferring causal impact using Bayesian structural time-series
|
|
23
|
+
models. *Annals of Applied Statistics*, 9(1), 247-274.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
from dataclasses import dataclass, field
|
|
29
|
+
from pathlib import Path
|
|
30
|
+
from typing import TYPE_CHECKING, Any
|
|
31
|
+
|
|
32
|
+
import numpy as np
|
|
33
|
+
import pandas as pd
|
|
34
|
+
|
|
35
|
+
from ..results import _table_to_html, _table_to_json
|
|
36
|
+
|
|
37
|
+
if TYPE_CHECKING:
|
|
38
|
+
import altair as alt
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass(frozen=True)
|
|
42
|
+
class CausalImpactResult:
|
|
43
|
+
"""Counterfactual causal-impact analysis around a known event.
|
|
44
|
+
|
|
45
|
+
Tables (all period-indexed, columns ``period``, ``observed``,
|
|
46
|
+
``counterfactual``, ``counterfactual_lower``, ``counterfactual_upper``,
|
|
47
|
+
plus the same for the pointwise and cumulative effects).
|
|
48
|
+
|
|
49
|
+
Summary scalars live in :attr:`metrics` as a dict keyed by the
|
|
50
|
+
standard CausalImpact reporting set: avg effect, absolute effect,
|
|
51
|
+
relative effect, posterior probability of no effect.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
target: str
|
|
55
|
+
event_date: pd.Timestamp
|
|
56
|
+
table: pd.DataFrame
|
|
57
|
+
metrics: dict[str, float] = field(default_factory=dict)
|
|
58
|
+
level: float = 0.95
|
|
59
|
+
n_pre: int = 0
|
|
60
|
+
n_post: int = 0
|
|
61
|
+
params: dict[str, Any] = field(default_factory=dict)
|
|
62
|
+
|
|
63
|
+
def to_df(self) -> pd.DataFrame:
|
|
64
|
+
return self.table.copy()
|
|
65
|
+
|
|
66
|
+
def to_html(self, path: str | Path | None = None, **kw: Any) -> str:
|
|
67
|
+
return _table_to_html(self.table, path, **kw)
|
|
68
|
+
|
|
69
|
+
def to_json(self, path: str | Path | None = None, **kw: Any) -> str:
|
|
70
|
+
return _table_to_json(self.table, path, **kw)
|
|
71
|
+
|
|
72
|
+
def plot(self, **kw: Any) -> alt.Chart:
|
|
73
|
+
from ..viz.causal_impact import causal_impact_plot
|
|
74
|
+
|
|
75
|
+
return causal_impact_plot(self, **kw)
|
|
76
|
+
|
|
77
|
+
def summary(self) -> str:
|
|
78
|
+
import math
|
|
79
|
+
|
|
80
|
+
m = self.metrics
|
|
81
|
+
avg = m.get("avg_effect", float("nan"))
|
|
82
|
+
cum = m.get("cumulative_effect", float("nan"))
|
|
83
|
+
rel = m.get("relative_effect", float("nan"))
|
|
84
|
+
ci_lo = m.get("avg_effect_lower", float("nan"))
|
|
85
|
+
ci_hi = m.get("avg_effect_upper", float("nan"))
|
|
86
|
+
p = m.get("posterior_prob_no_effect", float("nan"))
|
|
87
|
+
rel_str = (
|
|
88
|
+
"n/a (counterfactual ≈ 0)" if math.isnan(rel) else f"{rel * 100:+.1f}%"
|
|
89
|
+
)
|
|
90
|
+
return (
|
|
91
|
+
f"CausalImpactResult(target={self.target!r}, "
|
|
92
|
+
f"event={self.event_date.date()}, pre={self.n_pre}, post={self.n_post})\n"
|
|
93
|
+
f" avg effect: {avg:+.4f} per period "
|
|
94
|
+
f"({int(self.level * 100)}% CrI [{ci_lo:+.4f}, {ci_hi:+.4f}])\n"
|
|
95
|
+
f" cumulative effect: {cum:+.4f}\n"
|
|
96
|
+
f" relative effect: {rel_str} vs counterfactual mean\n"
|
|
97
|
+
f" P(no effect): {p:.3f}"
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def causal_impact(
|
|
102
|
+
series: pd.Series,
|
|
103
|
+
event_date: str | pd.Period | pd.Timestamp,
|
|
104
|
+
*,
|
|
105
|
+
level: float = 0.95,
|
|
106
|
+
n_samples: int = 1000,
|
|
107
|
+
seed: int | None = 0,
|
|
108
|
+
model: str = "local linear trend",
|
|
109
|
+
) -> CausalImpactResult:
|
|
110
|
+
"""Counterfactual causal impact of an event on a time series.
|
|
111
|
+
|
|
112
|
+
Fits a Bayesian structural time-series model on the pre-event
|
|
113
|
+
portion of ``series`` and projects it forward through the
|
|
114
|
+
post-event window as the counterfactual "what would have happened
|
|
115
|
+
without the event". Observed minus counterfactual is the causal
|
|
116
|
+
effect, with credible intervals from Monte Carlo simulation
|
|
117
|
+
against the joint state-space posterior.
|
|
118
|
+
|
|
119
|
+
Parameters
|
|
120
|
+
----------
|
|
121
|
+
series
|
|
122
|
+
Period-indexed time series of values (e.g. relative
|
|
123
|
+
frequencies from :meth:`TemporalTrajectory.over_time`).
|
|
124
|
+
event_date
|
|
125
|
+
Where to place the intervention. Anything pandas can compare
|
|
126
|
+
to the series index.
|
|
127
|
+
level
|
|
128
|
+
Credible-interval level. ``0.95`` → 95% CrI on every band.
|
|
129
|
+
n_samples
|
|
130
|
+
Number of Monte Carlo paths to sample for the pointwise +
|
|
131
|
+
cumulative CIs. 1000 is the conventional default and runs in
|
|
132
|
+
well under a second on typical CL series.
|
|
133
|
+
seed
|
|
134
|
+
RNG seed for reproducibility.
|
|
135
|
+
model
|
|
136
|
+
Trend specification passed to
|
|
137
|
+
:class:`statsmodels.tsa.UnobservedComponents` — usually
|
|
138
|
+
``"local linear trend"`` (level + slope) or ``"local level"``.
|
|
139
|
+
|
|
140
|
+
Returns
|
|
141
|
+
-------
|
|
142
|
+
:class:`CausalImpactResult`
|
|
143
|
+
"""
|
|
144
|
+
try:
|
|
145
|
+
import statsmodels.api as sm
|
|
146
|
+
except ImportError as exc: # pragma: no cover
|
|
147
|
+
raise ImportError(
|
|
148
|
+
"causal_impact requires statsmodels. "
|
|
149
|
+
"Install with: pip install 'pycorpdiff[temporal]'"
|
|
150
|
+
) from exc
|
|
151
|
+
|
|
152
|
+
if not 0 < level < 1:
|
|
153
|
+
raise ValueError(f"level must be in (0, 1); got {level}")
|
|
154
|
+
if n_samples < 100:
|
|
155
|
+
raise ValueError(f"n_samples must be >= 100; got {n_samples}")
|
|
156
|
+
|
|
157
|
+
# Period → Timestamp index so the state-space model can use the
|
|
158
|
+
# inferred frequency for forward extension.
|
|
159
|
+
work = series.copy()
|
|
160
|
+
if isinstance(work.index, pd.PeriodIndex):
|
|
161
|
+
work.index = work.index.to_timestamp()
|
|
162
|
+
inferred = getattr(work.index, "inferred_freq", None)
|
|
163
|
+
if inferred is not None and isinstance(work.index, pd.DatetimeIndex):
|
|
164
|
+
# Rebuild a frequency-tagged DatetimeIndex without trying to mutate
|
|
165
|
+
# the read-only ``.freq`` attribute. statsmodels picks up the freq
|
|
166
|
+
# from this for the forecast horizon.
|
|
167
|
+
work.index = pd.DatetimeIndex(work.index, freq=inferred)
|
|
168
|
+
|
|
169
|
+
event_ts = _coerce_event(event_date, work)
|
|
170
|
+
|
|
171
|
+
pre_mask = np.asarray(work.index < event_ts)
|
|
172
|
+
post_mask = ~pre_mask
|
|
173
|
+
if int(pre_mask.sum()) < 4:
|
|
174
|
+
raise ValueError(
|
|
175
|
+
f"need at least 4 pre-event observations; got {int(pre_mask.sum())} "
|
|
176
|
+
f"(event_date={event_date!r})"
|
|
177
|
+
)
|
|
178
|
+
if int(post_mask.sum()) < 1:
|
|
179
|
+
raise ValueError(
|
|
180
|
+
f"need at least 1 post-event observation; event_date={event_date!r} "
|
|
181
|
+
"is at or after the last period"
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
pre = work[pre_mask]
|
|
185
|
+
post = work[post_mask]
|
|
186
|
+
|
|
187
|
+
# Fit BSTS on the pre-event window.
|
|
188
|
+
uc = sm.tsa.UnobservedComponents(pre, level=model)
|
|
189
|
+
fit = uc.fit(disp=False)
|
|
190
|
+
|
|
191
|
+
# Counterfactual point forecast + marginal CI from get_forecast.
|
|
192
|
+
h = len(post)
|
|
193
|
+
fc = fit.get_forecast(steps=h)
|
|
194
|
+
cf_mean = np.asarray(fc.predicted_mean, dtype=float)
|
|
195
|
+
cf_ci = np.asarray(fc.conf_int(alpha=1.0 - level), dtype=float)
|
|
196
|
+
cf_lower = cf_ci[:, 0]
|
|
197
|
+
cf_upper = cf_ci[:, 1]
|
|
198
|
+
|
|
199
|
+
# Monte Carlo against the joint posterior for the cumulative + pointwise
|
|
200
|
+
# effect bands — anchored at the end of the pre-event filter state.
|
|
201
|
+
sim = fit.simulate(
|
|
202
|
+
nsimulations=h,
|
|
203
|
+
repetitions=int(n_samples),
|
|
204
|
+
anchor="end",
|
|
205
|
+
random_state=seed,
|
|
206
|
+
)
|
|
207
|
+
paths = sim.to_numpy() if isinstance(sim, pd.DataFrame) else np.asarray(sim)
|
|
208
|
+
# statsmodels returns shape (h, repetitions); transpose to (N, h).
|
|
209
|
+
paths = paths.T if paths.shape == (h, n_samples) else paths
|
|
210
|
+
|
|
211
|
+
obs_arr = post.to_numpy(dtype=float)
|
|
212
|
+
pointwise_paths = obs_arr[None, :] - paths
|
|
213
|
+
cumulative_paths = np.cumsum(pointwise_paths, axis=1)
|
|
214
|
+
|
|
215
|
+
alpha = 1.0 - level
|
|
216
|
+
lo_q, hi_q = alpha / 2.0, 1.0 - alpha / 2.0
|
|
217
|
+
pw_lower = np.quantile(pointwise_paths, lo_q, axis=0)
|
|
218
|
+
pw_upper = np.quantile(pointwise_paths, hi_q, axis=0)
|
|
219
|
+
cum_mean = cumulative_paths.mean(axis=0)
|
|
220
|
+
cum_lower = np.quantile(cumulative_paths, lo_q, axis=0)
|
|
221
|
+
cum_upper = np.quantile(cumulative_paths, hi_q, axis=0)
|
|
222
|
+
pw_mean = obs_arr - cf_mean
|
|
223
|
+
|
|
224
|
+
# Summary scalars — Brodersen "average effect" is over the post window.
|
|
225
|
+
avg_effect = float(pw_mean.mean())
|
|
226
|
+
avg_lower = float(np.quantile(pointwise_paths.mean(axis=1), lo_q))
|
|
227
|
+
avg_upper = float(np.quantile(pointwise_paths.mean(axis=1), hi_q))
|
|
228
|
+
cf_mean_avg = float(cf_mean.mean())
|
|
229
|
+
relative_effect = (
|
|
230
|
+
avg_effect / cf_mean_avg if abs(cf_mean_avg) > 1e-12 else float("nan")
|
|
231
|
+
)
|
|
232
|
+
# Posterior probability of no effect: fraction of paths where the avg
|
|
233
|
+
# effect crosses zero — two-tailed.
|
|
234
|
+
avg_effect_per_path = pointwise_paths.mean(axis=1)
|
|
235
|
+
if avg_effect >= 0:
|
|
236
|
+
p_no_effect = float((avg_effect_per_path <= 0).mean()) * 2.0
|
|
237
|
+
else:
|
|
238
|
+
p_no_effect = float((avg_effect_per_path >= 0).mean()) * 2.0
|
|
239
|
+
p_no_effect = min(1.0, max(0.0, p_no_effect))
|
|
240
|
+
|
|
241
|
+
# Build the long table — one row per post-event period. Use the
|
|
242
|
+
# original series' index so callers see Period objects if that's
|
|
243
|
+
# what they passed in.
|
|
244
|
+
if isinstance(series.index, pd.PeriodIndex):
|
|
245
|
+
periods_out: pd.Index = series.index[post_mask]
|
|
246
|
+
else:
|
|
247
|
+
periods_out = work.index[post_mask]
|
|
248
|
+
table = pd.DataFrame(
|
|
249
|
+
{
|
|
250
|
+
"period": periods_out,
|
|
251
|
+
"observed": obs_arr,
|
|
252
|
+
"counterfactual": cf_mean,
|
|
253
|
+
"counterfactual_lower": cf_lower,
|
|
254
|
+
"counterfactual_upper": cf_upper,
|
|
255
|
+
"pointwise_effect": pw_mean,
|
|
256
|
+
"pointwise_lower": pw_lower,
|
|
257
|
+
"pointwise_upper": pw_upper,
|
|
258
|
+
"cumulative_effect": cum_mean,
|
|
259
|
+
"cumulative_lower": cum_lower,
|
|
260
|
+
"cumulative_upper": cum_upper,
|
|
261
|
+
}
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
metrics = {
|
|
265
|
+
"avg_effect": avg_effect,
|
|
266
|
+
"avg_effect_lower": avg_lower,
|
|
267
|
+
"avg_effect_upper": avg_upper,
|
|
268
|
+
"cumulative_effect": float(cum_mean[-1]),
|
|
269
|
+
"cumulative_effect_lower": float(cum_lower[-1]),
|
|
270
|
+
"cumulative_effect_upper": float(cum_upper[-1]),
|
|
271
|
+
"relative_effect": relative_effect,
|
|
272
|
+
"counterfactual_mean": cf_mean_avg,
|
|
273
|
+
"posterior_prob_no_effect": p_no_effect,
|
|
274
|
+
}
|
|
275
|
+
return CausalImpactResult(
|
|
276
|
+
target="", # filled in by the caller — TemporalTrajectory knows the name
|
|
277
|
+
event_date=event_ts,
|
|
278
|
+
table=table,
|
|
279
|
+
metrics=metrics,
|
|
280
|
+
level=level,
|
|
281
|
+
n_pre=int(pre_mask.sum()),
|
|
282
|
+
n_post=int(post_mask.sum()),
|
|
283
|
+
params={"model": model, "n_samples": n_samples, "seed": seed},
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def _coerce_event(event: object, series: pd.Series) -> pd.Timestamp:
|
|
288
|
+
"""Coerce an event-date argument to a Timestamp aligned with the series."""
|
|
289
|
+
if isinstance(event, pd.Timestamp):
|
|
290
|
+
return event
|
|
291
|
+
if isinstance(event, pd.Period):
|
|
292
|
+
return pd.Timestamp(str(event))
|
|
293
|
+
return pd.Timestamp(str(event))
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""Changepoint detection on temporal frequency / similarity series.
|
|
2
|
+
|
|
3
|
+
Wraps the ``ruptures`` library; importable only when the ``temporal``
|
|
4
|
+
extra is installed.
|
|
5
|
+
|
|
6
|
+
References
|
|
7
|
+
----------
|
|
8
|
+
Truong, C., Oudre, L., & Vayatis, N. (2020). Selective review of
|
|
9
|
+
offline change point detection methods. *Signal Processing*, 167,
|
|
10
|
+
107299.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from typing import Literal
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
import pandas as pd
|
|
19
|
+
|
|
20
|
+
ChangepointMethod = Literal["pelt", "binseg", "window"]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def detect_changepoints(
|
|
24
|
+
series: pd.Series,
|
|
25
|
+
method: ChangepointMethod = "pelt",
|
|
26
|
+
penalty: float | None = None,
|
|
27
|
+
model: str = "rbf",
|
|
28
|
+
) -> pd.DataFrame:
|
|
29
|
+
"""Detect changepoints in a temporal value series.
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
series
|
|
34
|
+
A 1-D series indexed by period (or any orderable). Index values
|
|
35
|
+
propagate into the output so changepoints can be reported in
|
|
36
|
+
their original time vocabulary.
|
|
37
|
+
method
|
|
38
|
+
``"pelt"`` (default), ``"binseg"``, or ``"window"`` — the three
|
|
39
|
+
offline algorithms ruptures exposes. PELT is exact and usually
|
|
40
|
+
best; BinSeg is greedy but faster on long series; window scans
|
|
41
|
+
a fixed window across the signal.
|
|
42
|
+
penalty
|
|
43
|
+
Penalty term for adding a changepoint. Larger = fewer
|
|
44
|
+
changepoints. If ``None``, defaults to ``log(n)`` (BIC-style).
|
|
45
|
+
Pass an explicit float to tune sensitivity.
|
|
46
|
+
model
|
|
47
|
+
Cost function: ``"rbf"`` (default), ``"l1"``, ``"l2"``,
|
|
48
|
+
``"normal"``. See ruptures documentation for the full list.
|
|
49
|
+
|
|
50
|
+
Returns
|
|
51
|
+
-------
|
|
52
|
+
pandas.DataFrame
|
|
53
|
+
Columns ``period`` (index value at the breakpoint), ``index``
|
|
54
|
+
(positional index), and ``method`` (echo of the chosen method).
|
|
55
|
+
"""
|
|
56
|
+
try:
|
|
57
|
+
import ruptures as rpt
|
|
58
|
+
except ImportError as exc: # pragma: no cover
|
|
59
|
+
raise ImportError(
|
|
60
|
+
"detect_changepoints requires ruptures. "
|
|
61
|
+
"Install with: pip install 'pycorpdiff[temporal]'"
|
|
62
|
+
) from exc
|
|
63
|
+
|
|
64
|
+
values = np.asarray(series.to_numpy(dtype=float))
|
|
65
|
+
if values.ndim != 1:
|
|
66
|
+
raise ValueError("series must be 1-D")
|
|
67
|
+
if len(values) < 4:
|
|
68
|
+
raise ValueError(f"need at least 4 observations; got {len(values)}")
|
|
69
|
+
if np.isnan(values).any():
|
|
70
|
+
raise ValueError("series contains NaN values; impute or drop them first")
|
|
71
|
+
|
|
72
|
+
pen = float(np.log(len(values))) if penalty is None else float(penalty)
|
|
73
|
+
|
|
74
|
+
if method == "pelt":
|
|
75
|
+
algo = rpt.Pelt(model=model)
|
|
76
|
+
elif method == "binseg":
|
|
77
|
+
algo = rpt.Binseg(model=model)
|
|
78
|
+
elif method == "window":
|
|
79
|
+
algo = rpt.Window(model=model)
|
|
80
|
+
else: # pragma: no cover - typing makes this unreachable
|
|
81
|
+
raise ValueError(f"unknown method={method!r}")
|
|
82
|
+
|
|
83
|
+
algo.fit(values.reshape(-1, 1))
|
|
84
|
+
# ruptures returns endpoint indices including len(values) as the
|
|
85
|
+
# final "boundary"; drop it to keep only interior changepoints.
|
|
86
|
+
breakpoints = [bp for bp in algo.predict(pen=pen) if bp < len(values)]
|
|
87
|
+
|
|
88
|
+
rows = [
|
|
89
|
+
{"period": series.index[bp], "index": bp, "method": method}
|
|
90
|
+
for bp in breakpoints
|
|
91
|
+
]
|
|
92
|
+
return pd.DataFrame(rows, columns=["period", "index", "method"])
|