panelkit 0.2.2__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. {panelkit-0.2.2 → panelkit-0.2.4}/Cargo.lock +5 -5
  2. {panelkit-0.2.2 → panelkit-0.2.4}/Cargo.toml +1 -1
  3. {panelkit-0.2.2 → panelkit-0.2.4}/GUIDE.md +37 -6
  4. {panelkit-0.2.2 → panelkit-0.2.4}/PKG-INFO +19 -2
  5. {panelkit-0.2.2 → panelkit-0.2.4}/README.md +18 -1
  6. {panelkit-0.2.2 → panelkit-0.2.4}/crates/geo/src/selection.rs +68 -20
  7. {panelkit-0.2.2 → panelkit-0.2.4}/crates/geo/tests/geo.rs +10 -0
  8. {panelkit-0.2.2 → panelkit-0.2.4}/crates/pypanelkit/src/api_geo.rs +3 -1
  9. {panelkit-0.2.2 → panelkit-0.2.4}/pyproject.toml +1 -1
  10. {panelkit-0.2.2 → panelkit-0.2.4}/python/panelkit/_panelkit.pyi +1 -0
  11. {panelkit-0.2.2 → panelkit-0.2.4}/python/panelkit/design.py +323 -53
  12. {panelkit-0.2.2 → panelkit-0.2.4}/BENCHMARKS.md +0 -0
  13. {panelkit-0.2.2 → panelkit-0.2.4}/LICENSE-APACHE +0 -0
  14. {panelkit-0.2.2 → panelkit-0.2.4}/LICENSE-MIT +0 -0
  15. {panelkit-0.2.2 → panelkit-0.2.4}/crates/estimators/Cargo.toml +0 -0
  16. {panelkit-0.2.2 → panelkit-0.2.4}/crates/estimators/benches/estimators.rs +0 -0
  17. {panelkit-0.2.2 → panelkit-0.2.4}/crates/estimators/src/did/bacon.rs +0 -0
  18. {panelkit-0.2.2 → panelkit-0.2.4}/crates/estimators/src/did/callaway.rs +0 -0
  19. {panelkit-0.2.2 → panelkit-0.2.4}/crates/estimators/src/did/mod.rs +0 -0
  20. {panelkit-0.2.2 → panelkit-0.2.4}/crates/estimators/src/did/sunab.rs +0 -0
  21. {panelkit-0.2.2 → panelkit-0.2.4}/crates/estimators/src/did/twfe.rs +0 -0
  22. {panelkit-0.2.2 → panelkit-0.2.4}/crates/estimators/src/fe/mod.rs +0 -0
  23. {panelkit-0.2.2 → panelkit-0.2.4}/crates/estimators/src/fe/within.rs +0 -0
  24. {panelkit-0.2.2 → panelkit-0.2.4}/crates/estimators/src/lib.rs +0 -0
  25. {panelkit-0.2.2 → panelkit-0.2.4}/crates/estimators/src/mcnnm/mod.rs +0 -0
  26. {panelkit-0.2.2 → panelkit-0.2.4}/crates/estimators/src/mcnnm/softimpute.rs +0 -0
  27. {panelkit-0.2.2 → panelkit-0.2.4}/crates/estimators/src/panel.rs +0 -0
  28. {panelkit-0.2.2 → panelkit-0.2.4}/crates/estimators/src/result.rs +0 -0
  29. {panelkit-0.2.2 → panelkit-0.2.4}/crates/estimators/src/sc/augmented.rs +0 -0
  30. {panelkit-0.2.2 → panelkit-0.2.4}/crates/estimators/src/sc/cpasc.rs +0 -0
  31. {panelkit-0.2.2 → panelkit-0.2.4}/crates/estimators/src/sc/mod.rs +0 -0
  32. {panelkit-0.2.2 → panelkit-0.2.4}/crates/estimators/src/sc/sdid.rs +0 -0
  33. {panelkit-0.2.2 → panelkit-0.2.4}/crates/estimators/src/sc/synthetic.rs +0 -0
  34. {panelkit-0.2.2 → panelkit-0.2.4}/crates/estimators/tests/cpasc.rs +0 -0
  35. {panelkit-0.2.2 → panelkit-0.2.4}/crates/estimators/tests/did.rs +0 -0
  36. {panelkit-0.2.2 → panelkit-0.2.4}/crates/estimators/tests/sc.rs +0 -0
  37. {panelkit-0.2.2 → panelkit-0.2.4}/crates/estimators/tests/sc_family.rs +0 -0
  38. {panelkit-0.2.2 → panelkit-0.2.4}/crates/geo/Cargo.toml +0 -0
  39. {panelkit-0.2.2 → panelkit-0.2.4}/crates/geo/src/diagnostics.rs +0 -0
  40. {panelkit-0.2.2 → panelkit-0.2.4}/crates/geo/src/lib.rs +0 -0
  41. {panelkit-0.2.2 → panelkit-0.2.4}/crates/geo/src/power.rs +0 -0
  42. {panelkit-0.2.2 → panelkit-0.2.4}/crates/geo/src/types.rs +0 -0
  43. {panelkit-0.2.2 → panelkit-0.2.4}/crates/inference/Cargo.toml +0 -0
  44. {panelkit-0.2.2 → panelkit-0.2.4}/crates/inference/src/batch.rs +0 -0
  45. {panelkit-0.2.2 → panelkit-0.2.4}/crates/inference/src/bootstrap.rs +0 -0
  46. {panelkit-0.2.2 → panelkit-0.2.4}/crates/inference/src/ci.rs +0 -0
  47. {panelkit-0.2.2 → panelkit-0.2.4}/crates/inference/src/lib.rs +0 -0
  48. {panelkit-0.2.2 → panelkit-0.2.4}/crates/inference/src/parallel.rs +0 -0
  49. {panelkit-0.2.2 → panelkit-0.2.4}/crates/inference/src/placebo.rs +0 -0
  50. {panelkit-0.2.2 → panelkit-0.2.4}/crates/inference/tests/inference.rs +0 -0
  51. {panelkit-0.2.2 → panelkit-0.2.4}/crates/linalg/Cargo.toml +0 -0
  52. {panelkit-0.2.2 → panelkit-0.2.4}/crates/linalg/src/error.rs +0 -0
  53. {panelkit-0.2.2 → panelkit-0.2.4}/crates/linalg/src/factor/cholesky.rs +0 -0
  54. {panelkit-0.2.2 → panelkit-0.2.4}/crates/linalg/src/factor/eig_sym.rs +0 -0
  55. {panelkit-0.2.2 → panelkit-0.2.4}/crates/linalg/src/factor/mod.rs +0 -0
  56. {panelkit-0.2.2 → panelkit-0.2.4}/crates/linalg/src/factor/qr.rs +0 -0
  57. {panelkit-0.2.2 → panelkit-0.2.4}/crates/linalg/src/factor/randomized.rs +0 -0
  58. {panelkit-0.2.2 → panelkit-0.2.4}/crates/linalg/src/factor/svd.rs +0 -0
  59. {panelkit-0.2.2 → panelkit-0.2.4}/crates/linalg/src/factor/svd_gram.rs +0 -0
  60. {panelkit-0.2.2 → panelkit-0.2.4}/crates/linalg/src/lib.rs +0 -0
  61. {panelkit-0.2.2 → panelkit-0.2.4}/crates/linalg/src/matrix.rs +0 -0
  62. {panelkit-0.2.2 → panelkit-0.2.4}/crates/linalg/src/ops/matmul.rs +0 -0
  63. {panelkit-0.2.2 → panelkit-0.2.4}/crates/linalg/src/ops/mod.rs +0 -0
  64. {panelkit-0.2.2 → panelkit-0.2.4}/crates/linalg/src/ops/norms.rs +0 -0
  65. {panelkit-0.2.2 → panelkit-0.2.4}/crates/linalg/src/ops/transform.rs +0 -0
  66. {panelkit-0.2.2 → panelkit-0.2.4}/crates/linalg/src/opt/mod.rs +0 -0
  67. {panelkit-0.2.2 → panelkit-0.2.4}/crates/linalg/src/opt/simplex.rs +0 -0
  68. {panelkit-0.2.2 → panelkit-0.2.4}/crates/linalg/src/opt/softthresh.rs +0 -0
  69. {panelkit-0.2.2 → panelkit-0.2.4}/crates/linalg/src/rng.rs +0 -0
  70. {panelkit-0.2.2 → panelkit-0.2.4}/crates/linalg/src/solve/lstsq.rs +0 -0
  71. {panelkit-0.2.2 → panelkit-0.2.4}/crates/linalg/src/solve/mod.rs +0 -0
  72. {panelkit-0.2.2 → panelkit-0.2.4}/crates/linalg/src/solve/spd.rs +0 -0
  73. {panelkit-0.2.2 → panelkit-0.2.4}/crates/linalg/tests/numerics.rs +0 -0
  74. {panelkit-0.2.2 → panelkit-0.2.4}/crates/pypanelkit/Cargo.toml +0 -0
  75. {panelkit-0.2.2 → panelkit-0.2.4}/crates/pypanelkit/src/api_did.rs +0 -0
  76. {panelkit-0.2.2 → panelkit-0.2.4}/crates/pypanelkit/src/api_sc.rs +0 -0
  77. {panelkit-0.2.2 → panelkit-0.2.4}/crates/pypanelkit/src/convert.rs +0 -0
  78. {panelkit-0.2.2 → panelkit-0.2.4}/crates/pypanelkit/src/lib.rs +0 -0
  79. {panelkit-0.2.2 → panelkit-0.2.4}/crates/pypanelkit/src/results.rs +0 -0
  80. {panelkit-0.2.2 → panelkit-0.2.4}/python/panelkit/__init__.py +0 -0
  81. {panelkit-0.2.2 → panelkit-0.2.4}/python/panelkit/estimators.py +0 -0
  82. {panelkit-0.2.2 → panelkit-0.2.4}/python/panelkit/py.typed +0 -0
@@ -462,7 +462,7 @@ checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e"
462
462
 
463
463
  [[package]]
464
464
  name = "panelkit-estimators"
465
- version = "0.2.2"
465
+ version = "0.2.4"
466
466
  dependencies = [
467
467
  "criterion",
468
468
  "panelkit-linalg",
@@ -471,7 +471,7 @@ dependencies = [
471
471
 
472
472
  [[package]]
473
473
  name = "panelkit-geo"
474
- version = "0.2.2"
474
+ version = "0.2.4"
475
475
  dependencies = [
476
476
  "panelkit-estimators",
477
477
  "panelkit-inference",
@@ -482,7 +482,7 @@ dependencies = [
482
482
 
483
483
  [[package]]
484
484
  name = "panelkit-inference"
485
- version = "0.2.2"
485
+ version = "0.2.4"
486
486
  dependencies = [
487
487
  "panelkit-estimators",
488
488
  "panelkit-linalg",
@@ -491,7 +491,7 @@ dependencies = [
491
491
 
492
492
  [[package]]
493
493
  name = "panelkit-linalg"
494
- version = "0.2.2"
494
+ version = "0.2.4"
495
495
  dependencies = [
496
496
  "proptest",
497
497
  "rayon",
@@ -623,7 +623,7 @@ dependencies = [
623
623
 
624
624
  [[package]]
625
625
  name = "pypanelkit"
626
- version = "0.2.2"
626
+ version = "0.2.4"
627
627
  dependencies = [
628
628
  "numpy",
629
629
  "panelkit-estimators",
@@ -3,7 +3,7 @@ resolver = "2"
3
3
  members = ["crates/linalg", "crates/estimators", "crates/inference", "crates/geo", "crates/pypanelkit"]
4
4
 
5
5
  [workspace.package]
6
- version = "0.2.2"
6
+ version = "0.2.4"
7
7
  edition = "2021"
8
8
  rust-version = "1.74"
9
9
  license = "MIT OR Apache-2.0"
@@ -294,20 +294,41 @@ weighted-average **ensemble** estimate.
294
294
 
295
295
  ```python
296
296
  ev = design.evaluate(treated=["chicago", "denver"], treat_start=52, level=0.90)
297
- print(ev.summary()) # per-method + ensemble lift, CI, cumulative
298
- ev.plot("evaluate.png") # observed-vs-counterfactual, effect path, lift bar
297
+ print(ev.summary()) # per-method + ensemble lift, CI, cumulative
298
+ ev.plot("evaluate.png") # observed-vs-cf, effect path (CI band), lift bar
299
+ ev.plot_effect_over_time("effect.png") # pointwise + cumulative over time, w/ CIs
299
300
  ev.lift, ev.cumulative, ev.significant
300
301
  ```
301
302
 
302
- Each estimate gets a confidence interval from a **stationary block bootstrap** of
303
- its post-period effect path; an **SC in-space placebo** supplies a p-value. The
304
- ensemble uses the same `weights` choices as `power()` (`"auto"` = inverse-variance
305
- from each method's bootstrap SE, `"equal"`, or an explicit dict/list). `ev` exposes
303
+ Inference is **in-space placebo** (Abadie): every donor market is refit as if it
304
+ were the treated one, and the spread of *their* post-period effects is the null
305
+ reference capturing out-of-sample extrapolation error, the real source of
306
+ uncertainty. (A bootstrap of the treated unit's own post-period only sees
307
+ in-sample noise and is wildly anti-conservative — on null data its 90% interval
308
+ falsely flags an effect ~50% of the time; the placebo version sits at/below the
309
+ nominal 10%.) Poorly-fit placebos (pre-period RMSPE > 2× the treated unit's) are
310
+ dropped, per Abadie. The p-value is the placebo rank of the treated effect, and
311
+ `"auto"` ensemble weights are inverse-variance from each method's placebo-null
312
+ spread. `ev` exposes
306
313
  `.lift`, `.att`, `.cumulative`, `.significant`, the per-method results in `ev.per`,
307
314
  and the ensemble in `ev.ensemble`. Reported numbers: **% lift** (effect ÷
308
315
  counterfactual), **per-period ATT**, and **cumulative incremental** over the
309
316
  window (summed across treated markets).
310
317
 
318
+ **Effect over time** (`ev.plot_effect_over_time(...)`) gives the event-study view:
319
+ the **pointwise** effect across the full timeline — *including the pre-period*, so
320
+ you can see it sits flat (centered on zero) inside the noise band before the test
321
+ starts (a placebo check) and breaks out after — and the running **cumulative
322
+ incremental**, each as a point estimate with a confidence band. The counterfactual
323
+ is centered on the pre-period, so the gap shows fit quality rather than a level
324
+ offset (SDID matches trends, not levels). The bands come from the **in-space
325
+ placebo** distribution: at each horizon, the pointwise band is the spread of the
326
+ donor placebos' per-period effects, and the cumulative band is the spread of their
327
+ cumulative sums (so it fans out with horizon). Placebo inference needs a decent
328
+ donor pool to have power — with only a handful of comparable donors the intervals
329
+ are necessarily wide. Pass `exclude=[…]` to drop markets from the control pool
330
+ (e.g. ones you don't trust as donors).
331
+
311
332
  ### Choosing a specification — `design.recommend(test_lengths, n_geos_options, target_lift, alphas=…)`
312
333
 
313
334
  Sweeps designs across **test length × number of geos × alpha** and recommends the
@@ -345,6 +366,16 @@ Searches candidate treatment-market sets and ranks them by power, MDE, pre-fit,
345
366
  holdout, and confidence. Pass `eligible=[…]` to restrict to markets you can
346
367
  actually run in.
347
368
 
369
+ Two real-world controls for *which* markets the search may use:
370
+
371
+ - **`include=[…]`** — force specific markets into **every** candidate treatment
372
+ set (must-treat markets, e.g. a flagship region you've already committed to).
373
+ The search fills the remaining slots from `eligible`, up to `max_treated`.
374
+ - **`exclude=[…]`** — drop markets **entirely**: they're never treated *and*
375
+ never used as a donor/control (e.g. a market with contaminated data or its own
376
+ concurrent campaign). `exclude` is also accepted by `power()`, `diagnose()`,
377
+ `evaluate()`, and `recommend()` to keep a market out of the control pool.
378
+
348
379
  ### Multi-cell tests — `design.multi_cell(cells, test_len, …)`
349
380
 
350
381
  Often you run several treatment cells at once — different creatives, budgets, or
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: panelkit
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Classifier: Programming Language :: Rust
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Topic :: Scientific/Engineering
@@ -231,10 +231,15 @@ mc = design.multi_cell(cells={"west": ["los_angeles", "san_diego"],
231
231
  print(mc.summary()) # per-cell MDE / confidence / holdout
232
232
  mc.plot("multicell.png") # the multi-cell figure below
233
233
 
234
+ # pin in must-have markets, drop ones you don't trust:
235
+ ranked = design.select_markets(test_len=8, target_lift=0.05, max_treated=3,
236
+ include=["chicago"], exclude=["miami"])
237
+
234
238
  # already ran the test? measure it (SC/ASC/SDID + a weighted-average ensemble):
235
239
  ev = design.evaluate(treated=["chicago", "denver"], treat_start=52)
236
240
  print(ev.summary()) # per-method + ensemble lift, CI, cumulative
237
241
  ev.plot("evaluate.png") # observed vs counterfactual + lift-by-method
242
+ ev.plot_effect_over_time("effect.png") # pointwise + cumulative over time, w/ CIs
238
243
 
239
244
  # or sweep specifications (length × #geos × significance) and recommend one:
240
245
  grid = design.recommend(test_lengths=[4, 6, 8, 12], n_geos_options=[3, 5, 10, 20],
@@ -268,11 +273,23 @@ per-cell MDE/confidence/holdout report and a combined figure:
268
273
  **Evaluate a test that ran.** `evaluate(...)` is the measurement counterpart to
269
274
  the power analysis: fit SC / ASC / SDID on a test that already happened, blend
270
275
  them into a weighted-average **ensemble** estimate, and report each one's lift,
271
- confidence interval (stationary block bootstrap), and cumulative incremental —
276
+ confidence interval (in-space placebo), and cumulative incremental —
272
277
  with an SC in-space placebo p-value:
273
278
 
274
279
  ![test evaluation](assets/geo_evaluate.png)
275
280
 
281
+ And the **effect over time** — the pointwise effect across the full timeline
282
+ (pre-period included, so you can see it sit flat in the noise band before the test
283
+ and break out after) plus the running cumulative incremental, each as a point
284
+ estimate with a confidence band:
285
+
286
+ ![effect over time](assets/geo_effect_over_time.png)
287
+
288
+ **Pin in / drop markets.** `select_markets`/`recommend` take `include=[…]`
289
+ (force must-treat markets into every candidate) and `exclude=[…]` (drop markets
290
+ entirely — never treated, never a control). `exclude` is also accepted by
291
+ `power`, `diagnose`, and `evaluate` to keep a market out of the donor pool.
292
+
276
293
  **Messy DataFrame? No problem.** `from_long` coerces real-world data: outcome
277
294
  strings → numeric (with a clear error on genuinely non-numeric values), dates
278
295
  (string or unsorted) → chronological columns, locations → market names, duplicate
@@ -201,10 +201,15 @@ mc = design.multi_cell(cells={"west": ["los_angeles", "san_diego"],
201
201
  print(mc.summary()) # per-cell MDE / confidence / holdout
202
202
  mc.plot("multicell.png") # the multi-cell figure below
203
203
 
204
+ # pin in must-have markets, drop ones you don't trust:
205
+ ranked = design.select_markets(test_len=8, target_lift=0.05, max_treated=3,
206
+ include=["chicago"], exclude=["miami"])
207
+
204
208
  # already ran the test? measure it (SC/ASC/SDID + a weighted-average ensemble):
205
209
  ev = design.evaluate(treated=["chicago", "denver"], treat_start=52)
206
210
  print(ev.summary()) # per-method + ensemble lift, CI, cumulative
207
211
  ev.plot("evaluate.png") # observed vs counterfactual + lift-by-method
212
+ ev.plot_effect_over_time("effect.png") # pointwise + cumulative over time, w/ CIs
208
213
 
209
214
  # or sweep specifications (length × #geos × significance) and recommend one:
210
215
  grid = design.recommend(test_lengths=[4, 6, 8, 12], n_geos_options=[3, 5, 10, 20],
@@ -238,11 +243,23 @@ per-cell MDE/confidence/holdout report and a combined figure:
238
243
  **Evaluate a test that ran.** `evaluate(...)` is the measurement counterpart to
239
244
  the power analysis: fit SC / ASC / SDID on a test that already happened, blend
240
245
  them into a weighted-average **ensemble** estimate, and report each one's lift,
241
- confidence interval (stationary block bootstrap), and cumulative incremental —
246
+ confidence interval (in-space placebo), and cumulative incremental —
242
247
  with an SC in-space placebo p-value:
243
248
 
244
249
  ![test evaluation](assets/geo_evaluate.png)
245
250
 
251
+ And the **effect over time** — the pointwise effect across the full timeline
252
+ (pre-period included, so you can see it sit flat in the noise band before the test
253
+ and break out after) plus the running cumulative incremental, each as a point
254
+ estimate with a confidence band:
255
+
256
+ ![effect over time](assets/geo_effect_over_time.png)
257
+
258
+ **Pin in / drop markets.** `select_markets`/`recommend` take `include=[…]`
259
+ (force must-treat markets into every candidate) and `exclude=[…]` (drop markets
260
+ entirely — never treated, never a control). `exclude` is also accepted by
261
+ `power`, `diagnose`, and `evaluate` to keep a market out of the donor pool.
262
+
246
263
  **Messy DataFrame? No problem.** `from_long` coerces real-world data: outcome
247
264
  strings → numeric (with a clear error on genuinely non-numeric values), dates
248
265
  (string or unsorted) → chronological columns, locations → market names, duplicate
@@ -34,7 +34,10 @@ pub struct MarketCandidate {
34
34
  pub struct SelectConfig {
35
35
  /// Units eligible to be treated (e.g. markets you could actually run in).
36
36
  pub eligible: Vec<usize>,
37
- /// Maximum number of treated markets in a candidate set.
37
+ /// Units **forced into every** candidate treatment set (must-treat markets).
38
+ /// The search fills the remaining slots from `eligible`. Empty = no forcing.
39
+ pub include: Vec<usize>,
40
+ /// Maximum number of treated markets in a candidate set (counts `include`).
38
41
  pub max_treated: usize,
39
42
  pub test_len: usize,
40
43
  /// The lift you care about detecting (fraction, e.g. 0.05 = 5%).
@@ -95,46 +98,91 @@ pub fn evaluate(y: &Mat, treated: &[usize], cfg: &SelectConfig) -> MarketCandida
95
98
  }
96
99
  }
97
100
 
98
- /// Build the candidate list. With `exact_size = Some(k)`, every candidate has
99
- /// exactly `k` markets; otherwise it's every singleton plus sampled subsets of
100
- /// size 2..=max_treated.
101
+ /// Build the candidate list. Every candidate always contains the forced
102
+ /// `include` markets; the remaining slots are drawn from `eligible` (minus the
103
+ /// forced ones). With `exact_size = Some(k)`, every candidate has exactly `k`
104
+ /// markets total; otherwise it's the forced set plus each single extra market
105
+ /// plus sampled larger subsets up to `max_treated`.
101
106
  fn candidate_sets(cfg: &SelectConfig) -> Vec<Vec<usize>> {
102
107
  let mut rng = Xoshiro256pp::seed_from_u64(cfg.seed);
103
- let mut seen = std::collections::HashSet::new();
108
+ let mut seen: std::collections::HashSet<Vec<usize>> = std::collections::HashSet::new();
104
109
  let mut sets: Vec<Vec<usize>> = Vec::new();
105
110
 
106
- if let Some(k) = cfg.exact_size {
107
- let k = k.min(cfg.eligible.len()).max(1);
108
- if k == 1 {
109
- return cfg.eligible.iter().map(|&u| vec![u]).collect();
111
+ // Forced (must-treat) markets, de-duplicated, and the pool of extra picks.
112
+ let mut forced: Vec<usize> = cfg.include.clone();
113
+ forced.sort_unstable();
114
+ forced.dedup();
115
+ let forced_set: std::collections::HashSet<usize> = forced.iter().copied().collect();
116
+ let extra_pool: Vec<usize> = cfg
117
+ .eligible
118
+ .iter()
119
+ .copied()
120
+ .filter(|u| !forced_set.contains(u))
121
+ .collect();
122
+
123
+ if let Some(k0) = cfg.exact_size {
124
+ let k = k0.max(1);
125
+ let need = k.saturating_sub(forced.len());
126
+ if need == 0 {
127
+ // The forced set already fills the requested size.
128
+ if !forced.is_empty() {
129
+ sets.push(forced.clone());
130
+ }
131
+ return sets;
132
+ }
133
+ if need == 1 {
134
+ // Deterministic: forced + each eligible single (preserves the old
135
+ // "all singletons" behavior when nothing is forced and k == 1).
136
+ for &u in &extra_pool {
137
+ let mut pick = forced.clone();
138
+ pick.push(u);
139
+ pick.sort_unstable();
140
+ if seen.insert(pick.clone()) {
141
+ sets.push(pick);
142
+ }
143
+ }
144
+ return sets;
110
145
  }
111
146
  let mut attempts = 0;
112
147
  while sets.len() < cfg.n_candidates && attempts < cfg.n_candidates * 40 {
113
148
  attempts += 1;
114
- let mut pool = cfg.eligible.clone();
149
+ let mut pool = extra_pool.clone();
115
150
  rng.shuffle(&mut pool);
116
- let mut pick: Vec<usize> = pool.into_iter().take(k).collect();
151
+ let mut pick: Vec<usize> = forced.clone();
152
+ pick.extend(pool.into_iter().take(need));
117
153
  pick.sort_unstable();
118
- if seen.insert(pick.clone()) {
154
+ if pick.len() == k && seen.insert(pick.clone()) {
119
155
  sets.push(pick);
120
156
  }
121
157
  }
122
158
  return sets;
123
159
  }
124
160
 
125
- // Mixed-size search: all singletons + sampled subsets of size 2..=max_treated.
126
- sets = cfg.eligible.iter().map(|&u| vec![u]).collect();
127
- if cfg.max_treated >= 2 && cfg.eligible.len() >= 2 {
128
- for s in &sets {
129
- seen.insert(s.clone());
161
+ // Mixed-size search. Extra slots available on top of the forced set.
162
+ let budget = cfg.max_treated.saturating_sub(forced.len());
163
+ if !forced.is_empty() {
164
+ seen.insert(forced.clone());
165
+ sets.push(forced.clone());
166
+ }
167
+ if budget >= 1 {
168
+ for &u in &extra_pool {
169
+ let mut pick = forced.clone();
170
+ pick.push(u);
171
+ pick.sort_unstable();
172
+ if seen.insert(pick.clone()) {
173
+ sets.push(pick);
174
+ }
130
175
  }
176
+ }
177
+ if budget >= 2 && extra_pool.len() >= 2 {
131
178
  let mut attempts = 0;
132
179
  while sets.len() < cfg.n_candidates && attempts < cfg.n_candidates * 20 {
133
180
  attempts += 1;
134
- let size = 2 + rng.gen_range(cfg.max_treated - 1); // 2..=max_treated
135
- let mut pool = cfg.eligible.clone();
181
+ let extra = 2 + rng.gen_range(budget - 1); // 2..=budget extra markets
182
+ let mut pool = extra_pool.clone();
136
183
  rng.shuffle(&mut pool);
137
- let mut pick: Vec<usize> = pool.into_iter().take(size).collect();
184
+ let mut pick: Vec<usize> = forced.clone();
185
+ pick.extend(pool.into_iter().take(extra));
138
186
  pick.sort_unstable();
139
187
  if seen.insert(pick.clone()) {
140
188
  sets.push(pick);
@@ -114,6 +114,7 @@ fn market_selection_ranks_candidates() {
114
114
  let y = geo_panel(12, 60, 5);
115
115
  let cfg = SelectConfig {
116
116
  eligible: (0..12).collect(),
117
+ include: vec![],
117
118
  max_treated: 3,
118
119
  test_len: 10,
119
120
  target_lift: 0.10,
@@ -139,6 +140,15 @@ fn market_selection_ranks_candidates() {
139
140
  };
140
141
  let ranked2 = select_markets(&y, &cfg2);
141
142
  assert!(ranked2.iter().all(|c| c.treated.len() == 2));
143
+ // include: market 5 is forced into every candidate set.
144
+ let cfg3 = SelectConfig {
145
+ include: vec![5],
146
+ ..cfg.clone()
147
+ };
148
+ let ranked3 = select_markets(&y, &cfg3);
149
+ assert!(!ranked3.is_empty());
150
+ assert!(ranked3.iter().all(|c| c.treated.contains(&5)));
151
+ assert!(ranked3.iter().all(|c| c.treated.len() <= 3));
142
152
  // Every candidate has a valid holdout and confidence.
143
153
  for c in &ranked {
144
154
  assert!(c.holdout_pct > 0.0 && c.holdout_pct < 1.0);
@@ -169,7 +169,7 @@ pub fn geo_diagnostics(
169
169
 
170
170
  /// Search and rank candidate treatment-market sets.
171
171
  #[pyfunction]
172
- #[pyo3(signature = (y, eligible, max_treated, test_len, target_lift, method="sdid", alpha=0.1, target_power=0.8, min_pre=0, n_candidates=200, seed=0, exact_size=None, lookback=None))]
172
+ #[pyo3(signature = (y, eligible, max_treated, test_len, target_lift, method="sdid", alpha=0.1, target_power=0.8, min_pre=0, n_candidates=200, seed=0, exact_size=None, lookback=None, include=None))]
173
173
  #[allow(clippy::too_many_arguments)]
174
174
  pub fn geo_select(
175
175
  py: Python<'_>,
@@ -186,6 +186,7 @@ pub fn geo_select(
186
186
  seed: u64,
187
187
  exact_size: Option<usize>,
188
188
  lookback: Option<usize>,
189
+ include: Option<Vec<usize>>,
189
190
  ) -> PyResult<Vec<PyMarketCandidate>> {
190
191
  let m = parse_method(method)?;
191
192
  let mat = mat_from_numpy(&y);
@@ -196,6 +197,7 @@ pub fn geo_select(
196
197
  };
197
198
  let cfg = SelectConfig {
198
199
  eligible,
200
+ include: include.unwrap_or_default(),
199
201
  max_treated,
200
202
  test_len,
201
203
  target_lift,
@@ -4,7 +4,7 @@ build-backend = "maturin"
4
4
 
5
5
  [project]
6
6
  name = "panelkit"
7
- version = "0.2.2"
7
+ version = "0.2.4"
8
8
  description = "Fast, from-scratch causal-inference estimators for panel/geo experiments (SC, ASC, SDID, DiD, MC-NNM)."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -191,6 +191,7 @@ def geo_select(
191
191
  seed: int = ...,
192
192
  exact_size: Optional[int] = ...,
193
193
  lookback: Optional[int] = ...,
194
+ include: Optional[Sequence[int]] = ...,
194
195
  ) -> list[MarketCandidate]: ...
195
196
  def fit_callaway_py(
196
197
  y: npt.NDArray[np.float64],
@@ -352,6 +352,22 @@ class GeoDesign:
352
352
  out.append(self._index[m])
353
353
  return out
354
354
 
355
+ def _names_of(self, markets) -> list:
356
+ """Resolve markets (names or indices) to their string names."""
357
+ return [self.names[i] for i in self._resolve(markets)]
358
+
359
+ def _without(self, exclude):
360
+ """Return ``(sub_design, excluded_name_set)`` with the excluded markets
361
+ dropped entirely (so they're neither treated nor used as controls). Names
362
+ are preserved, so callers can pass markets to the sub-design by name."""
363
+ ex = set(self._names_of(exclude)) if exclude else set()
364
+ if not ex:
365
+ return self, ex
366
+ keep = [i for i in range(self.n) if self.names[i] not in ex]
367
+ if not keep:
368
+ raise ValueError("exclude removes every market — nothing left to analyze")
369
+ return GeoDesign(self.Y[keep], names=[self.names[i] for i in keep]), ex
370
+
355
371
  def power(
356
372
  self,
357
373
  treated,
@@ -364,6 +380,7 @@ class GeoDesign:
364
380
  lookback: int | None = None,
365
381
  ensemble: bool = True,
366
382
  ensemble_weights="auto",
383
+ exclude=None,
367
384
  ) -> _PowerReport:
368
385
  """Power analysis for a specified treated-market set across methods.
369
386
 
@@ -376,7 +393,20 @@ class GeoDesign:
376
393
  power reflects the averaged estimator, which is usually steadier than any
377
394
  one method). ``ensemble_weights`` is ``"auto"`` (data-driven inverse-variance
378
395
  weighting from each method's historical-null spread), ``"equal"``, or a dict
379
- like ``{"SC": 0.5, "ASC": 0.2, "SDID": 0.3}``."""
396
+ like ``{"SC": 0.5, "ASC": 0.2, "SDID": 0.3}``.
397
+
398
+ ``exclude`` drops markets entirely (e.g. contaminated or untrustworthy
399
+ ones) so they're never used as donors/controls."""
400
+ if exclude:
401
+ sub, ex = self._without(exclude)
402
+ tnames = self._names_of(treated)
403
+ bad = [n for n in tnames if n in ex]
404
+ if bad:
405
+ raise ValueError(f"treated markets were also excluded: {bad}")
406
+ return sub.power(tnames, test_len, lifts=lifts, methods=methods, alpha=alpha,
407
+ target_power=target_power, recommended=recommended,
408
+ lookback=lookback, ensemble=ensemble,
409
+ ensemble_weights=ensemble_weights)
380
410
  idx = self._resolve(treated)
381
411
  names = [self.names[i] for i in idx]
382
412
  lifts = list(_DEFAULT_LIFTS if lifts is None else lifts)
@@ -398,13 +428,21 @@ class GeoDesign:
398
428
  rec = recommended if recommended in results else list(results)[0]
399
429
  return _PowerReport(self, idx, names, test_len, results, diag, rec, alpha, target_power)
400
430
 
401
- def diagnose(self, treated, test_len: int) -> "_DiagnosticsReport":
431
+ def diagnose(self, treated, test_len: int, exclude=None) -> "_DiagnosticsReport":
402
432
  """Real-world guardrails for a treated-market set: pre-period fit,
403
433
  seasonality, holdout, stability, and warnings — with a visual.
404
434
 
405
435
  Returns a report with ``.summary()`` and ``.plot(path)`` (the guardrails
406
436
  figure: treated-vs-synthetic pre-fit, seasonality ACF, holdout share, and
407
- a scorecard listing any warnings)."""
437
+ a scorecard listing any warnings). ``exclude`` drops markets from the
438
+ control pool entirely."""
439
+ if exclude:
440
+ sub, ex = self._without(exclude)
441
+ tnames = self._names_of(treated)
442
+ bad = [n for n in tnames if n in ex]
443
+ if bad:
444
+ raise ValueError(f"treated markets were also excluded: {bad}")
445
+ return sub.diagnose(tnames, test_len)
408
446
  idx = self._resolve(treated)
409
447
  names = [self.names[i] for i in idx]
410
448
  t0 = self.t - int(test_len)
@@ -431,18 +469,46 @@ class GeoDesign:
431
469
  top: int = 10,
432
470
  exact_size: int | None = None,
433
471
  lookback: int | None = None,
472
+ include=None,
473
+ exclude=None,
434
474
  ) -> list:
435
475
  """Search candidate treatment-market sets and return the top ranked.
436
476
 
437
477
  ``exact_size=k`` restricts the search to sets of exactly ``k`` markets
438
478
  (otherwise sizes 1..``max_treated`` are considered). ``lookback=k`` powers
439
- over the most-recent ``k`` historical windows."""
479
+ over the most-recent ``k`` historical windows.
480
+
481
+ ``include`` forces specific markets into **every** candidate treatment set
482
+ (must-treat markets); the search fills the remaining slots from
483
+ ``eligible``. ``exclude`` drops markets entirely — they're never treated
484
+ and never used as controls."""
485
+ if exclude:
486
+ sub, ex = self._without(exclude)
487
+ elig_names = self._names_of(eligible) if eligible is not None else None
488
+ if elig_names is not None:
489
+ elig_names = [n for n in elig_names if n not in ex]
490
+ inc_names = self._names_of(include) if include else None
491
+ if inc_names is not None:
492
+ bad = [n for n in inc_names if n in ex]
493
+ if bad:
494
+ raise ValueError(f"markets in both include and exclude: {bad}")
495
+ return sub.select_markets(
496
+ test_len, target_lift, max_treated, eligible=elig_names, method=method,
497
+ alpha=alpha, target_power=target_power, n_candidates=n_candidates,
498
+ seed=seed, top=top, exact_size=exact_size, lookback=lookback,
499
+ include=inc_names, exclude=None)
500
+
440
501
  elig = self._resolve(eligible) if eligible is not None else list(range(self.n))
502
+ inc = sorted(set(self._resolve(include))) if include else []
503
+ if len(inc) > int(max_treated):
504
+ raise ValueError(f"include has {len(inc)} markets but max_treated="
505
+ f"{max_treated}; raise max_treated or include fewer")
441
506
  ranked = _panelkit.geo_select(
442
507
  self.Y, elig, int(max_treated), int(test_len), float(target_lift),
443
508
  method.lower(), alpha, target_power, 0, int(n_candidates), int(seed),
444
509
  None if exact_size is None else int(exact_size),
445
510
  None if lookback is None else int(lookback),
511
+ inc or None,
446
512
  )
447
513
  out = []
448
514
  for c in ranked[:top]:
@@ -470,6 +536,8 @@ class GeoDesign:
470
536
  seed: int = 0,
471
537
  min_confidence: float = 60.0,
472
538
  lookback: int | None = None,
539
+ include=None,
540
+ exclude=None,
473
541
  ) -> "_ScenarioGrid":
474
542
  """Sweep designs across **specifications** — test length × number of geos
475
543
  × significance level (alpha) — and recommend the best.
@@ -477,7 +545,9 @@ class GeoDesign:
477
545
  For each (alpha, test_len, n_geos) cell it searches for the best set of
478
546
  exactly ``n_geos`` treatment markets and records its MDE, power, holdout,
479
547
  and confidence. Returns a :class:`_ScenarioGrid` with a recommendation,
480
- a plain-English summary, and a tradeoffs figure.
548
+ a plain-English summary, and a tradeoffs figure. ``include`` forces
549
+ must-treat markets into every candidate; ``exclude`` drops markets
550
+ entirely.
481
551
  """
482
552
  rows = []
483
553
  for alpha in alphas:
@@ -488,6 +558,7 @@ class GeoDesign:
488
558
  eligible=eligible, method=method, alpha=alpha,
489
559
  target_power=target_power, n_candidates=n_candidates,
490
560
  seed=seed, top=1, exact_size=ng, lookback=lookback,
561
+ include=include, exclude=exclude,
491
562
  )
492
563
  best = ranked[0] if ranked else None
493
564
  if best is None:
@@ -610,18 +681,24 @@ class GeoDesign:
610
681
  methods: Sequence[str] = _METHODS,
611
682
  weights="auto",
612
683
  level: float = 0.90,
613
- n_boot: int = 2000,
614
- block_len: int = 4,
684
+ max_placebo: int = 200,
615
685
  seed: int = 0,
686
+ exclude=None,
616
687
  ) -> "_EvalReport":
617
688
  """Estimate the realized effect of a geo test that has **already run**.
618
689
 
619
690
  This is the measurement counterpart to :meth:`power`: given the treated
620
691
  markets and the period treatment began (``treat_start``, the first
621
692
  post-period column), it fits SC / ASC / SDID, reports each one's effect,
622
- and combines them into a weighted-average **ensemble** estimate. Each
623
- estimate gets a confidence interval from a stationary block bootstrap of
624
- its post-period effect path; an SC in-space placebo supplies a p-value.
693
+ and combines them into a weighted-average **ensemble** estimate.
694
+
695
+ Inference is **in-space placebo** (Abadie): every donor market is refit as
696
+ if it were the treated one, and the spread of *their* post-period effects
697
+ is the null reference. This captures out-of-sample extrapolation error —
698
+ the dominant source of uncertainty — so the intervals are calibrated
699
+ (unlike a bootstrap of the treated unit's own post-period, which only sees
700
+ in-sample noise and is far too narrow). Poorly-fit placebos (pre-period
701
+ RMSPE > 2× the treated unit's) are dropped, per Abadie.
625
702
 
626
703
  Parameters
627
704
  ----------
@@ -633,18 +710,29 @@ class GeoDesign:
633
710
  Which estimators to fit and blend.
634
711
  weights : "auto" | "equal" | dict
635
712
  Ensemble weighting. ``"auto"`` is inverse-variance (precision)
636
- weighting from each method's bootstrap standard error.
713
+ weighting from each method's placebo-null spread.
637
714
  level : float
638
715
  Confidence level for the intervals (e.g. 0.90).
639
- n_boot, block_len, seed :
640
- Stationary-bootstrap settings for the effect-path CIs.
716
+ max_placebo : int
717
+ Cap on the number of donor placebos used (sampled if exceeded).
718
+ seed : int
719
+ Seed for placebo sampling when ``max_placebo`` is exceeded.
641
720
 
642
721
  Returns
643
722
  -------
644
723
  _EvalReport
645
724
  With ``.summary()``, ``.plot(path)``, per-method results, and the
646
- ensemble point estimate / interval / lift.
725
+ ensemble point estimate / interval / lift. ``exclude`` drops markets
726
+ from the control pool entirely.
647
727
  """
728
+ if exclude:
729
+ sub, ex = self._without(exclude)
730
+ tnames = self._names_of(treated)
731
+ bad = [n for n in tnames if n in ex]
732
+ if bad:
733
+ raise ValueError(f"treated markets were also excluded: {bad}")
734
+ return sub.evaluate(tnames, treat_start, methods=methods, weights=weights,
735
+ level=level, max_placebo=max_placebo, seed=seed)
648
736
  idx = self._resolve(treated)
649
737
  names = [self.names[i] for i in idx]
650
738
  t0 = int(treat_start)
@@ -656,37 +744,66 @@ class GeoDesign:
656
744
  if unknown:
657
745
  raise ValueError(f"unknown methods {unknown}; choose from {_METHODS}")
658
746
 
659
- fitters = {
660
- "SC": lambda: _panelkit.fit_sc(self.Y, idx, t0, 0.0, False, level),
661
- "ASC": lambda: _panelkit.fit_asc(self.Y, idx, t0, 0.0, None),
662
- "SDID": lambda: _panelkit.fit_sdid(self.Y, idx, t0, 1.0),
663
- }
747
+ def _fit(method, tr):
748
+ if method == "SC":
749
+ return _panelkit.fit_sc(self.Y, tr, t0, 0.0, False, level)
750
+ if method == "ASC":
751
+ return _panelkit.fit_asc(self.Y, tr, t0, 0.0, None)
752
+ return _panelkit.fit_sdid(self.Y, tr, t0, 1.0)
753
+
754
+ treated_series = self.Y[idx].mean(axis=0)
755
+ post_len = self.t - t0
756
+ order = methods
757
+
758
+ # --- point estimates on the treated set ---
664
759
  per = {}
665
760
  for m in methods:
666
- fit = fitters[m]()
761
+ fit = _fit(m, idx)
667
762
  att_path = np.asarray(fit.att_path, dtype=float)
668
763
  cf = np.asarray(fit.counterfactual, dtype=float)
669
764
  att = float(fit.att)
670
765
  cf_mean = float(np.mean(cf)) if cf.size else float("nan")
671
- se, lo, hi = _panelkit.bootstrap_mean(
672
- att_path.tolist(), "stationary", int(block_len), int(n_boot),
673
- int(seed), float(level))
766
+ # Full-timeline counterfactual via donor weights, centered on the
767
+ # pre-period so the gap reflects FIT, not a level offset (SDID matches
768
+ # trends, not levels).
769
+ dids = np.asarray(fit.donor_ids, dtype=int)
770
+ ws = np.asarray(fit.weights, dtype=float)
771
+ if dids.size:
772
+ full_cf = self.Y[dids].T @ ws
773
+ full_cf = full_cf + (treated_series[:t0].mean() - full_cf[:t0].mean())
774
+ else:
775
+ full_cf = np.full(self.t, np.nan)
674
776
  per[m] = {
675
777
  "att": att, "att_path": att_path, "counterfactual": cf,
676
- "cf_mean": cf_mean, "lift": att / cf_mean if cf_mean else float("nan"),
677
- "se": se, "att_lo": lo, "att_hi": hi,
678
- "lift_lo": lo / cf_mean if cf_mean else float("nan"),
679
- "lift_hi": hi / cf_mean if cf_mean else float("nan"),
778
+ "full_cf": full_cf, "cf_mean": cf_mean,
779
+ "lift": att / cf_mean if cf_mean else float("nan"),
680
780
  "cumulative": float(att_path.sum()) * n_treated,
681
781
  "pre_rmspe": float(fit.pre_rmspe),
682
782
  }
683
783
 
684
- # Ensemble: weight-average the post-period effect paths, then summarize.
685
- order = methods
784
+ # --- in-space placebo: refit each donor as if it were treated ---
785
+ treated_set = set(idx)
786
+ donors = [u for u in range(self.n) if u not in treated_set]
787
+ if len(donors) > int(max_placebo):
788
+ rng = np.random.default_rng(int(seed))
789
+ donors = sorted(int(j) for j in rng.choice(donors, int(max_placebo), replace=False))
790
+ pb = {m: [] for m in methods} # per method: list of (att_path, pre_rmspe)
791
+ for j in donors:
792
+ for m in methods:
793
+ fj = _fit(m, [j])
794
+ pb[m].append((np.asarray(fj.att_path, dtype=float), float(fj.pre_rmspe)))
795
+
796
+ # --- ensemble weights ---
797
+ def _placebo_att_sd(m):
798
+ if not pb[m]:
799
+ return 1.0
800
+ vals = np.array([p.mean() for (p, _) in pb[m]])
801
+ return float(np.std(vals)) if len(vals) > 1 else 1.0
686
802
  if isinstance(weights, str) and weights.lower() == "equal":
687
803
  wv = [1.0 / len(order)] * len(order)
688
804
  elif isinstance(weights, str) and weights.lower() == "auto":
689
- prec = [1.0 / max(per[m]["se"] ** 2, 1e-300) for m in order]
805
+ # inverse-variance from each method's placebo-null spread (precision)
806
+ prec = [1.0 / max(_placebo_att_sd(m) ** 2, 1e-300) for m in order]
690
807
  s = sum(prec)
691
808
  wv = [p / s for p in prec] if s > 0 else [1.0 / len(order)] * len(order)
692
809
  elif isinstance(weights, dict):
@@ -702,33 +819,91 @@ class GeoDesign:
702
819
  s = sum(raw)
703
820
  wv = [r / s for r in raw]
704
821
  wmap = dict(zip(order, wv))
705
-
822
+ a = (1.0 - float(level)) / 2.0
823
+
824
+ def _ci(point, null_samples):
825
+ """Pivot CI: point estimate ± the placebo null spread (null ≈ 0)."""
826
+ if len(null_samples) >= 2:
827
+ return point + float(np.quantile(null_samples, a)), \
828
+ point + float(np.quantile(null_samples, 1.0 - a))
829
+ return point, point
830
+
831
+ # --- per-method point CIs from each method's placebo att spread ---
832
+ for m in order:
833
+ mp = np.array([p.mean() for (p, _) in pb[m]]) if pb[m] else np.array([])
834
+ lo, hi = _ci(per[m]["att"], mp)
835
+ cfm = per[m]["cf_mean"]
836
+ per[m]["att_lo"], per[m]["att_hi"] = lo, hi
837
+ per[m]["lift_lo"] = lo / cfm if cfm else float("nan")
838
+ per[m]["lift_hi"] = hi / cfm if cfm else float("nan")
839
+
840
+ # --- ensemble estimate + ensemble placebo paths (Abadie pre-fit filter) ---
706
841
  ens_path = sum(wmap[m] * per[m]["att_path"] for m in order)
707
842
  ens_cf_mean = float(sum(wmap[m] * per[m]["cf_mean"] for m in order))
708
843
  ens_att = float(ens_path.mean())
709
- se, lo, hi = _panelkit.bootstrap_mean(
710
- ens_path.tolist(), "stationary", int(block_len), int(n_boot),
711
- int(seed), float(level))
844
+ treated_pre = sum(wmap[m] * per[m]["pre_rmspe"] for m in order)
845
+
846
+ ens_pb = [] # (path, pre_rmspe)
847
+ for di in range(len(donors)):
848
+ path = sum(wmap[m] * pb[m][di][0] for m in order)
849
+ pre = sum(wmap[m] * pb[m][di][1] for m in order)
850
+ ens_pb.append((path, pre))
851
+ kept = [p for (p, pre) in ens_pb if treated_pre <= 0 or pre <= 2.0 * treated_pre]
852
+ if len(kept) < 5: # too few comparable placebos → use all
853
+ kept = [p for (p, _) in ens_pb]
854
+ pb_mat = np.array(kept) if kept else np.zeros((0, post_len))
855
+ n_pb = pb_mat.shape[0]
856
+
857
+ # pointwise + cumulative + mean CIs, all from the placebo null
858
+ if n_pb >= 2:
859
+ point_lo = ens_path + np.quantile(pb_mat, a, axis=0)
860
+ point_hi = ens_path + np.quantile(pb_mat, 1.0 - a, axis=0)
861
+ point_hw = float(np.quantile(np.abs(pb_mat), float(level)))
862
+ cum_pb = np.cumsum(pb_mat, axis=1)
863
+ run = np.cumsum(ens_path)
864
+ cum_lo_band = np.quantile(cum_pb, a, axis=0)
865
+ cum_hi_band = np.quantile(cum_pb, 1.0 - a, axis=0)
866
+ pb_att = pb_mat.mean(axis=1)
867
+ p_value = float((1.0 + np.sum(np.abs(pb_att) >= abs(ens_att))) / (1.0 + n_pb))
868
+ else:
869
+ point_lo = point_hi = ens_path.copy()
870
+ point_hw = 0.0
871
+ run = np.cumsum(ens_path)
872
+ cum_lo_band = cum_hi_band = np.zeros(post_len)
873
+ pb_att = np.array([])
874
+ p_value = None
875
+ att_lo, att_hi = _ci(ens_att, pb_att)
876
+
877
+ cum_curve = run * n_treated
712
878
  ensemble = {
713
- "att": ens_att, "att_path": ens_path, "se": se,
714
- "att_lo": lo, "att_hi": hi,
879
+ "att": ens_att, "att_path": ens_path,
880
+ "att_lo": att_lo, "att_hi": att_hi,
715
881
  "lift": ens_att / ens_cf_mean if ens_cf_mean else float("nan"),
716
- "lift_lo": lo / ens_cf_mean if ens_cf_mean else float("nan"),
717
- "lift_hi": hi / ens_cf_mean if ens_cf_mean else float("nan"),
882
+ "lift_lo": att_lo / ens_cf_mean if ens_cf_mean else float("nan"),
883
+ "lift_hi": att_hi / ens_cf_mean if ens_cf_mean else float("nan"),
718
884
  "cumulative": float(ens_path.sum()) * n_treated,
719
- "weights": wmap,
885
+ "weights": wmap, "n_placebo": n_pb,
720
886
  }
721
887
 
722
- # Significance: SC in-space placebo p-value, plus a full SC counterfactual
723
- # (donor-weight reconstruction) for the timeline plot.
724
- sc = _panelkit.fit_sc(self.Y, idx, t0, 0.0, True, level)
725
- p_value = sc.p_value
726
- donors = np.asarray(sc.donor_ids, dtype=int)
727
- w_sc = np.asarray(sc.weights, dtype=float)
728
- full_cf = (self.Y[donors].T @ w_sc) if donors.size else np.full(self.t, np.nan)
729
- treated_series = self.Y[idx].mean(axis=0)
888
+ # full-timeline counterfactual + gap path (pre shows fit; post = effect)
889
+ ens_full_cf = sum(wmap[m] * per[m]["full_cf"] for m in order)
890
+ full_gap = treated_series - ens_full_cf
891
+ full_gap[t0:] = ens_path
892
+ counterfactual = treated_series - full_gap
893
+ ensemble["full_gap"] = full_gap
894
+ ensemble["sigma_pre"] = (float(np.std(full_gap[:t0], ddof=1)) if t0 > 1
895
+ else float(np.std(full_gap[:t0])))
896
+ ensemble["point_hw"] = point_hw
897
+ ensemble["point_lo"] = point_lo
898
+ ensemble["point_hi"] = point_hi
899
+ ensemble["cum_curve"] = cum_curve
900
+ ensemble["cum_lo_curve"] = (run + cum_lo_band) * n_treated
901
+ ensemble["cum_hi_curve"] = (run + cum_hi_band) * n_treated
902
+ ensemble["cum_lo"] = float(ensemble["cum_lo_curve"][-1]) if post_len else float("nan")
903
+ ensemble["cum_hi"] = float(ensemble["cum_hi_curve"][-1]) if post_len else float("nan")
904
+
730
905
  return _EvalReport(names, t0, n_treated, per, ensemble, p_value, level,
731
- treated_series, full_cf)
906
+ treated_series, counterfactual)
732
907
 
733
908
 
734
909
  class _ScenarioGrid:
@@ -924,15 +1099,26 @@ class _EvalReport:
924
1099
  "interval includes zero.")
925
1100
  lines.append(f"Headline (ensemble) : {100*e['lift']:+.2f}% lift, "
926
1101
  f"{e['cumulative']:,.0f} cumulative incremental")
1102
+ if "cum_lo" in e:
1103
+ lines.append(f"Cumulative {cl}% CI : "
1104
+ f"[{e['cum_lo']:,.0f}, {e['cum_hi']:,.0f}] "
1105
+ f"(in-space placebo, {e.get('n_placebo', 0)} donors)")
927
1106
  lines.append(verdict)
928
1107
  lines.append("=" * 66)
929
1108
  return "\n".join(lines)
930
1109
 
931
1110
  def plot(self, path: str | None = None):
932
- """Render the evaluation figure (observed vs counterfactual, effect path,
933
- and a lift-by-method bar). Returns the matplotlib Figure."""
1111
+ """Render the evaluation figure (observed vs counterfactual, effect path
1112
+ with CI band, and a lift-by-method bar). Returns the matplotlib Figure."""
934
1113
  return _plot_eval(self, path)
935
1114
 
1115
+ def plot_effect_over_time(self, path: str | None = None):
1116
+ """Render the effect-over-time figure: the **pointwise** effect across the
1117
+ full timeline (pre-period included, as a placebo check) and the running
1118
+ **cumulative** incremental, each as a point estimate with a confidence
1119
+ band. Returns the matplotlib Figure."""
1120
+ return _plot_eval_timeline(self, path)
1121
+
936
1122
  def __repr__(self):
937
1123
  sig = "sig" if self.significant else "ns"
938
1124
  return (f"EvalReport(lift={100*self.lift:+.2f}%, "
@@ -1362,13 +1548,18 @@ def _plot_eval(rep: "_EvalReport", path):
1362
1548
  ax.grid(True, alpha=0.25)
1363
1549
  ax.legend(loc="best", framealpha=0.9, fontsize=9)
1364
1550
 
1365
- # ---- B: effect path over the post-period (ensemble + per method). ----
1551
+ # ---- B: effect path over the post-period (ensemble + per method) + CI band.
1366
1552
  axb = fig.add_subplot(gs[1, 0])
1367
1553
  for m, r in rep.per.items():
1368
1554
  axb.plot(post, r["att_path"], color=_METHOD_COLORS.get(m, _PK_GREY),
1369
1555
  lw=1.3, alpha=0.7, label=m)
1370
- axb.plot(post, rep.ensemble["att_path"], color=_PK_PURPLE, lw=2.6,
1371
- label="ENSEMBLE")
1556
+ ens_post = rep.ensemble["att_path"]
1557
+ p_lo = rep.ensemble.get("point_lo")
1558
+ p_hi = rep.ensemble.get("point_hi")
1559
+ if p_lo is not None:
1560
+ axb.fill_between(post, p_lo, p_hi, color=_PK_PURPLE, alpha=0.18,
1561
+ label=f"ensemble {int(round(100*rep.level))}% band")
1562
+ axb.plot(post, ens_post, color=_PK_PURPLE, lw=2.6, label="ENSEMBLE")
1372
1563
  axb.axhline(0, color="#111827", lw=1.0)
1373
1564
  axb.set_title("Effect over time (per-period ATT)", fontweight="bold")
1374
1565
  axb.set_xlabel("period")
@@ -1405,3 +1596,82 @@ def _plot_eval(rep: "_EvalReport", path):
1405
1596
  if path:
1406
1597
  fig.savefig(path, dpi=150, bbox_inches="tight")
1407
1598
  return fig
1599
+
1600
+
1601
+ def _plot_eval_timeline(rep: "_EvalReport", path):
1602
+ """Pointwise + cumulative effect over the full timeline, with CI bands.
1603
+
1604
+ Bands come from the in-space placebo distribution (every donor refit as if
1605
+ treated): the pointwise band is the per-period placebo spread around the
1606
+ estimate; the cumulative band grows with horizon as the placebo
1607
+ cumulative-sums spread out."""
1608
+ _, plt = _require_mpl()
1609
+ import numpy as _np
1610
+ from matplotlib.gridspec import GridSpec
1611
+
1612
+ T = len(rep.treated_series)
1613
+ t0 = rep.t0
1614
+ e = rep.ensemble
1615
+ x = _np.arange(T)
1616
+ seg = x[t0:]
1617
+ gap = _np.asarray(e["full_gap"], dtype=float)
1618
+ hw = e.get("point_hw", 0.0)
1619
+ cl = int(round(100 * rep.level))
1620
+
1621
+ plt.rcParams.update({"font.size": 11, "axes.titlesize": 12})
1622
+ fig = plt.figure(figsize=(12, 7.8))
1623
+ fig.patch.set_facecolor("white")
1624
+ gs = GridSpec(2, 1, figure=fig, height_ratios=[1.0, 1.0], hspace=0.32)
1625
+
1626
+ # ---- Top: pointwise effect (treated − counterfactual), full timeline. ----
1627
+ ax = fig.add_subplot(gs[0])
1628
+ ax.axvspan(-0.5, t0 - 0.5, color="#f3f4f6", alpha=0.8)
1629
+ # Constant placebo band across the whole timeline (the pre-period sits inside
1630
+ # it as a fit/placebo check); the per-period CI on the post effect is shown
1631
+ # as a tighter band around the estimate.
1632
+ ax.fill_between(x, gap - hw, gap + hw, color=_PK_PURPLE, alpha=0.12,
1633
+ label=f"{cl}% placebo band")
1634
+ ax.fill_between(seg, e["point_lo"], e["point_hi"], color=_PK_PURPLE, alpha=0.22)
1635
+ ax.plot(x, gap, color=_PK_PURPLE, lw=2.0, label="pointwise effect")
1636
+ ax.axhline(0, color="#111827", lw=1.0)
1637
+ ax.axvline(t0 - 0.5, color="#374151", lw=1.2, ls=":")
1638
+ ax.annotate("pre-period (placebo)", (t0 / 2, ax.get_ylim()[1]), ha="center",
1639
+ va="top", color="#6b7280", fontsize=9)
1640
+ ax.annotate("test window", (t0 + (T - t0) / 2, ax.get_ylim()[1]), ha="center",
1641
+ va="top", color="#6b21a8", fontsize=9)
1642
+ ax.set_title("Pointwise effect over time (treated − counterfactual)",
1643
+ fontweight="bold")
1644
+ ax.set_xlabel("period")
1645
+ ax.set_ylabel("per-period effect")
1646
+ ax.grid(True, alpha=0.25)
1647
+ ax.legend(loc="upper left", framealpha=0.9, fontsize=9)
1648
+
1649
+ # ---- Bottom: cumulative incremental over the test window (×n_treated). ----
1650
+ axc = fig.add_subplot(gs[1])
1651
+ cum = e["cum_curve"]
1652
+ axc.axvspan(-0.5, t0 - 0.5, color="#f3f4f6", alpha=0.8)
1653
+ axc.fill_between(seg, e["cum_lo_curve"], e["cum_hi_curve"], color=_PK_GREEN,
1654
+ alpha=0.15, label=f"{cl}% band (in-space placebo)")
1655
+ axc.plot(seg, cum, color=_PK_GREEN, lw=2.4, label="cumulative incremental")
1656
+ axc.axhline(0, color="#111827", lw=1.0)
1657
+ axc.axvline(t0 - 0.5, color="#374151", lw=1.2, ls=":")
1658
+ final = cum[-1]
1659
+ axc.annotate(f"{final:,.0f}\n[{e['cum_lo']:,.0f}, {e['cum_hi']:,.0f}]",
1660
+ (T - 1, final), textcoords="offset points", xytext=(-6, 0),
1661
+ ha="right", va="center", fontweight="bold", color="#065f46", fontsize=9)
1662
+ axc.set_title("Cumulative incremental effect over the test window",
1663
+ fontweight="bold")
1664
+ axc.set_xlabel("period")
1665
+ axc.set_ylabel("cumulative incremental")
1666
+ axc.set_xlim(-0.5, T - 0.5)
1667
+ axc.grid(True, alpha=0.25)
1668
+ axc.legend(loc="upper left", framealpha=0.9, fontsize=9)
1669
+
1670
+ fig.suptitle(f"panelkit · effect over time — ensemble "
1671
+ f"{100*rep.ensemble['lift']:+.2f}% lift, "
1672
+ f"{rep.ensemble['cumulative']:,.0f} cumulative "
1673
+ f"[{e['cum_lo']:,.0f}, {e['cum_hi']:,.0f}]",
1674
+ fontsize=14, fontweight="bold", x=0.012, ha="left")
1675
+ if path:
1676
+ fig.savefig(path, dpi=150, bbox_inches="tight")
1677
+ return fig
File without changes
File without changes
File without changes
File without changes
File without changes