panelkit 0.2.0__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. {panelkit-0.2.0 → panelkit-0.2.2}/Cargo.lock +5 -5
  2. {panelkit-0.2.0 → panelkit-0.2.2}/Cargo.toml +1 -1
  3. {panelkit-0.2.0 → panelkit-0.2.2}/GUIDE.md +117 -9
  4. {panelkit-0.2.0 → panelkit-0.2.2}/PKG-INFO +52 -8
  5. {panelkit-0.2.0 → panelkit-0.2.2}/README.md +51 -7
  6. {panelkit-0.2.0 → panelkit-0.2.2}/crates/geo/Cargo.toml +1 -1
  7. {panelkit-0.2.0 → panelkit-0.2.2}/crates/geo/src/lib.rs +1 -1
  8. {panelkit-0.2.0 → panelkit-0.2.2}/crates/geo/src/power.rs +161 -1
  9. {panelkit-0.2.0 → panelkit-0.2.2}/crates/geo/src/selection.rs +36 -5
  10. {panelkit-0.2.0 → panelkit-0.2.2}/crates/geo/src/types.rs +4 -0
  11. {panelkit-0.2.0 → panelkit-0.2.2}/crates/geo/tests/geo.rs +25 -3
  12. {panelkit-0.2.0 → panelkit-0.2.2}/crates/pypanelkit/src/api_geo.rs +82 -3
  13. {panelkit-0.2.0 → panelkit-0.2.2}/crates/pypanelkit/src/lib.rs +1 -0
  14. {panelkit-0.2.0 → panelkit-0.2.2}/crates/pypanelkit/src/results.rs +4 -0
  15. {panelkit-0.2.0 → panelkit-0.2.2}/pyproject.toml +1 -1
  16. {panelkit-0.2.0 → panelkit-0.2.2}/python/panelkit/_panelkit.pyi +15 -0
  17. panelkit-0.2.2/python/panelkit/design.py +1407 -0
  18. panelkit-0.2.0/python/panelkit/design.py +0 -653
  19. {panelkit-0.2.0 → panelkit-0.2.2}/BENCHMARKS.md +0 -0
  20. {panelkit-0.2.0 → panelkit-0.2.2}/LICENSE-APACHE +0 -0
  21. {panelkit-0.2.0 → panelkit-0.2.2}/LICENSE-MIT +0 -0
  22. {panelkit-0.2.0 → panelkit-0.2.2}/crates/estimators/Cargo.toml +0 -0
  23. {panelkit-0.2.0 → panelkit-0.2.2}/crates/estimators/benches/estimators.rs +0 -0
  24. {panelkit-0.2.0 → panelkit-0.2.2}/crates/estimators/src/did/bacon.rs +0 -0
  25. {panelkit-0.2.0 → panelkit-0.2.2}/crates/estimators/src/did/callaway.rs +0 -0
  26. {panelkit-0.2.0 → panelkit-0.2.2}/crates/estimators/src/did/mod.rs +0 -0
  27. {panelkit-0.2.0 → panelkit-0.2.2}/crates/estimators/src/did/sunab.rs +0 -0
  28. {panelkit-0.2.0 → panelkit-0.2.2}/crates/estimators/src/did/twfe.rs +0 -0
  29. {panelkit-0.2.0 → panelkit-0.2.2}/crates/estimators/src/fe/mod.rs +0 -0
  30. {panelkit-0.2.0 → panelkit-0.2.2}/crates/estimators/src/fe/within.rs +0 -0
  31. {panelkit-0.2.0 → panelkit-0.2.2}/crates/estimators/src/lib.rs +0 -0
  32. {panelkit-0.2.0 → panelkit-0.2.2}/crates/estimators/src/mcnnm/mod.rs +0 -0
  33. {panelkit-0.2.0 → panelkit-0.2.2}/crates/estimators/src/mcnnm/softimpute.rs +0 -0
  34. {panelkit-0.2.0 → panelkit-0.2.2}/crates/estimators/src/panel.rs +0 -0
  35. {panelkit-0.2.0 → panelkit-0.2.2}/crates/estimators/src/result.rs +0 -0
  36. {panelkit-0.2.0 → panelkit-0.2.2}/crates/estimators/src/sc/augmented.rs +0 -0
  37. {panelkit-0.2.0 → panelkit-0.2.2}/crates/estimators/src/sc/cpasc.rs +0 -0
  38. {panelkit-0.2.0 → panelkit-0.2.2}/crates/estimators/src/sc/mod.rs +0 -0
  39. {panelkit-0.2.0 → panelkit-0.2.2}/crates/estimators/src/sc/sdid.rs +0 -0
  40. {panelkit-0.2.0 → panelkit-0.2.2}/crates/estimators/src/sc/synthetic.rs +0 -0
  41. {panelkit-0.2.0 → panelkit-0.2.2}/crates/estimators/tests/cpasc.rs +0 -0
  42. {panelkit-0.2.0 → panelkit-0.2.2}/crates/estimators/tests/did.rs +0 -0
  43. {panelkit-0.2.0 → panelkit-0.2.2}/crates/estimators/tests/sc.rs +0 -0
  44. {panelkit-0.2.0 → panelkit-0.2.2}/crates/estimators/tests/sc_family.rs +0 -0
  45. {panelkit-0.2.0 → panelkit-0.2.2}/crates/geo/src/diagnostics.rs +0 -0
  46. {panelkit-0.2.0 → panelkit-0.2.2}/crates/inference/Cargo.toml +0 -0
  47. {panelkit-0.2.0 → panelkit-0.2.2}/crates/inference/src/batch.rs +0 -0
  48. {panelkit-0.2.0 → panelkit-0.2.2}/crates/inference/src/bootstrap.rs +0 -0
  49. {panelkit-0.2.0 → panelkit-0.2.2}/crates/inference/src/ci.rs +0 -0
  50. {panelkit-0.2.0 → panelkit-0.2.2}/crates/inference/src/lib.rs +0 -0
  51. {panelkit-0.2.0 → panelkit-0.2.2}/crates/inference/src/parallel.rs +0 -0
  52. {panelkit-0.2.0 → panelkit-0.2.2}/crates/inference/src/placebo.rs +0 -0
  53. {panelkit-0.2.0 → panelkit-0.2.2}/crates/inference/tests/inference.rs +0 -0
  54. {panelkit-0.2.0 → panelkit-0.2.2}/crates/linalg/Cargo.toml +0 -0
  55. {panelkit-0.2.0 → panelkit-0.2.2}/crates/linalg/src/error.rs +0 -0
  56. {panelkit-0.2.0 → panelkit-0.2.2}/crates/linalg/src/factor/cholesky.rs +0 -0
  57. {panelkit-0.2.0 → panelkit-0.2.2}/crates/linalg/src/factor/eig_sym.rs +0 -0
  58. {panelkit-0.2.0 → panelkit-0.2.2}/crates/linalg/src/factor/mod.rs +0 -0
  59. {panelkit-0.2.0 → panelkit-0.2.2}/crates/linalg/src/factor/qr.rs +0 -0
  60. {panelkit-0.2.0 → panelkit-0.2.2}/crates/linalg/src/factor/randomized.rs +0 -0
  61. {panelkit-0.2.0 → panelkit-0.2.2}/crates/linalg/src/factor/svd.rs +0 -0
  62. {panelkit-0.2.0 → panelkit-0.2.2}/crates/linalg/src/factor/svd_gram.rs +0 -0
  63. {panelkit-0.2.0 → panelkit-0.2.2}/crates/linalg/src/lib.rs +0 -0
  64. {panelkit-0.2.0 → panelkit-0.2.2}/crates/linalg/src/matrix.rs +0 -0
  65. {panelkit-0.2.0 → panelkit-0.2.2}/crates/linalg/src/ops/matmul.rs +0 -0
  66. {panelkit-0.2.0 → panelkit-0.2.2}/crates/linalg/src/ops/mod.rs +0 -0
  67. {panelkit-0.2.0 → panelkit-0.2.2}/crates/linalg/src/ops/norms.rs +0 -0
  68. {panelkit-0.2.0 → panelkit-0.2.2}/crates/linalg/src/ops/transform.rs +0 -0
  69. {panelkit-0.2.0 → panelkit-0.2.2}/crates/linalg/src/opt/mod.rs +0 -0
  70. {panelkit-0.2.0 → panelkit-0.2.2}/crates/linalg/src/opt/simplex.rs +0 -0
  71. {panelkit-0.2.0 → panelkit-0.2.2}/crates/linalg/src/opt/softthresh.rs +0 -0
  72. {panelkit-0.2.0 → panelkit-0.2.2}/crates/linalg/src/rng.rs +0 -0
  73. {panelkit-0.2.0 → panelkit-0.2.2}/crates/linalg/src/solve/lstsq.rs +0 -0
  74. {panelkit-0.2.0 → panelkit-0.2.2}/crates/linalg/src/solve/mod.rs +0 -0
  75. {panelkit-0.2.0 → panelkit-0.2.2}/crates/linalg/src/solve/spd.rs +0 -0
  76. {panelkit-0.2.0 → panelkit-0.2.2}/crates/linalg/tests/numerics.rs +0 -0
  77. {panelkit-0.2.0 → panelkit-0.2.2}/crates/pypanelkit/Cargo.toml +0 -0
  78. {panelkit-0.2.0 → panelkit-0.2.2}/crates/pypanelkit/src/api_did.rs +0 -0
  79. {panelkit-0.2.0 → panelkit-0.2.2}/crates/pypanelkit/src/api_sc.rs +0 -0
  80. {panelkit-0.2.0 → panelkit-0.2.2}/crates/pypanelkit/src/convert.rs +0 -0
  81. {panelkit-0.2.0 → panelkit-0.2.2}/python/panelkit/__init__.py +0 -0
  82. {panelkit-0.2.0 → panelkit-0.2.2}/python/panelkit/estimators.py +0 -0
  83. {panelkit-0.2.0 → panelkit-0.2.2}/python/panelkit/py.typed +0 -0
@@ -462,7 +462,7 @@ checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e"
462
462
 
463
463
  [[package]]
464
464
  name = "panelkit-estimators"
465
- version = "0.2.0"
465
+ version = "0.2.2"
466
466
  dependencies = [
467
467
  "criterion",
468
468
  "panelkit-linalg",
@@ -471,7 +471,7 @@ dependencies = [
471
471
 
472
472
  [[package]]
473
473
  name = "panelkit-geo"
474
- version = "0.2.0"
474
+ version = "0.2.2"
475
475
  dependencies = [
476
476
  "panelkit-estimators",
477
477
  "panelkit-inference",
@@ -482,7 +482,7 @@ dependencies = [
482
482
 
483
483
  [[package]]
484
484
  name = "panelkit-inference"
485
- version = "0.2.0"
485
+ version = "0.2.2"
486
486
  dependencies = [
487
487
  "panelkit-estimators",
488
488
  "panelkit-linalg",
@@ -491,7 +491,7 @@ dependencies = [
491
491
 
492
492
  [[package]]
493
493
  name = "panelkit-linalg"
494
- version = "0.2.0"
494
+ version = "0.2.2"
495
495
  dependencies = [
496
496
  "proptest",
497
497
  "rayon",
@@ -623,7 +623,7 @@ dependencies = [
623
623
 
624
624
  [[package]]
625
625
  name = "pypanelkit"
626
- version = "0.2.0"
626
+ version = "0.2.2"
627
627
  dependencies = [
628
628
  "numpy",
629
629
  "panelkit-estimators",
@@ -3,7 +3,7 @@ resolver = "2"
3
3
  members = ["crates/linalg", "crates/estimators", "crates/inference", "crates/geo", "crates/pypanelkit"]
4
4
 
5
5
  [workspace.package]
6
- version = "0.2.0"
6
+ version = "0.2.2"
7
7
  edition = "2021"
8
8
  rust-version = "1.74"
9
9
  license = "MIT OR Apache-2.0"
@@ -251,16 +251,93 @@ SC / ASC / SDID. Returns a report with:
251
251
  estimate-accuracy CI, design-quality bars).
252
252
 
253
253
  Key options: `alpha` (significance level, default 0.10), `target_power`
254
- (default 0.80), `lifts` (the % grid), `methods`, `recommended` (default SDID).
254
+ (default 0.80), `lifts` (the % grid), `methods`, `recommended` (default SDID),
255
+ `lookback`, `ensemble`/`ensemble_weights`.
256
+
257
+ **The ENSEMBLE method (weighted average of SC + ASC + SDID).** By default
258
+ `power()` adds an `"ENSEMBLE"` result alongside the three base methods: a
259
+ weighted average of their ATTs, combined *within each placebo window* before the
260
+ null and power are computed. (That ordering matters — the power of the averaged
261
+ estimator is not the average of three powers; the blend is usually steadier than
262
+ any single method, so its MDE is often the smallest.) `ensemble_weights="auto"`
263
+ (default) uses **inverse-variance** weighting — each method weighted by the
264
+ precision of its historical-null distribution, so a noisier estimator counts for
265
+ less. Pass `"equal"`, a dict like `{"SC": 0.5, "ASC": 0.2, "SDID": 0.3}`, or a
266
+ `[w_sc, w_asc, w_sdid]` list to set them yourself; `ensemble=False` turns it off.
267
+ The weights used are printed in the report and stored on
268
+ `rep.results["ENSEMBLE"].ensemble_weights`.
269
+
270
+ **How power is simulated (many placebos, not one).** For a treated set, the test
271
+ window of length `test_len` is *slid across the whole history*: every valid start
272
+ position is one placebo experiment. The detection threshold (critical |ATT|)
273
+ comes from those same windows with **no** injected lift (the historical null), and
274
+ power at lift τ is the share of windows whose injected effect clears that
275
+ threshold. So the estimate is averaged over **many** placebos — `result.n_windows`
276
+ reports how many.
277
+
278
+ **The `lookback` option — how far back to simulate.** By default panelkit powers
279
+ over *all* valid windows (more placebo samples → a more stable power estimate).
280
+ Pass `lookback=k` to use only the **most-recent k** windows: those have the
281
+ longest pre-periods and reflect current dynamics, so they're the most
282
+ representative of the test you're about to run — at the cost of fewer samples (a
283
+ noisier estimate). It matters when older history is unrepresentative (regime
284
+ change, growth, format changes) or when early windows have very short pre-periods;
285
+ set `lookback` to cover your relevant recent history (e.g. the last ~6–12
286
+ months of windows).
287
+
288
+ ### Evaluating a test that ran — `design.evaluate(treated, treat_start, …)`
289
+
290
+ `power()` *plans* a test; `evaluate()` *measures* one. Given the treated markets
291
+ and the period treatment began (`treat_start`, the first post-period column), it
292
+ fits SC / ASC / SDID, reports each one's realized effect, and blends them into a
293
+ weighted-average **ensemble** estimate.
294
+
295
+ ```python
296
+ ev = design.evaluate(treated=["chicago", "denver"], treat_start=52, level=0.90)
297
+ print(ev.summary()) # per-method + ensemble lift, CI, cumulative
298
+ ev.plot("evaluate.png") # observed-vs-counterfactual, effect path, lift bar
299
+ ev.lift, ev.cumulative, ev.significant
300
+ ```
301
+
302
+ Each estimate gets a confidence interval from a **stationary block bootstrap** of
303
+ its post-period effect path; an **SC in-space placebo** supplies a p-value. The
304
+ ensemble uses the same `weights` choices as `power()` (`"auto"` = inverse-variance
305
+ from each method's bootstrap SE, `"equal"`, or an explicit dict/list). `ev` exposes
306
+ `.lift`, `.att`, `.cumulative`, `.significant`, the per-method results in `ev.per`,
307
+ and the ensemble in `ev.ensemble`. Reported numbers: **% lift** (effect ÷
308
+ counterfactual), **per-period ATT**, and **cumulative incremental** over the
309
+ window (summed across treated markets).
255
310
 
256
311
  ### Choosing a specification — `design.recommend(test_lengths, n_geos_options, target_lift, alphas=…)`
257
312
 
258
313
  Sweeps designs across **test length × number of geos × alpha** and recommends the
259
314
  best (smallest MDE among trustworthy designs, ties broken toward shorter/cheaper).
260
315
  `grid.summary()` prints the recommendation + alternatives; `grid.plot(path)`
261
- renders the **tradeoffs figure** (MDE vs length per #geos, an MDE heatmap over
262
- length × #geos, and alpha sensitivity). Use it to find the "knee" — the cheapest
263
- design that still detects your target lift.
316
+ renders the **tradeoffs figure**. Use it to find the "knee" the cheapest design
317
+ that still detects your target lift.
318
+
319
+ **Reading the tradeoffs figure:**
320
+ - **Top panel** — minimum detectable lift (%) vs test length, one line per number
321
+ of treated geos. *Lower is better.* The red band marks lifts you *can't*
322
+ detect; lines below your target lift are viable designs. More geos and longer
323
+ tests pull the line down (more signal), but cost more holdout/time — pick the
324
+ knee where the curve flattens.
325
+ - **Bottom-left heatmap** — the same MDE across every (test length × #geos) cell,
326
+ green = small detectable lift (good), red = large (bad), grey = underpowered.
327
+ - **Bottom-right** — with multiple alphas, how the MDE of the recommended design
328
+ moves with the significance level (looser α → smaller MDE, more false
329
+ positives); with one alpha, design confidence by spec.
330
+ - The black ★ marks the recommended design.
331
+
332
+ ### Guardrails — `design.diagnose(treated, test_len)`
333
+
334
+ Before trusting a design, check it. `diagnose` returns a report with
335
+ `.summary()` and `.plot(path)` (the **guardrails figure**): the pre-period fit
336
+ (treated vs synthetic control, so you can *see* whether the counterfactual
337
+ tracks), a seasonality ACF, the holdout share against a healthy band, and a
338
+ banner listing any plain-language warnings (weak fit, volatile markets, strong
339
+ seasonality vs short history, tiny/huge holdout, too few donors). It also exposes
340
+ `.confidence`, `.holdout_pct`, and `.warnings`.
264
341
 
265
342
  ### Picking markets — `design.select_markets(test_len, target_lift, max_treated, …)`
266
343
 
@@ -268,9 +345,40 @@ Searches candidate treatment-market sets and ranks them by power, MDE, pre-fit,
268
345
  holdout, and confidence. Pass `eligible=[…]` to restrict to markets you can
269
346
  actually run in.
270
347
 
271
- ### What it adds over GeoLift
348
+ ### Multi-cell tests `design.multi_cell(cells, test_len, …)`
349
+
350
+ Often you run several treatment cells at once — different creatives, budgets, or
351
+ messages across disjoint groups of markets — and want each cell's lift measured
352
+ separately. The subtlety is the control pool: a market that's treated in one cell
353
+ can't be a clean control for another. `multi_cell` handles this by powering each
354
+ cell against a **shared donor pool that excludes every cell's treated markets**.
355
+
356
+ ```python
357
+ mc = design.multi_cell(
358
+ cells={
359
+ "West": ["los_angeles", "san_diego"],
360
+ "Midwest": ["chicago", "detroit"],
361
+ "Northeast": ["boston", "philadelphia"],
362
+ },
363
+ test_len=8, alpha=0.10,
364
+ )
365
+ print(mc.summary()) # per-cell MDE / confidence / holdout + combined holdout
366
+ mc.plot("multicell.png") # per-cell power curves + an MDE-by-cell bar
367
+ ```
272
368
 
273
- Multi-method (SC/ASC/SDID, not just augmented SCM), MDE in %/absolute/cumulative
274
- with CIs, an explicit confidence score + verdict, seasonality/stability/holdout
275
- guardrails with plain-English warnings, a specification-tradeoff sweep, and
276
- publication-clean figures out of the box.
369
+ `cells` maps a label to its markets (names or indices) and must be disjoint. By
370
+ default the donor pool is every market not assigned to any cell; pass
371
+ `shared_donors=[…]` to fix it explicitly. `lifts`, `methods`, `alpha`,
372
+ `target_power`, `recommended`, and `lookback` are forwarded to each cell's power
373
+ analysis. The report exposes `mc.cells[label]` (a full power report per cell) and
374
+ a combined holdout across all cells. Bigger cells get a smaller MDE; underpowered
375
+ cells are flagged so you can grow or merge them before spending.
376
+
377
+ ### What the design layer gives you
378
+
379
+ Multi-method power (SC/ASC/SDID plus a weighted-average **ensemble** and a
380
+ naive-DiD baseline), MDE in %/absolute/cumulative with CIs, an explicit 0–100
381
+ confidence score + one-line verdict, seasonality/stability/holdout guardrails with
382
+ plain-English warnings, a specification-tradeoff sweep, multi-cell designs,
383
+ **post-test evaluation** (`evaluate()`), and publication-clean figures out of the
384
+ box.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: panelkit
3
- Version: 0.2.0
3
+ Version: 0.2.2
4
4
  Classifier: Programming Language :: Rust
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Topic :: Scientific/Engineering
@@ -202,10 +202,10 @@ valid inference for each estimator.
202
202
 
203
203
  ## Geo test design (power analysis & market selection)
204
204
 
205
- `panelkit.design` is the planning layer in front of a geo experiment — a
206
- GeoLift-style toolkit, but multi-method and robustness-first, with the heavy
207
- simulation in Rust. It answers: **which markets should I treat, how big a lift
208
- can I detect, and can I trust this design?**
205
+ `panelkit.design` is the planning layer in front of a geo experiment —
206
+ multi-method and robustness-first, with the heavy simulation in Rust. It answers:
207
+ **which markets should I treat, how big a lift can I detect, can I trust this
208
+ design and, once it's run, how big was the effect?**
209
209
 
210
210
  ```python
211
211
  from panelkit.design import GeoDesign
@@ -217,11 +217,27 @@ rep = design.power(treated=["chicago", "denver"], test_len=8, alpha=0.10)
217
217
  print(rep.summary()) # plain-English report: MDE, confidence, warnings
218
218
  rep.plot("design.png") # the figure below
219
219
 
220
+ # guardrails: is this design trustworthy? (pre-fit, seasonality, holdout, warnings)
221
+ guard = design.diagnose(treated=["chicago", "denver"], test_len=8)
222
+ print(guard.summary())
223
+ guard.plot("guardrails.png") # the guardrails figure below
224
+
220
225
  # let it pick the markets for you:
221
226
  ranked = design.select_markets(test_len=8, target_lift=0.05, max_treated=3)
222
227
 
228
+ # run several disjoint treatment cells at once (each vs. a shared donor pool):
229
+ mc = design.multi_cell(cells={"west": ["los_angeles", "san_diego"],
230
+ "east": ["boston", "philadelphia"]}, test_len=8)
231
+ print(mc.summary()) # per-cell MDE / confidence / holdout
232
+ mc.plot("multicell.png") # the multi-cell figure below
233
+
234
+ # already ran the test? measure it (SC/ASC/SDID + a weighted-average ensemble):
235
+ ev = design.evaluate(treated=["chicago", "denver"], treat_start=52)
236
+ print(ev.summary()) # per-method + ensemble lift, CI, cumulative
237
+ ev.plot("evaluate.png") # observed vs counterfactual + lift-by-method
238
+
223
239
  # or sweep specifications (length × #geos × significance) and recommend one:
224
- grid = design.recommend(test_lengths=[4, 6, 8, 12], n_geos_options=[1, 2, 3, 4],
240
+ grid = design.recommend(test_lengths=[4, 6, 8, 12], n_geos_options=[3, 5, 10, 20],
225
241
  target_lift=0.05, alphas=[0.05, 0.10])
226
242
  print(grid.summary())
227
243
  grid.plot("tradeoffs.png") # the tradeoffs figure below
@@ -229,19 +245,41 @@ grid.plot("tradeoffs.png") # the tradeoffs figure below
229
245
 
230
246
  ![geo design report](assets/geo_design.png)
231
247
 
248
+ **Guardrails — can you trust the design?** `diagnose(...)` visualizes the
249
+ pre-period fit (treated vs synthetic control), seasonality, holdout share, and
250
+ surfaces plain-language warnings when the design is risky:
251
+
252
+ ![guardrails](assets/geo_guardrails.png)
253
+
232
254
  **Recommendations across specifications.** `recommend(...)` sweeps test length ×
233
255
  number of geos × significance level (`alpha`) and points you at the cheapest
234
- design that still detects your target lift — with a figure of the tradeoffs:
256
+ design that still detects your target lift — with a readable figure of the
257
+ tradeoffs (MDE vs length per #geos, an intuitive heatmap, and alpha sensitivity):
235
258
 
236
259
  ![specification tradeoffs](assets/geo_scenarios.png)
237
260
 
261
+ **Multi-cell tests.** `multi_cell(...)` runs several disjoint treatment cells
262
+ simultaneously — each measured against a shared donor pool that excludes *every*
263
+ cell's treated markets, so cells never borrow each other as controls. You get a
264
+ per-cell MDE/confidence/holdout report and a combined figure:
265
+
266
+ ![multi-cell test](assets/geo_multicell.png)
267
+
268
+ **Evaluate a test that ran.** `evaluate(...)` is the measurement counterpart to
269
+ the power analysis: fit SC / ASC / SDID on a test that already happened, blend
270
+ them into a weighted-average **ensemble** estimate, and report each one's lift,
271
+ confidence interval (stationary block bootstrap), and cumulative incremental —
272
+ with an SC in-space placebo p-value:
273
+
274
+ ![test evaluation](assets/geo_evaluate.png)
275
+
238
276
  **Messy DataFrame? No problem.** `from_long` coerces real-world data: outcome
239
277
  strings → numeric (with a clear error on genuinely non-numeric values), dates
240
278
  (string or unsorted) → chronological columns, locations → market names, duplicate
241
279
  rows aggregated with a warning, and a clear error (with a count) if the panel is
242
280
  gappy. You don't pre-clean dtypes.
243
281
 
244
- What it does that GeoLift doesn't, out of the box:
282
+ What you get out of the box:
245
283
 
246
284
  - **Real-data power** — historical placebo with injected lift on your *actual*
247
285
  panel (not an assumed variance), across **SC, ASC, and SDID** with a
@@ -256,6 +294,12 @@ What it does that GeoLift doesn't, out of the box:
256
294
  go/no-go.
257
295
  - **Market selection** that searches candidate treatment sets and ranks them by
258
296
  power, MDE, fit, holdout, and confidence.
297
+ - **Multi-cell tests** — several disjoint treatment cells powered at once against
298
+ a shared donor pool, with a per-cell MDE/confidence report.
299
+ - **A weighted-average ensemble** of SC + ASC + SDID (combined per placebo window,
300
+ with auto inverse-variance weights) for a steadier estimate than any one method.
301
+ - **Post-test evaluation** — `evaluate()` measures a test that already ran:
302
+ per-method + ensemble lift, bootstrap CIs, cumulative incremental, and a p-value.
259
303
 
260
304
  See [`examples/geo_demo.py`](examples/geo_demo.py).
261
305
 
@@ -172,10 +172,10 @@ valid inference for each estimator.
172
172
 
173
173
  ## Geo test design (power analysis & market selection)
174
174
 
175
- `panelkit.design` is the planning layer in front of a geo experiment — a
176
- GeoLift-style toolkit, but multi-method and robustness-first, with the heavy
177
- simulation in Rust. It answers: **which markets should I treat, how big a lift
178
- can I detect, and can I trust this design?**
175
+ `panelkit.design` is the planning layer in front of a geo experiment —
176
+ multi-method and robustness-first, with the heavy simulation in Rust. It answers:
177
+ **which markets should I treat, how big a lift can I detect, can I trust this
178
+ design and, once it's run, how big was the effect?**
179
179
 
180
180
  ```python
181
181
  from panelkit.design import GeoDesign
@@ -187,11 +187,27 @@ rep = design.power(treated=["chicago", "denver"], test_len=8, alpha=0.10)
187
187
  print(rep.summary()) # plain-English report: MDE, confidence, warnings
188
188
  rep.plot("design.png") # the figure below
189
189
 
190
+ # guardrails: is this design trustworthy? (pre-fit, seasonality, holdout, warnings)
191
+ guard = design.diagnose(treated=["chicago", "denver"], test_len=8)
192
+ print(guard.summary())
193
+ guard.plot("guardrails.png") # the guardrails figure below
194
+
190
195
  # let it pick the markets for you:
191
196
  ranked = design.select_markets(test_len=8, target_lift=0.05, max_treated=3)
192
197
 
198
+ # run several disjoint treatment cells at once (each vs. a shared donor pool):
199
+ mc = design.multi_cell(cells={"west": ["los_angeles", "san_diego"],
200
+ "east": ["boston", "philadelphia"]}, test_len=8)
201
+ print(mc.summary()) # per-cell MDE / confidence / holdout
202
+ mc.plot("multicell.png") # the multi-cell figure below
203
+
204
+ # already ran the test? measure it (SC/ASC/SDID + a weighted-average ensemble):
205
+ ev = design.evaluate(treated=["chicago", "denver"], treat_start=52)
206
+ print(ev.summary()) # per-method + ensemble lift, CI, cumulative
207
+ ev.plot("evaluate.png") # observed vs counterfactual + lift-by-method
208
+
193
209
  # or sweep specifications (length × #geos × significance) and recommend one:
194
- grid = design.recommend(test_lengths=[4, 6, 8, 12], n_geos_options=[1, 2, 3, 4],
210
+ grid = design.recommend(test_lengths=[4, 6, 8, 12], n_geos_options=[3, 5, 10, 20],
195
211
  target_lift=0.05, alphas=[0.05, 0.10])
196
212
  print(grid.summary())
197
213
  grid.plot("tradeoffs.png") # the tradeoffs figure below
@@ -199,19 +215,41 @@ grid.plot("tradeoffs.png") # the tradeoffs figure below
199
215
 
200
216
  ![geo design report](assets/geo_design.png)
201
217
 
218
+ **Guardrails — can you trust the design?** `diagnose(...)` visualizes the
219
+ pre-period fit (treated vs synthetic control), seasonality, holdout share, and
220
+ surfaces plain-language warnings when the design is risky:
221
+
222
+ ![guardrails](assets/geo_guardrails.png)
223
+
202
224
  **Recommendations across specifications.** `recommend(...)` sweeps test length ×
203
225
  number of geos × significance level (`alpha`) and points you at the cheapest
204
- design that still detects your target lift — with a figure of the tradeoffs:
226
+ design that still detects your target lift — with a readable figure of the
227
+ tradeoffs (MDE vs length per #geos, an intuitive heatmap, and alpha sensitivity):
205
228
 
206
229
  ![specification tradeoffs](assets/geo_scenarios.png)
207
230
 
231
+ **Multi-cell tests.** `multi_cell(...)` runs several disjoint treatment cells
232
+ simultaneously — each measured against a shared donor pool that excludes *every*
233
+ cell's treated markets, so cells never borrow each other as controls. You get a
234
+ per-cell MDE/confidence/holdout report and a combined figure:
235
+
236
+ ![multi-cell test](assets/geo_multicell.png)
237
+
238
+ **Evaluate a test that ran.** `evaluate(...)` is the measurement counterpart to
239
+ the power analysis: fit SC / ASC / SDID on a test that already happened, blend
240
+ them into a weighted-average **ensemble** estimate, and report each one's lift,
241
+ confidence interval (stationary block bootstrap), and cumulative incremental —
242
+ with an SC in-space placebo p-value:
243
+
244
+ ![test evaluation](assets/geo_evaluate.png)
245
+
208
246
  **Messy DataFrame? No problem.** `from_long` coerces real-world data: outcome
209
247
  strings → numeric (with a clear error on genuinely non-numeric values), dates
210
248
  (string or unsorted) → chronological columns, locations → market names, duplicate
211
249
  rows aggregated with a warning, and a clear error (with a count) if the panel is
212
250
  gappy. You don't pre-clean dtypes.
213
251
 
214
- What it does that GeoLift doesn't, out of the box:
252
+ What you get out of the box:
215
253
 
216
254
  - **Real-data power** — historical placebo with injected lift on your *actual*
217
255
  panel (not an assumed variance), across **SC, ASC, and SDID** with a
@@ -226,6 +264,12 @@ What it does that GeoLift doesn't, out of the box:
226
264
  go/no-go.
227
265
  - **Market selection** that searches candidate treatment sets and ranks them by
228
266
  power, MDE, fit, holdout, and confidence.
267
+ - **Multi-cell tests** — several disjoint treatment cells powered at once against
268
+ a shared donor pool, with a per-cell MDE/confidence report.
269
+ - **A weighted-average ensemble** of SC + ASC + SDID (combined per placebo window,
270
+ with auto inverse-variance weights) for a steadier estimate than any one method.
271
+ - **Post-test evaluation** — `evaluate()` measures a test that already ran:
272
+ per-method + ensemble lift, bootstrap CIs, cumulative incremental, and a p-value.
229
273
 
230
274
  See [`examples/geo_demo.py`](examples/geo_demo.py).
231
275
 
@@ -6,7 +6,7 @@ rust-version.workspace = true
6
6
  license.workspace = true
7
7
  authors.workspace = true
8
8
  repository.workspace = true
9
- description = "Geo-experiment design: power analysis, market selection, and real-world diagnostics for panelkit (GeoLift-style, but multi-method and robustness-first)."
9
+ description = "Geo-experiment design: multi-method power analysis, market selection, and real-world diagnostics for panelkit."
10
10
 
11
11
  [features]
12
12
  default = []
@@ -21,6 +21,6 @@ pub mod selection;
21
21
  pub mod types;
22
22
 
23
23
  pub use diagnostics::diagnostics;
24
- pub use power::power_curve;
24
+ pub use power::{power_curve, power_curve_ensemble};
25
25
  pub use selection::{evaluate, select_markets, MarketCandidate, SelectConfig};
26
26
  pub use types::{Diagnostics, Method, PowerPoint, PowerResult};
@@ -26,9 +26,39 @@ pub(crate) fn fit_method(panel: &Panel, t0: usize, method: Method) -> ScFit {
26
26
  Method::Sc => fit_sc_at(panel, t0, ScConfig::default()),
27
27
  Method::Asc => fit_asc_at(panel, t0, AscConfig::default()),
28
28
  Method::Sdid => fit_sdid_at(panel, t0, SdidConfig::default()),
29
+ Method::Ensemble => {
30
+ unreachable!("Ensemble is combined across methods, not a single fit")
31
+ }
29
32
  }
30
33
  }
31
34
 
35
+ /// Normalize three (clamped-nonnegative) weights to sum to 1. Falls back to
36
+ /// equal weights if the inputs are degenerate (all ≤ 0).
37
+ fn normalize_weights(w: [f64; 3]) -> [f64; 3] {
38
+ let c = [w[0].max(0.0), w[1].max(0.0), w[2].max(0.0)];
39
+ let s = c[0] + c[1] + c[2];
40
+ if s > 0.0 {
41
+ [c[0] / s, c[1] / s, c[2] / s]
42
+ } else {
43
+ [1.0 / 3.0; 3]
44
+ }
45
+ }
46
+
47
+ /// Inverse-variance ("precision") weights from each method's null variance:
48
+ /// a method with a tighter placebo distribution gets more weight. A small floor
49
+ /// (relative to the mean variance) keeps a near-perfect fit from taking all the
50
+ /// weight and avoids divide-by-zero.
51
+ fn inverse_variance_weights(var: [f64; 3]) -> [f64; 3] {
52
+ let mean = (var[0] + var[1] + var[2]) / 3.0;
53
+ let floor = 1e-6 * mean + f64::MIN_POSITIVE;
54
+ let prec = [
55
+ 1.0 / (var[0] + floor),
56
+ 1.0 / (var[1] + floor),
57
+ 1.0 / (var[2] + floor),
58
+ ];
59
+ normalize_weights(prec)
60
+ }
61
+
32
62
  /// Build the sub-panel on periods `[0, end)` with a multiplicative `lift` applied
33
63
  /// to the treated units over the test window `[s, end)`.
34
64
  fn injected_subpanel(y: &Mat, treated: &[usize], s: usize, end: usize, lift: f64) -> Panel {
@@ -103,6 +133,7 @@ pub fn power_curve(
103
133
  alpha: f64,
104
134
  target_power: f64,
105
135
  min_pre: usize,
136
+ lookback: Option<usize>,
106
137
  ) -> PowerResult {
107
138
  let t = y.cols();
108
139
  assert!(test_len >= 1 && test_len < t, "test_len out of range");
@@ -111,7 +142,18 @@ pub fn power_curve(
111
142
  first <= t - test_len,
112
143
  "not enough periods for the requested pre-window + test_len"
113
144
  );
114
- let starts: Vec<usize> = (first..=(t - test_len)).collect();
145
+ // Every valid sliding test-window start position is one historical placebo.
146
+ // We power over MANY of them (the count is `n_windows`). `lookback`, when set,
147
+ // keeps only the most-recent K windows: those are the most representative of
148
+ // the upcoming test (recent dynamics, longest pre-periods), at the cost of
149
+ // fewer placebo samples.
150
+ let mut starts: Vec<usize> = (first..=(t - test_len)).collect();
151
+ if let Some(k) = lookback {
152
+ let k = k.max(1);
153
+ if starts.len() > k {
154
+ starts = starts.split_off(starts.len() - k);
155
+ }
156
+ }
115
157
  let n_windows = starts.len();
116
158
  let (base_mean, base_sum) = treated_baseline(y, treated);
117
159
 
@@ -169,6 +211,124 @@ pub fn power_curve(
169
211
  }
170
212
  }
171
213
 
214
+ /// Power analysis for a **weighted-average ensemble** of SC + ASC + SDID.
215
+ ///
216
+ /// Each historical placebo window is fit with all three estimators and combined
217
+ /// into a single ATT, `Σ wₘ · ATTₘ`, *before* the null distribution and power are
218
+ /// computed — so this reports the power of the averaged estimator (which is
219
+ /// generally more stable than any single one), not the average of three powers.
220
+ ///
221
+ /// `weights` is `[w_sc, w_asc, w_sdid]`; `None` uses data-driven inverse-variance
222
+ /// weights from each method's historical-null spread. Returns the result plus the
223
+ /// (normalized) weights actually used.
224
+ #[allow(clippy::too_many_arguments)]
225
+ pub fn power_curve_ensemble(
226
+ y: &Mat,
227
+ treated: &[usize],
228
+ test_len: usize,
229
+ lifts: &[f64],
230
+ alpha: f64,
231
+ target_power: f64,
232
+ min_pre: usize,
233
+ lookback: Option<usize>,
234
+ weights: Option<[f64; 3]>,
235
+ ) -> (PowerResult, [f64; 3]) {
236
+ let t = y.cols();
237
+ assert!(test_len >= 1 && test_len < t, "test_len out of range");
238
+ let first = min_pre.max(1);
239
+ assert!(
240
+ first <= t - test_len,
241
+ "not enough periods for the requested pre-window + test_len"
242
+ );
243
+ let mut starts: Vec<usize> = (first..=(t - test_len)).collect();
244
+ if let Some(k) = lookback {
245
+ let k = k.max(1);
246
+ if starts.len() > k {
247
+ starts = starts.split_off(starts.len() - k);
248
+ }
249
+ }
250
+ let n_windows = starts.len();
251
+ let (base_mean, base_sum) = treated_baseline(y, treated);
252
+
253
+ // Per-window null ATTs for each of the three methods (one fit-set, reused for
254
+ // both weight estimation and the lift-0 power point).
255
+ let null_by_window: Vec<[f64; 3]> = par_map_items(starts.clone(), |s| {
256
+ let panel = injected_subpanel(y, treated, s, s + test_len, 0.0);
257
+ [
258
+ fit_method(&panel, s, Method::Sc).att,
259
+ fit_method(&panel, s, Method::Asc).att,
260
+ fit_method(&panel, s, Method::Sdid).att,
261
+ ]
262
+ });
263
+
264
+ let w = match weights {
265
+ Some(w) => normalize_weights(w),
266
+ None => {
267
+ let mut var = [0.0f64; 3];
268
+ for m in 0..3 {
269
+ let col: Vec<f64> = null_by_window.iter().map(|a| a[m]).collect();
270
+ let sd = std_dev(&col);
271
+ var[m] = sd * sd;
272
+ }
273
+ inverse_variance_weights(var)
274
+ }
275
+ };
276
+ let combine = |a: [f64; 3]| w[0] * a[0] + w[1] * a[1] + w[2] * a[2];
277
+
278
+ let null_atts: Vec<f64> = null_by_window.iter().map(|&a| combine(a)).collect();
279
+ let mut abs_null: Vec<f64> = null_atts.iter().map(|a| a.abs()).collect();
280
+ abs_null.sort_by(|a, b| a.partial_cmp(b).unwrap());
281
+ let crit = quantile(&abs_null, 1.0 - alpha);
282
+ let se_null = std_dev(&null_atts);
283
+
284
+ let mut points = Vec::with_capacity(lifts.len());
285
+ for &lift in lifts {
286
+ let atts: Vec<f64> = if lift == 0.0 {
287
+ null_atts.clone()
288
+ } else {
289
+ par_map_items(starts.clone(), |s| {
290
+ let panel = injected_subpanel(y, treated, s, s + test_len, lift);
291
+ combine([
292
+ fit_method(&panel, s, Method::Sc).att,
293
+ fit_method(&panel, s, Method::Asc).att,
294
+ fit_method(&panel, s, Method::Sdid).att,
295
+ ])
296
+ })
297
+ };
298
+ let power = atts.iter().filter(|a| a.abs() > crit).count() as f64 / n_windows as f64;
299
+ let mut est_pct: Vec<f64> = atts.iter().map(|a| a / base_mean).collect();
300
+ let mean_pct = est_pct.iter().sum::<f64>() / est_pct.len() as f64;
301
+ est_pct.sort_by(|a, b| a.partial_cmp(b).unwrap());
302
+ points.push(PowerPoint {
303
+ lift_pct: lift,
304
+ power,
305
+ est_pct_mean: mean_pct,
306
+ est_pct_lo: quantile(&est_pct, alpha / 2.0),
307
+ est_pct_hi: quantile(&est_pct, 1.0 - alpha / 2.0),
308
+ });
309
+ }
310
+
311
+ let mde_pct = mde_from_points(&points, target_power);
312
+ let (mde_abs_per_period, mde_cumulative) = match mde_pct {
313
+ Some(m) => (Some(m * base_mean), Some(m * base_sum * test_len as f64)),
314
+ None => (None, None),
315
+ };
316
+
317
+ (
318
+ PowerResult {
319
+ method: Method::Ensemble,
320
+ points,
321
+ mde_pct,
322
+ mde_abs_per_period,
323
+ mde_cumulative,
324
+ crit,
325
+ se_null,
326
+ n_windows,
327
+ },
328
+ w,
329
+ )
330
+ }
331
+
172
332
  /// Smallest lift with power ≥ `target`, interpolating between bracketing grid
173
333
  /// points. Assumes `points` are in ascending lift order.
174
334
  fn mde_from_points(points: &[PowerPoint], target: f64) -> Option<f64> {