panelkit 0.2.1__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. {panelkit-0.2.1 → panelkit-0.2.3}/Cargo.lock +5 -5
  2. {panelkit-0.2.1 → panelkit-0.2.3}/Cargo.toml +1 -1
  3. {panelkit-0.2.1 → panelkit-0.2.3}/GUIDE.md +103 -12
  4. {panelkit-0.2.1 → panelkit-0.2.3}/PKG-INFO +55 -6
  5. {panelkit-0.2.1 → panelkit-0.2.3}/README.md +54 -5
  6. {panelkit-0.2.1 → panelkit-0.2.3}/crates/geo/Cargo.toml +1 -1
  7. {panelkit-0.2.1 → panelkit-0.2.3}/crates/geo/src/lib.rs +1 -1
  8. {panelkit-0.2.1 → panelkit-0.2.3}/crates/geo/src/power.rs +151 -3
  9. {panelkit-0.2.1 → panelkit-0.2.3}/crates/geo/src/selection.rs +70 -22
  10. {panelkit-0.2.1 → panelkit-0.2.3}/crates/geo/src/types.rs +4 -0
  11. {panelkit-0.2.1 → panelkit-0.2.3}/crates/geo/tests/geo.rs +10 -0
  12. {panelkit-0.2.1 → panelkit-0.2.3}/crates/pypanelkit/src/api_geo.rs +77 -2
  13. {panelkit-0.2.1 → panelkit-0.2.3}/crates/pypanelkit/src/lib.rs +1 -0
  14. {panelkit-0.2.1 → panelkit-0.2.3}/crates/pypanelkit/src/results.rs +4 -0
  15. {panelkit-0.2.1 → panelkit-0.2.3}/pyproject.toml +1 -1
  16. {panelkit-0.2.1 → panelkit-0.2.3}/python/panelkit/_panelkit.pyi +13 -0
  17. {panelkit-0.2.1 → panelkit-0.2.3}/python/panelkit/design.py +799 -8
  18. {panelkit-0.2.1 → panelkit-0.2.3}/BENCHMARKS.md +0 -0
  19. {panelkit-0.2.1 → panelkit-0.2.3}/LICENSE-APACHE +0 -0
  20. {panelkit-0.2.1 → panelkit-0.2.3}/LICENSE-MIT +0 -0
  21. {panelkit-0.2.1 → panelkit-0.2.3}/crates/estimators/Cargo.toml +0 -0
  22. {panelkit-0.2.1 → panelkit-0.2.3}/crates/estimators/benches/estimators.rs +0 -0
  23. {panelkit-0.2.1 → panelkit-0.2.3}/crates/estimators/src/did/bacon.rs +0 -0
  24. {panelkit-0.2.1 → panelkit-0.2.3}/crates/estimators/src/did/callaway.rs +0 -0
  25. {panelkit-0.2.1 → panelkit-0.2.3}/crates/estimators/src/did/mod.rs +0 -0
  26. {panelkit-0.2.1 → panelkit-0.2.3}/crates/estimators/src/did/sunab.rs +0 -0
  27. {panelkit-0.2.1 → panelkit-0.2.3}/crates/estimators/src/did/twfe.rs +0 -0
  28. {panelkit-0.2.1 → panelkit-0.2.3}/crates/estimators/src/fe/mod.rs +0 -0
  29. {panelkit-0.2.1 → panelkit-0.2.3}/crates/estimators/src/fe/within.rs +0 -0
  30. {panelkit-0.2.1 → panelkit-0.2.3}/crates/estimators/src/lib.rs +0 -0
  31. {panelkit-0.2.1 → panelkit-0.2.3}/crates/estimators/src/mcnnm/mod.rs +0 -0
  32. {panelkit-0.2.1 → panelkit-0.2.3}/crates/estimators/src/mcnnm/softimpute.rs +0 -0
  33. {panelkit-0.2.1 → panelkit-0.2.3}/crates/estimators/src/panel.rs +0 -0
  34. {panelkit-0.2.1 → panelkit-0.2.3}/crates/estimators/src/result.rs +0 -0
  35. {panelkit-0.2.1 → panelkit-0.2.3}/crates/estimators/src/sc/augmented.rs +0 -0
  36. {panelkit-0.2.1 → panelkit-0.2.3}/crates/estimators/src/sc/cpasc.rs +0 -0
  37. {panelkit-0.2.1 → panelkit-0.2.3}/crates/estimators/src/sc/mod.rs +0 -0
  38. {panelkit-0.2.1 → panelkit-0.2.3}/crates/estimators/src/sc/sdid.rs +0 -0
  39. {panelkit-0.2.1 → panelkit-0.2.3}/crates/estimators/src/sc/synthetic.rs +0 -0
  40. {panelkit-0.2.1 → panelkit-0.2.3}/crates/estimators/tests/cpasc.rs +0 -0
  41. {panelkit-0.2.1 → panelkit-0.2.3}/crates/estimators/tests/did.rs +0 -0
  42. {panelkit-0.2.1 → panelkit-0.2.3}/crates/estimators/tests/sc.rs +0 -0
  43. {panelkit-0.2.1 → panelkit-0.2.3}/crates/estimators/tests/sc_family.rs +0 -0
  44. {panelkit-0.2.1 → panelkit-0.2.3}/crates/geo/src/diagnostics.rs +0 -0
  45. {panelkit-0.2.1 → panelkit-0.2.3}/crates/inference/Cargo.toml +0 -0
  46. {panelkit-0.2.1 → panelkit-0.2.3}/crates/inference/src/batch.rs +0 -0
  47. {panelkit-0.2.1 → panelkit-0.2.3}/crates/inference/src/bootstrap.rs +0 -0
  48. {panelkit-0.2.1 → panelkit-0.2.3}/crates/inference/src/ci.rs +0 -0
  49. {panelkit-0.2.1 → panelkit-0.2.3}/crates/inference/src/lib.rs +0 -0
  50. {panelkit-0.2.1 → panelkit-0.2.3}/crates/inference/src/parallel.rs +0 -0
  51. {panelkit-0.2.1 → panelkit-0.2.3}/crates/inference/src/placebo.rs +0 -0
  52. {panelkit-0.2.1 → panelkit-0.2.3}/crates/inference/tests/inference.rs +0 -0
  53. {panelkit-0.2.1 → panelkit-0.2.3}/crates/linalg/Cargo.toml +0 -0
  54. {panelkit-0.2.1 → panelkit-0.2.3}/crates/linalg/src/error.rs +0 -0
  55. {panelkit-0.2.1 → panelkit-0.2.3}/crates/linalg/src/factor/cholesky.rs +0 -0
  56. {panelkit-0.2.1 → panelkit-0.2.3}/crates/linalg/src/factor/eig_sym.rs +0 -0
  57. {panelkit-0.2.1 → panelkit-0.2.3}/crates/linalg/src/factor/mod.rs +0 -0
  58. {panelkit-0.2.1 → panelkit-0.2.3}/crates/linalg/src/factor/qr.rs +0 -0
  59. {panelkit-0.2.1 → panelkit-0.2.3}/crates/linalg/src/factor/randomized.rs +0 -0
  60. {panelkit-0.2.1 → panelkit-0.2.3}/crates/linalg/src/factor/svd.rs +0 -0
  61. {panelkit-0.2.1 → panelkit-0.2.3}/crates/linalg/src/factor/svd_gram.rs +0 -0
  62. {panelkit-0.2.1 → panelkit-0.2.3}/crates/linalg/src/lib.rs +0 -0
  63. {panelkit-0.2.1 → panelkit-0.2.3}/crates/linalg/src/matrix.rs +0 -0
  64. {panelkit-0.2.1 → panelkit-0.2.3}/crates/linalg/src/ops/matmul.rs +0 -0
  65. {panelkit-0.2.1 → panelkit-0.2.3}/crates/linalg/src/ops/mod.rs +0 -0
  66. {panelkit-0.2.1 → panelkit-0.2.3}/crates/linalg/src/ops/norms.rs +0 -0
  67. {panelkit-0.2.1 → panelkit-0.2.3}/crates/linalg/src/ops/transform.rs +0 -0
  68. {panelkit-0.2.1 → panelkit-0.2.3}/crates/linalg/src/opt/mod.rs +0 -0
  69. {panelkit-0.2.1 → panelkit-0.2.3}/crates/linalg/src/opt/simplex.rs +0 -0
  70. {panelkit-0.2.1 → panelkit-0.2.3}/crates/linalg/src/opt/softthresh.rs +0 -0
  71. {panelkit-0.2.1 → panelkit-0.2.3}/crates/linalg/src/rng.rs +0 -0
  72. {panelkit-0.2.1 → panelkit-0.2.3}/crates/linalg/src/solve/lstsq.rs +0 -0
  73. {panelkit-0.2.1 → panelkit-0.2.3}/crates/linalg/src/solve/mod.rs +0 -0
  74. {panelkit-0.2.1 → panelkit-0.2.3}/crates/linalg/src/solve/spd.rs +0 -0
  75. {panelkit-0.2.1 → panelkit-0.2.3}/crates/linalg/tests/numerics.rs +0 -0
  76. {panelkit-0.2.1 → panelkit-0.2.3}/crates/pypanelkit/Cargo.toml +0 -0
  77. {panelkit-0.2.1 → panelkit-0.2.3}/crates/pypanelkit/src/api_did.rs +0 -0
  78. {panelkit-0.2.1 → panelkit-0.2.3}/crates/pypanelkit/src/api_sc.rs +0 -0
  79. {panelkit-0.2.1 → panelkit-0.2.3}/crates/pypanelkit/src/convert.rs +0 -0
  80. {panelkit-0.2.1 → panelkit-0.2.3}/python/panelkit/__init__.py +0 -0
  81. {panelkit-0.2.1 → panelkit-0.2.3}/python/panelkit/estimators.py +0 -0
  82. {panelkit-0.2.1 → panelkit-0.2.3}/python/panelkit/py.typed +0 -0
@@ -462,7 +462,7 @@ checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e"
462
462
 
463
463
  [[package]]
464
464
  name = "panelkit-estimators"
465
- version = "0.2.1"
465
+ version = "0.2.3"
466
466
  dependencies = [
467
467
  "criterion",
468
468
  "panelkit-linalg",
@@ -471,7 +471,7 @@ dependencies = [
471
471
 
472
472
  [[package]]
473
473
  name = "panelkit-geo"
474
- version = "0.2.1"
474
+ version = "0.2.3"
475
475
  dependencies = [
476
476
  "panelkit-estimators",
477
477
  "panelkit-inference",
@@ -482,7 +482,7 @@ dependencies = [
482
482
 
483
483
  [[package]]
484
484
  name = "panelkit-inference"
485
- version = "0.2.1"
485
+ version = "0.2.3"
486
486
  dependencies = [
487
487
  "panelkit-estimators",
488
488
  "panelkit-linalg",
@@ -491,7 +491,7 @@ dependencies = [
491
491
 
492
492
  [[package]]
493
493
  name = "panelkit-linalg"
494
- version = "0.2.1"
494
+ version = "0.2.3"
495
495
  dependencies = [
496
496
  "proptest",
497
497
  "rayon",
@@ -623,7 +623,7 @@ dependencies = [
623
623
 
624
624
  [[package]]
625
625
  name = "pypanelkit"
626
- version = "0.2.1"
626
+ version = "0.2.3"
627
627
  dependencies = [
628
628
  "numpy",
629
629
  "panelkit-estimators",
@@ -3,7 +3,7 @@ resolver = "2"
3
3
  members = ["crates/linalg", "crates/estimators", "crates/inference", "crates/geo", "crates/pypanelkit"]
4
4
 
5
5
  [workspace.package]
6
- version = "0.2.1"
6
+ version = "0.2.3"
7
7
  edition = "2021"
8
8
  rust-version = "1.74"
9
9
  license = "MIT OR Apache-2.0"
@@ -252,7 +252,20 @@ SC / ASC / SDID. Returns a report with:
252
252
 
253
253
  Key options: `alpha` (significance level, default 0.10), `target_power`
254
254
  (default 0.80), `lifts` (the % grid), `methods`, `recommended` (default SDID),
255
- `lookback`.
255
+ `lookback`, `ensemble`/`ensemble_weights`.
256
+
257
+ **The ENSEMBLE method (weighted average of SC + ASC + SDID).** By default
258
+ `power()` adds an `"ENSEMBLE"` result alongside the three base methods: a
259
+ weighted average of their ATTs, combined *within each placebo window* before the
260
+ null and power are computed. (That ordering matters — the power of the averaged
261
+ estimator is not the average of three powers; the blend is usually steadier than
262
+ any single method, so its MDE is often the smallest.) `ensemble_weights="auto"`
263
+ (default) uses **inverse-variance** weighting — each method weighted by the
264
+ precision of its historical-null distribution, so a noisier estimator counts for
265
+ less. Pass `"equal"`, a dict like `{"SC": 0.5, "ASC": 0.2, "SDID": 0.3}`, or a
266
+ `[w_sc, w_asc, w_sdid]` list to set them yourself; `ensemble=False` turns it off.
267
+ The weights used are printed in the report and stored on
268
+ `rep.results["ENSEMBLE"].ensemble_weights`.
256
269
 
257
270
  **How power is simulated (many placebos, not one).** For a treated set, the test
258
271
  window of length `test_len` is *slid across the whole history*: every valid start
@@ -262,17 +275,54 @@ power at lift τ is the share of windows whose injected effect clears that
262
275
  threshold. So the estimate is averaged over **many** placebos — `result.n_windows`
263
276
  reports how many.
264
277
 
265
- **Relationship to GeoLift's `lookback_window`.** GeoLift's lookback is exactly
266
- this idea how many recent test-start points to simulate over. By default
267
- panelkit uses *all* available windows (more placebo samples a more stable power
268
- estimate). Pass `lookback=k` to use only the **most-recent k** windows: those have
269
- the longest pre-periods and reflect current dynamics, so they're the most
278
+ **The `lookback` option — how far back to simulate.** By default panelkit powers
279
+ over *all* valid windows (more placebo samples a more stable power estimate).
280
+ Pass `lookback=k` to use only the **most-recent k** windows: those have the
281
+ longest pre-periods and reflect current dynamics, so they're the most
270
282
  representative of the test you're about to run — at the cost of fewer samples (a
271
283
  noisier estimate). It matters when older history is unrepresentative (regime
272
284
  change, growth, format changes) or when early windows have very short pre-periods;
273
- use a `lookback` covering your relevant recent history (e.g. the last ~6–12
285
+ set `lookback` to cover your relevant recent history (e.g. the last ~6–12
274
286
  months of windows).
275
287
 
288
+ ### Evaluating a test that ran — `design.evaluate(treated, treat_start, …)`
289
+
290
+ `power()` *plans* a test; `evaluate()` *measures* one. Given the treated markets
291
+ and the period treatment began (`treat_start`, the first post-period column), it
292
+ fits SC / ASC / SDID, reports each one's realized effect, and blends them into a
293
+ weighted-average **ensemble** estimate.
294
+
295
+ ```python
296
+ ev = design.evaluate(treated=["chicago", "denver"], treat_start=52, level=0.90)
297
+ print(ev.summary()) # per-method + ensemble lift, CI, cumulative
298
+ ev.plot("evaluate.png") # observed-vs-cf, effect path (CI band), lift bar
299
+ ev.plot_effect_over_time("effect.png") # pointwise + cumulative over time, w/ CIs
300
+ ev.lift, ev.cumulative, ev.significant
301
+ ```
302
+
303
+ Each estimate gets a confidence interval from a **stationary block bootstrap** of
304
+ its post-period effect path; an **SC in-space placebo** supplies a p-value. The
305
+ ensemble uses the same `weights` choices as `power()` (`"auto"` = inverse-variance
306
+ from each method's bootstrap SE, `"equal"`, or an explicit dict/list). `ev` exposes
307
+ `.lift`, `.att`, `.cumulative`, `.significant`, the per-method results in `ev.per`,
308
+ and the ensemble in `ev.ensemble`. Reported numbers: **% lift** (effect ÷
309
+ counterfactual), **per-period ATT**, and **cumulative incremental** over the
310
+ window (summed across treated markets).
311
+
312
+ **Effect over time** (`ev.plot_effect_over_time(...)`) gives the event-study view:
313
+ the **pointwise** effect across the full timeline — *including the pre-period*, so
314
+ you can see it sits flat (centered on zero) inside the noise band before the test
315
+ starts (a placebo check) and breaks out after — and the running **cumulative
316
+ incremental**, each as a point estimate with a confidence band. The counterfactual
317
+ is centered on the pre-period, so the gap shows fit quality rather than a level
318
+ offset (SDID matches trends, not levels). The bands come from a **moving-block
319
+ bootstrap** of the pre-period residuals: resampling whole blocks preserves their
320
+ autocorrelation, so the intervals are more conservative than an iid normal
321
+ approximation — the cumulative band in particular widens faster than √k when the
322
+ residuals are positively autocorrelated. Raise `block_len` to capture longer-range
323
+ dependence (wider, more conservative cumulative bands). Pass `exclude=[…]` to drop
324
+ markets from the control pool (e.g. ones you don't trust as donors).
325
+
276
326
  ### Choosing a specification — `design.recommend(test_lengths, n_geos_options, target_lift, alphas=…)`
277
327
 
278
328
  Sweeps designs across **test length × number of geos × alpha** and recommends the
@@ -310,9 +360,50 @@ Searches candidate treatment-market sets and ranks them by power, MDE, pre-fit,
310
360
  holdout, and confidence. Pass `eligible=[…]` to restrict to markets you can
311
361
  actually run in.
312
362
 
313
- ### What it adds over GeoLift
363
+ Two real-world controls for *which* markets the search may use:
364
+
365
+ - **`include=[…]`** — force specific markets into **every** candidate treatment
366
+ set (must-treat markets, e.g. a flagship region you've already committed to).
367
+ The search fills the remaining slots from `eligible`, up to `max_treated`.
368
+ - **`exclude=[…]`** — drop markets **entirely**: they're never treated *and*
369
+ never used as a donor/control (e.g. a market with contaminated data or its own
370
+ concurrent campaign). `exclude` is also accepted by `power()`, `diagnose()`,
371
+ `evaluate()`, and `recommend()` to keep a market out of the control pool.
372
+
373
+ ### Multi-cell tests — `design.multi_cell(cells, test_len, …)`
374
+
375
+ Often you run several treatment cells at once — different creatives, budgets, or
376
+ messages across disjoint groups of markets — and want each cell's lift measured
377
+ separately. The subtlety is the control pool: a market that's treated in one cell
378
+ can't be a clean control for another. `multi_cell` handles this by powering each
379
+ cell against a **shared donor pool that excludes every cell's treated markets**.
380
+
381
+ ```python
382
+ mc = design.multi_cell(
383
+ cells={
384
+ "West": ["los_angeles", "san_diego"],
385
+ "Midwest": ["chicago", "detroit"],
386
+ "Northeast": ["boston", "philadelphia"],
387
+ },
388
+ test_len=8, alpha=0.10,
389
+ )
390
+ print(mc.summary()) # per-cell MDE / confidence / holdout + combined holdout
391
+ mc.plot("multicell.png") # per-cell power curves + an MDE-by-cell bar
392
+ ```
314
393
 
315
- Multi-method (SC/ASC/SDID, not just augmented SCM), MDE in %/absolute/cumulative
316
- with CIs, an explicit confidence score + verdict, seasonality/stability/holdout
317
- guardrails with plain-English warnings, a specification-tradeoff sweep, and
318
- publication-clean figures out of the box.
394
+ `cells` maps a label to its markets (names or indices) and must be disjoint. By
395
+ default the donor pool is every market not assigned to any cell; pass
396
+ `shared_donors=[…]` to fix it explicitly. `lifts`, `methods`, `alpha`,
397
+ `target_power`, `recommended`, and `lookback` are forwarded to each cell's power
398
+ analysis. The report exposes `mc.cells[label]` (a full power report per cell) and
399
+ a combined holdout across all cells. Bigger cells get a smaller MDE; underpowered
400
+ cells are flagged so you can grow or merge them before spending.
401
+
402
+ ### What the design layer gives you
403
+
404
+ Multi-method power (SC/ASC/SDID plus a weighted-average **ensemble** and a
405
+ naive-DiD baseline), MDE in %/absolute/cumulative with CIs, an explicit 0–100
406
+ confidence score + one-line verdict, seasonality/stability/holdout guardrails with
407
+ plain-English warnings, a specification-tradeoff sweep, multi-cell designs,
408
+ **post-test evaluation** (`evaluate()`), and publication-clean figures out of the
409
+ box.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: panelkit
3
- Version: 0.2.1
3
+ Version: 0.2.3
4
4
  Classifier: Programming Language :: Rust
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Topic :: Scientific/Engineering
@@ -202,10 +202,10 @@ valid inference for each estimator.
202
202
 
203
203
  ## Geo test design (power analysis & market selection)
204
204
 
205
- `panelkit.design` is the planning layer in front of a geo experiment — a
206
- GeoLift-style toolkit, but multi-method and robustness-first, with the heavy
207
- simulation in Rust. It answers: **which markets should I treat, how big a lift
208
- can I detect, and can I trust this design?**
205
+ `panelkit.design` is the planning layer in front of a geo experiment —
206
+ multi-method and robustness-first, with the heavy simulation in Rust. It answers:
207
+ **which markets should I treat, how big a lift can I detect, can I trust this
208
+ design and, once it's run, how big was the effect?**
209
209
 
210
210
  ```python
211
211
  from panelkit.design import GeoDesign
@@ -225,6 +225,22 @@ guard.plot("guardrails.png") # the guardrails figure below
225
225
  # let it pick the markets for you:
226
226
  ranked = design.select_markets(test_len=8, target_lift=0.05, max_treated=3)
227
227
 
228
+ # run several disjoint treatment cells at once (each vs. a shared donor pool):
229
+ mc = design.multi_cell(cells={"west": ["los_angeles", "san_diego"],
230
+ "east": ["boston", "philadelphia"]}, test_len=8)
231
+ print(mc.summary()) # per-cell MDE / confidence / holdout
232
+ mc.plot("multicell.png") # the multi-cell figure below
233
+
234
+ # pin in must-have markets, drop ones you don't trust:
235
+ ranked = design.select_markets(test_len=8, target_lift=0.05, max_treated=3,
236
+ include=["chicago"], exclude=["miami"])
237
+
238
+ # already ran the test? measure it (SC/ASC/SDID + a weighted-average ensemble):
239
+ ev = design.evaluate(treated=["chicago", "denver"], treat_start=52)
240
+ print(ev.summary()) # per-method + ensemble lift, CI, cumulative
241
+ ev.plot("evaluate.png") # observed vs counterfactual + lift-by-method
242
+ ev.plot_effect_over_time("effect.png") # pointwise + cumulative over time, w/ CIs
243
+
228
244
  # or sweep specifications (length × #geos × significance) and recommend one:
229
245
  grid = design.recommend(test_lengths=[4, 6, 8, 12], n_geos_options=[3, 5, 10, 20],
230
246
  target_lift=0.05, alphas=[0.05, 0.10])
@@ -247,13 +263,40 @@ tradeoffs (MDE vs length per #geos, an intuitive heatmap, and alpha sensitivity)
247
263
 
248
264
  ![specification tradeoffs](assets/geo_scenarios.png)
249
265
 
266
+ **Multi-cell tests.** `multi_cell(...)` runs several disjoint treatment cells
267
+ simultaneously — each measured against a shared donor pool that excludes *every*
268
+ cell's treated markets, so cells never borrow each other as controls. You get a
269
+ per-cell MDE/confidence/holdout report and a combined figure:
270
+
271
+ ![multi-cell test](assets/geo_multicell.png)
272
+
273
+ **Evaluate a test that ran.** `evaluate(...)` is the measurement counterpart to
274
+ the power analysis: fit SC / ASC / SDID on a test that already happened, blend
275
+ them into a weighted-average **ensemble** estimate, and report each one's lift,
276
+ confidence interval (stationary block bootstrap), and cumulative incremental —
277
+ with an SC in-space placebo p-value:
278
+
279
+ ![test evaluation](assets/geo_evaluate.png)
280
+
281
+ And the **effect over time** — the pointwise effect across the full timeline
282
+ (pre-period included, so you can see it sit flat in the noise band before the test
283
+ and break out after) plus the running cumulative incremental, each as a point
284
+ estimate with a confidence band:
285
+
286
+ ![effect over time](assets/geo_effect_over_time.png)
287
+
288
+ **Pin in / drop markets.** `select_markets`/`recommend` take `include=[…]`
289
+ (force must-treat markets into every candidate) and `exclude=[…]` (drop markets
290
+ entirely — never treated, never a control). `exclude` is also accepted by
291
+ `power`, `diagnose`, and `evaluate` to keep a market out of the donor pool.
292
+
250
293
  **Messy DataFrame? No problem.** `from_long` coerces real-world data: outcome
251
294
  strings → numeric (with a clear error on genuinely non-numeric values), dates
252
295
  (string or unsorted) → chronological columns, locations → market names, duplicate
253
296
  rows aggregated with a warning, and a clear error (with a count) if the panel is
254
297
  gappy. You don't pre-clean dtypes.
255
298
 
256
- What it does that GeoLift doesn't, out of the box:
299
+ What you get out of the box:
257
300
 
258
301
  - **Real-data power** — historical placebo with injected lift on your *actual*
259
302
  panel (not an assumed variance), across **SC, ASC, and SDID** with a
@@ -268,6 +311,12 @@ What it does that GeoLift doesn't, out of the box:
268
311
  go/no-go.
269
312
  - **Market selection** that searches candidate treatment sets and ranks them by
270
313
  power, MDE, fit, holdout, and confidence.
314
+ - **Multi-cell tests** — several disjoint treatment cells powered at once against
315
+ a shared donor pool, with a per-cell MDE/confidence report.
316
+ - **A weighted-average ensemble** of SC + ASC + SDID (combined per placebo window,
317
+ with auto inverse-variance weights) for a steadier estimate than any one method.
318
+ - **Post-test evaluation** — `evaluate()` measures a test that already ran:
319
+ per-method + ensemble lift, bootstrap CIs, cumulative incremental, and a p-value.
271
320
 
272
321
  See [`examples/geo_demo.py`](examples/geo_demo.py).
273
322
 
@@ -172,10 +172,10 @@ valid inference for each estimator.
172
172
 
173
173
  ## Geo test design (power analysis & market selection)
174
174
 
175
- `panelkit.design` is the planning layer in front of a geo experiment — a
176
- GeoLift-style toolkit, but multi-method and robustness-first, with the heavy
177
- simulation in Rust. It answers: **which markets should I treat, how big a lift
178
- can I detect, and can I trust this design?**
175
+ `panelkit.design` is the planning layer in front of a geo experiment —
176
+ multi-method and robustness-first, with the heavy simulation in Rust. It answers:
177
+ **which markets should I treat, how big a lift can I detect, can I trust this
178
+ design and, once it's run, how big was the effect?**
179
179
 
180
180
  ```python
181
181
  from panelkit.design import GeoDesign
@@ -195,6 +195,22 @@ guard.plot("guardrails.png") # the guardrails figure below
195
195
  # let it pick the markets for you:
196
196
  ranked = design.select_markets(test_len=8, target_lift=0.05, max_treated=3)
197
197
 
198
+ # run several disjoint treatment cells at once (each vs. a shared donor pool):
199
+ mc = design.multi_cell(cells={"west": ["los_angeles", "san_diego"],
200
+ "east": ["boston", "philadelphia"]}, test_len=8)
201
+ print(mc.summary()) # per-cell MDE / confidence / holdout
202
+ mc.plot("multicell.png") # the multi-cell figure below
203
+
204
+ # pin in must-have markets, drop ones you don't trust:
205
+ ranked = design.select_markets(test_len=8, target_lift=0.05, max_treated=3,
206
+ include=["chicago"], exclude=["miami"])
207
+
208
+ # already ran the test? measure it (SC/ASC/SDID + a weighted-average ensemble):
209
+ ev = design.evaluate(treated=["chicago", "denver"], treat_start=52)
210
+ print(ev.summary()) # per-method + ensemble lift, CI, cumulative
211
+ ev.plot("evaluate.png") # observed vs counterfactual + lift-by-method
212
+ ev.plot_effect_over_time("effect.png") # pointwise + cumulative over time, w/ CIs
213
+
198
214
  # or sweep specifications (length × #geos × significance) and recommend one:
199
215
  grid = design.recommend(test_lengths=[4, 6, 8, 12], n_geos_options=[3, 5, 10, 20],
200
216
  target_lift=0.05, alphas=[0.05, 0.10])
@@ -217,13 +233,40 @@ tradeoffs (MDE vs length per #geos, an intuitive heatmap, and alpha sensitivity)
217
233
 
218
234
  ![specification tradeoffs](assets/geo_scenarios.png)
219
235
 
236
+ **Multi-cell tests.** `multi_cell(...)` runs several disjoint treatment cells
237
+ simultaneously — each measured against a shared donor pool that excludes *every*
238
+ cell's treated markets, so cells never borrow each other as controls. You get a
239
+ per-cell MDE/confidence/holdout report and a combined figure:
240
+
241
+ ![multi-cell test](assets/geo_multicell.png)
242
+
243
+ **Evaluate a test that ran.** `evaluate(...)` is the measurement counterpart to
244
+ the power analysis: fit SC / ASC / SDID on a test that already happened, blend
245
+ them into a weighted-average **ensemble** estimate, and report each one's lift,
246
+ confidence interval (stationary block bootstrap), and cumulative incremental —
247
+ with an SC in-space placebo p-value:
248
+
249
+ ![test evaluation](assets/geo_evaluate.png)
250
+
251
+ And the **effect over time** — the pointwise effect across the full timeline
252
+ (pre-period included, so you can see it sit flat in the noise band before the test
253
+ and break out after) plus the running cumulative incremental, each as a point
254
+ estimate with a confidence band:
255
+
256
+ ![effect over time](assets/geo_effect_over_time.png)
257
+
258
+ **Pin in / drop markets.** `select_markets`/`recommend` take `include=[…]`
259
+ (force must-treat markets into every candidate) and `exclude=[…]` (drop markets
260
+ entirely — never treated, never a control). `exclude` is also accepted by
261
+ `power`, `diagnose`, and `evaluate` to keep a market out of the donor pool.
262
+
220
263
  **Messy DataFrame? No problem.** `from_long` coerces real-world data: outcome
221
264
  strings → numeric (with a clear error on genuinely non-numeric values), dates
222
265
  (string or unsorted) → chronological columns, locations → market names, duplicate
223
266
  rows aggregated with a warning, and a clear error (with a count) if the panel is
224
267
  gappy. You don't pre-clean dtypes.
225
268
 
226
- What it does that GeoLift doesn't, out of the box:
269
+ What you get out of the box:
227
270
 
228
271
  - **Real-data power** — historical placebo with injected lift on your *actual*
229
272
  panel (not an assumed variance), across **SC, ASC, and SDID** with a
@@ -238,6 +281,12 @@ What it does that GeoLift doesn't, out of the box:
238
281
  go/no-go.
239
282
  - **Market selection** that searches candidate treatment sets and ranks them by
240
283
  power, MDE, fit, holdout, and confidence.
284
+ - **Multi-cell tests** — several disjoint treatment cells powered at once against
285
+ a shared donor pool, with a per-cell MDE/confidence report.
286
+ - **A weighted-average ensemble** of SC + ASC + SDID (combined per placebo window,
287
+ with auto inverse-variance weights) for a steadier estimate than any one method.
288
+ - **Post-test evaluation** — `evaluate()` measures a test that already ran:
289
+ per-method + ensemble lift, bootstrap CIs, cumulative incremental, and a p-value.
241
290
 
242
291
  See [`examples/geo_demo.py`](examples/geo_demo.py).
243
292
 
@@ -6,7 +6,7 @@ rust-version.workspace = true
6
6
  license.workspace = true
7
7
  authors.workspace = true
8
8
  repository.workspace = true
9
- description = "Geo-experiment design: power analysis, market selection, and real-world diagnostics for panelkit (GeoLift-style, but multi-method and robustness-first)."
9
+ description = "Geo-experiment design: multi-method power analysis, market selection, and real-world diagnostics for panelkit."
10
10
 
11
11
  [features]
12
12
  default = []
@@ -21,6 +21,6 @@ pub mod selection;
21
21
  pub mod types;
22
22
 
23
23
  pub use diagnostics::diagnostics;
24
- pub use power::power_curve;
24
+ pub use power::{power_curve, power_curve_ensemble};
25
25
  pub use selection::{evaluate, select_markets, MarketCandidate, SelectConfig};
26
26
  pub use types::{Diagnostics, Method, PowerPoint, PowerResult};
@@ -26,9 +26,39 @@ pub(crate) fn fit_method(panel: &Panel, t0: usize, method: Method) -> ScFit {
26
26
  Method::Sc => fit_sc_at(panel, t0, ScConfig::default()),
27
27
  Method::Asc => fit_asc_at(panel, t0, AscConfig::default()),
28
28
  Method::Sdid => fit_sdid_at(panel, t0, SdidConfig::default()),
29
+ Method::Ensemble => {
30
+ unreachable!("Ensemble is combined across methods, not a single fit")
31
+ }
29
32
  }
30
33
  }
31
34
 
35
+ /// Normalize three (clamped-nonnegative) weights to sum to 1. Falls back to
36
+ /// equal weights if the inputs are degenerate (all ≤ 0).
37
+ fn normalize_weights(w: [f64; 3]) -> [f64; 3] {
38
+ let c = [w[0].max(0.0), w[1].max(0.0), w[2].max(0.0)];
39
+ let s = c[0] + c[1] + c[2];
40
+ if s > 0.0 {
41
+ [c[0] / s, c[1] / s, c[2] / s]
42
+ } else {
43
+ [1.0 / 3.0; 3]
44
+ }
45
+ }
46
+
47
+ /// Inverse-variance ("precision") weights from each method's null variance:
48
+ /// a method with a tighter placebo distribution gets more weight. A small floor
49
+ /// (relative to the mean variance) keeps a near-perfect fit from taking all the
50
+ /// weight and avoids divide-by-zero.
51
+ fn inverse_variance_weights(var: [f64; 3]) -> [f64; 3] {
52
+ let mean = (var[0] + var[1] + var[2]) / 3.0;
53
+ let floor = 1e-6 * mean + f64::MIN_POSITIVE;
54
+ let prec = [
55
+ 1.0 / (var[0] + floor),
56
+ 1.0 / (var[1] + floor),
57
+ 1.0 / (var[2] + floor),
58
+ ];
59
+ normalize_weights(prec)
60
+ }
61
+
32
62
  /// Build the sub-panel on periods `[0, end)` with a multiplicative `lift` applied
33
63
  /// to the treated units over the test window `[s, end)`.
34
64
  fn injected_subpanel(y: &Mat, treated: &[usize], s: usize, end: usize, lift: f64) -> Panel {
@@ -114,9 +144,9 @@ pub fn power_curve(
114
144
  );
115
145
  // Every valid sliding test-window start position is one historical placebo.
116
146
  // We power over MANY of them (the count is `n_windows`). `lookback`, when set,
117
- // keeps only the most-recent K windows GeoLift's "lookback_window": those
118
- // are the most representative of the upcoming test (recent dynamics, longest
119
- // pre-periods), at the cost of fewer placebo samples.
147
+ // keeps only the most-recent K windows: those are the most representative of
148
+ // the upcoming test (recent dynamics, longest pre-periods), at the cost of
149
+ // fewer placebo samples.
120
150
  let mut starts: Vec<usize> = (first..=(t - test_len)).collect();
121
151
  if let Some(k) = lookback {
122
152
  let k = k.max(1);
@@ -181,6 +211,124 @@ pub fn power_curve(
181
211
  }
182
212
  }
183
213
 
214
+ /// Power analysis for a **weighted-average ensemble** of SC + ASC + SDID.
215
+ ///
216
+ /// Each historical placebo window is fit with all three estimators and combined
217
+ /// into a single ATT, `Σ wₘ · ATTₘ`, *before* the null distribution and power are
218
+ /// computed — so this reports the power of the averaged estimator (which is
219
+ /// generally more stable than any single one), not the average of three powers.
220
+ ///
221
+ /// `weights` is `[w_sc, w_asc, w_sdid]`; `None` uses data-driven inverse-variance
222
+ /// weights from each method's historical-null spread. Returns the result plus the
223
+ /// (normalized) weights actually used.
224
+ #[allow(clippy::too_many_arguments)]
225
+ pub fn power_curve_ensemble(
226
+ y: &Mat,
227
+ treated: &[usize],
228
+ test_len: usize,
229
+ lifts: &[f64],
230
+ alpha: f64,
231
+ target_power: f64,
232
+ min_pre: usize,
233
+ lookback: Option<usize>,
234
+ weights: Option<[f64; 3]>,
235
+ ) -> (PowerResult, [f64; 3]) {
236
+ let t = y.cols();
237
+ assert!(test_len >= 1 && test_len < t, "test_len out of range");
238
+ let first = min_pre.max(1);
239
+ assert!(
240
+ first <= t - test_len,
241
+ "not enough periods for the requested pre-window + test_len"
242
+ );
243
+ let mut starts: Vec<usize> = (first..=(t - test_len)).collect();
244
+ if let Some(k) = lookback {
245
+ let k = k.max(1);
246
+ if starts.len() > k {
247
+ starts = starts.split_off(starts.len() - k);
248
+ }
249
+ }
250
+ let n_windows = starts.len();
251
+ let (base_mean, base_sum) = treated_baseline(y, treated);
252
+
253
+ // Per-window null ATTs for each of the three methods (one fit-set, reused for
254
+ // both weight estimation and the lift-0 power point).
255
+ let null_by_window: Vec<[f64; 3]> = par_map_items(starts.clone(), |s| {
256
+ let panel = injected_subpanel(y, treated, s, s + test_len, 0.0);
257
+ [
258
+ fit_method(&panel, s, Method::Sc).att,
259
+ fit_method(&panel, s, Method::Asc).att,
260
+ fit_method(&panel, s, Method::Sdid).att,
261
+ ]
262
+ });
263
+
264
+ let w = match weights {
265
+ Some(w) => normalize_weights(w),
266
+ None => {
267
+ let mut var = [0.0f64; 3];
268
+ for m in 0..3 {
269
+ let col: Vec<f64> = null_by_window.iter().map(|a| a[m]).collect();
270
+ let sd = std_dev(&col);
271
+ var[m] = sd * sd;
272
+ }
273
+ inverse_variance_weights(var)
274
+ }
275
+ };
276
+ let combine = |a: [f64; 3]| w[0] * a[0] + w[1] * a[1] + w[2] * a[2];
277
+
278
+ let null_atts: Vec<f64> = null_by_window.iter().map(|&a| combine(a)).collect();
279
+ let mut abs_null: Vec<f64> = null_atts.iter().map(|a| a.abs()).collect();
280
+ abs_null.sort_by(|a, b| a.partial_cmp(b).unwrap());
281
+ let crit = quantile(&abs_null, 1.0 - alpha);
282
+ let se_null = std_dev(&null_atts);
283
+
284
+ let mut points = Vec::with_capacity(lifts.len());
285
+ for &lift in lifts {
286
+ let atts: Vec<f64> = if lift == 0.0 {
287
+ null_atts.clone()
288
+ } else {
289
+ par_map_items(starts.clone(), |s| {
290
+ let panel = injected_subpanel(y, treated, s, s + test_len, lift);
291
+ combine([
292
+ fit_method(&panel, s, Method::Sc).att,
293
+ fit_method(&panel, s, Method::Asc).att,
294
+ fit_method(&panel, s, Method::Sdid).att,
295
+ ])
296
+ })
297
+ };
298
+ let power = atts.iter().filter(|a| a.abs() > crit).count() as f64 / n_windows as f64;
299
+ let mut est_pct: Vec<f64> = atts.iter().map(|a| a / base_mean).collect();
300
+ let mean_pct = est_pct.iter().sum::<f64>() / est_pct.len() as f64;
301
+ est_pct.sort_by(|a, b| a.partial_cmp(b).unwrap());
302
+ points.push(PowerPoint {
303
+ lift_pct: lift,
304
+ power,
305
+ est_pct_mean: mean_pct,
306
+ est_pct_lo: quantile(&est_pct, alpha / 2.0),
307
+ est_pct_hi: quantile(&est_pct, 1.0 - alpha / 2.0),
308
+ });
309
+ }
310
+
311
+ let mde_pct = mde_from_points(&points, target_power);
312
+ let (mde_abs_per_period, mde_cumulative) = match mde_pct {
313
+ Some(m) => (Some(m * base_mean), Some(m * base_sum * test_len as f64)),
314
+ None => (None, None),
315
+ };
316
+
317
+ (
318
+ PowerResult {
319
+ method: Method::Ensemble,
320
+ points,
321
+ mde_pct,
322
+ mde_abs_per_period,
323
+ mde_cumulative,
324
+ crit,
325
+ se_null,
326
+ n_windows,
327
+ },
328
+ w,
329
+ )
330
+ }
331
+
184
332
  /// Smallest lift with power ≥ `target`, interpolating between bracketing grid
185
333
  /// points. Assumes `points` are in ascending lift order.
186
334
  fn mde_from_points(points: &[PowerPoint], target: f64) -> Option<f64> {