PyPI - panelkit - Versions diffs - 0.2.0__tar.gz → 0.2.2__tar.gz - Mend

panelkit 0.2.0tar.gz → 0.2.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

{panelkit-0.2.0 → panelkit-0.2.2}/Cargo.lock RENAMED Viewed

@@ -462,7 +462,7 @@ checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e"
 [[package]]
 name = "panelkit-estimators"
-version = "0.2.0"
+version = "0.2.2"
 dependencies = [
  "criterion",
  "panelkit-linalg",
@@ -471,7 +471,7 @@ dependencies = [
 [[package]]
 name = "panelkit-geo"
-version = "0.2.0"
+version = "0.2.2"
 dependencies = [
  "panelkit-estimators",
  "panelkit-inference",
@@ -482,7 +482,7 @@ dependencies = [
 [[package]]
 name = "panelkit-inference"
-version = "0.2.0"
+version = "0.2.2"
 dependencies = [
  "panelkit-estimators",
  "panelkit-linalg",
@@ -491,7 +491,7 @@ dependencies = [
 [[package]]
 name = "panelkit-linalg"
-version = "0.2.0"
+version = "0.2.2"
 dependencies = [
  "proptest",
  "rayon",
@@ -623,7 +623,7 @@ dependencies = [
 [[package]]
 name = "pypanelkit"
-version = "0.2.0"
+version = "0.2.2"
 dependencies = [
  "numpy",
  "panelkit-estimators",

{panelkit-0.2.0 → panelkit-0.2.2}/Cargo.toml RENAMED Viewed

@@ -3,7 +3,7 @@ resolver = "2"
 members = ["crates/linalg", "crates/estimators", "crates/inference", "crates/geo", "crates/pypanelkit"]
 [workspace.package]
-version = "0.2.0"
+version = "0.2.2"
 edition = "2021"
 rust-version = "1.74"
 license = "MIT OR Apache-2.0"

{panelkit-0.2.0 → panelkit-0.2.2}/GUIDE.md RENAMED Viewed

@@ -251,16 +251,93 @@ SC / ASC / SDID. Returns a report with:
   estimate-accuracy CI, design-quality bars).
 Key options: `alpha` (significance level, default 0.10), `target_power`
-(default 0.80), `lifts` (the % grid), `methods`, `recommended` (default SDID).
+(default 0.80), `lifts` (the % grid), `methods`, `recommended` (default SDID),
+`lookback`, `ensemble`/`ensemble_weights`.
+**The ENSEMBLE method (weighted average of SC + ASC + SDID).** By default
+`power()` adds an `"ENSEMBLE"` result alongside the three base methods: a
+weighted average of their ATTs, combined *within each placebo window* before the
+null and power are computed. (That ordering matters — the power of the averaged
+estimator is not the average of three powers; the blend is usually steadier than
+any single method, so its MDE is often the smallest.) `ensemble_weights="auto"`
+(default) uses **inverse-variance** weighting — each method weighted by the
+precision of its historical-null distribution, so a noisier estimator counts for
+less. Pass `"equal"`, a dict like `{"SC": 0.5, "ASC": 0.2, "SDID": 0.3}`, or a
+`[w_sc, w_asc, w_sdid]` list to set them yourself; `ensemble=False` turns it off.
+The weights used are printed in the report and stored on
+`rep.results["ENSEMBLE"].ensemble_weights`.
+**How power is simulated (many placebos, not one).** For a treated set, the test
+window of length `test_len` is *slid across the whole history*: every valid start
+position is one placebo experiment. The detection threshold (critical |ATT|)
+comes from those same windows with **no** injected lift (the historical null), and
+power at lift τ is the share of windows whose injected effect clears that
+threshold. So the estimate is averaged over **many** placebos — `result.n_windows`
+reports how many.
+**The `lookback` option — how far back to simulate.** By default panelkit powers
+over *all* valid windows (more placebo samples → a more stable power estimate).
+Pass `lookback=k` to use only the **most-recent k** windows: those have the
+longest pre-periods and reflect current dynamics, so they're the most
+representative of the test you're about to run — at the cost of fewer samples (a
+noisier estimate). It matters when older history is unrepresentative (regime
+change, growth, format changes) or when early windows have very short pre-periods;
+set `lookback` to cover your relevant recent history (e.g. the last ~6–12
+months of windows).
+### Evaluating a test that ran — `design.evaluate(treated, treat_start, …)`
+`power()` *plans* a test; `evaluate()` *measures* one. Given the treated markets
+and the period treatment began (`treat_start`, the first post-period column), it
+fits SC / ASC / SDID, reports each one's realized effect, and blends them into a
+weighted-average **ensemble** estimate.
+```python
+ev = design.evaluate(treated=["chicago", "denver"], treat_start=52, level=0.90)
+print(ev.summary())          # per-method + ensemble lift, CI, cumulative
+ev.plot("evaluate.png")      # observed-vs-counterfactual, effect path, lift bar
+ev.lift, ev.cumulative, ev.significant
+```
+Each estimate gets a confidence interval from a **stationary block bootstrap** of
+its post-period effect path; an **SC in-space placebo** supplies a p-value. The
+ensemble uses the same `weights` choices as `power()` (`"auto"` = inverse-variance
+from each method's bootstrap SE, `"equal"`, or an explicit dict/list). `ev` exposes
+`.lift`, `.att`, `.cumulative`, `.significant`, the per-method results in `ev.per`,
+and the ensemble in `ev.ensemble`. Reported numbers: **% lift** (effect ÷
+counterfactual), **per-period ATT**, and **cumulative incremental** over the
+window (summed across treated markets).
 ### Choosing a specification — `design.recommend(test_lengths, n_geos_options, target_lift, alphas=…)`
 Sweeps designs across **test length × number of geos × alpha** and recommends the
 best (smallest MDE among trustworthy designs, ties broken toward shorter/cheaper).
 `grid.summary()` prints the recommendation + alternatives; `grid.plot(path)`
-renders the **tradeoffs figure** (MDE vs length per #geos, an MDE heatmap over
-length × #geos, and alpha sensitivity). Use it to find the "knee" — the cheapest
-design that still detects your target lift.
+renders the **tradeoffs figure**. Use it to find the "knee" — the cheapest design
+that still detects your target lift.
+**Reading the tradeoffs figure:**
+- **Top panel** — minimum detectable lift (%) vs test length, one line per number
+  of treated geos. *Lower is better.* The red band marks lifts you *can't*
+  detect; lines below your target lift are viable designs. More geos and longer
+  tests pull the line down (more signal), but cost more holdout/time — pick the
+  knee where the curve flattens.
+- **Bottom-left heatmap** — the same MDE across every (test length × #geos) cell,
+  green = small detectable lift (good), red = large (bad), grey = underpowered.
+- **Bottom-right** — with multiple alphas, how the MDE of the recommended design
+  moves with the significance level (looser α → smaller MDE, more false
+  positives); with one alpha, design confidence by spec.
+- The black ★ marks the recommended design.
+### Guardrails — `design.diagnose(treated, test_len)`
+Before trusting a design, check it. `diagnose` returns a report with
+`.summary()` and `.plot(path)` (the **guardrails figure**): the pre-period fit
+(treated vs synthetic control, so you can *see* whether the counterfactual
+tracks), a seasonality ACF, the holdout share against a healthy band, and a
+banner listing any plain-language warnings (weak fit, volatile markets, strong
+seasonality vs short history, tiny/huge holdout, too few donors). It also exposes
+`.confidence`, `.holdout_pct`, and `.warnings`.
 ### Picking markets — `design.select_markets(test_len, target_lift, max_treated, …)`
@@ -268,9 +345,40 @@ Searches candidate treatment-market sets and ranks them by power, MDE, pre-fit,
 holdout, and confidence. Pass `eligible=[…]` to restrict to markets you can
 actually run in.
-### What it adds over GeoLift
+### Multi-cell tests — `design.multi_cell(cells, test_len, …)`
+Often you run several treatment cells at once — different creatives, budgets, or
+messages across disjoint groups of markets — and want each cell's lift measured
+separately. The subtlety is the control pool: a market that's treated in one cell
+can't be a clean control for another. `multi_cell` handles this by powering each
+cell against a **shared donor pool that excludes every cell's treated markets**.
+```python
+mc = design.multi_cell(
+    cells={
+        "West":      ["los_angeles", "san_diego"],
+        "Midwest":   ["chicago", "detroit"],
+        "Northeast": ["boston", "philadelphia"],
+    },
+    test_len=8, alpha=0.10,
+)
+print(mc.summary())          # per-cell MDE / confidence / holdout + combined holdout
+mc.plot("multicell.png")     # per-cell power curves + an MDE-by-cell bar
+```
-Multi-method (SC/ASC/SDID, not just augmented SCM), MDE in %/absolute/cumulative
-with CIs, an explicit confidence score + verdict, seasonality/stability/holdout
-guardrails with plain-English warnings, a specification-tradeoff sweep, and
-publication-clean figures out of the box.
+`cells` maps a label to its markets (names or indices) and must be disjoint. By
+default the donor pool is every market not assigned to any cell; pass
+`shared_donors=[…]` to fix it explicitly. `lifts`, `methods`, `alpha`,
+`target_power`, `recommended`, and `lookback` are forwarded to each cell's power
+analysis. The report exposes `mc.cells[label]` (a full power report per cell) and
+a combined holdout across all cells. Bigger cells get a smaller MDE; underpowered
+cells are flagged so you can grow or merge them before spending.
+### What the design layer gives you
+Multi-method power (SC/ASC/SDID plus a weighted-average **ensemble** and a
+naive-DiD baseline), MDE in %/absolute/cumulative with CIs, an explicit 0–100
+confidence score + one-line verdict, seasonality/stability/holdout guardrails with
+plain-English warnings, a specification-tradeoff sweep, multi-cell designs,
+**post-test evaluation** (`evaluate()`), and publication-clean figures out of the
+box.

{panelkit-0.2.0 → panelkit-0.2.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: panelkit
-Version: 0.2.0
+Version: 0.2.2
 Classifier: Programming Language :: Rust
 Classifier: Programming Language :: Python :: 3
 Classifier: Topic :: Scientific/Engineering
@@ -202,10 +202,10 @@ valid inference for each estimator.
 ## Geo test design (power analysis & market selection)
-`panelkit.design` is the planning layer in front of a geo experiment — a
-GeoLift-style toolkit, but multi-method and robustness-first, with the heavy
-simulation in Rust. It answers: **which markets should I treat, how big a lift
-can I detect, and can I trust this design?**
+`panelkit.design` is the planning layer in front of a geo experiment —
+multi-method and robustness-first, with the heavy simulation in Rust. It answers:
+**which markets should I treat, how big a lift can I detect, can I trust this
+design — and, once it's run, how big was the effect?**
 ```python
 from panelkit.design import GeoDesign
@@ -217,11 +217,27 @@ rep = design.power(treated=["chicago", "denver"], test_len=8, alpha=0.10)
 print(rep.summary())          # plain-English report: MDE, confidence, warnings
 rep.plot("design.png")        # the figure below
+# guardrails: is this design trustworthy? (pre-fit, seasonality, holdout, warnings)
+guard = design.diagnose(treated=["chicago", "denver"], test_len=8)
+print(guard.summary())
+guard.plot("guardrails.png")  # the guardrails figure below
 # let it pick the markets for you:
 ranked = design.select_markets(test_len=8, target_lift=0.05, max_treated=3)
+# run several disjoint treatment cells at once (each vs. a shared donor pool):
+mc = design.multi_cell(cells={"west": ["los_angeles", "san_diego"],
+                              "east": ["boston", "philadelphia"]}, test_len=8)
+print(mc.summary())           # per-cell MDE / confidence / holdout
+mc.plot("multicell.png")      # the multi-cell figure below
+# already ran the test? measure it (SC/ASC/SDID + a weighted-average ensemble):
+ev = design.evaluate(treated=["chicago", "denver"], treat_start=52)
+print(ev.summary())           # per-method + ensemble lift, CI, cumulative
+ev.plot("evaluate.png")       # observed vs counterfactual + lift-by-method
 # or sweep specifications (length × #geos × significance) and recommend one:
-grid = design.recommend(test_lengths=[4, 6, 8, 12], n_geos_options=[1, 2, 3, 4],
+grid = design.recommend(test_lengths=[4, 6, 8, 12], n_geos_options=[3, 5, 10, 20],
                         target_lift=0.05, alphas=[0.05, 0.10])
 print(grid.summary())
 grid.plot("tradeoffs.png")    # the tradeoffs figure below
@@ -229,19 +245,41 @@ grid.plot("tradeoffs.png")    # the tradeoffs figure below
 ![geo design report](assets/geo_design.png)
+**Guardrails — can you trust the design?** `diagnose(...)` visualizes the
+pre-period fit (treated vs synthetic control), seasonality, holdout share, and
+surfaces plain-language warnings when the design is risky:
+![guardrails](assets/geo_guardrails.png)
 **Recommendations across specifications.** `recommend(...)` sweeps test length ×
 number of geos × significance level (`alpha`) and points you at the cheapest
-design that still detects your target lift — with a figure of the tradeoffs:
+design that still detects your target lift — with a readable figure of the
+tradeoffs (MDE vs length per #geos, an intuitive heatmap, and alpha sensitivity):
 ![specification tradeoffs](assets/geo_scenarios.png)
+**Multi-cell tests.** `multi_cell(...)` runs several disjoint treatment cells
+simultaneously — each measured against a shared donor pool that excludes *every*
+cell's treated markets, so cells never borrow each other as controls. You get a
+per-cell MDE/confidence/holdout report and a combined figure:
+![multi-cell test](assets/geo_multicell.png)
+**Evaluate a test that ran.** `evaluate(...)` is the measurement counterpart to
+the power analysis: fit SC / ASC / SDID on a test that already happened, blend
+them into a weighted-average **ensemble** estimate, and report each one's lift,
+confidence interval (stationary block bootstrap), and cumulative incremental —
+with an SC in-space placebo p-value:
+![test evaluation](assets/geo_evaluate.png)
 **Messy DataFrame? No problem.** `from_long` coerces real-world data: outcome
 strings → numeric (with a clear error on genuinely non-numeric values), dates
 (string or unsorted) → chronological columns, locations → market names, duplicate
 rows aggregated with a warning, and a clear error (with a count) if the panel is
 gappy. You don't pre-clean dtypes.
-What it does that GeoLift doesn't, out of the box:
+What you get out of the box:
 - **Real-data power** — historical placebo with injected lift on your *actual*
   panel (not an assumed variance), across **SC, ASC, and SDID** with a
@@ -256,6 +294,12 @@ What it does that GeoLift doesn't, out of the box:
   go/no-go.
 - **Market selection** that searches candidate treatment sets and ranks them by
   power, MDE, fit, holdout, and confidence.
+- **Multi-cell tests** — several disjoint treatment cells powered at once against
+  a shared donor pool, with a per-cell MDE/confidence report.
+- **A weighted-average ensemble** of SC + ASC + SDID (combined per placebo window,
+  with auto inverse-variance weights) for a steadier estimate than any one method.
+- **Post-test evaluation** — `evaluate()` measures a test that already ran:
+  per-method + ensemble lift, bootstrap CIs, cumulative incremental, and a p-value.
 See [`examples/geo_demo.py`](examples/geo_demo.py).

{panelkit-0.2.0 → panelkit-0.2.2}/README.md RENAMED Viewed

@@ -172,10 +172,10 @@ valid inference for each estimator.
 ## Geo test design (power analysis & market selection)
-`panelkit.design` is the planning layer in front of a geo experiment — a
-GeoLift-style toolkit, but multi-method and robustness-first, with the heavy
-simulation in Rust. It answers: **which markets should I treat, how big a lift
-can I detect, and can I trust this design?**
+`panelkit.design` is the planning layer in front of a geo experiment —
+multi-method and robustness-first, with the heavy simulation in Rust. It answers:
+**which markets should I treat, how big a lift can I detect, can I trust this
+design — and, once it's run, how big was the effect?**
 ```python
 from panelkit.design import GeoDesign
@@ -187,11 +187,27 @@ rep = design.power(treated=["chicago", "denver"], test_len=8, alpha=0.10)
 print(rep.summary())          # plain-English report: MDE, confidence, warnings
 rep.plot("design.png")        # the figure below
+# guardrails: is this design trustworthy? (pre-fit, seasonality, holdout, warnings)
+guard = design.diagnose(treated=["chicago", "denver"], test_len=8)
+print(guard.summary())
+guard.plot("guardrails.png")  # the guardrails figure below
 # let it pick the markets for you:
 ranked = design.select_markets(test_len=8, target_lift=0.05, max_treated=3)
+# run several disjoint treatment cells at once (each vs. a shared donor pool):
+mc = design.multi_cell(cells={"west": ["los_angeles", "san_diego"],
+                              "east": ["boston", "philadelphia"]}, test_len=8)
+print(mc.summary())           # per-cell MDE / confidence / holdout
+mc.plot("multicell.png")      # the multi-cell figure below
+# already ran the test? measure it (SC/ASC/SDID + a weighted-average ensemble):
+ev = design.evaluate(treated=["chicago", "denver"], treat_start=52)
+print(ev.summary())           # per-method + ensemble lift, CI, cumulative
+ev.plot("evaluate.png")       # observed vs counterfactual + lift-by-method
 # or sweep specifications (length × #geos × significance) and recommend one:
-grid = design.recommend(test_lengths=[4, 6, 8, 12], n_geos_options=[1, 2, 3, 4],
+grid = design.recommend(test_lengths=[4, 6, 8, 12], n_geos_options=[3, 5, 10, 20],
                         target_lift=0.05, alphas=[0.05, 0.10])
 print(grid.summary())
 grid.plot("tradeoffs.png")    # the tradeoffs figure below
@@ -199,19 +215,41 @@ grid.plot("tradeoffs.png")    # the tradeoffs figure below
 ![geo design report](assets/geo_design.png)
+**Guardrails — can you trust the design?** `diagnose(...)` visualizes the
+pre-period fit (treated vs synthetic control), seasonality, holdout share, and
+surfaces plain-language warnings when the design is risky:
+![guardrails](assets/geo_guardrails.png)
 **Recommendations across specifications.** `recommend(...)` sweeps test length ×
 number of geos × significance level (`alpha`) and points you at the cheapest
-design that still detects your target lift — with a figure of the tradeoffs:
+design that still detects your target lift — with a readable figure of the
+tradeoffs (MDE vs length per #geos, an intuitive heatmap, and alpha sensitivity):
 ![specification tradeoffs](assets/geo_scenarios.png)
+**Multi-cell tests.** `multi_cell(...)` runs several disjoint treatment cells
+simultaneously — each measured against a shared donor pool that excludes *every*
+cell's treated markets, so cells never borrow each other as controls. You get a
+per-cell MDE/confidence/holdout report and a combined figure:
+![multi-cell test](assets/geo_multicell.png)
+**Evaluate a test that ran.** `evaluate(...)` is the measurement counterpart to
+the power analysis: fit SC / ASC / SDID on a test that already happened, blend
+them into a weighted-average **ensemble** estimate, and report each one's lift,
+confidence interval (stationary block bootstrap), and cumulative incremental —
+with an SC in-space placebo p-value:
+![test evaluation](assets/geo_evaluate.png)
 **Messy DataFrame? No problem.** `from_long` coerces real-world data: outcome
 strings → numeric (with a clear error on genuinely non-numeric values), dates
 (string or unsorted) → chronological columns, locations → market names, duplicate
 rows aggregated with a warning, and a clear error (with a count) if the panel is
 gappy. You don't pre-clean dtypes.
-What it does that GeoLift doesn't, out of the box:
+What you get out of the box:
 - **Real-data power** — historical placebo with injected lift on your *actual*
   panel (not an assumed variance), across **SC, ASC, and SDID** with a
@@ -226,6 +264,12 @@ What it does that GeoLift doesn't, out of the box:
   go/no-go.
 - **Market selection** that searches candidate treatment sets and ranks them by
   power, MDE, fit, holdout, and confidence.
+- **Multi-cell tests** — several disjoint treatment cells powered at once against
+  a shared donor pool, with a per-cell MDE/confidence report.
+- **A weighted-average ensemble** of SC + ASC + SDID (combined per placebo window,
+  with auto inverse-variance weights) for a steadier estimate than any one method.
+- **Post-test evaluation** — `evaluate()` measures a test that already ran:
+  per-method + ensemble lift, bootstrap CIs, cumulative incremental, and a p-value.
 See [`examples/geo_demo.py`](examples/geo_demo.py).

{panelkit-0.2.0 → panelkit-0.2.2}/crates/geo/Cargo.toml RENAMED Viewed

@@ -6,7 +6,7 @@ rust-version.workspace = true
 license.workspace = true
 authors.workspace = true
 repository.workspace = true
-description = "Geo-experiment design: power analysis, market selection, and real-world diagnostics for panelkit (GeoLift-style, but multi-method and robustness-first)."
+description = "Geo-experiment design: multi-method power analysis, market selection, and real-world diagnostics for panelkit."
 [features]
 default = []

{panelkit-0.2.0 → panelkit-0.2.2}/crates/geo/src/lib.rs RENAMED Viewed

@@ -21,6 +21,6 @@ pub mod selection;
 pub mod types;
 pub use diagnostics::diagnostics;
-pub use power::power_curve;
+pub use power::{power_curve, power_curve_ensemble};
 pub use selection::{evaluate, select_markets, MarketCandidate, SelectConfig};
 pub use types::{Diagnostics, Method, PowerPoint, PowerResult};

{panelkit-0.2.0 → panelkit-0.2.2}/crates/geo/src/power.rs RENAMED Viewed

@@ -26,9 +26,39 @@ pub(crate) fn fit_method(panel: &Panel, t0: usize, method: Method) -> ScFit {
         Method::Sc => fit_sc_at(panel, t0, ScConfig::default()),
         Method::Asc => fit_asc_at(panel, t0, AscConfig::default()),
         Method::Sdid => fit_sdid_at(panel, t0, SdidConfig::default()),
+        Method::Ensemble => {
+            unreachable!("Ensemble is combined across methods, not a single fit")
+        }
     }
 }
+/// Normalize three (clamped-nonnegative) weights to sum to 1. Falls back to
+/// equal weights if the inputs are degenerate (all ≤ 0).
+fn normalize_weights(w: [f64; 3]) -> [f64; 3] {
+    let c = [w[0].max(0.0), w[1].max(0.0), w[2].max(0.0)];
+    let s = c[0] + c[1] + c[2];
+    if s > 0.0 {
+        [c[0] / s, c[1] / s, c[2] / s]
+    } else {
+        [1.0 / 3.0; 3]
+    }
+}
+/// Inverse-variance ("precision") weights from each method's null variance:
+/// a method with a tighter placebo distribution gets more weight. A small floor
+/// (relative to the mean variance) keeps a near-perfect fit from taking all the
+/// weight and avoids divide-by-zero.
+fn inverse_variance_weights(var: [f64; 3]) -> [f64; 3] {
+    let mean = (var[0] + var[1] + var[2]) / 3.0;
+    let floor = 1e-6 * mean + f64::MIN_POSITIVE;
+    let prec = [
+        1.0 / (var[0] + floor),
+        1.0 / (var[1] + floor),
+        1.0 / (var[2] + floor),
+    ];
+    normalize_weights(prec)
+}
 /// Build the sub-panel on periods `[0, end)` with a multiplicative `lift` applied
 /// to the treated units over the test window `[s, end)`.
 fn injected_subpanel(y: &Mat, treated: &[usize], s: usize, end: usize, lift: f64) -> Panel {
@@ -103,6 +133,7 @@ pub fn power_curve(
     alpha: f64,
     target_power: f64,
     min_pre: usize,
+    lookback: Option<usize>,
 ) -> PowerResult {
     let t = y.cols();
     assert!(test_len >= 1 && test_len < t, "test_len out of range");
@@ -111,7 +142,18 @@ pub fn power_curve(
         first <= t - test_len,
         "not enough periods for the requested pre-window + test_len"
     );
-    let starts: Vec<usize> = (first..=(t - test_len)).collect();
+    // Every valid sliding test-window start position is one historical placebo.
+    // We power over MANY of them (the count is `n_windows`). `lookback`, when set,
+    // keeps only the most-recent K windows: those are the most representative of
+    // the upcoming test (recent dynamics, longest pre-periods), at the cost of
+    // fewer placebo samples.
+    let mut starts: Vec<usize> = (first..=(t - test_len)).collect();
+    if let Some(k) = lookback {
+        let k = k.max(1);
+        if starts.len() > k {
+            starts = starts.split_off(starts.len() - k);
+        }
+    }
     let n_windows = starts.len();
     let (base_mean, base_sum) = treated_baseline(y, treated);
@@ -169,6 +211,124 @@ pub fn power_curve(
     }
 }
+/// Power analysis for a **weighted-average ensemble** of SC + ASC + SDID.
+///
+/// Each historical placebo window is fit with all three estimators and combined
+/// into a single ATT, `Σ wₘ · ATTₘ`, *before* the null distribution and power are
+/// computed — so this reports the power of the averaged estimator (which is
+/// generally more stable than any single one), not the average of three powers.
+///
+/// `weights` is `[w_sc, w_asc, w_sdid]`; `None` uses data-driven inverse-variance
+/// weights from each method's historical-null spread. Returns the result plus the
+/// (normalized) weights actually used.
+#[allow(clippy::too_many_arguments)]
+pub fn power_curve_ensemble(
+    y: &Mat,
+    treated: &[usize],
+    test_len: usize,
+    lifts: &[f64],
+    alpha: f64,
+    target_power: f64,
+    min_pre: usize,
+    lookback: Option<usize>,
+    weights: Option<[f64; 3]>,
+) -> (PowerResult, [f64; 3]) {
+    let t = y.cols();
+    assert!(test_len >= 1 && test_len < t, "test_len out of range");
+    let first = min_pre.max(1);
+    assert!(
+        first <= t - test_len,
+        "not enough periods for the requested pre-window + test_len"
+    );
+    let mut starts: Vec<usize> = (first..=(t - test_len)).collect();
+    if let Some(k) = lookback {
+        let k = k.max(1);
+        if starts.len() > k {
+            starts = starts.split_off(starts.len() - k);
+        }
+    }
+    let n_windows = starts.len();
+    let (base_mean, base_sum) = treated_baseline(y, treated);
+    // Per-window null ATTs for each of the three methods (one fit-set, reused for
+    // both weight estimation and the lift-0 power point).
+    let null_by_window: Vec<[f64; 3]> = par_map_items(starts.clone(), |s| {
+        let panel = injected_subpanel(y, treated, s, s + test_len, 0.0);
+        [
+            fit_method(&panel, s, Method::Sc).att,
+            fit_method(&panel, s, Method::Asc).att,
+            fit_method(&panel, s, Method::Sdid).att,
+        ]
+    });
+    let w = match weights {
+        Some(w) => normalize_weights(w),
+        None => {
+            let mut var = [0.0f64; 3];
+            for m in 0..3 {
+                let col: Vec<f64> = null_by_window.iter().map(|a| a[m]).collect();
+                let sd = std_dev(&col);
+                var[m] = sd * sd;
+            }
+            inverse_variance_weights(var)
+        }
+    };
+    let combine = |a: [f64; 3]| w[0] * a[0] + w[1] * a[1] + w[2] * a[2];
+    let null_atts: Vec<f64> = null_by_window.iter().map(|&a| combine(a)).collect();
+    let mut abs_null: Vec<f64> = null_atts.iter().map(|a| a.abs()).collect();
+    abs_null.sort_by(|a, b| a.partial_cmp(b).unwrap());
+    let crit = quantile(&abs_null, 1.0 - alpha);
+    let se_null = std_dev(&null_atts);
+    let mut points = Vec::with_capacity(lifts.len());
+    for &lift in lifts {
+        let atts: Vec<f64> = if lift == 0.0 {
+            null_atts.clone()
+        } else {
+            par_map_items(starts.clone(), |s| {
+                let panel = injected_subpanel(y, treated, s, s + test_len, lift);
+                combine([
+                    fit_method(&panel, s, Method::Sc).att,
+                    fit_method(&panel, s, Method::Asc).att,
+                    fit_method(&panel, s, Method::Sdid).att,
+                ])
+            })
+        };
+        let power = atts.iter().filter(|a| a.abs() > crit).count() as f64 / n_windows as f64;
+        let mut est_pct: Vec<f64> = atts.iter().map(|a| a / base_mean).collect();
+        let mean_pct = est_pct.iter().sum::<f64>() / est_pct.len() as f64;
+        est_pct.sort_by(|a, b| a.partial_cmp(b).unwrap());
+        points.push(PowerPoint {
+            lift_pct: lift,
+            power,
+            est_pct_mean: mean_pct,
+            est_pct_lo: quantile(&est_pct, alpha / 2.0),
+            est_pct_hi: quantile(&est_pct, 1.0 - alpha / 2.0),
+        });
+    }
+    let mde_pct = mde_from_points(&points, target_power);
+    let (mde_abs_per_period, mde_cumulative) = match mde_pct {
+        Some(m) => (Some(m * base_mean), Some(m * base_sum * test_len as f64)),
+        None => (None, None),
+    };
+    (
+        PowerResult {
+            method: Method::Ensemble,
+            points,
+            mde_pct,
+            mde_abs_per_period,
+            mde_cumulative,
+            crit,
+            se_null,
+            n_windows,
+        },
+        w,
+    )
+}
 /// Smallest lift with power ≥ `target`, interpolating between bracketing grid
 /// points. Assumes `points` are in ascending lift order.
 fn mde_from_points(points: &[PowerPoint], target: f64) -> Option<f64> {

panelkit 0.2.0__tar.gz → 0.2.2__tar.gz

panelkit 0.2.0tar.gz → 0.2.2tar.gz