abtestwise 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abtestwise/__init__.py +14 -0
- abtestwise/bayesian.py +77 -0
- abtestwise/binary.py +143 -0
- abtestwise/frequentist.py +48 -0
- abtestwise/plotting.py +143 -0
- abtestwise/result.py +209 -0
- abtestwise/validation.py +80 -0
- abtestwise-0.1.0.dist-info/METADATA +131 -0
- abtestwise-0.1.0.dist-info/RECORD +10 -0
- abtestwise-0.1.0.dist-info/WHEEL +4 -0
abtestwise/__init__.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""abtestwise: a lightweight toolkit for binary A/B experiment analysis.
|
|
2
|
+
|
|
3
|
+
Version 0.1 combines frequentist and Bayesian summaries for binary proportions
|
|
4
|
+
using aggregate count data.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from .binary import BinaryABTest
|
|
10
|
+
from .result import BinaryABResult
|
|
11
|
+
|
|
12
|
+
__version__ = "0.1.0"
|
|
13
|
+
|
|
14
|
+
__all__ = ["BinaryABTest", "BinaryABResult", "__version__"]
|
abtestwise/bayesian.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""Bayesian analysis: beta-binomial posterior simulation.
|
|
2
|
+
|
|
3
|
+
For a binary metric with a Beta prior, the posterior for each group's true
|
|
4
|
+
success rate is conjugate and also Beta:
|
|
5
|
+
|
|
6
|
+
posterior = Beta(prior_alpha + successes, prior_beta + failures)
|
|
7
|
+
|
|
8
|
+
We draw samples from each group's posterior, form the lift distribution
|
|
9
|
+
(Treatment B - Control A), and summarize it using NumPy operations.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def posterior_samples(
|
|
18
|
+
successes: int,
|
|
19
|
+
total: int,
|
|
20
|
+
prior_alpha: float,
|
|
21
|
+
prior_beta: float,
|
|
22
|
+
n_simulations: int,
|
|
23
|
+
rng: np.random.Generator,
|
|
24
|
+
) -> np.ndarray:
|
|
25
|
+
""""Draw posterior samples of a group's true success rate from its Beta posterior."""
|
|
26
|
+
failures = total - successes
|
|
27
|
+
alpha = prior_alpha + successes
|
|
28
|
+
beta = prior_beta + failures
|
|
29
|
+
return rng.beta(alpha, beta, size=n_simulations)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def simulate_lift_samples(
|
|
33
|
+
control_successes: int,
|
|
34
|
+
control_total: int,
|
|
35
|
+
treatment_successes: int,
|
|
36
|
+
treatment_total: int,
|
|
37
|
+
prior_alpha: float,
|
|
38
|
+
prior_beta: float,
|
|
39
|
+
n_simulations: int,
|
|
40
|
+
rng: np.random.Generator,
|
|
41
|
+
) -> np.ndarray:
|
|
42
|
+
"""Return posterior samples of the lift (treatment rate - control rate)."""
|
|
43
|
+
control = posterior_samples(
|
|
44
|
+
control_successes, control_total, prior_alpha, prior_beta, n_simulations, rng
|
|
45
|
+
)
|
|
46
|
+
treatment = posterior_samples(
|
|
47
|
+
treatment_successes,
|
|
48
|
+
treatment_total,
|
|
49
|
+
prior_alpha,
|
|
50
|
+
prior_beta,
|
|
51
|
+
n_simulations,
|
|
52
|
+
rng,
|
|
53
|
+
)
|
|
54
|
+
return treatment - control
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def credible_interval_bounds(
|
|
58
|
+
lift_samples: np.ndarray, credible_interval: float
|
|
59
|
+
) -> tuple[float, float]:
|
|
60
|
+
"""Equal-tailed credible interval for the lift at the given level.
|
|
61
|
+
|
|
62
|
+
For a 0.95 interval this returns the 2.5th and 97.5th percentiles.
|
|
63
|
+
"""
|
|
64
|
+
tail = (1.0 - credible_interval) / 2.0
|
|
65
|
+
lower = float(np.quantile(lift_samples, tail))
|
|
66
|
+
upper = float(np.quantile(lift_samples, 1.0 - tail))
|
|
67
|
+
return lower, upper
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def expected_loss_treatment(lift_samples: np.ndarray) -> float:
|
|
71
|
+
"""Expected loss from choosing treatment: mean(max(-lift, 0))."""
|
|
72
|
+
return float(np.mean(np.maximum(-lift_samples, 0.0)))
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def expected_loss_control(lift_samples: np.ndarray) -> float:
|
|
76
|
+
"""Expected loss from choosing control: mean(max(lift, 0))."""
|
|
77
|
+
return float(np.mean(np.maximum(lift_samples, 0.0)))
|
abtestwise/binary.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""Run binary A/B tests from aggregate count data.
|
|
2
|
+
|
|
3
|
+
This module validates inputs, sets up the random number generator so results are
|
|
4
|
+
reproducible from a seed, and combines the frequentist and Bayesian helpers to
|
|
5
|
+
build a :class:BinaryABResult.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import math
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
|
|
15
|
+
from . import bayesian, frequentist, validation
|
|
16
|
+
from .result import BinaryABResult
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass(frozen=True)
|
|
20
|
+
class BinaryABTest:
|
|
21
|
+
"""A binary A/B test using aggregate success and total counts."""
|
|
22
|
+
|
|
23
|
+
control_successes: int
|
|
24
|
+
control_total: int
|
|
25
|
+
treatment_successes: int
|
|
26
|
+
treatment_total: int
|
|
27
|
+
prior_alpha: float
|
|
28
|
+
prior_beta: float
|
|
29
|
+
n_simulations: int
|
|
30
|
+
credible_interval: float
|
|
31
|
+
seed: int | None
|
|
32
|
+
|
|
33
|
+
@classmethod
|
|
34
|
+
def from_counts(
|
|
35
|
+
cls,
|
|
36
|
+
control_successes: int,
|
|
37
|
+
control_total: int,
|
|
38
|
+
treatment_successes: int,
|
|
39
|
+
treatment_total: int,
|
|
40
|
+
*,
|
|
41
|
+
prior_alpha: float = 1.0,
|
|
42
|
+
prior_beta: float = 1.0,
|
|
43
|
+
n_simulations: int = 100_000,
|
|
44
|
+
credible_interval: float = 0.95,
|
|
45
|
+
seed: int | None = None,
|
|
46
|
+
) -> "BinaryABTest":
|
|
47
|
+
"""Create a binary A/B test from aggregate counts.
|
|
48
|
+
|
|
49
|
+
The four count inputs can be positional. All other settings have to be names.
|
|
50
|
+
The default prior is Beta(1, 1).
|
|
51
|
+
"""
|
|
52
|
+
validation.validate_count("control_successes", control_successes)
|
|
53
|
+
validation.validate_total("control_total", control_total)
|
|
54
|
+
validation.validate_count("treatment_successes", treatment_successes)
|
|
55
|
+
validation.validate_total("treatment_total", treatment_total)
|
|
56
|
+
validation.validate_successes_le_total(
|
|
57
|
+
"control_successes", control_successes, "control_total", control_total
|
|
58
|
+
)
|
|
59
|
+
validation.validate_successes_le_total(
|
|
60
|
+
"treatment_successes",
|
|
61
|
+
treatment_successes,
|
|
62
|
+
"treatment_total",
|
|
63
|
+
treatment_total,
|
|
64
|
+
)
|
|
65
|
+
validation.validate_prior("prior_alpha", prior_alpha)
|
|
66
|
+
validation.validate_prior("prior_beta", prior_beta)
|
|
67
|
+
validation.validate_n_simulations(n_simulations)
|
|
68
|
+
validation.validate_credible_interval(credible_interval)
|
|
69
|
+
validation.validate_seed(seed)
|
|
70
|
+
|
|
71
|
+
return cls(
|
|
72
|
+
control_successes=control_successes,
|
|
73
|
+
control_total=control_total,
|
|
74
|
+
treatment_successes=treatment_successes,
|
|
75
|
+
treatment_total=treatment_total,
|
|
76
|
+
prior_alpha=float(prior_alpha),
|
|
77
|
+
prior_beta=float(prior_beta),
|
|
78
|
+
n_simulations=n_simulations,
|
|
79
|
+
credible_interval=credible_interval,
|
|
80
|
+
seed=seed,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
def run(self) -> BinaryABResult:
|
|
84
|
+
"""Run the frequentist and Bayesian analyses and return the result."""
|
|
85
|
+
control_rate = self.control_successes / self.control_total
|
|
86
|
+
treatment_rate = self.treatment_successes / self.treatment_total
|
|
87
|
+
absolute_lift = treatment_rate - control_rate
|
|
88
|
+
|
|
89
|
+
# Relative lift is undefined when the control rate is zero.
|
|
90
|
+
relative_lift = (
|
|
91
|
+
absolute_lift / control_rate if control_rate != 0 else math.nan
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
# --- Frequentist ---
|
|
95
|
+
z_statistic, p_value = frequentist.two_proportion_z_test(
|
|
96
|
+
self.control_successes,
|
|
97
|
+
self.control_total,
|
|
98
|
+
self.treatment_successes,
|
|
99
|
+
self.treatment_total,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# --- Bayesian ---
|
|
103
|
+
rng = np.random.default_rng(self.seed)
|
|
104
|
+
lift_samples = bayesian.simulate_lift_samples(
|
|
105
|
+
self.control_successes,
|
|
106
|
+
self.control_total,
|
|
107
|
+
self.treatment_successes,
|
|
108
|
+
self.treatment_total,
|
|
109
|
+
self.prior_alpha,
|
|
110
|
+
self.prior_beta,
|
|
111
|
+
self.n_simulations,
|
|
112
|
+
rng,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
lower, upper = bayesian.credible_interval_bounds(
|
|
116
|
+
lift_samples, self.credible_interval
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
return BinaryABResult(
|
|
120
|
+
control_successes=self.control_successes,
|
|
121
|
+
control_total=self.control_total,
|
|
122
|
+
treatment_successes=self.treatment_successes,
|
|
123
|
+
treatment_total=self.treatment_total,
|
|
124
|
+
prior_alpha=self.prior_alpha,
|
|
125
|
+
prior_beta=self.prior_beta,
|
|
126
|
+
n_simulations=self.n_simulations,
|
|
127
|
+
credible_interval=self.credible_interval,
|
|
128
|
+
seed=self.seed,
|
|
129
|
+
control_rate=control_rate,
|
|
130
|
+
treatment_rate=treatment_rate,
|
|
131
|
+
absolute_lift=absolute_lift,
|
|
132
|
+
relative_lift=relative_lift,
|
|
133
|
+
z_statistic=z_statistic,
|
|
134
|
+
p_value=p_value,
|
|
135
|
+
posterior_mean_lift=float(np.mean(lift_samples)),
|
|
136
|
+
posterior_median_lift=float(np.median(lift_samples)),
|
|
137
|
+
prob_treatment_better=float(np.mean(lift_samples > 0)),
|
|
138
|
+
prob_control_better=float(np.mean(lift_samples < 0)),
|
|
139
|
+
credible_interval_bounds=(lower, upper),
|
|
140
|
+
expected_loss_treatment=bayesian.expected_loss_treatment(lift_samples),
|
|
141
|
+
expected_loss_control=bayesian.expected_loss_control(lift_samples),
|
|
142
|
+
lift_samples=lift_samples,
|
|
143
|
+
)
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""Frequentist two-sided pooled two-proportion z-test."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import math
|
|
6
|
+
|
|
7
|
+
from scipy.stats import norm
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def two_proportion_z_test(
|
|
11
|
+
control_successes: int,
|
|
12
|
+
control_total: int,
|
|
13
|
+
treatment_successes: int,
|
|
14
|
+
treatment_total: int,
|
|
15
|
+
) -> tuple[float, float]:
|
|
16
|
+
"""Run a two-sided pooled two-proportion z-test.
|
|
17
|
+
|
|
18
|
+
Returns ``(z_statistic, p_value)``.
|
|
19
|
+
|
|
20
|
+
The pooled estimate combines both arms under the null hypothesis that the
|
|
21
|
+
two proportions are equal:
|
|
22
|
+
|
|
23
|
+
p_pool = (x_c + x_t) / (n_c + n_t)
|
|
24
|
+
se = sqrt(p_pool * (1 - p_pool) * (1/n_c + 1/n_t))
|
|
25
|
+
z = (rate_t - rate_c) / se
|
|
26
|
+
p = 2 * (1 - Phi(|z|))
|
|
27
|
+
|
|
28
|
+
If the standard error is zero, then return ``(0.0, 1.0)`` to avoid dividing by zero.
|
|
29
|
+
"""
|
|
30
|
+
control_rate = control_successes / control_total
|
|
31
|
+
treatment_rate = treatment_successes / treatment_total
|
|
32
|
+
|
|
33
|
+
pooled_rate = (control_successes + treatment_successes) / (
|
|
34
|
+
control_total + treatment_total
|
|
35
|
+
)
|
|
36
|
+
standard_error = math.sqrt(
|
|
37
|
+
pooled_rate
|
|
38
|
+
* (1.0 - pooled_rate)
|
|
39
|
+
* (1.0 / control_total + 1.0 / treatment_total)
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
if standard_error == 0.0:
|
|
43
|
+
return 0.0, 1.0
|
|
44
|
+
|
|
45
|
+
z_statistic = (treatment_rate - control_rate) / standard_error
|
|
46
|
+
p_value = 2.0 * (1.0 - norm.cdf(abs(z_statistic)))
|
|
47
|
+
|
|
48
|
+
return float(z_statistic), float(p_value)
|
abtestwise/plotting.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""Plotting for binary A/B test results.
|
|
2
|
+
|
|
3
|
+
Lift is always Treatment B - Control A. Plots show the lift in percentage points,
|
|
4
|
+
so a raw lift of 0.025 will be displayed as +2.5 percentage points.
|
|
5
|
+
|
|
6
|
+
These functions return matplotlib Axes objects. Note that they do not call plt.show()
|
|
7
|
+
or save files automatically.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from typing import TYPE_CHECKING
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING: # pragma: no cover - typing only
|
|
17
|
+
from matplotlib.axes import Axes
|
|
18
|
+
|
|
19
|
+
# Raw lift is a proportion difference; multiply by 100 to get percentage points.
|
|
20
|
+
_PCT_POINTS = 100.0
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def plot_lift_distribution(
|
|
24
|
+
lift_samples: np.ndarray,
|
|
25
|
+
median_lift: float,
|
|
26
|
+
credible_interval_bounds: tuple[float, float],
|
|
27
|
+
credible_interval: float,
|
|
28
|
+
*,
|
|
29
|
+
ax: "Axes | None" = None,
|
|
30
|
+
bins: int = 50,
|
|
31
|
+
density: bool = True,
|
|
32
|
+
title: str | None = "Posterior Distribution of Lift",
|
|
33
|
+
) -> "Axes":
|
|
34
|
+
"""Histogram of the posterior lift distribution (Treatment B - Control A).
|
|
35
|
+
|
|
36
|
+
Marks zero, the posterior median, and the credible-interval bounds. All
|
|
37
|
+
lift values are shown in percentage points.
|
|
38
|
+
"""
|
|
39
|
+
import matplotlib.pyplot as plt
|
|
40
|
+
|
|
41
|
+
if ax is None:
|
|
42
|
+
_, ax = plt.subplots(figsize=(8, 5))
|
|
43
|
+
|
|
44
|
+
samples_pp = np.asarray(lift_samples) * _PCT_POINTS
|
|
45
|
+
median_pp = median_lift * _PCT_POINTS
|
|
46
|
+
lower_pp = credible_interval_bounds[0] * _PCT_POINTS
|
|
47
|
+
upper_pp = credible_interval_bounds[1] * _PCT_POINTS
|
|
48
|
+
ci_pct = credible_interval * 100
|
|
49
|
+
|
|
50
|
+
ax.hist(
|
|
51
|
+
samples_pp,
|
|
52
|
+
bins=bins,
|
|
53
|
+
density=density,
|
|
54
|
+
color="#4C72B0",
|
|
55
|
+
alpha=0.7,
|
|
56
|
+
edgecolor="white",
|
|
57
|
+
linewidth=0.5,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# Zero reference: where Treatment B and Control A are equal.
|
|
61
|
+
ax.axvline(0.0, color="#444444", linestyle="--", linewidth=1.5, label="No difference")
|
|
62
|
+
# Posterior median lift.
|
|
63
|
+
ax.axvline(
|
|
64
|
+
median_pp,
|
|
65
|
+
color="#C44E52",
|
|
66
|
+
linestyle="-",
|
|
67
|
+
linewidth=2.0,
|
|
68
|
+
label=f"Median {median_pp:+.2f} pp",
|
|
69
|
+
)
|
|
70
|
+
# Credible interval bounds.
|
|
71
|
+
ax.axvline(
|
|
72
|
+
lower_pp,
|
|
73
|
+
color="#55A868",
|
|
74
|
+
linestyle=":",
|
|
75
|
+
linewidth=1.8,
|
|
76
|
+
label=f"{ci_pct:g}% CI [{lower_pp:+.2f}, {upper_pp:+.2f}] pp",
|
|
77
|
+
)
|
|
78
|
+
ax.axvline(upper_pp, color="#55A868", linestyle=":", linewidth=1.8)
|
|
79
|
+
|
|
80
|
+
ax.set_xlabel("Lift: Treatment B - Control A (percentage points)")
|
|
81
|
+
ax.set_ylabel("Density" if density else "Frequency")
|
|
82
|
+
if title is not None:
|
|
83
|
+
ax.set_title(title)
|
|
84
|
+
ax.legend(loc="best", frameon=True, fontsize=9)
|
|
85
|
+
ax.margins(x=0.02)
|
|
86
|
+
|
|
87
|
+
return ax
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def plot_probability_bar(
|
|
91
|
+
prob_treatment_better: float,
|
|
92
|
+
prob_control_better: float,
|
|
93
|
+
*,
|
|
94
|
+
ax: "Axes | None" = None,
|
|
95
|
+
title: str | None = "Posterior Probability of Being Better",
|
|
96
|
+
) -> "Axes":
|
|
97
|
+
"""Two-bar chart comparing P(Treatment B better) vs P(Control A better)."""
|
|
98
|
+
import matplotlib.pyplot as plt
|
|
99
|
+
|
|
100
|
+
if ax is None:
|
|
101
|
+
_, ax = plt.subplots(figsize=(6, 5))
|
|
102
|
+
|
|
103
|
+
labels = ["Treatment B better", "Control A better"]
|
|
104
|
+
values = [prob_treatment_better, prob_control_better]
|
|
105
|
+
colors = ["#4C72B0", "#C44E52"]
|
|
106
|
+
|
|
107
|
+
bars = ax.bar(labels, values, color=colors, width=0.6, edgecolor="white")
|
|
108
|
+
|
|
109
|
+
# Percentage labels. For tall bars (near the top boundary) we place the
|
|
110
|
+
# label *inside* the bar in white so it does not overlap the chart top; for
|
|
111
|
+
# shorter bars we place it just above the bar in dark text.
|
|
112
|
+
high_bar_threshold = 0.9
|
|
113
|
+
for bar, value in zip(bars, values):
|
|
114
|
+
x = bar.get_x() + bar.get_width() / 2
|
|
115
|
+
if value >= high_bar_threshold:
|
|
116
|
+
ax.text(
|
|
117
|
+
x,
|
|
118
|
+
value - 0.03,
|
|
119
|
+
f"{value:.1%}",
|
|
120
|
+
ha="center",
|
|
121
|
+
va="top",
|
|
122
|
+
fontsize=11,
|
|
123
|
+
fontweight="bold",
|
|
124
|
+
color="white",
|
|
125
|
+
)
|
|
126
|
+
else:
|
|
127
|
+
ax.text(
|
|
128
|
+
x,
|
|
129
|
+
value + 0.02,
|
|
130
|
+
f"{value:.1%}",
|
|
131
|
+
ha="center",
|
|
132
|
+
va="bottom",
|
|
133
|
+
fontsize=11,
|
|
134
|
+
fontweight="bold",
|
|
135
|
+
color="#222222",
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
ax.set_ylim(0.0, 1.0)
|
|
139
|
+
ax.set_ylabel("Posterior probability")
|
|
140
|
+
if title is not None:
|
|
141
|
+
ax.set_title(title)
|
|
142
|
+
|
|
143
|
+
return ax
|
abtestwise/result.py
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
"""Result object for binary A/B tests."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import math
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
from . import validation
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _format_percent(x: float, digits: int = 2) -> str:
|
|
16
|
+
"""Format a proportion as a percent string."""
|
|
17
|
+
if x is None or (isinstance(x, float) and math.isnan(x)):
|
|
18
|
+
return "undefined"
|
|
19
|
+
return f"{x * 100:.{digits}f}%"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _format_pp(x: float, digits: int = 2) -> str:
|
|
23
|
+
"""Format a proportion difference as signed percentage points."""
|
|
24
|
+
if x is None or (isinstance(x, float) and math.isnan(x)):
|
|
25
|
+
return "undefined"
|
|
26
|
+
return f"{x * 100:+.{digits}f}"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _format_probability(x: float, digits: int = 1) -> str:
|
|
30
|
+
"""Format a probability as a percent string."""
|
|
31
|
+
if x is None or (isinstance(x, float) and math.isnan(x)):
|
|
32
|
+
return "undefined"
|
|
33
|
+
return f"{x * 100:.{digits}f}%"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass(frozen=True)
|
|
37
|
+
class BinaryABResult:
|
|
38
|
+
"""Results from a binary A/B test.
|
|
39
|
+
|
|
40
|
+
Lift is always Treatment B - Control A.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
# Inputs
|
|
44
|
+
control_successes: int
|
|
45
|
+
control_total: int
|
|
46
|
+
treatment_successes: int
|
|
47
|
+
treatment_total: int
|
|
48
|
+
prior_alpha: float
|
|
49
|
+
prior_beta: float
|
|
50
|
+
n_simulations: int
|
|
51
|
+
credible_interval: float
|
|
52
|
+
seed: int | None
|
|
53
|
+
|
|
54
|
+
# Observed and frequentist results
|
|
55
|
+
control_rate: float
|
|
56
|
+
treatment_rate: float
|
|
57
|
+
absolute_lift: float
|
|
58
|
+
relative_lift: float
|
|
59
|
+
z_statistic: float
|
|
60
|
+
p_value: float
|
|
61
|
+
|
|
62
|
+
# Bayesian results
|
|
63
|
+
posterior_mean_lift: float
|
|
64
|
+
posterior_median_lift: float
|
|
65
|
+
prob_treatment_better: float
|
|
66
|
+
prob_control_better: float
|
|
67
|
+
credible_interval_bounds: tuple[float, float]
|
|
68
|
+
expected_loss_treatment: float
|
|
69
|
+
expected_loss_control: float
|
|
70
|
+
|
|
71
|
+
# Posterior samples
|
|
72
|
+
lift_samples: np.ndarray = field(repr=False)
|
|
73
|
+
|
|
74
|
+
def prob_lift_above(self, threshold: float) -> float:
|
|
75
|
+
"""Return the posterior probability that lift is above a threshold."""
|
|
76
|
+
return float(np.mean(self.lift_samples > threshold))
|
|
77
|
+
|
|
78
|
+
def prob_no_harm(self, margin: float = 0.0) -> float:
|
|
79
|
+
"""Posterior probability that Treatment B does no harm beyond ``margin``.
|
|
80
|
+
|
|
81
|
+
Computes ``P(lift >= -margin | data)``, where lift is
|
|
82
|
+
``treatment_rate - control_rate``. ``margin`` is in raw decimal units, so
|
|
83
|
+
``margin=0.005`` means "Treatment B is not worse than Control A by more
|
|
84
|
+
than 0.5 percentage points". With ``margin=0.0`` this is the probability
|
|
85
|
+
that lift is at least zero.
|
|
86
|
+
"""
|
|
87
|
+
validation.validate_margin(margin)
|
|
88
|
+
return float(np.mean(self.lift_samples >= -margin))
|
|
89
|
+
|
|
90
|
+
def prob_harm_above(self, margin: float = 0.0) -> float:
|
|
91
|
+
"""Posterior probability that Treatment B is harmful beyond ``margin``.
|
|
92
|
+
|
|
93
|
+
Computes ``P(lift < -margin | data)``, the exact complement of
|
|
94
|
+
:meth:`prob_no_harm` for the same ``margin``.
|
|
95
|
+
"""
|
|
96
|
+
validation.validate_margin(margin)
|
|
97
|
+
return float(np.mean(self.lift_samples < -margin))
|
|
98
|
+
|
|
99
|
+
def plot_lift_distribution(
|
|
100
|
+
self,
|
|
101
|
+
ax: Any = None,
|
|
102
|
+
*,
|
|
103
|
+
bins: int = 50,
|
|
104
|
+
density: bool = True,
|
|
105
|
+
title: str | None = "Posterior Distribution of Lift",
|
|
106
|
+
) -> Any:
|
|
107
|
+
"""Plot the posterior lift distribution."""
|
|
108
|
+
from . import plotting
|
|
109
|
+
|
|
110
|
+
return plotting.plot_lift_distribution(
|
|
111
|
+
self.lift_samples,
|
|
112
|
+
self.posterior_median_lift,
|
|
113
|
+
self.credible_interval_bounds,
|
|
114
|
+
self.credible_interval,
|
|
115
|
+
ax=ax,
|
|
116
|
+
bins=bins,
|
|
117
|
+
density=density,
|
|
118
|
+
title=title,
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
def plot_probability_bar(
|
|
122
|
+
self,
|
|
123
|
+
ax: Any = None,
|
|
124
|
+
*,
|
|
125
|
+
title: str | None = "Posterior Probability of Being Better",
|
|
126
|
+
) -> Any:
|
|
127
|
+
"""Plot the probability that each group is better."""
|
|
128
|
+
from . import plotting
|
|
129
|
+
|
|
130
|
+
return plotting.plot_probability_bar(
|
|
131
|
+
self.prob_treatment_better,
|
|
132
|
+
self.prob_control_better,
|
|
133
|
+
ax=ax,
|
|
134
|
+
title=title,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
def to_dict(self) -> dict[str, Any]:
|
|
138
|
+
"""Return result fields as a dictionary."""
|
|
139
|
+
return {
|
|
140
|
+
"control_successes": self.control_successes,
|
|
141
|
+
"control_total": self.control_total,
|
|
142
|
+
"treatment_successes": self.treatment_successes,
|
|
143
|
+
"treatment_total": self.treatment_total,
|
|
144
|
+
"prior_alpha": self.prior_alpha,
|
|
145
|
+
"prior_beta": self.prior_beta,
|
|
146
|
+
"n_simulations": self.n_simulations,
|
|
147
|
+
"credible_interval": self.credible_interval,
|
|
148
|
+
"seed": self.seed,
|
|
149
|
+
"control_rate": self.control_rate,
|
|
150
|
+
"treatment_rate": self.treatment_rate,
|
|
151
|
+
"absolute_lift": self.absolute_lift,
|
|
152
|
+
"relative_lift": self.relative_lift,
|
|
153
|
+
"z_statistic": self.z_statistic,
|
|
154
|
+
"p_value": self.p_value,
|
|
155
|
+
"posterior_mean_lift": self.posterior_mean_lift,
|
|
156
|
+
"posterior_median_lift": self.posterior_median_lift,
|
|
157
|
+
"prob_treatment_better": self.prob_treatment_better,
|
|
158
|
+
"prob_control_better": self.prob_control_better,
|
|
159
|
+
"credible_interval_bounds": self.credible_interval_bounds,
|
|
160
|
+
"expected_loss_treatment": self.expected_loss_treatment,
|
|
161
|
+
"expected_loss_control": self.expected_loss_control,
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
def summary(self) -> str:
|
|
165
|
+
"""Return a formatted summary."""
|
|
166
|
+
ci_pct = self.credible_interval * 100
|
|
167
|
+
lower, upper = self.credible_interval_bounds
|
|
168
|
+
|
|
169
|
+
# Relative lift is a ratio, not percentage points.
|
|
170
|
+
if math.isnan(self.relative_lift):
|
|
171
|
+
relative_lift_str = "undefined"
|
|
172
|
+
else:
|
|
173
|
+
relative_lift_str = f"{self.relative_lift * 100:+.2f}%"
|
|
174
|
+
|
|
175
|
+
lines = [
|
|
176
|
+
"Binary A/B test result",
|
|
177
|
+
"=" * 40,
|
|
178
|
+
"Observed (lift is always Treatment B - Control A)",
|
|
179
|
+
f" Control (A): {self.control_successes} / {self.control_total} "
|
|
180
|
+
f"= {_format_percent(self.control_rate)}",
|
|
181
|
+
f" Treatment (B): {self.treatment_successes} / {self.treatment_total} "
|
|
182
|
+
f"= {_format_percent(self.treatment_rate)}",
|
|
183
|
+
f" Observed lift (B - A): {_format_pp(self.absolute_lift)} "
|
|
184
|
+
"percentage points",
|
|
185
|
+
f" Relative lift: {relative_lift_str}",
|
|
186
|
+
"",
|
|
187
|
+
"Frequentist (two-sided pooled z-test)",
|
|
188
|
+
f" z statistic: {self.z_statistic:+.4f}",
|
|
189
|
+
f" p-value: {self.p_value:.4f}",
|
|
190
|
+
"",
|
|
191
|
+
f"Bayesian (Beta({self.prior_alpha:g}, {self.prior_beta:g}) prior, "
|
|
192
|
+
f"{self.n_simulations:,} sims)",
|
|
193
|
+
f" Posterior mean lift: {_format_pp(self.posterior_mean_lift)} "
|
|
194
|
+
"percentage points",
|
|
195
|
+
f" Posterior median lift: {_format_pp(self.posterior_median_lift)} "
|
|
196
|
+
"percentage points",
|
|
197
|
+
f" P(Treatment B > Control A): "
|
|
198
|
+
f"{_format_probability(self.prob_treatment_better)}",
|
|
199
|
+
f" P(Control A > Treatment B): "
|
|
200
|
+
f"{_format_probability(self.prob_control_better)}",
|
|
201
|
+
f" {ci_pct:g}% credible interval for lift: "
|
|
202
|
+
f"[{_format_pp(lower)}, {_format_pp(upper)}] percentage points",
|
|
203
|
+
" Expected loss",
|
|
204
|
+
f" Choosing treatment B: "
|
|
205
|
+
f"{self.expected_loss_treatment * 100:.2f} percentage points",
|
|
206
|
+
f" Choosing control A: "
|
|
207
|
+
f"{self.expected_loss_control * 100:.2f} percentage points",
|
|
208
|
+
]
|
|
209
|
+
return "\n".join(lines)
|
abtestwise/validation.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""Input validation helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import math
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def validate_count(name: str, value: int) -> None:
|
|
9
|
+
"""Validate a non-negative integer count."""
|
|
10
|
+
if isinstance(value, bool) or not isinstance(value, int):
|
|
11
|
+
raise ValueError(f"{name} must be an integer, got {value!r}.")
|
|
12
|
+
if value < 0:
|
|
13
|
+
raise ValueError(f"{name} must be >= 0, got {value}.")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def validate_total(name: str, value: int) -> None:
|
|
17
|
+
"""Validate a positive integer total."""
|
|
18
|
+
if isinstance(value, bool) or not isinstance(value, int):
|
|
19
|
+
raise ValueError(f"{name} must be an integer, got {value!r}.")
|
|
20
|
+
if value <= 0:
|
|
21
|
+
raise ValueError(f"{name} must be > 0, got {value}.")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def validate_successes_le_total(
|
|
25
|
+
successes_name: str,
|
|
26
|
+
successes: int,
|
|
27
|
+
total_name: str,
|
|
28
|
+
total: int,
|
|
29
|
+
) -> None:
|
|
30
|
+
"""Validate that successes do not exceed total."""
|
|
31
|
+
if successes > total:
|
|
32
|
+
raise ValueError(
|
|
33
|
+
f"{successes_name} ({successes}) cannot exceed {total_name} ({total})."
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def validate_prior(name: str, value: float) -> None:
|
|
38
|
+
"""Validate a positive Beta prior parameter."""
|
|
39
|
+
if isinstance(value, bool) or not isinstance(value, (int, float)):
|
|
40
|
+
raise ValueError(f"{name} must be a number, got {value!r}.")
|
|
41
|
+
if value <= 0:
|
|
42
|
+
raise ValueError(f"{name} must be > 0, got {value}.")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def validate_credible_interval(value: float) -> None:
|
|
46
|
+
"""Validate a credible interval level between 0 and 1."""
|
|
47
|
+
if isinstance(value, bool) or not isinstance(value, (int, float)):
|
|
48
|
+
raise ValueError(f"credible_interval must be a number, got {value!r}.")
|
|
49
|
+
if not (0.0 < value < 1.0):
|
|
50
|
+
raise ValueError(
|
|
51
|
+
f"credible_interval must be strictly between 0 and 1, got {value}."
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def validate_n_simulations(value: int) -> None:
|
|
56
|
+
"""Validate a positive integer number of simulations."""
|
|
57
|
+
if isinstance(value, bool) or not isinstance(value, int):
|
|
58
|
+
raise ValueError(f"n_simulations must be an integer, got {value!r}.")
|
|
59
|
+
if value <= 0:
|
|
60
|
+
raise ValueError(f"n_simulations must be > 0, got {value}.")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def validate_seed(value: int | None) -> None:
|
|
64
|
+
"""Validate a random seed."""
|
|
65
|
+
if value is None:
|
|
66
|
+
return
|
|
67
|
+
if isinstance(value, bool) or not isinstance(value, int):
|
|
68
|
+
raise ValueError(f"seed must be None or an integer, got {value!r}.")
|
|
69
|
+
if value < 0:
|
|
70
|
+
raise ValueError(f"seed must be >= 0, got {value}.")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def validate_margin(value: float) -> None:
|
|
74
|
+
"""Validate a non-negative, finite do-no-harm margin."""
|
|
75
|
+
if isinstance(value, bool) or not isinstance(value, (int, float)):
|
|
76
|
+
raise ValueError(f"margin must be a number, got {value!r}.")
|
|
77
|
+
if not math.isfinite(value):
|
|
78
|
+
raise ValueError(f"margin must be finite, got {value}.")
|
|
79
|
+
if value < 0:
|
|
80
|
+
raise ValueError(f"margin must be >= 0, got {value}.")
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: abtestwise
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Lightweight toolkit for binary A/B experiment analysis using aggregate counts.
|
|
5
|
+
License: MIT
|
|
6
|
+
Keywords: ab-testing,bayesian,experimentation,frequentist,statistics
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Requires-Dist: matplotlib>=3.5
|
|
9
|
+
Requires-Dist: numpy>=1.22
|
|
10
|
+
Requires-Dist: scipy>=1.8
|
|
11
|
+
Provides-Extra: dev
|
|
12
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
|
|
15
|
+
# abtestwise
|
|
16
|
+
|
|
17
|
+
A lightweight Python toolkit for **binary A/B experiment analysis** using
|
|
18
|
+
aggregate count data. Version 0.1 combines frequentist and Bayesian summaries
|
|
19
|
+
for binary proportions.
|
|
20
|
+
|
|
21
|
+
## Install
|
|
22
|
+
|
|
23
|
+
Install from PyPI:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
pip install abtestwise
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Development install
|
|
30
|
+
|
|
31
|
+
To work on the package locally (with the test dependencies):
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install -e ".[dev]"
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Quickstart
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
from abtestwise import BinaryABTest
|
|
41
|
+
|
|
42
|
+
test = BinaryABTest.from_counts(
|
|
43
|
+
control_successes=120,
|
|
44
|
+
control_total=1000,
|
|
45
|
+
treatment_successes=145,
|
|
46
|
+
treatment_total=1000,
|
|
47
|
+
prior_alpha=1,
|
|
48
|
+
prior_beta=1,
|
|
49
|
+
n_simulations=100_000,
|
|
50
|
+
credible_interval=0.95,
|
|
51
|
+
seed=42,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
result = test.run()
|
|
55
|
+
|
|
56
|
+
print(result.summary())
|
|
57
|
+
print(result.prob_lift_above(0.01))
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
`prob_lift_above(0.01)` gives the posterior probability that Treatment B improves
|
|
61
|
+
the metric by more than 1 percentage point.
|
|
62
|
+
|
|
63
|
+
### Do-no-harm checks
|
|
64
|
+
|
|
65
|
+
`prob_no_harm(margin)` gives the posterior probability that Treatment B is **not**
|
|
66
|
+
worse than Control A by more than `margin` (in raw decimal units, so `0.005` means
|
|
67
|
+
0.5 percentage points). `prob_harm_above(margin)` is its complement.
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
result.prob_no_harm(0.005) # P(lift >= -0.005): B is not worse by more than 0.5pp
|
|
71
|
+
result.prob_harm_above(0.005) # P(lift < -0.005): B is worse by more than 0.5pp
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Raw result values are also available:
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
result.to_dict()
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Plotting
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
import matplotlib.pyplot as plt
|
|
84
|
+
|
|
85
|
+
result.plot_lift_distribution()
|
|
86
|
+
result.plot_probability_bar()
|
|
87
|
+
|
|
88
|
+
plt.show()
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
The lift distribution plot shows posterior lift in percentage points.
|
|
92
|
+
|
|
93
|
+
The probability bar plot shows:
|
|
94
|
+
|
|
95
|
+
```text
|
|
96
|
+
P(Treatment B > Control A)
|
|
97
|
+
P(Control A > Treatment B)
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## Groups and sign convention
|
|
101
|
+
|
|
102
|
+
In product A/B testing terms:
|
|
103
|
+
|
|
104
|
+
- **Control (A)** is the baseline group.
|
|
105
|
+
- **Treatment (B)** is the test group or variant B.
|
|
106
|
+
- **Lift is always Treatment B - Control A.**
|
|
107
|
+
- **Positive lift means Treatment B is better than Control A.**
|
|
108
|
+
- **Negative lift means Control A is better than Treatment B.**
|
|
109
|
+
|
|
110
|
+
## Scope
|
|
111
|
+
|
|
112
|
+
Current package scope:
|
|
113
|
+
|
|
114
|
+
- Binary proportions only.
|
|
115
|
+
- Aggregate counts only.
|
|
116
|
+
- Two groups only.
|
|
117
|
+
- Frequentist: two-sided pooled two-proportion z-test.
|
|
118
|
+
- Bayesian: beta-binomial posterior simulation with default prior `Beta(1, 1)`.
|
|
119
|
+
- Equal-tailed credible interval.
|
|
120
|
+
- Expected loss.
|
|
121
|
+
- Practical lift thresholds.
|
|
122
|
+
- Do-no-harm probabilities using a user-defined harm margin.
|
|
123
|
+
- Simple plots.
|
|
124
|
+
|
|
125
|
+
## Development
|
|
126
|
+
|
|
127
|
+
Run tests with:
|
|
128
|
+
|
|
129
|
+
```bash
|
|
130
|
+
python -m pytest -q
|
|
131
|
+
```
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
abtestwise/__init__.py,sha256=S-NlVV4h2eFs3ClUNaEqYkMrjLr2m8IZ43j379THHXA,388
|
|
2
|
+
abtestwise/bayesian.py,sha256=2Cz9Q9V170MOt275lM6Cy15JRsoG7q4CH87RXClfzUM,2350
|
|
3
|
+
abtestwise/binary.py,sha256=2krX5pqcMP8yVMfnKPSY-FoPvFSPVx0DIV9D1DbcLcY,5132
|
|
4
|
+
abtestwise/frequentist.py,sha256=gSkvwlA9-FZYyEcarRPCyLmdsfpkMsFTNK2-HhpmI7I,1377
|
|
5
|
+
abtestwise/plotting.py,sha256=WVXFkT9oKCmnFPnBW8HtIglUgsNkWgZGEdxqroZVcO8,4243
|
|
6
|
+
abtestwise/result.py,sha256=FHiPYjEBVI4vlBPrxpUBVMSOBALCCfVLoE7CwbBzCpo,7519
|
|
7
|
+
abtestwise/validation.py,sha256=vu5n4TcXjPz2eKfuKVqM-J08MQ-z1nDZZcOj0CEKU4E,2883
|
|
8
|
+
abtestwise-0.1.0.dist-info/METADATA,sha256=X4NoGjL2eKwTWNKwwIoxyvkHplk1Xukz7etUmpdPD5o,2952
|
|
9
|
+
abtestwise-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
10
|
+
abtestwise-0.1.0.dist-info/RECORD,,
|