abtoolkit 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. abtoolkit-1.0.0/LICENSE +7 -0
  2. abtoolkit-1.0.0/PKG-INFO +113 -0
  3. abtoolkit-1.0.0/README.md +92 -0
  4. abtoolkit-1.0.0/abtoolkit/__init__.py +0 -0
  5. abtoolkit-1.0.0/abtoolkit/continuous/__init__.py +0 -0
  6. abtoolkit-1.0.0/abtoolkit/continuous/simulation.py +269 -0
  7. abtoolkit-1.0.0/abtoolkit/continuous/stattests.py +192 -0
  8. abtoolkit-1.0.0/abtoolkit/continuous/utils.py +87 -0
  9. abtoolkit-1.0.0/abtoolkit/discrete/__init__.py +0 -0
  10. abtoolkit-1.0.0/abtoolkit/discrete/utils.py +30 -0
  11. abtoolkit-1.0.0/abtoolkit.egg-info/PKG-INFO +113 -0
  12. abtoolkit-1.0.0/abtoolkit.egg-info/SOURCES.txt +34 -0
  13. abtoolkit-1.0.0/abtoolkit.egg-info/dependency_links.txt +1 -0
  14. abtoolkit-1.0.0/abtoolkit.egg-info/requires.txt +6 -0
  15. abtoolkit-1.0.0/abtoolkit.egg-info/top_level.txt +6 -0
  16. abtoolkit-1.0.0/examples/continuous_var_analysis.py +51 -0
  17. abtoolkit-1.0.0/pyproject.toml +34 -0
  18. abtoolkit-1.0.0/setup.cfg +4 -0
  19. abtoolkit-1.0.0/tests/__init__.py +0 -0
  20. abtoolkit-1.0.0/tests/test_continuous_simulation.py +63 -0
  21. abtoolkit-1.0.0/tests/test_continuous_tests.py +87 -0
  22. abtoolkit-1.0.0/tests/test_continuous_utils.py +18 -0
  23. abtoolkit-1.0.0/tests/test_discrete_utils.py +16 -0
  24. abtoolkit-1.0.0/venv/bin/activate_this.py +31 -0
  25. abtoolkit-1.0.0/venv/bin/rst2html.py +23 -0
  26. abtoolkit-1.0.0/venv/bin/rst2html4.py +26 -0
  27. abtoolkit-1.0.0/venv/bin/rst2html5.py +33 -0
  28. abtoolkit-1.0.0/venv/bin/rst2latex.py +26 -0
  29. abtoolkit-1.0.0/venv/bin/rst2man.py +27 -0
  30. abtoolkit-1.0.0/venv/bin/rst2odt.py +28 -0
  31. abtoolkit-1.0.0/venv/bin/rst2odt_prepstyles.py +20 -0
  32. abtoolkit-1.0.0/venv/bin/rst2pseudoxml.py +23 -0
  33. abtoolkit-1.0.0/venv/bin/rst2s5.py +24 -0
  34. abtoolkit-1.0.0/venv/bin/rst2xetex.py +27 -0
  35. abtoolkit-1.0.0/venv/bin/rst2xml.py +23 -0
  36. abtoolkit-1.0.0/venv/bin/rstpep2html.py +25 -0
@@ -0,0 +1,7 @@
1
+ Copyright 2024 Nikita Altukhov
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,113 @@
1
+ Metadata-Version: 2.1
2
+ Name: abtoolkit
3
+ Version: 1.0.0
4
+ Summary: Package with tools for AB testing
5
+ Author-email: Nikita Altukhov <altuxov.nikita@gmail.com>
6
+ Project-URL: Homepage, https://github.com/nikitosl/abtoolkit
7
+ Project-URL: Issues, https://github.com/nikitosl/abtoolkit/issues
8
+ Keywords: ab_test,cuped,did,ttest
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.8
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+ Requires-Dist: pandas>=2.2.1
16
+ Requires-Dist: numpy>=1.26.4
17
+ Requires-Dist: statsmodels>=0.14.1
18
+ Requires-Dist: scipy>=1.12.0
19
+ Requires-Dist: linearmodels>=5.4
20
+ Requires-Dist: tqdm>=4.66.2
21
+
22
+ # ABToolkit
23
+ Set of tools for AA and AB tests, sample size estimation, confidence intervals estimation.
24
+ For continuous and discrete variables.
25
+
26
+ ## Install using pip:
27
+ ```pip install abtoolkit```
28
+
29
+ ## Continuous variables analysis
30
+ #### Sample size estimation:
31
+ ```
32
+ from abtoolkit.continuous.utils import calculate_sample_size_by_mde
33
+ calculate_sample_size_by_mde(
34
+ std=variable.std(),
35
+ alpha=alpha_level,
36
+ power=power,
37
+ mde=mde
38
+ )
39
+ ```
40
+
41
+ #### AA and AB tests simulation:
42
+ Using ```abtoolkit.continuous.simulation.StatTestsSimulation``` class you can simulate and check different stat-test,
43
+ compare them in terms of stat test power to choose the best test for your data. As result of simulation for each
44
+ stat test you will get the 1-st Type error estimation with confidence interval, 2-nd Type error estimation with
45
+ confidence interval and plot of p-value distribution for different tests.
46
+
47
+ ```
48
+ from abtoolkit.continuous.simulation import StatTestsSimulation
49
+ simulation = StatTestsSimulation(
50
+ control,
51
+ test,
52
+ stattests_list=["ttest", "regression_test", "cuped_ttest", "did_regression_test", "additional_vars_regression_test"],
53
+ experiments_num=experiments_num,
54
+ sample_size=sample_size,
55
+ mde=mde,
56
+ alpha_level=alpha_level,
57
+
58
+ control_previous_values=control_previous_value,
59
+ test_previous_values=test_previous_value,
60
+ control_cuped_covariant=control_previous_value,
61
+ test_cuped_covariant=test_previous_value,
62
+ control_additional_vars=[control_previous_value],
63
+ test_additional_vars=[test_previous_value],
64
+ )
65
+ simulation.run() # Run simulation
66
+ simulation.print_results() # Print results of simulation
67
+ simulation.plot_p_values() # Plot p-values distribution
68
+ ```
69
+ Output:
70
+ ![output-plot.png](static%2Foutput-plot.png)
71
+ ![p-value-plot.png](static%2Fp-value-plot.png)
72
+
73
+ Full example of usage you can find in ```examples/continuous_var_analysis.py``` script.
74
+
75
+ #### Next stat tests implemented for treatment effect estimation:
76
+ - ***T-Test*** - estimates treatment effect by comparing variables between test and control groups.
77
+ - ***Difference T-Test*** - estimates treatment effect by comparing difference between actual and previous values
78
+ of variables in test and control groups.
79
+ - ***Regression Test*** - estimates treatment effect using linear regression by tested predicting variable.
80
+ Fact of treatment represented in model as binary flag (treated or not). Weight for this flag show significant
81
+ of treatment impact.
82
+ ```y = bias + w * treated```
83
+ - ***Regression Difference-in-Difference Test*** - estimates treatment effect using linear regression by predicting
84
+ difference between test and control groups whist represented as difference between current variable value and
85
+ previous period variable value (two differences). Weight for treated and current variable values shows
86
+ significant of treatment. ```y = bias + w0 * treated + w1 * after + w2 * treated * after```
87
+ - ***CUPED*** - estimates treatment effect by comparing variables between test and control groups and uses covariant
88
+ to reduce variance and speedup test. ```y = y - Q * covariant```, where ```Q = cov(y, covariant) / var(covariant)```.
89
+ Cuped variable has same mean value (unbiased), but smaller variance, that speedup test.
90
+ - ***Regression with Additional Variables*** - estimates treatment effect using linear regression by predicting
91
+ tested variable with additional variables, which describe part of main variable variance and speedup test.
92
+ Fact of treatment represented in model as binary flag (treated or not). Weight for this flag show significant
93
+ of treatment impact.
94
+ ```y = bias + w0 * treated + w1 * additional_variable1 + w2 * additional_variable2 + ...```
95
+
96
+
97
+ ## Discrete variables analysis
98
+ #### Sample size estimation:
99
+ ```
100
+ from abtoolkit.discrete.utils import estimate_ci_binomial
101
+ estimate_ci_binomial(
102
+ p,
103
+ sample_size,
104
+ alpha=0.05
105
+ )
106
+ ```
107
+ #### AA and AB tests simulation:
108
+ To Be Done
109
+ #### Next stat tests implemented for treatment effect estimation:
110
+ To Be Done
111
+
112
+ ---
113
+ You can find examples of toolkit usage in [examples/](https://github.com/nikitosl/abtoolkit/tree/master/examples) directory.
@@ -0,0 +1,92 @@
1
+ # ABToolkit
2
+ Set of tools for AA and AB tests, sample size estimation, confidence intervals estimation.
3
+ For continuous and discrete variables.
4
+
5
+ ## Install using pip:
6
+ ```pip install abtoolkit```
7
+
8
+ ## Continuous variables analysis
9
+ #### Sample size estimation:
10
+ ```
11
+ from abtoolkit.continuous.utils import calculate_sample_size_by_mde
12
+ calculate_sample_size_by_mde(
13
+ std=variable.std(),
14
+ alpha=alpha_level,
15
+ power=power,
16
+ mde=mde
17
+ )
18
+ ```
19
+
20
+ #### AA and AB tests simulation:
21
+ Using ```abtoolkit.continuous.simulation.StatTestsSimulation``` class you can simulate and check different stat-test,
22
+ compare them in terms of stat test power to choose the best test for your data. As result of simulation for each
23
+ stat test you will get the 1-st Type error estimation with confidence interval, 2-nd Type error estimation with
24
+ confidence interval and plot of p-value distribution for different tests.
25
+
26
+ ```
27
+ from abtoolkit.continuous.simulation import StatTestsSimulation
28
+ simulation = StatTestsSimulation(
29
+ control,
30
+ test,
31
+ stattests_list=["ttest", "regression_test", "cuped_ttest", "did_regression_test", "additional_vars_regression_test"],
32
+ experiments_num=experiments_num,
33
+ sample_size=sample_size,
34
+ mde=mde,
35
+ alpha_level=alpha_level,
36
+
37
+ control_previous_values=control_previous_value,
38
+ test_previous_values=test_previous_value,
39
+ control_cuped_covariant=control_previous_value,
40
+ test_cuped_covariant=test_previous_value,
41
+ control_additional_vars=[control_previous_value],
42
+ test_additional_vars=[test_previous_value],
43
+ )
44
+ simulation.run() # Run simulation
45
+ simulation.print_results() # Print results of simulation
46
+ simulation.plot_p_values() # Plot p-values distribution
47
+ ```
48
+ Output:
49
+ ![output-plot.png](static%2Foutput-plot.png)
50
+ ![p-value-plot.png](static%2Fp-value-plot.png)
51
+
52
+ Full example of usage you can find in ```examples/continuous_var_analysis.py``` script.
53
+
54
+ #### Next stat tests implemented for treatment effect estimation:
55
+ - ***T-Test*** - estimates treatment effect by comparing variables between test and control groups.
56
+ - ***Difference T-Test*** - estimates treatment effect by comparing difference between actual and previous values
57
+ of variables in test and control groups.
58
+ - ***Regression Test*** - estimates treatment effect using linear regression by tested predicting variable.
59
+ Fact of treatment represented in model as binary flag (treated or not). Weight for this flag show significant
60
+ of treatment impact.
61
+ ```y = bias + w * treated```
62
+ - ***Regression Difference-in-Difference Test*** - estimates treatment effect using linear regression by predicting
63
+ difference between test and control groups whist represented as difference between current variable value and
64
+ previous period variable value (two differences). Weight for treated and current variable values shows
65
+ significant of treatment. ```y = bias + w0 * treated + w1 * after + w2 * treated * after```
66
+ - ***CUPED*** - estimates treatment effect by comparing variables between test and control groups and uses covariant
67
+ to reduce variance and speedup test. ```y = y - Q * covariant```, where ```Q = cov(y, covariant) / var(covariant)```.
68
+ Cuped variable has same mean value (unbiased), but smaller variance, that speedup test.
69
+ - ***Regression with Additional Variables*** - estimates treatment effect using linear regression by predicting
70
+ tested variable with additional variables, which describe part of main variable variance and speedup test.
71
+ Fact of treatment represented in model as binary flag (treated or not). Weight for this flag show significant
72
+ of treatment impact.
73
+ ```y = bias + w0 * treated + w1 * additional_variable1 + w2 * additional_variable2 + ...```
74
+
75
+
76
+ ## Discrete variables analysis
77
+ #### Sample size estimation:
78
+ ```
79
+ from abtoolkit.discrete.utils import estimate_ci_binomial
80
+ estimate_ci_binomial(
81
+ p,
82
+ sample_size,
83
+ alpha=0.05
84
+ )
85
+ ```
86
+ #### AA and AB tests simulation:
87
+ To Be Done
88
+ #### Next stat tests implemented for treatment effect estimation:
89
+ To Be Done
90
+
91
+ ---
92
+ You can find examples of toolkit usage in [examples/](https://github.com/nikitosl/abtoolkit/tree/master/examples) directory.
File without changes
File without changes
@@ -0,0 +1,269 @@
1
+ from typing import List
2
+
3
+ import matplotlib.pyplot as plt
4
+ import numpy as np
5
+ import pandas as pd
6
+ from tqdm import tqdm
7
+
8
+ from abtoolkit.continuous.stattests import additional_vars_regression_test
9
+ from abtoolkit.continuous.stattests import cuped_ttest
10
+ from abtoolkit.continuous.stattests import did_regression_test
11
+ from abtoolkit.continuous.stattests import regression_test
12
+ from abtoolkit.continuous.stattests import ttest
13
+ from abtoolkit.discrete.utils import estimate_ci_binomial
14
+
15
+
16
+ class StatTestsSimulation:
17
+ def __init__(
18
+ self,
19
+ control: pd.Series,
20
+ test: pd.Series,
21
+ stattests_list: List[str],
22
+ sample_size: int,
23
+ experiments_num: int,
24
+ mde: float,
25
+ alpha_level: float = 0.05,
26
+ power: float = 0.8,
27
+ control_previous_values: pd.Series = None,
28
+ test_previous_values: pd.Series = None,
29
+ control_cuped_covariant: pd.Series = None,
30
+ test_cuped_covariant: pd.Series = None,
31
+ control_additional_vars: List[pd.Series] = None,
32
+ test_additional_vars: List[pd.Series] = None,
33
+ ):
34
+ """
35
+ Simulates AA and AB tests for given stat-tests. Prints result (alpha and power) for each test
36
+ and builds plot for p-value distributions.
37
+
38
+ :param control: control variable
39
+ :param test: test variable
40
+ :param stattests_list: list of stat-tests for estimation
41
+ :param sample_size: number of examples to sample from variables in each iteration
42
+ :param experiments_num: number of experiments to perform for each stat-test
43
+ :param mde: minimal detectable effect, used to perform AB test (add to test variable)
44
+ :param alpha_level: test alpha-level
45
+ :param power: test power
46
+ :param control_previous_values: previous values of control variable used to reduce variance and speedup
47
+ test in difference-in-difference test
48
+ :param test_previous_values: previous values of test variable used to reduce variance and speedup test
49
+ in difference-in-difference test
50
+ :param control_cuped_covariant: covariant for control group variable used to reduce variance and speedup test
51
+ in cuped test
52
+ :param test_cuped_covariant: covariant for test group variable used to reduce variance and speedup test
53
+ in cuped test
54
+ :param control_additional_vars: list of additional variables for control group variable used to
55
+ reduce variance and speedup test in 'regression_with_additional_variables' test
56
+ :param test_additional_vars: list of additional variables for test group variable used to
57
+ reduce variance and speedup test in 'regression_with_additional_variables' test
58
+ """
59
+
60
+ self.control = control
61
+ self.test = test
62
+
63
+ self.stattests_list = stattests_list
64
+ self.experiments_num = experiments_num
65
+ self.sample_size = sample_size
66
+ self.mde = mde
67
+ self.alpha_level = alpha_level
68
+ self.power = power
69
+
70
+ self.stattests_func_map = {
71
+ "ttest": self.simulate_ttest,
72
+ "diff_ttest": self.simulate_difference_ttest,
73
+ "cuped_ttest": self.simulate_cuped,
74
+ "regression_test": self.simulate_reg,
75
+ "did_regression_test": self.simulate_reg_did,
76
+ "additional_vars_regression_test": self.simulate_reg_add,
77
+ }
78
+ self.info = {}
79
+
80
+ # Optional
81
+ self.control_previous_values = control_previous_values
82
+ self.test_previous_values = test_previous_values
83
+ self.control_cuped_covariant = control_cuped_covariant
84
+ self.test_cuped_covariant = test_cuped_covariant
85
+ self.control_additional_vars = control_additional_vars
86
+ self.test_additional_vars = test_additional_vars
87
+
88
+ def plot_p_values(self):
89
+ """
90
+ Plot p-values distribution for each test
91
+ :return: None
92
+ """
93
+ if len(self.info) == 0:
94
+ return
95
+
96
+ X = np.linspace(0, 1, 1000)
97
+ for test, test_info in self.info.items():
98
+ ab_pvalues = np.array(test_info["ab_pvalues"])
99
+ Y = [np.mean(ab_pvalues < x) for x in X]
100
+ plt.plot(X, Y, label=test)
101
+
102
+ plt.plot([self.alpha_level, self.alpha_level], [0, 1], '--k', alpha=0.8)
103
+ plt.plot([0, 1], [self.power, self.power], '--k', alpha=0.8)
104
+ plt.title('P-Value Distribution for AB Simulation', size=12)
105
+ plt.xlabel('p-value', size=10)
106
+ plt.legend(fontsize=10)
107
+ plt.grid()
108
+ plt.show()
109
+
110
+ def print_results(self):
111
+ """
112
+ Print simulation results for each test (alpha and power + confidence intervals)
113
+ :return: None
114
+ """
115
+ for test in self.info:
116
+ a, p = self.info[test]['alpha'], self.info[test]['power']
117
+ aci1, aci2 = round(self.info[test]['alpha_ci'][0], 4), round(self.info[test]['alpha_ci'][1], 4)
118
+ pci1, pci2 = round(self.info[test]['power_ci'][0], 4), round(self.info[test]['power_ci'][1], 4)
119
+
120
+ if (aci1 > self.alpha_level) or (self.power > pci2):
121
+ print('\033[91m' + f"'{test}'; alpha={a} ci[{aci1}; {aci2}], power={p} [{pci1}; {pci2}]" + '\033[0m')
122
+ else:
123
+ print('\033[92m' + f"'{test}'; alpha={a} ci[{aci1}; {aci2}], power={p} [{pci1}; {pci2}]" + '\033[0m')
124
+
125
+ def run(self):
126
+ """
127
+ Simulate all tests from 'self.stattests_list' by given data and save information to 'info' dictionary
128
+ :return:
129
+ """
130
+ self.info = {}
131
+ for stattest in self.stattests_list:
132
+ self.simulate_test_by_name(stattest)
133
+ return self.info
134
+
135
+ def simulate_test_by_name(self, test_name: str):
136
+ """
137
+ Simulate AA and AB test and save results to 'info' dictionary
138
+ :param test_name: name of test for simulation (ttest | cuped_ttest | regression_test | did_regression_test
139
+ | additional_vars_regression_test)
140
+ :return: None
141
+ """
142
+
143
+ assert test_name in self.stattests_func_map, f"Given test_name {test_name} not found"
144
+ stattest_func = self.stattests_func_map[test_name]
145
+
146
+ test_success_no_effect_cnt = 0
147
+ test_pvalues_no_effect = []
148
+ test_success_effect_cnt = 0
149
+ test_pvalues_effect = []
150
+
151
+ for _ in tqdm(range(self.experiments_num), desc=f"Simulation test '{test_name}'"):
152
+ p_value = stattest_func(mde=0)
153
+ test_pvalues_no_effect.append(p_value)
154
+ if p_value < self.alpha_level:
155
+ test_success_no_effect_cnt += 1
156
+
157
+ p_value = stattest_func(mde=self.mde)
158
+ test_pvalues_effect.append(p_value)
159
+ if p_value < self.alpha_level:
160
+ test_success_effect_cnt += 1
161
+
162
+ alpha = test_success_no_effect_cnt / self.experiments_num
163
+ power = test_success_effect_cnt / self.experiments_num
164
+
165
+ alpha_ci = estimate_ci_binomial(alpha, self.experiments_num, alpha=0.05)
166
+ power_ci = estimate_ci_binomial(power, self.experiments_num, alpha=0.05)
167
+
168
+ if test_name in self.info:
169
+ del self.info[test_name]
170
+ self.info[test_name] = {
171
+ "alpha": alpha,
172
+ "alpha_ci": alpha_ci,
173
+ "power": power,
174
+ "power_ci": power_ci,
175
+ "aa_pvalues": test_pvalues_no_effect,
176
+ "ab_pvalues": test_pvalues_effect,
177
+ }
178
+
179
+ def simulate_ttest(self, mde: float) -> float:
180
+ """
181
+ Simulate ttest
182
+ :param mde: minimal detectable effect, to sum with test variable
183
+ :return: p_value
184
+ """
185
+ control_sample = self.control.sample(self.sample_size, replace=True)
186
+ test_sample = self.test.sample(self.sample_size, replace=True)
187
+ test_sample += mde
188
+
189
+ return ttest(control_sample, test_sample)
190
+
191
+ def simulate_difference_ttest(self, mde: float) -> float:
192
+ """
193
+ Simulate ttest for difference between actual variable value and previous period variable value
194
+ :param mde: minimal detectable effect, to sum with test variable
195
+ :return: p_value
196
+ """
197
+ control_index_sample = self.control.index[np.random.randint(0, len(self.control), size=self.sample_size)]
198
+ test_index_sample = self.test.index[np.random.randint(0, len(self.test), size=self.sample_size)]
199
+
200
+ control_sample = self.control.loc[control_index_sample]
201
+ control_pre_sample = self.control_previous_values.loc[control_index_sample]
202
+ test_sample = self.test.loc[test_index_sample]
203
+ test_pre_sample = self.test_previous_values.loc[test_index_sample]
204
+ test_sample += mde
205
+
206
+ return cuped_ttest(control_sample, control_pre_sample, test_sample, test_pre_sample)
207
+
208
+ def simulate_cuped(self, mde: float) -> float:
209
+ """
210
+ Simulate CUPED ttest
211
+ :param mde: minimal detectable effect, to sum with test variable
212
+ :return: p_value
213
+ """
214
+ control_index_sample = self.control.index[np.random.randint(0, len(self.control), size=self.sample_size)]
215
+ test_index_sample = self.test.index[np.random.randint(0, len(self.test), size=self.sample_size)]
216
+
217
+ control_sample = self.control.loc[control_index_sample]
218
+ control_covariant_sample = self.control_cuped_covariant.loc[control_index_sample]
219
+ test_sample = self.test.loc[test_index_sample]
220
+ test_covariant_sample = self.test_cuped_covariant.loc[test_index_sample]
221
+ test_sample += mde
222
+
223
+ return cuped_ttest(control_sample, control_covariant_sample, test_sample, test_covariant_sample)
224
+
225
+ def simulate_reg(self, mde: float) -> float:
226
+ """
227
+ Simulate test using regression
228
+ :param mde: minimal detectable effect, to sum with test variable
229
+ :return: p_value
230
+ """
231
+ control_sample = self.control.sample(self.sample_size, replace=True)
232
+ test_sample = self.test.sample(self.sample_size, replace=True)
233
+ test_sample += mde
234
+
235
+ return regression_test(control_sample, test_sample)
236
+
237
+ def simulate_reg_did(self, mde: float) -> float:
238
+ """
239
+ Simulate test using regression with difference-in-difference technique
240
+ :param mde: minimal detectable effect, to sum with test variable
241
+ :return: p_value
242
+ """
243
+ control_index_sample = self.control.index[np.random.randint(0, len(self.control), size=self.sample_size)]
244
+ test_index_sample = self.test.index[np.random.randint(0, len(self.test), size=self.sample_size)]
245
+
246
+ control_sample = self.control.loc[control_index_sample]
247
+ control_previous_sample = self.control_previous_values.loc[control_index_sample]
248
+ test_sample = self.test.loc[test_index_sample]
249
+ test_previous_sample = self.test_previous_values.loc[test_index_sample]
250
+ test_sample += mde
251
+
252
+ return did_regression_test(control_sample, control_previous_sample, test_sample, test_previous_sample)
253
+
254
+ def simulate_reg_add(self, mde: float) -> float:
255
+ """
256
+ Simulate test using regression with additional variables
257
+ :param mde: minimal detectable effect, to sum with test variable
258
+ :return: p_value
259
+ """
260
+ control_index_sample = self.control.index[np.random.randint(0, len(self.control), size=self.sample_size)]
261
+ test_index_sample = self.test.index[np.random.randint(0, len(self.test), size=self.sample_size)]
262
+
263
+ control_sample = self.control.loc[control_index_sample]
264
+ control_add_samples = [a.loc[control_index_sample] for a in self.control_additional_vars]
265
+ test_sample = self.test.loc[test_index_sample]
266
+ test_add_samples = [a.loc[test_index_sample] for a in self.test_additional_vars]
267
+ test_sample += mde
268
+
269
+ return additional_vars_regression_test(control_sample, control_add_samples, test_sample, test_add_samples)
@@ -0,0 +1,192 @@
1
+ from typing import List
2
+
3
+ import linearmodels as lm
4
+ import numpy as np
5
+ import pandas as pd
6
+ from scipy.stats import ttest_ind
7
+
8
+
9
+ def ttest(
10
+ control: pd.Series,
11
+ test: pd.Series,
12
+ ) -> float:
13
+ """
14
+ Simple two-side t-test
15
+ :param control: pd.Series for control sample
16
+ :param test: pd.Series for test sample
17
+ :return: p-value
18
+ """
19
+
20
+ return ttest_ind(control, test, alternative="less").pvalue
21
+
22
+
23
+ def difference_ttest(
24
+ control: pd.Series,
25
+ control_pre: pd.Series,
26
+ test: pd.Series,
27
+ test_pre: pd.Series,
28
+ ) -> float:
29
+ """
30
+ Estimation treatment effect using ttest and CUPED to increase test's power
31
+ :param control: pd.Series, control sample
32
+ :param control_pre: pd.Series, control previous period value
33
+ :param test: pd.Series, test sample
34
+ :param test_pre: pd.Series, test previous period value
35
+ :return: p-value
36
+ """
37
+ control = control - control_pre
38
+ test = test - test_pre
39
+
40
+ return ttest(control, test)
41
+
42
+
43
+ def cuped_ttest(
44
+ control: pd.Series,
45
+ control_covariant: pd.Series,
46
+ test: pd.Series,
47
+ test_covariant: pd.Series,
48
+ ) -> float:
49
+ """
50
+ Estimation treatment effect using ttest and CUPED to increase test's power
51
+ :param control: pd.Series, control sample
52
+ :param control_covariant: pd.Series, control sample covariant
53
+ :param test: pd.Series, test sample
54
+ :param test_covariant: pd.Series, test sample covariant
55
+ :return: p-value
56
+ """
57
+
58
+ full_value = pd.concat([
59
+ control.rename("value"),
60
+ test.rename("value"),
61
+ ], axis=0)
62
+
63
+ full_covariant = pd.concat([
64
+ control_covariant.rename("covariant"),
65
+ test_covariant.rename("covariant"),
66
+ ], axis=0)
67
+
68
+ cov = np.cov(full_covariant, full_value)[0, 1]
69
+ var = full_covariant.var()
70
+ theta = cov / var
71
+
72
+ cuped_test = test - theta * test_covariant
73
+ cuped_control = control - theta * control_covariant
74
+
75
+ return ttest(cuped_control, cuped_test)
76
+
77
+
78
+
79
+ def regression_test(
80
+ control: pd.Series,
81
+ test: pd.Series,
82
+ ) -> float:
83
+ """
84
+ Treatment effect estimation using linear regression
85
+ :param control: pd.Series with index [entity, dt], where dt could be int of datetime. Control sample
86
+ :param test: pd.Series with index [entity, dt], where dt could be int of datetime. Test sample
87
+ :return: p-value
88
+ """
89
+ df = pd.concat([
90
+ control.rename("value").to_frame().assign(treated=0),
91
+ test.rename("value").to_frame().assign(treated=1),
92
+ ], axis=0)
93
+ df["bias"] = 1
94
+
95
+ if not isinstance(df.index, pd.MultiIndex):
96
+ df["index1"] = 0
97
+ df["index2"] = 1
98
+ df = df.set_index(["index1", "index2"])
99
+
100
+ mod = lm.PanelOLS.from_formula("value ~ bias + treated", data=df)
101
+ result = mod.fit()
102
+ return result.pvalues["treated"]
103
+
104
+
105
+ def did_regression_test(
106
+ control: pd.Series,
107
+ control_pre: pd.Series,
108
+ test: pd.Series,
109
+ test_pre: pd.Series,
110
+ ) -> float:
111
+ """
112
+ Difference-in-Difference treatment effect estimation using linear regression.
113
+ Calculates difference between current and last values in test and control groups and then
114
+ calculates difference between differences to increase test power
115
+ :param control_pre: pd.Series with index [entity, dt], where dt could be int of datetime.
116
+ Control sample before treatment
117
+ :param control: pd.Series with index [entity, dt], where dt could be int of datetime.
118
+ Control sample after treatment
119
+ :param test_pre: pd.Series with index [entity, dt], where dt could be int of datetime. Test sample before treatment
120
+ :param test: pd.Series with index [entity, dt], where dt could be int of datetime. Test sample after treatment
121
+ :return: p-value
122
+ """
123
+ df = pd.concat([
124
+ control_pre.rename("value").to_frame().assign(treated=0).assign(after=0),
125
+ control.rename("value").to_frame().assign(treated=0).assign(after=1),
126
+ test_pre.rename("value").to_frame().assign(treated=1).assign(after=0),
127
+ test.rename("value").to_frame().assign(treated=1).assign(after=1),
128
+ ], axis=0)
129
+
130
+ df["bias"] = 1
131
+
132
+ if not isinstance(df.index, pd.MultiIndex):
133
+ df["index1"] = 0
134
+ df["index2"] = 1
135
+ df = df.set_index(["index1", "index2"])
136
+
137
+ mod = lm.PanelOLS.from_formula("value ~ bias + + after + treated + treated*after", data=df)
138
+ result = mod.fit()
139
+ return result.pvalues["treated:after"]
140
+
141
+
142
+ def additional_vars_regression_test(
143
+ control: pd.Series,
144
+ control_additional_vars: List[pd.Series],
145
+ test: pd.Series,
146
+ test_additional_vars: List[pd.Series],
147
+ ) -> float:
148
+ """
149
+ Treatment effect estimation using additional variables in linear regression. Additional
150
+ variables should reduce deviation of target variable and increase test power
151
+ :param control: pd.Series with index [entity, dt], where dt could be int of datetime.
152
+ Control sample
153
+ :param control_additional_vars: List of pd.Series with index [entity, dt], where dt could be int of datetime.
154
+ Additional variables which can describe some deviation of tested variable
155
+ :param test: pd.Series with index [entity, dt], where dt could be int of datetime.
156
+ Test sample
157
+ :param test_additional_vars: List of pd.Series with index [entity, dt], where dt could be int of datetime.
158
+ Additional variables which can describe some deviation of tested variable
159
+ :return: p-value
160
+ """
161
+
162
+ assert len(test_additional_vars) > 0, "No additional vars for 'additional_vars_regression_test' test given"
163
+
164
+ additional_vars_names_test = [v.name for v in test_additional_vars]
165
+ additional_vars_names_control = [v.name for v in control_additional_vars]
166
+ assert set(additional_vars_names_test) == set(additional_vars_names_control), \
167
+ (f"Lists of control and test additional vars should the same. "
168
+ f"Got {set(additional_vars_names_test)} vars for test "
169
+ f"and {set(additional_vars_names_control)} vars for control")
170
+
171
+ control_df = pd.concat([control.rename("value").to_frame()] + control_additional_vars, axis=1)
172
+ test_df = pd.concat([test.rename("value").to_frame()] + test_additional_vars, axis=1)
173
+
174
+ control_df.index = test_df.index
175
+ df = pd.concat([
176
+ control_df.assign(treated=0),
177
+ test_df.assign(treated=1),
178
+ ], axis=0)
179
+
180
+ df["bias"] = 1
181
+
182
+ if not isinstance(df.index, pd.MultiIndex):
183
+ df["index1"] = 0
184
+ df["index2"] = 1
185
+ df = df.set_index(["index1", "index2"])
186
+
187
+ additional_vars_formula = " + ".join(map(str, additional_vars_names_test))
188
+
189
+ formula = f"value ~ bias + treated + {additional_vars_formula}"
190
+ mod = lm.PanelOLS.from_formula(formula, data=df)
191
+ result = mod.fit()
192
+ return result.pvalues["treated"]