pycausal-inference-joshlim 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pycausal_inference_joshlim-0.1.0/LICENSE +21 -0
- pycausal_inference_joshlim-0.1.0/PKG-INFO +80 -0
- pycausal_inference_joshlim-0.1.0/README.md +42 -0
- pycausal_inference_joshlim-0.1.0/pycausal_inference_joshlim/__init__.py +24 -0
- pycausal_inference_joshlim-0.1.0/pycausal_inference_joshlim/meta_learners.py +163 -0
- pycausal_inference_joshlim-0.1.0/pycausal_inference_joshlim/propensity.py +60 -0
- pycausal_inference_joshlim-0.1.0/pycausal_inference_joshlim/rct.py +61 -0
- pycausal_inference_joshlim-0.1.0/pycausal_inference_joshlim.egg-info/PKG-INFO +80 -0
- pycausal_inference_joshlim-0.1.0/pycausal_inference_joshlim.egg-info/SOURCES.txt +13 -0
- pycausal_inference_joshlim-0.1.0/pycausal_inference_joshlim.egg-info/dependency_links.txt +1 -0
- pycausal_inference_joshlim-0.1.0/pycausal_inference_joshlim.egg-info/requires.txt +13 -0
- pycausal_inference_joshlim-0.1.0/pycausal_inference_joshlim.egg-info/top_level.txt +1 -0
- pycausal_inference_joshlim-0.1.0/pyproject.toml +69 -0
- pycausal_inference_joshlim-0.1.0/setup.cfg +4 -0
- pycausal_inference_joshlim-0.1.0/tests/test_meta_learners.py +96 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Josh Lim
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pycausal-inference-joshlim
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A Python package for causal inference methods including ATE estimation, propensity score methods, and meta-learners
|
|
5
|
+
Author-email: Josh Lim <j.lim703@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/jhl126/pycausal-inference-joshlim
|
|
8
|
+
Project-URL: Documentation, https://github.com/jhl126/pycausal-inference-joshlim#readme
|
|
9
|
+
Project-URL: Repository, https://github.com/jhl126/pycausal-inference-joshlim
|
|
10
|
+
Project-URL: Bug Tracker, https://github.com/jhl126/pycausal-inference-joshlim/issues
|
|
11
|
+
Keywords: causal inference,statistics,machine learning,treatment effects
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Mathematics
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
22
|
+
Requires-Python: >=3.8
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: pandas>=1.3.0
|
|
26
|
+
Requires-Dist: numpy>=1.21.0
|
|
27
|
+
Requires-Dist: scipy>=1.7.0
|
|
28
|
+
Requires-Dist: scikit-learn>=1.0.0
|
|
29
|
+
Requires-Dist: lightgbm>=3.3.0
|
|
30
|
+
Requires-Dist: patsy>=0.5.0
|
|
31
|
+
Provides-Extra: dev
|
|
32
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
33
|
+
Requires-Dist: pytest-cov>=3.0.0; extra == "dev"
|
|
34
|
+
Requires-Dist: black>=22.0.0; extra == "dev"
|
|
35
|
+
Requires-Dist: pylint>=2.12.0; extra == "dev"
|
|
36
|
+
Requires-Dist: mypy>=0.950; extra == "dev"
|
|
37
|
+
Dynamic: license-file
|
|
38
|
+
|
|
39
|
+
[](https://github.com/jhl126/pycausal-inference-joshlim/actions)
|
|
40
|
+
|
|
41
|
+
# Causal Inference Python Package - Josh Lim
|
|
42
|
+
|
|
43
|
+
This package provides key causal inference methods. These methods include ATE estimation from randomized experiments, propensity score methods, and meta-learners.
|
|
44
|
+
|
|
45
|
+
## Installation
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
# Clone the repository
|
|
49
|
+
git clone https://github.com/jhl126/pycausal-inference-joshlim.git
|
|
50
|
+
cd pycausal-inference-joshlim
|
|
51
|
+
|
|
52
|
+
# Install in editable mode
|
|
53
|
+
uv pip install -e .
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Usage
|
|
57
|
+
|
|
58
|
+
Import functions with the following code:
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
from pycausal_inference_joshlim import calculate_ate_ci, calculate_ate_pvalue
|
|
62
|
+
from pycausal_inference_joshlim import ipw, doubly_robust
|
|
63
|
+
from pycausal_inference_joshlim import s_learner_discrete, t_learner_discrete, x_learner_discrete, double_ml_cate
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## API Documentation
|
|
67
|
+
|
|
68
|
+
### RCT Module
|
|
69
|
+
- `calculate_ate_ci(data)` - Calculates the average treatment effect (ATE) and confidence interval from randomized experiment data
|
|
70
|
+
- `calculate_ate_pvalue(data)` - Calculates the p-value for the ATE estimate
|
|
71
|
+
|
|
72
|
+
### Propensity Score Module
|
|
73
|
+
- `ipw(data)` - Estimates the ATE using inverse probability weighting
|
|
74
|
+
- `doubly_robust(data)` - Estimates the ATE using the doubly robust estimator
|
|
75
|
+
|
|
76
|
+
### Meta-Learners Module
|
|
77
|
+
- `s_learner_discrete(data)` - Estimates heterogeneous treatment effects using the S-Learner approach
|
|
78
|
+
- `t_learner_discrete(data)` - Estimates heterogeneous treatment effects using the T-Learner approach
|
|
79
|
+
- `x_learner_discrete(data)` - Estimates heterogeneous treatment effects using the X-Learner approach
|
|
80
|
+
- `double_ml_cate(data)` - Estimates heterogeneous treatment effects using Double ML
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
[](https://github.com/jhl126/pycausal-inference-joshlim/actions)
|
|
2
|
+
|
|
3
|
+
# Causal Inference Python Package - Josh Lim
|
|
4
|
+
|
|
5
|
+
This package provides key causal inference methods. These methods include ATE estimation from randomized experiments, propensity score methods, and meta-learners.
|
|
6
|
+
|
|
7
|
+
## Installation
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
# Clone the repository
|
|
11
|
+
git clone https://github.com/jhl126/pycausal-inference-joshlim.git
|
|
12
|
+
cd pycausal-inference-joshlim
|
|
13
|
+
|
|
14
|
+
# Install in editable mode
|
|
15
|
+
uv pip install -e .
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Usage
|
|
19
|
+
|
|
20
|
+
Import functions with the following code:
|
|
21
|
+
|
|
22
|
+
```python
|
|
23
|
+
from pycausal_inference_joshlim import calculate_ate_ci, calculate_ate_pvalue
|
|
24
|
+
from pycausal_inference_joshlim import ipw, doubly_robust
|
|
25
|
+
from pycausal_inference_joshlim import s_learner_discrete, t_learner_discrete, x_learner_discrete, double_ml_cate
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## API Documentation
|
|
29
|
+
|
|
30
|
+
### RCT Module
|
|
31
|
+
- `calculate_ate_ci(data)` - Calculates the average treatment effect (ATE) and confidence interval from randomized experiment data
|
|
32
|
+
- `calculate_ate_pvalue(data)` - Calculates the p-value for the ATE estimate
|
|
33
|
+
|
|
34
|
+
### Propensity Score Module
|
|
35
|
+
- `ipw(data)` - Estimates the ATE using inverse probability weighting
|
|
36
|
+
- `doubly_robust(data)` - Estimates the ATE using the doubly robust estimator
|
|
37
|
+
|
|
38
|
+
### Meta-Learners Module
|
|
39
|
+
- `s_learner_discrete(data)` - Estimates heterogeneous treatment effects using the S-Learner approach
|
|
40
|
+
- `t_learner_discrete(data)` - Estimates heterogeneous treatment effects using the T-Learner approach
|
|
41
|
+
- `x_learner_discrete(data)` - Estimates heterogeneous treatment effects using the X-Learner approach
|
|
42
|
+
- `double_ml_cate(data)` - Estimates heterogeneous treatment effects using Double ML
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""pycausal_inference_joshlim - Causal Inference Toolkit"""
|
|
2
|
+
|
|
3
|
+
__version__ = "0.1.0"
|
|
4
|
+
|
|
5
|
+
# Import key functions for easy access
|
|
6
|
+
from .rct import calculate_ate_ci, calculate_ate_pvalue
|
|
7
|
+
from .propensity import ipw, doubly_robust
|
|
8
|
+
from .meta_learners import (
|
|
9
|
+
s_learner_discrete,
|
|
10
|
+
t_learner_discrete,
|
|
11
|
+
x_learner_discrete,
|
|
12
|
+
double_ml_cate
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"calculate_ate_ci",
|
|
17
|
+
"calculate_ate_pvalue",
|
|
18
|
+
"ipw",
|
|
19
|
+
"doubly_robust",
|
|
20
|
+
"s_learner_discrete",
|
|
21
|
+
"t_learner_discrete",
|
|
22
|
+
"x_learner_discrete",
|
|
23
|
+
"double_ml_cate",
|
|
24
|
+
]
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from lightgbm import LGBMRegressor
|
|
4
|
+
from sklearn.linear_model import LogisticRegression
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def simple_data(self):
|
|
8
|
+
"""Generate simple data with known treatment effect"""
|
|
9
|
+
np.random.seed(42)
|
|
10
|
+
n = 1000
|
|
11
|
+
|
|
12
|
+
# Covariates
|
|
13
|
+
x1 = np.random.normal(0, 1, n)
|
|
14
|
+
x2 = np.random.normal(0, 1, n)
|
|
15
|
+
|
|
16
|
+
# Treatment assignment (confounded)
|
|
17
|
+
prob_t = 1 / (1 + np.exp(-(0.5 * x1 + 0.3 * x2)))
|
|
18
|
+
t = np.random.binomial(1, prob_t, n)
|
|
19
|
+
|
|
20
|
+
# Outcome with constant treatment effect = 2.0
|
|
21
|
+
y = 2.0 * t + x1 + 0.5 * x2 + np.random.normal(0, 0.5, n)
|
|
22
|
+
|
|
23
|
+
df = pd.DataFrame({'x1': x1, 'x2': x2, 't': t, 'y': y})
|
|
24
|
+
|
|
25
|
+
# Split into train/test
|
|
26
|
+
train = df.iloc[:800].copy()
|
|
27
|
+
test = df.iloc[800:].copy()
|
|
28
|
+
|
|
29
|
+
return train, test
|
|
30
|
+
|
|
31
|
+
def heterogeneous_data(self):
|
|
32
|
+
"""Generate data with heterogeneous treatment effect"""
|
|
33
|
+
np.random.seed(123)
|
|
34
|
+
n = 1500
|
|
35
|
+
|
|
36
|
+
# Covariates
|
|
37
|
+
x1 = np.random.normal(0, 1, n)
|
|
38
|
+
x2 = np.random.normal(0, 1, n)
|
|
39
|
+
|
|
40
|
+
# Treatment assignment
|
|
41
|
+
prob_t = 1 / (1 + np.exp(-(0.4 * x1)))
|
|
42
|
+
t = np.random.binomial(1, prob_t, n)
|
|
43
|
+
|
|
44
|
+
# Outcome with heterogeneous effect: effect depends on x1
|
|
45
|
+
# CATE(x1) = 1 + 0.5*x1
|
|
46
|
+
te = 1.0 + 0.5 * x1
|
|
47
|
+
y = te * t + x1 + 0.3 * x2 + np.random.normal(0, 0.5, n)
|
|
48
|
+
|
|
49
|
+
df = pd.DataFrame({'x1': x1, 'x2': x2, 't': t, 'y': y})
|
|
50
|
+
|
|
51
|
+
train = df.iloc[:1200].copy()
|
|
52
|
+
test = df.iloc[1200:].copy()
|
|
53
|
+
|
|
54
|
+
return train, test
|
|
55
|
+
|
|
56
|
+
def continuous_treatment_data(self):
|
|
57
|
+
"""Generate data with continuous treatment"""
|
|
58
|
+
np.random.seed(789)
|
|
59
|
+
n = 1000
|
|
60
|
+
|
|
61
|
+
# Covariates
|
|
62
|
+
x1 = np.random.normal(0, 1, n)
|
|
63
|
+
x2 = np.random.normal(0, 1, n)
|
|
64
|
+
|
|
65
|
+
# Continuous treatment
|
|
66
|
+
t = 10 + x1 + 2*x2 + np.random.normal(0, 1, n)
|
|
67
|
+
|
|
68
|
+
# Outcome: linear effect of treatment
|
|
69
|
+
y = t + x1 + 0.5*x2 + np.random.normal(0, 0.5, n)
|
|
70
|
+
|
|
71
|
+
df = pd.DataFrame({'x1': x1, 'x2': x2, 't': t, 'y': y})
|
|
72
|
+
|
|
73
|
+
train = df.iloc[:800].copy()
|
|
74
|
+
test = df.iloc[800:].copy()
|
|
75
|
+
|
|
76
|
+
return train, test
|
|
77
|
+
|
|
78
|
+
def s_learner_discrete(train, test, X, T, y) -> pd.DataFrame:
|
|
79
|
+
model = LGBMRegressor()
|
|
80
|
+
model.fit(train[X + [T]], train[y])
|
|
81
|
+
|
|
82
|
+
t0 = test.copy()
|
|
83
|
+
t1 = test.copy()
|
|
84
|
+
|
|
85
|
+
t0[T] = 0
|
|
86
|
+
t1[T] = 1
|
|
87
|
+
|
|
88
|
+
cate = model.predict(t1[X + [T]]) - model.predict(t0[X + [T]])
|
|
89
|
+
|
|
90
|
+
output = test.copy()
|
|
91
|
+
output['cate'] = cate
|
|
92
|
+
|
|
93
|
+
return output
|
|
94
|
+
|
|
95
|
+
def t_learner_discrete(train, test, X, T, y) -> pd.DataFrame:
|
|
96
|
+
t0 = train.loc[train[T] == 0]
|
|
97
|
+
t1 = train.loc[train[T] == 1]
|
|
98
|
+
|
|
99
|
+
model0 = LGBMRegressor()
|
|
100
|
+
model1 = LGBMRegressor()
|
|
101
|
+
|
|
102
|
+
model0.fit(t0[X], t0[y])
|
|
103
|
+
model1.fit(t1[X], t1[y])
|
|
104
|
+
|
|
105
|
+
cate = model1.predict(test[X]) - model0.predict(test[X])
|
|
106
|
+
|
|
107
|
+
output = test.copy()
|
|
108
|
+
output['cate'] = cate
|
|
109
|
+
|
|
110
|
+
return output
|
|
111
|
+
|
|
112
|
+
def x_learner_discrete(train, test, X, T, y) -> pd.DataFrame:
|
|
113
|
+
t0 = train.loc[train[T] == 0]
|
|
114
|
+
t1 = train.loc[train[T] == 1]
|
|
115
|
+
|
|
116
|
+
model0 = LGBMRegressor()
|
|
117
|
+
model1 = LGBMRegressor()
|
|
118
|
+
|
|
119
|
+
model0.fit(t0[X], t0[y])
|
|
120
|
+
model1.fit(t1[X], t1[y])
|
|
121
|
+
|
|
122
|
+
pseudo0 = model1.predict(t0[X]) - t0[y]
|
|
123
|
+
pseudo1 = t1[y] - model0.predict(t1[X])
|
|
124
|
+
|
|
125
|
+
tau_model0 = LGBMRegressor()
|
|
126
|
+
tau_model1 = LGBMRegressor()
|
|
127
|
+
|
|
128
|
+
tau_model0.fit(t0[X], pseudo0)
|
|
129
|
+
tau_model1.fit(t1[X], pseudo1)
|
|
130
|
+
|
|
131
|
+
lr = LogisticRegression(penalty=None)
|
|
132
|
+
lr.fit(train[X], train[T])
|
|
133
|
+
|
|
134
|
+
e = lr.predict_proba(test[X])[:, 1]
|
|
135
|
+
|
|
136
|
+
cate = e * tau_model0.predict(test[X]) + (1 - e) * tau_model1.predict(test[X])
|
|
137
|
+
|
|
138
|
+
output = test.copy()
|
|
139
|
+
output['cate'] = cate
|
|
140
|
+
|
|
141
|
+
return output
|
|
142
|
+
|
|
143
|
+
def double_ml_cate(train, test, X, T, y) -> pd.DataFrame:
|
|
144
|
+
model_t = LGBMRegressor()
|
|
145
|
+
model_t.fit(train[X], train[T])
|
|
146
|
+
T_res = train[T] - model_t.predict(train[X])
|
|
147
|
+
|
|
148
|
+
model_y = LGBMRegressor()
|
|
149
|
+
model_y.fit(train[X], train[y])
|
|
150
|
+
Y_res = train[y] - model_y.predict(train[X])
|
|
151
|
+
|
|
152
|
+
Y_star = Y_res / T_res
|
|
153
|
+
w = T_res ** 2
|
|
154
|
+
|
|
155
|
+
model = LGBMRegressor()
|
|
156
|
+
model.fit(train[X], Y_star, sample_weight = w)
|
|
157
|
+
|
|
158
|
+
cate = model.predict(test[X])
|
|
159
|
+
|
|
160
|
+
output = test.copy()
|
|
161
|
+
output['cate'] = cate
|
|
162
|
+
|
|
163
|
+
return output
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# Solution obtained from Claude to solve for packaging error
|
|
2
|
+
import subprocess
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
subprocess.run([sys.executable, "-m", "pip", "install", "packaging"], capture_output=True)
|
|
6
|
+
|
|
7
|
+
# Add imports
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from patsy import dmatrices, dmatrix
|
|
11
|
+
from sklearn.linear_model import LinearRegression, LogisticRegression
|
|
12
|
+
|
|
13
|
+
"""Test 1: IPW with simple positive treatment effect"""
|
|
14
|
+
np.random.seed(42)
|
|
15
|
+
n = 1000
|
|
16
|
+
# Generate data with known ATE = 2
|
|
17
|
+
x = np.random.normal(0, 1, n)
|
|
18
|
+
prob_t = 1 / (1 + np.exp(-(0.5 * x)))
|
|
19
|
+
t = np.random.binomial(1, prob_t, n)
|
|
20
|
+
y = 2 * t + x + np.random.normal(0, 0.5, n)
|
|
21
|
+
df = pd.DataFrame({'x': x, 't': t, 'y': y})
|
|
22
|
+
|
|
23
|
+
"""Test 5: IPW with categorical covariate"""
|
|
24
|
+
np.random.seed(101)
|
|
25
|
+
n = 1000
|
|
26
|
+
# Generate data with categorical confounder
|
|
27
|
+
group = np.random.choice(['A', 'B', 'C'], n)
|
|
28
|
+
group_effect = {'A': 0, 'B': 1, 'C': 2}
|
|
29
|
+
x_numeric = np.array([group_effect[g] for g in group])
|
|
30
|
+
prob_t = 1 / (1 + np.exp(-(0.5 * x_numeric)))
|
|
31
|
+
t = np.random.binomial(1, prob_t, n)
|
|
32
|
+
y = 2.0 * t + x_numeric + np.random.normal(0, 0.5, n)
|
|
33
|
+
df = pd.DataFrame({'group': group, 't': t, 'y': y})
|
|
34
|
+
|
|
35
|
+
def ipw(df: pd.DataFrame, ps_formula: str, T: str, Y: str) -> float:
|
|
36
|
+
X = dmatrix(ps_formula, df)
|
|
37
|
+
model = LogisticRegression(penalty = None, max_iter = 1000).fit(X, df[T])
|
|
38
|
+
ps = model.predict_proba(X)[:,1]
|
|
39
|
+
return np.mean((df[T] - ps) / (ps*(1-ps)) * df[Y])
|
|
40
|
+
|
|
41
|
+
def doubly_robust(df: pd.DataFrame, formula: str, T: str, Y: str) -> float:
|
|
42
|
+
X = dmatrix(formula, df)
|
|
43
|
+
model = LogisticRegression(penalty=None, max_iter=1000).fit(X,df[T])
|
|
44
|
+
ps = model.predict_proba(X)[:,1]
|
|
45
|
+
|
|
46
|
+
Y_mat, X_out = dmatrices(f"{Y} ~ {T} + {formula}", df)
|
|
47
|
+
outcome_model = LinearRegression().fit(X_out, np.array(Y_mat).flatten())
|
|
48
|
+
|
|
49
|
+
mu1_df = df.copy()
|
|
50
|
+
mu0_df = df.copy()
|
|
51
|
+
mu1_df[T] = 1
|
|
52
|
+
mu0_df[T] = 0
|
|
53
|
+
mu1x = dmatrix(f"{T} + {formula}", mu1_df)
|
|
54
|
+
mu0x = dmatrix(f"{T} + {formula}", mu0_df)
|
|
55
|
+
|
|
56
|
+
mu1 = outcome_model.predict(mu1x).flatten()
|
|
57
|
+
mu0 = outcome_model.predict(mu0x).flatten()
|
|
58
|
+
|
|
59
|
+
ate = np.mean(df[T] * (df[Y] - mu1) / ps + mu1) - np.mean((1-df[T]) * (df[Y] - mu0) / (1-ps) + mu0)
|
|
60
|
+
return ate
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from typing import Tuple
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from scipy.stats import norm
|
|
6
|
+
|
|
7
|
+
# Positive effect data
|
|
8
|
+
np.random.seed(42)
|
|
9
|
+
n = 1000
|
|
10
|
+
positive_effect_data = pd.DataFrame({
|
|
11
|
+
'I': range(n),
|
|
12
|
+
'T': np.random.binomial(1, 0.5, n),
|
|
13
|
+
})
|
|
14
|
+
positive_effect_data['Y'] = np.where(
|
|
15
|
+
positive_effect_data['T'] == 1,
|
|
16
|
+
np.random.normal(10, 2, n),
|
|
17
|
+
np.random.normal(8, 2, n)
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
# No effect data
|
|
21
|
+
np.random.seed(123)
|
|
22
|
+
n = 500
|
|
23
|
+
no_effect_data = pd.DataFrame({
|
|
24
|
+
'I': range(n),
|
|
25
|
+
'T': np.random.binomial(1, 0.5, n),
|
|
26
|
+
})
|
|
27
|
+
no_effect_data['Y'] = np.random.normal(5, 3, n)
|
|
28
|
+
|
|
29
|
+
def calculate_ate_ci(data: pd.DataFrame, alpha: float = 0.05) -> Tuple[float, float, float]:
|
|
30
|
+
avg_treatment = data.loc[data['T']==1]['Y'].mean()
|
|
31
|
+
avg_control = data.loc[data['T']==0]['Y'].mean()
|
|
32
|
+
ATE_estimate = avg_treatment - avg_control
|
|
33
|
+
|
|
34
|
+
n_1 = len(data.loc[data['T']==1])
|
|
35
|
+
n_0 = len(data.loc[data['T']==0])
|
|
36
|
+
var_1 = data.loc[data['T']==1]['Y'].var()
|
|
37
|
+
var_0 = data.loc[data['T']==0]['Y'].var()
|
|
38
|
+
se_ate = np.sqrt(var_1/n_1 + var_0/n_0)
|
|
39
|
+
|
|
40
|
+
z = norm.ppf(1-alpha/2)
|
|
41
|
+
|
|
42
|
+
ci_lower = ATE_estimate - z*se_ate
|
|
43
|
+
ci_upper = ATE_estimate + z*se_ate
|
|
44
|
+
|
|
45
|
+
return(ATE_estimate, ci_lower, ci_upper)
|
|
46
|
+
|
|
47
|
+
def calculate_ate_pvalue(data: pd.DataFrame) -> Tuple[float, float, float]:
|
|
48
|
+
avg_treatment = data.loc[data['T']==1]['Y'].mean()
|
|
49
|
+
avg_control = data.loc[data['T']==0]['Y'].mean()
|
|
50
|
+
ATE_estimate = avg_treatment - avg_control
|
|
51
|
+
|
|
52
|
+
n_1 = len(data.loc[data['T']==1])
|
|
53
|
+
n_0 = len(data.loc[data['T']==0])
|
|
54
|
+
var_1 = data.loc[data['T']==1]['Y'].var()
|
|
55
|
+
var_0 = data.loc[data['T']==0]['Y'].var()
|
|
56
|
+
se_ate = np.sqrt(var_1/n_1 + var_0/n_0)
|
|
57
|
+
|
|
58
|
+
t_statistic = ATE_estimate / se_ate
|
|
59
|
+
p_value = 2 * (1-norm.cdf(abs(t_statistic)))
|
|
60
|
+
|
|
61
|
+
return(ATE_estimate, t_statistic, p_value)
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pycausal-inference-joshlim
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A Python package for causal inference methods including ATE estimation, propensity score methods, and meta-learners
|
|
5
|
+
Author-email: Josh Lim <j.lim703@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/jhl126/pycausal-inference-joshlim
|
|
8
|
+
Project-URL: Documentation, https://github.com/jhl126/pycausal-inference-joshlim#readme
|
|
9
|
+
Project-URL: Repository, https://github.com/jhl126/pycausal-inference-joshlim
|
|
10
|
+
Project-URL: Bug Tracker, https://github.com/jhl126/pycausal-inference-joshlim/issues
|
|
11
|
+
Keywords: causal inference,statistics,machine learning,treatment effects
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Mathematics
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
22
|
+
Requires-Python: >=3.8
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: pandas>=1.3.0
|
|
26
|
+
Requires-Dist: numpy>=1.21.0
|
|
27
|
+
Requires-Dist: scipy>=1.7.0
|
|
28
|
+
Requires-Dist: scikit-learn>=1.0.0
|
|
29
|
+
Requires-Dist: lightgbm>=3.3.0
|
|
30
|
+
Requires-Dist: patsy>=0.5.0
|
|
31
|
+
Provides-Extra: dev
|
|
32
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
33
|
+
Requires-Dist: pytest-cov>=3.0.0; extra == "dev"
|
|
34
|
+
Requires-Dist: black>=22.0.0; extra == "dev"
|
|
35
|
+
Requires-Dist: pylint>=2.12.0; extra == "dev"
|
|
36
|
+
Requires-Dist: mypy>=0.950; extra == "dev"
|
|
37
|
+
Dynamic: license-file
|
|
38
|
+
|
|
39
|
+
[](https://github.com/jhl126/pycausal-inference-joshlim/actions)
|
|
40
|
+
|
|
41
|
+
# Causal Inference Python Package - Josh Lim
|
|
42
|
+
|
|
43
|
+
This package provides key causal inference methods. These methods include ATE estimation from randomized experiments, propensity score methods, and meta-learners.
|
|
44
|
+
|
|
45
|
+
## Installation
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
# Clone the repository
|
|
49
|
+
git clone https://github.com/jhl126/pycausal-inference-joshlim.git
|
|
50
|
+
cd pycausal-inference-joshlim
|
|
51
|
+
|
|
52
|
+
# Install in editable mode
|
|
53
|
+
uv pip install -e .
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Usage
|
|
57
|
+
|
|
58
|
+
Import functions with the following code:
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
from pycausal_inference_joshlim import calculate_ate_ci, calculate_ate_pvalue
|
|
62
|
+
from pycausal_inference_joshlim import ipw, doubly_robust
|
|
63
|
+
from pycausal_inference_joshlim import s_learner_discrete, t_learner_discrete, x_learner_discrete, double_ml_cate
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## API Documentation
|
|
67
|
+
|
|
68
|
+
### RCT Module
|
|
69
|
+
- `calculate_ate_ci(data)` - Calculates the average treatment effect (ATE) and confidence interval from randomized experiment data
|
|
70
|
+
- `calculate_ate_pvalue(data)` - Calculates the p-value for the ATE estimate
|
|
71
|
+
|
|
72
|
+
### Propensity Score Module
|
|
73
|
+
- `ipw(data)` - Estimates the ATE using inverse probability weighting
|
|
74
|
+
- `doubly_robust(data)` - Estimates the ATE using the doubly robust estimator
|
|
75
|
+
|
|
76
|
+
### Meta-Learners Module
|
|
77
|
+
- `s_learner_discrete(data)` - Estimates heterogeneous treatment effects using the S-Learner approach
|
|
78
|
+
- `t_learner_discrete(data)` - Estimates heterogeneous treatment effects using the T-Learner approach
|
|
79
|
+
- `x_learner_discrete(data)` - Estimates heterogeneous treatment effects using the X-Learner approach
|
|
80
|
+
- `double_ml_cate(data)` - Estimates heterogeneous treatment effects using Double ML
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
pycausal_inference_joshlim/__init__.py
|
|
5
|
+
pycausal_inference_joshlim/meta_learners.py
|
|
6
|
+
pycausal_inference_joshlim/propensity.py
|
|
7
|
+
pycausal_inference_joshlim/rct.py
|
|
8
|
+
pycausal_inference_joshlim.egg-info/PKG-INFO
|
|
9
|
+
pycausal_inference_joshlim.egg-info/SOURCES.txt
|
|
10
|
+
pycausal_inference_joshlim.egg-info/dependency_links.txt
|
|
11
|
+
pycausal_inference_joshlim.egg-info/requires.txt
|
|
12
|
+
pycausal_inference_joshlim.egg-info/top_level.txt
|
|
13
|
+
tests/test_meta_learners.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
pycausal_inference_joshlim
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "pycausal-inference-joshlim"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A Python package for causal inference methods including ATE estimation, propensity score methods, and meta-learners"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
license = {text = "MIT"}
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Josh Lim", email = "j.lim703@gmail.com"}
|
|
14
|
+
]
|
|
15
|
+
keywords = ["causal inference", "statistics", "machine learning", "treatment effects"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 3 - Alpha",
|
|
18
|
+
"Intended Audience :: Science/Research",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.8",
|
|
22
|
+
"Programming Language :: Python :: 3.9",
|
|
23
|
+
"Programming Language :: Python :: 3.10",
|
|
24
|
+
"Programming Language :: Python :: 3.11",
|
|
25
|
+
"Topic :: Scientific/Engineering :: Mathematics",
|
|
26
|
+
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
dependencies = [
|
|
30
|
+
"pandas>=1.3.0",
|
|
31
|
+
"numpy>=1.21.0",
|
|
32
|
+
"scipy>=1.7.0",
|
|
33
|
+
"scikit-learn>=1.0.0",
|
|
34
|
+
"lightgbm>=3.3.0",
|
|
35
|
+
"patsy>=0.5.0",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
[project.optional-dependencies]
|
|
39
|
+
dev = [
|
|
40
|
+
"pytest>=7.0.0",
|
|
41
|
+
"pytest-cov>=3.0.0",
|
|
42
|
+
"black>=22.0.0",
|
|
43
|
+
"pylint>=2.12.0",
|
|
44
|
+
"mypy>=0.950",
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
[project.urls]
|
|
48
|
+
Homepage = "https://github.com/jhl126/pycausal-inference-joshlim"
|
|
49
|
+
Documentation = "https://github.com/jhl126/pycausal-inference-joshlim#readme"
|
|
50
|
+
Repository = "https://github.com/jhl126/pycausal-inference-joshlim"
|
|
51
|
+
"Bug Tracker" = "https://github.com/jhl126/pycausal-inference-joshlim/issues"
|
|
52
|
+
|
|
53
|
+
[tool.pytest.ini_options]
|
|
54
|
+
testpaths = ["tests"]
|
|
55
|
+
python_files = ["test_*.py"]
|
|
56
|
+
python_classes = ["Test*"]
|
|
57
|
+
python_functions = ["test_*"]
|
|
58
|
+
addopts = "-v --cov=pycausal_inference_joshlim --cov-report=html --cov-report=term"
|
|
59
|
+
|
|
60
|
+
[tool.black]
|
|
61
|
+
line-length = 88
|
|
62
|
+
target-version = ['py38', 'py39', 'py310', 'py311']
|
|
63
|
+
include = '\.pyi?$'
|
|
64
|
+
|
|
65
|
+
[tool.mypy]
|
|
66
|
+
python_version = "3.8"
|
|
67
|
+
warn_return_any = true
|
|
68
|
+
warn_unused_configs = true
|
|
69
|
+
disallow_untyped_defs = false
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
|
|
4
|
+
from pycausal_inference_joshlim import s_learner_discrete, t_learner_discrete, x_learner_discrete, double_ml_cate
|
|
5
|
+
|
|
6
|
+
def simple_data():
|
|
7
|
+
"""Generate simple data with known treatment effect"""
|
|
8
|
+
np.random.seed(42)
|
|
9
|
+
n = 1000
|
|
10
|
+
|
|
11
|
+
# Covariates
|
|
12
|
+
x1 = np.random.normal(0, 1, n)
|
|
13
|
+
x2 = np.random.normal(0, 1, n)
|
|
14
|
+
|
|
15
|
+
# Treatment assignment (confounded)
|
|
16
|
+
prob_t = 1 / (1 + np.exp(-(0.5 * x1 + 0.3 * x2)))
|
|
17
|
+
t = np.random.binomial(1, prob_t, n)
|
|
18
|
+
|
|
19
|
+
# Outcome with constant treatment effect = 2.0
|
|
20
|
+
y = 2.0 * t + x1 + 0.5 * x2 + np.random.normal(0, 0.5, n)
|
|
21
|
+
|
|
22
|
+
df = pd.DataFrame({'x1': x1, 'x2': x2, 't': t, 'y': y})
|
|
23
|
+
|
|
24
|
+
# Split into train/test
|
|
25
|
+
train = df.iloc[:800].copy()
|
|
26
|
+
test = df.iloc[800:].copy()
|
|
27
|
+
|
|
28
|
+
return train, test
|
|
29
|
+
|
|
30
|
+
def continuous_treatment_data():
|
|
31
|
+
"""Generate data with continuous treatment"""
|
|
32
|
+
np.random.seed(789)
|
|
33
|
+
n = 1000
|
|
34
|
+
|
|
35
|
+
# Covariates
|
|
36
|
+
x1 = np.random.normal(0, 1, n)
|
|
37
|
+
x2 = np.random.normal(0, 1, n)
|
|
38
|
+
|
|
39
|
+
# Continuous treatment
|
|
40
|
+
t = 10 + x1 + 2*x2 + np.random.normal(0, 1, n)
|
|
41
|
+
|
|
42
|
+
# Outcome: linear effect of treatment
|
|
43
|
+
y = t + x1 + 0.5*x2 + np.random.normal(0, 0.5, n)
|
|
44
|
+
|
|
45
|
+
df = pd.DataFrame({'x1': x1, 'x2': x2, 't': t, 'y': y})
|
|
46
|
+
|
|
47
|
+
train = df.iloc[:800].copy()
|
|
48
|
+
test = df.iloc[800:].copy()
|
|
49
|
+
|
|
50
|
+
return train, test
|
|
51
|
+
|
|
52
|
+
def test_s_learner_returns_dataframe():
|
|
53
|
+
train, test = simple_data()
|
|
54
|
+
result = s_learner_discrete(train, test, ['x1', 'x2'], 't', 'y')
|
|
55
|
+
assert isinstance(result, pd.DataFrame)
|
|
56
|
+
|
|
57
|
+
def test_s_learner_has_cate_column():
|
|
58
|
+
train, test = simple_data()
|
|
59
|
+
result = s_learner_discrete(train, test, ['x1', 'x2'], 't', 'y')
|
|
60
|
+
assert 'cate' in result.columns
|
|
61
|
+
|
|
62
|
+
def test_s_learner_constant_effect():
|
|
63
|
+
train, test = simple_data()
|
|
64
|
+
result = s_learner_discrete(train, test, ['x1', 'x2'], 't', 'y')
|
|
65
|
+
assert abs(result['cate'].mean() - 2.0) < 0.5
|
|
66
|
+
|
|
67
|
+
def test_s_learner_return_numeric_cate():
|
|
68
|
+
train, test = simple_data()
|
|
69
|
+
result = s_learner_discrete(train, test, ['x1','x2'], 't','y')
|
|
70
|
+
assert pd.api.types.is_numeric_dtype(result['cate'])
|
|
71
|
+
|
|
72
|
+
def test_s_learner_no_nan_values():
|
|
73
|
+
train, test = simple_data()
|
|
74
|
+
result = s_learner_discrete(train, test, ['x1', 'x2'], 't', 'y')
|
|
75
|
+
assert result['cate'].isna().sum() == 0
|
|
76
|
+
|
|
77
|
+
def test_t_learner_returns_dataframe():
|
|
78
|
+
train, test = simple_data()
|
|
79
|
+
result = t_learner_discrete(train, test, ['x1','x2'], 't', 'y')
|
|
80
|
+
assert isinstance(result, pd.DataFrame)
|
|
81
|
+
|
|
82
|
+
def test_x_learner_returns_dataframe():
|
|
83
|
+
train, test = simple_data()
|
|
84
|
+
result = x_learner_discrete(train, test, ['x1','x2'],'t','y')
|
|
85
|
+
assert isinstance(result, pd.DataFrame)
|
|
86
|
+
|
|
87
|
+
def test_double_ml_returns_dataframe():
|
|
88
|
+
train, test = continuous_treatment_data()
|
|
89
|
+
result = double_ml_cate(train, test, ['x1','x2'],'t','y')
|
|
90
|
+
assert isinstance(result, pd.DataFrame)
|
|
91
|
+
|
|
92
|
+
def test_double_ml_continuous_treatment():
|
|
93
|
+
train, test = continuous_treatment_data()
|
|
94
|
+
result = double_ml_cate(train, test, ['x1','x2'],'t','y')
|
|
95
|
+
estimated_ate = result['cate'].mean()
|
|
96
|
+
assert abs(estimated_ate - 1) < 0.5
|