aigroup-econ-mcp 1.3.3__py3-none-any.whl → 2.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- .gitignore +253 -0
- PKG-INFO +732 -0
- README.md +687 -0
- __init__.py +14 -0
- aigroup_econ_mcp-2.0.1.dist-info/METADATA +732 -0
- aigroup_econ_mcp-2.0.1.dist-info/RECORD +170 -0
- aigroup_econ_mcp-2.0.1.dist-info/entry_points.txt +2 -0
- aigroup_econ_mcp-2.0.1.dist-info/licenses/LICENSE +21 -0
- cli.py +32 -0
- econometrics/README.md +18 -0
- econometrics/__init__.py +191 -0
- econometrics/advanced_methods/modern_computing_machine_learning/__init__.py +30 -0
- econometrics/advanced_methods/modern_computing_machine_learning/causal_forest.py +253 -0
- econometrics/advanced_methods/modern_computing_machine_learning/double_ml.py +268 -0
- econometrics/advanced_methods/modern_computing_machine_learning/gradient_boosting.py +249 -0
- econometrics/advanced_methods/modern_computing_machine_learning/hierarchical_clustering.py +243 -0
- econometrics/advanced_methods/modern_computing_machine_learning/kmeans_clustering.py +293 -0
- econometrics/advanced_methods/modern_computing_machine_learning/neural_network.py +264 -0
- econometrics/advanced_methods/modern_computing_machine_learning/random_forest.py +195 -0
- econometrics/advanced_methods/modern_computing_machine_learning/support_vector_machine.py +226 -0
- econometrics/advanced_methods/modern_computing_machine_learning/test_all_modules.py +329 -0
- econometrics/advanced_methods/modern_computing_machine_learning/test_report.md +107 -0
- econometrics/basic_parametric_estimation/__init__.py +31 -0
- econometrics/basic_parametric_estimation/gmm/__init__.py +13 -0
- econometrics/basic_parametric_estimation/gmm/gmm_model.py +256 -0
- econometrics/basic_parametric_estimation/mle/__init__.py +13 -0
- econometrics/basic_parametric_estimation/mle/mle_model.py +241 -0
- econometrics/basic_parametric_estimation/ols/__init__.py +13 -0
- econometrics/basic_parametric_estimation/ols/ols_model.py +141 -0
- econometrics/causal_inference/__init__.py +66 -0
- econometrics/causal_inference/causal_identification_strategy/__init__.py +104 -0
- econometrics/causal_inference/causal_identification_strategy/control_function.py +112 -0
- econometrics/causal_inference/causal_identification_strategy/difference_in_differences.py +107 -0
- econometrics/causal_inference/causal_identification_strategy/event_study.py +119 -0
- econometrics/causal_inference/causal_identification_strategy/first_difference.py +89 -0
- econometrics/causal_inference/causal_identification_strategy/fixed_effects.py +103 -0
- econometrics/causal_inference/causal_identification_strategy/hausman_test.py +69 -0
- econometrics/causal_inference/causal_identification_strategy/instrumental_variables.py +145 -0
- econometrics/causal_inference/causal_identification_strategy/mediation_analysis.py +121 -0
- econometrics/causal_inference/causal_identification_strategy/moderation_analysis.py +109 -0
- econometrics/causal_inference/causal_identification_strategy/propensity_score_matching.py +140 -0
- econometrics/causal_inference/causal_identification_strategy/random_effects.py +100 -0
- econometrics/causal_inference/causal_identification_strategy/regression_discontinuity.py +98 -0
- econometrics/causal_inference/causal_identification_strategy/synthetic_control.py +111 -0
- econometrics/causal_inference/causal_identification_strategy/triple_difference.py +86 -0
- econometrics/distribution_analysis/__init__.py +28 -0
- econometrics/distribution_analysis/oaxaca_blinder.py +184 -0
- econometrics/distribution_analysis/time_series_decomposition.py +152 -0
- econometrics/distribution_analysis/variance_decomposition.py +179 -0
- econometrics/missing_data/__init__.py +18 -0
- econometrics/missing_data/imputation_methods.py +219 -0
- econometrics/missing_data/missing_data_measurement_error/__init__.py +0 -0
- econometrics/model_specification_diagnostics_robust_inference/README.md +173 -0
- econometrics/model_specification_diagnostics_robust_inference/__init__.py +78 -0
- econometrics/model_specification_diagnostics_robust_inference/diagnostic_tests/__init__.py +20 -0
- econometrics/model_specification_diagnostics_robust_inference/diagnostic_tests/diagnostic_tests_model.py +149 -0
- econometrics/model_specification_diagnostics_robust_inference/generalized_least_squares/__init__.py +15 -0
- econometrics/model_specification_diagnostics_robust_inference/generalized_least_squares/gls_model.py +130 -0
- econometrics/model_specification_diagnostics_robust_inference/model_selection/__init__.py +18 -0
- econometrics/model_specification_diagnostics_robust_inference/model_selection/model_selection_model.py +286 -0
- econometrics/model_specification_diagnostics_robust_inference/regularization/__init__.py +15 -0
- econometrics/model_specification_diagnostics_robust_inference/regularization/regularization_model.py +177 -0
- econometrics/model_specification_diagnostics_robust_inference/robust_errors/__init__.py +15 -0
- econometrics/model_specification_diagnostics_robust_inference/robust_errors/robust_errors_model.py +122 -0
- econometrics/model_specification_diagnostics_robust_inference/simultaneous_equations/__init__.py +15 -0
- econometrics/model_specification_diagnostics_robust_inference/simultaneous_equations/simultaneous_equations_model.py +246 -0
- econometrics/model_specification_diagnostics_robust_inference/weighted_least_squares/__init__.py +15 -0
- econometrics/model_specification_diagnostics_robust_inference/weighted_least_squares/wls_model.py +127 -0
- econometrics/nonparametric/__init__.py +35 -0
- econometrics/nonparametric/gam_model.py +117 -0
- econometrics/nonparametric/kernel_regression.py +161 -0
- econometrics/nonparametric/nonparametric_semiparametric_methods/__init__.py +0 -0
- econometrics/nonparametric/quantile_regression.py +249 -0
- econometrics/nonparametric/spline_regression.py +100 -0
- econometrics/spatial_econometrics/__init__.py +68 -0
- econometrics/spatial_econometrics/geographically_weighted_regression.py +211 -0
- econometrics/spatial_econometrics/gwr_simple.py +154 -0
- econometrics/spatial_econometrics/spatial_autocorrelation.py +356 -0
- econometrics/spatial_econometrics/spatial_durbin_model.py +177 -0
- econometrics/spatial_econometrics/spatial_econometrics_new/__init__.py +0 -0
- econometrics/spatial_econometrics/spatial_regression.py +315 -0
- econometrics/spatial_econometrics/spatial_weights.py +226 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/README.md +164 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/__init__.py +40 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/count_data_models.py +311 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/discrete_choice_models.py +294 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/limited_dependent_variable_models.py +282 -0
- econometrics/specific_data_modeling/survival_duration_data/__init__.py +0 -0
- econometrics/specific_data_modeling/time_series_panel_data/__init__.py +143 -0
- econometrics/specific_data_modeling/time_series_panel_data/arima_model.py +104 -0
- econometrics/specific_data_modeling/time_series_panel_data/cointegration_vecm.py +334 -0
- econometrics/specific_data_modeling/time_series_panel_data/dynamic_panel_models.py +653 -0
- econometrics/specific_data_modeling/time_series_panel_data/exponential_smoothing.py +176 -0
- econometrics/specific_data_modeling/time_series_panel_data/garch_model.py +198 -0
- econometrics/specific_data_modeling/time_series_panel_data/panel_diagnostics.py +125 -0
- econometrics/specific_data_modeling/time_series_panel_data/panel_var.py +60 -0
- econometrics/specific_data_modeling/time_series_panel_data/structural_break_tests.py +87 -0
- econometrics/specific_data_modeling/time_series_panel_data/time_varying_parameter_models.py +106 -0
- econometrics/specific_data_modeling/time_series_panel_data/unit_root_tests.py +204 -0
- econometrics/specific_data_modeling/time_series_panel_data/var_svar_model.py +372 -0
- econometrics/statistical_inference/__init__.py +21 -0
- econometrics/statistical_inference/bootstrap_methods.py +162 -0
- econometrics/statistical_inference/permutation_test.py +177 -0
- econometrics/statistical_inference/statistical_inference_techniques/__init__.py +0 -0
- econometrics/statistics/distribution_decomposition_methods/__init__.py +0 -0
- econometrics/survival_analysis/__init__.py +18 -0
- econometrics/survival_analysis/survival_models.py +259 -0
- econometrics/tests/basic_parametric_estimation_tests/__init__.py +3 -0
- econometrics/tests/basic_parametric_estimation_tests/test_gmm.py +128 -0
- econometrics/tests/basic_parametric_estimation_tests/test_mle.py +127 -0
- econometrics/tests/basic_parametric_estimation_tests/test_ols.py +100 -0
- econometrics/tests/causal_inference_tests/__init__.py +3 -0
- econometrics/tests/causal_inference_tests/detailed_test.py +441 -0
- econometrics/tests/causal_inference_tests/test_all_methods.py +418 -0
- econometrics/tests/causal_inference_tests/test_causal_identification_strategy.py +202 -0
- econometrics/tests/causal_inference_tests/test_difference_in_differences.py +53 -0
- econometrics/tests/causal_inference_tests/test_instrumental_variables.py +44 -0
- econometrics/tests/model_specification_diagnostics_tests/__init__.py +3 -0
- econometrics/tests/model_specification_diagnostics_tests/test_diagnostic_tests.py +86 -0
- econometrics/tests/model_specification_diagnostics_tests/test_robust_errors.py +89 -0
- econometrics/tests/specific_data_modeling_tests/__init__.py +3 -0
- econometrics/tests/specific_data_modeling_tests/test_arima.py +98 -0
- econometrics/tests/specific_data_modeling_tests/test_dynamic_panel.py +198 -0
- econometrics/tests/specific_data_modeling_tests/test_exponential_smoothing.py +105 -0
- econometrics/tests/specific_data_modeling_tests/test_garch.py +118 -0
- econometrics/tests/specific_data_modeling_tests/test_micro_discrete_limited_data.py +189 -0
- econometrics/tests/specific_data_modeling_tests/test_unit_root.py +156 -0
- econometrics/tests/specific_data_modeling_tests/test_var.py +124 -0
- econometrics//321/206/320/254/320/272/321/205/342/225/235/320/220/321/205/320/237/320/241/321/205/320/264/320/267/321/207/342/226/222/342/225/227/321/204/342/225/235/320/250/321/205/320/225/320/230/321/207/342/225/221/320/267/321/205/320/230/320/226/321/206/320/256/320/240.md +544 -0
- prompts/__init__.py +0 -0
- prompts/analysis_guides.py +43 -0
- pyproject.toml +85 -0
- resources/MCP_MASTER_GUIDE.md +422 -0
- resources/MCP_TOOLS_DATA_FORMAT_GUIDE.md +185 -0
- resources/__init__.py +0 -0
- server.py +97 -0
- tools/README.md +88 -0
- tools/__init__.py +119 -0
- tools/causal_inference_adapter.py +658 -0
- tools/data_loader.py +213 -0
- tools/decorators.py +38 -0
- tools/distribution_analysis_adapter.py +121 -0
- tools/econometrics_adapter.py +286 -0
- tools/gwr_simple_adapter.py +54 -0
- tools/machine_learning_adapter.py +567 -0
- tools/mcp_tool_groups/__init__.py +15 -0
- tools/mcp_tool_groups/basic_parametric_tools.py +173 -0
- tools/mcp_tool_groups/causal_inference_tools.py +643 -0
- tools/mcp_tool_groups/distribution_analysis_tools.py +169 -0
- tools/mcp_tool_groups/machine_learning_tools.py +422 -0
- tools/mcp_tool_groups/microecon_tools.py +325 -0
- tools/mcp_tool_groups/missing_data_tools.py +117 -0
- tools/mcp_tool_groups/model_specification_tools.py +402 -0
- tools/mcp_tool_groups/nonparametric_tools.py +225 -0
- tools/mcp_tool_groups/spatial_econometrics_tools.py +323 -0
- tools/mcp_tool_groups/statistical_inference_tools.py +131 -0
- tools/mcp_tool_groups/time_series_tools.py +494 -0
- tools/mcp_tools_registry.py +124 -0
- tools/microecon_adapter.py +412 -0
- tools/missing_data_adapter.py +73 -0
- tools/model_specification_adapter.py +369 -0
- tools/nonparametric_adapter.py +190 -0
- tools/output_formatter.py +563 -0
- tools/spatial_econometrics_adapter.py +318 -0
- tools/statistical_inference_adapter.py +90 -0
- tools/survival_analysis_adapter.py +46 -0
- tools/time_series_panel_data_adapter.py +858 -0
- tools/time_series_panel_data_tools.py +65 -0
- aigroup_econ_mcp/__init__.py +0 -19
- aigroup_econ_mcp/cli.py +0 -82
- aigroup_econ_mcp/config.py +0 -561
- aigroup_econ_mcp/server.py +0 -452
- aigroup_econ_mcp/tools/__init__.py +0 -19
- aigroup_econ_mcp/tools/base.py +0 -470
- aigroup_econ_mcp/tools/cache.py +0 -533
- aigroup_econ_mcp/tools/data_loader.py +0 -195
- aigroup_econ_mcp/tools/file_parser.py +0 -1027
- aigroup_econ_mcp/tools/machine_learning.py +0 -60
- aigroup_econ_mcp/tools/ml_ensemble.py +0 -210
- aigroup_econ_mcp/tools/ml_evaluation.py +0 -272
- aigroup_econ_mcp/tools/ml_models.py +0 -54
- aigroup_econ_mcp/tools/ml_regularization.py +0 -186
- aigroup_econ_mcp/tools/monitoring.py +0 -555
- aigroup_econ_mcp/tools/optimized_example.py +0 -229
- aigroup_econ_mcp/tools/panel_data.py +0 -619
- aigroup_econ_mcp/tools/regression.py +0 -214
- aigroup_econ_mcp/tools/statistics.py +0 -154
- aigroup_econ_mcp/tools/time_series.py +0 -698
- aigroup_econ_mcp/tools/timeout.py +0 -283
- aigroup_econ_mcp/tools/tool_descriptions.py +0 -410
- aigroup_econ_mcp/tools/tool_handlers.py +0 -1016
- aigroup_econ_mcp/tools/tool_registry.py +0 -478
- aigroup_econ_mcp/tools/validation.py +0 -482
- aigroup_econ_mcp-1.3.3.dist-info/METADATA +0 -525
- aigroup_econ_mcp-1.3.3.dist-info/RECORD +0 -30
- aigroup_econ_mcp-1.3.3.dist-info/entry_points.txt +0 -2
- /aigroup_econ_mcp-1.3.3.dist-info/licenses/LICENSE → /LICENSE +0 -0
- {aigroup_econ_mcp-1.3.3.dist-info → aigroup_econ_mcp-2.0.1.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Causal Forest implementation for heterogeneous treatment effect estimation
|
|
3
|
+
"""
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from sklearn.ensemble import RandomForestRegressor
|
|
7
|
+
from sklearn.model_selection import train_test_split
|
|
8
|
+
from typing import Union, Optional, Dict, Any
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class CausalForest:
|
|
12
|
+
"""
|
|
13
|
+
Causal Forest for estimating heterogeneous treatment effects
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(self, n_estimators: int = 100, min_samples_leaf: int = 5,
|
|
17
|
+
max_depth: Optional[int] = None, random_state: int = 42,
|
|
18
|
+
honest: bool = True, n_jobs: int = -1):
|
|
19
|
+
"""
|
|
20
|
+
Initialize Causal Forest model
|
|
21
|
+
|
|
22
|
+
Parameters:
|
|
23
|
+
-----------
|
|
24
|
+
n_estimators : int
|
|
25
|
+
Number of trees in the forest
|
|
26
|
+
min_samples_leaf : int
|
|
27
|
+
Minimum number of samples required to be at a leaf node
|
|
28
|
+
max_depth : int, optional
|
|
29
|
+
Maximum depth of the tree
|
|
30
|
+
random_state : int
|
|
31
|
+
Random state for reproducibility
|
|
32
|
+
honest : bool
|
|
33
|
+
Whether to use honest splitting (separate samples for splitting and estimation)
|
|
34
|
+
n_jobs : int
|
|
35
|
+
Number of jobs to run in parallel
|
|
36
|
+
"""
|
|
37
|
+
self.n_estimators = n_estimators
|
|
38
|
+
self.min_samples_leaf = min_samples_leaf
|
|
39
|
+
self.max_depth = max_depth
|
|
40
|
+
self.random_state = random_state
|
|
41
|
+
self.honest = honest
|
|
42
|
+
self.n_jobs = n_jobs
|
|
43
|
+
|
|
44
|
+
# We'll implement a simplified version using two random forests
|
|
45
|
+
# One for the outcome regression and one for the treatment regression
|
|
46
|
+
self.mu_model = RandomForestRegressor(
|
|
47
|
+
n_estimators=n_estimators,
|
|
48
|
+
min_samples_leaf=min_samples_leaf,
|
|
49
|
+
max_depth=max_depth,
|
|
50
|
+
random_state=random_state,
|
|
51
|
+
n_jobs=n_jobs
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
self.pi_model = RandomForestRegressor(
|
|
55
|
+
n_estimators=n_estimators,
|
|
56
|
+
min_samples_leaf=min_samples_leaf,
|
|
57
|
+
max_depth=max_depth,
|
|
58
|
+
random_state=random_state,
|
|
59
|
+
n_jobs=n_jobs
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# Store results
|
|
63
|
+
self.fitted = False
|
|
64
|
+
|
|
65
|
+
def fit(self, X: Union[np.ndarray, pd.DataFrame],
|
|
66
|
+
y: Union[np.ndarray, pd.Series],
|
|
67
|
+
w: Union[np.ndarray, pd.Series]) -> 'CausalForest':
|
|
68
|
+
"""
|
|
69
|
+
Fit the Causal Forest model
|
|
70
|
+
|
|
71
|
+
Parameters:
|
|
72
|
+
-----------
|
|
73
|
+
X : array-like of shape (n_samples, n_features)
|
|
74
|
+
Covariates
|
|
75
|
+
y : array-like of shape (n_samples,)
|
|
76
|
+
Outcome variable
|
|
77
|
+
w : array-like of shape (n_samples,)
|
|
78
|
+
Treatment assignment (binary)
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
--------
|
|
82
|
+
self : CausalForest
|
|
83
|
+
"""
|
|
84
|
+
# Convert to numpy arrays
|
|
85
|
+
X = np.asarray(X)
|
|
86
|
+
y = np.asarray(y)
|
|
87
|
+
w = np.asarray(w)
|
|
88
|
+
|
|
89
|
+
# Fit outcome regression E[Y|X]
|
|
90
|
+
self.mu_model.fit(X, y)
|
|
91
|
+
|
|
92
|
+
# Fit treatment regression E[W|X]
|
|
93
|
+
self.pi_model.fit(X, w)
|
|
94
|
+
|
|
95
|
+
self.fitted = True
|
|
96
|
+
return self
|
|
97
|
+
|
|
98
|
+
def predict(self, X: Union[np.ndarray, pd.DataFrame]) -> Dict[str, np.ndarray]:
|
|
99
|
+
"""
|
|
100
|
+
Predict treatment effects for new samples
|
|
101
|
+
|
|
102
|
+
Parameters:
|
|
103
|
+
-----------
|
|
104
|
+
X : array-like of shape (n_samples, n_features)
|
|
105
|
+
Samples
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
--------
|
|
109
|
+
results : dict
|
|
110
|
+
Dictionary with treatment effect estimates and related statistics
|
|
111
|
+
"""
|
|
112
|
+
if not self.fitted:
|
|
113
|
+
raise ValueError("Model must be fitted before making predictions")
|
|
114
|
+
|
|
115
|
+
# Convert to numpy array
|
|
116
|
+
X = np.asarray(X)
|
|
117
|
+
|
|
118
|
+
# Get base predictions
|
|
119
|
+
mu_pred = self.mu_model.predict(X)
|
|
120
|
+
pi_pred = self.pi_model.predict(X)
|
|
121
|
+
|
|
122
|
+
# In a full implementation, we would compute heterogeneous treatment effects
|
|
123
|
+
# For this simplified version, we return the predicted values
|
|
124
|
+
# A full implementation would involve:
|
|
125
|
+
# 1. Using honest splitting
|
|
126
|
+
# 2. Computing R-learner or similar estimates in the leaves
|
|
127
|
+
# 3. Aggregating across trees
|
|
128
|
+
|
|
129
|
+
return {
|
|
130
|
+
'outcome_prediction': mu_pred,
|
|
131
|
+
'treatment_propensity': pi_pred,
|
|
132
|
+
'treatment_effect': mu_pred # Placeholder - in practice would be different
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
def estimate_treatment_effect(self, X: Union[np.ndarray, pd.DataFrame],
|
|
136
|
+
y: Union[np.ndarray, pd.Series],
|
|
137
|
+
w: Union[np.ndarray, pd.Series]) -> Dict[str, Any]:
|
|
138
|
+
"""
|
|
139
|
+
Estimate treatment effects using the fitted model
|
|
140
|
+
|
|
141
|
+
Parameters:
|
|
142
|
+
-----------
|
|
143
|
+
X : array-like of shape (n_samples, n_features)
|
|
144
|
+
Covariates
|
|
145
|
+
y : array-like of shape (n_samples,)
|
|
146
|
+
Outcome variable
|
|
147
|
+
w : array-like of shape (n_samples,)
|
|
148
|
+
Treatment assignment (binary)
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
--------
|
|
152
|
+
results : dict
|
|
153
|
+
Dictionary with treatment effect estimates
|
|
154
|
+
"""
|
|
155
|
+
if not self.fitted:
|
|
156
|
+
raise ValueError("Model must be fitted first")
|
|
157
|
+
|
|
158
|
+
# Convert to numpy arrays
|
|
159
|
+
X = np.asarray(X)
|
|
160
|
+
y = np.asarray(y)
|
|
161
|
+
w = np.asarray(w)
|
|
162
|
+
|
|
163
|
+
# Get predictions
|
|
164
|
+
mu_pred = self.mu_model.predict(X)
|
|
165
|
+
pi_pred = self.pi_model.predict(X)
|
|
166
|
+
|
|
167
|
+
# Compute doubly robust scores for treatment effect estimation
|
|
168
|
+
# psi = (w - pi_pred) * (y - mu_pred) / (pi_pred * (1 - pi_pred)) + mu_pred
|
|
169
|
+
|
|
170
|
+
# Handle edge cases for propensity scores
|
|
171
|
+
pi_pred = np.clip(pi_pred, 1e-5, 1 - 1e-5)
|
|
172
|
+
|
|
173
|
+
# Compute AIPW (Augmented Inverse Probability Weighting) scores
|
|
174
|
+
w1 = w / pi_pred
|
|
175
|
+
w0 = (1 - w) / (1 - pi_pred)
|
|
176
|
+
|
|
177
|
+
# Estimate treatment effects
|
|
178
|
+
y1_est = w1 * y + (1 - w1) * mu_pred
|
|
179
|
+
y0_est = w0 * y + (1 - w0) * mu_pred
|
|
180
|
+
|
|
181
|
+
# Individual treatment effects (CATE - Conditional Average Treatment Effect)
|
|
182
|
+
cate = y1_est - y0_est
|
|
183
|
+
|
|
184
|
+
# Average treatment effect
|
|
185
|
+
ate = np.mean(cate)
|
|
186
|
+
|
|
187
|
+
# Standard error (naive)
|
|
188
|
+
cate_se = np.std(cate) / np.sqrt(len(cate))
|
|
189
|
+
|
|
190
|
+
return {
|
|
191
|
+
'cate': cate, # Conditional Average Treatment Effects
|
|
192
|
+
'ate': ate, # Average Treatment Effect
|
|
193
|
+
'cate_se': cate_se,
|
|
194
|
+
'outcome_prediction': mu_pred,
|
|
195
|
+
'treatment_propensity': pi_pred
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def causal_forest_analysis(X: Union[np.ndarray, pd.DataFrame],
|
|
200
|
+
y: Union[np.ndarray, pd.Series],
|
|
201
|
+
w: Union[np.ndarray, pd.Series],
|
|
202
|
+
n_estimators: int = 100,
|
|
203
|
+
min_samples_leaf: int = 5,
|
|
204
|
+
max_depth: Optional[int] = None,
|
|
205
|
+
random_state: int = 42,
|
|
206
|
+
honest: bool = True) -> dict:
|
|
207
|
+
"""
|
|
208
|
+
Perform complete Causal Forest analysis
|
|
209
|
+
|
|
210
|
+
Parameters:
|
|
211
|
+
-----------
|
|
212
|
+
X : array-like of shape (n_samples, n_features)
|
|
213
|
+
Covariates
|
|
214
|
+
y : array-like of shape (n_samples,)
|
|
215
|
+
Outcome variable
|
|
216
|
+
w : array-like of shape (n_samples,)
|
|
217
|
+
Treatment assignment (binary)
|
|
218
|
+
n_estimators : int
|
|
219
|
+
Number of trees in the forest
|
|
220
|
+
min_samples_leaf : int
|
|
221
|
+
Minimum number of samples required to be at a leaf node
|
|
222
|
+
max_depth : int, optional
|
|
223
|
+
Maximum depth of the tree
|
|
224
|
+
random_state : int
|
|
225
|
+
Random state for reproducibility
|
|
226
|
+
honest : bool
|
|
227
|
+
Whether to use honest splitting
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
--------
|
|
231
|
+
results : dict
|
|
232
|
+
Dictionary with model and estimation results
|
|
233
|
+
"""
|
|
234
|
+
# Initialize and fit model
|
|
235
|
+
cf_model = CausalForest(
|
|
236
|
+
n_estimators=n_estimators,
|
|
237
|
+
min_samples_leaf=min_samples_leaf,
|
|
238
|
+
max_depth=max_depth,
|
|
239
|
+
random_state=random_state,
|
|
240
|
+
honest=honest
|
|
241
|
+
)
|
|
242
|
+
cf_model.fit(X, y, w)
|
|
243
|
+
|
|
244
|
+
# Estimate treatment effects
|
|
245
|
+
te_results = cf_model.estimate_treatment_effect(X, y, w)
|
|
246
|
+
|
|
247
|
+
return {
|
|
248
|
+
'model': cf_model,
|
|
249
|
+
'treatment_effects': te_results,
|
|
250
|
+
'X': X,
|
|
251
|
+
'y': y,
|
|
252
|
+
'w': w
|
|
253
|
+
}
|
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Double Machine Learning implementation for causal inference
|
|
3
|
+
"""
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from sklearn.model_selection import train_test_split
|
|
7
|
+
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
|
|
8
|
+
from sklearn.linear_model import LinearRegression, LogisticRegression
|
|
9
|
+
from sklearn.metrics import mean_squared_error
|
|
10
|
+
from typing import Union, Optional, Dict, Any, Tuple
|
|
11
|
+
from scipy import stats
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DoubleML:
|
|
15
|
+
"""
|
|
16
|
+
Double Machine Learning for causal inference with treatment effects
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self, learner_g: Any = None, learner_m: Any = None,
|
|
20
|
+
treatment_type: str = 'continuous', n_folds: int = 5,
|
|
21
|
+
random_state: int = 42):
|
|
22
|
+
"""
|
|
23
|
+
Initialize Double Machine Learning model
|
|
24
|
+
|
|
25
|
+
Parameters:
|
|
26
|
+
-----------
|
|
27
|
+
learner_g : sklearn estimator, optional
|
|
28
|
+
Estimator for the outcome regression (g)
|
|
29
|
+
Default: RandomForestRegressor for continuous, RandomForestClassifier for binary
|
|
30
|
+
learner_m : sklearn estimator, optional
|
|
31
|
+
Estimator for the treatment regression (m)
|
|
32
|
+
Default: RandomForestRegressor for continuous, RandomForestClassifier for binary
|
|
33
|
+
treatment_type : str, 'continuous' or 'binary'
|
|
34
|
+
Type of treatment variable
|
|
35
|
+
n_folds : int
|
|
36
|
+
Number of cross-fitting folds
|
|
37
|
+
random_state : int
|
|
38
|
+
Random state for reproducibility
|
|
39
|
+
"""
|
|
40
|
+
self.learner_g = learner_g
|
|
41
|
+
self.learner_m = learner_m
|
|
42
|
+
self.treatment_type = treatment_type
|
|
43
|
+
self.n_folds = n_folds
|
|
44
|
+
self.random_state = random_state
|
|
45
|
+
|
|
46
|
+
# Set default learners if not provided
|
|
47
|
+
if self.learner_g is None:
|
|
48
|
+
if treatment_type == 'continuous':
|
|
49
|
+
self.learner_g = RandomForestRegressor(n_estimators=100, random_state=random_state)
|
|
50
|
+
else:
|
|
51
|
+
self.learner_g = RandomForestClassifier(n_estimators=100, random_state=random_state)
|
|
52
|
+
|
|
53
|
+
if self.learner_m is None:
|
|
54
|
+
if treatment_type == 'continuous':
|
|
55
|
+
self.learner_m = RandomForestRegressor(n_estimators=100, random_state=random_state)
|
|
56
|
+
else:
|
|
57
|
+
self.learner_m = RandomForestClassifier(n_estimators=100, random_state=random_state)
|
|
58
|
+
|
|
59
|
+
# Store results
|
|
60
|
+
self.effect = None
|
|
61
|
+
self.se = None
|
|
62
|
+
self.ci = None
|
|
63
|
+
self.pval = None
|
|
64
|
+
|
|
65
|
+
def fit(self, X: Union[np.ndarray, pd.DataFrame],
|
|
66
|
+
y: Union[np.ndarray, pd.Series],
|
|
67
|
+
d: Union[np.ndarray, pd.Series]) -> 'DoubleML':
|
|
68
|
+
"""
|
|
69
|
+
Fit the Double Machine Learning model
|
|
70
|
+
|
|
71
|
+
Parameters:
|
|
72
|
+
-----------
|
|
73
|
+
X : array-like of shape (n_samples, n_features)
|
|
74
|
+
Covariates
|
|
75
|
+
y : array-like of shape (n_samples,)
|
|
76
|
+
Outcome variable
|
|
77
|
+
d : array-like of shape (n_samples,)
|
|
78
|
+
Treatment variable
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
--------
|
|
82
|
+
self : DoubleML
|
|
83
|
+
"""
|
|
84
|
+
# Convert to numpy arrays if needed
|
|
85
|
+
X = np.asarray(X)
|
|
86
|
+
y = np.asarray(y)
|
|
87
|
+
d = np.asarray(d)
|
|
88
|
+
|
|
89
|
+
n_samples = X.shape[0]
|
|
90
|
+
|
|
91
|
+
# Initialize arrays to store residuals
|
|
92
|
+
y_res = np.zeros(n_samples)
|
|
93
|
+
d_res = np.zeros(n_samples)
|
|
94
|
+
|
|
95
|
+
# Create folds for cross-fitting
|
|
96
|
+
np.random.seed(self.random_state)
|
|
97
|
+
indices = np.random.permutation(n_samples)
|
|
98
|
+
fold_size = n_samples // self.n_folds
|
|
99
|
+
folds = [indices[i*fold_size:(i+1)*fold_size] for i in range(self.n_folds)]
|
|
100
|
+
# Add remaining samples to the last fold
|
|
101
|
+
if n_samples % self.n_folds != 0:
|
|
102
|
+
folds[-1] = np.concatenate([folds[-1], indices[self.n_folds*fold_size:]])
|
|
103
|
+
|
|
104
|
+
# Cross-fitting
|
|
105
|
+
for fold_idx, test_idx in enumerate(folds):
|
|
106
|
+
# Training indices (all except test fold)
|
|
107
|
+
train_idx = np.concatenate([folds[i] for i in range(self.n_folds) if i != fold_idx])
|
|
108
|
+
|
|
109
|
+
# Split data
|
|
110
|
+
X_train, X_test = X[train_idx], X[test_idx]
|
|
111
|
+
y_train, y_test = y[train_idx], y[test_idx]
|
|
112
|
+
d_train, d_test = d[train_idx], d[test_idx]
|
|
113
|
+
|
|
114
|
+
# Fit outcome regression and get residuals
|
|
115
|
+
self.learner_g.fit(X_train, y_train)
|
|
116
|
+
if self.treatment_type == 'continuous':
|
|
117
|
+
y_pred = self.learner_g.predict(X_test)
|
|
118
|
+
else:
|
|
119
|
+
y_pred = self.learner_g.predict_proba(X_test)[:, 1]
|
|
120
|
+
y_res[test_idx] = y_test - y_pred
|
|
121
|
+
|
|
122
|
+
# Fit treatment regression and get residuals
|
|
123
|
+
self.learner_m.fit(X_train, d_train)
|
|
124
|
+
if self.treatment_type == 'continuous':
|
|
125
|
+
d_pred = self.learner_m.predict(X_test)
|
|
126
|
+
else:
|
|
127
|
+
d_pred = self.learner_m.predict_proba(X_test)[:, 1]
|
|
128
|
+
d_res[test_idx] = d_test - d_pred
|
|
129
|
+
|
|
130
|
+
# Estimate treatment effect using partially linear regression
|
|
131
|
+
# theta = E[d_res * y_res] / E[d_res^2]
|
|
132
|
+
numerator = np.mean(d_res * y_res)
|
|
133
|
+
denominator = np.mean(d_res**2)
|
|
134
|
+
|
|
135
|
+
self.effect = numerator / denominator
|
|
136
|
+
|
|
137
|
+
# Calculate standard error
|
|
138
|
+
# Using the formula for the variance of the DML estimator
|
|
139
|
+
residuals = y_res - self.effect * d_res
|
|
140
|
+
variance = np.mean(residuals**2) / np.mean(d_res**2)**2 / n_samples
|
|
141
|
+
self.se = np.sqrt(variance)
|
|
142
|
+
|
|
143
|
+
# Calculate 95% confidence interval
|
|
144
|
+
crit_val = 1.96 # 95% CI
|
|
145
|
+
self.ci = (self.effect - crit_val * self.se,
|
|
146
|
+
self.effect + crit_val * self.se)
|
|
147
|
+
|
|
148
|
+
# Calculate p-value (two-sided test)
|
|
149
|
+
z_score = self.effect / self.se
|
|
150
|
+
# Use scipy.stats.norm for calculating p-value
|
|
151
|
+
self.pval = 2 * (1 - stats.norm.cdf(np.abs(z_score)))
|
|
152
|
+
|
|
153
|
+
return self
|
|
154
|
+
|
|
155
|
+
def predict(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
|
|
156
|
+
"""
|
|
157
|
+
Predict treatment effects (constant for this implementation)
|
|
158
|
+
|
|
159
|
+
Parameters:
|
|
160
|
+
-----------
|
|
161
|
+
X : array-like of shape (n_samples, n_features)
|
|
162
|
+
Samples (not used, treatment effect is constant)
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
--------
|
|
166
|
+
effects : ndarray of shape (n_samples,)
|
|
167
|
+
Estimated treatment effects
|
|
168
|
+
"""
|
|
169
|
+
return np.full(X.shape[0], self.effect) if hasattr(X, 'shape') else np.full(len(X), self.effect)
|
|
170
|
+
|
|
171
|
+
def get_effect(self) -> float:
|
|
172
|
+
"""
|
|
173
|
+
Get the estimated treatment effect
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
--------
|
|
177
|
+
effect : float
|
|
178
|
+
Estimated treatment effect
|
|
179
|
+
"""
|
|
180
|
+
return self.effect
|
|
181
|
+
|
|
182
|
+
def get_se(self) -> float:
|
|
183
|
+
"""
|
|
184
|
+
Get the standard error of the treatment effect
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
--------
|
|
188
|
+
se : float
|
|
189
|
+
Standard error of the treatment effect
|
|
190
|
+
"""
|
|
191
|
+
return self.se
|
|
192
|
+
|
|
193
|
+
def get_ci(self) -> Tuple[float, float]:
|
|
194
|
+
"""
|
|
195
|
+
Get the 95% confidence interval for the treatment effect
|
|
196
|
+
|
|
197
|
+
Returns:
|
|
198
|
+
--------
|
|
199
|
+
ci : tuple
|
|
200
|
+
95% confidence interval (lower, upper)
|
|
201
|
+
"""
|
|
202
|
+
return self.ci
|
|
203
|
+
|
|
204
|
+
def get_pval(self) -> float:
|
|
205
|
+
"""
|
|
206
|
+
Get the p-value for the treatment effect
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
--------
|
|
210
|
+
pval : float
|
|
211
|
+
P-value for the treatment effect
|
|
212
|
+
"""
|
|
213
|
+
return self.pval
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def double_ml_analysis(X: Union[np.ndarray, pd.DataFrame],
|
|
217
|
+
y: Union[np.ndarray, pd.Series],
|
|
218
|
+
d: Union[np.ndarray, pd.Series],
|
|
219
|
+
treatment_type: str = 'continuous',
|
|
220
|
+
n_folds: int = 5,
|
|
221
|
+
random_state: int = 42) -> dict:
|
|
222
|
+
"""
|
|
223
|
+
Perform complete Double Machine Learning analysis
|
|
224
|
+
|
|
225
|
+
Parameters:
|
|
226
|
+
-----------
|
|
227
|
+
X : array-like of shape (n_samples, n_features)
|
|
228
|
+
Covariates
|
|
229
|
+
y : array-like of shape (n_samples,)
|
|
230
|
+
Outcome variable
|
|
231
|
+
d : array-like of shape (n_samples,)
|
|
232
|
+
Treatment variable
|
|
233
|
+
treatment_type : str, 'continuous' or 'binary'
|
|
234
|
+
Type of treatment variable
|
|
235
|
+
n_folds : int
|
|
236
|
+
Number of cross-fitting folds
|
|
237
|
+
random_state : int
|
|
238
|
+
Random state for reproducibility
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
--------
|
|
242
|
+
results : dict
|
|
243
|
+
Dictionary with model and estimation results
|
|
244
|
+
"""
|
|
245
|
+
# Initialize and fit model
|
|
246
|
+
dml_model = DoubleML(
|
|
247
|
+
treatment_type=treatment_type,
|
|
248
|
+
n_folds=n_folds,
|
|
249
|
+
random_state=random_state
|
|
250
|
+
)
|
|
251
|
+
dml_model.fit(X, y, d)
|
|
252
|
+
|
|
253
|
+
# Get results
|
|
254
|
+
effect = dml_model.get_effect()
|
|
255
|
+
se = dml_model.get_se()
|
|
256
|
+
ci = dml_model.get_ci()
|
|
257
|
+
pval = dml_model.get_pval()
|
|
258
|
+
|
|
259
|
+
return {
|
|
260
|
+
'model': dml_model,
|
|
261
|
+
'effect': effect,
|
|
262
|
+
'se': se,
|
|
263
|
+
'ci': ci,
|
|
264
|
+
'pval': pval,
|
|
265
|
+
'X': X,
|
|
266
|
+
'y': y,
|
|
267
|
+
'd': d
|
|
268
|
+
}
|