aigroup-econ-mcp 1.4.3__py3-none-any.whl → 2.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PKG-INFO +344 -322
- README.md +335 -320
- __init__.py +1 -1
- aigroup_econ_mcp-2.0.1.dist-info/METADATA +732 -0
- aigroup_econ_mcp-2.0.1.dist-info/RECORD +170 -0
- cli.py +4 -0
- econometrics/advanced_methods/modern_computing_machine_learning/__init__.py +30 -0
- econometrics/advanced_methods/modern_computing_machine_learning/causal_forest.py +253 -0
- econometrics/advanced_methods/modern_computing_machine_learning/double_ml.py +268 -0
- econometrics/advanced_methods/modern_computing_machine_learning/gradient_boosting.py +249 -0
- econometrics/advanced_methods/modern_computing_machine_learning/hierarchical_clustering.py +243 -0
- econometrics/advanced_methods/modern_computing_machine_learning/kmeans_clustering.py +293 -0
- econometrics/advanced_methods/modern_computing_machine_learning/neural_network.py +264 -0
- econometrics/advanced_methods/modern_computing_machine_learning/random_forest.py +195 -0
- econometrics/advanced_methods/modern_computing_machine_learning/support_vector_machine.py +226 -0
- econometrics/advanced_methods/modern_computing_machine_learning/test_all_modules.py +329 -0
- econometrics/advanced_methods/modern_computing_machine_learning/test_report.md +107 -0
- econometrics/causal_inference/__init__.py +66 -0
- econometrics/causal_inference/causal_identification_strategy/__init__.py +104 -0
- econometrics/causal_inference/causal_identification_strategy/control_function.py +112 -0
- econometrics/causal_inference/causal_identification_strategy/difference_in_differences.py +107 -0
- econometrics/causal_inference/causal_identification_strategy/event_study.py +119 -0
- econometrics/causal_inference/causal_identification_strategy/first_difference.py +89 -0
- econometrics/causal_inference/causal_identification_strategy/fixed_effects.py +103 -0
- econometrics/causal_inference/causal_identification_strategy/hausman_test.py +69 -0
- econometrics/causal_inference/causal_identification_strategy/instrumental_variables.py +145 -0
- econometrics/causal_inference/causal_identification_strategy/mediation_analysis.py +121 -0
- econometrics/causal_inference/causal_identification_strategy/moderation_analysis.py +109 -0
- econometrics/causal_inference/causal_identification_strategy/propensity_score_matching.py +140 -0
- econometrics/causal_inference/causal_identification_strategy/random_effects.py +100 -0
- econometrics/causal_inference/causal_identification_strategy/regression_discontinuity.py +98 -0
- econometrics/causal_inference/causal_identification_strategy/synthetic_control.py +111 -0
- econometrics/causal_inference/causal_identification_strategy/triple_difference.py +86 -0
- econometrics/distribution_analysis/__init__.py +28 -0
- econometrics/distribution_analysis/oaxaca_blinder.py +184 -0
- econometrics/distribution_analysis/time_series_decomposition.py +152 -0
- econometrics/distribution_analysis/variance_decomposition.py +179 -0
- econometrics/missing_data/__init__.py +18 -0
- econometrics/missing_data/imputation_methods.py +219 -0
- econometrics/nonparametric/__init__.py +35 -0
- econometrics/nonparametric/gam_model.py +117 -0
- econometrics/nonparametric/kernel_regression.py +161 -0
- econometrics/nonparametric/quantile_regression.py +249 -0
- econometrics/nonparametric/spline_regression.py +100 -0
- econometrics/spatial_econometrics/__init__.py +68 -0
- econometrics/spatial_econometrics/geographically_weighted_regression.py +211 -0
- econometrics/spatial_econometrics/gwr_simple.py +154 -0
- econometrics/spatial_econometrics/spatial_autocorrelation.py +356 -0
- econometrics/spatial_econometrics/spatial_durbin_model.py +177 -0
- econometrics/spatial_econometrics/spatial_regression.py +315 -0
- econometrics/spatial_econometrics/spatial_weights.py +226 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/README.md +164 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/__init__.py +40 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/count_data_models.py +311 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/discrete_choice_models.py +294 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/limited_dependent_variable_models.py +282 -0
- econometrics/statistical_inference/__init__.py +21 -0
- econometrics/statistical_inference/bootstrap_methods.py +162 -0
- econometrics/statistical_inference/permutation_test.py +177 -0
- econometrics/survival_analysis/__init__.py +18 -0
- econometrics/survival_analysis/survival_models.py +259 -0
- econometrics/tests/causal_inference_tests/__init__.py +3 -0
- econometrics/tests/causal_inference_tests/detailed_test.py +441 -0
- econometrics/tests/causal_inference_tests/test_all_methods.py +418 -0
- econometrics/tests/causal_inference_tests/test_causal_identification_strategy.py +202 -0
- econometrics/tests/causal_inference_tests/test_difference_in_differences.py +53 -0
- econometrics/tests/causal_inference_tests/test_instrumental_variables.py +44 -0
- econometrics/tests/specific_data_modeling_tests/test_micro_discrete_limited_data.py +189 -0
- econometrics//321/206/320/254/320/272/321/205/342/225/235/320/220/321/205/320/237/320/241/321/205/320/264/320/267/321/207/342/226/222/342/225/227/321/204/342/225/235/320/250/321/205/320/225/320/230/321/207/342/225/221/320/267/321/205/320/230/320/226/321/206/320/256/320/240.md +544 -0
- pyproject.toml +9 -2
- server.py +15 -1
- tools/__init__.py +75 -1
- tools/causal_inference_adapter.py +658 -0
- tools/distribution_analysis_adapter.py +121 -0
- tools/gwr_simple_adapter.py +54 -0
- tools/machine_learning_adapter.py +567 -0
- tools/mcp_tool_groups/__init__.py +15 -1
- tools/mcp_tool_groups/causal_inference_tools.py +643 -0
- tools/mcp_tool_groups/distribution_analysis_tools.py +169 -0
- tools/mcp_tool_groups/machine_learning_tools.py +422 -0
- tools/mcp_tool_groups/microecon_tools.py +325 -0
- tools/mcp_tool_groups/missing_data_tools.py +117 -0
- tools/mcp_tool_groups/nonparametric_tools.py +225 -0
- tools/mcp_tool_groups/spatial_econometrics_tools.py +323 -0
- tools/mcp_tool_groups/statistical_inference_tools.py +131 -0
- tools/mcp_tools_registry.py +13 -3
- tools/microecon_adapter.py +412 -0
- tools/missing_data_adapter.py +73 -0
- tools/nonparametric_adapter.py +190 -0
- tools/spatial_econometrics_adapter.py +318 -0
- tools/statistical_inference_adapter.py +90 -0
- tools/survival_analysis_adapter.py +46 -0
- aigroup_econ_mcp-1.4.3.dist-info/METADATA +0 -710
- aigroup_econ_mcp-1.4.3.dist-info/RECORD +0 -92
- {aigroup_econ_mcp-1.4.3.dist-info → aigroup_econ_mcp-2.0.1.dist-info}/WHEEL +0 -0
- {aigroup_econ_mcp-1.4.3.dist-info → aigroup_econ_mcp-2.0.1.dist-info}/entry_points.txt +0 -0
- {aigroup_econ_mcp-1.4.3.dist-info → aigroup_econ_mcp-2.0.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Double Machine Learning implementation for causal inference
|
|
3
|
+
"""
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from sklearn.model_selection import train_test_split
|
|
7
|
+
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
|
|
8
|
+
from sklearn.linear_model import LinearRegression, LogisticRegression
|
|
9
|
+
from sklearn.metrics import mean_squared_error
|
|
10
|
+
from typing import Union, Optional, Dict, Any, Tuple
|
|
11
|
+
from scipy import stats
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DoubleML:
|
|
15
|
+
"""
|
|
16
|
+
Double Machine Learning for causal inference with treatment effects
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self, learner_g: Any = None, learner_m: Any = None,
|
|
20
|
+
treatment_type: str = 'continuous', n_folds: int = 5,
|
|
21
|
+
random_state: int = 42):
|
|
22
|
+
"""
|
|
23
|
+
Initialize Double Machine Learning model
|
|
24
|
+
|
|
25
|
+
Parameters:
|
|
26
|
+
-----------
|
|
27
|
+
learner_g : sklearn estimator, optional
|
|
28
|
+
Estimator for the outcome regression (g)
|
|
29
|
+
Default: RandomForestRegressor for continuous, RandomForestClassifier for binary
|
|
30
|
+
learner_m : sklearn estimator, optional
|
|
31
|
+
Estimator for the treatment regression (m)
|
|
32
|
+
Default: RandomForestRegressor for continuous, RandomForestClassifier for binary
|
|
33
|
+
treatment_type : str, 'continuous' or 'binary'
|
|
34
|
+
Type of treatment variable
|
|
35
|
+
n_folds : int
|
|
36
|
+
Number of cross-fitting folds
|
|
37
|
+
random_state : int
|
|
38
|
+
Random state for reproducibility
|
|
39
|
+
"""
|
|
40
|
+
self.learner_g = learner_g
|
|
41
|
+
self.learner_m = learner_m
|
|
42
|
+
self.treatment_type = treatment_type
|
|
43
|
+
self.n_folds = n_folds
|
|
44
|
+
self.random_state = random_state
|
|
45
|
+
|
|
46
|
+
# Set default learners if not provided
|
|
47
|
+
if self.learner_g is None:
|
|
48
|
+
if treatment_type == 'continuous':
|
|
49
|
+
self.learner_g = RandomForestRegressor(n_estimators=100, random_state=random_state)
|
|
50
|
+
else:
|
|
51
|
+
self.learner_g = RandomForestClassifier(n_estimators=100, random_state=random_state)
|
|
52
|
+
|
|
53
|
+
if self.learner_m is None:
|
|
54
|
+
if treatment_type == 'continuous':
|
|
55
|
+
self.learner_m = RandomForestRegressor(n_estimators=100, random_state=random_state)
|
|
56
|
+
else:
|
|
57
|
+
self.learner_m = RandomForestClassifier(n_estimators=100, random_state=random_state)
|
|
58
|
+
|
|
59
|
+
# Store results
|
|
60
|
+
self.effect = None
|
|
61
|
+
self.se = None
|
|
62
|
+
self.ci = None
|
|
63
|
+
self.pval = None
|
|
64
|
+
|
|
65
|
+
def fit(self, X: Union[np.ndarray, pd.DataFrame],
|
|
66
|
+
y: Union[np.ndarray, pd.Series],
|
|
67
|
+
d: Union[np.ndarray, pd.Series]) -> 'DoubleML':
|
|
68
|
+
"""
|
|
69
|
+
Fit the Double Machine Learning model
|
|
70
|
+
|
|
71
|
+
Parameters:
|
|
72
|
+
-----------
|
|
73
|
+
X : array-like of shape (n_samples, n_features)
|
|
74
|
+
Covariates
|
|
75
|
+
y : array-like of shape (n_samples,)
|
|
76
|
+
Outcome variable
|
|
77
|
+
d : array-like of shape (n_samples,)
|
|
78
|
+
Treatment variable
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
--------
|
|
82
|
+
self : DoubleML
|
|
83
|
+
"""
|
|
84
|
+
# Convert to numpy arrays if needed
|
|
85
|
+
X = np.asarray(X)
|
|
86
|
+
y = np.asarray(y)
|
|
87
|
+
d = np.asarray(d)
|
|
88
|
+
|
|
89
|
+
n_samples = X.shape[0]
|
|
90
|
+
|
|
91
|
+
# Initialize arrays to store residuals
|
|
92
|
+
y_res = np.zeros(n_samples)
|
|
93
|
+
d_res = np.zeros(n_samples)
|
|
94
|
+
|
|
95
|
+
# Create folds for cross-fitting
|
|
96
|
+
np.random.seed(self.random_state)
|
|
97
|
+
indices = np.random.permutation(n_samples)
|
|
98
|
+
fold_size = n_samples // self.n_folds
|
|
99
|
+
folds = [indices[i*fold_size:(i+1)*fold_size] for i in range(self.n_folds)]
|
|
100
|
+
# Add remaining samples to the last fold
|
|
101
|
+
if n_samples % self.n_folds != 0:
|
|
102
|
+
folds[-1] = np.concatenate([folds[-1], indices[self.n_folds*fold_size:]])
|
|
103
|
+
|
|
104
|
+
# Cross-fitting
|
|
105
|
+
for fold_idx, test_idx in enumerate(folds):
|
|
106
|
+
# Training indices (all except test fold)
|
|
107
|
+
train_idx = np.concatenate([folds[i] for i in range(self.n_folds) if i != fold_idx])
|
|
108
|
+
|
|
109
|
+
# Split data
|
|
110
|
+
X_train, X_test = X[train_idx], X[test_idx]
|
|
111
|
+
y_train, y_test = y[train_idx], y[test_idx]
|
|
112
|
+
d_train, d_test = d[train_idx], d[test_idx]
|
|
113
|
+
|
|
114
|
+
# Fit outcome regression and get residuals
|
|
115
|
+
self.learner_g.fit(X_train, y_train)
|
|
116
|
+
if self.treatment_type == 'continuous':
|
|
117
|
+
y_pred = self.learner_g.predict(X_test)
|
|
118
|
+
else:
|
|
119
|
+
y_pred = self.learner_g.predict_proba(X_test)[:, 1]
|
|
120
|
+
y_res[test_idx] = y_test - y_pred
|
|
121
|
+
|
|
122
|
+
# Fit treatment regression and get residuals
|
|
123
|
+
self.learner_m.fit(X_train, d_train)
|
|
124
|
+
if self.treatment_type == 'continuous':
|
|
125
|
+
d_pred = self.learner_m.predict(X_test)
|
|
126
|
+
else:
|
|
127
|
+
d_pred = self.learner_m.predict_proba(X_test)[:, 1]
|
|
128
|
+
d_res[test_idx] = d_test - d_pred
|
|
129
|
+
|
|
130
|
+
# Estimate treatment effect using partially linear regression
|
|
131
|
+
# theta = E[d_res * y_res] / E[d_res^2]
|
|
132
|
+
numerator = np.mean(d_res * y_res)
|
|
133
|
+
denominator = np.mean(d_res**2)
|
|
134
|
+
|
|
135
|
+
self.effect = numerator / denominator
|
|
136
|
+
|
|
137
|
+
# Calculate standard error
|
|
138
|
+
# Using the formula for the variance of the DML estimator
|
|
139
|
+
residuals = y_res - self.effect * d_res
|
|
140
|
+
variance = np.mean(residuals**2) / np.mean(d_res**2)**2 / n_samples
|
|
141
|
+
self.se = np.sqrt(variance)
|
|
142
|
+
|
|
143
|
+
# Calculate 95% confidence interval
|
|
144
|
+
crit_val = 1.96 # 95% CI
|
|
145
|
+
self.ci = (self.effect - crit_val * self.se,
|
|
146
|
+
self.effect + crit_val * self.se)
|
|
147
|
+
|
|
148
|
+
# Calculate p-value (two-sided test)
|
|
149
|
+
z_score = self.effect / self.se
|
|
150
|
+
# Use scipy.stats.norm for calculating p-value
|
|
151
|
+
self.pval = 2 * (1 - stats.norm.cdf(np.abs(z_score)))
|
|
152
|
+
|
|
153
|
+
return self
|
|
154
|
+
|
|
155
|
+
def predict(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
|
|
156
|
+
"""
|
|
157
|
+
Predict treatment effects (constant for this implementation)
|
|
158
|
+
|
|
159
|
+
Parameters:
|
|
160
|
+
-----------
|
|
161
|
+
X : array-like of shape (n_samples, n_features)
|
|
162
|
+
Samples (not used, treatment effect is constant)
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
--------
|
|
166
|
+
effects : ndarray of shape (n_samples,)
|
|
167
|
+
Estimated treatment effects
|
|
168
|
+
"""
|
|
169
|
+
return np.full(X.shape[0], self.effect) if hasattr(X, 'shape') else np.full(len(X), self.effect)
|
|
170
|
+
|
|
171
|
+
def get_effect(self) -> float:
|
|
172
|
+
"""
|
|
173
|
+
Get the estimated treatment effect
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
--------
|
|
177
|
+
effect : float
|
|
178
|
+
Estimated treatment effect
|
|
179
|
+
"""
|
|
180
|
+
return self.effect
|
|
181
|
+
|
|
182
|
+
def get_se(self) -> float:
|
|
183
|
+
"""
|
|
184
|
+
Get the standard error of the treatment effect
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
--------
|
|
188
|
+
se : float
|
|
189
|
+
Standard error of the treatment effect
|
|
190
|
+
"""
|
|
191
|
+
return self.se
|
|
192
|
+
|
|
193
|
+
def get_ci(self) -> Tuple[float, float]:
|
|
194
|
+
"""
|
|
195
|
+
Get the 95% confidence interval for the treatment effect
|
|
196
|
+
|
|
197
|
+
Returns:
|
|
198
|
+
--------
|
|
199
|
+
ci : tuple
|
|
200
|
+
95% confidence interval (lower, upper)
|
|
201
|
+
"""
|
|
202
|
+
return self.ci
|
|
203
|
+
|
|
204
|
+
def get_pval(self) -> float:
|
|
205
|
+
"""
|
|
206
|
+
Get the p-value for the treatment effect
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
--------
|
|
210
|
+
pval : float
|
|
211
|
+
P-value for the treatment effect
|
|
212
|
+
"""
|
|
213
|
+
return self.pval
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def double_ml_analysis(X: Union[np.ndarray, pd.DataFrame],
|
|
217
|
+
y: Union[np.ndarray, pd.Series],
|
|
218
|
+
d: Union[np.ndarray, pd.Series],
|
|
219
|
+
treatment_type: str = 'continuous',
|
|
220
|
+
n_folds: int = 5,
|
|
221
|
+
random_state: int = 42) -> dict:
|
|
222
|
+
"""
|
|
223
|
+
Perform complete Double Machine Learning analysis
|
|
224
|
+
|
|
225
|
+
Parameters:
|
|
226
|
+
-----------
|
|
227
|
+
X : array-like of shape (n_samples, n_features)
|
|
228
|
+
Covariates
|
|
229
|
+
y : array-like of shape (n_samples,)
|
|
230
|
+
Outcome variable
|
|
231
|
+
d : array-like of shape (n_samples,)
|
|
232
|
+
Treatment variable
|
|
233
|
+
treatment_type : str, 'continuous' or 'binary'
|
|
234
|
+
Type of treatment variable
|
|
235
|
+
n_folds : int
|
|
236
|
+
Number of cross-fitting folds
|
|
237
|
+
random_state : int
|
|
238
|
+
Random state for reproducibility
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
--------
|
|
242
|
+
results : dict
|
|
243
|
+
Dictionary with model and estimation results
|
|
244
|
+
"""
|
|
245
|
+
# Initialize and fit model
|
|
246
|
+
dml_model = DoubleML(
|
|
247
|
+
treatment_type=treatment_type,
|
|
248
|
+
n_folds=n_folds,
|
|
249
|
+
random_state=random_state
|
|
250
|
+
)
|
|
251
|
+
dml_model.fit(X, y, d)
|
|
252
|
+
|
|
253
|
+
# Get results
|
|
254
|
+
effect = dml_model.get_effect()
|
|
255
|
+
se = dml_model.get_se()
|
|
256
|
+
ci = dml_model.get_ci()
|
|
257
|
+
pval = dml_model.get_pval()
|
|
258
|
+
|
|
259
|
+
return {
|
|
260
|
+
'model': dml_model,
|
|
261
|
+
'effect': effect,
|
|
262
|
+
'se': se,
|
|
263
|
+
'ci': ci,
|
|
264
|
+
'pval': pval,
|
|
265
|
+
'X': X,
|
|
266
|
+
'y': y,
|
|
267
|
+
'd': d
|
|
268
|
+
}
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Gradient Boosting Machine (GBM/XGBoost) implementation for econometric analysis
|
|
3
|
+
"""
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
|
|
7
|
+
from sklearn.model_selection import train_test_split
|
|
8
|
+
from sklearn.metrics import mean_squared_error, accuracy_score
|
|
9
|
+
try:
|
|
10
|
+
import xgboost as xgb
|
|
11
|
+
XGBOOST_AVAILABLE = True
|
|
12
|
+
except ImportError:
|
|
13
|
+
XGBOOST_AVAILABLE = False
|
|
14
|
+
from typing import Union, Optional, Dict, Any
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class EconGradientBoosting:
|
|
18
|
+
"""
|
|
19
|
+
Gradient Boosting for econometric analysis with both scikit-learn and XGBoost implementations
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(self, algorithm: str = 'sklearn', problem_type: str = 'regression',
|
|
23
|
+
n_estimators: int = 100, learning_rate: float = 0.1,
|
|
24
|
+
max_depth: int = 3, random_state: int = 42):
|
|
25
|
+
"""
|
|
26
|
+
Initialize Gradient Boosting model
|
|
27
|
+
|
|
28
|
+
Parameters:
|
|
29
|
+
-----------
|
|
30
|
+
algorithm : str, 'sklearn' or 'xgboost'
|
|
31
|
+
Which implementation to use
|
|
32
|
+
problem_type : str, 'regression' or 'classification'
|
|
33
|
+
Type of problem to solve
|
|
34
|
+
n_estimators : int
|
|
35
|
+
Number of boosting stages
|
|
36
|
+
learning_rate : float
|
|
37
|
+
Learning rate shrinks the contribution of each tree
|
|
38
|
+
max_depth : int
|
|
39
|
+
Maximum depth of the individual regression estimators
|
|
40
|
+
random_state : int
|
|
41
|
+
Random state for reproducibility
|
|
42
|
+
"""
|
|
43
|
+
self.algorithm = algorithm
|
|
44
|
+
self.problem_type = problem_type
|
|
45
|
+
self.n_estimators = n_estimators
|
|
46
|
+
self.learning_rate = learning_rate
|
|
47
|
+
self.max_depth = max_depth
|
|
48
|
+
self.random_state = random_state
|
|
49
|
+
|
|
50
|
+
if algorithm == 'sklearn':
|
|
51
|
+
if problem_type == 'regression':
|
|
52
|
+
self.model = GradientBoostingRegressor(
|
|
53
|
+
n_estimators=n_estimators,
|
|
54
|
+
learning_rate=learning_rate,
|
|
55
|
+
max_depth=max_depth,
|
|
56
|
+
random_state=random_state
|
|
57
|
+
)
|
|
58
|
+
elif problem_type == 'classification':
|
|
59
|
+
self.model = GradientBoostingClassifier(
|
|
60
|
+
n_estimators=n_estimators,
|
|
61
|
+
learning_rate=learning_rate,
|
|
62
|
+
max_depth=max_depth,
|
|
63
|
+
random_state=random_state
|
|
64
|
+
)
|
|
65
|
+
elif algorithm == 'xgboost':
|
|
66
|
+
if not XGBOOST_AVAILABLE:
|
|
67
|
+
raise ImportError("XGBoost is not installed. Please install it with 'pip install xgboost'")
|
|
68
|
+
|
|
69
|
+
if problem_type == 'regression':
|
|
70
|
+
self.model = xgb.XGBRegressor(
|
|
71
|
+
n_estimators=n_estimators,
|
|
72
|
+
learning_rate=learning_rate,
|
|
73
|
+
max_depth=max_depth,
|
|
74
|
+
random_state=random_state
|
|
75
|
+
)
|
|
76
|
+
elif problem_type == 'classification':
|
|
77
|
+
self.model = xgb.XGBClassifier(
|
|
78
|
+
n_estimators=n_estimators,
|
|
79
|
+
learning_rate=learning_rate,
|
|
80
|
+
max_depth=max_depth,
|
|
81
|
+
random_state=random_state
|
|
82
|
+
)
|
|
83
|
+
else:
|
|
84
|
+
raise ValueError("algorithm must be either 'sklearn' or 'xgboost'")
|
|
85
|
+
|
|
86
|
+
def fit(self, X: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.Series]) -> 'EconGradientBoosting':
|
|
87
|
+
"""
|
|
88
|
+
Fit the Gradient Boosting model
|
|
89
|
+
|
|
90
|
+
Parameters:
|
|
91
|
+
-----------
|
|
92
|
+
X : array-like of shape (n_samples, n_features)
|
|
93
|
+
Training data
|
|
94
|
+
y : array-like of shape (n_samples,)
|
|
95
|
+
Target values
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
--------
|
|
99
|
+
self : EconGradientBoosting
|
|
100
|
+
"""
|
|
101
|
+
self.model.fit(X, y)
|
|
102
|
+
return self
|
|
103
|
+
|
|
104
|
+
def predict(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
|
|
105
|
+
"""
|
|
106
|
+
Predict using the Gradient Boosting model
|
|
107
|
+
|
|
108
|
+
Parameters:
|
|
109
|
+
-----------
|
|
110
|
+
X : array-like of shape (n_samples, n_features)
|
|
111
|
+
Samples
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
--------
|
|
115
|
+
y_pred : ndarray of shape (n_samples,)
|
|
116
|
+
Predicted values
|
|
117
|
+
"""
|
|
118
|
+
return self.model.predict(X)
|
|
119
|
+
|
|
120
|
+
def feature_importance(self) -> Dict[str, np.ndarray]:
|
|
121
|
+
"""
|
|
122
|
+
Get feature importances
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
--------
|
|
126
|
+
importances : dict
|
|
127
|
+
Dictionary with feature importances (depends on algorithm)
|
|
128
|
+
"""
|
|
129
|
+
if self.algorithm == 'sklearn':
|
|
130
|
+
return {
|
|
131
|
+
'importances': self.model.feature_importances_
|
|
132
|
+
}
|
|
133
|
+
elif self.algorithm == 'xgboost':
|
|
134
|
+
# XGBoost provides multiple importance types
|
|
135
|
+
importance_types = ['weight', 'gain', 'cover', 'total_gain', 'total_cover']
|
|
136
|
+
importances = {}
|
|
137
|
+
for imp_type in importance_types:
|
|
138
|
+
try:
|
|
139
|
+
importances[imp_type] = self.model.feature_importances_
|
|
140
|
+
except:
|
|
141
|
+
pass
|
|
142
|
+
return importances
|
|
143
|
+
|
|
144
|
+
def evaluate(self, X: Union[np.ndarray, pd.DataFrame],
|
|
145
|
+
y: Union[np.ndarray, pd.Series]) -> dict:
|
|
146
|
+
"""
|
|
147
|
+
Evaluate model performance
|
|
148
|
+
|
|
149
|
+
Parameters:
|
|
150
|
+
-----------
|
|
151
|
+
X : array-like of shape (n_samples, n_features)
|
|
152
|
+
Test data
|
|
153
|
+
y : array-like of shape (n_samples,)
|
|
154
|
+
True values
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
--------
|
|
158
|
+
metrics : dict
|
|
159
|
+
Dictionary with evaluation metrics
|
|
160
|
+
"""
|
|
161
|
+
y_pred = self.predict(X)
|
|
162
|
+
|
|
163
|
+
if self.problem_type == 'regression':
|
|
164
|
+
mse = mean_squared_error(y, y_pred)
|
|
165
|
+
rmse = np.sqrt(mse)
|
|
166
|
+
return {
|
|
167
|
+
'mse': mse,
|
|
168
|
+
'rmse': rmse,
|
|
169
|
+
'predictions': y_pred
|
|
170
|
+
}
|
|
171
|
+
else:
|
|
172
|
+
accuracy = accuracy_score(y, y_pred)
|
|
173
|
+
return {
|
|
174
|
+
'accuracy': accuracy,
|
|
175
|
+
'predictions': y_pred
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def gradient_boosting_analysis(X: Union[np.ndarray, pd.DataFrame],
|
|
180
|
+
y: Union[np.ndarray, pd.Series],
|
|
181
|
+
algorithm: str = 'sklearn',
|
|
182
|
+
problem_type: str = 'regression',
|
|
183
|
+
test_size: float = 0.2,
|
|
184
|
+
n_estimators: int = 100,
|
|
185
|
+
learning_rate: float = 0.1,
|
|
186
|
+
max_depth: int = 3,
|
|
187
|
+
random_state: int = 42) -> dict:
|
|
188
|
+
"""
|
|
189
|
+
Perform complete Gradient Boosting analysis
|
|
190
|
+
|
|
191
|
+
Parameters:
|
|
192
|
+
-----------
|
|
193
|
+
X : array-like of shape (n_samples, n_features)
|
|
194
|
+
Features
|
|
195
|
+
y : array-like of shape (n_samples,)
|
|
196
|
+
Target variable
|
|
197
|
+
algorithm : str, 'sklearn' or 'xgboost'
|
|
198
|
+
Which implementation to use
|
|
199
|
+
problem_type : str, 'regression' or 'classification'
|
|
200
|
+
Type of problem to solve
|
|
201
|
+
test_size : float
|
|
202
|
+
Proportion of dataset to include in test split
|
|
203
|
+
n_estimators : int
|
|
204
|
+
Number of boosting stages
|
|
205
|
+
learning_rate : float
|
|
206
|
+
Learning rate shrinks the contribution of each tree
|
|
207
|
+
max_depth : int
|
|
208
|
+
Maximum depth of the individual regression estimators
|
|
209
|
+
random_state : int
|
|
210
|
+
Random state for reproducibility
|
|
211
|
+
|
|
212
|
+
Returns:
|
|
213
|
+
--------
|
|
214
|
+
results : dict
|
|
215
|
+
Dictionary with model, predictions, and feature importances
|
|
216
|
+
"""
|
|
217
|
+
# Split data
|
|
218
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
219
|
+
X, y, test_size=test_size, random_state=random_state
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
# Initialize and fit model
|
|
223
|
+
gb_model = EconGradientBoosting(
|
|
224
|
+
algorithm=algorithm,
|
|
225
|
+
problem_type=problem_type,
|
|
226
|
+
n_estimators=n_estimators,
|
|
227
|
+
learning_rate=learning_rate,
|
|
228
|
+
max_depth=max_depth,
|
|
229
|
+
random_state=random_state
|
|
230
|
+
)
|
|
231
|
+
gb_model.fit(X_train, y_train)
|
|
232
|
+
|
|
233
|
+
# Evaluate model
|
|
234
|
+
train_results = gb_model.evaluate(X_train, y_train)
|
|
235
|
+
test_results = gb_model.evaluate(X_test, y_test)
|
|
236
|
+
|
|
237
|
+
# Get feature importances
|
|
238
|
+
importances = gb_model.feature_importance()
|
|
239
|
+
|
|
240
|
+
return {
|
|
241
|
+
'model': gb_model,
|
|
242
|
+
'train_results': train_results,
|
|
243
|
+
'test_results': test_results,
|
|
244
|
+
'feature_importances': importances,
|
|
245
|
+
'X_train': X_train,
|
|
246
|
+
'X_test': X_test,
|
|
247
|
+
'y_train': y_train,
|
|
248
|
+
'y_test': y_test
|
|
249
|
+
}
|