aigroup-econ-mcp 1.4.3__py3-none-any.whl → 2.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PKG-INFO +344 -322
- README.md +335 -320
- __init__.py +1 -1
- aigroup_econ_mcp-2.0.1.dist-info/METADATA +732 -0
- aigroup_econ_mcp-2.0.1.dist-info/RECORD +170 -0
- cli.py +4 -0
- econometrics/advanced_methods/modern_computing_machine_learning/__init__.py +30 -0
- econometrics/advanced_methods/modern_computing_machine_learning/causal_forest.py +253 -0
- econometrics/advanced_methods/modern_computing_machine_learning/double_ml.py +268 -0
- econometrics/advanced_methods/modern_computing_machine_learning/gradient_boosting.py +249 -0
- econometrics/advanced_methods/modern_computing_machine_learning/hierarchical_clustering.py +243 -0
- econometrics/advanced_methods/modern_computing_machine_learning/kmeans_clustering.py +293 -0
- econometrics/advanced_methods/modern_computing_machine_learning/neural_network.py +264 -0
- econometrics/advanced_methods/modern_computing_machine_learning/random_forest.py +195 -0
- econometrics/advanced_methods/modern_computing_machine_learning/support_vector_machine.py +226 -0
- econometrics/advanced_methods/modern_computing_machine_learning/test_all_modules.py +329 -0
- econometrics/advanced_methods/modern_computing_machine_learning/test_report.md +107 -0
- econometrics/causal_inference/__init__.py +66 -0
- econometrics/causal_inference/causal_identification_strategy/__init__.py +104 -0
- econometrics/causal_inference/causal_identification_strategy/control_function.py +112 -0
- econometrics/causal_inference/causal_identification_strategy/difference_in_differences.py +107 -0
- econometrics/causal_inference/causal_identification_strategy/event_study.py +119 -0
- econometrics/causal_inference/causal_identification_strategy/first_difference.py +89 -0
- econometrics/causal_inference/causal_identification_strategy/fixed_effects.py +103 -0
- econometrics/causal_inference/causal_identification_strategy/hausman_test.py +69 -0
- econometrics/causal_inference/causal_identification_strategy/instrumental_variables.py +145 -0
- econometrics/causal_inference/causal_identification_strategy/mediation_analysis.py +121 -0
- econometrics/causal_inference/causal_identification_strategy/moderation_analysis.py +109 -0
- econometrics/causal_inference/causal_identification_strategy/propensity_score_matching.py +140 -0
- econometrics/causal_inference/causal_identification_strategy/random_effects.py +100 -0
- econometrics/causal_inference/causal_identification_strategy/regression_discontinuity.py +98 -0
- econometrics/causal_inference/causal_identification_strategy/synthetic_control.py +111 -0
- econometrics/causal_inference/causal_identification_strategy/triple_difference.py +86 -0
- econometrics/distribution_analysis/__init__.py +28 -0
- econometrics/distribution_analysis/oaxaca_blinder.py +184 -0
- econometrics/distribution_analysis/time_series_decomposition.py +152 -0
- econometrics/distribution_analysis/variance_decomposition.py +179 -0
- econometrics/missing_data/__init__.py +18 -0
- econometrics/missing_data/imputation_methods.py +219 -0
- econometrics/nonparametric/__init__.py +35 -0
- econometrics/nonparametric/gam_model.py +117 -0
- econometrics/nonparametric/kernel_regression.py +161 -0
- econometrics/nonparametric/quantile_regression.py +249 -0
- econometrics/nonparametric/spline_regression.py +100 -0
- econometrics/spatial_econometrics/__init__.py +68 -0
- econometrics/spatial_econometrics/geographically_weighted_regression.py +211 -0
- econometrics/spatial_econometrics/gwr_simple.py +154 -0
- econometrics/spatial_econometrics/spatial_autocorrelation.py +356 -0
- econometrics/spatial_econometrics/spatial_durbin_model.py +177 -0
- econometrics/spatial_econometrics/spatial_regression.py +315 -0
- econometrics/spatial_econometrics/spatial_weights.py +226 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/README.md +164 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/__init__.py +40 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/count_data_models.py +311 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/discrete_choice_models.py +294 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/limited_dependent_variable_models.py +282 -0
- econometrics/statistical_inference/__init__.py +21 -0
- econometrics/statistical_inference/bootstrap_methods.py +162 -0
- econometrics/statistical_inference/permutation_test.py +177 -0
- econometrics/survival_analysis/__init__.py +18 -0
- econometrics/survival_analysis/survival_models.py +259 -0
- econometrics/tests/causal_inference_tests/__init__.py +3 -0
- econometrics/tests/causal_inference_tests/detailed_test.py +441 -0
- econometrics/tests/causal_inference_tests/test_all_methods.py +418 -0
- econometrics/tests/causal_inference_tests/test_causal_identification_strategy.py +202 -0
- econometrics/tests/causal_inference_tests/test_difference_in_differences.py +53 -0
- econometrics/tests/causal_inference_tests/test_instrumental_variables.py +44 -0
- econometrics/tests/specific_data_modeling_tests/test_micro_discrete_limited_data.py +189 -0
- econometrics//321/206/320/254/320/272/321/205/342/225/235/320/220/321/205/320/237/320/241/321/205/320/264/320/267/321/207/342/226/222/342/225/227/321/204/342/225/235/320/250/321/205/320/225/320/230/321/207/342/225/221/320/267/321/205/320/230/320/226/321/206/320/256/320/240.md +544 -0
- pyproject.toml +9 -2
- server.py +15 -1
- tools/__init__.py +75 -1
- tools/causal_inference_adapter.py +658 -0
- tools/distribution_analysis_adapter.py +121 -0
- tools/gwr_simple_adapter.py +54 -0
- tools/machine_learning_adapter.py +567 -0
- tools/mcp_tool_groups/__init__.py +15 -1
- tools/mcp_tool_groups/causal_inference_tools.py +643 -0
- tools/mcp_tool_groups/distribution_analysis_tools.py +169 -0
- tools/mcp_tool_groups/machine_learning_tools.py +422 -0
- tools/mcp_tool_groups/microecon_tools.py +325 -0
- tools/mcp_tool_groups/missing_data_tools.py +117 -0
- tools/mcp_tool_groups/nonparametric_tools.py +225 -0
- tools/mcp_tool_groups/spatial_econometrics_tools.py +323 -0
- tools/mcp_tool_groups/statistical_inference_tools.py +131 -0
- tools/mcp_tools_registry.py +13 -3
- tools/microecon_adapter.py +412 -0
- tools/missing_data_adapter.py +73 -0
- tools/nonparametric_adapter.py +190 -0
- tools/spatial_econometrics_adapter.py +318 -0
- tools/statistical_inference_adapter.py +90 -0
- tools/survival_analysis_adapter.py +46 -0
- aigroup_econ_mcp-1.4.3.dist-info/METADATA +0 -710
- aigroup_econ_mcp-1.4.3.dist-info/RECORD +0 -92
- {aigroup_econ_mcp-1.4.3.dist-info → aigroup_econ_mcp-2.0.1.dist-info}/WHEEL +0 -0
- {aigroup_econ_mcp-1.4.3.dist-info → aigroup_econ_mcp-2.0.1.dist-info}/entry_points.txt +0 -0
- {aigroup_econ_mcp-1.4.3.dist-info → aigroup_econ_mcp-2.0.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Neural Network implementation for econometric analysis
|
|
3
|
+
"""
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from sklearn.neural_network import MLPRegressor, MLPClassifier
|
|
7
|
+
from sklearn.model_selection import train_test_split
|
|
8
|
+
from sklearn.metrics import mean_squared_error, accuracy_score
|
|
9
|
+
from sklearn.preprocessing import StandardScaler
|
|
10
|
+
from typing import Union, Optional, List, Tuple
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class EconNeuralNetwork:
|
|
14
|
+
"""
|
|
15
|
+
Neural Network for econometric analysis with both regression and classification capabilities
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(self, problem_type: str = 'regression', hidden_layer_sizes: tuple = (100,),
|
|
19
|
+
activation: str = 'relu', solver: str = 'adam', alpha: float = 0.0001,
|
|
20
|
+
learning_rate: str = 'constant', learning_rate_init: float = 0.001,
|
|
21
|
+
max_iter: int = 200, random_state: int = 42):
|
|
22
|
+
"""
|
|
23
|
+
Initialize Neural Network model
|
|
24
|
+
|
|
25
|
+
Parameters:
|
|
26
|
+
-----------
|
|
27
|
+
problem_type : str, 'regression' or 'classification'
|
|
28
|
+
Type of problem to solve
|
|
29
|
+
hidden_layer_sizes : tuple
|
|
30
|
+
The ith element represents the number of neurons in the ith hidden layer
|
|
31
|
+
activation : str, 'identity', 'logistic', 'tanh', 'relu'
|
|
32
|
+
Activation function for the hidden layer
|
|
33
|
+
solver : str, 'lbfgs', 'sgd', 'adam'
|
|
34
|
+
The solver for weight optimization
|
|
35
|
+
alpha : float
|
|
36
|
+
L2 penalty (regularization term) parameter
|
|
37
|
+
learning_rate : str, 'constant', 'invscaling', 'adaptive'
|
|
38
|
+
Learning rate schedule for weight updates
|
|
39
|
+
learning_rate_init : float
|
|
40
|
+
The initial learning rate used
|
|
41
|
+
max_iter : int
|
|
42
|
+
Maximum number of iterations
|
|
43
|
+
random_state : int
|
|
44
|
+
Random state for reproducibility
|
|
45
|
+
"""
|
|
46
|
+
self.problem_type = problem_type
|
|
47
|
+
self.hidden_layer_sizes = hidden_layer_sizes
|
|
48
|
+
self.activation = activation
|
|
49
|
+
self.solver = solver
|
|
50
|
+
self.alpha = alpha
|
|
51
|
+
self.learning_rate = learning_rate
|
|
52
|
+
self.learning_rate_init = learning_rate_init
|
|
53
|
+
self.max_iter = max_iter
|
|
54
|
+
self.random_state = random_state
|
|
55
|
+
self.scaler = StandardScaler()
|
|
56
|
+
|
|
57
|
+
if problem_type == 'regression':
|
|
58
|
+
self.model = MLPRegressor(
|
|
59
|
+
hidden_layer_sizes=hidden_layer_sizes,
|
|
60
|
+
activation=activation,
|
|
61
|
+
solver=solver,
|
|
62
|
+
alpha=alpha,
|
|
63
|
+
learning_rate=learning_rate,
|
|
64
|
+
learning_rate_init=learning_rate_init,
|
|
65
|
+
max_iter=max_iter,
|
|
66
|
+
random_state=random_state
|
|
67
|
+
)
|
|
68
|
+
elif problem_type == 'classification':
|
|
69
|
+
self.model = MLPClassifier(
|
|
70
|
+
hidden_layer_sizes=hidden_layer_sizes,
|
|
71
|
+
activation=activation,
|
|
72
|
+
solver=solver,
|
|
73
|
+
alpha=alpha,
|
|
74
|
+
learning_rate=learning_rate,
|
|
75
|
+
learning_rate_init=learning_rate_init,
|
|
76
|
+
max_iter=max_iter,
|
|
77
|
+
random_state=random_state
|
|
78
|
+
)
|
|
79
|
+
else:
|
|
80
|
+
raise ValueError("problem_type must be either 'regression' or 'classification'")
|
|
81
|
+
|
|
82
|
+
def fit(self, X: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.Series]) -> 'EconNeuralNetwork':
|
|
83
|
+
"""
|
|
84
|
+
Fit the Neural Network model
|
|
85
|
+
|
|
86
|
+
Parameters:
|
|
87
|
+
-----------
|
|
88
|
+
X : array-like of shape (n_samples, n_features)
|
|
89
|
+
Training data
|
|
90
|
+
y : array-like of shape (n_samples,)
|
|
91
|
+
Target values
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
--------
|
|
95
|
+
self : EconNeuralNetwork
|
|
96
|
+
"""
|
|
97
|
+
# Scale features
|
|
98
|
+
X_scaled = self.scaler.fit_transform(X)
|
|
99
|
+
self.model.fit(X_scaled, y)
|
|
100
|
+
return self
|
|
101
|
+
|
|
102
|
+
def predict(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
|
|
103
|
+
"""
|
|
104
|
+
Predict using the Neural Network model
|
|
105
|
+
|
|
106
|
+
Parameters:
|
|
107
|
+
-----------
|
|
108
|
+
X : array-like of shape (n_samples, n_features)
|
|
109
|
+
Samples
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
--------
|
|
113
|
+
y_pred : ndarray of shape (n_samples,)
|
|
114
|
+
Predicted values
|
|
115
|
+
"""
|
|
116
|
+
# Scale features using the same scaler
|
|
117
|
+
X_scaled = self.scaler.transform(X)
|
|
118
|
+
return self.model.predict(X_scaled)
|
|
119
|
+
|
|
120
|
+
def predict_proba(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
|
|
121
|
+
"""
|
|
122
|
+
Predict class probabilities using the Neural Network model (classification only)
|
|
123
|
+
|
|
124
|
+
Parameters:
|
|
125
|
+
-----------
|
|
126
|
+
X : array-like of shape (n_samples, n_features)
|
|
127
|
+
Samples
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
--------
|
|
131
|
+
y_proba : ndarray of shape (n_samples, n_classes)
|
|
132
|
+
Predicted class probabilities
|
|
133
|
+
"""
|
|
134
|
+
if self.problem_type != 'classification':
|
|
135
|
+
raise ValueError("predict_proba is only available for classification problems")
|
|
136
|
+
|
|
137
|
+
# Scale features using the same scaler
|
|
138
|
+
X_scaled = self.scaler.transform(X)
|
|
139
|
+
return self.model.predict_proba(X_scaled)
|
|
140
|
+
|
|
141
|
+
def evaluate(self, X: Union[np.ndarray, pd.DataFrame],
|
|
142
|
+
y: Union[np.ndarray, pd.Series]) -> dict:
|
|
143
|
+
"""
|
|
144
|
+
Evaluate model performance
|
|
145
|
+
|
|
146
|
+
Parameters:
|
|
147
|
+
-----------
|
|
148
|
+
X : array-like of shape (n_samples, n_features)
|
|
149
|
+
Test data
|
|
150
|
+
y : array-like of shape (n_samples,)
|
|
151
|
+
True values
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
--------
|
|
155
|
+
metrics : dict
|
|
156
|
+
Dictionary with evaluation metrics
|
|
157
|
+
"""
|
|
158
|
+
y_pred = self.predict(X)
|
|
159
|
+
|
|
160
|
+
if self.problem_type == 'regression':
|
|
161
|
+
mse = mean_squared_error(y, y_pred)
|
|
162
|
+
rmse = np.sqrt(mse)
|
|
163
|
+
return {
|
|
164
|
+
'mse': mse,
|
|
165
|
+
'rmse': rmse,
|
|
166
|
+
'predictions': y_pred
|
|
167
|
+
}
|
|
168
|
+
else:
|
|
169
|
+
accuracy = accuracy_score(y, y_pred)
|
|
170
|
+
return {
|
|
171
|
+
'accuracy': accuracy,
|
|
172
|
+
'predictions': y_pred
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def neural_network_analysis(X: Union[np.ndarray, pd.DataFrame],
|
|
177
|
+
y: Union[np.ndarray, pd.Series],
|
|
178
|
+
problem_type: str = 'regression',
|
|
179
|
+
hidden_layer_sizes: tuple = (100,),
|
|
180
|
+
activation: str = 'relu',
|
|
181
|
+
solver: str = 'adam',
|
|
182
|
+
test_size: float = 0.2,
|
|
183
|
+
alpha: float = 0.0001,
|
|
184
|
+
learning_rate: str = 'constant',
|
|
185
|
+
learning_rate_init: float = 0.001,
|
|
186
|
+
max_iter: int = 200,
|
|
187
|
+
random_state: int = 42) -> dict:
|
|
188
|
+
"""
|
|
189
|
+
Perform complete Neural Network analysis
|
|
190
|
+
|
|
191
|
+
Parameters:
|
|
192
|
+
-----------
|
|
193
|
+
X : array-like of shape (n_samples, n_features)
|
|
194
|
+
Features
|
|
195
|
+
y : array-like of shape (n_samples,)
|
|
196
|
+
Target variable
|
|
197
|
+
problem_type : str, 'regression' or 'classification'
|
|
198
|
+
Type of problem to solve
|
|
199
|
+
hidden_layer_sizes : tuple
|
|
200
|
+
The ith element represents the number of neurons in the ith hidden layer
|
|
201
|
+
activation : str, 'identity', 'logistic', 'tanh', 'relu'
|
|
202
|
+
Activation function for the hidden layer
|
|
203
|
+
solver : str, 'lbfgs', 'sgd', 'adam'
|
|
204
|
+
The solver for weight optimization
|
|
205
|
+
test_size : float
|
|
206
|
+
Proportion of dataset to include in test split
|
|
207
|
+
alpha : float
|
|
208
|
+
L2 penalty (regularization term) parameter
|
|
209
|
+
learning_rate : str, 'constant', 'invscaling', 'adaptive'
|
|
210
|
+
Learning rate schedule for weight updates
|
|
211
|
+
learning_rate_init : float
|
|
212
|
+
The initial learning rate used
|
|
213
|
+
max_iter : int
|
|
214
|
+
Maximum number of iterations
|
|
215
|
+
random_state : int
|
|
216
|
+
Random state for reproducibility
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
--------
|
|
220
|
+
results : dict
|
|
221
|
+
Dictionary with model, predictions, and evaluation metrics
|
|
222
|
+
"""
|
|
223
|
+
# Split data
|
|
224
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
225
|
+
X, y, test_size=test_size, random_state=random_state
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
# Initialize and fit model
|
|
229
|
+
nn_model = EconNeuralNetwork(
|
|
230
|
+
problem_type=problem_type,
|
|
231
|
+
hidden_layer_sizes=hidden_layer_sizes,
|
|
232
|
+
activation=activation,
|
|
233
|
+
solver=solver,
|
|
234
|
+
alpha=alpha,
|
|
235
|
+
learning_rate=learning_rate,
|
|
236
|
+
learning_rate_init=learning_rate_init,
|
|
237
|
+
max_iter=max_iter,
|
|
238
|
+
random_state=random_state
|
|
239
|
+
)
|
|
240
|
+
nn_model.fit(X_train, y_train)
|
|
241
|
+
|
|
242
|
+
# Evaluate model
|
|
243
|
+
train_results = nn_model.evaluate(X_train, y_train)
|
|
244
|
+
test_results = nn_model.evaluate(X_test, y_test)
|
|
245
|
+
|
|
246
|
+
# For classification, also get probabilities
|
|
247
|
+
if problem_type == 'classification':
|
|
248
|
+
train_proba = nn_model.predict_proba(X_train)
|
|
249
|
+
test_proba = nn_model.predict_proba(X_test)
|
|
250
|
+
else:
|
|
251
|
+
train_proba = None
|
|
252
|
+
test_proba = None
|
|
253
|
+
|
|
254
|
+
return {
|
|
255
|
+
'model': nn_model,
|
|
256
|
+
'train_results': train_results,
|
|
257
|
+
'test_results': test_results,
|
|
258
|
+
'train_proba': train_proba,
|
|
259
|
+
'test_proba': test_proba,
|
|
260
|
+
'X_train': X_train,
|
|
261
|
+
'X_test': X_test,
|
|
262
|
+
'y_train': y_train,
|
|
263
|
+
'y_test': y_test
|
|
264
|
+
}
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Random Forest implementation for econometric analysis
|
|
3
|
+
"""
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
|
|
7
|
+
from sklearn.model_selection import train_test_split
|
|
8
|
+
from sklearn.metrics import mean_squared_error, accuracy_score
|
|
9
|
+
from typing import Union, Optional, Tuple
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class EconRandomForest:
|
|
13
|
+
"""
|
|
14
|
+
Random Forest for econometric analysis with both regression and classification capabilities
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, problem_type: str = 'regression', n_estimators: int = 100,
|
|
18
|
+
max_depth: Optional[int] = None, random_state: int = 42):
|
|
19
|
+
"""
|
|
20
|
+
Initialize Random Forest model
|
|
21
|
+
|
|
22
|
+
Parameters:
|
|
23
|
+
-----------
|
|
24
|
+
problem_type : str, 'regression' or 'classification'
|
|
25
|
+
Type of problem to solve
|
|
26
|
+
n_estimators : int
|
|
27
|
+
Number of trees in the forest
|
|
28
|
+
max_depth : int, optional
|
|
29
|
+
Maximum depth of the tree
|
|
30
|
+
random_state : int
|
|
31
|
+
Random state for reproducibility
|
|
32
|
+
"""
|
|
33
|
+
self.problem_type = problem_type
|
|
34
|
+
self.n_estimators = n_estimators
|
|
35
|
+
self.max_depth = max_depth
|
|
36
|
+
self.random_state = random_state
|
|
37
|
+
|
|
38
|
+
if problem_type == 'regression':
|
|
39
|
+
self.model = RandomForestRegressor(
|
|
40
|
+
n_estimators=n_estimators,
|
|
41
|
+
max_depth=max_depth,
|
|
42
|
+
random_state=random_state
|
|
43
|
+
)
|
|
44
|
+
elif problem_type == 'classification':
|
|
45
|
+
self.model = RandomForestClassifier(
|
|
46
|
+
n_estimators=n_estimators,
|
|
47
|
+
max_depth=max_depth,
|
|
48
|
+
random_state=random_state
|
|
49
|
+
)
|
|
50
|
+
else:
|
|
51
|
+
raise ValueError("problem_type must be either 'regression' or 'classification'")
|
|
52
|
+
|
|
53
|
+
def fit(self, X: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.Series]) -> 'EconRandomForest':
|
|
54
|
+
"""
|
|
55
|
+
Fit the Random Forest model
|
|
56
|
+
|
|
57
|
+
Parameters:
|
|
58
|
+
-----------
|
|
59
|
+
X : array-like of shape (n_samples, n_features)
|
|
60
|
+
Training data
|
|
61
|
+
y : array-like of shape (n_samples,)
|
|
62
|
+
Target values
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
--------
|
|
66
|
+
self : EconRandomForest
|
|
67
|
+
"""
|
|
68
|
+
self.model.fit(X, y)
|
|
69
|
+
return self
|
|
70
|
+
|
|
71
|
+
def predict(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
|
|
72
|
+
"""
|
|
73
|
+
Predict using the Random Forest model
|
|
74
|
+
|
|
75
|
+
Parameters:
|
|
76
|
+
-----------
|
|
77
|
+
X : array-like of shape (n_samples, n_features)
|
|
78
|
+
Samples
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
--------
|
|
82
|
+
y_pred : ndarray of shape (n_samples,)
|
|
83
|
+
Predicted values
|
|
84
|
+
"""
|
|
85
|
+
return self.model.predict(X)
|
|
86
|
+
|
|
87
|
+
def feature_importance(self) -> np.ndarray:
|
|
88
|
+
"""
|
|
89
|
+
Get feature importances
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
--------
|
|
93
|
+
importances : ndarray of shape (n_features,)
|
|
94
|
+
Feature importances
|
|
95
|
+
"""
|
|
96
|
+
return self.model.feature_importances_
|
|
97
|
+
|
|
98
|
+
def evaluate(self, X: Union[np.ndarray, pd.DataFrame],
|
|
99
|
+
y: Union[np.ndarray, pd.Series]) -> dict:
|
|
100
|
+
"""
|
|
101
|
+
Evaluate model performance
|
|
102
|
+
|
|
103
|
+
Parameters:
|
|
104
|
+
-----------
|
|
105
|
+
X : array-like of shape (n_samples, n_features)
|
|
106
|
+
Test data
|
|
107
|
+
y : array-like of shape (n_samples,)
|
|
108
|
+
True values
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
--------
|
|
112
|
+
metrics : dict
|
|
113
|
+
Dictionary with evaluation metrics
|
|
114
|
+
"""
|
|
115
|
+
y_pred = self.predict(X)
|
|
116
|
+
|
|
117
|
+
if self.problem_type == 'regression':
|
|
118
|
+
mse = mean_squared_error(y, y_pred)
|
|
119
|
+
rmse = np.sqrt(mse)
|
|
120
|
+
return {
|
|
121
|
+
'mse': mse,
|
|
122
|
+
'rmse': rmse,
|
|
123
|
+
'predictions': y_pred
|
|
124
|
+
}
|
|
125
|
+
else:
|
|
126
|
+
accuracy = accuracy_score(y, y_pred)
|
|
127
|
+
return {
|
|
128
|
+
'accuracy': accuracy,
|
|
129
|
+
'predictions': y_pred
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def random_forest_analysis(X: Union[np.ndarray, pd.DataFrame],
|
|
134
|
+
y: Union[np.ndarray, pd.Series],
|
|
135
|
+
problem_type: str = 'regression',
|
|
136
|
+
test_size: float = 0.2,
|
|
137
|
+
n_estimators: int = 100,
|
|
138
|
+
max_depth: Optional[int] = None,
|
|
139
|
+
random_state: int = 42) -> dict:
|
|
140
|
+
"""
|
|
141
|
+
Perform complete Random Forest analysis
|
|
142
|
+
|
|
143
|
+
Parameters:
|
|
144
|
+
-----------
|
|
145
|
+
X : array-like of shape (n_samples, n_features)
|
|
146
|
+
Features
|
|
147
|
+
y : array-like of shape (n_samples,)
|
|
148
|
+
Target variable
|
|
149
|
+
problem_type : str, 'regression' or 'classification'
|
|
150
|
+
Type of problem to solve
|
|
151
|
+
test_size : float
|
|
152
|
+
Proportion of dataset to include in test split
|
|
153
|
+
n_estimators : int
|
|
154
|
+
Number of trees in the forest
|
|
155
|
+
max_depth : int, optional
|
|
156
|
+
Maximum depth of the tree
|
|
157
|
+
random_state : int
|
|
158
|
+
Random state for reproducibility
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
--------
|
|
162
|
+
results : dict
|
|
163
|
+
Dictionary with model, predictions, and feature importances
|
|
164
|
+
"""
|
|
165
|
+
# Split data
|
|
166
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
167
|
+
X, y, test_size=test_size, random_state=random_state
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
# Initialize and fit model
|
|
171
|
+
rf_model = EconRandomForest(
|
|
172
|
+
problem_type=problem_type,
|
|
173
|
+
n_estimators=n_estimators,
|
|
174
|
+
max_depth=max_depth,
|
|
175
|
+
random_state=random_state
|
|
176
|
+
)
|
|
177
|
+
rf_model.fit(X_train, y_train)
|
|
178
|
+
|
|
179
|
+
# Evaluate model
|
|
180
|
+
train_results = rf_model.evaluate(X_train, y_train)
|
|
181
|
+
test_results = rf_model.evaluate(X_test, y_test)
|
|
182
|
+
|
|
183
|
+
# Get feature importances
|
|
184
|
+
importances = rf_model.feature_importance()
|
|
185
|
+
|
|
186
|
+
return {
|
|
187
|
+
'model': rf_model,
|
|
188
|
+
'train_results': train_results,
|
|
189
|
+
'test_results': test_results,
|
|
190
|
+
'feature_importances': importances,
|
|
191
|
+
'X_train': X_train,
|
|
192
|
+
'X_test': X_test,
|
|
193
|
+
'y_train': y_train,
|
|
194
|
+
'y_test': y_test
|
|
195
|
+
}
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Support Vector Machine (SVM) implementation for econometric analysis
|
|
3
|
+
"""
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from sklearn.svm import SVR, SVC
|
|
7
|
+
from sklearn.model_selection import train_test_split
|
|
8
|
+
from sklearn.metrics import mean_squared_error, accuracy_score
|
|
9
|
+
from sklearn.preprocessing import StandardScaler
|
|
10
|
+
from typing import Union, Optional, Dict, Any
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class EconSVM:
|
|
14
|
+
"""
|
|
15
|
+
Support Vector Machine for econometric analysis with both regression and classification capabilities
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(self, problem_type: str = 'regression', kernel: str = 'rbf',
|
|
19
|
+
C: float = 1.0, gamma: str = 'scale', random_state: int = 42):
|
|
20
|
+
"""
|
|
21
|
+
Initialize SVM model
|
|
22
|
+
|
|
23
|
+
Parameters:
|
|
24
|
+
-----------
|
|
25
|
+
problem_type : str, 'regression' or 'classification'
|
|
26
|
+
Type of problem to solve
|
|
27
|
+
kernel : str, 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'
|
|
28
|
+
Specifies the kernel type to be used in the algorithm
|
|
29
|
+
C : float
|
|
30
|
+
Regularization parameter
|
|
31
|
+
gamma : str or float, 'scale' or 'auto' or float
|
|
32
|
+
Kernel coefficient for 'rbf', 'poly' and 'sigmoid'
|
|
33
|
+
random_state : int
|
|
34
|
+
Random state for reproducibility (used in probability estimation)
|
|
35
|
+
"""
|
|
36
|
+
self.problem_type = problem_type
|
|
37
|
+
self.kernel = kernel
|
|
38
|
+
self.C = C
|
|
39
|
+
self.gamma = gamma
|
|
40
|
+
self.random_state = random_state
|
|
41
|
+
self.scaler = StandardScaler()
|
|
42
|
+
|
|
43
|
+
if problem_type == 'regression':
|
|
44
|
+
self.model = SVR(
|
|
45
|
+
kernel=kernel,
|
|
46
|
+
C=C,
|
|
47
|
+
gamma=gamma
|
|
48
|
+
)
|
|
49
|
+
elif problem_type == 'classification':
|
|
50
|
+
self.model = SVC(
|
|
51
|
+
kernel=kernel,
|
|
52
|
+
C=C,
|
|
53
|
+
gamma=gamma,
|
|
54
|
+
random_state=random_state,
|
|
55
|
+
probability=True
|
|
56
|
+
)
|
|
57
|
+
else:
|
|
58
|
+
raise ValueError("problem_type must be either 'regression' or 'classification'")
|
|
59
|
+
|
|
60
|
+
def fit(self, X: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.Series]) -> 'EconSVM':
|
|
61
|
+
"""
|
|
62
|
+
Fit the SVM model
|
|
63
|
+
|
|
64
|
+
Parameters:
|
|
65
|
+
-----------
|
|
66
|
+
X : array-like of shape (n_samples, n_features)
|
|
67
|
+
Training data
|
|
68
|
+
y : array-like of shape (n_samples,)
|
|
69
|
+
Target values
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
--------
|
|
73
|
+
self : EconSVM
|
|
74
|
+
"""
|
|
75
|
+
# Scale features
|
|
76
|
+
X_scaled = self.scaler.fit_transform(X)
|
|
77
|
+
self.model.fit(X_scaled, y)
|
|
78
|
+
return self
|
|
79
|
+
|
|
80
|
+
def predict(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
|
|
81
|
+
"""
|
|
82
|
+
Predict using the SVM model
|
|
83
|
+
|
|
84
|
+
Parameters:
|
|
85
|
+
-----------
|
|
86
|
+
X : array-like of shape (n_samples, n_features)
|
|
87
|
+
Samples
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
--------
|
|
91
|
+
y_pred : ndarray of shape (n_samples,)
|
|
92
|
+
Predicted values
|
|
93
|
+
"""
|
|
94
|
+
# Scale features using the same scaler
|
|
95
|
+
X_scaled = self.scaler.transform(X)
|
|
96
|
+
return self.model.predict(X_scaled)
|
|
97
|
+
|
|
98
|
+
def predict_proba(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
|
|
99
|
+
"""
|
|
100
|
+
Predict class probabilities using the SVM model (classification only)
|
|
101
|
+
|
|
102
|
+
Parameters:
|
|
103
|
+
-----------
|
|
104
|
+
X : array-like of shape (n_samples, n_features)
|
|
105
|
+
Samples
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
--------
|
|
109
|
+
y_proba : ndarray of shape (n_samples, n_classes)
|
|
110
|
+
Predicted class probabilities
|
|
111
|
+
"""
|
|
112
|
+
if self.problem_type != 'classification':
|
|
113
|
+
raise ValueError("predict_proba is only available for classification problems")
|
|
114
|
+
|
|
115
|
+
# Scale features using the same scaler
|
|
116
|
+
X_scaled = self.scaler.transform(X)
|
|
117
|
+
return self.model.predict_proba(X_scaled)
|
|
118
|
+
|
|
119
|
+
def evaluate(self, X: Union[np.ndarray, pd.DataFrame],
|
|
120
|
+
y: Union[np.ndarray, pd.Series]) -> dict:
|
|
121
|
+
"""
|
|
122
|
+
Evaluate model performance
|
|
123
|
+
|
|
124
|
+
Parameters:
|
|
125
|
+
-----------
|
|
126
|
+
X : array-like of shape (n_samples, n_features)
|
|
127
|
+
Test data
|
|
128
|
+
y : array-like of shape (n_samples,)
|
|
129
|
+
True values
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
--------
|
|
133
|
+
metrics : dict
|
|
134
|
+
Dictionary with evaluation metrics
|
|
135
|
+
"""
|
|
136
|
+
y_pred = self.predict(X)
|
|
137
|
+
|
|
138
|
+
if self.problem_type == 'regression':
|
|
139
|
+
mse = mean_squared_error(y, y_pred)
|
|
140
|
+
rmse = np.sqrt(mse)
|
|
141
|
+
return {
|
|
142
|
+
'mse': mse,
|
|
143
|
+
'rmse': rmse,
|
|
144
|
+
'predictions': y_pred
|
|
145
|
+
}
|
|
146
|
+
else:
|
|
147
|
+
accuracy = accuracy_score(y, y_pred)
|
|
148
|
+
return {
|
|
149
|
+
'accuracy': accuracy,
|
|
150
|
+
'predictions': y_pred
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def svm_analysis(X: Union[np.ndarray, pd.DataFrame],
|
|
155
|
+
y: Union[np.ndarray, pd.Series],
|
|
156
|
+
problem_type: str = 'regression',
|
|
157
|
+
kernel: str = 'rbf',
|
|
158
|
+
test_size: float = 0.2,
|
|
159
|
+
C: float = 1.0,
|
|
160
|
+
gamma: str = 'scale',
|
|
161
|
+
random_state: int = 42) -> dict:
|
|
162
|
+
"""
|
|
163
|
+
Perform complete SVM analysis
|
|
164
|
+
|
|
165
|
+
Parameters:
|
|
166
|
+
-----------
|
|
167
|
+
X : array-like of shape (n_samples, n_features)
|
|
168
|
+
Features
|
|
169
|
+
y : array-like of shape (n_samples,)
|
|
170
|
+
Target variable
|
|
171
|
+
problem_type : str, 'regression' or 'classification'
|
|
172
|
+
Type of problem to solve
|
|
173
|
+
kernel : str, 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'
|
|
174
|
+
Specifies the kernel type to be used in the algorithm
|
|
175
|
+
test_size : float
|
|
176
|
+
Proportion of dataset to include in test split
|
|
177
|
+
C : float
|
|
178
|
+
Regularization parameter
|
|
179
|
+
gamma : str or float, 'scale' or 'auto' or float
|
|
180
|
+
Kernel coefficient for 'rbf', 'poly' and 'sigmoid'
|
|
181
|
+
random_state : int
|
|
182
|
+
Random state for reproducibility
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
--------
|
|
186
|
+
results : dict
|
|
187
|
+
Dictionary with model, predictions, and evaluation metrics
|
|
188
|
+
"""
|
|
189
|
+
# Split data
|
|
190
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
191
|
+
X, y, test_size=test_size, random_state=random_state
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
# Initialize and fit model
|
|
195
|
+
svm_model = EconSVM(
|
|
196
|
+
problem_type=problem_type,
|
|
197
|
+
kernel=kernel,
|
|
198
|
+
C=C,
|
|
199
|
+
gamma=gamma,
|
|
200
|
+
random_state=random_state
|
|
201
|
+
)
|
|
202
|
+
svm_model.fit(X_train, y_train)
|
|
203
|
+
|
|
204
|
+
# Evaluate model
|
|
205
|
+
train_results = svm_model.evaluate(X_train, y_train)
|
|
206
|
+
test_results = svm_model.evaluate(X_test, y_test)
|
|
207
|
+
|
|
208
|
+
# For classification, also get probabilities
|
|
209
|
+
if problem_type == 'classification':
|
|
210
|
+
train_proba = svm_model.predict_proba(X_train)
|
|
211
|
+
test_proba = svm_model.predict_proba(X_test)
|
|
212
|
+
else:
|
|
213
|
+
train_proba = None
|
|
214
|
+
test_proba = None
|
|
215
|
+
|
|
216
|
+
return {
|
|
217
|
+
'model': svm_model,
|
|
218
|
+
'train_results': train_results,
|
|
219
|
+
'test_results': test_results,
|
|
220
|
+
'train_proba': train_proba,
|
|
221
|
+
'test_proba': test_proba,
|
|
222
|
+
'X_train': X_train,
|
|
223
|
+
'X_test': X_test,
|
|
224
|
+
'y_train': y_train,
|
|
225
|
+
'y_test': y_test
|
|
226
|
+
}
|