aigroup-econ-mcp 1.3.3__py3-none-any.whl → 2.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- .gitignore +253 -0
- PKG-INFO +732 -0
- README.md +687 -0
- __init__.py +14 -0
- aigroup_econ_mcp-2.0.1.dist-info/METADATA +732 -0
- aigroup_econ_mcp-2.0.1.dist-info/RECORD +170 -0
- aigroup_econ_mcp-2.0.1.dist-info/entry_points.txt +2 -0
- aigroup_econ_mcp-2.0.1.dist-info/licenses/LICENSE +21 -0
- cli.py +32 -0
- econometrics/README.md +18 -0
- econometrics/__init__.py +191 -0
- econometrics/advanced_methods/modern_computing_machine_learning/__init__.py +30 -0
- econometrics/advanced_methods/modern_computing_machine_learning/causal_forest.py +253 -0
- econometrics/advanced_methods/modern_computing_machine_learning/double_ml.py +268 -0
- econometrics/advanced_methods/modern_computing_machine_learning/gradient_boosting.py +249 -0
- econometrics/advanced_methods/modern_computing_machine_learning/hierarchical_clustering.py +243 -0
- econometrics/advanced_methods/modern_computing_machine_learning/kmeans_clustering.py +293 -0
- econometrics/advanced_methods/modern_computing_machine_learning/neural_network.py +264 -0
- econometrics/advanced_methods/modern_computing_machine_learning/random_forest.py +195 -0
- econometrics/advanced_methods/modern_computing_machine_learning/support_vector_machine.py +226 -0
- econometrics/advanced_methods/modern_computing_machine_learning/test_all_modules.py +329 -0
- econometrics/advanced_methods/modern_computing_machine_learning/test_report.md +107 -0
- econometrics/basic_parametric_estimation/__init__.py +31 -0
- econometrics/basic_parametric_estimation/gmm/__init__.py +13 -0
- econometrics/basic_parametric_estimation/gmm/gmm_model.py +256 -0
- econometrics/basic_parametric_estimation/mle/__init__.py +13 -0
- econometrics/basic_parametric_estimation/mle/mle_model.py +241 -0
- econometrics/basic_parametric_estimation/ols/__init__.py +13 -0
- econometrics/basic_parametric_estimation/ols/ols_model.py +141 -0
- econometrics/causal_inference/__init__.py +66 -0
- econometrics/causal_inference/causal_identification_strategy/__init__.py +104 -0
- econometrics/causal_inference/causal_identification_strategy/control_function.py +112 -0
- econometrics/causal_inference/causal_identification_strategy/difference_in_differences.py +107 -0
- econometrics/causal_inference/causal_identification_strategy/event_study.py +119 -0
- econometrics/causal_inference/causal_identification_strategy/first_difference.py +89 -0
- econometrics/causal_inference/causal_identification_strategy/fixed_effects.py +103 -0
- econometrics/causal_inference/causal_identification_strategy/hausman_test.py +69 -0
- econometrics/causal_inference/causal_identification_strategy/instrumental_variables.py +145 -0
- econometrics/causal_inference/causal_identification_strategy/mediation_analysis.py +121 -0
- econometrics/causal_inference/causal_identification_strategy/moderation_analysis.py +109 -0
- econometrics/causal_inference/causal_identification_strategy/propensity_score_matching.py +140 -0
- econometrics/causal_inference/causal_identification_strategy/random_effects.py +100 -0
- econometrics/causal_inference/causal_identification_strategy/regression_discontinuity.py +98 -0
- econometrics/causal_inference/causal_identification_strategy/synthetic_control.py +111 -0
- econometrics/causal_inference/causal_identification_strategy/triple_difference.py +86 -0
- econometrics/distribution_analysis/__init__.py +28 -0
- econometrics/distribution_analysis/oaxaca_blinder.py +184 -0
- econometrics/distribution_analysis/time_series_decomposition.py +152 -0
- econometrics/distribution_analysis/variance_decomposition.py +179 -0
- econometrics/missing_data/__init__.py +18 -0
- econometrics/missing_data/imputation_methods.py +219 -0
- econometrics/missing_data/missing_data_measurement_error/__init__.py +0 -0
- econometrics/model_specification_diagnostics_robust_inference/README.md +173 -0
- econometrics/model_specification_diagnostics_robust_inference/__init__.py +78 -0
- econometrics/model_specification_diagnostics_robust_inference/diagnostic_tests/__init__.py +20 -0
- econometrics/model_specification_diagnostics_robust_inference/diagnostic_tests/diagnostic_tests_model.py +149 -0
- econometrics/model_specification_diagnostics_robust_inference/generalized_least_squares/__init__.py +15 -0
- econometrics/model_specification_diagnostics_robust_inference/generalized_least_squares/gls_model.py +130 -0
- econometrics/model_specification_diagnostics_robust_inference/model_selection/__init__.py +18 -0
- econometrics/model_specification_diagnostics_robust_inference/model_selection/model_selection_model.py +286 -0
- econometrics/model_specification_diagnostics_robust_inference/regularization/__init__.py +15 -0
- econometrics/model_specification_diagnostics_robust_inference/regularization/regularization_model.py +177 -0
- econometrics/model_specification_diagnostics_robust_inference/robust_errors/__init__.py +15 -0
- econometrics/model_specification_diagnostics_robust_inference/robust_errors/robust_errors_model.py +122 -0
- econometrics/model_specification_diagnostics_robust_inference/simultaneous_equations/__init__.py +15 -0
- econometrics/model_specification_diagnostics_robust_inference/simultaneous_equations/simultaneous_equations_model.py +246 -0
- econometrics/model_specification_diagnostics_robust_inference/weighted_least_squares/__init__.py +15 -0
- econometrics/model_specification_diagnostics_robust_inference/weighted_least_squares/wls_model.py +127 -0
- econometrics/nonparametric/__init__.py +35 -0
- econometrics/nonparametric/gam_model.py +117 -0
- econometrics/nonparametric/kernel_regression.py +161 -0
- econometrics/nonparametric/nonparametric_semiparametric_methods/__init__.py +0 -0
- econometrics/nonparametric/quantile_regression.py +249 -0
- econometrics/nonparametric/spline_regression.py +100 -0
- econometrics/spatial_econometrics/__init__.py +68 -0
- econometrics/spatial_econometrics/geographically_weighted_regression.py +211 -0
- econometrics/spatial_econometrics/gwr_simple.py +154 -0
- econometrics/spatial_econometrics/spatial_autocorrelation.py +356 -0
- econometrics/spatial_econometrics/spatial_durbin_model.py +177 -0
- econometrics/spatial_econometrics/spatial_econometrics_new/__init__.py +0 -0
- econometrics/spatial_econometrics/spatial_regression.py +315 -0
- econometrics/spatial_econometrics/spatial_weights.py +226 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/README.md +164 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/__init__.py +40 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/count_data_models.py +311 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/discrete_choice_models.py +294 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/limited_dependent_variable_models.py +282 -0
- econometrics/specific_data_modeling/survival_duration_data/__init__.py +0 -0
- econometrics/specific_data_modeling/time_series_panel_data/__init__.py +143 -0
- econometrics/specific_data_modeling/time_series_panel_data/arima_model.py +104 -0
- econometrics/specific_data_modeling/time_series_panel_data/cointegration_vecm.py +334 -0
- econometrics/specific_data_modeling/time_series_panel_data/dynamic_panel_models.py +653 -0
- econometrics/specific_data_modeling/time_series_panel_data/exponential_smoothing.py +176 -0
- econometrics/specific_data_modeling/time_series_panel_data/garch_model.py +198 -0
- econometrics/specific_data_modeling/time_series_panel_data/panel_diagnostics.py +125 -0
- econometrics/specific_data_modeling/time_series_panel_data/panel_var.py +60 -0
- econometrics/specific_data_modeling/time_series_panel_data/structural_break_tests.py +87 -0
- econometrics/specific_data_modeling/time_series_panel_data/time_varying_parameter_models.py +106 -0
- econometrics/specific_data_modeling/time_series_panel_data/unit_root_tests.py +204 -0
- econometrics/specific_data_modeling/time_series_panel_data/var_svar_model.py +372 -0
- econometrics/statistical_inference/__init__.py +21 -0
- econometrics/statistical_inference/bootstrap_methods.py +162 -0
- econometrics/statistical_inference/permutation_test.py +177 -0
- econometrics/statistical_inference/statistical_inference_techniques/__init__.py +0 -0
- econometrics/statistics/distribution_decomposition_methods/__init__.py +0 -0
- econometrics/survival_analysis/__init__.py +18 -0
- econometrics/survival_analysis/survival_models.py +259 -0
- econometrics/tests/basic_parametric_estimation_tests/__init__.py +3 -0
- econometrics/tests/basic_parametric_estimation_tests/test_gmm.py +128 -0
- econometrics/tests/basic_parametric_estimation_tests/test_mle.py +127 -0
- econometrics/tests/basic_parametric_estimation_tests/test_ols.py +100 -0
- econometrics/tests/causal_inference_tests/__init__.py +3 -0
- econometrics/tests/causal_inference_tests/detailed_test.py +441 -0
- econometrics/tests/causal_inference_tests/test_all_methods.py +418 -0
- econometrics/tests/causal_inference_tests/test_causal_identification_strategy.py +202 -0
- econometrics/tests/causal_inference_tests/test_difference_in_differences.py +53 -0
- econometrics/tests/causal_inference_tests/test_instrumental_variables.py +44 -0
- econometrics/tests/model_specification_diagnostics_tests/__init__.py +3 -0
- econometrics/tests/model_specification_diagnostics_tests/test_diagnostic_tests.py +86 -0
- econometrics/tests/model_specification_diagnostics_tests/test_robust_errors.py +89 -0
- econometrics/tests/specific_data_modeling_tests/__init__.py +3 -0
- econometrics/tests/specific_data_modeling_tests/test_arima.py +98 -0
- econometrics/tests/specific_data_modeling_tests/test_dynamic_panel.py +198 -0
- econometrics/tests/specific_data_modeling_tests/test_exponential_smoothing.py +105 -0
- econometrics/tests/specific_data_modeling_tests/test_garch.py +118 -0
- econometrics/tests/specific_data_modeling_tests/test_micro_discrete_limited_data.py +189 -0
- econometrics/tests/specific_data_modeling_tests/test_unit_root.py +156 -0
- econometrics/tests/specific_data_modeling_tests/test_var.py +124 -0
- econometrics//321/206/320/254/320/272/321/205/342/225/235/320/220/321/205/320/237/320/241/321/205/320/264/320/267/321/207/342/226/222/342/225/227/321/204/342/225/235/320/250/321/205/320/225/320/230/321/207/342/225/221/320/267/321/205/320/230/320/226/321/206/320/256/320/240.md +544 -0
- prompts/__init__.py +0 -0
- prompts/analysis_guides.py +43 -0
- pyproject.toml +85 -0
- resources/MCP_MASTER_GUIDE.md +422 -0
- resources/MCP_TOOLS_DATA_FORMAT_GUIDE.md +185 -0
- resources/__init__.py +0 -0
- server.py +97 -0
- tools/README.md +88 -0
- tools/__init__.py +119 -0
- tools/causal_inference_adapter.py +658 -0
- tools/data_loader.py +213 -0
- tools/decorators.py +38 -0
- tools/distribution_analysis_adapter.py +121 -0
- tools/econometrics_adapter.py +286 -0
- tools/gwr_simple_adapter.py +54 -0
- tools/machine_learning_adapter.py +567 -0
- tools/mcp_tool_groups/__init__.py +15 -0
- tools/mcp_tool_groups/basic_parametric_tools.py +173 -0
- tools/mcp_tool_groups/causal_inference_tools.py +643 -0
- tools/mcp_tool_groups/distribution_analysis_tools.py +169 -0
- tools/mcp_tool_groups/machine_learning_tools.py +422 -0
- tools/mcp_tool_groups/microecon_tools.py +325 -0
- tools/mcp_tool_groups/missing_data_tools.py +117 -0
- tools/mcp_tool_groups/model_specification_tools.py +402 -0
- tools/mcp_tool_groups/nonparametric_tools.py +225 -0
- tools/mcp_tool_groups/spatial_econometrics_tools.py +323 -0
- tools/mcp_tool_groups/statistical_inference_tools.py +131 -0
- tools/mcp_tool_groups/time_series_tools.py +494 -0
- tools/mcp_tools_registry.py +124 -0
- tools/microecon_adapter.py +412 -0
- tools/missing_data_adapter.py +73 -0
- tools/model_specification_adapter.py +369 -0
- tools/nonparametric_adapter.py +190 -0
- tools/output_formatter.py +563 -0
- tools/spatial_econometrics_adapter.py +318 -0
- tools/statistical_inference_adapter.py +90 -0
- tools/survival_analysis_adapter.py +46 -0
- tools/time_series_panel_data_adapter.py +858 -0
- tools/time_series_panel_data_tools.py +65 -0
- aigroup_econ_mcp/__init__.py +0 -19
- aigroup_econ_mcp/cli.py +0 -82
- aigroup_econ_mcp/config.py +0 -561
- aigroup_econ_mcp/server.py +0 -452
- aigroup_econ_mcp/tools/__init__.py +0 -19
- aigroup_econ_mcp/tools/base.py +0 -470
- aigroup_econ_mcp/tools/cache.py +0 -533
- aigroup_econ_mcp/tools/data_loader.py +0 -195
- aigroup_econ_mcp/tools/file_parser.py +0 -1027
- aigroup_econ_mcp/tools/machine_learning.py +0 -60
- aigroup_econ_mcp/tools/ml_ensemble.py +0 -210
- aigroup_econ_mcp/tools/ml_evaluation.py +0 -272
- aigroup_econ_mcp/tools/ml_models.py +0 -54
- aigroup_econ_mcp/tools/ml_regularization.py +0 -186
- aigroup_econ_mcp/tools/monitoring.py +0 -555
- aigroup_econ_mcp/tools/optimized_example.py +0 -229
- aigroup_econ_mcp/tools/panel_data.py +0 -619
- aigroup_econ_mcp/tools/regression.py +0 -214
- aigroup_econ_mcp/tools/statistics.py +0 -154
- aigroup_econ_mcp/tools/time_series.py +0 -698
- aigroup_econ_mcp/tools/timeout.py +0 -283
- aigroup_econ_mcp/tools/tool_descriptions.py +0 -410
- aigroup_econ_mcp/tools/tool_handlers.py +0 -1016
- aigroup_econ_mcp/tools/tool_registry.py +0 -478
- aigroup_econ_mcp/tools/validation.py +0 -482
- aigroup_econ_mcp-1.3.3.dist-info/METADATA +0 -525
- aigroup_econ_mcp-1.3.3.dist-info/RECORD +0 -30
- aigroup_econ_mcp-1.3.3.dist-info/entry_points.txt +0 -2
- /aigroup_econ_mcp-1.3.3.dist-info/licenses/LICENSE → /LICENSE +0 -0
- {aigroup_econ_mcp-1.3.3.dist-info → aigroup_econ_mcp-2.0.1.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Gradient Boosting Machine (GBM/XGBoost) implementation for econometric analysis
|
|
3
|
+
"""
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
|
|
7
|
+
from sklearn.model_selection import train_test_split
|
|
8
|
+
from sklearn.metrics import mean_squared_error, accuracy_score
|
|
9
|
+
try:
|
|
10
|
+
import xgboost as xgb
|
|
11
|
+
XGBOOST_AVAILABLE = True
|
|
12
|
+
except ImportError:
|
|
13
|
+
XGBOOST_AVAILABLE = False
|
|
14
|
+
from typing import Union, Optional, Dict, Any
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class EconGradientBoosting:
|
|
18
|
+
"""
|
|
19
|
+
Gradient Boosting for econometric analysis with both scikit-learn and XGBoost implementations
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(self, algorithm: str = 'sklearn', problem_type: str = 'regression',
|
|
23
|
+
n_estimators: int = 100, learning_rate: float = 0.1,
|
|
24
|
+
max_depth: int = 3, random_state: int = 42):
|
|
25
|
+
"""
|
|
26
|
+
Initialize Gradient Boosting model
|
|
27
|
+
|
|
28
|
+
Parameters:
|
|
29
|
+
-----------
|
|
30
|
+
algorithm : str, 'sklearn' or 'xgboost'
|
|
31
|
+
Which implementation to use
|
|
32
|
+
problem_type : str, 'regression' or 'classification'
|
|
33
|
+
Type of problem to solve
|
|
34
|
+
n_estimators : int
|
|
35
|
+
Number of boosting stages
|
|
36
|
+
learning_rate : float
|
|
37
|
+
Learning rate shrinks the contribution of each tree
|
|
38
|
+
max_depth : int
|
|
39
|
+
Maximum depth of the individual regression estimators
|
|
40
|
+
random_state : int
|
|
41
|
+
Random state for reproducibility
|
|
42
|
+
"""
|
|
43
|
+
self.algorithm = algorithm
|
|
44
|
+
self.problem_type = problem_type
|
|
45
|
+
self.n_estimators = n_estimators
|
|
46
|
+
self.learning_rate = learning_rate
|
|
47
|
+
self.max_depth = max_depth
|
|
48
|
+
self.random_state = random_state
|
|
49
|
+
|
|
50
|
+
if algorithm == 'sklearn':
|
|
51
|
+
if problem_type == 'regression':
|
|
52
|
+
self.model = GradientBoostingRegressor(
|
|
53
|
+
n_estimators=n_estimators,
|
|
54
|
+
learning_rate=learning_rate,
|
|
55
|
+
max_depth=max_depth,
|
|
56
|
+
random_state=random_state
|
|
57
|
+
)
|
|
58
|
+
elif problem_type == 'classification':
|
|
59
|
+
self.model = GradientBoostingClassifier(
|
|
60
|
+
n_estimators=n_estimators,
|
|
61
|
+
learning_rate=learning_rate,
|
|
62
|
+
max_depth=max_depth,
|
|
63
|
+
random_state=random_state
|
|
64
|
+
)
|
|
65
|
+
elif algorithm == 'xgboost':
|
|
66
|
+
if not XGBOOST_AVAILABLE:
|
|
67
|
+
raise ImportError("XGBoost is not installed. Please install it with 'pip install xgboost'")
|
|
68
|
+
|
|
69
|
+
if problem_type == 'regression':
|
|
70
|
+
self.model = xgb.XGBRegressor(
|
|
71
|
+
n_estimators=n_estimators,
|
|
72
|
+
learning_rate=learning_rate,
|
|
73
|
+
max_depth=max_depth,
|
|
74
|
+
random_state=random_state
|
|
75
|
+
)
|
|
76
|
+
elif problem_type == 'classification':
|
|
77
|
+
self.model = xgb.XGBClassifier(
|
|
78
|
+
n_estimators=n_estimators,
|
|
79
|
+
learning_rate=learning_rate,
|
|
80
|
+
max_depth=max_depth,
|
|
81
|
+
random_state=random_state
|
|
82
|
+
)
|
|
83
|
+
else:
|
|
84
|
+
raise ValueError("algorithm must be either 'sklearn' or 'xgboost'")
|
|
85
|
+
|
|
86
|
+
def fit(self, X: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.Series]) -> 'EconGradientBoosting':
|
|
87
|
+
"""
|
|
88
|
+
Fit the Gradient Boosting model
|
|
89
|
+
|
|
90
|
+
Parameters:
|
|
91
|
+
-----------
|
|
92
|
+
X : array-like of shape (n_samples, n_features)
|
|
93
|
+
Training data
|
|
94
|
+
y : array-like of shape (n_samples,)
|
|
95
|
+
Target values
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
--------
|
|
99
|
+
self : EconGradientBoosting
|
|
100
|
+
"""
|
|
101
|
+
self.model.fit(X, y)
|
|
102
|
+
return self
|
|
103
|
+
|
|
104
|
+
def predict(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
|
|
105
|
+
"""
|
|
106
|
+
Predict using the Gradient Boosting model
|
|
107
|
+
|
|
108
|
+
Parameters:
|
|
109
|
+
-----------
|
|
110
|
+
X : array-like of shape (n_samples, n_features)
|
|
111
|
+
Samples
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
--------
|
|
115
|
+
y_pred : ndarray of shape (n_samples,)
|
|
116
|
+
Predicted values
|
|
117
|
+
"""
|
|
118
|
+
return self.model.predict(X)
|
|
119
|
+
|
|
120
|
+
def feature_importance(self) -> Dict[str, np.ndarray]:
|
|
121
|
+
"""
|
|
122
|
+
Get feature importances
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
--------
|
|
126
|
+
importances : dict
|
|
127
|
+
Dictionary with feature importances (depends on algorithm)
|
|
128
|
+
"""
|
|
129
|
+
if self.algorithm == 'sklearn':
|
|
130
|
+
return {
|
|
131
|
+
'importances': self.model.feature_importances_
|
|
132
|
+
}
|
|
133
|
+
elif self.algorithm == 'xgboost':
|
|
134
|
+
# XGBoost provides multiple importance types
|
|
135
|
+
importance_types = ['weight', 'gain', 'cover', 'total_gain', 'total_cover']
|
|
136
|
+
importances = {}
|
|
137
|
+
for imp_type in importance_types:
|
|
138
|
+
try:
|
|
139
|
+
importances[imp_type] = self.model.feature_importances_
|
|
140
|
+
except:
|
|
141
|
+
pass
|
|
142
|
+
return importances
|
|
143
|
+
|
|
144
|
+
def evaluate(self, X: Union[np.ndarray, pd.DataFrame],
|
|
145
|
+
y: Union[np.ndarray, pd.Series]) -> dict:
|
|
146
|
+
"""
|
|
147
|
+
Evaluate model performance
|
|
148
|
+
|
|
149
|
+
Parameters:
|
|
150
|
+
-----------
|
|
151
|
+
X : array-like of shape (n_samples, n_features)
|
|
152
|
+
Test data
|
|
153
|
+
y : array-like of shape (n_samples,)
|
|
154
|
+
True values
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
--------
|
|
158
|
+
metrics : dict
|
|
159
|
+
Dictionary with evaluation metrics
|
|
160
|
+
"""
|
|
161
|
+
y_pred = self.predict(X)
|
|
162
|
+
|
|
163
|
+
if self.problem_type == 'regression':
|
|
164
|
+
mse = mean_squared_error(y, y_pred)
|
|
165
|
+
rmse = np.sqrt(mse)
|
|
166
|
+
return {
|
|
167
|
+
'mse': mse,
|
|
168
|
+
'rmse': rmse,
|
|
169
|
+
'predictions': y_pred
|
|
170
|
+
}
|
|
171
|
+
else:
|
|
172
|
+
accuracy = accuracy_score(y, y_pred)
|
|
173
|
+
return {
|
|
174
|
+
'accuracy': accuracy,
|
|
175
|
+
'predictions': y_pred
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def gradient_boosting_analysis(X: Union[np.ndarray, pd.DataFrame],
|
|
180
|
+
y: Union[np.ndarray, pd.Series],
|
|
181
|
+
algorithm: str = 'sklearn',
|
|
182
|
+
problem_type: str = 'regression',
|
|
183
|
+
test_size: float = 0.2,
|
|
184
|
+
n_estimators: int = 100,
|
|
185
|
+
learning_rate: float = 0.1,
|
|
186
|
+
max_depth: int = 3,
|
|
187
|
+
random_state: int = 42) -> dict:
|
|
188
|
+
"""
|
|
189
|
+
Perform complete Gradient Boosting analysis
|
|
190
|
+
|
|
191
|
+
Parameters:
|
|
192
|
+
-----------
|
|
193
|
+
X : array-like of shape (n_samples, n_features)
|
|
194
|
+
Features
|
|
195
|
+
y : array-like of shape (n_samples,)
|
|
196
|
+
Target variable
|
|
197
|
+
algorithm : str, 'sklearn' or 'xgboost'
|
|
198
|
+
Which implementation to use
|
|
199
|
+
problem_type : str, 'regression' or 'classification'
|
|
200
|
+
Type of problem to solve
|
|
201
|
+
test_size : float
|
|
202
|
+
Proportion of dataset to include in test split
|
|
203
|
+
n_estimators : int
|
|
204
|
+
Number of boosting stages
|
|
205
|
+
learning_rate : float
|
|
206
|
+
Learning rate shrinks the contribution of each tree
|
|
207
|
+
max_depth : int
|
|
208
|
+
Maximum depth of the individual regression estimators
|
|
209
|
+
random_state : int
|
|
210
|
+
Random state for reproducibility
|
|
211
|
+
|
|
212
|
+
Returns:
|
|
213
|
+
--------
|
|
214
|
+
results : dict
|
|
215
|
+
Dictionary with model, predictions, and feature importances
|
|
216
|
+
"""
|
|
217
|
+
# Split data
|
|
218
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
219
|
+
X, y, test_size=test_size, random_state=random_state
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
# Initialize and fit model
|
|
223
|
+
gb_model = EconGradientBoosting(
|
|
224
|
+
algorithm=algorithm,
|
|
225
|
+
problem_type=problem_type,
|
|
226
|
+
n_estimators=n_estimators,
|
|
227
|
+
learning_rate=learning_rate,
|
|
228
|
+
max_depth=max_depth,
|
|
229
|
+
random_state=random_state
|
|
230
|
+
)
|
|
231
|
+
gb_model.fit(X_train, y_train)
|
|
232
|
+
|
|
233
|
+
# Evaluate model
|
|
234
|
+
train_results = gb_model.evaluate(X_train, y_train)
|
|
235
|
+
test_results = gb_model.evaluate(X_test, y_test)
|
|
236
|
+
|
|
237
|
+
# Get feature importances
|
|
238
|
+
importances = gb_model.feature_importance()
|
|
239
|
+
|
|
240
|
+
return {
|
|
241
|
+
'model': gb_model,
|
|
242
|
+
'train_results': train_results,
|
|
243
|
+
'test_results': test_results,
|
|
244
|
+
'feature_importances': importances,
|
|
245
|
+
'X_train': X_train,
|
|
246
|
+
'X_test': X_test,
|
|
247
|
+
'y_train': y_train,
|
|
248
|
+
'y_test': y_test
|
|
249
|
+
}
|
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Hierarchical Clustering implementation for econometric analysis
|
|
3
|
+
"""
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from sklearn.cluster import AgglomerativeClustering, linkage_tree
|
|
7
|
+
from sklearn.metrics import silhouette_score, calinski_harabasz_score
|
|
8
|
+
from sklearn.preprocessing import StandardScaler
|
|
9
|
+
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
|
|
10
|
+
from scipy.spatial.distance import pdist
|
|
11
|
+
from typing import Union, Optional, Dict, Any
|
|
12
|
+
|
|
13
|
+
# 可选导入matplotlib
|
|
14
|
+
try:
|
|
15
|
+
import matplotlib.pyplot as plt
|
|
16
|
+
MATPLOTLIB_AVAILABLE = True
|
|
17
|
+
except ImportError:
|
|
18
|
+
MATPLOTLIB_AVAILABLE = False
|
|
19
|
+
except UnicodeDecodeError:
|
|
20
|
+
# 处理编码问题
|
|
21
|
+
MATPLOTLIB_AVAILABLE = False
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class EconHierarchicalClustering:
|
|
25
|
+
"""
|
|
26
|
+
Hierarchical Clustering for econometric analysis
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(self, n_clusters: int = 2, linkage: str = 'ward',
|
|
30
|
+
metric: str = 'euclidean'):
|
|
31
|
+
"""
|
|
32
|
+
Initialize Hierarchical Clustering model
|
|
33
|
+
|
|
34
|
+
Parameters:
|
|
35
|
+
-----------
|
|
36
|
+
n_clusters : int
|
|
37
|
+
Number of clusters to find
|
|
38
|
+
linkage : str, 'ward', 'complete', 'average', 'single'
|
|
39
|
+
Which linkage criterion to use
|
|
40
|
+
metric : str or callable
|
|
41
|
+
Metric used to compute the linkage. Can be 'euclidean', 'l1', 'l2',
|
|
42
|
+
'manhattan', 'cosine', or 'precomputed'
|
|
43
|
+
"""
|
|
44
|
+
self.n_clusters = n_clusters
|
|
45
|
+
self.linkage = linkage
|
|
46
|
+
self.metric = metric
|
|
47
|
+
self.scaler = StandardScaler()
|
|
48
|
+
|
|
49
|
+
# Initialize model
|
|
50
|
+
# Note: 'ward' linkage requires 'euclidean' metric
|
|
51
|
+
if linkage == 'ward':
|
|
52
|
+
self.metric = 'euclidean'
|
|
53
|
+
|
|
54
|
+
self.model = AgglomerativeClustering(
|
|
55
|
+
n_clusters=n_clusters,
|
|
56
|
+
linkage=linkage,
|
|
57
|
+
metric=metric if linkage != 'ward' else 'euclidean'
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# Store linkage matrix for dendrogram
|
|
61
|
+
self.linkage_matrix = None
|
|
62
|
+
|
|
63
|
+
def fit(self, X: Union[np.ndarray, pd.DataFrame]) -> 'EconHierarchicalClustering':
|
|
64
|
+
"""
|
|
65
|
+
Fit the Hierarchical Clustering model
|
|
66
|
+
|
|
67
|
+
Parameters:
|
|
68
|
+
-----------
|
|
69
|
+
X : array-like of shape (n_samples, n_features)
|
|
70
|
+
Training data
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
--------
|
|
74
|
+
self : EconHierarchicalClustering
|
|
75
|
+
"""
|
|
76
|
+
# Scale features
|
|
77
|
+
X_scaled = self.scaler.fit_transform(X)
|
|
78
|
+
|
|
79
|
+
# Fit the model
|
|
80
|
+
self.model.fit(X_scaled)
|
|
81
|
+
|
|
82
|
+
# Compute linkage matrix for dendrogram
|
|
83
|
+
if self.metric != 'precomputed':
|
|
84
|
+
distance_matrix = pdist(X_scaled, metric=self.metric)
|
|
85
|
+
self.linkage_matrix = linkage(distance_matrix, method=self.linkage)
|
|
86
|
+
|
|
87
|
+
return self
|
|
88
|
+
|
|
89
|
+
def predict(self, X: Union[np.ndarray, pd.DataFrame] = None) -> np.ndarray:
|
|
90
|
+
"""
|
|
91
|
+
Get cluster labels
|
|
92
|
+
|
|
93
|
+
Parameters:
|
|
94
|
+
-----------
|
|
95
|
+
X : array-like of shape (n_samples, n_features) or None
|
|
96
|
+
Data to predict (not used in hierarchical clustering,
|
|
97
|
+
returns labels from fit)
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
--------
|
|
101
|
+
labels : ndarray of shape (n_samples,)
|
|
102
|
+
Index of the cluster each sample belongs to
|
|
103
|
+
"""
|
|
104
|
+
return self.model.labels_
|
|
105
|
+
|
|
106
|
+
def fit_predict(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
|
|
107
|
+
"""
|
|
108
|
+
Fit the hierarchical clustering model and return cluster labels
|
|
109
|
+
|
|
110
|
+
Parameters:
|
|
111
|
+
-----------
|
|
112
|
+
X : array-like of shape (n_samples, n_features)
|
|
113
|
+
Training data
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
--------
|
|
117
|
+
labels : ndarray of shape (n_samples,)
|
|
118
|
+
Index of the cluster each sample belongs to
|
|
119
|
+
"""
|
|
120
|
+
self.fit(X)
|
|
121
|
+
return self.model.labels_
|
|
122
|
+
|
|
123
|
+
def evaluate(self, X: Union[np.ndarray, pd.DataFrame]) -> Dict[str, float]:
|
|
124
|
+
"""
|
|
125
|
+
Evaluate clustering performance
|
|
126
|
+
|
|
127
|
+
Parameters:
|
|
128
|
+
-----------
|
|
129
|
+
X : array-like of shape (n_samples, n_features)
|
|
130
|
+
Data to evaluate
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
--------
|
|
134
|
+
metrics : dict
|
|
135
|
+
Dictionary with evaluation metrics
|
|
136
|
+
"""
|
|
137
|
+
# Scale features
|
|
138
|
+
X_scaled = self.scaler.transform(X)
|
|
139
|
+
labels = self.model.labels_
|
|
140
|
+
|
|
141
|
+
# Calculate metrics if more than 1 cluster
|
|
142
|
+
if len(np.unique(labels)) > 1:
|
|
143
|
+
silhouette = silhouette_score(X_scaled, labels)
|
|
144
|
+
calinski_harabasz = calinski_harabasz_score(X_scaled, labels)
|
|
145
|
+
else:
|
|
146
|
+
silhouette = 0.0
|
|
147
|
+
calinski_harabasz = 0.0
|
|
148
|
+
|
|
149
|
+
return {
|
|
150
|
+
'silhouette_score': silhouette,
|
|
151
|
+
'calinski_harabasz_score': calinski_harabasz
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
def plot_dendrogram(self, X: Union[np.ndarray, pd.DataFrame] = None,
|
|
155
|
+
truncate_mode: str = 'level', p: int = 5,
|
|
156
|
+
figsize: tuple = (12, 8)) -> Optional:
|
|
157
|
+
"""
|
|
158
|
+
Plot dendrogram for hierarchical clustering
|
|
159
|
+
|
|
160
|
+
Parameters:
|
|
161
|
+
-----------
|
|
162
|
+
X : array-like of shape (n_samples, n_features) or None
|
|
163
|
+
Data to visualize (if None, uses data from fit)
|
|
164
|
+
truncate_mode : str
|
|
165
|
+
Truncation mode for dendrogram
|
|
166
|
+
p : int
|
|
167
|
+
Parameter for truncation
|
|
168
|
+
figsize : tuple
|
|
169
|
+
Figure size
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
--------
|
|
173
|
+
fig : matplotlib Figure or None
|
|
174
|
+
The figure object, or None if matplotlib is not available
|
|
175
|
+
"""
|
|
176
|
+
if not MATPLOTLIB_AVAILABLE:
|
|
177
|
+
print("Matplotlib is not available. Skipping visualization.")
|
|
178
|
+
return None
|
|
179
|
+
|
|
180
|
+
# Compute linkage matrix if not already computed
|
|
181
|
+
if self.linkage_matrix is None and X is not None:
|
|
182
|
+
X_scaled = self.scaler.transform(X)
|
|
183
|
+
distance_matrix = pdist(X_scaled, metric=self.metric)
|
|
184
|
+
self.linkage_matrix = linkage(distance_matrix, method=self.linkage)
|
|
185
|
+
|
|
186
|
+
if self.linkage_matrix is None:
|
|
187
|
+
raise ValueError("No linkage matrix available. Please fit the model first or provide data.")
|
|
188
|
+
|
|
189
|
+
# Create plot
|
|
190
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
191
|
+
dendrogram(
|
|
192
|
+
self.linkage_matrix,
|
|
193
|
+
truncate_mode=truncate_mode,
|
|
194
|
+
p=p,
|
|
195
|
+
ax=ax
|
|
196
|
+
)
|
|
197
|
+
ax.set_xlabel('Sample Index or (Cluster Size)')
|
|
198
|
+
ax.set_ylabel('Distance')
|
|
199
|
+
ax.set_title('Hierarchical Clustering Dendrogram')
|
|
200
|
+
|
|
201
|
+
return fig
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def hierarchical_clustering_analysis(X: Union[np.ndarray, pd.DataFrame],
|
|
205
|
+
n_clusters: int = 2,
|
|
206
|
+
linkage: str = 'ward',
|
|
207
|
+
metric: str = 'euclidean') -> dict:
|
|
208
|
+
"""
|
|
209
|
+
Perform complete Hierarchical Clustering analysis
|
|
210
|
+
|
|
211
|
+
Parameters:
|
|
212
|
+
-----------
|
|
213
|
+
X : array-like of shape (n_samples, n_features)
|
|
214
|
+
Features
|
|
215
|
+
n_clusters : int
|
|
216
|
+
Number of clusters to find
|
|
217
|
+
linkage : str, 'ward', 'complete', 'average', 'single'
|
|
218
|
+
Which linkage criterion to use
|
|
219
|
+
metric : str or callable
|
|
220
|
+
Metric used to compute the linkage
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
--------
|
|
224
|
+
results : dict
|
|
225
|
+
Dictionary with model, cluster labels, and evaluation metrics
|
|
226
|
+
"""
|
|
227
|
+
# Initialize and fit model
|
|
228
|
+
hc_model = EconHierarchicalClustering(
|
|
229
|
+
n_clusters=n_clusters,
|
|
230
|
+
linkage=linkage,
|
|
231
|
+
metric=metric
|
|
232
|
+
)
|
|
233
|
+
labels = hc_model.fit_predict(X)
|
|
234
|
+
|
|
235
|
+
# Evaluate clustering
|
|
236
|
+
metrics = hc_model.evaluate(X)
|
|
237
|
+
|
|
238
|
+
return {
|
|
239
|
+
'model': hc_model,
|
|
240
|
+
'labels': labels,
|
|
241
|
+
'metrics': metrics,
|
|
242
|
+
'X': X
|
|
243
|
+
}
|