aigroup-econ-mcp 1.3.3__py3-none-any.whl → 2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. .gitignore +253 -0
  2. PKG-INFO +732 -0
  3. README.md +687 -0
  4. __init__.py +14 -0
  5. aigroup_econ_mcp-2.0.1.dist-info/METADATA +732 -0
  6. aigroup_econ_mcp-2.0.1.dist-info/RECORD +170 -0
  7. aigroup_econ_mcp-2.0.1.dist-info/entry_points.txt +2 -0
  8. aigroup_econ_mcp-2.0.1.dist-info/licenses/LICENSE +21 -0
  9. cli.py +32 -0
  10. econometrics/README.md +18 -0
  11. econometrics/__init__.py +191 -0
  12. econometrics/advanced_methods/modern_computing_machine_learning/__init__.py +30 -0
  13. econometrics/advanced_methods/modern_computing_machine_learning/causal_forest.py +253 -0
  14. econometrics/advanced_methods/modern_computing_machine_learning/double_ml.py +268 -0
  15. econometrics/advanced_methods/modern_computing_machine_learning/gradient_boosting.py +249 -0
  16. econometrics/advanced_methods/modern_computing_machine_learning/hierarchical_clustering.py +243 -0
  17. econometrics/advanced_methods/modern_computing_machine_learning/kmeans_clustering.py +293 -0
  18. econometrics/advanced_methods/modern_computing_machine_learning/neural_network.py +264 -0
  19. econometrics/advanced_methods/modern_computing_machine_learning/random_forest.py +195 -0
  20. econometrics/advanced_methods/modern_computing_machine_learning/support_vector_machine.py +226 -0
  21. econometrics/advanced_methods/modern_computing_machine_learning/test_all_modules.py +329 -0
  22. econometrics/advanced_methods/modern_computing_machine_learning/test_report.md +107 -0
  23. econometrics/basic_parametric_estimation/__init__.py +31 -0
  24. econometrics/basic_parametric_estimation/gmm/__init__.py +13 -0
  25. econometrics/basic_parametric_estimation/gmm/gmm_model.py +256 -0
  26. econometrics/basic_parametric_estimation/mle/__init__.py +13 -0
  27. econometrics/basic_parametric_estimation/mle/mle_model.py +241 -0
  28. econometrics/basic_parametric_estimation/ols/__init__.py +13 -0
  29. econometrics/basic_parametric_estimation/ols/ols_model.py +141 -0
  30. econometrics/causal_inference/__init__.py +66 -0
  31. econometrics/causal_inference/causal_identification_strategy/__init__.py +104 -0
  32. econometrics/causal_inference/causal_identification_strategy/control_function.py +112 -0
  33. econometrics/causal_inference/causal_identification_strategy/difference_in_differences.py +107 -0
  34. econometrics/causal_inference/causal_identification_strategy/event_study.py +119 -0
  35. econometrics/causal_inference/causal_identification_strategy/first_difference.py +89 -0
  36. econometrics/causal_inference/causal_identification_strategy/fixed_effects.py +103 -0
  37. econometrics/causal_inference/causal_identification_strategy/hausman_test.py +69 -0
  38. econometrics/causal_inference/causal_identification_strategy/instrumental_variables.py +145 -0
  39. econometrics/causal_inference/causal_identification_strategy/mediation_analysis.py +121 -0
  40. econometrics/causal_inference/causal_identification_strategy/moderation_analysis.py +109 -0
  41. econometrics/causal_inference/causal_identification_strategy/propensity_score_matching.py +140 -0
  42. econometrics/causal_inference/causal_identification_strategy/random_effects.py +100 -0
  43. econometrics/causal_inference/causal_identification_strategy/regression_discontinuity.py +98 -0
  44. econometrics/causal_inference/causal_identification_strategy/synthetic_control.py +111 -0
  45. econometrics/causal_inference/causal_identification_strategy/triple_difference.py +86 -0
  46. econometrics/distribution_analysis/__init__.py +28 -0
  47. econometrics/distribution_analysis/oaxaca_blinder.py +184 -0
  48. econometrics/distribution_analysis/time_series_decomposition.py +152 -0
  49. econometrics/distribution_analysis/variance_decomposition.py +179 -0
  50. econometrics/missing_data/__init__.py +18 -0
  51. econometrics/missing_data/imputation_methods.py +219 -0
  52. econometrics/missing_data/missing_data_measurement_error/__init__.py +0 -0
  53. econometrics/model_specification_diagnostics_robust_inference/README.md +173 -0
  54. econometrics/model_specification_diagnostics_robust_inference/__init__.py +78 -0
  55. econometrics/model_specification_diagnostics_robust_inference/diagnostic_tests/__init__.py +20 -0
  56. econometrics/model_specification_diagnostics_robust_inference/diagnostic_tests/diagnostic_tests_model.py +149 -0
  57. econometrics/model_specification_diagnostics_robust_inference/generalized_least_squares/__init__.py +15 -0
  58. econometrics/model_specification_diagnostics_robust_inference/generalized_least_squares/gls_model.py +130 -0
  59. econometrics/model_specification_diagnostics_robust_inference/model_selection/__init__.py +18 -0
  60. econometrics/model_specification_diagnostics_robust_inference/model_selection/model_selection_model.py +286 -0
  61. econometrics/model_specification_diagnostics_robust_inference/regularization/__init__.py +15 -0
  62. econometrics/model_specification_diagnostics_robust_inference/regularization/regularization_model.py +177 -0
  63. econometrics/model_specification_diagnostics_robust_inference/robust_errors/__init__.py +15 -0
  64. econometrics/model_specification_diagnostics_robust_inference/robust_errors/robust_errors_model.py +122 -0
  65. econometrics/model_specification_diagnostics_robust_inference/simultaneous_equations/__init__.py +15 -0
  66. econometrics/model_specification_diagnostics_robust_inference/simultaneous_equations/simultaneous_equations_model.py +246 -0
  67. econometrics/model_specification_diagnostics_robust_inference/weighted_least_squares/__init__.py +15 -0
  68. econometrics/model_specification_diagnostics_robust_inference/weighted_least_squares/wls_model.py +127 -0
  69. econometrics/nonparametric/__init__.py +35 -0
  70. econometrics/nonparametric/gam_model.py +117 -0
  71. econometrics/nonparametric/kernel_regression.py +161 -0
  72. econometrics/nonparametric/nonparametric_semiparametric_methods/__init__.py +0 -0
  73. econometrics/nonparametric/quantile_regression.py +249 -0
  74. econometrics/nonparametric/spline_regression.py +100 -0
  75. econometrics/spatial_econometrics/__init__.py +68 -0
  76. econometrics/spatial_econometrics/geographically_weighted_regression.py +211 -0
  77. econometrics/spatial_econometrics/gwr_simple.py +154 -0
  78. econometrics/spatial_econometrics/spatial_autocorrelation.py +356 -0
  79. econometrics/spatial_econometrics/spatial_durbin_model.py +177 -0
  80. econometrics/spatial_econometrics/spatial_econometrics_new/__init__.py +0 -0
  81. econometrics/spatial_econometrics/spatial_regression.py +315 -0
  82. econometrics/spatial_econometrics/spatial_weights.py +226 -0
  83. econometrics/specific_data_modeling/micro_discrete_limited_data/README.md +164 -0
  84. econometrics/specific_data_modeling/micro_discrete_limited_data/__init__.py +40 -0
  85. econometrics/specific_data_modeling/micro_discrete_limited_data/count_data_models.py +311 -0
  86. econometrics/specific_data_modeling/micro_discrete_limited_data/discrete_choice_models.py +294 -0
  87. econometrics/specific_data_modeling/micro_discrete_limited_data/limited_dependent_variable_models.py +282 -0
  88. econometrics/specific_data_modeling/survival_duration_data/__init__.py +0 -0
  89. econometrics/specific_data_modeling/time_series_panel_data/__init__.py +143 -0
  90. econometrics/specific_data_modeling/time_series_panel_data/arima_model.py +104 -0
  91. econometrics/specific_data_modeling/time_series_panel_data/cointegration_vecm.py +334 -0
  92. econometrics/specific_data_modeling/time_series_panel_data/dynamic_panel_models.py +653 -0
  93. econometrics/specific_data_modeling/time_series_panel_data/exponential_smoothing.py +176 -0
  94. econometrics/specific_data_modeling/time_series_panel_data/garch_model.py +198 -0
  95. econometrics/specific_data_modeling/time_series_panel_data/panel_diagnostics.py +125 -0
  96. econometrics/specific_data_modeling/time_series_panel_data/panel_var.py +60 -0
  97. econometrics/specific_data_modeling/time_series_panel_data/structural_break_tests.py +87 -0
  98. econometrics/specific_data_modeling/time_series_panel_data/time_varying_parameter_models.py +106 -0
  99. econometrics/specific_data_modeling/time_series_panel_data/unit_root_tests.py +204 -0
  100. econometrics/specific_data_modeling/time_series_panel_data/var_svar_model.py +372 -0
  101. econometrics/statistical_inference/__init__.py +21 -0
  102. econometrics/statistical_inference/bootstrap_methods.py +162 -0
  103. econometrics/statistical_inference/permutation_test.py +177 -0
  104. econometrics/statistical_inference/statistical_inference_techniques/__init__.py +0 -0
  105. econometrics/statistics/distribution_decomposition_methods/__init__.py +0 -0
  106. econometrics/survival_analysis/__init__.py +18 -0
  107. econometrics/survival_analysis/survival_models.py +259 -0
  108. econometrics/tests/basic_parametric_estimation_tests/__init__.py +3 -0
  109. econometrics/tests/basic_parametric_estimation_tests/test_gmm.py +128 -0
  110. econometrics/tests/basic_parametric_estimation_tests/test_mle.py +127 -0
  111. econometrics/tests/basic_parametric_estimation_tests/test_ols.py +100 -0
  112. econometrics/tests/causal_inference_tests/__init__.py +3 -0
  113. econometrics/tests/causal_inference_tests/detailed_test.py +441 -0
  114. econometrics/tests/causal_inference_tests/test_all_methods.py +418 -0
  115. econometrics/tests/causal_inference_tests/test_causal_identification_strategy.py +202 -0
  116. econometrics/tests/causal_inference_tests/test_difference_in_differences.py +53 -0
  117. econometrics/tests/causal_inference_tests/test_instrumental_variables.py +44 -0
  118. econometrics/tests/model_specification_diagnostics_tests/__init__.py +3 -0
  119. econometrics/tests/model_specification_diagnostics_tests/test_diagnostic_tests.py +86 -0
  120. econometrics/tests/model_specification_diagnostics_tests/test_robust_errors.py +89 -0
  121. econometrics/tests/specific_data_modeling_tests/__init__.py +3 -0
  122. econometrics/tests/specific_data_modeling_tests/test_arima.py +98 -0
  123. econometrics/tests/specific_data_modeling_tests/test_dynamic_panel.py +198 -0
  124. econometrics/tests/specific_data_modeling_tests/test_exponential_smoothing.py +105 -0
  125. econometrics/tests/specific_data_modeling_tests/test_garch.py +118 -0
  126. econometrics/tests/specific_data_modeling_tests/test_micro_discrete_limited_data.py +189 -0
  127. econometrics/tests/specific_data_modeling_tests/test_unit_root.py +156 -0
  128. econometrics/tests/specific_data_modeling_tests/test_var.py +124 -0
  129. econometrics//321/206/320/254/320/272/321/205/342/225/235/320/220/321/205/320/237/320/241/321/205/320/264/320/267/321/207/342/226/222/342/225/227/321/204/342/225/235/320/250/321/205/320/225/320/230/321/207/342/225/221/320/267/321/205/320/230/320/226/321/206/320/256/320/240.md +544 -0
  130. prompts/__init__.py +0 -0
  131. prompts/analysis_guides.py +43 -0
  132. pyproject.toml +85 -0
  133. resources/MCP_MASTER_GUIDE.md +422 -0
  134. resources/MCP_TOOLS_DATA_FORMAT_GUIDE.md +185 -0
  135. resources/__init__.py +0 -0
  136. server.py +97 -0
  137. tools/README.md +88 -0
  138. tools/__init__.py +119 -0
  139. tools/causal_inference_adapter.py +658 -0
  140. tools/data_loader.py +213 -0
  141. tools/decorators.py +38 -0
  142. tools/distribution_analysis_adapter.py +121 -0
  143. tools/econometrics_adapter.py +286 -0
  144. tools/gwr_simple_adapter.py +54 -0
  145. tools/machine_learning_adapter.py +567 -0
  146. tools/mcp_tool_groups/__init__.py +15 -0
  147. tools/mcp_tool_groups/basic_parametric_tools.py +173 -0
  148. tools/mcp_tool_groups/causal_inference_tools.py +643 -0
  149. tools/mcp_tool_groups/distribution_analysis_tools.py +169 -0
  150. tools/mcp_tool_groups/machine_learning_tools.py +422 -0
  151. tools/mcp_tool_groups/microecon_tools.py +325 -0
  152. tools/mcp_tool_groups/missing_data_tools.py +117 -0
  153. tools/mcp_tool_groups/model_specification_tools.py +402 -0
  154. tools/mcp_tool_groups/nonparametric_tools.py +225 -0
  155. tools/mcp_tool_groups/spatial_econometrics_tools.py +323 -0
  156. tools/mcp_tool_groups/statistical_inference_tools.py +131 -0
  157. tools/mcp_tool_groups/time_series_tools.py +494 -0
  158. tools/mcp_tools_registry.py +124 -0
  159. tools/microecon_adapter.py +412 -0
  160. tools/missing_data_adapter.py +73 -0
  161. tools/model_specification_adapter.py +369 -0
  162. tools/nonparametric_adapter.py +190 -0
  163. tools/output_formatter.py +563 -0
  164. tools/spatial_econometrics_adapter.py +318 -0
  165. tools/statistical_inference_adapter.py +90 -0
  166. tools/survival_analysis_adapter.py +46 -0
  167. tools/time_series_panel_data_adapter.py +858 -0
  168. tools/time_series_panel_data_tools.py +65 -0
  169. aigroup_econ_mcp/__init__.py +0 -19
  170. aigroup_econ_mcp/cli.py +0 -82
  171. aigroup_econ_mcp/config.py +0 -561
  172. aigroup_econ_mcp/server.py +0 -452
  173. aigroup_econ_mcp/tools/__init__.py +0 -19
  174. aigroup_econ_mcp/tools/base.py +0 -470
  175. aigroup_econ_mcp/tools/cache.py +0 -533
  176. aigroup_econ_mcp/tools/data_loader.py +0 -195
  177. aigroup_econ_mcp/tools/file_parser.py +0 -1027
  178. aigroup_econ_mcp/tools/machine_learning.py +0 -60
  179. aigroup_econ_mcp/tools/ml_ensemble.py +0 -210
  180. aigroup_econ_mcp/tools/ml_evaluation.py +0 -272
  181. aigroup_econ_mcp/tools/ml_models.py +0 -54
  182. aigroup_econ_mcp/tools/ml_regularization.py +0 -186
  183. aigroup_econ_mcp/tools/monitoring.py +0 -555
  184. aigroup_econ_mcp/tools/optimized_example.py +0 -229
  185. aigroup_econ_mcp/tools/panel_data.py +0 -619
  186. aigroup_econ_mcp/tools/regression.py +0 -214
  187. aigroup_econ_mcp/tools/statistics.py +0 -154
  188. aigroup_econ_mcp/tools/time_series.py +0 -698
  189. aigroup_econ_mcp/tools/timeout.py +0 -283
  190. aigroup_econ_mcp/tools/tool_descriptions.py +0 -410
  191. aigroup_econ_mcp/tools/tool_handlers.py +0 -1016
  192. aigroup_econ_mcp/tools/tool_registry.py +0 -478
  193. aigroup_econ_mcp/tools/validation.py +0 -482
  194. aigroup_econ_mcp-1.3.3.dist-info/METADATA +0 -525
  195. aigroup_econ_mcp-1.3.3.dist-info/RECORD +0 -30
  196. aigroup_econ_mcp-1.3.3.dist-info/entry_points.txt +0 -2
  197. /aigroup_econ_mcp-1.3.3.dist-info/licenses/LICENSE → /LICENSE +0 -0
  198. {aigroup_econ_mcp-1.3.3.dist-info → aigroup_econ_mcp-2.0.1.dist-info}/WHEEL +0 -0
@@ -0,0 +1,249 @@
1
+ """
2
+ Gradient Boosting Machine (GBM/XGBoost) implementation for econometric analysis
3
+ """
4
+ import numpy as np
5
+ import pandas as pd
6
+ from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
7
+ from sklearn.model_selection import train_test_split
8
+ from sklearn.metrics import mean_squared_error, accuracy_score
9
+ try:
10
+ import xgboost as xgb
11
+ XGBOOST_AVAILABLE = True
12
+ except ImportError:
13
+ XGBOOST_AVAILABLE = False
14
+ from typing import Union, Optional, Dict, Any
15
+
16
+
17
+ class EconGradientBoosting:
18
+ """
19
+ Gradient Boosting for econometric analysis with both scikit-learn and XGBoost implementations
20
+ """
21
+
22
+ def __init__(self, algorithm: str = 'sklearn', problem_type: str = 'regression',
23
+ n_estimators: int = 100, learning_rate: float = 0.1,
24
+ max_depth: int = 3, random_state: int = 42):
25
+ """
26
+ Initialize Gradient Boosting model
27
+
28
+ Parameters:
29
+ -----------
30
+ algorithm : str, 'sklearn' or 'xgboost'
31
+ Which implementation to use
32
+ problem_type : str, 'regression' or 'classification'
33
+ Type of problem to solve
34
+ n_estimators : int
35
+ Number of boosting stages
36
+ learning_rate : float
37
+ Learning rate shrinks the contribution of each tree
38
+ max_depth : int
39
+ Maximum depth of the individual regression estimators
40
+ random_state : int
41
+ Random state for reproducibility
42
+ """
43
+ self.algorithm = algorithm
44
+ self.problem_type = problem_type
45
+ self.n_estimators = n_estimators
46
+ self.learning_rate = learning_rate
47
+ self.max_depth = max_depth
48
+ self.random_state = random_state
49
+
50
+ if algorithm == 'sklearn':
51
+ if problem_type == 'regression':
52
+ self.model = GradientBoostingRegressor(
53
+ n_estimators=n_estimators,
54
+ learning_rate=learning_rate,
55
+ max_depth=max_depth,
56
+ random_state=random_state
57
+ )
58
+ elif problem_type == 'classification':
59
+ self.model = GradientBoostingClassifier(
60
+ n_estimators=n_estimators,
61
+ learning_rate=learning_rate,
62
+ max_depth=max_depth,
63
+ random_state=random_state
64
+ )
65
+ elif algorithm == 'xgboost':
66
+ if not XGBOOST_AVAILABLE:
67
+ raise ImportError("XGBoost is not installed. Please install it with 'pip install xgboost'")
68
+
69
+ if problem_type == 'regression':
70
+ self.model = xgb.XGBRegressor(
71
+ n_estimators=n_estimators,
72
+ learning_rate=learning_rate,
73
+ max_depth=max_depth,
74
+ random_state=random_state
75
+ )
76
+ elif problem_type == 'classification':
77
+ self.model = xgb.XGBClassifier(
78
+ n_estimators=n_estimators,
79
+ learning_rate=learning_rate,
80
+ max_depth=max_depth,
81
+ random_state=random_state
82
+ )
83
+ else:
84
+ raise ValueError("algorithm must be either 'sklearn' or 'xgboost'")
85
+
86
+ def fit(self, X: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.Series]) -> 'EconGradientBoosting':
87
+ """
88
+ Fit the Gradient Boosting model
89
+
90
+ Parameters:
91
+ -----------
92
+ X : array-like of shape (n_samples, n_features)
93
+ Training data
94
+ y : array-like of shape (n_samples,)
95
+ Target values
96
+
97
+ Returns:
98
+ --------
99
+ self : EconGradientBoosting
100
+ """
101
+ self.model.fit(X, y)
102
+ return self
103
+
104
+ def predict(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
105
+ """
106
+ Predict using the Gradient Boosting model
107
+
108
+ Parameters:
109
+ -----------
110
+ X : array-like of shape (n_samples, n_features)
111
+ Samples
112
+
113
+ Returns:
114
+ --------
115
+ y_pred : ndarray of shape (n_samples,)
116
+ Predicted values
117
+ """
118
+ return self.model.predict(X)
119
+
120
+ def feature_importance(self) -> Dict[str, np.ndarray]:
121
+ """
122
+ Get feature importances
123
+
124
+ Returns:
125
+ --------
126
+ importances : dict
127
+ Dictionary with feature importances (depends on algorithm)
128
+ """
129
+ if self.algorithm == 'sklearn':
130
+ return {
131
+ 'importances': self.model.feature_importances_
132
+ }
133
+ elif self.algorithm == 'xgboost':
134
+ # XGBoost provides multiple importance types
135
+ importance_types = ['weight', 'gain', 'cover', 'total_gain', 'total_cover']
136
+ importances = {}
137
+ for imp_type in importance_types:
138
+ try:
139
+ importances[imp_type] = self.model.feature_importances_
140
+ except:
141
+ pass
142
+ return importances
143
+
144
+ def evaluate(self, X: Union[np.ndarray, pd.DataFrame],
145
+ y: Union[np.ndarray, pd.Series]) -> dict:
146
+ """
147
+ Evaluate model performance
148
+
149
+ Parameters:
150
+ -----------
151
+ X : array-like of shape (n_samples, n_features)
152
+ Test data
153
+ y : array-like of shape (n_samples,)
154
+ True values
155
+
156
+ Returns:
157
+ --------
158
+ metrics : dict
159
+ Dictionary with evaluation metrics
160
+ """
161
+ y_pred = self.predict(X)
162
+
163
+ if self.problem_type == 'regression':
164
+ mse = mean_squared_error(y, y_pred)
165
+ rmse = np.sqrt(mse)
166
+ return {
167
+ 'mse': mse,
168
+ 'rmse': rmse,
169
+ 'predictions': y_pred
170
+ }
171
+ else:
172
+ accuracy = accuracy_score(y, y_pred)
173
+ return {
174
+ 'accuracy': accuracy,
175
+ 'predictions': y_pred
176
+ }
177
+
178
+
179
+ def gradient_boosting_analysis(X: Union[np.ndarray, pd.DataFrame],
180
+ y: Union[np.ndarray, pd.Series],
181
+ algorithm: str = 'sklearn',
182
+ problem_type: str = 'regression',
183
+ test_size: float = 0.2,
184
+ n_estimators: int = 100,
185
+ learning_rate: float = 0.1,
186
+ max_depth: int = 3,
187
+ random_state: int = 42) -> dict:
188
+ """
189
+ Perform complete Gradient Boosting analysis
190
+
191
+ Parameters:
192
+ -----------
193
+ X : array-like of shape (n_samples, n_features)
194
+ Features
195
+ y : array-like of shape (n_samples,)
196
+ Target variable
197
+ algorithm : str, 'sklearn' or 'xgboost'
198
+ Which implementation to use
199
+ problem_type : str, 'regression' or 'classification'
200
+ Type of problem to solve
201
+ test_size : float
202
+ Proportion of dataset to include in test split
203
+ n_estimators : int
204
+ Number of boosting stages
205
+ learning_rate : float
206
+ Learning rate shrinks the contribution of each tree
207
+ max_depth : int
208
+ Maximum depth of the individual regression estimators
209
+ random_state : int
210
+ Random state for reproducibility
211
+
212
+ Returns:
213
+ --------
214
+ results : dict
215
+ Dictionary with model, predictions, and feature importances
216
+ """
217
+ # Split data
218
+ X_train, X_test, y_train, y_test = train_test_split(
219
+ X, y, test_size=test_size, random_state=random_state
220
+ )
221
+
222
+ # Initialize and fit model
223
+ gb_model = EconGradientBoosting(
224
+ algorithm=algorithm,
225
+ problem_type=problem_type,
226
+ n_estimators=n_estimators,
227
+ learning_rate=learning_rate,
228
+ max_depth=max_depth,
229
+ random_state=random_state
230
+ )
231
+ gb_model.fit(X_train, y_train)
232
+
233
+ # Evaluate model
234
+ train_results = gb_model.evaluate(X_train, y_train)
235
+ test_results = gb_model.evaluate(X_test, y_test)
236
+
237
+ # Get feature importances
238
+ importances = gb_model.feature_importance()
239
+
240
+ return {
241
+ 'model': gb_model,
242
+ 'train_results': train_results,
243
+ 'test_results': test_results,
244
+ 'feature_importances': importances,
245
+ 'X_train': X_train,
246
+ 'X_test': X_test,
247
+ 'y_train': y_train,
248
+ 'y_test': y_test
249
+ }
@@ -0,0 +1,243 @@
1
+ """
2
+ Hierarchical Clustering implementation for econometric analysis
3
+ """
4
+ import numpy as np
5
+ import pandas as pd
6
+ from sklearn.cluster import AgglomerativeClustering, linkage_tree
7
+ from sklearn.metrics import silhouette_score, calinski_harabasz_score
8
+ from sklearn.preprocessing import StandardScaler
9
+ from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
10
+ from scipy.spatial.distance import pdist
11
+ from typing import Union, Optional, Dict, Any
12
+
13
+ # 可选导入matplotlib
14
+ try:
15
+ import matplotlib.pyplot as plt
16
+ MATPLOTLIB_AVAILABLE = True
17
+ except ImportError:
18
+ MATPLOTLIB_AVAILABLE = False
19
+ except UnicodeDecodeError:
20
+ # 处理编码问题
21
+ MATPLOTLIB_AVAILABLE = False
22
+
23
+
24
+ class EconHierarchicalClustering:
25
+ """
26
+ Hierarchical Clustering for econometric analysis
27
+ """
28
+
29
+ def __init__(self, n_clusters: int = 2, linkage: str = 'ward',
30
+ metric: str = 'euclidean'):
31
+ """
32
+ Initialize Hierarchical Clustering model
33
+
34
+ Parameters:
35
+ -----------
36
+ n_clusters : int
37
+ Number of clusters to find
38
+ linkage : str, 'ward', 'complete', 'average', 'single'
39
+ Which linkage criterion to use
40
+ metric : str or callable
41
+ Metric used to compute the linkage. Can be 'euclidean', 'l1', 'l2',
42
+ 'manhattan', 'cosine', or 'precomputed'
43
+ """
44
+ self.n_clusters = n_clusters
45
+ self.linkage = linkage
46
+ self.metric = metric
47
+ self.scaler = StandardScaler()
48
+
49
+ # Initialize model
50
+ # Note: 'ward' linkage requires 'euclidean' metric
51
+ if linkage == 'ward':
52
+ self.metric = 'euclidean'
53
+
54
+ self.model = AgglomerativeClustering(
55
+ n_clusters=n_clusters,
56
+ linkage=linkage,
57
+ metric=metric if linkage != 'ward' else 'euclidean'
58
+ )
59
+
60
+ # Store linkage matrix for dendrogram
61
+ self.linkage_matrix = None
62
+
63
+ def fit(self, X: Union[np.ndarray, pd.DataFrame]) -> 'EconHierarchicalClustering':
64
+ """
65
+ Fit the Hierarchical Clustering model
66
+
67
+ Parameters:
68
+ -----------
69
+ X : array-like of shape (n_samples, n_features)
70
+ Training data
71
+
72
+ Returns:
73
+ --------
74
+ self : EconHierarchicalClustering
75
+ """
76
+ # Scale features
77
+ X_scaled = self.scaler.fit_transform(X)
78
+
79
+ # Fit the model
80
+ self.model.fit(X_scaled)
81
+
82
+ # Compute linkage matrix for dendrogram
83
+ if self.metric != 'precomputed':
84
+ distance_matrix = pdist(X_scaled, metric=self.metric)
85
+ self.linkage_matrix = linkage(distance_matrix, method=self.linkage)
86
+
87
+ return self
88
+
89
+ def predict(self, X: Union[np.ndarray, pd.DataFrame] = None) -> np.ndarray:
90
+ """
91
+ Get cluster labels
92
+
93
+ Parameters:
94
+ -----------
95
+ X : array-like of shape (n_samples, n_features) or None
96
+ Data to predict (not used in hierarchical clustering,
97
+ returns labels from fit)
98
+
99
+ Returns:
100
+ --------
101
+ labels : ndarray of shape (n_samples,)
102
+ Index of the cluster each sample belongs to
103
+ """
104
+ return self.model.labels_
105
+
106
+ def fit_predict(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
107
+ """
108
+ Fit the hierarchical clustering model and return cluster labels
109
+
110
+ Parameters:
111
+ -----------
112
+ X : array-like of shape (n_samples, n_features)
113
+ Training data
114
+
115
+ Returns:
116
+ --------
117
+ labels : ndarray of shape (n_samples,)
118
+ Index of the cluster each sample belongs to
119
+ """
120
+ self.fit(X)
121
+ return self.model.labels_
122
+
123
+ def evaluate(self, X: Union[np.ndarray, pd.DataFrame]) -> Dict[str, float]:
124
+ """
125
+ Evaluate clustering performance
126
+
127
+ Parameters:
128
+ -----------
129
+ X : array-like of shape (n_samples, n_features)
130
+ Data to evaluate
131
+
132
+ Returns:
133
+ --------
134
+ metrics : dict
135
+ Dictionary with evaluation metrics
136
+ """
137
+ # Scale features
138
+ X_scaled = self.scaler.transform(X)
139
+ labels = self.model.labels_
140
+
141
+ # Calculate metrics if more than 1 cluster
142
+ if len(np.unique(labels)) > 1:
143
+ silhouette = silhouette_score(X_scaled, labels)
144
+ calinski_harabasz = calinski_harabasz_score(X_scaled, labels)
145
+ else:
146
+ silhouette = 0.0
147
+ calinski_harabasz = 0.0
148
+
149
+ return {
150
+ 'silhouette_score': silhouette,
151
+ 'calinski_harabasz_score': calinski_harabasz
152
+ }
153
+
154
+ def plot_dendrogram(self, X: Union[np.ndarray, pd.DataFrame] = None,
155
+ truncate_mode: str = 'level', p: int = 5,
156
+ figsize: tuple = (12, 8)) -> Optional:
157
+ """
158
+ Plot dendrogram for hierarchical clustering
159
+
160
+ Parameters:
161
+ -----------
162
+ X : array-like of shape (n_samples, n_features) or None
163
+ Data to visualize (if None, uses data from fit)
164
+ truncate_mode : str
165
+ Truncation mode for dendrogram
166
+ p : int
167
+ Parameter for truncation
168
+ figsize : tuple
169
+ Figure size
170
+
171
+ Returns:
172
+ --------
173
+ fig : matplotlib Figure or None
174
+ The figure object, or None if matplotlib is not available
175
+ """
176
+ if not MATPLOTLIB_AVAILABLE:
177
+ print("Matplotlib is not available. Skipping visualization.")
178
+ return None
179
+
180
+ # Compute linkage matrix if not already computed
181
+ if self.linkage_matrix is None and X is not None:
182
+ X_scaled = self.scaler.transform(X)
183
+ distance_matrix = pdist(X_scaled, metric=self.metric)
184
+ self.linkage_matrix = linkage(distance_matrix, method=self.linkage)
185
+
186
+ if self.linkage_matrix is None:
187
+ raise ValueError("No linkage matrix available. Please fit the model first or provide data.")
188
+
189
+ # Create plot
190
+ fig, ax = plt.subplots(figsize=figsize)
191
+ dendrogram(
192
+ self.linkage_matrix,
193
+ truncate_mode=truncate_mode,
194
+ p=p,
195
+ ax=ax
196
+ )
197
+ ax.set_xlabel('Sample Index or (Cluster Size)')
198
+ ax.set_ylabel('Distance')
199
+ ax.set_title('Hierarchical Clustering Dendrogram')
200
+
201
+ return fig
202
+
203
+
204
+ def hierarchical_clustering_analysis(X: Union[np.ndarray, pd.DataFrame],
205
+ n_clusters: int = 2,
206
+ linkage: str = 'ward',
207
+ metric: str = 'euclidean') -> dict:
208
+ """
209
+ Perform complete Hierarchical Clustering analysis
210
+
211
+ Parameters:
212
+ -----------
213
+ X : array-like of shape (n_samples, n_features)
214
+ Features
215
+ n_clusters : int
216
+ Number of clusters to find
217
+ linkage : str, 'ward', 'complete', 'average', 'single'
218
+ Which linkage criterion to use
219
+ metric : str or callable
220
+ Metric used to compute the linkage
221
+
222
+ Returns:
223
+ --------
224
+ results : dict
225
+ Dictionary with model, cluster labels, and evaluation metrics
226
+ """
227
+ # Initialize and fit model
228
+ hc_model = EconHierarchicalClustering(
229
+ n_clusters=n_clusters,
230
+ linkage=linkage,
231
+ metric=metric
232
+ )
233
+ labels = hc_model.fit_predict(X)
234
+
235
+ # Evaluate clustering
236
+ metrics = hc_model.evaluate(X)
237
+
238
+ return {
239
+ 'model': hc_model,
240
+ 'labels': labels,
241
+ 'metrics': metrics,
242
+ 'X': X
243
+ }