aigroup-econ-mcp 1.3.3__py3-none-any.whl → 2.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- .gitignore +253 -0
- PKG-INFO +732 -0
- README.md +687 -0
- __init__.py +14 -0
- aigroup_econ_mcp-2.0.1.dist-info/METADATA +732 -0
- aigroup_econ_mcp-2.0.1.dist-info/RECORD +170 -0
- aigroup_econ_mcp-2.0.1.dist-info/entry_points.txt +2 -0
- aigroup_econ_mcp-2.0.1.dist-info/licenses/LICENSE +21 -0
- cli.py +32 -0
- econometrics/README.md +18 -0
- econometrics/__init__.py +191 -0
- econometrics/advanced_methods/modern_computing_machine_learning/__init__.py +30 -0
- econometrics/advanced_methods/modern_computing_machine_learning/causal_forest.py +253 -0
- econometrics/advanced_methods/modern_computing_machine_learning/double_ml.py +268 -0
- econometrics/advanced_methods/modern_computing_machine_learning/gradient_boosting.py +249 -0
- econometrics/advanced_methods/modern_computing_machine_learning/hierarchical_clustering.py +243 -0
- econometrics/advanced_methods/modern_computing_machine_learning/kmeans_clustering.py +293 -0
- econometrics/advanced_methods/modern_computing_machine_learning/neural_network.py +264 -0
- econometrics/advanced_methods/modern_computing_machine_learning/random_forest.py +195 -0
- econometrics/advanced_methods/modern_computing_machine_learning/support_vector_machine.py +226 -0
- econometrics/advanced_methods/modern_computing_machine_learning/test_all_modules.py +329 -0
- econometrics/advanced_methods/modern_computing_machine_learning/test_report.md +107 -0
- econometrics/basic_parametric_estimation/__init__.py +31 -0
- econometrics/basic_parametric_estimation/gmm/__init__.py +13 -0
- econometrics/basic_parametric_estimation/gmm/gmm_model.py +256 -0
- econometrics/basic_parametric_estimation/mle/__init__.py +13 -0
- econometrics/basic_parametric_estimation/mle/mle_model.py +241 -0
- econometrics/basic_parametric_estimation/ols/__init__.py +13 -0
- econometrics/basic_parametric_estimation/ols/ols_model.py +141 -0
- econometrics/causal_inference/__init__.py +66 -0
- econometrics/causal_inference/causal_identification_strategy/__init__.py +104 -0
- econometrics/causal_inference/causal_identification_strategy/control_function.py +112 -0
- econometrics/causal_inference/causal_identification_strategy/difference_in_differences.py +107 -0
- econometrics/causal_inference/causal_identification_strategy/event_study.py +119 -0
- econometrics/causal_inference/causal_identification_strategy/first_difference.py +89 -0
- econometrics/causal_inference/causal_identification_strategy/fixed_effects.py +103 -0
- econometrics/causal_inference/causal_identification_strategy/hausman_test.py +69 -0
- econometrics/causal_inference/causal_identification_strategy/instrumental_variables.py +145 -0
- econometrics/causal_inference/causal_identification_strategy/mediation_analysis.py +121 -0
- econometrics/causal_inference/causal_identification_strategy/moderation_analysis.py +109 -0
- econometrics/causal_inference/causal_identification_strategy/propensity_score_matching.py +140 -0
- econometrics/causal_inference/causal_identification_strategy/random_effects.py +100 -0
- econometrics/causal_inference/causal_identification_strategy/regression_discontinuity.py +98 -0
- econometrics/causal_inference/causal_identification_strategy/synthetic_control.py +111 -0
- econometrics/causal_inference/causal_identification_strategy/triple_difference.py +86 -0
- econometrics/distribution_analysis/__init__.py +28 -0
- econometrics/distribution_analysis/oaxaca_blinder.py +184 -0
- econometrics/distribution_analysis/time_series_decomposition.py +152 -0
- econometrics/distribution_analysis/variance_decomposition.py +179 -0
- econometrics/missing_data/__init__.py +18 -0
- econometrics/missing_data/imputation_methods.py +219 -0
- econometrics/missing_data/missing_data_measurement_error/__init__.py +0 -0
- econometrics/model_specification_diagnostics_robust_inference/README.md +173 -0
- econometrics/model_specification_diagnostics_robust_inference/__init__.py +78 -0
- econometrics/model_specification_diagnostics_robust_inference/diagnostic_tests/__init__.py +20 -0
- econometrics/model_specification_diagnostics_robust_inference/diagnostic_tests/diagnostic_tests_model.py +149 -0
- econometrics/model_specification_diagnostics_robust_inference/generalized_least_squares/__init__.py +15 -0
- econometrics/model_specification_diagnostics_robust_inference/generalized_least_squares/gls_model.py +130 -0
- econometrics/model_specification_diagnostics_robust_inference/model_selection/__init__.py +18 -0
- econometrics/model_specification_diagnostics_robust_inference/model_selection/model_selection_model.py +286 -0
- econometrics/model_specification_diagnostics_robust_inference/regularization/__init__.py +15 -0
- econometrics/model_specification_diagnostics_robust_inference/regularization/regularization_model.py +177 -0
- econometrics/model_specification_diagnostics_robust_inference/robust_errors/__init__.py +15 -0
- econometrics/model_specification_diagnostics_robust_inference/robust_errors/robust_errors_model.py +122 -0
- econometrics/model_specification_diagnostics_robust_inference/simultaneous_equations/__init__.py +15 -0
- econometrics/model_specification_diagnostics_robust_inference/simultaneous_equations/simultaneous_equations_model.py +246 -0
- econometrics/model_specification_diagnostics_robust_inference/weighted_least_squares/__init__.py +15 -0
- econometrics/model_specification_diagnostics_robust_inference/weighted_least_squares/wls_model.py +127 -0
- econometrics/nonparametric/__init__.py +35 -0
- econometrics/nonparametric/gam_model.py +117 -0
- econometrics/nonparametric/kernel_regression.py +161 -0
- econometrics/nonparametric/nonparametric_semiparametric_methods/__init__.py +0 -0
- econometrics/nonparametric/quantile_regression.py +249 -0
- econometrics/nonparametric/spline_regression.py +100 -0
- econometrics/spatial_econometrics/__init__.py +68 -0
- econometrics/spatial_econometrics/geographically_weighted_regression.py +211 -0
- econometrics/spatial_econometrics/gwr_simple.py +154 -0
- econometrics/spatial_econometrics/spatial_autocorrelation.py +356 -0
- econometrics/spatial_econometrics/spatial_durbin_model.py +177 -0
- econometrics/spatial_econometrics/spatial_econometrics_new/__init__.py +0 -0
- econometrics/spatial_econometrics/spatial_regression.py +315 -0
- econometrics/spatial_econometrics/spatial_weights.py +226 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/README.md +164 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/__init__.py +40 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/count_data_models.py +311 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/discrete_choice_models.py +294 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/limited_dependent_variable_models.py +282 -0
- econometrics/specific_data_modeling/survival_duration_data/__init__.py +0 -0
- econometrics/specific_data_modeling/time_series_panel_data/__init__.py +143 -0
- econometrics/specific_data_modeling/time_series_panel_data/arima_model.py +104 -0
- econometrics/specific_data_modeling/time_series_panel_data/cointegration_vecm.py +334 -0
- econometrics/specific_data_modeling/time_series_panel_data/dynamic_panel_models.py +653 -0
- econometrics/specific_data_modeling/time_series_panel_data/exponential_smoothing.py +176 -0
- econometrics/specific_data_modeling/time_series_panel_data/garch_model.py +198 -0
- econometrics/specific_data_modeling/time_series_panel_data/panel_diagnostics.py +125 -0
- econometrics/specific_data_modeling/time_series_panel_data/panel_var.py +60 -0
- econometrics/specific_data_modeling/time_series_panel_data/structural_break_tests.py +87 -0
- econometrics/specific_data_modeling/time_series_panel_data/time_varying_parameter_models.py +106 -0
- econometrics/specific_data_modeling/time_series_panel_data/unit_root_tests.py +204 -0
- econometrics/specific_data_modeling/time_series_panel_data/var_svar_model.py +372 -0
- econometrics/statistical_inference/__init__.py +21 -0
- econometrics/statistical_inference/bootstrap_methods.py +162 -0
- econometrics/statistical_inference/permutation_test.py +177 -0
- econometrics/statistical_inference/statistical_inference_techniques/__init__.py +0 -0
- econometrics/statistics/distribution_decomposition_methods/__init__.py +0 -0
- econometrics/survival_analysis/__init__.py +18 -0
- econometrics/survival_analysis/survival_models.py +259 -0
- econometrics/tests/basic_parametric_estimation_tests/__init__.py +3 -0
- econometrics/tests/basic_parametric_estimation_tests/test_gmm.py +128 -0
- econometrics/tests/basic_parametric_estimation_tests/test_mle.py +127 -0
- econometrics/tests/basic_parametric_estimation_tests/test_ols.py +100 -0
- econometrics/tests/causal_inference_tests/__init__.py +3 -0
- econometrics/tests/causal_inference_tests/detailed_test.py +441 -0
- econometrics/tests/causal_inference_tests/test_all_methods.py +418 -0
- econometrics/tests/causal_inference_tests/test_causal_identification_strategy.py +202 -0
- econometrics/tests/causal_inference_tests/test_difference_in_differences.py +53 -0
- econometrics/tests/causal_inference_tests/test_instrumental_variables.py +44 -0
- econometrics/tests/model_specification_diagnostics_tests/__init__.py +3 -0
- econometrics/tests/model_specification_diagnostics_tests/test_diagnostic_tests.py +86 -0
- econometrics/tests/model_specification_diagnostics_tests/test_robust_errors.py +89 -0
- econometrics/tests/specific_data_modeling_tests/__init__.py +3 -0
- econometrics/tests/specific_data_modeling_tests/test_arima.py +98 -0
- econometrics/tests/specific_data_modeling_tests/test_dynamic_panel.py +198 -0
- econometrics/tests/specific_data_modeling_tests/test_exponential_smoothing.py +105 -0
- econometrics/tests/specific_data_modeling_tests/test_garch.py +118 -0
- econometrics/tests/specific_data_modeling_tests/test_micro_discrete_limited_data.py +189 -0
- econometrics/tests/specific_data_modeling_tests/test_unit_root.py +156 -0
- econometrics/tests/specific_data_modeling_tests/test_var.py +124 -0
- econometrics//321/206/320/254/320/272/321/205/342/225/235/320/220/321/205/320/237/320/241/321/205/320/264/320/267/321/207/342/226/222/342/225/227/321/204/342/225/235/320/250/321/205/320/225/320/230/321/207/342/225/221/320/267/321/205/320/230/320/226/321/206/320/256/320/240.md +544 -0
- prompts/__init__.py +0 -0
- prompts/analysis_guides.py +43 -0
- pyproject.toml +85 -0
- resources/MCP_MASTER_GUIDE.md +422 -0
- resources/MCP_TOOLS_DATA_FORMAT_GUIDE.md +185 -0
- resources/__init__.py +0 -0
- server.py +97 -0
- tools/README.md +88 -0
- tools/__init__.py +119 -0
- tools/causal_inference_adapter.py +658 -0
- tools/data_loader.py +213 -0
- tools/decorators.py +38 -0
- tools/distribution_analysis_adapter.py +121 -0
- tools/econometrics_adapter.py +286 -0
- tools/gwr_simple_adapter.py +54 -0
- tools/machine_learning_adapter.py +567 -0
- tools/mcp_tool_groups/__init__.py +15 -0
- tools/mcp_tool_groups/basic_parametric_tools.py +173 -0
- tools/mcp_tool_groups/causal_inference_tools.py +643 -0
- tools/mcp_tool_groups/distribution_analysis_tools.py +169 -0
- tools/mcp_tool_groups/machine_learning_tools.py +422 -0
- tools/mcp_tool_groups/microecon_tools.py +325 -0
- tools/mcp_tool_groups/missing_data_tools.py +117 -0
- tools/mcp_tool_groups/model_specification_tools.py +402 -0
- tools/mcp_tool_groups/nonparametric_tools.py +225 -0
- tools/mcp_tool_groups/spatial_econometrics_tools.py +323 -0
- tools/mcp_tool_groups/statistical_inference_tools.py +131 -0
- tools/mcp_tool_groups/time_series_tools.py +494 -0
- tools/mcp_tools_registry.py +124 -0
- tools/microecon_adapter.py +412 -0
- tools/missing_data_adapter.py +73 -0
- tools/model_specification_adapter.py +369 -0
- tools/nonparametric_adapter.py +190 -0
- tools/output_formatter.py +563 -0
- tools/spatial_econometrics_adapter.py +318 -0
- tools/statistical_inference_adapter.py +90 -0
- tools/survival_analysis_adapter.py +46 -0
- tools/time_series_panel_data_adapter.py +858 -0
- tools/time_series_panel_data_tools.py +65 -0
- aigroup_econ_mcp/__init__.py +0 -19
- aigroup_econ_mcp/cli.py +0 -82
- aigroup_econ_mcp/config.py +0 -561
- aigroup_econ_mcp/server.py +0 -452
- aigroup_econ_mcp/tools/__init__.py +0 -19
- aigroup_econ_mcp/tools/base.py +0 -470
- aigroup_econ_mcp/tools/cache.py +0 -533
- aigroup_econ_mcp/tools/data_loader.py +0 -195
- aigroup_econ_mcp/tools/file_parser.py +0 -1027
- aigroup_econ_mcp/tools/machine_learning.py +0 -60
- aigroup_econ_mcp/tools/ml_ensemble.py +0 -210
- aigroup_econ_mcp/tools/ml_evaluation.py +0 -272
- aigroup_econ_mcp/tools/ml_models.py +0 -54
- aigroup_econ_mcp/tools/ml_regularization.py +0 -186
- aigroup_econ_mcp/tools/monitoring.py +0 -555
- aigroup_econ_mcp/tools/optimized_example.py +0 -229
- aigroup_econ_mcp/tools/panel_data.py +0 -619
- aigroup_econ_mcp/tools/regression.py +0 -214
- aigroup_econ_mcp/tools/statistics.py +0 -154
- aigroup_econ_mcp/tools/time_series.py +0 -698
- aigroup_econ_mcp/tools/timeout.py +0 -283
- aigroup_econ_mcp/tools/tool_descriptions.py +0 -410
- aigroup_econ_mcp/tools/tool_handlers.py +0 -1016
- aigroup_econ_mcp/tools/tool_registry.py +0 -478
- aigroup_econ_mcp/tools/validation.py +0 -482
- aigroup_econ_mcp-1.3.3.dist-info/METADATA +0 -525
- aigroup_econ_mcp-1.3.3.dist-info/RECORD +0 -30
- aigroup_econ_mcp-1.3.3.dist-info/entry_points.txt +0 -2
- /aigroup_econ_mcp-1.3.3.dist-info/licenses/LICENSE → /LICENSE +0 -0
- {aigroup_econ_mcp-1.3.3.dist-info → aigroup_econ_mcp-2.0.1.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
"""
|
|
2
|
+
K-Means Clustering implementation for econometric analysis
|
|
3
|
+
"""
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from sklearn.cluster import KMeans, MiniBatchKMeans
|
|
7
|
+
from sklearn.metrics import silhouette_score, calinski_harabasz_score
|
|
8
|
+
from sklearn.preprocessing import StandardScaler
|
|
9
|
+
from sklearn.decomposition import PCA
|
|
10
|
+
from typing import Union, Optional, Dict, Any
|
|
11
|
+
|
|
12
|
+
# 可选导入matplotlib
|
|
13
|
+
try:
|
|
14
|
+
import matplotlib.pyplot as plt
|
|
15
|
+
MATPLOTLIB_AVAILABLE = True
|
|
16
|
+
except ImportError:
|
|
17
|
+
MATPLOTLIB_AVAILABLE = False
|
|
18
|
+
except UnicodeDecodeError:
|
|
19
|
+
# 处理编码问题
|
|
20
|
+
MATPLOTLIB_AVAILABLE = False
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class EconKMeans:
|
|
24
|
+
"""
|
|
25
|
+
K-Means Clustering for econometric analysis
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self, n_clusters: int = 8, init: str = 'k-means++', n_init: int = 10,
|
|
29
|
+
max_iter: int = 300, random_state: int = 42, algorithm: str = 'lloyd',
|
|
30
|
+
use_minibatch: bool = False, batch_size: int = 1000):
|
|
31
|
+
"""
|
|
32
|
+
Initialize K-Means clustering model
|
|
33
|
+
|
|
34
|
+
Parameters:
|
|
35
|
+
-----------
|
|
36
|
+
n_clusters : int
|
|
37
|
+
Number of clusters to form
|
|
38
|
+
init : str, 'k-means++', 'random'
|
|
39
|
+
Method for initialization
|
|
40
|
+
n_init : int
|
|
41
|
+
Number of time the k-means algorithm will be run with different centroid seeds
|
|
42
|
+
max_iter : int
|
|
43
|
+
Maximum number of iterations of the k-means algorithm for a single run
|
|
44
|
+
random_state : int
|
|
45
|
+
Random state for reproducibility
|
|
46
|
+
algorithm : str, 'lloyd', 'elkan'
|
|
47
|
+
K-means algorithm to use
|
|
48
|
+
use_minibatch : bool
|
|
49
|
+
Whether to use MiniBatchKMeans for large datasets
|
|
50
|
+
batch_size : int
|
|
51
|
+
Size of the mini batches (only used when use_minibatch=True)
|
|
52
|
+
"""
|
|
53
|
+
self.n_clusters = n_clusters
|
|
54
|
+
self.init = init
|
|
55
|
+
self.n_init = n_init
|
|
56
|
+
self.max_iter = max_iter
|
|
57
|
+
self.random_state = random_state
|
|
58
|
+
self.algorithm = algorithm
|
|
59
|
+
self.use_minibatch = use_minibatch
|
|
60
|
+
self.batch_size = batch_size
|
|
61
|
+
self.scaler = StandardScaler()
|
|
62
|
+
|
|
63
|
+
if use_minibatch:
|
|
64
|
+
self.model = MiniBatchKMeans(
|
|
65
|
+
n_clusters=n_clusters,
|
|
66
|
+
init=init,
|
|
67
|
+
max_iter=max_iter,
|
|
68
|
+
random_state=random_state,
|
|
69
|
+
batch_size=batch_size
|
|
70
|
+
)
|
|
71
|
+
else:
|
|
72
|
+
self.model = KMeans(
|
|
73
|
+
n_clusters=n_clusters,
|
|
74
|
+
init=init,
|
|
75
|
+
n_init=n_init,
|
|
76
|
+
max_iter=max_iter,
|
|
77
|
+
random_state=random_state,
|
|
78
|
+
algorithm=algorithm
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
def fit(self, X: Union[np.ndarray, pd.DataFrame]) -> 'EconKMeans':
|
|
82
|
+
"""
|
|
83
|
+
Fit the K-Means clustering model
|
|
84
|
+
|
|
85
|
+
Parameters:
|
|
86
|
+
-----------
|
|
87
|
+
X : array-like of shape (n_samples, n_features)
|
|
88
|
+
Training data
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
--------
|
|
92
|
+
self : EconKMeans
|
|
93
|
+
"""
|
|
94
|
+
# Scale features
|
|
95
|
+
X_scaled = self.scaler.fit_transform(X)
|
|
96
|
+
self.model.fit(X_scaled)
|
|
97
|
+
return self
|
|
98
|
+
|
|
99
|
+
def predict(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
|
|
100
|
+
"""
|
|
101
|
+
Predict the closest cluster each sample in X belongs to
|
|
102
|
+
|
|
103
|
+
Parameters:
|
|
104
|
+
-----------
|
|
105
|
+
X : array-like of shape (n_samples, n_features)
|
|
106
|
+
New data to predict
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
--------
|
|
110
|
+
labels : ndarray of shape (n_samples,)
|
|
111
|
+
Index of the cluster each sample belongs to
|
|
112
|
+
"""
|
|
113
|
+
# Scale features using the same scaler
|
|
114
|
+
X_scaled = self.scaler.transform(X)
|
|
115
|
+
return self.model.predict(X_scaled)
|
|
116
|
+
|
|
117
|
+
def fit_predict(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
|
|
118
|
+
"""
|
|
119
|
+
Compute cluster centers and predict cluster index for each sample
|
|
120
|
+
|
|
121
|
+
Parameters:
|
|
122
|
+
-----------
|
|
123
|
+
X : array-like of shape (n_samples, n_features)
|
|
124
|
+
Training data
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
--------
|
|
128
|
+
labels : ndarray of shape (n_samples,)
|
|
129
|
+
Index of the cluster each sample belongs to
|
|
130
|
+
"""
|
|
131
|
+
# Scale features
|
|
132
|
+
X_scaled = self.scaler.fit_transform(X)
|
|
133
|
+
return self.model.fit_predict(X_scaled)
|
|
134
|
+
|
|
135
|
+
def cluster_centers(self) -> np.ndarray:
|
|
136
|
+
"""
|
|
137
|
+
Get the cluster centers
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
--------
|
|
141
|
+
centers : ndarray of shape (n_clusters, n_features)
|
|
142
|
+
Coordinates of cluster centers
|
|
143
|
+
"""
|
|
144
|
+
return self.scaler.inverse_transform(self.model.cluster_centers_)
|
|
145
|
+
|
|
146
|
+
def evaluate(self, X: Union[np.ndarray, pd.DataFrame]) -> Dict[str, float]:
|
|
147
|
+
"""
|
|
148
|
+
Evaluate clustering performance
|
|
149
|
+
|
|
150
|
+
Parameters:
|
|
151
|
+
-----------
|
|
152
|
+
X : array-like of shape (n_samples, n_features)
|
|
153
|
+
Data to evaluate
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
--------
|
|
157
|
+
metrics : dict
|
|
158
|
+
Dictionary with evaluation metrics
|
|
159
|
+
"""
|
|
160
|
+
# Scale features
|
|
161
|
+
X_scaled = self.scaler.transform(X)
|
|
162
|
+
labels = self.model.predict(X_scaled)
|
|
163
|
+
|
|
164
|
+
# Calculate metrics
|
|
165
|
+
silhouette = silhouette_score(X_scaled, labels)
|
|
166
|
+
calinski_harabasz = calinski_harabasz_score(X_scaled, labels)
|
|
167
|
+
|
|
168
|
+
return {
|
|
169
|
+
'silhouette_score': silhouette,
|
|
170
|
+
'calinski_harabasz_score': calinski_harabasz,
|
|
171
|
+
'inertia': self.model.inertia_,
|
|
172
|
+
'n_iter': self.model.n_iter_
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
def visualize_clusters(self, X: Union[np.ndarray, pd.DataFrame],
|
|
176
|
+
max_features: int = 10, figsize: tuple = (12, 8)) -> Optional:
|
|
177
|
+
"""
|
|
178
|
+
Visualize clusters using PCA for dimensionality reduction
|
|
179
|
+
|
|
180
|
+
Parameters:
|
|
181
|
+
-----------
|
|
182
|
+
X : array-like of shape (n_samples, n_features)
|
|
183
|
+
Data to visualize
|
|
184
|
+
max_features : int
|
|
185
|
+
Maximum number of features to show in the plot
|
|
186
|
+
figsize : tuple
|
|
187
|
+
Figure size
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
--------
|
|
191
|
+
fig : matplotlib Figure or None
|
|
192
|
+
The figure object, or None if matplotlib is not available
|
|
193
|
+
"""
|
|
194
|
+
if not MATPLOTLIB_AVAILABLE:
|
|
195
|
+
print("Matplotlib is not available. Skipping visualization.")
|
|
196
|
+
return None
|
|
197
|
+
|
|
198
|
+
# Scale features
|
|
199
|
+
X_scaled = self.scaler.transform(X)
|
|
200
|
+
labels = self.model.predict(X_scaled)
|
|
201
|
+
|
|
202
|
+
# Use PCA for dimensionality reduction if there are more than 2 features
|
|
203
|
+
if X_scaled.shape[1] > 2:
|
|
204
|
+
pca = PCA(n_components=min(2, X_scaled.shape[1]))
|
|
205
|
+
X_pca = pca.fit_transform(X_scaled)
|
|
206
|
+
else:
|
|
207
|
+
X_pca = X_scaled
|
|
208
|
+
|
|
209
|
+
# Create plot
|
|
210
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
211
|
+
|
|
212
|
+
# Plot points colored by cluster
|
|
213
|
+
scatter = ax.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap='viridis', alpha=0.7)
|
|
214
|
+
|
|
215
|
+
# Plot cluster centers if available in PCA space
|
|
216
|
+
if hasattr(self.model, 'cluster_centers_'):
|
|
217
|
+
centers_pca = pca.transform(self.model.cluster_centers_) if X_scaled.shape[1] > 2 else self.model.cluster_centers_
|
|
218
|
+
ax.scatter(centers_pca[:, 0], centers_pca[:, 1], c='red', marker='x', s=200, linewidths=3)
|
|
219
|
+
|
|
220
|
+
ax.set_xlabel('Principal Component 1' if X_scaled.shape[1] > 2 else 'Feature 1')
|
|
221
|
+
ax.set_ylabel('Principal Component 2' if X_scaled.shape[1] > 2 else 'Feature 2')
|
|
222
|
+
ax.set_title('K-Means Clustering Results')
|
|
223
|
+
|
|
224
|
+
# Add colorbar
|
|
225
|
+
plt.colorbar(scatter, ax=ax)
|
|
226
|
+
|
|
227
|
+
return fig
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def kmeans_analysis(X: Union[np.ndarray, pd.DataFrame],
|
|
231
|
+
n_clusters: int = 8,
|
|
232
|
+
init: str = 'k-means++',
|
|
233
|
+
n_init: int = 10,
|
|
234
|
+
max_iter: int = 300,
|
|
235
|
+
random_state: int = 42,
|
|
236
|
+
algorithm: str = 'lloyd',
|
|
237
|
+
use_minibatch: bool = False,
|
|
238
|
+
batch_size: int = 1000) -> dict:
|
|
239
|
+
"""
|
|
240
|
+
Perform complete K-Means clustering analysis
|
|
241
|
+
|
|
242
|
+
Parameters:
|
|
243
|
+
-----------
|
|
244
|
+
X : array-like of shape (n_samples, n_features)
|
|
245
|
+
Features
|
|
246
|
+
n_clusters : int
|
|
247
|
+
Number of clusters to form
|
|
248
|
+
init : str, 'k-means++', 'random'
|
|
249
|
+
Method for initialization
|
|
250
|
+
n_init : int
|
|
251
|
+
Number of time the k-means algorithm will be run with different centroid seeds
|
|
252
|
+
max_iter : int
|
|
253
|
+
Maximum number of iterations of the k-means algorithm for a single run
|
|
254
|
+
random_state : int
|
|
255
|
+
Random state for reproducibility
|
|
256
|
+
algorithm : str, 'lloyd', 'elkan'
|
|
257
|
+
K-means algorithm to use
|
|
258
|
+
use_minibatch : bool
|
|
259
|
+
Whether to use MiniBatchKMeans for large datasets
|
|
260
|
+
batch_size : int
|
|
261
|
+
Size of the mini batches (only used when use_minibatch=True)
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
--------
|
|
265
|
+
results : dict
|
|
266
|
+
Dictionary with model, cluster labels, centers, and evaluation metrics
|
|
267
|
+
"""
|
|
268
|
+
# Initialize and fit model
|
|
269
|
+
kmeans_model = EconKMeans(
|
|
270
|
+
n_clusters=n_clusters,
|
|
271
|
+
init=init,
|
|
272
|
+
n_init=n_init,
|
|
273
|
+
max_iter=max_iter,
|
|
274
|
+
random_state=random_state,
|
|
275
|
+
algorithm=algorithm,
|
|
276
|
+
use_minibatch=use_minibatch,
|
|
277
|
+
batch_size=batch_size
|
|
278
|
+
)
|
|
279
|
+
labels = kmeans_model.fit_predict(X)
|
|
280
|
+
|
|
281
|
+
# Get cluster centers
|
|
282
|
+
centers = kmeans_model.cluster_centers()
|
|
283
|
+
|
|
284
|
+
# Evaluate clustering
|
|
285
|
+
metrics = kmeans_model.evaluate(X)
|
|
286
|
+
|
|
287
|
+
return {
|
|
288
|
+
'model': kmeans_model,
|
|
289
|
+
'labels': labels,
|
|
290
|
+
'cluster_centers': centers,
|
|
291
|
+
'metrics': metrics,
|
|
292
|
+
'X': X
|
|
293
|
+
}
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Neural Network implementation for econometric analysis
|
|
3
|
+
"""
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from sklearn.neural_network import MLPRegressor, MLPClassifier
|
|
7
|
+
from sklearn.model_selection import train_test_split
|
|
8
|
+
from sklearn.metrics import mean_squared_error, accuracy_score
|
|
9
|
+
from sklearn.preprocessing import StandardScaler
|
|
10
|
+
from typing import Union, Optional, List, Tuple
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class EconNeuralNetwork:
|
|
14
|
+
"""
|
|
15
|
+
Neural Network for econometric analysis with both regression and classification capabilities
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(self, problem_type: str = 'regression', hidden_layer_sizes: tuple = (100,),
|
|
19
|
+
activation: str = 'relu', solver: str = 'adam', alpha: float = 0.0001,
|
|
20
|
+
learning_rate: str = 'constant', learning_rate_init: float = 0.001,
|
|
21
|
+
max_iter: int = 200, random_state: int = 42):
|
|
22
|
+
"""
|
|
23
|
+
Initialize Neural Network model
|
|
24
|
+
|
|
25
|
+
Parameters:
|
|
26
|
+
-----------
|
|
27
|
+
problem_type : str, 'regression' or 'classification'
|
|
28
|
+
Type of problem to solve
|
|
29
|
+
hidden_layer_sizes : tuple
|
|
30
|
+
The ith element represents the number of neurons in the ith hidden layer
|
|
31
|
+
activation : str, 'identity', 'logistic', 'tanh', 'relu'
|
|
32
|
+
Activation function for the hidden layer
|
|
33
|
+
solver : str, 'lbfgs', 'sgd', 'adam'
|
|
34
|
+
The solver for weight optimization
|
|
35
|
+
alpha : float
|
|
36
|
+
L2 penalty (regularization term) parameter
|
|
37
|
+
learning_rate : str, 'constant', 'invscaling', 'adaptive'
|
|
38
|
+
Learning rate schedule for weight updates
|
|
39
|
+
learning_rate_init : float
|
|
40
|
+
The initial learning rate used
|
|
41
|
+
max_iter : int
|
|
42
|
+
Maximum number of iterations
|
|
43
|
+
random_state : int
|
|
44
|
+
Random state for reproducibility
|
|
45
|
+
"""
|
|
46
|
+
self.problem_type = problem_type
|
|
47
|
+
self.hidden_layer_sizes = hidden_layer_sizes
|
|
48
|
+
self.activation = activation
|
|
49
|
+
self.solver = solver
|
|
50
|
+
self.alpha = alpha
|
|
51
|
+
self.learning_rate = learning_rate
|
|
52
|
+
self.learning_rate_init = learning_rate_init
|
|
53
|
+
self.max_iter = max_iter
|
|
54
|
+
self.random_state = random_state
|
|
55
|
+
self.scaler = StandardScaler()
|
|
56
|
+
|
|
57
|
+
if problem_type == 'regression':
|
|
58
|
+
self.model = MLPRegressor(
|
|
59
|
+
hidden_layer_sizes=hidden_layer_sizes,
|
|
60
|
+
activation=activation,
|
|
61
|
+
solver=solver,
|
|
62
|
+
alpha=alpha,
|
|
63
|
+
learning_rate=learning_rate,
|
|
64
|
+
learning_rate_init=learning_rate_init,
|
|
65
|
+
max_iter=max_iter,
|
|
66
|
+
random_state=random_state
|
|
67
|
+
)
|
|
68
|
+
elif problem_type == 'classification':
|
|
69
|
+
self.model = MLPClassifier(
|
|
70
|
+
hidden_layer_sizes=hidden_layer_sizes,
|
|
71
|
+
activation=activation,
|
|
72
|
+
solver=solver,
|
|
73
|
+
alpha=alpha,
|
|
74
|
+
learning_rate=learning_rate,
|
|
75
|
+
learning_rate_init=learning_rate_init,
|
|
76
|
+
max_iter=max_iter,
|
|
77
|
+
random_state=random_state
|
|
78
|
+
)
|
|
79
|
+
else:
|
|
80
|
+
raise ValueError("problem_type must be either 'regression' or 'classification'")
|
|
81
|
+
|
|
82
|
+
def fit(self, X: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.Series]) -> 'EconNeuralNetwork':
|
|
83
|
+
"""
|
|
84
|
+
Fit the Neural Network model
|
|
85
|
+
|
|
86
|
+
Parameters:
|
|
87
|
+
-----------
|
|
88
|
+
X : array-like of shape (n_samples, n_features)
|
|
89
|
+
Training data
|
|
90
|
+
y : array-like of shape (n_samples,)
|
|
91
|
+
Target values
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
--------
|
|
95
|
+
self : EconNeuralNetwork
|
|
96
|
+
"""
|
|
97
|
+
# Scale features
|
|
98
|
+
X_scaled = self.scaler.fit_transform(X)
|
|
99
|
+
self.model.fit(X_scaled, y)
|
|
100
|
+
return self
|
|
101
|
+
|
|
102
|
+
def predict(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
|
|
103
|
+
"""
|
|
104
|
+
Predict using the Neural Network model
|
|
105
|
+
|
|
106
|
+
Parameters:
|
|
107
|
+
-----------
|
|
108
|
+
X : array-like of shape (n_samples, n_features)
|
|
109
|
+
Samples
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
--------
|
|
113
|
+
y_pred : ndarray of shape (n_samples,)
|
|
114
|
+
Predicted values
|
|
115
|
+
"""
|
|
116
|
+
# Scale features using the same scaler
|
|
117
|
+
X_scaled = self.scaler.transform(X)
|
|
118
|
+
return self.model.predict(X_scaled)
|
|
119
|
+
|
|
120
|
+
def predict_proba(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
|
|
121
|
+
"""
|
|
122
|
+
Predict class probabilities using the Neural Network model (classification only)
|
|
123
|
+
|
|
124
|
+
Parameters:
|
|
125
|
+
-----------
|
|
126
|
+
X : array-like of shape (n_samples, n_features)
|
|
127
|
+
Samples
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
--------
|
|
131
|
+
y_proba : ndarray of shape (n_samples, n_classes)
|
|
132
|
+
Predicted class probabilities
|
|
133
|
+
"""
|
|
134
|
+
if self.problem_type != 'classification':
|
|
135
|
+
raise ValueError("predict_proba is only available for classification problems")
|
|
136
|
+
|
|
137
|
+
# Scale features using the same scaler
|
|
138
|
+
X_scaled = self.scaler.transform(X)
|
|
139
|
+
return self.model.predict_proba(X_scaled)
|
|
140
|
+
|
|
141
|
+
def evaluate(self, X: Union[np.ndarray, pd.DataFrame],
|
|
142
|
+
y: Union[np.ndarray, pd.Series]) -> dict:
|
|
143
|
+
"""
|
|
144
|
+
Evaluate model performance
|
|
145
|
+
|
|
146
|
+
Parameters:
|
|
147
|
+
-----------
|
|
148
|
+
X : array-like of shape (n_samples, n_features)
|
|
149
|
+
Test data
|
|
150
|
+
y : array-like of shape (n_samples,)
|
|
151
|
+
True values
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
--------
|
|
155
|
+
metrics : dict
|
|
156
|
+
Dictionary with evaluation metrics
|
|
157
|
+
"""
|
|
158
|
+
y_pred = self.predict(X)
|
|
159
|
+
|
|
160
|
+
if self.problem_type == 'regression':
|
|
161
|
+
mse = mean_squared_error(y, y_pred)
|
|
162
|
+
rmse = np.sqrt(mse)
|
|
163
|
+
return {
|
|
164
|
+
'mse': mse,
|
|
165
|
+
'rmse': rmse,
|
|
166
|
+
'predictions': y_pred
|
|
167
|
+
}
|
|
168
|
+
else:
|
|
169
|
+
accuracy = accuracy_score(y, y_pred)
|
|
170
|
+
return {
|
|
171
|
+
'accuracy': accuracy,
|
|
172
|
+
'predictions': y_pred
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def neural_network_analysis(X: Union[np.ndarray, pd.DataFrame],
|
|
177
|
+
y: Union[np.ndarray, pd.Series],
|
|
178
|
+
problem_type: str = 'regression',
|
|
179
|
+
hidden_layer_sizes: tuple = (100,),
|
|
180
|
+
activation: str = 'relu',
|
|
181
|
+
solver: str = 'adam',
|
|
182
|
+
test_size: float = 0.2,
|
|
183
|
+
alpha: float = 0.0001,
|
|
184
|
+
learning_rate: str = 'constant',
|
|
185
|
+
learning_rate_init: float = 0.001,
|
|
186
|
+
max_iter: int = 200,
|
|
187
|
+
random_state: int = 42) -> dict:
|
|
188
|
+
"""
|
|
189
|
+
Perform complete Neural Network analysis
|
|
190
|
+
|
|
191
|
+
Parameters:
|
|
192
|
+
-----------
|
|
193
|
+
X : array-like of shape (n_samples, n_features)
|
|
194
|
+
Features
|
|
195
|
+
y : array-like of shape (n_samples,)
|
|
196
|
+
Target variable
|
|
197
|
+
problem_type : str, 'regression' or 'classification'
|
|
198
|
+
Type of problem to solve
|
|
199
|
+
hidden_layer_sizes : tuple
|
|
200
|
+
The ith element represents the number of neurons in the ith hidden layer
|
|
201
|
+
activation : str, 'identity', 'logistic', 'tanh', 'relu'
|
|
202
|
+
Activation function for the hidden layer
|
|
203
|
+
solver : str, 'lbfgs', 'sgd', 'adam'
|
|
204
|
+
The solver for weight optimization
|
|
205
|
+
test_size : float
|
|
206
|
+
Proportion of dataset to include in test split
|
|
207
|
+
alpha : float
|
|
208
|
+
L2 penalty (regularization term) parameter
|
|
209
|
+
learning_rate : str, 'constant', 'invscaling', 'adaptive'
|
|
210
|
+
Learning rate schedule for weight updates
|
|
211
|
+
learning_rate_init : float
|
|
212
|
+
The initial learning rate used
|
|
213
|
+
max_iter : int
|
|
214
|
+
Maximum number of iterations
|
|
215
|
+
random_state : int
|
|
216
|
+
Random state for reproducibility
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
--------
|
|
220
|
+
results : dict
|
|
221
|
+
Dictionary with model, predictions, and evaluation metrics
|
|
222
|
+
"""
|
|
223
|
+
# Split data
|
|
224
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
225
|
+
X, y, test_size=test_size, random_state=random_state
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
# Initialize and fit model
|
|
229
|
+
nn_model = EconNeuralNetwork(
|
|
230
|
+
problem_type=problem_type,
|
|
231
|
+
hidden_layer_sizes=hidden_layer_sizes,
|
|
232
|
+
activation=activation,
|
|
233
|
+
solver=solver,
|
|
234
|
+
alpha=alpha,
|
|
235
|
+
learning_rate=learning_rate,
|
|
236
|
+
learning_rate_init=learning_rate_init,
|
|
237
|
+
max_iter=max_iter,
|
|
238
|
+
random_state=random_state
|
|
239
|
+
)
|
|
240
|
+
nn_model.fit(X_train, y_train)
|
|
241
|
+
|
|
242
|
+
# Evaluate model
|
|
243
|
+
train_results = nn_model.evaluate(X_train, y_train)
|
|
244
|
+
test_results = nn_model.evaluate(X_test, y_test)
|
|
245
|
+
|
|
246
|
+
# For classification, also get probabilities
|
|
247
|
+
if problem_type == 'classification':
|
|
248
|
+
train_proba = nn_model.predict_proba(X_train)
|
|
249
|
+
test_proba = nn_model.predict_proba(X_test)
|
|
250
|
+
else:
|
|
251
|
+
train_proba = None
|
|
252
|
+
test_proba = None
|
|
253
|
+
|
|
254
|
+
return {
|
|
255
|
+
'model': nn_model,
|
|
256
|
+
'train_results': train_results,
|
|
257
|
+
'test_results': test_results,
|
|
258
|
+
'train_proba': train_proba,
|
|
259
|
+
'test_proba': test_proba,
|
|
260
|
+
'X_train': X_train,
|
|
261
|
+
'X_test': X_test,
|
|
262
|
+
'y_train': y_train,
|
|
263
|
+
'y_test': y_test
|
|
264
|
+
}
|