aigroup-econ-mcp 1.4.3__py3-none-any.whl → 2.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PKG-INFO +344 -322
- README.md +335 -320
- __init__.py +1 -1
- aigroup_econ_mcp-2.0.1.dist-info/METADATA +732 -0
- aigroup_econ_mcp-2.0.1.dist-info/RECORD +170 -0
- cli.py +4 -0
- econometrics/advanced_methods/modern_computing_machine_learning/__init__.py +30 -0
- econometrics/advanced_methods/modern_computing_machine_learning/causal_forest.py +253 -0
- econometrics/advanced_methods/modern_computing_machine_learning/double_ml.py +268 -0
- econometrics/advanced_methods/modern_computing_machine_learning/gradient_boosting.py +249 -0
- econometrics/advanced_methods/modern_computing_machine_learning/hierarchical_clustering.py +243 -0
- econometrics/advanced_methods/modern_computing_machine_learning/kmeans_clustering.py +293 -0
- econometrics/advanced_methods/modern_computing_machine_learning/neural_network.py +264 -0
- econometrics/advanced_methods/modern_computing_machine_learning/random_forest.py +195 -0
- econometrics/advanced_methods/modern_computing_machine_learning/support_vector_machine.py +226 -0
- econometrics/advanced_methods/modern_computing_machine_learning/test_all_modules.py +329 -0
- econometrics/advanced_methods/modern_computing_machine_learning/test_report.md +107 -0
- econometrics/causal_inference/__init__.py +66 -0
- econometrics/causal_inference/causal_identification_strategy/__init__.py +104 -0
- econometrics/causal_inference/causal_identification_strategy/control_function.py +112 -0
- econometrics/causal_inference/causal_identification_strategy/difference_in_differences.py +107 -0
- econometrics/causal_inference/causal_identification_strategy/event_study.py +119 -0
- econometrics/causal_inference/causal_identification_strategy/first_difference.py +89 -0
- econometrics/causal_inference/causal_identification_strategy/fixed_effects.py +103 -0
- econometrics/causal_inference/causal_identification_strategy/hausman_test.py +69 -0
- econometrics/causal_inference/causal_identification_strategy/instrumental_variables.py +145 -0
- econometrics/causal_inference/causal_identification_strategy/mediation_analysis.py +121 -0
- econometrics/causal_inference/causal_identification_strategy/moderation_analysis.py +109 -0
- econometrics/causal_inference/causal_identification_strategy/propensity_score_matching.py +140 -0
- econometrics/causal_inference/causal_identification_strategy/random_effects.py +100 -0
- econometrics/causal_inference/causal_identification_strategy/regression_discontinuity.py +98 -0
- econometrics/causal_inference/causal_identification_strategy/synthetic_control.py +111 -0
- econometrics/causal_inference/causal_identification_strategy/triple_difference.py +86 -0
- econometrics/distribution_analysis/__init__.py +28 -0
- econometrics/distribution_analysis/oaxaca_blinder.py +184 -0
- econometrics/distribution_analysis/time_series_decomposition.py +152 -0
- econometrics/distribution_analysis/variance_decomposition.py +179 -0
- econometrics/missing_data/__init__.py +18 -0
- econometrics/missing_data/imputation_methods.py +219 -0
- econometrics/nonparametric/__init__.py +35 -0
- econometrics/nonparametric/gam_model.py +117 -0
- econometrics/nonparametric/kernel_regression.py +161 -0
- econometrics/nonparametric/quantile_regression.py +249 -0
- econometrics/nonparametric/spline_regression.py +100 -0
- econometrics/spatial_econometrics/__init__.py +68 -0
- econometrics/spatial_econometrics/geographically_weighted_regression.py +211 -0
- econometrics/spatial_econometrics/gwr_simple.py +154 -0
- econometrics/spatial_econometrics/spatial_autocorrelation.py +356 -0
- econometrics/spatial_econometrics/spatial_durbin_model.py +177 -0
- econometrics/spatial_econometrics/spatial_regression.py +315 -0
- econometrics/spatial_econometrics/spatial_weights.py +226 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/README.md +164 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/__init__.py +40 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/count_data_models.py +311 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/discrete_choice_models.py +294 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/limited_dependent_variable_models.py +282 -0
- econometrics/statistical_inference/__init__.py +21 -0
- econometrics/statistical_inference/bootstrap_methods.py +162 -0
- econometrics/statistical_inference/permutation_test.py +177 -0
- econometrics/survival_analysis/__init__.py +18 -0
- econometrics/survival_analysis/survival_models.py +259 -0
- econometrics/tests/causal_inference_tests/__init__.py +3 -0
- econometrics/tests/causal_inference_tests/detailed_test.py +441 -0
- econometrics/tests/causal_inference_tests/test_all_methods.py +418 -0
- econometrics/tests/causal_inference_tests/test_causal_identification_strategy.py +202 -0
- econometrics/tests/causal_inference_tests/test_difference_in_differences.py +53 -0
- econometrics/tests/causal_inference_tests/test_instrumental_variables.py +44 -0
- econometrics/tests/specific_data_modeling_tests/test_micro_discrete_limited_data.py +189 -0
- econometrics//321/206/320/254/320/272/321/205/342/225/235/320/220/321/205/320/237/320/241/321/205/320/264/320/267/321/207/342/226/222/342/225/227/321/204/342/225/235/320/250/321/205/320/225/320/230/321/207/342/225/221/320/267/321/205/320/230/320/226/321/206/320/256/320/240.md +544 -0
- pyproject.toml +9 -2
- server.py +15 -1
- tools/__init__.py +75 -1
- tools/causal_inference_adapter.py +658 -0
- tools/distribution_analysis_adapter.py +121 -0
- tools/gwr_simple_adapter.py +54 -0
- tools/machine_learning_adapter.py +567 -0
- tools/mcp_tool_groups/__init__.py +15 -1
- tools/mcp_tool_groups/causal_inference_tools.py +643 -0
- tools/mcp_tool_groups/distribution_analysis_tools.py +169 -0
- tools/mcp_tool_groups/machine_learning_tools.py +422 -0
- tools/mcp_tool_groups/microecon_tools.py +325 -0
- tools/mcp_tool_groups/missing_data_tools.py +117 -0
- tools/mcp_tool_groups/nonparametric_tools.py +225 -0
- tools/mcp_tool_groups/spatial_econometrics_tools.py +323 -0
- tools/mcp_tool_groups/statistical_inference_tools.py +131 -0
- tools/mcp_tools_registry.py +13 -3
- tools/microecon_adapter.py +412 -0
- tools/missing_data_adapter.py +73 -0
- tools/nonparametric_adapter.py +190 -0
- tools/spatial_econometrics_adapter.py +318 -0
- tools/statistical_inference_adapter.py +90 -0
- tools/survival_analysis_adapter.py +46 -0
- aigroup_econ_mcp-1.4.3.dist-info/METADATA +0 -710
- aigroup_econ_mcp-1.4.3.dist-info/RECORD +0 -92
- {aigroup_econ_mcp-1.4.3.dist-info → aigroup_econ_mcp-2.0.1.dist-info}/WHEEL +0 -0
- {aigroup_econ_mcp-1.4.3.dist-info → aigroup_econ_mcp-2.0.1.dist-info}/entry_points.txt +0 -0
- {aigroup_econ_mcp-1.4.3.dist-info → aigroup_econ_mcp-2.0.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Hierarchical Clustering implementation for econometric analysis
|
|
3
|
+
"""
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from sklearn.cluster import AgglomerativeClustering, linkage_tree
|
|
7
|
+
from sklearn.metrics import silhouette_score, calinski_harabasz_score
|
|
8
|
+
from sklearn.preprocessing import StandardScaler
|
|
9
|
+
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
|
|
10
|
+
from scipy.spatial.distance import pdist
|
|
11
|
+
from typing import Union, Optional, Dict, Any
|
|
12
|
+
|
|
13
|
+
# 可选导入matplotlib
|
|
14
|
+
try:
|
|
15
|
+
import matplotlib.pyplot as plt
|
|
16
|
+
MATPLOTLIB_AVAILABLE = True
|
|
17
|
+
except ImportError:
|
|
18
|
+
MATPLOTLIB_AVAILABLE = False
|
|
19
|
+
except UnicodeDecodeError:
|
|
20
|
+
# 处理编码问题
|
|
21
|
+
MATPLOTLIB_AVAILABLE = False
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class EconHierarchicalClustering:
|
|
25
|
+
"""
|
|
26
|
+
Hierarchical Clustering for econometric analysis
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(self, n_clusters: int = 2, linkage: str = 'ward',
|
|
30
|
+
metric: str = 'euclidean'):
|
|
31
|
+
"""
|
|
32
|
+
Initialize Hierarchical Clustering model
|
|
33
|
+
|
|
34
|
+
Parameters:
|
|
35
|
+
-----------
|
|
36
|
+
n_clusters : int
|
|
37
|
+
Number of clusters to find
|
|
38
|
+
linkage : str, 'ward', 'complete', 'average', 'single'
|
|
39
|
+
Which linkage criterion to use
|
|
40
|
+
metric : str or callable
|
|
41
|
+
Metric used to compute the linkage. Can be 'euclidean', 'l1', 'l2',
|
|
42
|
+
'manhattan', 'cosine', or 'precomputed'
|
|
43
|
+
"""
|
|
44
|
+
self.n_clusters = n_clusters
|
|
45
|
+
self.linkage = linkage
|
|
46
|
+
self.metric = metric
|
|
47
|
+
self.scaler = StandardScaler()
|
|
48
|
+
|
|
49
|
+
# Initialize model
|
|
50
|
+
# Note: 'ward' linkage requires 'euclidean' metric
|
|
51
|
+
if linkage == 'ward':
|
|
52
|
+
self.metric = 'euclidean'
|
|
53
|
+
|
|
54
|
+
self.model = AgglomerativeClustering(
|
|
55
|
+
n_clusters=n_clusters,
|
|
56
|
+
linkage=linkage,
|
|
57
|
+
metric=metric if linkage != 'ward' else 'euclidean'
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# Store linkage matrix for dendrogram
|
|
61
|
+
self.linkage_matrix = None
|
|
62
|
+
|
|
63
|
+
def fit(self, X: Union[np.ndarray, pd.DataFrame]) -> 'EconHierarchicalClustering':
|
|
64
|
+
"""
|
|
65
|
+
Fit the Hierarchical Clustering model
|
|
66
|
+
|
|
67
|
+
Parameters:
|
|
68
|
+
-----------
|
|
69
|
+
X : array-like of shape (n_samples, n_features)
|
|
70
|
+
Training data
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
--------
|
|
74
|
+
self : EconHierarchicalClustering
|
|
75
|
+
"""
|
|
76
|
+
# Scale features
|
|
77
|
+
X_scaled = self.scaler.fit_transform(X)
|
|
78
|
+
|
|
79
|
+
# Fit the model
|
|
80
|
+
self.model.fit(X_scaled)
|
|
81
|
+
|
|
82
|
+
# Compute linkage matrix for dendrogram
|
|
83
|
+
if self.metric != 'precomputed':
|
|
84
|
+
distance_matrix = pdist(X_scaled, metric=self.metric)
|
|
85
|
+
self.linkage_matrix = linkage(distance_matrix, method=self.linkage)
|
|
86
|
+
|
|
87
|
+
return self
|
|
88
|
+
|
|
89
|
+
def predict(self, X: Union[np.ndarray, pd.DataFrame] = None) -> np.ndarray:
|
|
90
|
+
"""
|
|
91
|
+
Get cluster labels
|
|
92
|
+
|
|
93
|
+
Parameters:
|
|
94
|
+
-----------
|
|
95
|
+
X : array-like of shape (n_samples, n_features) or None
|
|
96
|
+
Data to predict (not used in hierarchical clustering,
|
|
97
|
+
returns labels from fit)
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
--------
|
|
101
|
+
labels : ndarray of shape (n_samples,)
|
|
102
|
+
Index of the cluster each sample belongs to
|
|
103
|
+
"""
|
|
104
|
+
return self.model.labels_
|
|
105
|
+
|
|
106
|
+
def fit_predict(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
|
|
107
|
+
"""
|
|
108
|
+
Fit the hierarchical clustering model and return cluster labels
|
|
109
|
+
|
|
110
|
+
Parameters:
|
|
111
|
+
-----------
|
|
112
|
+
X : array-like of shape (n_samples, n_features)
|
|
113
|
+
Training data
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
--------
|
|
117
|
+
labels : ndarray of shape (n_samples,)
|
|
118
|
+
Index of the cluster each sample belongs to
|
|
119
|
+
"""
|
|
120
|
+
self.fit(X)
|
|
121
|
+
return self.model.labels_
|
|
122
|
+
|
|
123
|
+
def evaluate(self, X: Union[np.ndarray, pd.DataFrame]) -> Dict[str, float]:
|
|
124
|
+
"""
|
|
125
|
+
Evaluate clustering performance
|
|
126
|
+
|
|
127
|
+
Parameters:
|
|
128
|
+
-----------
|
|
129
|
+
X : array-like of shape (n_samples, n_features)
|
|
130
|
+
Data to evaluate
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
--------
|
|
134
|
+
metrics : dict
|
|
135
|
+
Dictionary with evaluation metrics
|
|
136
|
+
"""
|
|
137
|
+
# Scale features
|
|
138
|
+
X_scaled = self.scaler.transform(X)
|
|
139
|
+
labels = self.model.labels_
|
|
140
|
+
|
|
141
|
+
# Calculate metrics if more than 1 cluster
|
|
142
|
+
if len(np.unique(labels)) > 1:
|
|
143
|
+
silhouette = silhouette_score(X_scaled, labels)
|
|
144
|
+
calinski_harabasz = calinski_harabasz_score(X_scaled, labels)
|
|
145
|
+
else:
|
|
146
|
+
silhouette = 0.0
|
|
147
|
+
calinski_harabasz = 0.0
|
|
148
|
+
|
|
149
|
+
return {
|
|
150
|
+
'silhouette_score': silhouette,
|
|
151
|
+
'calinski_harabasz_score': calinski_harabasz
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
def plot_dendrogram(self, X: Union[np.ndarray, pd.DataFrame] = None,
|
|
155
|
+
truncate_mode: str = 'level', p: int = 5,
|
|
156
|
+
figsize: tuple = (12, 8)) -> Optional:
|
|
157
|
+
"""
|
|
158
|
+
Plot dendrogram for hierarchical clustering
|
|
159
|
+
|
|
160
|
+
Parameters:
|
|
161
|
+
-----------
|
|
162
|
+
X : array-like of shape (n_samples, n_features) or None
|
|
163
|
+
Data to visualize (if None, uses data from fit)
|
|
164
|
+
truncate_mode : str
|
|
165
|
+
Truncation mode for dendrogram
|
|
166
|
+
p : int
|
|
167
|
+
Parameter for truncation
|
|
168
|
+
figsize : tuple
|
|
169
|
+
Figure size
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
--------
|
|
173
|
+
fig : matplotlib Figure or None
|
|
174
|
+
The figure object, or None if matplotlib is not available
|
|
175
|
+
"""
|
|
176
|
+
if not MATPLOTLIB_AVAILABLE:
|
|
177
|
+
print("Matplotlib is not available. Skipping visualization.")
|
|
178
|
+
return None
|
|
179
|
+
|
|
180
|
+
# Compute linkage matrix if not already computed
|
|
181
|
+
if self.linkage_matrix is None and X is not None:
|
|
182
|
+
X_scaled = self.scaler.transform(X)
|
|
183
|
+
distance_matrix = pdist(X_scaled, metric=self.metric)
|
|
184
|
+
self.linkage_matrix = linkage(distance_matrix, method=self.linkage)
|
|
185
|
+
|
|
186
|
+
if self.linkage_matrix is None:
|
|
187
|
+
raise ValueError("No linkage matrix available. Please fit the model first or provide data.")
|
|
188
|
+
|
|
189
|
+
# Create plot
|
|
190
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
191
|
+
dendrogram(
|
|
192
|
+
self.linkage_matrix,
|
|
193
|
+
truncate_mode=truncate_mode,
|
|
194
|
+
p=p,
|
|
195
|
+
ax=ax
|
|
196
|
+
)
|
|
197
|
+
ax.set_xlabel('Sample Index or (Cluster Size)')
|
|
198
|
+
ax.set_ylabel('Distance')
|
|
199
|
+
ax.set_title('Hierarchical Clustering Dendrogram')
|
|
200
|
+
|
|
201
|
+
return fig
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def hierarchical_clustering_analysis(X: Union[np.ndarray, pd.DataFrame],
|
|
205
|
+
n_clusters: int = 2,
|
|
206
|
+
linkage: str = 'ward',
|
|
207
|
+
metric: str = 'euclidean') -> dict:
|
|
208
|
+
"""
|
|
209
|
+
Perform complete Hierarchical Clustering analysis
|
|
210
|
+
|
|
211
|
+
Parameters:
|
|
212
|
+
-----------
|
|
213
|
+
X : array-like of shape (n_samples, n_features)
|
|
214
|
+
Features
|
|
215
|
+
n_clusters : int
|
|
216
|
+
Number of clusters to find
|
|
217
|
+
linkage : str, 'ward', 'complete', 'average', 'single'
|
|
218
|
+
Which linkage criterion to use
|
|
219
|
+
metric : str or callable
|
|
220
|
+
Metric used to compute the linkage
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
--------
|
|
224
|
+
results : dict
|
|
225
|
+
Dictionary with model, cluster labels, and evaluation metrics
|
|
226
|
+
"""
|
|
227
|
+
# Initialize and fit model
|
|
228
|
+
hc_model = EconHierarchicalClustering(
|
|
229
|
+
n_clusters=n_clusters,
|
|
230
|
+
linkage=linkage,
|
|
231
|
+
metric=metric
|
|
232
|
+
)
|
|
233
|
+
labels = hc_model.fit_predict(X)
|
|
234
|
+
|
|
235
|
+
# Evaluate clustering
|
|
236
|
+
metrics = hc_model.evaluate(X)
|
|
237
|
+
|
|
238
|
+
return {
|
|
239
|
+
'model': hc_model,
|
|
240
|
+
'labels': labels,
|
|
241
|
+
'metrics': metrics,
|
|
242
|
+
'X': X
|
|
243
|
+
}
|
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
"""
|
|
2
|
+
K-Means Clustering implementation for econometric analysis
|
|
3
|
+
"""
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from sklearn.cluster import KMeans, MiniBatchKMeans
|
|
7
|
+
from sklearn.metrics import silhouette_score, calinski_harabasz_score
|
|
8
|
+
from sklearn.preprocessing import StandardScaler
|
|
9
|
+
from sklearn.decomposition import PCA
|
|
10
|
+
from typing import Union, Optional, Dict, Any
|
|
11
|
+
|
|
12
|
+
# 可选导入matplotlib
|
|
13
|
+
try:
|
|
14
|
+
import matplotlib.pyplot as plt
|
|
15
|
+
MATPLOTLIB_AVAILABLE = True
|
|
16
|
+
except ImportError:
|
|
17
|
+
MATPLOTLIB_AVAILABLE = False
|
|
18
|
+
except UnicodeDecodeError:
|
|
19
|
+
# 处理编码问题
|
|
20
|
+
MATPLOTLIB_AVAILABLE = False
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class EconKMeans:
|
|
24
|
+
"""
|
|
25
|
+
K-Means Clustering for econometric analysis
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self, n_clusters: int = 8, init: str = 'k-means++', n_init: int = 10,
|
|
29
|
+
max_iter: int = 300, random_state: int = 42, algorithm: str = 'lloyd',
|
|
30
|
+
use_minibatch: bool = False, batch_size: int = 1000):
|
|
31
|
+
"""
|
|
32
|
+
Initialize K-Means clustering model
|
|
33
|
+
|
|
34
|
+
Parameters:
|
|
35
|
+
-----------
|
|
36
|
+
n_clusters : int
|
|
37
|
+
Number of clusters to form
|
|
38
|
+
init : str, 'k-means++', 'random'
|
|
39
|
+
Method for initialization
|
|
40
|
+
n_init : int
|
|
41
|
+
Number of time the k-means algorithm will be run with different centroid seeds
|
|
42
|
+
max_iter : int
|
|
43
|
+
Maximum number of iterations of the k-means algorithm for a single run
|
|
44
|
+
random_state : int
|
|
45
|
+
Random state for reproducibility
|
|
46
|
+
algorithm : str, 'lloyd', 'elkan'
|
|
47
|
+
K-means algorithm to use
|
|
48
|
+
use_minibatch : bool
|
|
49
|
+
Whether to use MiniBatchKMeans for large datasets
|
|
50
|
+
batch_size : int
|
|
51
|
+
Size of the mini batches (only used when use_minibatch=True)
|
|
52
|
+
"""
|
|
53
|
+
self.n_clusters = n_clusters
|
|
54
|
+
self.init = init
|
|
55
|
+
self.n_init = n_init
|
|
56
|
+
self.max_iter = max_iter
|
|
57
|
+
self.random_state = random_state
|
|
58
|
+
self.algorithm = algorithm
|
|
59
|
+
self.use_minibatch = use_minibatch
|
|
60
|
+
self.batch_size = batch_size
|
|
61
|
+
self.scaler = StandardScaler()
|
|
62
|
+
|
|
63
|
+
if use_minibatch:
|
|
64
|
+
self.model = MiniBatchKMeans(
|
|
65
|
+
n_clusters=n_clusters,
|
|
66
|
+
init=init,
|
|
67
|
+
max_iter=max_iter,
|
|
68
|
+
random_state=random_state,
|
|
69
|
+
batch_size=batch_size
|
|
70
|
+
)
|
|
71
|
+
else:
|
|
72
|
+
self.model = KMeans(
|
|
73
|
+
n_clusters=n_clusters,
|
|
74
|
+
init=init,
|
|
75
|
+
n_init=n_init,
|
|
76
|
+
max_iter=max_iter,
|
|
77
|
+
random_state=random_state,
|
|
78
|
+
algorithm=algorithm
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
def fit(self, X: Union[np.ndarray, pd.DataFrame]) -> 'EconKMeans':
|
|
82
|
+
"""
|
|
83
|
+
Fit the K-Means clustering model
|
|
84
|
+
|
|
85
|
+
Parameters:
|
|
86
|
+
-----------
|
|
87
|
+
X : array-like of shape (n_samples, n_features)
|
|
88
|
+
Training data
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
--------
|
|
92
|
+
self : EconKMeans
|
|
93
|
+
"""
|
|
94
|
+
# Scale features
|
|
95
|
+
X_scaled = self.scaler.fit_transform(X)
|
|
96
|
+
self.model.fit(X_scaled)
|
|
97
|
+
return self
|
|
98
|
+
|
|
99
|
+
def predict(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
|
|
100
|
+
"""
|
|
101
|
+
Predict the closest cluster each sample in X belongs to
|
|
102
|
+
|
|
103
|
+
Parameters:
|
|
104
|
+
-----------
|
|
105
|
+
X : array-like of shape (n_samples, n_features)
|
|
106
|
+
New data to predict
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
--------
|
|
110
|
+
labels : ndarray of shape (n_samples,)
|
|
111
|
+
Index of the cluster each sample belongs to
|
|
112
|
+
"""
|
|
113
|
+
# Scale features using the same scaler
|
|
114
|
+
X_scaled = self.scaler.transform(X)
|
|
115
|
+
return self.model.predict(X_scaled)
|
|
116
|
+
|
|
117
|
+
def fit_predict(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
|
|
118
|
+
"""
|
|
119
|
+
Compute cluster centers and predict cluster index for each sample
|
|
120
|
+
|
|
121
|
+
Parameters:
|
|
122
|
+
-----------
|
|
123
|
+
X : array-like of shape (n_samples, n_features)
|
|
124
|
+
Training data
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
--------
|
|
128
|
+
labels : ndarray of shape (n_samples,)
|
|
129
|
+
Index of the cluster each sample belongs to
|
|
130
|
+
"""
|
|
131
|
+
# Scale features
|
|
132
|
+
X_scaled = self.scaler.fit_transform(X)
|
|
133
|
+
return self.model.fit_predict(X_scaled)
|
|
134
|
+
|
|
135
|
+
def cluster_centers(self) -> np.ndarray:
|
|
136
|
+
"""
|
|
137
|
+
Get the cluster centers
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
--------
|
|
141
|
+
centers : ndarray of shape (n_clusters, n_features)
|
|
142
|
+
Coordinates of cluster centers
|
|
143
|
+
"""
|
|
144
|
+
return self.scaler.inverse_transform(self.model.cluster_centers_)
|
|
145
|
+
|
|
146
|
+
def evaluate(self, X: Union[np.ndarray, pd.DataFrame]) -> Dict[str, float]:
|
|
147
|
+
"""
|
|
148
|
+
Evaluate clustering performance
|
|
149
|
+
|
|
150
|
+
Parameters:
|
|
151
|
+
-----------
|
|
152
|
+
X : array-like of shape (n_samples, n_features)
|
|
153
|
+
Data to evaluate
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
--------
|
|
157
|
+
metrics : dict
|
|
158
|
+
Dictionary with evaluation metrics
|
|
159
|
+
"""
|
|
160
|
+
# Scale features
|
|
161
|
+
X_scaled = self.scaler.transform(X)
|
|
162
|
+
labels = self.model.predict(X_scaled)
|
|
163
|
+
|
|
164
|
+
# Calculate metrics
|
|
165
|
+
silhouette = silhouette_score(X_scaled, labels)
|
|
166
|
+
calinski_harabasz = calinski_harabasz_score(X_scaled, labels)
|
|
167
|
+
|
|
168
|
+
return {
|
|
169
|
+
'silhouette_score': silhouette,
|
|
170
|
+
'calinski_harabasz_score': calinski_harabasz,
|
|
171
|
+
'inertia': self.model.inertia_,
|
|
172
|
+
'n_iter': self.model.n_iter_
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
def visualize_clusters(self, X: Union[np.ndarray, pd.DataFrame],
|
|
176
|
+
max_features: int = 10, figsize: tuple = (12, 8)) -> Optional:
|
|
177
|
+
"""
|
|
178
|
+
Visualize clusters using PCA for dimensionality reduction
|
|
179
|
+
|
|
180
|
+
Parameters:
|
|
181
|
+
-----------
|
|
182
|
+
X : array-like of shape (n_samples, n_features)
|
|
183
|
+
Data to visualize
|
|
184
|
+
max_features : int
|
|
185
|
+
Maximum number of features to show in the plot
|
|
186
|
+
figsize : tuple
|
|
187
|
+
Figure size
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
--------
|
|
191
|
+
fig : matplotlib Figure or None
|
|
192
|
+
The figure object, or None if matplotlib is not available
|
|
193
|
+
"""
|
|
194
|
+
if not MATPLOTLIB_AVAILABLE:
|
|
195
|
+
print("Matplotlib is not available. Skipping visualization.")
|
|
196
|
+
return None
|
|
197
|
+
|
|
198
|
+
# Scale features
|
|
199
|
+
X_scaled = self.scaler.transform(X)
|
|
200
|
+
labels = self.model.predict(X_scaled)
|
|
201
|
+
|
|
202
|
+
# Use PCA for dimensionality reduction if there are more than 2 features
|
|
203
|
+
if X_scaled.shape[1] > 2:
|
|
204
|
+
pca = PCA(n_components=min(2, X_scaled.shape[1]))
|
|
205
|
+
X_pca = pca.fit_transform(X_scaled)
|
|
206
|
+
else:
|
|
207
|
+
X_pca = X_scaled
|
|
208
|
+
|
|
209
|
+
# Create plot
|
|
210
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
211
|
+
|
|
212
|
+
# Plot points colored by cluster
|
|
213
|
+
scatter = ax.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap='viridis', alpha=0.7)
|
|
214
|
+
|
|
215
|
+
# Plot cluster centers if available in PCA space
|
|
216
|
+
if hasattr(self.model, 'cluster_centers_'):
|
|
217
|
+
centers_pca = pca.transform(self.model.cluster_centers_) if X_scaled.shape[1] > 2 else self.model.cluster_centers_
|
|
218
|
+
ax.scatter(centers_pca[:, 0], centers_pca[:, 1], c='red', marker='x', s=200, linewidths=3)
|
|
219
|
+
|
|
220
|
+
ax.set_xlabel('Principal Component 1' if X_scaled.shape[1] > 2 else 'Feature 1')
|
|
221
|
+
ax.set_ylabel('Principal Component 2' if X_scaled.shape[1] > 2 else 'Feature 2')
|
|
222
|
+
ax.set_title('K-Means Clustering Results')
|
|
223
|
+
|
|
224
|
+
# Add colorbar
|
|
225
|
+
plt.colorbar(scatter, ax=ax)
|
|
226
|
+
|
|
227
|
+
return fig
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def kmeans_analysis(X: Union[np.ndarray, pd.DataFrame],
|
|
231
|
+
n_clusters: int = 8,
|
|
232
|
+
init: str = 'k-means++',
|
|
233
|
+
n_init: int = 10,
|
|
234
|
+
max_iter: int = 300,
|
|
235
|
+
random_state: int = 42,
|
|
236
|
+
algorithm: str = 'lloyd',
|
|
237
|
+
use_minibatch: bool = False,
|
|
238
|
+
batch_size: int = 1000) -> dict:
|
|
239
|
+
"""
|
|
240
|
+
Perform complete K-Means clustering analysis
|
|
241
|
+
|
|
242
|
+
Parameters:
|
|
243
|
+
-----------
|
|
244
|
+
X : array-like of shape (n_samples, n_features)
|
|
245
|
+
Features
|
|
246
|
+
n_clusters : int
|
|
247
|
+
Number of clusters to form
|
|
248
|
+
init : str, 'k-means++', 'random'
|
|
249
|
+
Method for initialization
|
|
250
|
+
n_init : int
|
|
251
|
+
Number of time the k-means algorithm will be run with different centroid seeds
|
|
252
|
+
max_iter : int
|
|
253
|
+
Maximum number of iterations of the k-means algorithm for a single run
|
|
254
|
+
random_state : int
|
|
255
|
+
Random state for reproducibility
|
|
256
|
+
algorithm : str, 'lloyd', 'elkan'
|
|
257
|
+
K-means algorithm to use
|
|
258
|
+
use_minibatch : bool
|
|
259
|
+
Whether to use MiniBatchKMeans for large datasets
|
|
260
|
+
batch_size : int
|
|
261
|
+
Size of the mini batches (only used when use_minibatch=True)
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
--------
|
|
265
|
+
results : dict
|
|
266
|
+
Dictionary with model, cluster labels, centers, and evaluation metrics
|
|
267
|
+
"""
|
|
268
|
+
# Initialize and fit model
|
|
269
|
+
kmeans_model = EconKMeans(
|
|
270
|
+
n_clusters=n_clusters,
|
|
271
|
+
init=init,
|
|
272
|
+
n_init=n_init,
|
|
273
|
+
max_iter=max_iter,
|
|
274
|
+
random_state=random_state,
|
|
275
|
+
algorithm=algorithm,
|
|
276
|
+
use_minibatch=use_minibatch,
|
|
277
|
+
batch_size=batch_size
|
|
278
|
+
)
|
|
279
|
+
labels = kmeans_model.fit_predict(X)
|
|
280
|
+
|
|
281
|
+
# Get cluster centers
|
|
282
|
+
centers = kmeans_model.cluster_centers()
|
|
283
|
+
|
|
284
|
+
# Evaluate clustering
|
|
285
|
+
metrics = kmeans_model.evaluate(X)
|
|
286
|
+
|
|
287
|
+
return {
|
|
288
|
+
'model': kmeans_model,
|
|
289
|
+
'labels': labels,
|
|
290
|
+
'cluster_centers': centers,
|
|
291
|
+
'metrics': metrics,
|
|
292
|
+
'X': X
|
|
293
|
+
}
|