aigroup-econ-mcp 1.4.3__py3-none-any.whl → 2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. PKG-INFO +344 -322
  2. README.md +335 -320
  3. __init__.py +1 -1
  4. aigroup_econ_mcp-2.0.1.dist-info/METADATA +732 -0
  5. aigroup_econ_mcp-2.0.1.dist-info/RECORD +170 -0
  6. cli.py +4 -0
  7. econometrics/advanced_methods/modern_computing_machine_learning/__init__.py +30 -0
  8. econometrics/advanced_methods/modern_computing_machine_learning/causal_forest.py +253 -0
  9. econometrics/advanced_methods/modern_computing_machine_learning/double_ml.py +268 -0
  10. econometrics/advanced_methods/modern_computing_machine_learning/gradient_boosting.py +249 -0
  11. econometrics/advanced_methods/modern_computing_machine_learning/hierarchical_clustering.py +243 -0
  12. econometrics/advanced_methods/modern_computing_machine_learning/kmeans_clustering.py +293 -0
  13. econometrics/advanced_methods/modern_computing_machine_learning/neural_network.py +264 -0
  14. econometrics/advanced_methods/modern_computing_machine_learning/random_forest.py +195 -0
  15. econometrics/advanced_methods/modern_computing_machine_learning/support_vector_machine.py +226 -0
  16. econometrics/advanced_methods/modern_computing_machine_learning/test_all_modules.py +329 -0
  17. econometrics/advanced_methods/modern_computing_machine_learning/test_report.md +107 -0
  18. econometrics/causal_inference/__init__.py +66 -0
  19. econometrics/causal_inference/causal_identification_strategy/__init__.py +104 -0
  20. econometrics/causal_inference/causal_identification_strategy/control_function.py +112 -0
  21. econometrics/causal_inference/causal_identification_strategy/difference_in_differences.py +107 -0
  22. econometrics/causal_inference/causal_identification_strategy/event_study.py +119 -0
  23. econometrics/causal_inference/causal_identification_strategy/first_difference.py +89 -0
  24. econometrics/causal_inference/causal_identification_strategy/fixed_effects.py +103 -0
  25. econometrics/causal_inference/causal_identification_strategy/hausman_test.py +69 -0
  26. econometrics/causal_inference/causal_identification_strategy/instrumental_variables.py +145 -0
  27. econometrics/causal_inference/causal_identification_strategy/mediation_analysis.py +121 -0
  28. econometrics/causal_inference/causal_identification_strategy/moderation_analysis.py +109 -0
  29. econometrics/causal_inference/causal_identification_strategy/propensity_score_matching.py +140 -0
  30. econometrics/causal_inference/causal_identification_strategy/random_effects.py +100 -0
  31. econometrics/causal_inference/causal_identification_strategy/regression_discontinuity.py +98 -0
  32. econometrics/causal_inference/causal_identification_strategy/synthetic_control.py +111 -0
  33. econometrics/causal_inference/causal_identification_strategy/triple_difference.py +86 -0
  34. econometrics/distribution_analysis/__init__.py +28 -0
  35. econometrics/distribution_analysis/oaxaca_blinder.py +184 -0
  36. econometrics/distribution_analysis/time_series_decomposition.py +152 -0
  37. econometrics/distribution_analysis/variance_decomposition.py +179 -0
  38. econometrics/missing_data/__init__.py +18 -0
  39. econometrics/missing_data/imputation_methods.py +219 -0
  40. econometrics/nonparametric/__init__.py +35 -0
  41. econometrics/nonparametric/gam_model.py +117 -0
  42. econometrics/nonparametric/kernel_regression.py +161 -0
  43. econometrics/nonparametric/quantile_regression.py +249 -0
  44. econometrics/nonparametric/spline_regression.py +100 -0
  45. econometrics/spatial_econometrics/__init__.py +68 -0
  46. econometrics/spatial_econometrics/geographically_weighted_regression.py +211 -0
  47. econometrics/spatial_econometrics/gwr_simple.py +154 -0
  48. econometrics/spatial_econometrics/spatial_autocorrelation.py +356 -0
  49. econometrics/spatial_econometrics/spatial_durbin_model.py +177 -0
  50. econometrics/spatial_econometrics/spatial_regression.py +315 -0
  51. econometrics/spatial_econometrics/spatial_weights.py +226 -0
  52. econometrics/specific_data_modeling/micro_discrete_limited_data/README.md +164 -0
  53. econometrics/specific_data_modeling/micro_discrete_limited_data/__init__.py +40 -0
  54. econometrics/specific_data_modeling/micro_discrete_limited_data/count_data_models.py +311 -0
  55. econometrics/specific_data_modeling/micro_discrete_limited_data/discrete_choice_models.py +294 -0
  56. econometrics/specific_data_modeling/micro_discrete_limited_data/limited_dependent_variable_models.py +282 -0
  57. econometrics/statistical_inference/__init__.py +21 -0
  58. econometrics/statistical_inference/bootstrap_methods.py +162 -0
  59. econometrics/statistical_inference/permutation_test.py +177 -0
  60. econometrics/survival_analysis/__init__.py +18 -0
  61. econometrics/survival_analysis/survival_models.py +259 -0
  62. econometrics/tests/causal_inference_tests/__init__.py +3 -0
  63. econometrics/tests/causal_inference_tests/detailed_test.py +441 -0
  64. econometrics/tests/causal_inference_tests/test_all_methods.py +418 -0
  65. econometrics/tests/causal_inference_tests/test_causal_identification_strategy.py +202 -0
  66. econometrics/tests/causal_inference_tests/test_difference_in_differences.py +53 -0
  67. econometrics/tests/causal_inference_tests/test_instrumental_variables.py +44 -0
  68. econometrics/tests/specific_data_modeling_tests/test_micro_discrete_limited_data.py +189 -0
  69. econometrics//321/206/320/254/320/272/321/205/342/225/235/320/220/321/205/320/237/320/241/321/205/320/264/320/267/321/207/342/226/222/342/225/227/321/204/342/225/235/320/250/321/205/320/225/320/230/321/207/342/225/221/320/267/321/205/320/230/320/226/321/206/320/256/320/240.md +544 -0
  70. pyproject.toml +9 -2
  71. server.py +15 -1
  72. tools/__init__.py +75 -1
  73. tools/causal_inference_adapter.py +658 -0
  74. tools/distribution_analysis_adapter.py +121 -0
  75. tools/gwr_simple_adapter.py +54 -0
  76. tools/machine_learning_adapter.py +567 -0
  77. tools/mcp_tool_groups/__init__.py +15 -1
  78. tools/mcp_tool_groups/causal_inference_tools.py +643 -0
  79. tools/mcp_tool_groups/distribution_analysis_tools.py +169 -0
  80. tools/mcp_tool_groups/machine_learning_tools.py +422 -0
  81. tools/mcp_tool_groups/microecon_tools.py +325 -0
  82. tools/mcp_tool_groups/missing_data_tools.py +117 -0
  83. tools/mcp_tool_groups/nonparametric_tools.py +225 -0
  84. tools/mcp_tool_groups/spatial_econometrics_tools.py +323 -0
  85. tools/mcp_tool_groups/statistical_inference_tools.py +131 -0
  86. tools/mcp_tools_registry.py +13 -3
  87. tools/microecon_adapter.py +412 -0
  88. tools/missing_data_adapter.py +73 -0
  89. tools/nonparametric_adapter.py +190 -0
  90. tools/spatial_econometrics_adapter.py +318 -0
  91. tools/statistical_inference_adapter.py +90 -0
  92. tools/survival_analysis_adapter.py +46 -0
  93. aigroup_econ_mcp-1.4.3.dist-info/METADATA +0 -710
  94. aigroup_econ_mcp-1.4.3.dist-info/RECORD +0 -92
  95. {aigroup_econ_mcp-1.4.3.dist-info → aigroup_econ_mcp-2.0.1.dist-info}/WHEEL +0 -0
  96. {aigroup_econ_mcp-1.4.3.dist-info → aigroup_econ_mcp-2.0.1.dist-info}/entry_points.txt +0 -0
  97. {aigroup_econ_mcp-1.4.3.dist-info → aigroup_econ_mcp-2.0.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,243 @@
1
+ """
2
+ Hierarchical Clustering implementation for econometric analysis
3
+ """
4
+ import numpy as np
5
+ import pandas as pd
6
+ from sklearn.cluster import AgglomerativeClustering, linkage_tree
7
+ from sklearn.metrics import silhouette_score, calinski_harabasz_score
8
+ from sklearn.preprocessing import StandardScaler
9
+ from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
10
+ from scipy.spatial.distance import pdist
11
+ from typing import Union, Optional, Dict, Any
12
+
13
+ # 可选导入matplotlib
14
+ try:
15
+ import matplotlib.pyplot as plt
16
+ MATPLOTLIB_AVAILABLE = True
17
+ except ImportError:
18
+ MATPLOTLIB_AVAILABLE = False
19
+ except UnicodeDecodeError:
20
+ # 处理编码问题
21
+ MATPLOTLIB_AVAILABLE = False
22
+
23
+
24
+ class EconHierarchicalClustering:
25
+ """
26
+ Hierarchical Clustering for econometric analysis
27
+ """
28
+
29
+ def __init__(self, n_clusters: int = 2, linkage: str = 'ward',
30
+ metric: str = 'euclidean'):
31
+ """
32
+ Initialize Hierarchical Clustering model
33
+
34
+ Parameters:
35
+ -----------
36
+ n_clusters : int
37
+ Number of clusters to find
38
+ linkage : str, 'ward', 'complete', 'average', 'single'
39
+ Which linkage criterion to use
40
+ metric : str or callable
41
+ Metric used to compute the linkage. Can be 'euclidean', 'l1', 'l2',
42
+ 'manhattan', 'cosine', or 'precomputed'
43
+ """
44
+ self.n_clusters = n_clusters
45
+ self.linkage = linkage
46
+ self.metric = metric
47
+ self.scaler = StandardScaler()
48
+
49
+ # Initialize model
50
+ # Note: 'ward' linkage requires 'euclidean' metric
51
+ if linkage == 'ward':
52
+ self.metric = 'euclidean'
53
+
54
+ self.model = AgglomerativeClustering(
55
+ n_clusters=n_clusters,
56
+ linkage=linkage,
57
+ metric=metric if linkage != 'ward' else 'euclidean'
58
+ )
59
+
60
+ # Store linkage matrix for dendrogram
61
+ self.linkage_matrix = None
62
+
63
+ def fit(self, X: Union[np.ndarray, pd.DataFrame]) -> 'EconHierarchicalClustering':
64
+ """
65
+ Fit the Hierarchical Clustering model
66
+
67
+ Parameters:
68
+ -----------
69
+ X : array-like of shape (n_samples, n_features)
70
+ Training data
71
+
72
+ Returns:
73
+ --------
74
+ self : EconHierarchicalClustering
75
+ """
76
+ # Scale features
77
+ X_scaled = self.scaler.fit_transform(X)
78
+
79
+ # Fit the model
80
+ self.model.fit(X_scaled)
81
+
82
+ # Compute linkage matrix for dendrogram
83
+ if self.metric != 'precomputed':
84
+ distance_matrix = pdist(X_scaled, metric=self.metric)
85
+ self.linkage_matrix = linkage(distance_matrix, method=self.linkage)
86
+
87
+ return self
88
+
89
+ def predict(self, X: Union[np.ndarray, pd.DataFrame] = None) -> np.ndarray:
90
+ """
91
+ Get cluster labels
92
+
93
+ Parameters:
94
+ -----------
95
+ X : array-like of shape (n_samples, n_features) or None
96
+ Data to predict (not used in hierarchical clustering,
97
+ returns labels from fit)
98
+
99
+ Returns:
100
+ --------
101
+ labels : ndarray of shape (n_samples,)
102
+ Index of the cluster each sample belongs to
103
+ """
104
+ return self.model.labels_
105
+
106
+ def fit_predict(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
107
+ """
108
+ Fit the hierarchical clustering model and return cluster labels
109
+
110
+ Parameters:
111
+ -----------
112
+ X : array-like of shape (n_samples, n_features)
113
+ Training data
114
+
115
+ Returns:
116
+ --------
117
+ labels : ndarray of shape (n_samples,)
118
+ Index of the cluster each sample belongs to
119
+ """
120
+ self.fit(X)
121
+ return self.model.labels_
122
+
123
+ def evaluate(self, X: Union[np.ndarray, pd.DataFrame]) -> Dict[str, float]:
124
+ """
125
+ Evaluate clustering performance
126
+
127
+ Parameters:
128
+ -----------
129
+ X : array-like of shape (n_samples, n_features)
130
+ Data to evaluate
131
+
132
+ Returns:
133
+ --------
134
+ metrics : dict
135
+ Dictionary with evaluation metrics
136
+ """
137
+ # Scale features
138
+ X_scaled = self.scaler.transform(X)
139
+ labels = self.model.labels_
140
+
141
+ # Calculate metrics if more than 1 cluster
142
+ if len(np.unique(labels)) > 1:
143
+ silhouette = silhouette_score(X_scaled, labels)
144
+ calinski_harabasz = calinski_harabasz_score(X_scaled, labels)
145
+ else:
146
+ silhouette = 0.0
147
+ calinski_harabasz = 0.0
148
+
149
+ return {
150
+ 'silhouette_score': silhouette,
151
+ 'calinski_harabasz_score': calinski_harabasz
152
+ }
153
+
154
+ def plot_dendrogram(self, X: Union[np.ndarray, pd.DataFrame] = None,
155
+ truncate_mode: str = 'level', p: int = 5,
156
+ figsize: tuple = (12, 8)) -> Optional:
157
+ """
158
+ Plot dendrogram for hierarchical clustering
159
+
160
+ Parameters:
161
+ -----------
162
+ X : array-like of shape (n_samples, n_features) or None
163
+ Data to visualize (if None, uses data from fit)
164
+ truncate_mode : str
165
+ Truncation mode for dendrogram
166
+ p : int
167
+ Parameter for truncation
168
+ figsize : tuple
169
+ Figure size
170
+
171
+ Returns:
172
+ --------
173
+ fig : matplotlib Figure or None
174
+ The figure object, or None if matplotlib is not available
175
+ """
176
+ if not MATPLOTLIB_AVAILABLE:
177
+ print("Matplotlib is not available. Skipping visualization.")
178
+ return None
179
+
180
+ # Compute linkage matrix if not already computed
181
+ if self.linkage_matrix is None and X is not None:
182
+ X_scaled = self.scaler.transform(X)
183
+ distance_matrix = pdist(X_scaled, metric=self.metric)
184
+ self.linkage_matrix = linkage(distance_matrix, method=self.linkage)
185
+
186
+ if self.linkage_matrix is None:
187
+ raise ValueError("No linkage matrix available. Please fit the model first or provide data.")
188
+
189
+ # Create plot
190
+ fig, ax = plt.subplots(figsize=figsize)
191
+ dendrogram(
192
+ self.linkage_matrix,
193
+ truncate_mode=truncate_mode,
194
+ p=p,
195
+ ax=ax
196
+ )
197
+ ax.set_xlabel('Sample Index or (Cluster Size)')
198
+ ax.set_ylabel('Distance')
199
+ ax.set_title('Hierarchical Clustering Dendrogram')
200
+
201
+ return fig
202
+
203
+
204
+ def hierarchical_clustering_analysis(X: Union[np.ndarray, pd.DataFrame],
205
+ n_clusters: int = 2,
206
+ linkage: str = 'ward',
207
+ metric: str = 'euclidean') -> dict:
208
+ """
209
+ Perform complete Hierarchical Clustering analysis
210
+
211
+ Parameters:
212
+ -----------
213
+ X : array-like of shape (n_samples, n_features)
214
+ Features
215
+ n_clusters : int
216
+ Number of clusters to find
217
+ linkage : str, 'ward', 'complete', 'average', 'single'
218
+ Which linkage criterion to use
219
+ metric : str or callable
220
+ Metric used to compute the linkage
221
+
222
+ Returns:
223
+ --------
224
+ results : dict
225
+ Dictionary with model, cluster labels, and evaluation metrics
226
+ """
227
+ # Initialize and fit model
228
+ hc_model = EconHierarchicalClustering(
229
+ n_clusters=n_clusters,
230
+ linkage=linkage,
231
+ metric=metric
232
+ )
233
+ labels = hc_model.fit_predict(X)
234
+
235
+ # Evaluate clustering
236
+ metrics = hc_model.evaluate(X)
237
+
238
+ return {
239
+ 'model': hc_model,
240
+ 'labels': labels,
241
+ 'metrics': metrics,
242
+ 'X': X
243
+ }
@@ -0,0 +1,293 @@
1
+ """
2
+ K-Means Clustering implementation for econometric analysis
3
+ """
4
+ import numpy as np
5
+ import pandas as pd
6
+ from sklearn.cluster import KMeans, MiniBatchKMeans
7
+ from sklearn.metrics import silhouette_score, calinski_harabasz_score
8
+ from sklearn.preprocessing import StandardScaler
9
+ from sklearn.decomposition import PCA
10
+ from typing import Union, Optional, Dict, Any
11
+
12
+ # 可选导入matplotlib
13
+ try:
14
+ import matplotlib.pyplot as plt
15
+ MATPLOTLIB_AVAILABLE = True
16
+ except ImportError:
17
+ MATPLOTLIB_AVAILABLE = False
18
+ except UnicodeDecodeError:
19
+ # 处理编码问题
20
+ MATPLOTLIB_AVAILABLE = False
21
+
22
+
23
+ class EconKMeans:
24
+ """
25
+ K-Means Clustering for econometric analysis
26
+ """
27
+
28
+ def __init__(self, n_clusters: int = 8, init: str = 'k-means++', n_init: int = 10,
29
+ max_iter: int = 300, random_state: int = 42, algorithm: str = 'lloyd',
30
+ use_minibatch: bool = False, batch_size: int = 1000):
31
+ """
32
+ Initialize K-Means clustering model
33
+
34
+ Parameters:
35
+ -----------
36
+ n_clusters : int
37
+ Number of clusters to form
38
+ init : str, 'k-means++', 'random'
39
+ Method for initialization
40
+ n_init : int
41
+ Number of time the k-means algorithm will be run with different centroid seeds
42
+ max_iter : int
43
+ Maximum number of iterations of the k-means algorithm for a single run
44
+ random_state : int
45
+ Random state for reproducibility
46
+ algorithm : str, 'lloyd', 'elkan'
47
+ K-means algorithm to use
48
+ use_minibatch : bool
49
+ Whether to use MiniBatchKMeans for large datasets
50
+ batch_size : int
51
+ Size of the mini batches (only used when use_minibatch=True)
52
+ """
53
+ self.n_clusters = n_clusters
54
+ self.init = init
55
+ self.n_init = n_init
56
+ self.max_iter = max_iter
57
+ self.random_state = random_state
58
+ self.algorithm = algorithm
59
+ self.use_minibatch = use_minibatch
60
+ self.batch_size = batch_size
61
+ self.scaler = StandardScaler()
62
+
63
+ if use_minibatch:
64
+ self.model = MiniBatchKMeans(
65
+ n_clusters=n_clusters,
66
+ init=init,
67
+ max_iter=max_iter,
68
+ random_state=random_state,
69
+ batch_size=batch_size
70
+ )
71
+ else:
72
+ self.model = KMeans(
73
+ n_clusters=n_clusters,
74
+ init=init,
75
+ n_init=n_init,
76
+ max_iter=max_iter,
77
+ random_state=random_state,
78
+ algorithm=algorithm
79
+ )
80
+
81
+ def fit(self, X: Union[np.ndarray, pd.DataFrame]) -> 'EconKMeans':
82
+ """
83
+ Fit the K-Means clustering model
84
+
85
+ Parameters:
86
+ -----------
87
+ X : array-like of shape (n_samples, n_features)
88
+ Training data
89
+
90
+ Returns:
91
+ --------
92
+ self : EconKMeans
93
+ """
94
+ # Scale features
95
+ X_scaled = self.scaler.fit_transform(X)
96
+ self.model.fit(X_scaled)
97
+ return self
98
+
99
+ def predict(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
100
+ """
101
+ Predict the closest cluster each sample in X belongs to
102
+
103
+ Parameters:
104
+ -----------
105
+ X : array-like of shape (n_samples, n_features)
106
+ New data to predict
107
+
108
+ Returns:
109
+ --------
110
+ labels : ndarray of shape (n_samples,)
111
+ Index of the cluster each sample belongs to
112
+ """
113
+ # Scale features using the same scaler
114
+ X_scaled = self.scaler.transform(X)
115
+ return self.model.predict(X_scaled)
116
+
117
+ def fit_predict(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
118
+ """
119
+ Compute cluster centers and predict cluster index for each sample
120
+
121
+ Parameters:
122
+ -----------
123
+ X : array-like of shape (n_samples, n_features)
124
+ Training data
125
+
126
+ Returns:
127
+ --------
128
+ labels : ndarray of shape (n_samples,)
129
+ Index of the cluster each sample belongs to
130
+ """
131
+ # Scale features
132
+ X_scaled = self.scaler.fit_transform(X)
133
+ return self.model.fit_predict(X_scaled)
134
+
135
+ def cluster_centers(self) -> np.ndarray:
136
+ """
137
+ Get the cluster centers
138
+
139
+ Returns:
140
+ --------
141
+ centers : ndarray of shape (n_clusters, n_features)
142
+ Coordinates of cluster centers
143
+ """
144
+ return self.scaler.inverse_transform(self.model.cluster_centers_)
145
+
146
+ def evaluate(self, X: Union[np.ndarray, pd.DataFrame]) -> Dict[str, float]:
147
+ """
148
+ Evaluate clustering performance
149
+
150
+ Parameters:
151
+ -----------
152
+ X : array-like of shape (n_samples, n_features)
153
+ Data to evaluate
154
+
155
+ Returns:
156
+ --------
157
+ metrics : dict
158
+ Dictionary with evaluation metrics
159
+ """
160
+ # Scale features
161
+ X_scaled = self.scaler.transform(X)
162
+ labels = self.model.predict(X_scaled)
163
+
164
+ # Calculate metrics
165
+ silhouette = silhouette_score(X_scaled, labels)
166
+ calinski_harabasz = calinski_harabasz_score(X_scaled, labels)
167
+
168
+ return {
169
+ 'silhouette_score': silhouette,
170
+ 'calinski_harabasz_score': calinski_harabasz,
171
+ 'inertia': self.model.inertia_,
172
+ 'n_iter': self.model.n_iter_
173
+ }
174
+
175
+ def visualize_clusters(self, X: Union[np.ndarray, pd.DataFrame],
176
+ max_features: int = 10, figsize: tuple = (12, 8)) -> Optional:
177
+ """
178
+ Visualize clusters using PCA for dimensionality reduction
179
+
180
+ Parameters:
181
+ -----------
182
+ X : array-like of shape (n_samples, n_features)
183
+ Data to visualize
184
+ max_features : int
185
+ Maximum number of features to show in the plot
186
+ figsize : tuple
187
+ Figure size
188
+
189
+ Returns:
190
+ --------
191
+ fig : matplotlib Figure or None
192
+ The figure object, or None if matplotlib is not available
193
+ """
194
+ if not MATPLOTLIB_AVAILABLE:
195
+ print("Matplotlib is not available. Skipping visualization.")
196
+ return None
197
+
198
+ # Scale features
199
+ X_scaled = self.scaler.transform(X)
200
+ labels = self.model.predict(X_scaled)
201
+
202
+ # Use PCA for dimensionality reduction if there are more than 2 features
203
+ if X_scaled.shape[1] > 2:
204
+ pca = PCA(n_components=min(2, X_scaled.shape[1]))
205
+ X_pca = pca.fit_transform(X_scaled)
206
+ else:
207
+ X_pca = X_scaled
208
+
209
+ # Create plot
210
+ fig, ax = plt.subplots(figsize=figsize)
211
+
212
+ # Plot points colored by cluster
213
+ scatter = ax.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap='viridis', alpha=0.7)
214
+
215
+ # Plot cluster centers if available in PCA space
216
+ if hasattr(self.model, 'cluster_centers_'):
217
+ centers_pca = pca.transform(self.model.cluster_centers_) if X_scaled.shape[1] > 2 else self.model.cluster_centers_
218
+ ax.scatter(centers_pca[:, 0], centers_pca[:, 1], c='red', marker='x', s=200, linewidths=3)
219
+
220
+ ax.set_xlabel('Principal Component 1' if X_scaled.shape[1] > 2 else 'Feature 1')
221
+ ax.set_ylabel('Principal Component 2' if X_scaled.shape[1] > 2 else 'Feature 2')
222
+ ax.set_title('K-Means Clustering Results')
223
+
224
+ # Add colorbar
225
+ plt.colorbar(scatter, ax=ax)
226
+
227
+ return fig
228
+
229
+
230
+ def kmeans_analysis(X: Union[np.ndarray, pd.DataFrame],
231
+ n_clusters: int = 8,
232
+ init: str = 'k-means++',
233
+ n_init: int = 10,
234
+ max_iter: int = 300,
235
+ random_state: int = 42,
236
+ algorithm: str = 'lloyd',
237
+ use_minibatch: bool = False,
238
+ batch_size: int = 1000) -> dict:
239
+ """
240
+ Perform complete K-Means clustering analysis
241
+
242
+ Parameters:
243
+ -----------
244
+ X : array-like of shape (n_samples, n_features)
245
+ Features
246
+ n_clusters : int
247
+ Number of clusters to form
248
+ init : str, 'k-means++', 'random'
249
+ Method for initialization
250
+ n_init : int
251
+ Number of time the k-means algorithm will be run with different centroid seeds
252
+ max_iter : int
253
+ Maximum number of iterations of the k-means algorithm for a single run
254
+ random_state : int
255
+ Random state for reproducibility
256
+ algorithm : str, 'lloyd', 'elkan'
257
+ K-means algorithm to use
258
+ use_minibatch : bool
259
+ Whether to use MiniBatchKMeans for large datasets
260
+ batch_size : int
261
+ Size of the mini batches (only used when use_minibatch=True)
262
+
263
+ Returns:
264
+ --------
265
+ results : dict
266
+ Dictionary with model, cluster labels, centers, and evaluation metrics
267
+ """
268
+ # Initialize and fit model
269
+ kmeans_model = EconKMeans(
270
+ n_clusters=n_clusters,
271
+ init=init,
272
+ n_init=n_init,
273
+ max_iter=max_iter,
274
+ random_state=random_state,
275
+ algorithm=algorithm,
276
+ use_minibatch=use_minibatch,
277
+ batch_size=batch_size
278
+ )
279
+ labels = kmeans_model.fit_predict(X)
280
+
281
+ # Get cluster centers
282
+ centers = kmeans_model.cluster_centers()
283
+
284
+ # Evaluate clustering
285
+ metrics = kmeans_model.evaluate(X)
286
+
287
+ return {
288
+ 'model': kmeans_model,
289
+ 'labels': labels,
290
+ 'cluster_centers': centers,
291
+ 'metrics': metrics,
292
+ 'X': X
293
+ }