@icyfenix-dmla/cli 2026.5.2-7 → 2026.5.3-821

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/package.json +9 -6
  2. package/scripts/build.js +44 -11
  3. package/shared_modules/__init__.py +10 -0
  4. package/shared_modules/bayesian/__init__.py +6 -0
  5. package/shared_modules/bayesian/bayesian_network.py +105 -0
  6. package/shared_modules/bayesian/gaussian_mixture_model.py +141 -0
  7. package/shared_modules/bayesian/gaussian_mixturemodel.py +141 -0
  8. package/shared_modules/bayesian/multinomial_naive_bayes.py +74 -0
  9. package/shared_modules/bayesian/simple_bayesian_network.py +99 -0
  10. package/shared_modules/bayesian/simple_bayesiannetwork.py +99 -0
  11. package/shared_modules/cnn/__init__.py +5 -0
  12. package/shared_modules/cnn/alex_net.py +65 -0
  13. package/shared_modules/cnn/alexnet.py +65 -0
  14. package/shared_modules/cnn/t_e_r_m1.py +65 -0
  15. package/shared_modules/cnn/tiny_image_net_dataset.py +67 -0
  16. package/shared_modules/cnn/tiny_imagenet_dataset.py +67 -0
  17. package/shared_modules/cnn/tiny_imagenetdataset.py +67 -0
  18. package/shared_modules/cnn/tinyimagenetdataset.py +67 -0
  19. package/shared_modules/linear/__init__.py +6 -0
  20. package/shared_modules/linear/lasso_regression.py +93 -0
  21. package/shared_modules/linear/logistic_regression.py +78 -0
  22. package/shared_modules/linear/naive_bayes.py +141 -0
  23. package/shared_modules/linear/ridge_regression.py +58 -0
  24. package/shared_modules/neural/__init__.py +4 -0
  25. package/shared_modules/neural/perceptron.py +80 -0
  26. package/shared_modules/svm/__init__.py +5 -0
  27. package/shared_modules/svm/kernel_s_v_m.py +98 -0
  28. package/shared_modules/svm/kernel_svm.py +98 -0
  29. package/shared_modules/svm/simple_s_v_m.py +111 -0
  30. package/shared_modules/svm/simple_svm.py +111 -0
  31. package/shared_modules/tree/__init__.py +6 -0
  32. package/shared_modules/tree/ada_boost.py +77 -0
  33. package/shared_modules/tree/decision_tree_classifier.py +235 -0
  34. package/shared_modules/tree/decision_treeclassifier.py +235 -0
  35. package/shared_modules/tree/random_forest_classifier.py +88 -0
  36. package/shared_modules/tree/random_forestclassifier.py +88 -0
  37. package/shared_modules/unsupervised/__init__.py +5 -0
  38. package/shared_modules/unsupervised/k_means.py +127 -0
  39. package/shared_modules/unsupervised/kmeans.py +127 -0
  40. package/shared_modules/unsupervised/p_c_a.py +111 -0
  41. package/shared_modules/unsupervised/pca.py +111 -0
  42. package/src/commands/data.js +823 -0
  43. package/src/commands/server.js +209 -4
  44. package/src/index.js +23 -2
  45. package/src/server/routes/sandbox.js +70 -3
  46. package/src/server/sandbox.js +87 -11
  47. package/version.json +4 -0
@@ -0,0 +1,235 @@
1
+ # DecisionTreeClassifier 类定义
2
+ # 从文档自动提取生成
3
+
4
+ import numpy as np
5
+
6
+ class DecisionTreeClassifier:
7
+ """
8
+ CART 决策树分类器
9
+
10
+ 使用 Gini 指数作为分裂准则,构建二叉决策树。
11
+ 支持预剪枝策略:最大深度限制和叶节点最小样本数限制。
12
+
13
+ 参数:
14
+ max_depth : int, 默认值 10
15
+ 树的最大深度,防止过拟合
16
+ min_samples_split : int, 默认值 2
17
+ 分裂所需的最小样本数,防止学习孤例
18
+ """
19
+
20
+ def __init__(self, max_depth=10, min_samples_split=2, min_gain_threshold=0.0):
21
+ self.max_depth = max_depth
22
+ self.min_samples_split = min_samples_split
23
+ self.min_gain_threshold = min_gain_threshold
24
+ self.tree = None
25
+
26
+ def _gini(self, y):
27
+ """
28
+ 计算数据集的 Gini 指数
29
+
30
+ Gini 指数衡量数据的不纯度,值越小越纯净。
31
+
32
+ 参数:
33
+ y : ndarray
34
+ 目标变量数组
35
+
36
+ 返回:
37
+ float : Gini 指数值
38
+ """
39
+ if len(y) == 0:
40
+ return 0
41
+ _, counts = np.unique(y, return_counts=True)
42
+ probs = counts / len(y)
43
+ return 1 - np.sum(probs ** 2)
44
+
45
+ def _gini_split(self, y_left, y_right):
46
+ """
47
+ 计算分裂后的加权 Gini 指数
48
+
49
+ 加权平均两个子集的 Gini 指数,权重为样本数比例。
50
+
51
+ 参数:
52
+ y_left : ndarray
53
+ 左分支的目标变量
54
+ y_right : ndarray
55
+ 右分支的目标变量
56
+
57
+ 返回:
58
+ float : 分裂后的加权 Gini 指数
59
+ """
60
+ n = len(y_left) + len(y_right)
61
+ return (len(y_left) / n) * self._gini(y_left) + \
62
+ (len(y_right) / n) * self._gini(y_right)
63
+
64
+ def _best_split(self, X, y):
65
+ """
66
+ 寻找最佳分裂特征和分割点
67
+
68
+ 遍历所有特征的所有候选分割点,选择 Gini 指数最小的分裂方案。
69
+ 候选分割点是特征的唯一值(CART 的标准策略)。
70
+
71
+ 参数:
72
+ X : ndarray, shape (n_samples, n_features)
73
+ 特征矩阵
74
+ y : ndarray, shape (n_samples,)
75
+ 目标变量
76
+
77
+ 返回:
78
+ tuple : (最佳特征索引, 最佳分割点, 对应的 Gini 指数)
79
+ """
80
+ best_gini = float('inf')
81
+ best_feature = None
82
+ best_threshold = None
83
+
84
+ n_features = X.shape[1]
85
+
86
+ for feature in range(n_features):
87
+ # 获取该特征的所有唯一值作为候选分割点
88
+ # 使用相邻唯一值的中点作为候选阈值(标准 CART 算法策略)
89
+ thresholds = np.unique(X[:, feature])
90
+ thresholds = (thresholds[:-1] + thresholds[1:]) / 2
91
+
92
+ for threshold in thresholds:
93
+ # 按阈值分裂数据
94
+ left_mask = X[:, feature] <= threshold
95
+ right_mask = ~left_mask
96
+
97
+ y_left = y[left_mask]
98
+ y_right = y[right_mask]
99
+
100
+ # 忽略无效分裂(某分支为空)
101
+ if len(y_left) == 0 or len(y_right) == 0:
102
+ continue
103
+
104
+ gini = self._gini_split(y_left, y_right)
105
+
106
+ # 更新最优分裂
107
+ if gini < best_gini:
108
+ best_gini = gini
109
+ best_feature = feature
110
+ best_threshold = threshold
111
+
112
+ return best_feature, best_threshold, best_gini
113
+
114
+ def _build_tree(self, X, y, depth):
115
+ """
116
+ 递归构建决策树
117
+
118
+ 核心步骤:
119
+ 1. 检查终止条件(深度限制、样本数限制、纯净度)
120
+ 2. 若满足终止条件,返回叶节点(多数类)
121
+ 3. 否则寻找最优分裂,创建内部节点
122
+ 4. 递归构建左右子树
123
+
124
+ 参数:
125
+ X : ndarray
126
+ 特征矩阵
127
+ y : ndarray
128
+ 目标变量
129
+ depth : int
130
+ 当前深度
131
+
132
+ 返回:
133
+ dict : 树节点(字典表示)
134
+ """
135
+ n_samples = len(y)
136
+
137
+ # 检查预剪枝终止条件
138
+ if (depth >= self.max_depth or
139
+ n_samples < self.min_samples_split or
140
+ len(np.unique(y)) == 1):
141
+ # 返回叶节点,预测值为多数类
142
+ values, counts = np.unique(y, return_counts=True)
143
+ return {'leaf': True, 'class': values[np.argmax(counts)]}
144
+
145
+ # 寻找最优分裂
146
+ feature, threshold, gini = self._best_split(X, y)
147
+
148
+ # 若无法分裂或分裂增益不足,返回叶节点
149
+ if feature is None or gini > self._gini(y) - self.min_gain_threshold:
150
+ values, counts = np.unique(y, return_counts=True)
151
+ return {'leaf': True, 'class': values[np.argmax(counts)]}
152
+
153
+ # 分裂数据
154
+ left_mask = X[:, feature] <= threshold
155
+ right_mask = ~left_mask
156
+
157
+ # 递归构建子树
158
+ left_tree = self._build_tree(X[left_mask], y[left_mask], depth + 1)
159
+ right_tree = self._build_tree(X[right_mask], y[right_mask], depth + 1)
160
+
161
+ return {
162
+ 'leaf': False,
163
+ 'feature': feature,
164
+ 'threshold': threshold,
165
+ 'left': left_tree,
166
+ 'right': right_tree
167
+ }
168
+
169
+ def fit(self, X, y):
170
+ """
171
+ 训练决策树
172
+
173
+ 参数:
174
+ X : ndarray, shape (n_samples, n_features)
175
+ 特征矩阵
176
+ y : ndarray, shape (n_samples,)
177
+ 目标变量
178
+
179
+ 返回:
180
+ self : 训练后的模型实例
181
+ """
182
+ self.tree = self._build_tree(X, y, depth=0)
183
+ return self
184
+
185
+ def _predict_one(self, x, node):
186
+ """
187
+ 预测单个样本
188
+
189
+ 从根节点开始,根据分裂条件选择分支,直到到达叶节点。
190
+
191
+ 参数:
192
+ x : ndarray
193
+ 单个样本的特征向量
194
+ node : dict
195
+ 当前树节点
196
+
197
+ 返回:
198
+ int : 预测类别
199
+ """
200
+ if node['leaf']:
201
+ return node['class']
202
+
203
+ if x[node['feature']] <= node['threshold']:
204
+ return self._predict_one(x, node['left'])
205
+ else:
206
+ return self._predict_one(x, node['right'])
207
+
208
+ def predict(self, X):
209
+ """
210
+ 批量预测
211
+
212
+ 参数:
213
+ X : ndarray, shape (n_samples, n_features)
214
+ 特征矩阵
215
+
216
+ 返回:
217
+ ndarray : 预测类别数组
218
+ """
219
+ return np.array([self._predict_one(x, self.tree) for x in X])
220
+
221
+ def score(self, X, y):
222
+ """
223
+ 计算准确率
224
+
225
+ 参数:
226
+ X : ndarray
227
+ 特征矩阵
228
+ y : ndarray
229
+ 真实类别
230
+
231
+ 返回:
232
+ float : 准确率
233
+ """
234
+ y_pred = self.predict(X)
235
+ return np.mean(y_pred == y)
@@ -0,0 +1,88 @@
1
+ # RandomForestClassifier 类定义
2
+ # 从文档自动提取生成
3
+
4
+ import numpy as np
5
+
6
+ class RandomForestClassifier:
7
+ """
8
+ 随机森林分类器
9
+
10
+ 实现:
11
+ 1. Bootstrap采样(对应理论:样本随机)
12
+ 2. 多棵决策树训练(每棵树使用不同的Bootstrap样本和特征子集)
13
+ 3. 多数投票预测(对应理论:投票机制)
14
+
15
+ 参数:
16
+ n_estimators : int, 默认值 100
17
+ 树的数量(对应理论中的B)
18
+ max_depth : int, 默认值 10
19
+ 每棵树的最大深度
20
+ max_features : str or int, 默认值 'sqrt'
21
+ 每次分裂时考虑的特征数量(对应理论中的m)
22
+ """
23
+
24
+ def __init__(self, n_estimators=100, max_depth=10, max_features='sqrt'):
25
+ self.n_estimators = n_estimators
26
+ self.max_depth = max_depth
27
+ self.max_features = max_features
28
+ self.trees = []
29
+
30
+ def _bootstrap_sample(self, X, y):
31
+ """
32
+ Bootstrap采样(对应理论:有放回重采样)
33
+
34
+ 从原始数据集中有放回地抽取n个样本
35
+ """
36
+ n_samples = X.shape[0]
37
+ indices = np.random.choice(n_samples, n_samples, replace=True)
38
+ return X[indices], y[indices]
39
+
40
+ def fit(self, X, y):
41
+ """
42
+ 训练随机森林
43
+
44
+ 核心步骤:
45
+ 1. 确定特征子集大小m
46
+ 2. 对每棵树:Bootstrap采样 → 训练决策树
47
+ """
48
+ n_features = X.shape[1]
49
+
50
+ # 确定特征子集大小m(对应理论:分类用sqrt(d),回归用d/3)
51
+ if self.max_features == 'sqrt':
52
+ max_features = int(np.sqrt(n_features))
53
+ elif self.max_features == 'log2':
54
+ max_features = int(np.log2(n_features))
55
+ else:
56
+ max_features = n_features
57
+
58
+ self.trees = []
59
+ for _ in range(self.n_estimators):
60
+ # Bootstrap采样
61
+ X_sample, y_sample = self._bootstrap_sample(X, y)
62
+
63
+ # 训练决策树(带特征随机)
64
+ tree = DecisionTreeForRF(
65
+ max_depth=self.max_depth,
66
+ max_features=max_features
67
+ )
68
+ tree.fit(X_sample, y_sample)
69
+ self.trees.append(tree)
70
+
71
+ return self
72
+
73
+ def predict(self, X):
74
+ """
75
+ 多数投票预测(对应理论:硬投票)
76
+
77
+ 每棵树预测一个类别,选择得票最多的类别
78
+ """
79
+ predictions = np.array([tree.predict(X) for tree in self.trees])
80
+ result = []
81
+ for i in range(X.shape[0]):
82
+ values, counts = np.unique(predictions[:, i], return_counts=True)
83
+ result.append(values[np.argmax(counts)])
84
+ return np.array(result)
85
+
86
+ def score(self, X, y):
87
+ """计算准确率"""
88
+ return np.mean(self.predict(X) == y)
@@ -0,0 +1,88 @@
1
+ # RandomForestClassifier 类定义
2
+ # 从文档自动提取生成
3
+
4
+ import numpy as np
5
+
6
+ class RandomForestClassifier:
7
+ """
8
+ 随机森林分类器
9
+
10
+ 实现:
11
+ 1. Bootstrap采样(对应理论:样本随机)
12
+ 2. 多棵决策树训练(每棵树使用不同的Bootstrap样本和特征子集)
13
+ 3. 多数投票预测(对应理论:投票机制)
14
+
15
+ 参数:
16
+ n_estimators : int, 默认值 100
17
+ 树的数量(对应理论中的B)
18
+ max_depth : int, 默认值 10
19
+ 每棵树的最大深度
20
+ max_features : str or int, 默认值 'sqrt'
21
+ 每次分裂时考虑的特征数量(对应理论中的m)
22
+ """
23
+
24
+ def __init__(self, n_estimators=100, max_depth=10, max_features='sqrt'):
25
+ self.n_estimators = n_estimators
26
+ self.max_depth = max_depth
27
+ self.max_features = max_features
28
+ self.trees = []
29
+
30
+ def _bootstrap_sample(self, X, y):
31
+ """
32
+ Bootstrap采样(对应理论:有放回重采样)
33
+
34
+ 从原始数据集中有放回地抽取n个样本
35
+ """
36
+ n_samples = X.shape[0]
37
+ indices = np.random.choice(n_samples, n_samples, replace=True)
38
+ return X[indices], y[indices]
39
+
40
+ def fit(self, X, y):
41
+ """
42
+ 训练随机森林
43
+
44
+ 核心步骤:
45
+ 1. 确定特征子集大小m
46
+ 2. 对每棵树:Bootstrap采样 → 训练决策树
47
+ """
48
+ n_features = X.shape[1]
49
+
50
+ # 确定特征子集大小m(对应理论:分类用sqrt(d),回归用d/3)
51
+ if self.max_features == 'sqrt':
52
+ max_features = int(np.sqrt(n_features))
53
+ elif self.max_features == 'log2':
54
+ max_features = int(np.log2(n_features))
55
+ else:
56
+ max_features = n_features
57
+
58
+ self.trees = []
59
+ for _ in range(self.n_estimators):
60
+ # Bootstrap采样
61
+ X_sample, y_sample = self._bootstrap_sample(X, y)
62
+
63
+ # 训练决策树(带特征随机)
64
+ tree = DecisionTreeForRF(
65
+ max_depth=self.max_depth,
66
+ max_features=max_features
67
+ )
68
+ tree.fit(X_sample, y_sample)
69
+ self.trees.append(tree)
70
+
71
+ return self
72
+
73
+ def predict(self, X):
74
+ """
75
+ 多数投票预测(对应理论:硬投票)
76
+
77
+ 每棵树预测一个类别,选择得票最多的类别
78
+ """
79
+ predictions = np.array([tree.predict(X) for tree in self.trees])
80
+ result = []
81
+ for i in range(X.shape[0]):
82
+ values, counts = np.unique(predictions[:, i], return_counts=True)
83
+ result.append(values[np.argmax(counts)])
84
+ return np.array(result)
85
+
86
+ def score(self, X, y):
87
+ """计算准确率"""
88
+ return np.mean(self.predict(X) == y)
@@ -0,0 +1,5 @@
1
+ # Unsupervised 模块
2
+ from .kmeans import KMeans
3
+ from .pca import PCA
4
+
5
+ __all__ = ['KMeans', 'PCA']
@@ -0,0 +1,127 @@
1
+ # KMeans 类定义
2
+ # 从文档自动提取生成
3
+
4
+ import numpy as np
5
+
6
+ class KMeans:
7
+ """
8
+ K-means聚类算法实现
9
+
10
+ 参数:
11
+ n_clusters : int, 簇的数量K
12
+ max_iter : int, 最大迭代次数
13
+ tol : float, 收敛阈值(中心变化小于此值时停止)
14
+ n_init : int, 随机初始化的次数(取最优结果)
15
+ """
16
+
17
+ def __init__(self, n_clusters=3, max_iter=300, tol=1e-4, n_init=10):
18
+ self.n_clusters = n_clusters
19
+ self.max_iter = max_iter
20
+ self.tol = tol
21
+ self.n_init = n_init
22
+
23
+ self.cluster_centers_ = None # 簇中心
24
+ self.labels_ = None # 每个样本的簇分配
25
+ self.inertia_ = None # 目标函数值(距离平方和)
26
+
27
+ def _init_centers(self, X):
28
+ """
29
+ 随机初始化簇中心
30
+
31
+ 从数据中随机选择K个样本作为初始中心
32
+ """
33
+ indices = np.random.choice(len(X), self.n_clusters, replace=False)
34
+ return X[indices].copy()
35
+
36
+ def _assign_clusters(self, X, centers):
37
+ """
38
+ 分配步骤:将每个样本分配到最近的簇中心
39
+
40
+ 计算每个样本到所有中心的距离平方,返回最近的簇编号
41
+ """
42
+ distances = np.zeros((len(X), self.n_clusters))
43
+ for k in range(self.n_clusters):
44
+ # 计算样本到第k个中心的距离平方(对应目标函数中的||x - μ||²)
45
+ distances[:, k] = np.sum((X - centers[k]) ** 2, axis=1)
46
+ return np.argmin(distances, axis=1)
47
+
48
+ def _update_centers(self, X, labels):
49
+ """
50
+ 更新步骤:重新计算每个簇的中心
51
+
52
+ 簇中心 = 簇内样本的均值(这就是"means"的含义)
53
+ """
54
+ centers = np.zeros((self.n_clusters, X.shape[1]))
55
+ for k in range(self.n_clusters):
56
+ mask = labels == k
57
+ if np.sum(mask) > 0:
58
+ # 取簇内样本的均值作为新中心
59
+ centers[k] = X[mask].mean(axis=0)
60
+ else:
61
+ # 空簇的罕见情况:随机重新初始化
62
+ centers[k] = X[np.random.randint(len(X))]
63
+ return centers
64
+
65
+ def _compute_inertia(self, X, labels, centers):
66
+ """
67
+ 计算目标函数值J
68
+
69
+ J = 所有样本到其所属簇中心的距离平方和
70
+ """
71
+ inertia = 0
72
+ for k in range(self.n_clusters):
73
+ mask = labels == k
74
+ inertia += np.sum((X[mask] - centers[k]) ** 2)
75
+ return inertia
76
+
77
+ def fit(self, X):
78
+ """
79
+ 训练K-means模型
80
+
81
+ 执行多次随机初始化,取目标函数最小的结果
82
+ """
83
+ best_inertia = float('inf')
84
+ best_centers = None
85
+ best_labels = None
86
+
87
+ for init in range(self.n_init):
88
+ # 初始化簇中心
89
+ centers = self._init_centers(X)
90
+
91
+ # 迭代直到收敛
92
+ for i in range(self.max_iter):
93
+ # 步骤2:分配样本到最近的簇
94
+ labels = self._assign_clusters(X, centers)
95
+
96
+ # 步骤3:更新簇中心
97
+ new_centers = self._update_centers(X, labels)
98
+
99
+ # 检查收敛:中心变化是否小于阈值
100
+ if np.max(np.abs(new_centers - centers)) < self.tol:
101
+ break
102
+
103
+ centers = new_centers
104
+
105
+ # 计算本次初始化的目标函数值
106
+ inertia = self._compute_inertia(X, labels, centers)
107
+
108
+ # 保留最优结果
109
+ if inertia < best_inertia:
110
+ best_inertia = inertia
111
+ best_centers = centers.copy()
112
+ best_labels = labels.copy()
113
+
114
+ # 存储最优结果
115
+ self.cluster_centers_ = best_centers
116
+ self.labels_ = best_labels
117
+ self.inertia_ = best_inertia
118
+
119
+ return self
120
+
121
+ def predict(self, X):
122
+ """
123
+ 预测新样本所属的簇
124
+
125
+ 根据训练得到的簇中心,将新样本分配到最近的簇
126
+ """
127
+ return self._assign_clusters(X, self.cluster_centers_)
@@ -0,0 +1,127 @@
1
+ # KMeans 类定义
2
+ # 从文档自动提取生成
3
+
4
+ import numpy as np
5
+
6
+ class KMeans:
7
+ """
8
+ K-means聚类算法实现
9
+
10
+ 参数:
11
+ n_clusters : int, 簇的数量K
12
+ max_iter : int, 最大迭代次数
13
+ tol : float, 收敛阈值(中心变化小于此值时停止)
14
+ n_init : int, 随机初始化的次数(取最优结果)
15
+ """
16
+
17
+ def __init__(self, n_clusters=3, max_iter=300, tol=1e-4, n_init=10):
18
+ self.n_clusters = n_clusters
19
+ self.max_iter = max_iter
20
+ self.tol = tol
21
+ self.n_init = n_init
22
+
23
+ self.cluster_centers_ = None # 簇中心
24
+ self.labels_ = None # 每个样本的簇分配
25
+ self.inertia_ = None # 目标函数值(距离平方和)
26
+
27
+ def _init_centers(self, X):
28
+ """
29
+ 随机初始化簇中心
30
+
31
+ 从数据中随机选择K个样本作为初始中心
32
+ """
33
+ indices = np.random.choice(len(X), self.n_clusters, replace=False)
34
+ return X[indices].copy()
35
+
36
+ def _assign_clusters(self, X, centers):
37
+ """
38
+ 分配步骤:将每个样本分配到最近的簇中心
39
+
40
+ 计算每个样本到所有中心的距离平方,返回最近的簇编号
41
+ """
42
+ distances = np.zeros((len(X), self.n_clusters))
43
+ for k in range(self.n_clusters):
44
+ # 计算样本到第k个中心的距离平方(对应目标函数中的||x - μ||²)
45
+ distances[:, k] = np.sum((X - centers[k]) ** 2, axis=1)
46
+ return np.argmin(distances, axis=1)
47
+
48
+ def _update_centers(self, X, labels):
49
+ """
50
+ 更新步骤:重新计算每个簇的中心
51
+
52
+ 簇中心 = 簇内样本的均值(这就是"means"的含义)
53
+ """
54
+ centers = np.zeros((self.n_clusters, X.shape[1]))
55
+ for k in range(self.n_clusters):
56
+ mask = labels == k
57
+ if np.sum(mask) > 0:
58
+ # 取簇内样本的均值作为新中心
59
+ centers[k] = X[mask].mean(axis=0)
60
+ else:
61
+ # 空簇的罕见情况:随机重新初始化
62
+ centers[k] = X[np.random.randint(len(X))]
63
+ return centers
64
+
65
+ def _compute_inertia(self, X, labels, centers):
66
+ """
67
+ 计算目标函数值J
68
+
69
+ J = 所有样本到其所属簇中心的距离平方和
70
+ """
71
+ inertia = 0
72
+ for k in range(self.n_clusters):
73
+ mask = labels == k
74
+ inertia += np.sum((X[mask] - centers[k]) ** 2)
75
+ return inertia
76
+
77
+ def fit(self, X):
78
+ """
79
+ 训练K-means模型
80
+
81
+ 执行多次随机初始化,取目标函数最小的结果
82
+ """
83
+ best_inertia = float('inf')
84
+ best_centers = None
85
+ best_labels = None
86
+
87
+ for init in range(self.n_init):
88
+ # 初始化簇中心
89
+ centers = self._init_centers(X)
90
+
91
+ # 迭代直到收敛
92
+ for i in range(self.max_iter):
93
+ # 步骤2:分配样本到最近的簇
94
+ labels = self._assign_clusters(X, centers)
95
+
96
+ # 步骤3:更新簇中心
97
+ new_centers = self._update_centers(X, labels)
98
+
99
+ # 检查收敛:中心变化是否小于阈值
100
+ if np.max(np.abs(new_centers - centers)) < self.tol:
101
+ break
102
+
103
+ centers = new_centers
104
+
105
+ # 计算本次初始化的目标函数值
106
+ inertia = self._compute_inertia(X, labels, centers)
107
+
108
+ # 保留最优结果
109
+ if inertia < best_inertia:
110
+ best_inertia = inertia
111
+ best_centers = centers.copy()
112
+ best_labels = labels.copy()
113
+
114
+ # 存储最优结果
115
+ self.cluster_centers_ = best_centers
116
+ self.labels_ = best_labels
117
+ self.inertia_ = best_inertia
118
+
119
+ return self
120
+
121
+ def predict(self, X):
122
+ """
123
+ 预测新样本所属的簇
124
+
125
+ 根据训练得到的簇中心,将新样本分配到最近的簇
126
+ """
127
+ return self._assign_clusters(X, self.cluster_centers_)