@icyfenix-dmla/cli 2026.5.2-7 → 2026.5.3-1019

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/package.json +9 -6
  2. package/scripts/build.js +44 -11
  3. package/shared_modules/__init__.py +10 -0
  4. package/shared_modules/bayesian/__init__.py +6 -0
  5. package/shared_modules/bayesian/bayesian_network.py +105 -0
  6. package/shared_modules/bayesian/gaussian_mixture_model.py +141 -0
  7. package/shared_modules/bayesian/gaussian_mixturemodel.py +141 -0
  8. package/shared_modules/bayesian/multinomial_naive_bayes.py +74 -0
  9. package/shared_modules/bayesian/simple_bayesian_network.py +99 -0
  10. package/shared_modules/bayesian/simple_bayesiannetwork.py +99 -0
  11. package/shared_modules/cnn/__init__.py +5 -0
  12. package/shared_modules/cnn/alex_net.py +65 -0
  13. package/shared_modules/cnn/alexnet.py +65 -0
  14. package/shared_modules/cnn/t_e_r_m1.py +65 -0
  15. package/shared_modules/cnn/tiny_image_net_dataset.py +67 -0
  16. package/shared_modules/cnn/tiny_imagenet_dataset.py +67 -0
  17. package/shared_modules/cnn/tiny_imagenetdataset.py +67 -0
  18. package/shared_modules/cnn/tinyimagenetdataset.py +67 -0
  19. package/shared_modules/linear/__init__.py +6 -0
  20. package/shared_modules/linear/lasso_regression.py +93 -0
  21. package/shared_modules/linear/logistic_regression.py +78 -0
  22. package/shared_modules/linear/naive_bayes.py +141 -0
  23. package/shared_modules/linear/ridge_regression.py +58 -0
  24. package/shared_modules/neural/__init__.py +4 -0
  25. package/shared_modules/neural/perceptron.py +80 -0
  26. package/shared_modules/svm/__init__.py +5 -0
  27. package/shared_modules/svm/kernel_s_v_m.py +98 -0
  28. package/shared_modules/svm/kernel_svm.py +98 -0
  29. package/shared_modules/svm/simple_s_v_m.py +111 -0
  30. package/shared_modules/svm/simple_svm.py +111 -0
  31. package/shared_modules/tree/__init__.py +6 -0
  32. package/shared_modules/tree/ada_boost.py +77 -0
  33. package/shared_modules/tree/decision_tree_classifier.py +235 -0
  34. package/shared_modules/tree/decision_treeclassifier.py +235 -0
  35. package/shared_modules/tree/random_forest_classifier.py +88 -0
  36. package/shared_modules/tree/random_forestclassifier.py +88 -0
  37. package/shared_modules/unsupervised/__init__.py +5 -0
  38. package/shared_modules/unsupervised/k_means.py +127 -0
  39. package/shared_modules/unsupervised/kmeans.py +127 -0
  40. package/shared_modules/unsupervised/p_c_a.py +111 -0
  41. package/shared_modules/unsupervised/pca.py +111 -0
  42. package/src/commands/data.js +823 -0
  43. package/src/commands/server.js +209 -4
  44. package/src/index.js +23 -2
  45. package/src/server/routes/sandbox.js +70 -3
  46. package/src/server/sandbox.js +87 -11
  47. package/version.json +4 -0
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@icyfenix-dmla/cli",
3
- "version": "2026.5.2-7",
3
+ "version": "2026.5.3-1019",
4
4
  "description": "DMLA 沙箱服务命令行工具",
5
5
  "type": "module",
6
6
  "main": "src/index.js",
@@ -17,16 +17,19 @@
17
17
  "bin/",
18
18
  "src/",
19
19
  "scripts/",
20
+ "shared_modules/",
21
+ "version.json",
20
22
  "README.md"
21
23
  ],
22
24
  "dependencies": {
23
- "commander": "^12.1.0",
25
+ "@icyfenix-dmla/install": "*",
26
+ "adm-zip": "^0.5.17",
24
27
  "chalk": "^5.3.0",
25
- "enquirer": "^2.4.1",
26
- "dockerode": "^5.0.0",
27
- "express": "^4.21.2",
28
+ "commander": "^12.1.0",
28
29
  "cors": "^2.8.5",
29
- "@icyfenix-dmla/install": "*"
30
+ "dockerode": "^5.0.0",
31
+ "enquirer": "^2.4.1",
32
+ "express": "^4.21.2"
30
33
  },
31
34
  "devDependencies": {
32
35
  "jest": "^29.7.0"
package/scripts/build.js CHANGED
@@ -1,6 +1,6 @@
1
1
  /**
2
2
  * 构建脚本:将 local-server 代码复制到 CLI 包中
3
- * 用于 npm 发布时包含完整的服务器代码
3
+ * 用于 npm 发布时包含完整的服务器代码和共享模块
4
4
  */
5
5
  import fs from 'fs'
6
6
  import path from 'path'
@@ -11,17 +11,17 @@ const __dirname = path.dirname(__filename)
11
11
 
12
12
  const rootDir = path.resolve(__dirname, '../../..')
13
13
  const localServerSrc = path.resolve(rootDir, 'local-server/src')
14
+ const localServerShared = path.resolve(rootDir, 'local-server/shared_modules')
14
15
  const cliServerDest = path.resolve(__dirname, '../src/server')
16
+ const cliSharedDest = path.resolve(__dirname, '../shared_modules')
15
17
 
16
18
  console.log('📦 构建 CLI 包...')
17
- console.log(` 源目录: ${localServerSrc}`)
18
- console.log(` 目标目录: ${cliServerDest}`)
19
19
 
20
20
  // 递归复制目录
21
- function copyDir(src, dest) {
21
+ function copyDir(src, dest, filter = null) {
22
22
  if (!fs.existsSync(src)) {
23
23
  console.error(`❌ 源目录不存在: ${src}`)
24
- process.exit(1)
24
+ return false
25
25
  }
26
26
 
27
27
  // 创建目标目录
@@ -36,15 +36,48 @@ function copyDir(src, dest) {
36
36
  const destPath = path.join(dest, entry.name)
37
37
 
38
38
  if (entry.isDirectory()) {
39
- copyDir(srcPath, destPath)
40
- } else if (entry.isFile() && entry.name.endsWith('.js')) {
39
+ copyDir(srcPath, destPath, filter)
40
+ } else if (entry.isFile()) {
41
+ // 应用过滤器
42
+ if (filter && !filter(entry.name)) {
43
+ continue
44
+ }
41
45
  fs.copyFileSync(srcPath, destPath)
42
- console.log(` ✓ 复制: ${entry.name}`)
46
+ console.log(` ✓ ${path.relative(rootDir, srcPath)} → ${path.relative(__dirname, destPath)}`)
43
47
  }
44
48
  }
49
+ return true
45
50
  }
46
51
 
47
- // 执行复制
48
- copyDir(localServerSrc, cliServerDest)
52
+ // 复制服务器代码(只复制 .js 文件)
53
+ console.log('\n📋 复制服务器代码...')
54
+ console.log(` 源目录: ${localServerSrc}`)
55
+ console.log(` 目标目录: ${cliServerDest}`)
56
+ copyDir(localServerSrc, cliServerDest, (name) => name.endsWith('.js'))
57
+
58
+ // 复制共享模块(复制所有 .py 文件和 __init__.py)
59
+ console.log('\n📋 复制共享模块...')
60
+ console.log(` 源目录: ${localServerShared}`)
61
+ console.log(` 目标目录: ${cliSharedDest}`)
62
+ const sharedCopied = copyDir(localServerShared, cliSharedDest, (name) => {
63
+ // 复制 Python 文件和初始化文件
64
+ return name.endsWith('.py') || name === '__init__.py'
65
+ })
66
+
67
+ if (!sharedCopied) {
68
+ console.log('⚠️ 共享模块目录不存在,跳过')
69
+ }
70
+
71
+ // 创建版本信息文件(用于 --dev 模式的版本比较)
72
+ const versionInfo = {
73
+ buildTime: new Date().toISOString(),
74
+ // 从 package.json 读取版本
75
+ cliVersion: JSON.parse(fs.readFileSync(path.resolve(__dirname, '../package.json'), 'utf8')).version
76
+ }
77
+ fs.writeFileSync(
78
+ path.resolve(__dirname, '../version.json'),
79
+ JSON.stringify(versionInfo, null, 2)
80
+ )
81
+ console.log('\n✅ 版本信息已生成')
49
82
 
50
- console.log('✅ 服务器代码已复制到 CLI ')
83
+ console.log('\n✅ CLI 包构建完成')
@@ -0,0 +1,10 @@
1
+ # shared 模块包初始化
2
+ # 包含统计学习系列文档中可复用的类定义
3
+
4
+ from .linear import *
5
+ from .cnn import *
6
+ from .bayesian import *
7
+ from .tree import *
8
+ from .svm import *
9
+ from .unsupervised import *
10
+ from .neural import *
@@ -0,0 +1,6 @@
1
+ # Bayesian 模块
2
+ from .simple_bayesian_network import SimpleBayesianNetwork
3
+ from .gaussian_mixture_model import GaussianMixtureModel
4
+ from .multinomial_naive_bayes import MultinomialNaiveBayes
5
+
6
+ __all__ = ['SimpleBayesianNetwork', 'GaussianMixtureModel', 'MultinomialNaiveBayes']
@@ -0,0 +1,105 @@
1
+ """
2
+ 贝叶斯网络实现
3
+ 支持离散变量和精确推断(枚举法)
4
+ """
5
+
6
+ import numpy as np
7
+
8
+
9
+ class SimpleBayesianNetwork:
10
+ """
11
+ 简单贝叶斯网络实现
12
+ 支持离散变量和精确推断(枚举法)
13
+ """
14
+
15
+ def __init__(self):
16
+ self.nodes = {} # 节点信息:{name: {'parents': [], 'values': []}}
17
+ self.cpts = {} # 条件概率表:{name: {parent_values: {value: prob}}}
18
+ self.topo_order = [] # 拓扑排序
19
+
20
+ def add_node(self, name, values, parents=None):
21
+ """添加节点"""
22
+ if parents is None:
23
+ parents = []
24
+ self.nodes[name] = {'parents': parents, 'values': values}
25
+ self._update_topo_order()
26
+
27
+ def set_cpt(self, name, cpt):
28
+ """
29
+ 设置条件概率表
30
+
31
+ cpt格式:{parent_value_tuple: {value: prob}}
32
+ 对于无父节点的变量:{(): {value: prob}}
33
+ """
34
+ self.cpts[name] = cpt
35
+
36
+ def _update_topo_order(self):
37
+ """计算拓扑排序"""
38
+ visited = set()
39
+ order = []
40
+
41
+ def visit(node):
42
+ if node in visited:
43
+ return
44
+ visited.add(node)
45
+ for parent in self.nodes[node]['parents']:
46
+ visit(parent)
47
+ order.append(node)
48
+
49
+ for node in self.nodes:
50
+ visit(node)
51
+
52
+ self.topo_order = order
53
+
54
+ def get_prob(self, name, value, parent_values):
55
+ """获取条件概率 P(name=value | parent_values)"""
56
+ parent_key = tuple(parent_values) if parent_values else ()
57
+ return self.cpts[name].get(parent_key, {}).get(value, 0)
58
+
59
+ def joint_prob(self, assignment):
60
+ """计算联合概率 P(X1, X2, ...)"""
61
+ prob = 1.0
62
+ for node in self.topo_order:
63
+ parents = self.nodes[node]['parents']
64
+ parent_values = [assignment[p] for p in parents]
65
+ value = assignment[node]
66
+ prob *= self.get_prob(node, value, parent_values)
67
+ return prob
68
+
69
+ def enumerate_inference(self, query, evidence):
70
+ """
71
+ 枚举推断:计算 P(query | evidence)
72
+
73
+ query: {node: '?'} 返回分布
74
+ evidence: {node: value}
75
+ """
76
+ query_nodes = list(query.keys())
77
+ hidden = [n for n in self.nodes if n not in query_nodes and n not in evidence]
78
+
79
+ def enumerate_assignments(variables, current):
80
+ if not variables:
81
+ yield current.copy()
82
+ return
83
+ var = variables[0]
84
+ for value in self.nodes[var]['values']:
85
+ current[var] = value
86
+ yield from enumerate_assignments(variables[1:], current)
87
+ del current[var]
88
+
89
+ query_values = {}
90
+ total = 0.0
91
+
92
+ query_node = query_nodes[0]
93
+ for qv in self.nodes[query_node]['values']:
94
+ prob_sum = 0.0
95
+ for assignment in enumerate_assignments(hidden, {}):
96
+ assignment.update(evidence)
97
+ assignment[query_node] = qv
98
+ prob_sum += self.joint_prob(assignment)
99
+ query_values[qv] = prob_sum
100
+ total += prob_sum
101
+
102
+ # 归一化
103
+ for k in query_values:
104
+ query_values[k] /= total
105
+ return query_values
@@ -0,0 +1,141 @@
1
+ # GaussianMixtureModel 类定义
2
+ # 从文档自动提取生成
3
+
4
+ import numpy as np
5
+
6
+ class GaussianMixtureModel:
7
+ """
8
+ 高斯混合模型实现
9
+ 使用EM算法求解
10
+ """
11
+ def __init__(self, n_components=3, max_iter=100, tol=1e-4):
12
+ self.n_components = n_components
13
+ self.max_iter = max_iter
14
+ self.tol = tol # 收敛阈值
15
+
16
+ self.weights_ = None # 混合系数 (K,)
17
+ self.means_ = None # 均值 (K, n_features)
18
+ self.covariances_ = None # 协方差矩阵 (K, n_features, n_features)
19
+ self.log_likelihood_history_ = []
20
+
21
+ def _initialize(self, X):
22
+ """初始化参数"""
23
+ n_samples, n_features = X.shape
24
+ K = self.n_components
25
+
26
+ # 随机初始化均值(从数据中随机选择K个点)
27
+ indices = np.random.choice(n_samples, K, replace=False)
28
+ self.means_ = X[indices].copy()
29
+
30
+ # 初始化协方差为数据协方差的对角线
31
+ data_cov = np.cov(X.T)
32
+ self.covariances_ = np.array([np.diag(np.diag(data_cov)) + 1e-6 * np.eye(n_features)
33
+ for _ in range(K)])
34
+
35
+ # 初始化混合系数为均匀分布
36
+ self.weights_ = np.ones(K) / K
37
+
38
+ def _gaussian_pdf(self, X, mean, cov):
39
+ """计算多元高斯概率密度"""
40
+ n_features = X.shape[1]
41
+ diff = X - mean
42
+
43
+ # 加小值保证数值稳定
44
+ cov_reg = cov + 1e-6 * np.eye(n_features)
45
+
46
+ # 使用Cholesky分解计算行列式和逆
47
+ try:
48
+ L = np.linalg.cholesky(cov_reg)
49
+ log_det = 2 * np.sum(np.log(np.diag(L)))
50
+ diff_L = np.linalg.solve(L, diff.T).T
51
+ mahalanobis = np.sum(diff_L ** 2, axis=1)
52
+ except np.linalg.LinAlgError:
53
+ # 如果Cholesky失败,使用标准方法
54
+ sign, log_det = np.linalg.slogdet(cov_reg)
55
+ cov_inv = np.linalg.inv(cov_reg)
56
+ mahalanobis = np.sum(diff @ cov_inv * diff, axis=1)
57
+
58
+ log_prob = -0.5 * (n_features * np.log(2 * np.pi) + log_det + mahalanobis)
59
+ return log_prob
60
+
61
+ def _e_step(self, X):
62
+ """E步:计算责任度"""
63
+ n_samples = X.shape[0]
64
+ K = self.n_components
65
+
66
+ # 计算每个成分的对数概率
67
+ log_probs = np.zeros((n_samples, K))
68
+ for k in range(K):
69
+ log_probs[:, k] = (np.log(self.weights_[k] + 1e-10) +
70
+ self._gaussian_pdf(X, self.means_[k], self.covariances_[k]))
71
+
72
+ # 计算对数似然
73
+ log_likelihood = np.sum(np.log(np.sum(np.exp(log_probs), axis=1)))
74
+
75
+ # 计算责任度(使用log-sum-exp trick避免数值下溢)
76
+ log_max = log_probs.max(axis=1, keepdims=True)
77
+ log_sum = np.log(np.sum(np.exp(log_probs - log_max), axis=1, keepdims=True)) + log_max
78
+ responsibilities = np.exp(log_probs - log_sum)
79
+
80
+ return responsibilities, log_likelihood
81
+
82
+ def _m_step(self, X, responsibilities):
83
+ """M步:更新参数"""
84
+ n_samples, n_features = X.shape
85
+ K = self.n_components
86
+
87
+ # 计算每个成分的有效样本数
88
+ N_k = responsibilities.sum(axis=0) + 1e-10
89
+
90
+ # 更新混合系数
91
+ self.weights_ = N_k / n_samples
92
+
93
+ # 更新均值
94
+ self.means_ = (responsibilities.T @ X) / N_k[:, np.newaxis]
95
+
96
+ # 更新协方差
97
+ for k in range(K):
98
+ diff = X - self.means_[k]
99
+ weighted_diff = responsibilities[:, k:k+1] * diff
100
+ self.covariances_[k] = (weighted_diff.T @ diff) / N_k[k]
101
+ # 添加正则化
102
+ self.covariances_[k] += 1e-6 * np.eye(n_features)
103
+
104
+ def fit(self, X):
105
+ """训练模型"""
106
+ self._initialize(X)
107
+ self.log_likelihood_history_ = []
108
+
109
+ prev_log_likelihood = -np.inf
110
+
111
+ for iteration in range(self.max_iter):
112
+ # E步
113
+ responsibilities, log_likelihood = self._e_step(X)
114
+ self.log_likelihood_history_.append(log_likelihood)
115
+
116
+ # 检查收敛
117
+ if abs(log_likelihood - prev_log_likelihood) < self.tol:
118
+ print(f"EM收敛于第{iteration}次迭代")
119
+ break
120
+
121
+ # M步
122
+ self._m_step(X, responsibilities)
123
+
124
+ prev_log_likelihood = log_likelihood
125
+
126
+ return self
127
+
128
+ def predict(self, X):
129
+ """预测聚类标签"""
130
+ responsibilities, _ = self._e_step(X)
131
+ return np.argmax(responsibilities, axis=1)
132
+
133
+ def predict_proba(self, X):
134
+ """预测属于各成分的概率"""
135
+ responsibilities, _ = self._e_step(X)
136
+ return responsibilities
137
+
138
+ def score(self, X):
139
+ """计算对数似然"""
140
+ _, log_likelihood = self._e_step(X)
141
+ return log_likelihood
@@ -0,0 +1,141 @@
1
+ # GaussianMixtureModel 类定义
2
+ # 从文档自动提取生成
3
+
4
+ import numpy as np
5
+
6
+ class GaussianMixtureModel:
7
+ """
8
+ 高斯混合模型实现
9
+ 使用EM算法求解
10
+ """
11
+ def __init__(self, n_components=3, max_iter=100, tol=1e-4):
12
+ self.n_components = n_components
13
+ self.max_iter = max_iter
14
+ self.tol = tol # 收敛阈值
15
+
16
+ self.weights_ = None # 混合系数 (K,)
17
+ self.means_ = None # 均值 (K, n_features)
18
+ self.covariances_ = None # 协方差矩阵 (K, n_features, n_features)
19
+ self.log_likelihood_history_ = []
20
+
21
+ def _initialize(self, X):
22
+ """初始化参数"""
23
+ n_samples, n_features = X.shape
24
+ K = self.n_components
25
+
26
+ # 随机初始化均值(从数据中随机选择K个点)
27
+ indices = np.random.choice(n_samples, K, replace=False)
28
+ self.means_ = X[indices].copy()
29
+
30
+ # 初始化协方差为数据协方差的对角线
31
+ data_cov = np.cov(X.T)
32
+ self.covariances_ = np.array([np.diag(np.diag(data_cov)) + 1e-6 * np.eye(n_features)
33
+ for _ in range(K)])
34
+
35
+ # 初始化混合系数为均匀分布
36
+ self.weights_ = np.ones(K) / K
37
+
38
+ def _gaussian_pdf(self, X, mean, cov):
39
+ """计算多元高斯概率密度"""
40
+ n_features = X.shape[1]
41
+ diff = X - mean
42
+
43
+ # 加小值保证数值稳定
44
+ cov_reg = cov + 1e-6 * np.eye(n_features)
45
+
46
+ # 使用Cholesky分解计算行列式和逆
47
+ try:
48
+ L = np.linalg.cholesky(cov_reg)
49
+ log_det = 2 * np.sum(np.log(np.diag(L)))
50
+ diff_L = np.linalg.solve(L, diff.T).T
51
+ mahalanobis = np.sum(diff_L ** 2, axis=1)
52
+ except np.linalg.LinAlgError:
53
+ # 如果Cholesky失败,使用标准方法
54
+ sign, log_det = np.linalg.slogdet(cov_reg)
55
+ cov_inv = np.linalg.inv(cov_reg)
56
+ mahalanobis = np.sum(diff @ cov_inv * diff, axis=1)
57
+
58
+ log_prob = -0.5 * (n_features * np.log(2 * np.pi) + log_det + mahalanobis)
59
+ return log_prob
60
+
61
+ def _e_step(self, X):
62
+ """E步:计算责任度"""
63
+ n_samples = X.shape[0]
64
+ K = self.n_components
65
+
66
+ # 计算每个成分的对数概率
67
+ log_probs = np.zeros((n_samples, K))
68
+ for k in range(K):
69
+ log_probs[:, k] = (np.log(self.weights_[k] + 1e-10) +
70
+ self._gaussian_pdf(X, self.means_[k], self.covariances_[k]))
71
+
72
+ # 计算对数似然
73
+ log_likelihood = np.sum(np.log(np.sum(np.exp(log_probs), axis=1)))
74
+
75
+ # 计算责任度(使用log-sum-exp trick避免数值下溢)
76
+ log_max = log_probs.max(axis=1, keepdims=True)
77
+ log_sum = np.log(np.sum(np.exp(log_probs - log_max), axis=1, keepdims=True)) + log_max
78
+ responsibilities = np.exp(log_probs - log_sum)
79
+
80
+ return responsibilities, log_likelihood
81
+
82
+ def _m_step(self, X, responsibilities):
83
+ """M步:更新参数"""
84
+ n_samples, n_features = X.shape
85
+ K = self.n_components
86
+
87
+ # 计算每个成分的有效样本数
88
+ N_k = responsibilities.sum(axis=0) + 1e-10
89
+
90
+ # 更新混合系数
91
+ self.weights_ = N_k / n_samples
92
+
93
+ # 更新均值
94
+ self.means_ = (responsibilities.T @ X) / N_k[:, np.newaxis]
95
+
96
+ # 更新协方差
97
+ for k in range(K):
98
+ diff = X - self.means_[k]
99
+ weighted_diff = responsibilities[:, k:k+1] * diff
100
+ self.covariances_[k] = (weighted_diff.T @ diff) / N_k[k]
101
+ # 添加正则化
102
+ self.covariances_[k] += 1e-6 * np.eye(n_features)
103
+
104
+ def fit(self, X):
105
+ """训练模型"""
106
+ self._initialize(X)
107
+ self.log_likelihood_history_ = []
108
+
109
+ prev_log_likelihood = -np.inf
110
+
111
+ for iteration in range(self.max_iter):
112
+ # E步
113
+ responsibilities, log_likelihood = self._e_step(X)
114
+ self.log_likelihood_history_.append(log_likelihood)
115
+
116
+ # 检查收敛
117
+ if abs(log_likelihood - prev_log_likelihood) < self.tol:
118
+ print(f"EM收敛于第{iteration}次迭代")
119
+ break
120
+
121
+ # M步
122
+ self._m_step(X, responsibilities)
123
+
124
+ prev_log_likelihood = log_likelihood
125
+
126
+ return self
127
+
128
+ def predict(self, X):
129
+ """预测聚类标签"""
130
+ responsibilities, _ = self._e_step(X)
131
+ return np.argmax(responsibilities, axis=1)
132
+
133
+ def predict_proba(self, X):
134
+ """预测属于各成分的概率"""
135
+ responsibilities, _ = self._e_step(X)
136
+ return responsibilities
137
+
138
+ def score(self, X):
139
+ """计算对数似然"""
140
+ _, log_likelihood = self._e_step(X)
141
+ return log_likelihood
@@ -0,0 +1,74 @@
1
+ # MultinomialNaiveBayes 类定义
2
+ # 从文档自动提取生成
3
+
4
+ import numpy as np
5
+
6
+ class MultinomialNaiveBayes:
7
+ """
8
+ 多项式朴素贝叶斯实现
9
+ 适用于离散特征(如文本词频)
10
+ """
11
+
12
+ def __init__(self, alpha=1.0):
13
+ """
14
+ Parameters:
15
+ alpha : float, 拉普拉斯平滑参数
16
+ """
17
+ self.alpha = alpha # 拉普拉斯平滑
18
+ self.class_prior_ = None # P(y)
19
+ self.feature_prob_ = None # P(x|y)
20
+ self.classes_ = None
21
+
22
+ def fit(self, X, y):
23
+ """
24
+ 训练模型
25
+
26
+ Parameters:
27
+ X : ndarray, shape (n_samples, n_features)
28
+ 特征矩阵(词频/计数)
29
+ y : ndarray, shape (n_samples,)
30
+ 类别标签
31
+ """
32
+ n_samples, n_features = X.shape
33
+ self.classes_ = np.unique(y)
34
+ n_classes = len(self.classes_)
35
+
36
+ # 计算先验概率 P(y)
37
+ class_counts = np.array([np.sum(y == c) for c in self.classes_])
38
+ self.class_prior_ = class_counts / n_samples
39
+
40
+ # 计算条件概率 P(x|y)
41
+ # 对于每个类别,计算每个特征在该类别文档中的总计数
42
+ self.feature_prob_ = np.zeros((n_classes, n_features))
43
+
44
+ for i, c in enumerate(self.classes_):
45
+ # 获取类别c的所有样本
46
+ X_c = X[y == c]
47
+ # 该类别每个特征的总计数 + 平滑
48
+ feature_counts = X_c.sum(axis=0) + self.alpha
49
+ # 归一化得到条件概率
50
+ total_count = feature_counts.sum()
51
+ self.feature_prob_[i] = feature_counts / total_count
52
+
53
+ return self
54
+
55
+ def predict_log_proba(self, X):
56
+ """
57
+ 计算对数概率
58
+ """
59
+ # log P(y) + sum(log P(x|y))
60
+ log_prior = np.log(self.class_prior_)
61
+ log_likelihood = X @ np.log(self.feature_prob_.T) # (n_samples, n_classes)
62
+ return log_prior + log_likelihood
63
+
64
+ def predict(self, X):
65
+ """
66
+ 预测类别
67
+ """
68
+ log_proba = self.predict_log_proba(X)
69
+ return self.classes_[np.argmax(log_proba, axis=1)]
70
+
71
+ def score(self, X, y):
72
+ """计算准确率"""
73
+ y_pred = self.predict(X)
74
+ return np.mean(y_pred == y)