@icyfenix-dmla/cli 2026.5.2-7 → 2026.5.3-726
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +9 -6
- package/scripts/build.js +44 -11
- package/shared_modules/__init__.py +10 -0
- package/shared_modules/bayesian/__init__.py +6 -0
- package/shared_modules/bayesian/bayesian_network.py +105 -0
- package/shared_modules/bayesian/gaussian_mixture_model.py +141 -0
- package/shared_modules/bayesian/gaussian_mixturemodel.py +141 -0
- package/shared_modules/bayesian/multinomial_naive_bayes.py +74 -0
- package/shared_modules/bayesian/simple_bayesian_network.py +99 -0
- package/shared_modules/bayesian/simple_bayesiannetwork.py +99 -0
- package/shared_modules/cnn/__init__.py +5 -0
- package/shared_modules/cnn/alex_net.py +65 -0
- package/shared_modules/cnn/alexnet.py +65 -0
- package/shared_modules/cnn/t_e_r_m1.py +65 -0
- package/shared_modules/cnn/tiny_image_net_dataset.py +67 -0
- package/shared_modules/cnn/tiny_imagenet_dataset.py +67 -0
- package/shared_modules/cnn/tiny_imagenetdataset.py +67 -0
- package/shared_modules/cnn/tinyimagenetdataset.py +67 -0
- package/shared_modules/linear/__init__.py +6 -0
- package/shared_modules/linear/lasso_regression.py +93 -0
- package/shared_modules/linear/logistic_regression.py +78 -0
- package/shared_modules/linear/naive_bayes.py +141 -0
- package/shared_modules/linear/ridge_regression.py +58 -0
- package/shared_modules/neural/__init__.py +4 -0
- package/shared_modules/neural/perceptron.py +80 -0
- package/shared_modules/svm/__init__.py +5 -0
- package/shared_modules/svm/kernel_s_v_m.py +98 -0
- package/shared_modules/svm/kernel_svm.py +98 -0
- package/shared_modules/svm/simple_s_v_m.py +111 -0
- package/shared_modules/svm/simple_svm.py +111 -0
- package/shared_modules/tree/__init__.py +6 -0
- package/shared_modules/tree/ada_boost.py +77 -0
- package/shared_modules/tree/decision_tree_classifier.py +235 -0
- package/shared_modules/tree/decision_treeclassifier.py +235 -0
- package/shared_modules/tree/random_forest_classifier.py +88 -0
- package/shared_modules/tree/random_forestclassifier.py +88 -0
- package/shared_modules/unsupervised/__init__.py +5 -0
- package/shared_modules/unsupervised/k_means.py +127 -0
- package/shared_modules/unsupervised/kmeans.py +127 -0
- package/shared_modules/unsupervised/p_c_a.py +111 -0
- package/shared_modules/unsupervised/pca.py +111 -0
- package/src/commands/data.js +823 -0
- package/src/commands/server.js +209 -4
- package/src/index.js +23 -2
- package/src/server/routes/sandbox.js +70 -3
- package/src/server/sandbox.js +84 -9
- package/version.json +4 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@icyfenix-dmla/cli",
|
|
3
|
-
"version": "2026.5.
|
|
3
|
+
"version": "2026.5.3-726",
|
|
4
4
|
"description": "DMLA 沙箱服务命令行工具",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "src/index.js",
|
|
@@ -17,16 +17,19 @@
|
|
|
17
17
|
"bin/",
|
|
18
18
|
"src/",
|
|
19
19
|
"scripts/",
|
|
20
|
+
"shared_modules/",
|
|
21
|
+
"version.json",
|
|
20
22
|
"README.md"
|
|
21
23
|
],
|
|
22
24
|
"dependencies": {
|
|
23
|
-
"
|
|
25
|
+
"@icyfenix-dmla/install": "*",
|
|
26
|
+
"adm-zip": "^0.5.17",
|
|
24
27
|
"chalk": "^5.3.0",
|
|
25
|
-
"
|
|
26
|
-
"dockerode": "^5.0.0",
|
|
27
|
-
"express": "^4.21.2",
|
|
28
|
+
"commander": "^12.1.0",
|
|
28
29
|
"cors": "^2.8.5",
|
|
29
|
-
"
|
|
30
|
+
"dockerode": "^5.0.0",
|
|
31
|
+
"enquirer": "^2.4.1",
|
|
32
|
+
"express": "^4.21.2"
|
|
30
33
|
},
|
|
31
34
|
"devDependencies": {
|
|
32
35
|
"jest": "^29.7.0"
|
package/scripts/build.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* 构建脚本:将 local-server 代码复制到 CLI 包中
|
|
3
|
-
* 用于 npm
|
|
3
|
+
* 用于 npm 发布时包含完整的服务器代码和共享模块
|
|
4
4
|
*/
|
|
5
5
|
import fs from 'fs'
|
|
6
6
|
import path from 'path'
|
|
@@ -11,17 +11,17 @@ const __dirname = path.dirname(__filename)
|
|
|
11
11
|
|
|
12
12
|
const rootDir = path.resolve(__dirname, '../../..')
|
|
13
13
|
const localServerSrc = path.resolve(rootDir, 'local-server/src')
|
|
14
|
+
const localServerShared = path.resolve(rootDir, 'local-server/shared_modules')
|
|
14
15
|
const cliServerDest = path.resolve(__dirname, '../src/server')
|
|
16
|
+
const cliSharedDest = path.resolve(__dirname, '../shared_modules')
|
|
15
17
|
|
|
16
18
|
console.log('📦 构建 CLI 包...')
|
|
17
|
-
console.log(` 源目录: ${localServerSrc}`)
|
|
18
|
-
console.log(` 目标目录: ${cliServerDest}`)
|
|
19
19
|
|
|
20
20
|
// 递归复制目录
|
|
21
|
-
function copyDir(src, dest) {
|
|
21
|
+
function copyDir(src, dest, filter = null) {
|
|
22
22
|
if (!fs.existsSync(src)) {
|
|
23
23
|
console.error(`❌ 源目录不存在: ${src}`)
|
|
24
|
-
|
|
24
|
+
return false
|
|
25
25
|
}
|
|
26
26
|
|
|
27
27
|
// 创建目标目录
|
|
@@ -36,15 +36,48 @@ function copyDir(src, dest) {
|
|
|
36
36
|
const destPath = path.join(dest, entry.name)
|
|
37
37
|
|
|
38
38
|
if (entry.isDirectory()) {
|
|
39
|
-
copyDir(srcPath, destPath)
|
|
40
|
-
} else if (entry.isFile()
|
|
39
|
+
copyDir(srcPath, destPath, filter)
|
|
40
|
+
} else if (entry.isFile()) {
|
|
41
|
+
// 应用过滤器
|
|
42
|
+
if (filter && !filter(entry.name)) {
|
|
43
|
+
continue
|
|
44
|
+
}
|
|
41
45
|
fs.copyFileSync(srcPath, destPath)
|
|
42
|
-
console.log(` ✓
|
|
46
|
+
console.log(` ✓ ${path.relative(rootDir, srcPath)} → ${path.relative(__dirname, destPath)}`)
|
|
43
47
|
}
|
|
44
48
|
}
|
|
49
|
+
return true
|
|
45
50
|
}
|
|
46
51
|
|
|
47
|
-
//
|
|
48
|
-
|
|
52
|
+
// 复制服务器代码(只复制 .js 文件)
|
|
53
|
+
console.log('\n📋 复制服务器代码...')
|
|
54
|
+
console.log(` 源目录: ${localServerSrc}`)
|
|
55
|
+
console.log(` 目标目录: ${cliServerDest}`)
|
|
56
|
+
copyDir(localServerSrc, cliServerDest, (name) => name.endsWith('.js'))
|
|
57
|
+
|
|
58
|
+
// 复制共享模块(复制所有 .py 文件和 __init__.py)
|
|
59
|
+
console.log('\n📋 复制共享模块...')
|
|
60
|
+
console.log(` 源目录: ${localServerShared}`)
|
|
61
|
+
console.log(` 目标目录: ${cliSharedDest}`)
|
|
62
|
+
const sharedCopied = copyDir(localServerShared, cliSharedDest, (name) => {
|
|
63
|
+
// 复制 Python 文件和初始化文件
|
|
64
|
+
return name.endsWith('.py') || name === '__init__.py'
|
|
65
|
+
})
|
|
66
|
+
|
|
67
|
+
if (!sharedCopied) {
|
|
68
|
+
console.log('⚠️ 共享模块目录不存在,跳过')
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// 创建版本信息文件(用于 --dev 模式的版本比较)
|
|
72
|
+
const versionInfo = {
|
|
73
|
+
buildTime: new Date().toISOString(),
|
|
74
|
+
// 从 package.json 读取版本
|
|
75
|
+
cliVersion: JSON.parse(fs.readFileSync(path.resolve(__dirname, '../package.json'), 'utf8')).version
|
|
76
|
+
}
|
|
77
|
+
fs.writeFileSync(
|
|
78
|
+
path.resolve(__dirname, '../version.json'),
|
|
79
|
+
JSON.stringify(versionInfo, null, 2)
|
|
80
|
+
)
|
|
81
|
+
console.log('\n✅ 版本信息已生成')
|
|
49
82
|
|
|
50
|
-
console.log('✅
|
|
83
|
+
console.log('\n✅ CLI 包构建完成')
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
# Bayesian 模块
|
|
2
|
+
from .simple_bayesian_network import SimpleBayesianNetwork
|
|
3
|
+
from .gaussian_mixture_model import GaussianMixtureModel
|
|
4
|
+
from .multinomial_naive_bayes import MultinomialNaiveBayes
|
|
5
|
+
|
|
6
|
+
__all__ = ['SimpleBayesianNetwork', 'GaussianMixtureModel', 'MultinomialNaiveBayes']
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""
|
|
2
|
+
贝叶斯网络实现
|
|
3
|
+
支持离散变量和精确推断(枚举法)
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SimpleBayesianNetwork:
|
|
10
|
+
"""
|
|
11
|
+
简单贝叶斯网络实现
|
|
12
|
+
支持离散变量和精确推断(枚举法)
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(self):
|
|
16
|
+
self.nodes = {} # 节点信息:{name: {'parents': [], 'values': []}}
|
|
17
|
+
self.cpts = {} # 条件概率表:{name: {parent_values: {value: prob}}}
|
|
18
|
+
self.topo_order = [] # 拓扑排序
|
|
19
|
+
|
|
20
|
+
def add_node(self, name, values, parents=None):
|
|
21
|
+
"""添加节点"""
|
|
22
|
+
if parents is None:
|
|
23
|
+
parents = []
|
|
24
|
+
self.nodes[name] = {'parents': parents, 'values': values}
|
|
25
|
+
self._update_topo_order()
|
|
26
|
+
|
|
27
|
+
def set_cpt(self, name, cpt):
|
|
28
|
+
"""
|
|
29
|
+
设置条件概率表
|
|
30
|
+
|
|
31
|
+
cpt格式:{parent_value_tuple: {value: prob}}
|
|
32
|
+
对于无父节点的变量:{(): {value: prob}}
|
|
33
|
+
"""
|
|
34
|
+
self.cpts[name] = cpt
|
|
35
|
+
|
|
36
|
+
def _update_topo_order(self):
|
|
37
|
+
"""计算拓扑排序"""
|
|
38
|
+
visited = set()
|
|
39
|
+
order = []
|
|
40
|
+
|
|
41
|
+
def visit(node):
|
|
42
|
+
if node in visited:
|
|
43
|
+
return
|
|
44
|
+
visited.add(node)
|
|
45
|
+
for parent in self.nodes[node]['parents']:
|
|
46
|
+
visit(parent)
|
|
47
|
+
order.append(node)
|
|
48
|
+
|
|
49
|
+
for node in self.nodes:
|
|
50
|
+
visit(node)
|
|
51
|
+
|
|
52
|
+
self.topo_order = order
|
|
53
|
+
|
|
54
|
+
def get_prob(self, name, value, parent_values):
|
|
55
|
+
"""获取条件概率 P(name=value | parent_values)"""
|
|
56
|
+
parent_key = tuple(parent_values) if parent_values else ()
|
|
57
|
+
return self.cpts[name].get(parent_key, {}).get(value, 0)
|
|
58
|
+
|
|
59
|
+
def joint_prob(self, assignment):
|
|
60
|
+
"""计算联合概率 P(X1, X2, ...)"""
|
|
61
|
+
prob = 1.0
|
|
62
|
+
for node in self.topo_order:
|
|
63
|
+
parents = self.nodes[node]['parents']
|
|
64
|
+
parent_values = [assignment[p] for p in parents]
|
|
65
|
+
value = assignment[node]
|
|
66
|
+
prob *= self.get_prob(node, value, parent_values)
|
|
67
|
+
return prob
|
|
68
|
+
|
|
69
|
+
def enumerate_inference(self, query, evidence):
|
|
70
|
+
"""
|
|
71
|
+
枚举推断:计算 P(query | evidence)
|
|
72
|
+
|
|
73
|
+
query: {node: '?'} 返回分布
|
|
74
|
+
evidence: {node: value}
|
|
75
|
+
"""
|
|
76
|
+
query_nodes = list(query.keys())
|
|
77
|
+
hidden = [n for n in self.nodes if n not in query_nodes and n not in evidence]
|
|
78
|
+
|
|
79
|
+
def enumerate_assignments(variables, current):
|
|
80
|
+
if not variables:
|
|
81
|
+
yield current.copy()
|
|
82
|
+
return
|
|
83
|
+
var = variables[0]
|
|
84
|
+
for value in self.nodes[var]['values']:
|
|
85
|
+
current[var] = value
|
|
86
|
+
yield from enumerate_assignments(variables[1:], current)
|
|
87
|
+
del current[var]
|
|
88
|
+
|
|
89
|
+
query_values = {}
|
|
90
|
+
total = 0.0
|
|
91
|
+
|
|
92
|
+
query_node = query_nodes[0]
|
|
93
|
+
for qv in self.nodes[query_node]['values']:
|
|
94
|
+
prob_sum = 0.0
|
|
95
|
+
for assignment in enumerate_assignments(hidden, {}):
|
|
96
|
+
assignment.update(evidence)
|
|
97
|
+
assignment[query_node] = qv
|
|
98
|
+
prob_sum += self.joint_prob(assignment)
|
|
99
|
+
query_values[qv] = prob_sum
|
|
100
|
+
total += prob_sum
|
|
101
|
+
|
|
102
|
+
# 归一化
|
|
103
|
+
for k in query_values:
|
|
104
|
+
query_values[k] /= total
|
|
105
|
+
return query_values
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
# GaussianMixtureModel 类定义
|
|
2
|
+
# 从文档自动提取生成
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
class GaussianMixtureModel:
|
|
7
|
+
"""
|
|
8
|
+
高斯混合模型实现
|
|
9
|
+
使用EM算法求解
|
|
10
|
+
"""
|
|
11
|
+
def __init__(self, n_components=3, max_iter=100, tol=1e-4):
|
|
12
|
+
self.n_components = n_components
|
|
13
|
+
self.max_iter = max_iter
|
|
14
|
+
self.tol = tol # 收敛阈值
|
|
15
|
+
|
|
16
|
+
self.weights_ = None # 混合系数 (K,)
|
|
17
|
+
self.means_ = None # 均值 (K, n_features)
|
|
18
|
+
self.covariances_ = None # 协方差矩阵 (K, n_features, n_features)
|
|
19
|
+
self.log_likelihood_history_ = []
|
|
20
|
+
|
|
21
|
+
def _initialize(self, X):
|
|
22
|
+
"""初始化参数"""
|
|
23
|
+
n_samples, n_features = X.shape
|
|
24
|
+
K = self.n_components
|
|
25
|
+
|
|
26
|
+
# 随机初始化均值(从数据中随机选择K个点)
|
|
27
|
+
indices = np.random.choice(n_samples, K, replace=False)
|
|
28
|
+
self.means_ = X[indices].copy()
|
|
29
|
+
|
|
30
|
+
# 初始化协方差为数据协方差的对角线
|
|
31
|
+
data_cov = np.cov(X.T)
|
|
32
|
+
self.covariances_ = np.array([np.diag(np.diag(data_cov)) + 1e-6 * np.eye(n_features)
|
|
33
|
+
for _ in range(K)])
|
|
34
|
+
|
|
35
|
+
# 初始化混合系数为均匀分布
|
|
36
|
+
self.weights_ = np.ones(K) / K
|
|
37
|
+
|
|
38
|
+
def _gaussian_pdf(self, X, mean, cov):
|
|
39
|
+
"""计算多元高斯概率密度"""
|
|
40
|
+
n_features = X.shape[1]
|
|
41
|
+
diff = X - mean
|
|
42
|
+
|
|
43
|
+
# 加小值保证数值稳定
|
|
44
|
+
cov_reg = cov + 1e-6 * np.eye(n_features)
|
|
45
|
+
|
|
46
|
+
# 使用Cholesky分解计算行列式和逆
|
|
47
|
+
try:
|
|
48
|
+
L = np.linalg.cholesky(cov_reg)
|
|
49
|
+
log_det = 2 * np.sum(np.log(np.diag(L)))
|
|
50
|
+
diff_L = np.linalg.solve(L, diff.T).T
|
|
51
|
+
mahalanobis = np.sum(diff_L ** 2, axis=1)
|
|
52
|
+
except np.linalg.LinAlgError:
|
|
53
|
+
# 如果Cholesky失败,使用标准方法
|
|
54
|
+
sign, log_det = np.linalg.slogdet(cov_reg)
|
|
55
|
+
cov_inv = np.linalg.inv(cov_reg)
|
|
56
|
+
mahalanobis = np.sum(diff @ cov_inv * diff, axis=1)
|
|
57
|
+
|
|
58
|
+
log_prob = -0.5 * (n_features * np.log(2 * np.pi) + log_det + mahalanobis)
|
|
59
|
+
return log_prob
|
|
60
|
+
|
|
61
|
+
def _e_step(self, X):
|
|
62
|
+
"""E步:计算责任度"""
|
|
63
|
+
n_samples = X.shape[0]
|
|
64
|
+
K = self.n_components
|
|
65
|
+
|
|
66
|
+
# 计算每个成分的对数概率
|
|
67
|
+
log_probs = np.zeros((n_samples, K))
|
|
68
|
+
for k in range(K):
|
|
69
|
+
log_probs[:, k] = (np.log(self.weights_[k] + 1e-10) +
|
|
70
|
+
self._gaussian_pdf(X, self.means_[k], self.covariances_[k]))
|
|
71
|
+
|
|
72
|
+
# 计算对数似然
|
|
73
|
+
log_likelihood = np.sum(np.log(np.sum(np.exp(log_probs), axis=1)))
|
|
74
|
+
|
|
75
|
+
# 计算责任度(使用log-sum-exp trick避免数值下溢)
|
|
76
|
+
log_max = log_probs.max(axis=1, keepdims=True)
|
|
77
|
+
log_sum = np.log(np.sum(np.exp(log_probs - log_max), axis=1, keepdims=True)) + log_max
|
|
78
|
+
responsibilities = np.exp(log_probs - log_sum)
|
|
79
|
+
|
|
80
|
+
return responsibilities, log_likelihood
|
|
81
|
+
|
|
82
|
+
def _m_step(self, X, responsibilities):
|
|
83
|
+
"""M步:更新参数"""
|
|
84
|
+
n_samples, n_features = X.shape
|
|
85
|
+
K = self.n_components
|
|
86
|
+
|
|
87
|
+
# 计算每个成分的有效样本数
|
|
88
|
+
N_k = responsibilities.sum(axis=0) + 1e-10
|
|
89
|
+
|
|
90
|
+
# 更新混合系数
|
|
91
|
+
self.weights_ = N_k / n_samples
|
|
92
|
+
|
|
93
|
+
# 更新均值
|
|
94
|
+
self.means_ = (responsibilities.T @ X) / N_k[:, np.newaxis]
|
|
95
|
+
|
|
96
|
+
# 更新协方差
|
|
97
|
+
for k in range(K):
|
|
98
|
+
diff = X - self.means_[k]
|
|
99
|
+
weighted_diff = responsibilities[:, k:k+1] * diff
|
|
100
|
+
self.covariances_[k] = (weighted_diff.T @ diff) / N_k[k]
|
|
101
|
+
# 添加正则化
|
|
102
|
+
self.covariances_[k] += 1e-6 * np.eye(n_features)
|
|
103
|
+
|
|
104
|
+
def fit(self, X):
|
|
105
|
+
"""训练模型"""
|
|
106
|
+
self._initialize(X)
|
|
107
|
+
self.log_likelihood_history_ = []
|
|
108
|
+
|
|
109
|
+
prev_log_likelihood = -np.inf
|
|
110
|
+
|
|
111
|
+
for iteration in range(self.max_iter):
|
|
112
|
+
# E步
|
|
113
|
+
responsibilities, log_likelihood = self._e_step(X)
|
|
114
|
+
self.log_likelihood_history_.append(log_likelihood)
|
|
115
|
+
|
|
116
|
+
# 检查收敛
|
|
117
|
+
if abs(log_likelihood - prev_log_likelihood) < self.tol:
|
|
118
|
+
print(f"EM收敛于第{iteration}次迭代")
|
|
119
|
+
break
|
|
120
|
+
|
|
121
|
+
# M步
|
|
122
|
+
self._m_step(X, responsibilities)
|
|
123
|
+
|
|
124
|
+
prev_log_likelihood = log_likelihood
|
|
125
|
+
|
|
126
|
+
return self
|
|
127
|
+
|
|
128
|
+
def predict(self, X):
|
|
129
|
+
"""预测聚类标签"""
|
|
130
|
+
responsibilities, _ = self._e_step(X)
|
|
131
|
+
return np.argmax(responsibilities, axis=1)
|
|
132
|
+
|
|
133
|
+
def predict_proba(self, X):
|
|
134
|
+
"""预测属于各成分的概率"""
|
|
135
|
+
responsibilities, _ = self._e_step(X)
|
|
136
|
+
return responsibilities
|
|
137
|
+
|
|
138
|
+
def score(self, X):
|
|
139
|
+
"""计算对数似然"""
|
|
140
|
+
_, log_likelihood = self._e_step(X)
|
|
141
|
+
return log_likelihood
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
# GaussianMixtureModel 类定义
|
|
2
|
+
# 从文档自动提取生成
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
class GaussianMixtureModel:
|
|
7
|
+
"""
|
|
8
|
+
高斯混合模型实现
|
|
9
|
+
使用EM算法求解
|
|
10
|
+
"""
|
|
11
|
+
def __init__(self, n_components=3, max_iter=100, tol=1e-4):
|
|
12
|
+
self.n_components = n_components
|
|
13
|
+
self.max_iter = max_iter
|
|
14
|
+
self.tol = tol # 收敛阈值
|
|
15
|
+
|
|
16
|
+
self.weights_ = None # 混合系数 (K,)
|
|
17
|
+
self.means_ = None # 均值 (K, n_features)
|
|
18
|
+
self.covariances_ = None # 协方差矩阵 (K, n_features, n_features)
|
|
19
|
+
self.log_likelihood_history_ = []
|
|
20
|
+
|
|
21
|
+
def _initialize(self, X):
|
|
22
|
+
"""初始化参数"""
|
|
23
|
+
n_samples, n_features = X.shape
|
|
24
|
+
K = self.n_components
|
|
25
|
+
|
|
26
|
+
# 随机初始化均值(从数据中随机选择K个点)
|
|
27
|
+
indices = np.random.choice(n_samples, K, replace=False)
|
|
28
|
+
self.means_ = X[indices].copy()
|
|
29
|
+
|
|
30
|
+
# 初始化协方差为数据协方差的对角线
|
|
31
|
+
data_cov = np.cov(X.T)
|
|
32
|
+
self.covariances_ = np.array([np.diag(np.diag(data_cov)) + 1e-6 * np.eye(n_features)
|
|
33
|
+
for _ in range(K)])
|
|
34
|
+
|
|
35
|
+
# 初始化混合系数为均匀分布
|
|
36
|
+
self.weights_ = np.ones(K) / K
|
|
37
|
+
|
|
38
|
+
def _gaussian_pdf(self, X, mean, cov):
|
|
39
|
+
"""计算多元高斯概率密度"""
|
|
40
|
+
n_features = X.shape[1]
|
|
41
|
+
diff = X - mean
|
|
42
|
+
|
|
43
|
+
# 加小值保证数值稳定
|
|
44
|
+
cov_reg = cov + 1e-6 * np.eye(n_features)
|
|
45
|
+
|
|
46
|
+
# 使用Cholesky分解计算行列式和逆
|
|
47
|
+
try:
|
|
48
|
+
L = np.linalg.cholesky(cov_reg)
|
|
49
|
+
log_det = 2 * np.sum(np.log(np.diag(L)))
|
|
50
|
+
diff_L = np.linalg.solve(L, diff.T).T
|
|
51
|
+
mahalanobis = np.sum(diff_L ** 2, axis=1)
|
|
52
|
+
except np.linalg.LinAlgError:
|
|
53
|
+
# 如果Cholesky失败,使用标准方法
|
|
54
|
+
sign, log_det = np.linalg.slogdet(cov_reg)
|
|
55
|
+
cov_inv = np.linalg.inv(cov_reg)
|
|
56
|
+
mahalanobis = np.sum(diff @ cov_inv * diff, axis=1)
|
|
57
|
+
|
|
58
|
+
log_prob = -0.5 * (n_features * np.log(2 * np.pi) + log_det + mahalanobis)
|
|
59
|
+
return log_prob
|
|
60
|
+
|
|
61
|
+
def _e_step(self, X):
|
|
62
|
+
"""E步:计算责任度"""
|
|
63
|
+
n_samples = X.shape[0]
|
|
64
|
+
K = self.n_components
|
|
65
|
+
|
|
66
|
+
# 计算每个成分的对数概率
|
|
67
|
+
log_probs = np.zeros((n_samples, K))
|
|
68
|
+
for k in range(K):
|
|
69
|
+
log_probs[:, k] = (np.log(self.weights_[k] + 1e-10) +
|
|
70
|
+
self._gaussian_pdf(X, self.means_[k], self.covariances_[k]))
|
|
71
|
+
|
|
72
|
+
# 计算对数似然
|
|
73
|
+
log_likelihood = np.sum(np.log(np.sum(np.exp(log_probs), axis=1)))
|
|
74
|
+
|
|
75
|
+
# 计算责任度(使用log-sum-exp trick避免数值下溢)
|
|
76
|
+
log_max = log_probs.max(axis=1, keepdims=True)
|
|
77
|
+
log_sum = np.log(np.sum(np.exp(log_probs - log_max), axis=1, keepdims=True)) + log_max
|
|
78
|
+
responsibilities = np.exp(log_probs - log_sum)
|
|
79
|
+
|
|
80
|
+
return responsibilities, log_likelihood
|
|
81
|
+
|
|
82
|
+
def _m_step(self, X, responsibilities):
|
|
83
|
+
"""M步:更新参数"""
|
|
84
|
+
n_samples, n_features = X.shape
|
|
85
|
+
K = self.n_components
|
|
86
|
+
|
|
87
|
+
# 计算每个成分的有效样本数
|
|
88
|
+
N_k = responsibilities.sum(axis=0) + 1e-10
|
|
89
|
+
|
|
90
|
+
# 更新混合系数
|
|
91
|
+
self.weights_ = N_k / n_samples
|
|
92
|
+
|
|
93
|
+
# 更新均值
|
|
94
|
+
self.means_ = (responsibilities.T @ X) / N_k[:, np.newaxis]
|
|
95
|
+
|
|
96
|
+
# 更新协方差
|
|
97
|
+
for k in range(K):
|
|
98
|
+
diff = X - self.means_[k]
|
|
99
|
+
weighted_diff = responsibilities[:, k:k+1] * diff
|
|
100
|
+
self.covariances_[k] = (weighted_diff.T @ diff) / N_k[k]
|
|
101
|
+
# 添加正则化
|
|
102
|
+
self.covariances_[k] += 1e-6 * np.eye(n_features)
|
|
103
|
+
|
|
104
|
+
def fit(self, X):
|
|
105
|
+
"""训练模型"""
|
|
106
|
+
self._initialize(X)
|
|
107
|
+
self.log_likelihood_history_ = []
|
|
108
|
+
|
|
109
|
+
prev_log_likelihood = -np.inf
|
|
110
|
+
|
|
111
|
+
for iteration in range(self.max_iter):
|
|
112
|
+
# E步
|
|
113
|
+
responsibilities, log_likelihood = self._e_step(X)
|
|
114
|
+
self.log_likelihood_history_.append(log_likelihood)
|
|
115
|
+
|
|
116
|
+
# 检查收敛
|
|
117
|
+
if abs(log_likelihood - prev_log_likelihood) < self.tol:
|
|
118
|
+
print(f"EM收敛于第{iteration}次迭代")
|
|
119
|
+
break
|
|
120
|
+
|
|
121
|
+
# M步
|
|
122
|
+
self._m_step(X, responsibilities)
|
|
123
|
+
|
|
124
|
+
prev_log_likelihood = log_likelihood
|
|
125
|
+
|
|
126
|
+
return self
|
|
127
|
+
|
|
128
|
+
def predict(self, X):
|
|
129
|
+
"""预测聚类标签"""
|
|
130
|
+
responsibilities, _ = self._e_step(X)
|
|
131
|
+
return np.argmax(responsibilities, axis=1)
|
|
132
|
+
|
|
133
|
+
def predict_proba(self, X):
|
|
134
|
+
"""预测属于各成分的概率"""
|
|
135
|
+
responsibilities, _ = self._e_step(X)
|
|
136
|
+
return responsibilities
|
|
137
|
+
|
|
138
|
+
def score(self, X):
|
|
139
|
+
"""计算对数似然"""
|
|
140
|
+
_, log_likelihood = self._e_step(X)
|
|
141
|
+
return log_likelihood
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# MultinomialNaiveBayes 类定义
|
|
2
|
+
# 从文档自动提取生成
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
class MultinomialNaiveBayes:
|
|
7
|
+
"""
|
|
8
|
+
多项式朴素贝叶斯实现
|
|
9
|
+
适用于离散特征(如文本词频)
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def __init__(self, alpha=1.0):
|
|
13
|
+
"""
|
|
14
|
+
Parameters:
|
|
15
|
+
alpha : float, 拉普拉斯平滑参数
|
|
16
|
+
"""
|
|
17
|
+
self.alpha = alpha # 拉普拉斯平滑
|
|
18
|
+
self.class_prior_ = None # P(y)
|
|
19
|
+
self.feature_prob_ = None # P(x|y)
|
|
20
|
+
self.classes_ = None
|
|
21
|
+
|
|
22
|
+
def fit(self, X, y):
|
|
23
|
+
"""
|
|
24
|
+
训练模型
|
|
25
|
+
|
|
26
|
+
Parameters:
|
|
27
|
+
X : ndarray, shape (n_samples, n_features)
|
|
28
|
+
特征矩阵(词频/计数)
|
|
29
|
+
y : ndarray, shape (n_samples,)
|
|
30
|
+
类别标签
|
|
31
|
+
"""
|
|
32
|
+
n_samples, n_features = X.shape
|
|
33
|
+
self.classes_ = np.unique(y)
|
|
34
|
+
n_classes = len(self.classes_)
|
|
35
|
+
|
|
36
|
+
# 计算先验概率 P(y)
|
|
37
|
+
class_counts = np.array([np.sum(y == c) for c in self.classes_])
|
|
38
|
+
self.class_prior_ = class_counts / n_samples
|
|
39
|
+
|
|
40
|
+
# 计算条件概率 P(x|y)
|
|
41
|
+
# 对于每个类别,计算每个特征在该类别文档中的总计数
|
|
42
|
+
self.feature_prob_ = np.zeros((n_classes, n_features))
|
|
43
|
+
|
|
44
|
+
for i, c in enumerate(self.classes_):
|
|
45
|
+
# 获取类别c的所有样本
|
|
46
|
+
X_c = X[y == c]
|
|
47
|
+
# 该类别每个特征的总计数 + 平滑
|
|
48
|
+
feature_counts = X_c.sum(axis=0) + self.alpha
|
|
49
|
+
# 归一化得到条件概率
|
|
50
|
+
total_count = feature_counts.sum()
|
|
51
|
+
self.feature_prob_[i] = feature_counts / total_count
|
|
52
|
+
|
|
53
|
+
return self
|
|
54
|
+
|
|
55
|
+
def predict_log_proba(self, X):
|
|
56
|
+
"""
|
|
57
|
+
计算对数概率
|
|
58
|
+
"""
|
|
59
|
+
# log P(y) + sum(log P(x|y))
|
|
60
|
+
log_prior = np.log(self.class_prior_)
|
|
61
|
+
log_likelihood = X @ np.log(self.feature_prob_.T) # (n_samples, n_classes)
|
|
62
|
+
return log_prior + log_likelihood
|
|
63
|
+
|
|
64
|
+
def predict(self, X):
|
|
65
|
+
"""
|
|
66
|
+
预测类别
|
|
67
|
+
"""
|
|
68
|
+
log_proba = self.predict_log_proba(X)
|
|
69
|
+
return self.classes_[np.argmax(log_proba, axis=1)]
|
|
70
|
+
|
|
71
|
+
def score(self, X, y):
|
|
72
|
+
"""计算准确率"""
|
|
73
|
+
y_pred = self.predict(X)
|
|
74
|
+
return np.mean(y_pred == y)
|