@icyfenix-dmla/cli 2026.5.2-2114 → 2026.5.2-2234
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +3 -1
- package/scripts/build.js +44 -11
- package/shared_modules/__init__.py +39 -0
- package/shared_modules/bayesian/__init__.py +13 -0
- package/shared_modules/bayesian/bayesian_network.py +105 -0
- package/shared_modules/bayesian/gaussian_mixture_model.py +141 -0
- package/shared_modules/bayesian/gaussian_mixturemodel.py +141 -0
- package/shared_modules/bayesian/multinomial_naive_bayes.py +74 -0
- package/shared_modules/bayesian/simple_bayesian_network.py +99 -0
- package/shared_modules/bayesian/simple_bayesiannetwork.py +99 -0
- package/shared_modules/cnn/__init__.py +9 -0
- package/shared_modules/cnn/alex_net.py +65 -0
- package/shared_modules/cnn/alexnet.py +65 -0
- package/shared_modules/cnn/t_e_r_m1.py +65 -0
- package/shared_modules/cnn/tiny_image_net_dataset.py +67 -0
- package/shared_modules/cnn/tiny_imagenet_dataset.py +67 -0
- package/shared_modules/cnn/tiny_imagenetdataset.py +67 -0
- package/shared_modules/cnn/tinyimagenetdataset.py +67 -0
- package/shared_modules/linear/__init__.py +9 -0
- package/shared_modules/linear/lasso_regression.py +93 -0
- package/shared_modules/linear/logistic_regression.py +78 -0
- package/shared_modules/linear/naive_bayes.py +141 -0
- package/shared_modules/linear/ridge_regression.py +58 -0
- package/shared_modules/neural/__init__.py +2 -0
- package/shared_modules/neural/perceptron.py +80 -0
- package/shared_modules/svm/__init__.py +8 -0
- package/shared_modules/svm/kernel_s_v_m.py +98 -0
- package/shared_modules/svm/kernel_svm.py +98 -0
- package/shared_modules/svm/simple_s_v_m.py +111 -0
- package/shared_modules/svm/simple_svm.py +111 -0
- package/shared_modules/tree/__init__.py +11 -0
- package/shared_modules/tree/ada_boost.py +77 -0
- package/shared_modules/tree/decision_tree_classifier.py +235 -0
- package/shared_modules/tree/decision_treeclassifier.py +235 -0
- package/shared_modules/tree/random_forest_classifier.py +88 -0
- package/shared_modules/tree/random_forestclassifier.py +88 -0
- package/shared_modules/unsupervised/__init__.py +8 -0
- package/shared_modules/unsupervised/k_means.py +127 -0
- package/shared_modules/unsupervised/kmeans.py +127 -0
- package/shared_modules/unsupervised/p_c_a.py +111 -0
- package/shared_modules/unsupervised/pca.py +111 -0
- package/src/commands/server.js +209 -4
- package/src/index.js +7 -2
- package/version.json +4 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@icyfenix-dmla/cli",
|
|
3
|
-
"version": "2026.5.2-
|
|
3
|
+
"version": "2026.5.2-2234",
|
|
4
4
|
"description": "DMLA 沙箱服务命令行工具",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "src/index.js",
|
|
@@ -17,6 +17,8 @@
|
|
|
17
17
|
"bin/",
|
|
18
18
|
"src/",
|
|
19
19
|
"scripts/",
|
|
20
|
+
"shared_modules/",
|
|
21
|
+
"version.json",
|
|
20
22
|
"README.md"
|
|
21
23
|
],
|
|
22
24
|
"dependencies": {
|
package/scripts/build.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* 构建脚本:将 local-server 代码复制到 CLI 包中
|
|
3
|
-
* 用于 npm
|
|
3
|
+
* 用于 npm 发布时包含完整的服务器代码和共享模块
|
|
4
4
|
*/
|
|
5
5
|
import fs from 'fs'
|
|
6
6
|
import path from 'path'
|
|
@@ -11,17 +11,17 @@ const __dirname = path.dirname(__filename)
|
|
|
11
11
|
|
|
12
12
|
const rootDir = path.resolve(__dirname, '../../..')
|
|
13
13
|
const localServerSrc = path.resolve(rootDir, 'local-server/src')
|
|
14
|
+
const localServerShared = path.resolve(rootDir, 'local-server/shared_modules')
|
|
14
15
|
const cliServerDest = path.resolve(__dirname, '../src/server')
|
|
16
|
+
const cliSharedDest = path.resolve(__dirname, '../shared_modules')
|
|
15
17
|
|
|
16
18
|
console.log('📦 构建 CLI 包...')
|
|
17
|
-
console.log(` 源目录: ${localServerSrc}`)
|
|
18
|
-
console.log(` 目标目录: ${cliServerDest}`)
|
|
19
19
|
|
|
20
20
|
// 递归复制目录
|
|
21
|
-
function copyDir(src, dest) {
|
|
21
|
+
function copyDir(src, dest, filter = null) {
|
|
22
22
|
if (!fs.existsSync(src)) {
|
|
23
23
|
console.error(`❌ 源目录不存在: ${src}`)
|
|
24
|
-
|
|
24
|
+
return false
|
|
25
25
|
}
|
|
26
26
|
|
|
27
27
|
// 创建目标目录
|
|
@@ -36,15 +36,48 @@ function copyDir(src, dest) {
|
|
|
36
36
|
const destPath = path.join(dest, entry.name)
|
|
37
37
|
|
|
38
38
|
if (entry.isDirectory()) {
|
|
39
|
-
copyDir(srcPath, destPath)
|
|
40
|
-
} else if (entry.isFile()
|
|
39
|
+
copyDir(srcPath, destPath, filter)
|
|
40
|
+
} else if (entry.isFile()) {
|
|
41
|
+
// 应用过滤器
|
|
42
|
+
if (filter && !filter(entry.name)) {
|
|
43
|
+
continue
|
|
44
|
+
}
|
|
41
45
|
fs.copyFileSync(srcPath, destPath)
|
|
42
|
-
console.log(` ✓
|
|
46
|
+
console.log(` ✓ ${path.relative(rootDir, srcPath)} → ${path.relative(__dirname, destPath)}`)
|
|
43
47
|
}
|
|
44
48
|
}
|
|
49
|
+
return true
|
|
45
50
|
}
|
|
46
51
|
|
|
47
|
-
//
|
|
48
|
-
|
|
52
|
+
// 复制服务器代码(只复制 .js 文件)
|
|
53
|
+
console.log('\n📋 复制服务器代码...')
|
|
54
|
+
console.log(` 源目录: ${localServerSrc}`)
|
|
55
|
+
console.log(` 目标目录: ${cliServerDest}`)
|
|
56
|
+
copyDir(localServerSrc, cliServerDest, (name) => name.endsWith('.js'))
|
|
57
|
+
|
|
58
|
+
// 复制共享模块(复制所有 .py 文件和 __init__.py)
|
|
59
|
+
console.log('\n📋 复制共享模块...')
|
|
60
|
+
console.log(` 源目录: ${localServerShared}`)
|
|
61
|
+
console.log(` 目标目录: ${cliSharedDest}`)
|
|
62
|
+
const sharedCopied = copyDir(localServerShared, cliSharedDest, (name) => {
|
|
63
|
+
// 复制 Python 文件和初始化文件
|
|
64
|
+
return name.endsWith('.py') || name === '__init__.py'
|
|
65
|
+
})
|
|
66
|
+
|
|
67
|
+
if (!sharedCopied) {
|
|
68
|
+
console.log('⚠️ 共享模块目录不存在,跳过')
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// 创建版本信息文件(用于 --dev 模式的版本比较)
|
|
72
|
+
const versionInfo = {
|
|
73
|
+
buildTime: new Date().toISOString(),
|
|
74
|
+
// 从 package.json 读取版本
|
|
75
|
+
cliVersion: JSON.parse(fs.readFileSync(path.resolve(__dirname, '../package.json'), 'utf8')).version
|
|
76
|
+
}
|
|
77
|
+
fs.writeFileSync(
|
|
78
|
+
path.resolve(__dirname, '../version.json'),
|
|
79
|
+
JSON.stringify(versionInfo, null, 2)
|
|
80
|
+
)
|
|
81
|
+
console.log('\n✅ 版本信息已生成')
|
|
49
82
|
|
|
50
|
-
console.log('✅
|
|
83
|
+
console.log('\n✅ CLI 包构建完成')
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from .cnn.tiny_imagenet_dataset import TinyImageNetDataset
|
|
2
|
+
from .cnn.tiny_imagenetdataset import TinyImageNetDataset
|
|
3
|
+
from .cnn.tinyimagenetdataset import TinyImageNetDataset
|
|
4
|
+
from .cnn.alexnet import AlexNet
|
|
5
|
+
from .bayesian.simple_bayesiannetwork import SimpleBayesianNetwork
|
|
6
|
+
from .bayesian.gaussian_mixturemodel import GaussianMixtureModel
|
|
7
|
+
from .tree.decision_treeclassifier import DecisionTreeClassifier
|
|
8
|
+
from .tree.random_forestclassifier import RandomForestClassifier
|
|
9
|
+
from .svm.kernel_svm import KernelSVM
|
|
10
|
+
from .svm.simple_svm import SimpleSVM
|
|
11
|
+
from .unsupervised.kmeans import KMeans
|
|
12
|
+
from .unsupervised.pca import PCA
|
|
13
|
+
from .cnn.tiny_t_e_r_m0_t_e_r_m11 import TinyImageNetDataset
|
|
14
|
+
from .cnn.t_e_r_m1 import AlexNet
|
|
15
|
+
from .bayesian.simple_bayesian_t_e_r_m17 import SimpleBayesianNetwork
|
|
16
|
+
from .bayesian.gaussian_mixture_t_e_r_m18 import GaussianMixtureModel
|
|
17
|
+
from .tree.decision_tree_t_e_r_m15 import DecisionTreeClassifier
|
|
18
|
+
from .tree.random_forest_t_e_r_m15 import RandomForestClassifier
|
|
19
|
+
from .cnn.tiny_image_net_dataset import TinyImageNetDataset
|
|
20
|
+
from .cnn.alex_net import AlexNet
|
|
21
|
+
from .neural.perceptron import Perceptron
|
|
22
|
+
from .unsupervised.p_c_a import PCA
|
|
23
|
+
from .tree.ada_boost import AdaBoost
|
|
24
|
+
from .tree.random_forest_classifier import RandomForestClassifier
|
|
25
|
+
from .unsupervised.k_means import KMeans
|
|
26
|
+
from .tree.decision_tree_classifier import DecisionTreeClassifier
|
|
27
|
+
from .tree.decision_tree_classifier import DecisionTreeClassifier
|
|
28
|
+
from .svm.kernel_s_v_m import KernelSVM
|
|
29
|
+
from .bayesian.gaussian_mixture_model import GaussianMixtureModel
|
|
30
|
+
from .svm.simple_s_v_m import SimpleSVM
|
|
31
|
+
from .bayesian.simple_bayesian_network import SimpleBayesianNetwork
|
|
32
|
+
from .bayesian.multinomial_naive_bayes import MultinomialNaiveBayes
|
|
33
|
+
from .linear.ridge_regression import RidgeRegression
|
|
34
|
+
from .linear.lasso_regression import LassoRegression
|
|
35
|
+
from .linear.logistic_regression import LogisticRegression
|
|
36
|
+
# shared 模块包初始化
|
|
37
|
+
# 包含统计学习系列文档中可复用的类定义
|
|
38
|
+
|
|
39
|
+
from .linear import *
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# 贝叶斯方法模块
|
|
2
|
+
# 包含朴素贝叶斯、贝叶斯网络、EM算法等实现
|
|
3
|
+
|
|
4
|
+
from .bayesian_network import SimpleBayesianNetwork
|
|
5
|
+
|
|
6
|
+
from .simple_bayesian_network import SimpleBayesianNetwork
|
|
7
|
+
from .multinomial_naive_bayes import MultinomialNaiveBayes
|
|
8
|
+
from .gaussian_mixture_model import GaussianMixtureModel
|
|
9
|
+
from .simple_bayesian_t_e_r_m17 import SimpleBayesianNetwork
|
|
10
|
+
from .gaussian_mixture_t_e_r_m18 import GaussianMixtureModel
|
|
11
|
+
from .simple_bayesiannetwork import SimpleBayesianNetwork
|
|
12
|
+
from .gaussian_mixturemodel import GaussianMixtureModel
|
|
13
|
+
__all__ = ['SimpleBayesianNetwork', 'SimpleBayesianNetwork', 'MultinomialNaiveBayes', 'GaussianMixtureModel', 'SimpleBayesianNetwork', 'GaussianMixtureModel', 'SimpleBayesianNetwork', 'GaussianMixtureModel']
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""
|
|
2
|
+
贝叶斯网络实现
|
|
3
|
+
支持离散变量和精确推断(枚举法)
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SimpleBayesianNetwork:
|
|
10
|
+
"""
|
|
11
|
+
简单贝叶斯网络实现
|
|
12
|
+
支持离散变量和精确推断(枚举法)
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(self):
|
|
16
|
+
self.nodes = {} # 节点信息:{name: {'parents': [], 'values': []}}
|
|
17
|
+
self.cpts = {} # 条件概率表:{name: {parent_values: {value: prob}}}
|
|
18
|
+
self.topo_order = [] # 拓扑排序
|
|
19
|
+
|
|
20
|
+
def add_node(self, name, values, parents=None):
|
|
21
|
+
"""添加节点"""
|
|
22
|
+
if parents is None:
|
|
23
|
+
parents = []
|
|
24
|
+
self.nodes[name] = {'parents': parents, 'values': values}
|
|
25
|
+
self._update_topo_order()
|
|
26
|
+
|
|
27
|
+
def set_cpt(self, name, cpt):
|
|
28
|
+
"""
|
|
29
|
+
设置条件概率表
|
|
30
|
+
|
|
31
|
+
cpt格式:{parent_value_tuple: {value: prob}}
|
|
32
|
+
对于无父节点的变量:{(): {value: prob}}
|
|
33
|
+
"""
|
|
34
|
+
self.cpts[name] = cpt
|
|
35
|
+
|
|
36
|
+
def _update_topo_order(self):
|
|
37
|
+
"""计算拓扑排序"""
|
|
38
|
+
visited = set()
|
|
39
|
+
order = []
|
|
40
|
+
|
|
41
|
+
def visit(node):
|
|
42
|
+
if node in visited:
|
|
43
|
+
return
|
|
44
|
+
visited.add(node)
|
|
45
|
+
for parent in self.nodes[node]['parents']:
|
|
46
|
+
visit(parent)
|
|
47
|
+
order.append(node)
|
|
48
|
+
|
|
49
|
+
for node in self.nodes:
|
|
50
|
+
visit(node)
|
|
51
|
+
|
|
52
|
+
self.topo_order = order
|
|
53
|
+
|
|
54
|
+
def get_prob(self, name, value, parent_values):
|
|
55
|
+
"""获取条件概率 P(name=value | parent_values)"""
|
|
56
|
+
parent_key = tuple(parent_values) if parent_values else ()
|
|
57
|
+
return self.cpts[name].get(parent_key, {}).get(value, 0)
|
|
58
|
+
|
|
59
|
+
def joint_prob(self, assignment):
|
|
60
|
+
"""计算联合概率 P(X1, X2, ...)"""
|
|
61
|
+
prob = 1.0
|
|
62
|
+
for node in self.topo_order:
|
|
63
|
+
parents = self.nodes[node]['parents']
|
|
64
|
+
parent_values = [assignment[p] for p in parents]
|
|
65
|
+
value = assignment[node]
|
|
66
|
+
prob *= self.get_prob(node, value, parent_values)
|
|
67
|
+
return prob
|
|
68
|
+
|
|
69
|
+
def enumerate_inference(self, query, evidence):
|
|
70
|
+
"""
|
|
71
|
+
枚举推断:计算 P(query | evidence)
|
|
72
|
+
|
|
73
|
+
query: {node: '?'} 返回分布
|
|
74
|
+
evidence: {node: value}
|
|
75
|
+
"""
|
|
76
|
+
query_nodes = list(query.keys())
|
|
77
|
+
hidden = [n for n in self.nodes if n not in query_nodes and n not in evidence]
|
|
78
|
+
|
|
79
|
+
def enumerate_assignments(variables, current):
|
|
80
|
+
if not variables:
|
|
81
|
+
yield current.copy()
|
|
82
|
+
return
|
|
83
|
+
var = variables[0]
|
|
84
|
+
for value in self.nodes[var]['values']:
|
|
85
|
+
current[var] = value
|
|
86
|
+
yield from enumerate_assignments(variables[1:], current)
|
|
87
|
+
del current[var]
|
|
88
|
+
|
|
89
|
+
query_values = {}
|
|
90
|
+
total = 0.0
|
|
91
|
+
|
|
92
|
+
query_node = query_nodes[0]
|
|
93
|
+
for qv in self.nodes[query_node]['values']:
|
|
94
|
+
prob_sum = 0.0
|
|
95
|
+
for assignment in enumerate_assignments(hidden, {}):
|
|
96
|
+
assignment.update(evidence)
|
|
97
|
+
assignment[query_node] = qv
|
|
98
|
+
prob_sum += self.joint_prob(assignment)
|
|
99
|
+
query_values[qv] = prob_sum
|
|
100
|
+
total += prob_sum
|
|
101
|
+
|
|
102
|
+
# 归一化
|
|
103
|
+
for k in query_values:
|
|
104
|
+
query_values[k] /= total
|
|
105
|
+
return query_values
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
# GaussianMixtureModel 类定义
|
|
2
|
+
# 从文档自动提取生成
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
class GaussianMixtureModel:
|
|
7
|
+
"""
|
|
8
|
+
高斯混合模型实现
|
|
9
|
+
使用EM算法求解
|
|
10
|
+
"""
|
|
11
|
+
def __init__(self, n_components=3, max_iter=100, tol=1e-4):
|
|
12
|
+
self.n_components = n_components
|
|
13
|
+
self.max_iter = max_iter
|
|
14
|
+
self.tol = tol # 收敛阈值
|
|
15
|
+
|
|
16
|
+
self.weights_ = None # 混合系数 (K,)
|
|
17
|
+
self.means_ = None # 均值 (K, n_features)
|
|
18
|
+
self.covariances_ = None # 协方差矩阵 (K, n_features, n_features)
|
|
19
|
+
self.log_likelihood_history_ = []
|
|
20
|
+
|
|
21
|
+
def _initialize(self, X):
|
|
22
|
+
"""初始化参数"""
|
|
23
|
+
n_samples, n_features = X.shape
|
|
24
|
+
K = self.n_components
|
|
25
|
+
|
|
26
|
+
# 随机初始化均值(从数据中随机选择K个点)
|
|
27
|
+
indices = np.random.choice(n_samples, K, replace=False)
|
|
28
|
+
self.means_ = X[indices].copy()
|
|
29
|
+
|
|
30
|
+
# 初始化协方差为数据协方差的对角线
|
|
31
|
+
data_cov = np.cov(X.T)
|
|
32
|
+
self.covariances_ = np.array([np.diag(np.diag(data_cov)) + 1e-6 * np.eye(n_features)
|
|
33
|
+
for _ in range(K)])
|
|
34
|
+
|
|
35
|
+
# 初始化混合系数为均匀分布
|
|
36
|
+
self.weights_ = np.ones(K) / K
|
|
37
|
+
|
|
38
|
+
def _gaussian_pdf(self, X, mean, cov):
|
|
39
|
+
"""计算多元高斯概率密度"""
|
|
40
|
+
n_features = X.shape[1]
|
|
41
|
+
diff = X - mean
|
|
42
|
+
|
|
43
|
+
# 加小值保证数值稳定
|
|
44
|
+
cov_reg = cov + 1e-6 * np.eye(n_features)
|
|
45
|
+
|
|
46
|
+
# 使用Cholesky分解计算行列式和逆
|
|
47
|
+
try:
|
|
48
|
+
L = np.linalg.cholesky(cov_reg)
|
|
49
|
+
log_det = 2 * np.sum(np.log(np.diag(L)))
|
|
50
|
+
diff_L = np.linalg.solve(L, diff.T).T
|
|
51
|
+
mahalanobis = np.sum(diff_L ** 2, axis=1)
|
|
52
|
+
except np.linalg.LinAlgError:
|
|
53
|
+
# 如果Cholesky失败,使用标准方法
|
|
54
|
+
sign, log_det = np.linalg.slogdet(cov_reg)
|
|
55
|
+
cov_inv = np.linalg.inv(cov_reg)
|
|
56
|
+
mahalanobis = np.sum(diff @ cov_inv * diff, axis=1)
|
|
57
|
+
|
|
58
|
+
log_prob = -0.5 * (n_features * np.log(2 * np.pi) + log_det + mahalanobis)
|
|
59
|
+
return log_prob
|
|
60
|
+
|
|
61
|
+
def _e_step(self, X):
|
|
62
|
+
"""E步:计算责任度"""
|
|
63
|
+
n_samples = X.shape[0]
|
|
64
|
+
K = self.n_components
|
|
65
|
+
|
|
66
|
+
# 计算每个成分的对数概率
|
|
67
|
+
log_probs = np.zeros((n_samples, K))
|
|
68
|
+
for k in range(K):
|
|
69
|
+
log_probs[:, k] = (np.log(self.weights_[k] + 1e-10) +
|
|
70
|
+
self._gaussian_pdf(X, self.means_[k], self.covariances_[k]))
|
|
71
|
+
|
|
72
|
+
# 计算对数似然
|
|
73
|
+
log_likelihood = np.sum(np.log(np.sum(np.exp(log_probs), axis=1)))
|
|
74
|
+
|
|
75
|
+
# 计算责任度(使用log-sum-exp trick避免数值下溢)
|
|
76
|
+
log_max = log_probs.max(axis=1, keepdims=True)
|
|
77
|
+
log_sum = np.log(np.sum(np.exp(log_probs - log_max), axis=1, keepdims=True)) + log_max
|
|
78
|
+
responsibilities = np.exp(log_probs - log_sum)
|
|
79
|
+
|
|
80
|
+
return responsibilities, log_likelihood
|
|
81
|
+
|
|
82
|
+
def _m_step(self, X, responsibilities):
|
|
83
|
+
"""M步:更新参数"""
|
|
84
|
+
n_samples, n_features = X.shape
|
|
85
|
+
K = self.n_components
|
|
86
|
+
|
|
87
|
+
# 计算每个成分的有效样本数
|
|
88
|
+
N_k = responsibilities.sum(axis=0) + 1e-10
|
|
89
|
+
|
|
90
|
+
# 更新混合系数
|
|
91
|
+
self.weights_ = N_k / n_samples
|
|
92
|
+
|
|
93
|
+
# 更新均值
|
|
94
|
+
self.means_ = (responsibilities.T @ X) / N_k[:, np.newaxis]
|
|
95
|
+
|
|
96
|
+
# 更新协方差
|
|
97
|
+
for k in range(K):
|
|
98
|
+
diff = X - self.means_[k]
|
|
99
|
+
weighted_diff = responsibilities[:, k:k+1] * diff
|
|
100
|
+
self.covariances_[k] = (weighted_diff.T @ diff) / N_k[k]
|
|
101
|
+
# 添加正则化
|
|
102
|
+
self.covariances_[k] += 1e-6 * np.eye(n_features)
|
|
103
|
+
|
|
104
|
+
def fit(self, X):
|
|
105
|
+
"""训练模型"""
|
|
106
|
+
self._initialize(X)
|
|
107
|
+
self.log_likelihood_history_ = []
|
|
108
|
+
|
|
109
|
+
prev_log_likelihood = -np.inf
|
|
110
|
+
|
|
111
|
+
for iteration in range(self.max_iter):
|
|
112
|
+
# E步
|
|
113
|
+
responsibilities, log_likelihood = self._e_step(X)
|
|
114
|
+
self.log_likelihood_history_.append(log_likelihood)
|
|
115
|
+
|
|
116
|
+
# 检查收敛
|
|
117
|
+
if abs(log_likelihood - prev_log_likelihood) < self.tol:
|
|
118
|
+
print(f"EM收敛于第{iteration}次迭代")
|
|
119
|
+
break
|
|
120
|
+
|
|
121
|
+
# M步
|
|
122
|
+
self._m_step(X, responsibilities)
|
|
123
|
+
|
|
124
|
+
prev_log_likelihood = log_likelihood
|
|
125
|
+
|
|
126
|
+
return self
|
|
127
|
+
|
|
128
|
+
def predict(self, X):
|
|
129
|
+
"""预测聚类标签"""
|
|
130
|
+
responsibilities, _ = self._e_step(X)
|
|
131
|
+
return np.argmax(responsibilities, axis=1)
|
|
132
|
+
|
|
133
|
+
def predict_proba(self, X):
|
|
134
|
+
"""预测属于各成分的概率"""
|
|
135
|
+
responsibilities, _ = self._e_step(X)
|
|
136
|
+
return responsibilities
|
|
137
|
+
|
|
138
|
+
def score(self, X):
|
|
139
|
+
"""计算对数似然"""
|
|
140
|
+
_, log_likelihood = self._e_step(X)
|
|
141
|
+
return log_likelihood
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
# GaussianMixtureModel 类定义
|
|
2
|
+
# 从文档自动提取生成
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
class GaussianMixtureModel:
|
|
7
|
+
"""
|
|
8
|
+
高斯混合模型实现
|
|
9
|
+
使用EM算法求解
|
|
10
|
+
"""
|
|
11
|
+
def __init__(self, n_components=3, max_iter=100, tol=1e-4):
|
|
12
|
+
self.n_components = n_components
|
|
13
|
+
self.max_iter = max_iter
|
|
14
|
+
self.tol = tol # 收敛阈值
|
|
15
|
+
|
|
16
|
+
self.weights_ = None # 混合系数 (K,)
|
|
17
|
+
self.means_ = None # 均值 (K, n_features)
|
|
18
|
+
self.covariances_ = None # 协方差矩阵 (K, n_features, n_features)
|
|
19
|
+
self.log_likelihood_history_ = []
|
|
20
|
+
|
|
21
|
+
def _initialize(self, X):
|
|
22
|
+
"""初始化参数"""
|
|
23
|
+
n_samples, n_features = X.shape
|
|
24
|
+
K = self.n_components
|
|
25
|
+
|
|
26
|
+
# 随机初始化均值(从数据中随机选择K个点)
|
|
27
|
+
indices = np.random.choice(n_samples, K, replace=False)
|
|
28
|
+
self.means_ = X[indices].copy()
|
|
29
|
+
|
|
30
|
+
# 初始化协方差为数据协方差的对角线
|
|
31
|
+
data_cov = np.cov(X.T)
|
|
32
|
+
self.covariances_ = np.array([np.diag(np.diag(data_cov)) + 1e-6 * np.eye(n_features)
|
|
33
|
+
for _ in range(K)])
|
|
34
|
+
|
|
35
|
+
# 初始化混合系数为均匀分布
|
|
36
|
+
self.weights_ = np.ones(K) / K
|
|
37
|
+
|
|
38
|
+
def _gaussian_pdf(self, X, mean, cov):
|
|
39
|
+
"""计算多元高斯概率密度"""
|
|
40
|
+
n_features = X.shape[1]
|
|
41
|
+
diff = X - mean
|
|
42
|
+
|
|
43
|
+
# 加小值保证数值稳定
|
|
44
|
+
cov_reg = cov + 1e-6 * np.eye(n_features)
|
|
45
|
+
|
|
46
|
+
# 使用Cholesky分解计算行列式和逆
|
|
47
|
+
try:
|
|
48
|
+
L = np.linalg.cholesky(cov_reg)
|
|
49
|
+
log_det = 2 * np.sum(np.log(np.diag(L)))
|
|
50
|
+
diff_L = np.linalg.solve(L, diff.T).T
|
|
51
|
+
mahalanobis = np.sum(diff_L ** 2, axis=1)
|
|
52
|
+
except np.linalg.LinAlgError:
|
|
53
|
+
# 如果Cholesky失败,使用标准方法
|
|
54
|
+
sign, log_det = np.linalg.slogdet(cov_reg)
|
|
55
|
+
cov_inv = np.linalg.inv(cov_reg)
|
|
56
|
+
mahalanobis = np.sum(diff @ cov_inv * diff, axis=1)
|
|
57
|
+
|
|
58
|
+
log_prob = -0.5 * (n_features * np.log(2 * np.pi) + log_det + mahalanobis)
|
|
59
|
+
return log_prob
|
|
60
|
+
|
|
61
|
+
def _e_step(self, X):
|
|
62
|
+
"""E步:计算责任度"""
|
|
63
|
+
n_samples = X.shape[0]
|
|
64
|
+
K = self.n_components
|
|
65
|
+
|
|
66
|
+
# 计算每个成分的对数概率
|
|
67
|
+
log_probs = np.zeros((n_samples, K))
|
|
68
|
+
for k in range(K):
|
|
69
|
+
log_probs[:, k] = (np.log(self.weights_[k] + 1e-10) +
|
|
70
|
+
self._gaussian_pdf(X, self.means_[k], self.covariances_[k]))
|
|
71
|
+
|
|
72
|
+
# 计算对数似然
|
|
73
|
+
log_likelihood = np.sum(np.log(np.sum(np.exp(log_probs), axis=1)))
|
|
74
|
+
|
|
75
|
+
# 计算责任度(使用log-sum-exp trick避免数值下溢)
|
|
76
|
+
log_max = log_probs.max(axis=1, keepdims=True)
|
|
77
|
+
log_sum = np.log(np.sum(np.exp(log_probs - log_max), axis=1, keepdims=True)) + log_max
|
|
78
|
+
responsibilities = np.exp(log_probs - log_sum)
|
|
79
|
+
|
|
80
|
+
return responsibilities, log_likelihood
|
|
81
|
+
|
|
82
|
+
def _m_step(self, X, responsibilities):
|
|
83
|
+
"""M步:更新参数"""
|
|
84
|
+
n_samples, n_features = X.shape
|
|
85
|
+
K = self.n_components
|
|
86
|
+
|
|
87
|
+
# 计算每个成分的有效样本数
|
|
88
|
+
N_k = responsibilities.sum(axis=0) + 1e-10
|
|
89
|
+
|
|
90
|
+
# 更新混合系数
|
|
91
|
+
self.weights_ = N_k / n_samples
|
|
92
|
+
|
|
93
|
+
# 更新均值
|
|
94
|
+
self.means_ = (responsibilities.T @ X) / N_k[:, np.newaxis]
|
|
95
|
+
|
|
96
|
+
# 更新协方差
|
|
97
|
+
for k in range(K):
|
|
98
|
+
diff = X - self.means_[k]
|
|
99
|
+
weighted_diff = responsibilities[:, k:k+1] * diff
|
|
100
|
+
self.covariances_[k] = (weighted_diff.T @ diff) / N_k[k]
|
|
101
|
+
# 添加正则化
|
|
102
|
+
self.covariances_[k] += 1e-6 * np.eye(n_features)
|
|
103
|
+
|
|
104
|
+
def fit(self, X):
|
|
105
|
+
"""训练模型"""
|
|
106
|
+
self._initialize(X)
|
|
107
|
+
self.log_likelihood_history_ = []
|
|
108
|
+
|
|
109
|
+
prev_log_likelihood = -np.inf
|
|
110
|
+
|
|
111
|
+
for iteration in range(self.max_iter):
|
|
112
|
+
# E步
|
|
113
|
+
responsibilities, log_likelihood = self._e_step(X)
|
|
114
|
+
self.log_likelihood_history_.append(log_likelihood)
|
|
115
|
+
|
|
116
|
+
# 检查收敛
|
|
117
|
+
if abs(log_likelihood - prev_log_likelihood) < self.tol:
|
|
118
|
+
print(f"EM收敛于第{iteration}次迭代")
|
|
119
|
+
break
|
|
120
|
+
|
|
121
|
+
# M步
|
|
122
|
+
self._m_step(X, responsibilities)
|
|
123
|
+
|
|
124
|
+
prev_log_likelihood = log_likelihood
|
|
125
|
+
|
|
126
|
+
return self
|
|
127
|
+
|
|
128
|
+
def predict(self, X):
|
|
129
|
+
"""预测聚类标签"""
|
|
130
|
+
responsibilities, _ = self._e_step(X)
|
|
131
|
+
return np.argmax(responsibilities, axis=1)
|
|
132
|
+
|
|
133
|
+
def predict_proba(self, X):
|
|
134
|
+
"""预测属于各成分的概率"""
|
|
135
|
+
responsibilities, _ = self._e_step(X)
|
|
136
|
+
return responsibilities
|
|
137
|
+
|
|
138
|
+
def score(self, X):
|
|
139
|
+
"""计算对数似然"""
|
|
140
|
+
_, log_likelihood = self._e_step(X)
|
|
141
|
+
return log_likelihood
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# MultinomialNaiveBayes 类定义
|
|
2
|
+
# 从文档自动提取生成
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
class MultinomialNaiveBayes:
|
|
7
|
+
"""
|
|
8
|
+
多项式朴素贝叶斯实现
|
|
9
|
+
适用于离散特征(如文本词频)
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def __init__(self, alpha=1.0):
|
|
13
|
+
"""
|
|
14
|
+
Parameters:
|
|
15
|
+
alpha : float, 拉普拉斯平滑参数
|
|
16
|
+
"""
|
|
17
|
+
self.alpha = alpha # 拉普拉斯平滑
|
|
18
|
+
self.class_prior_ = None # P(y)
|
|
19
|
+
self.feature_prob_ = None # P(x|y)
|
|
20
|
+
self.classes_ = None
|
|
21
|
+
|
|
22
|
+
def fit(self, X, y):
|
|
23
|
+
"""
|
|
24
|
+
训练模型
|
|
25
|
+
|
|
26
|
+
Parameters:
|
|
27
|
+
X : ndarray, shape (n_samples, n_features)
|
|
28
|
+
特征矩阵(词频/计数)
|
|
29
|
+
y : ndarray, shape (n_samples,)
|
|
30
|
+
类别标签
|
|
31
|
+
"""
|
|
32
|
+
n_samples, n_features = X.shape
|
|
33
|
+
self.classes_ = np.unique(y)
|
|
34
|
+
n_classes = len(self.classes_)
|
|
35
|
+
|
|
36
|
+
# 计算先验概率 P(y)
|
|
37
|
+
class_counts = np.array([np.sum(y == c) for c in self.classes_])
|
|
38
|
+
self.class_prior_ = class_counts / n_samples
|
|
39
|
+
|
|
40
|
+
# 计算条件概率 P(x|y)
|
|
41
|
+
# 对于每个类别,计算每个特征在该类别文档中的总计数
|
|
42
|
+
self.feature_prob_ = np.zeros((n_classes, n_features))
|
|
43
|
+
|
|
44
|
+
for i, c in enumerate(self.classes_):
|
|
45
|
+
# 获取类别c的所有样本
|
|
46
|
+
X_c = X[y == c]
|
|
47
|
+
# 该类别每个特征的总计数 + 平滑
|
|
48
|
+
feature_counts = X_c.sum(axis=0) + self.alpha
|
|
49
|
+
# 归一化得到条件概率
|
|
50
|
+
total_count = feature_counts.sum()
|
|
51
|
+
self.feature_prob_[i] = feature_counts / total_count
|
|
52
|
+
|
|
53
|
+
return self
|
|
54
|
+
|
|
55
|
+
def predict_log_proba(self, X):
|
|
56
|
+
"""
|
|
57
|
+
计算对数概率
|
|
58
|
+
"""
|
|
59
|
+
# log P(y) + sum(log P(x|y))
|
|
60
|
+
log_prior = np.log(self.class_prior_)
|
|
61
|
+
log_likelihood = X @ np.log(self.feature_prob_.T) # (n_samples, n_classes)
|
|
62
|
+
return log_prior + log_likelihood
|
|
63
|
+
|
|
64
|
+
def predict(self, X):
|
|
65
|
+
"""
|
|
66
|
+
预测类别
|
|
67
|
+
"""
|
|
68
|
+
log_proba = self.predict_log_proba(X)
|
|
69
|
+
return self.classes_[np.argmax(log_proba, axis=1)]
|
|
70
|
+
|
|
71
|
+
def score(self, X, y):
|
|
72
|
+
"""计算准确率"""
|
|
73
|
+
y_pred = self.predict(X)
|
|
74
|
+
return np.mean(y_pred == y)
|