aigroup-econ-mcp 0.4.0__py3-none-any.whl → 1.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aigroup_econ_mcp/__init__.py +1 -1
- aigroup_econ_mcp/cli.py +81 -86
- aigroup_econ_mcp/server.py +451 -451
- aigroup_econ_mcp/tools/__init__.py +8 -7
- aigroup_econ_mcp/tools/base.py +204 -5
- aigroup_econ_mcp/tools/data_loader.py +51 -27
- aigroup_econ_mcp/tools/file_parser.py +1027 -560
- aigroup_econ_mcp/tools/machine_learning.py +56 -669
- aigroup_econ_mcp/tools/ml_ensemble.py +210 -0
- aigroup_econ_mcp/tools/ml_evaluation.py +272 -0
- aigroup_econ_mcp/tools/ml_models.py +54 -0
- aigroup_econ_mcp/tools/ml_regularization.py +186 -0
- aigroup_econ_mcp/tools/panel_data.py +70 -4
- aigroup_econ_mcp/tools/time_series.py +53 -22
- aigroup_econ_mcp/tools/tool_descriptions.py +410 -0
- aigroup_econ_mcp/tools/tool_handlers.py +681 -43
- aigroup_econ_mcp/tools/tool_registry.py +329 -21
- aigroup_econ_mcp-1.3.3.dist-info/METADATA +525 -0
- aigroup_econ_mcp-1.3.3.dist-info/RECORD +30 -0
- aigroup_econ_mcp/server_v1_backup.py +0 -1250
- aigroup_econ_mcp/server_v1_old.py +0 -1250
- aigroup_econ_mcp/server_with_file_support.py +0 -259
- aigroup_econ_mcp/tools/decorators.py +0 -178
- aigroup_econ_mcp/tools/file_input_handler.py +0 -268
- aigroup_econ_mcp-0.4.0.dist-info/METADATA +0 -718
- aigroup_econ_mcp-0.4.0.dist-info/RECORD +0 -30
- {aigroup_econ_mcp-0.4.0.dist-info → aigroup_econ_mcp-1.3.3.dist-info}/WHEEL +0 -0
- {aigroup_econ_mcp-0.4.0.dist-info → aigroup_econ_mcp-1.3.3.dist-info}/entry_points.txt +0 -0
- {aigroup_econ_mcp-0.4.0.dist-info → aigroup_econ_mcp-1.3.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
"""
|
|
2
|
+
集成学习方法模块
|
|
3
|
+
包含随机森林和梯度提升树回归算法
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
from typing import List, Optional
|
|
8
|
+
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
|
|
9
|
+
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
|
|
10
|
+
from sklearn.preprocessing import StandardScaler
|
|
11
|
+
import warnings
|
|
12
|
+
warnings.filterwarnings('ignore')
|
|
13
|
+
|
|
14
|
+
from .ml_models import RandomForestResult, GradientBoostingResult
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def random_forest_regression(
|
|
18
|
+
y_data: List[float],
|
|
19
|
+
x_data: List[List[float]],
|
|
20
|
+
feature_names: Optional[List[str]] = None,
|
|
21
|
+
n_estimators: int = 100,
|
|
22
|
+
max_depth: Optional[int] = None,
|
|
23
|
+
random_state: int = 42
|
|
24
|
+
) -> RandomForestResult:
|
|
25
|
+
"""
|
|
26
|
+
随机森林回归
|
|
27
|
+
|
|
28
|
+
📊 功能说明:
|
|
29
|
+
使用随机森林算法进行回归分析,适用于非线性关系和复杂交互效应。
|
|
30
|
+
|
|
31
|
+
📈 算法特点:
|
|
32
|
+
- 集成学习:多个决策树的组合
|
|
33
|
+
- 抗过拟合:通过袋外样本和特征随机选择
|
|
34
|
+
- 非线性建模:能够捕捉复杂的非线性关系
|
|
35
|
+
- 特征重要性:提供特征重要性排序
|
|
36
|
+
|
|
37
|
+
💡 使用场景:
|
|
38
|
+
- 复杂非线性关系建模
|
|
39
|
+
- 特征重要性分析
|
|
40
|
+
- 高维数据回归
|
|
41
|
+
- 稳健预测建模
|
|
42
|
+
|
|
43
|
+
⚠️ 注意事项:
|
|
44
|
+
- 计算复杂度较高
|
|
45
|
+
- 需要调整超参数(n_estimators, max_depth)
|
|
46
|
+
- 对异常值相对稳健
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
y_data: 因变量数据
|
|
50
|
+
x_data: 自变量数据,二维列表格式
|
|
51
|
+
feature_names: 特征名称列表
|
|
52
|
+
n_estimators: 树的数量,默认100
|
|
53
|
+
max_depth: 最大深度,None表示不限制
|
|
54
|
+
random_state: 随机种子
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
RandomForestResult: 随机森林回归结果
|
|
58
|
+
"""
|
|
59
|
+
# 数据验证
|
|
60
|
+
if not y_data or not x_data:
|
|
61
|
+
raise ValueError("因变量和自变量数据不能为空")
|
|
62
|
+
|
|
63
|
+
if len(y_data) != len(x_data):
|
|
64
|
+
raise ValueError(f"因变量和自变量的观测数量不一致: y_data={len(y_data)}, x_data={len(x_data)}")
|
|
65
|
+
|
|
66
|
+
# 准备数据
|
|
67
|
+
X = np.array(x_data)
|
|
68
|
+
y = np.array(y_data)
|
|
69
|
+
|
|
70
|
+
# 特征名称处理
|
|
71
|
+
if feature_names is None:
|
|
72
|
+
feature_names = [f"x{i}" for i in range(X.shape[1])]
|
|
73
|
+
elif len(feature_names) != X.shape[1]:
|
|
74
|
+
raise ValueError(f"特征名称数量({len(feature_names)})与自变量数量({X.shape[1]})不匹配")
|
|
75
|
+
|
|
76
|
+
# 数据标准化
|
|
77
|
+
scaler = StandardScaler()
|
|
78
|
+
X_scaled = scaler.fit_transform(X)
|
|
79
|
+
|
|
80
|
+
# 训练随机森林模型
|
|
81
|
+
rf_model = RandomForestRegressor(
|
|
82
|
+
n_estimators=n_estimators,
|
|
83
|
+
max_depth=max_depth,
|
|
84
|
+
random_state=random_state,
|
|
85
|
+
oob_score=True
|
|
86
|
+
)
|
|
87
|
+
rf_model.fit(X_scaled, y)
|
|
88
|
+
|
|
89
|
+
# 预测
|
|
90
|
+
y_pred = rf_model.predict(X_scaled)
|
|
91
|
+
|
|
92
|
+
# 计算评估指标
|
|
93
|
+
r2 = r2_score(y, y_pred)
|
|
94
|
+
mse = mean_squared_error(y, y_pred)
|
|
95
|
+
mae = mean_absolute_error(y, y_pred)
|
|
96
|
+
|
|
97
|
+
# 特征重要性
|
|
98
|
+
feature_importance = dict(zip(feature_names, rf_model.feature_importances_))
|
|
99
|
+
|
|
100
|
+
return RandomForestResult(
|
|
101
|
+
model_type="random_forest",
|
|
102
|
+
r2_score=r2,
|
|
103
|
+
mse=mse,
|
|
104
|
+
mae=mae,
|
|
105
|
+
n_obs=len(y),
|
|
106
|
+
feature_names=feature_names,
|
|
107
|
+
feature_importance=feature_importance,
|
|
108
|
+
n_estimators=n_estimators,
|
|
109
|
+
max_depth=max_depth if max_depth is not None else 0, # 0表示无限制
|
|
110
|
+
oob_score=rf_model.oob_score_ if hasattr(rf_model, 'oob_score_') else None
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def gradient_boosting_regression(
|
|
115
|
+
y_data: List[float],
|
|
116
|
+
x_data: List[List[float]],
|
|
117
|
+
feature_names: Optional[List[str]] = None,
|
|
118
|
+
n_estimators: int = 100,
|
|
119
|
+
learning_rate: float = 0.1,
|
|
120
|
+
max_depth: int = 3,
|
|
121
|
+
random_state: int = 42
|
|
122
|
+
) -> GradientBoostingResult:
|
|
123
|
+
"""
|
|
124
|
+
梯度提升树回归
|
|
125
|
+
|
|
126
|
+
📊 功能说明:
|
|
127
|
+
使用梯度提升算法进行回归分析,通过逐步优化残差来提升模型性能。
|
|
128
|
+
|
|
129
|
+
📈 算法特点:
|
|
130
|
+
- 逐步优化:通过梯度下降逐步改进模型
|
|
131
|
+
- 高精度:通常比随机森林有更好的预测精度
|
|
132
|
+
- 正则化:通过学习率和树深度控制过拟合
|
|
133
|
+
- 特征重要性:提供特征重要性排序
|
|
134
|
+
|
|
135
|
+
💡 使用场景:
|
|
136
|
+
- 高精度预测需求
|
|
137
|
+
- 结构化数据建模
|
|
138
|
+
- 竞赛和实际应用
|
|
139
|
+
- 需要精细调优的场景
|
|
140
|
+
|
|
141
|
+
⚠️ 注意事项:
|
|
142
|
+
- 对超参数敏感
|
|
143
|
+
- 训练时间较长
|
|
144
|
+
- 容易过拟合(需要仔细调参)
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
y_data: 因变量数据
|
|
148
|
+
x_data: 自变量数据,二维列表格式
|
|
149
|
+
feature_names: 特征名称列表
|
|
150
|
+
n_estimators: 树的数量,默认100
|
|
151
|
+
learning_rate: 学习率,默认0.1
|
|
152
|
+
max_depth: 最大深度,默认3
|
|
153
|
+
random_state: 随机种子
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
GradientBoostingResult: 梯度提升树回归结果
|
|
157
|
+
"""
|
|
158
|
+
# 数据验证
|
|
159
|
+
if not y_data or not x_data:
|
|
160
|
+
raise ValueError("因变量和自变量数据不能为空")
|
|
161
|
+
|
|
162
|
+
if len(y_data) != len(x_data):
|
|
163
|
+
raise ValueError(f"因变量和自变量的观测数量不一致: y_data={len(y_data)}, x_data={len(x_data)}")
|
|
164
|
+
|
|
165
|
+
# 准备数据
|
|
166
|
+
X = np.array(x_data)
|
|
167
|
+
y = np.array(y_data)
|
|
168
|
+
|
|
169
|
+
# 特征名称处理
|
|
170
|
+
if feature_names is None:
|
|
171
|
+
feature_names = [f"x{i}" for i in range(X.shape[1])]
|
|
172
|
+
elif len(feature_names) != X.shape[1]:
|
|
173
|
+
raise ValueError(f"特征名称数量({len(feature_names)})与自变量数量({X.shape[1]})不匹配")
|
|
174
|
+
|
|
175
|
+
# 数据标准化
|
|
176
|
+
scaler = StandardScaler()
|
|
177
|
+
X_scaled = scaler.fit_transform(X)
|
|
178
|
+
|
|
179
|
+
# 训练梯度提升树模型
|
|
180
|
+
gb_model = GradientBoostingRegressor(
|
|
181
|
+
n_estimators=n_estimators,
|
|
182
|
+
learning_rate=learning_rate,
|
|
183
|
+
max_depth=max_depth,
|
|
184
|
+
random_state=random_state
|
|
185
|
+
)
|
|
186
|
+
gb_model.fit(X_scaled, y)
|
|
187
|
+
|
|
188
|
+
# 预测
|
|
189
|
+
y_pred = gb_model.predict(X_scaled)
|
|
190
|
+
|
|
191
|
+
# 计算评估指标
|
|
192
|
+
r2 = r2_score(y, y_pred)
|
|
193
|
+
mse = mean_squared_error(y, y_pred)
|
|
194
|
+
mae = mean_absolute_error(y, y_pred)
|
|
195
|
+
|
|
196
|
+
# 特征重要性
|
|
197
|
+
feature_importance = dict(zip(feature_names, gb_model.feature_importances_))
|
|
198
|
+
|
|
199
|
+
return GradientBoostingResult(
|
|
200
|
+
model_type="gradient_boosting",
|
|
201
|
+
r2_score=r2,
|
|
202
|
+
mse=mse,
|
|
203
|
+
mae=mae,
|
|
204
|
+
n_obs=len(y),
|
|
205
|
+
feature_names=feature_names,
|
|
206
|
+
feature_importance=feature_importance,
|
|
207
|
+
n_estimators=n_estimators,
|
|
208
|
+
learning_rate=learning_rate,
|
|
209
|
+
max_depth=max_depth
|
|
210
|
+
)
|
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
"""
|
|
2
|
+
机器学习评估和比较模块
|
|
3
|
+
包含交叉验证、特征重要性分析和模型比较功能
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
from typing import List, Dict, Any, Optional
|
|
8
|
+
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
|
|
9
|
+
from sklearn.model_selection import cross_val_score, KFold
|
|
10
|
+
from sklearn.preprocessing import StandardScaler
|
|
11
|
+
from sklearn.linear_model import Lasso, Ridge
|
|
12
|
+
import warnings
|
|
13
|
+
warnings.filterwarnings('ignore')
|
|
14
|
+
|
|
15
|
+
from .ml_models import CrossValidationResult, FeatureImportanceResult
|
|
16
|
+
from .ml_ensemble import random_forest_regression, gradient_boosting_regression
|
|
17
|
+
from .ml_regularization import lasso_regression, ridge_regression
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def cross_validation(
|
|
21
|
+
y_data: List[float],
|
|
22
|
+
x_data: List[List[float]],
|
|
23
|
+
model_type: str = "random_forest",
|
|
24
|
+
cv_folds: int = 5,
|
|
25
|
+
scoring: str = "r2",
|
|
26
|
+
**model_params
|
|
27
|
+
) -> CrossValidationResult:
|
|
28
|
+
"""
|
|
29
|
+
交叉验证
|
|
30
|
+
|
|
31
|
+
📊 功能说明:
|
|
32
|
+
通过交叉验证评估模型的泛化能力和稳定性。
|
|
33
|
+
|
|
34
|
+
📈 验证方法:
|
|
35
|
+
- K折交叉验证:将数据分为K份,轮流使用K-1份训练,1份测试
|
|
36
|
+
- 稳定性评估:通过多次验证评估模型稳定性
|
|
37
|
+
- 泛化能力:评估模型在未见数据上的表现
|
|
38
|
+
|
|
39
|
+
💡 使用场景:
|
|
40
|
+
- 模型选择和比较
|
|
41
|
+
- 超参数调优
|
|
42
|
+
- 评估模型稳定性
|
|
43
|
+
- 防止过拟合
|
|
44
|
+
|
|
45
|
+
⚠️ 注意事项:
|
|
46
|
+
- 计算成本较高
|
|
47
|
+
- 需要足够的数据量
|
|
48
|
+
- 折数选择影响结果稳定性
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
y_data: 因变量数据
|
|
52
|
+
x_data: 自变量数据,二维列表格式
|
|
53
|
+
model_type: 模型类型(random_forest, gradient_boosting, lasso, ridge)
|
|
54
|
+
cv_folds: 交叉验证折数,默认5
|
|
55
|
+
scoring: 评分指标,默认"r2"
|
|
56
|
+
**model_params: 模型参数
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
CrossValidationResult: 交叉验证结果
|
|
60
|
+
"""
|
|
61
|
+
# 数据验证
|
|
62
|
+
if not y_data or not x_data:
|
|
63
|
+
raise ValueError("因变量和自变量数据不能为空")
|
|
64
|
+
|
|
65
|
+
if len(y_data) != len(x_data):
|
|
66
|
+
raise ValueError(f"因变量和自变量的观测数量不一致: y_data={len(y_data)}, x_data={len(x_data)}")
|
|
67
|
+
|
|
68
|
+
if cv_folds < 2 or cv_folds > len(y_data):
|
|
69
|
+
raise ValueError(f"交叉验证折数应在2到样本数量之间: cv_folds={cv_folds}, n_obs={len(y_data)}")
|
|
70
|
+
|
|
71
|
+
# 准备数据
|
|
72
|
+
X = np.array(x_data)
|
|
73
|
+
y = np.array(y_data)
|
|
74
|
+
|
|
75
|
+
# 数据标准化
|
|
76
|
+
scaler = StandardScaler()
|
|
77
|
+
X_scaled = scaler.fit_transform(X)
|
|
78
|
+
|
|
79
|
+
# 选择模型
|
|
80
|
+
if model_type == "random_forest":
|
|
81
|
+
model = RandomForestRegressor(**model_params)
|
|
82
|
+
elif model_type == "gradient_boosting":
|
|
83
|
+
model = GradientBoostingRegressor(**model_params)
|
|
84
|
+
elif model_type == "lasso":
|
|
85
|
+
model = Lasso(**model_params)
|
|
86
|
+
elif model_type == "ridge":
|
|
87
|
+
model = Ridge(**model_params)
|
|
88
|
+
else:
|
|
89
|
+
raise ValueError(f"不支持的模型类型: {model_type}")
|
|
90
|
+
|
|
91
|
+
# 执行交叉验证
|
|
92
|
+
cv = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
|
|
93
|
+
cv_scores = cross_val_score(model, X_scaled, y, cv=cv, scoring=scoring)
|
|
94
|
+
|
|
95
|
+
return CrossValidationResult(
|
|
96
|
+
model_type=model_type,
|
|
97
|
+
cv_scores=cv_scores.tolist(),
|
|
98
|
+
mean_score=np.mean(cv_scores),
|
|
99
|
+
std_score=np.std(cv_scores),
|
|
100
|
+
n_splits=cv_folds
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def feature_importance_analysis(
|
|
105
|
+
y_data: List[float],
|
|
106
|
+
x_data: List[List[float]],
|
|
107
|
+
feature_names: Optional[List[str]] = None,
|
|
108
|
+
method: str = "random_forest",
|
|
109
|
+
top_k: int = 5
|
|
110
|
+
) -> FeatureImportanceResult:
|
|
111
|
+
"""
|
|
112
|
+
特征重要性分析
|
|
113
|
+
|
|
114
|
+
📊 功能说明:
|
|
115
|
+
分析各个特征对预测目标的重要性,帮助理解数据中的关键因素。
|
|
116
|
+
|
|
117
|
+
📈 分析方法:
|
|
118
|
+
- 基于模型:使用机器学习模型计算特征重要性
|
|
119
|
+
- 排序分析:按重要性对特征进行排序
|
|
120
|
+
- 关键特征识别:识别最重要的top-k个特征
|
|
121
|
+
|
|
122
|
+
💡 使用场景:
|
|
123
|
+
- 特征选择和降维
|
|
124
|
+
- 模型可解释性分析
|
|
125
|
+
- 业务洞察提取
|
|
126
|
+
- 数据理解增强
|
|
127
|
+
|
|
128
|
+
⚠️ 注意事项:
|
|
129
|
+
- 不同方法可能给出不同的重要性排序
|
|
130
|
+
- 重要性分数是相对的,不是绝对的
|
|
131
|
+
- 需要结合业务知识解释结果
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
y_data: 因变量数据
|
|
135
|
+
x_data: 自变量数据,二维列表格式
|
|
136
|
+
feature_names: 特征名称列表
|
|
137
|
+
method: 分析方法(random_forest, gradient_boosting)
|
|
138
|
+
top_k: 最重要的特征数量,默认5
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
FeatureImportanceResult: 特征重要性分析结果
|
|
142
|
+
"""
|
|
143
|
+
# 数据验证
|
|
144
|
+
if not y_data or not x_data:
|
|
145
|
+
raise ValueError("因变量和自变量数据不能为空")
|
|
146
|
+
|
|
147
|
+
if len(y_data) != len(x_data):
|
|
148
|
+
raise ValueError(f"因变量和自变量的观测数量不一致: y_data={len(y_data)}, x_data={len(x_data)}")
|
|
149
|
+
|
|
150
|
+
# 准备数据
|
|
151
|
+
X = np.array(x_data)
|
|
152
|
+
y = np.array(y_data)
|
|
153
|
+
|
|
154
|
+
# 特征名称处理
|
|
155
|
+
if feature_names is None:
|
|
156
|
+
feature_names = [f"x{i}" for i in range(X.shape[1])]
|
|
157
|
+
elif len(feature_names) != X.shape[1]:
|
|
158
|
+
raise ValueError(f"特征名称数量({len(feature_names)})与自变量数量({X.shape[1]})不匹配")
|
|
159
|
+
|
|
160
|
+
# 数据标准化
|
|
161
|
+
scaler = StandardScaler()
|
|
162
|
+
X_scaled = scaler.fit_transform(X)
|
|
163
|
+
|
|
164
|
+
# 选择模型并计算特征重要性
|
|
165
|
+
if method == "random_forest":
|
|
166
|
+
model = RandomForestRegressor(n_estimators=100, random_state=42)
|
|
167
|
+
elif method == "gradient_boosting":
|
|
168
|
+
model = GradientBoostingRegressor(n_estimators=100, random_state=42)
|
|
169
|
+
else:
|
|
170
|
+
raise ValueError(f"不支持的特征重要性分析方法: {method}")
|
|
171
|
+
|
|
172
|
+
# 训练模型
|
|
173
|
+
model.fit(X_scaled, y)
|
|
174
|
+
|
|
175
|
+
# 获取特征重要性
|
|
176
|
+
importance_scores = model.feature_importances_
|
|
177
|
+
feature_importance = dict(zip(feature_names, importance_scores))
|
|
178
|
+
|
|
179
|
+
# 按重要性排序
|
|
180
|
+
sorted_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)
|
|
181
|
+
|
|
182
|
+
# 获取最重要的特征
|
|
183
|
+
top_features = [feature for feature, score in sorted_features[:top_k]]
|
|
184
|
+
|
|
185
|
+
return FeatureImportanceResult(
|
|
186
|
+
feature_importance=feature_importance,
|
|
187
|
+
sorted_features=sorted_features,
|
|
188
|
+
top_features=top_features
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def compare_ml_models(
|
|
193
|
+
y_data: List[float],
|
|
194
|
+
x_data: List[List[float]],
|
|
195
|
+
feature_names: Optional[List[str]] = None,
|
|
196
|
+
models: List[str] = None
|
|
197
|
+
) -> Dict[str, Any]:
|
|
198
|
+
"""
|
|
199
|
+
比较多个机器学习模型
|
|
200
|
+
|
|
201
|
+
📊 功能说明:
|
|
202
|
+
同时运行多个机器学习模型并比较它们的性能,帮助选择最佳模型。
|
|
203
|
+
|
|
204
|
+
📈 比较指标:
|
|
205
|
+
- R²得分:模型解释方差的比例
|
|
206
|
+
- 均方误差:预测误差的平方平均
|
|
207
|
+
- 平均绝对误差:预测误差的绝对平均
|
|
208
|
+
- 特征重要性:模型认为的重要特征
|
|
209
|
+
|
|
210
|
+
💡 使用场景:
|
|
211
|
+
- 模型选择和比较
|
|
212
|
+
- 算法性能评估
|
|
213
|
+
- 项目初始阶段模型筛选
|
|
214
|
+
- 基准模型建立
|
|
215
|
+
|
|
216
|
+
⚠️ 注意事项:
|
|
217
|
+
- 不同模型有不同的假设和适用场景
|
|
218
|
+
- 需要结合交叉验证结果
|
|
219
|
+
- 考虑模型复杂度和计算成本
|
|
220
|
+
|
|
221
|
+
Args:
|
|
222
|
+
y_data: 因变量数据
|
|
223
|
+
x_data: 自变量数据,二维列表格式
|
|
224
|
+
feature_names: 特征名称列表
|
|
225
|
+
models: 要比较的模型列表,默认比较所有模型
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
Dict[str, Any]: 模型比较结果
|
|
229
|
+
"""
|
|
230
|
+
if models is None:
|
|
231
|
+
models = ["random_forest", "gradient_boosting", "lasso", "ridge"]
|
|
232
|
+
|
|
233
|
+
results = {}
|
|
234
|
+
|
|
235
|
+
for model_name in models:
|
|
236
|
+
try:
|
|
237
|
+
if model_name == "random_forest":
|
|
238
|
+
result = random_forest_regression(y_data, x_data, feature_names)
|
|
239
|
+
elif model_name == "gradient_boosting":
|
|
240
|
+
result = gradient_boosting_regression(y_data, x_data, feature_names)
|
|
241
|
+
elif model_name == "lasso":
|
|
242
|
+
result = lasso_regression(y_data, x_data, feature_names)
|
|
243
|
+
elif model_name == "ridge":
|
|
244
|
+
result = ridge_regression(y_data, x_data, feature_names)
|
|
245
|
+
else:
|
|
246
|
+
continue
|
|
247
|
+
|
|
248
|
+
results[model_name] = result.model_dump()
|
|
249
|
+
|
|
250
|
+
except Exception as e:
|
|
251
|
+
print(f"模型 {model_name} 运行失败: {e}")
|
|
252
|
+
continue
|
|
253
|
+
|
|
254
|
+
# 找出最佳模型(基于R²得分)
|
|
255
|
+
best_model = None
|
|
256
|
+
best_r2 = -float('inf')
|
|
257
|
+
|
|
258
|
+
for model_name, result in results.items():
|
|
259
|
+
if result['r2_score'] > best_r2:
|
|
260
|
+
best_r2 = result['r2_score']
|
|
261
|
+
best_model = model_name
|
|
262
|
+
|
|
263
|
+
return {
|
|
264
|
+
"model_results": results,
|
|
265
|
+
"best_model": best_model,
|
|
266
|
+
"best_r2": best_r2,
|
|
267
|
+
"comparison_summary": {
|
|
268
|
+
"total_models": len(results),
|
|
269
|
+
"successful_models": len(results),
|
|
270
|
+
"best_performing": best_model
|
|
271
|
+
}
|
|
272
|
+
}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""
|
|
2
|
+
机器学习模型数据类定义
|
|
3
|
+
定义各种机器学习算法的结果数据结构
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import List, Dict, Any, Optional, Tuple
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class MLModelResult(BaseModel):
|
|
11
|
+
"""机器学习模型结果基类"""
|
|
12
|
+
model_type: str = Field(description="模型类型")
|
|
13
|
+
r2_score: float = Field(description="R²得分")
|
|
14
|
+
mse: float = Field(description="均方误差")
|
|
15
|
+
mae: float = Field(description="平均绝对误差")
|
|
16
|
+
n_obs: int = Field(description="样本数量")
|
|
17
|
+
feature_names: List[str] = Field(description="特征名称")
|
|
18
|
+
feature_importance: Optional[Dict[str, float]] = Field(default=None, description="特征重要性")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class RandomForestResult(MLModelResult):
|
|
22
|
+
"""随机森林回归结果"""
|
|
23
|
+
n_estimators: int = Field(description="树的数量")
|
|
24
|
+
max_depth: int = Field(description="最大深度")
|
|
25
|
+
oob_score: Optional[float] = Field(default=None, description="袋外得分")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class GradientBoostingResult(MLModelResult):
|
|
29
|
+
"""梯度提升树回归结果"""
|
|
30
|
+
n_estimators: int = Field(description="树的数量")
|
|
31
|
+
learning_rate: float = Field(description="学习率")
|
|
32
|
+
max_depth: int = Field(description="最大深度")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class RegularizedRegressionResult(MLModelResult):
|
|
36
|
+
"""正则化回归结果"""
|
|
37
|
+
alpha: float = Field(description="正则化强度")
|
|
38
|
+
coefficients: Dict[str, float] = Field(description="回归系数")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class CrossValidationResult(BaseModel):
|
|
42
|
+
"""交叉验证结果"""
|
|
43
|
+
model_type: str = Field(description="模型类型")
|
|
44
|
+
cv_scores: List[float] = Field(description="交叉验证得分")
|
|
45
|
+
mean_score: float = Field(description="平均得分")
|
|
46
|
+
std_score: float = Field(description="标准差")
|
|
47
|
+
n_splits: int = Field(description="交叉验证折数")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class FeatureImportanceResult(BaseModel):
|
|
51
|
+
"""特征重要性分析结果"""
|
|
52
|
+
feature_importance: Dict[str, float] = Field(description="特征重要性分数")
|
|
53
|
+
sorted_features: List[Tuple[str, float]] = Field(description="按重要性排序的特征")
|
|
54
|
+
top_features: List[str] = Field(description="最重要的特征")
|