aigroup-econ-mcp 0.4.0__py3-none-any.whl → 1.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aigroup_econ_mcp/__init__.py +1 -1
- aigroup_econ_mcp/cli.py +81 -86
- aigroup_econ_mcp/server.py +451 -451
- aigroup_econ_mcp/tools/__init__.py +8 -7
- aigroup_econ_mcp/tools/base.py +204 -5
- aigroup_econ_mcp/tools/data_loader.py +51 -27
- aigroup_econ_mcp/tools/file_parser.py +1027 -560
- aigroup_econ_mcp/tools/machine_learning.py +56 -669
- aigroup_econ_mcp/tools/ml_ensemble.py +210 -0
- aigroup_econ_mcp/tools/ml_evaluation.py +272 -0
- aigroup_econ_mcp/tools/ml_models.py +54 -0
- aigroup_econ_mcp/tools/ml_regularization.py +186 -0
- aigroup_econ_mcp/tools/panel_data.py +70 -4
- aigroup_econ_mcp/tools/time_series.py +53 -22
- aigroup_econ_mcp/tools/tool_descriptions.py +410 -0
- aigroup_econ_mcp/tools/tool_handlers.py +681 -43
- aigroup_econ_mcp/tools/tool_registry.py +329 -21
- aigroup_econ_mcp-1.3.3.dist-info/METADATA +525 -0
- aigroup_econ_mcp-1.3.3.dist-info/RECORD +30 -0
- aigroup_econ_mcp/server_v1_backup.py +0 -1250
- aigroup_econ_mcp/server_v1_old.py +0 -1250
- aigroup_econ_mcp/server_with_file_support.py +0 -259
- aigroup_econ_mcp/tools/decorators.py +0 -178
- aigroup_econ_mcp/tools/file_input_handler.py +0 -268
- aigroup_econ_mcp-0.4.0.dist-info/METADATA +0 -718
- aigroup_econ_mcp-0.4.0.dist-info/RECORD +0 -30
- {aigroup_econ_mcp-0.4.0.dist-info → aigroup_econ_mcp-1.3.3.dist-info}/WHEEL +0 -0
- {aigroup_econ_mcp-0.4.0.dist-info → aigroup_econ_mcp-1.3.3.dist-info}/entry_points.txt +0 -0
- {aigroup_econ_mcp-0.4.0.dist-info → aigroup_econ_mcp-1.3.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,673 +1,60 @@
|
|
|
1
|
-
|
|
2
1
|
"""
|
|
3
|
-
机器学习集成模块
|
|
2
|
+
机器学习集成模块 - 统一导出接口
|
|
4
3
|
提供基于scikit-learn的机器学习算法,用于经济数据分析
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import numpy as np
|
|
8
|
-
import pandas as pd
|
|
9
|
-
from typing import List, Dict, Any, Optional, Tuple
|
|
10
|
-
from pydantic import BaseModel, Field
|
|
11
|
-
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
|
|
12
|
-
from sklearn.linear_model import Lasso, Ridge
|
|
13
|
-
from sklearn.model_selection import cross_val_score, KFold
|
|
14
|
-
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
|
|
15
|
-
from sklearn.preprocessing import StandardScaler
|
|
16
|
-
import warnings
|
|
17
|
-
warnings.filterwarnings('ignore')
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class MLModelResult(BaseModel):
|
|
21
|
-
"""机器学习模型结果基类"""
|
|
22
|
-
model_type: str = Field(description="模型类型")
|
|
23
|
-
r2_score: float = Field(description="R²得分")
|
|
24
|
-
mse: float = Field(description="均方误差")
|
|
25
|
-
mae: float = Field(description="平均绝对误差")
|
|
26
|
-
n_obs: int = Field(description="样本数量")
|
|
27
|
-
feature_names: List[str] = Field(description="特征名称")
|
|
28
|
-
feature_importance: Optional[Dict[str, float]] = Field(default=None, description="特征重要性")
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
class RandomForestResult(MLModelResult):
|
|
32
|
-
"""随机森林回归结果"""
|
|
33
|
-
n_estimators: int = Field(description="树的数量")
|
|
34
|
-
max_depth: int = Field(description="最大深度")
|
|
35
|
-
oob_score: Optional[float] = Field(default=None, description="袋外得分")
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
class GradientBoostingResult(MLModelResult):
|
|
39
|
-
"""梯度提升树回归结果"""
|
|
40
|
-
n_estimators: int = Field(description="树的数量")
|
|
41
|
-
learning_rate: float = Field(description="学习率")
|
|
42
|
-
max_depth: int = Field(description="最大深度")
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
class RegularizedRegressionResult(MLModelResult):
|
|
46
|
-
"""正则化回归结果"""
|
|
47
|
-
alpha: float = Field(description="正则化强度")
|
|
48
|
-
coefficients: Dict[str, float] = Field(description="回归系数")
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
class CrossValidationResult(BaseModel):
|
|
52
|
-
"""交叉验证结果"""
|
|
53
|
-
model_type: str = Field(description="模型类型")
|
|
54
|
-
cv_scores: List[float] = Field(description="交叉验证得分")
|
|
55
|
-
mean_score: float = Field(description="平均得分")
|
|
56
|
-
std_score: float = Field(description="标准差")
|
|
57
|
-
n_splits: int = Field(description="交叉验证折数")
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
class FeatureImportanceResult(BaseModel):
|
|
61
|
-
"""特征重要性分析结果"""
|
|
62
|
-
feature_importance: Dict[str, float] = Field(description="特征重要性分数")
|
|
63
|
-
sorted_features: List[Tuple[str, float]] = Field(description="按重要性排序的特征")
|
|
64
|
-
top_features: List[str] = Field(description="最重要的特征")
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
def random_forest_regression(
|
|
68
|
-
y_data: List[float],
|
|
69
|
-
x_data: List[List[float]],
|
|
70
|
-
feature_names: Optional[List[str]] = None,
|
|
71
|
-
n_estimators: int = 100,
|
|
72
|
-
max_depth: Optional[int] = None,
|
|
73
|
-
random_state: int = 42
|
|
74
|
-
) -> RandomForestResult:
|
|
75
|
-
"""
|
|
76
|
-
随机森林回归
|
|
77
|
-
|
|
78
|
-
📊 功能说明:
|
|
79
|
-
使用随机森林算法进行回归分析,适用于非线性关系和复杂交互效应。
|
|
80
|
-
|
|
81
|
-
📈 算法特点:
|
|
82
|
-
- 集成学习:多个决策树的组合
|
|
83
|
-
- 抗过拟合:通过袋外样本和特征随机选择
|
|
84
|
-
- 非线性建模:能够捕捉复杂的非线性关系
|
|
85
|
-
- 特征重要性:提供特征重要性排序
|
|
86
|
-
|
|
87
|
-
💡 使用场景:
|
|
88
|
-
- 复杂非线性关系建模
|
|
89
|
-
- 特征重要性分析
|
|
90
|
-
- 高维数据回归
|
|
91
|
-
- 稳健预测建模
|
|
92
|
-
|
|
93
|
-
⚠️ 注意事项:
|
|
94
|
-
- 计算复杂度较高
|
|
95
|
-
- 需要调整超参数(n_estimators, max_depth)
|
|
96
|
-
- 对异常值相对稳健
|
|
97
|
-
|
|
98
|
-
Args:
|
|
99
|
-
y_data: 因变量数据
|
|
100
|
-
x_data: 自变量数据,二维列表格式
|
|
101
|
-
feature_names: 特征名称列表
|
|
102
|
-
n_estimators: 树的数量,默认100
|
|
103
|
-
max_depth: 最大深度,None表示不限制
|
|
104
|
-
random_state: 随机种子
|
|
105
|
-
|
|
106
|
-
Returns:
|
|
107
|
-
RandomForestResult: 随机森林回归结果
|
|
108
|
-
"""
|
|
109
|
-
# 数据验证
|
|
110
|
-
if not y_data or not x_data:
|
|
111
|
-
raise ValueError("因变量和自变量数据不能为空")
|
|
112
|
-
|
|
113
|
-
if len(y_data) != len(x_data):
|
|
114
|
-
raise ValueError(f"因变量和自变量的观测数量不一致: y_data={len(y_data)}, x_data={len(x_data)}")
|
|
115
|
-
|
|
116
|
-
# 准备数据
|
|
117
|
-
X = np.array(x_data)
|
|
118
|
-
y = np.array(y_data)
|
|
119
|
-
|
|
120
|
-
# 特征名称处理
|
|
121
|
-
if feature_names is None:
|
|
122
|
-
feature_names = [f"x{i}" for i in range(X.shape[1])]
|
|
123
|
-
elif len(feature_names) != X.shape[1]:
|
|
124
|
-
raise ValueError(f"特征名称数量({len(feature_names)})与自变量数量({X.shape[1]})不匹配")
|
|
125
|
-
|
|
126
|
-
# 数据标准化
|
|
127
|
-
scaler = StandardScaler()
|
|
128
|
-
X_scaled = scaler.fit_transform(X)
|
|
129
|
-
|
|
130
|
-
# 训练随机森林模型
|
|
131
|
-
rf_model = RandomForestRegressor(
|
|
132
|
-
n_estimators=n_estimators,
|
|
133
|
-
max_depth=max_depth,
|
|
134
|
-
random_state=random_state,
|
|
135
|
-
oob_score=True
|
|
136
|
-
)
|
|
137
|
-
rf_model.fit(X_scaled, y)
|
|
138
|
-
|
|
139
|
-
# 预测
|
|
140
|
-
y_pred = rf_model.predict(X_scaled)
|
|
141
|
-
|
|
142
|
-
# 计算评估指标
|
|
143
|
-
r2 = r2_score(y, y_pred)
|
|
144
|
-
mse = mean_squared_error(y, y_pred)
|
|
145
|
-
mae = mean_absolute_error(y, y_pred)
|
|
146
|
-
|
|
147
|
-
# 特征重要性
|
|
148
|
-
feature_importance = dict(zip(feature_names, rf_model.feature_importances_))
|
|
149
|
-
|
|
150
|
-
return RandomForestResult(
|
|
151
|
-
model_type="random_forest",
|
|
152
|
-
r2_score=r2,
|
|
153
|
-
mse=mse,
|
|
154
|
-
mae=mae,
|
|
155
|
-
n_obs=len(y),
|
|
156
|
-
feature_names=feature_names,
|
|
157
|
-
feature_importance=feature_importance,
|
|
158
|
-
n_estimators=n_estimators,
|
|
159
|
-
max_depth=max_depth if max_depth is not None else 0, # 0表示无限制
|
|
160
|
-
oob_score=rf_model.oob_score_ if hasattr(rf_model, 'oob_score_') else None
|
|
161
|
-
)
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
def gradient_boosting_regression(
|
|
165
|
-
y_data: List[float],
|
|
166
|
-
x_data: List[List[float]],
|
|
167
|
-
feature_names: Optional[List[str]] = None,
|
|
168
|
-
n_estimators: int = 100,
|
|
169
|
-
learning_rate: float = 0.1,
|
|
170
|
-
max_depth: int = 3,
|
|
171
|
-
random_state: int = 42
|
|
172
|
-
) -> GradientBoostingResult:
|
|
173
|
-
"""
|
|
174
|
-
梯度提升树回归
|
|
175
|
-
|
|
176
|
-
📊 功能说明:
|
|
177
|
-
使用梯度提升算法进行回归分析,通过逐步优化残差来提升模型性能。
|
|
178
|
-
|
|
179
|
-
📈 算法特点:
|
|
180
|
-
- 逐步优化:通过梯度下降逐步改进模型
|
|
181
|
-
- 高精度:通常比随机森林有更好的预测精度
|
|
182
|
-
- 正则化:通过学习率和树深度控制过拟合
|
|
183
|
-
- 特征重要性:提供特征重要性排序
|
|
184
|
-
|
|
185
|
-
💡 使用场景:
|
|
186
|
-
- 高精度预测需求
|
|
187
|
-
- 结构化数据建模
|
|
188
|
-
- 竞赛和实际应用
|
|
189
|
-
- 需要精细调优的场景
|
|
190
|
-
|
|
191
|
-
⚠️ 注意事项:
|
|
192
|
-
- 对超参数敏感
|
|
193
|
-
- 训练时间较长
|
|
194
|
-
- 容易过拟合(需要仔细调参)
|
|
195
|
-
|
|
196
|
-
Args:
|
|
197
|
-
y_data: 因变量数据
|
|
198
|
-
x_data: 自变量数据,二维列表格式
|
|
199
|
-
feature_names: 特征名称列表
|
|
200
|
-
n_estimators: 树的数量,默认100
|
|
201
|
-
learning_rate: 学习率,默认0.1
|
|
202
|
-
max_depth: 最大深度,默认3
|
|
203
|
-
random_state: 随机种子
|
|
204
|
-
|
|
205
|
-
Returns:
|
|
206
|
-
GradientBoostingResult: 梯度提升树回归结果
|
|
207
|
-
"""
|
|
208
|
-
# 数据验证
|
|
209
|
-
if not y_data or not x_data:
|
|
210
|
-
raise ValueError("因变量和自变量数据不能为空")
|
|
211
|
-
|
|
212
|
-
if len(y_data) != len(x_data):
|
|
213
|
-
raise ValueError(f"因变量和自变量的观测数量不一致: y_data={len(y_data)}, x_data={len(x_data)}")
|
|
214
|
-
|
|
215
|
-
# 准备数据
|
|
216
|
-
X = np.array(x_data)
|
|
217
|
-
y = np.array(y_data)
|
|
218
|
-
|
|
219
|
-
# 特征名称处理
|
|
220
|
-
if feature_names is None:
|
|
221
|
-
feature_names = [f"x{i}" for i in range(X.shape[1])]
|
|
222
|
-
elif len(feature_names) != X.shape[1]:
|
|
223
|
-
raise ValueError(f"特征名称数量({len(feature_names)})与自变量数量({X.shape[1]})不匹配")
|
|
224
|
-
|
|
225
|
-
# 数据标准化
|
|
226
|
-
scaler = StandardScaler()
|
|
227
|
-
X_scaled = scaler.fit_transform(X)
|
|
228
|
-
|
|
229
|
-
# 训练梯度提升树模型
|
|
230
|
-
gb_model = GradientBoostingRegressor(
|
|
231
|
-
n_estimators=n_estimators,
|
|
232
|
-
learning_rate=learning_rate,
|
|
233
|
-
max_depth=max_depth,
|
|
234
|
-
random_state=random_state
|
|
235
|
-
)
|
|
236
|
-
gb_model.fit(X_scaled, y)
|
|
237
|
-
|
|
238
|
-
# 预测
|
|
239
|
-
y_pred = gb_model.predict(X_scaled)
|
|
240
|
-
|
|
241
|
-
# 计算评估指标
|
|
242
|
-
r2 = r2_score(y, y_pred)
|
|
243
|
-
mse = mean_squared_error(y, y_pred)
|
|
244
|
-
mae = mean_absolute_error(y, y_pred)
|
|
245
|
-
|
|
246
|
-
# 特征重要性
|
|
247
|
-
feature_importance = dict(zip(feature_names, gb_model.feature_importances_))
|
|
248
|
-
|
|
249
|
-
return GradientBoostingResult(
|
|
250
|
-
model_type="gradient_boosting",
|
|
251
|
-
r2_score=r2,
|
|
252
|
-
mse=mse,
|
|
253
|
-
mae=mae,
|
|
254
|
-
n_obs=len(y),
|
|
255
|
-
feature_names=feature_names,
|
|
256
|
-
feature_importance=feature_importance,
|
|
257
|
-
n_estimators=n_estimators,
|
|
258
|
-
learning_rate=learning_rate,
|
|
259
|
-
max_depth=max_depth
|
|
260
|
-
)
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
def lasso_regression(
|
|
264
|
-
y_data: List[float],
|
|
265
|
-
x_data: List[List[float]],
|
|
266
|
-
feature_names: Optional[List[str]] = None,
|
|
267
|
-
alpha: float = 1.0,
|
|
268
|
-
random_state: int = 42
|
|
269
|
-
) -> RegularizedRegressionResult:
|
|
270
|
-
"""
|
|
271
|
-
Lasso回归(L1正则化)
|
|
272
|
-
|
|
273
|
-
📊 功能说明:
|
|
274
|
-
使用L1正则化的线性回归,能够进行特征选择和稀疏建模。
|
|
275
|
-
|
|
276
|
-
📈 算法特点:
|
|
277
|
-
- 特征选择:自动将不重要的特征系数压缩为0
|
|
278
|
-
- 稀疏解:产生稀疏的系数向量
|
|
279
|
-
- 可解释性:保留重要特征,去除冗余特征
|
|
280
|
-
- 处理多重共线性:对高度相关的特征进行选择
|
|
281
|
-
|
|
282
|
-
💡 使用场景:
|
|
283
|
-
- 高维数据特征选择
|
|
284
|
-
- 多重共线性问题
|
|
285
|
-
- 稀疏建模需求
|
|
286
|
-
- 可解释性要求高的场景
|
|
287
|
-
|
|
288
|
-
⚠️ 注意事项:
|
|
289
|
-
- 对alpha参数敏感
|
|
290
|
-
- 可能过度压缩重要特征
|
|
291
|
-
- 需要数据标准化
|
|
292
|
-
|
|
293
|
-
Args:
|
|
294
|
-
y_data: 因变量数据
|
|
295
|
-
x_data: 自变量数据,二维列表格式
|
|
296
|
-
feature_names: 特征名称列表
|
|
297
|
-
alpha: 正则化强度,默认1.0
|
|
298
|
-
random_state: 随机种子
|
|
299
|
-
|
|
300
|
-
Returns:
|
|
301
|
-
RegularizedRegressionResult: Lasso回归结果
|
|
302
|
-
"""
|
|
303
|
-
return _regularized_regression(
|
|
304
|
-
y_data, x_data, feature_names, alpha, random_state, "lasso"
|
|
305
|
-
)
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
def ridge_regression(
|
|
309
|
-
y_data: List[float],
|
|
310
|
-
x_data: List[List[float]],
|
|
311
|
-
feature_names: Optional[List[str]] = None,
|
|
312
|
-
alpha: float = 1.0,
|
|
313
|
-
random_state: int = 42
|
|
314
|
-
) -> RegularizedRegressionResult:
|
|
315
|
-
"""
|
|
316
|
-
Ridge回归(L2正则化)
|
|
317
|
-
|
|
318
|
-
📊 功能说明:
|
|
319
|
-
使用L2正则化的线性回归,能够处理多重共线性问题。
|
|
320
|
-
|
|
321
|
-
📈 算法特点:
|
|
322
|
-
- 稳定性:对多重共线性稳健
|
|
323
|
-
- 收缩系数:将所有系数向0收缩
|
|
324
|
-
- 无特征选择:保留所有特征
|
|
325
|
-
- 数值稳定性:改善矩阵条件数
|
|
326
|
-
|
|
327
|
-
💡 使用场景:
|
|
328
|
-
- 多重共线性问题
|
|
329
|
-
- 需要稳定估计的场景
|
|
330
|
-
- 所有特征都可能有贡献的情况
|
|
331
|
-
- 小样本高维数据
|
|
332
|
-
|
|
333
|
-
⚠️ 注意事项:
|
|
334
|
-
- 不进行特征选择
|
|
335
|
-
- 对alpha参数敏感
|
|
336
|
-
- 需要数据标准化
|
|
337
|
-
|
|
338
|
-
Args:
|
|
339
|
-
y_data: 因变量数据
|
|
340
|
-
x_data: 自变量数据,二维列表格式
|
|
341
|
-
feature_names: 特征名称列表
|
|
342
|
-
alpha: 正则化强度,默认1.0
|
|
343
|
-
random_state: 随机种子
|
|
344
|
-
|
|
345
|
-
Returns:
|
|
346
|
-
RegularizedRegressionResult: Ridge回归结果
|
|
347
|
-
"""
|
|
348
|
-
return _regularized_regression(
|
|
349
|
-
y_data, x_data, feature_names, alpha, random_state, "ridge"
|
|
350
|
-
)
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
def _regularized_regression(
|
|
354
|
-
y_data: List[float],
|
|
355
|
-
x_data: List[List[float]],
|
|
356
|
-
feature_names: Optional[List[str]],
|
|
357
|
-
alpha: float,
|
|
358
|
-
random_state: int,
|
|
359
|
-
model_type: str
|
|
360
|
-
) -> RegularizedRegressionResult:
|
|
361
|
-
"""正则化回归内部实现"""
|
|
362
|
-
# 数据验证
|
|
363
|
-
if not y_data or not x_data:
|
|
364
|
-
raise ValueError("因变量和自变量数据不能为空")
|
|
365
|
-
|
|
366
|
-
if len(y_data) != len(x_data):
|
|
367
|
-
raise ValueError(f"因变量和自变量的观测数量不一致: y_data={len(y_data)}, x_data={len(x_data)}")
|
|
368
|
-
|
|
369
|
-
# 准备数据
|
|
370
|
-
X = np.array(x_data)
|
|
371
|
-
y = np.array(y_data)
|
|
372
|
-
|
|
373
|
-
# 特征名称处理
|
|
374
|
-
if feature_names is None:
|
|
375
|
-
feature_names = [f"x{i}" for i in range(X.shape[1])]
|
|
376
|
-
elif len(feature_names) != X.shape[1]:
|
|
377
|
-
raise ValueError(f"特征名称数量({len(feature_names)})与自变量数量({X.shape[1]})不匹配")
|
|
378
|
-
|
|
379
|
-
# 数据标准化
|
|
380
|
-
scaler = StandardScaler()
|
|
381
|
-
X_scaled = scaler.fit_transform(X)
|
|
382
|
-
y_scaled = (y - np.mean(y)) / np.std(y) # 标准化因变量
|
|
383
|
-
|
|
384
|
-
# 选择模型
|
|
385
|
-
if model_type == "lasso":
|
|
386
|
-
model = Lasso(alpha=alpha, random_state=random_state, max_iter=10000)
|
|
387
|
-
elif model_type == "ridge":
|
|
388
|
-
model = Ridge(alpha=alpha, random_state=random_state)
|
|
389
|
-
else:
|
|
390
|
-
raise ValueError(f"不支持的模型类型: {model_type}")
|
|
391
|
-
|
|
392
|
-
# 训练模型
|
|
393
|
-
model.fit(X_scaled, y_scaled)
|
|
394
|
-
|
|
395
|
-
# 预测
|
|
396
|
-
y_pred_scaled = model.predict(X_scaled)
|
|
397
|
-
|
|
398
|
-
# 将预测值转换回原始尺度
|
|
399
|
-
y_pred = y_pred_scaled * np.std(y) + np.mean(y)
|
|
400
|
-
|
|
401
|
-
# 计算评估指标
|
|
402
|
-
r2 = r2_score(y, y_pred)
|
|
403
|
-
mse = mean_squared_error(y, y_pred)
|
|
404
|
-
mae = mean_absolute_error(y, y_pred)
|
|
405
|
-
|
|
406
|
-
# 系数(注意:由于标准化,系数需要适当解释)
|
|
407
|
-
coefficients = dict(zip(feature_names, model.coef_))
|
|
408
|
-
|
|
409
|
-
return RegularizedRegressionResult(
|
|
410
|
-
model_type=model_type,
|
|
411
|
-
r2_score=r2,
|
|
412
|
-
mse=mse,
|
|
413
|
-
mae=mae,
|
|
414
|
-
n_obs=len(y),
|
|
415
|
-
feature_names=feature_names,
|
|
416
|
-
alpha=alpha,
|
|
417
|
-
coefficients=coefficients
|
|
418
|
-
)
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
def cross_validation(
|
|
422
|
-
y_data: List[float],
|
|
423
|
-
x_data: List[List[float]],
|
|
424
|
-
model_type: str = "random_forest",
|
|
425
|
-
cv_folds: int = 5,
|
|
426
|
-
scoring: str = "r2",
|
|
427
|
-
**model_params
|
|
428
|
-
) -> CrossValidationResult:
|
|
429
|
-
"""
|
|
430
|
-
交叉验证
|
|
431
|
-
|
|
432
|
-
📊 功能说明:
|
|
433
|
-
通过交叉验证评估模型的泛化能力和稳定性。
|
|
434
|
-
|
|
435
|
-
📈 验证方法:
|
|
436
|
-
- K折交叉验证:将数据分为K份,轮流使用K-1份训练,1份测试
|
|
437
|
-
- 稳定性评估:通过多次验证评估模型稳定性
|
|
438
|
-
- 泛化能力:评估模型在未见数据上的表现
|
|
439
|
-
|
|
440
|
-
💡 使用场景:
|
|
441
|
-
- 模型选择和比较
|
|
442
|
-
- 超参数调优
|
|
443
|
-
- 评估模型稳定性
|
|
444
|
-
- 防止过拟合
|
|
445
|
-
|
|
446
|
-
⚠️ 注意事项:
|
|
447
|
-
- 计算成本较高
|
|
448
|
-
- 需要足够的数据量
|
|
449
|
-
- 折数选择影响结果稳定性
|
|
450
|
-
|
|
451
|
-
Args:
|
|
452
|
-
y_data: 因变量数据
|
|
453
|
-
x_data: 自变量数据,二维列表格式
|
|
454
|
-
model_type: 模型类型(random_forest, gradient_boosting, lasso, ridge)
|
|
455
|
-
cv_folds: 交叉验证折数,默认5
|
|
456
|
-
scoring: 评分指标,默认"r2"
|
|
457
|
-
**model_params: 模型参数
|
|
458
|
-
|
|
459
|
-
Returns:
|
|
460
|
-
CrossValidationResult: 交叉验证结果
|
|
461
|
-
"""
|
|
462
|
-
# 数据验证
|
|
463
|
-
if not y_data or not x_data:
|
|
464
|
-
raise ValueError("因变量和自变量数据不能为空")
|
|
465
|
-
|
|
466
|
-
if len(y_data) != len(x_data):
|
|
467
|
-
raise ValueError(f"因变量和自变量的观测数量不一致: y_data={len(y_data)}, x_data={len(x_data)}")
|
|
468
|
-
|
|
469
|
-
if cv_folds < 2 or cv_folds > len(y_data):
|
|
470
|
-
raise ValueError(f"交叉验证折数应在2到样本数量之间: cv_folds={cv_folds}, n_obs={len(y_data)}")
|
|
471
|
-
|
|
472
|
-
# 准备数据
|
|
473
|
-
X = np.array(x_data)
|
|
474
|
-
y = np.array(y_data)
|
|
475
|
-
|
|
476
|
-
# 数据标准化
|
|
477
|
-
scaler = StandardScaler()
|
|
478
|
-
X_scaled = scaler.fit_transform(X)
|
|
479
|
-
|
|
480
|
-
# 选择模型
|
|
481
|
-
if model_type == "random_forest":
|
|
482
|
-
model = RandomForestRegressor(**model_params)
|
|
483
|
-
elif model_type == "gradient_boosting":
|
|
484
|
-
model = GradientBoostingRegressor(**model_params)
|
|
485
|
-
elif model_type == "lasso":
|
|
486
|
-
model = Lasso(**model_params)
|
|
487
|
-
elif model_type == "ridge":
|
|
488
|
-
model = Ridge(**model_params)
|
|
489
|
-
else:
|
|
490
|
-
raise ValueError(f"不支持的模型类型: {model_type}")
|
|
491
|
-
|
|
492
|
-
# 执行交叉验证
|
|
493
|
-
cv = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
|
|
494
|
-
cv_scores = cross_val_score(model, X_scaled, y, cv=cv, scoring=scoring)
|
|
495
|
-
|
|
496
|
-
return CrossValidationResult(
|
|
497
|
-
model_type=model_type,
|
|
498
|
-
cv_scores=cv_scores.tolist(),
|
|
499
|
-
mean_score=np.mean(cv_scores),
|
|
500
|
-
std_score=np.std(cv_scores),
|
|
501
|
-
n_splits=cv_folds
|
|
502
|
-
)
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
def feature_importance_analysis(
|
|
506
|
-
y_data: List[float],
|
|
507
|
-
x_data: List[List[float]],
|
|
508
|
-
feature_names: Optional[List[str]] = None,
|
|
509
|
-
method: str = "random_forest",
|
|
510
|
-
top_k: int = 5
|
|
511
|
-
) -> FeatureImportanceResult:
|
|
512
|
-
"""
|
|
513
|
-
特征重要性分析
|
|
514
|
-
|
|
515
|
-
📊 功能说明:
|
|
516
|
-
分析各个特征对预测目标的重要性,帮助理解数据中的关键因素。
|
|
517
|
-
|
|
518
|
-
📈 分析方法:
|
|
519
|
-
- 基于模型:使用机器学习模型计算特征重要性
|
|
520
|
-
- 排序分析:按重要性对特征进行排序
|
|
521
|
-
- 关键特征识别:识别最重要的top-k个特征
|
|
522
|
-
|
|
523
|
-
💡 使用场景:
|
|
524
|
-
- 特征选择和降维
|
|
525
|
-
- 模型可解释性分析
|
|
526
|
-
- 业务洞察提取
|
|
527
|
-
- 数据理解增强
|
|
528
|
-
|
|
529
|
-
⚠️ 注意事项:
|
|
530
|
-
- 不同方法可能给出不同的重要性排序
|
|
531
|
-
- 重要性分数是相对的,不是绝对的
|
|
532
|
-
- 需要结合业务知识解释结果
|
|
533
|
-
|
|
534
|
-
Args:
|
|
535
|
-
y_data: 因变量数据
|
|
536
|
-
x_data: 自变量数据,二维列表格式
|
|
537
|
-
feature_names: 特征名称列表
|
|
538
|
-
method: 分析方法(random_forest, gradient_boosting)
|
|
539
|
-
top_k: 最重要的特征数量,默认5
|
|
540
|
-
|
|
541
|
-
Returns:
|
|
542
|
-
FeatureImportanceResult: 特征重要性分析结果
|
|
543
|
-
"""
|
|
544
|
-
# 数据验证
|
|
545
|
-
if not y_data or not x_data:
|
|
546
|
-
raise ValueError("因变量和自变量数据不能为空")
|
|
547
|
-
|
|
548
|
-
if len(y_data) != len(x_data):
|
|
549
|
-
raise ValueError(f"因变量和自变量的观测数量不一致: y_data={len(y_data)}, x_data={len(x_data)}")
|
|
550
|
-
|
|
551
|
-
# 准备数据
|
|
552
|
-
X = np.array(x_data)
|
|
553
|
-
y = np.array(y_data)
|
|
554
|
-
|
|
555
|
-
# 特征名称处理
|
|
556
|
-
if feature_names is None:
|
|
557
|
-
feature_names = [f"x{i}" for i in range(X.shape[1])]
|
|
558
|
-
elif len(feature_names) != X.shape[1]:
|
|
559
|
-
raise ValueError(f"特征名称数量({len(feature_names)})与自变量数量({X.shape[1]})不匹配")
|
|
560
|
-
|
|
561
|
-
# 数据标准化
|
|
562
|
-
scaler = StandardScaler()
|
|
563
|
-
X_scaled = scaler.fit_transform(X)
|
|
564
|
-
|
|
565
|
-
# 选择模型并计算特征重要性
|
|
566
|
-
if method == "random_forest":
|
|
567
|
-
model = RandomForestRegressor(n_estimators=100, random_state=42)
|
|
568
|
-
elif method == "gradient_boosting":
|
|
569
|
-
model = GradientBoostingRegressor(n_estimators=100, random_state=42)
|
|
570
|
-
else:
|
|
571
|
-
raise ValueError(f"不支持的特征重要性分析方法: {method}")
|
|
572
|
-
|
|
573
|
-
# 训练模型
|
|
574
|
-
model.fit(X_scaled, y)
|
|
575
|
-
|
|
576
|
-
# 获取特征重要性
|
|
577
|
-
importance_scores = model.feature_importances_
|
|
578
|
-
feature_importance = dict(zip(feature_names, importance_scores))
|
|
579
|
-
|
|
580
|
-
# 按重要性排序
|
|
581
|
-
sorted_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)
|
|
582
|
-
|
|
583
|
-
# 获取最重要的特征
|
|
584
|
-
top_features = [feature for feature, score in sorted_features[:top_k]]
|
|
585
|
-
|
|
586
|
-
return FeatureImportanceResult(
|
|
587
|
-
feature_importance=feature_importance,
|
|
588
|
-
sorted_features=sorted_features,
|
|
589
|
-
top_features=top_features
|
|
590
|
-
)
|
|
591
4
|
|
|
5
|
+
此模块作为统一入口,导出所有机器学习相关功能:
|
|
6
|
+
- ml_models: 数据模型定义
|
|
7
|
+
- ml_ensemble: 集成学习方法(随机森林、梯度提升树)
|
|
8
|
+
- ml_regularization: 正则化回归(Lasso、Ridge)
|
|
9
|
+
- ml_evaluation: 评估和比较功能
|
|
10
|
+
"""
|
|
592
11
|
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
""
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
elif model_name == "lasso":
|
|
643
|
-
result = lasso_regression(y_data, x_data, feature_names)
|
|
644
|
-
elif model_name == "ridge":
|
|
645
|
-
result = ridge_regression(y_data, x_data, feature_names)
|
|
646
|
-
else:
|
|
647
|
-
continue
|
|
648
|
-
|
|
649
|
-
results[model_name] = result.model_dump()
|
|
650
|
-
|
|
651
|
-
except Exception as e:
|
|
652
|
-
print(f"模型 {model_name} 运行失败: {e}")
|
|
653
|
-
continue
|
|
654
|
-
|
|
655
|
-
# 找出最佳模型(基于R²得分)
|
|
656
|
-
best_model = None
|
|
657
|
-
best_r2 = -float('inf')
|
|
658
|
-
|
|
659
|
-
for model_name, result in results.items():
|
|
660
|
-
if result['r2_score'] > best_r2:
|
|
661
|
-
best_r2 = result['r2_score']
|
|
662
|
-
best_model = model_name
|
|
663
|
-
|
|
664
|
-
return {
|
|
665
|
-
"model_results": results,
|
|
666
|
-
"best_model": best_model,
|
|
667
|
-
"best_r2": best_r2,
|
|
668
|
-
"comparison_summary": {
|
|
669
|
-
"total_models": len(results),
|
|
670
|
-
"successful_models": len(results),
|
|
671
|
-
"best_performing": best_model
|
|
672
|
-
}
|
|
673
|
-
}
|
|
12
|
+
# 导入数据模型
|
|
13
|
+
from .ml_models import (
|
|
14
|
+
MLModelResult,
|
|
15
|
+
RandomForestResult,
|
|
16
|
+
GradientBoostingResult,
|
|
17
|
+
RegularizedRegressionResult,
|
|
18
|
+
CrossValidationResult,
|
|
19
|
+
FeatureImportanceResult
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
# 导入集成学习方法
|
|
23
|
+
from .ml_ensemble import (
|
|
24
|
+
random_forest_regression,
|
|
25
|
+
gradient_boosting_regression
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
# 导入正则化回归
|
|
29
|
+
from .ml_regularization import (
|
|
30
|
+
lasso_regression,
|
|
31
|
+
ridge_regression
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
# 导入评估和比较功能
|
|
35
|
+
from .ml_evaluation import (
|
|
36
|
+
cross_validation,
|
|
37
|
+
feature_importance_analysis,
|
|
38
|
+
compare_ml_models
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# 导出所有公共接口
|
|
42
|
+
__all__ = [
|
|
43
|
+
# 数据模型
|
|
44
|
+
"MLModelResult",
|
|
45
|
+
"RandomForestResult",
|
|
46
|
+
"GradientBoostingResult",
|
|
47
|
+
"RegularizedRegressionResult",
|
|
48
|
+
"CrossValidationResult",
|
|
49
|
+
"FeatureImportanceResult",
|
|
50
|
+
# 集成学习
|
|
51
|
+
"random_forest_regression",
|
|
52
|
+
"gradient_boosting_regression",
|
|
53
|
+
# 正则化回归
|
|
54
|
+
"lasso_regression",
|
|
55
|
+
"ridge_regression",
|
|
56
|
+
# 评估和比较
|
|
57
|
+
"cross_validation",
|
|
58
|
+
"feature_importance_analysis",
|
|
59
|
+
"compare_ml_models"
|
|
60
|
+
]
|