aigroup-econ-mcp 0.6.0__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aigroup-econ-mcp might be problematic. Click here for more details.
- aigroup_econ_mcp/__init__.py +1 -1
- aigroup_econ_mcp/tools/ml_regularization.py +22 -8
- aigroup_econ_mcp/tools/panel_data.py +70 -4
- aigroup_econ_mcp/tools/tool_descriptions.py +176 -30
- aigroup_econ_mcp/tools/tool_handlers.py +454 -19
- {aigroup_econ_mcp-0.6.0.dist-info → aigroup_econ_mcp-0.8.0.dist-info}/METADATA +2 -2
- {aigroup_econ_mcp-0.6.0.dist-info → aigroup_econ_mcp-0.8.0.dist-info}/RECORD +10 -10
- {aigroup_econ_mcp-0.6.0.dist-info → aigroup_econ_mcp-0.8.0.dist-info}/WHEEL +0 -0
- {aigroup_econ_mcp-0.6.0.dist-info → aigroup_econ_mcp-0.8.0.dist-info}/entry_points.txt +0 -0
- {aigroup_econ_mcp-0.6.0.dist-info → aigroup_econ_mcp-0.8.0.dist-info}/licenses/LICENSE +0 -0
aigroup_econ_mcp/__init__.py
CHANGED
|
@@ -130,36 +130,50 @@ def _regularized_regression(
|
|
|
130
130
|
elif len(feature_names) != X.shape[1]:
|
|
131
131
|
raise ValueError(f"特征名称数量({len(feature_names)})与自变量数量({X.shape[1]})不匹配")
|
|
132
132
|
|
|
133
|
-
#
|
|
133
|
+
# 检查数据质量
|
|
134
|
+
if len(y) < 5:
|
|
135
|
+
warnings.warn(f"⚠️ 警告:样本数量较少({len(y)}个),正则化回归可能不稳定")
|
|
136
|
+
|
|
137
|
+
# 数据标准化 - 只标准化自变量,不标准化因变量
|
|
134
138
|
scaler = StandardScaler()
|
|
135
139
|
X_scaled = scaler.fit_transform(X)
|
|
136
|
-
y_scaled = (y - np.mean(y)) / np.std(y) # 标准化因变量
|
|
137
140
|
|
|
138
141
|
# 选择模型
|
|
139
142
|
if model_type == "lasso":
|
|
140
|
-
model = Lasso(alpha=alpha, random_state=random_state, max_iter=10000)
|
|
143
|
+
model = Lasso(alpha=alpha, random_state=random_state, max_iter=10000, tol=1e-4)
|
|
144
|
+
# 对于Lasso,如果alpha过大,建议使用更小的值
|
|
145
|
+
if alpha > 10:
|
|
146
|
+
warnings.warn(f"⚠️ 警告:Lasso正则化参数alpha={alpha}可能过大,建议尝试更小的值(如0.1-1.0)")
|
|
141
147
|
elif model_type == "ridge":
|
|
142
148
|
model = Ridge(alpha=alpha, random_state=random_state)
|
|
143
149
|
else:
|
|
144
150
|
raise ValueError(f"不支持的模型类型: {model_type}")
|
|
145
151
|
|
|
146
152
|
# 训练模型
|
|
147
|
-
|
|
153
|
+
try:
|
|
154
|
+
model.fit(X_scaled, y)
|
|
155
|
+
except Exception as e:
|
|
156
|
+
raise ValueError(f"{model_type}模型拟合失败: {str(e)}。建议:1) 检查数据质量 2) 尝试不同的alpha值 3) 增加样本数量")
|
|
148
157
|
|
|
149
158
|
# 预测
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
# 将预测值转换回原始尺度
|
|
153
|
-
y_pred = y_pred_scaled * np.std(y) + np.mean(y)
|
|
159
|
+
y_pred = model.predict(X_scaled)
|
|
154
160
|
|
|
155
161
|
# 计算评估指标
|
|
156
162
|
r2 = r2_score(y, y_pred)
|
|
157
163
|
mse = mean_squared_error(y, y_pred)
|
|
158
164
|
mae = mean_absolute_error(y, y_pred)
|
|
159
165
|
|
|
166
|
+
# 检查R²是否为负值
|
|
167
|
+
if r2 < 0:
|
|
168
|
+
warnings.warn(f"⚠️ 警告:{model_type}模型的R²为负值({r2:.4f}),表明模型性能比简单均值预测更差。可能原因:1) 数据噪声过大 2) 特征与目标变量无关 3) 正则化参数过大 4) 样本量过小")
|
|
169
|
+
|
|
160
170
|
# 系数(注意:由于标准化,系数需要适当解释)
|
|
161
171
|
coefficients = dict(zip(feature_names, model.coef_))
|
|
162
172
|
|
|
173
|
+
# 检查系数是否全为0(Lasso过度压缩)
|
|
174
|
+
if model_type == "lasso" and all(abs(coef) < 1e-10 for coef in model.coef_):
|
|
175
|
+
warnings.warn(f"⚠️ 警告:Lasso模型所有系数都被压缩为0,表明正则化参数alpha={alpha}可能过大,建议减小alpha值")
|
|
176
|
+
|
|
163
177
|
return RegularizedRegressionResult(
|
|
164
178
|
model_type=model_type,
|
|
165
179
|
r2_score=r2,
|
|
@@ -63,6 +63,23 @@ def prepare_panel_data(
|
|
|
63
63
|
"""
|
|
64
64
|
准备面板数据格式
|
|
65
65
|
|
|
66
|
+
📊 数据格式要求:
|
|
67
|
+
- 因变量(y_data): 数值列表,如 [1.2, 3.4, 5.6, ...]
|
|
68
|
+
- 自变量(X_data): 二维数值列表,如 [[1, 2], [3, 4], [5, 6], ...]
|
|
69
|
+
- 实体ID(entity_ids): 字符串列表,标识不同个体,如 ['A', 'A', 'B', 'B', ...]
|
|
70
|
+
- 时间标识符(time_periods): 字符串或数值列表,标识时间点,如 ['2020', '2020', '2021', '2021', ...]
|
|
71
|
+
|
|
72
|
+
💡 使用示例:
|
|
73
|
+
y_data = [10, 12, 8, 9] # 4个观测值
|
|
74
|
+
X_data = [[1, 2], [2, 3], [1, 1], [2, 2]] # 2个自变量,4个观测值
|
|
75
|
+
entity_ids = ['A', 'A', 'B', 'B'] # 2个实体,每个实体2个时间点
|
|
76
|
+
time_periods = ['2020', '2021', '2020', '2021'] # 2个时间点
|
|
77
|
+
|
|
78
|
+
⚠️ 注意事项:
|
|
79
|
+
- 确保每个实体有相同的时间点数量(平衡面板)
|
|
80
|
+
- 实体ID和时间标识符的组合必须唯一
|
|
81
|
+
- 建议至少3个实体,每个实体至少2个时间点
|
|
82
|
+
|
|
66
83
|
Args:
|
|
67
84
|
y_data: 因变量数据
|
|
68
85
|
X_data: 自变量数据,二维列表
|
|
@@ -73,13 +90,62 @@ def prepare_panel_data(
|
|
|
73
90
|
Returns:
|
|
74
91
|
pd.DataFrame: 面板数据格式的DataFrame
|
|
75
92
|
"""
|
|
76
|
-
# 数据验证
|
|
93
|
+
# 数据验证 - 提供更详细的错误信息
|
|
94
|
+
if not y_data or not X_data or not entity_ids or not time_periods:
|
|
95
|
+
raise ValueError("所有输入数据都不能为空。请提供:因变量(y_data)、自变量(X_data)、实体ID(entity_ids)、时间标识符(time_periods)")
|
|
96
|
+
|
|
77
97
|
if len(y_data) != len(X_data):
|
|
78
|
-
raise ValueError("
|
|
98
|
+
raise ValueError(f"因变量和自变量的观测数量不一致:因变量有{len(y_data)}个观测值,自变量有{len(X_data)}个观测值")
|
|
99
|
+
|
|
79
100
|
if len(y_data) != len(entity_ids):
|
|
80
|
-
raise ValueError("
|
|
101
|
+
raise ValueError(f"因变量和个体标识符数量不一致:因变量有{len(y_data)}个观测值,实体ID有{len(entity_ids)}个")
|
|
102
|
+
|
|
81
103
|
if len(y_data) != len(time_periods):
|
|
82
|
-
raise ValueError("
|
|
104
|
+
raise ValueError(f"因变量和时间标识符数量不一致:因变量有{len(y_data)}个观测值,时间标识符有{len(time_periods)}个")
|
|
105
|
+
|
|
106
|
+
# 检查自变量维度一致性
|
|
107
|
+
if len(X_data) > 0:
|
|
108
|
+
first_dim = len(X_data[0])
|
|
109
|
+
for i, x_row in enumerate(X_data):
|
|
110
|
+
if len(x_row) != first_dim:
|
|
111
|
+
raise ValueError(f"自变量维度不一致:第{i}行有{len(x_row)}个变量,但第一行有{first_dim}个变量")
|
|
112
|
+
|
|
113
|
+
# 检查面板数据平衡性
|
|
114
|
+
entity_time_counts = {}
|
|
115
|
+
for entity, time_period in zip(entity_ids, time_periods):
|
|
116
|
+
key = (entity, time_period)
|
|
117
|
+
if key in entity_time_counts:
|
|
118
|
+
raise ValueError(f"重复的实体-时间组合:实体 '{entity}' 在时间 '{time_period}' 有多个观测值")
|
|
119
|
+
entity_time_counts[key] = True
|
|
120
|
+
|
|
121
|
+
# 检查每个实体的时间点数量
|
|
122
|
+
entity_counts = {}
|
|
123
|
+
for entity in entity_ids:
|
|
124
|
+
entity_counts[entity] = entity_counts.get(entity, 0) + 1
|
|
125
|
+
|
|
126
|
+
unique_entities = len(entity_counts)
|
|
127
|
+
if unique_entities < 2:
|
|
128
|
+
raise ValueError(f"面板数据需要至少2个不同的实体,当前只有{unique_entities}个")
|
|
129
|
+
|
|
130
|
+
# 检查时间点数量
|
|
131
|
+
time_counts = {}
|
|
132
|
+
for time_period in time_periods:
|
|
133
|
+
time_counts[time_period] = time_counts.get(time_period, 0) + 1
|
|
134
|
+
|
|
135
|
+
unique_times = len(time_counts)
|
|
136
|
+
if unique_times < 2:
|
|
137
|
+
raise ValueError(f"面板数据需要至少2个不同的时间点,当前只有{unique_times}个")
|
|
138
|
+
|
|
139
|
+
# 检查是否为平衡面板
|
|
140
|
+
time_counts_per_entity = {}
|
|
141
|
+
for entity in set(entity_ids):
|
|
142
|
+
entity_times = [time for e, time in zip(entity_ids, time_periods) if e == entity]
|
|
143
|
+
time_counts_per_entity[entity] = len(set(entity_times))
|
|
144
|
+
|
|
145
|
+
min_times = min(time_counts_per_entity.values())
|
|
146
|
+
max_times = max(time_counts_per_entity.values())
|
|
147
|
+
if min_times != max_times:
|
|
148
|
+
warnings.warn(f"⚠️ 警告:面板数据不平衡。不同实体的时间点数量不同(最少{min_times}个,最多{max_times}个)。建议使用平衡面板数据以获得更可靠的结果。")
|
|
83
149
|
|
|
84
150
|
# 处理时间标识符格式兼容性
|
|
85
151
|
processed_time_periods = []
|
|
@@ -570,47 +570,193 @@ VARIANCE_DECOMPOSITION_ANALYSIS = ToolDescription(
|
|
|
570
570
|
|
|
571
571
|
RANDOM_FOREST_REGRESSION_ANALYSIS = ToolDescription(
|
|
572
572
|
name="random_forest_regression_analysis",
|
|
573
|
-
description="
|
|
573
|
+
description="""随机森林回归分析
|
|
574
|
+
|
|
575
|
+
📊 功能说明:
|
|
576
|
+
随机森林通过构建多个决策树并集成结果,能够处理复杂的非线性关系和特征交互。
|
|
577
|
+
|
|
578
|
+
📈 算法特点:
|
|
579
|
+
- 集成学习:多个决策树投票或平均结果
|
|
580
|
+
- 稳健性:对异常值和噪声数据稳健
|
|
581
|
+
- 特征重要性:自动计算特征重要性分数
|
|
582
|
+
- 袋外评估:使用袋外样本进行模型评估
|
|
583
|
+
- 并行训练:支持并行化训练加速
|
|
584
|
+
|
|
585
|
+
💡 适用场景:
|
|
586
|
+
- 复杂非线性关系建模
|
|
587
|
+
- 特征交互分析
|
|
588
|
+
- 稳健预测需求
|
|
589
|
+
- 特征重要性评估
|
|
590
|
+
- 大数据集处理
|
|
591
|
+
|
|
592
|
+
⚠️ 注意事项:
|
|
593
|
+
- 黑盒模型,可解释性较差
|
|
594
|
+
- 内存消耗较大(树的数量多时)
|
|
595
|
+
- 训练时间随树数量增加
|
|
596
|
+
- 可能过度拟合噪声数据
|
|
597
|
+
|
|
598
|
+
🔧 参数建议:
|
|
599
|
+
- n_estimators: 树的数量,默认100
|
|
600
|
+
- 小数据集: 50-100
|
|
601
|
+
- 大数据集: 100-500
|
|
602
|
+
- max_depth: 最大深度,默认None(无限制)
|
|
603
|
+
- 控制过拟合: 5-15
|
|
604
|
+
- 复杂关系: None(无限制)
|
|
605
|
+
|
|
606
|
+
📋 数据要求:
|
|
607
|
+
- 至少10个样本
|
|
608
|
+
- 数值型和类别型数据
|
|
609
|
+
- 支持缺失值处理""",
|
|
574
610
|
field_descriptions={
|
|
575
|
-
"file_path": "
|
|
576
|
-
"file_content": "
|
|
577
|
-
"file_format": "
|
|
578
|
-
"y_data": "
|
|
579
|
-
"x_data": "
|
|
580
|
-
"feature_names": "
|
|
581
|
-
"n_estimators": "
|
|
582
|
-
"max_depth": "
|
|
583
|
-
}
|
|
611
|
+
"file_path": "CSV/JSON文件路径。CSV格式: 最后一列为因变量,其余列为自变量",
|
|
612
|
+
"file_content": "文件内容字符串。JSON格式: {'y': [因变量], 'x1': [自变量1], ...}",
|
|
613
|
+
"file_format": "文件格式: csv/json/auto",
|
|
614
|
+
"y_data": "因变量数据列表,数值格式,如 [1.2, 3.4, 5.6, ...]",
|
|
615
|
+
"x_data": "自变量数据矩阵,二维列表格式,如 [[1, 2], [3, 4], [5, 6], ...]",
|
|
616
|
+
"feature_names": "自变量名称列表,如 ['GDP', 'Population', 'Investment']",
|
|
617
|
+
"n_estimators": "决策树数量,控制模型复杂度和稳定性,默认100",
|
|
618
|
+
"max_depth": "决策树最大深度,控制过拟合,默认None(无限制)"
|
|
619
|
+
},
|
|
620
|
+
examples=[
|
|
621
|
+
"预测房价与房屋特征的非线性关系",
|
|
622
|
+
"分析消费者行为与营销变量的复杂交互",
|
|
623
|
+
"评估经济指标对股票收益的影响"
|
|
624
|
+
],
|
|
625
|
+
use_cases=[
|
|
626
|
+
"复杂非线性关系建模",
|
|
627
|
+
"特征重要性分析",
|
|
628
|
+
"稳健预测建模",
|
|
629
|
+
"大数据集处理",
|
|
630
|
+
"集成学习应用"
|
|
631
|
+
]
|
|
584
632
|
)
|
|
585
633
|
|
|
586
634
|
GRADIENT_BOOSTING_REGRESSION_ANALYSIS = ToolDescription(
|
|
587
635
|
name="gradient_boosting_regression_analysis",
|
|
588
|
-
description="
|
|
636
|
+
description="""梯度提升树回归分析
|
|
637
|
+
|
|
638
|
+
📊 功能说明:
|
|
639
|
+
梯度提升树通过顺序构建决策树,每棵树修正前一棵树的错误,能够处理复杂的非线性关系。
|
|
640
|
+
|
|
641
|
+
📈 算法特点:
|
|
642
|
+
- 顺序学习:每棵树学习前一棵树的残差
|
|
643
|
+
- 高精度:通常具有很高的预测精度
|
|
644
|
+
- 特征重要性:自动计算特征重要性
|
|
645
|
+
- 灵活性强:可处理各种类型的数据
|
|
646
|
+
- 正则化:内置正则化防止过拟合
|
|
647
|
+
|
|
648
|
+
💡 适用场景:
|
|
649
|
+
- 高精度预测需求
|
|
650
|
+
- 复杂非线性关系
|
|
651
|
+
- 小样本高维数据
|
|
652
|
+
- 竞赛和性能要求高的场景
|
|
653
|
+
- 特征重要性分析
|
|
654
|
+
|
|
655
|
+
⚠️ 注意事项:
|
|
656
|
+
- 对参数敏感,需要仔细调优
|
|
657
|
+
- 训练时间较长
|
|
658
|
+
- 可能过度拟合噪声数据
|
|
659
|
+
- 内存消耗较大
|
|
660
|
+
|
|
661
|
+
🔧 参数建议:
|
|
662
|
+
- n_estimators: 树的数量,默认100
|
|
663
|
+
- 小数据集: 50-200
|
|
664
|
+
- 大数据集: 200-1000
|
|
665
|
+
- learning_rate: 学习率,默认0.1
|
|
666
|
+
- 保守学习: 0.01-0.1
|
|
667
|
+
- 快速收敛: 0.1-0.3
|
|
668
|
+
- max_depth: 最大深度,默认3
|
|
669
|
+
- 简单关系: 2-4
|
|
670
|
+
- 复杂关系: 5-8
|
|
671
|
+
|
|
672
|
+
📋 数据要求:
|
|
673
|
+
- 至少10个样本
|
|
674
|
+
- 数值型和类别型数据
|
|
675
|
+
- 建议进行数据标准化""",
|
|
589
676
|
field_descriptions={
|
|
590
|
-
"file_path": "
|
|
591
|
-
"file_content": "
|
|
592
|
-
"file_format": "
|
|
593
|
-
"y_data": "
|
|
594
|
-
"x_data": "
|
|
595
|
-
"feature_names": "
|
|
596
|
-
"n_estimators": "
|
|
597
|
-
"learning_rate": "
|
|
598
|
-
"max_depth": "
|
|
599
|
-
}
|
|
677
|
+
"file_path": "CSV/JSON文件路径。CSV格式: 最后一列为因变量,其余列为自变量",
|
|
678
|
+
"file_content": "文件内容字符串。JSON格式: {'y': [因变量], 'x1': [自变量1], ...}",
|
|
679
|
+
"file_format": "文件格式: csv/json/auto",
|
|
680
|
+
"y_data": "因变量数据列表,数值格式,如 [1.2, 3.4, 5.6, ...]",
|
|
681
|
+
"x_data": "自变量数据矩阵,二维列表格式,如 [[1, 2], [3, 4], [5, 6], ...]",
|
|
682
|
+
"feature_names": "自变量名称列表,如 ['GDP', 'Population', 'Investment']",
|
|
683
|
+
"n_estimators": "提升阶段执行的树数量,控制模型复杂度,默认100",
|
|
684
|
+
"learning_rate": "学习率,控制每棵树的贡献程度,默认0.1",
|
|
685
|
+
"max_depth": "单个回归估计器的最大深度,控制过拟合,默认3"
|
|
686
|
+
},
|
|
687
|
+
examples=[
|
|
688
|
+
"高精度预测股票价格走势",
|
|
689
|
+
"分析复杂的经济指标关系",
|
|
690
|
+
"预测消费者购买行为的精确概率",
|
|
691
|
+
"竞赛级别的预测建模"
|
|
692
|
+
],
|
|
693
|
+
use_cases=[
|
|
694
|
+
"高精度预测建模",
|
|
695
|
+
"复杂非线性关系分析",
|
|
696
|
+
"特征重要性评估",
|
|
697
|
+
"小样本高维数据处理",
|
|
698
|
+
"竞赛级别模型构建"
|
|
699
|
+
]
|
|
600
700
|
)
|
|
601
701
|
|
|
602
702
|
LASSO_REGRESSION_ANALYSIS = ToolDescription(
|
|
603
703
|
name="lasso_regression_analysis",
|
|
604
|
-
description="Lasso
|
|
704
|
+
description="""Lasso回归分析
|
|
705
|
+
|
|
706
|
+
📊 功能说明:
|
|
707
|
+
Lasso回归使用L1正则化进行特征选择和稀疏建模,能够自动将不重要的特征系数压缩为0。
|
|
708
|
+
|
|
709
|
+
📈 算法特点:
|
|
710
|
+
- 特征选择:自动识别重要特征,压缩冗余特征系数为0
|
|
711
|
+
- 稀疏解:产生稀疏的系数向量,提高模型可解释性
|
|
712
|
+
- 处理多重共线性:对高度相关的特征进行选择
|
|
713
|
+
- 正则化强度控制:通过alpha参数控制特征选择的严格程度
|
|
714
|
+
|
|
715
|
+
💡 适用场景:
|
|
716
|
+
- 高维数据特征选择(特征数量 > 样本数量)
|
|
717
|
+
- 多重共线性问题
|
|
718
|
+
- 稀疏建模需求
|
|
719
|
+
- 可解释性要求高的场景
|
|
720
|
+
- 变量筛选和降维
|
|
721
|
+
|
|
722
|
+
⚠️ 注意事项:
|
|
723
|
+
- 对alpha参数敏感,建议尝试多个值(如0.01, 0.1, 1.0, 10.0)
|
|
724
|
+
- 可能过度压缩重要特征,导致信息损失
|
|
725
|
+
- 需要数据标准化
|
|
726
|
+
- R²为负值时表明模型性能比简单均值预测更差
|
|
727
|
+
- 样本量过小时可能不稳定
|
|
728
|
+
|
|
729
|
+
🔧 参数建议:
|
|
730
|
+
- alpha: 正则化强度,默认1.0
|
|
731
|
+
- 小alpha(0.01-0.1): 轻微正则化,保留更多特征
|
|
732
|
+
- 中等alpha(0.1-1.0): 平衡特征选择和模型拟合
|
|
733
|
+
- 大alpha(>1.0): 强正则化,压缩更多特征
|
|
734
|
+
|
|
735
|
+
📋 数据要求:
|
|
736
|
+
- 至少5个样本
|
|
737
|
+
- 数值型数据
|
|
738
|
+
- 建议特征数量不超过样本数量的80%""",
|
|
605
739
|
field_descriptions={
|
|
606
|
-
"file_path": "
|
|
607
|
-
"file_content": "
|
|
608
|
-
"file_format": "
|
|
609
|
-
"y_data": "
|
|
610
|
-
"x_data": "
|
|
611
|
-
"feature_names": "
|
|
612
|
-
"alpha": "
|
|
613
|
-
}
|
|
740
|
+
"file_path": "CSV/JSON文件路径。CSV格式: 最后一列为因变量,其余列为自变量",
|
|
741
|
+
"file_content": "文件内容字符串。JSON格式: {'y': [因变量], 'x1': [自变量1], ...}",
|
|
742
|
+
"file_format": "文件格式: csv/json/auto",
|
|
743
|
+
"y_data": "因变量数据列表,数值格式,如 [1.2, 3.4, 5.6, ...]",
|
|
744
|
+
"x_data": "自变量数据矩阵,二维列表格式,如 [[1, 2], [3, 4], [5, 6], ...]",
|
|
745
|
+
"feature_names": "自变量名称列表,如 ['GDP', 'Population', 'Investment']",
|
|
746
|
+
"alpha": "正则化强度参数,控制特征选择的严格程度,默认1.0。建议尝试多个值进行调优"
|
|
747
|
+
},
|
|
748
|
+
examples=[
|
|
749
|
+
"从100个经济指标中选择影响GDP增长的关键因素",
|
|
750
|
+
"在消费者行为数据中识别最重要的预测变量",
|
|
751
|
+
"处理高度相关的宏观经济变量进行预测建模"
|
|
752
|
+
],
|
|
753
|
+
use_cases=[
|
|
754
|
+
"高维数据特征选择",
|
|
755
|
+
"变量重要性排序",
|
|
756
|
+
"多重共线性处理",
|
|
757
|
+
"稀疏线性建模",
|
|
758
|
+
"可解释机器学习"
|
|
759
|
+
]
|
|
614
760
|
)
|
|
615
761
|
|
|
616
762
|
RIDGE_REGRESSION_ANALYSIS = ToolDescription(
|
|
@@ -228,35 +228,99 @@ async def handle_correlation_analysis(ctx, data: Dict[str, List[float]],
|
|
|
228
228
|
|
|
229
229
|
|
|
230
230
|
# 面板数据处理器
|
|
231
|
-
async def handle_panel_fixed_effects(ctx, y_data, x_data, entity_ids, time_periods,
|
|
231
|
+
async def handle_panel_fixed_effects(ctx, y_data, x_data, entity_ids, time_periods,
|
|
232
232
|
feature_names=None, entity_effects=True, time_effects=False, **kwargs):
|
|
233
|
+
"""处理固定效应模型 - 统一输出格式"""
|
|
233
234
|
result = fixed_effects_model(y_data, x_data, entity_ids, time_periods, feature_names, entity_effects, time_effects)
|
|
235
|
+
|
|
236
|
+
# 构建详细的结果文本
|
|
237
|
+
result_text = f"""📊 固定效应模型分析结果
|
|
238
|
+
|
|
239
|
+
🔍 模型拟合信息:
|
|
240
|
+
- R² = {result.rsquared:.4f}
|
|
241
|
+
- 调整R² = {result.rsquared_adj:.4f}
|
|
242
|
+
- F统计量 = {result.f_statistic:.4f} (p = {result.f_pvalue:.4f})
|
|
243
|
+
- AIC = {result.aic:.2f}, BIC = {result.bic:.2f}
|
|
244
|
+
- 观测数量 = {result.n_obs}
|
|
245
|
+
- 个体效应 = {'是' if result.entity_effects else '否'}
|
|
246
|
+
- 时间效应 = {'是' if result.time_effects else '否'}
|
|
247
|
+
|
|
248
|
+
📈 回归系数详情:"""
|
|
249
|
+
|
|
250
|
+
# 添加系数信息
|
|
251
|
+
for var_name, coef_info in result.coefficients.items():
|
|
252
|
+
significance = "***" if coef_info["p_value"] < 0.01 else "**" if coef_info["p_value"] < 0.05 else "*" if coef_info["p_value"] < 0.1 else ""
|
|
253
|
+
result_text += f"\n- {var_name}: {coef_info['coef']:.4f}{significance} (se={coef_info['std_err']:.4f}, p={coef_info['p_value']:.4f})"
|
|
254
|
+
|
|
255
|
+
result_text += "\n\n💡 模型说明:固定效应模型通过组内变换消除个体固定差异,适用于个体间存在不可观测固定特征的情况。"
|
|
256
|
+
|
|
234
257
|
return CallToolResult(
|
|
235
|
-
content=[TextContent(type="text", text=
|
|
258
|
+
content=[TextContent(type="text", text=result_text)],
|
|
236
259
|
structuredContent=result.model_dump()
|
|
237
260
|
)
|
|
238
261
|
|
|
239
262
|
|
|
240
263
|
async def handle_panel_random_effects(ctx, y_data, x_data, entity_ids, time_periods,
|
|
241
264
|
feature_names=None, entity_effects=True, time_effects=False, **kwargs):
|
|
265
|
+
"""处理随机效应模型 - 统一输出格式"""
|
|
242
266
|
result = random_effects_model(y_data, x_data, entity_ids, time_periods, feature_names, entity_effects, time_effects)
|
|
267
|
+
|
|
268
|
+
# 构建详细的结果文本
|
|
269
|
+
result_text = f"""📊 随机效应模型分析结果
|
|
270
|
+
|
|
271
|
+
🔍 模型拟合信息:
|
|
272
|
+
- R² = {result.rsquared:.4f}
|
|
273
|
+
- 调整R² = {result.rsquared_adj:.4f}
|
|
274
|
+
- F统计量 = {result.f_statistic:.4f} (p = {result.f_pvalue:.4f})
|
|
275
|
+
- AIC = {result.aic:.2f}, BIC = {result.bic:.2f}
|
|
276
|
+
- 观测数量 = {result.n_obs}
|
|
277
|
+
- 个体效应 = {'是' if result.entity_effects else '否'}
|
|
278
|
+
- 时间效应 = {'是' if result.time_effects else '否'}
|
|
279
|
+
|
|
280
|
+
📈 回归系数详情:"""
|
|
281
|
+
|
|
282
|
+
# 添加系数信息
|
|
283
|
+
for var_name, coef_info in result.coefficients.items():
|
|
284
|
+
significance = "***" if coef_info["p_value"] < 0.01 else "**" if coef_info["p_value"] < 0.05 else "*" if coef_info["p_value"] < 0.1 else ""
|
|
285
|
+
result_text += f"\n- {var_name}: {coef_info['coef']:.4f}{significance} (se={coef_info['std_err']:.4f}, p={coef_info['p_value']:.4f})"
|
|
286
|
+
|
|
287
|
+
result_text += "\n\n💡 模型说明:随机效应模型假设个体差异是随机的,比固定效应模型更有效率,但需要满足个体效应与解释变量不相关的假设。"
|
|
288
|
+
|
|
243
289
|
return CallToolResult(
|
|
244
|
-
content=[TextContent(type="text", text=
|
|
290
|
+
content=[TextContent(type="text", text=result_text)],
|
|
245
291
|
structuredContent=result.model_dump()
|
|
246
292
|
)
|
|
247
293
|
|
|
248
294
|
|
|
249
295
|
async def handle_panel_hausman_test(ctx, y_data, x_data, entity_ids, time_periods, feature_names=None, **kwargs):
|
|
296
|
+
"""处理Hausman检验 - 统一输出格式"""
|
|
250
297
|
result = hausman_test(y_data, x_data, entity_ids, time_periods, feature_names)
|
|
298
|
+
|
|
299
|
+
result_text = f"""📊 Hausman检验结果
|
|
300
|
+
|
|
301
|
+
🔍 检验信息:
|
|
302
|
+
- 检验统计量 = {result.statistic:.4f}
|
|
303
|
+
- p值 = {result.p_value:.4f}
|
|
304
|
+
- 显著性 = {'是' if result.significant else '否'} (5%水平)
|
|
305
|
+
|
|
306
|
+
💡 模型选择建议:
|
|
307
|
+
{result.recommendation}
|
|
308
|
+
|
|
309
|
+
📋 决策规则:
|
|
310
|
+
- p值 < 0.05: 拒绝原假设,选择固定效应模型
|
|
311
|
+
- p值 >= 0.05: 不能拒绝原假设,选择随机效应模型
|
|
312
|
+
|
|
313
|
+
🔬 检验原理:Hausman检验用于判断个体效应是否与解释变量相关。原假设为随机效应模型是一致的。"""
|
|
314
|
+
|
|
251
315
|
return CallToolResult(
|
|
252
|
-
content=[TextContent(type="text", text=
|
|
316
|
+
content=[TextContent(type="text", text=result_text)],
|
|
253
317
|
structuredContent=result.model_dump()
|
|
254
318
|
)
|
|
255
319
|
|
|
256
320
|
|
|
257
321
|
async def handle_panel_unit_root_test(ctx, **kwargs):
|
|
258
322
|
"""
|
|
259
|
-
处理面板单位根检验
|
|
323
|
+
处理面板单位根检验 - 统一输出格式
|
|
260
324
|
|
|
261
325
|
panel_unit_root_test函数期望:data, entity_ids, time_periods
|
|
262
326
|
但panel装饰器会传入:y_data, x_data, entity_ids, time_periods
|
|
@@ -280,99 +344,470 @@ async def handle_panel_unit_root_test(ctx, **kwargs):
|
|
|
280
344
|
|
|
281
345
|
# 只传递panel_unit_root_test需要的参数
|
|
282
346
|
result = panel_unit_root_test(data, entity_ids, time_periods, test_type)
|
|
347
|
+
|
|
348
|
+
# 构建详细的结果文本
|
|
349
|
+
result_text = f"""📊 面板单位根检验结果
|
|
350
|
+
|
|
351
|
+
🔍 检验信息:
|
|
352
|
+
- 检验方法 = {test_type.upper()}
|
|
353
|
+
- 个体数量 = {len(set(entity_ids))}
|
|
354
|
+
- 时间期数 = {len(set(time_periods))}
|
|
355
|
+
- 检验统计量 = {result.statistic:.4f}
|
|
356
|
+
- p值 = {result.p_value:.4f}
|
|
357
|
+
- 平稳性 = {'平稳' if result.stationary else '非平稳'} (5%水平)
|
|
358
|
+
|
|
359
|
+
📈 检验详情:"""
|
|
360
|
+
|
|
361
|
+
# 添加检验详情信息
|
|
362
|
+
if hasattr(result, 'critical_values'):
|
|
363
|
+
result_text += f"\n- 临界值: {result.critical_values}"
|
|
364
|
+
if hasattr(result, 'lags_used'):
|
|
365
|
+
result_text += f"\n- 使用滞后阶数: {result.lags_used}"
|
|
366
|
+
if hasattr(result, 'test_statistic'):
|
|
367
|
+
result_text += f"\n- 检验统计量: {result.test_statistic:.4f}"
|
|
368
|
+
|
|
369
|
+
result_text += f"\n\n💡 检验说明:面板单位根检验用于判断面板数据是否平稳,是面板数据分析的重要前提检验。"
|
|
370
|
+
result_text += f"\n\n⚠️ 注意事项:如果数据非平稳,需要进行差分处理或使用面板协整检验。"
|
|
371
|
+
|
|
283
372
|
return CallToolResult(
|
|
284
|
-
content=[TextContent(type="text", text=
|
|
373
|
+
content=[TextContent(type="text", text=result_text)],
|
|
285
374
|
structuredContent=result.model_dump()
|
|
286
375
|
)
|
|
287
376
|
|
|
288
377
|
|
|
289
378
|
# 时间序列处理器
|
|
290
379
|
async def handle_var_model(ctx, data, max_lags=5, ic="aic", **kwargs):
|
|
380
|
+
"""处理VAR模型分析 - 统一输出格式"""
|
|
291
381
|
result = var_model(data, max_lags=max_lags, ic=ic)
|
|
382
|
+
|
|
383
|
+
# 构建详细的结果文本
|
|
384
|
+
result_text = f"""📊 VAR模型分析结果
|
|
385
|
+
|
|
386
|
+
🔍 模型基本信息:
|
|
387
|
+
- 最优滞后阶数 = {result.order}
|
|
388
|
+
- 变量数量 = {len(result.variables) if hasattr(result, 'variables') else '未知'}
|
|
389
|
+
- 信息准则 = {ic.upper()}
|
|
390
|
+
- AIC = {result.aic:.2f}
|
|
391
|
+
- BIC = {getattr(result, 'bic', 'N/A')}
|
|
392
|
+
- HQIC = {getattr(result, 'hqic', 'N/A')}
|
|
393
|
+
|
|
394
|
+
📈 模型诊断信息:"""
|
|
395
|
+
|
|
396
|
+
# 添加模型诊断信息
|
|
397
|
+
if hasattr(result, 'residuals_normality'):
|
|
398
|
+
result_text += f"\n- 残差正态性检验: {result.residuals_normality}"
|
|
399
|
+
if hasattr(result, 'serial_correlation'):
|
|
400
|
+
result_text += f"\n- 序列相关性检验: {result.serial_correlation}"
|
|
401
|
+
if hasattr(result, 'stability'):
|
|
402
|
+
result_text += f"\n- 模型稳定性: {result.stability}"
|
|
403
|
+
|
|
404
|
+
# 添加变量信息
|
|
405
|
+
if hasattr(result, 'variables'):
|
|
406
|
+
result_text += f"\n\n🔬 分析变量:"
|
|
407
|
+
for var in result.variables:
|
|
408
|
+
result_text += f"\n- {var}"
|
|
409
|
+
|
|
410
|
+
result_text += f"\n\n💡 模型说明:VAR模型用于分析多个时间序列变量间的动态关系,能够捕捉变量间的相互影响和滞后效应。"
|
|
411
|
+
result_text += f"\n\n⚠️ 注意事项:VAR模型假设所有变量都是内生的,适用于分析变量间的动态交互关系。"
|
|
412
|
+
|
|
292
413
|
return CallToolResult(
|
|
293
|
-
content=[TextContent(type="text", text=
|
|
414
|
+
content=[TextContent(type="text", text=result_text)],
|
|
294
415
|
structuredContent=result.model_dump()
|
|
295
416
|
)
|
|
296
417
|
|
|
297
418
|
|
|
298
419
|
async def handle_vecm_model(ctx, data, coint_rank=1, deterministic="co", max_lags=5, **kwargs):
|
|
420
|
+
"""处理VECM模型分析 - 统一输出格式"""
|
|
299
421
|
result = vecm_model(data, coint_rank=coint_rank, deterministic=deterministic, max_lags=max_lags)
|
|
422
|
+
|
|
423
|
+
# 构建详细的结果文本
|
|
424
|
+
result_text = f"""📊 VECM模型分析结果
|
|
425
|
+
|
|
426
|
+
🔍 模型基本信息:
|
|
427
|
+
- 协整秩 = {result.coint_rank}
|
|
428
|
+
- 确定性项类型 = {deterministic}
|
|
429
|
+
- 最大滞后阶数 = {max_lags}
|
|
430
|
+
- AIC = {result.aic:.2f}
|
|
431
|
+
- BIC = {getattr(result, 'bic', 'N/A')}
|
|
432
|
+
- HQIC = {getattr(result, 'hqic', 'N/A')}
|
|
433
|
+
|
|
434
|
+
📈 协整关系分析:"""
|
|
435
|
+
|
|
436
|
+
# 添加协整关系信息
|
|
437
|
+
if hasattr(result, 'coint_relations'):
|
|
438
|
+
result_text += f"\n- 协整关系数量: {len(result.coint_relations)}"
|
|
439
|
+
for i, relation in enumerate(result.coint_relations[:3], 1): # 显示前3个关系
|
|
440
|
+
result_text += f"\n- 关系{i}: {relation}"
|
|
441
|
+
if len(result.coint_relations) > 3:
|
|
442
|
+
result_text += f"\n- ... 还有{len(result.coint_relations) - 3}个协整关系"
|
|
443
|
+
|
|
444
|
+
# 添加误差修正项信息
|
|
445
|
+
if hasattr(result, 'error_correction'):
|
|
446
|
+
result_text += f"\n\n🔧 误差修正机制:"
|
|
447
|
+
result_text += f"\n- 误差修正项显著性: {result.error_correction}"
|
|
448
|
+
|
|
449
|
+
result_text += f"\n\n💡 模型说明:VECM模型用于分析非平稳时间序列的长期均衡关系,包含误差修正机制来反映短期调整过程。"
|
|
450
|
+
result_text += f"\n\n⚠️ 注意事项:VECM模型要求变量间存在协整关系,适用于分析经济变量的长期均衡和短期动态调整。"
|
|
451
|
+
|
|
300
452
|
return CallToolResult(
|
|
301
|
-
content=[TextContent(type="text", text=
|
|
453
|
+
content=[TextContent(type="text", text=result_text)],
|
|
302
454
|
structuredContent=result.model_dump()
|
|
303
455
|
)
|
|
304
456
|
|
|
305
457
|
|
|
306
458
|
async def handle_garch_model(ctx, data, order=(1, 1), dist="normal", **kwargs):
|
|
459
|
+
"""处理GARCH模型分析 - 统一输出格式"""
|
|
307
460
|
result = garch_model(data, order=order, dist=dist)
|
|
461
|
+
|
|
462
|
+
# 构建详细的结果文本
|
|
463
|
+
result_text = f"""📊 GARCH模型分析结果
|
|
464
|
+
|
|
465
|
+
🔍 模型基本信息:
|
|
466
|
+
- GARCH阶数 = ({order[0]}, {order[1]})
|
|
467
|
+
- 误差分布 = {dist}
|
|
468
|
+
- 持久性 = {result.persistence:.4f}
|
|
469
|
+
- AIC = {result.aic:.2f}
|
|
470
|
+
- BIC = {getattr(result, 'bic', 'N/A')}
|
|
471
|
+
|
|
472
|
+
📈 波动率特征:"""
|
|
473
|
+
|
|
474
|
+
# 添加波动率特征信息
|
|
475
|
+
if hasattr(result, 'volatility_persistence'):
|
|
476
|
+
result_text += f"\n- 波动率持续性: {result.volatility_persistence:.4f}"
|
|
477
|
+
if hasattr(result, 'unconditional_variance'):
|
|
478
|
+
result_text += f"\n- 无条件方差: {result.unconditional_variance:.4f}"
|
|
479
|
+
if hasattr(result, 'leverage_effect'):
|
|
480
|
+
result_text += f"\n- 杠杆效应: {result.leverage_effect}"
|
|
481
|
+
|
|
482
|
+
# 添加模型诊断信息
|
|
483
|
+
if hasattr(result, 'residuals_test'):
|
|
484
|
+
result_text += f"\n\n🔧 模型诊断:"
|
|
485
|
+
result_text += f"\n- 残差检验: {result.residuals_test}"
|
|
486
|
+
|
|
487
|
+
result_text += f"\n\n💡 模型说明:GARCH模型用于分析金融时间序列的波动率聚类现象,能够捕捉条件异方差性。"
|
|
488
|
+
result_text += f"\n\n⚠️ 注意事项:GARCH模型适用于金融数据波动率建模,阶数选择影响模型对波动率持续性的捕捉能力。"
|
|
489
|
+
|
|
308
490
|
return CallToolResult(
|
|
309
|
-
content=[TextContent(type="text", text=
|
|
491
|
+
content=[TextContent(type="text", text=result_text)],
|
|
310
492
|
structuredContent=result.model_dump()
|
|
311
493
|
)
|
|
312
494
|
|
|
313
495
|
|
|
314
|
-
async def handle_state_space_model(ctx, data, state_dim=1, observation_dim=1,
|
|
496
|
+
async def handle_state_space_model(ctx, data, state_dim=1, observation_dim=1,
|
|
315
497
|
trend=True, seasonal=False, period=12, **kwargs):
|
|
498
|
+
"""处理状态空间模型分析 - 统一输出格式"""
|
|
316
499
|
result = state_space_model(data, state_dim, observation_dim, trend, seasonal, period)
|
|
500
|
+
|
|
501
|
+
# 构建详细的结果文本
|
|
502
|
+
result_text = f"""📊 状态空间模型分析结果
|
|
503
|
+
|
|
504
|
+
🔍 模型结构信息:
|
|
505
|
+
- 状态维度 = {result.state_dim}
|
|
506
|
+
- 观测维度 = {result.observation_dim}
|
|
507
|
+
- 趋势项 = {'包含' if result.trend else '不包含'}
|
|
508
|
+
- 季节项 = {'包含' if result.seasonal else '不包含'}
|
|
509
|
+
- 季节周期 = {result.period if result.seasonal else 'N/A'}
|
|
510
|
+
- AIC = {result.aic:.2f}
|
|
511
|
+
- BIC = {getattr(result, 'bic', 'N/A')}
|
|
512
|
+
|
|
513
|
+
📈 模型拟合信息:"""
|
|
514
|
+
|
|
515
|
+
# 添加模型拟合信息
|
|
516
|
+
if hasattr(result, 'log_likelihood'):
|
|
517
|
+
result_text += f"\n- 对数似然值: {result.log_likelihood:.2f}"
|
|
518
|
+
if hasattr(result, 'converged'):
|
|
519
|
+
result_text += f"\n- 收敛状态: {'已收敛' if result.converged else '未收敛'}"
|
|
520
|
+
if hasattr(result, 'smoothing_error'):
|
|
521
|
+
result_text += f"\n- 平滑误差: {result.smoothing_error:.4f}"
|
|
522
|
+
|
|
523
|
+
result_text += f"\n\n💡 模型说明:状态空间模型用于分析时间序列的潜在状态和观测关系,能够处理复杂的动态系统。"
|
|
524
|
+
result_text += f"\n\n⚠️ 注意事项:状态空间模型适用于分析具有潜在状态的时间序列,参数估计可能对初始值敏感。"
|
|
525
|
+
|
|
317
526
|
return CallToolResult(
|
|
318
|
-
content=[TextContent(type="text", text=
|
|
527
|
+
content=[TextContent(type="text", text=result_text)],
|
|
319
528
|
structuredContent=result.model_dump()
|
|
320
529
|
)
|
|
321
530
|
|
|
322
531
|
|
|
323
532
|
async def handle_variance_decomposition(ctx, data, periods=10, max_lags=5, **kwargs):
|
|
533
|
+
"""处理方差分解分析 - 统一输出格式"""
|
|
324
534
|
result = variance_decomposition(data, periods=periods, max_lags=max_lags)
|
|
535
|
+
|
|
536
|
+
# 构建详细的结果文本
|
|
537
|
+
result_text = f"""📊 方差分解分析结果
|
|
538
|
+
|
|
539
|
+
🔍 分析设置:
|
|
540
|
+
- 分解期数 = {periods}
|
|
541
|
+
- 最大滞后阶数 = {max_lags}
|
|
542
|
+
- 变量数量 = {len(result) if isinstance(result, dict) else '未知'}
|
|
543
|
+
|
|
544
|
+
📈 方差分解结果:"""
|
|
545
|
+
|
|
546
|
+
# 添加方差分解结果
|
|
547
|
+
if isinstance(result, dict):
|
|
548
|
+
for var_name, decomposition in result.items():
|
|
549
|
+
if isinstance(decomposition, dict):
|
|
550
|
+
result_text += f"\n\n🔬 变量 '{var_name}' 的方差来源:"
|
|
551
|
+
for source, percentage in decomposition.items():
|
|
552
|
+
result_text += f"\n- {source}: {percentage:.1f}%"
|
|
553
|
+
else:
|
|
554
|
+
result_text += f"\n- {var_name}: {decomposition}"
|
|
555
|
+
else:
|
|
556
|
+
result_text += f"\n- 结果: {result}"
|
|
557
|
+
|
|
558
|
+
result_text += f"\n\n💡 分析说明:方差分解用于分析多变量系统中各变量对预测误差方差的贡献程度。"
|
|
559
|
+
result_text += f"\n\n⚠️ 注意事项:方差分解结果依赖于模型的滞后阶数选择,不同期数的分解结果可能不同。"
|
|
560
|
+
|
|
325
561
|
return CallToolResult(
|
|
326
|
-
content=[TextContent(type="text", text=
|
|
562
|
+
content=[TextContent(type="text", text=result_text)],
|
|
327
563
|
structuredContent=result
|
|
328
564
|
)
|
|
329
565
|
|
|
330
566
|
|
|
331
567
|
# 机器学习处理器
|
|
332
568
|
async def handle_random_forest(ctx, y_data, x_data, feature_names=None, n_estimators=100, max_depth=None, **kwargs):
|
|
569
|
+
"""处理随机森林回归 - 统一输出格式"""
|
|
333
570
|
result = random_forest_regression(y_data, x_data, feature_names, n_estimators, max_depth)
|
|
571
|
+
|
|
572
|
+
# 检查R²是否为负值
|
|
573
|
+
r2_warning = ""
|
|
574
|
+
if result.r2_score < 0:
|
|
575
|
+
r2_warning = f"\n⚠️ 警告:R²为负值({result.r2_score:.4f}),表明模型性能比简单均值预测更差。建议:1) 检查数据质量 2) 增加样本数量 3) 调整模型参数"
|
|
576
|
+
|
|
577
|
+
# 构建详细的结果文本
|
|
578
|
+
result_text = f"""📊 随机森林回归分析结果
|
|
579
|
+
|
|
580
|
+
🔍 模型拟合信息:
|
|
581
|
+
- R² = {result.r2_score:.4f}
|
|
582
|
+
- 均方误差(MSE) = {result.mse:.4f}
|
|
583
|
+
- 平均绝对误差(MAE) = {result.mae:.4f}
|
|
584
|
+
- 样本数量 = {result.n_obs}
|
|
585
|
+
- 树的数量 = {result.n_estimators}
|
|
586
|
+
- 最大深度 = {result.max_depth if result.max_depth else '无限制'}
|
|
587
|
+
- 袋外得分 = {f"{result.oob_score:.4f}" if result.oob_score else '未计算'}
|
|
588
|
+
{r2_warning}
|
|
589
|
+
|
|
590
|
+
📈 特征重要性(前10个):"""
|
|
591
|
+
|
|
592
|
+
# 添加特征重要性信息,按重要性排序
|
|
593
|
+
if result.feature_importance:
|
|
594
|
+
sorted_features = sorted(result.feature_importance.items(), key=lambda x: x[1], reverse=True)
|
|
595
|
+
for i, (feature, importance) in enumerate(sorted_features[:10]):
|
|
596
|
+
result_text += f"\n- {feature}: {importance:.4f}"
|
|
597
|
+
if len(sorted_features) > 10:
|
|
598
|
+
result_text += f"\n- ... 还有{len(sorted_features) - 10}个特征"
|
|
599
|
+
else:
|
|
600
|
+
result_text += "\n- 特征重要性未计算"
|
|
601
|
+
|
|
602
|
+
result_text += f"\n\n💡 模型说明:随机森林通过构建多个决策树并集成结果,能够处理非线性关系和特征交互,对异常值稳健且不易过拟合。"
|
|
603
|
+
result_text += f"\n\n⚠️ 注意事项:随机森林是黑盒模型,可解释性较差,但预测性能通常较好。"
|
|
604
|
+
|
|
334
605
|
return CallToolResult(
|
|
335
|
-
content=[TextContent(type="text", text=
|
|
606
|
+
content=[TextContent(type="text", text=result_text)],
|
|
336
607
|
structuredContent=result.model_dump()
|
|
337
608
|
)
|
|
338
609
|
|
|
339
610
|
|
|
340
|
-
async def handle_gradient_boosting(ctx, y_data, x_data, feature_names=None,
|
|
611
|
+
async def handle_gradient_boosting(ctx, y_data, x_data, feature_names=None,
|
|
341
612
|
n_estimators=100, learning_rate=0.1, max_depth=3, **kwargs):
|
|
613
|
+
"""处理梯度提升树回归 - 统一输出格式"""
|
|
342
614
|
result = gradient_boosting_regression(y_data, x_data, feature_names, n_estimators, learning_rate, max_depth)
|
|
615
|
+
|
|
616
|
+
# 检查R²是否为负值
|
|
617
|
+
r2_warning = ""
|
|
618
|
+
if result.r2_score < 0:
|
|
619
|
+
r2_warning = f"\n⚠️ 警告:R²为负值({result.r2_score:.4f}),表明模型性能比简单均值预测更差。建议:1) 检查数据质量 2) 增加样本数量 3) 调整模型参数"
|
|
620
|
+
|
|
621
|
+
# 构建详细的结果文本
|
|
622
|
+
result_text = f"""📊 梯度提升树回归分析结果
|
|
623
|
+
|
|
624
|
+
🔍 模型拟合信息:
|
|
625
|
+
- R² = {result.r2_score:.4f}
|
|
626
|
+
- 均方误差(MSE) = {result.mse:.4f}
|
|
627
|
+
- 平均绝对误差(MAE) = {result.mae:.4f}
|
|
628
|
+
- 样本数量 = {result.n_obs}
|
|
629
|
+
- 树的数量 = {result.n_estimators}
|
|
630
|
+
- 学习率 = {result.learning_rate}
|
|
631
|
+
- 最大深度 = {result.max_depth}
|
|
632
|
+
{r2_warning}
|
|
633
|
+
|
|
634
|
+
📈 特征重要性(前10个):"""
|
|
635
|
+
|
|
636
|
+
# 添加特征重要性信息,按重要性排序
|
|
637
|
+
if result.feature_importance:
|
|
638
|
+
sorted_features = sorted(result.feature_importance.items(), key=lambda x: x[1], reverse=True)
|
|
639
|
+
for i, (feature, importance) in enumerate(sorted_features[:10]):
|
|
640
|
+
result_text += f"\n- {feature}: {importance:.4f}"
|
|
641
|
+
if len(sorted_features) > 10:
|
|
642
|
+
result_text += f"\n- ... 还有{len(sorted_features) - 10}个特征"
|
|
643
|
+
else:
|
|
644
|
+
result_text += "\n- 特征重要性未计算"
|
|
645
|
+
|
|
646
|
+
result_text += f"\n\n💡 模型说明:梯度提升树通过顺序构建决策树,每棵树修正前一棵树的错误,能够处理复杂的非线性关系,通常具有很高的预测精度。"
|
|
647
|
+
result_text += f"\n\n⚠️ 注意事项:梯度提升树对参数敏感,需要仔细调优,训练时间较长但预测性能优秀。"
|
|
648
|
+
|
|
343
649
|
return CallToolResult(
|
|
344
|
-
content=[TextContent(type="text", text=
|
|
650
|
+
content=[TextContent(type="text", text=result_text)],
|
|
345
651
|
structuredContent=result.model_dump()
|
|
346
652
|
)
|
|
347
653
|
|
|
348
654
|
|
|
349
655
|
async def handle_lasso_regression(ctx, y_data, x_data, feature_names=None, alpha=1.0, **kwargs):
|
|
656
|
+
"""处理Lasso回归 - 统一输出格式"""
|
|
350
657
|
result = lasso_regression(y_data, x_data, feature_names, alpha)
|
|
658
|
+
|
|
659
|
+
# 检查R²是否为负值
|
|
660
|
+
r2_warning = ""
|
|
661
|
+
if result.r2_score < 0:
|
|
662
|
+
r2_warning = f"\n⚠️ 警告:R²为负值({result.r2_score:.4f}),表明模型性能比简单均值预测更差。建议:1) 检查数据质量 2) 尝试更小的alpha值 3) 增加样本数量"
|
|
663
|
+
|
|
664
|
+
# 检查系数是否全为0
|
|
665
|
+
coef_warning = ""
|
|
666
|
+
if all(abs(coef) < 1e-10 for coef in result.coefficients.values()):
|
|
667
|
+
coef_warning = f"\n⚠️ 警告:所有系数都被压缩为0,正则化参数alpha={alpha}可能过大,建议减小alpha值"
|
|
668
|
+
|
|
669
|
+
# 构建详细的结果文本
|
|
670
|
+
result_text = f"""📊 Lasso回归分析结果
|
|
671
|
+
|
|
672
|
+
🔍 模型拟合信息:
|
|
673
|
+
- R² = {result.r2_score:.4f}
|
|
674
|
+
- 均方误差(MSE) = {result.mse:.4f}
|
|
675
|
+
- 平均绝对误差(MAE) = {result.mae:.4f}
|
|
676
|
+
- 样本数量 = {result.n_obs}
|
|
677
|
+
- 正则化参数(alpha) = {result.alpha}
|
|
678
|
+
{r2_warning}{coef_warning}
|
|
679
|
+
|
|
680
|
+
📈 回归系数详情:"""
|
|
681
|
+
|
|
682
|
+
# 添加系数信息,按绝对值排序
|
|
683
|
+
sorted_coefficients = sorted(result.coefficients.items(), key=lambda x: abs(x[1]), reverse=True)
|
|
684
|
+
for var_name, coef in sorted_coefficients:
|
|
685
|
+
if abs(coef) > 1e-10: # 只显示非零系数
|
|
686
|
+
result_text += f"\n- {var_name}: {coef:.4f}"
|
|
687
|
+
else:
|
|
688
|
+
result_text += f"\n- {var_name}: 0.0000 (被压缩)"
|
|
689
|
+
|
|
690
|
+
result_text += f"\n\n💡 模型说明:Lasso回归使用L1正则化进行特征选择,能够自动将不重要的特征系数压缩为0,适用于高维数据和特征选择场景。"
|
|
691
|
+
result_text += f"\n\n⚠️ 注意事项:由于数据标准化,系数大小需要谨慎解释。"
|
|
692
|
+
|
|
351
693
|
return CallToolResult(
|
|
352
|
-
content=[TextContent(type="text", text=
|
|
694
|
+
content=[TextContent(type="text", text=result_text)],
|
|
353
695
|
structuredContent=result.model_dump()
|
|
354
696
|
)
|
|
355
697
|
|
|
356
698
|
|
|
357
699
|
async def handle_ridge_regression(ctx, y_data, x_data, feature_names=None, alpha=1.0, **kwargs):
|
|
700
|
+
"""处理Ridge回归 - 统一输出格式"""
|
|
358
701
|
result = ridge_regression(y_data, x_data, feature_names, alpha)
|
|
702
|
+
|
|
703
|
+
# 检查R²是否为负值
|
|
704
|
+
r2_warning = ""
|
|
705
|
+
if result.r2_score < 0:
|
|
706
|
+
r2_warning = f"\n⚠️ 警告:R²为负值({result.r2_score:.4f}),表明模型性能比简单均值预测更差。建议:1) 检查数据质量 2) 尝试更小的alpha值 3) 增加样本数量"
|
|
707
|
+
|
|
708
|
+
# 构建详细的结果文本
|
|
709
|
+
result_text = f"""📊 Ridge回归分析结果
|
|
710
|
+
|
|
711
|
+
🔍 模型拟合信息:
|
|
712
|
+
- R² = {result.r2_score:.4f}
|
|
713
|
+
- 均方误差(MSE) = {result.mse:.4f}
|
|
714
|
+
- 平均绝对误差(MAE) = {result.mae:.4f}
|
|
715
|
+
- 样本数量 = {result.n_obs}
|
|
716
|
+
- 正则化参数(alpha) = {result.alpha}
|
|
717
|
+
{r2_warning}
|
|
718
|
+
|
|
719
|
+
📈 回归系数详情:"""
|
|
720
|
+
|
|
721
|
+
# 添加系数信息,按绝对值排序
|
|
722
|
+
sorted_coefficients = sorted(result.coefficients.items(), key=lambda x: abs(x[1]), reverse=True)
|
|
723
|
+
for var_name, coef in sorted_coefficients:
|
|
724
|
+
result_text += f"\n- {var_name}: {coef:.4f}"
|
|
725
|
+
|
|
726
|
+
result_text += f"\n\n💡 模型说明:Ridge回归使用L2正则化处理多重共线性问题,对所有系数进行收缩但不进行特征选择,适用于需要稳定估计的场景。"
|
|
727
|
+
result_text += f"\n\n⚠️ 注意事项:由于数据标准化,系数大小需要谨慎解释。"
|
|
728
|
+
|
|
359
729
|
return CallToolResult(
|
|
360
|
-
content=[TextContent(type="text", text=
|
|
730
|
+
content=[TextContent(type="text", text=result_text)],
|
|
361
731
|
structuredContent=result.model_dump()
|
|
362
732
|
)
|
|
363
733
|
|
|
364
734
|
|
|
365
735
|
async def handle_cross_validation(ctx, y_data, x_data, model_type="random_forest", cv_folds=5, scoring="r2", **kwargs):
|
|
736
|
+
"""处理交叉验证 - 统一输出格式"""
|
|
366
737
|
result = cross_validation(y_data, x_data, model_type, cv_folds, scoring)
|
|
738
|
+
|
|
739
|
+
# 构建详细的结果文本
|
|
740
|
+
result_text = f"""📊 交叉验证分析结果
|
|
741
|
+
|
|
742
|
+
🔍 验证信息:
|
|
743
|
+
- 模型类型 = {result.model_type}
|
|
744
|
+
- 交叉验证折数 = {result.n_splits}
|
|
745
|
+
- 评分指标 = {scoring}
|
|
746
|
+
- 平均得分 = {result.mean_score:.4f}
|
|
747
|
+
- 得分标准差 = {result.std_score:.4f}
|
|
748
|
+
- 变异系数 = {(result.std_score / abs(result.mean_score)) * 100 if result.mean_score != 0 else 0:.2f}%
|
|
749
|
+
|
|
750
|
+
📈 各折得分详情:"""
|
|
751
|
+
|
|
752
|
+
# 添加各折得分
|
|
753
|
+
for i, score in enumerate(result.cv_scores, 1):
|
|
754
|
+
result_text += f"\n- 第{i}折: {score:.4f}"
|
|
755
|
+
|
|
756
|
+
# 评估模型稳定性
|
|
757
|
+
stability_assessment = ""
|
|
758
|
+
cv_threshold = 0.1 # 10%的变异系数阈值
|
|
759
|
+
cv_value = (result.std_score / abs(result.mean_score)) if result.mean_score != 0 else 0
|
|
760
|
+
|
|
761
|
+
if cv_value < cv_threshold:
|
|
762
|
+
stability_assessment = f"\n\n✅ 模型稳定性:优秀(变异系数{cv_value*100:.2f}% < {cv_threshold*100:.0f}%)"
|
|
763
|
+
elif cv_value < cv_threshold * 2:
|
|
764
|
+
stability_assessment = f"\n\n⚠️ 模型稳定性:一般(变异系数{cv_value*100:.2f}% 在{cv_threshold*100:.0f}%-{cv_threshold*2*100:.0f}%之间)"
|
|
765
|
+
else:
|
|
766
|
+
stability_assessment = f"\n\n❌ 模型稳定性:较差(变异系数{cv_value*100:.2f}% > {cv_threshold*2*100:.0f}%)"
|
|
767
|
+
|
|
768
|
+
result_text += stability_assessment
|
|
769
|
+
result_text += f"\n\n💡 模型说明:交叉验证通过将数据分成多个子集进行训练和测试,评估模型的泛化能力和稳定性。"
|
|
770
|
+
result_text += f"\n\n⚠️ 注意事项:变异系数越小表明模型越稳定,建议选择变异系数小于10%的模型。"
|
|
771
|
+
|
|
367
772
|
return CallToolResult(
|
|
368
|
-
content=[TextContent(type="text", text=
|
|
773
|
+
content=[TextContent(type="text", text=result_text)],
|
|
369
774
|
structuredContent=result.model_dump()
|
|
370
775
|
)
|
|
371
776
|
|
|
372
777
|
|
|
373
778
|
async def handle_feature_importance(ctx, y_data, x_data, feature_names=None, method="random_forest", top_k=5, **kwargs):
|
|
779
|
+
"""处理特征重要性分析 - 统一输出格式"""
|
|
374
780
|
result = feature_importance_analysis(y_data, x_data, feature_names, method, top_k)
|
|
781
|
+
|
|
782
|
+
# 构建详细的结果文本
|
|
783
|
+
result_text = f"""📊 特征重要性分析结果
|
|
784
|
+
|
|
785
|
+
🔍 分析信息:
|
|
786
|
+
- 分析方法 = {method}
|
|
787
|
+
- 显示Top特征数量 = {top_k}
|
|
788
|
+
- 总特征数量 = {len(result.feature_importance)}
|
|
789
|
+
|
|
790
|
+
📈 特征重要性排名:"""
|
|
791
|
+
|
|
792
|
+
# 添加特征重要性信息
|
|
793
|
+
for i, (feature, importance) in enumerate(result.sorted_features[:top_k], 1):
|
|
794
|
+
percentage = (importance / sum(result.feature_importance.values())) * 100 if sum(result.feature_importance.values()) > 0 else 0
|
|
795
|
+
result_text += f"\n{i}. {feature}: {importance:.4f} ({percentage:.1f}%)"
|
|
796
|
+
|
|
797
|
+
# 添加重要性分布信息
|
|
798
|
+
if len(result.sorted_features) > 0:
|
|
799
|
+
top_k_importance = sum(imp for _, imp in result.sorted_features[:top_k])
|
|
800
|
+
total_importance = sum(result.feature_importance.values())
|
|
801
|
+
top_k_percentage = (top_k_importance / total_importance) * 100 if total_importance > 0 else 0
|
|
802
|
+
|
|
803
|
+
result_text += f"\n\n📊 重要性分布:"
|
|
804
|
+
result_text += f"\n- Top {top_k}特征累计重要性: {top_k_percentage:.1f}%"
|
|
805
|
+
result_text += f"\n- 剩余特征重要性: {100 - top_k_percentage:.1f}%"
|
|
806
|
+
|
|
807
|
+
result_text += f"\n\n💡 分析说明:特征重要性分析帮助识别对预测目标最重要的变量,可用于特征选择和模型解释。"
|
|
808
|
+
result_text += f"\n\n⚠️ 注意事项:不同方法计算的特征重要性可能不同,建议结合业务知识进行解释。"
|
|
809
|
+
|
|
375
810
|
return CallToolResult(
|
|
376
|
-
content=[TextContent(type="text", text=
|
|
811
|
+
content=[TextContent(type="text", text=result_text)],
|
|
377
812
|
structuredContent=result.model_dump()
|
|
378
813
|
)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: aigroup-econ-mcp
|
|
3
|
-
Version: 0.
|
|
4
|
-
Summary: 专业计量经济学MCP工具 -
|
|
3
|
+
Version: 0.8.0
|
|
4
|
+
Summary: 专业计量经济学MCP工具 - 让大模型直接进行数据分析(优化版:统一输出格式,增强模型说明)
|
|
5
5
|
Project-URL: Homepage, https://github.com/aigroup/aigroup-econ-mcp
|
|
6
6
|
Project-URL: Repository, https://github.com/aigroup/aigroup-econ-mcp.git
|
|
7
7
|
Project-URL: Issues, https://github.com/aigroup/aigroup-econ-mcp/issues
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
aigroup_econ_mcp/__init__.py,sha256
|
|
1
|
+
aigroup_econ_mcp/__init__.py,sha256=-mhTcro4v3VCeyT-6G6yFwfwcW15xLnKLigWSUdN7Ws,511
|
|
2
2
|
aigroup_econ_mcp/cli.py,sha256=7yeNXWNwMdpUswAO4LsqAvb0EmCO3S6Bs6sl483uSXI,3363
|
|
3
3
|
aigroup_econ_mcp/config.py,sha256=ab5X4-H8isIe2nma0c0AOqlyYgwhf5kfe9Zx5XRrzIo,18876
|
|
4
4
|
aigroup_econ_mcp/server.py,sha256=pmE-n8NwU3xqzJBVdgXBTSSHmt_RDIG9lYMYi8rL9fM,30899
|
|
@@ -11,20 +11,20 @@ aigroup_econ_mcp/tools/machine_learning.py,sha256=PpxrJVJw4eND95Wl0uGPEqUHXrIwTU
|
|
|
11
11
|
aigroup_econ_mcp/tools/ml_ensemble.py,sha256=XOL0PzCsx9LY_pFbKCAxjYdGny-HqEhlZyov2r1l3ww,6475
|
|
12
12
|
aigroup_econ_mcp/tools/ml_evaluation.py,sha256=hiwVW3-N0hnSAJfZW4luOCXt3sTh1W9Hj3CwZLRVaJk,8900
|
|
13
13
|
aigroup_econ_mcp/tools/ml_models.py,sha256=hJEUgARxkqYgJqu6_7eRc1WnD2HcTGxtXf8Jre_XO1U,2137
|
|
14
|
-
aigroup_econ_mcp/tools/ml_regularization.py,sha256=
|
|
14
|
+
aigroup_econ_mcp/tools/ml_regularization.py,sha256=qOWXiOZqB-IUvjHLrgKWWjNhuuxZm_both2aR-OBs1U,6124
|
|
15
15
|
aigroup_econ_mcp/tools/monitoring.py,sha256=-hcw5nu5Q91FmDz39mRBsKavrTmEqXsKfGzlXr_5f0c,16708
|
|
16
16
|
aigroup_econ_mcp/tools/optimized_example.py,sha256=tZVQ2jTzHY_zixTynm4Sq8gj5hz6eWg7MKqNwsxrPoQ,6784
|
|
17
|
-
aigroup_econ_mcp/tools/panel_data.py,sha256=
|
|
17
|
+
aigroup_econ_mcp/tools/panel_data.py,sha256=qFZICvt9Plt2bOvCCgAveVncb_QpHvWzDssdQntKf5M,22696
|
|
18
18
|
aigroup_econ_mcp/tools/regression.py,sha256=uMGRGUQo4mU1sb8fwpP2FpkCqt_e9AtqEtUpInACtJo,6443
|
|
19
19
|
aigroup_econ_mcp/tools/statistics.py,sha256=2cHgNSUXwPYPLxntVOEOL8yF-x92mrgjK-R8kkxDihg,4239
|
|
20
20
|
aigroup_econ_mcp/tools/time_series.py,sha256=LNCO0bYXLPilQ2kSVXA3woNp8ERVq7n3jaoQhWgTCJQ,21763
|
|
21
21
|
aigroup_econ_mcp/tools/timeout.py,sha256=vNnGsR0sXW1xvIbKCF-qPUU3QNDAn_MaQgSxbGxkfW4,8404
|
|
22
|
-
aigroup_econ_mcp/tools/tool_descriptions.py,sha256=
|
|
23
|
-
aigroup_econ_mcp/tools/tool_handlers.py,sha256=
|
|
22
|
+
aigroup_econ_mcp/tools/tool_descriptions.py,sha256=Oj_14_79AB8Ku64mV0cdoV5f2-UFx-0NY3Xxjj6L-1A,32506
|
|
23
|
+
aigroup_econ_mcp/tools/tool_handlers.py,sha256=RUXCB8dYkS2sbn7pKl3WPI70HQHwCDoy0hEmQMJ8rbs,34399
|
|
24
24
|
aigroup_econ_mcp/tools/tool_registry.py,sha256=4SFpMnReZyGfEHCCDnojwHIUEpuQICS9M2u_9xuoUck,4413
|
|
25
25
|
aigroup_econ_mcp/tools/validation.py,sha256=F7LHwog5xtFIMjD9D48kd8jAF5MsZb7wjdrgaOg8EKo,16657
|
|
26
|
-
aigroup_econ_mcp-0.
|
|
27
|
-
aigroup_econ_mcp-0.
|
|
28
|
-
aigroup_econ_mcp-0.
|
|
29
|
-
aigroup_econ_mcp-0.
|
|
30
|
-
aigroup_econ_mcp-0.
|
|
26
|
+
aigroup_econ_mcp-0.8.0.dist-info/METADATA,sha256=7ByVxeiktZPL809uJSH7zKG59f6-1zAzb7uSpxT-Usc,10857
|
|
27
|
+
aigroup_econ_mcp-0.8.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
28
|
+
aigroup_econ_mcp-0.8.0.dist-info/entry_points.txt,sha256=j5ZJYOc4lAZV-X3XkAuGhzHtIRcJtZ6Gz8ZKPY_QTrM,62
|
|
29
|
+
aigroup_econ_mcp-0.8.0.dist-info/licenses/LICENSE,sha256=DoyCJUWlDzKbqc5KRbFpsGYLwLh-XJRHKQDoITjb1yc,1083
|
|
30
|
+
aigroup_econ_mcp-0.8.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|