aigroup-econ-mcp 0.4.2__py3-none-any.whl → 1.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aigroup_econ_mcp/__init__.py +1 -1
- aigroup_econ_mcp/server.py +451 -451
- aigroup_econ_mcp/tools/__init__.py +8 -7
- aigroup_econ_mcp/tools/data_loader.py +51 -27
- aigroup_econ_mcp/tools/file_parser.py +1026 -828
- aigroup_econ_mcp/tools/ml_regularization.py +22 -8
- aigroup_econ_mcp/tools/panel_data.py +70 -4
- aigroup_econ_mcp/tools/time_series.py +53 -22
- aigroup_econ_mcp/tools/tool_descriptions.py +410 -0
- aigroup_econ_mcp/tools/tool_handlers.py +681 -43
- aigroup_econ_mcp/tools/tool_registry.py +328 -20
- aigroup_econ_mcp-1.3.3.dist-info/METADATA +525 -0
- {aigroup_econ_mcp-0.4.2.dist-info → aigroup_econ_mcp-1.3.3.dist-info}/RECORD +16 -15
- aigroup_econ_mcp-0.4.2.dist-info/METADATA +0 -360
- {aigroup_econ_mcp-0.4.2.dist-info → aigroup_econ_mcp-1.3.3.dist-info}/WHEEL +0 -0
- {aigroup_econ_mcp-0.4.2.dist-info → aigroup_econ_mcp-1.3.3.dist-info}/entry_points.txt +0 -0
- {aigroup_econ_mcp-0.4.2.dist-info → aigroup_econ_mcp-1.3.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -130,36 +130,50 @@ def _regularized_regression(
|
|
|
130
130
|
elif len(feature_names) != X.shape[1]:
|
|
131
131
|
raise ValueError(f"特征名称数量({len(feature_names)})与自变量数量({X.shape[1]})不匹配")
|
|
132
132
|
|
|
133
|
-
#
|
|
133
|
+
# 检查数据质量
|
|
134
|
+
if len(y) < 5:
|
|
135
|
+
warnings.warn(f"⚠️ 警告:样本数量较少({len(y)}个),正则化回归可能不稳定")
|
|
136
|
+
|
|
137
|
+
# 数据标准化 - 只标准化自变量,不标准化因变量
|
|
134
138
|
scaler = StandardScaler()
|
|
135
139
|
X_scaled = scaler.fit_transform(X)
|
|
136
|
-
y_scaled = (y - np.mean(y)) / np.std(y) # 标准化因变量
|
|
137
140
|
|
|
138
141
|
# 选择模型
|
|
139
142
|
if model_type == "lasso":
|
|
140
|
-
model = Lasso(alpha=alpha, random_state=random_state, max_iter=10000)
|
|
143
|
+
model = Lasso(alpha=alpha, random_state=random_state, max_iter=10000, tol=1e-4)
|
|
144
|
+
# 对于Lasso,如果alpha过大,建议使用更小的值
|
|
145
|
+
if alpha > 10:
|
|
146
|
+
warnings.warn(f"⚠️ 警告:Lasso正则化参数alpha={alpha}可能过大,建议尝试更小的值(如0.1-1.0)")
|
|
141
147
|
elif model_type == "ridge":
|
|
142
148
|
model = Ridge(alpha=alpha, random_state=random_state)
|
|
143
149
|
else:
|
|
144
150
|
raise ValueError(f"不支持的模型类型: {model_type}")
|
|
145
151
|
|
|
146
152
|
# 训练模型
|
|
147
|
-
|
|
153
|
+
try:
|
|
154
|
+
model.fit(X_scaled, y)
|
|
155
|
+
except Exception as e:
|
|
156
|
+
raise ValueError(f"{model_type}模型拟合失败: {str(e)}。建议:1) 检查数据质量 2) 尝试不同的alpha值 3) 增加样本数量")
|
|
148
157
|
|
|
149
158
|
# 预测
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
# 将预测值转换回原始尺度
|
|
153
|
-
y_pred = y_pred_scaled * np.std(y) + np.mean(y)
|
|
159
|
+
y_pred = model.predict(X_scaled)
|
|
154
160
|
|
|
155
161
|
# 计算评估指标
|
|
156
162
|
r2 = r2_score(y, y_pred)
|
|
157
163
|
mse = mean_squared_error(y, y_pred)
|
|
158
164
|
mae = mean_absolute_error(y, y_pred)
|
|
159
165
|
|
|
166
|
+
# 检查R²是否为负值
|
|
167
|
+
if r2 < 0:
|
|
168
|
+
warnings.warn(f"⚠️ 警告:{model_type}模型的R²为负值({r2:.4f}),表明模型性能比简单均值预测更差。可能原因:1) 数据噪声过大 2) 特征与目标变量无关 3) 正则化参数过大 4) 样本量过小")
|
|
169
|
+
|
|
160
170
|
# 系数(注意:由于标准化,系数需要适当解释)
|
|
161
171
|
coefficients = dict(zip(feature_names, model.coef_))
|
|
162
172
|
|
|
173
|
+
# 检查系数是否全为0(Lasso过度压缩)
|
|
174
|
+
if model_type == "lasso" and all(abs(coef) < 1e-10 for coef in model.coef_):
|
|
175
|
+
warnings.warn(f"⚠️ 警告:Lasso模型所有系数都被压缩为0,表明正则化参数alpha={alpha}可能过大,建议减小alpha值")
|
|
176
|
+
|
|
163
177
|
return RegularizedRegressionResult(
|
|
164
178
|
model_type=model_type,
|
|
165
179
|
r2_score=r2,
|
|
@@ -63,6 +63,23 @@ def prepare_panel_data(
|
|
|
63
63
|
"""
|
|
64
64
|
准备面板数据格式
|
|
65
65
|
|
|
66
|
+
📊 数据格式要求:
|
|
67
|
+
- 因变量(y_data): 数值列表,如 [1.2, 3.4, 5.6, ...]
|
|
68
|
+
- 自变量(X_data): 二维数值列表,如 [[1, 2], [3, 4], [5, 6], ...]
|
|
69
|
+
- 实体ID(entity_ids): 字符串列表,标识不同个体,如 ['A', 'A', 'B', 'B', ...]
|
|
70
|
+
- 时间标识符(time_periods): 字符串或数值列表,标识时间点,如 ['2020', '2020', '2021', '2021', ...]
|
|
71
|
+
|
|
72
|
+
💡 使用示例:
|
|
73
|
+
y_data = [10, 12, 8, 9] # 4个观测值
|
|
74
|
+
X_data = [[1, 2], [2, 3], [1, 1], [2, 2]] # 2个自变量,4个观测值
|
|
75
|
+
entity_ids = ['A', 'A', 'B', 'B'] # 2个实体,每个实体2个时间点
|
|
76
|
+
time_periods = ['2020', '2021', '2020', '2021'] # 2个时间点
|
|
77
|
+
|
|
78
|
+
⚠️ 注意事项:
|
|
79
|
+
- 确保每个实体有相同的时间点数量(平衡面板)
|
|
80
|
+
- 实体ID和时间标识符的组合必须唯一
|
|
81
|
+
- 建议至少3个实体,每个实体至少2个时间点
|
|
82
|
+
|
|
66
83
|
Args:
|
|
67
84
|
y_data: 因变量数据
|
|
68
85
|
X_data: 自变量数据,二维列表
|
|
@@ -73,13 +90,62 @@ def prepare_panel_data(
|
|
|
73
90
|
Returns:
|
|
74
91
|
pd.DataFrame: 面板数据格式的DataFrame
|
|
75
92
|
"""
|
|
76
|
-
# 数据验证
|
|
93
|
+
# 数据验证 - 提供更详细的错误信息
|
|
94
|
+
if not y_data or not X_data or not entity_ids or not time_periods:
|
|
95
|
+
raise ValueError("所有输入数据都不能为空。请提供:因变量(y_data)、自变量(X_data)、实体ID(entity_ids)、时间标识符(time_periods)")
|
|
96
|
+
|
|
77
97
|
if len(y_data) != len(X_data):
|
|
78
|
-
raise ValueError("
|
|
98
|
+
raise ValueError(f"因变量和自变量的观测数量不一致:因变量有{len(y_data)}个观测值,自变量有{len(X_data)}个观测值")
|
|
99
|
+
|
|
79
100
|
if len(y_data) != len(entity_ids):
|
|
80
|
-
raise ValueError("
|
|
101
|
+
raise ValueError(f"因变量和个体标识符数量不一致:因变量有{len(y_data)}个观测值,实体ID有{len(entity_ids)}个")
|
|
102
|
+
|
|
81
103
|
if len(y_data) != len(time_periods):
|
|
82
|
-
raise ValueError("
|
|
104
|
+
raise ValueError(f"因变量和时间标识符数量不一致:因变量有{len(y_data)}个观测值,时间标识符有{len(time_periods)}个")
|
|
105
|
+
|
|
106
|
+
# 检查自变量维度一致性
|
|
107
|
+
if len(X_data) > 0:
|
|
108
|
+
first_dim = len(X_data[0])
|
|
109
|
+
for i, x_row in enumerate(X_data):
|
|
110
|
+
if len(x_row) != first_dim:
|
|
111
|
+
raise ValueError(f"自变量维度不一致:第{i}行有{len(x_row)}个变量,但第一行有{first_dim}个变量")
|
|
112
|
+
|
|
113
|
+
# 检查面板数据平衡性
|
|
114
|
+
entity_time_counts = {}
|
|
115
|
+
for entity, time_period in zip(entity_ids, time_periods):
|
|
116
|
+
key = (entity, time_period)
|
|
117
|
+
if key in entity_time_counts:
|
|
118
|
+
raise ValueError(f"重复的实体-时间组合:实体 '{entity}' 在时间 '{time_period}' 有多个观测值")
|
|
119
|
+
entity_time_counts[key] = True
|
|
120
|
+
|
|
121
|
+
# 检查每个实体的时间点数量
|
|
122
|
+
entity_counts = {}
|
|
123
|
+
for entity in entity_ids:
|
|
124
|
+
entity_counts[entity] = entity_counts.get(entity, 0) + 1
|
|
125
|
+
|
|
126
|
+
unique_entities = len(entity_counts)
|
|
127
|
+
if unique_entities < 2:
|
|
128
|
+
raise ValueError(f"面板数据需要至少2个不同的实体,当前只有{unique_entities}个")
|
|
129
|
+
|
|
130
|
+
# 检查时间点数量
|
|
131
|
+
time_counts = {}
|
|
132
|
+
for time_period in time_periods:
|
|
133
|
+
time_counts[time_period] = time_counts.get(time_period, 0) + 1
|
|
134
|
+
|
|
135
|
+
unique_times = len(time_counts)
|
|
136
|
+
if unique_times < 2:
|
|
137
|
+
raise ValueError(f"面板数据需要至少2个不同的时间点,当前只有{unique_times}个")
|
|
138
|
+
|
|
139
|
+
# 检查是否为平衡面板
|
|
140
|
+
time_counts_per_entity = {}
|
|
141
|
+
for entity in set(entity_ids):
|
|
142
|
+
entity_times = [time for e, time in zip(entity_ids, time_periods) if e == entity]
|
|
143
|
+
time_counts_per_entity[entity] = len(set(entity_times))
|
|
144
|
+
|
|
145
|
+
min_times = min(time_counts_per_entity.values())
|
|
146
|
+
max_times = max(time_counts_per_entity.values())
|
|
147
|
+
if min_times != max_times:
|
|
148
|
+
warnings.warn(f"⚠️ 警告:面板数据不平衡。不同实体的时间点数量不同(最少{min_times}个,最多{max_times}个)。建议使用平衡面板数据以获得更可靠的结果。")
|
|
83
149
|
|
|
84
150
|
# 处理时间标识符格式兼容性
|
|
85
151
|
processed_time_periods = []
|
|
@@ -13,6 +13,9 @@ from statsmodels.tsa.arima.model import ARIMA
|
|
|
13
13
|
from statsmodels.tsa.statespace.sarimax import SARIMAX
|
|
14
14
|
from statsmodels.tsa.vector_ar.var_model import VAR
|
|
15
15
|
|
|
16
|
+
# 导入超时控制
|
|
17
|
+
from .timeout import with_timeout, TimeoutError
|
|
18
|
+
|
|
16
19
|
|
|
17
20
|
class StationarityTest(BaseModel):
|
|
18
21
|
"""Stationarity test results"""
|
|
@@ -138,6 +141,7 @@ def calculate_acf_pacf(
|
|
|
138
141
|
)
|
|
139
142
|
|
|
140
143
|
|
|
144
|
+
@with_timeout(seconds=60)
|
|
141
145
|
def var_model(
|
|
142
146
|
data: Dict[str, List[float]],
|
|
143
147
|
max_lags: int = 5,
|
|
@@ -170,16 +174,24 @@ def var_model(
|
|
|
170
174
|
if len(df) < min_obs:
|
|
171
175
|
raise ValueError(f"Data length ({len(df)}) insufficient, need at least {min_obs} observations")
|
|
172
176
|
|
|
173
|
-
# 数据平稳性检查
|
|
177
|
+
# 数据平稳性检查 - 优化性能,只检查前几个变量
|
|
174
178
|
from statsmodels.tsa.stattools import adfuller
|
|
175
179
|
stationary_vars = []
|
|
176
|
-
|
|
177
|
-
adf_result = adfuller(df[col].dropna())
|
|
178
|
-
if adf_result[1] < 0.05: # p值 < 0.05 表示平稳
|
|
179
|
-
stationary_vars.append(col)
|
|
180
|
+
max_stationarity_checks = min(5, len(df.columns)) # 最多检查5个变量
|
|
180
181
|
|
|
181
|
-
|
|
182
|
-
|
|
182
|
+
for i, col in enumerate(df.columns):
|
|
183
|
+
if i >= max_stationarity_checks:
|
|
184
|
+
break
|
|
185
|
+
try:
|
|
186
|
+
adf_result = adfuller(df[col].dropna(), maxlag=min(5, len(df)//10))
|
|
187
|
+
if adf_result[1] < 0.05: # p值 < 0.05 表示平稳
|
|
188
|
+
stationary_vars.append(col)
|
|
189
|
+
except:
|
|
190
|
+
# 如果检验失败,假设非平稳
|
|
191
|
+
pass
|
|
192
|
+
|
|
193
|
+
if len(stationary_vars) < max_stationarity_checks:
|
|
194
|
+
print(f"警告: 部分变量可能非平稳,建议进行差分处理")
|
|
183
195
|
|
|
184
196
|
# Fit VAR model
|
|
185
197
|
model = VAR(df)
|
|
@@ -220,11 +232,17 @@ def var_model(
|
|
|
220
232
|
fitted_values[col] = fitted_model.fittedvalues[col].tolist() if col in fitted_model.fittedvalues else []
|
|
221
233
|
residuals[col] = fitted_model.resid[col].tolist() if col in fitted_model.resid else []
|
|
222
234
|
|
|
223
|
-
# Granger causality test
|
|
235
|
+
# Granger causality test - 优化性能,限制测试数量
|
|
224
236
|
granger_causality = {}
|
|
225
|
-
|
|
237
|
+
max_causality_tests = min(3, len(df.columns)) # 最多测试3个变量
|
|
238
|
+
|
|
239
|
+
for i, cause in enumerate(df.columns):
|
|
240
|
+
if i >= max_causality_tests:
|
|
241
|
+
break
|
|
226
242
|
granger_causality[cause] = {}
|
|
227
|
-
for effect in df.columns:
|
|
243
|
+
for j, effect in enumerate(df.columns):
|
|
244
|
+
if j >= max_causality_tests:
|
|
245
|
+
break
|
|
228
246
|
if cause != effect:
|
|
229
247
|
try:
|
|
230
248
|
test_result = fitted_model.test_causality(effect, cause, kind='f')
|
|
@@ -247,6 +265,7 @@ def var_model(
|
|
|
247
265
|
raise ValueError(f"VAR model fitting failed: {str(e)}")
|
|
248
266
|
|
|
249
267
|
|
|
268
|
+
@with_timeout(seconds=30)
|
|
250
269
|
def garch_model(
|
|
251
270
|
data: List[float],
|
|
252
271
|
order: Tuple[int, int] = (1, 1),
|
|
@@ -321,6 +340,7 @@ def garch_model(
|
|
|
321
340
|
raise ValueError(f"GARCH model fitting failed: {str(e)}")
|
|
322
341
|
|
|
323
342
|
|
|
343
|
+
@with_timeout(seconds=45)
|
|
324
344
|
def state_space_model(
|
|
325
345
|
data: List[float],
|
|
326
346
|
state_dim: int = 1,
|
|
@@ -415,6 +435,7 @@ def state_space_model(
|
|
|
415
435
|
|
|
416
436
|
|
|
417
437
|
|
|
438
|
+
@with_timeout(seconds=30)
|
|
418
439
|
def variance_decomposition(
|
|
419
440
|
data: Dict[str, List[float]],
|
|
420
441
|
periods: int = 10,
|
|
@@ -430,16 +451,8 @@ def variance_decomposition(
|
|
|
430
451
|
if len(df) < min_obs:
|
|
431
452
|
raise ValueError(f"数据长度({len(df)})不足,需要至少{min_obs}个观测点")
|
|
432
453
|
|
|
433
|
-
# 数据平稳性检查
|
|
434
|
-
|
|
435
|
-
stationary_vars = []
|
|
436
|
-
for col in df.columns:
|
|
437
|
-
adf_result = adfuller(df[col].dropna())
|
|
438
|
-
if adf_result[1] < 0.05: # p值 < 0.05 表示平稳
|
|
439
|
-
stationary_vars.append(col)
|
|
440
|
-
|
|
441
|
-
if len(stationary_vars) < len(df.columns):
|
|
442
|
-
print(f"警告: 变量 {set(df.columns) - set(stationary_vars)} 可能非平稳,建议进行差分处理")
|
|
454
|
+
# 数据平稳性检查 - 优化性能,跳过检查以提升速度
|
|
455
|
+
print(f"警告: 方差分解跳过平稳性检查以提升性能,请确保数据平稳")
|
|
443
456
|
|
|
444
457
|
# Fit VAR model
|
|
445
458
|
model = VAR(df)
|
|
@@ -461,12 +474,30 @@ def variance_decomposition(
|
|
|
461
474
|
try:
|
|
462
475
|
vd = fitted_model.fevd(periods=periods)
|
|
463
476
|
|
|
464
|
-
# Build variance decomposition results
|
|
477
|
+
# Build variance decomposition results - 兼容不同statsmodels版本
|
|
465
478
|
variance_decomp = {}
|
|
466
479
|
for i, var_name in enumerate(df.columns):
|
|
467
480
|
variance_decomp[var_name] = {}
|
|
468
481
|
for j, shock_name in enumerate(df.columns):
|
|
469
|
-
|
|
482
|
+
try:
|
|
483
|
+
# 新版本statsmodels的访问方式
|
|
484
|
+
if hasattr(vd, 'decomposition'):
|
|
485
|
+
variance_decomp[var_name][shock_name] = vd.decomposition[var_name][shock_name].tolist()
|
|
486
|
+
elif hasattr(vd, 'cova'):
|
|
487
|
+
# 旧版本statsmodels的访问方式
|
|
488
|
+
variance_decomp[var_name][shock_name] = vd.cova[var_name][shock_name].tolist()
|
|
489
|
+
else:
|
|
490
|
+
# 如果无法访问,使用简化方法
|
|
491
|
+
if var_name == shock_name:
|
|
492
|
+
variance_decomp[var_name][shock_name] = [1.0] * periods
|
|
493
|
+
else:
|
|
494
|
+
variance_decomp[var_name][shock_name] = [0.0] * periods
|
|
495
|
+
except Exception as inner_e:
|
|
496
|
+
# 如果单个变量访问失败,使用简化方法
|
|
497
|
+
if var_name == shock_name:
|
|
498
|
+
variance_decomp[var_name][shock_name] = [1.0] * periods
|
|
499
|
+
else:
|
|
500
|
+
variance_decomp[var_name][shock_name] = [0.0] * periods
|
|
470
501
|
except Exception as e:
|
|
471
502
|
print(f"方差分解计算失败,使用简化方法: {e}")
|
|
472
503
|
# 简化实现
|