aigroup-econ-mcp 1.3.3__py3-none-any.whl → 1.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- .gitignore +253 -0
- PKG-INFO +710 -0
- README.md +672 -0
- __init__.py +14 -0
- aigroup_econ_mcp-1.4.3.dist-info/METADATA +710 -0
- aigroup_econ_mcp-1.4.3.dist-info/RECORD +92 -0
- aigroup_econ_mcp-1.4.3.dist-info/entry_points.txt +2 -0
- aigroup_econ_mcp-1.4.3.dist-info/licenses/LICENSE +21 -0
- cli.py +28 -0
- econometrics/README.md +18 -0
- econometrics/__init__.py +191 -0
- econometrics/advanced_methods/modern_computing_machine_learning/__init__.py +0 -0
- econometrics/basic_parametric_estimation/__init__.py +31 -0
- econometrics/basic_parametric_estimation/gmm/__init__.py +13 -0
- econometrics/basic_parametric_estimation/gmm/gmm_model.py +256 -0
- econometrics/basic_parametric_estimation/mle/__init__.py +13 -0
- econometrics/basic_parametric_estimation/mle/mle_model.py +241 -0
- econometrics/basic_parametric_estimation/ols/__init__.py +13 -0
- econometrics/basic_parametric_estimation/ols/ols_model.py +141 -0
- econometrics/causal_inference/causal_identification_strategy/__init__.py +0 -0
- econometrics/missing_data/missing_data_measurement_error/__init__.py +0 -0
- econometrics/model_specification_diagnostics_robust_inference/README.md +173 -0
- econometrics/model_specification_diagnostics_robust_inference/__init__.py +78 -0
- econometrics/model_specification_diagnostics_robust_inference/diagnostic_tests/__init__.py +20 -0
- econometrics/model_specification_diagnostics_robust_inference/diagnostic_tests/diagnostic_tests_model.py +149 -0
- econometrics/model_specification_diagnostics_robust_inference/generalized_least_squares/__init__.py +15 -0
- econometrics/model_specification_diagnostics_robust_inference/generalized_least_squares/gls_model.py +130 -0
- econometrics/model_specification_diagnostics_robust_inference/model_selection/__init__.py +18 -0
- econometrics/model_specification_diagnostics_robust_inference/model_selection/model_selection_model.py +286 -0
- econometrics/model_specification_diagnostics_robust_inference/regularization/__init__.py +15 -0
- econometrics/model_specification_diagnostics_robust_inference/regularization/regularization_model.py +177 -0
- econometrics/model_specification_diagnostics_robust_inference/robust_errors/__init__.py +15 -0
- econometrics/model_specification_diagnostics_robust_inference/robust_errors/robust_errors_model.py +122 -0
- econometrics/model_specification_diagnostics_robust_inference/simultaneous_equations/__init__.py +15 -0
- econometrics/model_specification_diagnostics_robust_inference/simultaneous_equations/simultaneous_equations_model.py +246 -0
- econometrics/model_specification_diagnostics_robust_inference/weighted_least_squares/__init__.py +15 -0
- econometrics/model_specification_diagnostics_robust_inference/weighted_least_squares/wls_model.py +127 -0
- econometrics/nonparametric/nonparametric_semiparametric_methods/__init__.py +0 -0
- econometrics/spatial_econometrics/spatial_econometrics_new/__init__.py +0 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/__init__.py +0 -0
- econometrics/specific_data_modeling/survival_duration_data/__init__.py +0 -0
- econometrics/specific_data_modeling/time_series_panel_data/__init__.py +143 -0
- econometrics/specific_data_modeling/time_series_panel_data/arima_model.py +104 -0
- econometrics/specific_data_modeling/time_series_panel_data/cointegration_vecm.py +334 -0
- econometrics/specific_data_modeling/time_series_panel_data/dynamic_panel_models.py +653 -0
- econometrics/specific_data_modeling/time_series_panel_data/exponential_smoothing.py +176 -0
- econometrics/specific_data_modeling/time_series_panel_data/garch_model.py +198 -0
- econometrics/specific_data_modeling/time_series_panel_data/panel_diagnostics.py +125 -0
- econometrics/specific_data_modeling/time_series_panel_data/panel_var.py +60 -0
- econometrics/specific_data_modeling/time_series_panel_data/structural_break_tests.py +87 -0
- econometrics/specific_data_modeling/time_series_panel_data/time_varying_parameter_models.py +106 -0
- econometrics/specific_data_modeling/time_series_panel_data/unit_root_tests.py +204 -0
- econometrics/specific_data_modeling/time_series_panel_data/var_svar_model.py +372 -0
- econometrics/statistical_inference/statistical_inference_techniques/__init__.py +0 -0
- econometrics/statistics/distribution_decomposition_methods/__init__.py +0 -0
- econometrics/tests/basic_parametric_estimation_tests/__init__.py +3 -0
- econometrics/tests/basic_parametric_estimation_tests/test_gmm.py +128 -0
- econometrics/tests/basic_parametric_estimation_tests/test_mle.py +127 -0
- econometrics/tests/basic_parametric_estimation_tests/test_ols.py +100 -0
- econometrics/tests/model_specification_diagnostics_tests/__init__.py +3 -0
- econometrics/tests/model_specification_diagnostics_tests/test_diagnostic_tests.py +86 -0
- econometrics/tests/model_specification_diagnostics_tests/test_robust_errors.py +89 -0
- econometrics/tests/specific_data_modeling_tests/__init__.py +3 -0
- econometrics/tests/specific_data_modeling_tests/test_arima.py +98 -0
- econometrics/tests/specific_data_modeling_tests/test_dynamic_panel.py +198 -0
- econometrics/tests/specific_data_modeling_tests/test_exponential_smoothing.py +105 -0
- econometrics/tests/specific_data_modeling_tests/test_garch.py +118 -0
- econometrics/tests/specific_data_modeling_tests/test_unit_root.py +156 -0
- econometrics/tests/specific_data_modeling_tests/test_var.py +124 -0
- prompts/__init__.py +0 -0
- prompts/analysis_guides.py +43 -0
- pyproject.toml +78 -0
- resources/MCP_MASTER_GUIDE.md +422 -0
- resources/MCP_TOOLS_DATA_FORMAT_GUIDE.md +185 -0
- resources/__init__.py +0 -0
- server.py +83 -0
- tools/README.md +88 -0
- tools/__init__.py +45 -0
- tools/data_loader.py +213 -0
- tools/decorators.py +38 -0
- tools/econometrics_adapter.py +286 -0
- tools/mcp_tool_groups/__init__.py +1 -0
- tools/mcp_tool_groups/basic_parametric_tools.py +173 -0
- tools/mcp_tool_groups/model_specification_tools.py +402 -0
- tools/mcp_tool_groups/time_series_tools.py +494 -0
- tools/mcp_tools_registry.py +114 -0
- tools/model_specification_adapter.py +369 -0
- tools/output_formatter.py +563 -0
- tools/time_series_panel_data_adapter.py +858 -0
- tools/time_series_panel_data_tools.py +65 -0
- aigroup_econ_mcp/__init__.py +0 -19
- aigroup_econ_mcp/cli.py +0 -82
- aigroup_econ_mcp/config.py +0 -561
- aigroup_econ_mcp/server.py +0 -452
- aigroup_econ_mcp/tools/__init__.py +0 -19
- aigroup_econ_mcp/tools/base.py +0 -470
- aigroup_econ_mcp/tools/cache.py +0 -533
- aigroup_econ_mcp/tools/data_loader.py +0 -195
- aigroup_econ_mcp/tools/file_parser.py +0 -1027
- aigroup_econ_mcp/tools/machine_learning.py +0 -60
- aigroup_econ_mcp/tools/ml_ensemble.py +0 -210
- aigroup_econ_mcp/tools/ml_evaluation.py +0 -272
- aigroup_econ_mcp/tools/ml_models.py +0 -54
- aigroup_econ_mcp/tools/ml_regularization.py +0 -186
- aigroup_econ_mcp/tools/monitoring.py +0 -555
- aigroup_econ_mcp/tools/optimized_example.py +0 -229
- aigroup_econ_mcp/tools/panel_data.py +0 -619
- aigroup_econ_mcp/tools/regression.py +0 -214
- aigroup_econ_mcp/tools/statistics.py +0 -154
- aigroup_econ_mcp/tools/time_series.py +0 -698
- aigroup_econ_mcp/tools/timeout.py +0 -283
- aigroup_econ_mcp/tools/tool_descriptions.py +0 -410
- aigroup_econ_mcp/tools/tool_handlers.py +0 -1016
- aigroup_econ_mcp/tools/tool_registry.py +0 -478
- aigroup_econ_mcp/tools/validation.py +0 -482
- aigroup_econ_mcp-1.3.3.dist-info/METADATA +0 -525
- aigroup_econ_mcp-1.3.3.dist-info/RECORD +0 -30
- aigroup_econ_mcp-1.3.3.dist-info/entry_points.txt +0 -2
- /aigroup_econ_mcp-1.3.3.dist-info/licenses/LICENSE → /LICENSE +0 -0
- {aigroup_econ_mcp-1.3.3.dist-info → aigroup_econ_mcp-1.4.3.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
"""
|
|
2
|
+
模型诊断测试 (Diagnostic Tests) 模块实现
|
|
3
|
+
|
|
4
|
+
包括各种统计检验方法:
|
|
5
|
+
- 异方差检验(White、Breusch-Pagan)
|
|
6
|
+
- 自相关检验(Durbin-Watson、Ljung-Box)
|
|
7
|
+
- 正态性检验(Jarque-Bera)
|
|
8
|
+
- 多重共线性诊断(VIF)
|
|
9
|
+
- 内生性检验(Durbin-Wu-Hausman)
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from typing import List, Dict, Any, Optional, Tuple
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
from pydantic import BaseModel, Field
|
|
15
|
+
import numpy as np
|
|
16
|
+
import pandas as pd
|
|
17
|
+
from scipy import stats
|
|
18
|
+
import statsmodels.api as sm
|
|
19
|
+
from statsmodels.stats.diagnostic import het_breuschpagan, het_white, acorr_ljungbox
|
|
20
|
+
from statsmodels.stats.stattools import jarque_bera
|
|
21
|
+
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
|
22
|
+
|
|
23
|
+
from tools.decorators import with_file_support_decorator as econometric_tool, validate_input
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class DiagnosticTestsResult(BaseModel):
|
|
27
|
+
"""模型诊断测试结果"""
|
|
28
|
+
het_breuschpagan_stat: Optional[float] = Field(None, description="Breusch-Pagan异方差检验统计量")
|
|
29
|
+
het_breuschpagan_pvalue: Optional[float] = Field(None, description="Breusch-Pagan异方差检验p值")
|
|
30
|
+
het_white_stat: Optional[float] = Field(None, description="White异方差检验统计量")
|
|
31
|
+
het_white_pvalue: Optional[float] = Field(None, description="White异方差检验p值")
|
|
32
|
+
dw_statistic: Optional[float] = Field(None, description="Durbin-Watson自相关检验统计量")
|
|
33
|
+
jb_statistic: Optional[float] = Field(None, description="Jarque-Bera正态性检验统计量")
|
|
34
|
+
jb_pvalue: Optional[float] = Field(None, description="Jarque-Bera正态性检验p值")
|
|
35
|
+
vif_values: Optional[List[float]] = Field(None, description="方差膨胀因子(VIF)")
|
|
36
|
+
feature_names: Optional[List[str]] = Field(None, description="特征名称")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@econometric_tool("diagnostic_tests")
|
|
40
|
+
@validate_input(data_type="econometric")
|
|
41
|
+
def diagnostic_tests(
|
|
42
|
+
y_data: List[float],
|
|
43
|
+
x_data: List[List[float]],
|
|
44
|
+
feature_names: Optional[List[str]] = None,
|
|
45
|
+
constant: bool = True
|
|
46
|
+
) -> DiagnosticTestsResult:
|
|
47
|
+
"""
|
|
48
|
+
执行多种模型诊断测试
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
y_data: 因变量数据
|
|
52
|
+
x_data: 自变量数据
|
|
53
|
+
feature_names: 特征名称
|
|
54
|
+
constant: 是否包含常数项
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
DiagnosticTestsResult: 诊断测试结果
|
|
58
|
+
"""
|
|
59
|
+
# 转换为numpy数组并确保浮点精度
|
|
60
|
+
y = np.asarray(y_data, dtype=np.float64)
|
|
61
|
+
X = np.asarray(x_data, dtype=np.float64)
|
|
62
|
+
|
|
63
|
+
# 添加常数项
|
|
64
|
+
if constant:
|
|
65
|
+
X = sm.add_constant(X)
|
|
66
|
+
if feature_names:
|
|
67
|
+
feature_names = ["const"] + feature_names
|
|
68
|
+
else:
|
|
69
|
+
feature_names = [f"x{i}" for i in range(X.shape[1])]
|
|
70
|
+
else:
|
|
71
|
+
if not feature_names:
|
|
72
|
+
feature_names = [f"x{i}" for i in range(X.shape[1])]
|
|
73
|
+
|
|
74
|
+
# 执行OLS回归
|
|
75
|
+
try:
|
|
76
|
+
ols_model = sm.OLS(y, X)
|
|
77
|
+
ols_results = ols_model.fit()
|
|
78
|
+
except Exception as e:
|
|
79
|
+
# 如果OLS失败,返回默认结果
|
|
80
|
+
return DiagnosticTestsResult(
|
|
81
|
+
het_breuschpagan_stat=None,
|
|
82
|
+
het_breuschpagan_pvalue=None,
|
|
83
|
+
het_white_stat=None,
|
|
84
|
+
het_white_pvalue=None,
|
|
85
|
+
dw_statistic=None,
|
|
86
|
+
jb_statistic=None,
|
|
87
|
+
jb_pvalue=None,
|
|
88
|
+
vif_values=None,
|
|
89
|
+
feature_names=feature_names[1:] if feature_names and len(feature_names) > 1 else None
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
# 计算预测值和残差
|
|
93
|
+
y_pred = ols_results.fittedvalues
|
|
94
|
+
residuals = ols_results.resid
|
|
95
|
+
|
|
96
|
+
# Breusch-Pagan异方差检验
|
|
97
|
+
try:
|
|
98
|
+
bp_stat, bp_pvalue, _, _ = het_breuschpagan(residuals, X)
|
|
99
|
+
bp_stat = float(bp_stat)
|
|
100
|
+
bp_pvalue = float(bp_pvalue)
|
|
101
|
+
except:
|
|
102
|
+
bp_stat = None
|
|
103
|
+
bp_pvalue = None
|
|
104
|
+
|
|
105
|
+
# White异方差检验
|
|
106
|
+
try:
|
|
107
|
+
white_stat, white_pvalue, _, _ = het_white(residuals, X)
|
|
108
|
+
white_stat = float(white_stat)
|
|
109
|
+
white_pvalue = float(white_pvalue)
|
|
110
|
+
except:
|
|
111
|
+
white_stat = None
|
|
112
|
+
white_pvalue = None
|
|
113
|
+
|
|
114
|
+
# Durbin-Watson自相关检验
|
|
115
|
+
try:
|
|
116
|
+
dw_stat = float(sm.stats.durbin_watson(residuals))
|
|
117
|
+
except:
|
|
118
|
+
dw_stat = None
|
|
119
|
+
|
|
120
|
+
# Jarque-Bera正态性检验
|
|
121
|
+
try:
|
|
122
|
+
jb_stat, jb_pvalue, _, _ = jarque_bera(residuals)
|
|
123
|
+
jb_stat = float(jb_stat)
|
|
124
|
+
jb_pvalue = float(jb_pvalue)
|
|
125
|
+
except:
|
|
126
|
+
jb_stat = None
|
|
127
|
+
jb_pvalue = None
|
|
128
|
+
|
|
129
|
+
# VIF计算(方差膨胀因子)
|
|
130
|
+
try:
|
|
131
|
+
vif_values = []
|
|
132
|
+
# 只对自变量计算VIF(跳过常数项)
|
|
133
|
+
for i in range(1 if constant else 0, X.shape[1]):
|
|
134
|
+
vif = variance_inflation_factor(X, i)
|
|
135
|
+
vif_values.append(float(vif))
|
|
136
|
+
except:
|
|
137
|
+
vif_values = None
|
|
138
|
+
|
|
139
|
+
return DiagnosticTestsResult(
|
|
140
|
+
het_breuschpagan_stat=bp_stat,
|
|
141
|
+
het_breuschpagan_pvalue=bp_pvalue,
|
|
142
|
+
het_white_stat=white_stat,
|
|
143
|
+
het_white_pvalue=white_pvalue,
|
|
144
|
+
dw_statistic=dw_stat,
|
|
145
|
+
jb_statistic=jb_stat,
|
|
146
|
+
jb_pvalue=jb_pvalue,
|
|
147
|
+
vif_values=vif_values,
|
|
148
|
+
feature_names=feature_names[1:] if constant and feature_names and len(feature_names) > 1 else feature_names
|
|
149
|
+
)
|
econometrics/model_specification_diagnostics_robust_inference/generalized_least_squares/gls_model.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""
|
|
2
|
+
广义最小二乘法 (Generalized Least Squares, GLS) 模型实现
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import List, Dict, Any, Optional
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from scipy import stats
|
|
11
|
+
import statsmodels.api as sm
|
|
12
|
+
|
|
13
|
+
from tools.decorators import with_file_support_decorator as econometric_tool, validate_input
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class GLSResult(BaseModel):
|
|
17
|
+
"""GLS回归结果"""
|
|
18
|
+
coefficients: List[float] = Field(..., description="回归系数")
|
|
19
|
+
std_errors: List[float] = Field(..., description="系数标准误")
|
|
20
|
+
t_values: List[float] = Field(..., description="t统计量")
|
|
21
|
+
p_values: List[float] = Field(..., description="p值")
|
|
22
|
+
conf_int_lower: List[float] = Field(..., description="置信区间下界")
|
|
23
|
+
conf_int_upper: List[float] = Field(..., description="置信区间上界")
|
|
24
|
+
r_squared: float = Field(..., description="R方")
|
|
25
|
+
adj_r_squared: float = Field(..., description="调整R方")
|
|
26
|
+
f_statistic: float = Field(..., description="F统计量")
|
|
27
|
+
f_p_value: float = Field(..., description="F统计量p值")
|
|
28
|
+
n_obs: int = Field(..., description="观测数量")
|
|
29
|
+
feature_names: List[str] = Field(..., description="特征名称")
|
|
30
|
+
log_likelihood: float = Field(..., description="对数似然值")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@econometric_tool("gls_regression")
|
|
34
|
+
@validate_input(data_type="econometric")
|
|
35
|
+
def gls_regression(
|
|
36
|
+
y_data: List[float],
|
|
37
|
+
x_data: List[List[float]],
|
|
38
|
+
sigma: Optional[List[List[float]]] = None,
|
|
39
|
+
feature_names: Optional[List[str]] = None,
|
|
40
|
+
constant: bool = True,
|
|
41
|
+
confidence_level: float = 0.95
|
|
42
|
+
) -> GLSResult:
|
|
43
|
+
"""
|
|
44
|
+
广义最小二乘法回归
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
y_data: 因变量数据
|
|
48
|
+
x_data: 自变量数据
|
|
49
|
+
sigma: 误差项协方差矩阵(可选,如未提供则使用单位矩阵)
|
|
50
|
+
feature_names: 特征名称
|
|
51
|
+
constant: 是否包含常数项
|
|
52
|
+
confidence_level: 置信水平
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
GLSResult: GLS回归结果
|
|
56
|
+
"""
|
|
57
|
+
# 转换为numpy数组
|
|
58
|
+
y = np.asarray(y_data, dtype=np.float64)
|
|
59
|
+
X = np.asarray(x_data, dtype=np.float64)
|
|
60
|
+
|
|
61
|
+
# 添加常数项
|
|
62
|
+
if constant:
|
|
63
|
+
X = sm.add_constant(X)
|
|
64
|
+
if feature_names:
|
|
65
|
+
feature_names = ["const"] + feature_names
|
|
66
|
+
else:
|
|
67
|
+
feature_names = [f"x{i}" for i in range(X.shape[1])]
|
|
68
|
+
else:
|
|
69
|
+
if not feature_names:
|
|
70
|
+
feature_names = [f"x{i}" for i in range(X.shape[1])]
|
|
71
|
+
|
|
72
|
+
# 检查数据维度
|
|
73
|
+
n, k = X.shape
|
|
74
|
+
if n <= k:
|
|
75
|
+
raise ValueError(f"观测数量({n})必须大于变量数量({k})")
|
|
76
|
+
|
|
77
|
+
# 如果未提供协方差矩阵,则使用单位矩阵(等价于OLS)
|
|
78
|
+
if sigma is None:
|
|
79
|
+
model = sm.GLS(y, X)
|
|
80
|
+
else:
|
|
81
|
+
sigma_array = np.asarray(sigma, dtype=np.float64)
|
|
82
|
+
# 检查协方差矩阵维度
|
|
83
|
+
if sigma_array.shape != (n, n):
|
|
84
|
+
raise ValueError(f"协方差矩阵sigma的维度必须是({n}, {n}),当前是{sigma_array.shape}")
|
|
85
|
+
model = sm.GLS(y, X, sigma=sigma_array)
|
|
86
|
+
|
|
87
|
+
# 拟合模型
|
|
88
|
+
try:
|
|
89
|
+
results = model.fit()
|
|
90
|
+
except Exception as e:
|
|
91
|
+
raise ValueError(f"无法拟合GLS模型: {str(e)}")
|
|
92
|
+
|
|
93
|
+
# 提取结果
|
|
94
|
+
coefficients = results.params.tolist()
|
|
95
|
+
std_errors = results.bse.tolist()
|
|
96
|
+
t_values = results.tvalues.tolist()
|
|
97
|
+
p_values = results.pvalues.tolist()
|
|
98
|
+
|
|
99
|
+
# 计算置信区间
|
|
100
|
+
alpha = 1 - confidence_level
|
|
101
|
+
conf_int = results.conf_int(alpha=alpha)
|
|
102
|
+
conf_int_lower = conf_int[:, 0].tolist()
|
|
103
|
+
conf_int_upper = conf_int[:, 1].tolist()
|
|
104
|
+
|
|
105
|
+
# 其他统计量
|
|
106
|
+
r_squared = float(results.rsquared)
|
|
107
|
+
adj_r_squared = float(results.rsquared_adj)
|
|
108
|
+
|
|
109
|
+
# F统计量
|
|
110
|
+
f_statistic = float(results.fvalue) if not np.isnan(results.fvalue) else 0.0
|
|
111
|
+
f_p_value = float(results.f_pvalue) if not np.isnan(results.f_pvalue) else 1.0
|
|
112
|
+
|
|
113
|
+
# 对数似然值
|
|
114
|
+
log_likelihood = float(results.llf)
|
|
115
|
+
|
|
116
|
+
return GLSResult(
|
|
117
|
+
coefficients=coefficients,
|
|
118
|
+
std_errors=std_errors,
|
|
119
|
+
t_values=t_values,
|
|
120
|
+
p_values=p_values,
|
|
121
|
+
conf_int_lower=conf_int_lower,
|
|
122
|
+
conf_int_upper=conf_int_upper,
|
|
123
|
+
r_squared=r_squared,
|
|
124
|
+
adj_r_squared=adj_r_squared,
|
|
125
|
+
f_statistic=f_statistic,
|
|
126
|
+
f_p_value=f_p_value,
|
|
127
|
+
n_obs=int(results.nobs),
|
|
128
|
+
feature_names=feature_names,
|
|
129
|
+
log_likelihood=log_likelihood
|
|
130
|
+
)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""
|
|
2
|
+
模型选择 (Model Selection) 模块
|
|
3
|
+
|
|
4
|
+
包括:
|
|
5
|
+
- 信息准则(AIC/BIC/HQIC)
|
|
6
|
+
- 交叉验证(K折、留一法)
|
|
7
|
+
- 格兰杰因果检验
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from .model_selection_model import (
|
|
11
|
+
ModelSelectionResult,
|
|
12
|
+
model_selection_criteria
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"ModelSelectionResult",
|
|
17
|
+
"model_selection_criteria"
|
|
18
|
+
]
|
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
"""
|
|
2
|
+
模型选择 (Model Selection) 模块实现
|
|
3
|
+
|
|
4
|
+
包括:
|
|
5
|
+
- 信息准则(AIC/BIC/HQIC)
|
|
6
|
+
- 交叉验证(K折、留一法)
|
|
7
|
+
- 格兰杰因果检验
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from typing import List, Dict, Any, Optional, Tuple
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from pydantic import BaseModel, Field
|
|
13
|
+
import numpy as np
|
|
14
|
+
import pandas as pd
|
|
15
|
+
from scipy import stats
|
|
16
|
+
import statsmodels.api as sm
|
|
17
|
+
from statsmodels.tsa.stattools import grangercausalitytests
|
|
18
|
+
|
|
19
|
+
from tools.decorators import with_file_support_decorator as econometric_tool, validate_input
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class GrangerCausalityResult(BaseModel):
|
|
23
|
+
"""格兰杰因果检验结果"""
|
|
24
|
+
f_statistic: float = Field(..., description="F统计量")
|
|
25
|
+
p_value: float = Field(..., description="p值")
|
|
26
|
+
lag_order: int = Field(..., description="滞后阶数")
|
|
27
|
+
n_obs: int = Field(..., description="观测数量")
|
|
28
|
+
dependent_variable: str = Field(..., description="因变量")
|
|
29
|
+
independent_variable: str = Field(..., description="格兰杰原因变量")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ModelSelectionResult(BaseModel):
|
|
33
|
+
"""模型选择结果"""
|
|
34
|
+
aic: float = Field(..., description="赤池信息准则 (AIC)")
|
|
35
|
+
bic: float = Field(..., description="贝叶斯信息准则 (BIC)")
|
|
36
|
+
hqic: float = Field(..., description="汉南-奎因信息准则 (HQIC)")
|
|
37
|
+
r_squared: float = Field(..., description="R方")
|
|
38
|
+
adj_r_squared: float = Field(..., description="调整R方")
|
|
39
|
+
log_likelihood: float = Field(..., description="对数似然值")
|
|
40
|
+
n_obs: int = Field(..., description="观测数量")
|
|
41
|
+
n_params: int = Field(..., description="参数数量")
|
|
42
|
+
cv_score: Optional[float] = Field(None, description="交叉验证得分")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@econometric_tool("granger_causality_test")
|
|
46
|
+
@validate_input(data_type="timeseries")
|
|
47
|
+
def granger_causality_test(
|
|
48
|
+
x_data: List[float],
|
|
49
|
+
y_data: List[float],
|
|
50
|
+
max_lag: int = 1,
|
|
51
|
+
add_constant: bool = True
|
|
52
|
+
) -> GrangerCausalityResult:
|
|
53
|
+
"""
|
|
54
|
+
格兰杰因果检验
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
x_data: 可能的格兰杰原因变量
|
|
58
|
+
y_data: 因变量
|
|
59
|
+
max_lag: 最大滞后阶数
|
|
60
|
+
add_constant: 是否添加常数项
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
GrangerCausalityResult: 格兰杰因果检验结果
|
|
64
|
+
"""
|
|
65
|
+
# 转换为numpy数组
|
|
66
|
+
x = np.asarray(x_data, dtype=np.float64)
|
|
67
|
+
y = np.asarray(y_data, dtype=np.float64)
|
|
68
|
+
|
|
69
|
+
# 检查数据长度
|
|
70
|
+
if len(x) != len(y):
|
|
71
|
+
raise ValueError("x_data和y_data的长度必须相同")
|
|
72
|
+
|
|
73
|
+
if len(x) <= max_lag:
|
|
74
|
+
raise ValueError("数据长度必须大于滞后阶数")
|
|
75
|
+
|
|
76
|
+
# 构建数据框用于statsmodels
|
|
77
|
+
data = pd.DataFrame({'y': y, 'x': x})
|
|
78
|
+
|
|
79
|
+
# 执行格兰杰因果检验
|
|
80
|
+
try:
|
|
81
|
+
# grangercausalitytests返回一个字典,键为滞后阶数
|
|
82
|
+
test_result = grangercausalitytests(data, max_lag, addconst=add_constant, verbose=False)
|
|
83
|
+
|
|
84
|
+
# 获取指定滞后阶数的结果(使用最大滞后阶数)
|
|
85
|
+
lag_order = max_lag
|
|
86
|
+
test_stats = test_result[lag_order][0]
|
|
87
|
+
|
|
88
|
+
# 提取F统计量和p值(使用ssr F-test)
|
|
89
|
+
f_statistic = test_stats['F test']
|
|
90
|
+
f_stat = f_statistic[0] # F统计量
|
|
91
|
+
p_value = f_statistic[1] # p值
|
|
92
|
+
|
|
93
|
+
except Exception as e:
|
|
94
|
+
# 如果检验失败,返回默认值
|
|
95
|
+
f_stat = 0.0
|
|
96
|
+
p_value = 1.0
|
|
97
|
+
lag_order = max_lag
|
|
98
|
+
|
|
99
|
+
return GrangerCausalityResult(
|
|
100
|
+
f_statistic=float(f_stat),
|
|
101
|
+
p_value=float(p_value),
|
|
102
|
+
lag_order=lag_order,
|
|
103
|
+
n_obs=len(y) - lag_order, # 考虑滞后后的实际观测数
|
|
104
|
+
dependent_variable="y",
|
|
105
|
+
independent_variable="x"
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
@econometric_tool("model_selection_criteria")
|
|
110
|
+
@validate_input(data_type="econometric")
|
|
111
|
+
def model_selection_criteria(
|
|
112
|
+
y_data: List[float],
|
|
113
|
+
x_data: List[List[float]],
|
|
114
|
+
feature_names: Optional[List[str]] = None,
|
|
115
|
+
constant: bool = True,
|
|
116
|
+
cv_folds: Optional[int] = None
|
|
117
|
+
) -> ModelSelectionResult:
|
|
118
|
+
"""
|
|
119
|
+
计算模型选择信息准则
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
y_data: 因变量数据
|
|
123
|
+
x_data: 自变量数据
|
|
124
|
+
feature_names: 特征名称
|
|
125
|
+
constant: 是否包含常数项
|
|
126
|
+
cv_folds: 交叉验证折数 (None表示不进行交叉验证,-1表示留一法)
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
ModelSelectionResult: 模型选择结果
|
|
130
|
+
"""
|
|
131
|
+
# 转换为numpy数组
|
|
132
|
+
y = np.array(y_data)
|
|
133
|
+
X = np.array(x_data)
|
|
134
|
+
|
|
135
|
+
# 添加常数项
|
|
136
|
+
if constant:
|
|
137
|
+
X = sm.add_constant(X)
|
|
138
|
+
if feature_names:
|
|
139
|
+
feature_names = ["const"] + feature_names
|
|
140
|
+
else:
|
|
141
|
+
feature_names = [f"x{i}" for i in range(X.shape[1])]
|
|
142
|
+
else:
|
|
143
|
+
if not feature_names:
|
|
144
|
+
feature_names = [f"x{i}" for i in range(X.shape[1])]
|
|
145
|
+
|
|
146
|
+
# 执行OLS回归
|
|
147
|
+
try:
|
|
148
|
+
model = sm.OLS(y, X)
|
|
149
|
+
results = model.fit()
|
|
150
|
+
except Exception as e:
|
|
151
|
+
raise ValueError(f"无法拟合模型: {str(e)}")
|
|
152
|
+
|
|
153
|
+
# 提取统计量
|
|
154
|
+
n = int(results.nobs)
|
|
155
|
+
k = len(results.params)
|
|
156
|
+
r_squared = float(results.rsquared)
|
|
157
|
+
adj_r_squared = float(results.rsquared_adj)
|
|
158
|
+
log_likelihood = float(results.llf)
|
|
159
|
+
aic = float(results.aic)
|
|
160
|
+
bic = float(results.bic)
|
|
161
|
+
|
|
162
|
+
# 计算HQIC (statsmodels中没有直接提供HQIC)
|
|
163
|
+
if n > 1 and np.log(n) != 0:
|
|
164
|
+
hqic = -2 * log_likelihood + 2 * k * np.log(np.log(n))
|
|
165
|
+
else:
|
|
166
|
+
hqic = np.inf
|
|
167
|
+
|
|
168
|
+
# 交叉验证
|
|
169
|
+
cv_score = None
|
|
170
|
+
if cv_folds is not None:
|
|
171
|
+
cv_score = _cross_validation(y, X, cv_folds)
|
|
172
|
+
|
|
173
|
+
return ModelSelectionResult(
|
|
174
|
+
aic=aic,
|
|
175
|
+
bic=bic,
|
|
176
|
+
hqic=float(hqic) if np.isfinite(hqic) else np.inf,
|
|
177
|
+
r_squared=r_squared,
|
|
178
|
+
adj_r_squared=adj_r_squared,
|
|
179
|
+
log_likelihood=log_likelihood,
|
|
180
|
+
n_obs=n,
|
|
181
|
+
n_params=k,
|
|
182
|
+
cv_score=float(cv_score) if cv_score is not None else None
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _cross_validation(y: np.ndarray, X: np.ndarray, folds: Optional[int]) -> float:
|
|
187
|
+
"""
|
|
188
|
+
执行交叉验证
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
y: 因变量
|
|
192
|
+
X: 自变量矩阵
|
|
193
|
+
folds: 折数 (-1表示留一法,其他正数表示K折交叉验证)
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
float: 交叉验证得分 (平均MSE)
|
|
197
|
+
"""
|
|
198
|
+
n = len(y)
|
|
199
|
+
|
|
200
|
+
if folds is None or folds == 0:
|
|
201
|
+
return None
|
|
202
|
+
|
|
203
|
+
if folds == -1 or folds >= n:
|
|
204
|
+
# 留一法交叉验证
|
|
205
|
+
folds = n
|
|
206
|
+
|
|
207
|
+
if folds <= 1 or X.shape[0] != n:
|
|
208
|
+
return None
|
|
209
|
+
|
|
210
|
+
# 检查是否有足够的数据进行训练和测试
|
|
211
|
+
if X.shape[0] < X.shape[1]:
|
|
212
|
+
return None
|
|
213
|
+
|
|
214
|
+
# 创建折叠索引
|
|
215
|
+
indices = np.arange(n)
|
|
216
|
+
np.random.seed(42) # 固定随机种子以确保结果可重现
|
|
217
|
+
np.random.shuffle(indices)
|
|
218
|
+
|
|
219
|
+
# 计算每折的大小
|
|
220
|
+
fold_sizes = np.full(folds, n // folds)
|
|
221
|
+
fold_sizes[:n % folds] += 1
|
|
222
|
+
|
|
223
|
+
current = 0
|
|
224
|
+
mse_scores = []
|
|
225
|
+
|
|
226
|
+
for fold_size in fold_sizes:
|
|
227
|
+
start, stop = current, current + fold_size
|
|
228
|
+
test_idx = indices[start:stop]
|
|
229
|
+
train_idx = np.concatenate([indices[:start], indices[stop:]])
|
|
230
|
+
|
|
231
|
+
# 分割数据
|
|
232
|
+
X_train, X_test = X[train_idx], X[test_idx]
|
|
233
|
+
y_train, y_test = y[train_idx], y[test_idx]
|
|
234
|
+
|
|
235
|
+
try:
|
|
236
|
+
# 检查是否有足够的数据进行训练和测试
|
|
237
|
+
if X_train.shape[0] < X_train.shape[1] or X_train.shape[0] == 0 or X_test.shape[0] == 0:
|
|
238
|
+
continue
|
|
239
|
+
|
|
240
|
+
# 训练模型,使用带正则化的求解方法
|
|
241
|
+
try:
|
|
242
|
+
# 使用statsmodels进行更稳定的回归
|
|
243
|
+
train_model = sm.OLS(y_train, X_train)
|
|
244
|
+
train_results = train_model.fit()
|
|
245
|
+
beta_train = train_results.params
|
|
246
|
+
except:
|
|
247
|
+
# 如果statsmodels失败,使用numpy的最小二乘法
|
|
248
|
+
# 添加正则化防止矩阵奇异
|
|
249
|
+
XtX = X_train.T @ X_train
|
|
250
|
+
if XtX.shape[0] > 0:
|
|
251
|
+
# 添加一个小的正则化项
|
|
252
|
+
reg_param = 1e-10 * np.trace(XtX) / XtX.shape[0] if np.trace(XtX) > 0 and XtX.shape[0] > 0 else 1e-10
|
|
253
|
+
XtX_reg = XtX + reg_param * np.eye(XtX.shape[0])
|
|
254
|
+
try:
|
|
255
|
+
beta_train = np.linalg.solve(XtX_reg, X_train.T @ y_train)
|
|
256
|
+
except np.linalg.LinAlgError:
|
|
257
|
+
# 如果仍然失败,使用伪逆
|
|
258
|
+
beta_train = np.linalg.pinv(XtX_reg) @ X_train.T @ y_train
|
|
259
|
+
else:
|
|
260
|
+
continue
|
|
261
|
+
|
|
262
|
+
# 预测
|
|
263
|
+
try:
|
|
264
|
+
y_pred = X_test @ beta_train
|
|
265
|
+
except:
|
|
266
|
+
continue
|
|
267
|
+
|
|
268
|
+
# 检查预测值是否有效
|
|
269
|
+
if not np.all(np.isfinite(y_pred)):
|
|
270
|
+
continue
|
|
271
|
+
|
|
272
|
+
# 计算MSE
|
|
273
|
+
mse = np.mean((y_test - y_pred) ** 2)
|
|
274
|
+
# 检查MSE是否有效
|
|
275
|
+
if np.isfinite(mse):
|
|
276
|
+
mse_scores.append(mse)
|
|
277
|
+
except (np.linalg.LinAlgError, ValueError, ZeroDivisionError):
|
|
278
|
+
# 如果出现数值问题,跳过这一折
|
|
279
|
+
pass
|
|
280
|
+
except Exception:
|
|
281
|
+
# 捕获其他可能的异常
|
|
282
|
+
pass
|
|
283
|
+
|
|
284
|
+
current = stop
|
|
285
|
+
|
|
286
|
+
return np.mean(mse_scores) if mse_scores and len(mse_scores) > 0 else None
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""
|
|
2
|
+
正则化方法 (Regularization Methods) 模块
|
|
3
|
+
|
|
4
|
+
包括岭回归、LASSO和弹性网络等方法,用于处理多重共线性/高维数据
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from .regularization_model import (
|
|
8
|
+
RegularizationResult,
|
|
9
|
+
regularized_regression
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"RegularizationResult",
|
|
14
|
+
"regularized_regression"
|
|
15
|
+
]
|