aigroup-econ-mcp 1.3.3__py3-none-any.whl → 1.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. .gitignore +253 -0
  2. PKG-INFO +710 -0
  3. README.md +672 -0
  4. __init__.py +14 -0
  5. aigroup_econ_mcp-1.4.3.dist-info/METADATA +710 -0
  6. aigroup_econ_mcp-1.4.3.dist-info/RECORD +92 -0
  7. aigroup_econ_mcp-1.4.3.dist-info/entry_points.txt +2 -0
  8. aigroup_econ_mcp-1.4.3.dist-info/licenses/LICENSE +21 -0
  9. cli.py +28 -0
  10. econometrics/README.md +18 -0
  11. econometrics/__init__.py +191 -0
  12. econometrics/advanced_methods/modern_computing_machine_learning/__init__.py +0 -0
  13. econometrics/basic_parametric_estimation/__init__.py +31 -0
  14. econometrics/basic_parametric_estimation/gmm/__init__.py +13 -0
  15. econometrics/basic_parametric_estimation/gmm/gmm_model.py +256 -0
  16. econometrics/basic_parametric_estimation/mle/__init__.py +13 -0
  17. econometrics/basic_parametric_estimation/mle/mle_model.py +241 -0
  18. econometrics/basic_parametric_estimation/ols/__init__.py +13 -0
  19. econometrics/basic_parametric_estimation/ols/ols_model.py +141 -0
  20. econometrics/causal_inference/causal_identification_strategy/__init__.py +0 -0
  21. econometrics/missing_data/missing_data_measurement_error/__init__.py +0 -0
  22. econometrics/model_specification_diagnostics_robust_inference/README.md +173 -0
  23. econometrics/model_specification_diagnostics_robust_inference/__init__.py +78 -0
  24. econometrics/model_specification_diagnostics_robust_inference/diagnostic_tests/__init__.py +20 -0
  25. econometrics/model_specification_diagnostics_robust_inference/diagnostic_tests/diagnostic_tests_model.py +149 -0
  26. econometrics/model_specification_diagnostics_robust_inference/generalized_least_squares/__init__.py +15 -0
  27. econometrics/model_specification_diagnostics_robust_inference/generalized_least_squares/gls_model.py +130 -0
  28. econometrics/model_specification_diagnostics_robust_inference/model_selection/__init__.py +18 -0
  29. econometrics/model_specification_diagnostics_robust_inference/model_selection/model_selection_model.py +286 -0
  30. econometrics/model_specification_diagnostics_robust_inference/regularization/__init__.py +15 -0
  31. econometrics/model_specification_diagnostics_robust_inference/regularization/regularization_model.py +177 -0
  32. econometrics/model_specification_diagnostics_robust_inference/robust_errors/__init__.py +15 -0
  33. econometrics/model_specification_diagnostics_robust_inference/robust_errors/robust_errors_model.py +122 -0
  34. econometrics/model_specification_diagnostics_robust_inference/simultaneous_equations/__init__.py +15 -0
  35. econometrics/model_specification_diagnostics_robust_inference/simultaneous_equations/simultaneous_equations_model.py +246 -0
  36. econometrics/model_specification_diagnostics_robust_inference/weighted_least_squares/__init__.py +15 -0
  37. econometrics/model_specification_diagnostics_robust_inference/weighted_least_squares/wls_model.py +127 -0
  38. econometrics/nonparametric/nonparametric_semiparametric_methods/__init__.py +0 -0
  39. econometrics/spatial_econometrics/spatial_econometrics_new/__init__.py +0 -0
  40. econometrics/specific_data_modeling/micro_discrete_limited_data/__init__.py +0 -0
  41. econometrics/specific_data_modeling/survival_duration_data/__init__.py +0 -0
  42. econometrics/specific_data_modeling/time_series_panel_data/__init__.py +143 -0
  43. econometrics/specific_data_modeling/time_series_panel_data/arima_model.py +104 -0
  44. econometrics/specific_data_modeling/time_series_panel_data/cointegration_vecm.py +334 -0
  45. econometrics/specific_data_modeling/time_series_panel_data/dynamic_panel_models.py +653 -0
  46. econometrics/specific_data_modeling/time_series_panel_data/exponential_smoothing.py +176 -0
  47. econometrics/specific_data_modeling/time_series_panel_data/garch_model.py +198 -0
  48. econometrics/specific_data_modeling/time_series_panel_data/panel_diagnostics.py +125 -0
  49. econometrics/specific_data_modeling/time_series_panel_data/panel_var.py +60 -0
  50. econometrics/specific_data_modeling/time_series_panel_data/structural_break_tests.py +87 -0
  51. econometrics/specific_data_modeling/time_series_panel_data/time_varying_parameter_models.py +106 -0
  52. econometrics/specific_data_modeling/time_series_panel_data/unit_root_tests.py +204 -0
  53. econometrics/specific_data_modeling/time_series_panel_data/var_svar_model.py +372 -0
  54. econometrics/statistical_inference/statistical_inference_techniques/__init__.py +0 -0
  55. econometrics/statistics/distribution_decomposition_methods/__init__.py +0 -0
  56. econometrics/tests/basic_parametric_estimation_tests/__init__.py +3 -0
  57. econometrics/tests/basic_parametric_estimation_tests/test_gmm.py +128 -0
  58. econometrics/tests/basic_parametric_estimation_tests/test_mle.py +127 -0
  59. econometrics/tests/basic_parametric_estimation_tests/test_ols.py +100 -0
  60. econometrics/tests/model_specification_diagnostics_tests/__init__.py +3 -0
  61. econometrics/tests/model_specification_diagnostics_tests/test_diagnostic_tests.py +86 -0
  62. econometrics/tests/model_specification_diagnostics_tests/test_robust_errors.py +89 -0
  63. econometrics/tests/specific_data_modeling_tests/__init__.py +3 -0
  64. econometrics/tests/specific_data_modeling_tests/test_arima.py +98 -0
  65. econometrics/tests/specific_data_modeling_tests/test_dynamic_panel.py +198 -0
  66. econometrics/tests/specific_data_modeling_tests/test_exponential_smoothing.py +105 -0
  67. econometrics/tests/specific_data_modeling_tests/test_garch.py +118 -0
  68. econometrics/tests/specific_data_modeling_tests/test_unit_root.py +156 -0
  69. econometrics/tests/specific_data_modeling_tests/test_var.py +124 -0
  70. prompts/__init__.py +0 -0
  71. prompts/analysis_guides.py +43 -0
  72. pyproject.toml +78 -0
  73. resources/MCP_MASTER_GUIDE.md +422 -0
  74. resources/MCP_TOOLS_DATA_FORMAT_GUIDE.md +185 -0
  75. resources/__init__.py +0 -0
  76. server.py +83 -0
  77. tools/README.md +88 -0
  78. tools/__init__.py +45 -0
  79. tools/data_loader.py +213 -0
  80. tools/decorators.py +38 -0
  81. tools/econometrics_adapter.py +286 -0
  82. tools/mcp_tool_groups/__init__.py +1 -0
  83. tools/mcp_tool_groups/basic_parametric_tools.py +173 -0
  84. tools/mcp_tool_groups/model_specification_tools.py +402 -0
  85. tools/mcp_tool_groups/time_series_tools.py +494 -0
  86. tools/mcp_tools_registry.py +114 -0
  87. tools/model_specification_adapter.py +369 -0
  88. tools/output_formatter.py +563 -0
  89. tools/time_series_panel_data_adapter.py +858 -0
  90. tools/time_series_panel_data_tools.py +65 -0
  91. aigroup_econ_mcp/__init__.py +0 -19
  92. aigroup_econ_mcp/cli.py +0 -82
  93. aigroup_econ_mcp/config.py +0 -561
  94. aigroup_econ_mcp/server.py +0 -452
  95. aigroup_econ_mcp/tools/__init__.py +0 -19
  96. aigroup_econ_mcp/tools/base.py +0 -470
  97. aigroup_econ_mcp/tools/cache.py +0 -533
  98. aigroup_econ_mcp/tools/data_loader.py +0 -195
  99. aigroup_econ_mcp/tools/file_parser.py +0 -1027
  100. aigroup_econ_mcp/tools/machine_learning.py +0 -60
  101. aigroup_econ_mcp/tools/ml_ensemble.py +0 -210
  102. aigroup_econ_mcp/tools/ml_evaluation.py +0 -272
  103. aigroup_econ_mcp/tools/ml_models.py +0 -54
  104. aigroup_econ_mcp/tools/ml_regularization.py +0 -186
  105. aigroup_econ_mcp/tools/monitoring.py +0 -555
  106. aigroup_econ_mcp/tools/optimized_example.py +0 -229
  107. aigroup_econ_mcp/tools/panel_data.py +0 -619
  108. aigroup_econ_mcp/tools/regression.py +0 -214
  109. aigroup_econ_mcp/tools/statistics.py +0 -154
  110. aigroup_econ_mcp/tools/time_series.py +0 -698
  111. aigroup_econ_mcp/tools/timeout.py +0 -283
  112. aigroup_econ_mcp/tools/tool_descriptions.py +0 -410
  113. aigroup_econ_mcp/tools/tool_handlers.py +0 -1016
  114. aigroup_econ_mcp/tools/tool_registry.py +0 -478
  115. aigroup_econ_mcp/tools/validation.py +0 -482
  116. aigroup_econ_mcp-1.3.3.dist-info/METADATA +0 -525
  117. aigroup_econ_mcp-1.3.3.dist-info/RECORD +0 -30
  118. aigroup_econ_mcp-1.3.3.dist-info/entry_points.txt +0 -2
  119. /aigroup_econ_mcp-1.3.3.dist-info/licenses/LICENSE → /LICENSE +0 -0
  120. {aigroup_econ_mcp-1.3.3.dist-info → aigroup_econ_mcp-1.4.3.dist-info}/WHEEL +0 -0
@@ -1,60 +0,0 @@
1
- """
2
- 机器学习集成模块 - 统一导出接口
3
- 提供基于scikit-learn的机器学习算法,用于经济数据分析
4
-
5
- 此模块作为统一入口,导出所有机器学习相关功能:
6
- - ml_models: 数据模型定义
7
- - ml_ensemble: 集成学习方法(随机森林、梯度提升树)
8
- - ml_regularization: 正则化回归(Lasso、Ridge)
9
- - ml_evaluation: 评估和比较功能
10
- """
11
-
12
- # 导入数据模型
13
- from .ml_models import (
14
- MLModelResult,
15
- RandomForestResult,
16
- GradientBoostingResult,
17
- RegularizedRegressionResult,
18
- CrossValidationResult,
19
- FeatureImportanceResult
20
- )
21
-
22
- # 导入集成学习方法
23
- from .ml_ensemble import (
24
- random_forest_regression,
25
- gradient_boosting_regression
26
- )
27
-
28
- # 导入正则化回归
29
- from .ml_regularization import (
30
- lasso_regression,
31
- ridge_regression
32
- )
33
-
34
- # 导入评估和比较功能
35
- from .ml_evaluation import (
36
- cross_validation,
37
- feature_importance_analysis,
38
- compare_ml_models
39
- )
40
-
41
- # 导出所有公共接口
42
- __all__ = [
43
- # 数据模型
44
- "MLModelResult",
45
- "RandomForestResult",
46
- "GradientBoostingResult",
47
- "RegularizedRegressionResult",
48
- "CrossValidationResult",
49
- "FeatureImportanceResult",
50
- # 集成学习
51
- "random_forest_regression",
52
- "gradient_boosting_regression",
53
- # 正则化回归
54
- "lasso_regression",
55
- "ridge_regression",
56
- # 评估和比较
57
- "cross_validation",
58
- "feature_importance_analysis",
59
- "compare_ml_models"
60
- ]
@@ -1,210 +0,0 @@
1
- """
2
- 集成学习方法模块
3
- 包含随机森林和梯度提升树回归算法
4
- """
5
-
6
- import numpy as np
7
- from typing import List, Optional
8
- from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
9
- from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
10
- from sklearn.preprocessing import StandardScaler
11
- import warnings
12
- warnings.filterwarnings('ignore')
13
-
14
- from .ml_models import RandomForestResult, GradientBoostingResult
15
-
16
-
17
- def random_forest_regression(
18
- y_data: List[float],
19
- x_data: List[List[float]],
20
- feature_names: Optional[List[str]] = None,
21
- n_estimators: int = 100,
22
- max_depth: Optional[int] = None,
23
- random_state: int = 42
24
- ) -> RandomForestResult:
25
- """
26
- 随机森林回归
27
-
28
- 📊 功能说明:
29
- 使用随机森林算法进行回归分析,适用于非线性关系和复杂交互效应。
30
-
31
- 📈 算法特点:
32
- - 集成学习:多个决策树的组合
33
- - 抗过拟合:通过袋外样本和特征随机选择
34
- - 非线性建模:能够捕捉复杂的非线性关系
35
- - 特征重要性:提供特征重要性排序
36
-
37
- 💡 使用场景:
38
- - 复杂非线性关系建模
39
- - 特征重要性分析
40
- - 高维数据回归
41
- - 稳健预测建模
42
-
43
- ⚠️ 注意事项:
44
- - 计算复杂度较高
45
- - 需要调整超参数(n_estimators, max_depth)
46
- - 对异常值相对稳健
47
-
48
- Args:
49
- y_data: 因变量数据
50
- x_data: 自变量数据,二维列表格式
51
- feature_names: 特征名称列表
52
- n_estimators: 树的数量,默认100
53
- max_depth: 最大深度,None表示不限制
54
- random_state: 随机种子
55
-
56
- Returns:
57
- RandomForestResult: 随机森林回归结果
58
- """
59
- # 数据验证
60
- if not y_data or not x_data:
61
- raise ValueError("因变量和自变量数据不能为空")
62
-
63
- if len(y_data) != len(x_data):
64
- raise ValueError(f"因变量和自变量的观测数量不一致: y_data={len(y_data)}, x_data={len(x_data)}")
65
-
66
- # 准备数据
67
- X = np.array(x_data)
68
- y = np.array(y_data)
69
-
70
- # 特征名称处理
71
- if feature_names is None:
72
- feature_names = [f"x{i}" for i in range(X.shape[1])]
73
- elif len(feature_names) != X.shape[1]:
74
- raise ValueError(f"特征名称数量({len(feature_names)})与自变量数量({X.shape[1]})不匹配")
75
-
76
- # 数据标准化
77
- scaler = StandardScaler()
78
- X_scaled = scaler.fit_transform(X)
79
-
80
- # 训练随机森林模型
81
- rf_model = RandomForestRegressor(
82
- n_estimators=n_estimators,
83
- max_depth=max_depth,
84
- random_state=random_state,
85
- oob_score=True
86
- )
87
- rf_model.fit(X_scaled, y)
88
-
89
- # 预测
90
- y_pred = rf_model.predict(X_scaled)
91
-
92
- # 计算评估指标
93
- r2 = r2_score(y, y_pred)
94
- mse = mean_squared_error(y, y_pred)
95
- mae = mean_absolute_error(y, y_pred)
96
-
97
- # 特征重要性
98
- feature_importance = dict(zip(feature_names, rf_model.feature_importances_))
99
-
100
- return RandomForestResult(
101
- model_type="random_forest",
102
- r2_score=r2,
103
- mse=mse,
104
- mae=mae,
105
- n_obs=len(y),
106
- feature_names=feature_names,
107
- feature_importance=feature_importance,
108
- n_estimators=n_estimators,
109
- max_depth=max_depth if max_depth is not None else 0, # 0表示无限制
110
- oob_score=rf_model.oob_score_ if hasattr(rf_model, 'oob_score_') else None
111
- )
112
-
113
-
114
- def gradient_boosting_regression(
115
- y_data: List[float],
116
- x_data: List[List[float]],
117
- feature_names: Optional[List[str]] = None,
118
- n_estimators: int = 100,
119
- learning_rate: float = 0.1,
120
- max_depth: int = 3,
121
- random_state: int = 42
122
- ) -> GradientBoostingResult:
123
- """
124
- 梯度提升树回归
125
-
126
- 📊 功能说明:
127
- 使用梯度提升算法进行回归分析,通过逐步优化残差来提升模型性能。
128
-
129
- 📈 算法特点:
130
- - 逐步优化:通过梯度下降逐步改进模型
131
- - 高精度:通常比随机森林有更好的预测精度
132
- - 正则化:通过学习率和树深度控制过拟合
133
- - 特征重要性:提供特征重要性排序
134
-
135
- 💡 使用场景:
136
- - 高精度预测需求
137
- - 结构化数据建模
138
- - 竞赛和实际应用
139
- - 需要精细调优的场景
140
-
141
- ⚠️ 注意事项:
142
- - 对超参数敏感
143
- - 训练时间较长
144
- - 容易过拟合(需要仔细调参)
145
-
146
- Args:
147
- y_data: 因变量数据
148
- x_data: 自变量数据,二维列表格式
149
- feature_names: 特征名称列表
150
- n_estimators: 树的数量,默认100
151
- learning_rate: 学习率,默认0.1
152
- max_depth: 最大深度,默认3
153
- random_state: 随机种子
154
-
155
- Returns:
156
- GradientBoostingResult: 梯度提升树回归结果
157
- """
158
- # 数据验证
159
- if not y_data or not x_data:
160
- raise ValueError("因变量和自变量数据不能为空")
161
-
162
- if len(y_data) != len(x_data):
163
- raise ValueError(f"因变量和自变量的观测数量不一致: y_data={len(y_data)}, x_data={len(x_data)}")
164
-
165
- # 准备数据
166
- X = np.array(x_data)
167
- y = np.array(y_data)
168
-
169
- # 特征名称处理
170
- if feature_names is None:
171
- feature_names = [f"x{i}" for i in range(X.shape[1])]
172
- elif len(feature_names) != X.shape[1]:
173
- raise ValueError(f"特征名称数量({len(feature_names)})与自变量数量({X.shape[1]})不匹配")
174
-
175
- # 数据标准化
176
- scaler = StandardScaler()
177
- X_scaled = scaler.fit_transform(X)
178
-
179
- # 训练梯度提升树模型
180
- gb_model = GradientBoostingRegressor(
181
- n_estimators=n_estimators,
182
- learning_rate=learning_rate,
183
- max_depth=max_depth,
184
- random_state=random_state
185
- )
186
- gb_model.fit(X_scaled, y)
187
-
188
- # 预测
189
- y_pred = gb_model.predict(X_scaled)
190
-
191
- # 计算评估指标
192
- r2 = r2_score(y, y_pred)
193
- mse = mean_squared_error(y, y_pred)
194
- mae = mean_absolute_error(y, y_pred)
195
-
196
- # 特征重要性
197
- feature_importance = dict(zip(feature_names, gb_model.feature_importances_))
198
-
199
- return GradientBoostingResult(
200
- model_type="gradient_boosting",
201
- r2_score=r2,
202
- mse=mse,
203
- mae=mae,
204
- n_obs=len(y),
205
- feature_names=feature_names,
206
- feature_importance=feature_importance,
207
- n_estimators=n_estimators,
208
- learning_rate=learning_rate,
209
- max_depth=max_depth
210
- )
@@ -1,272 +0,0 @@
1
- """
2
- 机器学习评估和比较模块
3
- 包含交叉验证、特征重要性分析和模型比较功能
4
- """
5
-
6
- import numpy as np
7
- from typing import List, Dict, Any, Optional
8
- from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
9
- from sklearn.model_selection import cross_val_score, KFold
10
- from sklearn.preprocessing import StandardScaler
11
- from sklearn.linear_model import Lasso, Ridge
12
- import warnings
13
- warnings.filterwarnings('ignore')
14
-
15
- from .ml_models import CrossValidationResult, FeatureImportanceResult
16
- from .ml_ensemble import random_forest_regression, gradient_boosting_regression
17
- from .ml_regularization import lasso_regression, ridge_regression
18
-
19
-
20
- def cross_validation(
21
- y_data: List[float],
22
- x_data: List[List[float]],
23
- model_type: str = "random_forest",
24
- cv_folds: int = 5,
25
- scoring: str = "r2",
26
- **model_params
27
- ) -> CrossValidationResult:
28
- """
29
- 交叉验证
30
-
31
- 📊 功能说明:
32
- 通过交叉验证评估模型的泛化能力和稳定性。
33
-
34
- 📈 验证方法:
35
- - K折交叉验证:将数据分为K份,轮流使用K-1份训练,1份测试
36
- - 稳定性评估:通过多次验证评估模型稳定性
37
- - 泛化能力:评估模型在未见数据上的表现
38
-
39
- 💡 使用场景:
40
- - 模型选择和比较
41
- - 超参数调优
42
- - 评估模型稳定性
43
- - 防止过拟合
44
-
45
- ⚠️ 注意事项:
46
- - 计算成本较高
47
- - 需要足够的数据量
48
- - 折数选择影响结果稳定性
49
-
50
- Args:
51
- y_data: 因变量数据
52
- x_data: 自变量数据,二维列表格式
53
- model_type: 模型类型(random_forest, gradient_boosting, lasso, ridge)
54
- cv_folds: 交叉验证折数,默认5
55
- scoring: 评分指标,默认"r2"
56
- **model_params: 模型参数
57
-
58
- Returns:
59
- CrossValidationResult: 交叉验证结果
60
- """
61
- # 数据验证
62
- if not y_data or not x_data:
63
- raise ValueError("因变量和自变量数据不能为空")
64
-
65
- if len(y_data) != len(x_data):
66
- raise ValueError(f"因变量和自变量的观测数量不一致: y_data={len(y_data)}, x_data={len(x_data)}")
67
-
68
- if cv_folds < 2 or cv_folds > len(y_data):
69
- raise ValueError(f"交叉验证折数应在2到样本数量之间: cv_folds={cv_folds}, n_obs={len(y_data)}")
70
-
71
- # 准备数据
72
- X = np.array(x_data)
73
- y = np.array(y_data)
74
-
75
- # 数据标准化
76
- scaler = StandardScaler()
77
- X_scaled = scaler.fit_transform(X)
78
-
79
- # 选择模型
80
- if model_type == "random_forest":
81
- model = RandomForestRegressor(**model_params)
82
- elif model_type == "gradient_boosting":
83
- model = GradientBoostingRegressor(**model_params)
84
- elif model_type == "lasso":
85
- model = Lasso(**model_params)
86
- elif model_type == "ridge":
87
- model = Ridge(**model_params)
88
- else:
89
- raise ValueError(f"不支持的模型类型: {model_type}")
90
-
91
- # 执行交叉验证
92
- cv = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
93
- cv_scores = cross_val_score(model, X_scaled, y, cv=cv, scoring=scoring)
94
-
95
- return CrossValidationResult(
96
- model_type=model_type,
97
- cv_scores=cv_scores.tolist(),
98
- mean_score=np.mean(cv_scores),
99
- std_score=np.std(cv_scores),
100
- n_splits=cv_folds
101
- )
102
-
103
-
104
- def feature_importance_analysis(
105
- y_data: List[float],
106
- x_data: List[List[float]],
107
- feature_names: Optional[List[str]] = None,
108
- method: str = "random_forest",
109
- top_k: int = 5
110
- ) -> FeatureImportanceResult:
111
- """
112
- 特征重要性分析
113
-
114
- 📊 功能说明:
115
- 分析各个特征对预测目标的重要性,帮助理解数据中的关键因素。
116
-
117
- 📈 分析方法:
118
- - 基于模型:使用机器学习模型计算特征重要性
119
- - 排序分析:按重要性对特征进行排序
120
- - 关键特征识别:识别最重要的top-k个特征
121
-
122
- 💡 使用场景:
123
- - 特征选择和降维
124
- - 模型可解释性分析
125
- - 业务洞察提取
126
- - 数据理解增强
127
-
128
- ⚠️ 注意事项:
129
- - 不同方法可能给出不同的重要性排序
130
- - 重要性分数是相对的,不是绝对的
131
- - 需要结合业务知识解释结果
132
-
133
- Args:
134
- y_data: 因变量数据
135
- x_data: 自变量数据,二维列表格式
136
- feature_names: 特征名称列表
137
- method: 分析方法(random_forest, gradient_boosting)
138
- top_k: 最重要的特征数量,默认5
139
-
140
- Returns:
141
- FeatureImportanceResult: 特征重要性分析结果
142
- """
143
- # 数据验证
144
- if not y_data or not x_data:
145
- raise ValueError("因变量和自变量数据不能为空")
146
-
147
- if len(y_data) != len(x_data):
148
- raise ValueError(f"因变量和自变量的观测数量不一致: y_data={len(y_data)}, x_data={len(x_data)}")
149
-
150
- # 准备数据
151
- X = np.array(x_data)
152
- y = np.array(y_data)
153
-
154
- # 特征名称处理
155
- if feature_names is None:
156
- feature_names = [f"x{i}" for i in range(X.shape[1])]
157
- elif len(feature_names) != X.shape[1]:
158
- raise ValueError(f"特征名称数量({len(feature_names)})与自变量数量({X.shape[1]})不匹配")
159
-
160
- # 数据标准化
161
- scaler = StandardScaler()
162
- X_scaled = scaler.fit_transform(X)
163
-
164
- # 选择模型并计算特征重要性
165
- if method == "random_forest":
166
- model = RandomForestRegressor(n_estimators=100, random_state=42)
167
- elif method == "gradient_boosting":
168
- model = GradientBoostingRegressor(n_estimators=100, random_state=42)
169
- else:
170
- raise ValueError(f"不支持的特征重要性分析方法: {method}")
171
-
172
- # 训练模型
173
- model.fit(X_scaled, y)
174
-
175
- # 获取特征重要性
176
- importance_scores = model.feature_importances_
177
- feature_importance = dict(zip(feature_names, importance_scores))
178
-
179
- # 按重要性排序
180
- sorted_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)
181
-
182
- # 获取最重要的特征
183
- top_features = [feature for feature, score in sorted_features[:top_k]]
184
-
185
- return FeatureImportanceResult(
186
- feature_importance=feature_importance,
187
- sorted_features=sorted_features,
188
- top_features=top_features
189
- )
190
-
191
-
192
- def compare_ml_models(
193
- y_data: List[float],
194
- x_data: List[List[float]],
195
- feature_names: Optional[List[str]] = None,
196
- models: List[str] = None
197
- ) -> Dict[str, Any]:
198
- """
199
- 比较多个机器学习模型
200
-
201
- 📊 功能说明:
202
- 同时运行多个机器学习模型并比较它们的性能,帮助选择最佳模型。
203
-
204
- 📈 比较指标:
205
- - R²得分:模型解释方差的比例
206
- - 均方误差:预测误差的平方平均
207
- - 平均绝对误差:预测误差的绝对平均
208
- - 特征重要性:模型认为的重要特征
209
-
210
- 💡 使用场景:
211
- - 模型选择和比较
212
- - 算法性能评估
213
- - 项目初始阶段模型筛选
214
- - 基准模型建立
215
-
216
- ⚠️ 注意事项:
217
- - 不同模型有不同的假设和适用场景
218
- - 需要结合交叉验证结果
219
- - 考虑模型复杂度和计算成本
220
-
221
- Args:
222
- y_data: 因变量数据
223
- x_data: 自变量数据,二维列表格式
224
- feature_names: 特征名称列表
225
- models: 要比较的模型列表,默认比较所有模型
226
-
227
- Returns:
228
- Dict[str, Any]: 模型比较结果
229
- """
230
- if models is None:
231
- models = ["random_forest", "gradient_boosting", "lasso", "ridge"]
232
-
233
- results = {}
234
-
235
- for model_name in models:
236
- try:
237
- if model_name == "random_forest":
238
- result = random_forest_regression(y_data, x_data, feature_names)
239
- elif model_name == "gradient_boosting":
240
- result = gradient_boosting_regression(y_data, x_data, feature_names)
241
- elif model_name == "lasso":
242
- result = lasso_regression(y_data, x_data, feature_names)
243
- elif model_name == "ridge":
244
- result = ridge_regression(y_data, x_data, feature_names)
245
- else:
246
- continue
247
-
248
- results[model_name] = result.model_dump()
249
-
250
- except Exception as e:
251
- print(f"模型 {model_name} 运行失败: {e}")
252
- continue
253
-
254
- # 找出最佳模型(基于R²得分)
255
- best_model = None
256
- best_r2 = -float('inf')
257
-
258
- for model_name, result in results.items():
259
- if result['r2_score'] > best_r2:
260
- best_r2 = result['r2_score']
261
- best_model = model_name
262
-
263
- return {
264
- "model_results": results,
265
- "best_model": best_model,
266
- "best_r2": best_r2,
267
- "comparison_summary": {
268
- "total_models": len(results),
269
- "successful_models": len(results),
270
- "best_performing": best_model
271
- }
272
- }
@@ -1,54 +0,0 @@
1
- """
2
- 机器学习模型数据类定义
3
- 定义各种机器学习算法的结果数据结构
4
- """
5
-
6
- from typing import List, Dict, Any, Optional, Tuple
7
- from pydantic import BaseModel, Field
8
-
9
-
10
- class MLModelResult(BaseModel):
11
- """机器学习模型结果基类"""
12
- model_type: str = Field(description="模型类型")
13
- r2_score: float = Field(description="R²得分")
14
- mse: float = Field(description="均方误差")
15
- mae: float = Field(description="平均绝对误差")
16
- n_obs: int = Field(description="样本数量")
17
- feature_names: List[str] = Field(description="特征名称")
18
- feature_importance: Optional[Dict[str, float]] = Field(default=None, description="特征重要性")
19
-
20
-
21
- class RandomForestResult(MLModelResult):
22
- """随机森林回归结果"""
23
- n_estimators: int = Field(description="树的数量")
24
- max_depth: int = Field(description="最大深度")
25
- oob_score: Optional[float] = Field(default=None, description="袋外得分")
26
-
27
-
28
- class GradientBoostingResult(MLModelResult):
29
- """梯度提升树回归结果"""
30
- n_estimators: int = Field(description="树的数量")
31
- learning_rate: float = Field(description="学习率")
32
- max_depth: int = Field(description="最大深度")
33
-
34
-
35
- class RegularizedRegressionResult(MLModelResult):
36
- """正则化回归结果"""
37
- alpha: float = Field(description="正则化强度")
38
- coefficients: Dict[str, float] = Field(description="回归系数")
39
-
40
-
41
- class CrossValidationResult(BaseModel):
42
- """交叉验证结果"""
43
- model_type: str = Field(description="模型类型")
44
- cv_scores: List[float] = Field(description="交叉验证得分")
45
- mean_score: float = Field(description="平均得分")
46
- std_score: float = Field(description="标准差")
47
- n_splits: int = Field(description="交叉验证折数")
48
-
49
-
50
- class FeatureImportanceResult(BaseModel):
51
- """特征重要性分析结果"""
52
- feature_importance: Dict[str, float] = Field(description="特征重要性分数")
53
- sorted_features: List[Tuple[str, float]] = Field(description="按重要性排序的特征")
54
- top_features: List[str] = Field(description="最重要的特征")