aigroup-econ-mcp 1.4.3__py3-none-any.whl → 2.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PKG-INFO +344 -322
- README.md +335 -320
- __init__.py +1 -1
- aigroup_econ_mcp-2.0.1.dist-info/METADATA +732 -0
- aigroup_econ_mcp-2.0.1.dist-info/RECORD +170 -0
- cli.py +4 -0
- econometrics/advanced_methods/modern_computing_machine_learning/__init__.py +30 -0
- econometrics/advanced_methods/modern_computing_machine_learning/causal_forest.py +253 -0
- econometrics/advanced_methods/modern_computing_machine_learning/double_ml.py +268 -0
- econometrics/advanced_methods/modern_computing_machine_learning/gradient_boosting.py +249 -0
- econometrics/advanced_methods/modern_computing_machine_learning/hierarchical_clustering.py +243 -0
- econometrics/advanced_methods/modern_computing_machine_learning/kmeans_clustering.py +293 -0
- econometrics/advanced_methods/modern_computing_machine_learning/neural_network.py +264 -0
- econometrics/advanced_methods/modern_computing_machine_learning/random_forest.py +195 -0
- econometrics/advanced_methods/modern_computing_machine_learning/support_vector_machine.py +226 -0
- econometrics/advanced_methods/modern_computing_machine_learning/test_all_modules.py +329 -0
- econometrics/advanced_methods/modern_computing_machine_learning/test_report.md +107 -0
- econometrics/causal_inference/__init__.py +66 -0
- econometrics/causal_inference/causal_identification_strategy/__init__.py +104 -0
- econometrics/causal_inference/causal_identification_strategy/control_function.py +112 -0
- econometrics/causal_inference/causal_identification_strategy/difference_in_differences.py +107 -0
- econometrics/causal_inference/causal_identification_strategy/event_study.py +119 -0
- econometrics/causal_inference/causal_identification_strategy/first_difference.py +89 -0
- econometrics/causal_inference/causal_identification_strategy/fixed_effects.py +103 -0
- econometrics/causal_inference/causal_identification_strategy/hausman_test.py +69 -0
- econometrics/causal_inference/causal_identification_strategy/instrumental_variables.py +145 -0
- econometrics/causal_inference/causal_identification_strategy/mediation_analysis.py +121 -0
- econometrics/causal_inference/causal_identification_strategy/moderation_analysis.py +109 -0
- econometrics/causal_inference/causal_identification_strategy/propensity_score_matching.py +140 -0
- econometrics/causal_inference/causal_identification_strategy/random_effects.py +100 -0
- econometrics/causal_inference/causal_identification_strategy/regression_discontinuity.py +98 -0
- econometrics/causal_inference/causal_identification_strategy/synthetic_control.py +111 -0
- econometrics/causal_inference/causal_identification_strategy/triple_difference.py +86 -0
- econometrics/distribution_analysis/__init__.py +28 -0
- econometrics/distribution_analysis/oaxaca_blinder.py +184 -0
- econometrics/distribution_analysis/time_series_decomposition.py +152 -0
- econometrics/distribution_analysis/variance_decomposition.py +179 -0
- econometrics/missing_data/__init__.py +18 -0
- econometrics/missing_data/imputation_methods.py +219 -0
- econometrics/nonparametric/__init__.py +35 -0
- econometrics/nonparametric/gam_model.py +117 -0
- econometrics/nonparametric/kernel_regression.py +161 -0
- econometrics/nonparametric/quantile_regression.py +249 -0
- econometrics/nonparametric/spline_regression.py +100 -0
- econometrics/spatial_econometrics/__init__.py +68 -0
- econometrics/spatial_econometrics/geographically_weighted_regression.py +211 -0
- econometrics/spatial_econometrics/gwr_simple.py +154 -0
- econometrics/spatial_econometrics/spatial_autocorrelation.py +356 -0
- econometrics/spatial_econometrics/spatial_durbin_model.py +177 -0
- econometrics/spatial_econometrics/spatial_regression.py +315 -0
- econometrics/spatial_econometrics/spatial_weights.py +226 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/README.md +164 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/__init__.py +40 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/count_data_models.py +311 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/discrete_choice_models.py +294 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/limited_dependent_variable_models.py +282 -0
- econometrics/statistical_inference/__init__.py +21 -0
- econometrics/statistical_inference/bootstrap_methods.py +162 -0
- econometrics/statistical_inference/permutation_test.py +177 -0
- econometrics/survival_analysis/__init__.py +18 -0
- econometrics/survival_analysis/survival_models.py +259 -0
- econometrics/tests/causal_inference_tests/__init__.py +3 -0
- econometrics/tests/causal_inference_tests/detailed_test.py +441 -0
- econometrics/tests/causal_inference_tests/test_all_methods.py +418 -0
- econometrics/tests/causal_inference_tests/test_causal_identification_strategy.py +202 -0
- econometrics/tests/causal_inference_tests/test_difference_in_differences.py +53 -0
- econometrics/tests/causal_inference_tests/test_instrumental_variables.py +44 -0
- econometrics/tests/specific_data_modeling_tests/test_micro_discrete_limited_data.py +189 -0
- econometrics//321/206/320/254/320/272/321/205/342/225/235/320/220/321/205/320/237/320/241/321/205/320/264/320/267/321/207/342/226/222/342/225/227/321/204/342/225/235/320/250/321/205/320/225/320/230/321/207/342/225/221/320/267/321/205/320/230/320/226/321/206/320/256/320/240.md +544 -0
- pyproject.toml +9 -2
- server.py +15 -1
- tools/__init__.py +75 -1
- tools/causal_inference_adapter.py +658 -0
- tools/distribution_analysis_adapter.py +121 -0
- tools/gwr_simple_adapter.py +54 -0
- tools/machine_learning_adapter.py +567 -0
- tools/mcp_tool_groups/__init__.py +15 -1
- tools/mcp_tool_groups/causal_inference_tools.py +643 -0
- tools/mcp_tool_groups/distribution_analysis_tools.py +169 -0
- tools/mcp_tool_groups/machine_learning_tools.py +422 -0
- tools/mcp_tool_groups/microecon_tools.py +325 -0
- tools/mcp_tool_groups/missing_data_tools.py +117 -0
- tools/mcp_tool_groups/nonparametric_tools.py +225 -0
- tools/mcp_tool_groups/spatial_econometrics_tools.py +323 -0
- tools/mcp_tool_groups/statistical_inference_tools.py +131 -0
- tools/mcp_tools_registry.py +13 -3
- tools/microecon_adapter.py +412 -0
- tools/missing_data_adapter.py +73 -0
- tools/nonparametric_adapter.py +190 -0
- tools/spatial_econometrics_adapter.py +318 -0
- tools/statistical_inference_adapter.py +90 -0
- tools/survival_analysis_adapter.py +46 -0
- aigroup_econ_mcp-1.4.3.dist-info/METADATA +0 -710
- aigroup_econ_mcp-1.4.3.dist-info/RECORD +0 -92
- {aigroup_econ_mcp-1.4.3.dist-info → aigroup_econ_mcp-2.0.1.dist-info}/WHEEL +0 -0
- {aigroup_econ_mcp-1.4.3.dist-info → aigroup_econ_mcp-2.0.1.dist-info}/entry_points.txt +0 -0
- {aigroup_econ_mcp-1.4.3.dist-info → aigroup_econ_mcp-2.0.1.dist-info}/licenses/LICENSE +0 -0
econometrics/specific_data_modeling/micro_discrete_limited_data/limited_dependent_variable_models.py
ADDED
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
"""
|
|
2
|
+
受限因变量模型模块
|
|
3
|
+
基于statsmodels等现有库实现
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from scipy import stats
|
|
9
|
+
try:
|
|
10
|
+
import statsmodels.api as sm
|
|
11
|
+
from statsmodels.regression.linear_model import OLS
|
|
12
|
+
HAS_STATSMODELS = True
|
|
13
|
+
except ImportError:
|
|
14
|
+
HAS_STATSMODELS = False
|
|
15
|
+
OLS = None
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
from statsmodels.base.model import GenericLikelihoodModel
|
|
19
|
+
HAS_GENERIC_MODEL = True
|
|
20
|
+
except ImportError:
|
|
21
|
+
HAS_GENERIC_MODEL = False
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class _PlaceholderModel:
|
|
25
|
+
def __init__(self, *args, **kwargs):
|
|
26
|
+
if not HAS_STATSMODELS:
|
|
27
|
+
raise ImportError("需要安装statsmodels库: pip install statsmodels")
|
|
28
|
+
|
|
29
|
+
def fit(self, *args, **kwargs):
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class TobitModel:
|
|
34
|
+
"""
|
|
35
|
+
Tobit模型(截断回归模型)
|
|
36
|
+
由于statsmodels中没有内置的Tobit模型,这里提供一个基于GenericLikelihoodModel的实现
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(self, lower_bound=0, upper_bound=None):
|
|
40
|
+
"""
|
|
41
|
+
初始化Tobit模型
|
|
42
|
+
|
|
43
|
+
参数:
|
|
44
|
+
lower_bound: 下界阈值,默认为0
|
|
45
|
+
upper_bound: 上界阈值,默认为None(无上界)
|
|
46
|
+
"""
|
|
47
|
+
if not HAS_STATSMODELS:
|
|
48
|
+
raise ImportError("需要安装statsmodels库: pip install statsmodels")
|
|
49
|
+
|
|
50
|
+
if not HAS_GENERIC_MODEL:
|
|
51
|
+
raise ImportError("需要statsmodels的GenericLikelihoodModel支持")
|
|
52
|
+
|
|
53
|
+
self.lower_bound = lower_bound
|
|
54
|
+
self.upper_bound = upper_bound
|
|
55
|
+
self.model_ = None
|
|
56
|
+
self.results_ = None
|
|
57
|
+
self.fitted_ = False
|
|
58
|
+
|
|
59
|
+
def fit(self, X, y):
|
|
60
|
+
"""拟合Tobit模型"""
|
|
61
|
+
X = np.array(X)
|
|
62
|
+
y = np.array(y)
|
|
63
|
+
|
|
64
|
+
# 添加常数项
|
|
65
|
+
X_with_const = sm.add_constant(X)
|
|
66
|
+
|
|
67
|
+
# 定义Tobit似然函数
|
|
68
|
+
class TobitLikelihoodModel(GenericLikelihoodModel):
|
|
69
|
+
def __init__(self, endog, exog, lower_bound=0, upper_bound=None, **kwds):
|
|
70
|
+
self.lower_bound = lower_bound
|
|
71
|
+
self.upper_bound = upper_bound
|
|
72
|
+
super(TobitLikelihoodModel, self).__init__(endog, exog, **kwds)
|
|
73
|
+
|
|
74
|
+
def loglikeobs(self, params):
|
|
75
|
+
# 分离系数和sigma
|
|
76
|
+
beta = params[:-1]
|
|
77
|
+
sigma = params[-1]
|
|
78
|
+
|
|
79
|
+
if sigma <= 0:
|
|
80
|
+
return np.full_like(self.endog, -np.inf)
|
|
81
|
+
|
|
82
|
+
# 预测值
|
|
83
|
+
xb = np.dot(self.exog, beta)
|
|
84
|
+
z = (self.endog - xb) / sigma
|
|
85
|
+
|
|
86
|
+
# 计算对数似然
|
|
87
|
+
if self.upper_bound is None:
|
|
88
|
+
# 只有下界的情况
|
|
89
|
+
censored = self.endog <= self.lower_bound
|
|
90
|
+
uncensored = ~censored
|
|
91
|
+
|
|
92
|
+
ll = np.zeros_like(self.endog)
|
|
93
|
+
# 截断观测的对数似然
|
|
94
|
+
ll[censored] = stats.norm.logcdf((self.lower_bound - xb[censored]) / sigma)
|
|
95
|
+
# 未截断观测的对数似然
|
|
96
|
+
ll[uncensored] = -0.5 * np.log(2 * np.pi * sigma**2) - 0.5 * z[uncensored]**2
|
|
97
|
+
else:
|
|
98
|
+
# 双边截断的情况
|
|
99
|
+
left_censored = self.endog <= self.lower_bound
|
|
100
|
+
right_censored = self.endog >= self.upper_bound
|
|
101
|
+
uncensored = ~(left_censored | right_censored)
|
|
102
|
+
|
|
103
|
+
ll = np.zeros_like(self.endog)
|
|
104
|
+
# 左截断观测的对数似然
|
|
105
|
+
ll[left_censored] = stats.norm.logcdf((self.lower_bound - xb[left_censored]) / sigma)
|
|
106
|
+
# 右截断观测的对数似然
|
|
107
|
+
ll[right_censored] = stats.norm.logsf((self.upper_bound - xb[right_censored]) / sigma)
|
|
108
|
+
# 未截断观测的对数似然
|
|
109
|
+
ll[uncensored] = -0.5 * np.log(2 * np.pi * sigma**2) - 0.5 * z[uncensored]**2
|
|
110
|
+
|
|
111
|
+
return ll
|
|
112
|
+
|
|
113
|
+
# 创建并拟合模型
|
|
114
|
+
self.model_ = TobitLikelihoodModel(
|
|
115
|
+
endog=y,
|
|
116
|
+
exog=X_with_const,
|
|
117
|
+
lower_bound=self.lower_bound,
|
|
118
|
+
upper_bound=self.upper_bound
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
# 初始化参数
|
|
122
|
+
n_features = X_with_const.shape[1]
|
|
123
|
+
initial_params = np.concatenate([
|
|
124
|
+
np.zeros(n_features), # beta
|
|
125
|
+
[np.std(y[y > self.lower_bound]) if self.upper_bound is None else np.std(y)] # sigma
|
|
126
|
+
])
|
|
127
|
+
|
|
128
|
+
self.results_ = self.model_.fit(start_params=initial_params, method='bfgs', disp=0)
|
|
129
|
+
self.fitted_ = True
|
|
130
|
+
return self
|
|
131
|
+
|
|
132
|
+
def predict(self, X):
|
|
133
|
+
"""预测期望值"""
|
|
134
|
+
if not self.fitted_:
|
|
135
|
+
raise ValueError("模型尚未拟合")
|
|
136
|
+
X = np.array(X)
|
|
137
|
+
X_with_const = sm.add_constant(X)
|
|
138
|
+
|
|
139
|
+
# 手动计算预测值
|
|
140
|
+
beta = self.results_.params[:-1] # 排除sigma参数
|
|
141
|
+
sigma = self.results_.params[-1]
|
|
142
|
+
|
|
143
|
+
xb = np.dot(X_with_const, beta)
|
|
144
|
+
|
|
145
|
+
if self.upper_bound is None:
|
|
146
|
+
# 只有下界的情况
|
|
147
|
+
z = (self.lower_bound - xb) / sigma
|
|
148
|
+
lambda_val = stats.norm.pdf(z) / np.clip(1 - stats.norm.cdf(z), 1e-10, 1)
|
|
149
|
+
return xb + sigma * lambda_val
|
|
150
|
+
else:
|
|
151
|
+
# 双边截断的情况
|
|
152
|
+
z_lower = (self.lower_bound - xb) / sigma
|
|
153
|
+
z_upper = (self.upper_bound - xb) / sigma
|
|
154
|
+
|
|
155
|
+
lambda_lower = stats.norm.pdf(z_lower) / np.clip(stats.norm.cdf(z_upper) - stats.norm.cdf(z_lower), 1e-10, 1)
|
|
156
|
+
lambda_upper = stats.norm.pdf(z_upper) / np.clip(stats.norm.cdf(z_upper) - stats.norm.cdf(z_lower), 1e-10, 1)
|
|
157
|
+
|
|
158
|
+
return xb + sigma * (lambda_lower - lambda_upper)
|
|
159
|
+
|
|
160
|
+
def predict_linear(self, X):
|
|
161
|
+
"""预测线性预测值"""
|
|
162
|
+
if not self.fitted_:
|
|
163
|
+
raise ValueError("模型尚未拟合")
|
|
164
|
+
X = np.array(X)
|
|
165
|
+
X_with_const = sm.add_constant(X)
|
|
166
|
+
xb = np.dot(X_with_const, self.results_.params[:-1]) # 排除sigma参数
|
|
167
|
+
return xb
|
|
168
|
+
|
|
169
|
+
def summary(self):
|
|
170
|
+
"""返回模型摘要"""
|
|
171
|
+
if not self.fitted_:
|
|
172
|
+
raise ValueError("模型尚未拟合")
|
|
173
|
+
return self.results_.summary()
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
class HeckmanModel:
|
|
177
|
+
"""
|
|
178
|
+
Heckman两阶段选择模型 (基于statsmodels实现)
|
|
179
|
+
"""
|
|
180
|
+
|
|
181
|
+
def __init__(self):
|
|
182
|
+
self.selection_model_ = None
|
|
183
|
+
self.selection_results_ = None
|
|
184
|
+
self.outcome_model_ = None
|
|
185
|
+
self.outcome_results_ = None
|
|
186
|
+
self.fitted_ = False
|
|
187
|
+
|
|
188
|
+
def fit(self, X_select, Z, y, s):
|
|
189
|
+
"""
|
|
190
|
+
拟合Heckman模型
|
|
191
|
+
|
|
192
|
+
参数:
|
|
193
|
+
X_select: 选择方程的解释变量矩阵
|
|
194
|
+
Z: 结果方程的解释变量矩阵
|
|
195
|
+
y: 结果变量向量(仅对选择样本可观测)
|
|
196
|
+
s: 选择指示变量向量(1表示被选择,0表示未被选择)
|
|
197
|
+
"""
|
|
198
|
+
if not HAS_STATSMODELS:
|
|
199
|
+
raise ImportError("需要安装statsmodels库: pip install statsmodels")
|
|
200
|
+
|
|
201
|
+
X_select = np.array(X_select)
|
|
202
|
+
Z = np.array(Z)
|
|
203
|
+
y = np.array(y)
|
|
204
|
+
s = np.array(s)
|
|
205
|
+
|
|
206
|
+
# 第一阶段:Probit模型估计选择方程
|
|
207
|
+
X_select_with_const = sm.add_constant(X_select)
|
|
208
|
+
self.selection_model_ = sm.Probit(s, X_select_with_const)
|
|
209
|
+
self.selection_results_ = self.selection_model_.fit(disp=0)
|
|
210
|
+
|
|
211
|
+
# 计算逆米尔斯比率 (Inverse Mills Ratio)
|
|
212
|
+
X_select_linpred = np.dot(X_select_with_const, self.selection_results_.params)
|
|
213
|
+
mills_ratio = stats.norm.pdf(X_select_linpred) / np.clip(stats.norm.cdf(X_select_linpred), 1e-10, 1-1e-10)
|
|
214
|
+
# 对于未被选择的样本,米尔斯比率为0
|
|
215
|
+
mills_ratio = mills_ratio * s
|
|
216
|
+
|
|
217
|
+
# 第二阶段:加入逆米尔斯比率的结果方程OLS
|
|
218
|
+
Z_with_mills = np.column_stack([Z, mills_ratio])
|
|
219
|
+
Z_with_mills_const = sm.add_constant(Z_with_mills)
|
|
220
|
+
|
|
221
|
+
# 只对被选择的样本进行回归
|
|
222
|
+
selected_mask = s == 1
|
|
223
|
+
Z_selected = Z_with_mills_const[selected_mask]
|
|
224
|
+
y_selected = y[selected_mask]
|
|
225
|
+
|
|
226
|
+
self.outcome_model_ = OLS(y_selected, Z_selected)
|
|
227
|
+
self.outcome_results_ = self.outcome_model_.fit()
|
|
228
|
+
|
|
229
|
+
self.fitted_ = True
|
|
230
|
+
return self
|
|
231
|
+
|
|
232
|
+
def predict(self, X_select, Z):
|
|
233
|
+
"""预测结果值"""
|
|
234
|
+
if not self.fitted_:
|
|
235
|
+
raise ValueError("模型尚未拟合")
|
|
236
|
+
|
|
237
|
+
X_select = np.array(X_select)
|
|
238
|
+
Z = np.array(Z)
|
|
239
|
+
|
|
240
|
+
# 添加常数项
|
|
241
|
+
X_select_with_const = sm.add_constant(X_select)
|
|
242
|
+
|
|
243
|
+
# 计算逆米尔斯比率
|
|
244
|
+
X_select_linpred = np.dot(X_select_with_const, self.selection_results_.params)
|
|
245
|
+
mills_ratio = stats.norm.pdf(X_select_linpred) / np.clip(stats.norm.cdf(X_select_linpred), 1e-10, 1-1e-10)
|
|
246
|
+
|
|
247
|
+
# 构建预测矩阵:Z + 逆米尔斯比率 + 常数项
|
|
248
|
+
Z_with_mills = np.column_stack([Z, mills_ratio])
|
|
249
|
+
Z_with_mills_const = sm.add_constant(Z_with_mills)
|
|
250
|
+
|
|
251
|
+
# 计算结果方程预测值
|
|
252
|
+
outcome_pred = self.outcome_results_.predict(Z_with_mills_const)
|
|
253
|
+
|
|
254
|
+
return outcome_pred
|
|
255
|
+
|
|
256
|
+
def summary(self):
|
|
257
|
+
"""返回模型摘要"""
|
|
258
|
+
if not self.fitted_:
|
|
259
|
+
raise ValueError("模型尚未拟合")
|
|
260
|
+
return {
|
|
261
|
+
'selection_summary': self.selection_results_.summary(),
|
|
262
|
+
'outcome_summary': self.outcome_results_.summary()
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
# 如果statsmodels不可用,则使用占位符
|
|
267
|
+
if not HAS_STATSMODELS:
|
|
268
|
+
TobitModel = _PlaceholderModel
|
|
269
|
+
HeckmanModel = _PlaceholderModel
|
|
270
|
+
|
|
271
|
+
def multinomial_logit():
|
|
272
|
+
"""
|
|
273
|
+
多项Logit模型占位符
|
|
274
|
+
"""
|
|
275
|
+
pass
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def nested_logit():
|
|
279
|
+
"""
|
|
280
|
+
嵌套Logit模型占位符
|
|
281
|
+
"""
|
|
282
|
+
pass
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""
|
|
2
|
+
统计推断技术模块
|
|
3
|
+
提供重采样、模拟和渐近推断方法
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from .bootstrap_methods import (
|
|
7
|
+
bootstrap_inference,
|
|
8
|
+
BootstrapResult
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
from .permutation_test import (
|
|
12
|
+
permutation_test,
|
|
13
|
+
PermutationTestResult
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
'bootstrap_inference',
|
|
18
|
+
'BootstrapResult',
|
|
19
|
+
'permutation_test',
|
|
20
|
+
'PermutationTestResult'
|
|
21
|
+
]
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Bootstrap重采样推断方法
|
|
3
|
+
基于 scipy.stats 实现多种Bootstrap方法
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import List, Optional, Callable, Tuple, Dict
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
|
+
import numpy as np
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
from scipy import stats
|
|
12
|
+
SCIPY_AVAILABLE = True
|
|
13
|
+
except ImportError:
|
|
14
|
+
SCIPY_AVAILABLE = False
|
|
15
|
+
stats = None
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class BootstrapResult(BaseModel):
|
|
19
|
+
"""Bootstrap推断结果"""
|
|
20
|
+
statistic: float = Field(..., description="统计量估计值")
|
|
21
|
+
bootstrap_mean: float = Field(..., description="Bootstrap均值")
|
|
22
|
+
bootstrap_std: float = Field(..., description="Bootstrap标准误")
|
|
23
|
+
confidence_interval: Tuple[float, float] = Field(..., description="置信区间")
|
|
24
|
+
bias: float = Field(..., description="偏差估计")
|
|
25
|
+
confidence_level: float = Field(..., description="置信水平")
|
|
26
|
+
n_bootstrap: int = Field(..., description="Bootstrap重采样次数")
|
|
27
|
+
method: str = Field(..., description="Bootstrap方法")
|
|
28
|
+
bootstrap_distribution: List[float] = Field(..., description="Bootstrap统计量分布(前100个)")
|
|
29
|
+
summary: str = Field(..., description="摘要信息")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def bootstrap_inference(
|
|
33
|
+
data: List[float],
|
|
34
|
+
statistic_func: Optional[str] = "mean",
|
|
35
|
+
n_bootstrap: int = 1000,
|
|
36
|
+
confidence_level: float = 0.95,
|
|
37
|
+
method: str = "percentile",
|
|
38
|
+
random_state: Optional[int] = None
|
|
39
|
+
) -> BootstrapResult:
|
|
40
|
+
"""
|
|
41
|
+
Bootstrap置信区间估计
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
data: 样本数据
|
|
45
|
+
statistic_func: 统计量函数 - "mean"(均值), "median"(中位数),
|
|
46
|
+
"std"(标准差), "var"(方差)
|
|
47
|
+
n_bootstrap: Bootstrap重采样次数
|
|
48
|
+
confidence_level: 置信水平
|
|
49
|
+
method: 置信区间方法 - "percentile"(百分位法), "bca"(BCa法)
|
|
50
|
+
random_state: 随机种子
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
BootstrapResult: Bootstrap推断结果
|
|
54
|
+
|
|
55
|
+
Raises:
|
|
56
|
+
ImportError: scipy库未安装
|
|
57
|
+
ValueError: 输入数据无效
|
|
58
|
+
"""
|
|
59
|
+
if not SCIPY_AVAILABLE:
|
|
60
|
+
raise ImportError("scipy库未安装。请运行: pip install scipy")
|
|
61
|
+
|
|
62
|
+
# 输入验证
|
|
63
|
+
if not data:
|
|
64
|
+
raise ValueError("data不能为空")
|
|
65
|
+
|
|
66
|
+
# 数据准备
|
|
67
|
+
data_arr = np.array(data, dtype=np.float64)
|
|
68
|
+
n = len(data_arr)
|
|
69
|
+
|
|
70
|
+
# 设置随机种子
|
|
71
|
+
if random_state is not None:
|
|
72
|
+
np.random.seed(random_state)
|
|
73
|
+
|
|
74
|
+
# 定义统计量函数
|
|
75
|
+
if statistic_func == "mean":
|
|
76
|
+
stat_fn = np.mean
|
|
77
|
+
elif statistic_func == "median":
|
|
78
|
+
stat_fn = np.median
|
|
79
|
+
elif statistic_func == "std":
|
|
80
|
+
stat_fn = lambda x: np.std(x, ddof=1)
|
|
81
|
+
elif statistic_func == "var":
|
|
82
|
+
stat_fn = lambda x: np.var(x, ddof=1)
|
|
83
|
+
elif callable(statistic_func):
|
|
84
|
+
stat_fn = statistic_func
|
|
85
|
+
else:
|
|
86
|
+
raise ValueError(f"不支持的统计量: {statistic_func}")
|
|
87
|
+
|
|
88
|
+
# 计算原始统计量
|
|
89
|
+
original_stat = float(stat_fn(data_arr))
|
|
90
|
+
|
|
91
|
+
# 执行Bootstrap重采样
|
|
92
|
+
bootstrap_stats = []
|
|
93
|
+
for _ in range(n_bootstrap):
|
|
94
|
+
# 有放回抽样
|
|
95
|
+
bootstrap_sample = np.random.choice(data_arr, size=n, replace=True)
|
|
96
|
+
bootstrap_stat = stat_fn(bootstrap_sample)
|
|
97
|
+
bootstrap_stats.append(bootstrap_stat)
|
|
98
|
+
|
|
99
|
+
bootstrap_stats = np.array(bootstrap_stats)
|
|
100
|
+
|
|
101
|
+
# 计算Bootstrap统计量
|
|
102
|
+
bootstrap_mean = float(bootstrap_stats.mean())
|
|
103
|
+
bootstrap_std = float(bootstrap_stats.std(ddof=1))
|
|
104
|
+
bias = bootstrap_mean - original_stat
|
|
105
|
+
|
|
106
|
+
# 计算置信区间
|
|
107
|
+
alpha = 1 - confidence_level
|
|
108
|
+
|
|
109
|
+
if method == "percentile":
|
|
110
|
+
# 百分位法
|
|
111
|
+
lower_percentile = alpha / 2 * 100
|
|
112
|
+
upper_percentile = (1 - alpha / 2) * 100
|
|
113
|
+
ci_lower = float(np.percentile(bootstrap_stats, lower_percentile))
|
|
114
|
+
ci_upper = float(np.percentile(bootstrap_stats, upper_percentile))
|
|
115
|
+
elif method == "normal":
|
|
116
|
+
# 正态近似法
|
|
117
|
+
z_score = stats.norm.ppf(1 - alpha / 2)
|
|
118
|
+
ci_lower = original_stat - z_score * bootstrap_std
|
|
119
|
+
ci_upper = original_stat + z_score * bootstrap_std
|
|
120
|
+
elif method == "basic":
|
|
121
|
+
# 基本Bootstrap法
|
|
122
|
+
lower_percentile = alpha / 2 * 100
|
|
123
|
+
upper_percentile = (1 - alpha / 2) * 100
|
|
124
|
+
ci_lower = 2 * original_stat - float(np.percentile(bootstrap_stats, upper_percentile))
|
|
125
|
+
ci_upper = 2 * original_stat - float(np.percentile(bootstrap_stats, lower_percentile))
|
|
126
|
+
else:
|
|
127
|
+
raise ValueError(f"不支持的置信区间方法: {method}")
|
|
128
|
+
|
|
129
|
+
# 保存前100个Bootstrap统计量(用于展示)
|
|
130
|
+
bootstrap_dist_sample = bootstrap_stats[:min(100, len(bootstrap_stats))].tolist()
|
|
131
|
+
|
|
132
|
+
# 生成摘要
|
|
133
|
+
summary = f"""Bootstrap推断:
|
|
134
|
+
- 样本量: {n}
|
|
135
|
+
- Bootstrap次数: {n_bootstrap}
|
|
136
|
+
- 统计量: {statistic_func}
|
|
137
|
+
- 置信区间方法: {method}
|
|
138
|
+
|
|
139
|
+
估计结果:
|
|
140
|
+
- 统计量估计: {original_stat:.4f}
|
|
141
|
+
- Bootstrap均值: {bootstrap_mean:.4f}
|
|
142
|
+
- Bootstrap标准误: {bootstrap_std:.4f}
|
|
143
|
+
- 偏差: {bias:.4f}
|
|
144
|
+
|
|
145
|
+
{int(confidence_level*100)}% 置信区间:
|
|
146
|
+
- 下界: {ci_lower:.4f}
|
|
147
|
+
- 上界: {ci_upper:.4f}
|
|
148
|
+
- 区间宽度: {ci_upper - ci_lower:.4f}
|
|
149
|
+
"""
|
|
150
|
+
|
|
151
|
+
return BootstrapResult(
|
|
152
|
+
statistic=original_stat,
|
|
153
|
+
bootstrap_mean=bootstrap_mean,
|
|
154
|
+
bootstrap_std=bootstrap_std,
|
|
155
|
+
confidence_interval=(ci_lower, ci_upper),
|
|
156
|
+
bias=bias,
|
|
157
|
+
confidence_level=confidence_level,
|
|
158
|
+
n_bootstrap=n_bootstrap,
|
|
159
|
+
method=method,
|
|
160
|
+
bootstrap_distribution=bootstrap_dist_sample,
|
|
161
|
+
summary=summary
|
|
162
|
+
)
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"""
|
|
2
|
+
置换检验 (Permutation Test)
|
|
3
|
+
非参数假设检验方法
|
|
4
|
+
基于 scipy.stats 实现
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import List, Optional
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
import numpy as np
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
from scipy import stats
|
|
13
|
+
SCIPY_AVAILABLE = True
|
|
14
|
+
except ImportError:
|
|
15
|
+
SCIPY_AVAILABLE = False
|
|
16
|
+
stats = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class PermutationTestResult(BaseModel):
|
|
20
|
+
"""置换检验结果"""
|
|
21
|
+
statistic: float = Field(..., description="观测统计量")
|
|
22
|
+
p_value: float = Field(..., description="P值")
|
|
23
|
+
null_distribution_mean: float = Field(..., description="零假设分布均值")
|
|
24
|
+
null_distribution_std: float = Field(..., description="零假设分布标准差")
|
|
25
|
+
n_permutations: int = Field(..., description="置换次数")
|
|
26
|
+
alternative: str = Field(..., description="备择假设")
|
|
27
|
+
test_type: str = Field(..., description="检验类型")
|
|
28
|
+
n_sample_a: int = Field(..., description="样本A大小")
|
|
29
|
+
n_sample_b: int = Field(..., description="样本B大小")
|
|
30
|
+
permutation_distribution: List[float] = Field(..., description="置换分布(前100个)")
|
|
31
|
+
summary: str = Field(..., description="摘要信息")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def permutation_test(
|
|
35
|
+
sample_a: List[float],
|
|
36
|
+
sample_b: List[float],
|
|
37
|
+
test_type: str = "mean_difference",
|
|
38
|
+
alternative: str = "two-sided",
|
|
39
|
+
n_permutations: int = 10000,
|
|
40
|
+
random_state: Optional[int] = None
|
|
41
|
+
) -> PermutationTestResult:
|
|
42
|
+
"""
|
|
43
|
+
置换检验(两样本)
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
sample_a: 样本A
|
|
47
|
+
sample_b: 样本B
|
|
48
|
+
test_type: 检验类型 - "mean_difference"(均值差异),
|
|
49
|
+
"median_difference"(中位数差异),
|
|
50
|
+
"variance_ratio"(方差比)
|
|
51
|
+
alternative: 备择假设 - "two-sided", "less", "greater"
|
|
52
|
+
n_permutations: 置换次数
|
|
53
|
+
random_state: 随机种子
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
PermutationTestResult: 置换检验结果
|
|
57
|
+
|
|
58
|
+
Raises:
|
|
59
|
+
ImportError: scipy库未安装
|
|
60
|
+
ValueError: 输入数据无效
|
|
61
|
+
"""
|
|
62
|
+
if not SCIPY_AVAILABLE:
|
|
63
|
+
raise ImportError("scipy库未安装。请运行: pip install scipy")
|
|
64
|
+
|
|
65
|
+
# 输入验证
|
|
66
|
+
if not sample_a or not sample_b:
|
|
67
|
+
raise ValueError("两个样本都不能为空")
|
|
68
|
+
|
|
69
|
+
# 数据准备
|
|
70
|
+
a = np.array(sample_a, dtype=np.float64)
|
|
71
|
+
b = np.array(sample_b, dtype=np.float64)
|
|
72
|
+
|
|
73
|
+
n_a = len(a)
|
|
74
|
+
n_b = len(b)
|
|
75
|
+
|
|
76
|
+
# 设置随机种子
|
|
77
|
+
if random_state is not None:
|
|
78
|
+
np.random.seed(random_state)
|
|
79
|
+
|
|
80
|
+
# 合并数据
|
|
81
|
+
combined = np.concatenate([a, b])
|
|
82
|
+
n_total = len(combined)
|
|
83
|
+
|
|
84
|
+
# 定义统计量函数
|
|
85
|
+
if test_type == "mean_difference":
|
|
86
|
+
def stat_func(x, y):
|
|
87
|
+
return np.mean(x) - np.mean(y)
|
|
88
|
+
elif test_type == "median_difference":
|
|
89
|
+
def stat_func(x, y):
|
|
90
|
+
return np.median(x) - np.median(y)
|
|
91
|
+
elif test_type == "variance_ratio":
|
|
92
|
+
def stat_func(x, y):
|
|
93
|
+
return np.var(x, ddof=1) / np.var(y, ddof=1) if np.var(y, ddof=1) > 0 else 0
|
|
94
|
+
else:
|
|
95
|
+
raise ValueError(f"不支持的检验类型: {test_type}")
|
|
96
|
+
|
|
97
|
+
# 计算观测统计量
|
|
98
|
+
observed_stat = stat_func(a, b)
|
|
99
|
+
|
|
100
|
+
# 执行置换检验
|
|
101
|
+
perm_stats = []
|
|
102
|
+
for _ in range(n_permutations):
|
|
103
|
+
# 随机置换
|
|
104
|
+
perm = np.random.permutation(combined)
|
|
105
|
+
perm_a = perm[:n_a]
|
|
106
|
+
perm_b = perm[n_a:]
|
|
107
|
+
perm_stat = stat_func(perm_a, perm_b)
|
|
108
|
+
perm_stats.append(perm_stat)
|
|
109
|
+
|
|
110
|
+
perm_stats = np.array(perm_stats)
|
|
111
|
+
|
|
112
|
+
# 计算p值
|
|
113
|
+
if alternative == "two-sided":
|
|
114
|
+
p_value = np.mean(np.abs(perm_stats) >= np.abs(observed_stat))
|
|
115
|
+
elif alternative == "greater":
|
|
116
|
+
p_value = np.mean(perm_stats >= observed_stat)
|
|
117
|
+
elif alternative == "less":
|
|
118
|
+
p_value = np.mean(perm_stats <= observed_stat)
|
|
119
|
+
else:
|
|
120
|
+
raise ValueError(f"不支持的备择假设: {alternative}")
|
|
121
|
+
|
|
122
|
+
# 零假设分布的统计特征
|
|
123
|
+
null_mean = float(perm_stats.mean())
|
|
124
|
+
null_std = float(perm_stats.std(ddof=1))
|
|
125
|
+
|
|
126
|
+
# 保存前100个置换统计量
|
|
127
|
+
perm_dist_sample = perm_stats[:min(100, len(perm_stats))].tolist()
|
|
128
|
+
|
|
129
|
+
# 判断显著性
|
|
130
|
+
if p_value < 0.01:
|
|
131
|
+
significance = "高度显著"
|
|
132
|
+
elif p_value < 0.05:
|
|
133
|
+
significance = "显著"
|
|
134
|
+
elif p_value < 0.10:
|
|
135
|
+
significance = "边际显著"
|
|
136
|
+
else:
|
|
137
|
+
significance = "不显著"
|
|
138
|
+
|
|
139
|
+
# 生成摘要
|
|
140
|
+
test_names = {
|
|
141
|
+
"mean_difference": "均值差异",
|
|
142
|
+
"median_difference": "中位数差异",
|
|
143
|
+
"variance_ratio": "方差比"
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
summary = f"""置换检验:
|
|
147
|
+
- 检验类型: {test_names.get(test_type, test_type)}
|
|
148
|
+
- 备择假设: {alternative}
|
|
149
|
+
- 置换次数: {n_permutations}
|
|
150
|
+
|
|
151
|
+
样本信息:
|
|
152
|
+
- 样本A: n={n_a}, 均值={a.mean():.4f}
|
|
153
|
+
- 样本B: n={n_b}, 均值={b.mean():.4f}
|
|
154
|
+
|
|
155
|
+
检验结果:
|
|
156
|
+
- 观测统计量: {observed_stat:.4f}
|
|
157
|
+
- P值: {p_value:.4f}
|
|
158
|
+
- 显著性: {significance}
|
|
159
|
+
|
|
160
|
+
零假设分布:
|
|
161
|
+
- 均值: {null_mean:.4f}
|
|
162
|
+
- 标准差: {null_std:.4f}
|
|
163
|
+
"""
|
|
164
|
+
|
|
165
|
+
return PermutationTestResult(
|
|
166
|
+
statistic=float(observed_stat),
|
|
167
|
+
p_value=float(p_value),
|
|
168
|
+
null_distribution_mean=null_mean,
|
|
169
|
+
null_distribution_std=null_std,
|
|
170
|
+
n_permutations=n_permutations,
|
|
171
|
+
alternative=alternative,
|
|
172
|
+
test_type=test_type,
|
|
173
|
+
n_sample_a=n_a,
|
|
174
|
+
n_sample_b=n_b,
|
|
175
|
+
permutation_distribution=perm_dist_sample,
|
|
176
|
+
summary=summary
|
|
177
|
+
)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""
|
|
2
|
+
生存分析模块
|
|
3
|
+
分析事件发生时间数据
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from .survival_models import (
|
|
7
|
+
kaplan_meier_estimation_simple,
|
|
8
|
+
cox_regression_simple,
|
|
9
|
+
KaplanMeierResult,
|
|
10
|
+
CoxRegressionResult
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
'kaplan_meier_estimation_simple',
|
|
15
|
+
'cox_regression_simple',
|
|
16
|
+
'KaplanMeierResult',
|
|
17
|
+
'CoxRegressionResult'
|
|
18
|
+
]
|