aigroup-econ-mcp 1.4.3__py3-none-any.whl → 2.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PKG-INFO +344 -322
- README.md +335 -320
- __init__.py +1 -1
- aigroup_econ_mcp-2.0.1.dist-info/METADATA +732 -0
- aigroup_econ_mcp-2.0.1.dist-info/RECORD +170 -0
- cli.py +4 -0
- econometrics/advanced_methods/modern_computing_machine_learning/__init__.py +30 -0
- econometrics/advanced_methods/modern_computing_machine_learning/causal_forest.py +253 -0
- econometrics/advanced_methods/modern_computing_machine_learning/double_ml.py +268 -0
- econometrics/advanced_methods/modern_computing_machine_learning/gradient_boosting.py +249 -0
- econometrics/advanced_methods/modern_computing_machine_learning/hierarchical_clustering.py +243 -0
- econometrics/advanced_methods/modern_computing_machine_learning/kmeans_clustering.py +293 -0
- econometrics/advanced_methods/modern_computing_machine_learning/neural_network.py +264 -0
- econometrics/advanced_methods/modern_computing_machine_learning/random_forest.py +195 -0
- econometrics/advanced_methods/modern_computing_machine_learning/support_vector_machine.py +226 -0
- econometrics/advanced_methods/modern_computing_machine_learning/test_all_modules.py +329 -0
- econometrics/advanced_methods/modern_computing_machine_learning/test_report.md +107 -0
- econometrics/causal_inference/__init__.py +66 -0
- econometrics/causal_inference/causal_identification_strategy/__init__.py +104 -0
- econometrics/causal_inference/causal_identification_strategy/control_function.py +112 -0
- econometrics/causal_inference/causal_identification_strategy/difference_in_differences.py +107 -0
- econometrics/causal_inference/causal_identification_strategy/event_study.py +119 -0
- econometrics/causal_inference/causal_identification_strategy/first_difference.py +89 -0
- econometrics/causal_inference/causal_identification_strategy/fixed_effects.py +103 -0
- econometrics/causal_inference/causal_identification_strategy/hausman_test.py +69 -0
- econometrics/causal_inference/causal_identification_strategy/instrumental_variables.py +145 -0
- econometrics/causal_inference/causal_identification_strategy/mediation_analysis.py +121 -0
- econometrics/causal_inference/causal_identification_strategy/moderation_analysis.py +109 -0
- econometrics/causal_inference/causal_identification_strategy/propensity_score_matching.py +140 -0
- econometrics/causal_inference/causal_identification_strategy/random_effects.py +100 -0
- econometrics/causal_inference/causal_identification_strategy/regression_discontinuity.py +98 -0
- econometrics/causal_inference/causal_identification_strategy/synthetic_control.py +111 -0
- econometrics/causal_inference/causal_identification_strategy/triple_difference.py +86 -0
- econometrics/distribution_analysis/__init__.py +28 -0
- econometrics/distribution_analysis/oaxaca_blinder.py +184 -0
- econometrics/distribution_analysis/time_series_decomposition.py +152 -0
- econometrics/distribution_analysis/variance_decomposition.py +179 -0
- econometrics/missing_data/__init__.py +18 -0
- econometrics/missing_data/imputation_methods.py +219 -0
- econometrics/nonparametric/__init__.py +35 -0
- econometrics/nonparametric/gam_model.py +117 -0
- econometrics/nonparametric/kernel_regression.py +161 -0
- econometrics/nonparametric/quantile_regression.py +249 -0
- econometrics/nonparametric/spline_regression.py +100 -0
- econometrics/spatial_econometrics/__init__.py +68 -0
- econometrics/spatial_econometrics/geographically_weighted_regression.py +211 -0
- econometrics/spatial_econometrics/gwr_simple.py +154 -0
- econometrics/spatial_econometrics/spatial_autocorrelation.py +356 -0
- econometrics/spatial_econometrics/spatial_durbin_model.py +177 -0
- econometrics/spatial_econometrics/spatial_regression.py +315 -0
- econometrics/spatial_econometrics/spatial_weights.py +226 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/README.md +164 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/__init__.py +40 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/count_data_models.py +311 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/discrete_choice_models.py +294 -0
- econometrics/specific_data_modeling/micro_discrete_limited_data/limited_dependent_variable_models.py +282 -0
- econometrics/statistical_inference/__init__.py +21 -0
- econometrics/statistical_inference/bootstrap_methods.py +162 -0
- econometrics/statistical_inference/permutation_test.py +177 -0
- econometrics/survival_analysis/__init__.py +18 -0
- econometrics/survival_analysis/survival_models.py +259 -0
- econometrics/tests/causal_inference_tests/__init__.py +3 -0
- econometrics/tests/causal_inference_tests/detailed_test.py +441 -0
- econometrics/tests/causal_inference_tests/test_all_methods.py +418 -0
- econometrics/tests/causal_inference_tests/test_causal_identification_strategy.py +202 -0
- econometrics/tests/causal_inference_tests/test_difference_in_differences.py +53 -0
- econometrics/tests/causal_inference_tests/test_instrumental_variables.py +44 -0
- econometrics/tests/specific_data_modeling_tests/test_micro_discrete_limited_data.py +189 -0
- econometrics//321/206/320/254/320/272/321/205/342/225/235/320/220/321/205/320/237/320/241/321/205/320/264/320/267/321/207/342/226/222/342/225/227/321/204/342/225/235/320/250/321/205/320/225/320/230/321/207/342/225/221/320/267/321/205/320/230/320/226/321/206/320/256/320/240.md +544 -0
- pyproject.toml +9 -2
- server.py +15 -1
- tools/__init__.py +75 -1
- tools/causal_inference_adapter.py +658 -0
- tools/distribution_analysis_adapter.py +121 -0
- tools/gwr_simple_adapter.py +54 -0
- tools/machine_learning_adapter.py +567 -0
- tools/mcp_tool_groups/__init__.py +15 -1
- tools/mcp_tool_groups/causal_inference_tools.py +643 -0
- tools/mcp_tool_groups/distribution_analysis_tools.py +169 -0
- tools/mcp_tool_groups/machine_learning_tools.py +422 -0
- tools/mcp_tool_groups/microecon_tools.py +325 -0
- tools/mcp_tool_groups/missing_data_tools.py +117 -0
- tools/mcp_tool_groups/nonparametric_tools.py +225 -0
- tools/mcp_tool_groups/spatial_econometrics_tools.py +323 -0
- tools/mcp_tool_groups/statistical_inference_tools.py +131 -0
- tools/mcp_tools_registry.py +13 -3
- tools/microecon_adapter.py +412 -0
- tools/missing_data_adapter.py +73 -0
- tools/nonparametric_adapter.py +190 -0
- tools/spatial_econometrics_adapter.py +318 -0
- tools/statistical_inference_adapter.py +90 -0
- tools/survival_analysis_adapter.py +46 -0
- aigroup_econ_mcp-1.4.3.dist-info/METADATA +0 -710
- aigroup_econ_mcp-1.4.3.dist-info/RECORD +0 -92
- {aigroup_econ_mcp-1.4.3.dist-info → aigroup_econ_mcp-2.0.1.dist-info}/WHEEL +0 -0
- {aigroup_econ_mcp-1.4.3.dist-info → aigroup_econ_mcp-2.0.1.dist-info}/entry_points.txt +0 -0
- {aigroup_econ_mcp-1.4.3.dist-info → aigroup_econ_mcp-2.0.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"""
|
|
2
|
+
断点回归设计 (RDD) 实现
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import List, Optional
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
from scipy import stats
|
|
10
|
+
import statsmodels.api as sm
|
|
11
|
+
from sklearn.preprocessing import PolynomialFeatures
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class RDDResult(BaseModel):
|
|
15
|
+
"""断点回归设计结果"""
|
|
16
|
+
method: str = Field(default="Regression Discontinuity Design", description="使用的因果识别方法")
|
|
17
|
+
estimate: float = Field(..., description="因果效应估计值")
|
|
18
|
+
std_error: float = Field(..., description="标准误")
|
|
19
|
+
t_statistic: float = Field(..., description="t统计量")
|
|
20
|
+
p_value: float = Field(..., description="p值")
|
|
21
|
+
confidence_interval: List[float] = Field(..., description="置信区间")
|
|
22
|
+
n_observations: int = Field(..., description="观测数量")
|
|
23
|
+
bandwidth: Optional[float] = Field(None, description="使用的带宽")
|
|
24
|
+
polynomial_order: Optional[int] = Field(None, description="多项式阶数")
|
|
25
|
+
discontinuity_location: Optional[float] = Field(None, description="断点位置")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def regression_discontinuity(
|
|
29
|
+
running_variable: List[float],
|
|
30
|
+
outcome: List[float],
|
|
31
|
+
cutoff: float,
|
|
32
|
+
bandwidth: Optional[float] = None,
|
|
33
|
+
polynomial_order: int = 1
|
|
34
|
+
) -> RDDResult:
|
|
35
|
+
"""
|
|
36
|
+
断点回归设计 (RDD)
|
|
37
|
+
|
|
38
|
+
使用statsmodels实现断点回归设计,评估在断点处的处理效应。
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
running_variable: 运行变量
|
|
42
|
+
outcome: 结果变量
|
|
43
|
+
cutoff: 断点值
|
|
44
|
+
bandwidth: 带宽
|
|
45
|
+
polynomial_order: 多项式阶数
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
RDDResult: 断点回归设计结果
|
|
49
|
+
"""
|
|
50
|
+
# 转换为numpy数组
|
|
51
|
+
running_array = np.array(running_variable)
|
|
52
|
+
outcome_array = np.array(outcome)
|
|
53
|
+
|
|
54
|
+
# 如果未指定带宽,使用默认方法计算
|
|
55
|
+
if bandwidth is None:
|
|
56
|
+
# 使用数据标准差的四分之一作为默认带宽
|
|
57
|
+
bandwidth = 0.25 * np.std(running_array)
|
|
58
|
+
|
|
59
|
+
# 筛选带宽内的观测
|
|
60
|
+
mask = np.abs(running_array - cutoff) <= bandwidth
|
|
61
|
+
running_local = running_array[mask]
|
|
62
|
+
outcome_local = outcome_array[mask]
|
|
63
|
+
|
|
64
|
+
# 构造处理变量(运行变量是否大于cutoff)
|
|
65
|
+
treatment = (running_local >= cutoff).astype(int)
|
|
66
|
+
|
|
67
|
+
# 构造多项式项
|
|
68
|
+
poly = PolynomialFeatures(degree=polynomial_order, include_bias=False)
|
|
69
|
+
running_poly = poly.fit_transform(running_local.reshape(-1, 1))
|
|
70
|
+
|
|
71
|
+
# 构建设计矩阵
|
|
72
|
+
X = np.column_stack([np.ones(len(running_local)), treatment, running_poly])
|
|
73
|
+
|
|
74
|
+
# 使用statsmodels进行OLS回归
|
|
75
|
+
model = sm.OLS(outcome_local, X)
|
|
76
|
+
results = model.fit()
|
|
77
|
+
|
|
78
|
+
# 提取处理效应(处理变量系数)
|
|
79
|
+
coef = results.params[1] # treatment变量是第2列(索引为1)
|
|
80
|
+
stderr = results.bse[1]
|
|
81
|
+
tstat = results.tvalues[1]
|
|
82
|
+
pval = results.pvalues[1]
|
|
83
|
+
|
|
84
|
+
# 计算置信区间
|
|
85
|
+
ci_lower = coef - 1.96 * stderr
|
|
86
|
+
ci_upper = coef + 1.96 * stderr
|
|
87
|
+
|
|
88
|
+
return RDDResult(
|
|
89
|
+
estimate=float(coef),
|
|
90
|
+
std_error=float(stderr),
|
|
91
|
+
t_statistic=float(tstat),
|
|
92
|
+
p_value=float(pval),
|
|
93
|
+
confidence_interval=[float(ci_lower), float(ci_upper)],
|
|
94
|
+
n_observations=len(running_local),
|
|
95
|
+
bandwidth=bandwidth,
|
|
96
|
+
polynomial_order=polynomial_order,
|
|
97
|
+
discontinuity_location=cutoff
|
|
98
|
+
)
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""
|
|
2
|
+
合成控制法实现
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import List, Optional, Dict
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
from scipy.optimize import minimize
|
|
10
|
+
from sklearn.linear_model import LinearRegression
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class SyntheticControlResult(BaseModel):
|
|
14
|
+
"""合成控制法结果"""
|
|
15
|
+
method: str = Field(default="Synthetic Control Method", description="使用的因果识别方法")
|
|
16
|
+
treatment_effect: float = Field(..., description="处理效应估计值")
|
|
17
|
+
synthetic_weights: List[float] = Field(..., description="合成控制权重")
|
|
18
|
+
n_observations: int = Field(..., description="观测数量")
|
|
19
|
+
donor_units: List[str] = Field(..., description="对照单元列表")
|
|
20
|
+
pre_treatment_fit: Dict[str, float] = Field(..., description="处理前拟合度量")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def synthetic_control_method(
|
|
24
|
+
outcome: List[float],
|
|
25
|
+
treatment_period: int,
|
|
26
|
+
treated_unit: str,
|
|
27
|
+
donor_units: List[str],
|
|
28
|
+
time_periods: List[str]
|
|
29
|
+
) -> SyntheticControlResult:
|
|
30
|
+
"""
|
|
31
|
+
合成控制法
|
|
32
|
+
|
|
33
|
+
合成控制法通过构造一个"合成"对照单元来评估处理效应,该对照单元是多个未处理单元的加权组合。
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
outcome: 结果变量(所有单元的时序数据)
|
|
37
|
+
treatment_period: 处理开始的时间期
|
|
38
|
+
treated_unit: 处理单元名称
|
|
39
|
+
donor_units: 对照单元名称列表
|
|
40
|
+
time_periods: 时间期列表
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
SyntheticControlResult: 合成控制法结果
|
|
44
|
+
"""
|
|
45
|
+
# 假设数据按单元排列,每个单元连续排列其时间序列
|
|
46
|
+
n_units = len(donor_units) + 1 # 包括处理单元
|
|
47
|
+
n_time = len(time_periods)
|
|
48
|
+
|
|
49
|
+
if len(outcome) != n_units * n_time:
|
|
50
|
+
raise ValueError("结果变量长度应等于单元数乘以时间期数")
|
|
51
|
+
|
|
52
|
+
# 重塑数据为(单元, 时间)矩阵
|
|
53
|
+
outcome_matrix = np.array(outcome).reshape(n_units, n_time)
|
|
54
|
+
|
|
55
|
+
# 确定处理单元索引
|
|
56
|
+
treated_idx = 0 # 假设处理单元是第一个
|
|
57
|
+
|
|
58
|
+
# 提取处理前时期的数据
|
|
59
|
+
pre_treatment_periods = treatment_period
|
|
60
|
+
treated_pre = outcome_matrix[treated_idx, :pre_treatment_periods]
|
|
61
|
+
donors_pre = outcome_matrix[1:, :pre_treatment_periods] # 排除处理单元
|
|
62
|
+
|
|
63
|
+
# 定义优化目标函数(最小化均方预测误差)
|
|
64
|
+
def objective(weights):
|
|
65
|
+
synthetic = donors_pre.T @ weights
|
|
66
|
+
mse = np.mean((treated_pre - synthetic) ** 2)
|
|
67
|
+
return mse
|
|
68
|
+
|
|
69
|
+
# 约束条件:权重非负且和为1
|
|
70
|
+
constraints = [
|
|
71
|
+
{"type": "eq", "fun": lambda w: np.sum(w) - 1} # 权重和为1
|
|
72
|
+
]
|
|
73
|
+
bounds = [(0, 1) for _ in range(len(donor_units))] # 权重在[0,1]之间
|
|
74
|
+
|
|
75
|
+
# 初始权重
|
|
76
|
+
initial_weights = np.ones(len(donor_units)) / len(donor_units)
|
|
77
|
+
|
|
78
|
+
# 优化求解
|
|
79
|
+
result = minimize(objective, initial_weights, method='SLSQP',
|
|
80
|
+
bounds=bounds, constraints=constraints)
|
|
81
|
+
|
|
82
|
+
optimal_weights = result.x
|
|
83
|
+
|
|
84
|
+
# 计算合成对照单元的结果 - 修复矩阵乘法维度问题
|
|
85
|
+
# outcome_matrix[1:] 形状: (n_units-1, n_time)
|
|
86
|
+
# optimal_weights 形状: (n_units-1,)
|
|
87
|
+
# 我们需要 (n_time, n_units-1) @ (n_units-1,) = (n_time,)
|
|
88
|
+
synthetic_control = outcome_matrix[1:].T @ optimal_weights # 所有时期
|
|
89
|
+
|
|
90
|
+
# 计算处理效应(处理后时期)
|
|
91
|
+
post_treatment_outcome = outcome_matrix[treated_idx, treatment_period:]
|
|
92
|
+
post_treatment_synthetic = synthetic_control[treatment_period:]
|
|
93
|
+
treatment_effect = np.mean(post_treatment_outcome - post_treatment_synthetic)
|
|
94
|
+
|
|
95
|
+
# 计算处理前拟合度量
|
|
96
|
+
pre_treatment_synthetic = synthetic_control[:treatment_period]
|
|
97
|
+
pre_treatment_r2 = 1 - np.sum((treated_pre - pre_treatment_synthetic) ** 2) / \
|
|
98
|
+
np.sum((treated_pre - np.mean(treated_pre)) ** 2)
|
|
99
|
+
|
|
100
|
+
pre_treatment_fit = {
|
|
101
|
+
"R-squared": float(pre_treatment_r2),
|
|
102
|
+
"RMSE": float(np.sqrt(np.mean((treated_pre - pre_treatment_synthetic) ** 2)))
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
return SyntheticControlResult(
|
|
106
|
+
treatment_effect=float(treatment_effect),
|
|
107
|
+
synthetic_weights=optimal_weights.tolist(),
|
|
108
|
+
n_observations=len(outcome),
|
|
109
|
+
donor_units=donor_units,
|
|
110
|
+
pre_treatment_fit=pre_treatment_fit
|
|
111
|
+
)
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""
|
|
2
|
+
三重差分法 (DDD) 实现
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import List, Optional
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
import statsmodels.api as sm
|
|
10
|
+
from scipy import stats
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class TripeDifferenceResult(BaseModel):
|
|
14
|
+
"""三重差分法结果"""
|
|
15
|
+
method: str = Field(default="Triple Difference", description="使用的因果识别方法")
|
|
16
|
+
estimate: float = Field(..., description="因果效应估计值")
|
|
17
|
+
std_error: float = Field(..., description="标准误")
|
|
18
|
+
t_statistic: float = Field(..., description="t统计量")
|
|
19
|
+
p_value: float = Field(..., description="p值")
|
|
20
|
+
confidence_interval: List[float] = Field(..., description="置信区间")
|
|
21
|
+
n_observations: int = Field(..., description="观测数量")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def triple_difference(
|
|
25
|
+
outcome: List[float],
|
|
26
|
+
treatment_group: List[int],
|
|
27
|
+
time_period: List[int],
|
|
28
|
+
cohort_group: List[int]
|
|
29
|
+
) -> TripeDifferenceResult:
|
|
30
|
+
"""
|
|
31
|
+
三重差分法 (DDD)
|
|
32
|
+
|
|
33
|
+
三重差分法通过引入第三个维度(如不同的队列组)来进一步控制混杂因素,
|
|
34
|
+
提供比双重差分法更强的因果识别能力。
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
outcome: 结果变量
|
|
38
|
+
treatment_group: 处理组虚拟变量 (0/1)
|
|
39
|
+
time_period: 时间虚拟变量 (0/1)
|
|
40
|
+
cohort_group: 队列组虚拟变量 (0/1)
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
TripeDifferenceResult: 三重差分法结果
|
|
44
|
+
"""
|
|
45
|
+
# 构建数据
|
|
46
|
+
df = pd.DataFrame({
|
|
47
|
+
'outcome': outcome,
|
|
48
|
+
'treatment': treatment_group,
|
|
49
|
+
'time': time_period,
|
|
50
|
+
'cohort': cohort_group
|
|
51
|
+
})
|
|
52
|
+
|
|
53
|
+
# 构建交互项
|
|
54
|
+
df['treatment_time'] = df['treatment'] * df['time']
|
|
55
|
+
df['treatment_cohort'] = df['treatment'] * df['cohort']
|
|
56
|
+
df['time_cohort'] = df['time'] * df['cohort']
|
|
57
|
+
df['treatment_time_cohort'] = df['treatment'] * df['time'] * df['cohort']
|
|
58
|
+
|
|
59
|
+
# 构建回归设计矩阵
|
|
60
|
+
X_vars = ['treatment', 'time', 'cohort', 'treatment_time', 'treatment_cohort', 'time_cohort', 'treatment_time_cohort']
|
|
61
|
+
X = df[X_vars]
|
|
62
|
+
X = sm.add_constant(X) # 添加常数项
|
|
63
|
+
y = df['outcome']
|
|
64
|
+
|
|
65
|
+
# OLS回归
|
|
66
|
+
model = sm.OLS(y, X)
|
|
67
|
+
results = model.fit()
|
|
68
|
+
|
|
69
|
+
# 提取三重差分估计结果(三重交互项系数)
|
|
70
|
+
coef = results.params['treatment_time_cohort']
|
|
71
|
+
stderr = results.bse['treatment_time_cohort']
|
|
72
|
+
tstat = results.tvalues['treatment_time_cohort']
|
|
73
|
+
pval = results.pvalues['treatment_time_cohort']
|
|
74
|
+
|
|
75
|
+
# 计算置信区间
|
|
76
|
+
ci_lower = coef - 1.96 * stderr
|
|
77
|
+
ci_upper = coef + 1.96 * stderr
|
|
78
|
+
|
|
79
|
+
return TripeDifferenceResult(
|
|
80
|
+
estimate=float(coef),
|
|
81
|
+
std_error=float(stderr),
|
|
82
|
+
t_statistic=float(tstat),
|
|
83
|
+
p_value=float(pval),
|
|
84
|
+
confidence_interval=[float(ci_lower), float(ci_upper)],
|
|
85
|
+
n_observations=len(df)
|
|
86
|
+
)
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""
|
|
2
|
+
分布分析与分解方法模块
|
|
3
|
+
分析因变量的条件分布特征并进行各种分解分析
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from .oaxaca_blinder import (
|
|
7
|
+
oaxaca_blinder_decomposition,
|
|
8
|
+
OaxacaResult
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
from .variance_decomposition import (
|
|
12
|
+
variance_decomposition,
|
|
13
|
+
VarianceDecompositionResult
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
from .time_series_decomposition import (
|
|
17
|
+
time_series_decomposition,
|
|
18
|
+
TimeSeriesDecompositionResult
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
'oaxaca_blinder_decomposition',
|
|
23
|
+
'OaxacaResult',
|
|
24
|
+
'variance_decomposition',
|
|
25
|
+
'VarianceDecompositionResult',
|
|
26
|
+
'time_series_decomposition',
|
|
27
|
+
'TimeSeriesDecompositionResult'
|
|
28
|
+
]
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Oaxaca-Blinder分解
|
|
3
|
+
用于分解两组之间的平均差异(如工资差距)
|
|
4
|
+
基于 statsmodels 和自定义实现
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import List, Optional, Tuple
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
import numpy as np
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import statsmodels.api as sm
|
|
13
|
+
STATSMODELS_AVAILABLE = True
|
|
14
|
+
except ImportError:
|
|
15
|
+
STATSMODELS_AVAILABLE = False
|
|
16
|
+
sm = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class OaxacaResult(BaseModel):
|
|
20
|
+
"""Oaxaca-Blinder分解结果"""
|
|
21
|
+
total_difference: float = Field(..., description="总差异")
|
|
22
|
+
explained_part: float = Field(..., description="可解释部分(禀赋效应)")
|
|
23
|
+
unexplained_part: float = Field(..., description="不可解释部分(系数效应)")
|
|
24
|
+
explained_pct: float = Field(..., description="可解释部分百分比")
|
|
25
|
+
unexplained_pct: float = Field(..., description="不可解释部分百分比")
|
|
26
|
+
group_a_mean: float = Field(..., description="A组平均值")
|
|
27
|
+
group_b_mean: float = Field(..., description="B组平均值")
|
|
28
|
+
group_a_coefficients: List[float] = Field(..., description="A组回归系数")
|
|
29
|
+
group_b_coefficients: List[float] = Field(..., description="B组回归系数")
|
|
30
|
+
detailed_explained: List[float] = Field(..., description="各变量的可解释部分")
|
|
31
|
+
detailed_unexplained: List[float] = Field(..., description="各变量的不可解释部分")
|
|
32
|
+
feature_names: List[str] = Field(..., description="特征名称")
|
|
33
|
+
n_obs_a: int = Field(..., description="A组观测数")
|
|
34
|
+
n_obs_b: int = Field(..., description="B组观测数")
|
|
35
|
+
summary: str = Field(..., description="摘要信息")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def oaxaca_blinder_decomposition(
|
|
39
|
+
y_a: List[float],
|
|
40
|
+
x_a: List[List[float]],
|
|
41
|
+
y_b: List[float],
|
|
42
|
+
x_b: List[List[float]],
|
|
43
|
+
feature_names: Optional[List[str]] = None,
|
|
44
|
+
weight_matrix: str = "pooled"
|
|
45
|
+
) -> OaxacaResult:
|
|
46
|
+
"""
|
|
47
|
+
Oaxaca-Blinder分解
|
|
48
|
+
分解两组之间的平均差异为可解释部分和不可解释部分
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
y_a: A组因变量(如男性工资)
|
|
52
|
+
x_a: A组自变量
|
|
53
|
+
y_b: B组因变量(如女性工资)
|
|
54
|
+
x_b: B组自变量
|
|
55
|
+
feature_names: 特征名称
|
|
56
|
+
weight_matrix: 权重矩阵类型 - "pooled"(pooled权重), "group_a", "group_b"
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
OaxacaResult: Oaxaca-Blinder分解结果
|
|
60
|
+
|
|
61
|
+
Raises:
|
|
62
|
+
ImportError: statsmodels库未安装
|
|
63
|
+
ValueError: 输入数据无效
|
|
64
|
+
"""
|
|
65
|
+
if not STATSMODELS_AVAILABLE:
|
|
66
|
+
raise ImportError("statsmodels库未安装。请运行: pip install statsmodels")
|
|
67
|
+
|
|
68
|
+
# 输入验证
|
|
69
|
+
if not y_a or not x_a or not y_b or not x_b:
|
|
70
|
+
raise ValueError("所有输入数据不能为空")
|
|
71
|
+
|
|
72
|
+
# 数据准备
|
|
73
|
+
y_a_arr = np.array(y_a, dtype=np.float64)
|
|
74
|
+
X_a_arr = np.array(x_a, dtype=np.float64)
|
|
75
|
+
y_b_arr = np.array(y_b, dtype=np.float64)
|
|
76
|
+
X_b_arr = np.array(x_b, dtype=np.float64)
|
|
77
|
+
|
|
78
|
+
# 确保X是二维数组
|
|
79
|
+
if X_a_arr.ndim == 1:
|
|
80
|
+
X_a_arr = X_a_arr.reshape(-1, 1)
|
|
81
|
+
if X_b_arr.ndim == 1:
|
|
82
|
+
X_b_arr = X_b_arr.reshape(-1, 1)
|
|
83
|
+
|
|
84
|
+
n_a = len(y_a_arr)
|
|
85
|
+
n_b = len(y_b_arr)
|
|
86
|
+
k = X_a_arr.shape[1]
|
|
87
|
+
|
|
88
|
+
# 数据验证
|
|
89
|
+
if X_a_arr.shape[1] != X_b_arr.shape[1]:
|
|
90
|
+
raise ValueError("两组的自变量数量必须相同")
|
|
91
|
+
|
|
92
|
+
# 添加常数项
|
|
93
|
+
X_a_const = sm.add_constant(X_a_arr)
|
|
94
|
+
X_b_const = sm.add_constant(X_b_arr)
|
|
95
|
+
|
|
96
|
+
# 特征名称
|
|
97
|
+
if feature_names is None:
|
|
98
|
+
feature_names = [f"X{i+1}" for i in range(k)]
|
|
99
|
+
all_feature_names = ["const"] + feature_names
|
|
100
|
+
|
|
101
|
+
# 对两组分别进行OLS回归
|
|
102
|
+
model_a = sm.OLS(y_a_arr, X_a_const).fit()
|
|
103
|
+
model_b = sm.OLS(y_b_arr, X_b_const).fit()
|
|
104
|
+
|
|
105
|
+
# 提取系数
|
|
106
|
+
beta_a = model_a.params
|
|
107
|
+
beta_b = model_b.params
|
|
108
|
+
|
|
109
|
+
# 计算两组的平均特征值
|
|
110
|
+
X_a_mean = X_a_const.mean(axis=0)
|
|
111
|
+
X_b_mean = X_b_const.mean(axis=0)
|
|
112
|
+
|
|
113
|
+
# 计算两组的平均因变量
|
|
114
|
+
y_a_mean = float(y_a_arr.mean())
|
|
115
|
+
y_b_mean = float(y_b_arr.mean())
|
|
116
|
+
|
|
117
|
+
# 总差异
|
|
118
|
+
total_diff = y_a_mean - y_b_mean
|
|
119
|
+
|
|
120
|
+
# 根据权重矩阵选择参考系数
|
|
121
|
+
if weight_matrix == "pooled":
|
|
122
|
+
# 使用pooled回归的系数作为参考
|
|
123
|
+
y_pooled = np.concatenate([y_a_arr, y_b_arr])
|
|
124
|
+
X_pooled = np.vstack([X_a_const, X_b_const])
|
|
125
|
+
model_pooled = sm.OLS(y_pooled, X_pooled).fit()
|
|
126
|
+
beta_ref = model_pooled.params
|
|
127
|
+
elif weight_matrix == "group_a":
|
|
128
|
+
beta_ref = beta_a
|
|
129
|
+
elif weight_matrix == "group_b":
|
|
130
|
+
beta_ref = beta_b
|
|
131
|
+
else:
|
|
132
|
+
raise ValueError(f"不支持的权重矩阵类型: {weight_matrix}")
|
|
133
|
+
|
|
134
|
+
# Oaxaca分解
|
|
135
|
+
# 可解释部分(禀赋效应): (X̄ₐ - X̄ᵦ)' β*
|
|
136
|
+
explained = (X_a_mean - X_b_mean) @ beta_ref
|
|
137
|
+
|
|
138
|
+
# 不可解释部分(系数效应): X̄ₐ'(βₐ - β*) + X̄ᵦ'(β* - βᵦ)
|
|
139
|
+
unexplained = X_a_mean @ (beta_a - beta_ref) + X_b_mean @ (beta_ref - beta_b)
|
|
140
|
+
|
|
141
|
+
# 详细分解(每个变量的贡献)
|
|
142
|
+
detailed_explained = ((X_a_mean - X_b_mean) * beta_ref).tolist()
|
|
143
|
+
detailed_unexplained = (
|
|
144
|
+
X_a_mean * (beta_a - beta_ref) + X_b_mean * (beta_ref - beta_b)
|
|
145
|
+
).tolist()
|
|
146
|
+
|
|
147
|
+
# 百分比
|
|
148
|
+
explained_pct = (explained / total_diff * 100) if total_diff != 0 else 0.0
|
|
149
|
+
unexplained_pct = (unexplained / total_diff * 100) if total_diff != 0 else 0.0
|
|
150
|
+
|
|
151
|
+
# 生成摘要
|
|
152
|
+
summary = f"""Oaxaca-Blinder分解:
|
|
153
|
+
- 总差异: {total_diff:.4f}
|
|
154
|
+
- A组平均: {y_a_mean:.4f} (n={n_a})
|
|
155
|
+
- B组平均: {y_b_mean:.4f} (n={n_b})
|
|
156
|
+
|
|
157
|
+
分解结果:
|
|
158
|
+
- 可解释部分(禀赋效应): {explained:.4f} ({explained_pct:.1f}%)
|
|
159
|
+
- 不可解释部分(系数效应): {unexplained:.4f} ({unexplained_pct:.1f}%)
|
|
160
|
+
|
|
161
|
+
各变量贡献:
|
|
162
|
+
"""
|
|
163
|
+
for i, name in enumerate(all_feature_names):
|
|
164
|
+
summary += f" {name}:\n"
|
|
165
|
+
summary += f" - 禀赋效应: {detailed_explained[i]:.4f}\n"
|
|
166
|
+
summary += f" - 系数效应: {detailed_unexplained[i]:.4f}\n"
|
|
167
|
+
|
|
168
|
+
return OaxacaResult(
|
|
169
|
+
total_difference=float(total_diff),
|
|
170
|
+
explained_part=float(explained),
|
|
171
|
+
unexplained_part=float(unexplained),
|
|
172
|
+
explained_pct=float(explained_pct),
|
|
173
|
+
unexplained_pct=float(unexplained_pct),
|
|
174
|
+
group_a_mean=y_a_mean,
|
|
175
|
+
group_b_mean=y_b_mean,
|
|
176
|
+
group_a_coefficients=beta_a.tolist(),
|
|
177
|
+
group_b_coefficients=beta_b.tolist(),
|
|
178
|
+
detailed_explained=detailed_explained,
|
|
179
|
+
detailed_unexplained=detailed_unexplained,
|
|
180
|
+
feature_names=all_feature_names,
|
|
181
|
+
n_obs_a=n_a,
|
|
182
|
+
n_obs_b=n_b,
|
|
183
|
+
summary=summary
|
|
184
|
+
)
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""
|
|
2
|
+
时间序列分解
|
|
3
|
+
将时间序列分解为趋势、季节和随机成分
|
|
4
|
+
基于 statsmodels 实现
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import List, Optional
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
import numpy as np
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
from statsmodels.tsa.seasonal import seasonal_decompose, STL
|
|
13
|
+
STATSMODELS_AVAILABLE = True
|
|
14
|
+
except ImportError:
|
|
15
|
+
STATSMODELS_AVAILABLE = False
|
|
16
|
+
seasonal_decompose = None
|
|
17
|
+
STL = None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class TimeSeriesDecompositionResult(BaseModel):
|
|
21
|
+
"""时间序列分解结果"""
|
|
22
|
+
trend: List[float] = Field(..., description="趋势成分")
|
|
23
|
+
seasonal: List[float] = Field(..., description="季节成分")
|
|
24
|
+
residual: List[float] = Field(..., description="随机成分")
|
|
25
|
+
observed: List[float] = Field(..., description="原始观测值")
|
|
26
|
+
decomposition_type: str = Field(..., description="分解类型(加法/乘法)")
|
|
27
|
+
method: str = Field(..., description="分解方法")
|
|
28
|
+
period: int = Field(..., description="季节周期")
|
|
29
|
+
trend_strength: float = Field(..., description="趋势强度")
|
|
30
|
+
seasonal_strength: float = Field(..., description="季节强度")
|
|
31
|
+
n_observations: int = Field(..., description="观测数量")
|
|
32
|
+
summary: str = Field(..., description="摘要信息")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def time_series_decomposition(
|
|
36
|
+
data: List[float],
|
|
37
|
+
period: int = 12,
|
|
38
|
+
model: str = "additive",
|
|
39
|
+
method: str = "classical",
|
|
40
|
+
extrapolate_trend: str = "freq"
|
|
41
|
+
) -> TimeSeriesDecompositionResult:
|
|
42
|
+
"""
|
|
43
|
+
时间序列分解
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
data: 时间序列数据
|
|
47
|
+
period: 季节周期(如12表示月度数据的年周期)
|
|
48
|
+
model: 分解模型类型 - "additive"(加法模型) 或 "multiplicative"(乘法模型)
|
|
49
|
+
method: 分解方法 - "classical"(经典分解) 或 "stl"(STL分解)
|
|
50
|
+
extrapolate_trend: 趋势外推方法
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
TimeSeriesDecompositionResult: 时间序列分解结果
|
|
54
|
+
|
|
55
|
+
Raises:
|
|
56
|
+
ImportError: statsmodels库未安装
|
|
57
|
+
ValueError: 输入数据无效
|
|
58
|
+
"""
|
|
59
|
+
if not STATSMODELS_AVAILABLE:
|
|
60
|
+
raise ImportError("statsmodels库未安装。请运行: pip install statsmodels")
|
|
61
|
+
|
|
62
|
+
# 输入验证
|
|
63
|
+
if not data:
|
|
64
|
+
raise ValueError("data不能为空")
|
|
65
|
+
|
|
66
|
+
if len(data) < 2 * period:
|
|
67
|
+
raise ValueError(f"数据点数({len(data)})应至少为季节周期({period})的2倍")
|
|
68
|
+
|
|
69
|
+
# 数据准备
|
|
70
|
+
y = np.array(data, dtype=np.float64)
|
|
71
|
+
n = len(y)
|
|
72
|
+
|
|
73
|
+
# 检查缺失值
|
|
74
|
+
if np.isnan(y).any():
|
|
75
|
+
raise ValueError("数据中包含缺失值")
|
|
76
|
+
|
|
77
|
+
# 执行分解
|
|
78
|
+
if method == "classical":
|
|
79
|
+
# 使用经典分解方法
|
|
80
|
+
decomposition = seasonal_decompose(
|
|
81
|
+
y,
|
|
82
|
+
model=model,
|
|
83
|
+
period=period,
|
|
84
|
+
extrapolate_trend=extrapolate_trend
|
|
85
|
+
)
|
|
86
|
+
elif method == "stl":
|
|
87
|
+
# 使用STL分解(仅支持加法模型)
|
|
88
|
+
if model != "additive":
|
|
89
|
+
raise ValueError("STL分解仅支持加法模型")
|
|
90
|
+
decomposition = STL(y, period=period).fit()
|
|
91
|
+
else:
|
|
92
|
+
raise ValueError(f"不支持的分解方法: {method}")
|
|
93
|
+
|
|
94
|
+
# 提取成分
|
|
95
|
+
trend = decomposition.trend
|
|
96
|
+
seasonal = decomposition.seasonal
|
|
97
|
+
residual = decomposition.resid
|
|
98
|
+
|
|
99
|
+
# 处理NaN值(在趋势计算中可能产生)
|
|
100
|
+
# 使用线性插值填充
|
|
101
|
+
if np.isnan(trend).any():
|
|
102
|
+
mask = ~np.isnan(trend)
|
|
103
|
+
indices = np.arange(len(trend))
|
|
104
|
+
trend = np.interp(indices, indices[mask], trend[mask])
|
|
105
|
+
|
|
106
|
+
# 计算趋势和季节强度
|
|
107
|
+
# 趋势强度 = 1 - Var(残差) / Var(去季节化序列)
|
|
108
|
+
deseasonalized = y - seasonal
|
|
109
|
+
var_resid = np.var(residual[~np.isnan(residual)])
|
|
110
|
+
var_deseas = np.var(deseasonalized[~np.isnan(deseasonalized)])
|
|
111
|
+
trend_strength = 1 - (var_resid / var_deseas) if var_deseas > 0 else 0.0
|
|
112
|
+
trend_strength = max(0.0, min(1.0, trend_strength))
|
|
113
|
+
|
|
114
|
+
# 季节强度 = 1 - Var(残差) / Var(去趋势化序列)
|
|
115
|
+
detrended = y - trend
|
|
116
|
+
var_detrend = np.var(detrended[~np.isnan(detrended)])
|
|
117
|
+
seasonal_strength = 1 - (var_resid / var_detrend) if var_detrend > 0 else 0.0
|
|
118
|
+
seasonal_strength = max(0.0, min(1.0, seasonal_strength))
|
|
119
|
+
|
|
120
|
+
# 生成摘要
|
|
121
|
+
summary = f"""时间序列分解:
|
|
122
|
+
- 观测数量: {n}
|
|
123
|
+
- 季节周期: {period}
|
|
124
|
+
- 分解模型: {model}
|
|
125
|
+
- 分解方法: {method}
|
|
126
|
+
|
|
127
|
+
成分方差:
|
|
128
|
+
- 趋势方差: {np.var(trend[~np.isnan(trend)]):.4f}
|
|
129
|
+
- 季节方差: {np.var(seasonal[~np.isnan(seasonal)]):.4f}
|
|
130
|
+
- 残差方差: {var_resid:.4f}
|
|
131
|
+
|
|
132
|
+
强度指标:
|
|
133
|
+
- 趋势强度: {trend_strength:.4f} ({'强' if trend_strength > 0.6 else '中' if trend_strength > 0.3 else '弱'})
|
|
134
|
+
- 季节强度: {seasonal_strength:.4f} ({'强' if seasonal_strength > 0.6 else '中' if seasonal_strength > 0.3 else '弱'})
|
|
135
|
+
|
|
136
|
+
解释:
|
|
137
|
+
- {model}模型: y = {'趋势 + 季节 + 随机' if model == 'additive' else '趋势 × 季节 × 随机'}
|
|
138
|
+
"""
|
|
139
|
+
|
|
140
|
+
return TimeSeriesDecompositionResult(
|
|
141
|
+
trend=trend.tolist(),
|
|
142
|
+
seasonal=seasonal.tolist(),
|
|
143
|
+
residual=residual.tolist(),
|
|
144
|
+
observed=y.tolist(),
|
|
145
|
+
decomposition_type=model,
|
|
146
|
+
method=method,
|
|
147
|
+
period=period,
|
|
148
|
+
trend_strength=float(trend_strength),
|
|
149
|
+
seasonal_strength=float(seasonal_strength),
|
|
150
|
+
n_observations=n,
|
|
151
|
+
summary=summary
|
|
152
|
+
)
|