aigroup-econ-mcp 1.4.3__py3-none-any.whl → 2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. PKG-INFO +344 -322
  2. README.md +335 -320
  3. __init__.py +1 -1
  4. aigroup_econ_mcp-2.0.1.dist-info/METADATA +732 -0
  5. aigroup_econ_mcp-2.0.1.dist-info/RECORD +170 -0
  6. cli.py +4 -0
  7. econometrics/advanced_methods/modern_computing_machine_learning/__init__.py +30 -0
  8. econometrics/advanced_methods/modern_computing_machine_learning/causal_forest.py +253 -0
  9. econometrics/advanced_methods/modern_computing_machine_learning/double_ml.py +268 -0
  10. econometrics/advanced_methods/modern_computing_machine_learning/gradient_boosting.py +249 -0
  11. econometrics/advanced_methods/modern_computing_machine_learning/hierarchical_clustering.py +243 -0
  12. econometrics/advanced_methods/modern_computing_machine_learning/kmeans_clustering.py +293 -0
  13. econometrics/advanced_methods/modern_computing_machine_learning/neural_network.py +264 -0
  14. econometrics/advanced_methods/modern_computing_machine_learning/random_forest.py +195 -0
  15. econometrics/advanced_methods/modern_computing_machine_learning/support_vector_machine.py +226 -0
  16. econometrics/advanced_methods/modern_computing_machine_learning/test_all_modules.py +329 -0
  17. econometrics/advanced_methods/modern_computing_machine_learning/test_report.md +107 -0
  18. econometrics/causal_inference/__init__.py +66 -0
  19. econometrics/causal_inference/causal_identification_strategy/__init__.py +104 -0
  20. econometrics/causal_inference/causal_identification_strategy/control_function.py +112 -0
  21. econometrics/causal_inference/causal_identification_strategy/difference_in_differences.py +107 -0
  22. econometrics/causal_inference/causal_identification_strategy/event_study.py +119 -0
  23. econometrics/causal_inference/causal_identification_strategy/first_difference.py +89 -0
  24. econometrics/causal_inference/causal_identification_strategy/fixed_effects.py +103 -0
  25. econometrics/causal_inference/causal_identification_strategy/hausman_test.py +69 -0
  26. econometrics/causal_inference/causal_identification_strategy/instrumental_variables.py +145 -0
  27. econometrics/causal_inference/causal_identification_strategy/mediation_analysis.py +121 -0
  28. econometrics/causal_inference/causal_identification_strategy/moderation_analysis.py +109 -0
  29. econometrics/causal_inference/causal_identification_strategy/propensity_score_matching.py +140 -0
  30. econometrics/causal_inference/causal_identification_strategy/random_effects.py +100 -0
  31. econometrics/causal_inference/causal_identification_strategy/regression_discontinuity.py +98 -0
  32. econometrics/causal_inference/causal_identification_strategy/synthetic_control.py +111 -0
  33. econometrics/causal_inference/causal_identification_strategy/triple_difference.py +86 -0
  34. econometrics/distribution_analysis/__init__.py +28 -0
  35. econometrics/distribution_analysis/oaxaca_blinder.py +184 -0
  36. econometrics/distribution_analysis/time_series_decomposition.py +152 -0
  37. econometrics/distribution_analysis/variance_decomposition.py +179 -0
  38. econometrics/missing_data/__init__.py +18 -0
  39. econometrics/missing_data/imputation_methods.py +219 -0
  40. econometrics/nonparametric/__init__.py +35 -0
  41. econometrics/nonparametric/gam_model.py +117 -0
  42. econometrics/nonparametric/kernel_regression.py +161 -0
  43. econometrics/nonparametric/quantile_regression.py +249 -0
  44. econometrics/nonparametric/spline_regression.py +100 -0
  45. econometrics/spatial_econometrics/__init__.py +68 -0
  46. econometrics/spatial_econometrics/geographically_weighted_regression.py +211 -0
  47. econometrics/spatial_econometrics/gwr_simple.py +154 -0
  48. econometrics/spatial_econometrics/spatial_autocorrelation.py +356 -0
  49. econometrics/spatial_econometrics/spatial_durbin_model.py +177 -0
  50. econometrics/spatial_econometrics/spatial_regression.py +315 -0
  51. econometrics/spatial_econometrics/spatial_weights.py +226 -0
  52. econometrics/specific_data_modeling/micro_discrete_limited_data/README.md +164 -0
  53. econometrics/specific_data_modeling/micro_discrete_limited_data/__init__.py +40 -0
  54. econometrics/specific_data_modeling/micro_discrete_limited_data/count_data_models.py +311 -0
  55. econometrics/specific_data_modeling/micro_discrete_limited_data/discrete_choice_models.py +294 -0
  56. econometrics/specific_data_modeling/micro_discrete_limited_data/limited_dependent_variable_models.py +282 -0
  57. econometrics/statistical_inference/__init__.py +21 -0
  58. econometrics/statistical_inference/bootstrap_methods.py +162 -0
  59. econometrics/statistical_inference/permutation_test.py +177 -0
  60. econometrics/survival_analysis/__init__.py +18 -0
  61. econometrics/survival_analysis/survival_models.py +259 -0
  62. econometrics/tests/causal_inference_tests/__init__.py +3 -0
  63. econometrics/tests/causal_inference_tests/detailed_test.py +441 -0
  64. econometrics/tests/causal_inference_tests/test_all_methods.py +418 -0
  65. econometrics/tests/causal_inference_tests/test_causal_identification_strategy.py +202 -0
  66. econometrics/tests/causal_inference_tests/test_difference_in_differences.py +53 -0
  67. econometrics/tests/causal_inference_tests/test_instrumental_variables.py +44 -0
  68. econometrics/tests/specific_data_modeling_tests/test_micro_discrete_limited_data.py +189 -0
  69. econometrics//321/206/320/254/320/272/321/205/342/225/235/320/220/321/205/320/237/320/241/321/205/320/264/320/267/321/207/342/226/222/342/225/227/321/204/342/225/235/320/250/321/205/320/225/320/230/321/207/342/225/221/320/267/321/205/320/230/320/226/321/206/320/256/320/240.md +544 -0
  70. pyproject.toml +9 -2
  71. server.py +15 -1
  72. tools/__init__.py +75 -1
  73. tools/causal_inference_adapter.py +658 -0
  74. tools/distribution_analysis_adapter.py +121 -0
  75. tools/gwr_simple_adapter.py +54 -0
  76. tools/machine_learning_adapter.py +567 -0
  77. tools/mcp_tool_groups/__init__.py +15 -1
  78. tools/mcp_tool_groups/causal_inference_tools.py +643 -0
  79. tools/mcp_tool_groups/distribution_analysis_tools.py +169 -0
  80. tools/mcp_tool_groups/machine_learning_tools.py +422 -0
  81. tools/mcp_tool_groups/microecon_tools.py +325 -0
  82. tools/mcp_tool_groups/missing_data_tools.py +117 -0
  83. tools/mcp_tool_groups/nonparametric_tools.py +225 -0
  84. tools/mcp_tool_groups/spatial_econometrics_tools.py +323 -0
  85. tools/mcp_tool_groups/statistical_inference_tools.py +131 -0
  86. tools/mcp_tools_registry.py +13 -3
  87. tools/microecon_adapter.py +412 -0
  88. tools/missing_data_adapter.py +73 -0
  89. tools/nonparametric_adapter.py +190 -0
  90. tools/spatial_econometrics_adapter.py +318 -0
  91. tools/statistical_inference_adapter.py +90 -0
  92. tools/survival_analysis_adapter.py +46 -0
  93. aigroup_econ_mcp-1.4.3.dist-info/METADATA +0 -710
  94. aigroup_econ_mcp-1.4.3.dist-info/RECORD +0 -92
  95. {aigroup_econ_mcp-1.4.3.dist-info → aigroup_econ_mcp-2.0.1.dist-info}/WHEEL +0 -0
  96. {aigroup_econ_mcp-1.4.3.dist-info → aigroup_econ_mcp-2.0.1.dist-info}/entry_points.txt +0 -0
  97. {aigroup_econ_mcp-1.4.3.dist-info → aigroup_econ_mcp-2.0.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,98 @@
1
+ """
2
+ 断点回归设计 (RDD) 实现
3
+ """
4
+
5
+ from typing import List, Optional
6
+ import numpy as np
7
+ import pandas as pd
8
+ from pydantic import BaseModel, Field
9
+ from scipy import stats
10
+ import statsmodels.api as sm
11
+ from sklearn.preprocessing import PolynomialFeatures
12
+
13
+
14
+ class RDDResult(BaseModel):
15
+ """断点回归设计结果"""
16
+ method: str = Field(default="Regression Discontinuity Design", description="使用的因果识别方法")
17
+ estimate: float = Field(..., description="因果效应估计值")
18
+ std_error: float = Field(..., description="标准误")
19
+ t_statistic: float = Field(..., description="t统计量")
20
+ p_value: float = Field(..., description="p值")
21
+ confidence_interval: List[float] = Field(..., description="置信区间")
22
+ n_observations: int = Field(..., description="观测数量")
23
+ bandwidth: Optional[float] = Field(None, description="使用的带宽")
24
+ polynomial_order: Optional[int] = Field(None, description="多项式阶数")
25
+ discontinuity_location: Optional[float] = Field(None, description="断点位置")
26
+
27
+
28
+ def regression_discontinuity(
29
+ running_variable: List[float],
30
+ outcome: List[float],
31
+ cutoff: float,
32
+ bandwidth: Optional[float] = None,
33
+ polynomial_order: int = 1
34
+ ) -> RDDResult:
35
+ """
36
+ 断点回归设计 (RDD)
37
+
38
+ 使用statsmodels实现断点回归设计,评估在断点处的处理效应。
39
+
40
+ Args:
41
+ running_variable: 运行变量
42
+ outcome: 结果变量
43
+ cutoff: 断点值
44
+ bandwidth: 带宽
45
+ polynomial_order: 多项式阶数
46
+
47
+ Returns:
48
+ RDDResult: 断点回归设计结果
49
+ """
50
+ # 转换为numpy数组
51
+ running_array = np.array(running_variable)
52
+ outcome_array = np.array(outcome)
53
+
54
+ # 如果未指定带宽,使用默认方法计算
55
+ if bandwidth is None:
56
+ # 使用数据标准差的四分之一作为默认带宽
57
+ bandwidth = 0.25 * np.std(running_array)
58
+
59
+ # 筛选带宽内的观测
60
+ mask = np.abs(running_array - cutoff) <= bandwidth
61
+ running_local = running_array[mask]
62
+ outcome_local = outcome_array[mask]
63
+
64
+ # 构造处理变量(运行变量是否大于cutoff)
65
+ treatment = (running_local >= cutoff).astype(int)
66
+
67
+ # 构造多项式项
68
+ poly = PolynomialFeatures(degree=polynomial_order, include_bias=False)
69
+ running_poly = poly.fit_transform(running_local.reshape(-1, 1))
70
+
71
+ # 构建设计矩阵
72
+ X = np.column_stack([np.ones(len(running_local)), treatment, running_poly])
73
+
74
+ # 使用statsmodels进行OLS回归
75
+ model = sm.OLS(outcome_local, X)
76
+ results = model.fit()
77
+
78
+ # 提取处理效应(处理变量系数)
79
+ coef = results.params[1] # treatment变量是第2列(索引为1)
80
+ stderr = results.bse[1]
81
+ tstat = results.tvalues[1]
82
+ pval = results.pvalues[1]
83
+
84
+ # 计算置信区间
85
+ ci_lower = coef - 1.96 * stderr
86
+ ci_upper = coef + 1.96 * stderr
87
+
88
+ return RDDResult(
89
+ estimate=float(coef),
90
+ std_error=float(stderr),
91
+ t_statistic=float(tstat),
92
+ p_value=float(pval),
93
+ confidence_interval=[float(ci_lower), float(ci_upper)],
94
+ n_observations=len(running_local),
95
+ bandwidth=bandwidth,
96
+ polynomial_order=polynomial_order,
97
+ discontinuity_location=cutoff
98
+ )
@@ -0,0 +1,111 @@
1
+ """
2
+ 合成控制法实现
3
+ """
4
+
5
+ from typing import List, Optional, Dict
6
+ import numpy as np
7
+ import pandas as pd
8
+ from pydantic import BaseModel, Field
9
+ from scipy.optimize import minimize
10
+ from sklearn.linear_model import LinearRegression
11
+
12
+
13
+ class SyntheticControlResult(BaseModel):
14
+ """合成控制法结果"""
15
+ method: str = Field(default="Synthetic Control Method", description="使用的因果识别方法")
16
+ treatment_effect: float = Field(..., description="处理效应估计值")
17
+ synthetic_weights: List[float] = Field(..., description="合成控制权重")
18
+ n_observations: int = Field(..., description="观测数量")
19
+ donor_units: List[str] = Field(..., description="对照单元列表")
20
+ pre_treatment_fit: Dict[str, float] = Field(..., description="处理前拟合度量")
21
+
22
+
23
+ def synthetic_control_method(
24
+ outcome: List[float],
25
+ treatment_period: int,
26
+ treated_unit: str,
27
+ donor_units: List[str],
28
+ time_periods: List[str]
29
+ ) -> SyntheticControlResult:
30
+ """
31
+ 合成控制法
32
+
33
+ 合成控制法通过构造一个"合成"对照单元来评估处理效应,该对照单元是多个未处理单元的加权组合。
34
+
35
+ Args:
36
+ outcome: 结果变量(所有单元的时序数据)
37
+ treatment_period: 处理开始的时间期
38
+ treated_unit: 处理单元名称
39
+ donor_units: 对照单元名称列表
40
+ time_periods: 时间期列表
41
+
42
+ Returns:
43
+ SyntheticControlResult: 合成控制法结果
44
+ """
45
+ # 假设数据按单元排列,每个单元连续排列其时间序列
46
+ n_units = len(donor_units) + 1 # 包括处理单元
47
+ n_time = len(time_periods)
48
+
49
+ if len(outcome) != n_units * n_time:
50
+ raise ValueError("结果变量长度应等于单元数乘以时间期数")
51
+
52
+ # 重塑数据为(单元, 时间)矩阵
53
+ outcome_matrix = np.array(outcome).reshape(n_units, n_time)
54
+
55
+ # 确定处理单元索引
56
+ treated_idx = 0 # 假设处理单元是第一个
57
+
58
+ # 提取处理前时期的数据
59
+ pre_treatment_periods = treatment_period
60
+ treated_pre = outcome_matrix[treated_idx, :pre_treatment_periods]
61
+ donors_pre = outcome_matrix[1:, :pre_treatment_periods] # 排除处理单元
62
+
63
+ # 定义优化目标函数(最小化均方预测误差)
64
+ def objective(weights):
65
+ synthetic = donors_pre.T @ weights
66
+ mse = np.mean((treated_pre - synthetic) ** 2)
67
+ return mse
68
+
69
+ # 约束条件:权重非负且和为1
70
+ constraints = [
71
+ {"type": "eq", "fun": lambda w: np.sum(w) - 1} # 权重和为1
72
+ ]
73
+ bounds = [(0, 1) for _ in range(len(donor_units))] # 权重在[0,1]之间
74
+
75
+ # 初始权重
76
+ initial_weights = np.ones(len(donor_units)) / len(donor_units)
77
+
78
+ # 优化求解
79
+ result = minimize(objective, initial_weights, method='SLSQP',
80
+ bounds=bounds, constraints=constraints)
81
+
82
+ optimal_weights = result.x
83
+
84
+ # 计算合成对照单元的结果 - 修复矩阵乘法维度问题
85
+ # outcome_matrix[1:] 形状: (n_units-1, n_time)
86
+ # optimal_weights 形状: (n_units-1,)
87
+ # 我们需要 (n_time, n_units-1) @ (n_units-1,) = (n_time,)
88
+ synthetic_control = outcome_matrix[1:].T @ optimal_weights # 所有时期
89
+
90
+ # 计算处理效应(处理后时期)
91
+ post_treatment_outcome = outcome_matrix[treated_idx, treatment_period:]
92
+ post_treatment_synthetic = synthetic_control[treatment_period:]
93
+ treatment_effect = np.mean(post_treatment_outcome - post_treatment_synthetic)
94
+
95
+ # 计算处理前拟合度量
96
+ pre_treatment_synthetic = synthetic_control[:treatment_period]
97
+ pre_treatment_r2 = 1 - np.sum((treated_pre - pre_treatment_synthetic) ** 2) / \
98
+ np.sum((treated_pre - np.mean(treated_pre)) ** 2)
99
+
100
+ pre_treatment_fit = {
101
+ "R-squared": float(pre_treatment_r2),
102
+ "RMSE": float(np.sqrt(np.mean((treated_pre - pre_treatment_synthetic) ** 2)))
103
+ }
104
+
105
+ return SyntheticControlResult(
106
+ treatment_effect=float(treatment_effect),
107
+ synthetic_weights=optimal_weights.tolist(),
108
+ n_observations=len(outcome),
109
+ donor_units=donor_units,
110
+ pre_treatment_fit=pre_treatment_fit
111
+ )
@@ -0,0 +1,86 @@
1
+ """
2
+ 三重差分法 (DDD) 实现
3
+ """
4
+
5
+ from typing import List, Optional
6
+ import numpy as np
7
+ import pandas as pd
8
+ from pydantic import BaseModel, Field
9
+ import statsmodels.api as sm
10
+ from scipy import stats
11
+
12
+
13
+ class TripeDifferenceResult(BaseModel):
14
+ """三重差分法结果"""
15
+ method: str = Field(default="Triple Difference", description="使用的因果识别方法")
16
+ estimate: float = Field(..., description="因果效应估计值")
17
+ std_error: float = Field(..., description="标准误")
18
+ t_statistic: float = Field(..., description="t统计量")
19
+ p_value: float = Field(..., description="p值")
20
+ confidence_interval: List[float] = Field(..., description="置信区间")
21
+ n_observations: int = Field(..., description="观测数量")
22
+
23
+
24
+ def triple_difference(
25
+ outcome: List[float],
26
+ treatment_group: List[int],
27
+ time_period: List[int],
28
+ cohort_group: List[int]
29
+ ) -> TripeDifferenceResult:
30
+ """
31
+ 三重差分法 (DDD)
32
+
33
+ 三重差分法通过引入第三个维度(如不同的队列组)来进一步控制混杂因素,
34
+ 提供比双重差分法更强的因果识别能力。
35
+
36
+ Args:
37
+ outcome: 结果变量
38
+ treatment_group: 处理组虚拟变量 (0/1)
39
+ time_period: 时间虚拟变量 (0/1)
40
+ cohort_group: 队列组虚拟变量 (0/1)
41
+
42
+ Returns:
43
+ TripeDifferenceResult: 三重差分法结果
44
+ """
45
+ # 构建数据
46
+ df = pd.DataFrame({
47
+ 'outcome': outcome,
48
+ 'treatment': treatment_group,
49
+ 'time': time_period,
50
+ 'cohort': cohort_group
51
+ })
52
+
53
+ # 构建交互项
54
+ df['treatment_time'] = df['treatment'] * df['time']
55
+ df['treatment_cohort'] = df['treatment'] * df['cohort']
56
+ df['time_cohort'] = df['time'] * df['cohort']
57
+ df['treatment_time_cohort'] = df['treatment'] * df['time'] * df['cohort']
58
+
59
+ # 构建回归设计矩阵
60
+ X_vars = ['treatment', 'time', 'cohort', 'treatment_time', 'treatment_cohort', 'time_cohort', 'treatment_time_cohort']
61
+ X = df[X_vars]
62
+ X = sm.add_constant(X) # 添加常数项
63
+ y = df['outcome']
64
+
65
+ # OLS回归
66
+ model = sm.OLS(y, X)
67
+ results = model.fit()
68
+
69
+ # 提取三重差分估计结果(三重交互项系数)
70
+ coef = results.params['treatment_time_cohort']
71
+ stderr = results.bse['treatment_time_cohort']
72
+ tstat = results.tvalues['treatment_time_cohort']
73
+ pval = results.pvalues['treatment_time_cohort']
74
+
75
+ # 计算置信区间
76
+ ci_lower = coef - 1.96 * stderr
77
+ ci_upper = coef + 1.96 * stderr
78
+
79
+ return TripeDifferenceResult(
80
+ estimate=float(coef),
81
+ std_error=float(stderr),
82
+ t_statistic=float(tstat),
83
+ p_value=float(pval),
84
+ confidence_interval=[float(ci_lower), float(ci_upper)],
85
+ n_observations=len(df)
86
+ )
@@ -0,0 +1,28 @@
1
+ """
2
+ 分布分析与分解方法模块
3
+ 分析因变量的条件分布特征并进行各种分解分析
4
+ """
5
+
6
+ from .oaxaca_blinder import (
7
+ oaxaca_blinder_decomposition,
8
+ OaxacaResult
9
+ )
10
+
11
+ from .variance_decomposition import (
12
+ variance_decomposition,
13
+ VarianceDecompositionResult
14
+ )
15
+
16
+ from .time_series_decomposition import (
17
+ time_series_decomposition,
18
+ TimeSeriesDecompositionResult
19
+ )
20
+
21
+ __all__ = [
22
+ 'oaxaca_blinder_decomposition',
23
+ 'OaxacaResult',
24
+ 'variance_decomposition',
25
+ 'VarianceDecompositionResult',
26
+ 'time_series_decomposition',
27
+ 'TimeSeriesDecompositionResult'
28
+ ]
@@ -0,0 +1,184 @@
1
+ """
2
+ Oaxaca-Blinder分解
3
+ 用于分解两组之间的平均差异(如工资差距)
4
+ 基于 statsmodels 和自定义实现
5
+ """
6
+
7
+ from typing import List, Optional, Tuple
8
+ from pydantic import BaseModel, Field
9
+ import numpy as np
10
+
11
+ try:
12
+ import statsmodels.api as sm
13
+ STATSMODELS_AVAILABLE = True
14
+ except ImportError:
15
+ STATSMODELS_AVAILABLE = False
16
+ sm = None
17
+
18
+
19
+ class OaxacaResult(BaseModel):
20
+ """Oaxaca-Blinder分解结果"""
21
+ total_difference: float = Field(..., description="总差异")
22
+ explained_part: float = Field(..., description="可解释部分(禀赋效应)")
23
+ unexplained_part: float = Field(..., description="不可解释部分(系数效应)")
24
+ explained_pct: float = Field(..., description="可解释部分百分比")
25
+ unexplained_pct: float = Field(..., description="不可解释部分百分比")
26
+ group_a_mean: float = Field(..., description="A组平均值")
27
+ group_b_mean: float = Field(..., description="B组平均值")
28
+ group_a_coefficients: List[float] = Field(..., description="A组回归系数")
29
+ group_b_coefficients: List[float] = Field(..., description="B组回归系数")
30
+ detailed_explained: List[float] = Field(..., description="各变量的可解释部分")
31
+ detailed_unexplained: List[float] = Field(..., description="各变量的不可解释部分")
32
+ feature_names: List[str] = Field(..., description="特征名称")
33
+ n_obs_a: int = Field(..., description="A组观测数")
34
+ n_obs_b: int = Field(..., description="B组观测数")
35
+ summary: str = Field(..., description="摘要信息")
36
+
37
+
38
+ def oaxaca_blinder_decomposition(
39
+ y_a: List[float],
40
+ x_a: List[List[float]],
41
+ y_b: List[float],
42
+ x_b: List[List[float]],
43
+ feature_names: Optional[List[str]] = None,
44
+ weight_matrix: str = "pooled"
45
+ ) -> OaxacaResult:
46
+ """
47
+ Oaxaca-Blinder分解
48
+ 分解两组之间的平均差异为可解释部分和不可解释部分
49
+
50
+ Args:
51
+ y_a: A组因变量(如男性工资)
52
+ x_a: A组自变量
53
+ y_b: B组因变量(如女性工资)
54
+ x_b: B组自变量
55
+ feature_names: 特征名称
56
+ weight_matrix: 权重矩阵类型 - "pooled"(pooled权重), "group_a", "group_b"
57
+
58
+ Returns:
59
+ OaxacaResult: Oaxaca-Blinder分解结果
60
+
61
+ Raises:
62
+ ImportError: statsmodels库未安装
63
+ ValueError: 输入数据无效
64
+ """
65
+ if not STATSMODELS_AVAILABLE:
66
+ raise ImportError("statsmodels库未安装。请运行: pip install statsmodels")
67
+
68
+ # 输入验证
69
+ if not y_a or not x_a or not y_b or not x_b:
70
+ raise ValueError("所有输入数据不能为空")
71
+
72
+ # 数据准备
73
+ y_a_arr = np.array(y_a, dtype=np.float64)
74
+ X_a_arr = np.array(x_a, dtype=np.float64)
75
+ y_b_arr = np.array(y_b, dtype=np.float64)
76
+ X_b_arr = np.array(x_b, dtype=np.float64)
77
+
78
+ # 确保X是二维数组
79
+ if X_a_arr.ndim == 1:
80
+ X_a_arr = X_a_arr.reshape(-1, 1)
81
+ if X_b_arr.ndim == 1:
82
+ X_b_arr = X_b_arr.reshape(-1, 1)
83
+
84
+ n_a = len(y_a_arr)
85
+ n_b = len(y_b_arr)
86
+ k = X_a_arr.shape[1]
87
+
88
+ # 数据验证
89
+ if X_a_arr.shape[1] != X_b_arr.shape[1]:
90
+ raise ValueError("两组的自变量数量必须相同")
91
+
92
+ # 添加常数项
93
+ X_a_const = sm.add_constant(X_a_arr)
94
+ X_b_const = sm.add_constant(X_b_arr)
95
+
96
+ # 特征名称
97
+ if feature_names is None:
98
+ feature_names = [f"X{i+1}" for i in range(k)]
99
+ all_feature_names = ["const"] + feature_names
100
+
101
+ # 对两组分别进行OLS回归
102
+ model_a = sm.OLS(y_a_arr, X_a_const).fit()
103
+ model_b = sm.OLS(y_b_arr, X_b_const).fit()
104
+
105
+ # 提取系数
106
+ beta_a = model_a.params
107
+ beta_b = model_b.params
108
+
109
+ # 计算两组的平均特征值
110
+ X_a_mean = X_a_const.mean(axis=0)
111
+ X_b_mean = X_b_const.mean(axis=0)
112
+
113
+ # 计算两组的平均因变量
114
+ y_a_mean = float(y_a_arr.mean())
115
+ y_b_mean = float(y_b_arr.mean())
116
+
117
+ # 总差异
118
+ total_diff = y_a_mean - y_b_mean
119
+
120
+ # 根据权重矩阵选择参考系数
121
+ if weight_matrix == "pooled":
122
+ # 使用pooled回归的系数作为参考
123
+ y_pooled = np.concatenate([y_a_arr, y_b_arr])
124
+ X_pooled = np.vstack([X_a_const, X_b_const])
125
+ model_pooled = sm.OLS(y_pooled, X_pooled).fit()
126
+ beta_ref = model_pooled.params
127
+ elif weight_matrix == "group_a":
128
+ beta_ref = beta_a
129
+ elif weight_matrix == "group_b":
130
+ beta_ref = beta_b
131
+ else:
132
+ raise ValueError(f"不支持的权重矩阵类型: {weight_matrix}")
133
+
134
+ # Oaxaca分解
135
+ # 可解释部分(禀赋效应): (X̄ₐ - X̄ᵦ)' β*
136
+ explained = (X_a_mean - X_b_mean) @ beta_ref
137
+
138
+ # 不可解释部分(系数效应): X̄ₐ'(βₐ - β*) + X̄ᵦ'(β* - βᵦ)
139
+ unexplained = X_a_mean @ (beta_a - beta_ref) + X_b_mean @ (beta_ref - beta_b)
140
+
141
+ # 详细分解(每个变量的贡献)
142
+ detailed_explained = ((X_a_mean - X_b_mean) * beta_ref).tolist()
143
+ detailed_unexplained = (
144
+ X_a_mean * (beta_a - beta_ref) + X_b_mean * (beta_ref - beta_b)
145
+ ).tolist()
146
+
147
+ # 百分比
148
+ explained_pct = (explained / total_diff * 100) if total_diff != 0 else 0.0
149
+ unexplained_pct = (unexplained / total_diff * 100) if total_diff != 0 else 0.0
150
+
151
+ # 生成摘要
152
+ summary = f"""Oaxaca-Blinder分解:
153
+ - 总差异: {total_diff:.4f}
154
+ - A组平均: {y_a_mean:.4f} (n={n_a})
155
+ - B组平均: {y_b_mean:.4f} (n={n_b})
156
+
157
+ 分解结果:
158
+ - 可解释部分(禀赋效应): {explained:.4f} ({explained_pct:.1f}%)
159
+ - 不可解释部分(系数效应): {unexplained:.4f} ({unexplained_pct:.1f}%)
160
+
161
+ 各变量贡献:
162
+ """
163
+ for i, name in enumerate(all_feature_names):
164
+ summary += f" {name}:\n"
165
+ summary += f" - 禀赋效应: {detailed_explained[i]:.4f}\n"
166
+ summary += f" - 系数效应: {detailed_unexplained[i]:.4f}\n"
167
+
168
+ return OaxacaResult(
169
+ total_difference=float(total_diff),
170
+ explained_part=float(explained),
171
+ unexplained_part=float(unexplained),
172
+ explained_pct=float(explained_pct),
173
+ unexplained_pct=float(unexplained_pct),
174
+ group_a_mean=y_a_mean,
175
+ group_b_mean=y_b_mean,
176
+ group_a_coefficients=beta_a.tolist(),
177
+ group_b_coefficients=beta_b.tolist(),
178
+ detailed_explained=detailed_explained,
179
+ detailed_unexplained=detailed_unexplained,
180
+ feature_names=all_feature_names,
181
+ n_obs_a=n_a,
182
+ n_obs_b=n_b,
183
+ summary=summary
184
+ )
@@ -0,0 +1,152 @@
1
+ """
2
+ 时间序列分解
3
+ 将时间序列分解为趋势、季节和随机成分
4
+ 基于 statsmodels 实现
5
+ """
6
+
7
+ from typing import List, Optional
8
+ from pydantic import BaseModel, Field
9
+ import numpy as np
10
+
11
+ try:
12
+ from statsmodels.tsa.seasonal import seasonal_decompose, STL
13
+ STATSMODELS_AVAILABLE = True
14
+ except ImportError:
15
+ STATSMODELS_AVAILABLE = False
16
+ seasonal_decompose = None
17
+ STL = None
18
+
19
+
20
+ class TimeSeriesDecompositionResult(BaseModel):
21
+ """时间序列分解结果"""
22
+ trend: List[float] = Field(..., description="趋势成分")
23
+ seasonal: List[float] = Field(..., description="季节成分")
24
+ residual: List[float] = Field(..., description="随机成分")
25
+ observed: List[float] = Field(..., description="原始观测值")
26
+ decomposition_type: str = Field(..., description="分解类型(加法/乘法)")
27
+ method: str = Field(..., description="分解方法")
28
+ period: int = Field(..., description="季节周期")
29
+ trend_strength: float = Field(..., description="趋势强度")
30
+ seasonal_strength: float = Field(..., description="季节强度")
31
+ n_observations: int = Field(..., description="观测数量")
32
+ summary: str = Field(..., description="摘要信息")
33
+
34
+
35
+ def time_series_decomposition(
36
+ data: List[float],
37
+ period: int = 12,
38
+ model: str = "additive",
39
+ method: str = "classical",
40
+ extrapolate_trend: str = "freq"
41
+ ) -> TimeSeriesDecompositionResult:
42
+ """
43
+ 时间序列分解
44
+
45
+ Args:
46
+ data: 时间序列数据
47
+ period: 季节周期(如12表示月度数据的年周期)
48
+ model: 分解模型类型 - "additive"(加法模型) 或 "multiplicative"(乘法模型)
49
+ method: 分解方法 - "classical"(经典分解) 或 "stl"(STL分解)
50
+ extrapolate_trend: 趋势外推方法
51
+
52
+ Returns:
53
+ TimeSeriesDecompositionResult: 时间序列分解结果
54
+
55
+ Raises:
56
+ ImportError: statsmodels库未安装
57
+ ValueError: 输入数据无效
58
+ """
59
+ if not STATSMODELS_AVAILABLE:
60
+ raise ImportError("statsmodels库未安装。请运行: pip install statsmodels")
61
+
62
+ # 输入验证
63
+ if not data:
64
+ raise ValueError("data不能为空")
65
+
66
+ if len(data) < 2 * period:
67
+ raise ValueError(f"数据点数({len(data)})应至少为季节周期({period})的2倍")
68
+
69
+ # 数据准备
70
+ y = np.array(data, dtype=np.float64)
71
+ n = len(y)
72
+
73
+ # 检查缺失值
74
+ if np.isnan(y).any():
75
+ raise ValueError("数据中包含缺失值")
76
+
77
+ # 执行分解
78
+ if method == "classical":
79
+ # 使用经典分解方法
80
+ decomposition = seasonal_decompose(
81
+ y,
82
+ model=model,
83
+ period=period,
84
+ extrapolate_trend=extrapolate_trend
85
+ )
86
+ elif method == "stl":
87
+ # 使用STL分解(仅支持加法模型)
88
+ if model != "additive":
89
+ raise ValueError("STL分解仅支持加法模型")
90
+ decomposition = STL(y, period=period).fit()
91
+ else:
92
+ raise ValueError(f"不支持的分解方法: {method}")
93
+
94
+ # 提取成分
95
+ trend = decomposition.trend
96
+ seasonal = decomposition.seasonal
97
+ residual = decomposition.resid
98
+
99
+ # 处理NaN值(在趋势计算中可能产生)
100
+ # 使用线性插值填充
101
+ if np.isnan(trend).any():
102
+ mask = ~np.isnan(trend)
103
+ indices = np.arange(len(trend))
104
+ trend = np.interp(indices, indices[mask], trend[mask])
105
+
106
+ # 计算趋势和季节强度
107
+ # 趋势强度 = 1 - Var(残差) / Var(去季节化序列)
108
+ deseasonalized = y - seasonal
109
+ var_resid = np.var(residual[~np.isnan(residual)])
110
+ var_deseas = np.var(deseasonalized[~np.isnan(deseasonalized)])
111
+ trend_strength = 1 - (var_resid / var_deseas) if var_deseas > 0 else 0.0
112
+ trend_strength = max(0.0, min(1.0, trend_strength))
113
+
114
+ # 季节强度 = 1 - Var(残差) / Var(去趋势化序列)
115
+ detrended = y - trend
116
+ var_detrend = np.var(detrended[~np.isnan(detrended)])
117
+ seasonal_strength = 1 - (var_resid / var_detrend) if var_detrend > 0 else 0.0
118
+ seasonal_strength = max(0.0, min(1.0, seasonal_strength))
119
+
120
+ # 生成摘要
121
+ summary = f"""时间序列分解:
122
+ - 观测数量: {n}
123
+ - 季节周期: {period}
124
+ - 分解模型: {model}
125
+ - 分解方法: {method}
126
+
127
+ 成分方差:
128
+ - 趋势方差: {np.var(trend[~np.isnan(trend)]):.4f}
129
+ - 季节方差: {np.var(seasonal[~np.isnan(seasonal)]):.4f}
130
+ - 残差方差: {var_resid:.4f}
131
+
132
+ 强度指标:
133
+ - 趋势强度: {trend_strength:.4f} ({'强' if trend_strength > 0.6 else '中' if trend_strength > 0.3 else '弱'})
134
+ - 季节强度: {seasonal_strength:.4f} ({'强' if seasonal_strength > 0.6 else '中' if seasonal_strength > 0.3 else '弱'})
135
+
136
+ 解释:
137
+ - {model}模型: y = {'趋势 + 季节 + 随机' if model == 'additive' else '趋势 × 季节 × 随机'}
138
+ """
139
+
140
+ return TimeSeriesDecompositionResult(
141
+ trend=trend.tolist(),
142
+ seasonal=seasonal.tolist(),
143
+ residual=residual.tolist(),
144
+ observed=y.tolist(),
145
+ decomposition_type=model,
146
+ method=method,
147
+ period=period,
148
+ trend_strength=float(trend_strength),
149
+ seasonal_strength=float(seasonal_strength),
150
+ n_observations=n,
151
+ summary=summary
152
+ )