ins-pricing 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. ins_pricing/README.md +60 -0
  2. ins_pricing/__init__.py +102 -0
  3. ins_pricing/governance/README.md +18 -0
  4. ins_pricing/governance/__init__.py +20 -0
  5. ins_pricing/governance/approval.py +93 -0
  6. ins_pricing/governance/audit.py +37 -0
  7. ins_pricing/governance/registry.py +99 -0
  8. ins_pricing/governance/release.py +159 -0
  9. ins_pricing/modelling/BayesOpt.py +146 -0
  10. ins_pricing/modelling/BayesOpt_USAGE.md +925 -0
  11. ins_pricing/modelling/BayesOpt_entry.py +575 -0
  12. ins_pricing/modelling/BayesOpt_incremental.py +731 -0
  13. ins_pricing/modelling/Explain_Run.py +36 -0
  14. ins_pricing/modelling/Explain_entry.py +539 -0
  15. ins_pricing/modelling/Pricing_Run.py +36 -0
  16. ins_pricing/modelling/README.md +33 -0
  17. ins_pricing/modelling/__init__.py +44 -0
  18. ins_pricing/modelling/bayesopt/__init__.py +98 -0
  19. ins_pricing/modelling/bayesopt/config_preprocess.py +303 -0
  20. ins_pricing/modelling/bayesopt/core.py +1476 -0
  21. ins_pricing/modelling/bayesopt/models.py +2196 -0
  22. ins_pricing/modelling/bayesopt/trainers.py +2446 -0
  23. ins_pricing/modelling/bayesopt/utils.py +1021 -0
  24. ins_pricing/modelling/cli_common.py +136 -0
  25. ins_pricing/modelling/explain/__init__.py +55 -0
  26. ins_pricing/modelling/explain/gradients.py +334 -0
  27. ins_pricing/modelling/explain/metrics.py +176 -0
  28. ins_pricing/modelling/explain/permutation.py +155 -0
  29. ins_pricing/modelling/explain/shap_utils.py +146 -0
  30. ins_pricing/modelling/notebook_utils.py +284 -0
  31. ins_pricing/modelling/plotting/__init__.py +45 -0
  32. ins_pricing/modelling/plotting/common.py +63 -0
  33. ins_pricing/modelling/plotting/curves.py +572 -0
  34. ins_pricing/modelling/plotting/diagnostics.py +139 -0
  35. ins_pricing/modelling/plotting/geo.py +362 -0
  36. ins_pricing/modelling/plotting/importance.py +121 -0
  37. ins_pricing/modelling/run_logging.py +133 -0
  38. ins_pricing/modelling/tests/conftest.py +8 -0
  39. ins_pricing/modelling/tests/test_cross_val_generic.py +66 -0
  40. ins_pricing/modelling/tests/test_distributed_utils.py +18 -0
  41. ins_pricing/modelling/tests/test_explain.py +56 -0
  42. ins_pricing/modelling/tests/test_geo_tokens_split.py +49 -0
  43. ins_pricing/modelling/tests/test_graph_cache.py +33 -0
  44. ins_pricing/modelling/tests/test_plotting.py +63 -0
  45. ins_pricing/modelling/tests/test_plotting_library.py +150 -0
  46. ins_pricing/modelling/tests/test_preprocessor.py +48 -0
  47. ins_pricing/modelling/watchdog_run.py +211 -0
  48. ins_pricing/pricing/README.md +44 -0
  49. ins_pricing/pricing/__init__.py +27 -0
  50. ins_pricing/pricing/calibration.py +39 -0
  51. ins_pricing/pricing/data_quality.py +117 -0
  52. ins_pricing/pricing/exposure.py +85 -0
  53. ins_pricing/pricing/factors.py +91 -0
  54. ins_pricing/pricing/monitoring.py +99 -0
  55. ins_pricing/pricing/rate_table.py +78 -0
  56. ins_pricing/production/__init__.py +21 -0
  57. ins_pricing/production/drift.py +30 -0
  58. ins_pricing/production/monitoring.py +143 -0
  59. ins_pricing/production/scoring.py +40 -0
  60. ins_pricing/reporting/README.md +20 -0
  61. ins_pricing/reporting/__init__.py +11 -0
  62. ins_pricing/reporting/report_builder.py +72 -0
  63. ins_pricing/reporting/scheduler.py +45 -0
  64. ins_pricing/setup.py +41 -0
  65. ins_pricing v2/__init__.py +23 -0
  66. ins_pricing v2/governance/__init__.py +20 -0
  67. ins_pricing v2/governance/approval.py +93 -0
  68. ins_pricing v2/governance/audit.py +37 -0
  69. ins_pricing v2/governance/registry.py +99 -0
  70. ins_pricing v2/governance/release.py +159 -0
  71. ins_pricing v2/modelling/Explain_Run.py +36 -0
  72. ins_pricing v2/modelling/Pricing_Run.py +36 -0
  73. ins_pricing v2/modelling/__init__.py +151 -0
  74. ins_pricing v2/modelling/cli_common.py +141 -0
  75. ins_pricing v2/modelling/config.py +249 -0
  76. ins_pricing v2/modelling/config_preprocess.py +254 -0
  77. ins_pricing v2/modelling/core.py +741 -0
  78. ins_pricing v2/modelling/data_container.py +42 -0
  79. ins_pricing v2/modelling/explain/__init__.py +55 -0
  80. ins_pricing v2/modelling/explain/gradients.py +334 -0
  81. ins_pricing v2/modelling/explain/metrics.py +176 -0
  82. ins_pricing v2/modelling/explain/permutation.py +155 -0
  83. ins_pricing v2/modelling/explain/shap_utils.py +146 -0
  84. ins_pricing v2/modelling/features.py +215 -0
  85. ins_pricing v2/modelling/model_manager.py +148 -0
  86. ins_pricing v2/modelling/model_plotting.py +463 -0
  87. ins_pricing v2/modelling/models.py +2203 -0
  88. ins_pricing v2/modelling/notebook_utils.py +294 -0
  89. ins_pricing v2/modelling/plotting/__init__.py +45 -0
  90. ins_pricing v2/modelling/plotting/common.py +63 -0
  91. ins_pricing v2/modelling/plotting/curves.py +572 -0
  92. ins_pricing v2/modelling/plotting/diagnostics.py +139 -0
  93. ins_pricing v2/modelling/plotting/geo.py +362 -0
  94. ins_pricing v2/modelling/plotting/importance.py +121 -0
  95. ins_pricing v2/modelling/run_logging.py +133 -0
  96. ins_pricing v2/modelling/tests/conftest.py +8 -0
  97. ins_pricing v2/modelling/tests/test_cross_val_generic.py +66 -0
  98. ins_pricing v2/modelling/tests/test_distributed_utils.py +18 -0
  99. ins_pricing v2/modelling/tests/test_explain.py +56 -0
  100. ins_pricing v2/modelling/tests/test_geo_tokens_split.py +49 -0
  101. ins_pricing v2/modelling/tests/test_graph_cache.py +33 -0
  102. ins_pricing v2/modelling/tests/test_plotting.py +63 -0
  103. ins_pricing v2/modelling/tests/test_plotting_library.py +150 -0
  104. ins_pricing v2/modelling/tests/test_preprocessor.py +48 -0
  105. ins_pricing v2/modelling/trainers.py +2447 -0
  106. ins_pricing v2/modelling/utils.py +1020 -0
  107. ins_pricing v2/modelling/watchdog_run.py +211 -0
  108. ins_pricing v2/pricing/__init__.py +27 -0
  109. ins_pricing v2/pricing/calibration.py +39 -0
  110. ins_pricing v2/pricing/data_quality.py +117 -0
  111. ins_pricing v2/pricing/exposure.py +85 -0
  112. ins_pricing v2/pricing/factors.py +91 -0
  113. ins_pricing v2/pricing/monitoring.py +99 -0
  114. ins_pricing v2/pricing/rate_table.py +78 -0
  115. ins_pricing v2/production/__init__.py +21 -0
  116. ins_pricing v2/production/drift.py +30 -0
  117. ins_pricing v2/production/monitoring.py +143 -0
  118. ins_pricing v2/production/scoring.py +40 -0
  119. ins_pricing v2/reporting/__init__.py +11 -0
  120. ins_pricing v2/reporting/report_builder.py +72 -0
  121. ins_pricing v2/reporting/scheduler.py +45 -0
  122. ins_pricing v2/scripts/BayesOpt_incremental.py +722 -0
  123. ins_pricing v2/scripts/Explain_entry.py +545 -0
  124. ins_pricing v2/scripts/__init__.py +1 -0
  125. ins_pricing v2/scripts/train.py +568 -0
  126. ins_pricing v2/setup.py +55 -0
  127. ins_pricing v2/smoke_test.py +28 -0
  128. ins_pricing-0.1.6.dist-info/METADATA +78 -0
  129. ins_pricing-0.1.6.dist-info/RECORD +169 -0
  130. ins_pricing-0.1.6.dist-info/WHEEL +5 -0
  131. ins_pricing-0.1.6.dist-info/top_level.txt +4 -0
  132. user_packages/__init__.py +105 -0
  133. user_packages legacy/BayesOpt.py +5659 -0
  134. user_packages legacy/BayesOpt_entry.py +513 -0
  135. user_packages legacy/BayesOpt_incremental.py +685 -0
  136. user_packages legacy/Pricing_Run.py +36 -0
  137. user_packages legacy/Try/BayesOpt Legacy251213.py +3719 -0
  138. user_packages legacy/Try/BayesOpt Legacy251215.py +3758 -0
  139. user_packages legacy/Try/BayesOpt lagecy251201.py +3506 -0
  140. user_packages legacy/Try/BayesOpt lagecy251218.py +3992 -0
  141. user_packages legacy/Try/BayesOpt legacy.py +3280 -0
  142. user_packages legacy/Try/BayesOpt.py +838 -0
  143. user_packages legacy/Try/BayesOptAll.py +1569 -0
  144. user_packages legacy/Try/BayesOptAllPlatform.py +909 -0
  145. user_packages legacy/Try/BayesOptCPUGPU.py +1877 -0
  146. user_packages legacy/Try/BayesOptSearch.py +830 -0
  147. user_packages legacy/Try/BayesOptSearchOrigin.py +829 -0
  148. user_packages legacy/Try/BayesOptV1.py +1911 -0
  149. user_packages legacy/Try/BayesOptV10.py +2973 -0
  150. user_packages legacy/Try/BayesOptV11.py +3001 -0
  151. user_packages legacy/Try/BayesOptV12.py +3001 -0
  152. user_packages legacy/Try/BayesOptV2.py +2065 -0
  153. user_packages legacy/Try/BayesOptV3.py +2209 -0
  154. user_packages legacy/Try/BayesOptV4.py +2342 -0
  155. user_packages legacy/Try/BayesOptV5.py +2372 -0
  156. user_packages legacy/Try/BayesOptV6.py +2759 -0
  157. user_packages legacy/Try/BayesOptV7.py +2832 -0
  158. user_packages legacy/Try/BayesOptV8Codex.py +2731 -0
  159. user_packages legacy/Try/BayesOptV8Gemini.py +2614 -0
  160. user_packages legacy/Try/BayesOptV9.py +2927 -0
  161. user_packages legacy/Try/BayesOpt_entry legacy.py +313 -0
  162. user_packages legacy/Try/ModelBayesOptSearch.py +359 -0
  163. user_packages legacy/Try/ResNetBayesOptSearch.py +249 -0
  164. user_packages legacy/Try/XgbBayesOptSearch.py +121 -0
  165. user_packages legacy/Try/xgbbayesopt.py +523 -0
  166. user_packages legacy/__init__.py +19 -0
  167. user_packages legacy/cli_common.py +124 -0
  168. user_packages legacy/notebook_utils.py +228 -0
  169. user_packages legacy/watchdog_run.py +202 -0
@@ -0,0 +1,2759 @@
1
+ # 数据在CPU和GPU之间传输会带来较大开销,但可以多CUDA流同时传输数据和计算,从而实现更大数据集的操作。
2
+
3
+ import copy
4
+ import gc
5
+ import math
6
+ import os
7
+ from dataclasses import dataclass
8
+ from pathlib import Path
9
+ from re import X
10
+ from typing import Any, Dict, List, Optional
11
+
12
+ import joblib
13
+ import matplotlib.pyplot as plt
14
+ import numpy as np # 1.26.2
15
+ import optuna # 4.3.0
16
+ import pandas as pd # 2.2.3
17
+ import shap
18
+ import statsmodels.api as sm
19
+
20
+ import torch # 版本: 1.10.1+cu111
21
+ import torch.nn as nn
22
+ import torch.nn.functional as F
23
+ import xgboost as xgb # 1.7.0
24
+
25
+ from torch.utils.data import Dataset, DataLoader, TensorDataset
26
+ from torch.cuda.amp import autocast, GradScaler
27
+ from torch.nn.utils import clip_grad_norm_
28
+ from sklearn.model_selection import ShuffleSplit, cross_val_score # 1.2.2
29
+ from sklearn.preprocessing import StandardScaler
30
+ from sklearn.metrics import log_loss, make_scorer, mean_tweedie_deviance
31
+
32
+ # =============================================================================
33
+ # Constants & utilities
34
+ # =============================================================================
35
+ EPS = 1e-8
36
+
37
+
38
+ def ensure_parent_dir(file_path: str) -> None:
39
+ # 若目标文件所在目录不存在则自动创建
40
+ directory = os.path.dirname(file_path)
41
+ if directory:
42
+ os.makedirs(directory, exist_ok=True)
43
+
44
+
45
+ def compute_batch_size(data_size: int, learning_rate: float, batch_num: int, minimum: int) -> int:
46
+ # 按学习率和样本量给出估算 batch,再夹在 [1, data_size] 范围内
47
+ estimated = int((learning_rate / 1e-4) ** 0.5 *
48
+ (data_size / max(batch_num, 1)))
49
+ return max(1, min(data_size, max(minimum, estimated)))
50
+
51
+
52
+ # 定义在 PyTorch 环境下的 Tweedie 偏差损失函数
53
+ # 参考文档:https://scikit-learn.org/stable/modules/model_evaluation.html#mean-poisson-gamma-and-tweedie-deviances
54
+
55
+
56
+ def tweedie_loss(pred, target, p=1.5, eps=1e-6, max_clip=1e6):
57
+ # 为确保稳定性先将预测值裁剪为正数
58
+ pred_clamped = torch.clamp(pred, min=eps)
59
+ # 计算 Tweedie 偏差的各部分
60
+ if p == 1:
61
+ # 对应泊松分布
62
+ term1 = target * torch.log(target / pred_clamped + eps)
63
+ term2 = -target + pred_clamped
64
+ term3 = 0
65
+ elif p == 0:
66
+ # 对应高斯分布
67
+ term1 = 0.5 * torch.pow(target - pred_clamped, 2)
68
+ term2 = 0
69
+ term3 = 0
70
+ elif p == 2:
71
+ # 对应伽马分布
72
+ term1 = torch.log(pred_clamped / target + eps)
73
+ term2 = -target / pred_clamped + 1
74
+ term3 = 0
75
+ else:
76
+ term1 = torch.pow(target, 2 - p) / ((1 - p) * (2 - p))
77
+ term2 = target * torch.pow(pred_clamped, 1 - p) / (1 - p)
78
+ term3 = torch.pow(pred_clamped, 2 - p) / (2 - p)
79
+ # Tweedie 负对数似然(忽略常数项)
80
+ return torch.nan_to_num(2 * (term1 - term2 + term3), nan=eps, posinf=max_clip, neginf=-max_clip)
81
+
82
+ # 定义释放CUDA内存函数
83
+
84
+
85
+ def free_cuda():
86
+ print(">>> Moving all models to CPU...")
87
+ for obj in gc.get_objects():
88
+ try:
89
+ if hasattr(obj, "to") and callable(obj.to):
90
+ # 跳过 torch.device 等不可移动对象
91
+ obj.to("cpu")
92
+ except:
93
+ pass
94
+
95
+ print(">>> Deleting tensors, optimizers, dataloaders...")
96
+ gc.collect()
97
+
98
+ print(">>> Emptying CUDA cache...")
99
+ torch.cuda.empty_cache()
100
+ torch.cuda.synchronize()
101
+
102
+ print(">>> CUDA memory freed.")
103
+
104
+
105
+ # =============================================================================
106
+ # Plotting helpers
107
+ # =============================================================================
108
+
109
+ # 定义分箱函数
110
+
111
+
112
+ def split_data(data, col_nme, wgt_nme, n_bins=10):
113
+ # 避免修改原始数据帧,先创建排序后的副本
114
+ data_sorted = data.sort_values(by=col_nme, ascending=True).copy()
115
+ data_sorted['cum_weight'] = data_sorted[wgt_nme].cumsum()
116
+ w_sum = data_sorted[wgt_nme].sum()
117
+ if w_sum <= EPS:
118
+ data_sorted.loc[:, 'bins'] = 0
119
+ else:
120
+ data_sorted.loc[:, 'bins'] = np.floor(
121
+ data_sorted['cum_weight'] * float(n_bins) / w_sum
122
+ )
123
+ data_sorted.loc[(data_sorted['bins'] == n_bins), 'bins'] = n_bins - 1
124
+ return data_sorted.groupby(['bins'], observed=True).sum(numeric_only=True)
125
+
126
+ # 定义提纯曲线(Lift)绘制函数
127
+
128
+
129
+ def plot_lift_list(pred_model, w_pred_list, w_act_list,
130
+ weight_list, tgt_nme, n_bins=10,
131
+ fig_nme='Lift Chart'):
132
+ lift_data = pd.DataFrame()
133
+ lift_data.loc[:, 'pred'] = pred_model
134
+ lift_data.loc[:, 'w_pred'] = w_pred_list
135
+ lift_data.loc[:, 'act'] = w_act_list
136
+ lift_data.loc[:, 'weight'] = weight_list
137
+ plot_data = split_data(lift_data, 'pred', 'weight', n_bins)
138
+ plot_data['exp_v'] = plot_data['w_pred'] / plot_data['weight']
139
+ plot_data['act_v'] = plot_data['act'] / plot_data['weight']
140
+ plot_data.reset_index(inplace=True)
141
+ fig = plt.figure(figsize=(7, 5))
142
+ ax = fig.add_subplot(111)
143
+ ax.plot(plot_data.index, plot_data['act_v'],
144
+ label='Actual', color='red')
145
+ ax.plot(plot_data.index, plot_data['exp_v'],
146
+ label='Predicted', color='blue')
147
+ ax.set_title(
148
+ 'Lift Chart of %s' % tgt_nme, fontsize=8)
149
+ plt.xticks(plot_data.index,
150
+ plot_data.index,
151
+ rotation=90, fontsize=6)
152
+ plt.yticks(fontsize=6)
153
+ plt.legend(loc='upper left',
154
+ fontsize=5, frameon=False)
155
+ plt.margins(0.05)
156
+ ax2 = ax.twinx()
157
+ ax2.bar(plot_data.index, plot_data['weight'],
158
+ alpha=0.5, color='seagreen',
159
+ label='Earned Exposure')
160
+ plt.yticks(fontsize=6)
161
+ plt.legend(loc='upper right',
162
+ fontsize=5, frameon=False)
163
+ plt.subplots_adjust(wspace=0.3)
164
+ save_path = os.path.join(
165
+ os.getcwd(), 'plot', f'05_{tgt_nme}_{fig_nme}.png')
166
+ ensure_parent_dir(save_path)
167
+ plt.savefig(save_path, dpi=300)
168
+ plt.close(fig)
169
+
170
+ # 定义双提纯曲线绘制函数
171
+
172
+
173
+ def plot_dlift_list(pred_model_1, pred_model_2,
174
+ model_nme_1, model_nme_2,
175
+ tgt_nme,
176
+ w_list, w_act_list, n_bins=10,
177
+ fig_nme='Double Lift Chart'):
178
+ lift_data = pd.DataFrame()
179
+ lift_data.loc[:, 'pred1'] = pred_model_1
180
+ lift_data.loc[:, 'pred2'] = pred_model_2
181
+ lift_data.loc[:, 'diff_ly'] = lift_data['pred1'] / lift_data['pred2']
182
+ lift_data.loc[:, 'act'] = w_act_list
183
+ lift_data.loc[:, 'weight'] = w_list
184
+ lift_data.loc[:, 'w_pred1'] = lift_data['pred1'] * lift_data['weight']
185
+ lift_data.loc[:, 'w_pred2'] = lift_data['pred2'] * lift_data['weight']
186
+ plot_data = split_data(lift_data, 'diff_ly', 'weight', n_bins)
187
+ plot_data['exp_v1'] = plot_data['w_pred1'] / plot_data['act']
188
+ plot_data['exp_v2'] = plot_data['w_pred2'] / plot_data['act']
189
+ plot_data['act_v'] = plot_data['act']/plot_data['act']
190
+ plot_data.reset_index(inplace=True)
191
+ fig = plt.figure(figsize=(7, 5))
192
+ ax = fig.add_subplot(111)
193
+ ax.plot(plot_data.index, plot_data['act_v'],
194
+ label='Actual', color='red')
195
+ ax.plot(plot_data.index, plot_data['exp_v1'],
196
+ label=model_nme_1, color='blue')
197
+ ax.plot(plot_data.index, plot_data['exp_v2'],
198
+ label=model_nme_2, color='black')
199
+ ax.set_title(
200
+ 'Double Lift Chart of %s' % tgt_nme, fontsize=8)
201
+ plt.xticks(plot_data.index,
202
+ plot_data.index,
203
+ rotation=90, fontsize=6)
204
+ plt.xlabel('%s / %s' % (model_nme_1, model_nme_2), fontsize=6)
205
+ plt.yticks(fontsize=6)
206
+ plt.legend(loc='upper left',
207
+ fontsize=5, frameon=False)
208
+ plt.margins(0.1)
209
+ plt.subplots_adjust(bottom=0.25, top=0.95, right=0.8)
210
+ ax2 = ax.twinx()
211
+ ax2.bar(plot_data.index, plot_data['weight'],
212
+ alpha=0.5, color='seagreen',
213
+ label='Earned Exposure')
214
+ plt.yticks(fontsize=6)
215
+ plt.legend(loc='upper right',
216
+ fontsize=5, frameon=False)
217
+ plt.subplots_adjust(wspace=0.3)
218
+ save_path = os.path.join(
219
+ os.getcwd(), 'plot', f'06_{tgt_nme}_{fig_nme}.png')
220
+ ensure_parent_dir(save_path)
221
+ plt.savefig(save_path, dpi=300)
222
+ plt.close(fig)
223
+
224
+
225
+ # =============================================================================
226
+ # ResNet model & sklearn-style wrapper
227
+ # =============================================================================
228
+
229
+ # 开始定义ResNet模型结构
230
+ # 残差块:两层线性 + ReLU + 残差连接
231
+ # ResBlock 继承 nn.Module
232
+ class ResBlock(nn.Module):
233
+ def __init__(self, dim: int, dropout: float = 0.1,
234
+ use_layernorm: bool = False, residual_scale: float = 0.1
235
+ ):
236
+ super().__init__()
237
+ self.use_layernorm = use_layernorm
238
+
239
+ if use_layernorm:
240
+ Norm = nn.LayerNorm # 对最后一维做归一化
241
+ else:
242
+ def Norm(d): return nn.BatchNorm1d(d) # 保留一个开关,想试 BN 时也能用
243
+
244
+ self.norm1 = Norm(dim)
245
+ self.fc1 = nn.Linear(dim, dim, bias=True)
246
+ self.act = nn.ReLU(inplace=True)
247
+ self.dropout = nn.Dropout(dropout) if dropout > 0.0 else nn.Identity()
248
+ self.norm2 = Norm(dim)
249
+ self.fc2 = nn.Linear(dim, dim, bias=True)
250
+
251
+ # 残差缩放,防止一开始就把主干搞炸
252
+ self.res_scale = nn.Parameter(
253
+ torch.tensor(residual_scale, dtype=torch.float32)
254
+ )
255
+
256
+ def forward(self, x):
257
+ # 前置激活结构
258
+ out = self.norm1(x)
259
+ out = self.fc1(out)
260
+ out = self.act(out)
261
+ out = self.dropout(out)
262
+ out = self.norm2(out)
263
+ out = self.fc2(out)
264
+ # 残差缩放再相加
265
+ return F.relu(x + self.res_scale * out)
266
+
267
+ # ResNetSequential 继承 nn.Module,定义整个网络结构
268
+
269
+
270
+ class ResNetSequential(nn.Module):
271
+ # 输入张量形状:(batch, input_dim)
272
+ # 网络结构:全连接 + 归一化 + ReLU,再堆叠若干残差块,最后输出 Softplus
273
+
274
+ def __init__(self, input_dim: int, hidden_dim: int = 64, block_num: int = 2,
275
+ use_layernorm: bool = True, dropout: float = 0.1,
276
+ residual_scale: float = 0.1):
277
+ super(ResNetSequential, self).__init__()
278
+
279
+ self.net = nn.Sequential()
280
+ self.net.add_module('fc1', nn.Linear(input_dim, hidden_dim))
281
+
282
+ if use_layernorm:
283
+ self.net.add_module('norm1', nn.LayerNorm(hidden_dim))
284
+ else:
285
+ self.net.add_module('norm1', nn.BatchNorm1d(hidden_dim))
286
+
287
+ self.net.add_module('relu1', nn.ReLU(inplace=True))
288
+
289
+ # 多个残差块
290
+ for i in range(block_num):
291
+ self.net.add_module(
292
+ f'ResBlk_{i+1}',
293
+ ResBlock(
294
+ hidden_dim,
295
+ dropout=dropout,
296
+ use_layernorm=use_layernorm,
297
+ residual_scale=residual_scale)
298
+ )
299
+
300
+ self.net.add_module('fc_out', nn.Linear(hidden_dim, 1))
301
+ self.net.add_module('softplus', nn.Softplus())
302
+
303
+ def forward(self, x):
304
+ return self.net(x)
305
+
306
+ # 定义ResNet模型的Scikit-Learn接口类
307
+
308
+
309
+ class ResNetSklearn(nn.Module):
310
+ def __init__(self, model_nme: str, input_dim: int, hidden_dim: int = 64,
311
+ block_num: int = 2, batch_num: int = 100, epochs: int = 100,
312
+ task_type: str = 'regression',
313
+ tweedie_power: float = 1.5, learning_rate: float = 0.01, patience: int = 10,
314
+ use_layernorm: bool = True, dropout: float = 0.1,
315
+ residual_scale: float = 0.1):
316
+ super(ResNetSklearn, self).__init__()
317
+
318
+ self.input_dim = input_dim
319
+ self.hidden_dim = hidden_dim
320
+ self.block_num = block_num
321
+ self.batch_num = batch_num
322
+ self.epochs = epochs
323
+ self.task_type = task_type
324
+ self.model_nme = model_nme
325
+ self.learning_rate = learning_rate
326
+ self.patience = patience
327
+ self.use_layernorm = use_layernorm
328
+ self.dropout = dropout
329
+ self.residual_scale = residual_scale
330
+
331
+ # 设备选择:cuda > mps > cpu
332
+ if torch.cuda.is_available():
333
+ self.device = torch.device('cuda')
334
+ elif torch.backends.mps.is_available():
335
+ self.device = torch.device('mps')
336
+ else:
337
+ self.device = torch.device('cpu')
338
+
339
+ # Tweedie 幂指数设定
340
+ if 'f' in self.model_nme:
341
+ self.tw_power = 1
342
+ elif 's' in self.model_nme:
343
+ self.tw_power = 2
344
+ else:
345
+ self.tw_power = tweedie_power
346
+
347
+ # 搭建网络
348
+ self.resnet = ResNetSequential(
349
+ self.input_dim,
350
+ self.hidden_dim,
351
+ self.block_num,
352
+ use_layernorm=self.use_layernorm,
353
+ dropout=self.dropout,
354
+ residual_scale=self.residual_scale
355
+ ).to(self.device)
356
+
357
+ # 如果是分类任务,替换掉最后的 Softplus
358
+ if self.task_type == 'classification':
359
+ self.resnet.net.softplus = nn.Identity()
360
+
361
+ def forward(self, x):
362
+ # 重写 forward 方法以处理 SHAP 的输入
363
+ # SHAP (KernelExplainer) 会传入一个 NumPy 数组
364
+ if isinstance(x, np.ndarray):
365
+ # 1. 从 NumPy 数组创建张量
366
+ x_tensor = torch.tensor(x, dtype=torch.float32)
367
+ else:
368
+ # 2. 保持对现有张量输入的兼容
369
+ x_tensor = x
370
+
371
+ # 3. 确保输入张量在正确的设备上
372
+ x_tensor = x_tensor.to(self.device)
373
+
374
+ # 4. 通过底层 ResNet 模型进行预测
375
+ # self.resnet 已经在初始化时被 .to(self.device)
376
+ y_pred = self.resnet(x_tensor)
377
+ return y_pred
378
+
379
+ def fit(self, X_train, y_train, w_train=None,
380
+ X_val=None, y_val=None, w_val=None):
381
+
382
+ # === 1. 训练集:先留在 CPU,交给 DataLoader 批量搬运到 GPU ===
383
+ # 注意:从 pandas DataFrame 转 tensor 时要复制数据,避免后续视图修改
384
+ X_tensor = torch.tensor(X_train.values, dtype=torch.float32)
385
+ y_tensor = torch.tensor(
386
+ y_train.values, dtype=torch.float32).view(-1, 1)
387
+ if w_train is not None:
388
+ w_tensor = torch.tensor(
389
+ w_train.values, dtype=torch.float32).view(-1, 1)
390
+ else:
391
+ w_tensor = torch.ones_like(y_tensor)
392
+
393
+ # === 2. 验证集:先在 CPU 上构造,后续一次性搬到目标设备 ===
394
+ has_val = X_val is not None and y_val is not None
395
+ if has_val:
396
+ X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
397
+ y_val_tensor = torch.tensor(
398
+ y_val.values, dtype=torch.float32).view(-1, 1)
399
+ if w_val is not None:
400
+ w_val_tensor = torch.tensor(
401
+ w_val.values, dtype=torch.float32).view(-1, 1)
402
+ else:
403
+ w_val_tensor = torch.ones_like(y_val_tensor)
404
+ else:
405
+ X_val_tensor = y_val_tensor = w_val_tensor = None
406
+
407
+ # === 3. 构建 DataLoader ===
408
+ dataset = TensorDataset(X_tensor, y_tensor, w_tensor)
409
+ batch_size = compute_batch_size(
410
+ data_size=len(dataset),
411
+ learning_rate=self.learning_rate,
412
+ batch_num=self.batch_num,
413
+ minimum=64
414
+ )
415
+ N = X_tensor.shape[0]
416
+
417
+ if self.device.type == 'cuda':
418
+ if N > 200_000:
419
+ base_bs = 4096
420
+ elif N > 50_000:
421
+ base_bs = 2048
422
+ else:
423
+ base_bs = 1024
424
+ else:
425
+ if N > 50_000:
426
+ base_bs = 1024
427
+ else:
428
+ base_bs = 512
429
+
430
+ batch_size = min(batch_size, base_bs)
431
+
432
+ dataloader = DataLoader(
433
+ dataset,
434
+ batch_size=batch_size,
435
+ shuffle=True,
436
+ num_workers=1, # 表格数据通常 0~1 个线程即可
437
+ pin_memory=(self.device.type == 'cuda')
438
+ )
439
+
440
+ # === 4. 优化器与 AMP ===
441
+ # 建议使用 Adam + AMP 主要是为了稳定损失,同时保持 GPU 性能
442
+ self.optimizer = torch.optim.Adam(
443
+ self.resnet.parameters(), lr=self.learning_rate)
444
+ self.scaler = GradScaler(enabled=(self.device.type == 'cuda'))
445
+
446
+ # === 5. 早停机制 ===
447
+ best_loss, patience_counter = float('inf'), 0
448
+ best_model_state = None
449
+
450
+ # 若存在验证集则一次性搬到目标设备
451
+ if has_val:
452
+ X_val_dev = X_val_tensor.to(self.device, non_blocking=True)
453
+ y_val_dev = y_val_tensor.to(self.device, non_blocking=True)
454
+ w_val_dev = w_val_tensor.to(self.device, non_blocking=True)
455
+
456
+ # === 6. 训练循环 ===
457
+ for epoch in range(1, self.epochs + 1):
458
+ self.resnet.train()
459
+ for X_batch, y_batch, w_batch in dataloader:
460
+ self.optimizer.zero_grad()
461
+
462
+ X_batch = X_batch.to(self.device, non_blocking=True)
463
+ y_batch = y_batch.to(self.device, non_blocking=True)
464
+ w_batch = w_batch.to(self.device, non_blocking=True)
465
+
466
+ with autocast(enabled=(self.device.type == 'cuda')):
467
+ y_pred = self.resnet(X_batch)
468
+
469
+ if self.task_type == 'classification':
470
+ loss_fn = nn.BCEWithLogitsLoss(reduction='none')
471
+ losses = loss_fn(y_pred, y_batch).view(-1)
472
+ else:
473
+ y_pred = torch.clamp(y_pred, min=1e-6)
474
+ losses = tweedie_loss(
475
+ y_pred, y_batch, p=self.tw_power
476
+ ).view(-1)
477
+
478
+ weighted_loss = (
479
+ losses * w_batch.view(-1)
480
+ ).sum() / torch.clamp(w_batch.sum(), min=EPS)
481
+
482
+ self.scaler.scale(weighted_loss).backward()
483
+
484
+ if self.device.type == 'cuda':
485
+ self.scaler.unscale_(self.optimizer)
486
+ clip_grad_norm_(self.resnet.parameters(), max_norm=1.0)
487
+
488
+ self.scaler.step(self.optimizer)
489
+ self.scaler.update()
490
+
491
+ # === 7. 验证损失与早停判断 ===
492
+ if has_val:
493
+ self.resnet.eval()
494
+ with torch.no_grad(), autocast(enabled=(self.device.type == 'cuda')):
495
+ y_val_pred = self.resnet(X_val_dev)
496
+
497
+ if self.task_type == 'classification':
498
+ val_loss_fn = nn.BCEWithLogitsLoss(reduction='none')
499
+ val_loss_values = val_loss_fn(
500
+ y_val_pred, y_val_dev).view(-1)
501
+ else:
502
+ y_val_pred = torch.clamp(y_val_pred, min=1e-6)
503
+ val_loss_values = tweedie_loss(
504
+ y_val_pred, y_val_dev, p=self.tw_power
505
+ ).view(-1)
506
+
507
+ val_weighted_loss = (
508
+ val_loss_values * w_val_dev.view(-1)
509
+ ).sum() / torch.clamp(w_val_dev.sum(), min=EPS)
510
+
511
+ if val_weighted_loss < best_loss:
512
+ best_loss = val_weighted_loss
513
+ patience_counter = 0
514
+ best_model_state = copy.deepcopy(self.resnet.state_dict())
515
+ else:
516
+ patience_counter += 1
517
+
518
+ if patience_counter >= self.patience and best_model_state is not None:
519
+ self.resnet.load_state_dict(best_model_state)
520
+ break
521
+ if has_val and best_model_state is not None:
522
+ self.resnet.load_state_dict(best_model_state)
523
+
524
+ # ---------------- 预测 ----------------
525
+
526
+ def predict(self, X_test):
527
+ self.resnet.eval()
528
+ # 如果输入是 DataFrame,先转为 NumPy 数组
529
+ if isinstance(X_test, pd.DataFrame):
530
+ X_np = X_test.values.astype(np.float32)
531
+ else:
532
+ X_np = X_test
533
+
534
+ with torch.no_grad():
535
+ # 直接调用 self (即 ResNetSklearn 实例),这将触发 forward 方法
536
+ y_pred = self(X_np).cpu().numpy()
537
+
538
+ if self.task_type == 'classification':
539
+ y_pred = 1 / (1 + np.exp(-y_pred)) # Sigmoid
540
+ else:
541
+ y_pred = np.clip(y_pred, 1e-6, None)
542
+ return y_pred.flatten()
543
+
544
+ # ---------------- 设置参数 ----------------
545
+
546
+ def set_params(self, params):
547
+ for key, value in params.items():
548
+ if hasattr(self, key):
549
+ setattr(self, key, value)
550
+ else:
551
+ raise ValueError(f"Parameter {key} not found in model.")
552
+ return self
553
+ # =============================================================================
554
+ # FT-Transformer model & sklearn-style wrapper
555
+ # =============================================================================
556
+ # 开始定义FT Transformer模型结构
557
+
558
+
559
+ class FeatureTokenizer(nn.Module):
560
+ # 将数值与类别特征映射为 token,输出形状 (batch, token 数, d_model)
561
+ # 设定:
562
+ # - X_num 表示数值特征,形状 (batch, num_numeric)
563
+ # - X_cat 表示类别特征,形状 (batch, num_categorical),每列为编码后的整数标签 [0, card-1]
564
+
565
+ def __init__(self, num_numeric: int, cat_cardinalities, d_model: int):
566
+ super().__init__()
567
+
568
+ self.num_numeric = num_numeric
569
+ self.has_numeric = num_numeric > 0
570
+
571
+ if self.has_numeric:
572
+ self.num_linear = nn.Linear(num_numeric, d_model)
573
+
574
+ self.embeddings = nn.ModuleList([
575
+ nn.Embedding(card, d_model) for card in cat_cardinalities
576
+ ])
577
+
578
+ def forward(self, X_num, X_cat):
579
+ tokens = []
580
+
581
+ if self.has_numeric:
582
+ # 数值特征映射为单个 token
583
+ num_token = self.num_linear(X_num) # 形状 (batch, d_model)
584
+ tokens.append(num_token)
585
+
586
+ # 每个类别特征生成一个嵌入 token
587
+ for i, emb in enumerate(self.embeddings):
588
+ tok = emb(X_cat[:, i]) # 形状 (batch, d_model)
589
+ tokens.append(tok)
590
+
591
+ # 最终堆叠为 (batch, token 数, d_model)
592
+ x = torch.stack(tokens, dim=1)
593
+ return x
594
+
595
+ # 定义具有残差缩放的Encoder层
596
+
597
+
598
+ class ScaledTransformerEncoderLayer(nn.Module):
599
+ def __init__(self, d_model: int, nhead: int, dim_feedforward: int = 2048,
600
+ dropout: float = 0.1, residual_scale_attn: float = 1.0,
601
+ residual_scale_ffn: float = 1.0, norm_first: bool = True,
602
+ ):
603
+ super().__init__()
604
+ self.self_attn = nn.MultiheadAttention(
605
+ embed_dim=d_model,
606
+ num_heads=nhead,
607
+ dropout=dropout,
608
+ batch_first=True
609
+ )
610
+
611
+ # 前馈网络部分
612
+ self.linear1 = nn.Linear(d_model, dim_feedforward)
613
+ self.dropout = nn.Dropout(dropout)
614
+ self.linear2 = nn.Linear(dim_feedforward, d_model)
615
+
616
+ # 归一化与 Dropout
617
+ self.norm1 = nn.LayerNorm(d_model)
618
+ self.norm2 = nn.LayerNorm(d_model)
619
+ self.dropout1 = nn.Dropout(dropout)
620
+ self.dropout2 = nn.Dropout(dropout)
621
+
622
+ self.activation = nn.GELU()
623
+ # self.activation = nn.ReLU()
624
+ self.norm_first = norm_first
625
+
626
+ # 残差缩放系数
627
+ self.res_scale_attn = residual_scale_attn
628
+ self.res_scale_ffn = residual_scale_ffn
629
+
630
+ def forward(self, src, src_mask=None, src_key_padding_mask=None):
631
+ # 输入张量形状:(batch, 序列长度, d_model)
632
+ x = src
633
+
634
+ if self.norm_first:
635
+ # 先归一化再做注意力
636
+ x = x + self._sa_block(self.norm1(x), src_mask,
637
+ src_key_padding_mask)
638
+ x = x + self._ff_block(self.norm2(x))
639
+ else:
640
+ # 后归一化(一般不启用)
641
+ x = self.norm1(
642
+ x + self._sa_block(x, src_mask, src_key_padding_mask))
643
+ x = self.norm2(x + self._ff_block(x))
644
+
645
+ return x
646
+
647
+ def _sa_block(self, x, attn_mask, key_padding_mask):
648
+ # 自注意力并附带残差缩放
649
+ attn_out, _ = self.self_attn(
650
+ x, x, x,
651
+ attn_mask=attn_mask,
652
+ key_padding_mask=key_padding_mask,
653
+ need_weights=False
654
+ )
655
+ return self.res_scale_attn * self.dropout1(attn_out)
656
+
657
+ def _ff_block(self, x):
658
+ # 前馈网络并附带残差缩放
659
+ x2 = self.linear2(self.dropout(self.activation(self.linear1(x))))
660
+ return self.res_scale_ffn * self.dropout2(x2)
661
+
662
+ # 定义FT-Transformer核心模型
663
+
664
+
665
+ class FTTransformerCore(nn.Module):
666
+ # 最小可用版本的 FT-Transformer:
667
+ # - FeatureTokenizer:将数值与类别特征转换为 token
668
+ # - TransformerEncoder:捕捉特征之间的交互
669
+ # - 池化 + MLP + Softplus:保证输出为正值(适配 Tweedie/Gamma)
670
+
671
+ def __init__(self, num_numeric: int, cat_cardinalities, d_model: int = 64,
672
+ n_heads: int = 8, n_layers: int = 4, dropout: float = 0.1,
673
+ ):
674
+ super().__init__()
675
+
676
+ self.tokenizer = FeatureTokenizer(
677
+ num_numeric=num_numeric,
678
+ cat_cardinalities=cat_cardinalities,
679
+ d_model=d_model
680
+ )
681
+ scale = 1.0 / math.sqrt(n_layers) # 推荐一个默认值
682
+ encoder_layer = ScaledTransformerEncoderLayer(
683
+ d_model=d_model,
684
+ nhead=n_heads,
685
+ dim_feedforward=d_model * 4,
686
+ dropout=dropout,
687
+ residual_scale_attn=scale,
688
+ residual_scale_ffn=scale,
689
+ norm_first=True,
690
+ )
691
+ self.encoder = nn.TransformerEncoder(
692
+ encoder_layer,
693
+ num_layers=n_layers
694
+ )
695
+ self.n_layers = n_layers
696
+
697
+ self.head = nn.Sequential(
698
+ nn.LayerNorm(d_model),
699
+ nn.Linear(d_model, d_model),
700
+ nn.GELU(),
701
+ # nn.ReLU(),
702
+ nn.Linear(d_model, 1),
703
+ # nn.Softplus() # 保证输出为正,适合 Tweedie / Gamma
704
+ # 移除 Softplus,让模型输出 logits。
705
+ # 在训练和推理时根据任务类型决定是否应用 sigmoid 或 softplus。
706
+ # 对于分类,输出 logits,然后用 BCEWithLogitsLoss
707
+ # 对于回归,在推理时应用 softplus
708
+ )
709
+
710
+ def forward(self, X_num, X_cat):
711
+
712
+ # X_num: (batch, 数值特征数),float32
713
+ # X_cat: (batch, 类别特征数),long
714
+
715
+ tokens = self.tokenizer(X_num, X_cat) # 形状 (batch, token 数, d_model)
716
+ x = self.encoder(tokens) # 形状 (batch, token 数, d_model)
717
+
718
+ # 对 token 做平均池化
719
+ x = x.mean(dim=1) # 形状 (batch, d_model)
720
+
721
+ out = self.head(x) # 形状 (batch, 1),Softplus 保证为正
722
+ return out
723
+
724
+ # 定义TabularDataset类
725
+
726
+
727
+ class TabularDataset(Dataset):
728
+ def __init__(self, X_num, X_cat, y, w):
729
+
730
+ # X_num: torch.float32, 形状 (N, 数值特征数)
731
+ # X_cat: torch.long, 形状 (N, 类别特征数)
732
+ # y: torch.float32, 形状 (N, 1)
733
+ # w: torch.float32, 形状 (N, 1)
734
+
735
+ self.X_num = X_num
736
+ self.X_cat = X_cat
737
+ self.y = y
738
+ self.w = w
739
+
740
+ def __len__(self):
741
+ return self.y.shape[0]
742
+
743
+ def __getitem__(self, idx):
744
+ return (
745
+ self.X_num[idx],
746
+ self.X_cat[idx],
747
+ self.y[idx],
748
+ self.w[idx],
749
+ )
750
+
751
+ # 定义FTTransformer的Scikit-Learn接口类
752
+
753
+
754
+ class FTTransformerSklearn(nn.Module):
755
+
756
+ # sklearn 风格包装:
757
+ # - num_cols:数值特征列名列表
758
+ # - cat_cols:类别特征列名列表(需提前做标签编码,取值 [0, n_classes-1])
759
+
760
+ def __init__(self, model_nme: str, num_cols, cat_cols, d_model: int = 64, n_heads: int = 8,
761
+ n_layers: int = 4, dropout: float = 0.1, batch_num: int = 100, epochs: int = 100,
762
+ task_type: str = 'regression',
763
+ tweedie_power: float = 1.5, learning_rate: float = 1e-3, patience: int = 10,
764
+ ):
765
+ super().__init__()
766
+
767
+ self.model_nme = model_nme
768
+ self.num_cols = list(num_cols)
769
+ self.cat_cols = list(cat_cols)
770
+ self.d_model = d_model
771
+ self.n_heads = n_heads
772
+ self.n_layers = n_layers
773
+ self.dropout = dropout
774
+ self.batch_num = batch_num
775
+ self.epochs = epochs
776
+ self.learning_rate = learning_rate
777
+ self.task_type = task_type
778
+ self.patience = patience
779
+ if 'f' in self.model_nme:
780
+ self.tw_power = 1.0
781
+ elif 's' in self.model_nme:
782
+ self.tw_power = 2.0
783
+ else:
784
+ self.tw_power = tweedie_power
785
+ if torch.cuda.is_available():
786
+ self.device = torch.device("cuda")
787
+ elif torch.backends.mps.is_available():
788
+ self.device = torch.device("mps")
789
+ else:
790
+ self.device = torch.device("cpu")
791
+ self.cat_cardinalities = None
792
+ self.cat_categories = {}
793
+ self.ft = None
794
+
795
+ def _build_model(self, X_train):
796
+ num_numeric = len(self.num_cols)
797
+ cat_cardinalities = []
798
+
799
+ for col in self.cat_cols:
800
+ cats = X_train[col].astype('category')
801
+ categories = cats.cat.categories
802
+ self.cat_categories[col] = categories # 保存训练集类别全集
803
+
804
+ card = len(categories) + 1 # 多预留 1 类给“未知/缺失”
805
+ cat_cardinalities.append(card)
806
+
807
+ self.cat_cardinalities = cat_cardinalities
808
+
809
+ self.ft = FTTransformerCore(
810
+ num_numeric=num_numeric,
811
+ cat_cardinalities=cat_cardinalities,
812
+ d_model=self.d_model,
813
+ n_heads=self.n_heads,
814
+ n_layers=self.n_layers,
815
+ dropout=self.dropout,
816
+ ).to(self.device)
817
+
818
+ def _encode_cats(self, X):
819
+ # 输入 DataFrame 至少需要包含所有类别特征列
820
+ # 返回形状 (N, 类别特征数) 的 int64 数组
821
+
822
+ if not self.cat_cols:
823
+ return np.zeros((len(X), 0), dtype='int64')
824
+
825
+ X_cat_list = []
826
+ for col in self.cat_cols:
827
+ # 使用训练阶段记录的类别全集
828
+ categories = self.cat_categories[col]
829
+ # 按固定类别构造 Categorical
830
+ cats = pd.Categorical(X[col], categories=categories)
831
+ codes = cats.codes.astype('int64', copy=True) # -1 表示未知或缺失
832
+ # 未知或缺失映射到额外的“未知”索引 len(categories)
833
+ codes[codes < 0] = len(categories)
834
+ X_cat_list.append(codes)
835
+
836
+ X_cat_np = np.stack(X_cat_list, axis=1) # 形状 (N, 类别特征数)
837
+ return X_cat_np
838
+
839
+ def fit(self, X_train, y_train, w_train=None,
840
+ X_val=None, y_val=None, w_val=None):
841
+
842
+ # 首次拟合时需要构建底层模型结构
843
+ if self.ft is None:
844
+ self._build_model(X_train)
845
+
846
+ # --- 构建训练张量(全部先放在 CPU,后续按批搬运) ---
847
+ # 复制数据确保与原 DataFrame 脱钩,这样标准化或采样不会污染原始数据
848
+ X_num_train = X_train[self.num_cols].to_numpy(
849
+ dtype=np.float32, copy=True)
850
+ X_num_train = torch.tensor(
851
+ X_num_train,
852
+ dtype=torch.float32
853
+ )
854
+
855
+ if self.cat_cols:
856
+ X_cat_train_np = self._encode_cats(X_train)
857
+ X_cat_train = torch.tensor(X_cat_train_np, dtype=torch.long)
858
+ else:
859
+ X_cat_train = torch.zeros(
860
+ (X_num_train.shape[0], 0), dtype=torch.long)
861
+
862
+ y_tensor = torch.tensor(
863
+ y_train.values,
864
+ dtype=torch.float32
865
+ ).view(-1, 1)
866
+
867
+ if w_train is not None:
868
+ w_tensor = torch.tensor(
869
+ w_train.values,
870
+ dtype=torch.float32
871
+ ).view(-1, 1)
872
+ else:
873
+ w_tensor = torch.ones_like(y_tensor)
874
+
875
+ # --- 验证集张量(一次性搬到目标设备) ---
876
+ has_val = X_val is not None and y_val is not None
877
+ if has_val:
878
+ # ---------- 数值特征 ----------
879
+ X_num_val_np = X_val[self.num_cols].to_numpy(
880
+ dtype=np.float32, copy=True)
881
+ X_num_val = torch.tensor(X_num_val_np, dtype=torch.float32)
882
+
883
+ # ---------- 类别特征 ----------
884
+ if self.cat_cols:
885
+ X_cat_val_np = self._encode_cats(X_val)
886
+ X_cat_val = torch.tensor(X_cat_val_np, dtype=torch.long)
887
+ else:
888
+ X_cat_val = torch.zeros(
889
+ (X_num_val.shape[0], 0), dtype=torch.long)
890
+
891
+ # ---------- 目标 & 权重 ----------
892
+ y_val_np = y_val.values.astype(np.float32, copy=True)
893
+ y_val_tensor = torch.tensor(
894
+ y_val_np, dtype=torch.float32).view(-1, 1)
895
+
896
+ if w_val is not None:
897
+ w_val_np = w_val.values.astype(np.float32, copy=True)
898
+ w_val_tensor = torch.tensor(
899
+ w_val_np, dtype=torch.float32).view(-1, 1)
900
+ else:
901
+ w_val_tensor = torch.ones_like(y_val_tensor)
902
+
903
+ else:
904
+ X_num_val = X_cat_val = y_val_tensor = w_val_tensor = None
905
+
906
+ # --- 构建 DataLoader ---
907
+ dataset = TabularDataset(
908
+ X_num_train, X_cat_train, y_tensor, w_tensor
909
+ )
910
+
911
+ batch_size = compute_batch_size(
912
+ data_size=len(dataset),
913
+ learning_rate=self.learning_rate,
914
+ batch_num=self.batch_num,
915
+ minimum=64
916
+ )
917
+ N = X_num_train.shape[0]
918
+ if self.device.type == 'cuda':
919
+ if N > 200_000:
920
+ base_bs = 32768
921
+ elif N > 50_000:
922
+ base_bs = 16384
923
+ else:
924
+ base_bs = 8192
925
+ else:
926
+ if N > 50_000:
927
+ base_bs = 16384
928
+ else:
929
+ base_bs = 8192
930
+ batch_size = min(batch_size, base_bs)
931
+
932
+ dataloader = DataLoader(
933
+ dataset,
934
+ batch_size=batch_size,
935
+ shuffle=True,
936
+ num_workers=1,
937
+ pin_memory=(self.device.type == 'cuda')
938
+ )
939
+
940
+ # --- 优化器与 AMP ---
941
+ # 这部分与 ResNet 一致,仍建议使用 Adam + AMP 来避免数值不稳定
942
+ optimizer = torch.optim.Adam(
943
+ self.ft.parameters(),
944
+ lr=self.learning_rate
945
+ )
946
+ scaler = GradScaler(enabled=(self.device.type == 'cuda'))
947
+
948
+ # --- 早停机制 ---
949
+ best_loss = float('inf')
950
+ patience_counter = 0
951
+ best_model_state = None
952
+
953
+ # 若存在验证集则整体迁移到目标设备
954
+ if has_val:
955
+ X_num_val_dev = X_num_val.to(self.device, non_blocking=True)
956
+ X_cat_val_dev = X_cat_val.to(self.device, non_blocking=True)
957
+ y_val_dev = y_val_tensor.to(self.device, non_blocking=True)
958
+ w_val_dev = w_val_tensor.to(self.device, non_blocking=True)
959
+
960
+ # --- 训练循环 ---
961
+ for epoch in range(1, self.epochs + 1):
962
+ self.ft.train()
963
+ for X_num_b, X_cat_b, y_b, w_b in dataloader:
964
+ optimizer.zero_grad()
965
+
966
+ X_num_b = X_num_b.to(self.device, non_blocking=True)
967
+ X_cat_b = X_cat_b.to(self.device, non_blocking=True)
968
+ y_b = y_b.to(self.device, non_blocking=True)
969
+ w_b = w_b.to(self.device, non_blocking=True)
970
+
971
+ with autocast(enabled=(self.device.type == 'cuda')):
972
+ y_pred = self.ft(X_num_b, X_cat_b)
973
+
974
+ if self.task_type == 'classification':
975
+ loss_fn = nn.BCEWithLogitsLoss(reduction='none')
976
+ losses = loss_fn(y_pred, y_b).view(-1)
977
+ else:
978
+ # 对于回归,需要保证预测值为正
979
+ y_pred = F.softplus(y_pred)
980
+ y_pred = torch.clamp(y_pred, min=1e-6)
981
+ losses = tweedie_loss(
982
+ y_pred, y_b, p=self.tw_power).view(-1)
983
+
984
+ weighted_loss = (
985
+ losses * w_b.view(-1)
986
+ ).sum() / \
987
+ torch.clamp(w_b.sum(), min=EPS)
988
+
989
+ scaler.scale(weighted_loss).backward()
990
+
991
+ if self.device.type == 'cuda':
992
+ scaler.unscale_(optimizer)
993
+ clip_grad_norm_(self.ft.parameters(), max_norm=1.0)
994
+
995
+ scaler.step(optimizer)
996
+ scaler.update()
997
+
998
+ # --- 验证阶段与早停判断 ---
999
+ if has_val:
1000
+ self.ft.eval()
1001
+ with torch.no_grad(), autocast(enabled=(self.device.type == 'cuda')):
1002
+ y_val_pred = self.ft(X_num_val_dev, X_cat_val_dev)
1003
+
1004
+ if self.task_type == 'classification':
1005
+ val_loss_fn = nn.BCEWithLogitsLoss(reduction='none')
1006
+ val_losses = val_loss_fn(
1007
+ y_val_pred, y_val_dev).view(-1)
1008
+ else:
1009
+ y_val_pred = F.softplus(y_val_pred)
1010
+ y_val_pred = torch.clamp(y_val_pred, min=1e-6)
1011
+ val_losses = tweedie_loss(
1012
+ y_val_pred, y_val_dev, p=self.tw_power).view(-1)
1013
+
1014
+ val_weighted_loss = (
1015
+ val_losses * w_val_dev.view(-1)
1016
+ ).sum() / torch.clamp(w_val_dev.sum(), min=EPS)
1017
+
1018
+ if val_weighted_loss < best_loss:
1019
+ best_loss = val_weighted_loss
1020
+ patience_counter = 0
1021
+ best_model_state = copy.deepcopy(self.ft.state_dict())
1022
+ else:
1023
+ patience_counter += 1
1024
+
1025
+ if patience_counter >= self.patience and best_model_state is not None:
1026
+ self.ft.load_state_dict(best_model_state)
1027
+ break
1028
+ if has_val and best_model_state is not None:
1029
+ self.ft.load_state_dict(best_model_state)
1030
+
1031
+ def predict(self, X_test):
1032
+ # X_test 需要包含所有数值列与类别列
1033
+
1034
+ self.ft.eval()
1035
+ X_num = X_test[self.num_cols].to_numpy(dtype=np.float32, copy=True)
1036
+ X_num = torch.tensor(
1037
+ X_num,
1038
+ dtype=torch.float32
1039
+ )
1040
+ if self.cat_cols:
1041
+ X_cat_np = self._encode_cats(X_test)
1042
+ X_cat = torch.tensor(X_cat_np, dtype=torch.long)
1043
+ else:
1044
+ X_cat = torch.zeros((X_num.size(0), 0), dtype=torch.long)
1045
+
1046
+ with torch.no_grad():
1047
+ X_num = X_num.to(self.device, non_blocking=True)
1048
+ X_cat = X_cat.to(self.device, non_blocking=True)
1049
+ y_pred = self.ft(X_num, X_cat).cpu().numpy()
1050
+
1051
+ if self.task_type == 'classification':
1052
+ # 从 logits 转换为概率
1053
+ y_pred = 1 / (1 + np.exp(-y_pred))
1054
+ else:
1055
+ y_pred = np.log(1 + np.exp(y_pred)) # softplus
1056
+ y_pred = np.clip(y_pred, 1e-6, None)
1057
+ return y_pred.ravel()
1058
+
1059
+ def set_params(self, params: dict):
1060
+
1061
+ # 和 sklearn 风格保持一致。
1062
+ # 注意:对结构性参数(如 d_model/n_heads)修改后,需要重新 fit 才会生效。
1063
+
1064
+ for key, value in params.items():
1065
+ if hasattr(self, key):
1066
+ setattr(self, key, value)
1067
+ else:
1068
+ raise ValueError(f"Parameter {key} not found in model.")
1069
+ return self
1070
+
1071
+
1072
+ # ===== 基础组件与训练封装 =====================================================
1073
+
1074
+ # =============================================================================
1075
+ # Config, preprocessing, and trainer base
1076
+ # =============================================================================
1077
+ @dataclass
1078
+ class BayesOptConfig:
1079
+ model_nme: str
1080
+ resp_nme: str
1081
+ weight_nme: str
1082
+ factor_nmes: List[str]
1083
+ task_type: str = 'regression'
1084
+ binary_resp_nme: Optional[str] = None
1085
+ cate_list: Optional[List[str]] = None
1086
+ prop_test: float = 0.25
1087
+ rand_seed: Optional[int] = None
1088
+ epochs: int = 100
1089
+ use_gpu: bool = True
1090
+
1091
+
1092
+ class OutputManager:
1093
+ # 统一管理结果、图表与模型的输出路径
1094
+
1095
+ def __init__(self, root: Optional[str] = None, model_name: str = "model") -> None:
1096
+ self.root = Path(root or os.getcwd())
1097
+ self.model_name = model_name
1098
+ self.plot_dir = self.root / 'plot'
1099
+ self.result_dir = self.root / 'Results'
1100
+ self.model_dir = self.root / 'model'
1101
+
1102
+ def _prepare(self, path: Path) -> str:
1103
+ ensure_parent_dir(str(path))
1104
+ return str(path)
1105
+
1106
+ def plot_path(self, filename: str) -> str:
1107
+ return self._prepare(self.plot_dir / filename)
1108
+
1109
+ def result_path(self, filename: str) -> str:
1110
+ return self._prepare(self.result_dir / filename)
1111
+
1112
+ def model_path(self, filename: str) -> str:
1113
+ return self._prepare(self.model_dir / filename)
1114
+
1115
+
1116
+ class DatasetPreprocessor:
1117
+ # 为各训练器准备通用的训练/测试数据视图
1118
+
1119
+ def __init__(self, train_df: pd.DataFrame, test_df: pd.DataFrame,
1120
+ config: BayesOptConfig) -> None:
1121
+ self.config = config
1122
+ self.train_data = train_df.copy(deep=True)
1123
+ self.test_data = test_df.copy(deep=True)
1124
+ self.num_features: List[str] = []
1125
+ self.train_oht_scl_data: Optional[pd.DataFrame] = None
1126
+ self.test_oht_scl_data: Optional[pd.DataFrame] = None
1127
+ self.var_nmes: List[str] = []
1128
+ self.cat_categories_for_shap: Dict[str, List[Any]] = {}
1129
+
1130
+ def run(self) -> "DatasetPreprocessor":
1131
+ cfg = self.config
1132
+ # 预先计算加权实际值,后续画图、校验都依赖该字段
1133
+ self.train_data.loc[:, 'w_act'] = self.train_data[cfg.resp_nme] * \
1134
+ self.train_data[cfg.weight_nme]
1135
+ self.test_data.loc[:, 'w_act'] = self.test_data[cfg.resp_nme] * \
1136
+ self.test_data[cfg.weight_nme]
1137
+ if cfg.binary_resp_nme:
1138
+ self.train_data.loc[:, 'w_binary_act'] = self.train_data[cfg.binary_resp_nme] * \
1139
+ self.train_data[cfg.weight_nme]
1140
+ self.test_data.loc[:, 'w_binary_act'] = self.test_data[cfg.binary_resp_nme] * \
1141
+ self.test_data[cfg.weight_nme]
1142
+ # 高分位裁剪用来吸收离群值;若删除会导致极端点主导损失
1143
+ q99 = self.train_data[cfg.resp_nme].quantile(0.999)
1144
+ self.train_data[cfg.resp_nme] = self.train_data[cfg.resp_nme].clip(
1145
+ upper=q99)
1146
+ cate_list = list(cfg.cate_list or [])
1147
+ if cate_list:
1148
+ for cate in cate_list:
1149
+ self.train_data[cate] = self.train_data[cate].astype(
1150
+ 'category')
1151
+ self.test_data[cate] = self.test_data[cate].astype('category')
1152
+ cats = self.train_data[cate].cat.categories
1153
+ self.cat_categories_for_shap[cate] = list(cats)
1154
+ self.num_features = [
1155
+ nme for nme in cfg.factor_nmes if nme not in cate_list]
1156
+ train_oht = self.train_data[cfg.factor_nmes +
1157
+ [cfg.weight_nme] + [cfg.resp_nme]].copy()
1158
+ test_oht = self.test_data[cfg.factor_nmes +
1159
+ [cfg.weight_nme] + [cfg.resp_nme]].copy()
1160
+ train_oht = pd.get_dummies(
1161
+ train_oht,
1162
+ columns=cate_list,
1163
+ drop_first=True,
1164
+ dtype=np.int8
1165
+ )
1166
+ test_oht = pd.get_dummies(
1167
+ test_oht,
1168
+ columns=cate_list,
1169
+ drop_first=True,
1170
+ dtype=np.int8
1171
+ )
1172
+ for num_chr in self.num_features:
1173
+ # 逐列标准化保障每个特征在同一量级,否则神经网络会难以收敛
1174
+ scaler = StandardScaler()
1175
+ train_oht[num_chr] = scaler.fit_transform(
1176
+ train_oht[num_chr].values.reshape(-1, 1))
1177
+ test_oht[num_chr] = scaler.transform(
1178
+ test_oht[num_chr].values.reshape(-1, 1))
1179
+ # reindex 时将缺失的哑变量列补零,避免测试集列数与训练集不一致
1180
+ test_oht = test_oht.reindex(columns=train_oht.columns, fill_value=0)
1181
+ self.train_oht_scl_data = train_oht
1182
+ self.test_oht_scl_data = test_oht
1183
+ self.var_nmes = list(
1184
+ set(list(train_oht.columns)) - set([cfg.weight_nme, cfg.resp_nme])
1185
+ )
1186
+ return self
1187
+
1188
+ # =============================================================================
1189
+ # Trainers
1190
+ # =============================================================================
1191
+
1192
+
1193
+ class TrainerBase:
1194
+ def __init__(self, context: "BayesOptModel", label: str) -> None:
1195
+ self.ctx = context
1196
+ self.label = label
1197
+
1198
+ @property
1199
+ def config(self) -> BayesOptConfig:
1200
+ return self.ctx.config
1201
+
1202
+ @property
1203
+ def output(self) -> OutputManager:
1204
+ return self.ctx.output_manager
1205
+
1206
+ def tune(self, max_evals: int) -> None: # pragma: no cover 子类会覆盖
1207
+ raise NotImplementedError
1208
+
1209
+ def train(self) -> None: # pragma: no cover 子类会覆盖
1210
+ raise NotImplementedError
1211
+
1212
+ def save(self) -> None:
1213
+ pass
1214
+
1215
+ def load(self) -> None:
1216
+ pass
1217
+
1218
+
1219
+ class XGBTrainer(TrainerBase):
1220
+ def __init__(self, context: "BayesOptModel") -> None:
1221
+ super().__init__(context, 'Xgboost')
1222
+ self.model: Optional[xgb.XGBRegressor] = None
1223
+ self.best_params: Optional[Dict[str, Any]] = None
1224
+ self.best_trial = None
1225
+
1226
+ def _build_estimator(self) -> xgb.XGBRegressor:
1227
+ params = dict(
1228
+ objective=self.ctx.obj,
1229
+ random_state=self.ctx.rand_seed,
1230
+ subsample=0.9,
1231
+ tree_method='gpu_hist' if self.ctx.use_gpu else 'hist',
1232
+ enable_categorical=True,
1233
+ predictor='gpu_predictor' if self.ctx.use_gpu else 'cpu_predictor'
1234
+ )
1235
+ if self.ctx.use_gpu:
1236
+ params['gpu_id'] = 0
1237
+ return xgb.XGBRegressor(**params)
1238
+
1239
+ def cross_val(self, trial: optuna.trial.Trial) -> float:
1240
+ learning_rate = trial.suggest_float(
1241
+ 'learning_rate', 1e-5, 1e-1, log=True)
1242
+ gamma = trial.suggest_float('gamma', 0, 10000)
1243
+ max_depth = trial.suggest_int('max_depth', 3, 25)
1244
+ n_estimators = trial.suggest_int('n_estimators', 10, 500, step=10)
1245
+ min_child_weight = trial.suggest_int(
1246
+ 'min_child_weight', 100, 10000, step=100)
1247
+ reg_alpha = trial.suggest_float('reg_alpha', 1e-10, 1, log=True)
1248
+ reg_lambda = trial.suggest_float('reg_lambda', 1e-10, 1, log=True)
1249
+ if self.ctx.obj == 'reg:tweedie':
1250
+ tweedie_variance_power = trial.suggest_float(
1251
+ 'tweedie_variance_power', 1, 2)
1252
+ elif self.ctx.obj == 'count:poisson':
1253
+ tweedie_variance_power = 1
1254
+ elif self.ctx.obj == 'reg:gamma':
1255
+ tweedie_variance_power = 2
1256
+ else:
1257
+ tweedie_variance_power = 1.5
1258
+ clf = self._build_estimator()
1259
+ params = {
1260
+ 'learning_rate': learning_rate,
1261
+ 'gamma': gamma,
1262
+ 'max_depth': max_depth,
1263
+ 'n_estimators': n_estimators,
1264
+ 'min_child_weight': min_child_weight,
1265
+ 'reg_alpha': reg_alpha,
1266
+ 'reg_lambda': reg_lambda
1267
+ }
1268
+ if self.ctx.obj == 'reg:tweedie':
1269
+ params['tweedie_variance_power'] = tweedie_variance_power
1270
+ clf.set_params(**params)
1271
+ n_jobs = 1 if self.ctx.use_gpu else int(1 / self.ctx.prop_test)
1272
+ acc = cross_val_score(
1273
+ clf,
1274
+ self.ctx.train_data[self.ctx.factor_nmes],
1275
+ self.ctx.train_data[self.ctx.resp_nme].values,
1276
+ fit_params=self.ctx.fit_params,
1277
+ cv=self.ctx.cv,
1278
+ scoring=make_scorer(
1279
+ mean_tweedie_deviance,
1280
+ power=tweedie_variance_power,
1281
+ greater_is_better=False),
1282
+ error_score='raise',
1283
+ n_jobs=n_jobs
1284
+ ).mean()
1285
+ return -acc
1286
+
1287
+ def tune(self, max_evals: int = 100) -> None:
1288
+ study = optuna.create_study(
1289
+ direction='minimize',
1290
+ sampler=optuna.samplers.TPESampler(seed=self.ctx.rand_seed)
1291
+ )
1292
+ study.optimize(self.cross_val, n_trials=max_evals)
1293
+ self.best_params = study.best_params
1294
+ self.best_trial = study.best_trial
1295
+ params_path = self.output.result_path(
1296
+ f'{self.ctx.model_nme}_bestparams_xgb.csv'
1297
+ )
1298
+ pd.DataFrame(self.best_params, index=[0]).to_csv(params_path)
1299
+
1300
+ def train(self) -> None:
1301
+ if not self.best_params:
1302
+ raise RuntimeError('请先运行 tune() 以获得 XGB 最优参数。')
1303
+ self.model = self._build_estimator()
1304
+ self.model.set_params(**self.best_params)
1305
+ self.model.fit(self.ctx.train_data[self.ctx.factor_nmes],
1306
+ self.ctx.train_data[self.ctx.resp_nme].values,
1307
+ **self.ctx.fit_params)
1308
+ self.ctx.model_label += [self.label]
1309
+ self.ctx.train_data['pred_xgb'] = self.model.predict(
1310
+ self.ctx.train_data[self.ctx.factor_nmes])
1311
+ self.ctx.test_data['pred_xgb'] = self.model.predict(
1312
+ self.ctx.test_data[self.ctx.factor_nmes])
1313
+ self.ctx.train_data.loc[:, 'w_pred_xgb'] = self.ctx.train_data['pred_xgb'] * \
1314
+ self.ctx.train_data[self.ctx.weight_nme]
1315
+ self.ctx.test_data.loc[:, 'w_pred_xgb'] = self.ctx.test_data['pred_xgb'] * \
1316
+ self.ctx.test_data[self.ctx.weight_nme]
1317
+ self.ctx.xgb_best = self.model
1318
+
1319
+ def save(self) -> None:
1320
+ if self.model is not None:
1321
+ joblib.dump(self.model, self.output.model_path(
1322
+ f'01_{self.ctx.model_nme}_Xgboost.pkl'))
1323
+
1324
+ def load(self) -> None:
1325
+ path = self.output.model_path(
1326
+ f'01_{self.ctx.model_nme}_Xgboost.pkl')
1327
+ if os.path.exists(path):
1328
+ self.model = joblib.load(path)
1329
+ self.ctx.xgb_best = self.model
1330
+ else:
1331
+ print(f"[load_model] Warning: 未找到 Xgboost 模型文件:{path}")
1332
+
1333
+
1334
+ class GLMTrainer(TrainerBase):
1335
+ def __init__(self, context: "BayesOptModel") -> None:
1336
+ super().__init__(context, 'GLM')
1337
+ self.model = None
1338
+ self.best_params: Optional[Dict[str, Any]] = None
1339
+ self.best_trial = None
1340
+
1341
+ def _select_family(self, tweedie_power: Optional[float] = None):
1342
+ if self.ctx.task_type == 'classification':
1343
+ return sm.families.Binomial()
1344
+ if self.ctx.obj == 'count:poisson':
1345
+ return sm.families.Poisson()
1346
+ if self.ctx.obj == 'reg:gamma':
1347
+ return sm.families.Gamma()
1348
+ power = tweedie_power if tweedie_power is not None else 1.5
1349
+ return sm.families.Tweedie(var_power=power, link=sm.families.links.log())
1350
+
1351
+ def _prepare_design(self, data: pd.DataFrame) -> pd.DataFrame:
1352
+ # 为 statsmodels 添加截距项
1353
+ X = data[self.ctx.var_nmes]
1354
+ return sm.add_constant(X, has_constant='add')
1355
+
1356
+ def _metric_power(self, family, tweedie_power: Optional[float]) -> float:
1357
+ if isinstance(family, sm.families.Poisson):
1358
+ return 1.0
1359
+ if isinstance(family, sm.families.Gamma):
1360
+ return 2.0
1361
+ if isinstance(family, sm.families.Tweedie):
1362
+ return tweedie_power if tweedie_power is not None else getattr(family, 'var_power', 1.5)
1363
+ return 1.5
1364
+
1365
+ def cross_val(self, trial: optuna.trial.Trial) -> float:
1366
+ alpha = trial.suggest_float('alpha', 1e-6, 1e2, log=True)
1367
+ l1_ratio = trial.suggest_float('l1_ratio', 0.0, 1.0)
1368
+ tweedie_power = None
1369
+ if self.ctx.task_type == 'regression' and self.ctx.obj == 'reg:tweedie':
1370
+ tweedie_power = trial.suggest_float('tweedie_power', 1.01, 1.99)
1371
+
1372
+ X_all = self._prepare_design(self.ctx.train_oht_scl_data)
1373
+ y_all = self.ctx.train_oht_scl_data[self.ctx.resp_nme]
1374
+ w_all = self.ctx.train_oht_scl_data[self.ctx.weight_nme]
1375
+
1376
+ scores = []
1377
+ for train_idx, val_idx in self.ctx.cv.split(X_all):
1378
+ X_train, X_val = X_all.iloc[train_idx], X_all.iloc[val_idx]
1379
+ y_train, y_val = y_all.iloc[train_idx], y_all.iloc[val_idx]
1380
+ w_train, w_val = w_all.iloc[train_idx], w_all.iloc[val_idx]
1381
+
1382
+ family = self._select_family(tweedie_power)
1383
+ glm = sm.GLM(y_train, X_train, family=family,
1384
+ freq_weights=w_train)
1385
+ result = glm.fit_regularized(
1386
+ alpha=alpha, L1_wt=l1_ratio, maxiter=200)
1387
+
1388
+ y_pred = result.predict(X_val)
1389
+ if self.ctx.task_type == 'classification':
1390
+ y_pred = np.clip(y_pred, EPS, 1 - EPS)
1391
+ fold_score = log_loss(
1392
+ y_val, y_pred, sample_weight=w_val)
1393
+ else:
1394
+ y_pred = np.maximum(y_pred, EPS)
1395
+ fold_score = mean_tweedie_deviance(
1396
+ y_val,
1397
+ y_pred,
1398
+ sample_weight=w_val,
1399
+ power=self._metric_power(family, tweedie_power)
1400
+ )
1401
+ scores.append(fold_score)
1402
+
1403
+ return float(np.mean(scores))
1404
+
1405
+ def tune(self, max_evals: int = 50) -> None:
1406
+ study = optuna.create_study(
1407
+ direction='minimize', sampler=optuna.samplers.TPESampler(seed=self.ctx.rand_seed))
1408
+ study.optimize(self.cross_val, n_trials=max_evals)
1409
+ self.best_params = study.best_params
1410
+ self.best_trial = study.best_trial
1411
+ params_path = self.output.result_path(
1412
+ f'{self.ctx.model_nme}_bestparams_glm.csv')
1413
+ pd.DataFrame(self.best_params, index=[0]).to_csv(params_path)
1414
+
1415
+ def train(self) -> None:
1416
+ if not self.best_params:
1417
+ raise RuntimeError('请先运行 tune() 以获得 GLM 最优参数。')
1418
+ tweedie_power = self.best_params.get('tweedie_power')
1419
+ family = self._select_family(tweedie_power)
1420
+
1421
+ X_train = self._prepare_design(self.ctx.train_oht_scl_data)
1422
+ y_train = self.ctx.train_oht_scl_data[self.ctx.resp_nme]
1423
+ w_train = self.ctx.train_oht_scl_data[self.ctx.weight_nme]
1424
+
1425
+ glm = sm.GLM(y_train, X_train, family=family,
1426
+ freq_weights=w_train)
1427
+ self.model = glm.fit_regularized(
1428
+ alpha=self.best_params['alpha'],
1429
+ L1_wt=self.best_params['l1_ratio'],
1430
+ maxiter=300
1431
+ )
1432
+
1433
+ self.ctx.glm_best = self.model
1434
+ self.ctx.model_label += [self.label]
1435
+ self._attach_predictions(tweedie_power)
1436
+
1437
+ def _attach_predictions(self, tweedie_power: Optional[float]) -> None:
1438
+ for src, target in (
1439
+ (self.ctx.train_oht_scl_data, self.ctx.train_data),
1440
+ (self.ctx.test_oht_scl_data, self.ctx.test_data),
1441
+ ):
1442
+ design = self._prepare_design(src)
1443
+ preds = self.model.predict(design)
1444
+ if self.ctx.task_type == 'classification':
1445
+ preds = np.clip(preds, EPS, 1 - EPS)
1446
+ else:
1447
+ preds = np.maximum(preds, EPS)
1448
+ target['pred_glm'] = preds
1449
+ target['w_pred_glm'] = target['pred_glm'] * \
1450
+ target[self.ctx.weight_nme]
1451
+
1452
+ def save(self) -> None:
1453
+ if self.model is not None:
1454
+ joblib.dump(self.model, self.output.model_path(
1455
+ f'01_{self.ctx.model_nme}_GLM.pkl'))
1456
+
1457
+ def load(self) -> None:
1458
+ path = self.output.model_path(
1459
+ f'01_{self.ctx.model_nme}_GLM.pkl')
1460
+ if os.path.exists(path):
1461
+ self.model = joblib.load(path)
1462
+ self.ctx.glm_best = self.model
1463
+ else:
1464
+ print(f"[load_model] Warning: 未找到 GLM 模型文件:{path}")
1465
+
1466
+
1467
+ class ResNetTrainer(TrainerBase):
1468
+ def __init__(self, context: "BayesOptModel") -> None:
1469
+ if context.task_type == 'classification':
1470
+ super().__init__(context, 'ResNetClassifier')
1471
+ else:
1472
+ super().__init__(context, 'ResNet')
1473
+ self.model: Optional[ResNetSklearn] = None
1474
+ self.best_params: Optional[Dict[str, Any]] = None
1475
+ self.best_trial = None
1476
+
1477
+ # ========= 交叉验证(BayesOpt 用) =========
1478
+ def cross_val(self, trial: optuna.trial.Trial) -> float:
1479
+ """
1480
+ 对 ResNet 做交叉验证。
1481
+ 为了防止显存 OOM:
1482
+ - 每个 fold 独立创建一个 ResNetSklearn
1483
+ - fold 结束就把模型挪到 CPU + 删除 + gc + empty_cache
1484
+ - 可选:BayesOpt 阶段只用训练集子样本
1485
+ """
1486
+
1487
+ # 1. 超参空间(基本沿用你之前的设定)
1488
+ learning_rate = trial.suggest_float(
1489
+ 'learning_rate', 1e-6, 1e-2, log=True
1490
+ )
1491
+ # hidden_dim = trial.suggest_int('hidden_dim', 32, 256, step=32) # 不宜过大
1492
+ hidden_dim = trial.suggest_int('hidden_dim', 8, 32, step=2)
1493
+ block_num = trial.suggest_int('block_num', 2, 10)
1494
+ # batch_num = trial.suggest_int(
1495
+ # 'batch_num',
1496
+ # 10 if self.ctx.obj == 'reg:gamma' else 100,
1497
+ # 100 if self.ctx.obj == 'reg:gamma' else 1000,
1498
+ # step=10 if self.ctx.obj == 'reg:gamma' else 100
1499
+ # )
1500
+
1501
+ if self.ctx.task_type == 'regression':
1502
+ if self.ctx.obj == 'reg:tweedie':
1503
+ tw_power = trial.suggest_float('tw_power', 1.0, 2.0)
1504
+ elif self.ctx.obj == 'count:poisson':
1505
+ tw_power = 1.0
1506
+ elif self.ctx.obj == 'reg:gamma':
1507
+ tw_power = 2.0
1508
+ else:
1509
+ tw_power = 1.5
1510
+ else: # classification
1511
+ tw_power = None # Not used
1512
+
1513
+ fold_losses = []
1514
+
1515
+ # 2. (可选)BayesOpt 只在子样本上做 CV,减轻显存 & 时间压力
1516
+ data_for_cv = self.ctx.train_oht_scl_data
1517
+ max_rows_for_resnet_bo = min(100000, int(
1518
+ len(data_for_cv)/5)) # 你可以按 A30 情况调小,比如 50_000
1519
+ if len(data_for_cv) > max_rows_for_resnet_bo:
1520
+ data_for_cv = data_for_cv.sample(
1521
+ max_rows_for_resnet_bo,
1522
+ random_state=self.ctx.rand_seed
1523
+ )
1524
+
1525
+ X_all = data_for_cv[self.ctx.var_nmes]
1526
+ y_all = data_for_cv[self.ctx.resp_nme]
1527
+ w_all = data_for_cv[self.ctx.weight_nme]
1528
+
1529
+ # 用局部 ShuffleSplit,避免子样本时索引不一致
1530
+ cv_local = ShuffleSplit(
1531
+ n_splits=int(1 / self.ctx.prop_test),
1532
+ test_size=self.ctx.prop_test,
1533
+ random_state=self.ctx.rand_seed
1534
+ )
1535
+
1536
+ for fold, (train_idx, val_idx) in enumerate(cv_local.split(X_all)):
1537
+ X_train_fold = X_all.iloc[train_idx]
1538
+ y_train_fold = y_all.iloc[train_idx]
1539
+ w_train_fold = w_all.iloc[train_idx]
1540
+
1541
+ X_val_fold = X_all.iloc[val_idx]
1542
+ y_val_fold = y_all.iloc[val_idx]
1543
+ w_val_fold = w_all.iloc[val_idx]
1544
+
1545
+ # 3. 每个 fold 创建一个临时 ResNet 模型
1546
+ cv_net = ResNetSklearn(
1547
+ model_nme=self.ctx.model_nme,
1548
+ input_dim=X_all.shape[1],
1549
+ hidden_dim=hidden_dim,
1550
+ block_num=block_num,
1551
+ task_type=self.ctx.task_type,
1552
+ # batch_num=batch_num,
1553
+ epochs=self.ctx.epochs,
1554
+ tweedie_power=tw_power,
1555
+ learning_rate=learning_rate,
1556
+ patience=5
1557
+ )
1558
+
1559
+ try:
1560
+ # 4. 训练(内部仍然用你自己的 tweedie_loss)
1561
+ cv_net.fit(
1562
+ X_train_fold,
1563
+ y_train_fold,
1564
+ w_train_fold,
1565
+ X_val_fold,
1566
+ y_val_fold,
1567
+ w_val_fold
1568
+ )
1569
+
1570
+ # 5. 验证集预测
1571
+ y_pred_fold = cv_net.predict(X_val_fold)
1572
+
1573
+ # 6. 评估:Tweedie deviance(评估用,训练 loss 不动)
1574
+ if self.ctx.task_type == 'regression':
1575
+ loss = mean_tweedie_deviance(
1576
+ y_val_fold,
1577
+ y_pred_fold,
1578
+ sample_weight=w_val_fold,
1579
+ power=tw_power
1580
+ )
1581
+ else: # classification
1582
+ from sklearn.metrics import log_loss
1583
+ loss = log_loss(
1584
+ y_val_fold,
1585
+ y_pred_fold,
1586
+ sample_weight=w_val_fold,
1587
+ )
1588
+ fold_losses.append(loss)
1589
+ finally:
1590
+ # 7. ★ 每个 fold 结束后释放 GPU 资源 ★
1591
+ try:
1592
+ if hasattr(cv_net, "resnet"):
1593
+ cv_net.resnet.to("cpu")
1594
+ except Exception:
1595
+ pass
1596
+ del cv_net
1597
+ gc.collect()
1598
+ if torch.cuda.is_available():
1599
+ torch.cuda.empty_cache()
1600
+
1601
+ return np.mean(fold_losses)
1602
+
1603
+ # ========= Optuna 调参 =========
1604
+ def tune(self, max_evals: int = 50) -> None:
1605
+ """
1606
+ 使用 Optuna 对 ResNet 做贝叶斯优化。
1607
+ 每个 trial 完成以后再做一次全局的显存清理。
1608
+ """
1609
+ def objective(trial: optuna.trial.Trial) -> float:
1610
+ result = self.cross_val(trial)
1611
+ # trial 级别兜底清理
1612
+ gc.collect()
1613
+ if torch.cuda.is_available():
1614
+ torch.cuda.empty_cache()
1615
+ return result
1616
+
1617
+ study = optuna.create_study(
1618
+ direction='minimize',
1619
+ sampler=optuna.samplers.TPESampler(seed=self.ctx.rand_seed)
1620
+ )
1621
+ study.optimize(objective, n_trials=max_evals)
1622
+
1623
+ self.best_params = study.best_params
1624
+ self.best_trial = study.best_trial
1625
+
1626
+ params_path = self.output.result_path(
1627
+ f'{self.ctx.model_nme}_bestparams_resn.csv'
1628
+ )
1629
+ pd.DataFrame(self.best_params, index=[0]).to_csv(params_path)
1630
+
1631
+ # ========= 用最优超参训练最终 ResNet =========
1632
+ def train(self) -> None:
1633
+ if not self.best_params:
1634
+ raise RuntimeError('请先运行 tune() 以获得 ResNet 最优参数。')
1635
+
1636
+ self.model = ResNetSklearn(
1637
+ model_nme=self.ctx.model_nme,
1638
+ input_dim=self.ctx.train_oht_scl_data[self.ctx.var_nmes].shape[1],
1639
+ task_type=self.ctx.task_type
1640
+ )
1641
+ self.model.set_params(self.best_params)
1642
+
1643
+ # 在全量 one-hot + 标准化数据上训练最终模型
1644
+ self.model.fit(
1645
+ self.ctx.train_oht_scl_data[self.ctx.var_nmes],
1646
+ self.ctx.train_oht_scl_data[self.ctx.resp_nme],
1647
+ self.ctx.train_oht_scl_data[self.ctx.weight_nme]
1648
+ )
1649
+
1650
+ # 记录标签
1651
+ self.ctx.model_label += [self.label]
1652
+
1653
+ # 训练集 / 测试集预测
1654
+ self.ctx.train_data['pred_resn'] = self.model.predict(
1655
+ self.ctx.train_oht_scl_data[self.ctx.var_nmes]
1656
+ )
1657
+ self.ctx.test_data['pred_resn'] = self.model.predict(
1658
+ self.ctx.test_oht_scl_data[self.ctx.var_nmes]
1659
+ )
1660
+
1661
+ # 加权赔付
1662
+ self.ctx.train_data.loc[:, 'w_pred_resn'] = (
1663
+ self.ctx.train_data['pred_resn'] *
1664
+ self.ctx.train_data[self.ctx.weight_nme]
1665
+ )
1666
+ self.ctx.test_data.loc[:, 'w_pred_resn'] = (
1667
+ self.ctx.test_data['pred_resn'] *
1668
+ self.ctx.test_data[self.ctx.weight_nme]
1669
+ )
1670
+
1671
+ # 方便外部调用
1672
+ self.ctx.resn_best = self.model
1673
+
1674
+ # ========= 保存 / 加载 =========
1675
+ def save(self) -> None:
1676
+ """
1677
+ 只保存 ResNet 的 state_dict(轻量,不含优化器)。
1678
+ """
1679
+ if self.model is not None:
1680
+ path = self.output.model_path(
1681
+ f'01_{self.ctx.model_nme}_ResNet.pth'
1682
+ )
1683
+ torch.save(self.model.resnet.state_dict(), path)
1684
+
1685
+ def load(self) -> None:
1686
+ """
1687
+ 从文件加载 ResNet 模型到合适的 device。
1688
+ """
1689
+ path = self.output.model_path(
1690
+ f'01_{self.ctx.model_nme}_ResNet.pth'
1691
+ )
1692
+ if os.path.exists(path):
1693
+ resn_loaded = ResNetSklearn(
1694
+ model_nme=self.ctx.model_nme,
1695
+ input_dim=self.ctx.train_oht_scl_data[self.ctx.var_nmes].shape[1],
1696
+ task_type=self.ctx.task_type
1697
+ )
1698
+ state_dict = torch.load(path, map_location='cpu')
1699
+ resn_loaded.resnet.load_state_dict(state_dict)
1700
+
1701
+ # 根据当前环境设置 device
1702
+ if torch.cuda.is_available():
1703
+ resn_loaded.device = torch.device('cuda')
1704
+ elif torch.backends.mps.is_available():
1705
+ resn_loaded.device = torch.device('mps')
1706
+ else:
1707
+ resn_loaded.device = torch.device('cpu')
1708
+
1709
+ resn_loaded.resnet.to(resn_loaded.device)
1710
+ self.model = resn_loaded
1711
+ self.ctx.resn_best = self.model
1712
+ else:
1713
+ print(f"[ResNetTrainer.load] 未找到模型文件:{path}")
1714
+
1715
+
1716
+ class FTTrainer(TrainerBase):
1717
+ def __init__(self, context: "BayesOptModel") -> None:
1718
+ if context.task_type == 'classification':
1719
+ super().__init__(context, 'FTTransformerClassifier')
1720
+ else:
1721
+ super().__init__(context, 'FTTransformer')
1722
+ self.model: Optional[FTTransformerSklearn] = None
1723
+ self.best_params: Optional[Dict[str, Any]] = None
1724
+ self.best_trial = None
1725
+
1726
+ def cross_val(self, trial: optuna.trial.Trial) -> float:
1727
+ """
1728
+ 对 FT-Transformer 做交叉验证。
1729
+ 这里是显存最容易爆的地方,所以加入了:
1730
+ - 较保守的超参搜索空间
1731
+ - 每个 fold 结束后强制释放 GPU 显存
1732
+ """
1733
+ # 超参空间适当缩小一点,避免特别大的模型
1734
+ learning_rate = trial.suggest_float(
1735
+ 'learning_rate', 1e-5, 5e-4, log=True
1736
+ )
1737
+ d_model = trial.suggest_int('d_model', 32, 256, step=32)
1738
+ # n_heads = trial.suggest_categorical('n_heads', [2, 4]) 避免欠拟合
1739
+ n_heads = trial.suggest_categorical('n_heads', [2, 4, 8])
1740
+ # n_layers = trial.suggest_int('n_layers', 2, 4) 避免欠拟合
1741
+ n_layers = trial.suggest_int('n_layers', 2, 8)
1742
+ dropout = trial.suggest_float('dropout', 0.0, 0.2)
1743
+ # batch_num = trial.suggest_int(
1744
+ # 'batch_num',
1745
+ # 5 if self.ctx.obj == 'reg:gamma' else 10,
1746
+ # 10 if self.ctx.obj == 'reg:gamma' else 50,
1747
+ # step=1 if self.ctx.obj == 'reg:gamma' else 10
1748
+ # )
1749
+
1750
+ if self.ctx.task_type == 'regression':
1751
+ if self.ctx.obj == 'reg:tweedie':
1752
+ tw_power = trial.suggest_float('tw_power', 1.0, 2.0)
1753
+ elif self.ctx.obj == 'count:poisson':
1754
+ tw_power = 1.0
1755
+ elif self.ctx.obj == 'reg:gamma':
1756
+ tw_power = 2.0
1757
+ else:
1758
+ tw_power = 1.5
1759
+ else: # classification
1760
+ tw_power = None # Not used
1761
+
1762
+ fold_losses = []
1763
+
1764
+ # 👉 可选:只在子样本上做 BO,避免大数据直接压垮显存
1765
+ data_for_cv = self.ctx.train_data
1766
+ max_rows_for_ft_bo = min(1000000, int(
1767
+ len(data_for_cv)/2)) # 你可以根据显存情况调小或调大
1768
+ if len(data_for_cv) > max_rows_for_ft_bo:
1769
+ data_for_cv = data_for_cv.sample(
1770
+ max_rows_for_ft_bo,
1771
+ random_state=self.ctx.rand_seed
1772
+ )
1773
+
1774
+ for _, (train_idx, test_idx) in enumerate(
1775
+ self.ctx.cv.split(data_for_cv[self.ctx.factor_nmes])
1776
+ ):
1777
+ X_train_fold = data_for_cv.iloc[train_idx][self.ctx.factor_nmes]
1778
+ y_train_fold = data_for_cv.iloc[train_idx][self.ctx.resp_nme]
1779
+ w_train_fold = data_for_cv.iloc[train_idx][self.ctx.weight_nme]
1780
+ X_val_fold = data_for_cv.iloc[test_idx][self.ctx.factor_nmes]
1781
+ y_val_fold = data_for_cv.iloc[test_idx][self.ctx.resp_nme]
1782
+ w_val_fold = data_for_cv.iloc[test_idx][self.ctx.weight_nme]
1783
+
1784
+ cv_ft = FTTransformerSklearn(
1785
+ model_nme=self.ctx.model_nme,
1786
+ num_cols=self.ctx.num_features,
1787
+ cat_cols=self.ctx.cate_list,
1788
+ d_model=d_model,
1789
+ n_heads=n_heads,
1790
+ n_layers=n_layers,
1791
+ dropout=dropout,
1792
+ task_type=self.ctx.task_type,
1793
+ # batch_num=batch_num,
1794
+ epochs=self.ctx.epochs,
1795
+ tweedie_power=tw_power,
1796
+ learning_rate=learning_rate,
1797
+ patience=5
1798
+ )
1799
+
1800
+ try:
1801
+ cv_ft.fit(
1802
+ X_train_fold, y_train_fold, w_train_fold,
1803
+ X_val_fold, y_val_fold, w_val_fold
1804
+ )
1805
+ y_pred_fold = cv_ft.predict(X_val_fold)
1806
+ if self.ctx.task_type == 'regression':
1807
+ loss = mean_tweedie_deviance(
1808
+ y_val_fold,
1809
+ y_pred_fold,
1810
+ sample_weight=w_val_fold,
1811
+ power=tw_power
1812
+ )
1813
+ else: # classification
1814
+ from sklearn.metrics import log_loss
1815
+ loss = log_loss(
1816
+ y_val_fold,
1817
+ y_pred_fold,
1818
+ sample_weight=w_val_fold,
1819
+ )
1820
+ fold_losses.append(loss)
1821
+ finally:
1822
+ # 🧹 每个 fold 用完就立即释放 GPU 资源
1823
+ try:
1824
+ # 如果模型在 GPU 上,先挪回 CPU
1825
+ if hasattr(cv_ft, "ft"):
1826
+ cv_ft.ft.to("cpu")
1827
+ except Exception:
1828
+ pass
1829
+ del cv_ft
1830
+ gc.collect()
1831
+ if torch.cuda.is_available():
1832
+ torch.cuda.empty_cache()
1833
+
1834
+ return np.mean(fold_losses)
1835
+
1836
+ def tune(self, max_evals: int = 50) -> None:
1837
+ """
1838
+ 用 Optuna 做超参搜索。
1839
+ 在每个 trial 结束后再做一次显存清理,避免 trial 间显存碎片堆积。
1840
+ """
1841
+ def objective(trial: optuna.trial.Trial) -> float:
1842
+ result = self.cross_val(trial)
1843
+ # trial 级别的兜底清理
1844
+ gc.collect()
1845
+ if torch.cuda.is_available():
1846
+ torch.cuda.empty_cache()
1847
+ return result
1848
+
1849
+ study = optuna.create_study(
1850
+ direction='minimize',
1851
+ sampler=optuna.samplers.TPESampler(seed=self.ctx.rand_seed)
1852
+ )
1853
+ study.optimize(objective, n_trials=max_evals)
1854
+ self.best_params = study.best_params
1855
+ self.best_trial = study.best_trial
1856
+ params_path = self.output.result_path(
1857
+ f'{self.ctx.model_nme}_bestparams_ft.csv'
1858
+ )
1859
+ pd.DataFrame(self.best_params, index=[0]).to_csv(params_path)
1860
+
1861
+ def train(self) -> None:
1862
+ if not self.best_params:
1863
+ raise RuntimeError('请先运行 tune() 以获得 FT-Transformer 最优参数。')
1864
+ self.model = FTTransformerSklearn(
1865
+ model_nme=self.ctx.model_nme,
1866
+ num_cols=self.ctx.num_features,
1867
+ cat_cols=self.ctx.cate_list,
1868
+ task_type=self.ctx.task_type
1869
+ )
1870
+ self.model.set_params(self.best_params)
1871
+ self.model.fit(
1872
+ self.ctx.train_data[self.ctx.factor_nmes],
1873
+ self.ctx.train_data[self.ctx.resp_nme],
1874
+ self.ctx.train_data[self.ctx.weight_nme]
1875
+ )
1876
+ self.ctx.model_label += [self.label]
1877
+ self.ctx.train_data['pred_ft'] = self.model.predict(
1878
+ self.ctx.train_data[self.ctx.factor_nmes]
1879
+ )
1880
+ self.ctx.test_data['pred_ft'] = self.model.predict(
1881
+ self.ctx.test_data[self.ctx.factor_nmes]
1882
+ )
1883
+ self.ctx.train_data.loc[:, 'w_pred_ft'] = (
1884
+ self.ctx.train_data['pred_ft'] *
1885
+ self.ctx.train_data[self.ctx.weight_nme]
1886
+ )
1887
+ self.ctx.test_data.loc[:, 'w_pred_ft'] = (
1888
+ self.ctx.test_data['pred_ft'] *
1889
+ self.ctx.test_data[self.ctx.weight_nme]
1890
+ )
1891
+ self.ctx.ft_best = self.model
1892
+
1893
+ def save(self) -> None:
1894
+ if self.model is not None:
1895
+ torch.save(
1896
+ self.model,
1897
+ self.output.model_path(
1898
+ f'01_{self.ctx.model_nme}_FTTransformer.pth')
1899
+ )
1900
+
1901
+ def load(self) -> None:
1902
+ path = self.output.model_path(
1903
+ f'01_{self.ctx.model_nme}_FTTransformer.pth')
1904
+ if os.path.exists(path):
1905
+ ft_loaded = torch.load(path, map_location='cpu')
1906
+ if torch.cuda.is_available():
1907
+ ft_loaded.device = torch.device('cuda')
1908
+ elif torch.backends.mps.is_available():
1909
+ ft_loaded.device = torch.device('mps')
1910
+ else:
1911
+ ft_loaded.device = torch.device('cpu')
1912
+ ft_loaded.ft.to(ft_loaded.device)
1913
+ self.model = ft_loaded
1914
+ self.ctx.ft_best = self.model
1915
+ else:
1916
+ print(f"[load_model] Warning: 未找到 FT-Transformer 模型文件:{path}")
1917
+
1918
+
1919
+ # =============================================================================
1920
+ # BayesOpt orchestration & SHAP utilities
1921
+ # =============================================================================
1922
+ class BayesOptModel:
1923
+ def __init__(self, train_data, test_data,
1924
+ model_nme, resp_nme, weight_nme, factor_nmes, task_type='regression',
1925
+ binary_resp_nme=None,
1926
+ cate_list=None, prop_test=0.25, rand_seed=None,
1927
+ epochs=100, use_gpu=True):
1928
+ cfg = BayesOptConfig(
1929
+ model_nme=model_nme,
1930
+ task_type=task_type,
1931
+ resp_nme=resp_nme,
1932
+ weight_nme=weight_nme,
1933
+ factor_nmes=list(factor_nmes),
1934
+ binary_resp_nme=binary_resp_nme,
1935
+ cate_list=list(cate_list) if cate_list else None,
1936
+ prop_test=prop_test,
1937
+ rand_seed=rand_seed,
1938
+ epochs=epochs,
1939
+ use_gpu=use_gpu
1940
+ )
1941
+ self.config = cfg
1942
+ self.model_nme = cfg.model_nme
1943
+ self.task_type = cfg.task_type
1944
+ self.resp_nme = cfg.resp_nme
1945
+ self.weight_nme = cfg.weight_nme
1946
+ self.factor_nmes = cfg.factor_nmes
1947
+ self.binary_resp_nme = cfg.binary_resp_nme
1948
+ self.cate_list = list(cfg.cate_list or [])
1949
+ self.prop_test = cfg.prop_test
1950
+ self.epochs = cfg.epochs
1951
+ self.rand_seed = cfg.rand_seed if cfg.rand_seed is not None else np.random.randint(
1952
+ 1, 10000)
1953
+ self.use_gpu = bool(cfg.use_gpu and torch.cuda.is_available())
1954
+ self.output_manager = OutputManager(os.getcwd(), self.model_nme)
1955
+
1956
+ preprocessor = DatasetPreprocessor(train_data, test_data, cfg).run()
1957
+ self.train_data = preprocessor.train_data
1958
+ self.test_data = preprocessor.test_data
1959
+ self.train_oht_scl_data = preprocessor.train_oht_scl_data
1960
+ self.test_oht_scl_data = preprocessor.test_oht_scl_data
1961
+ self.var_nmes = preprocessor.var_nmes
1962
+ self.num_features = preprocessor.num_features
1963
+ self.cat_categories_for_shap = preprocessor.cat_categories_for_shap
1964
+
1965
+ self.cv = ShuffleSplit(n_splits=int(1/self.prop_test),
1966
+ test_size=self.prop_test,
1967
+ random_state=self.rand_seed)
1968
+ if self.task_type == 'classification':
1969
+ self.obj = 'binary:logistic'
1970
+ else: # regression
1971
+ if 'f' in self.model_nme:
1972
+ self.obj = 'count:poisson'
1973
+ elif 's' in self.model_nme:
1974
+ self.obj = 'reg:gamma'
1975
+ elif 'bc' in self.model_nme:
1976
+ self.obj = 'reg:tweedie'
1977
+ else:
1978
+ self.obj = 'reg:tweedie'
1979
+ self.fit_params = {
1980
+ 'sample_weight': self.train_data[self.weight_nme].values
1981
+ }
1982
+ self.model_label: List[str] = []
1983
+
1984
+ # 记录各模型训练器,后续统一通过标签访问,方便扩展新模型
1985
+ self.trainers: Dict[str, TrainerBase] = {
1986
+ 'xgb': XGBTrainer(self),
1987
+ 'resn': ResNetTrainer(self),
1988
+ 'ft': FTTrainer(self),
1989
+ 'glm': GLMTrainer(self)
1990
+ }
1991
+ self.xgb_best = None
1992
+ self.resn_best = None
1993
+ self.glm_best = None
1994
+ self.ft_best = None
1995
+ self.best_xgb_params = None
1996
+ self.best_resn_params = None
1997
+ self.best_ft_params = None
1998
+ self.best_xgb_trial = None
1999
+ self.best_resn_trial = None
2000
+ self.best_ft_trial = None
2001
+ self.best_glm_params = None
2002
+ self.best_glm_trial = None
2003
+ self.xgb_load = None
2004
+ self.resn_load = None
2005
+ self.ft_load = None
2006
+
2007
+ # 定义单因素画图函数
2008
+ def plot_oneway(self, n_bins=10):
2009
+ for c in self.factor_nmes:
2010
+ fig = plt.figure(figsize=(7, 5))
2011
+ if c in self.cate_list:
2012
+ group_col = c
2013
+ plot_source = self.train_data
2014
+ else:
2015
+ group_col = f'{c}_bins'
2016
+ bins = pd.qcut(
2017
+ self.train_data[c],
2018
+ n_bins,
2019
+ duplicates='drop' # 注意:如果分位数重复会丢 bin,避免异常终止
2020
+ )
2021
+ plot_source = self.train_data.assign(**{group_col: bins})
2022
+ plot_data = plot_source.groupby(
2023
+ [group_col], observed=True).sum(numeric_only=True)
2024
+ plot_data.reset_index(inplace=True)
2025
+ plot_data['act_v'] = plot_data['w_act'] / \
2026
+ plot_data[self.weight_nme]
2027
+ plot_data.head()
2028
+ ax = fig.add_subplot(111)
2029
+ ax.plot(plot_data.index, plot_data['act_v'],
2030
+ label='Actual', color='red')
2031
+ ax.set_title(
2032
+ 'Analysis of %s : Train Data' % group_col,
2033
+ fontsize=8)
2034
+ plt.xticks(plot_data.index,
2035
+ list(plot_data[group_col].astype(str)),
2036
+ rotation=90)
2037
+ if len(list(plot_data[group_col].astype(str))) > 50:
2038
+ plt.xticks(fontsize=3)
2039
+ else:
2040
+ plt.xticks(fontsize=6)
2041
+ plt.yticks(fontsize=6)
2042
+ ax2 = ax.twinx()
2043
+ ax2.bar(plot_data.index,
2044
+ plot_data[self.weight_nme],
2045
+ alpha=0.5, color='seagreen')
2046
+ plt.yticks(fontsize=6)
2047
+ plt.margins(0.05)
2048
+ plt.subplots_adjust(wspace=0.3)
2049
+ save_path = self.output_manager.plot_path(
2050
+ f'00_{self.model_nme}_{group_col}_oneway.png')
2051
+ plt.savefig(save_path, dpi=300)
2052
+ plt.close(fig)
2053
+
2054
+ # 定义Xgboost贝叶斯优化函数
2055
+ def bayesopt_xgb(self, max_evals=100):
2056
+ trainer = self.trainers['xgb']
2057
+ trainer.tune(max_evals)
2058
+ trainer.train()
2059
+ self.xgb_best = trainer.model
2060
+ # 记录最优参数及 trial 以便排查或复现结果
2061
+ self.best_xgb_params = trainer.best_params
2062
+ self.best_xgb_trial = trainer.best_trial
2063
+
2064
+ # 定义GLM贝叶斯优化函数
2065
+ def bayesopt_glm(self, max_evals=50):
2066
+ trainer = self.trainers['glm']
2067
+ trainer.tune(max_evals)
2068
+ trainer.train()
2069
+ self.glm_best = trainer.model
2070
+ self.best_glm_params = trainer.best_params
2071
+ self.best_glm_trial = trainer.best_trial
2072
+
2073
+ # 定义ResNet贝叶斯优化函数
2074
+
2075
+ def bayesopt_resnet(self, max_evals=100):
2076
+ trainer = self.trainers['resn']
2077
+ trainer.tune(max_evals)
2078
+ trainer.train()
2079
+ self.resn_best = trainer.model
2080
+ # 保存最优 trial 相关信息,方便后续调参分析
2081
+ self.best_resn_params = trainer.best_params
2082
+ self.best_resn_trial = trainer.best_trial
2083
+
2084
+ # 定义 FT-Transformer 贝叶斯优化函数
2085
+ def bayesopt_ft(self, max_evals=50):
2086
+ trainer = self.trainers['ft']
2087
+ trainer.tune(max_evals)
2088
+ trainer.train()
2089
+ self.ft_best = trainer.model
2090
+ # FT-Transformer 参数较多,留存配置信息尤其重要
2091
+ self.best_ft_params = trainer.best_params
2092
+ self.best_ft_trial = trainer.best_trial
2093
+
2094
+ # 定义分箱函数
2095
+
2096
+ def _split_data(self, data, col_nme, wgt_nme, n_bins=10):
2097
+ # 先按得分排序再按累计权重等分,能保证每个分箱曝光量接近
2098
+ sorted_data = data.sort_values(by=col_nme, ascending=True).copy()
2099
+ sorted_data['cum_weight'] = sorted_data[wgt_nme].cumsum()
2100
+ w_sum = sorted_data[wgt_nme].sum()
2101
+ if w_sum <= EPS:
2102
+ sorted_data.loc[:, 'bins'] = 0
2103
+ else:
2104
+ sorted_data.loc[:, 'bins'] = np.floor(
2105
+ sorted_data['cum_weight'] * float(n_bins) / w_sum
2106
+ )
2107
+ sorted_data.loc[(sorted_data['bins'] == n_bins),
2108
+ 'bins'] = n_bins - 1
2109
+ return sorted_data.groupby(['bins'], observed=True).sum(numeric_only=True)
2110
+
2111
+ # 构建提纯曲线所需的数据
2112
+ def _plot_data_lift(self,
2113
+ pred_list, w_pred_list,
2114
+ w_act_list, weight_list, n_bins=10):
2115
+ lift_data = pd.DataFrame()
2116
+ lift_data.loc[:, 'pred'] = pred_list
2117
+ lift_data.loc[:, 'w_pred'] = w_pred_list
2118
+ lift_data.loc[:, 'act'] = w_act_list
2119
+ lift_data.loc[:, 'weight'] = weight_list
2120
+ plot_data = self._split_data(
2121
+ lift_data, 'pred', 'weight', n_bins)
2122
+ denom = np.maximum(plot_data['weight'], EPS)
2123
+ plot_data['exp_v'] = plot_data['w_pred'] / denom
2124
+ plot_data['act_v'] = plot_data['act'] / denom
2125
+ plot_data.reset_index(inplace=True)
2126
+ return plot_data
2127
+
2128
+ # 绘制提纯曲线
2129
+ def plot_lift(self, model_label, pred_nme, n_bins=10):
2130
+ # 绘制建模集上结果
2131
+ figpos_list = [121, 122]
2132
+ plot_dict = {
2133
+ 121: self.train_data,
2134
+ 122: self.test_data
2135
+ }
2136
+ name_list = {
2137
+ 121: 'Train Data',
2138
+ 122: 'Test Data'
2139
+ }
2140
+ if model_label == 'Xgboost':
2141
+ pred_nme = 'pred_xgb'
2142
+ elif model_label == 'ResNet':
2143
+ pred_nme = 'pred_resn'
2144
+ elif model_label == 'FTTransformer':
2145
+ pred_nme = 'pred_ft'
2146
+ elif model_label.startswith('ResNetClassifier'):
2147
+ pred_nme = 'pred_resn'
2148
+ elif model_label.startswith('FTTransformerClassifier'):
2149
+ pred_nme = 'pred_ft'
2150
+ elif model_label == 'GLM':
2151
+ pred_nme = 'pred_glm'
2152
+ # pred_nme 映射保证后续取列统一,否则新模型加入时需同步更新
2153
+
2154
+ fig = plt.figure(figsize=(11, 5))
2155
+ for figpos in figpos_list:
2156
+ plot_data = self._plot_data_lift(
2157
+ plot_dict[figpos][pred_nme].values,
2158
+ plot_dict[figpos]['w_'+pred_nme].values,
2159
+ plot_dict[figpos]['w_act'].values,
2160
+ plot_dict[figpos][self.weight_nme].values,
2161
+ n_bins)
2162
+ ax = fig.add_subplot(figpos)
2163
+ ax.plot(plot_data.index, plot_data['act_v'],
2164
+ label='Actual', color='red')
2165
+ ax.plot(plot_data.index, plot_data['exp_v'],
2166
+ label='Predicted', color='blue')
2167
+ ax.set_title(
2168
+ 'Lift Chart on %s' % name_list[figpos], fontsize=8)
2169
+ plt.xticks(plot_data.index,
2170
+ plot_data.index,
2171
+ rotation=90, fontsize=6)
2172
+ plt.yticks(fontsize=6)
2173
+ plt.legend(loc='upper left',
2174
+ fontsize=5, frameon=False)
2175
+ plt.margins(0.05)
2176
+ ax2 = ax.twinx()
2177
+ ax2.bar(plot_data.index, plot_data['weight'],
2178
+ alpha=0.5, color='seagreen',
2179
+ label='Earned Exposure')
2180
+ plt.yticks(fontsize=6)
2181
+ plt.legend(loc='upper right',
2182
+ fontsize=5, frameon=False)
2183
+ plt.subplots_adjust(wspace=0.3)
2184
+ save_path = self.output_manager.plot_path(
2185
+ f'01_{self.model_nme}_{model_label}_lift.png')
2186
+ plt.savefig(save_path, dpi=300)
2187
+ plt.show()
2188
+ plt.close(fig)
2189
+
2190
+ # 构建双提纯曲线所需的数据
2191
+ def _plot_data_dlift(self,
2192
+ pred_list_model1, pred_list_model2,
2193
+ w_list, w_act_list, n_bins=10):
2194
+ lift_data = pd.DataFrame()
2195
+ lift_data.loc[:, 'pred1'] = pred_list_model1
2196
+ lift_data.loc[:, 'pred2'] = pred_list_model2
2197
+ lift_data.loc[:, 'diff_ly'] = lift_data['pred1'] / lift_data['pred2']
2198
+ lift_data.loc[:, 'act'] = w_act_list
2199
+ lift_data.loc[:, 'weight'] = w_list
2200
+ plot_data = self._split_data(lift_data, 'diff_ly', 'weight', n_bins)
2201
+ denom = np.maximum(plot_data['act'], EPS)
2202
+ plot_data['exp_v1'] = plot_data['pred1'] / denom
2203
+ plot_data['exp_v2'] = plot_data['pred2'] / denom
2204
+ plot_data['act_v'] = plot_data['act'] / denom
2205
+ plot_data.reset_index(inplace=True)
2206
+ return plot_data
2207
+
2208
+ # 绘制双提纯曲线
2209
+ def plot_dlift(self, model_comp: List[str] = ['xgb', 'resn'], n_bins: int = 10) -> None:
2210
+ """
2211
+ 绘制双提纯曲线,对比两个模型的预测效果。
2212
+
2213
+ Args:
2214
+ model_comp: 包含两个模型简称的列表,例如 ['xgb', 'resn']。
2215
+ 支持 'xgb', 'resn', 'ft'。
2216
+ n_bins: 分箱数量。
2217
+ """
2218
+ if len(model_comp) != 2:
2219
+ raise ValueError("`model_comp` 必须包含两个模型进行对比。")
2220
+
2221
+ model_name_map = {
2222
+ 'xgb': 'Xgboost',
2223
+ 'resn': 'ResNet',
2224
+ 'ft': 'FTTransformer',
2225
+ 'glm': 'GLM'
2226
+ }
2227
+
2228
+ name1, name2 = model_comp
2229
+ if name1 not in model_name_map or name2 not in model_name_map:
2230
+ raise ValueError(f"不支持的模型简称。请从 {list(model_name_map.keys())} 中选择。")
2231
+
2232
+ fig, axes = plt.subplots(1, 2, figsize=(11, 5))
2233
+ datasets = {
2234
+ 'Train Data': self.train_data,
2235
+ 'Test Data': self.test_data
2236
+ }
2237
+
2238
+ for ax, (data_name, data) in zip(axes, datasets.items()):
2239
+ pred1_col = f'w_pred_{name1}'
2240
+ pred2_col = f'w_pred_{name2}'
2241
+
2242
+ if pred1_col not in data.columns or pred2_col not in data.columns:
2243
+ print(
2244
+ f"警告: 在 {data_name} 中找不到预测列 {pred1_col} 或 {pred2_col}。跳过绘图。")
2245
+ continue
2246
+
2247
+ plot_data = self._plot_data_dlift(
2248
+ data[pred1_col].values,
2249
+ data[pred2_col].values,
2250
+ data[self.weight_nme].values,
2251
+ data['w_act'].values,
2252
+ n_bins
2253
+ )
2254
+
2255
+ label1 = model_name_map[name1]
2256
+ label2 = model_name_map[name2]
2257
+
2258
+ ax.plot(plot_data.index,
2259
+ plot_data['act_v'], label='Actual', color='red')
2260
+ ax.plot(plot_data.index,
2261
+ plot_data['exp_v1'], label=label1, color='blue')
2262
+ ax.plot(plot_data.index,
2263
+ plot_data['exp_v2'], label=label2, color='black')
2264
+
2265
+ ax.set_title(f'Double Lift Chart on {data_name}', fontsize=8)
2266
+ ax.set_xticks(plot_data.index)
2267
+ ax.set_xticklabels(plot_data.index, rotation=90, fontsize=6)
2268
+ ax.set_xlabel(f'{label1} / {label2}', fontsize=6)
2269
+ ax.tick_params(axis='y', labelsize=6)
2270
+ ax.legend(loc='upper left', fontsize=5, frameon=False)
2271
+ ax.margins(0.1)
2272
+
2273
+ ax2 = ax.twinx()
2274
+ ax2.bar(plot_data.index, plot_data['weight'],
2275
+ alpha=0.5, color='seagreen', label='Earned Exposure')
2276
+ ax2.tick_params(axis='y', labelsize=6)
2277
+ ax2.legend(loc='upper right', fontsize=5, frameon=False)
2278
+
2279
+ plt.subplots_adjust(bottom=0.25, top=0.95, right=0.8, wspace=0.3)
2280
+ save_path = self.output_manager.plot_path(
2281
+ f'02_{self.model_nme}_dlift_{name1}_vs_{name2}.png')
2282
+ plt.savefig(save_path, dpi=300)
2283
+ plt.show()
2284
+ plt.close(fig)
2285
+
2286
+ # 绘制成交率提升曲线
2287
+ def plot_conversion_lift(self, model_pred_col: str, n_bins: int = 20):
2288
+ if not self.binary_resp_nme:
2289
+ print("错误: 未在 BayesOptModel 初始化时提供 `binary_resp_nme`。无法绘制成交率曲线。")
2290
+ return
2291
+
2292
+ fig, axes = plt.subplots(1, 2, figsize=(14, 6), sharey=True)
2293
+ datasets = {
2294
+ 'Train Data': self.train_data,
2295
+ 'Test Data': self.test_data
2296
+ }
2297
+
2298
+ for ax, (data_name, data) in zip(axes, datasets.items()):
2299
+ if model_pred_col not in data.columns:
2300
+ print(f"警告: 在 {data_name} 中找不到预测列 '{model_pred_col}'。跳过绘图。")
2301
+ continue
2302
+
2303
+ # 按模型预测分排序,并计算分箱
2304
+ plot_data = data.sort_values(by=model_pred_col).copy()
2305
+ plot_data['cum_weight'] = plot_data[self.weight_nme].cumsum()
2306
+ total_weight = plot_data[self.weight_nme].sum()
2307
+
2308
+ if total_weight > EPS:
2309
+ plot_data['bin'] = pd.cut(
2310
+ plot_data['cum_weight'],
2311
+ bins=n_bins,
2312
+ labels=False,
2313
+ right=False
2314
+ )
2315
+ else:
2316
+ plot_data['bin'] = 0
2317
+
2318
+ # 按分箱聚合
2319
+ lift_agg = plot_data.groupby('bin').agg(
2320
+ total_weight=(self.weight_nme, 'sum'),
2321
+ actual_conversions=(self.binary_resp_nme, 'sum'),
2322
+ weighted_conversions=('w_binary_act', 'sum'),
2323
+ avg_pred=(model_pred_col, 'mean')
2324
+ ).reset_index()
2325
+
2326
+ # 计算成交率
2327
+ lift_agg['conversion_rate'] = lift_agg['weighted_conversions'] / \
2328
+ lift_agg['total_weight']
2329
+
2330
+ # 计算整体平均成交率
2331
+ overall_conversion_rate = data['w_binary_act'].sum(
2332
+ ) / data[self.weight_nme].sum()
2333
+ ax.axhline(y=overall_conversion_rate, color='gray', linestyle='--',
2334
+ label=f'Overall Avg Rate ({overall_conversion_rate:.2%})')
2335
+
2336
+ ax.plot(lift_agg['bin'], lift_agg['conversion_rate'],
2337
+ marker='o', linestyle='-', label='Actual Conversion Rate')
2338
+ ax.set_title(f'Conversion Rate Lift Chart on {data_name}')
2339
+ ax.set_xlabel(f'Model Score Decile (based on {model_pred_col})')
2340
+ ax.set_ylabel('Conversion Rate')
2341
+ ax.grid(True, linestyle='--', alpha=0.6)
2342
+ ax.legend()
2343
+
2344
+ plt.tight_layout()
2345
+ plt.show()
2346
+
2347
+ # 保存模型
2348
+
2349
+ def save_model(self, model_name=None):
2350
+
2351
+ # model_name 可以是:
2352
+ # - None: 保存全部可用模型
2353
+ # - 'xgb': 只保存 Xgboost
2354
+ # - 'resn': 只保存 ResNet
2355
+ # - 'ft': 只保存 FT-Transformer
2356
+ # - 'glm': 只保存 GLM
2357
+ if model_name in (None, 'xgb'):
2358
+ trainer = self.trainers['xgb']
2359
+ if trainer.model is not None:
2360
+ trainer.save()
2361
+ else:
2362
+ print("[save_model] Warning: xgb_best 不存在,未保存 Xgboost 模型。")
2363
+
2364
+ if model_name in (None, 'resn'):
2365
+ trainer = self.trainers['resn']
2366
+ if trainer.model is not None:
2367
+ trainer.save()
2368
+ else:
2369
+ print("[save_model] Warning: resn_best 不存在,未保存 ResNet 模型。")
2370
+
2371
+ if model_name in (None, 'ft'):
2372
+ trainer = self.trainers['ft']
2373
+ if trainer.model is not None:
2374
+ trainer.save()
2375
+ else:
2376
+ print("[save_model] Warning: ft_best 不存在,未保存 FT-Transformer 模型。")
2377
+
2378
+ if model_name in (None, 'glm'):
2379
+ trainer = self.trainers['glm']
2380
+ if trainer.model is not None:
2381
+ trainer.save()
2382
+ else:
2383
+ print("[save_model] Warning: glm_best 不存在,未保存 GLM 模型。")
2384
+
2385
+ def load_model(self, model_name=None):
2386
+ # model_name 可以是:
2387
+ # - None: 加载全部能找到的模型
2388
+ # - 'xgb': 只加载 Xgboost
2389
+ # - 'resn': 只加载 ResNet
2390
+ # - 'ft': 只加载 FT-Transformer
2391
+ # - 'glm': 只加载 GLM
2392
+
2393
+ if model_name in (None, 'xgb'):
2394
+ trainer = self.trainers['xgb']
2395
+ trainer.load()
2396
+ self.xgb_best = trainer.model
2397
+ self.xgb_load = trainer.model
2398
+
2399
+ if model_name in (None, 'resn'):
2400
+ trainer = self.trainers['resn']
2401
+ trainer.load()
2402
+ self.resn_best = trainer.model
2403
+ self.resn_load = trainer.model
2404
+
2405
+ if model_name in (None, 'ft'):
2406
+ trainer = self.trainers['ft']
2407
+ trainer.load()
2408
+ self.ft_best = trainer.model
2409
+ self.ft_load = trainer.model
2410
+
2411
+ if model_name in (None, 'glm'):
2412
+ trainer = self.trainers['glm']
2413
+ trainer.load()
2414
+ self.glm_best = trainer.model
2415
+
2416
+ def _build_ft_shap_matrix(self, data: pd.DataFrame) -> np.ndarray:
2417
+
2418
+ # 将原始特征 DataFrame (包含 self.factor_nmes) 转成
2419
+ # 纯数值矩阵: 数值列为 float64,类别列为整数 code(float64 存储)。
2420
+ # 列顺序与 self.factor_nmes 保持一致。
2421
+
2422
+ matrices = []
2423
+
2424
+ for col in self.factor_nmes:
2425
+ s = data[col]
2426
+
2427
+ if col in self.cate_list:
2428
+ # 类别列:按训练时的类别全集编码
2429
+ cats = pd.Categorical(
2430
+ s,
2431
+ categories=self.cat_categories_for_shap[col]
2432
+ )
2433
+ # cats.codes 是一个 Index / ndarray,用 np.asarray 包一下再 reshape
2434
+ codes = np.asarray(cats.codes, dtype=np.float64).reshape(-1, 1)
2435
+ matrices.append(codes)
2436
+ else:
2437
+ # 数值列:转成 Series -> numpy -> reshape
2438
+ vals = pd.to_numeric(s, errors="coerce")
2439
+ arr = vals.to_numpy(dtype=np.float64, copy=True).reshape(-1, 1)
2440
+ matrices.append(arr)
2441
+
2442
+ X_mat = np.concatenate(matrices, axis=1) # (N, F)
2443
+ return X_mat
2444
+
2445
+ def _decode_ft_shap_matrix_to_df(self, X_mat: np.ndarray) -> pd.DataFrame:
2446
+
2447
+ # 将 SHAP 的数值矩阵 (N, F) 还原为原始特征 DataFrame,
2448
+ # 数值列为 float,类别列还原为 pandas 的 category 类型,
2449
+ # 以便兼容 enable_categorical=True 的 XGBoost 和 FT-Transformer 的输入。
2450
+ # 列顺序 = self.factor_nmes
2451
+
2452
+ data_dict = {}
2453
+
2454
+ for j, col in enumerate(self.factor_nmes):
2455
+ col_vals = X_mat[:, j]
2456
+
2457
+ if col in self.cate_list:
2458
+ cats = self.cat_categories_for_shap[col]
2459
+
2460
+ # SHAP 会扰动成小数,这里 round 回整数 code
2461
+ codes = np.round(col_vals).astype(int)
2462
+ # 限制在 [-1, len(cats)-1]
2463
+ codes = np.clip(codes, -1, len(cats) - 1)
2464
+
2465
+ # 使用 pandas.Categorical.from_codes:
2466
+ # - codes = -1 被当成缺失 (NaN)
2467
+ # - 其他索引映射到 cats 中对应的类别
2468
+ cat_series = pd.Categorical.from_codes(
2469
+ codes,
2470
+ categories=cats
2471
+ )
2472
+ # 存的是 Categorical 类型,而不是 object
2473
+ data_dict[col] = cat_series
2474
+ else:
2475
+ # 数值列:直接 float
2476
+ data_dict[col] = col_vals.astype(float)
2477
+
2478
+ df = pd.DataFrame(data_dict, columns=self.factor_nmes)
2479
+
2480
+ # 再保险:确保所有类别列 dtype 真的是 category
2481
+ for col in self.cate_list:
2482
+ if col in df.columns:
2483
+ df[col] = df[col].astype("category")
2484
+ return df
2485
+
2486
+ def _build_glm_design(self, data: pd.DataFrame) -> pd.DataFrame:
2487
+ # 与 GLM 训练阶段一致:在 one-hot + 标准化特征上添加截距
2488
+ X = data[self.var_nmes]
2489
+ return sm.add_constant(X, has_constant='add')
2490
+
2491
+ # ========= XGBoost SHAP =========
2492
+
2493
+ def compute_shap_xgb(self, n_background: int = 500,
2494
+ n_samples: int = 200,
2495
+ on_train: bool = True):
2496
+ # 使用 KernelExplainer 计算 XGBoost 的 SHAP 值(黑盒方式)。
2497
+ #
2498
+ # - 对 SHAP:输入是一份纯数值矩阵:
2499
+ # * 数值特征:float64
2500
+ # * 类别特征:用 _build_ft_shap_matrix 编码后的整数 code(float64)
2501
+ # - 对模型:仍然用原始 DataFrame + xgb_best.predict(...)
2502
+
2503
+ if not hasattr(self, "xgb_best"):
2504
+ raise RuntimeError("请先运行 bayesopt_xgb() 训练好 self.xgb_best")
2505
+
2506
+ # 1) 选择数据源:训练集 or 测试集(原始特征空间)
2507
+ data = self.train_data if on_train else self.test_data
2508
+ X_raw = data[self.factor_nmes]
2509
+
2510
+ # 2) 构造背景矩阵(用和 FT 一样的数值编码)
2511
+ background_raw = X_raw.sample(
2512
+ min(len(X_raw), n_background),
2513
+ random_state=self.rand_seed
2514
+ )
2515
+ # KernelExplainer 计算量极大,务必控制背景样本规模,否则会拖慢调试
2516
+ background_mat = self._build_ft_shap_matrix(
2517
+ background_raw
2518
+ ).astype(np.float64, copy=True)
2519
+
2520
+ # 3) 定义黑盒预测函数:数值矩阵 -> DataFrame -> xgb_best.predict
2521
+ def f_predict(x_mat: np.ndarray) -> np.ndarray:
2522
+ # 把编码矩阵还原成原始 DataFrame(数值+类别)
2523
+ df_input = self._decode_ft_shap_matrix_to_df(x_mat)
2524
+ # 注意:这里用的是 self.xgb_best.predict,和你训练/预测时一致
2525
+ y_pred = self.xgb_best.predict(df_input)
2526
+ return y_pred
2527
+
2528
+ explainer = shap.KernelExplainer(f_predict, background_mat)
2529
+
2530
+ # 4) 要解释的样本:原始特征 + 数值编码
2531
+ X_explain_raw = X_raw.sample(
2532
+ min(len(X_raw), n_samples),
2533
+ random_state=self.rand_seed
2534
+ )
2535
+ X_explain_mat = self._build_ft_shap_matrix(
2536
+ X_explain_raw
2537
+ ).astype(np.float64, copy=True)
2538
+
2539
+ # 5) 计算 SHAP 值(注意用 nsamples='auto' 控制复杂度)
2540
+ shap_values = explainer.shap_values(X_explain_mat, nsamples="auto")
2541
+
2542
+ # 6) 保存结果:
2543
+ # - shap_values:数值编码空间,对应 factor_nmes 的每一列
2544
+ # - X_explain_raw:原始 DataFrame,方便画图时显示真实类别名
2545
+ self.shap_xgb = {
2546
+ "explainer": explainer,
2547
+ "X_explain": X_explain_raw,
2548
+ "shap_values": shap_values,
2549
+ "base_value": explainer.expected_value,
2550
+ }
2551
+ return self.shap_xgb
2552
+ # ========= ResNet SHAP =========
2553
+
2554
+ def _resn_predict_wrapper(self, X_np):
2555
+ # 保证走 CPU
2556
+ model = self.resn_best.resnet.to("cpu")
2557
+ with torch.no_grad():
2558
+ # 不要 .to(self.device)
2559
+ X_tensor = torch.tensor(X_np, dtype=torch.float32)
2560
+ y_pred = model(X_tensor).cpu().numpy()
2561
+ y_pred = np.clip(y_pred, 1e-6, None)
2562
+ return y_pred.reshape(-1)
2563
+
2564
+ def compute_shap_resn(self, n_background: int = 500,
2565
+ n_samples: int = 200,
2566
+ on_train: bool = True):
2567
+
2568
+ # 使用 KernelExplainer 计算 ResNet 的 SHAP 值。
2569
+ # 解释空间:已 one-hot & 标准化后的特征 self.var_nmes。
2570
+
2571
+ if not hasattr(self, 'resn_best'):
2572
+ raise RuntimeError("请先运行 bayesopt_resnet() 训练好 resn_best")
2573
+
2574
+ self.resn_best.device = torch.device("cpu") # 强制走 CPU
2575
+ self.resn_best.resnet.to("cpu")
2576
+ if torch.cuda.is_available():
2577
+ torch.cuda.empty_cache()
2578
+
2579
+ # 选择数据集(已 one-hot & 标准化)
2580
+ data = self.train_oht_scl_data if on_train else self.test_oht_scl_data
2581
+ X = data[self.var_nmes]
2582
+ if len(X) == 0:
2583
+ raise ValueError(
2584
+ "compute_shap_resn: 选择的数据集为空(len(X)==0),无法计算 SHAP。")
2585
+
2586
+ # 背景样本:float64 numpy
2587
+ background_df = X.sample(
2588
+ min(len(X), n_background),
2589
+ random_state=self.rand_seed
2590
+ )
2591
+ background_np = background_df.to_numpy(dtype=np.float64, copy=True)
2592
+
2593
+ # 黑盒预测函数
2594
+ def f_predict(x):
2595
+ y = self._resn_predict_wrapper(x)
2596
+ # 保证是一维数组
2597
+ y = np.asarray(y, dtype=np.float64).reshape(-1)
2598
+ return y
2599
+
2600
+ explainer = shap.KernelExplainer(f_predict, background_np)
2601
+
2602
+ # 要解释的样本
2603
+ X_explain_df = X.sample(
2604
+ min(len(X), n_samples),
2605
+ random_state=self.rand_seed
2606
+ )
2607
+ X_explain_np = X_explain_df.to_numpy(dtype=np.float64, copy=True)
2608
+
2609
+ max_nsamples = 300
2610
+ min_needed = X_explain_np.shape[1] + 2
2611
+ nsample_eff = max(min_needed, min(max_nsamples,
2612
+ X_explain_np.shape[0] * X_explain_np.shape[1]))
2613
+ shap_values = explainer.shap_values(X_explain_np, nsamples=nsample_eff)
2614
+ # 手动计算 base_value,避免 NotOneValueFound
2615
+ bg_pred = f_predict(background_np)
2616
+ if bg_pred.size == 0:
2617
+ raise ValueError("compute_shap_resn: 背景样本预测结果为空,无法计算 base_value。")
2618
+ base_value = float(bg_pred.mean())
2619
+
2620
+ self.shap_resn = {
2621
+ "explainer": explainer,
2622
+ "X_explain": X_explain_df, # DataFrame: 用于画图(有列名)
2623
+ "shap_values": shap_values, # numpy: (n_samples, n_features)
2624
+ # "base_value": explainer.expected_value,
2625
+ "base_value": base_value,
2626
+ }
2627
+ return self.shap_resn
2628
+
2629
+ # ========= FT-Transformer SHAP =========
2630
+
2631
+ def _ft_shap_predict_wrapper(self, X_mat: np.ndarray) -> np.ndarray:
2632
+
2633
+ # SHAP 的预测包装:
2634
+ # 数值矩阵 -> 还原为原始特征 DataFrame -> 调用 ft_best.predict
2635
+
2636
+ df_input = self._decode_ft_shap_matrix_to_df(X_mat)
2637
+ y_pred = self.ft_best.predict(df_input)
2638
+ return np.asarray(y_pred, dtype=np.float64).reshape(-1)
2639
+
2640
+ def compute_shap_ft(self, n_background: int = 500,
2641
+ n_samples: int = 200,
2642
+ on_train: bool = True):
2643
+
2644
+ # 使用 KernelExplainer 计算 FT-Transformer 的 SHAP 值。
2645
+ # 解释空间:数值+类别code 的混合数值矩阵(float64),
2646
+ # 但对外展示时仍使用原始特征名/取值(X_explain)。
2647
+
2648
+ if not hasattr(self, "ft_best"):
2649
+ raise RuntimeError("请先运行 bayesopt_ft() 训练好 ft_best")
2650
+
2651
+ self.ft_best.device = torch.device("cpu") # 强制走 CPU
2652
+ self.ft_best.ft.to("cpu")
2653
+ if torch.cuda.is_available():
2654
+ torch.cuda.empty_cache()
2655
+
2656
+ # 选择数据源(原始特征空间)
2657
+ data = self.train_data if on_train else self.test_data
2658
+ X_raw = data[self.factor_nmes]
2659
+
2660
+ # 背景矩阵
2661
+ background_raw = X_raw.sample(
2662
+ min(len(X_raw), n_background),
2663
+ random_state=self.rand_seed
2664
+ )
2665
+ background_mat = self._build_ft_shap_matrix(
2666
+ background_raw
2667
+ ).astype(np.float64, copy=True)
2668
+
2669
+ # 黑盒预测函数(数值矩阵 → DataFrame → FT 模型)
2670
+ def f_predict(x):
2671
+ return self._ft_shap_predict_wrapper(x)
2672
+
2673
+ explainer = shap.KernelExplainer(f_predict, background_mat)
2674
+
2675
+ # 要解释的样本(原始特征空间)
2676
+ X_explain_raw = X_raw.sample(
2677
+ min(len(X_raw), n_samples),
2678
+ random_state=self.rand_seed
2679
+ )
2680
+ X_explain_mat = self._build_ft_shap_matrix(
2681
+ X_explain_raw
2682
+ ).astype(np.float64, copy=True)
2683
+
2684
+ max_nsamples = 300
2685
+ min_needed = X_explain_mat.shape[1] + 2
2686
+ nsample_eff = max(min_needed, min(max_nsamples,
2687
+ X_explain_mat.shape[0] * X_explain_mat.shape[1]))
2688
+ shap_values = explainer.shap_values(
2689
+ X_explain_mat, nsamples=nsample_eff)
2690
+ bg_pred = self._ft_shap_predict_wrapper(background_mat)
2691
+ bg_pred = np.asarray(bg_pred, dtype=np.float64).reshape(-1)
2692
+ base_value = float(bg_pred.mean())
2693
+
2694
+ self.shap_ft = {
2695
+ "explainer": explainer,
2696
+ "X_explain": X_explain_raw, # 原始特征 DataFrame,用来画图
2697
+ "shap_values": shap_values, # numpy: (n_samples, n_features)
2698
+ # "base_value": explainer.expected_value,
2699
+ "base_value": base_value,
2700
+ }
2701
+ return self.shap_ft
2702
+
2703
+ # ========= GLM SHAP =========
2704
+ def compute_shap_glm(self, n_background: int = 500,
2705
+ n_samples: int = 200,
2706
+ on_train: bool = True):
2707
+ """
2708
+ 使用 KernelExplainer 计算 GLM 的 SHAP 值。
2709
+ 解释空间:one-hot + 标准化 + 截距项(与 GLM 训练一致)。
2710
+ """
2711
+ if not hasattr(self, "glm_best") or self.glm_best is None:
2712
+ raise RuntimeError("请先运行 bayesopt_glm() 训练好 glm_best")
2713
+
2714
+ data = self.train_oht_scl_data if on_train else self.test_oht_scl_data
2715
+ if len(data) == 0:
2716
+ raise ValueError("compute_shap_glm: 选择的数据集为空,无法计算 SHAP。")
2717
+
2718
+ # 构造设计矩阵
2719
+ design_all = self._build_glm_design(data)
2720
+
2721
+ # 背景样本
2722
+ background_df = design_all.sample(
2723
+ min(len(design_all), n_background),
2724
+ random_state=self.rand_seed
2725
+ )
2726
+ background_np = background_df.to_numpy(dtype=np.float64, copy=True)
2727
+ design_cols = list(design_all.columns)
2728
+
2729
+ def f_predict(x_np: np.ndarray) -> np.ndarray:
2730
+ # 转成 DataFrame 以保持列顺序与截距
2731
+ x_df = pd.DataFrame(x_np, columns=design_cols)
2732
+ y_pred = self.glm_best.predict(x_df)
2733
+ return np.asarray(y_pred, dtype=np.float64).reshape(-1)
2734
+
2735
+ explainer = shap.KernelExplainer(f_predict, background_np)
2736
+
2737
+ # 要解释的样本
2738
+ explain_df = design_all.sample(
2739
+ min(len(design_all), n_samples),
2740
+ random_state=self.rand_seed
2741
+ )
2742
+ explain_np = explain_df.to_numpy(dtype=np.float64, copy=True)
2743
+
2744
+ max_nsamples = 300
2745
+ min_needed = explain_np.shape[1] + 2
2746
+ nsample_eff = max(min_needed, min(max_nsamples,
2747
+ explain_np.shape[0] * explain_np.shape[1]))
2748
+ shap_values = explainer.shap_values(explain_np, nsamples=nsample_eff)
2749
+ bg_pred = f_predict(background_np)
2750
+ base_value = float(np.asarray(bg_pred, dtype=np.float64).mean())
2751
+
2752
+ self.shap_glm = {
2753
+ "explainer": explainer,
2754
+ "X_explain": explain_df, # 包含 const + 特征,用于画图
2755
+ "shap_values": shap_values,
2756
+ "base_value": base_value,
2757
+ "design_columns": design_cols
2758
+ }
2759
+ return self.shap_glm