ins-pricing 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. ins_pricing/README.md +60 -0
  2. ins_pricing/__init__.py +102 -0
  3. ins_pricing/governance/README.md +18 -0
  4. ins_pricing/governance/__init__.py +20 -0
  5. ins_pricing/governance/approval.py +93 -0
  6. ins_pricing/governance/audit.py +37 -0
  7. ins_pricing/governance/registry.py +99 -0
  8. ins_pricing/governance/release.py +159 -0
  9. ins_pricing/modelling/BayesOpt.py +146 -0
  10. ins_pricing/modelling/BayesOpt_USAGE.md +925 -0
  11. ins_pricing/modelling/BayesOpt_entry.py +575 -0
  12. ins_pricing/modelling/BayesOpt_incremental.py +731 -0
  13. ins_pricing/modelling/Explain_Run.py +36 -0
  14. ins_pricing/modelling/Explain_entry.py +539 -0
  15. ins_pricing/modelling/Pricing_Run.py +36 -0
  16. ins_pricing/modelling/README.md +33 -0
  17. ins_pricing/modelling/__init__.py +44 -0
  18. ins_pricing/modelling/bayesopt/__init__.py +98 -0
  19. ins_pricing/modelling/bayesopt/config_preprocess.py +303 -0
  20. ins_pricing/modelling/bayesopt/core.py +1476 -0
  21. ins_pricing/modelling/bayesopt/models.py +2196 -0
  22. ins_pricing/modelling/bayesopt/trainers.py +2446 -0
  23. ins_pricing/modelling/bayesopt/utils.py +1021 -0
  24. ins_pricing/modelling/cli_common.py +136 -0
  25. ins_pricing/modelling/explain/__init__.py +55 -0
  26. ins_pricing/modelling/explain/gradients.py +334 -0
  27. ins_pricing/modelling/explain/metrics.py +176 -0
  28. ins_pricing/modelling/explain/permutation.py +155 -0
  29. ins_pricing/modelling/explain/shap_utils.py +146 -0
  30. ins_pricing/modelling/notebook_utils.py +284 -0
  31. ins_pricing/modelling/plotting/__init__.py +45 -0
  32. ins_pricing/modelling/plotting/common.py +63 -0
  33. ins_pricing/modelling/plotting/curves.py +572 -0
  34. ins_pricing/modelling/plotting/diagnostics.py +139 -0
  35. ins_pricing/modelling/plotting/geo.py +362 -0
  36. ins_pricing/modelling/plotting/importance.py +121 -0
  37. ins_pricing/modelling/run_logging.py +133 -0
  38. ins_pricing/modelling/tests/conftest.py +8 -0
  39. ins_pricing/modelling/tests/test_cross_val_generic.py +66 -0
  40. ins_pricing/modelling/tests/test_distributed_utils.py +18 -0
  41. ins_pricing/modelling/tests/test_explain.py +56 -0
  42. ins_pricing/modelling/tests/test_geo_tokens_split.py +49 -0
  43. ins_pricing/modelling/tests/test_graph_cache.py +33 -0
  44. ins_pricing/modelling/tests/test_plotting.py +63 -0
  45. ins_pricing/modelling/tests/test_plotting_library.py +150 -0
  46. ins_pricing/modelling/tests/test_preprocessor.py +48 -0
  47. ins_pricing/modelling/watchdog_run.py +211 -0
  48. ins_pricing/pricing/README.md +44 -0
  49. ins_pricing/pricing/__init__.py +27 -0
  50. ins_pricing/pricing/calibration.py +39 -0
  51. ins_pricing/pricing/data_quality.py +117 -0
  52. ins_pricing/pricing/exposure.py +85 -0
  53. ins_pricing/pricing/factors.py +91 -0
  54. ins_pricing/pricing/monitoring.py +99 -0
  55. ins_pricing/pricing/rate_table.py +78 -0
  56. ins_pricing/production/__init__.py +21 -0
  57. ins_pricing/production/drift.py +30 -0
  58. ins_pricing/production/monitoring.py +143 -0
  59. ins_pricing/production/scoring.py +40 -0
  60. ins_pricing/reporting/README.md +20 -0
  61. ins_pricing/reporting/__init__.py +11 -0
  62. ins_pricing/reporting/report_builder.py +72 -0
  63. ins_pricing/reporting/scheduler.py +45 -0
  64. ins_pricing/setup.py +41 -0
  65. ins_pricing v2/__init__.py +23 -0
  66. ins_pricing v2/governance/__init__.py +20 -0
  67. ins_pricing v2/governance/approval.py +93 -0
  68. ins_pricing v2/governance/audit.py +37 -0
  69. ins_pricing v2/governance/registry.py +99 -0
  70. ins_pricing v2/governance/release.py +159 -0
  71. ins_pricing v2/modelling/Explain_Run.py +36 -0
  72. ins_pricing v2/modelling/Pricing_Run.py +36 -0
  73. ins_pricing v2/modelling/__init__.py +151 -0
  74. ins_pricing v2/modelling/cli_common.py +141 -0
  75. ins_pricing v2/modelling/config.py +249 -0
  76. ins_pricing v2/modelling/config_preprocess.py +254 -0
  77. ins_pricing v2/modelling/core.py +741 -0
  78. ins_pricing v2/modelling/data_container.py +42 -0
  79. ins_pricing v2/modelling/explain/__init__.py +55 -0
  80. ins_pricing v2/modelling/explain/gradients.py +334 -0
  81. ins_pricing v2/modelling/explain/metrics.py +176 -0
  82. ins_pricing v2/modelling/explain/permutation.py +155 -0
  83. ins_pricing v2/modelling/explain/shap_utils.py +146 -0
  84. ins_pricing v2/modelling/features.py +215 -0
  85. ins_pricing v2/modelling/model_manager.py +148 -0
  86. ins_pricing v2/modelling/model_plotting.py +463 -0
  87. ins_pricing v2/modelling/models.py +2203 -0
  88. ins_pricing v2/modelling/notebook_utils.py +294 -0
  89. ins_pricing v2/modelling/plotting/__init__.py +45 -0
  90. ins_pricing v2/modelling/plotting/common.py +63 -0
  91. ins_pricing v2/modelling/plotting/curves.py +572 -0
  92. ins_pricing v2/modelling/plotting/diagnostics.py +139 -0
  93. ins_pricing v2/modelling/plotting/geo.py +362 -0
  94. ins_pricing v2/modelling/plotting/importance.py +121 -0
  95. ins_pricing v2/modelling/run_logging.py +133 -0
  96. ins_pricing v2/modelling/tests/conftest.py +8 -0
  97. ins_pricing v2/modelling/tests/test_cross_val_generic.py +66 -0
  98. ins_pricing v2/modelling/tests/test_distributed_utils.py +18 -0
  99. ins_pricing v2/modelling/tests/test_explain.py +56 -0
  100. ins_pricing v2/modelling/tests/test_geo_tokens_split.py +49 -0
  101. ins_pricing v2/modelling/tests/test_graph_cache.py +33 -0
  102. ins_pricing v2/modelling/tests/test_plotting.py +63 -0
  103. ins_pricing v2/modelling/tests/test_plotting_library.py +150 -0
  104. ins_pricing v2/modelling/tests/test_preprocessor.py +48 -0
  105. ins_pricing v2/modelling/trainers.py +2447 -0
  106. ins_pricing v2/modelling/utils.py +1020 -0
  107. ins_pricing v2/modelling/watchdog_run.py +211 -0
  108. ins_pricing v2/pricing/__init__.py +27 -0
  109. ins_pricing v2/pricing/calibration.py +39 -0
  110. ins_pricing v2/pricing/data_quality.py +117 -0
  111. ins_pricing v2/pricing/exposure.py +85 -0
  112. ins_pricing v2/pricing/factors.py +91 -0
  113. ins_pricing v2/pricing/monitoring.py +99 -0
  114. ins_pricing v2/pricing/rate_table.py +78 -0
  115. ins_pricing v2/production/__init__.py +21 -0
  116. ins_pricing v2/production/drift.py +30 -0
  117. ins_pricing v2/production/monitoring.py +143 -0
  118. ins_pricing v2/production/scoring.py +40 -0
  119. ins_pricing v2/reporting/__init__.py +11 -0
  120. ins_pricing v2/reporting/report_builder.py +72 -0
  121. ins_pricing v2/reporting/scheduler.py +45 -0
  122. ins_pricing v2/scripts/BayesOpt_incremental.py +722 -0
  123. ins_pricing v2/scripts/Explain_entry.py +545 -0
  124. ins_pricing v2/scripts/__init__.py +1 -0
  125. ins_pricing v2/scripts/train.py +568 -0
  126. ins_pricing v2/setup.py +55 -0
  127. ins_pricing v2/smoke_test.py +28 -0
  128. ins_pricing-0.1.6.dist-info/METADATA +78 -0
  129. ins_pricing-0.1.6.dist-info/RECORD +169 -0
  130. ins_pricing-0.1.6.dist-info/WHEEL +5 -0
  131. ins_pricing-0.1.6.dist-info/top_level.txt +4 -0
  132. user_packages/__init__.py +105 -0
  133. user_packages legacy/BayesOpt.py +5659 -0
  134. user_packages legacy/BayesOpt_entry.py +513 -0
  135. user_packages legacy/BayesOpt_incremental.py +685 -0
  136. user_packages legacy/Pricing_Run.py +36 -0
  137. user_packages legacy/Try/BayesOpt Legacy251213.py +3719 -0
  138. user_packages legacy/Try/BayesOpt Legacy251215.py +3758 -0
  139. user_packages legacy/Try/BayesOpt lagecy251201.py +3506 -0
  140. user_packages legacy/Try/BayesOpt lagecy251218.py +3992 -0
  141. user_packages legacy/Try/BayesOpt legacy.py +3280 -0
  142. user_packages legacy/Try/BayesOpt.py +838 -0
  143. user_packages legacy/Try/BayesOptAll.py +1569 -0
  144. user_packages legacy/Try/BayesOptAllPlatform.py +909 -0
  145. user_packages legacy/Try/BayesOptCPUGPU.py +1877 -0
  146. user_packages legacy/Try/BayesOptSearch.py +830 -0
  147. user_packages legacy/Try/BayesOptSearchOrigin.py +829 -0
  148. user_packages legacy/Try/BayesOptV1.py +1911 -0
  149. user_packages legacy/Try/BayesOptV10.py +2973 -0
  150. user_packages legacy/Try/BayesOptV11.py +3001 -0
  151. user_packages legacy/Try/BayesOptV12.py +3001 -0
  152. user_packages legacy/Try/BayesOptV2.py +2065 -0
  153. user_packages legacy/Try/BayesOptV3.py +2209 -0
  154. user_packages legacy/Try/BayesOptV4.py +2342 -0
  155. user_packages legacy/Try/BayesOptV5.py +2372 -0
  156. user_packages legacy/Try/BayesOptV6.py +2759 -0
  157. user_packages legacy/Try/BayesOptV7.py +2832 -0
  158. user_packages legacy/Try/BayesOptV8Codex.py +2731 -0
  159. user_packages legacy/Try/BayesOptV8Gemini.py +2614 -0
  160. user_packages legacy/Try/BayesOptV9.py +2927 -0
  161. user_packages legacy/Try/BayesOpt_entry legacy.py +313 -0
  162. user_packages legacy/Try/ModelBayesOptSearch.py +359 -0
  163. user_packages legacy/Try/ResNetBayesOptSearch.py +249 -0
  164. user_packages legacy/Try/XgbBayesOptSearch.py +121 -0
  165. user_packages legacy/Try/xgbbayesopt.py +523 -0
  166. user_packages legacy/__init__.py +19 -0
  167. user_packages legacy/cli_common.py +124 -0
  168. user_packages legacy/notebook_utils.py +228 -0
  169. user_packages legacy/watchdog_run.py +202 -0
@@ -0,0 +1,2372 @@
1
+ # 数据在CPU和GPU之间传输会带来较大开销,但可以多CUDA流同时传输数据和计算,从而实现更大数据集的操作。
2
+
3
+ import copy
4
+ from email.mime import base
5
+ import gc
6
+ import math
7
+ import os
8
+ from dataclasses import dataclass
9
+ from pathlib import Path
10
+ from re import X
11
+ from typing import Any, Dict, List, Optional
12
+
13
+ import joblib
14
+ import matplotlib.pyplot as plt
15
+ import numpy as np # 1.26.2
16
+ import optuna # 4.3.0
17
+ import pandas as pd # 2.2.3
18
+ import shap
19
+
20
+ import torch # 版本: 1.10.1+cu111
21
+ import torch.nn as nn
22
+ import torch.nn.functional as F
23
+ import xgboost as xgb # 1.7.0
24
+
25
+ from torch.utils.data import Dataset, DataLoader, TensorDataset
26
+ from torch.cuda.amp import autocast, GradScaler
27
+ from torch.nn.utils import clip_grad_norm_
28
+ from sklearn.model_selection import ShuffleSplit, cross_val_score # 1.2.2
29
+ from sklearn.preprocessing import StandardScaler
30
+ from sklearn.metrics import make_scorer, mean_tweedie_deviance
31
+
32
+ # =============================================================================
33
+ # Constants & utilities
34
+ # =============================================================================
35
+ EPS = 1e-8
36
+
37
+
38
+ def ensure_parent_dir(file_path: str) -> None:
39
+ # 若目标文件所在目录不存在则自动创建
40
+ directory = os.path.dirname(file_path)
41
+ if directory:
42
+ os.makedirs(directory, exist_ok=True)
43
+
44
+
45
+ def compute_batch_size(data_size: int, learning_rate: float, batch_num: int, minimum: int) -> int:
46
+ # 按学习率和样本量给出估算 batch,再夹在 [1, data_size] 范围内
47
+ estimated = int((learning_rate / 1e-4) ** 0.5 *
48
+ (data_size / max(batch_num, 1)))
49
+ return max(1, min(data_size, max(minimum, estimated)))
50
+
51
+
52
+ # 定义在 PyTorch 环境下的 Tweedie 偏差损失函数
53
+ # 参考文档:https://scikit-learn.org/stable/modules/model_evaluation.html#mean-poisson-gamma-and-tweedie-deviances
54
+
55
+
56
+ def tweedie_loss(pred, target, p=1.5, eps=1e-6, max_clip=1e6):
57
+ # 为确保稳定性先将预测值裁剪为正数
58
+ pred_clamped = torch.clamp(pred, min=eps)
59
+ # 计算 Tweedie 偏差的各部分
60
+ if p == 1:
61
+ # 对应泊松分布
62
+ term1 = target * torch.log(target / pred_clamped + eps)
63
+ term2 = -target + pred_clamped
64
+ term3 = 0
65
+ elif p == 0:
66
+ # 对应高斯分布
67
+ term1 = 0.5 * torch.pow(target - pred_clamped, 2)
68
+ term2 = 0
69
+ term3 = 0
70
+ elif p == 2:
71
+ # 对应伽马分布
72
+ term1 = torch.log(pred_clamped / target + eps)
73
+ term2 = -target / pred_clamped + 1
74
+ term3 = 0
75
+ else:
76
+ term1 = torch.pow(target, 2 - p) / ((1 - p) * (2 - p))
77
+ term2 = target * torch.pow(pred_clamped, 1 - p) / (1 - p)
78
+ term3 = torch.pow(pred_clamped, 2 - p) / (2 - p)
79
+ # Tweedie 负对数似然(忽略常数项)
80
+ return torch.nan_to_num(2 * (term1 - term2 + term3), nan=eps, posinf=max_clip, neginf=-max_clip)
81
+
82
+ # 定义释放CUDA内存函数
83
+
84
+
85
+ def free_cuda():
86
+ print(">>> Moving all models to CPU...")
87
+ for obj in gc.get_objects():
88
+ try:
89
+ if hasattr(obj, "to") and callable(obj.to):
90
+ # 跳过 torch.device 等不可移动对象
91
+ obj.to("cpu")
92
+ except:
93
+ pass
94
+
95
+ print(">>> Deleting tensors, optimizers, dataloaders...")
96
+ gc.collect()
97
+
98
+ print(">>> Emptying CUDA cache...")
99
+ torch.cuda.empty_cache()
100
+ torch.cuda.synchronize()
101
+
102
+ print(">>> CUDA memory freed.")
103
+
104
+
105
+ # =============================================================================
106
+ # Plotting helpers
107
+ # =============================================================================
108
+
109
+ # 定义分箱函数
110
+
111
+
112
+ def split_data(data, col_nme, wgt_nme, n_bins=10):
113
+ # 避免修改原始数据帧,先创建排序后的副本
114
+ data_sorted = data.sort_values(by=col_nme, ascending=True).copy()
115
+ data_sorted['cum_weight'] = data_sorted[wgt_nme].cumsum()
116
+ w_sum = data_sorted[wgt_nme].sum()
117
+ if w_sum <= EPS:
118
+ data_sorted.loc[:, 'bins'] = 0
119
+ else:
120
+ data_sorted.loc[:, 'bins'] = np.floor(
121
+ data_sorted['cum_weight'] * float(n_bins) / w_sum
122
+ )
123
+ data_sorted.loc[(data_sorted['bins'] == n_bins), 'bins'] = n_bins - 1
124
+ return data_sorted.groupby(['bins'], observed=True).sum(numeric_only=True)
125
+
126
+ # 定义提纯曲线(Lift)绘制函数
127
+
128
+
129
+ def plot_lift_list(pred_model, w_pred_list, w_act_list,
130
+ weight_list, tgt_nme, n_bins=10,
131
+ fig_nme='Lift Chart'):
132
+ lift_data = pd.DataFrame()
133
+ lift_data.loc[:, 'pred'] = pred_model
134
+ lift_data.loc[:, 'w_pred'] = w_pred_list
135
+ lift_data.loc[:, 'act'] = w_act_list
136
+ lift_data.loc[:, 'weight'] = weight_list
137
+ plot_data = split_data(lift_data, 'pred', 'weight', n_bins)
138
+ plot_data['exp_v'] = plot_data['w_pred'] / plot_data['weight']
139
+ plot_data['act_v'] = plot_data['act'] / plot_data['weight']
140
+ plot_data.reset_index(inplace=True)
141
+ fig = plt.figure(figsize=(7, 5))
142
+ ax = fig.add_subplot(111)
143
+ ax.plot(plot_data.index, plot_data['act_v'],
144
+ label='Actual', color='red')
145
+ ax.plot(plot_data.index, plot_data['exp_v'],
146
+ label='Predicted', color='blue')
147
+ ax.set_title(
148
+ 'Lift Chart of %s' % tgt_nme, fontsize=8)
149
+ plt.xticks(plot_data.index,
150
+ plot_data.index,
151
+ rotation=90, fontsize=6)
152
+ plt.yticks(fontsize=6)
153
+ plt.legend(loc='upper left',
154
+ fontsize=5, frameon=False)
155
+ plt.margins(0.05)
156
+ ax2 = ax.twinx()
157
+ ax2.bar(plot_data.index, plot_data['weight'],
158
+ alpha=0.5, color='seagreen',
159
+ label='Earned Exposure')
160
+ plt.yticks(fontsize=6)
161
+ plt.legend(loc='upper right',
162
+ fontsize=5, frameon=False)
163
+ plt.subplots_adjust(wspace=0.3)
164
+ save_path = os.path.join(
165
+ os.getcwd(), 'plot', f'05_{tgt_nme}_{fig_nme}.png')
166
+ ensure_parent_dir(save_path)
167
+ plt.savefig(save_path, dpi=300)
168
+ plt.close(fig)
169
+
170
+ # 定义双提纯曲线绘制函数
171
+
172
+
173
+ def plot_dlift_list(pred_model_1, pred_model_2,
174
+ model_nme_1, model_nme_2,
175
+ tgt_nme,
176
+ w_list, w_act_list, n_bins=10,
177
+ fig_nme='Double Lift Chart'):
178
+ lift_data = pd.DataFrame()
179
+ lift_data.loc[:, 'pred1'] = pred_model_1
180
+ lift_data.loc[:, 'pred2'] = pred_model_2
181
+ lift_data.loc[:, 'diff_ly'] = lift_data['pred1'] / lift_data['pred2']
182
+ lift_data.loc[:, 'act'] = w_act_list
183
+ lift_data.loc[:, 'weight'] = w_list
184
+ lift_data.loc[:, 'w_pred1'] = lift_data['pred1'] * lift_data['weight']
185
+ lift_data.loc[:, 'w_pred2'] = lift_data['pred2'] * lift_data['weight']
186
+ plot_data = split_data(lift_data, 'diff_ly', 'weight', n_bins)
187
+ plot_data['exp_v1'] = plot_data['w_pred1'] / plot_data['act']
188
+ plot_data['exp_v2'] = plot_data['w_pred2'] / plot_data['act']
189
+ plot_data['act_v'] = plot_data['act']/plot_data['act']
190
+ plot_data.reset_index(inplace=True)
191
+ fig = plt.figure(figsize=(7, 5))
192
+ ax = fig.add_subplot(111)
193
+ ax.plot(plot_data.index, plot_data['act_v'],
194
+ label='Actual', color='red')
195
+ ax.plot(plot_data.index, plot_data['exp_v1'],
196
+ label=model_nme_1, color='blue')
197
+ ax.plot(plot_data.index, plot_data['exp_v2'],
198
+ label=model_nme_2, color='black')
199
+ ax.set_title(
200
+ 'Double Lift Chart of %s' % tgt_nme, fontsize=8)
201
+ plt.xticks(plot_data.index,
202
+ plot_data.index,
203
+ rotation=90, fontsize=6)
204
+ plt.xlabel('%s / %s' % (model_nme_1, model_nme_2), fontsize=6)
205
+ plt.yticks(fontsize=6)
206
+ plt.legend(loc='upper left',
207
+ fontsize=5, frameon=False)
208
+ plt.margins(0.1)
209
+ plt.subplots_adjust(bottom=0.25, top=0.95, right=0.8)
210
+ ax2 = ax.twinx()
211
+ ax2.bar(plot_data.index, plot_data['weight'],
212
+ alpha=0.5, color='seagreen',
213
+ label='Earned Exposure')
214
+ plt.yticks(fontsize=6)
215
+ plt.legend(loc='upper right',
216
+ fontsize=5, frameon=False)
217
+ plt.subplots_adjust(wspace=0.3)
218
+ save_path = os.path.join(
219
+ os.getcwd(), 'plot', f'06_{tgt_nme}_{fig_nme}.png')
220
+ ensure_parent_dir(save_path)
221
+ plt.savefig(save_path, dpi=300)
222
+ plt.close(fig)
223
+
224
+
225
+ # =============================================================================
226
+ # ResNet model & sklearn-style wrapper
227
+ # =============================================================================
228
+
229
+ # 开始定义ResNet模型结构
230
+ # 残差块:两层线性 + ReLU + 残差连接
231
+ # ResBlock 继承 nn.Module
232
+ class ResBlock(nn.Module):
233
+ def __init__(self, dim: int, dropout: float = 0.1,
234
+ use_layernorm: bool = False, residual_scale: float = 0.1
235
+ ):
236
+ super().__init__()
237
+ self.use_layernorm = use_layernorm
238
+
239
+ if use_layernorm:
240
+ Norm = nn.LayerNorm # 对最后一维做归一化
241
+ else:
242
+ def Norm(d): return nn.BatchNorm1d(d) # 保留一个开关,想试 BN 时也能用
243
+
244
+ self.norm1 = Norm(dim)
245
+ self.fc1 = nn.Linear(dim, dim, bias=True)
246
+ self.act = nn.ReLU(inplace=True)
247
+ self.dropout = nn.Dropout(dropout) if dropout > 0.0 else nn.Identity()
248
+ self.norm2 = Norm(dim)
249
+ self.fc2 = nn.Linear(dim, dim, bias=True)
250
+
251
+ # 残差缩放,防止一开始就把主干搞炸
252
+ self.res_scale = nn.Parameter(
253
+ torch.tensor(residual_scale, dtype=torch.float32)
254
+ )
255
+
256
+ def forward(self, x):
257
+ # 前置激活结构
258
+ out = self.norm1(x)
259
+ out = self.fc1(out)
260
+ out = self.act(out)
261
+ out = self.dropout(out)
262
+ out = self.norm2(out)
263
+ out = self.fc2(out)
264
+ # 残差缩放再相加
265
+ return F.relu(x + self.res_scale * out)
266
+
267
+ # ResNetSequential 继承 nn.Module,定义整个网络结构
268
+
269
+
270
+ class ResNetSequential(nn.Module):
271
+ # 输入张量形状:(batch, input_dim)
272
+ # 网络结构:全连接 + 归一化 + ReLU,再堆叠若干残差块,最后输出 Softplus
273
+
274
+ def __init__(self, input_dim: int, hidden_dim: int = 64, block_num: int = 2,
275
+ use_layernorm: bool = True, dropout: float = 0.1,
276
+ residual_scale: float = 0.1):
277
+ super(ResNetSequential, self).__init__()
278
+
279
+ self.net = nn.Sequential()
280
+ self.net.add_module('fc1', nn.Linear(input_dim, hidden_dim))
281
+
282
+ if use_layernorm:
283
+ self.net.add_module('norm1', nn.LayerNorm(hidden_dim))
284
+ else:
285
+ self.net.add_module('norm1', nn.BatchNorm1d(hidden_dim))
286
+
287
+ self.net.add_module('relu1', nn.ReLU(inplace=True))
288
+
289
+ # 多个残差块
290
+ for i in range(block_num):
291
+ self.net.add_module(
292
+ f'ResBlk_{i+1}',
293
+ ResBlock(
294
+ hidden_dim,
295
+ dropout=dropout,
296
+ use_layernorm=use_layernorm,
297
+ residual_scale=residual_scale)
298
+ )
299
+
300
+ self.net.add_module('fc_out', nn.Linear(hidden_dim, 1))
301
+ self.net.add_module('softplus', nn.Softplus())
302
+
303
+ def forward(self, x):
304
+ return self.net(x)
305
+
306
+ # 定义ResNet模型的Scikit-Learn接口类
307
+
308
+
309
+ class ResNetSklearn(nn.Module):
310
+ def __init__(self, model_nme: str, input_dim: int, hidden_dim: int = 64,
311
+ block_num: int = 2, batch_num: int = 100, epochs: int = 100,
312
+ tweedie_power: float = 1.5, learning_rate: float = 0.01, patience: int = 10,
313
+ use_layernorm: bool = True, dropout: float = 0.1,
314
+ residual_scale: float = 0.1):
315
+ super(ResNetSklearn, self).__init__()
316
+
317
+ self.input_dim = input_dim
318
+ self.hidden_dim = hidden_dim
319
+ self.block_num = block_num
320
+ self.batch_num = batch_num
321
+ self.epochs = epochs
322
+ self.model_nme = model_nme
323
+ self.learning_rate = learning_rate
324
+ self.patience = patience
325
+ self.use_layernorm = use_layernorm
326
+ self.dropout = dropout
327
+ self.residual_scale = residual_scale
328
+
329
+ # 设备选择:cuda > mps > cpu
330
+ if torch.cuda.is_available():
331
+ self.device = torch.device('cuda')
332
+ elif torch.backends.mps.is_available():
333
+ self.device = torch.device('mps')
334
+ else:
335
+ self.device = torch.device('cpu')
336
+
337
+ # Tweedie 幂指数设定
338
+ if 'f' in self.model_nme:
339
+ self.tw_power = 1
340
+ elif 's' in self.model_nme:
341
+ self.tw_power = 2
342
+ else:
343
+ self.tw_power = tweedie_power
344
+
345
+ # 搭建网络
346
+ self.resnet = ResNetSequential(
347
+ self.input_dim,
348
+ self.hidden_dim,
349
+ self.block_num,
350
+ use_layernorm=self.use_layernorm,
351
+ dropout=self.dropout,
352
+ residual_scale=self.residual_scale
353
+ ).to(self.device)
354
+
355
+ def forward(self, x):
356
+ # 重写 forward 方法以处理 SHAP 的输入
357
+ # SHAP (KernelExplainer) 会传入一个 NumPy 数组
358
+ if isinstance(x, np.ndarray):
359
+ # 1. 从 NumPy 数组创建张量
360
+ x_tensor = torch.tensor(x, dtype=torch.float32)
361
+ else:
362
+ # 2. 保持对现有张量输入的兼容
363
+ x_tensor = x
364
+
365
+ # 3. 确保输入张量在正确的设备上
366
+ x_tensor = x_tensor.to(self.device)
367
+
368
+ # 4. 通过底层 ResNet 模型进行预测
369
+ # self.resnet 已经在初始化时被 .to(self.device)
370
+ y_pred = self.resnet(x_tensor)
371
+ return y_pred
372
+
373
+ def fit(self, X_train, y_train, w_train=None,
374
+ X_val=None, y_val=None, w_val=None):
375
+
376
+ # === 1. 训练集:先留在 CPU,交给 DataLoader 批量搬运到 GPU ===
377
+ # 注意:从 pandas DataFrame 转 tensor 时要复制数据,避免后续视图修改
378
+ X_tensor = torch.tensor(X_train.values, dtype=torch.float32)
379
+ y_tensor = torch.tensor(
380
+ y_train.values, dtype=torch.float32).view(-1, 1)
381
+ if w_train is not None:
382
+ w_tensor = torch.tensor(
383
+ w_train.values, dtype=torch.float32).view(-1, 1)
384
+ else:
385
+ w_tensor = torch.ones_like(y_tensor)
386
+
387
+ # === 2. 验证集:先在 CPU 上构造,后续一次性搬到目标设备 ===
388
+ has_val = X_val is not None and y_val is not None
389
+ if has_val:
390
+ X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
391
+ y_val_tensor = torch.tensor(
392
+ y_val.values, dtype=torch.float32).view(-1, 1)
393
+ if w_val is not None:
394
+ w_val_tensor = torch.tensor(
395
+ w_val.values, dtype=torch.float32).view(-1, 1)
396
+ else:
397
+ w_val_tensor = torch.ones_like(y_val_tensor)
398
+ else:
399
+ X_val_tensor = y_val_tensor = w_val_tensor = None
400
+
401
+ # === 3. 构建 DataLoader ===
402
+ dataset = TensorDataset(X_tensor, y_tensor, w_tensor)
403
+ batch_size = compute_batch_size(
404
+ data_size=len(dataset),
405
+ learning_rate=self.learning_rate,
406
+ batch_num=self.batch_num,
407
+ minimum=64
408
+ )
409
+ N = X_tensor.shape[0]
410
+
411
+ if self.device.type == 'cuda':
412
+ if N > 200_000:
413
+ base_bs = 4096
414
+ elif N > 50_000:
415
+ base_bs = 2048
416
+ else:
417
+ base_bs = 1024
418
+ else:
419
+ if N > 50_000:
420
+ base_bs = 1024
421
+ else:
422
+ base_bs = 512
423
+
424
+ batch_size = min(batch_size, base_bs)
425
+
426
+ dataloader = DataLoader(
427
+ dataset,
428
+ batch_size=batch_size,
429
+ shuffle=True,
430
+ num_workers=1, # 表格数据通常 0~1 个线程即可
431
+ pin_memory=(self.device.type == 'cuda')
432
+ )
433
+
434
+ # === 4. 优化器与 AMP ===
435
+ # 建议使用 Adam + AMP 主要是为了稳定损失,同时保持 GPU 性能
436
+ self.optimizer = torch.optim.Adam(
437
+ self.resnet.parameters(), lr=self.learning_rate)
438
+ self.scaler = GradScaler(enabled=(self.device.type == 'cuda'))
439
+
440
+ # === 5. 早停机制 ===
441
+ best_loss, patience_counter = float('inf'), 0
442
+ best_model_state = None
443
+
444
+ # 若存在验证集则一次性搬到目标设备
445
+ if has_val:
446
+ X_val_dev = X_val_tensor.to(self.device, non_blocking=True)
447
+ y_val_dev = y_val_tensor.to(self.device, non_blocking=True)
448
+ w_val_dev = w_val_tensor.to(self.device, non_blocking=True)
449
+
450
+ # === 6. 训练循环 ===
451
+ for epoch in range(1, self.epochs + 1):
452
+ self.resnet.train()
453
+ for X_batch, y_batch, w_batch in dataloader:
454
+ self.optimizer.zero_grad()
455
+
456
+ X_batch = X_batch.to(self.device, non_blocking=True)
457
+ y_batch = y_batch.to(self.device, non_blocking=True)
458
+ w_batch = w_batch.to(self.device, non_blocking=True)
459
+
460
+ with autocast(enabled=(self.device.type == 'cuda')):
461
+ y_pred = self.resnet(X_batch)
462
+ y_pred = torch.clamp(y_pred, min=1e-6)
463
+
464
+ losses = tweedie_loss(
465
+ y_pred, y_batch, p=self.tw_power).view(-1)
466
+ weighted_loss = (losses * w_batch.view(-1)
467
+ ).sum() / torch.clamp(w_batch.sum(), min=EPS)
468
+
469
+ self.scaler.scale(weighted_loss).backward()
470
+
471
+ if self.device.type == 'cuda':
472
+ self.scaler.unscale_(self.optimizer)
473
+ clip_grad_norm_(self.resnet.parameters(), max_norm=1.0)
474
+
475
+ self.scaler.step(self.optimizer)
476
+ self.scaler.update()
477
+
478
+ # === 7. 验证损失与早停判断 ===
479
+ if has_val:
480
+ self.resnet.eval()
481
+ with torch.no_grad(), autocast(enabled=(self.device.type == 'cuda')):
482
+ y_val_pred = self.resnet(X_val_dev)
483
+ y_val_pred = torch.clamp(y_val_pred, min=1e-6)
484
+
485
+ val_loss_values = tweedie_loss(
486
+ y_val_pred, y_val_dev, p=self.tw_power
487
+ ).view(-1)
488
+ val_weighted_loss = (
489
+ val_loss_values * w_val_dev.view(-1)
490
+ ).sum() / torch.clamp(w_val_dev.sum(), min=EPS)
491
+
492
+ if val_weighted_loss < best_loss:
493
+ best_loss = val_weighted_loss
494
+ patience_counter = 0
495
+ best_model_state = copy.deepcopy(self.resnet.state_dict())
496
+ else:
497
+ patience_counter += 1
498
+
499
+ if patience_counter >= self.patience and best_model_state is not None:
500
+ self.resnet.load_state_dict(best_model_state)
501
+ break
502
+ if has_val and best_model_state is not None:
503
+ self.resnet.load_state_dict(best_model_state)
504
+
505
+ # ---------------- 预测 ----------------
506
+
507
+ def predict(self, X_test):
508
+ self.resnet.eval()
509
+ # 如果输入是 DataFrame,先转为 NumPy 数组
510
+ if isinstance(X_test, pd.DataFrame):
511
+ X_np = X_test.values.astype(np.float32)
512
+ else:
513
+ X_np = X_test
514
+
515
+ with torch.no_grad():
516
+ # 直接调用 self (即 ResNetSklearn 实例),这将触发 forward 方法
517
+ y_pred = self(X_np).cpu().numpy()
518
+ y_pred = np.clip(y_pred, 1e-6, None)
519
+ return y_pred.flatten()
520
+
521
+ # ---------------- 设置参数 ----------------
522
+
523
+ def set_params(self, params):
524
+ for key, value in params.items():
525
+ if hasattr(self, key):
526
+ setattr(self, key, value)
527
+ else:
528
+ raise ValueError(f"Parameter {key} not found in model.")
529
+
530
+ # =============================================================================
531
+ # FT-Transformer model & sklearn-style wrapper
532
+ # =============================================================================
533
+ # 开始定义FT Transformer模型结构
534
+
535
+
536
+ class FeatureTokenizer(nn.Module):
537
+ # 将数值与类别特征映射为 token,输出形状 (batch, token 数, d_model)
538
+ # 设定:
539
+ # - X_num 表示数值特征,形状 (batch, num_numeric)
540
+ # - X_cat 表示类别特征,形状 (batch, num_categorical),每列为编码后的整数标签 [0, card-1]
541
+
542
+ def __init__(self, num_numeric: int, cat_cardinalities, d_model: int):
543
+ super().__init__()
544
+
545
+ self.num_numeric = num_numeric
546
+ self.has_numeric = num_numeric > 0
547
+
548
+ if self.has_numeric:
549
+ self.num_linear = nn.Linear(num_numeric, d_model)
550
+
551
+ self.embeddings = nn.ModuleList([
552
+ nn.Embedding(card, d_model) for card in cat_cardinalities
553
+ ])
554
+
555
+ def forward(self, X_num, X_cat):
556
+ tokens = []
557
+
558
+ if self.has_numeric:
559
+ # 数值特征映射为单个 token
560
+ num_token = self.num_linear(X_num) # 形状 (batch, d_model)
561
+ tokens.append(num_token)
562
+
563
+ # 每个类别特征生成一个嵌入 token
564
+ for i, emb in enumerate(self.embeddings):
565
+ tok = emb(X_cat[:, i]) # 形状 (batch, d_model)
566
+ tokens.append(tok)
567
+
568
+ # 最终堆叠为 (batch, token 数, d_model)
569
+ x = torch.stack(tokens, dim=1)
570
+ return x
571
+
572
+ # 定义具有残差缩放的Encoder层
573
+
574
+
575
+ class ScaledTransformerEncoderLayer(nn.Module):
576
+ def __init__(self, d_model: int, nhead: int, dim_feedforward: int = 2048,
577
+ dropout: float = 0.1, residual_scale_attn: float = 1.0,
578
+ residual_scale_ffn: float = 1.0, norm_first: bool = True,
579
+ ):
580
+ super().__init__()
581
+ self.self_attn = nn.MultiheadAttention(
582
+ embed_dim=d_model,
583
+ num_heads=nhead,
584
+ dropout=dropout,
585
+ batch_first=True
586
+ )
587
+
588
+ # 前馈网络部分
589
+ self.linear1 = nn.Linear(d_model, dim_feedforward)
590
+ self.dropout = nn.Dropout(dropout)
591
+ self.linear2 = nn.Linear(dim_feedforward, d_model)
592
+
593
+ # 归一化与 Dropout
594
+ self.norm1 = nn.LayerNorm(d_model)
595
+ self.norm2 = nn.LayerNorm(d_model)
596
+ self.dropout1 = nn.Dropout(dropout)
597
+ self.dropout2 = nn.Dropout(dropout)
598
+
599
+ self.activation = nn.GELU()
600
+ # self.activation = nn.ReLU()
601
+ self.norm_first = norm_first
602
+
603
+ # 残差缩放系数
604
+ self.res_scale_attn = residual_scale_attn
605
+ self.res_scale_ffn = residual_scale_ffn
606
+
607
+ def forward(self, src, src_mask=None, src_key_padding_mask=None):
608
+ # 输入张量形状:(batch, 序列长度, d_model)
609
+ x = src
610
+
611
+ if self.norm_first:
612
+ # 先归一化再做注意力
613
+ x = x + self._sa_block(self.norm1(x), src_mask,
614
+ src_key_padding_mask)
615
+ x = x + self._ff_block(self.norm2(x))
616
+ else:
617
+ # 后归一化(一般不启用)
618
+ x = self.norm1(
619
+ x + self._sa_block(x, src_mask, src_key_padding_mask))
620
+ x = self.norm2(x + self._ff_block(x))
621
+
622
+ return x
623
+
624
+ def _sa_block(self, x, attn_mask, key_padding_mask):
625
+ # 自注意力并附带残差缩放
626
+ attn_out, _ = self.self_attn(
627
+ x, x, x,
628
+ attn_mask=attn_mask,
629
+ key_padding_mask=key_padding_mask,
630
+ need_weights=False
631
+ )
632
+ return self.res_scale_attn * self.dropout1(attn_out)
633
+
634
+ def _ff_block(self, x):
635
+ # 前馈网络并附带残差缩放
636
+ x2 = self.linear2(self.dropout(self.activation(self.linear1(x))))
637
+ return self.res_scale_ffn * self.dropout2(x2)
638
+
639
+ # 定义FT-Transformer核心模型
640
+
641
+
642
+ class FTTransformerCore(nn.Module):
643
+ # 最小可用版本的 FT-Transformer:
644
+ # - FeatureTokenizer:将数值与类别特征转换为 token
645
+ # - TransformerEncoder:捕捉特征之间的交互
646
+ # - 池化 + MLP + Softplus:保证输出为正值(适配 Tweedie/Gamma)
647
+
648
+ def __init__(self, num_numeric: int, cat_cardinalities, d_model: int = 64,
649
+ n_heads: int = 8, n_layers: int = 4, dropout: float = 0.1,
650
+ ):
651
+ super().__init__()
652
+
653
+ self.tokenizer = FeatureTokenizer(
654
+ num_numeric=num_numeric,
655
+ cat_cardinalities=cat_cardinalities,
656
+ d_model=d_model
657
+ )
658
+ scale = 1.0 / math.sqrt(n_layers) # 推荐一个默认值
659
+ encoder_layer = ScaledTransformerEncoderLayer(
660
+ d_model=d_model,
661
+ nhead=n_heads,
662
+ dim_feedforward=d_model * 4,
663
+ dropout=dropout,
664
+ residual_scale_attn=scale,
665
+ residual_scale_ffn=scale,
666
+ norm_first=True,
667
+ )
668
+ self.encoder = nn.TransformerEncoder(
669
+ encoder_layer,
670
+ num_layers=n_layers
671
+ )
672
+ self.n_layers = n_layers
673
+
674
+ self.head = nn.Sequential(
675
+ nn.LayerNorm(d_model),
676
+ nn.Linear(d_model, d_model),
677
+ nn.GELU(),
678
+ # nn.ReLU(),
679
+ nn.Linear(d_model, 1),
680
+ nn.Softplus() # 保证输出为正,适合 Tweedie / Gamma
681
+ )
682
+
683
+ def forward(self, X_num, X_cat):
684
+
685
+ # X_num: (batch, 数值特征数),float32
686
+ # X_cat: (batch, 类别特征数),long
687
+
688
+ tokens = self.tokenizer(X_num, X_cat) # 形状 (batch, token 数, d_model)
689
+ x = self.encoder(tokens) # 形状 (batch, token 数, d_model)
690
+
691
+ # 对 token 做平均池化
692
+ x = x.mean(dim=1) # 形状 (batch, d_model)
693
+
694
+ out = self.head(x) # 形状 (batch, 1),Softplus 保证为正
695
+ return out
696
+
697
+ # 定义TabularDataset类
698
+
699
+
700
+ class TabularDataset(Dataset):
701
+ def __init__(self, X_num, X_cat, y, w):
702
+
703
+ # X_num: torch.float32, 形状 (N, 数值特征数)
704
+ # X_cat: torch.long, 形状 (N, 类别特征数)
705
+ # y: torch.float32, 形状 (N, 1)
706
+ # w: torch.float32, 形状 (N, 1)
707
+
708
+ self.X_num = X_num
709
+ self.X_cat = X_cat
710
+ self.y = y
711
+ self.w = w
712
+
713
+ def __len__(self):
714
+ return self.y.shape[0]
715
+
716
+ def __getitem__(self, idx):
717
+ return (
718
+ self.X_num[idx],
719
+ self.X_cat[idx],
720
+ self.y[idx],
721
+ self.w[idx],
722
+ )
723
+
724
+ # 定义FTTransformer的Scikit-Learn接口类
725
+
726
+
727
+ class FTTransformerSklearn(nn.Module):
728
+
729
+ # sklearn 风格包装:
730
+ # - num_cols:数值特征列名列表
731
+ # - cat_cols:类别特征列名列表(需提前做标签编码,取值 [0, n_classes-1])
732
+
733
+ def __init__(self, model_nme: str, num_cols, cat_cols, d_model: int = 64, n_heads: int = 8,
734
+ n_layers: int = 4, dropout: float = 0.1, batch_num: int = 100, epochs: int = 100,
735
+ tweedie_power: float = 1.5, learning_rate: float = 1e-3, patience: int = 10,
736
+ ):
737
+ super().__init__()
738
+
739
+ self.model_nme = model_nme
740
+ self.num_cols = list(num_cols)
741
+ self.cat_cols = list(cat_cols)
742
+ self.d_model = d_model
743
+ self.n_heads = n_heads
744
+ self.n_layers = n_layers
745
+ self.dropout = dropout
746
+ self.batch_num = batch_num
747
+ self.epochs = epochs
748
+ self.learning_rate = learning_rate
749
+ self.patience = patience
750
+ if 'f' in self.model_nme:
751
+ self.tw_power = 1.0
752
+ elif 's' in self.model_nme:
753
+ self.tw_power = 2.0
754
+ else:
755
+ self.tw_power = tweedie_power
756
+ if torch.cuda.is_available():
757
+ self.device = torch.device("cuda")
758
+ elif torch.backends.mps.is_available():
759
+ self.device = torch.device("mps")
760
+ else:
761
+ self.device = torch.device("cpu")
762
+ self.cat_cardinalities = None
763
+ self.cat_categories = {}
764
+ self.ft = None
765
+
766
+ def _build_model(self, X_train):
767
+ num_numeric = len(self.num_cols)
768
+ cat_cardinalities = []
769
+
770
+ for col in self.cat_cols:
771
+ cats = X_train[col].astype('category')
772
+ categories = cats.cat.categories
773
+ self.cat_categories[col] = categories # 保存训练集类别全集
774
+
775
+ card = len(categories) + 1 # 多预留 1 类给“未知/缺失”
776
+ cat_cardinalities.append(card)
777
+
778
+ self.cat_cardinalities = cat_cardinalities
779
+
780
+ self.ft = FTTransformerCore(
781
+ num_numeric=num_numeric,
782
+ cat_cardinalities=cat_cardinalities,
783
+ d_model=self.d_model,
784
+ n_heads=self.n_heads,
785
+ n_layers=self.n_layers,
786
+ dropout=self.dropout,
787
+ ).to(self.device)
788
+
789
+ def _encode_cats(self, X):
790
+ # 输入 DataFrame 至少需要包含所有类别特征列
791
+ # 返回形状 (N, 类别特征数) 的 int64 数组
792
+
793
+ if not self.cat_cols:
794
+ return np.zeros((len(X), 0), dtype='int64')
795
+
796
+ X_cat_list = []
797
+ for col in self.cat_cols:
798
+ # 使用训练阶段记录的类别全集
799
+ categories = self.cat_categories[col]
800
+ # 按固定类别构造 Categorical
801
+ cats = pd.Categorical(X[col], categories=categories)
802
+ codes = cats.codes.astype('int64', copy=True) # -1 表示未知或缺失
803
+ # 未知或缺失映射到额外的“未知”索引 len(categories)
804
+ codes[codes < 0] = len(categories)
805
+ X_cat_list.append(codes)
806
+
807
+ X_cat_np = np.stack(X_cat_list, axis=1) # 形状 (N, 类别特征数)
808
+ return X_cat_np
809
+
810
+ def fit(self, X_train, y_train, w_train=None,
811
+ X_val=None, y_val=None, w_val=None):
812
+
813
+ # 首次拟合时需要构建底层模型结构
814
+ if self.ft is None:
815
+ self._build_model(X_train)
816
+
817
+ # --- 构建训练张量(全部先放在 CPU,后续按批搬运) ---
818
+ # 复制数据确保与原 DataFrame 脱钩,这样标准化或采样不会污染原始数据
819
+ X_num_train = X_train[self.num_cols].to_numpy(
820
+ dtype=np.float32, copy=True)
821
+ X_num_train = torch.tensor(
822
+ X_num_train,
823
+ dtype=torch.float32
824
+ )
825
+
826
+ if self.cat_cols:
827
+ X_cat_train_np = self._encode_cats(X_train)
828
+ X_cat_train = torch.tensor(X_cat_train_np, dtype=torch.long)
829
+ else:
830
+ X_cat_train = torch.zeros(
831
+ (X_num_train.shape[0], 0), dtype=torch.long)
832
+
833
+ y_tensor = torch.tensor(
834
+ y_train.values,
835
+ dtype=torch.float32
836
+ ).view(-1, 1)
837
+
838
+ if w_train is not None:
839
+ w_tensor = torch.tensor(
840
+ w_train.values,
841
+ dtype=torch.float32
842
+ ).view(-1, 1)
843
+ else:
844
+ w_tensor = torch.ones_like(y_tensor)
845
+
846
+ # --- 验证集张量(一次性搬到目标设备) ---
847
+ has_val = X_val is not None and y_val is not None
848
+ if has_val:
849
+ # ---------- 数值特征 ----------
850
+ X_num_val_np = X_val[self.num_cols].to_numpy(
851
+ dtype=np.float32, copy=True)
852
+ X_num_val = torch.tensor(X_num_val_np, dtype=torch.float32)
853
+
854
+ # ---------- 类别特征 ----------
855
+ if self.cat_cols:
856
+ X_cat_val_np = self._encode_cats(X_val)
857
+ X_cat_val = torch.tensor(X_cat_val_np, dtype=torch.long)
858
+ else:
859
+ X_cat_val = torch.zeros(
860
+ (X_num_val.shape[0], 0), dtype=torch.long)
861
+
862
+ # ---------- 目标 & 权重 ----------
863
+ y_val_np = y_val.values.astype(np.float32, copy=True)
864
+ y_val_tensor = torch.tensor(
865
+ y_val_np, dtype=torch.float32).view(-1, 1)
866
+
867
+ if w_val is not None:
868
+ w_val_np = w_val.values.astype(np.float32, copy=True)
869
+ w_val_tensor = torch.tensor(
870
+ w_val_np, dtype=torch.float32).view(-1, 1)
871
+ else:
872
+ w_val_tensor = torch.ones_like(y_val_tensor)
873
+
874
+ else:
875
+ X_num_val = X_cat_val = y_val_tensor = w_val_tensor = None
876
+
877
+ # --- 构建 DataLoader ---
878
+ dataset = TabularDataset(
879
+ X_num_train, X_cat_train, y_tensor, w_tensor
880
+ )
881
+
882
+ batch_size = compute_batch_size(
883
+ data_size=len(dataset),
884
+ learning_rate=self.learning_rate,
885
+ batch_num=self.batch_num,
886
+ minimum=64
887
+ )
888
+ N = X_num_train.shape[0]
889
+ if self.device.type == 'cuda':
890
+ if N > 200_000:
891
+ base_bs = 32768
892
+ elif N > 50_000:
893
+ base_bs = 16384
894
+ else:
895
+ base_bs = 8192
896
+ else:
897
+ if N > 50_000:
898
+ base_bs = 16384
899
+ else:
900
+ base_bs = 8192
901
+ batch_size = min(batch_size, base_bs)
902
+
903
+ dataloader = DataLoader(
904
+ dataset,
905
+ batch_size=batch_size,
906
+ shuffle=True,
907
+ num_workers=1,
908
+ pin_memory=(self.device.type == 'cuda')
909
+ )
910
+
911
+ # --- 优化器与 AMP ---
912
+ # 这部分与 ResNet 一致,仍建议使用 Adam + AMP 来避免数值不稳定
913
+ optimizer = torch.optim.Adam(
914
+ self.ft.parameters(),
915
+ lr=self.learning_rate
916
+ )
917
+ scaler = GradScaler(enabled=(self.device.type == 'cuda'))
918
+
919
+ # --- 早停机制 ---
920
+ best_loss = float('inf')
921
+ patience_counter = 0
922
+ best_model_state = None
923
+
924
+ # 若存在验证集则整体迁移到目标设备
925
+ if has_val:
926
+ X_num_val_dev = X_num_val.to(self.device, non_blocking=True)
927
+ X_cat_val_dev = X_cat_val.to(self.device, non_blocking=True)
928
+ y_val_dev = y_val_tensor.to(self.device, non_blocking=True)
929
+ w_val_dev = w_val_tensor.to(self.device, non_blocking=True)
930
+
931
+ # --- 训练循环 ---
932
+ for epoch in range(1, self.epochs + 1):
933
+ self.ft.train()
934
+ for X_num_b, X_cat_b, y_b, w_b in dataloader:
935
+ optimizer.zero_grad()
936
+
937
+ X_num_b = X_num_b.to(self.device, non_blocking=True)
938
+ X_cat_b = X_cat_b.to(self.device, non_blocking=True)
939
+ y_b = y_b.to(self.device, non_blocking=True)
940
+ w_b = w_b.to(self.device, non_blocking=True)
941
+
942
+ with autocast(enabled=(self.device.type == 'cuda')):
943
+ y_pred = self.ft(X_num_b, X_cat_b)
944
+ y_pred = torch.clamp(y_pred, min=1e-6)
945
+
946
+ losses = tweedie_loss(
947
+ y_pred, y_b, p=self.tw_power
948
+ ).view(-1)
949
+
950
+ weighted_loss = (losses * w_b.view(-1)).sum() / \
951
+ torch.clamp(w_b.sum(), min=EPS)
952
+
953
+ scaler.scale(weighted_loss).backward()
954
+
955
+ if self.device.type == 'cuda':
956
+ scaler.unscale_(optimizer)
957
+ clip_grad_norm_(self.ft.parameters(), max_norm=1.0)
958
+
959
+ scaler.step(optimizer)
960
+ scaler.update()
961
+
962
+ # --- 验证阶段与早停判断 ---
963
+ if has_val:
964
+ self.ft.eval()
965
+ with torch.no_grad(), autocast(enabled=(self.device.type == 'cuda')):
966
+ y_val_pred = self.ft(X_num_val_dev, X_cat_val_dev)
967
+ y_val_pred = torch.clamp(y_val_pred, min=1e-6)
968
+
969
+ val_losses = tweedie_loss(
970
+ y_val_pred, y_val_dev, p=self.tw_power
971
+ ).view(-1)
972
+
973
+ val_weighted_loss = (
974
+ val_losses * w_val_dev.view(-1)
975
+ ).sum() / torch.clamp(w_val_dev.sum(), min=EPS)
976
+
977
+ if val_weighted_loss < best_loss:
978
+ best_loss = val_weighted_loss
979
+ patience_counter = 0
980
+ best_model_state = copy.deepcopy(self.ft.state_dict())
981
+ else:
982
+ patience_counter += 1
983
+
984
+ if patience_counter >= self.patience and best_model_state is not None:
985
+ self.ft.load_state_dict(best_model_state)
986
+ break
987
+ if has_val and best_model_state is not None:
988
+ self.ft.load_state_dict(best_model_state)
989
+
990
+ def predict(self, X_test):
991
+ # X_test 需要包含所有数值列与类别列
992
+
993
+ self.ft.eval()
994
+ X_num = X_test[self.num_cols].to_numpy(dtype=np.float32, copy=True)
995
+ X_num = torch.tensor(
996
+ X_num,
997
+ dtype=torch.float32
998
+ )
999
+ if self.cat_cols:
1000
+ X_cat_np = self._encode_cats(X_test)
1001
+ X_cat = torch.tensor(X_cat_np, dtype=torch.long)
1002
+ else:
1003
+ X_cat = torch.zeros((X_num.size(0), 0), dtype=torch.long)
1004
+
1005
+ with torch.no_grad():
1006
+ X_num = X_num.to(self.device, non_blocking=True)
1007
+ X_cat = X_cat.to(self.device, non_blocking=True)
1008
+ y_pred = self.ft(X_num, X_cat).cpu().numpy()
1009
+
1010
+ y_pred = np.clip(y_pred, 1e-6, None)
1011
+ return y_pred.ravel()
1012
+
1013
+ def set_params(self, params: dict):
1014
+
1015
+ # 和 sklearn 风格保持一致。
1016
+ # 注意:对结构性参数(如 d_model/n_heads)修改后,需要重新 fit 才会生效。
1017
+
1018
+ for key, value in params.items():
1019
+ if hasattr(self, key):
1020
+ setattr(self, key, value)
1021
+ else:
1022
+ raise ValueError(f"Parameter {key} not found in model.")
1023
+ return self
1024
+
1025
+
1026
+ # ===== 基础组件与训练封装 =====================================================
1027
+
1028
+ # =============================================================================
1029
+ # Config, preprocessing, and trainer base
1030
+ # =============================================================================
1031
+ @dataclass
1032
+ class BayesOptConfig:
1033
+ model_nme: str
1034
+ resp_nme: str
1035
+ weight_nme: str
1036
+ factor_nmes: List[str]
1037
+ cate_list: Optional[List[str]] = None
1038
+ prop_test: float = 0.25
1039
+ rand_seed: Optional[int] = None
1040
+ epochs: int = 100
1041
+ use_gpu: bool = True
1042
+
1043
+
1044
+ class OutputManager:
1045
+ # 统一管理结果、图表与模型的输出路径
1046
+
1047
+ def __init__(self, root: Optional[str] = None, model_name: str = "model") -> None:
1048
+ self.root = Path(root or os.getcwd())
1049
+ self.model_name = model_name
1050
+ self.plot_dir = self.root / 'plot'
1051
+ self.result_dir = self.root / 'Results'
1052
+ self.model_dir = self.root / 'model'
1053
+
1054
+ def _prepare(self, path: Path) -> str:
1055
+ ensure_parent_dir(str(path))
1056
+ return str(path)
1057
+
1058
+ def plot_path(self, filename: str) -> str:
1059
+ return self._prepare(self.plot_dir / filename)
1060
+
1061
+ def result_path(self, filename: str) -> str:
1062
+ return self._prepare(self.result_dir / filename)
1063
+
1064
+ def model_path(self, filename: str) -> str:
1065
+ return self._prepare(self.model_dir / filename)
1066
+
1067
+
1068
+ class DatasetPreprocessor:
1069
+ # 为各训练器准备通用的训练/测试数据视图
1070
+
1071
+ def __init__(self, train_df: pd.DataFrame, test_df: pd.DataFrame,
1072
+ config: BayesOptConfig) -> None:
1073
+ self.config = config
1074
+ self.train_data = train_df.copy(deep=True)
1075
+ self.test_data = test_df.copy(deep=True)
1076
+ self.num_features: List[str] = []
1077
+ self.train_oht_scl_data: Optional[pd.DataFrame] = None
1078
+ self.test_oht_scl_data: Optional[pd.DataFrame] = None
1079
+ self.var_nmes: List[str] = []
1080
+ self.cat_categories_for_shap: Dict[str, List[Any]] = {}
1081
+
1082
+ def run(self) -> "DatasetPreprocessor":
1083
+ cfg = self.config
1084
+ # 预先计算加权实际值,后续画图、校验都依赖该字段
1085
+ self.train_data.loc[:, 'w_act'] = self.train_data[cfg.resp_nme] * \
1086
+ self.train_data[cfg.weight_nme]
1087
+ self.test_data.loc[:, 'w_act'] = self.test_data[cfg.resp_nme] * \
1088
+ self.test_data[cfg.weight_nme]
1089
+ # 高分位裁剪用来吸收离群值;若删除会导致极端点主导损失
1090
+ q99 = self.train_data[cfg.resp_nme].quantile(0.999)
1091
+ self.train_data[cfg.resp_nme] = self.train_data[cfg.resp_nme].clip(
1092
+ upper=q99)
1093
+ cate_list = list(cfg.cate_list or [])
1094
+ if cate_list:
1095
+ for cate in cate_list:
1096
+ self.train_data[cate] = self.train_data[cate].astype(
1097
+ 'category')
1098
+ self.test_data[cate] = self.test_data[cate].astype('category')
1099
+ cats = self.train_data[cate].cat.categories
1100
+ self.cat_categories_for_shap[cate] = list(cats)
1101
+ self.num_features = [
1102
+ nme for nme in cfg.factor_nmes if nme not in cate_list]
1103
+ train_oht = self.train_data[cfg.factor_nmes +
1104
+ [cfg.weight_nme] + [cfg.resp_nme]].copy()
1105
+ test_oht = self.test_data[cfg.factor_nmes +
1106
+ [cfg.weight_nme] + [cfg.resp_nme]].copy()
1107
+ train_oht = pd.get_dummies(
1108
+ train_oht,
1109
+ columns=cate_list,
1110
+ drop_first=True,
1111
+ dtype=np.int8
1112
+ )
1113
+ test_oht = pd.get_dummies(
1114
+ test_oht,
1115
+ columns=cate_list,
1116
+ drop_first=True,
1117
+ dtype=np.int8
1118
+ )
1119
+ for num_chr in self.num_features:
1120
+ # 逐列标准化保障每个特征在同一量级,否则神经网络会难以收敛
1121
+ scaler = StandardScaler()
1122
+ train_oht[num_chr] = scaler.fit_transform(
1123
+ train_oht[num_chr].values.reshape(-1, 1))
1124
+ test_oht[num_chr] = scaler.transform(
1125
+ test_oht[num_chr].values.reshape(-1, 1))
1126
+ # reindex 时将缺失的哑变量列补零,避免测试集列数与训练集不一致
1127
+ test_oht = test_oht.reindex(columns=train_oht.columns, fill_value=0)
1128
+ self.train_oht_scl_data = train_oht
1129
+ self.test_oht_scl_data = test_oht
1130
+ self.var_nmes = list(
1131
+ set(list(train_oht.columns)) - set([cfg.weight_nme, cfg.resp_nme])
1132
+ )
1133
+ return self
1134
+
1135
+ # =============================================================================
1136
+ # Trainers
1137
+ # =============================================================================
1138
+
1139
+
1140
+ class TrainerBase:
1141
+ def __init__(self, context: "BayesOptModel", label: str) -> None:
1142
+ self.ctx = context
1143
+ self.label = label
1144
+
1145
+ @property
1146
+ def config(self) -> BayesOptConfig:
1147
+ return self.ctx.config
1148
+
1149
+ @property
1150
+ def output(self) -> OutputManager:
1151
+ return self.ctx.output_manager
1152
+
1153
+ def tune(self, max_evals: int) -> None: # pragma: no cover 子类会覆盖
1154
+ raise NotImplementedError
1155
+
1156
+ def train(self) -> None: # pragma: no cover 子类会覆盖
1157
+ raise NotImplementedError
1158
+
1159
+ def save(self) -> None:
1160
+ pass
1161
+
1162
+ def load(self) -> None:
1163
+ pass
1164
+
1165
+
1166
+ class XGBTrainer(TrainerBase):
1167
+ def __init__(self, context: "BayesOptModel") -> None:
1168
+ super().__init__(context, 'Xgboost')
1169
+ self.model: Optional[xgb.XGBRegressor] = None
1170
+ self.best_params: Optional[Dict[str, Any]] = None
1171
+ self.best_trial = None
1172
+
1173
+ def _build_estimator(self) -> xgb.XGBRegressor:
1174
+ params = dict(
1175
+ objective=self.ctx.obj,
1176
+ random_state=self.ctx.rand_seed,
1177
+ subsample=0.9,
1178
+ tree_method='gpu_hist' if self.ctx.use_gpu else 'hist',
1179
+ enable_categorical=True,
1180
+ predictor='gpu_predictor' if self.ctx.use_gpu else 'cpu_predictor'
1181
+ )
1182
+ if self.ctx.use_gpu:
1183
+ params['gpu_id'] = 0
1184
+ return xgb.XGBRegressor(**params)
1185
+
1186
+ def cross_val(self, trial: optuna.trial.Trial) -> float:
1187
+ learning_rate = trial.suggest_float(
1188
+ 'learning_rate', 1e-5, 1e-1, log=True)
1189
+ gamma = trial.suggest_float('gamma', 0, 10000)
1190
+ max_depth = trial.suggest_int('max_depth', 3, 25)
1191
+ n_estimators = trial.suggest_int('n_estimators', 10, 500, step=10)
1192
+ min_child_weight = trial.suggest_int(
1193
+ 'min_child_weight', 100, 10000, step=100)
1194
+ reg_alpha = trial.suggest_float('reg_alpha', 1e-10, 1, log=True)
1195
+ reg_lambda = trial.suggest_float('reg_lambda', 1e-10, 1, log=True)
1196
+ if self.ctx.obj == 'reg:tweedie':
1197
+ tweedie_variance_power = trial.suggest_float(
1198
+ 'tweedie_variance_power', 1, 2)
1199
+ elif self.ctx.obj == 'count:poisson':
1200
+ tweedie_variance_power = 1
1201
+ elif self.ctx.obj == 'reg:gamma':
1202
+ tweedie_variance_power = 2
1203
+ else:
1204
+ tweedie_variance_power = 1.5
1205
+ clf = self._build_estimator()
1206
+ params = {
1207
+ 'learning_rate': learning_rate,
1208
+ 'gamma': gamma,
1209
+ 'max_depth': max_depth,
1210
+ 'n_estimators': n_estimators,
1211
+ 'min_child_weight': min_child_weight,
1212
+ 'reg_alpha': reg_alpha,
1213
+ 'reg_lambda': reg_lambda
1214
+ }
1215
+ if self.ctx.obj == 'reg:tweedie':
1216
+ params['tweedie_variance_power'] = tweedie_variance_power
1217
+ clf.set_params(**params)
1218
+ n_jobs = 1 if self.ctx.use_gpu else int(1 / self.ctx.prop_test)
1219
+ acc = cross_val_score(
1220
+ clf,
1221
+ self.ctx.train_data[self.ctx.factor_nmes],
1222
+ self.ctx.train_data[self.ctx.resp_nme].values,
1223
+ fit_params=self.ctx.fit_params,
1224
+ cv=self.ctx.cv,
1225
+ scoring=make_scorer(
1226
+ mean_tweedie_deviance,
1227
+ power=tweedie_variance_power,
1228
+ greater_is_better=False),
1229
+ error_score='raise',
1230
+ n_jobs=n_jobs
1231
+ ).mean()
1232
+ return -acc
1233
+
1234
+ def tune(self, max_evals: int = 100) -> None:
1235
+ study = optuna.create_study(
1236
+ direction='minimize',
1237
+ sampler=optuna.samplers.TPESampler(seed=self.ctx.rand_seed)
1238
+ )
1239
+ study.optimize(self.cross_val, n_trials=max_evals)
1240
+ self.best_params = study.best_params
1241
+ self.best_trial = study.best_trial
1242
+ params_path = self.output.result_path(
1243
+ f'{self.ctx.model_nme}_bestparams_xgb.csv'
1244
+ )
1245
+ pd.DataFrame(self.best_params, index=[0]).to_csv(params_path)
1246
+
1247
+ def train(self) -> None:
1248
+ if not self.best_params:
1249
+ raise RuntimeError('请先运行 tune() 以获得 XGB 最优参数。')
1250
+ self.model = self._build_estimator()
1251
+ self.model.set_params(**self.best_params)
1252
+ self.model.fit(self.ctx.train_data[self.ctx.factor_nmes],
1253
+ self.ctx.train_data[self.ctx.resp_nme].values,
1254
+ **self.ctx.fit_params)
1255
+ self.ctx.model_label += [self.label]
1256
+ self.ctx.train_data['pred_xgb'] = self.model.predict(
1257
+ self.ctx.train_data[self.ctx.factor_nmes])
1258
+ self.ctx.test_data['pred_xgb'] = self.model.predict(
1259
+ self.ctx.test_data[self.ctx.factor_nmes])
1260
+ self.ctx.train_data.loc[:, 'w_pred_xgb'] = self.ctx.train_data['pred_xgb'] * \
1261
+ self.ctx.train_data[self.ctx.weight_nme]
1262
+ self.ctx.test_data.loc[:, 'w_pred_xgb'] = self.ctx.test_data['pred_xgb'] * \
1263
+ self.ctx.test_data[self.ctx.weight_nme]
1264
+ self.ctx.xgb_best = self.model
1265
+
1266
+ def save(self) -> None:
1267
+ if self.model is not None:
1268
+ joblib.dump(self.model, self.output.model_path(
1269
+ f'01_{self.ctx.model_nme}_Xgboost.pkl'))
1270
+
1271
+ def load(self) -> None:
1272
+ path = self.output.model_path(
1273
+ f'01_{self.ctx.model_nme}_Xgboost.pkl')
1274
+ if os.path.exists(path):
1275
+ self.model = joblib.load(path)
1276
+ self.ctx.xgb_best = self.model
1277
+ else:
1278
+ print(f"[load_model] Warning: 未找到 Xgboost 模型文件:{path}")
1279
+
1280
+
1281
+ class ResNetTrainer(TrainerBase):
1282
+ def __init__(self, context: "BayesOptModel") -> None:
1283
+ super().__init__(context, 'ResNet')
1284
+ self.model: Optional[ResNetSklearn] = None
1285
+ self.best_params: Optional[Dict[str, Any]] = None
1286
+ self.best_trial = None
1287
+
1288
+ # ========= 交叉验证(BayesOpt 用) =========
1289
+ def cross_val(self, trial: optuna.trial.Trial) -> float:
1290
+ """
1291
+ 对 ResNet 做交叉验证。
1292
+ 为了防止显存 OOM:
1293
+ - 每个 fold 独立创建一个 ResNetSklearn
1294
+ - fold 结束就把模型挪到 CPU + 删除 + gc + empty_cache
1295
+ - 可选:BayesOpt 阶段只用训练集子样本
1296
+ """
1297
+
1298
+ # 1. 超参空间(基本沿用你之前的设定)
1299
+ learning_rate = trial.suggest_float(
1300
+ 'learning_rate', 1e-6, 1e-2, log=True
1301
+ )
1302
+ # hidden_dim = trial.suggest_int('hidden_dim', 32, 256, step=32) # 不宜过大
1303
+ hidden_dim = trial.suggest_int('hidden_dim', 8, 32, step=2)
1304
+ block_num = trial.suggest_int('block_num', 2, 10)
1305
+ # batch_num = trial.suggest_int(
1306
+ # 'batch_num',
1307
+ # 10 if self.ctx.obj == 'reg:gamma' else 100,
1308
+ # 100 if self.ctx.obj == 'reg:gamma' else 1000,
1309
+ # step=10 if self.ctx.obj == 'reg:gamma' else 100
1310
+ # )
1311
+
1312
+ if self.ctx.obj == 'reg:tweedie':
1313
+ tw_power = trial.suggest_float('tw_power', 1.0, 2.0)
1314
+ elif self.ctx.obj == 'count:poisson':
1315
+ tw_power = 1.0
1316
+ elif self.ctx.obj == 'reg:gamma':
1317
+ tw_power = 2.0
1318
+ else:
1319
+ tw_power = 1.5
1320
+
1321
+ loss = 0.0
1322
+
1323
+ # 2. (可选)BayesOpt 只在子样本上做 CV,减轻显存 & 时间压力
1324
+ data_for_cv = self.ctx.train_oht_scl_data
1325
+ max_rows_for_resnet_bo = min(100000, int(
1326
+ len(data_for_cv)/5)) # 你可以按 A30 情况调小,比如 50_000
1327
+ if len(data_for_cv) > max_rows_for_resnet_bo:
1328
+ data_for_cv = data_for_cv.sample(
1329
+ max_rows_for_resnet_bo,
1330
+ random_state=self.ctx.rand_seed
1331
+ )
1332
+
1333
+ X_all = data_for_cv[self.ctx.var_nmes]
1334
+ y_all = data_for_cv[self.ctx.resp_nme]
1335
+ w_all = data_for_cv[self.ctx.weight_nme]
1336
+
1337
+ # 用局部 ShuffleSplit,避免子样本时索引不一致
1338
+ cv_local = ShuffleSplit(
1339
+ n_splits=int(1 / self.ctx.prop_test),
1340
+ test_size=self.ctx.prop_test,
1341
+ random_state=self.ctx.rand_seed
1342
+ )
1343
+
1344
+ for fold, (train_idx, val_idx) in enumerate(cv_local.split(X_all)):
1345
+ X_train_fold = X_all.iloc[train_idx]
1346
+ y_train_fold = y_all.iloc[train_idx]
1347
+ w_train_fold = w_all.iloc[train_idx]
1348
+
1349
+ X_val_fold = X_all.iloc[val_idx]
1350
+ y_val_fold = y_all.iloc[val_idx]
1351
+ w_val_fold = w_all.iloc[val_idx]
1352
+
1353
+ # 3. 每个 fold 创建一个临时 ResNet 模型
1354
+ cv_net = ResNetSklearn(
1355
+ model_nme=self.ctx.model_nme,
1356
+ input_dim=X_all.shape[1],
1357
+ hidden_dim=hidden_dim,
1358
+ block_num=block_num,
1359
+ # batch_num=batch_num,
1360
+ epochs=self.ctx.epochs,
1361
+ tweedie_power=tw_power,
1362
+ learning_rate=learning_rate,
1363
+ patience=5
1364
+ )
1365
+
1366
+ try:
1367
+ # 4. 训练(内部仍然用你自己的 tweedie_loss)
1368
+ cv_net.fit(
1369
+ X_train_fold,
1370
+ y_train_fold,
1371
+ w_train_fold,
1372
+ X_val_fold,
1373
+ y_val_fold,
1374
+ w_val_fold
1375
+ )
1376
+
1377
+ # 5. 验证集预测
1378
+ y_pred_fold = cv_net.predict(X_val_fold)
1379
+
1380
+ # 6. 评估:Tweedie deviance(评估用,训练 loss 不动)
1381
+ loss += mean_tweedie_deviance(
1382
+ y_val_fold,
1383
+ y_pred_fold,
1384
+ sample_weight=w_val_fold,
1385
+ power=tw_power
1386
+ )
1387
+
1388
+ finally:
1389
+ # 7. ★ 每个 fold 结束后释放 GPU 资源 ★
1390
+ try:
1391
+ if hasattr(cv_net, "resnet"):
1392
+ cv_net.resnet.to("cpu")
1393
+ except Exception:
1394
+ pass
1395
+ del cv_net
1396
+ gc.collect()
1397
+ if torch.cuda.is_available():
1398
+ torch.cuda.empty_cache()
1399
+
1400
+ return loss / int(1 / self.ctx.prop_test)
1401
+
1402
+ # ========= Optuna 调参 =========
1403
+ def tune(self, max_evals: int = 50) -> None:
1404
+ """
1405
+ 使用 Optuna 对 ResNet 做贝叶斯优化。
1406
+ 每个 trial 完成以后再做一次全局的显存清理。
1407
+ """
1408
+ def objective(trial: optuna.trial.Trial) -> float:
1409
+ result = self.cross_val(trial)
1410
+ # trial 级别兜底清理
1411
+ gc.collect()
1412
+ if torch.cuda.is_available():
1413
+ torch.cuda.empty_cache()
1414
+ return result
1415
+
1416
+ study = optuna.create_study(
1417
+ direction='minimize',
1418
+ sampler=optuna.samplers.TPESampler(seed=self.ctx.rand_seed)
1419
+ )
1420
+ study.optimize(objective, n_trials=max_evals)
1421
+
1422
+ self.best_params = study.best_params
1423
+ self.best_trial = study.best_trial
1424
+
1425
+ params_path = self.output.result_path(
1426
+ f'{self.ctx.model_nme}_bestparams_resn.csv'
1427
+ )
1428
+ pd.DataFrame(self.best_params, index=[0]).to_csv(params_path)
1429
+
1430
+ # ========= 用最优超参训练最终 ResNet =========
1431
+ def train(self) -> None:
1432
+ if not self.best_params:
1433
+ raise RuntimeError('请先运行 tune() 以获得 ResNet 最优参数。')
1434
+
1435
+ self.model = ResNetSklearn(
1436
+ model_nme=self.ctx.model_nme,
1437
+ input_dim=self.ctx.train_oht_scl_data[self.ctx.var_nmes].shape[1]
1438
+ )
1439
+ self.model.set_params(self.best_params)
1440
+
1441
+ # 在全量 one-hot + 标准化数据上训练最终模型
1442
+ self.model.fit(
1443
+ self.ctx.train_oht_scl_data[self.ctx.var_nmes],
1444
+ self.ctx.train_oht_scl_data[self.ctx.resp_nme],
1445
+ self.ctx.train_oht_scl_data[self.ctx.weight_nme]
1446
+ )
1447
+
1448
+ # 记录标签
1449
+ self.ctx.model_label += [self.label]
1450
+
1451
+ # 训练集 / 测试集预测
1452
+ self.ctx.train_data['pred_resn'] = self.model.predict(
1453
+ self.ctx.train_oht_scl_data[self.ctx.var_nmes]
1454
+ )
1455
+ self.ctx.test_data['pred_resn'] = self.model.predict(
1456
+ self.ctx.test_oht_scl_data[self.ctx.var_nmes]
1457
+ )
1458
+
1459
+ # 加权赔付
1460
+ self.ctx.train_data.loc[:, 'w_pred_resn'] = (
1461
+ self.ctx.train_data['pred_resn'] *
1462
+ self.ctx.train_data[self.ctx.weight_nme]
1463
+ )
1464
+ self.ctx.test_data.loc[:, 'w_pred_resn'] = (
1465
+ self.ctx.test_data['pred_resn'] *
1466
+ self.ctx.test_data[self.ctx.weight_nme]
1467
+ )
1468
+
1469
+ # 方便外部调用
1470
+ self.ctx.resn_best = self.model
1471
+
1472
+ # ========= 保存 / 加载 =========
1473
+ def save(self) -> None:
1474
+ """
1475
+ 只保存 ResNet 的 state_dict(轻量,不含优化器)。
1476
+ """
1477
+ if self.model is not None:
1478
+ path = self.output.model_path(
1479
+ f'01_{self.ctx.model_nme}_ResNet.pth'
1480
+ )
1481
+ torch.save(self.model.resnet.state_dict(), path)
1482
+
1483
+ def load(self) -> None:
1484
+ """
1485
+ 从文件加载 ResNet 模型到合适的 device。
1486
+ """
1487
+ path = self.output.model_path(
1488
+ f'01_{self.ctx.model_nme}_ResNet.pth'
1489
+ )
1490
+ if os.path.exists(path):
1491
+ resn_loaded = ResNetSklearn(
1492
+ model_nme=self.ctx.model_nme,
1493
+ input_dim=self.ctx.train_oht_scl_data[self.ctx.var_nmes].shape[1]
1494
+ )
1495
+ state_dict = torch.load(path, map_location='cpu')
1496
+ resn_loaded.resnet.load_state_dict(state_dict)
1497
+
1498
+ # 根据当前环境设置 device
1499
+ if torch.cuda.is_available():
1500
+ resn_loaded.device = torch.device('cuda')
1501
+ elif torch.backends.mps.is_available():
1502
+ resn_loaded.device = torch.device('mps')
1503
+ else:
1504
+ resn_loaded.device = torch.device('cpu')
1505
+
1506
+ resn_loaded.resnet.to(resn_loaded.device)
1507
+ self.model = resn_loaded
1508
+ self.ctx.resn_best = self.model
1509
+ else:
1510
+ print(f"[ResNetTrainer.load] 未找到模型文件:{path}")
1511
+
1512
+
1513
+ class FTTrainer(TrainerBase):
1514
+ def __init__(self, context: "BayesOptModel") -> None:
1515
+ super().__init__(context, 'FTTransformer')
1516
+ self.model: Optional[FTTransformerSklearn] = None
1517
+ self.best_params: Optional[Dict[str, Any]] = None
1518
+ self.best_trial = None
1519
+
1520
+ def cross_val(self, trial: optuna.trial.Trial) -> float:
1521
+ """
1522
+ 对 FT-Transformer 做交叉验证。
1523
+ 这里是显存最容易爆的地方,所以加入了:
1524
+ - 较保守的超参搜索空间
1525
+ - 每个 fold 结束后强制释放 GPU 显存
1526
+ """
1527
+ # 超参空间适当缩小一点,避免特别大的模型
1528
+ learning_rate = trial.suggest_float(
1529
+ 'learning_rate', 1e-5, 5e-4, log=True
1530
+ )
1531
+ d_model = trial.suggest_int('d_model', 32, 256, step=32)
1532
+ # n_heads = trial.suggest_categorical('n_heads', [2, 4]) 避免欠拟合
1533
+ n_heads = trial.suggest_categorical('n_heads', [2, 4, 8])
1534
+ # n_layers = trial.suggest_int('n_layers', 2, 4) 避免欠拟合
1535
+ n_layers = trial.suggest_int('n_layers', 2, 8)
1536
+ dropout = trial.suggest_float('dropout', 0.0, 0.2)
1537
+ # batch_num = trial.suggest_int(
1538
+ # 'batch_num',
1539
+ # 5 if self.ctx.obj == 'reg:gamma' else 10,
1540
+ # 10 if self.ctx.obj == 'reg:gamma' else 50,
1541
+ # step=1 if self.ctx.obj == 'reg:gamma' else 10
1542
+ # )
1543
+
1544
+ if self.ctx.obj == 'reg:tweedie':
1545
+ tw_power = trial.suggest_float('tw_power', 1.0, 2.0)
1546
+ elif self.ctx.obj == 'count:poisson':
1547
+ tw_power = 1.0
1548
+ elif self.ctx.obj == 'reg:gamma':
1549
+ tw_power = 2.0
1550
+ else:
1551
+ tw_power = 1.5
1552
+
1553
+ loss = 0.0
1554
+
1555
+ # 👉 可选:只在子样本上做 BO,避免大数据直接压垮显存
1556
+ data_for_cv = self.ctx.train_data
1557
+ max_rows_for_ft_bo = min(1000000, int(
1558
+ len(data_for_cv)/2)) # 你可以根据显存情况调小或调大
1559
+ if len(data_for_cv) > max_rows_for_ft_bo:
1560
+ data_for_cv = data_for_cv.sample(
1561
+ max_rows_for_ft_bo,
1562
+ random_state=self.ctx.rand_seed
1563
+ )
1564
+
1565
+ for _, (train_idx, test_idx) in enumerate(
1566
+ self.ctx.cv.split(data_for_cv[self.ctx.factor_nmes])
1567
+ ):
1568
+ X_train_fold = data_for_cv.iloc[train_idx][self.ctx.factor_nmes]
1569
+ y_train_fold = data_for_cv.iloc[train_idx][self.ctx.resp_nme]
1570
+ w_train_fold = data_for_cv.iloc[train_idx][self.ctx.weight_nme]
1571
+ X_val_fold = data_for_cv.iloc[test_idx][self.ctx.factor_nmes]
1572
+ y_val_fold = data_for_cv.iloc[test_idx][self.ctx.resp_nme]
1573
+ w_val_fold = data_for_cv.iloc[test_idx][self.ctx.weight_nme]
1574
+
1575
+ cv_ft = FTTransformerSklearn(
1576
+ model_nme=self.ctx.model_nme,
1577
+ num_cols=self.ctx.num_features,
1578
+ cat_cols=self.ctx.cate_list,
1579
+ d_model=d_model,
1580
+ n_heads=n_heads,
1581
+ n_layers=n_layers,
1582
+ dropout=dropout,
1583
+ # batch_num=batch_num,
1584
+ epochs=self.ctx.epochs,
1585
+ tweedie_power=tw_power,
1586
+ learning_rate=learning_rate,
1587
+ patience=5
1588
+ )
1589
+
1590
+ try:
1591
+ cv_ft.fit(
1592
+ X_train_fold, y_train_fold, w_train_fold,
1593
+ X_val_fold, y_val_fold, w_val_fold
1594
+ )
1595
+ y_pred_fold = cv_ft.predict(X_val_fold)
1596
+ loss += mean_tweedie_deviance(
1597
+ y_val_fold,
1598
+ y_pred_fold,
1599
+ sample_weight=w_val_fold,
1600
+ power=tw_power
1601
+ )
1602
+ finally:
1603
+ # 🧹 每个 fold 用完就立即释放 GPU 资源
1604
+ try:
1605
+ # 如果模型在 GPU 上,先挪回 CPU
1606
+ if hasattr(cv_ft, "ft"):
1607
+ cv_ft.ft.to("cpu")
1608
+ except Exception:
1609
+ pass
1610
+ del cv_ft
1611
+ gc.collect()
1612
+ if torch.cuda.is_available():
1613
+ torch.cuda.empty_cache()
1614
+
1615
+ return loss / int(1 / self.ctx.prop_test)
1616
+
1617
+ def tune(self, max_evals: int = 50) -> None:
1618
+ """
1619
+ 用 Optuna 做超参搜索。
1620
+ 在每个 trial 结束后再做一次显存清理,避免 trial 间显存碎片堆积。
1621
+ """
1622
+ def objective(trial: optuna.trial.Trial) -> float:
1623
+ result = self.cross_val(trial)
1624
+ # trial 级别的兜底清理
1625
+ gc.collect()
1626
+ if torch.cuda.is_available():
1627
+ torch.cuda.empty_cache()
1628
+ return result
1629
+
1630
+ study = optuna.create_study(
1631
+ direction='minimize',
1632
+ sampler=optuna.samplers.TPESampler(seed=self.ctx.rand_seed)
1633
+ )
1634
+ study.optimize(objective, n_trials=max_evals)
1635
+ self.best_params = study.best_params
1636
+ self.best_trial = study.best_trial
1637
+ params_path = self.output.result_path(
1638
+ f'{self.ctx.model_nme}_bestparams_ft.csv'
1639
+ )
1640
+ pd.DataFrame(self.best_params, index=[0]).to_csv(params_path)
1641
+
1642
+ def train(self) -> None:
1643
+ if not self.best_params:
1644
+ raise RuntimeError('请先运行 tune() 以获得 FT-Transformer 最优参数。')
1645
+ self.model = FTTransformerSklearn(
1646
+ model_nme=self.ctx.model_nme,
1647
+ num_cols=self.ctx.num_features,
1648
+ cat_cols=self.ctx.cate_list
1649
+ )
1650
+ self.model.set_params(self.best_params)
1651
+ self.model.fit(
1652
+ self.ctx.train_data[self.ctx.factor_nmes],
1653
+ self.ctx.train_data[self.ctx.resp_nme],
1654
+ self.ctx.train_data[self.ctx.weight_nme]
1655
+ )
1656
+ self.ctx.model_label += [self.label]
1657
+ self.ctx.train_data['pred_ft'] = self.model.predict(
1658
+ self.ctx.train_data[self.ctx.factor_nmes]
1659
+ )
1660
+ self.ctx.test_data['pred_ft'] = self.model.predict(
1661
+ self.ctx.test_data[self.ctx.factor_nmes]
1662
+ )
1663
+ self.ctx.train_data.loc[:, 'w_pred_ft'] = (
1664
+ self.ctx.train_data['pred_ft'] *
1665
+ self.ctx.train_data[self.ctx.weight_nme]
1666
+ )
1667
+ self.ctx.test_data.loc[:, 'w_pred_ft'] = (
1668
+ self.ctx.test_data['pred_ft'] *
1669
+ self.ctx.test_data[self.ctx.weight_nme]
1670
+ )
1671
+ self.ctx.ft_best = self.model
1672
+
1673
+ def save(self) -> None:
1674
+ if self.model is not None:
1675
+ torch.save(
1676
+ self.model,
1677
+ self.output.model_path(
1678
+ f'01_{self.ctx.model_nme}_FTTransformer.pth')
1679
+ )
1680
+
1681
+ def load(self) -> None:
1682
+ path = self.output.model_path(
1683
+ f'01_{self.ctx.model_nme}_FTTransformer.pth')
1684
+ if os.path.exists(path):
1685
+ ft_loaded = torch.load(path, map_location='cpu')
1686
+ if torch.cuda.is_available():
1687
+ ft_loaded.device = torch.device('cuda')
1688
+ elif torch.backends.mps.is_available():
1689
+ ft_loaded.device = torch.device('mps')
1690
+ else:
1691
+ ft_loaded.device = torch.device('cpu')
1692
+ ft_loaded.ft.to(ft_loaded.device)
1693
+ self.model = ft_loaded
1694
+ self.ctx.ft_best = self.model
1695
+ else:
1696
+ print(f"[load_model] Warning: 未找到 FT-Transformer 模型文件:{path}")
1697
+
1698
+
1699
+ # =============================================================================
1700
+ # BayesOpt orchestration & SHAP utilities
1701
+ # =============================================================================
1702
+ class BayesOptModel:
1703
+ def __init__(self, train_data, test_data,
1704
+ model_nme, resp_nme, weight_nme, factor_nmes,
1705
+ cate_list=None, prop_test=0.25, rand_seed=None,
1706
+ epochs=100, use_gpu=True):
1707
+ cfg = BayesOptConfig(
1708
+ model_nme=model_nme,
1709
+ resp_nme=resp_nme,
1710
+ weight_nme=weight_nme,
1711
+ factor_nmes=list(factor_nmes),
1712
+ cate_list=list(cate_list) if cate_list else None,
1713
+ prop_test=prop_test,
1714
+ rand_seed=rand_seed,
1715
+ epochs=epochs,
1716
+ use_gpu=use_gpu
1717
+ )
1718
+ self.config = cfg
1719
+ self.model_nme = cfg.model_nme
1720
+ self.resp_nme = cfg.resp_nme
1721
+ self.weight_nme = cfg.weight_nme
1722
+ self.factor_nmes = cfg.factor_nmes
1723
+ self.cate_list = list(cfg.cate_list or [])
1724
+ self.prop_test = cfg.prop_test
1725
+ self.epochs = cfg.epochs
1726
+ self.rand_seed = cfg.rand_seed if cfg.rand_seed is not None else np.random.randint(
1727
+ 1, 10000)
1728
+ self.use_gpu = bool(cfg.use_gpu and torch.cuda.is_available())
1729
+ self.output_manager = OutputManager(os.getcwd(), self.model_nme)
1730
+
1731
+ preprocessor = DatasetPreprocessor(train_data, test_data, cfg).run()
1732
+ self.train_data = preprocessor.train_data
1733
+ self.test_data = preprocessor.test_data
1734
+ self.train_oht_scl_data = preprocessor.train_oht_scl_data
1735
+ self.test_oht_scl_data = preprocessor.test_oht_scl_data
1736
+ self.var_nmes = preprocessor.var_nmes
1737
+ self.num_features = preprocessor.num_features
1738
+ self.cat_categories_for_shap = preprocessor.cat_categories_for_shap
1739
+
1740
+ self.cv = ShuffleSplit(n_splits=int(1/self.prop_test),
1741
+ test_size=self.prop_test,
1742
+ random_state=self.rand_seed)
1743
+ if self.model_nme.find('f') != -1:
1744
+ self.obj = 'count:poisson'
1745
+ elif self.model_nme.find('s') != -1:
1746
+ self.obj = 'reg:gamma'
1747
+ elif self.model_nme.find('bc') != -1:
1748
+ self.obj = 'reg:tweedie'
1749
+ else:
1750
+ self.obj = 'reg:tweedie'
1751
+ self.fit_params = {
1752
+ 'sample_weight': self.train_data[self.weight_nme].values
1753
+ }
1754
+ self.model_label: List[str] = []
1755
+
1756
+ # 记录各模型训练器,后续统一通过标签访问,方便扩展新模型
1757
+ self.trainers: Dict[str, TrainerBase] = {
1758
+ 'xgb': XGBTrainer(self),
1759
+ 'resn': ResNetTrainer(self),
1760
+ 'ft': FTTrainer(self)
1761
+ }
1762
+ self.xgb_best = None
1763
+ self.resn_best = None
1764
+ self.ft_best = None
1765
+ self.best_xgb_params = None
1766
+ self.best_resn_params = None
1767
+ self.best_ft_params = None
1768
+ self.best_xgb_trial = None
1769
+ self.best_resn_trial = None
1770
+ self.best_ft_trial = None
1771
+ self.xgb_load = None
1772
+ self.resn_load = None
1773
+ self.ft_load = None
1774
+
1775
+ # 定义单因素画图函数
1776
+ def plot_oneway(self, n_bins=10):
1777
+ for c in self.factor_nmes:
1778
+ fig = plt.figure(figsize=(7, 5))
1779
+ if c in self.cate_list:
1780
+ group_col = c
1781
+ plot_source = self.train_data
1782
+ else:
1783
+ group_col = f'{c}_bins'
1784
+ bins = pd.qcut(
1785
+ self.train_data[c],
1786
+ n_bins,
1787
+ duplicates='drop' # 注意:如果分位数重复会丢 bin,避免异常终止
1788
+ )
1789
+ plot_source = self.train_data.assign(**{group_col: bins})
1790
+ plot_data = plot_source.groupby(
1791
+ [group_col], observed=True).sum(numeric_only=True)
1792
+ plot_data.reset_index(inplace=True)
1793
+ plot_data['act_v'] = plot_data['w_act'] / \
1794
+ plot_data[self.weight_nme]
1795
+ plot_data.head()
1796
+ ax = fig.add_subplot(111)
1797
+ ax.plot(plot_data.index, plot_data['act_v'],
1798
+ label='Actual', color='red')
1799
+ ax.set_title(
1800
+ 'Analysis of %s : Train Data' % group_col,
1801
+ fontsize=8)
1802
+ plt.xticks(plot_data.index,
1803
+ list(plot_data[group_col].astype(str)),
1804
+ rotation=90)
1805
+ if len(list(plot_data[group_col].astype(str))) > 50:
1806
+ plt.xticks(fontsize=3)
1807
+ else:
1808
+ plt.xticks(fontsize=6)
1809
+ plt.yticks(fontsize=6)
1810
+ ax2 = ax.twinx()
1811
+ ax2.bar(plot_data.index,
1812
+ plot_data[self.weight_nme],
1813
+ alpha=0.5, color='seagreen')
1814
+ plt.yticks(fontsize=6)
1815
+ plt.margins(0.05)
1816
+ plt.subplots_adjust(wspace=0.3)
1817
+ save_path = self.output_manager.plot_path(
1818
+ f'00_{self.model_nme}_{group_col}_oneway.png')
1819
+ plt.savefig(save_path, dpi=300)
1820
+ plt.close(fig)
1821
+
1822
+ # 定义Xgboost贝叶斯优化函数
1823
+ def bayesopt_xgb(self, max_evals=100):
1824
+ trainer = self.trainers['xgb']
1825
+ trainer.tune(max_evals)
1826
+ trainer.train()
1827
+ self.xgb_best = trainer.model
1828
+ # 记录最优参数及 trial 以便排查或复现结果
1829
+ self.best_xgb_params = trainer.best_params
1830
+ self.best_xgb_trial = trainer.best_trial
1831
+
1832
+ # 定义ResNet贝叶斯优化函数
1833
+ def bayesopt_resnet(self, max_evals=100):
1834
+ trainer = self.trainers['resn']
1835
+ trainer.tune(max_evals)
1836
+ trainer.train()
1837
+ self.resn_best = trainer.model
1838
+ # 保存最优 trial 相关信息,方便后续调参分析
1839
+ self.best_resn_params = trainer.best_params
1840
+ self.best_resn_trial = trainer.best_trial
1841
+
1842
+ # 定义 FT-Transformer 贝叶斯优化函数
1843
+ def bayesopt_ft(self, max_evals=50):
1844
+ trainer = self.trainers['ft']
1845
+ trainer.tune(max_evals)
1846
+ trainer.train()
1847
+ self.ft_best = trainer.model
1848
+ # FT-Transformer 参数较多,留存配置信息尤其重要
1849
+ self.best_ft_params = trainer.best_params
1850
+ self.best_ft_trial = trainer.best_trial
1851
+
1852
+ # 定义分箱函数
1853
+
1854
+ def _split_data(self, data, col_nme, wgt_nme, n_bins=10):
1855
+ # 先按得分排序再按累计权重等分,能保证每个分箱曝光量接近
1856
+ sorted_data = data.sort_values(by=col_nme, ascending=True).copy()
1857
+ sorted_data['cum_weight'] = sorted_data[wgt_nme].cumsum()
1858
+ w_sum = sorted_data[wgt_nme].sum()
1859
+ if w_sum <= EPS:
1860
+ sorted_data.loc[:, 'bins'] = 0
1861
+ else:
1862
+ sorted_data.loc[:, 'bins'] = np.floor(
1863
+ sorted_data['cum_weight'] * float(n_bins) / w_sum
1864
+ )
1865
+ sorted_data.loc[(sorted_data['bins'] == n_bins),
1866
+ 'bins'] = n_bins - 1
1867
+ return sorted_data.groupby(['bins'], observed=True).sum(numeric_only=True)
1868
+
1869
+ # 构建提纯曲线所需的数据
1870
+ def _plot_data_lift(self,
1871
+ pred_list, w_pred_list,
1872
+ w_act_list, weight_list, n_bins=10):
1873
+ lift_data = pd.DataFrame()
1874
+ lift_data.loc[:, 'pred'] = pred_list
1875
+ lift_data.loc[:, 'w_pred'] = w_pred_list
1876
+ lift_data.loc[:, 'act'] = w_act_list
1877
+ lift_data.loc[:, 'weight'] = weight_list
1878
+ plot_data = self._split_data(
1879
+ lift_data, 'pred', 'weight', n_bins)
1880
+ denom = np.maximum(plot_data['weight'], EPS)
1881
+ plot_data['exp_v'] = plot_data['w_pred'] / denom
1882
+ plot_data['act_v'] = plot_data['act'] / denom
1883
+ plot_data.reset_index(inplace=True)
1884
+ return plot_data
1885
+
1886
+ # 绘制提纯曲线
1887
+ def plot_lift(self, model_label, pred_nme, n_bins=10):
1888
+ # 绘制建模集上结果
1889
+ figpos_list = [121, 122]
1890
+ plot_dict = {
1891
+ 121: self.train_data,
1892
+ 122: self.test_data
1893
+ }
1894
+ name_list = {
1895
+ 121: 'Train Data',
1896
+ 122: 'Test Data'
1897
+ }
1898
+ if model_label == 'Xgboost':
1899
+ pred_nme = 'pred_xgb'
1900
+ elif model_label == 'ResNet':
1901
+ pred_nme = 'pred_resn'
1902
+ elif model_label == 'FTTransformer':
1903
+ pred_nme = 'pred_ft'
1904
+ # pred_nme 映射保证后续取列统一,否则新模型加入时需同步更新
1905
+
1906
+ fig = plt.figure(figsize=(11, 5))
1907
+ for figpos in figpos_list:
1908
+ plot_data = self._plot_data_lift(
1909
+ plot_dict[figpos][pred_nme].values,
1910
+ plot_dict[figpos]['w_'+pred_nme].values,
1911
+ plot_dict[figpos]['w_act'].values,
1912
+ plot_dict[figpos][self.weight_nme].values,
1913
+ n_bins)
1914
+ ax = fig.add_subplot(figpos)
1915
+ ax.plot(plot_data.index, plot_data['act_v'],
1916
+ label='Actual', color='red')
1917
+ ax.plot(plot_data.index, plot_data['exp_v'],
1918
+ label='Predicted', color='blue')
1919
+ ax.set_title(
1920
+ 'Lift Chart on %s' % name_list[figpos], fontsize=8)
1921
+ plt.xticks(plot_data.index,
1922
+ plot_data.index,
1923
+ rotation=90, fontsize=6)
1924
+ plt.yticks(fontsize=6)
1925
+ plt.legend(loc='upper left',
1926
+ fontsize=5, frameon=False)
1927
+ plt.margins(0.05)
1928
+ ax2 = ax.twinx()
1929
+ ax2.bar(plot_data.index, plot_data['weight'],
1930
+ alpha=0.5, color='seagreen',
1931
+ label='Earned Exposure')
1932
+ plt.yticks(fontsize=6)
1933
+ plt.legend(loc='upper right',
1934
+ fontsize=5, frameon=False)
1935
+ plt.subplots_adjust(wspace=0.3)
1936
+ save_path = self.output_manager.plot_path(
1937
+ f'01_{self.model_nme}_{model_label}_lift.png')
1938
+ plt.savefig(save_path, dpi=300)
1939
+ plt.show()
1940
+ plt.close(fig)
1941
+
1942
+ # 构建双提纯曲线所需的数据
1943
+ def _plot_data_dlift(self,
1944
+ pred_list_model1, pred_list_model2,
1945
+ w_list, w_act_list, n_bins=10):
1946
+ lift_data = pd.DataFrame()
1947
+ lift_data.loc[:, 'pred1'] = pred_list_model1
1948
+ lift_data.loc[:, 'pred2'] = pred_list_model2
1949
+ lift_data.loc[:, 'diff_ly'] = lift_data['pred1'] / lift_data['pred2']
1950
+ lift_data.loc[:, 'act'] = w_act_list
1951
+ lift_data.loc[:, 'weight'] = w_list
1952
+ plot_data = self._split_data(lift_data, 'diff_ly', 'weight', n_bins)
1953
+ denom = np.maximum(plot_data['act'], EPS)
1954
+ plot_data['exp_v1'] = plot_data['pred1'] / denom
1955
+ plot_data['exp_v2'] = plot_data['pred2'] / denom
1956
+ plot_data['act_v'] = plot_data['act'] / denom
1957
+ plot_data.reset_index(inplace=True)
1958
+ return plot_data
1959
+
1960
+ # 绘制双提纯曲线
1961
+ def plot_dlift(self, model_comp: List[str] = ['xgb', 'resn'], n_bins: int = 10) -> None:
1962
+ """
1963
+ 绘制双提纯曲线,对比两个模型的预测效果。
1964
+
1965
+ Args:
1966
+ model_comp: 包含两个模型简称的列表,例如 ['xgb', 'resn']。
1967
+ 支持 'xgb', 'resn', 'ft'。
1968
+ n_bins: 分箱数量。
1969
+ """
1970
+ if len(model_comp) != 2:
1971
+ raise ValueError("`model_comp` 必须包含两个模型进行对比。")
1972
+
1973
+ model_name_map = {
1974
+ 'xgb': 'Xgboost',
1975
+ 'resn': 'ResNet',
1976
+ 'ft': 'FTTransformer'
1977
+ }
1978
+
1979
+ name1, name2 = model_comp
1980
+ if name1 not in model_name_map or name2 not in model_name_map:
1981
+ raise ValueError(f"不支持的模型简称。请从 {list(model_name_map.keys())} 中选择。")
1982
+
1983
+ fig, axes = plt.subplots(1, 2, figsize=(11, 5))
1984
+ datasets = {
1985
+ 'Train Data': self.train_data,
1986
+ 'Test Data': self.test_data
1987
+ }
1988
+
1989
+ for ax, (data_name, data) in zip(axes, datasets.items()):
1990
+ pred1_col = f'w_pred_{name1}'
1991
+ pred2_col = f'w_pred_{name2}'
1992
+
1993
+ if pred1_col not in data.columns or pred2_col not in data.columns:
1994
+ print(
1995
+ f"警告: 在 {data_name} 中找不到预测列 {pred1_col} 或 {pred2_col}。跳过绘图。")
1996
+ continue
1997
+
1998
+ plot_data = self._plot_data_dlift(
1999
+ data[pred1_col].values,
2000
+ data[pred2_col].values,
2001
+ data[self.weight_nme].values,
2002
+ data['w_act'].values,
2003
+ n_bins
2004
+ )
2005
+
2006
+ label1 = model_name_map[name1]
2007
+ label2 = model_name_map[name2]
2008
+
2009
+ ax.plot(plot_data.index,
2010
+ plot_data['act_v'], label='Actual', color='red')
2011
+ ax.plot(plot_data.index,
2012
+ plot_data['exp_v1'], label=label1, color='blue')
2013
+ ax.plot(plot_data.index,
2014
+ plot_data['exp_v2'], label=label2, color='black')
2015
+
2016
+ ax.set_title(f'Double Lift Chart on {data_name}', fontsize=8)
2017
+ ax.set_xticks(plot_data.index)
2018
+ ax.set_xticklabels(plot_data.index, rotation=90, fontsize=6)
2019
+ ax.set_xlabel(f'{label1} / {label2}', fontsize=6)
2020
+ ax.tick_params(axis='y', labelsize=6)
2021
+ ax.legend(loc='upper left', fontsize=5, frameon=False)
2022
+ ax.margins(0.1)
2023
+
2024
+ ax2 = ax.twinx()
2025
+ ax2.bar(plot_data.index, plot_data['weight'],
2026
+ alpha=0.5, color='seagreen', label='Earned Exposure')
2027
+ ax2.tick_params(axis='y', labelsize=6)
2028
+ ax2.legend(loc='upper right', fontsize=5, frameon=False)
2029
+
2030
+ plt.subplots_adjust(bottom=0.25, top=0.95, right=0.8, wspace=0.3)
2031
+ save_path = self.output_manager.plot_path(
2032
+ f'02_{self.model_nme}_dlift_{name1}_vs_{name2}.png')
2033
+ plt.savefig(save_path, dpi=300)
2034
+ plt.show()
2035
+ plt.close(fig)
2036
+
2037
+ # 保存模型
2038
+
2039
+ def save_model(self, model_name=None):
2040
+
2041
+ # model_name 可以是:
2042
+ # - None: 保存全部可用模型
2043
+ # - 'xgb': 只保存 Xgboost
2044
+ # - 'resn': 只保存 ResNet
2045
+ # - 'ft': 只保存 FT-Transformer
2046
+ if model_name in (None, 'xgb'):
2047
+ trainer = self.trainers['xgb']
2048
+ if trainer.model is not None:
2049
+ trainer.save()
2050
+ else:
2051
+ print("[save_model] Warning: xgb_best 不存在,未保存 Xgboost 模型。")
2052
+
2053
+ if model_name in (None, 'resn'):
2054
+ trainer = self.trainers['resn']
2055
+ if trainer.model is not None:
2056
+ trainer.save()
2057
+ else:
2058
+ print("[save_model] Warning: resn_best 不存在,未保存 ResNet 模型。")
2059
+
2060
+ if model_name in (None, 'ft'):
2061
+ trainer = self.trainers['ft']
2062
+ if trainer.model is not None:
2063
+ trainer.save()
2064
+ else:
2065
+ print("[save_model] Warning: ft_best 不存在,未保存 FT-Transformer 模型。")
2066
+
2067
+ def load_model(self, model_name=None):
2068
+ # model_name 可以是:
2069
+ # - None: 加载全部能找到的模型
2070
+ # - 'xgb': 只加载 Xgboost
2071
+ # - 'resn': 只加载 ResNet
2072
+ # - 'ft': 只加载 FT-Transformer
2073
+
2074
+ if model_name in (None, 'xgb'):
2075
+ trainer = self.trainers['xgb']
2076
+ trainer.load()
2077
+ self.xgb_best = trainer.model
2078
+ self.xgb_load = trainer.model
2079
+
2080
+ if model_name in (None, 'resn'):
2081
+ trainer = self.trainers['resn']
2082
+ trainer.load()
2083
+ self.resn_best = trainer.model
2084
+ self.resn_load = trainer.model
2085
+
2086
+ if model_name in (None, 'ft'):
2087
+ trainer = self.trainers['ft']
2088
+ trainer.load()
2089
+ self.ft_best = trainer.model
2090
+ self.ft_load = trainer.model
2091
+
2092
+ def _build_ft_shap_matrix(self, data: pd.DataFrame) -> np.ndarray:
2093
+
2094
+ # 将原始特征 DataFrame (包含 self.factor_nmes) 转成
2095
+ # 纯数值矩阵: 数值列为 float64,类别列为整数 code(float64 存储)。
2096
+ # 列顺序与 self.factor_nmes 保持一致。
2097
+
2098
+ matrices = []
2099
+
2100
+ for col in self.factor_nmes:
2101
+ s = data[col]
2102
+
2103
+ if col in self.cate_list:
2104
+ # 类别列:按训练时的类别全集编码
2105
+ cats = pd.Categorical(
2106
+ s,
2107
+ categories=self.cat_categories_for_shap[col]
2108
+ )
2109
+ # cats.codes 是一个 Index / ndarray,用 np.asarray 包一下再 reshape
2110
+ codes = np.asarray(cats.codes, dtype=np.float64).reshape(-1, 1)
2111
+ matrices.append(codes)
2112
+ else:
2113
+ # 数值列:转成 Series -> numpy -> reshape
2114
+ vals = pd.to_numeric(s, errors="coerce")
2115
+ arr = vals.to_numpy(dtype=np.float64, copy=True).reshape(-1, 1)
2116
+ matrices.append(arr)
2117
+
2118
+ X_mat = np.concatenate(matrices, axis=1) # (N, F)
2119
+ return X_mat
2120
+
2121
+ def _decode_ft_shap_matrix_to_df(self, X_mat: np.ndarray) -> pd.DataFrame:
2122
+
2123
+ # 将 SHAP 的数值矩阵 (N, F) 还原为原始特征 DataFrame,
2124
+ # 数值列为 float,类别列还原为 pandas 的 category 类型,
2125
+ # 以便兼容 enable_categorical=True 的 XGBoost 和 FT-Transformer 的输入。
2126
+ # 列顺序 = self.factor_nmes
2127
+
2128
+ data_dict = {}
2129
+
2130
+ for j, col in enumerate(self.factor_nmes):
2131
+ col_vals = X_mat[:, j]
2132
+
2133
+ if col in self.cate_list:
2134
+ cats = self.cat_categories_for_shap[col]
2135
+
2136
+ # SHAP 会扰动成小数,这里 round 回整数 code
2137
+ codes = np.round(col_vals).astype(int)
2138
+ # 限制在 [-1, len(cats)-1]
2139
+ codes = np.clip(codes, -1, len(cats) - 1)
2140
+
2141
+ # 使用 pandas.Categorical.from_codes:
2142
+ # - codes = -1 被当成缺失 (NaN)
2143
+ # - 其他索引映射到 cats 中对应的类别
2144
+ cat_series = pd.Categorical.from_codes(
2145
+ codes,
2146
+ categories=cats
2147
+ )
2148
+ # 存的是 Categorical 类型,而不是 object
2149
+ data_dict[col] = cat_series
2150
+ else:
2151
+ # 数值列:直接 float
2152
+ data_dict[col] = col_vals.astype(float)
2153
+
2154
+ df = pd.DataFrame(data_dict, columns=self.factor_nmes)
2155
+
2156
+ # 再保险:确保所有类别列 dtype 真的是 category
2157
+ for col in self.cate_list:
2158
+ if col in df.columns:
2159
+ df[col] = df[col].astype("category")
2160
+ return df
2161
+
2162
+ # ========= XGBoost SHAP =========
2163
+
2164
+ def compute_shap_xgb(self, n_background: int = 500,
2165
+ n_samples: int = 200,
2166
+ on_train: bool = True):
2167
+ # 使用 KernelExplainer 计算 XGBoost 的 SHAP 值(黑盒方式)。
2168
+ #
2169
+ # - 对 SHAP:输入是一份纯数值矩阵:
2170
+ # * 数值特征:float64
2171
+ # * 类别特征:用 _build_ft_shap_matrix 编码后的整数 code(float64)
2172
+ # - 对模型:仍然用原始 DataFrame + xgb_best.predict(...)
2173
+
2174
+ if not hasattr(self, "xgb_best"):
2175
+ raise RuntimeError("请先运行 bayesopt_xgb() 训练好 self.xgb_best")
2176
+
2177
+ # 1) 选择数据源:训练集 or 测试集(原始特征空间)
2178
+ data = self.train_data if on_train else self.test_data
2179
+ X_raw = data[self.factor_nmes]
2180
+
2181
+ # 2) 构造背景矩阵(用和 FT 一样的数值编码)
2182
+ background_raw = X_raw.sample(
2183
+ min(len(X_raw), n_background),
2184
+ random_state=self.rand_seed
2185
+ )
2186
+ # KernelExplainer 计算量极大,务必控制背景样本规模,否则会拖慢调试
2187
+ background_mat = self._build_ft_shap_matrix(
2188
+ background_raw
2189
+ ).astype(np.float64, copy=True)
2190
+
2191
+ # 3) 定义黑盒预测函数:数值矩阵 -> DataFrame -> xgb_best.predict
2192
+ def f_predict(x_mat: np.ndarray) -> np.ndarray:
2193
+ # 把编码矩阵还原成原始 DataFrame(数值+类别)
2194
+ df_input = self._decode_ft_shap_matrix_to_df(x_mat)
2195
+ # 注意:这里用的是 self.xgb_best.predict,和你训练/预测时一致
2196
+ y_pred = self.xgb_best.predict(df_input)
2197
+ return y_pred
2198
+
2199
+ explainer = shap.KernelExplainer(f_predict, background_mat)
2200
+
2201
+ # 4) 要解释的样本:原始特征 + 数值编码
2202
+ X_explain_raw = X_raw.sample(
2203
+ min(len(X_raw), n_samples),
2204
+ random_state=self.rand_seed
2205
+ )
2206
+ X_explain_mat = self._build_ft_shap_matrix(
2207
+ X_explain_raw
2208
+ ).astype(np.float64, copy=True)
2209
+
2210
+ # 5) 计算 SHAP 值(注意用 nsamples='auto' 控制复杂度)
2211
+ shap_values = explainer.shap_values(X_explain_mat, nsamples="auto")
2212
+
2213
+ # 6) 保存结果:
2214
+ # - shap_values:数值编码空间,对应 factor_nmes 的每一列
2215
+ # - X_explain_raw:原始 DataFrame,方便画图时显示真实类别名
2216
+ self.shap_xgb = {
2217
+ "explainer": explainer,
2218
+ "X_explain": X_explain_raw,
2219
+ "shap_values": shap_values,
2220
+ "base_value": explainer.expected_value,
2221
+ }
2222
+ return self.shap_xgb
2223
+ # ========= ResNet SHAP =========
2224
+
2225
+ def _resn_predict_wrapper(self, X_np):
2226
+ # 保证走 CPU
2227
+ model = self.resn_best.resnet.to("cpu")
2228
+ with torch.no_grad():
2229
+ # 不要 .to(self.device)
2230
+ X_tensor = torch.tensor(X_np, dtype=torch.float32)
2231
+ y_pred = model(X_tensor).cpu().numpy()
2232
+ y_pred = np.clip(y_pred, 1e-6, None)
2233
+ return y_pred.reshape(-1)
2234
+
2235
+ def compute_shap_resn(self, n_background: int = 500,
2236
+ n_samples: int = 200,
2237
+ on_train: bool = True):
2238
+
2239
+ # 使用 KernelExplainer 计算 ResNet 的 SHAP 值。
2240
+ # 解释空间:已 one-hot & 标准化后的特征 self.var_nmes。
2241
+
2242
+ if not hasattr(self, 'resn_best'):
2243
+ raise RuntimeError("请先运行 bayesopt_resnet() 训练好 resn_best")
2244
+
2245
+ self.resn_best.device = torch.device("cpu") # 强制走 CPU
2246
+ self.resn_best.resnet.to("cpu")
2247
+ if torch.cuda.is_available():
2248
+ torch.cuda.empty_cache()
2249
+
2250
+ # 选择数据集(已 one-hot & 标准化)
2251
+ data = self.train_oht_scl_data if on_train else self.test_oht_scl_data
2252
+ X = data[self.var_nmes]
2253
+ if len(X) == 0:
2254
+ raise ValueError(
2255
+ "compute_shap_resn: 选择的数据集为空(len(X)==0),无法计算 SHAP。")
2256
+
2257
+ # 背景样本:float64 numpy
2258
+ background_df = X.sample(
2259
+ min(len(X), n_background),
2260
+ random_state=self.rand_seed
2261
+ )
2262
+ background_np = background_df.to_numpy(dtype=np.float64, copy=True)
2263
+
2264
+ # 黑盒预测函数
2265
+ def f_predict(x):
2266
+ y = self._resn_predict_wrapper(x)
2267
+ # 保证是一维数组
2268
+ y = np.asarray(y, dtype=np.float64).reshape(-1)
2269
+ return y
2270
+
2271
+ explainer = shap.KernelExplainer(f_predict, background_np)
2272
+
2273
+ # 要解释的样本
2274
+ X_explain_df = X.sample(
2275
+ min(len(X), n_samples),
2276
+ random_state=self.rand_seed
2277
+ )
2278
+ X_explain_np = X_explain_df.to_numpy(dtype=np.float64, copy=True)
2279
+
2280
+ max_nsamples = 300
2281
+ min_needed = X_explain_np.shape[1] + 2
2282
+ nsample_eff = max(min_needed, min(max_nsamples,
2283
+ X_explain_np.shape[0] * X_explain_np.shape[1]))
2284
+ shap_values = explainer.shap_values(X_explain_np, nsamples=nsample_eff)
2285
+ # 手动计算 base_value,避免 NotOneValueFound
2286
+ bg_pred = f_predict(background_np)
2287
+ if bg_pred.size == 0:
2288
+ raise ValueError("compute_shap_resn: 背景样本预测结果为空,无法计算 base_value。")
2289
+ base_value = float(bg_pred.mean())
2290
+
2291
+ self.shap_resn = {
2292
+ "explainer": explainer,
2293
+ "X_explain": X_explain_df, # DataFrame: 用于画图(有列名)
2294
+ "shap_values": shap_values, # numpy: (n_samples, n_features)
2295
+ # "base_value": explainer.expected_value,
2296
+ "base_value": base_value,
2297
+ }
2298
+ return self.shap_resn
2299
+
2300
+ # ========= FT-Transformer SHAP =========
2301
+
2302
+ def _ft_shap_predict_wrapper(self, X_mat: np.ndarray) -> np.ndarray:
2303
+
2304
+ # SHAP 的预测包装:
2305
+ # 数值矩阵 -> 还原为原始特征 DataFrame -> 调用 ft_best.predict
2306
+
2307
+ df_input = self._decode_ft_shap_matrix_to_df(X_mat)
2308
+ y_pred = self.ft_best.predict(df_input)
2309
+ return np.asarray(y_pred, dtype=np.float64).reshape(-1)
2310
+
2311
+ def compute_shap_ft(self, n_background: int = 500,
2312
+ n_samples: int = 200,
2313
+ on_train: bool = True):
2314
+
2315
+ # 使用 KernelExplainer 计算 FT-Transformer 的 SHAP 值。
2316
+ # 解释空间:数值+类别code 的混合数值矩阵(float64),
2317
+ # 但对外展示时仍使用原始特征名/取值(X_explain)。
2318
+
2319
+ if not hasattr(self, "ft_best"):
2320
+ raise RuntimeError("请先运行 bayesopt_ft() 训练好 ft_best")
2321
+
2322
+ self.ft_best.device = torch.device("cpu") # 强制走 CPU
2323
+ self.ft_best.ft.to("cpu")
2324
+ if torch.cuda.is_available():
2325
+ torch.cuda.empty_cache()
2326
+
2327
+ # 选择数据源(原始特征空间)
2328
+ data = self.train_data if on_train else self.test_data
2329
+ X_raw = data[self.factor_nmes]
2330
+
2331
+ # 背景矩阵
2332
+ background_raw = X_raw.sample(
2333
+ min(len(X_raw), n_background),
2334
+ random_state=self.rand_seed
2335
+ )
2336
+ background_mat = self._build_ft_shap_matrix(
2337
+ background_raw
2338
+ ).astype(np.float64, copy=True)
2339
+
2340
+ # 黑盒预测函数(数值矩阵 → DataFrame → FT 模型)
2341
+ def f_predict(x):
2342
+ return self._ft_shap_predict_wrapper(x)
2343
+
2344
+ explainer = shap.KernelExplainer(f_predict, background_mat)
2345
+
2346
+ # 要解释的样本(原始特征空间)
2347
+ X_explain_raw = X_raw.sample(
2348
+ min(len(X_raw), n_samples),
2349
+ random_state=self.rand_seed
2350
+ )
2351
+ X_explain_mat = self._build_ft_shap_matrix(
2352
+ X_explain_raw
2353
+ ).astype(np.float64, copy=True)
2354
+
2355
+ max_nsamples = 300
2356
+ min_needed = X_explain_mat.shape[1] + 2
2357
+ nsample_eff = max(min_needed, min(max_nsamples,
2358
+ X_explain_mat.shape[0] * X_explain_mat.shape[1]))
2359
+ shap_values = explainer.shap_values(
2360
+ X_explain_mat, nsamples=nsample_eff)
2361
+ bg_pred = self._ft_shap_predict_wrapper(background_mat)
2362
+ bg_pred = np.asarray(bg_pred, dtype=np.float64).reshape(-1)
2363
+ base_value = float(bg_pred.mean())
2364
+
2365
+ self.shap_ft = {
2366
+ "explainer": explainer,
2367
+ "X_explain": X_explain_raw, # 原始特征 DataFrame,用来画图
2368
+ "shap_values": shap_values, # numpy: (n_samples, n_features)
2369
+ # "base_value": explainer.expected_value,
2370
+ "base_value": base_value,
2371
+ }
2372
+ return self.shap_ft