ins-pricing 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. ins_pricing/README.md +60 -0
  2. ins_pricing/__init__.py +102 -0
  3. ins_pricing/governance/README.md +18 -0
  4. ins_pricing/governance/__init__.py +20 -0
  5. ins_pricing/governance/approval.py +93 -0
  6. ins_pricing/governance/audit.py +37 -0
  7. ins_pricing/governance/registry.py +99 -0
  8. ins_pricing/governance/release.py +159 -0
  9. ins_pricing/modelling/BayesOpt.py +146 -0
  10. ins_pricing/modelling/BayesOpt_USAGE.md +925 -0
  11. ins_pricing/modelling/BayesOpt_entry.py +575 -0
  12. ins_pricing/modelling/BayesOpt_incremental.py +731 -0
  13. ins_pricing/modelling/Explain_Run.py +36 -0
  14. ins_pricing/modelling/Explain_entry.py +539 -0
  15. ins_pricing/modelling/Pricing_Run.py +36 -0
  16. ins_pricing/modelling/README.md +33 -0
  17. ins_pricing/modelling/__init__.py +44 -0
  18. ins_pricing/modelling/bayesopt/__init__.py +98 -0
  19. ins_pricing/modelling/bayesopt/config_preprocess.py +303 -0
  20. ins_pricing/modelling/bayesopt/core.py +1476 -0
  21. ins_pricing/modelling/bayesopt/models.py +2196 -0
  22. ins_pricing/modelling/bayesopt/trainers.py +2446 -0
  23. ins_pricing/modelling/bayesopt/utils.py +1021 -0
  24. ins_pricing/modelling/cli_common.py +136 -0
  25. ins_pricing/modelling/explain/__init__.py +55 -0
  26. ins_pricing/modelling/explain/gradients.py +334 -0
  27. ins_pricing/modelling/explain/metrics.py +176 -0
  28. ins_pricing/modelling/explain/permutation.py +155 -0
  29. ins_pricing/modelling/explain/shap_utils.py +146 -0
  30. ins_pricing/modelling/notebook_utils.py +284 -0
  31. ins_pricing/modelling/plotting/__init__.py +45 -0
  32. ins_pricing/modelling/plotting/common.py +63 -0
  33. ins_pricing/modelling/plotting/curves.py +572 -0
  34. ins_pricing/modelling/plotting/diagnostics.py +139 -0
  35. ins_pricing/modelling/plotting/geo.py +362 -0
  36. ins_pricing/modelling/plotting/importance.py +121 -0
  37. ins_pricing/modelling/run_logging.py +133 -0
  38. ins_pricing/modelling/tests/conftest.py +8 -0
  39. ins_pricing/modelling/tests/test_cross_val_generic.py +66 -0
  40. ins_pricing/modelling/tests/test_distributed_utils.py +18 -0
  41. ins_pricing/modelling/tests/test_explain.py +56 -0
  42. ins_pricing/modelling/tests/test_geo_tokens_split.py +49 -0
  43. ins_pricing/modelling/tests/test_graph_cache.py +33 -0
  44. ins_pricing/modelling/tests/test_plotting.py +63 -0
  45. ins_pricing/modelling/tests/test_plotting_library.py +150 -0
  46. ins_pricing/modelling/tests/test_preprocessor.py +48 -0
  47. ins_pricing/modelling/watchdog_run.py +211 -0
  48. ins_pricing/pricing/README.md +44 -0
  49. ins_pricing/pricing/__init__.py +27 -0
  50. ins_pricing/pricing/calibration.py +39 -0
  51. ins_pricing/pricing/data_quality.py +117 -0
  52. ins_pricing/pricing/exposure.py +85 -0
  53. ins_pricing/pricing/factors.py +91 -0
  54. ins_pricing/pricing/monitoring.py +99 -0
  55. ins_pricing/pricing/rate_table.py +78 -0
  56. ins_pricing/production/__init__.py +21 -0
  57. ins_pricing/production/drift.py +30 -0
  58. ins_pricing/production/monitoring.py +143 -0
  59. ins_pricing/production/scoring.py +40 -0
  60. ins_pricing/reporting/README.md +20 -0
  61. ins_pricing/reporting/__init__.py +11 -0
  62. ins_pricing/reporting/report_builder.py +72 -0
  63. ins_pricing/reporting/scheduler.py +45 -0
  64. ins_pricing/setup.py +41 -0
  65. ins_pricing v2/__init__.py +23 -0
  66. ins_pricing v2/governance/__init__.py +20 -0
  67. ins_pricing v2/governance/approval.py +93 -0
  68. ins_pricing v2/governance/audit.py +37 -0
  69. ins_pricing v2/governance/registry.py +99 -0
  70. ins_pricing v2/governance/release.py +159 -0
  71. ins_pricing v2/modelling/Explain_Run.py +36 -0
  72. ins_pricing v2/modelling/Pricing_Run.py +36 -0
  73. ins_pricing v2/modelling/__init__.py +151 -0
  74. ins_pricing v2/modelling/cli_common.py +141 -0
  75. ins_pricing v2/modelling/config.py +249 -0
  76. ins_pricing v2/modelling/config_preprocess.py +254 -0
  77. ins_pricing v2/modelling/core.py +741 -0
  78. ins_pricing v2/modelling/data_container.py +42 -0
  79. ins_pricing v2/modelling/explain/__init__.py +55 -0
  80. ins_pricing v2/modelling/explain/gradients.py +334 -0
  81. ins_pricing v2/modelling/explain/metrics.py +176 -0
  82. ins_pricing v2/modelling/explain/permutation.py +155 -0
  83. ins_pricing v2/modelling/explain/shap_utils.py +146 -0
  84. ins_pricing v2/modelling/features.py +215 -0
  85. ins_pricing v2/modelling/model_manager.py +148 -0
  86. ins_pricing v2/modelling/model_plotting.py +463 -0
  87. ins_pricing v2/modelling/models.py +2203 -0
  88. ins_pricing v2/modelling/notebook_utils.py +294 -0
  89. ins_pricing v2/modelling/plotting/__init__.py +45 -0
  90. ins_pricing v2/modelling/plotting/common.py +63 -0
  91. ins_pricing v2/modelling/plotting/curves.py +572 -0
  92. ins_pricing v2/modelling/plotting/diagnostics.py +139 -0
  93. ins_pricing v2/modelling/plotting/geo.py +362 -0
  94. ins_pricing v2/modelling/plotting/importance.py +121 -0
  95. ins_pricing v2/modelling/run_logging.py +133 -0
  96. ins_pricing v2/modelling/tests/conftest.py +8 -0
  97. ins_pricing v2/modelling/tests/test_cross_val_generic.py +66 -0
  98. ins_pricing v2/modelling/tests/test_distributed_utils.py +18 -0
  99. ins_pricing v2/modelling/tests/test_explain.py +56 -0
  100. ins_pricing v2/modelling/tests/test_geo_tokens_split.py +49 -0
  101. ins_pricing v2/modelling/tests/test_graph_cache.py +33 -0
  102. ins_pricing v2/modelling/tests/test_plotting.py +63 -0
  103. ins_pricing v2/modelling/tests/test_plotting_library.py +150 -0
  104. ins_pricing v2/modelling/tests/test_preprocessor.py +48 -0
  105. ins_pricing v2/modelling/trainers.py +2447 -0
  106. ins_pricing v2/modelling/utils.py +1020 -0
  107. ins_pricing v2/modelling/watchdog_run.py +211 -0
  108. ins_pricing v2/pricing/__init__.py +27 -0
  109. ins_pricing v2/pricing/calibration.py +39 -0
  110. ins_pricing v2/pricing/data_quality.py +117 -0
  111. ins_pricing v2/pricing/exposure.py +85 -0
  112. ins_pricing v2/pricing/factors.py +91 -0
  113. ins_pricing v2/pricing/monitoring.py +99 -0
  114. ins_pricing v2/pricing/rate_table.py +78 -0
  115. ins_pricing v2/production/__init__.py +21 -0
  116. ins_pricing v2/production/drift.py +30 -0
  117. ins_pricing v2/production/monitoring.py +143 -0
  118. ins_pricing v2/production/scoring.py +40 -0
  119. ins_pricing v2/reporting/__init__.py +11 -0
  120. ins_pricing v2/reporting/report_builder.py +72 -0
  121. ins_pricing v2/reporting/scheduler.py +45 -0
  122. ins_pricing v2/scripts/BayesOpt_incremental.py +722 -0
  123. ins_pricing v2/scripts/Explain_entry.py +545 -0
  124. ins_pricing v2/scripts/__init__.py +1 -0
  125. ins_pricing v2/scripts/train.py +568 -0
  126. ins_pricing v2/setup.py +55 -0
  127. ins_pricing v2/smoke_test.py +28 -0
  128. ins_pricing-0.1.6.dist-info/METADATA +78 -0
  129. ins_pricing-0.1.6.dist-info/RECORD +169 -0
  130. ins_pricing-0.1.6.dist-info/WHEEL +5 -0
  131. ins_pricing-0.1.6.dist-info/top_level.txt +4 -0
  132. user_packages/__init__.py +105 -0
  133. user_packages legacy/BayesOpt.py +5659 -0
  134. user_packages legacy/BayesOpt_entry.py +513 -0
  135. user_packages legacy/BayesOpt_incremental.py +685 -0
  136. user_packages legacy/Pricing_Run.py +36 -0
  137. user_packages legacy/Try/BayesOpt Legacy251213.py +3719 -0
  138. user_packages legacy/Try/BayesOpt Legacy251215.py +3758 -0
  139. user_packages legacy/Try/BayesOpt lagecy251201.py +3506 -0
  140. user_packages legacy/Try/BayesOpt lagecy251218.py +3992 -0
  141. user_packages legacy/Try/BayesOpt legacy.py +3280 -0
  142. user_packages legacy/Try/BayesOpt.py +838 -0
  143. user_packages legacy/Try/BayesOptAll.py +1569 -0
  144. user_packages legacy/Try/BayesOptAllPlatform.py +909 -0
  145. user_packages legacy/Try/BayesOptCPUGPU.py +1877 -0
  146. user_packages legacy/Try/BayesOptSearch.py +830 -0
  147. user_packages legacy/Try/BayesOptSearchOrigin.py +829 -0
  148. user_packages legacy/Try/BayesOptV1.py +1911 -0
  149. user_packages legacy/Try/BayesOptV10.py +2973 -0
  150. user_packages legacy/Try/BayesOptV11.py +3001 -0
  151. user_packages legacy/Try/BayesOptV12.py +3001 -0
  152. user_packages legacy/Try/BayesOptV2.py +2065 -0
  153. user_packages legacy/Try/BayesOptV3.py +2209 -0
  154. user_packages legacy/Try/BayesOptV4.py +2342 -0
  155. user_packages legacy/Try/BayesOptV5.py +2372 -0
  156. user_packages legacy/Try/BayesOptV6.py +2759 -0
  157. user_packages legacy/Try/BayesOptV7.py +2832 -0
  158. user_packages legacy/Try/BayesOptV8Codex.py +2731 -0
  159. user_packages legacy/Try/BayesOptV8Gemini.py +2614 -0
  160. user_packages legacy/Try/BayesOptV9.py +2927 -0
  161. user_packages legacy/Try/BayesOpt_entry legacy.py +313 -0
  162. user_packages legacy/Try/ModelBayesOptSearch.py +359 -0
  163. user_packages legacy/Try/ResNetBayesOptSearch.py +249 -0
  164. user_packages legacy/Try/XgbBayesOptSearch.py +121 -0
  165. user_packages legacy/Try/xgbbayesopt.py +523 -0
  166. user_packages legacy/__init__.py +19 -0
  167. user_packages legacy/cli_common.py +124 -0
  168. user_packages legacy/notebook_utils.py +228 -0
  169. user_packages legacy/watchdog_run.py +202 -0
@@ -0,0 +1,2065 @@
1
+ # 数据在CPU和GPU之间传输会带来较大开销,但可以多CUDA流同时传输数据和计算,从而实现更大数据集的操作。
2
+
3
+ import pandas as pd
4
+ import numpy as np
5
+ from random import sample
6
+ from re import X
7
+ from turtle import st
8
+ from uuid import RESERVED_FUTURE
9
+ import numpy as np # 1.26.2
10
+ import pandas as pd # 2.2.3
11
+ import torch # 版本: 1.10.1+cu111
12
+ import torch.nn as nn
13
+ import torch.nn.functional as F
14
+ import optuna # 4.3.0
15
+ import xgboost as xgb # 1.7.0
16
+ import matplotlib.pyplot as plt
17
+ import os
18
+ import joblib
19
+ import copy
20
+ import shap
21
+ import math
22
+ import gc
23
+ from dataclasses import dataclass
24
+ from pathlib import Path
25
+ from typing import Any, Dict, List, Optional
26
+
27
+ from torch.utils.data import Dataset, DataLoader, TensorDataset
28
+ from torch.cuda.amp import autocast, GradScaler
29
+ from torch.nn.utils import clip_grad_norm_
30
+ from sklearn.model_selection import ShuffleSplit, cross_val_score # 1.2.2
31
+ from sklearn.preprocessing import StandardScaler
32
+ from sklearn.metrics import make_scorer, mean_tweedie_deviance
33
+
34
+
35
+ def ensure_parent_dir(file_path: str) -> None:
36
+ # 若目标文件所在目录不存在则自动创建
37
+ directory = os.path.dirname(file_path)
38
+ if directory:
39
+ os.makedirs(directory, exist_ok=True)
40
+
41
+
42
+ # 定义在 PyTorch 环境下的 Tweedie 偏差损失函数
43
+ # 参考文档:https://scikit-learn.org/stable/modules/model_evaluation.html#mean-poisson-gamma-and-tweedie-deviances
44
+
45
+
46
+ def tweedie_loss(pred, target, p=1.5, eps=1e-6, max_clip=1e6):
47
+ # 为确保稳定性先将预测值裁剪为正数
48
+ pred_clamped = torch.clamp(pred, min=eps)
49
+ # 计算 Tweedie 偏差的各部分
50
+ if p == 1:
51
+ # 对应泊松分布
52
+ term1 = target * torch.log(target / pred_clamped + eps)
53
+ term2 = -target + pred_clamped
54
+ term3 = 0
55
+ elif p == 0:
56
+ # 对应高斯分布
57
+ term1 = 0.5 * torch.pow(target - pred_clamped, 2)
58
+ term2 = 0
59
+ term3 = 0
60
+ elif p == 2:
61
+ # 对应伽马分布
62
+ term1 = torch.log(pred_clamped / target + eps)
63
+ term2 = -target / pred_clamped + 1
64
+ term3 = 0
65
+ else:
66
+ term1 = torch.pow(target, 2 - p) / ((1 - p) * (2 - p))
67
+ term2 = target * torch.pow(pred_clamped, 1 - p) / (1 - p)
68
+ term3 = torch.pow(pred_clamped, 2 - p) / (2 - p)
69
+ # Tweedie 负对数似然(忽略常数项)
70
+ return torch.nan_to_num(2 * (term1 - term2 + term3), nan=eps, posinf=max_clip, neginf=-max_clip)
71
+
72
+ # 定义释放CUDA内存函数
73
+
74
+
75
+ def free_cuda():
76
+ print(">>> Moving all models to CPU...")
77
+ for obj in gc.get_objects():
78
+ try:
79
+ if hasattr(obj, "to") and callable(obj.to):
80
+ # 跳过 torch.device 等不可移动对象
81
+ obj.to("cpu")
82
+ except:
83
+ pass
84
+
85
+ print(">>> Deleting tensors, optimizers, dataloaders...")
86
+ gc.collect()
87
+
88
+ print(">>> Emptying CUDA cache...")
89
+ torch.cuda.empty_cache()
90
+ torch.cuda.synchronize()
91
+
92
+ print(">>> CUDA memory freed.")
93
+
94
+
95
+ # 定义分箱函数
96
+
97
+
98
+ def split_data(data, col_nme, wgt_nme, n_bins=10):
99
+ data.sort_values(by=col_nme, ascending=True, inplace=True)
100
+ data['cum_weight'] = data[wgt_nme].cumsum()
101
+ w_sum = data[wgt_nme].sum()
102
+ data.loc[:, 'bins'] = np.floor(data['cum_weight'] * float(n_bins) / w_sum)
103
+ data.loc[(data['bins'] == n_bins), 'bins'] = n_bins - 1
104
+ return data.groupby(['bins'], observed=True).sum(numeric_only=True)
105
+
106
+ # 定义提纯曲线(Lift)绘制函数
107
+
108
+
109
+ def plot_lift_list(pred_model, w_pred_list, w_act_list,
110
+ weight_list, tgt_nme, n_bins=10,
111
+ fig_nme='Lift Chart'):
112
+ lift_data = pd.DataFrame()
113
+ lift_data.loc[:, 'pred'] = pred_model
114
+ lift_data.loc[:, 'w_pred'] = w_pred_list
115
+ lift_data.loc[:, 'act'] = w_act_list
116
+ lift_data.loc[:, 'weight'] = weight_list
117
+ plot_data = split_data(lift_data, 'pred', 'weight', n_bins)
118
+ plot_data['exp_v'] = plot_data['w_pred'] / plot_data['weight']
119
+ plot_data['act_v'] = plot_data['act'] / plot_data['weight']
120
+ plot_data.reset_index(inplace=True)
121
+ fig = plt.figure(figsize=(7, 5))
122
+ ax = fig.add_subplot(111)
123
+ ax.plot(plot_data.index, plot_data['act_v'],
124
+ label='Actual', color='red')
125
+ ax.plot(plot_data.index, plot_data['exp_v'],
126
+ label='Predicted', color='blue')
127
+ ax.set_title(
128
+ 'Lift Chart of %s' % tgt_nme, fontsize=8)
129
+ plt.xticks(plot_data.index,
130
+ plot_data.index,
131
+ rotation=90, fontsize=6)
132
+ plt.yticks(fontsize=6)
133
+ plt.legend(loc='upper left',
134
+ fontsize=5, frameon=False)
135
+ plt.margins(0.05)
136
+ ax2 = ax.twinx()
137
+ ax2.bar(plot_data.index, plot_data['weight'],
138
+ alpha=0.5, color='seagreen',
139
+ label='Earned Exposure')
140
+ plt.yticks(fontsize=6)
141
+ plt.legend(loc='upper right',
142
+ fontsize=5, frameon=False)
143
+ plt.subplots_adjust(wspace=0.3)
144
+ save_path = os.path.join(
145
+ os.getcwd(), 'plot', f'05_{tgt_nme}_{fig_nme}.png')
146
+ ensure_parent_dir(save_path)
147
+ plt.savefig(save_path, dpi=300)
148
+ plt.close(fig)
149
+
150
+ # 定义双提纯曲线绘制函数
151
+
152
+
153
+ def plot_dlift_list(pred_model_1, pred_model_2,
154
+ model_nme_1, model_nme_2,
155
+ tgt_nme,
156
+ w_list, w_act_list, n_bins=10,
157
+ fig_nme='Double Lift Chart'):
158
+ lift_data = pd.DataFrame()
159
+ lift_data.loc[:, 'pred1'] = pred_model_1
160
+ lift_data.loc[:, 'pred2'] = pred_model_2
161
+ lift_data.loc[:, 'diff_ly'] = lift_data['pred1'] / lift_data['pred2']
162
+ lift_data.loc[:, 'act'] = w_act_list
163
+ lift_data.loc[:, 'weight'] = w_list
164
+ lift_data.loc[:, 'w_pred1'] = lift_data['pred1'] * lift_data['weight']
165
+ lift_data.loc[:, 'w_pred2'] = lift_data['pred2'] * lift_data['weight']
166
+ plot_data = split_data(lift_data, 'diff_ly', 'weight', n_bins)
167
+ plot_data['exp_v1'] = plot_data['w_pred1'] / plot_data['act']
168
+ plot_data['exp_v2'] = plot_data['w_pred2'] / plot_data['act']
169
+ plot_data['act_v'] = plot_data['act']/plot_data['act']
170
+ plot_data.reset_index(inplace=True)
171
+ fig = plt.figure(figsize=(7, 5))
172
+ ax = fig.add_subplot(111)
173
+ ax.plot(plot_data.index, plot_data['act_v'],
174
+ label='Actual', color='red')
175
+ ax.plot(plot_data.index, plot_data['exp_v1'],
176
+ label=model_nme_1, color='blue')
177
+ ax.plot(plot_data.index, plot_data['exp_v2'],
178
+ label=model_nme_2, color='black')
179
+ ax.set_title(
180
+ 'Double Lift Chart of %s' % tgt_nme, fontsize=8)
181
+ plt.xticks(plot_data.index,
182
+ plot_data.index,
183
+ rotation=90, fontsize=6)
184
+ plt.xlabel('%s / %s' % (model_nme_1, model_nme_2), fontsize=6)
185
+ plt.yticks(fontsize=6)
186
+ plt.legend(loc='upper left',
187
+ fontsize=5, frameon=False)
188
+ plt.margins(0.1)
189
+ plt.subplots_adjust(bottom=0.25, top=0.95, right=0.8)
190
+ ax2 = ax.twinx()
191
+ ax2.bar(plot_data.index, plot_data['weight'],
192
+ alpha=0.5, color='seagreen',
193
+ label='Earned Exposure')
194
+ plt.yticks(fontsize=6)
195
+ plt.legend(loc='upper right',
196
+ fontsize=5, frameon=False)
197
+ plt.subplots_adjust(wspace=0.3)
198
+ save_path = os.path.join(
199
+ os.getcwd(), 'plot', f'06_{tgt_nme}_{fig_nme}.png')
200
+ ensure_parent_dir(save_path)
201
+ plt.savefig(save_path, dpi=300)
202
+ plt.close(fig)
203
+
204
+
205
+ # 开始定义ResNet模型结构
206
+ # 残差块:两层线性 + ReLU + 残差连接
207
+ # ResBlock 继承 nn.Module
208
+ class ResBlock(nn.Module):
209
+ def __init__(self, dim: int, dropout: float = 0.1,
210
+ use_layernorm: bool = False, residual_scale: float = 0.1
211
+ ):
212
+ super().__init__()
213
+ self.use_layernorm = use_layernorm
214
+
215
+ if use_layernorm:
216
+ Norm = nn.LayerNorm # 对最后一维做归一化
217
+ else:
218
+ def Norm(d): return nn.BatchNorm1d(d) # 保留一个开关,想试 BN 时也能用
219
+
220
+ self.norm1 = Norm(dim)
221
+ self.fc1 = nn.Linear(dim, dim, bias=True)
222
+ self.act = nn.ReLU(inplace=True)
223
+ self.dropout = nn.Dropout(dropout) if dropout > 0.0 else nn.Identity()
224
+ self.norm2 = Norm(dim)
225
+ self.fc2 = nn.Linear(dim, dim, bias=True)
226
+
227
+ # 残差缩放,防止一开始就把主干搞炸
228
+ self.res_scale = nn.Parameter(
229
+ torch.tensor(residual_scale, dtype=torch.float32)
230
+ )
231
+
232
+ def forward(self, x):
233
+ # 前置激活结构
234
+ out = self.norm1(x)
235
+ out = self.fc1(out)
236
+ out = self.act(out)
237
+ out = self.dropout(out)
238
+ out = self.norm2(out)
239
+ out = self.fc2(out)
240
+ # 残差缩放再相加
241
+ return F.relu(x + self.res_scale * out)
242
+
243
+ # ResNetSequential 继承 nn.Module,定义整个网络结构
244
+
245
+
246
+ class ResNetSequential(nn.Module):
247
+ # 输入张量形状:(batch, input_dim)
248
+ # 网络结构:全连接 + 归一化 + ReLU,再堆叠若干残差块,最后输出 Softplus
249
+
250
+ def __init__(self, input_dim: int, hidden_dim: int = 64, block_num: int = 2,
251
+ use_layernorm: bool = True, dropout: float = 0.1,
252
+ residual_scale: float = 0.1):
253
+ super(ResNetSequential, self).__init__()
254
+
255
+ self.net = nn.Sequential()
256
+ self.net.add_module('fc1', nn.Linear(input_dim, hidden_dim))
257
+
258
+ if use_layernorm:
259
+ self.net.add_module('norm1', nn.LayerNorm(hidden_dim))
260
+ else:
261
+ self.net.add_module('norm1', nn.BatchNorm1d(hidden_dim))
262
+
263
+ self.net.add_module('relu1', nn.ReLU(inplace=True))
264
+
265
+ # 多个残差块
266
+ for i in range(block_num):
267
+ self.net.add_module(
268
+ f'ResBlk_{i+1}',
269
+ ResBlock(
270
+ hidden_dim,
271
+ dropout=dropout,
272
+ use_layernorm=use_layernorm,
273
+ residual_scale=residual_scale)
274
+ )
275
+
276
+ self.net.add_module('fc_out', nn.Linear(hidden_dim, 1))
277
+ self.net.add_module('softplus', nn.Softplus())
278
+
279
+ def forward(self, x):
280
+ return self.net(x)
281
+
282
+ # 定义ResNet模型的Scikit-Learn接口类
283
+
284
+
285
+ class ResNetSklearn(nn.Module):
286
+ def __init__(self, model_nme: str, input_dim: int, hidden_dim: int = 64,
287
+ block_num: int = 2, batch_num: int = 100, epochs: int = 100,
288
+ tweedie_power: float = 1.5, learning_rate: float = 0.01, patience: int = 10,
289
+ use_layernorm: bool = True, dropout: float = 0.1,
290
+ residual_scale: float = 0.1):
291
+ super(ResNetSklearn, self).__init__()
292
+
293
+ self.input_dim = input_dim
294
+ self.hidden_dim = hidden_dim
295
+ self.block_num = block_num
296
+ self.batch_num = batch_num
297
+ self.epochs = epochs
298
+ self.model_nme = model_nme
299
+ self.learning_rate = learning_rate
300
+ self.patience = patience
301
+ self.use_layernorm = use_layernorm
302
+ self.dropout = dropout
303
+ self.residual_scale = residual_scale
304
+
305
+ # 设备选择:cuda > mps > cpu
306
+ if torch.cuda.is_available():
307
+ self.device = torch.device('cuda')
308
+ elif torch.backends.mps.is_available():
309
+ self.device = torch.device('mps')
310
+ else:
311
+ self.device = torch.device('cpu')
312
+
313
+ # Tweedie 幂指数设定
314
+ if 'f' in self.model_nme:
315
+ self.tw_power = 1
316
+ elif 's' in self.model_nme:
317
+ self.tw_power = 2
318
+ else:
319
+ self.tw_power = tweedie_power
320
+
321
+ # 搭建网络
322
+ self.resnet = ResNetSequential(
323
+ self.input_dim,
324
+ self.hidden_dim,
325
+ self.block_num,
326
+ use_layernorm=self.use_layernorm,
327
+ dropout=self.dropout,
328
+ residual_scale=self.residual_scale
329
+ ).to(self.device)
330
+
331
+ def fit(self, X_train, y_train, w_train=None,
332
+ X_val=None, y_val=None, w_val=None):
333
+
334
+ # === 1. 训练集:先留在 CPU,交给 DataLoader 批量搬运到 GPU ===
335
+ # 注意:从 pandas DataFrame 转 tensor 时要复制数据,避免后续视图修改
336
+ X_tensor = torch.tensor(X_train.values, dtype=torch.float32)
337
+ y_tensor = torch.tensor(
338
+ y_train.values, dtype=torch.float32).view(-1, 1)
339
+ if w_train is not None:
340
+ w_tensor = torch.tensor(
341
+ w_train.values, dtype=torch.float32).view(-1, 1)
342
+ else:
343
+ w_tensor = torch.ones_like(y_tensor)
344
+
345
+ # === 2. 验证集:先在 CPU 上构造,后续一次性搬到目标设备 ===
346
+ has_val = X_val is not None and y_val is not None
347
+ if has_val:
348
+ X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
349
+ y_val_tensor = torch.tensor(
350
+ y_val.values, dtype=torch.float32).view(-1, 1)
351
+ if w_val is not None:
352
+ w_val_tensor = torch.tensor(
353
+ w_val.values, dtype=torch.float32).view(-1, 1)
354
+ else:
355
+ w_val_tensor = torch.ones_like(y_val_tensor)
356
+ else:
357
+ X_val_tensor = y_val_tensor = w_val_tensor = None
358
+
359
+ # === 3. 构建 DataLoader ===
360
+ dataset = TensorDataset(X_tensor, y_tensor, w_tensor)
361
+ batch_size = max(
362
+ 4096,
363
+ int((self.learning_rate / (1e-4)) ** 0.5 *
364
+ (X_train.shape[0] / self.batch_num))
365
+ )
366
+
367
+ dataloader = DataLoader(
368
+ dataset,
369
+ batch_size=batch_size,
370
+ shuffle=True,
371
+ num_workers=1, # 表格数据通常 0~1 个线程即可
372
+ pin_memory=(self.device.type == 'cuda')
373
+ )
374
+
375
+ # === 4. 优化器与 AMP ===
376
+ # 建议使用 Adam + AMP 主要是为了稳定损失,同时保持 GPU 性能
377
+ optimizer = torch.optim.Adam(
378
+ self.resnet.parameters(), lr=self.learning_rate)
379
+ scaler = GradScaler(enabled=(self.device.type == 'cuda'))
380
+
381
+ # === 5. 早停机制 ===
382
+ best_loss, patience_counter = float('inf'), 0
383
+ best_model_state = None
384
+
385
+ # 若存在验证集则一次性搬到目标设备
386
+ if has_val:
387
+ X_val_dev = X_val_tensor.to(self.device, non_blocking=True)
388
+ y_val_dev = y_val_tensor.to(self.device, non_blocking=True)
389
+ w_val_dev = w_val_tensor.to(self.device, non_blocking=True)
390
+
391
+ # === 6. 训练循环 ===
392
+ for epoch in range(1, self.epochs + 1):
393
+ self.resnet.train()
394
+ for X_batch, y_batch, w_batch in dataloader:
395
+ optimizer.zero_grad()
396
+
397
+ X_batch = X_batch.to(self.device, non_blocking=True)
398
+ y_batch = y_batch.to(self.device, non_blocking=True)
399
+ w_batch = w_batch.to(self.device, non_blocking=True)
400
+
401
+ with autocast(enabled=(self.device.type == 'cuda')):
402
+ y_pred = self.resnet(X_batch)
403
+ y_pred = torch.clamp(y_pred, min=1e-6)
404
+
405
+ losses = tweedie_loss(
406
+ y_pred, y_batch, p=self.tw_power).view(-1)
407
+ weighted_loss = (losses * w_batch.view(-1)
408
+ ).sum() / w_batch.sum()
409
+
410
+ scaler.scale(weighted_loss).backward()
411
+
412
+ if self.device.type == 'cuda':
413
+ scaler.unscale_(optimizer)
414
+ clip_grad_norm_(self.resnet.parameters(), max_norm=1.0)
415
+
416
+ scaler.step(optimizer)
417
+ scaler.update()
418
+
419
+ # === 7. 验证损失与早停判断 ===
420
+ if has_val:
421
+ self.resnet.eval()
422
+ with torch.no_grad(), autocast(enabled=(self.device.type == 'cuda')):
423
+ y_val_pred = self.resnet(X_val_dev)
424
+ y_val_pred = torch.clamp(y_val_pred, min=1e-6)
425
+
426
+ val_loss_values = tweedie_loss(
427
+ y_val_pred, y_val_dev, p=self.tw_power
428
+ ).view(-1)
429
+ val_weighted_loss = (
430
+ val_loss_values * w_val_dev.view(-1)
431
+ ).sum() / w_val_dev.sum()
432
+
433
+ if val_weighted_loss < best_loss:
434
+ best_loss = val_weighted_loss
435
+ patience_counter = 0
436
+ best_model_state = copy.deepcopy(self.resnet.state_dict())
437
+ else:
438
+ patience_counter += 1
439
+
440
+ if patience_counter >= self.patience and best_model_state is not None:
441
+ self.resnet.load_state_dict(best_model_state)
442
+ break
443
+ if has_val and best_model_state is not None:
444
+ self.resnet.load_state_dict(best_model_state)
445
+
446
+ # ---------------- 预测 ----------------
447
+
448
+ def predict(self, X_test):
449
+ self.resnet.eval()
450
+ with torch.no_grad():
451
+ X_tensor = torch.tensor(
452
+ X_test.values, dtype=torch.float32).to(self.device)
453
+ y_pred = self.resnet(X_tensor).cpu().numpy()
454
+
455
+ y_pred = np.clip(y_pred, 1e-6, None)
456
+ return y_pred.flatten()
457
+
458
+ # ---------------- 设置参数 ----------------
459
+
460
+ def set_params(self, params):
461
+ for key, value in params.items():
462
+ if hasattr(self, key):
463
+ setattr(self, key, value)
464
+ else:
465
+ raise ValueError(f"Parameter {key} not found in model.")
466
+
467
+ # 开始定义FT Transformer模型结构
468
+
469
+
470
+ class FeatureTokenizer(nn.Module):
471
+ # 将数值与类别特征映射为 token,输出形状 (batch, token 数, d_model)
472
+ # 设定:
473
+ # - X_num 表示数值特征,形状 (batch, num_numeric)
474
+ # - X_cat 表示类别特征,形状 (batch, num_categorical),每列为编码后的整数标签 [0, card-1]
475
+
476
+ def __init__(self, num_numeric: int, cat_cardinalities, d_model: int):
477
+ super().__init__()
478
+
479
+ self.num_numeric = num_numeric
480
+ self.has_numeric = num_numeric > 0
481
+
482
+ if self.has_numeric:
483
+ self.num_linear = nn.Linear(num_numeric, d_model)
484
+
485
+ self.embeddings = nn.ModuleList([
486
+ nn.Embedding(card, d_model) for card in cat_cardinalities
487
+ ])
488
+
489
+ def forward(self, X_num, X_cat):
490
+ tokens = []
491
+
492
+ if self.has_numeric:
493
+ # 数值特征映射为单个 token
494
+ num_token = self.num_linear(X_num) # 形状 (batch, d_model)
495
+ tokens.append(num_token)
496
+
497
+ # 每个类别特征生成一个嵌入 token
498
+ for i, emb in enumerate(self.embeddings):
499
+ tok = emb(X_cat[:, i]) # 形状 (batch, d_model)
500
+ tokens.append(tok)
501
+
502
+ # 最终堆叠为 (batch, token 数, d_model)
503
+ x = torch.stack(tokens, dim=1)
504
+ return x
505
+
506
+ # 定义具有残差缩放的Encoder层
507
+
508
+
509
+ class ScaledTransformerEncoderLayer(nn.Module):
510
+ def __init__(self, d_model: int, nhead: int, dim_feedforward: int = 2048,
511
+ dropout: float = 0.1, residual_scale_attn: float = 1.0,
512
+ residual_scale_ffn: float = 1.0, norm_first: bool = True,
513
+ ):
514
+ super().__init__()
515
+ self.self_attn = nn.MultiheadAttention(
516
+ embed_dim=d_model,
517
+ num_heads=nhead,
518
+ dropout=dropout,
519
+ batch_first=True
520
+ )
521
+
522
+ # 前馈网络部分
523
+ self.linear1 = nn.Linear(d_model, dim_feedforward)
524
+ self.dropout = nn.Dropout(dropout)
525
+ self.linear2 = nn.Linear(dim_feedforward, d_model)
526
+
527
+ # 归一化与 Dropout
528
+ self.norm1 = nn.LayerNorm(d_model)
529
+ self.norm2 = nn.LayerNorm(d_model)
530
+ self.dropout1 = nn.Dropout(dropout)
531
+ self.dropout2 = nn.Dropout(dropout)
532
+
533
+ self.activation = nn.GELU()
534
+ self.norm_first = norm_first
535
+
536
+ # 残差缩放系数
537
+ self.res_scale_attn = residual_scale_attn
538
+ self.res_scale_ffn = residual_scale_ffn
539
+
540
+ def forward(self, src, src_mask=None, src_key_padding_mask=None):
541
+ # 输入张量形状:(batch, 序列长度, d_model)
542
+ x = src
543
+
544
+ if self.norm_first:
545
+ # 先归一化再做注意力
546
+ x = x + self._sa_block(self.norm1(x), src_mask,
547
+ src_key_padding_mask)
548
+ x = x + self._ff_block(self.norm2(x))
549
+ else:
550
+ # 后归一化(一般不启用)
551
+ x = self.norm1(
552
+ x + self._sa_block(x, src_mask, src_key_padding_mask))
553
+ x = self.norm2(x + self._ff_block(x))
554
+
555
+ return x
556
+
557
+ def _sa_block(self, x, attn_mask, key_padding_mask):
558
+ # 自注意力并附带残差缩放
559
+ attn_out, _ = self.self_attn(
560
+ x, x, x,
561
+ attn_mask=attn_mask,
562
+ key_padding_mask=key_padding_mask,
563
+ need_weights=False
564
+ )
565
+ return self.res_scale_attn * self.dropout1(attn_out)
566
+
567
+ def _ff_block(self, x):
568
+ # 前馈网络并附带残差缩放
569
+ x2 = self.linear2(self.dropout(self.activation(self.linear1(x))))
570
+ return self.res_scale_ffn * self.dropout2(x2)
571
+
572
+ # 定义FT-Transformer核心模型
573
+
574
+
575
+ class FTTransformerCore(nn.Module):
576
+ # 最小可用版本的 FT-Transformer:
577
+ # - FeatureTokenizer:将数值与类别特征转换为 token
578
+ # - TransformerEncoder:捕捉特征之间的交互
579
+ # - 池化 + MLP + Softplus:保证输出为正值(适配 Tweedie/Gamma)
580
+
581
+ def __init__(self, num_numeric: int, cat_cardinalities, d_model: int = 64,
582
+ n_heads: int = 8, n_layers: int = 4, dropout: float = 0.1,
583
+ ):
584
+ super().__init__()
585
+
586
+ self.tokenizer = FeatureTokenizer(
587
+ num_numeric=num_numeric,
588
+ cat_cardinalities=cat_cardinalities,
589
+ d_model=d_model
590
+ )
591
+ scale = 1.0 / math.sqrt(n_layers) # 推荐一个默认值
592
+ encoder_layer = ScaledTransformerEncoderLayer(
593
+ d_model=d_model,
594
+ nhead=n_heads,
595
+ dim_feedforward=d_model * 4,
596
+ dropout=dropout,
597
+ residual_scale_attn=scale,
598
+ residual_scale_ffn=scale,
599
+ norm_first=True,
600
+ )
601
+ self.encoder = nn.TransformerEncoder(
602
+ encoder_layer,
603
+ num_layers=n_layers
604
+ )
605
+ self.n_layers = n_layers
606
+
607
+ self.head = nn.Sequential(
608
+ nn.LayerNorm(d_model),
609
+ nn.Linear(d_model, d_model),
610
+ nn.GELU(),
611
+ nn.Linear(d_model, 1),
612
+ nn.Softplus() # 保证输出为正,适合 Tweedie / Gamma
613
+ )
614
+
615
+ def forward(self, X_num, X_cat):
616
+
617
+ # X_num: (batch, 数值特征数),float32
618
+ # X_cat: (batch, 类别特征数),long
619
+
620
+ tokens = self.tokenizer(X_num, X_cat) # 形状 (batch, token 数, d_model)
621
+ x = self.encoder(tokens) # 形状 (batch, token 数, d_model)
622
+
623
+ # 对 token 做平均池化
624
+ x = x.mean(dim=1) # 形状 (batch, d_model)
625
+
626
+ out = self.head(x) # 形状 (batch, 1),Softplus 保证为正
627
+ return out
628
+
629
+ # 定义TabularDataset类
630
+
631
+
632
+ class TabularDataset(Dataset):
633
+ def __init__(self, X_num, X_cat, y, w):
634
+
635
+ # X_num: torch.float32, 形状 (N, 数值特征数)
636
+ # X_cat: torch.long, 形状 (N, 类别特征数)
637
+ # y: torch.float32, 形状 (N, 1)
638
+ # w: torch.float32, 形状 (N, 1)
639
+
640
+ self.X_num = X_num
641
+ self.X_cat = X_cat
642
+ self.y = y
643
+ self.w = w
644
+
645
+ def __len__(self):
646
+ return self.y.shape[0]
647
+
648
+ def __getitem__(self, idx):
649
+ return (
650
+ self.X_num[idx],
651
+ self.X_cat[idx],
652
+ self.y[idx],
653
+ self.w[idx],
654
+ )
655
+
656
+ # 定义FTTransformer的Scikit-Learn接口类
657
+
658
+
659
+ class FTTransformerSklearn(nn.Module):
660
+
661
+ # sklearn 风格包装:
662
+ # - num_cols:数值特征列名列表
663
+ # - cat_cols:类别特征列名列表(需提前做标签编码,取值 [0, n_classes-1])
664
+
665
+ def __init__(self, model_nme: str, num_cols, cat_cols, d_model: int = 64, n_heads: int = 8,
666
+ n_layers: int = 4, dropout: float = 0.1, batch_num: int = 100, epochs: int = 100,
667
+ tweedie_power: float = 1.5, learning_rate: float = 1e-3, patience: int = 10,
668
+ ):
669
+ super().__init__()
670
+
671
+ self.model_nme = model_nme
672
+ self.num_cols = list(num_cols)
673
+ self.cat_cols = list(cat_cols)
674
+ self.d_model = d_model
675
+ self.n_heads = n_heads
676
+ self.n_layers = n_layers
677
+ self.dropout = dropout
678
+ self.batch_num = batch_num
679
+ self.epochs = epochs
680
+ self.learning_rate = learning_rate
681
+ self.patience = patience
682
+ if 'f' in self.model_nme:
683
+ self.tw_power = 1.0
684
+ elif 's' in self.model_nme:
685
+ self.tw_power = 2.0
686
+ else:
687
+ self.tw_power = tweedie_power
688
+ if torch.cuda.is_available():
689
+ self.device = torch.device("cuda")
690
+ elif torch.backends.mps.is_available():
691
+ self.device = torch.device("mps")
692
+ else:
693
+ self.device = torch.device("cpu")
694
+ self.cat_cardinalities = None
695
+ self.cat_categories = {}
696
+ self.ft = None
697
+
698
+ def _build_model(self, X_train):
699
+ num_numeric = len(self.num_cols)
700
+ cat_cardinalities = []
701
+
702
+ for col in self.cat_cols:
703
+ cats = X_train[col].astype('category')
704
+ categories = cats.cat.categories
705
+ self.cat_categories[col] = categories # 保存训练集类别全集
706
+
707
+ card = len(categories) + 1 # 多预留 1 类给“未知/缺失”
708
+ cat_cardinalities.append(card)
709
+
710
+ self.cat_cardinalities = cat_cardinalities
711
+
712
+ self.ft = FTTransformerCore(
713
+ num_numeric=num_numeric,
714
+ cat_cardinalities=cat_cardinalities,
715
+ d_model=self.d_model,
716
+ n_heads=self.n_heads,
717
+ n_layers=self.n_layers,
718
+ dropout=self.dropout,
719
+ ).to(self.device)
720
+
721
+ def _encode_cats(self, X):
722
+ # 输入 DataFrame 至少需要包含所有类别特征列
723
+ # 返回形状 (N, 类别特征数) 的 int64 数组
724
+
725
+ if not self.cat_cols:
726
+ return np.zeros((len(X), 0), dtype='int64')
727
+
728
+ X_cat_list = []
729
+ for col in self.cat_cols:
730
+ # 使用训练阶段记录的类别全集
731
+ categories = self.cat_categories[col]
732
+ # 按固定类别构造 Categorical
733
+ cats = pd.Categorical(X[col], categories=categories)
734
+ codes = cats.codes.astype('int64', copy=True) # -1 表示未知或缺失
735
+ # 未知或缺失映射到额外的“未知”索引 len(categories)
736
+ codes[codes < 0] = len(categories)
737
+ X_cat_list.append(codes)
738
+
739
+ X_cat_np = np.stack(X_cat_list, axis=1) # 形状 (N, 类别特征数)
740
+ return X_cat_np
741
+
742
+ def fit(self, X_train, y_train, w_train=None,
743
+ X_val=None, y_val=None, w_val=None):
744
+
745
+ # 首次拟合时需要构建底层模型结构
746
+ if self.ft is None:
747
+ self._build_model(X_train)
748
+
749
+ # --- 构建训练张量(全部先放在 CPU,后续按批搬运) ---
750
+ # 复制数据确保与原 DataFrame 脱钩,这样标准化或采样不会污染原始数据
751
+ X_num_train = X_train[self.num_cols].to_numpy(
752
+ dtype=np.float32, copy=True)
753
+ X_num_train = torch.tensor(
754
+ X_num_train,
755
+ dtype=torch.float32
756
+ )
757
+
758
+ if self.cat_cols:
759
+ X_cat_train_np = self._encode_cats(X_train)
760
+ X_cat_train = torch.tensor(X_cat_train_np, dtype=torch.long)
761
+ else:
762
+ X_cat_train = torch.zeros(
763
+ (X_num_train.size(0), 0), dtype=torch.long)
764
+
765
+ y_tensor = torch.tensor(
766
+ y_train.values,
767
+ dtype=torch.float32
768
+ ).view(-1, 1)
769
+
770
+ if w_train is not None:
771
+ w_tensor = torch.tensor(
772
+ w_train.values,
773
+ dtype=torch.float32
774
+ ).view(-1, 1)
775
+ else:
776
+ w_tensor = torch.ones_like(y_tensor)
777
+
778
+ # --- 验证集张量(一次性搬到目标设备) ---
779
+ has_val = X_val is not None and y_val is not None
780
+ if has_val:
781
+ # ---------- 数值特征 ----------
782
+ X_num_val_np = X_val[self.num_cols].to_numpy(
783
+ dtype=np.float32, copy=True)
784
+ X_num_val = torch.tensor(X_num_val_np, dtype=torch.float32)
785
+
786
+ # ---------- 类别特征 ----------
787
+ if self.cat_cols:
788
+ X_cat_val_np = self._encode_cats(X_val)
789
+ X_cat_val = torch.tensor(X_cat_val_np, dtype=torch.long)
790
+ else:
791
+ X_cat_val = torch.zeros(
792
+ (X_num_val.shape[0], 0), dtype=torch.long)
793
+
794
+ # ---------- 目标 & 权重 ----------
795
+ y_val_np = y_val.values.astype(np.float32, copy=True)
796
+ y_val_tensor = torch.tensor(
797
+ y_val_np, dtype=torch.float32).view(-1, 1)
798
+
799
+ if w_val is not None:
800
+ w_val_np = w_val.values.astype(np.float32, copy=True)
801
+ w_val_tensor = torch.tensor(
802
+ w_val_np, dtype=torch.float32).view(-1, 1)
803
+ else:
804
+ w_val_tensor = torch.ones_like(y_val_tensor)
805
+
806
+ else:
807
+ X_num_val = X_cat_val = y_val_tensor = w_val_tensor = None
808
+
809
+ # --- 构建 DataLoader ---
810
+ dataset = TabularDataset(
811
+ X_num_train, X_cat_train, y_tensor, w_tensor
812
+ )
813
+
814
+ batch_size = max(
815
+ 32,
816
+ int((self.learning_rate / 1e-4) ** 0.5 *
817
+ (X_train.shape[0] / self.batch_num))
818
+ )
819
+
820
+ dataloader = DataLoader(
821
+ dataset,
822
+ batch_size=batch_size,
823
+ shuffle=True,
824
+ num_workers=1,
825
+ pin_memory=(self.device.type == 'cuda')
826
+ )
827
+
828
+ # --- 优化器与 AMP ---
829
+ # 这部分与 ResNet 一致,仍建议使用 Adam + AMP 来避免数值不稳定
830
+ optimizer = torch.optim.Adam(
831
+ self.ft.parameters(),
832
+ lr=self.learning_rate
833
+ )
834
+ scaler = GradScaler(enabled=(self.device.type == 'cuda'))
835
+
836
+ # --- 早停机制 ---
837
+ best_loss = float('inf')
838
+ patience_counter = 0
839
+ best_model_state = None
840
+
841
+ # 若存在验证集则整体迁移到目标设备
842
+ if has_val:
843
+ X_num_val_dev = X_num_val.to(self.device, non_blocking=True)
844
+ X_cat_val_dev = X_cat_val.to(self.device, non_blocking=True)
845
+ y_val_dev = y_val_tensor.to(self.device, non_blocking=True)
846
+ w_val_dev = w_val_tensor.to(self.device, non_blocking=True)
847
+
848
+ # --- 训练循环 ---
849
+ for epoch in range(1, self.epochs + 1):
850
+ self.ft.train()
851
+ for X_num_b, X_cat_b, y_b, w_b in dataloader:
852
+ optimizer.zero_grad()
853
+
854
+ X_num_b = X_num_b.to(self.device, non_blocking=True)
855
+ X_cat_b = X_cat_b.to(self.device, non_blocking=True)
856
+ y_b = y_b.to(self.device, non_blocking=True)
857
+ w_b = w_b.to(self.device, non_blocking=True)
858
+
859
+ with autocast(enabled=(self.device.type == 'cuda')):
860
+ y_pred = self.ft(X_num_b, X_cat_b)
861
+ y_pred = torch.clamp(y_pred, min=1e-6)
862
+
863
+ losses = tweedie_loss(
864
+ y_pred, y_b, p=self.tw_power
865
+ ).view(-1)
866
+
867
+ weighted_loss = (losses * w_b.view(-1)).sum() / w_b.sum()
868
+
869
+ scaler.scale(weighted_loss).backward()
870
+
871
+ if self.device.type == 'cuda':
872
+ scaler.unscale_(optimizer)
873
+ clip_grad_norm_(self.ft.parameters(), max_norm=1.0)
874
+
875
+ scaler.step(optimizer)
876
+ scaler.update()
877
+
878
+ # --- 验证阶段与早停判断 ---
879
+ if has_val:
880
+ self.ft.eval()
881
+ with torch.no_grad(), autocast(enabled=(self.device.type == 'cuda')):
882
+ y_val_pred = self.ft(X_num_val_dev, X_cat_val_dev)
883
+ y_val_pred = torch.clamp(y_val_pred, min=1e-6)
884
+
885
+ val_losses = tweedie_loss(
886
+ y_val_pred, y_val_dev, p=self.tw_power
887
+ ).view(-1)
888
+
889
+ val_weighted_loss = (
890
+ val_losses * w_val_dev.view(-1)
891
+ ).sum() / w_val_dev.sum()
892
+
893
+ if val_weighted_loss < best_loss:
894
+ best_loss = val_weighted_loss
895
+ patience_counter = 0
896
+ best_model_state = copy.deepcopy(self.ft.state_dict())
897
+ else:
898
+ patience_counter += 1
899
+
900
+ if patience_counter >= self.patience and best_model_state is not None:
901
+ self.ft.load_state_dict(best_model_state)
902
+ break
903
+ if has_val and best_model_state is not None:
904
+ self.ft.load_state_dict(best_model_state)
905
+
906
+ def predict(self, X_test):
907
+ # X_test 需要包含所有数值列与类别列
908
+
909
+ self.ft.eval()
910
+ X_num = X_test[self.num_cols].to_numpy(dtype=np.float32, copy=True)
911
+ X_num = torch.tensor(
912
+ X_num,
913
+ dtype=torch.float32
914
+ )
915
+ if self.cat_cols:
916
+ X_cat_np = self._encode_cats(X_test)
917
+ X_cat = torch.tensor(X_cat_np, dtype=torch.long)
918
+ else:
919
+ X_cat = torch.zeros((X_num.size(0), 0), dtype=torch.long)
920
+
921
+ with torch.no_grad():
922
+ X_num = X_num.to(self.device, non_blocking=True)
923
+ X_cat = X_cat.to(self.device, non_blocking=True)
924
+ y_pred = self.ft(X_num, X_cat).cpu().numpy()
925
+
926
+ y_pred = np.clip(y_pred, 1e-6, None)
927
+ return y_pred.ravel()
928
+
929
+ def set_params(self, params: dict):
930
+
931
+ # 和 sklearn 风格保持一致。
932
+ # 注意:对结构性参数(如 d_model/n_heads)修改后,需要重新 fit 才会生效。
933
+
934
+ for key, value in params.items():
935
+ if hasattr(self, key):
936
+ setattr(self, key, value)
937
+ else:
938
+ raise ValueError(f"Parameter {key} not found in model.")
939
+ return self
940
+
941
+
942
+ # ===== 基础组件与训练封装 =====================================================
943
+
944
+ @dataclass
945
+ class BayesOptConfig:
946
+ model_nme: str
947
+ resp_nme: str
948
+ weight_nme: str
949
+ factor_nmes: List[str]
950
+ cate_list: Optional[List[str]] = None
951
+ prop_test: float = 0.25
952
+ rand_seed: Optional[int] = None
953
+ epochs: int = 100
954
+ use_gpu: bool = True
955
+
956
+
957
+ class OutputManager:
958
+ # 统一管理结果、图表与模型的输出路径
959
+
960
+ def __init__(self, root: Optional[str] = None, model_name: str = "model") -> None:
961
+ self.root = Path(root or os.getcwd())
962
+ self.model_name = model_name
963
+ self.plot_dir = self.root / 'plot'
964
+ self.result_dir = self.root / 'Results'
965
+ self.model_dir = self.root / 'model'
966
+
967
+ def _prepare(self, path: Path) -> str:
968
+ ensure_parent_dir(str(path))
969
+ return str(path)
970
+
971
+ def plot_path(self, filename: str) -> str:
972
+ return self._prepare(self.plot_dir / filename)
973
+
974
+ def result_path(self, filename: str) -> str:
975
+ return self._prepare(self.result_dir / filename)
976
+
977
+ def model_path(self, filename: str) -> str:
978
+ return self._prepare(self.model_dir / filename)
979
+
980
+
981
+ class DatasetPreprocessor:
982
+ # 为各训练器准备通用的训练/测试数据视图
983
+
984
+ def __init__(self, train_df: pd.DataFrame, test_df: pd.DataFrame,
985
+ config: BayesOptConfig) -> None:
986
+ self.config = config
987
+ self.train_data = train_df.copy(deep=True)
988
+ self.test_data = test_df.copy(deep=True)
989
+ self.num_features: List[str] = []
990
+ self.train_oht_scl_data: Optional[pd.DataFrame] = None
991
+ self.test_oht_scl_data: Optional[pd.DataFrame] = None
992
+ self.var_nmes: List[str] = []
993
+ self.cat_categories_for_shap: Dict[str, List[Any]] = {}
994
+
995
+ def run(self) -> "DatasetPreprocessor":
996
+ cfg = self.config
997
+ # 预先计算加权实际值,后续画图、校验都依赖该字段
998
+ self.train_data.loc[:, 'w_act'] = self.train_data[cfg.resp_nme] * \
999
+ self.train_data[cfg.weight_nme]
1000
+ self.test_data.loc[:, 'w_act'] = self.test_data[cfg.resp_nme] * \
1001
+ self.test_data[cfg.weight_nme]
1002
+ # 高分位裁剪用来吸收离群值;若删除会导致极端点主导损失
1003
+ q99 = self.train_data[cfg.resp_nme].quantile(0.999)
1004
+ self.train_data[cfg.resp_nme] = self.train_data[cfg.resp_nme].clip(
1005
+ upper=q99)
1006
+ cate_list = list(cfg.cate_list or [])
1007
+ if cate_list:
1008
+ for cate in cate_list:
1009
+ self.train_data[cate] = self.train_data[cate].astype(
1010
+ 'category')
1011
+ self.test_data[cate] = self.test_data[cate].astype('category')
1012
+ cats = self.train_data[cate].cat.categories
1013
+ self.cat_categories_for_shap[cate] = list(cats)
1014
+ self.num_features = [
1015
+ nme for nme in cfg.factor_nmes if nme not in cate_list]
1016
+ train_oht = self.train_data[cfg.factor_nmes +
1017
+ [cfg.weight_nme] + [cfg.resp_nme]].copy()
1018
+ test_oht = self.test_data[cfg.factor_nmes +
1019
+ [cfg.weight_nme] + [cfg.resp_nme]].copy()
1020
+ train_oht = pd.get_dummies(
1021
+ train_oht,
1022
+ columns=cate_list,
1023
+ drop_first=True,
1024
+ dtype=np.int8
1025
+ )
1026
+ test_oht = pd.get_dummies(
1027
+ test_oht,
1028
+ columns=cate_list,
1029
+ drop_first=True,
1030
+ dtype=np.int8
1031
+ )
1032
+ for num_chr in self.num_features:
1033
+ # 逐列标准化保障每个特征在同一量级,否则神经网络会难以收敛
1034
+ scaler = StandardScaler()
1035
+ train_oht[num_chr] = scaler.fit_transform(
1036
+ train_oht[num_chr].values.reshape(-1, 1))
1037
+ test_oht[num_chr] = scaler.transform(
1038
+ test_oht[num_chr].values.reshape(-1, 1))
1039
+ # reindex 时将缺失的哑变量列补零,避免测试集列数与训练集不一致
1040
+ test_oht = test_oht.reindex(columns=train_oht.columns, fill_value=0)
1041
+ self.train_oht_scl_data = train_oht
1042
+ self.test_oht_scl_data = test_oht
1043
+ self.var_nmes = list(
1044
+ set(list(train_oht.columns)) - set([cfg.weight_nme, cfg.resp_nme])
1045
+ )
1046
+ return self
1047
+
1048
+
1049
+ class TrainerBase:
1050
+ def __init__(self, context: "BayesOptModel", label: str) -> None:
1051
+ self.ctx = context
1052
+ self.label = label
1053
+
1054
+ @property
1055
+ def config(self) -> BayesOptConfig:
1056
+ return self.ctx.config
1057
+
1058
+ @property
1059
+ def output(self) -> OutputManager:
1060
+ return self.ctx.output_manager
1061
+
1062
+ def tune(self, max_evals: int) -> None: # pragma: no cover 子类会覆盖
1063
+ raise NotImplementedError
1064
+
1065
+ def train(self) -> None: # pragma: no cover 子类会覆盖
1066
+ raise NotImplementedError
1067
+
1068
+ def save(self) -> None:
1069
+ pass
1070
+
1071
+ def load(self) -> None:
1072
+ pass
1073
+
1074
+
1075
+ class XGBTrainer(TrainerBase):
1076
+ def __init__(self, context: "BayesOptModel") -> None:
1077
+ super().__init__(context, 'Xgboost')
1078
+ self.model: Optional[xgb.XGBRegressor] = None
1079
+ self.best_params: Optional[Dict[str, Any]] = None
1080
+ self.best_trial = None
1081
+
1082
+ def _build_estimator(self) -> xgb.XGBRegressor:
1083
+ params = dict(
1084
+ objective=self.ctx.obj,
1085
+ random_state=self.ctx.rand_seed,
1086
+ subsample=0.9,
1087
+ tree_method='gpu_hist' if self.ctx.use_gpu else 'hist',
1088
+ enable_categorical=True,
1089
+ predictor='gpu_predictor' if self.ctx.use_gpu else 'cpu_predictor'
1090
+ )
1091
+ if self.ctx.use_gpu:
1092
+ params['gpu_id'] = 0
1093
+ return xgb.XGBRegressor(**params)
1094
+
1095
+ def cross_val(self, trial: optuna.trial.Trial) -> float:
1096
+ learning_rate = trial.suggest_float(
1097
+ 'learning_rate', 1e-5, 1e-1, log=True)
1098
+ gamma = trial.suggest_float('gamma', 0, 10000)
1099
+ max_depth = trial.suggest_int('max_depth', 3, 25)
1100
+ n_estimators = trial.suggest_int('n_estimators', 10, 500, step=10)
1101
+ min_child_weight = trial.suggest_int(
1102
+ 'min_child_weight', 100, 10000, step=100)
1103
+ reg_alpha = trial.suggest_float('reg_alpha', 1e-10, 1, log=True)
1104
+ reg_lambda = trial.suggest_float('reg_lambda', 1e-10, 1, log=True)
1105
+ if self.ctx.obj == 'reg:tweedie':
1106
+ tweedie_variance_power = trial.suggest_float(
1107
+ 'tweedie_variance_power', 1, 2)
1108
+ elif self.ctx.obj == 'count:poisson':
1109
+ tweedie_variance_power = 1
1110
+ elif self.ctx.obj == 'reg:gamma':
1111
+ tweedie_variance_power = 2
1112
+ else:
1113
+ tweedie_variance_power = 1.5
1114
+ clf = self._build_estimator()
1115
+ params = {
1116
+ 'learning_rate': learning_rate,
1117
+ 'gamma': gamma,
1118
+ 'max_depth': max_depth,
1119
+ 'n_estimators': n_estimators,
1120
+ 'min_child_weight': min_child_weight,
1121
+ 'reg_alpha': reg_alpha,
1122
+ 'reg_lambda': reg_lambda
1123
+ }
1124
+ if self.ctx.obj == 'reg:tweedie':
1125
+ params['tweedie_variance_power'] = tweedie_variance_power
1126
+ clf.set_params(**params)
1127
+ n_jobs = 1 if self.ctx.use_gpu else int(1 / self.ctx.prop_test)
1128
+ acc = cross_val_score(
1129
+ clf,
1130
+ self.ctx.train_data[self.ctx.factor_nmes],
1131
+ self.ctx.train_data[self.ctx.resp_nme].values,
1132
+ fit_params=self.ctx.fit_params,
1133
+ cv=self.ctx.cv,
1134
+ scoring=make_scorer(
1135
+ mean_tweedie_deviance,
1136
+ power=tweedie_variance_power,
1137
+ greater_is_better=False),
1138
+ error_score='raise',
1139
+ n_jobs=n_jobs
1140
+ ).mean()
1141
+ return -acc
1142
+
1143
+ def tune(self, max_evals: int = 100) -> None:
1144
+ study = optuna.create_study(
1145
+ direction='minimize',
1146
+ sampler=optuna.samplers.TPESampler(seed=self.ctx.rand_seed)
1147
+ )
1148
+ study.optimize(self.cross_val, n_trials=max_evals)
1149
+ self.best_params = study.best_params
1150
+ self.best_trial = study.best_trial
1151
+ params_path = self.output.result_path(
1152
+ f'{self.ctx.model_nme}_bestparams_xgb.csv'
1153
+ )
1154
+ pd.DataFrame(self.best_params, index=[0]).to_csv(params_path)
1155
+
1156
+ def train(self) -> None:
1157
+ if not self.best_params:
1158
+ raise RuntimeError('请先运行 tune() 以获得 XGB 最优参数。')
1159
+ self.model = self._build_estimator()
1160
+ self.model.set_params(**self.best_params)
1161
+ self.model.fit(self.ctx.train_data[self.ctx.factor_nmes],
1162
+ self.ctx.train_data[self.ctx.resp_nme].values,
1163
+ **self.ctx.fit_params)
1164
+ self.ctx.model_label += [self.label]
1165
+ self.ctx.train_data['pred_xgb'] = self.model.predict(
1166
+ self.ctx.train_data[self.ctx.factor_nmes])
1167
+ self.ctx.test_data['pred_xgb'] = self.model.predict(
1168
+ self.ctx.test_data[self.ctx.factor_nmes])
1169
+ self.ctx.train_data.loc[:, 'w_pred_xgb'] = self.ctx.train_data['pred_xgb'] * \
1170
+ self.ctx.train_data[self.ctx.weight_nme]
1171
+ self.ctx.test_data.loc[:, 'w_pred_xgb'] = self.ctx.test_data['pred_xgb'] * \
1172
+ self.ctx.test_data[self.ctx.weight_nme]
1173
+ self.ctx.xgb_best = self.model
1174
+
1175
+ def save(self) -> None:
1176
+ if self.model is not None:
1177
+ joblib.dump(self.model, self.output.model_path(
1178
+ f'01_{self.ctx.model_nme}_Xgboost.pkl'))
1179
+
1180
+ def load(self) -> None:
1181
+ path = self.output.model_path(
1182
+ f'01_{self.ctx.model_nme}_Xgboost.pkl')
1183
+ if os.path.exists(path):
1184
+ self.model = joblib.load(path)
1185
+ self.ctx.xgb_best = self.model
1186
+ else:
1187
+ print(f"[load_model] Warning: 未找到 Xgboost 模型文件:{path}")
1188
+
1189
+
1190
+ class ResNetTrainer(TrainerBase):
1191
+ def __init__(self, context: "BayesOptModel") -> None:
1192
+ super().__init__(context, 'ResNet')
1193
+ self.model: Optional[ResNetSklearn] = None
1194
+ self.best_params: Optional[Dict[str, Any]] = None
1195
+ self.best_trial = None
1196
+
1197
+ def cross_val(self, trial: optuna.trial.Trial) -> float:
1198
+ learning_rate = trial.suggest_float(
1199
+ 'learning_rate', 1e-6, 1e-2, log=True)
1200
+ hidden_dim = trial.suggest_int('hidden_dim', 32, 256, step=32)
1201
+ block_num = trial.suggest_int('block_num', 2, 10)
1202
+ batch_num = trial.suggest_int(
1203
+ 'batch_num',
1204
+ 10 if self.ctx.obj == 'reg:gamma' else 100,
1205
+ 100 if self.ctx.obj == 'reg:gamma' else 1000,
1206
+ step=10 if self.ctx.obj == 'reg:gamma' else 100
1207
+ )
1208
+ if self.ctx.obj == 'reg:tweedie':
1209
+ tw_power = trial.suggest_float('tw_power', 1, 2.0)
1210
+ elif self.ctx.obj == 'count:poisson':
1211
+ tw_power = 1
1212
+ elif self.ctx.obj == 'reg:gamma':
1213
+ tw_power = 2
1214
+ else:
1215
+ tw_power = 1.5
1216
+ loss = 0
1217
+ for _, (train_idx, test_idx) in enumerate(
1218
+ self.ctx.cv.split(self.ctx.train_oht_scl_data[self.ctx.var_nmes])):
1219
+ cv_net = ResNetSklearn(
1220
+ model_nme=self.ctx.model_nme,
1221
+ input_dim=self.ctx.train_oht_scl_data[self.ctx.var_nmes].shape[1],
1222
+ epochs=self.ctx.epochs,
1223
+ learning_rate=learning_rate,
1224
+ hidden_dim=hidden_dim,
1225
+ block_num=block_num,
1226
+ batch_num=batch_num,
1227
+ tweedie_power=tw_power
1228
+ )
1229
+ try:
1230
+ cv_net.fit(
1231
+ self.ctx.train_oht_scl_data[self.ctx.var_nmes].iloc[train_idx],
1232
+ self.ctx.train_oht_scl_data[self.ctx.resp_nme].iloc[train_idx],
1233
+ self.ctx.train_oht_scl_data[self.ctx.weight_nme].iloc[train_idx],
1234
+ self.ctx.train_oht_scl_data[self.ctx.var_nmes].iloc[test_idx],
1235
+ self.ctx.train_oht_scl_data[self.ctx.resp_nme].iloc[test_idx],
1236
+ self.ctx.train_oht_scl_data[self.ctx.weight_nme].iloc[test_idx]
1237
+ )
1238
+ y_pred_fold = cv_net.predict(
1239
+ self.ctx.train_oht_scl_data[self.ctx.var_nmes].iloc[test_idx]
1240
+ )
1241
+ loss += mean_tweedie_deviance(
1242
+ self.ctx.train_oht_scl_data[self.ctx.resp_nme].iloc[test_idx],
1243
+ y_pred_fold,
1244
+ sample_weight=self.ctx.train_oht_scl_data[self.ctx.weight_nme].iloc[test_idx],
1245
+ power=tw_power
1246
+ )
1247
+ finally:
1248
+ # 7. ★ 每个 fold 结束后释放 GPU 资源 ★
1249
+ try:
1250
+ if hasattr(cv_net, "resnet"):
1251
+ cv_net.resnet.to("cpu")
1252
+ except Exception:
1253
+ pass
1254
+ del cv_net
1255
+ gc.collect()
1256
+ if torch.cuda.is_available():
1257
+ torch.cuda.empty_cache()
1258
+ return loss / int(1 / self.ctx.prop_test)
1259
+
1260
+ def tune(self, max_evals: int = 100) -> None:
1261
+ study = optuna.create_study(
1262
+ direction='minimize',
1263
+ sampler=optuna.samplers.TPESampler(seed=self.ctx.rand_seed))
1264
+ study.optimize(self.cross_val, n_trials=max_evals)
1265
+ self.best_params = study.best_params
1266
+ self.best_trial = study.best_trial
1267
+ params_path = self.output.result_path(
1268
+ f'{self.ctx.model_nme}_bestparams_resn.csv'
1269
+ )
1270
+ pd.DataFrame(self.best_params, index=[0]).to_csv(params_path)
1271
+
1272
+ def train(self) -> None:
1273
+ if not self.best_params:
1274
+ raise RuntimeError('请先运行 tune() 以获得 ResNet 最优参数。')
1275
+ self.model = ResNetSklearn(
1276
+ model_nme=self.ctx.model_nme,
1277
+ input_dim=self.ctx.train_oht_scl_data[self.ctx.var_nmes].shape[1]
1278
+ )
1279
+ self.model.set_params(self.best_params)
1280
+ self.model.fit(self.ctx.train_oht_scl_data[self.ctx.var_nmes],
1281
+ self.ctx.train_oht_scl_data[self.ctx.resp_nme],
1282
+ self.ctx.train_oht_scl_data[self.ctx.weight_nme])
1283
+ self.ctx.model_label += [self.label]
1284
+ self.ctx.train_data['pred_resn'] = self.model.predict(
1285
+ self.ctx.train_oht_scl_data[self.ctx.var_nmes])
1286
+ self.ctx.test_data['pred_resn'] = self.model.predict(
1287
+ self.ctx.test_oht_scl_data[self.ctx.var_nmes])
1288
+ self.ctx.train_data.loc[:, 'w_pred_resn'] = self.ctx.train_data['pred_resn'] * \
1289
+ self.ctx.train_data[self.ctx.weight_nme]
1290
+ self.ctx.test_data.loc[:, 'w_pred_resn'] = self.ctx.test_data['pred_resn'] * \
1291
+ self.ctx.test_data[self.ctx.weight_nme]
1292
+ self.ctx.resn_best = self.model
1293
+
1294
+ def save(self) -> None:
1295
+ if self.model is not None:
1296
+ torch.save(
1297
+ self.model.resnet.state_dict(),
1298
+ self.output.model_path(f'01_{self.ctx.model_nme}_ResNet.pth')
1299
+ )
1300
+
1301
+ def load(self) -> None:
1302
+ path = self.output.model_path(f'01_{self.ctx.model_nme}_ResNet.pth')
1303
+ if os.path.exists(path):
1304
+ self.model = ResNetSklearn(
1305
+ model_nme=self.ctx.model_nme,
1306
+ input_dim=self.ctx.train_oht_scl_data[self.ctx.var_nmes].shape[1]
1307
+ )
1308
+ state_dict = torch.load(path, map_location=self.model.device)
1309
+ self.model.resnet.load_state_dict(state_dict)
1310
+ self.ctx.resn_best = self.model
1311
+ else:
1312
+ print(f"[load_model] Warning: 未找到 ResNet 模型文件:{path}")
1313
+
1314
+
1315
+ class FTTrainer(TrainerBase):
1316
+ def __init__(self, context: "BayesOptModel") -> None:
1317
+ super().__init__(context, 'FTTransformer')
1318
+ self.model: Optional[FTTransformerSklearn] = None
1319
+ self.best_params: Optional[Dict[str, Any]] = None
1320
+ self.best_trial = None
1321
+
1322
+ def cross_val(self, trial: optuna.trial.Trial) -> float:
1323
+ learning_rate = trial.suggest_float(
1324
+ 'learning_rate', 1e-6, 1e-4, log=True)
1325
+ d_model = trial.suggest_int('d_model', 32, 128, step=32)
1326
+ n_heads = trial.suggest_categorical('n_heads', [2, 4, 8])
1327
+ n_layers = trial.suggest_int('n_layers', 2, 6)
1328
+ dropout = trial.suggest_float('dropout', 0.0, 0.2)
1329
+ batch_num = trial.suggest_int(
1330
+ 'batch_num',
1331
+ 5 if self.ctx.obj == 'reg:gamma' else 10,
1332
+ 10 if self.ctx.obj == 'reg:gamma' else 100,
1333
+ step=1 if self.ctx.obj == 'reg:gamma' else 10
1334
+ )
1335
+ if self.ctx.obj == 'reg:tweedie':
1336
+ tw_power = trial.suggest_float('tw_power', 1.0, 2.0)
1337
+ elif self.ctx.obj == 'count:poisson':
1338
+ tw_power = 1.0
1339
+ elif self.ctx.obj == 'reg:gamma':
1340
+ tw_power = 2.0
1341
+ else:
1342
+ tw_power = 1.5
1343
+ loss = 0.0
1344
+ for _, (train_idx, test_idx) in enumerate(
1345
+ self.ctx.cv.split(self.ctx.train_data[self.ctx.factor_nmes])):
1346
+ X_train_fold = self.ctx.train_data.iloc[train_idx][self.ctx.factor_nmes]
1347
+ y_train_fold = self.ctx.train_data.iloc[train_idx][self.ctx.resp_nme]
1348
+ w_train_fold = self.ctx.train_data.iloc[train_idx][self.ctx.weight_nme]
1349
+ X_val_fold = self.ctx.train_data.iloc[test_idx][self.ctx.factor_nmes]
1350
+ y_val_fold = self.ctx.train_data.iloc[test_idx][self.ctx.resp_nme]
1351
+ w_val_fold = self.ctx.train_data.iloc[test_idx][self.ctx.weight_nme]
1352
+ cv_ft = FTTransformerSklearn(
1353
+ model_nme=self.ctx.model_nme,
1354
+ num_cols=self.ctx.num_features,
1355
+ cat_cols=self.ctx.cate_list,
1356
+ d_model=d_model,
1357
+ n_heads=n_heads,
1358
+ n_layers=n_layers,
1359
+ dropout=dropout,
1360
+ batch_num=batch_num,
1361
+ epochs=self.ctx.epochs,
1362
+ tweedie_power=tw_power,
1363
+ learning_rate=learning_rate,
1364
+ patience=5
1365
+ )
1366
+ try:
1367
+ cv_ft.fit(X_train_fold, y_train_fold, w_train_fold,
1368
+ X_val_fold, y_val_fold, w_val_fold)
1369
+ y_pred_fold = cv_ft.predict(X_val_fold)
1370
+ loss += mean_tweedie_deviance(
1371
+ y_val_fold,
1372
+ y_pred_fold,
1373
+ sample_weight=w_val_fold,
1374
+ power=tw_power
1375
+ )
1376
+ finally:
1377
+ # 🧹 每个 fold 用完就立即释放 GPU 资源
1378
+ try:
1379
+ # 如果模型在 GPU 上,先挪回 CPU
1380
+ if hasattr(cv_ft, "ft"):
1381
+ cv_ft.ft.to("cpu")
1382
+ except Exception:
1383
+ pass
1384
+ del cv_ft
1385
+ gc.collect()
1386
+ if torch.cuda.is_available():
1387
+ torch.cuda.empty_cache()
1388
+
1389
+ return loss / int(1 / self.ctx.prop_test)
1390
+
1391
+ def tune(self, max_evals: int = 50) -> None:
1392
+ study = optuna.create_study(
1393
+ direction='minimize',
1394
+ sampler=optuna.samplers.TPESampler(seed=self.ctx.rand_seed)
1395
+ )
1396
+ study.optimize(self.cross_val, n_trials=max_evals)
1397
+ self.best_params = study.best_params
1398
+ self.best_trial = study.best_trial
1399
+ params_path = self.output.result_path(
1400
+ f'{self.ctx.model_nme}_bestparams_ft.csv'
1401
+ )
1402
+ pd.DataFrame(self.best_params, index=[0]).to_csv(params_path)
1403
+
1404
+ def train(self) -> None:
1405
+ if not self.best_params:
1406
+ raise RuntimeError('请先运行 tune() 以获得 FT-Transformer 最优参数。')
1407
+ self.model = FTTransformerSklearn(
1408
+ model_nme=self.ctx.model_nme,
1409
+ num_cols=self.ctx.num_features,
1410
+ cat_cols=self.ctx.cate_list
1411
+ )
1412
+ self.model.set_params(self.best_params)
1413
+ self.model.fit(
1414
+ self.ctx.train_data[self.ctx.factor_nmes],
1415
+ self.ctx.train_data[self.ctx.resp_nme],
1416
+ self.ctx.train_data[self.ctx.weight_nme]
1417
+ )
1418
+ self.ctx.model_label += [self.label]
1419
+ self.ctx.train_data['pred_ft'] = self.model.predict(
1420
+ self.ctx.train_data[self.ctx.factor_nmes]
1421
+ )
1422
+ self.ctx.test_data['pred_ft'] = self.model.predict(
1423
+ self.ctx.test_data[self.ctx.factor_nmes]
1424
+ )
1425
+ self.ctx.train_data.loc[:, 'w_pred_ft'] = (
1426
+ self.ctx.train_data['pred_ft'] *
1427
+ self.ctx.train_data[self.ctx.weight_nme]
1428
+ )
1429
+ self.ctx.test_data.loc[:, 'w_pred_ft'] = (
1430
+ self.ctx.test_data['pred_ft'] *
1431
+ self.ctx.test_data[self.ctx.weight_nme]
1432
+ )
1433
+ self.ctx.ft_best = self.model
1434
+
1435
+ def save(self) -> None:
1436
+ if self.model is not None:
1437
+ torch.save(
1438
+ self.model,
1439
+ self.output.model_path(
1440
+ f'01_{self.ctx.model_nme}_FTTransformer.pth')
1441
+ )
1442
+
1443
+ def load(self) -> None:
1444
+ path = self.output.model_path(
1445
+ f'01_{self.ctx.model_nme}_FTTransformer.pth')
1446
+ if os.path.exists(path):
1447
+ ft_loaded = torch.load(path, map_location='cpu')
1448
+ if torch.cuda.is_available():
1449
+ ft_loaded.device = torch.device('cuda')
1450
+ elif torch.backends.mps.is_available():
1451
+ ft_loaded.device = torch.device('mps')
1452
+ else:
1453
+ ft_loaded.device = torch.device('cpu')
1454
+ ft_loaded.ft.to(ft_loaded.device)
1455
+ self.model = ft_loaded
1456
+ self.ctx.ft_best = self.model
1457
+ else:
1458
+ print(f"[load_model] Warning: 未找到 FT-Transformer 模型文件:{path}")
1459
+
1460
+
1461
+ class BayesOptModel:
1462
+ def __init__(self, train_data, test_data,
1463
+ model_nme, resp_nme, weight_nme, factor_nmes,
1464
+ cate_list=None, prop_test=0.25, rand_seed=None,
1465
+ epochs=100, use_gpu=True):
1466
+ cfg = BayesOptConfig(
1467
+ model_nme=model_nme,
1468
+ resp_nme=resp_nme,
1469
+ weight_nme=weight_nme,
1470
+ factor_nmes=list(factor_nmes),
1471
+ cate_list=list(cate_list) if cate_list else None,
1472
+ prop_test=prop_test,
1473
+ rand_seed=rand_seed,
1474
+ epochs=epochs,
1475
+ use_gpu=use_gpu
1476
+ )
1477
+ self.config = cfg
1478
+ self.model_nme = cfg.model_nme
1479
+ self.resp_nme = cfg.resp_nme
1480
+ self.weight_nme = cfg.weight_nme
1481
+ self.factor_nmes = cfg.factor_nmes
1482
+ self.cate_list = list(cfg.cate_list or [])
1483
+ self.prop_test = cfg.prop_test
1484
+ self.epochs = cfg.epochs
1485
+ self.rand_seed = cfg.rand_seed if cfg.rand_seed is not None else np.random.randint(
1486
+ 1, 10000)
1487
+ self.use_gpu = bool(cfg.use_gpu and torch.cuda.is_available())
1488
+ self.output_manager = OutputManager(os.getcwd(), self.model_nme)
1489
+
1490
+ preprocessor = DatasetPreprocessor(train_data, test_data, cfg).run()
1491
+ self.train_data = preprocessor.train_data
1492
+ self.test_data = preprocessor.test_data
1493
+ self.train_oht_scl_data = preprocessor.train_oht_scl_data
1494
+ self.test_oht_scl_data = preprocessor.test_oht_scl_data
1495
+ self.var_nmes = preprocessor.var_nmes
1496
+ self.num_features = preprocessor.num_features
1497
+ self.cat_categories_for_shap = preprocessor.cat_categories_for_shap
1498
+
1499
+ self.cv = ShuffleSplit(n_splits=int(1/self.prop_test),
1500
+ test_size=self.prop_test,
1501
+ random_state=self.rand_seed)
1502
+ if self.model_nme.find('f') != -1:
1503
+ self.obj = 'count:poisson'
1504
+ elif self.model_nme.find('s') != -1:
1505
+ self.obj = 'reg:gamma'
1506
+ elif self.model_nme.find('bc') != -1:
1507
+ self.obj = 'reg:tweedie'
1508
+ else:
1509
+ self.obj = 'reg:tweedie'
1510
+ self.fit_params = {
1511
+ 'sample_weight': self.train_data[self.weight_nme].values
1512
+ }
1513
+ self.model_label: List[str] = []
1514
+
1515
+ # 记录各模型训练器,后续统一通过标签访问,方便扩展新模型
1516
+ self.trainers: Dict[str, TrainerBase] = {
1517
+ 'xgb': XGBTrainer(self),
1518
+ 'resn': ResNetTrainer(self),
1519
+ 'ft': FTTrainer(self)
1520
+ }
1521
+ self.xgb_best = None
1522
+ self.resn_best = None
1523
+ self.ft_best = None
1524
+ self.best_xgb_params = None
1525
+ self.best_resn_params = None
1526
+ self.best_ft_params = None
1527
+ self.best_xgb_trial = None
1528
+ self.best_resn_trial = None
1529
+ self.best_ft_trial = None
1530
+ self.xgb_load = None
1531
+ self.resn_load = None
1532
+ self.ft_load = None
1533
+
1534
+ # 定义单因素画图函数
1535
+ def plot_oneway(self, n_bins=10):
1536
+ for c in self.factor_nmes:
1537
+ fig = plt.figure(figsize=(7, 5))
1538
+ if c in self.cate_list:
1539
+ group_col = c
1540
+ plot_source = self.train_data
1541
+ else:
1542
+ group_col = f'{c}_bins'
1543
+ bins = pd.qcut(
1544
+ self.train_data[c],
1545
+ n_bins,
1546
+ duplicates='drop' # 注意:如果分位数重复会丢 bin,避免异常终止
1547
+ )
1548
+ plot_source = self.train_data.assign(**{group_col: bins})
1549
+ plot_data = plot_source.groupby(
1550
+ [group_col], observed=True).sum(numeric_only=True)
1551
+ plot_data.reset_index(inplace=True)
1552
+ plot_data['act_v'] = plot_data['w_act'] / \
1553
+ plot_data[self.weight_nme]
1554
+ plot_data.head()
1555
+ ax = fig.add_subplot(111)
1556
+ ax.plot(plot_data.index, plot_data['act_v'],
1557
+ label='Actual', color='red')
1558
+ ax.set_title(
1559
+ 'Analysis of %s : Train Data' % group_col,
1560
+ fontsize=8)
1561
+ plt.xticks(plot_data.index,
1562
+ list(plot_data[group_col].astype(str)),
1563
+ rotation=90)
1564
+ if len(list(plot_data[group_col].astype(str))) > 50:
1565
+ plt.xticks(fontsize=3)
1566
+ else:
1567
+ plt.xticks(fontsize=6)
1568
+ plt.yticks(fontsize=6)
1569
+ ax2 = ax.twinx()
1570
+ ax2.bar(plot_data.index,
1571
+ plot_data[self.weight_nme],
1572
+ alpha=0.5, color='seagreen')
1573
+ plt.yticks(fontsize=6)
1574
+ plt.margins(0.05)
1575
+ plt.subplots_adjust(wspace=0.3)
1576
+ save_path = self.output_manager.plot_path(
1577
+ f'00_{self.model_nme}_{group_col}_oneway.png')
1578
+ plt.savefig(save_path, dpi=300)
1579
+ plt.close(fig)
1580
+
1581
+ # 定义Xgboost贝叶斯优化函数
1582
+ def bayesopt_xgb(self, max_evals=100):
1583
+ trainer = self.trainers['xgb']
1584
+ trainer.tune(max_evals)
1585
+ trainer.train()
1586
+ self.xgb_best = trainer.model
1587
+ # 记录最优参数及 trial 以便排查或复现结果
1588
+ self.best_xgb_params = trainer.best_params
1589
+ self.best_xgb_trial = trainer.best_trial
1590
+
1591
+ # 定义ResNet贝叶斯优化函数
1592
+ def bayesopt_resnet(self, max_evals=100):
1593
+ trainer = self.trainers['resn']
1594
+ trainer.tune(max_evals)
1595
+ trainer.train()
1596
+ self.resn_best = trainer.model
1597
+ # 保存最优 trial 相关信息,方便后续调参分析
1598
+ self.best_resn_params = trainer.best_params
1599
+ self.best_resn_trial = trainer.best_trial
1600
+
1601
+ # 定义 FT-Transformer 贝叶斯优化函数
1602
+ def bayesopt_ft(self, max_evals=50):
1603
+ trainer = self.trainers['ft']
1604
+ trainer.tune(max_evals)
1605
+ trainer.train()
1606
+ self.ft_best = trainer.model
1607
+ # FT-Transformer 参数较多,留存配置信息尤其重要
1608
+ self.best_ft_params = trainer.best_params
1609
+ self.best_ft_trial = trainer.best_trial
1610
+
1611
+ # 定义分箱函数
1612
+
1613
+ def _split_data(self, data, col_nme, wgt_nme, n_bins=10):
1614
+ # 先按得分排序再按累计权重等分,能保证每个分箱曝光量接近
1615
+ data.sort_values(by=col_nme, ascending=True, inplace=True)
1616
+ data['cum_weight'] = data[wgt_nme].cumsum()
1617
+ w_sum = data[wgt_nme].sum()
1618
+ data.loc[:, 'bins'] = np.floor(
1619
+ data['cum_weight']*float(n_bins)/w_sum)
1620
+ data.loc[(data['bins'] == n_bins), 'bins'] = n_bins-1
1621
+ return data.groupby(['bins'], observed=True).sum(numeric_only=True)
1622
+
1623
+ # 构建提纯曲线所需的数据
1624
+ def _plot_data_lift(self,
1625
+ pred_list, w_pred_list,
1626
+ w_act_list, weight_list, n_bins=10):
1627
+ lift_data = pd.DataFrame()
1628
+ lift_data.loc[:, 'pred'] = pred_list
1629
+ lift_data.loc[:, 'w_pred'] = w_pred_list
1630
+ lift_data.loc[:, 'act'] = w_act_list
1631
+ lift_data.loc[:, 'weight'] = weight_list
1632
+ plot_data = self._split_data(
1633
+ lift_data, 'pred', 'weight', n_bins)
1634
+ plot_data['exp_v'] = plot_data['w_pred'] / plot_data['weight']
1635
+ plot_data['act_v'] = plot_data['act'] / plot_data['weight']
1636
+ plot_data.reset_index(inplace=True)
1637
+ return plot_data
1638
+
1639
+ # 绘制提纯曲线
1640
+ def plot_lift(self, model_label, pred_nme, n_bins=10):
1641
+ # 绘制建模集上结果
1642
+ figpos_list = [121, 122]
1643
+ plot_dict = {
1644
+ 121: self.train_data,
1645
+ 122: self.test_data
1646
+ }
1647
+ name_list = {
1648
+ 121: 'Train Data',
1649
+ 122: 'Test Data'
1650
+ }
1651
+ if model_label == 'Xgboost':
1652
+ pred_nme = 'pred_xgb'
1653
+ elif model_label == 'ResNet':
1654
+ pred_nme = 'pred_resn'
1655
+ elif model_label == 'FTTransformer':
1656
+ pred_nme = 'pred_ft'
1657
+ # pred_nme 映射保证后续取列统一,否则新模型加入时需同步更新
1658
+
1659
+ fig = plt.figure(figsize=(11, 5))
1660
+ for figpos in figpos_list:
1661
+ plot_data = self._plot_data_lift(
1662
+ plot_dict[figpos][pred_nme].values,
1663
+ plot_dict[figpos]['w_'+pred_nme].values,
1664
+ plot_dict[figpos]['w_act'].values,
1665
+ plot_dict[figpos][self.weight_nme].values,
1666
+ n_bins)
1667
+ ax = fig.add_subplot(figpos)
1668
+ ax.plot(plot_data.index, plot_data['act_v'],
1669
+ label='Actual', color='red')
1670
+ ax.plot(plot_data.index, plot_data['exp_v'],
1671
+ label='Predicted', color='blue')
1672
+ ax.set_title(
1673
+ 'Lift Chart on %s' % name_list[figpos], fontsize=8)
1674
+ plt.xticks(plot_data.index,
1675
+ plot_data.index,
1676
+ rotation=90, fontsize=6)
1677
+ plt.yticks(fontsize=6)
1678
+ plt.legend(loc='upper left',
1679
+ fontsize=5, frameon=False)
1680
+ plt.margins(0.05)
1681
+ ax2 = ax.twinx()
1682
+ ax2.bar(plot_data.index, plot_data['weight'],
1683
+ alpha=0.5, color='seagreen',
1684
+ label='Earned Exposure')
1685
+ plt.yticks(fontsize=6)
1686
+ plt.legend(loc='upper right',
1687
+ fontsize=5, frameon=False)
1688
+ plt.subplots_adjust(wspace=0.3)
1689
+ save_path = self.output_manager.plot_path(
1690
+ f'01_{self.model_nme}_{model_label}_lift.png')
1691
+ plt.savefig(save_path, dpi=300)
1692
+ plt.show()
1693
+ plt.close(fig)
1694
+
1695
+ # 构建双提纯曲线所需的数据
1696
+ def _plot_data_dlift(self,
1697
+ pred_list_model1, pred_list_model2,
1698
+ w_list, w_act_list, n_bins=10):
1699
+ lift_data = pd.DataFrame()
1700
+ lift_data.loc[:, 'pred1'] = pred_list_model1
1701
+ lift_data.loc[:, 'pred2'] = pred_list_model2
1702
+ lift_data.loc[:, 'diff_ly'] = lift_data['pred1'] / lift_data['pred2']
1703
+ lift_data.loc[:, 'act'] = w_act_list
1704
+ lift_data.loc[:, 'weight'] = w_list
1705
+ plot_data = self._split_data(lift_data, 'diff_ly', 'weight', n_bins)
1706
+ plot_data['exp_v1'] = plot_data['pred1'] / plot_data['act']
1707
+ plot_data['exp_v2'] = plot_data['pred2'] / plot_data['act']
1708
+ plot_data['act_v'] = plot_data['act'] / plot_data['act']
1709
+ plot_data.reset_index(inplace=True)
1710
+ return plot_data
1711
+
1712
+ # 绘制双提纯曲线
1713
+ def plot_dlift(self, model_comp=['xgb', 'resn'], n_bins=10):
1714
+ # 指标名称
1715
+ # xgb 表示 XGBoost
1716
+ # resn 表示 ResNet
1717
+ # ft 表示 FT-Transformer
1718
+ figpos_list = [121, 122]
1719
+ plot_dict = {
1720
+ 121: self.train_data,
1721
+ 122: self.test_data
1722
+ }
1723
+ name_list = {
1724
+ 121: 'Train Data',
1725
+ 122: 'Test Data'
1726
+ }
1727
+ fig = plt.figure(figsize=(11, 5))
1728
+ for figpos in figpos_list:
1729
+ plot_data = self._plot_data_dlift(
1730
+ plot_dict[figpos]['w_pred_'+model_comp[0]].values,
1731
+ plot_dict[figpos]['w_pred_'+model_comp[1]].values,
1732
+ plot_dict[figpos][self.weight_nme].values,
1733
+ plot_dict[figpos]['w_act'].values,
1734
+ n_bins)
1735
+ ax = fig.add_subplot(figpos)
1736
+ tt1 = 'Xgboost'
1737
+ tt2 = 'ResNet'
1738
+ ax.plot(plot_data.index, plot_data['act_v'],
1739
+ label='Actual', color='red')
1740
+ ax.plot(plot_data.index, plot_data['exp_v1'],
1741
+ label=tt1, color='blue')
1742
+ ax.plot(plot_data.index, plot_data['exp_v2'],
1743
+ label=tt2, color='black')
1744
+ ax.set_title(
1745
+ 'Double Lift Chart on %s' % name_list[figpos], fontsize=8)
1746
+ plt.xticks(plot_data.index,
1747
+ plot_data.index,
1748
+ rotation=90, fontsize=6)
1749
+ plt.xlabel('%s / %s' % (tt1, tt2), fontsize=6)
1750
+ plt.yticks(fontsize=6)
1751
+ plt.legend(loc='upper left',
1752
+ fontsize=5, frameon=False)
1753
+ plt.margins(0.1)
1754
+ plt.subplots_adjust(bottom=0.25, top=0.95, right=0.8)
1755
+ ax2 = ax.twinx()
1756
+ ax2.bar(plot_data.index, plot_data['weight'],
1757
+ alpha=0.5, color='seagreen',
1758
+ label='Earned Exposure')
1759
+ plt.yticks(fontsize=6)
1760
+ plt.legend(loc='upper right',
1761
+ fontsize=5, frameon=False)
1762
+ plt.subplots_adjust(wspace=0.3)
1763
+ save_path = self.output_manager.plot_path(
1764
+ f'02_{self.model_nme}_dlift.png')
1765
+ plt.savefig(save_path, dpi=300)
1766
+ plt.show()
1767
+ plt.close(fig)
1768
+
1769
+ # 保存模型
1770
+
1771
+ def save_model(self, model_name=None):
1772
+
1773
+ # model_name 可以是:
1774
+ # - None: 保存全部可用模型
1775
+ # - 'xgb': 只保存 Xgboost
1776
+ # - 'resn': 只保存 ResNet
1777
+ # - 'ft': 只保存 FT-Transformer
1778
+ if model_name in (None, 'xgb'):
1779
+ trainer = self.trainers['xgb']
1780
+ if trainer.model is not None:
1781
+ trainer.save()
1782
+ else:
1783
+ print("[save_model] Warning: xgb_best 不存在,未保存 Xgboost 模型。")
1784
+
1785
+ if model_name in (None, 'resn'):
1786
+ trainer = self.trainers['resn']
1787
+ if trainer.model is not None:
1788
+ trainer.save()
1789
+ else:
1790
+ print("[save_model] Warning: resn_best 不存在,未保存 ResNet 模型。")
1791
+
1792
+ if model_name in (None, 'ft'):
1793
+ trainer = self.trainers['ft']
1794
+ if trainer.model is not None:
1795
+ trainer.save()
1796
+ else:
1797
+ print("[save_model] Warning: ft_best 不存在,未保存 FT-Transformer 模型。")
1798
+
1799
+ def load_model(self, model_name=None):
1800
+ # model_name 可以是:
1801
+ # - None: 加载全部能找到的模型
1802
+ # - 'xgb': 只加载 Xgboost
1803
+ # - 'resn': 只加载 ResNet
1804
+ # - 'ft': 只加载 FT-Transformer
1805
+
1806
+ if model_name in (None, 'xgb'):
1807
+ trainer = self.trainers['xgb']
1808
+ trainer.load()
1809
+ self.xgb_best = trainer.model
1810
+ self.xgb_load = trainer.model
1811
+
1812
+ if model_name in (None, 'resn'):
1813
+ trainer = self.trainers['resn']
1814
+ trainer.load()
1815
+ self.resn_best = trainer.model
1816
+ self.resn_load = trainer.model
1817
+
1818
+ if model_name in (None, 'ft'):
1819
+ trainer = self.trainers['ft']
1820
+ trainer.load()
1821
+ self.ft_best = trainer.model
1822
+ self.ft_load = trainer.model
1823
+
1824
+ def _build_ft_shap_matrix(self, data: pd.DataFrame) -> np.ndarray:
1825
+
1826
+ # 将原始特征 DataFrame (包含 self.factor_nmes) 转成
1827
+ # 纯数值矩阵: 数值列为 float64,类别列为整数 code(float64 存储)。
1828
+ # 列顺序与 self.factor_nmes 保持一致。
1829
+
1830
+ matrices = []
1831
+
1832
+ for col in self.factor_nmes:
1833
+ s = data[col]
1834
+
1835
+ if col in self.cate_list:
1836
+ # 类别列:按训练时的类别全集编码
1837
+ cats = pd.Categorical(
1838
+ s,
1839
+ categories=self.cat_categories_for_shap[col]
1840
+ )
1841
+ # cats.codes 是一个 Index / ndarray,用 np.asarray 包一下再 reshape
1842
+ codes = np.asarray(cats.codes, dtype=np.float64).reshape(-1, 1)
1843
+ matrices.append(codes)
1844
+ else:
1845
+ # 数值列:转成 Series -> numpy -> reshape
1846
+ vals = pd.to_numeric(s, errors="coerce")
1847
+ arr = vals.to_numpy(dtype=np.float64, copy=True).reshape(-1, 1)
1848
+ matrices.append(arr)
1849
+
1850
+ X_mat = np.concatenate(matrices, axis=1) # (N, F)
1851
+ return X_mat
1852
+
1853
+ def _decode_ft_shap_matrix_to_df(self, X_mat: np.ndarray) -> pd.DataFrame:
1854
+
1855
+ # 将 SHAP 的数值矩阵 (N, F) 还原为原始特征 DataFrame,
1856
+ # 数值列为 float,类别列还原为 pandas 的 category 类型,
1857
+ # 以便兼容 enable_categorical=True 的 XGBoost 和 FT-Transformer 的输入。
1858
+ # 列顺序 = self.factor_nmes
1859
+
1860
+ data_dict = {}
1861
+
1862
+ for j, col in enumerate(self.factor_nmes):
1863
+ col_vals = X_mat[:, j]
1864
+
1865
+ if col in self.cate_list:
1866
+ cats = self.cat_categories_for_shap[col]
1867
+
1868
+ # SHAP 会扰动成小数,这里 round 回整数 code
1869
+ codes = np.round(col_vals).astype(int)
1870
+ # 限制在 [-1, len(cats)-1]
1871
+ codes = np.clip(codes, -1, len(cats) - 1)
1872
+
1873
+ # 使用 pandas.Categorical.from_codes:
1874
+ # - codes = -1 被当成缺失 (NaN)
1875
+ # - 其他索引映射到 cats 中对应的类别
1876
+ cat_series = pd.Categorical.from_codes(
1877
+ codes,
1878
+ categories=cats
1879
+ )
1880
+ # 存的是 Categorical 类型,而不是 object
1881
+ data_dict[col] = cat_series
1882
+ else:
1883
+ # 数值列:直接 float
1884
+ data_dict[col] = col_vals.astype(float)
1885
+
1886
+ df = pd.DataFrame(data_dict, columns=self.factor_nmes)
1887
+
1888
+ # 再保险:确保所有类别列 dtype 真的是 category
1889
+ for col in self.cate_list:
1890
+ if col in df.columns:
1891
+ df[col] = df[col].astype("category")
1892
+ return df
1893
+
1894
+ # ========= XGBoost SHAP =========
1895
+
1896
+ def compute_shap_xgb(self, n_background: int = 500,
1897
+ n_samples: int = 200,
1898
+ on_train: bool = True):
1899
+ # 使用 KernelExplainer 计算 XGBoost 的 SHAP 值(黑盒方式)。
1900
+ #
1901
+ # - 对 SHAP:输入是一份纯数值矩阵:
1902
+ # * 数值特征:float64
1903
+ # * 类别特征:用 _build_ft_shap_matrix 编码后的整数 code(float64)
1904
+ # - 对模型:仍然用原始 DataFrame + xgb_best.predict(...)
1905
+
1906
+ if not hasattr(self, "xgb_best"):
1907
+ raise RuntimeError("请先运行 bayesopt_xgb() 训练好 self.xgb_best")
1908
+
1909
+ # 1) 选择数据源:训练集 or 测试集(原始特征空间)
1910
+ data = self.train_data if on_train else self.test_data
1911
+ X_raw = data[self.factor_nmes]
1912
+
1913
+ # 2) 构造背景矩阵(用和 FT 一样的数值编码)
1914
+ background_raw = X_raw.sample(
1915
+ min(len(X_raw), n_background),
1916
+ random_state=self.rand_seed
1917
+ )
1918
+ # KernelExplainer 计算量极大,务必控制背景样本规模,否则会拖慢调试
1919
+ background_mat = self._build_ft_shap_matrix(
1920
+ background_raw
1921
+ ).astype(np.float64, copy=True)
1922
+
1923
+ # 3) 定义黑盒预测函数:数值矩阵 -> DataFrame -> xgb_best.predict
1924
+ def f_predict(x_mat: np.ndarray) -> np.ndarray:
1925
+ # 把编码矩阵还原成原始 DataFrame(数值+类别)
1926
+ df_input = self._decode_ft_shap_matrix_to_df(x_mat)
1927
+ # 注意:这里用的是 self.xgb_best.predict,和你训练/预测时一致
1928
+ y_pred = self.xgb_best.predict(df_input)
1929
+ return y_pred
1930
+
1931
+ explainer = shap.KernelExplainer(f_predict, background_mat)
1932
+
1933
+ # 4) 要解释的样本:原始特征 + 数值编码
1934
+ X_explain_raw = X_raw.sample(
1935
+ min(len(X_raw), n_samples),
1936
+ random_state=self.rand_seed
1937
+ )
1938
+ X_explain_mat = self._build_ft_shap_matrix(
1939
+ X_explain_raw
1940
+ ).astype(np.float64, copy=True)
1941
+
1942
+ # 5) 计算 SHAP 值(注意用 nsamples='auto' 控制复杂度)
1943
+ shap_values = explainer.shap_values(X_explain_mat, nsamples="auto")
1944
+
1945
+ # 6) 保存结果:
1946
+ # - shap_values:数值编码空间,对应 factor_nmes 的每一列
1947
+ # - X_explain_raw:原始 DataFrame,方便画图时显示真实类别名
1948
+ self.shap_xgb = {
1949
+ "explainer": explainer,
1950
+ "X_explain": X_explain_raw,
1951
+ "shap_values": shap_values,
1952
+ "base_value": explainer.expected_value,
1953
+ }
1954
+ return self.shap_xgb
1955
+ # ========= ResNet SHAP =========
1956
+
1957
+ def _resn_predict_wrapper(self, X_np: np.ndarray) -> np.ndarray:
1958
+ # 用于 SHAP 的 ResNet 预测包装。
1959
+ # X_np: numpy array, shape = (N, n_features), 列顺序对应 self.var_nmes
1960
+ X_df = pd.DataFrame(X_np, columns=self.var_nmes)
1961
+ return self.resn_best.predict(X_df)
1962
+
1963
+ def compute_shap_resn(self, n_background: int = 500,
1964
+ n_samples: int = 200,
1965
+ on_train: bool = True):
1966
+
1967
+ # 使用 KernelExplainer 计算 ResNet 的 SHAP 值。
1968
+ # 解释空间:已 one-hot & 标准化后的特征 self.var_nmes。
1969
+
1970
+ if not hasattr(self, 'resn_best'):
1971
+ raise RuntimeError("请先运行 bayesopt_resnet() 训练好 resn_best")
1972
+
1973
+ # 选择数据集(已 one-hot & 标准化)
1974
+ data = self.train_oht_scl_data if on_train else self.test_oht_scl_data
1975
+ X = data[self.var_nmes]
1976
+
1977
+ # 背景样本:float64 numpy
1978
+ background_df = X.sample(
1979
+ min(len(X), n_background),
1980
+ random_state=self.rand_seed
1981
+ )
1982
+ background_np = background_df.to_numpy(dtype=np.float64, copy=True)
1983
+
1984
+ # 黑盒预测函数
1985
+ def f_predict(x):
1986
+ return self._resn_predict_wrapper(x)
1987
+
1988
+ explainer = shap.KernelExplainer(f_predict, background_np)
1989
+
1990
+ # 要解释的样本
1991
+ X_explain_df = X.sample(
1992
+ min(len(X), n_samples),
1993
+ random_state=self.rand_seed
1994
+ )
1995
+ X_explain_np = X_explain_df.to_numpy(dtype=np.float64, copy=True)
1996
+
1997
+ shap_values = explainer.shap_values(X_explain_np, nsamples="auto")
1998
+
1999
+ self.shap_resn = {
2000
+ "explainer": explainer,
2001
+ "X_explain": X_explain_df, # DataFrame: 用于画图(有列名)
2002
+ "shap_values": shap_values, # numpy: (n_samples, n_features)
2003
+ "base_value": explainer.expected_value,
2004
+ }
2005
+ return self.shap_resn
2006
+
2007
+ # ========= FT-Transformer SHAP =========
2008
+
2009
+ def _ft_shap_predict_wrapper(self, X_mat: np.ndarray) -> np.ndarray:
2010
+
2011
+ # SHAP 的预测包装:
2012
+ # 数值矩阵 -> 还原为原始特征 DataFrame -> 调用 ft_best.predict
2013
+
2014
+ df_input = self._decode_ft_shap_matrix_to_df(X_mat)
2015
+ y_pred = self.ft_best.predict(df_input)
2016
+ return y_pred
2017
+
2018
+ def compute_shap_ft(self, n_background: int = 500,
2019
+ n_samples: int = 200,
2020
+ on_train: bool = True):
2021
+
2022
+ # 使用 KernelExplainer 计算 FT-Transformer 的 SHAP 值。
2023
+ # 解释空间:数值+类别code 的混合数值矩阵(float64),
2024
+ # 但对外展示时仍使用原始特征名/取值(X_explain)。
2025
+
2026
+ if not hasattr(self, "ft_best"):
2027
+ raise RuntimeError("请先运行 bayesopt_ft() 训练好 ft_best")
2028
+
2029
+ # 选择数据源(原始特征空间)
2030
+ data = self.train_data if on_train else self.test_data
2031
+ X_raw = data[self.factor_nmes]
2032
+
2033
+ # 背景矩阵
2034
+ background_raw = X_raw.sample(
2035
+ min(len(X_raw), n_background),
2036
+ random_state=self.rand_seed
2037
+ )
2038
+ background_mat = self._build_ft_shap_matrix(
2039
+ background_raw
2040
+ ).astype(np.float64, copy=True)
2041
+
2042
+ # 黑盒预测函数(数值矩阵 → DataFrame → FT 模型)
2043
+ def f_predict(x):
2044
+ return self._ft_shap_predict_wrapper(x)
2045
+
2046
+ explainer = shap.KernelExplainer(f_predict, background_mat)
2047
+
2048
+ # 要解释的样本(原始特征空间)
2049
+ X_explain_raw = X_raw.sample(
2050
+ min(len(X_raw), n_samples),
2051
+ random_state=self.rand_seed
2052
+ )
2053
+ X_explain_mat = self._build_ft_shap_matrix(
2054
+ X_explain_raw
2055
+ ).astype(np.float64, copy=True)
2056
+
2057
+ shap_values = explainer.shap_values(X_explain_mat, nsamples="auto")
2058
+
2059
+ self.shap_ft = {
2060
+ "explainer": explainer,
2061
+ "X_explain": X_explain_raw, # 原始特征 DataFrame,用来画图
2062
+ "shap_values": shap_values, # numpy: (n_samples, n_features)
2063
+ "base_value": explainer.expected_value,
2064
+ }
2065
+ return self.shap_ft