ins-pricing 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. ins_pricing/README.md +60 -0
  2. ins_pricing/__init__.py +102 -0
  3. ins_pricing/governance/README.md +18 -0
  4. ins_pricing/governance/__init__.py +20 -0
  5. ins_pricing/governance/approval.py +93 -0
  6. ins_pricing/governance/audit.py +37 -0
  7. ins_pricing/governance/registry.py +99 -0
  8. ins_pricing/governance/release.py +159 -0
  9. ins_pricing/modelling/BayesOpt.py +146 -0
  10. ins_pricing/modelling/BayesOpt_USAGE.md +925 -0
  11. ins_pricing/modelling/BayesOpt_entry.py +575 -0
  12. ins_pricing/modelling/BayesOpt_incremental.py +731 -0
  13. ins_pricing/modelling/Explain_Run.py +36 -0
  14. ins_pricing/modelling/Explain_entry.py +539 -0
  15. ins_pricing/modelling/Pricing_Run.py +36 -0
  16. ins_pricing/modelling/README.md +33 -0
  17. ins_pricing/modelling/__init__.py +44 -0
  18. ins_pricing/modelling/bayesopt/__init__.py +98 -0
  19. ins_pricing/modelling/bayesopt/config_preprocess.py +303 -0
  20. ins_pricing/modelling/bayesopt/core.py +1476 -0
  21. ins_pricing/modelling/bayesopt/models.py +2196 -0
  22. ins_pricing/modelling/bayesopt/trainers.py +2446 -0
  23. ins_pricing/modelling/bayesopt/utils.py +1021 -0
  24. ins_pricing/modelling/cli_common.py +136 -0
  25. ins_pricing/modelling/explain/__init__.py +55 -0
  26. ins_pricing/modelling/explain/gradients.py +334 -0
  27. ins_pricing/modelling/explain/metrics.py +176 -0
  28. ins_pricing/modelling/explain/permutation.py +155 -0
  29. ins_pricing/modelling/explain/shap_utils.py +146 -0
  30. ins_pricing/modelling/notebook_utils.py +284 -0
  31. ins_pricing/modelling/plotting/__init__.py +45 -0
  32. ins_pricing/modelling/plotting/common.py +63 -0
  33. ins_pricing/modelling/plotting/curves.py +572 -0
  34. ins_pricing/modelling/plotting/diagnostics.py +139 -0
  35. ins_pricing/modelling/plotting/geo.py +362 -0
  36. ins_pricing/modelling/plotting/importance.py +121 -0
  37. ins_pricing/modelling/run_logging.py +133 -0
  38. ins_pricing/modelling/tests/conftest.py +8 -0
  39. ins_pricing/modelling/tests/test_cross_val_generic.py +66 -0
  40. ins_pricing/modelling/tests/test_distributed_utils.py +18 -0
  41. ins_pricing/modelling/tests/test_explain.py +56 -0
  42. ins_pricing/modelling/tests/test_geo_tokens_split.py +49 -0
  43. ins_pricing/modelling/tests/test_graph_cache.py +33 -0
  44. ins_pricing/modelling/tests/test_plotting.py +63 -0
  45. ins_pricing/modelling/tests/test_plotting_library.py +150 -0
  46. ins_pricing/modelling/tests/test_preprocessor.py +48 -0
  47. ins_pricing/modelling/watchdog_run.py +211 -0
  48. ins_pricing/pricing/README.md +44 -0
  49. ins_pricing/pricing/__init__.py +27 -0
  50. ins_pricing/pricing/calibration.py +39 -0
  51. ins_pricing/pricing/data_quality.py +117 -0
  52. ins_pricing/pricing/exposure.py +85 -0
  53. ins_pricing/pricing/factors.py +91 -0
  54. ins_pricing/pricing/monitoring.py +99 -0
  55. ins_pricing/pricing/rate_table.py +78 -0
  56. ins_pricing/production/__init__.py +21 -0
  57. ins_pricing/production/drift.py +30 -0
  58. ins_pricing/production/monitoring.py +143 -0
  59. ins_pricing/production/scoring.py +40 -0
  60. ins_pricing/reporting/README.md +20 -0
  61. ins_pricing/reporting/__init__.py +11 -0
  62. ins_pricing/reporting/report_builder.py +72 -0
  63. ins_pricing/reporting/scheduler.py +45 -0
  64. ins_pricing/setup.py +41 -0
  65. ins_pricing v2/__init__.py +23 -0
  66. ins_pricing v2/governance/__init__.py +20 -0
  67. ins_pricing v2/governance/approval.py +93 -0
  68. ins_pricing v2/governance/audit.py +37 -0
  69. ins_pricing v2/governance/registry.py +99 -0
  70. ins_pricing v2/governance/release.py +159 -0
  71. ins_pricing v2/modelling/Explain_Run.py +36 -0
  72. ins_pricing v2/modelling/Pricing_Run.py +36 -0
  73. ins_pricing v2/modelling/__init__.py +151 -0
  74. ins_pricing v2/modelling/cli_common.py +141 -0
  75. ins_pricing v2/modelling/config.py +249 -0
  76. ins_pricing v2/modelling/config_preprocess.py +254 -0
  77. ins_pricing v2/modelling/core.py +741 -0
  78. ins_pricing v2/modelling/data_container.py +42 -0
  79. ins_pricing v2/modelling/explain/__init__.py +55 -0
  80. ins_pricing v2/modelling/explain/gradients.py +334 -0
  81. ins_pricing v2/modelling/explain/metrics.py +176 -0
  82. ins_pricing v2/modelling/explain/permutation.py +155 -0
  83. ins_pricing v2/modelling/explain/shap_utils.py +146 -0
  84. ins_pricing v2/modelling/features.py +215 -0
  85. ins_pricing v2/modelling/model_manager.py +148 -0
  86. ins_pricing v2/modelling/model_plotting.py +463 -0
  87. ins_pricing v2/modelling/models.py +2203 -0
  88. ins_pricing v2/modelling/notebook_utils.py +294 -0
  89. ins_pricing v2/modelling/plotting/__init__.py +45 -0
  90. ins_pricing v2/modelling/plotting/common.py +63 -0
  91. ins_pricing v2/modelling/plotting/curves.py +572 -0
  92. ins_pricing v2/modelling/plotting/diagnostics.py +139 -0
  93. ins_pricing v2/modelling/plotting/geo.py +362 -0
  94. ins_pricing v2/modelling/plotting/importance.py +121 -0
  95. ins_pricing v2/modelling/run_logging.py +133 -0
  96. ins_pricing v2/modelling/tests/conftest.py +8 -0
  97. ins_pricing v2/modelling/tests/test_cross_val_generic.py +66 -0
  98. ins_pricing v2/modelling/tests/test_distributed_utils.py +18 -0
  99. ins_pricing v2/modelling/tests/test_explain.py +56 -0
  100. ins_pricing v2/modelling/tests/test_geo_tokens_split.py +49 -0
  101. ins_pricing v2/modelling/tests/test_graph_cache.py +33 -0
  102. ins_pricing v2/modelling/tests/test_plotting.py +63 -0
  103. ins_pricing v2/modelling/tests/test_plotting_library.py +150 -0
  104. ins_pricing v2/modelling/tests/test_preprocessor.py +48 -0
  105. ins_pricing v2/modelling/trainers.py +2447 -0
  106. ins_pricing v2/modelling/utils.py +1020 -0
  107. ins_pricing v2/modelling/watchdog_run.py +211 -0
  108. ins_pricing v2/pricing/__init__.py +27 -0
  109. ins_pricing v2/pricing/calibration.py +39 -0
  110. ins_pricing v2/pricing/data_quality.py +117 -0
  111. ins_pricing v2/pricing/exposure.py +85 -0
  112. ins_pricing v2/pricing/factors.py +91 -0
  113. ins_pricing v2/pricing/monitoring.py +99 -0
  114. ins_pricing v2/pricing/rate_table.py +78 -0
  115. ins_pricing v2/production/__init__.py +21 -0
  116. ins_pricing v2/production/drift.py +30 -0
  117. ins_pricing v2/production/monitoring.py +143 -0
  118. ins_pricing v2/production/scoring.py +40 -0
  119. ins_pricing v2/reporting/__init__.py +11 -0
  120. ins_pricing v2/reporting/report_builder.py +72 -0
  121. ins_pricing v2/reporting/scheduler.py +45 -0
  122. ins_pricing v2/scripts/BayesOpt_incremental.py +722 -0
  123. ins_pricing v2/scripts/Explain_entry.py +545 -0
  124. ins_pricing v2/scripts/__init__.py +1 -0
  125. ins_pricing v2/scripts/train.py +568 -0
  126. ins_pricing v2/setup.py +55 -0
  127. ins_pricing v2/smoke_test.py +28 -0
  128. ins_pricing-0.1.6.dist-info/METADATA +78 -0
  129. ins_pricing-0.1.6.dist-info/RECORD +169 -0
  130. ins_pricing-0.1.6.dist-info/WHEEL +5 -0
  131. ins_pricing-0.1.6.dist-info/top_level.txt +4 -0
  132. user_packages/__init__.py +105 -0
  133. user_packages legacy/BayesOpt.py +5659 -0
  134. user_packages legacy/BayesOpt_entry.py +513 -0
  135. user_packages legacy/BayesOpt_incremental.py +685 -0
  136. user_packages legacy/Pricing_Run.py +36 -0
  137. user_packages legacy/Try/BayesOpt Legacy251213.py +3719 -0
  138. user_packages legacy/Try/BayesOpt Legacy251215.py +3758 -0
  139. user_packages legacy/Try/BayesOpt lagecy251201.py +3506 -0
  140. user_packages legacy/Try/BayesOpt lagecy251218.py +3992 -0
  141. user_packages legacy/Try/BayesOpt legacy.py +3280 -0
  142. user_packages legacy/Try/BayesOpt.py +838 -0
  143. user_packages legacy/Try/BayesOptAll.py +1569 -0
  144. user_packages legacy/Try/BayesOptAllPlatform.py +909 -0
  145. user_packages legacy/Try/BayesOptCPUGPU.py +1877 -0
  146. user_packages legacy/Try/BayesOptSearch.py +830 -0
  147. user_packages legacy/Try/BayesOptSearchOrigin.py +829 -0
  148. user_packages legacy/Try/BayesOptV1.py +1911 -0
  149. user_packages legacy/Try/BayesOptV10.py +2973 -0
  150. user_packages legacy/Try/BayesOptV11.py +3001 -0
  151. user_packages legacy/Try/BayesOptV12.py +3001 -0
  152. user_packages legacy/Try/BayesOptV2.py +2065 -0
  153. user_packages legacy/Try/BayesOptV3.py +2209 -0
  154. user_packages legacy/Try/BayesOptV4.py +2342 -0
  155. user_packages legacy/Try/BayesOptV5.py +2372 -0
  156. user_packages legacy/Try/BayesOptV6.py +2759 -0
  157. user_packages legacy/Try/BayesOptV7.py +2832 -0
  158. user_packages legacy/Try/BayesOptV8Codex.py +2731 -0
  159. user_packages legacy/Try/BayesOptV8Gemini.py +2614 -0
  160. user_packages legacy/Try/BayesOptV9.py +2927 -0
  161. user_packages legacy/Try/BayesOpt_entry legacy.py +313 -0
  162. user_packages legacy/Try/ModelBayesOptSearch.py +359 -0
  163. user_packages legacy/Try/ResNetBayesOptSearch.py +249 -0
  164. user_packages legacy/Try/XgbBayesOptSearch.py +121 -0
  165. user_packages legacy/Try/xgbbayesopt.py +523 -0
  166. user_packages legacy/__init__.py +19 -0
  167. user_packages legacy/cli_common.py +124 -0
  168. user_packages legacy/notebook_utils.py +228 -0
  169. user_packages legacy/watchdog_run.py +202 -0
@@ -0,0 +1,1877 @@
1
+ # 数据在CPU和GPU之间传输会带来较大开销,但可以多CUDA流同时传输数据和计算,从而实现更大数据集的操作。
2
+
3
+ import pandas as pd
4
+ import numpy as np
5
+ from random import sample
6
+ from re import X
7
+ from turtle import st
8
+ from uuid import RESERVED_FUTURE
9
+ import numpy as np # 1.26.2
10
+ import pandas as pd # 2.2.3
11
+ import torch # 1.10.1+cu111
12
+ import torch.nn as nn
13
+ import torch.nn.functional as F
14
+ import optuna # 4.3.0
15
+ import xgboost as xgb # 1.7.0
16
+ import matplotlib.pyplot as plt
17
+ import os
18
+ import joblib
19
+ import copy
20
+ import shap
21
+ import math
22
+ import gc
23
+
24
+ from torch.utils.data import Dataset, DataLoader, TensorDataset
25
+ from torch.cuda.amp import autocast, GradScaler
26
+ from torch.nn.utils import clip_grad_norm_
27
+ from sklearn.model_selection import ShuffleSplit, cross_val_score # 1.2.2
28
+ from sklearn.preprocessing import StandardScaler
29
+ from sklearn.metrics import make_scorer, mean_tweedie_deviance
30
+
31
+ # 定义torch下tweedie deviance损失函数
32
+ # 参考:https://scikit-learn.org/stable/modules/model_evaluation.html#mean-poisson-gamma-and-tweedie-deviances
33
+
34
+
35
+ def tweedie_loss(pred, target, p=1.5, eps=1e-6, max_clip=1e6):
36
+ # Ensure predictions are positive for stability
37
+ pred_clamped = torch.clamp(pred, min=eps)
38
+ # Compute Tweedie deviance components
39
+ if p == 1:
40
+ # Poisson case
41
+ term1 = target * torch.log(target / pred_clamped + eps)
42
+ term2 = -target + pred_clamped
43
+ term3 = 0
44
+ elif p == 0:
45
+ # Gaussian case
46
+ term1 = 0.5 * torch.pow(target - pred_clamped, 2)
47
+ term2 = 0
48
+ term3 = 0
49
+ elif p == 2:
50
+ # Gamma case
51
+ term1 = torch.log(pred_clamped / target + eps)
52
+ term2 = -target / pred_clamped + 1
53
+ term3 = 0
54
+ else:
55
+ term1 = torch.pow(target, 2 - p) / ((1 - p) * (2 - p))
56
+ term2 = target * torch.pow(pred_clamped, 1 - p) / (1 - p)
57
+ term3 = torch.pow(pred_clamped, 2 - p) / (2 - p)
58
+ # Tweedie negative log-likelihood (up to a constant)
59
+ return torch.nan_to_num(2 * (term1 - term2 + term3), nan=eps, posinf=max_clip, neginf=-max_clip)
60
+
61
+ # 定义释放CUDA内存函数
62
+
63
+
64
+ def free_cuda():
65
+ print(">>> Moving all models to CPU...")
66
+ for obj in gc.get_objects():
67
+ try:
68
+ if hasattr(obj, "to") and callable(obj.to):
69
+ # skip torch.device
70
+ obj.to("cpu")
71
+ except:
72
+ pass
73
+
74
+ print(">>> Deleting tensors, optimizers, dataloaders...")
75
+ gc.collect()
76
+
77
+ print(">>> Emptying CUDA cache...")
78
+ torch.cuda.empty_cache()
79
+ torch.cuda.synchronize()
80
+
81
+ print(">>> CUDA memory freed.")
82
+
83
+
84
+ # 定义分箱函数
85
+
86
+
87
+ def split_data(data, col_nme, wgt_nme, n_bins=10):
88
+ data.sort_values(by=col_nme, ascending=True, inplace=True)
89
+ data['cum_weight'] = data[wgt_nme].cumsum()
90
+ w_sum = data[wgt_nme].sum()
91
+ data.loc[:, 'bins'] = np.floor(data['cum_weight'] * float(n_bins) / w_sum)
92
+ data.loc[(data['bins'] == n_bins), 'bins'] = n_bins - 1
93
+ return data.groupby(['bins'], observed=True).sum(numeric_only=True)
94
+
95
+ # 定义Lift Chart绘制函数
96
+
97
+
98
+ def plot_lift_list(pred_model, w_pred_list, w_act_list,
99
+ weight_list, tgt_nme, n_bins=10,
100
+ fig_nme='Lift Chart'):
101
+ lift_data = pd.DataFrame()
102
+ lift_data.loc[:, 'pred'] = pred_model
103
+ lift_data.loc[:, 'w_pred'] = w_pred_list
104
+ lift_data.loc[:, 'act'] = w_act_list
105
+ lift_data.loc[:, 'weight'] = weight_list
106
+ plot_data = split_data(lift_data, 'pred', 'weight', n_bins)
107
+ plot_data['exp_v'] = plot_data['w_pred'] / plot_data['weight']
108
+ plot_data['act_v'] = plot_data['act'] / plot_data['weight']
109
+ plot_data.reset_index(inplace=True)
110
+ fig = plt.figure(figsize=(7, 5))
111
+ ax = fig.add_subplot(111)
112
+ ax.plot(plot_data.index, plot_data['act_v'],
113
+ label='Actual', color='red')
114
+ ax.plot(plot_data.index, plot_data['exp_v'],
115
+ label='Predicted', color='blue')
116
+ ax.set_title(
117
+ 'Lift Chart of %s' % tgt_nme, fontsize=8)
118
+ plt.xticks(plot_data.index,
119
+ plot_data.index,
120
+ rotation=90, fontsize=6)
121
+ plt.yticks(fontsize=6)
122
+ plt.legend(loc='upper left',
123
+ fontsize=5, frameon=False)
124
+ plt.margins(0.05)
125
+ ax2 = ax.twinx()
126
+ ax2.bar(plot_data.index, plot_data['weight'],
127
+ alpha=0.5, color='seagreen',
128
+ label='Earned Exposure')
129
+ plt.yticks(fontsize=6)
130
+ plt.legend(loc='upper right',
131
+ fontsize=5, frameon=False)
132
+ plt.subplots_adjust(wspace=0.3)
133
+ save_path = os.path.join(
134
+ os.getcwd(), 'plot', f'05_{tgt_nme}_{fig_nme}.png')
135
+ plt.savefig(save_path, dpi=300)
136
+ plt.close(fig)
137
+
138
+ # 定义Double Lift Chart绘制函数
139
+
140
+
141
+ def plot_dlift_list(pred_model_1, pred_model_2,
142
+ model_nme_1, model_nme_2,
143
+ tgt_nme,
144
+ w_list, w_act_list, n_bins=10,
145
+ fig_nme='Double Lift Chart'):
146
+ lift_data = pd.DataFrame()
147
+ lift_data.loc[:, 'pred1'] = pred_model_1
148
+ lift_data.loc[:, 'pred2'] = pred_model_2
149
+ lift_data.loc[:, 'diff_ly'] = lift_data['pred1'] / lift_data['pred2']
150
+ lift_data.loc[:, 'act'] = w_act_list
151
+ lift_data.loc[:, 'weight'] = w_list
152
+ lift_data.loc[:, 'w_pred1'] = lift_data['pred1'] * lift_data['weight']
153
+ lift_data.loc[:, 'w_pred2'] = lift_data['pred2'] * lift_data['weight']
154
+ plot_data = split_data(lift_data, 'diff_ly', 'weight', n_bins)
155
+ plot_data['exp_v1'] = plot_data['w_pred1'] / plot_data['act']
156
+ plot_data['exp_v2'] = plot_data['w_pred2'] / plot_data['act']
157
+ plot_data['act_v'] = plot_data['act']/plot_data['act']
158
+ plot_data.reset_index(inplace=True)
159
+ fig = plt.figure(figsize=(7, 5))
160
+ ax = fig.add_subplot(111)
161
+ ax.plot(plot_data.index, plot_data['act_v'],
162
+ label='Actual', color='red')
163
+ ax.plot(plot_data.index, plot_data['exp_v1'],
164
+ label=model_nme_1, color='blue')
165
+ ax.plot(plot_data.index, plot_data['exp_v2'],
166
+ label=model_nme_2, color='black')
167
+ ax.set_title(
168
+ 'Double Lift Chart of %s' % tgt_nme, fontsize=8)
169
+ plt.xticks(plot_data.index,
170
+ plot_data.index,
171
+ rotation=90, fontsize=6)
172
+ plt.xlabel('%s / %s' % (model_nme_1, model_nme_2), fontsize=6)
173
+ plt.yticks(fontsize=6)
174
+ plt.legend(loc='upper left',
175
+ fontsize=5, frameon=False)
176
+ plt.margins(0.1)
177
+ plt.subplots_adjust(bottom=0.25, top=0.95, right=0.8)
178
+ ax2 = ax.twinx()
179
+ ax2.bar(plot_data.index, plot_data['weight'],
180
+ alpha=0.5, color='seagreen',
181
+ label='Earned Exposure')
182
+ plt.yticks(fontsize=6)
183
+ plt.legend(loc='upper right',
184
+ fontsize=5, frameon=False)
185
+ plt.subplots_adjust(wspace=0.3)
186
+ save_path = os.path.join(
187
+ os.getcwd(), 'plot', f'06_{tgt_nme}_{fig_nme}.png')
188
+ plt.savefig(save_path, dpi=300)
189
+ plt.close(fig)
190
+
191
+
192
+ # 开始定义ResNet模型结构
193
+ # 残差块:两层线性 + ReLU + 残差连接
194
+ # ResBlock 继承 nn.Module
195
+ class ResBlock(nn.Module):
196
+ def __init__(self, dim: int, dropout: float = 0.1,
197
+ use_layernorm: bool = False, residual_scale: float = 0.1
198
+ ):
199
+ super().__init__()
200
+ self.use_layernorm = use_layernorm
201
+
202
+ if use_layernorm:
203
+ Norm = nn.LayerNorm # 对最后一维做归一化
204
+ else:
205
+ def Norm(d): return nn.BatchNorm1d(d) # 保留一个开关,想试 BN 时也能用
206
+
207
+ self.norm1 = Norm(dim)
208
+ self.fc1 = nn.Linear(dim, dim, bias=True)
209
+ self.act = nn.ReLU(inplace=True)
210
+ self.dropout = nn.Dropout(dropout) if dropout > 0.0 else nn.Identity()
211
+ self.norm2 = Norm(dim)
212
+ self.fc2 = nn.Linear(dim, dim, bias=True)
213
+
214
+ # 残差缩放,防止一开始就把主干搞炸
215
+ self.res_scale = nn.Parameter(
216
+ torch.tensor(residual_scale, dtype=torch.float32)
217
+ )
218
+
219
+ def forward(self, x):
220
+ # pre-activation
221
+ out = self.norm1(x)
222
+ out = self.fc1(out)
223
+ out = self.act(out)
224
+ out = self.dropout(out)
225
+ out = self.norm2(out)
226
+ out = self.fc2(out)
227
+ # 残差缩放再相加
228
+ return F.relu(x + self.res_scale * out)
229
+
230
+ # ResNetSequential 继承 nn.Module,定义整个网络结构
231
+
232
+
233
+ class ResNetSequential(nn.Module):
234
+ # 输入: (batch, input_dim)
235
+ # 结构:
236
+ # fc1 -> LN/Bn -> ReLU -> ResBlock * block_num -> fc_out -> Softplus
237
+
238
+ def __init__(self, input_dim: int, hidden_dim: int = 64, block_num: int = 2,
239
+ use_layernorm: bool = True, dropout: float = 0.1,
240
+ residual_scale: float = 0.1):
241
+ super(ResNetSequential, self).__init__()
242
+
243
+ self.net = nn.Sequential()
244
+ self.net.add_module('fc1', nn.Linear(input_dim, hidden_dim))
245
+
246
+ if use_layernorm:
247
+ self.net.add_module('norm1', nn.LayerNorm(hidden_dim))
248
+ else:
249
+ self.net.add_module('norm1', nn.BatchNorm1d(hidden_dim))
250
+
251
+ self.net.add_module('relu1', nn.ReLU(inplace=True))
252
+
253
+ # 多个残差块
254
+ for i in range(block_num):
255
+ self.net.add_module(
256
+ f'ResBlk_{i+1}',
257
+ ResBlock(
258
+ hidden_dim,
259
+ dropout=dropout,
260
+ use_layernorm=use_layernorm,
261
+ residual_scale=residual_scale)
262
+ )
263
+
264
+ self.net.add_module('fc_out', nn.Linear(hidden_dim, 1))
265
+ self.net.add_module('softplus', nn.Softplus())
266
+
267
+ def forward(self, x):
268
+ return self.net(x)
269
+
270
+ # 定义ResNet模型的Scikit-Learn接口类
271
+
272
+
273
+ class ResNetSklearn(nn.Module):
274
+ def __init__(self, model_nme: str, input_dim: int, hidden_dim: int = 64,
275
+ block_num: int = 2, batch_num: int = 100, epochs: int = 100,
276
+ tweedie_power: float = 1.5, learning_rate: float = 0.01, patience: int = 10,
277
+ use_layernorm: bool = True, dropout: float = 0.1,
278
+ residual_scale: float = 0.1):
279
+ super(ResNetSklearn, self).__init__()
280
+
281
+ self.input_dim = input_dim
282
+ self.hidden_dim = hidden_dim
283
+ self.block_num = block_num
284
+ self.batch_num = batch_num
285
+ self.epochs = epochs
286
+ self.model_nme = model_nme
287
+ self.learning_rate = learning_rate
288
+ self.patience = patience
289
+ self.use_layernorm = use_layernorm
290
+ self.dropout = dropout
291
+ self.residual_scale = residual_scale
292
+
293
+ # 设备选择:cuda > mps > cpu
294
+ if torch.cuda.is_available():
295
+ self.device = torch.device('cuda')
296
+ elif torch.backends.mps.is_available():
297
+ self.device = torch.device('mps')
298
+ else:
299
+ self.device = torch.device('cpu')
300
+
301
+ # Tweedie power
302
+ if 'f' in self.model_nme:
303
+ self.tw_power = 1
304
+ elif 's' in self.model_nme:
305
+ self.tw_power = 2
306
+ else:
307
+ self.tw_power = tweedie_power
308
+
309
+ # 搭建网络
310
+ self.resnet = ResNetSequential(
311
+ self.input_dim,
312
+ self.hidden_dim,
313
+ self.block_num,
314
+ use_layernorm=self.use_layernorm,
315
+ dropout=self.dropout,
316
+ residual_scale=self.residual_scale
317
+ ).to(self.device)
318
+
319
+ def fit(self, X_train, y_train, w_train=None,
320
+ X_val=None, y_val=None, w_val=None):
321
+
322
+ # === 1. 训练集:留在 CPU,交给 DataLoader 再搬到 GPU ===
323
+ X_tensor = torch.tensor(X_train.values, dtype=torch.float32)
324
+ y_tensor = torch.tensor(
325
+ y_train.values, dtype=torch.float32).view(-1, 1)
326
+ if w_train is not None:
327
+ w_tensor = torch.tensor(
328
+ w_train.values, dtype=torch.float32).view(-1, 1)
329
+ else:
330
+ w_tensor = torch.ones_like(y_tensor)
331
+
332
+ # === 2. 验证集:先在 CPU 上建,后面一次性搬到 device ===
333
+ has_val = X_val is not None and y_val is not None
334
+ if has_val:
335
+ X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
336
+ y_val_tensor = torch.tensor(
337
+ y_val.values, dtype=torch.float32).view(-1, 1)
338
+ if w_val is not None:
339
+ w_val_tensor = torch.tensor(
340
+ w_val.values, dtype=torch.float32).view(-1, 1)
341
+ else:
342
+ w_val_tensor = torch.ones_like(y_val_tensor)
343
+ else:
344
+ X_val_tensor = y_val_tensor = w_val_tensor = None
345
+
346
+ # === 3. DataLoader ===
347
+ dataset = TensorDataset(X_tensor, y_tensor, w_tensor)
348
+ batch_size = max(
349
+ 4096,
350
+ int((self.learning_rate / (1e-4)) ** 0.5 *
351
+ (X_train.shape[0] / self.batch_num))
352
+ )
353
+
354
+ dataloader = DataLoader(
355
+ dataset,
356
+ batch_size=batch_size,
357
+ shuffle=True,
358
+ num_workers=1, # tabular: 0~1 一般够用
359
+ pin_memory=(self.device.type == 'cuda')
360
+ )
361
+
362
+ # === 4. 优化器 & AMP ===
363
+ optimizer = torch.optim.Adam(
364
+ self.resnet.parameters(), lr=self.learning_rate)
365
+ scaler = GradScaler(enabled=(self.device.type == 'cuda'))
366
+
367
+ # === 5. Early stopping ===
368
+ best_loss, patience_counter = float('inf'), 0
369
+ best_model_state = None
370
+
371
+ # 如果有验证集,先把它整个搬到 device,只搬一次
372
+ if has_val:
373
+ X_val_dev = X_val_tensor.to(self.device, non_blocking=True)
374
+ y_val_dev = y_val_tensor.to(self.device, non_blocking=True)
375
+ w_val_dev = w_val_tensor.to(self.device, non_blocking=True)
376
+
377
+ # === 6. 训练循环 ===
378
+ for epoch in range(1, self.epochs + 1):
379
+ self.resnet.train()
380
+ for X_batch, y_batch, w_batch in dataloader:
381
+ optimizer.zero_grad()
382
+
383
+ X_batch = X_batch.to(self.device, non_blocking=True)
384
+ y_batch = y_batch.to(self.device, non_blocking=True)
385
+ w_batch = w_batch.to(self.device, non_blocking=True)
386
+
387
+ with autocast(enabled=(self.device.type == 'cuda')):
388
+ y_pred = self.resnet(X_batch)
389
+ y_pred = torch.clamp(y_pred, min=1e-6)
390
+
391
+ losses = tweedie_loss(
392
+ y_pred, y_batch, p=self.tw_power).view(-1)
393
+ weighted_loss = (losses * w_batch.view(-1)
394
+ ).sum() / w_batch.sum()
395
+
396
+ scaler.scale(weighted_loss).backward()
397
+
398
+ if self.device.type == 'cuda':
399
+ scaler.unscale_(optimizer)
400
+ clip_grad_norm_(self.resnet.parameters(), max_norm=1.0)
401
+
402
+ scaler.step(optimizer)
403
+ scaler.update()
404
+
405
+ # === 7. 验证集损失 & early stopping ===
406
+ if has_val:
407
+ self.resnet.eval()
408
+ with torch.no_grad(), autocast(enabled=(self.device.type == 'cuda')):
409
+ y_val_pred = self.resnet(X_val_dev)
410
+ y_val_pred = torch.clamp(y_val_pred, min=1e-6)
411
+
412
+ val_loss_values = tweedie_loss(
413
+ y_val_pred, y_val_dev, p=self.tw_power
414
+ ).view(-1)
415
+ val_weighted_loss = (
416
+ val_loss_values * w_val_dev.view(-1)
417
+ ).sum() / w_val_dev.sum()
418
+
419
+ if val_weighted_loss < best_loss:
420
+ best_loss = val_weighted_loss
421
+ patience_counter = 0
422
+ best_model_state = copy.deepcopy(self.resnet.state_dict())
423
+ else:
424
+ patience_counter += 1
425
+
426
+ if patience_counter >= self.patience and best_model_state is not None:
427
+ self.resnet.load_state_dict(best_model_state)
428
+ break
429
+
430
+ # ---------------- predict ----------------
431
+
432
+ def predict(self, X_test):
433
+ self.resnet.eval()
434
+ with torch.no_grad():
435
+ X_tensor = torch.tensor(
436
+ X_test.values, dtype=torch.float32).to(self.device)
437
+ y_pred = self.resnet(X_tensor).cpu().numpy()
438
+
439
+ y_pred = np.clip(y_pred, 1e-6, None)
440
+ return y_pred.flatten()
441
+
442
+ # ---------------- set_params ----------------
443
+
444
+ def set_params(self, params):
445
+ for key, value in params.items():
446
+ if hasattr(self, key):
447
+ setattr(self, key, value)
448
+ else:
449
+ raise ValueError(f"Parameter {key} not found in model.")
450
+
451
+ # 开始定义FT Transformer模型结构
452
+
453
+
454
+ class FeatureTokenizer(nn.Module):
455
+ # 将数值特征 & 类别特征映射为 token (batch, n_tokens, d_model)
456
+ # 假设:
457
+ # - X_num: (batch, num_numeric)
458
+ # - X_cat: (batch, num_categorical),每列是已编码好的整数标签 [0, card-1]
459
+
460
+ def __init__(self, num_numeric: int, cat_cardinalities, d_model: int):
461
+ super().__init__()
462
+
463
+ self.num_numeric = num_numeric
464
+ self.has_numeric = num_numeric > 0
465
+
466
+ if self.has_numeric:
467
+ self.num_linear = nn.Linear(num_numeric, d_model)
468
+
469
+ self.embeddings = nn.ModuleList([
470
+ nn.Embedding(card, d_model) for card in cat_cardinalities
471
+ ])
472
+
473
+ def forward(self, X_num, X_cat):
474
+ tokens = []
475
+
476
+ if self.has_numeric:
477
+ # 数值特征映射为一个 token
478
+ num_token = self.num_linear(X_num) # (batch, d_model)
479
+ tokens.append(num_token)
480
+
481
+ # 每个类别特征一个 embedding token
482
+ for i, emb in enumerate(self.embeddings):
483
+ tok = emb(X_cat[:, i]) # (batch, d_model)
484
+ tokens.append(tok)
485
+
486
+ # (batch, n_tokens, d_model)
487
+ x = torch.stack(tokens, dim=1)
488
+ return x
489
+
490
+ # 定义具有残差缩放的Encoder层
491
+
492
+
493
+ class ScaledTransformerEncoderLayer(nn.Module):
494
+ def __init__(self, d_model: int, nhead: int, dim_feedforward: int = 2048,
495
+ dropout: float = 0.1, residual_scale_attn: float = 1.0,
496
+ residual_scale_ffn: float = 1.0, norm_first: bool = True,
497
+ ):
498
+ super().__init__()
499
+ self.self_attn = nn.MultiheadAttention(
500
+ embed_dim=d_model,
501
+ num_heads=nhead,
502
+ dropout=dropout,
503
+ batch_first=True
504
+ )
505
+
506
+ # FFN
507
+ self.linear1 = nn.Linear(d_model, dim_feedforward)
508
+ self.dropout = nn.Dropout(dropout)
509
+ self.linear2 = nn.Linear(dim_feedforward, d_model)
510
+
511
+ # Norm & Dropout
512
+ self.norm1 = nn.LayerNorm(d_model)
513
+ self.norm2 = nn.LayerNorm(d_model)
514
+ self.dropout1 = nn.Dropout(dropout)
515
+ self.dropout2 = nn.Dropout(dropout)
516
+
517
+ self.activation = nn.GELU()
518
+ self.norm_first = norm_first
519
+
520
+ # 残差缩放系数
521
+ self.res_scale_attn = residual_scale_attn
522
+ self.res_scale_ffn = residual_scale_ffn
523
+
524
+ def forward(self, src, src_mask=None, src_key_padding_mask=None):
525
+ """
526
+ src: (B, T, d_model)
527
+ """
528
+ x = src
529
+
530
+ if self.norm_first:
531
+ # pre-norm
532
+ x = x + self._sa_block(self.norm1(x), src_mask,
533
+ src_key_padding_mask)
534
+ x = x + self._ff_block(self.norm2(x))
535
+ else:
536
+ # post-norm(一般不用)
537
+ x = self.norm1(
538
+ x + self._sa_block(x, src_mask, src_key_padding_mask))
539
+ x = self.norm2(x + self._ff_block(x))
540
+
541
+ return x
542
+
543
+ def _sa_block(self, x, attn_mask, key_padding_mask):
544
+ # Self-Attention + 残差缩放
545
+ attn_out, _ = self.self_attn(
546
+ x, x, x,
547
+ attn_mask=attn_mask,
548
+ key_padding_mask=key_padding_mask,
549
+ need_weights=False
550
+ )
551
+ return self.res_scale_attn * self.dropout1(attn_out)
552
+
553
+ def _ff_block(self, x):
554
+ # FFN + 残差缩放
555
+ x2 = self.linear2(self.dropout(self.activation(self.linear1(x))))
556
+ return self.res_scale_ffn * self.dropout2(x2)
557
+
558
+ # 定义FT-Transformer核心模型
559
+
560
+
561
+ class FTTransformerCore(nn.Module):
562
+ # 一个最小可用的 FT-Transformer:
563
+ # - FeatureTokenizer: 数值、类别 → token
564
+ # - TransformerEncoder: 捕捉特征交互
565
+ # - 池化 + MLP + Softplus: 输出正值 (适配 Tweedie/Gamma)
566
+
567
+ def __init__(self, num_numeric: int, cat_cardinalities, d_model: int = 64,
568
+ n_heads: int = 8, n_layers: int = 4, dropout: float = 0.1,
569
+ ):
570
+ super().__init__()
571
+
572
+ self.tokenizer = FeatureTokenizer(
573
+ num_numeric=num_numeric,
574
+ cat_cardinalities=cat_cardinalities,
575
+ d_model=d_model
576
+ )
577
+ scale = 1.0 / math.sqrt(n_layers) # 推荐一个默认值
578
+ encoder_layer = ScaledTransformerEncoderLayer(
579
+ d_model=d_model,
580
+ nhead=n_heads,
581
+ dim_feedforward=d_model * 4,
582
+ dropout=dropout,
583
+ residual_scale_attn=scale,
584
+ residual_scale_ffn=scale,
585
+ norm_first=True,
586
+ )
587
+ self.encoder = nn.TransformerEncoder(
588
+ encoder_layer,
589
+ num_layers=n_layers
590
+ )
591
+ self.n_layers = n_layers
592
+
593
+ self.head = nn.Sequential(
594
+ nn.LayerNorm(d_model),
595
+ nn.Linear(d_model, d_model),
596
+ nn.GELU(),
597
+ nn.Linear(d_model, 1),
598
+ nn.Softplus() # 保证输出为正,适合 Tweedie / Gamma
599
+ )
600
+
601
+ def forward(self, X_num, X_cat):
602
+
603
+ # X_num: (batch, num_numeric) float32
604
+ # X_cat: (batch, num_categorical) long
605
+
606
+ tokens = self.tokenizer(X_num, X_cat) # (batch, tokens, d_model)
607
+ x = self.encoder(tokens) # (batch, tokens, d_model)
608
+
609
+ # 简单地对 token 取平均池化
610
+ x = x.mean(dim=1) # (batch, d_model)
611
+
612
+ out = self.head(x) # (batch, 1), Softplus 已做
613
+ return out
614
+
615
+ # 定义TabularDataset类
616
+
617
+
618
+ class TabularDataset(Dataset):
619
+ def __init__(self, X_num, X_cat, y, w):
620
+
621
+ # X_num: torch.float32, (N, num_numeric)
622
+ # X_cat: torch.long, (N, num_categorical)
623
+ # y: torch.float32, (N, 1)
624
+ # w: torch.float32, (N, 1)
625
+
626
+ self.X_num = X_num
627
+ self.X_cat = X_cat
628
+ self.y = y
629
+ self.w = w
630
+
631
+ def __len__(self):
632
+ return self.y.shape[0]
633
+
634
+ def __getitem__(self, idx):
635
+ return (
636
+ self.X_num[idx],
637
+ self.X_cat[idx],
638
+ self.y[idx],
639
+ self.w[idx],
640
+ )
641
+
642
+ # 定义FTTransformer的Scikit-Learn接口类
643
+
644
+
645
+ class FTTransformerSklearn(nn.Module):
646
+
647
+ # sklearn 风格包装:
648
+ # - num_cols: 数值特征列名列表
649
+ # - cat_cols: 类别特征列名列表 (已做 label encoding,取值 [0, n_classes-1])
650
+
651
+ def __init__(self, model_nme: str, num_cols, cat_cols, d_model: int = 64, n_heads: int = 8,
652
+ n_layers: int = 4, dropout: float = 0.1, batch_num: int = 100, epochs: int = 100,
653
+ tweedie_power: float = 1.5, learning_rate: float = 1e-3, patience: int = 10,
654
+ ):
655
+ super().__init__()
656
+
657
+ self.model_nme = model_nme
658
+ self.num_cols = list(num_cols)
659
+ self.cat_cols = list(cat_cols)
660
+ self.d_model = d_model
661
+ self.n_heads = n_heads
662
+ self.n_layers = n_layers
663
+ self.dropout = dropout
664
+ self.batch_num = batch_num
665
+ self.epochs = epochs
666
+ self.learning_rate = learning_rate
667
+ self.patience = patience
668
+ if 'f' in self.model_nme:
669
+ self.tw_power = 1.0
670
+ elif 's' in self.model_nme:
671
+ self.tw_power = 2.0
672
+ else:
673
+ self.tw_power = tweedie_power
674
+ if torch.cuda.is_available():
675
+ self.device = torch.device("cuda")
676
+ elif torch.backends.mps.is_available():
677
+ self.device = torch.device("mps")
678
+ else:
679
+ self.device = torch.device("cpu")
680
+ self.cat_cardinalities = None
681
+ self.cat_categories = {}
682
+ self.ft = None
683
+
684
+ def _build_model(self, X_train):
685
+ num_numeric = len(self.num_cols)
686
+ cat_cardinalities = []
687
+
688
+ for col in self.cat_cols:
689
+ cats = X_train[col].astype('category')
690
+ categories = cats.cat.categories
691
+ self.cat_categories[col] = categories # 保存训练集类别全集
692
+
693
+ card = len(categories) + 1 # 多预留 1 类给“未知/缺失”
694
+ cat_cardinalities.append(card)
695
+
696
+ self.cat_cardinalities = cat_cardinalities
697
+
698
+ self.ft = FTTransformerCore(
699
+ num_numeric=num_numeric,
700
+ cat_cardinalities=cat_cardinalities,
701
+ d_model=self.d_model,
702
+ n_heads=self.n_heads,
703
+ n_layers=self.n_layers,
704
+ dropout=self.dropout,
705
+ ).to(self.device)
706
+
707
+ def _encode_cats(self, X):
708
+ # X: DataFrame,至少包含 self.cat_cols
709
+ # 返回: np.ndarray, shape (N, num_categorical), dtype=int64
710
+
711
+ if not self.cat_cols:
712
+ return np.zeros((len(X), 0), dtype='int64')
713
+
714
+ X_cat_list = []
715
+ for col in self.cat_cols:
716
+ # 使用训练时记录下来的 categories
717
+ categories = self.cat_categories[col]
718
+ # 用固定的 categories 构造 Categorical
719
+ cats = pd.Categorical(X[col], categories=categories)
720
+ codes = cats.codes.astype('int64', copy=True) # -1 表示未知或缺失
721
+ # 未知 / 缺失 映射到“未知类 bucket”,索引 = len(categories)
722
+ codes[codes < 0] = len(categories)
723
+ X_cat_list.append(codes)
724
+
725
+ X_cat_np = np.stack(X_cat_list, axis=1) # (N, num_categorical)
726
+ return X_cat_np
727
+
728
+ def fit(self, X_train, y_train, w_train=None,
729
+ X_val=None, y_val=None, w_val=None):
730
+
731
+ # 第一次 fit 时构建模型结构
732
+ if self.ft is None:
733
+ self._build_model(X_train)
734
+
735
+ # --- 构建训练张量 (全在 CPU,后面 batch 再搬 GPU) ---
736
+ X_num_train = X_train[self.num_cols].to_numpy(
737
+ dtype=np.float32, copy=True)
738
+ X_num_train = torch.tensor(
739
+ X_num_train,
740
+ dtype=torch.float32
741
+ )
742
+
743
+ if self.cat_cols:
744
+ X_cat_train_np = self._encode_cats(X_train)
745
+ X_cat_train = torch.tensor(X_cat_train_np, dtype=torch.long)
746
+ else:
747
+ X_cat_train = torch.zeros(
748
+ (X_num_train.size(0), 0), dtype=torch.long)
749
+
750
+ y_tensor = torch.tensor(
751
+ y_train.values,
752
+ dtype=torch.float32
753
+ ).view(-1, 1)
754
+
755
+ if w_train is not None:
756
+ w_tensor = torch.tensor(
757
+ w_train.values,
758
+ dtype=torch.float32
759
+ ).view(-1, 1)
760
+ else:
761
+ w_tensor = torch.ones_like(y_tensor)
762
+
763
+ # --- 验证集张量 (后面一次性搬到 device) ---
764
+ has_val = X_val is not None and y_val is not None
765
+ if has_val:
766
+ # ---------- 数值特征 ----------
767
+ X_num_val_np = X_val[self.num_cols].to_numpy(
768
+ dtype=np.float32, copy=True)
769
+ X_num_val = torch.tensor(X_num_val_np, dtype=torch.float32)
770
+
771
+ # ---------- 类别特征 ----------
772
+ if self.cat_cols:
773
+ X_cat_val_np = self._encode_cats(X_val)
774
+ X_cat_val = torch.tensor(X_cat_val_np, dtype=torch.long)
775
+ else:
776
+ X_cat_val = torch.zeros(
777
+ (X_num_val.shape[0], 0), dtype=torch.long)
778
+
779
+ # ---------- 目标 & 权重 ----------
780
+ y_val_np = y_val.values.astype(np.float32, copy=True)
781
+ y_val_tensor = torch.tensor(
782
+ y_val_np, dtype=torch.float32).view(-1, 1)
783
+
784
+ if w_val is not None:
785
+ w_val_np = w_val.values.astype(np.float32, copy=True)
786
+ w_val_tensor = torch.tensor(
787
+ w_val_np, dtype=torch.float32).view(-1, 1)
788
+ else:
789
+ w_val_tensor = torch.ones_like(y_val_tensor)
790
+
791
+ else:
792
+ X_num_val = X_cat_val = y_val_tensor = w_val_tensor = None
793
+
794
+ # --- DataLoader ---
795
+ dataset = TabularDataset(
796
+ X_num_train, X_cat_train, y_tensor, w_tensor
797
+ )
798
+
799
+ batch_size = max(
800
+ 32,
801
+ int((self.learning_rate / 1e-4) ** 0.5 *
802
+ (X_train.shape[0] / self.batch_num))
803
+ )
804
+
805
+ dataloader = DataLoader(
806
+ dataset,
807
+ batch_size=batch_size,
808
+ shuffle=True,
809
+ num_workers=1,
810
+ pin_memory=(self.device.type == 'cuda')
811
+ )
812
+
813
+ # --- 优化器 & AMP ---
814
+ optimizer = torch.optim.Adam(
815
+ self.ft.parameters(),
816
+ lr=self.learning_rate
817
+ )
818
+ scaler = GradScaler(enabled=(self.device.type == 'cuda'))
819
+
820
+ # --- Early stopping ---
821
+ best_loss = float('inf')
822
+ patience_counter = 0
823
+ best_model_state = None
824
+
825
+ # 验证集整体搬到 device(如果存在)
826
+ if has_val:
827
+ X_num_val_dev = X_num_val.to(self.device, non_blocking=True)
828
+ X_cat_val_dev = X_cat_val.to(self.device, non_blocking=True)
829
+ y_val_dev = y_val_tensor.to(self.device, non_blocking=True)
830
+ w_val_dev = w_val_tensor.to(self.device, non_blocking=True)
831
+
832
+ # --- 训练循环 ---
833
+ for epoch in range(1, self.epochs + 1):
834
+ self.ft.train()
835
+ for X_num_b, X_cat_b, y_b, w_b in dataloader:
836
+ optimizer.zero_grad()
837
+
838
+ X_num_b = X_num_b.to(self.device, non_blocking=True)
839
+ X_cat_b = X_cat_b.to(self.device, non_blocking=True)
840
+ y_b = y_b.to(self.device, non_blocking=True)
841
+ w_b = w_b.to(self.device, non_blocking=True)
842
+
843
+ with autocast(enabled=(self.device.type == 'cuda')):
844
+ y_pred = self.ft(X_num_b, X_cat_b)
845
+ y_pred = torch.clamp(y_pred, min=1e-6)
846
+
847
+ losses = tweedie_loss(
848
+ y_pred, y_b, p=self.tw_power
849
+ ).view(-1)
850
+
851
+ weighted_loss = (losses * w_b.view(-1)).sum() / w_b.sum()
852
+
853
+ scaler.scale(weighted_loss).backward()
854
+
855
+ if self.device.type == 'cuda':
856
+ scaler.unscale_(optimizer)
857
+ clip_grad_norm_(self.ft.parameters(), max_norm=1.0)
858
+
859
+ scaler.step(optimizer)
860
+ scaler.update()
861
+
862
+ # --- 验证 & early stopping ---
863
+ if has_val:
864
+ self.ft.eval()
865
+ with torch.no_grad(), autocast(enabled=(self.device.type == 'cuda')):
866
+ y_val_pred = self.ft(X_num_val_dev, X_cat_val_dev)
867
+ y_val_pred = torch.clamp(y_val_pred, min=1e-6)
868
+
869
+ val_losses = tweedie_loss(
870
+ y_val_pred, y_val_dev, p=self.tw_power
871
+ ).view(-1)
872
+
873
+ val_weighted_loss = (
874
+ val_losses * w_val_dev.view(-1)
875
+ ).sum() / w_val_dev.sum()
876
+
877
+ if val_weighted_loss < best_loss:
878
+ best_loss = val_weighted_loss
879
+ patience_counter = 0
880
+ best_model_state = copy.deepcopy(self.ft.state_dict())
881
+ else:
882
+ patience_counter += 1
883
+
884
+ if patience_counter >= self.patience and best_model_state is not None:
885
+ self.ft.load_state_dict(best_model_state)
886
+ break
887
+
888
+ def predict(self, X_test):
889
+ # X_test: DataFrame,包含 num_cols + cat_cols
890
+
891
+ self.ft.eval()
892
+ X_num = X_test[self.num_cols].to_numpy(dtype=np.float32, copy=True)
893
+ X_num = torch.tensor(
894
+ X_num,
895
+ dtype=torch.float32
896
+ )
897
+ if self.cat_cols:
898
+ X_cat_np = self._encode_cats(X_test)
899
+ X_cat = torch.tensor(X_cat_np, dtype=torch.long)
900
+ else:
901
+ X_cat = torch.zeros((X_num.size(0), 0), dtype=torch.long)
902
+
903
+ with torch.no_grad():
904
+ X_num = X_num.to(self.device, non_blocking=True)
905
+ X_cat = X_cat.to(self.device, non_blocking=True)
906
+ y_pred = self.ft(X_num, X_cat).cpu().numpy()
907
+
908
+ y_pred = np.clip(y_pred, 1e-6, None)
909
+ return y_pred.ravel()
910
+
911
+ def set_params(self, params: dict):
912
+
913
+ # 和 sklearn 风格保持一致。
914
+ # 注意:对结构性参数(如 d_model/n_heads)修改后,需要重新 fit 才会生效。
915
+
916
+ for key, value in params.items():
917
+ if hasattr(self, key):
918
+ setattr(self, key, value)
919
+ else:
920
+ raise ValueError(f"Parameter {key} not found in model.")
921
+ return self
922
+
923
+
924
+ # 定义贝叶斯优化模型类,包含XGBoost和ResNet模型
925
+
926
+ class BayesOptModel:
927
+ def __init__(self, train_data, test_data,
928
+ model_nme, resp_nme, weight_nme, factor_nmes,
929
+ cate_list=[], prop_test=0.25, rand_seed=None, epochs=100):
930
+ # 初始化数据
931
+ # train_data: 训练数据, test_data: 测试数据 格式需为DataFrame
932
+ # model_nme: 模型名称
933
+ # resp_nme: 因变量名称, weight_nme: 权重名称
934
+ # factor_nmes: 因子名称列表, space_params: 参数空间
935
+ # cate_list: 类别变量列表
936
+ # prop_test: 测试集比例, rand_seed
937
+ self.train_data = train_data
938
+ self.test_data = test_data
939
+ self.resp_nme = resp_nme
940
+ self.weight_nme = weight_nme
941
+ self.train_data.loc[:, 'w_act'] = self.train_data[self.resp_nme] * \
942
+ self.train_data[self.weight_nme]
943
+ self.test_data.loc[:, 'w_act'] = self.test_data[self.resp_nme] * \
944
+ self.test_data[self.weight_nme]
945
+ q99 = self.train_data[self.resp_nme].quantile(0.999)
946
+ self.train_data[self.resp_nme] = self.train_data[self.resp_nme].clip(
947
+ upper=q99)
948
+ self.factor_nmes = factor_nmes
949
+ self.cate_list = cate_list
950
+ self.rand_seed = rand_seed if rand_seed is not None else np.random.randint(
951
+ 1, 10000)
952
+ if self.cate_list != []:
953
+ for cate in self.cate_list:
954
+ self.train_data[cate] = self.train_data[cate].astype(
955
+ 'category')
956
+ self.test_data[cate] = self.test_data[cate].astype('category')
957
+ self.prop_test = prop_test
958
+ self.cv = ShuffleSplit(n_splits=int(1/self.prop_test),
959
+ test_size=self.prop_test,
960
+ random_state=self.rand_seed)
961
+ self.model_nme = model_nme
962
+ if self.model_nme.find('f') != -1:
963
+ self.obj = 'count:poisson'
964
+ elif self.model_nme.find('s') != -1:
965
+ self.obj = 'reg:gamma'
966
+ elif self.model_nme.find('bc') != -1:
967
+ self.obj = 'reg:tweedie'
968
+ self.fit_params = {
969
+ 'sample_weight': self.train_data[self.weight_nme].values
970
+ }
971
+ self.num_features = [
972
+ nme for nme in self.factor_nmes if nme not in self.cate_list]
973
+ self.train_oht_scl_data = self.train_data[self.factor_nmes +
974
+ [self.weight_nme]+[self.resp_nme]].copy()
975
+ self.test_oht_scl_data = self.test_data[self.factor_nmes +
976
+ [self.weight_nme]+[self.resp_nme]].copy()
977
+ self.train_oht_scl_data = pd.get_dummies(
978
+ self.train_oht_scl_data,
979
+ columns=self.cate_list,
980
+ drop_first=True,
981
+ dtype=np.int8
982
+ )
983
+ self.test_oht_scl_data = pd.get_dummies(
984
+ self.test_oht_scl_data,
985
+ columns=self.cate_list,
986
+ drop_first=True,
987
+ dtype=np.int8
988
+ )
989
+ for num_chr in self.num_features:
990
+ scaler = StandardScaler()
991
+ self.train_oht_scl_data[num_chr] = scaler.fit_transform(
992
+ self.train_oht_scl_data[num_chr].values.reshape(-1, 1))
993
+ self.test_oht_scl_data[num_chr] = scaler.transform(
994
+ self.test_oht_scl_data[num_chr].values.reshape(-1, 1))
995
+ # 对测试集进行列对齐
996
+ self.test_oht_scl_data = self.test_oht_scl_data.reindex(
997
+ columns=self.train_oht_scl_data.columns,
998
+ fill_value=0
999
+ )
1000
+ self.var_nmes = list(
1001
+ set(list(self.train_oht_scl_data.columns)) -
1002
+ set([self.weight_nme, self.resp_nme])
1003
+ )
1004
+ self.epochs = epochs
1005
+ self.model_label = []
1006
+ self.cat_categories_for_shap = {}
1007
+ for col in self.cate_list:
1008
+ cats = self.train_data[col].astype('category')
1009
+ self.cat_categories_for_shap[col] = list(cats.cat.categories)
1010
+
1011
+ # 定义单因素画图函数
1012
+ def plot_oneway(self, n_bins=10):
1013
+ for c in self.factor_nmes:
1014
+ fig = plt.figure(figsize=(7, 5))
1015
+ if c in self.cate_list:
1016
+ strs = c
1017
+ else:
1018
+ strs = c+'_bins'
1019
+ self.train_data.loc[:, strs] = pd.qcut(
1020
+ self.train_data[c],
1021
+ n_bins,
1022
+ duplicates='drop'
1023
+ )
1024
+ plot_data = self.train_data.groupby(
1025
+ [strs], observed=True).sum(numeric_only=True)
1026
+ plot_data.reset_index(inplace=True)
1027
+ plot_data['act_v'] = plot_data['w_act'] / \
1028
+ plot_data[self.weight_nme]
1029
+ plot_data.head()
1030
+ ax = fig.add_subplot(111)
1031
+ ax.plot(plot_data.index, plot_data['act_v'],
1032
+ label='Actual', color='red')
1033
+ ax.set_title(
1034
+ 'Analysis of %s : Train Data' % strs,
1035
+ fontsize=8)
1036
+ plt.xticks(plot_data.index,
1037
+ list(plot_data[strs].astype(str)),
1038
+ rotation=90)
1039
+ if len(list(plot_data[strs].astype(str))) > 50:
1040
+ plt.xticks(fontsize=3)
1041
+ else:
1042
+ plt.xticks(fontsize=6)
1043
+ plt.yticks(fontsize=6)
1044
+ ax2 = ax.twinx()
1045
+ ax2.bar(plot_data.index,
1046
+ plot_data[self.weight_nme],
1047
+ alpha=0.5, color='seagreen')
1048
+ plt.yticks(fontsize=6)
1049
+ plt.margins(0.05)
1050
+ plt.subplots_adjust(wspace=0.3)
1051
+ save_path = os.path.join(
1052
+ os.getcwd(), 'plot',
1053
+ f'00_{self.model_nme}_{strs}_oneway.png')
1054
+ plt.savefig(save_path, dpi=300)
1055
+ plt.close(fig)
1056
+
1057
+ # Xgboost交叉验证函数
1058
+ def cross_val_xgb(self, trial):
1059
+ learning_rate = trial.suggest_float(
1060
+ 'learning_rate', 1e-5, 1e-1, log=True)
1061
+ gamma = trial.suggest_float(
1062
+ 'gamma', 0, 10000)
1063
+ max_depth = trial.suggest_int(
1064
+ 'max_depth', 3, 25)
1065
+ n_estimators = trial.suggest_int(
1066
+ 'n_estimators', 10, 500, step=10)
1067
+ min_child_weight = trial.suggest_int(
1068
+ 'min_child_weight', 100, 10000, step=100)
1069
+ reg_alpha = trial.suggest_float(
1070
+ 'reg_alpha', 1e-10, 1, log=True)
1071
+ reg_lambda = trial.suggest_float(
1072
+ 'reg_lambda', 1e-10, 1, log=True)
1073
+ if self.obj == 'reg:tweedie':
1074
+ tweedie_variance_power = trial.suggest_float(
1075
+ 'tweedie_variance_power', 1, 2)
1076
+ elif self.obj == 'count:poisson':
1077
+ tweedie_variance_power = 1
1078
+ elif self.obj == 'reg:gamma':
1079
+ tweedie_variance_power = 2
1080
+ clf = xgb.XGBRegressor(
1081
+ objective=self.obj,
1082
+ random_state=self.rand_seed,
1083
+ subsample=0.9,
1084
+ tree_method='gpu_hist' if torch.cuda.is_available() else 'hist',
1085
+ gpu_id=0,
1086
+ enable_categorical=True,
1087
+ predictor='gpu_predictor' if torch.cuda.is_available() else 'cpu_predictor'
1088
+ )
1089
+ params = {
1090
+ 'learning_rate': learning_rate,
1091
+ 'gamma': gamma,
1092
+ 'max_depth': max_depth,
1093
+ 'n_estimators': n_estimators,
1094
+ 'min_child_weight': min_child_weight,
1095
+ 'reg_alpha': reg_alpha,
1096
+ 'reg_lambda': reg_lambda
1097
+ }
1098
+ if self.obj == 'reg:tweedie':
1099
+ params['tweedie_variance_power'] = tweedie_variance_power
1100
+ clf.set_params(**params)
1101
+ acc = cross_val_score(
1102
+ clf,
1103
+ self.train_data[self.factor_nmes],
1104
+ self.train_data[self.resp_nme].values,
1105
+ fit_params=self.fit_params,
1106
+ cv=self.cv,
1107
+ scoring=make_scorer(
1108
+ mean_tweedie_deviance,
1109
+ power=tweedie_variance_power,
1110
+ greater_is_better=False),
1111
+ error_score='raise',
1112
+ n_jobs=int(1/self.prop_test)).mean()
1113
+ return -acc
1114
+
1115
+ # 定义Xgboost贝叶斯优化函数
1116
+ def bayesopt_xgb(self, max_evals=100):
1117
+ study = optuna.create_study(
1118
+ direction='minimize',
1119
+ sampler=optuna.samplers.TPESampler(seed=self.rand_seed))
1120
+ study.optimize(self.cross_val_xgb, n_trials=max_evals)
1121
+ self.best_xgb_params = study.best_params
1122
+ pd.DataFrame(self.best_xgb_params, index=[0]).to_csv(
1123
+ os.getcwd() + '/Results/' + self.model_nme + '_bestparams_xgb.csv')
1124
+ self.best_xgb_trial = study.best_trial
1125
+ self.xgb_best = xgb.XGBRegressor(
1126
+ objective=self.obj,
1127
+ random_state=self.rand_seed,
1128
+ subsample=0.9,
1129
+ tree_method='gpu_hist' if torch.cuda.is_available() else 'hist',
1130
+ gpu_id=0,
1131
+ enable_categorical=True,
1132
+ predictor='gpu_predictor' if torch.cuda.is_available() else 'cpu_predictor'
1133
+ )
1134
+ self.xgb_best.set_params(**self.best_xgb_params)
1135
+ self.xgb_best.fit(self.train_data[self.factor_nmes],
1136
+ self.train_data[self.resp_nme].values,
1137
+ **self.fit_params)
1138
+ self.model_label += ['Xgboost']
1139
+ self.train_data['pred_xgb'] = self.xgb_best.predict(
1140
+ self.train_data[self.factor_nmes])
1141
+ self.test_data['pred_xgb'] = self.xgb_best.predict(
1142
+ self.test_data[self.factor_nmes])
1143
+ self.train_data.loc[:, 'w_pred_xgb'] = self.train_data['pred_xgb'] * \
1144
+ self.train_data[self.weight_nme]
1145
+ self.test_data.loc[:, 'w_pred_xgb'] = self.test_data['pred_xgb'] * \
1146
+ self.test_data[self.weight_nme]
1147
+
1148
+ # ResNet交叉验证函数
1149
+ def cross_val_resn(self, trial):
1150
+
1151
+ learning_rate = trial.suggest_float(
1152
+ 'learning_rate', 1e-6, 1e-2, log=True) # 较低learning rate为了保证不会出险梯度爆炸
1153
+ hidden_dim = trial.suggest_int(
1154
+ 'hidden_dim', 32, 256, step=32)
1155
+ block_num = trial.suggest_int(
1156
+ 'block_num', 2, 10)
1157
+ batch_num = trial.suggest_int(
1158
+ 'batch_num',
1159
+ 10 if self.obj == 'reg:gamma' else 100,
1160
+ 100 if self.obj == 'reg:gamma' else 1000,
1161
+ step=10 if self.obj == 'reg:gamma' else 100)
1162
+ if self.obj == 'reg:tweedie':
1163
+ tw_power = trial.suggest_float(
1164
+ 'tw_power', 1, 2.0)
1165
+ elif self.obj == 'count:poisson':
1166
+ tw_power = 1
1167
+ elif self.obj == 'reg:gamma':
1168
+ tw_power = 2
1169
+ loss = 0
1170
+ for fold, (train_idx, test_idx) in enumerate(self.cv.split(self.train_oht_scl_data[self.var_nmes])):
1171
+ # 创建模型
1172
+ cv_net = ResNetSklearn(
1173
+ model_nme=self.model_nme,
1174
+ input_dim=self.train_oht_scl_data[self.var_nmes].shape[1],
1175
+ epochs=self.epochs,
1176
+ learning_rate=learning_rate,
1177
+ hidden_dim=hidden_dim,
1178
+ block_num=block_num,
1179
+ # 保证权重方差不变
1180
+ batch_num=batch_num,
1181
+ tweedie_power=tw_power
1182
+ # 再此可以调整normlayer,dropout,residual_scale等参数
1183
+ )
1184
+ # 训练模型
1185
+ cv_net.fit(
1186
+ self.train_oht_scl_data[self.var_nmes].iloc[train_idx],
1187
+ self.train_oht_scl_data[self.resp_nme].iloc[train_idx],
1188
+ self.train_oht_scl_data[self.weight_nme].iloc[train_idx],
1189
+ self.train_oht_scl_data[self.var_nmes].iloc[test_idx],
1190
+ self.train_oht_scl_data[self.resp_nme].iloc[test_idx],
1191
+ self.train_oht_scl_data[self.weight_nme].iloc[test_idx]
1192
+ )
1193
+ # 预测
1194
+ y_pred_fold = cv_net.predict(
1195
+ self.train_oht_scl_data[self.var_nmes].iloc[test_idx]
1196
+ )
1197
+ # 计算损失
1198
+ loss += mean_tweedie_deviance(
1199
+ self.train_oht_scl_data[self.resp_nme].iloc[test_idx],
1200
+ y_pred_fold,
1201
+ sample_weight=self.train_oht_scl_data[self.weight_nme].iloc[test_idx],
1202
+ power=tw_power
1203
+ )
1204
+ return loss / int(1/self.prop_test)
1205
+
1206
+ # 定义ResNet贝叶斯优化函数
1207
+ def bayesopt_resnet(self, max_evals=100):
1208
+ study = optuna.create_study(
1209
+ direction='minimize',
1210
+ sampler=optuna.samplers.TPESampler(seed=self.rand_seed))
1211
+ study.optimize(self.cross_val_resn, n_trials=max_evals)
1212
+ self.best_resn_params = study.best_params
1213
+ pd.DataFrame(self.best_resn_params, index=[0]).to_csv(
1214
+ os.getcwd() + '/Results/' + self.model_nme + '_bestparams_resn.csv')
1215
+ self.best_resn_trial = study.best_trial
1216
+ self.resn_best = ResNetSklearn(
1217
+ model_nme=self.model_nme,
1218
+ input_dim=self.train_oht_scl_data[self.var_nmes].shape[1]
1219
+ )
1220
+ self.resn_best.set_params(self.best_resn_params)
1221
+ self.resn_best.fit(self.train_oht_scl_data[self.var_nmes],
1222
+ self.train_oht_scl_data[self.resp_nme],
1223
+ self.train_oht_scl_data[self.weight_nme])
1224
+ self.model_label += ['ResNet']
1225
+ self.train_data['pred_resn'] = self.resn_best.predict(
1226
+ self.train_oht_scl_data[self.var_nmes])
1227
+ self.test_data['pred_resn'] = self.resn_best.predict(
1228
+ self.test_oht_scl_data[self.var_nmes])
1229
+ self.train_data.loc[:, 'w_pred_resn'] = self.train_data['pred_resn'] * \
1230
+ self.train_data[self.weight_nme]
1231
+ self.test_data.loc[:, 'w_pred_resn'] = self.test_data['pred_resn'] * \
1232
+ self.test_data[self.weight_nme]
1233
+
1234
+ # FT-Transformer 交叉验证函数
1235
+ def cross_val_ft(self, trial):
1236
+
1237
+ # 学习率
1238
+ learning_rate = trial.suggest_float(
1239
+ 'learning_rate', 1e-6, 1e-4, log=True
1240
+ )
1241
+
1242
+ # Transformer 维度与层数
1243
+ d_model = trial.suggest_int(
1244
+ 'd_model', 32, 128, step=32
1245
+ )
1246
+ n_heads = trial.suggest_categorical(
1247
+ 'n_heads', [2, 4, 8]
1248
+ )
1249
+ n_layers = trial.suggest_int(
1250
+ 'n_layers', 2, 6
1251
+ )
1252
+
1253
+ dropout = trial.suggest_float(
1254
+ 'dropout', 0.0, 0.2
1255
+ )
1256
+
1257
+ batch_num = trial.suggest_int(
1258
+ 'batch_num',
1259
+ 5 if self.obj == 'reg:gamma' else 10,
1260
+ 10 if self.obj == 'reg:gamma' else 100,
1261
+ step=1 if self.obj == 'reg:gamma' else 10
1262
+ )
1263
+
1264
+ # Tweedie power
1265
+ if self.obj == 'reg:tweedie':
1266
+ tw_power = trial.suggest_float('tw_power', 1.0, 2.0)
1267
+ elif self.obj == 'count:poisson':
1268
+ tw_power = 1.0
1269
+ elif self.obj == 'reg:gamma':
1270
+ tw_power = 2.0
1271
+
1272
+ loss = 0.0
1273
+
1274
+ # 这里注意:FT 使用的是“原始特征”(self.factor_nmes),
1275
+ # 而不是 one-hot 之后的 self.train_oht_scl_data
1276
+ for fold, (train_idx, test_idx) in enumerate(
1277
+ self.cv.split(self.train_data[self.factor_nmes])):
1278
+
1279
+ X_train_fold = self.train_data.iloc[train_idx][self.factor_nmes]
1280
+ y_train_fold = self.train_data.iloc[train_idx][self.resp_nme]
1281
+ w_train_fold = self.train_data.iloc[train_idx][self.weight_nme]
1282
+
1283
+ X_val_fold = self.train_data.iloc[test_idx][self.factor_nmes]
1284
+ y_val_fold = self.train_data.iloc[test_idx][self.resp_nme]
1285
+ w_val_fold = self.train_data.iloc[test_idx][self.weight_nme]
1286
+
1287
+ # 创建 FT-Transformer 模型
1288
+ cv_ft = FTTransformerSklearn(
1289
+ model_nme=self.model_nme,
1290
+ num_cols=self.num_features, # 数值特征列表
1291
+ cat_cols=self.cate_list, # 类别特征列表(需是编码好的整数或category)
1292
+ d_model=d_model,
1293
+ n_heads=n_heads,
1294
+ n_layers=n_layers,
1295
+ dropout=dropout,
1296
+ batch_num=batch_num,
1297
+ epochs=self.epochs,
1298
+ tweedie_power=tw_power,
1299
+ learning_rate=learning_rate,
1300
+ patience=5 # 可以根据需要调整
1301
+ )
1302
+
1303
+ # 训练
1304
+ cv_ft.fit(
1305
+ X_train_fold,
1306
+ y_train_fold,
1307
+ w_train_fold,
1308
+ X_val_fold,
1309
+ y_val_fold,
1310
+ w_val_fold
1311
+ )
1312
+
1313
+ # 预测
1314
+ y_pred_fold = cv_ft.predict(X_val_fold)
1315
+
1316
+ # 计算损失(与 ResNet 一致:mean_tweedie_deviance)
1317
+ loss += mean_tweedie_deviance(
1318
+ y_val_fold,
1319
+ y_pred_fold,
1320
+ sample_weight=w_val_fold,
1321
+ power=tw_power
1322
+ )
1323
+
1324
+ return loss / int(1 / self.prop_test)
1325
+
1326
+ # 定义 FT-Transformer 贝叶斯优化函数
1327
+ def bayesopt_ft(self, max_evals=50):
1328
+ study = optuna.create_study(
1329
+ direction='minimize',
1330
+ sampler=optuna.samplers.TPESampler(seed=self.rand_seed)
1331
+ )
1332
+ study.optimize(self.cross_val_ft, n_trials=max_evals)
1333
+
1334
+ self.best_ft_params = study.best_params
1335
+ pd.DataFrame(self.best_ft_params, index=[0]).to_csv(
1336
+ os.getcwd() + '/Results/' + self.model_nme + '_bestparams_ft.csv'
1337
+ )
1338
+ self.best_ft_trial = study.best_trial
1339
+
1340
+ # 用最优参数重新建一个 FT 模型,在全量训练集上拟合
1341
+ self.ft_best = FTTransformerSklearn(
1342
+ model_nme=self.model_nme,
1343
+ num_cols=self.num_features,
1344
+ cat_cols=self.cate_list
1345
+ )
1346
+ # 设置最优超参
1347
+ self.ft_best.set_params(self.best_ft_params)
1348
+
1349
+ # 全量训练
1350
+ self.ft_best.fit(
1351
+ self.train_data[self.factor_nmes],
1352
+ self.train_data[self.resp_nme],
1353
+ self.train_data[self.weight_nme]
1354
+ )
1355
+
1356
+ # 记录模型标签
1357
+ self.model_label += ['FTTransformer']
1358
+
1359
+ # 训练集预测
1360
+ self.train_data['pred_ft'] = self.ft_best.predict(
1361
+ self.train_data[self.factor_nmes]
1362
+ )
1363
+ # 测试集预测
1364
+ self.test_data['pred_ft'] = self.ft_best.predict(
1365
+ self.test_data[self.factor_nmes]
1366
+ )
1367
+
1368
+ # 加权预测(和 XGB / ResNet 风格一致)
1369
+ self.train_data.loc[:, 'w_pred_ft'] = (
1370
+ self.train_data['pred_ft'] * self.train_data[self.weight_nme]
1371
+ )
1372
+ self.test_data.loc[:, 'w_pred_ft'] = (
1373
+ self.test_data['pred_ft'] * self.test_data[self.weight_nme]
1374
+ )
1375
+
1376
+ # 定义分箱函数
1377
+
1378
+ def _split_data(self, data, col_nme, wgt_nme, n_bins=10):
1379
+ data.sort_values(by=col_nme, ascending=True, inplace=True)
1380
+ data['cum_weight'] = data[wgt_nme].cumsum()
1381
+ w_sum = data[wgt_nme].sum()
1382
+ data.loc[:, 'bins'] = np.floor(
1383
+ data['cum_weight']*float(n_bins)/w_sum)
1384
+ data.loc[(data['bins'] == n_bins), 'bins'] = n_bins-1
1385
+ return data.groupby(['bins'], observed=True).sum(numeric_only=True)
1386
+
1387
+ # 定义Lift Chart绘制数据集函数
1388
+ def _plot_data_lift(self,
1389
+ pred_list, w_pred_list,
1390
+ w_act_list, weight_list, n_bins=10):
1391
+ lift_data = pd.DataFrame()
1392
+ lift_data.loc[:, 'pred'] = pred_list
1393
+ lift_data.loc[:, 'w_pred'] = w_pred_list
1394
+ lift_data.loc[:, 'act'] = w_act_list
1395
+ lift_data.loc[:, 'weight'] = weight_list
1396
+ plot_data = self._split_data(
1397
+ lift_data, 'pred', 'weight', n_bins)
1398
+ plot_data['exp_v'] = plot_data['w_pred'] / plot_data['weight']
1399
+ plot_data['act_v'] = plot_data['act'] / plot_data['weight']
1400
+ plot_data.reset_index(inplace=True)
1401
+ return plot_data
1402
+
1403
+ # 定义lift曲线绘制函数
1404
+ def plot_lift(self, model_label, pred_nme, n_bins=10):
1405
+ # 绘制建模集上结果
1406
+ figpos_list = [121, 122]
1407
+ plot_dict = {
1408
+ 121: self.train_data,
1409
+ 122: self.test_data
1410
+ }
1411
+ name_list = {
1412
+ 121: 'Train Data',
1413
+ 122: 'Test Data'
1414
+ }
1415
+ if model_label == 'Xgboost':
1416
+ pred_nme = 'pred_xgb'
1417
+ elif model_label == 'ResNet':
1418
+ pred_nme = 'pred_resn'
1419
+ elif model_label == 'FTTransformer':
1420
+ pred_nme = 'pred_ft'
1421
+
1422
+ fig = plt.figure(figsize=(11, 5))
1423
+ for figpos in figpos_list:
1424
+ plot_data = self._plot_data_lift(
1425
+ plot_dict[figpos][pred_nme].values,
1426
+ plot_dict[figpos]['w_'+pred_nme].values,
1427
+ plot_dict[figpos]['w_act'].values,
1428
+ plot_dict[figpos][self.weight_nme].values,
1429
+ n_bins)
1430
+ ax = fig.add_subplot(figpos)
1431
+ ax.plot(plot_data.index, plot_data['act_v'],
1432
+ label='Actual', color='red')
1433
+ ax.plot(plot_data.index, plot_data['exp_v'],
1434
+ label='Predicted', color='blue')
1435
+ ax.set_title(
1436
+ 'Lift Chart on %s' % name_list[figpos], fontsize=8)
1437
+ plt.xticks(plot_data.index,
1438
+ plot_data.index,
1439
+ rotation=90, fontsize=6)
1440
+ plt.yticks(fontsize=6)
1441
+ plt.legend(loc='upper left',
1442
+ fontsize=5, frameon=False)
1443
+ plt.margins(0.05)
1444
+ ax2 = ax.twinx()
1445
+ ax2.bar(plot_data.index, plot_data['weight'],
1446
+ alpha=0.5, color='seagreen',
1447
+ label='Earned Exposure')
1448
+ plt.yticks(fontsize=6)
1449
+ plt.legend(loc='upper right',
1450
+ fontsize=5, frameon=False)
1451
+ plt.subplots_adjust(wspace=0.3)
1452
+ save_path = os.path.join(
1453
+ os.getcwd(), 'plot', f'01_{self.model_nme}_{model_label}_lift.png')
1454
+ plt.savefig(save_path, dpi=300)
1455
+ plt.show()
1456
+ plt.close(fig)
1457
+
1458
+ # 定义Double Lift Chart绘制数据集函数
1459
+ def _plot_data_dlift(self,
1460
+ pred_list_model1, pred_list_model2,
1461
+ w_list, w_act_list, n_bins=10):
1462
+ lift_data = pd.DataFrame()
1463
+ lift_data.loc[:, 'pred1'] = pred_list_model1
1464
+ lift_data.loc[:, 'pred2'] = pred_list_model2
1465
+ lift_data.loc[:, 'diff_ly'] = lift_data['pred1'] / lift_data['pred2']
1466
+ lift_data.loc[:, 'act'] = w_act_list
1467
+ lift_data.loc[:, 'weight'] = w_list
1468
+ plot_data = self._split_data(lift_data, 'diff_ly', 'weight', n_bins)
1469
+ plot_data['exp_v1'] = plot_data['pred1'] / plot_data['act']
1470
+ plot_data['exp_v2'] = plot_data['pred2'] / plot_data['act']
1471
+ plot_data['act_v'] = plot_data['act'] / plot_data['act']
1472
+ plot_data.reset_index(inplace=True)
1473
+ return plot_data
1474
+
1475
+ # 定义绘制Double Lift Chart函数
1476
+ def plot_dlift(self, model_comp=['xgb', 'resn'], n_bins=10):
1477
+ # 指标名称
1478
+ # xgb = 'Xgboost'
1479
+ # resn = 'ResNet'
1480
+ # ft = 'FTTransformer'
1481
+ figpos_list = [121, 122]
1482
+ plot_dict = {
1483
+ 121: self.train_data,
1484
+ 122: self.test_data
1485
+ }
1486
+ name_list = {
1487
+ 121: 'Train Data',
1488
+ 122: 'Test Data'
1489
+ }
1490
+ fig = plt.figure(figsize=(11, 5))
1491
+ for figpos in figpos_list:
1492
+ plot_data = self._plot_data_dlift(
1493
+ plot_dict[figpos]['w_pred_'+model_comp[0]].values,
1494
+ plot_dict[figpos]['w_pred_'+model_comp[1]].values,
1495
+ plot_dict[figpos][self.weight_nme].values,
1496
+ plot_dict[figpos]['w_act'].values,
1497
+ n_bins)
1498
+ ax = fig.add_subplot(figpos)
1499
+ tt1 = 'Xgboost'
1500
+ tt2 = 'ResNet'
1501
+ ax.plot(plot_data.index, plot_data['act_v'],
1502
+ label='Actual', color='red')
1503
+ ax.plot(plot_data.index, plot_data['exp_v1'],
1504
+ label=tt1, color='blue')
1505
+ ax.plot(plot_data.index, plot_data['exp_v2'],
1506
+ label=tt2, color='black')
1507
+ ax.set_title(
1508
+ 'Double Lift Chart on %s' % name_list[figpos], fontsize=8)
1509
+ plt.xticks(plot_data.index,
1510
+ plot_data.index,
1511
+ rotation=90, fontsize=6)
1512
+ plt.xlabel('%s / %s' % (tt1, tt2), fontsize=6)
1513
+ plt.yticks(fontsize=6)
1514
+ plt.legend(loc='upper left',
1515
+ fontsize=5, frameon=False)
1516
+ plt.margins(0.1)
1517
+ plt.subplots_adjust(bottom=0.25, top=0.95, right=0.8)
1518
+ ax2 = ax.twinx()
1519
+ ax2.bar(plot_data.index, plot_data['weight'],
1520
+ alpha=0.5, color='seagreen',
1521
+ label='Earned Exposure')
1522
+ plt.yticks(fontsize=6)
1523
+ plt.legend(loc='upper right',
1524
+ fontsize=5, frameon=False)
1525
+ plt.subplots_adjust(wspace=0.3)
1526
+ save_path = os.path.join(
1527
+ os.getcwd(), 'plot', f'02_{self.model_nme}_dlift.png')
1528
+ plt.savefig(save_path, dpi=300)
1529
+ plt.show()
1530
+ plt.close(fig)
1531
+
1532
+ # 保存模型
1533
+
1534
+ def save_model(self, model_name=None):
1535
+
1536
+ # model_name 可以是:
1537
+ # - None: 保存全部可用模型
1538
+ # - 'xgb': 只保存 Xgboost
1539
+ # - 'resn': 只保存 ResNet
1540
+ # - 'ft': 只保存 FT-Transformer
1541
+
1542
+ model_dir = os.path.join(os.getcwd(), 'model')
1543
+ if not os.path.exists(model_dir):
1544
+ os.makedirs(model_dir)
1545
+
1546
+ save_path_xgb = os.path.join(
1547
+ model_dir, f'01_{self.model_nme}_Xgboost.pkl'
1548
+ )
1549
+ save_path_resn = os.path.join(
1550
+ model_dir, f'01_{self.model_nme}_ResNet.pth'
1551
+ )
1552
+ save_path_ft = os.path.join(
1553
+ model_dir, f'01_{self.model_nme}_FTTransformer.pth'
1554
+ )
1555
+
1556
+ # 保存 XGBoost
1557
+ if model_name in (None, 'xgb'):
1558
+ if hasattr(self, 'xgb_best'):
1559
+ joblib.dump(self.xgb_best, save_path_xgb)
1560
+ else:
1561
+ print("[save_model] Warning: xgb_best 不存在,未保存 Xgboost 模型。")
1562
+
1563
+ # 保存 ResNet(只保存核心网络的 state_dict)
1564
+ if model_name in (None, 'resn'):
1565
+ if hasattr(self, 'resn_best'):
1566
+ torch.save(self.resn_best.resnet.state_dict(), save_path_resn)
1567
+ else:
1568
+ print("[save_model] Warning: resn_best 不存在,未保存 ResNet 模型。")
1569
+
1570
+ # 保存 FT-Transformer(直接保存整个 sklearn 风格 wrapper,方便恢复结构和参数)
1571
+ if model_name in (None, 'ft'):
1572
+ if hasattr(self, 'ft_best'):
1573
+ # 这里直接保存整个对象,包含结构和参数、best 超参等
1574
+ torch.save(self.ft_best, save_path_ft)
1575
+ else:
1576
+ print("[save_model] Warning: ft_best 不存在,未保存 FT-Transformer 模型。")
1577
+
1578
+ def load_model(self, model_name=None):
1579
+ # model_name 可以是:
1580
+ # - None: 加载全部能找到的模型
1581
+ # - 'xgb': 只加载 Xgboost
1582
+ # - 'resn': 只加载 ResNet
1583
+ # - 'ft': 只加载 FT-Transformer
1584
+
1585
+ model_dir = os.path.join(os.getcwd(), 'model')
1586
+ save_path_xgb = os.path.join(
1587
+ model_dir, f'01_{self.model_nme}_Xgboost.pkl'
1588
+ )
1589
+ save_path_resn = os.path.join(
1590
+ model_dir, f'01_{self.model_nme}_ResNet.pth'
1591
+ )
1592
+ save_path_ft = os.path.join(
1593
+ model_dir, f'01_{self.model_nme}_FTTransformer.pth'
1594
+ )
1595
+
1596
+ # 加载 XGBoost
1597
+ if model_name in (None, 'xgb'):
1598
+ if os.path.exists(save_path_xgb):
1599
+ self.xgb_load = joblib.load(save_path_xgb)
1600
+ else:
1601
+ print(
1602
+ f"[load_model] Warning: 未找到 Xgboost 模型文件:{save_path_xgb}")
1603
+
1604
+ # 加载 ResNet(重新构建 wrapper,然后加载 state_dict)
1605
+ if model_name in (None, 'resn'):
1606
+ if os.path.exists(save_path_resn):
1607
+ self.resn_load = ResNetSklearn(
1608
+ model_nme=self.model_nme,
1609
+ input_dim=self.train_oht_scl_data[self.var_nmes].shape[1]
1610
+ )
1611
+ state_dict = torch.load(
1612
+ save_path_resn, map_location=self.resn_load.device)
1613
+ self.resn_load.resnet.load_state_dict(state_dict)
1614
+ else:
1615
+ print(
1616
+ f"[load_model] Warning: 未找到 ResNet 模型文件:{save_path_resn}")
1617
+
1618
+ # 加载 FT-Transformer(直接反序列化 sklearn 风格 wrapper)
1619
+ if model_name in (None, 'ft'):
1620
+ if os.path.exists(save_path_ft):
1621
+ # 统一用 CPU 先加载,再按当前环境迁移
1622
+ ft_loaded = torch.load(save_path_ft, map_location='cpu')
1623
+ # 根据当前环境设置 device,并把内部 core 模型迁到对应 device
1624
+ if torch.cuda.is_available():
1625
+ ft_loaded.device = torch.device('cuda')
1626
+ elif torch.backends.mps.is_available():
1627
+ ft_loaded.device = torch.device('mps')
1628
+ else:
1629
+ ft_loaded.device = torch.device('cpu')
1630
+ ft_loaded.ft.to(ft_loaded.device)
1631
+
1632
+ self.ft_load = ft_loaded
1633
+ else:
1634
+ print(
1635
+ f"[load_model] Warning: 未找到 FT-Transformer 模型文件:{save_path_ft}")
1636
+
1637
+ def _build_ft_shap_matrix(self, data: pd.DataFrame) -> np.ndarray:
1638
+
1639
+ # 将原始特征 DataFrame (包含 self.factor_nmes) 转成
1640
+ # 纯数值矩阵: 数值列为 float64,类别列为整数 code(float64 存储)。
1641
+ # 列顺序与 self.factor_nmes 保持一致。
1642
+
1643
+ matrices = []
1644
+
1645
+ for col in self.factor_nmes:
1646
+ s = data[col]
1647
+
1648
+ if col in self.cate_list:
1649
+ # 类别列:按训练时的类别全集编码
1650
+ cats = pd.Categorical(
1651
+ s,
1652
+ categories=self.cat_categories_for_shap[col]
1653
+ )
1654
+ # cats.codes 是一个 Index / ndarray,用 np.asarray 包一下再 reshape
1655
+ codes = np.asarray(cats.codes, dtype=np.float64).reshape(-1, 1)
1656
+ matrices.append(codes)
1657
+ else:
1658
+ # 数值列:转成 Series -> numpy -> reshape
1659
+ vals = pd.to_numeric(s, errors="coerce")
1660
+ arr = vals.to_numpy(dtype=np.float64, copy=True).reshape(-1, 1)
1661
+ matrices.append(arr)
1662
+
1663
+ X_mat = np.concatenate(matrices, axis=1) # (N, F)
1664
+ return X_mat
1665
+
1666
+ def _decode_ft_shap_matrix_to_df(self, X_mat: np.ndarray) -> pd.DataFrame:
1667
+
1668
+ # 将 SHAP 的数值矩阵 (N, F) 还原为原始特征 DataFrame,
1669
+ # 数值列为 float,类别列还原为 pandas 的 category 类型,
1670
+ # 以便兼容 enable_categorical=True 的 XGBoost 和 FT-Transformer 的输入。
1671
+ # 列顺序 = self.factor_nmes
1672
+
1673
+ data_dict = {}
1674
+
1675
+ for j, col in enumerate(self.factor_nmes):
1676
+ col_vals = X_mat[:, j]
1677
+
1678
+ if col in self.cate_list:
1679
+ cats = self.cat_categories_for_shap[col]
1680
+
1681
+ # SHAP 会扰动成小数,这里 round 回整数 code
1682
+ codes = np.round(col_vals).astype(int)
1683
+ # 限制在 [-1, len(cats)-1]
1684
+ codes = np.clip(codes, -1, len(cats) - 1)
1685
+
1686
+ # 使用 pandas.Categorical.from_codes:
1687
+ # - codes = -1 被当成缺失 (NaN)
1688
+ # - 其他索引映射到 cats 中对应的类别
1689
+ cat_series = pd.Categorical.from_codes(
1690
+ codes,
1691
+ categories=cats
1692
+ )
1693
+ # 存的是 Categorical 类型,而不是 object
1694
+ data_dict[col] = cat_series
1695
+ else:
1696
+ # 数值列:直接 float
1697
+ data_dict[col] = col_vals.astype(float)
1698
+
1699
+ df = pd.DataFrame(data_dict, columns=self.factor_nmes)
1700
+
1701
+ # 再保险:确保所有类别列 dtype 真的是 category
1702
+ for col in self.cate_list:
1703
+ if col in df.columns:
1704
+ df[col] = df[col].astype("category")
1705
+ return df
1706
+
1707
+ # ========= XGBoost SHAP =========
1708
+
1709
+ def compute_shap_xgb(self, n_background: int = 500,
1710
+ n_samples: int = 200,
1711
+ on_train: bool = True):
1712
+ # 使用 KernelExplainer 计算 XGBoost 的 SHAP 值(黑盒方式)。
1713
+ #
1714
+ # - 对 SHAP:输入是一份纯数值矩阵:
1715
+ # * 数值特征:float64
1716
+ # * 类别特征:用 _build_ft_shap_matrix 编码后的整数 code(float64)
1717
+ # - 对模型:仍然用原始 DataFrame + xgb_best.predict(...)
1718
+
1719
+ if not hasattr(self, "xgb_best"):
1720
+ raise RuntimeError("请先运行 bayesopt_xgb() 训练好 self.xgb_best")
1721
+
1722
+ # 1) 选择数据源:训练集 or 测试集(原始特征空间)
1723
+ data = self.train_data if on_train else self.test_data
1724
+ X_raw = data[self.factor_nmes]
1725
+
1726
+ # 2) 构造背景矩阵(用和 FT 一样的数值编码)
1727
+ background_raw = X_raw.sample(
1728
+ min(len(X_raw), n_background),
1729
+ random_state=self.rand_seed
1730
+ )
1731
+ background_mat = self._build_ft_shap_matrix(
1732
+ background_raw
1733
+ ).astype(np.float64, copy=True)
1734
+
1735
+ # 3) 定义黑盒预测函数:数值矩阵 -> DataFrame -> xgb_best.predict
1736
+ def f_predict(x_mat: np.ndarray) -> np.ndarray:
1737
+ # 把编码矩阵还原成原始 DataFrame(数值+类别)
1738
+ df_input = self._decode_ft_shap_matrix_to_df(x_mat)
1739
+ # 注意:这里用的是 self.xgb_best.predict,和你训练/预测时一致
1740
+ y_pred = self.xgb_best.predict(df_input)
1741
+ return y_pred
1742
+
1743
+ explainer = shap.KernelExplainer(f_predict, background_mat)
1744
+
1745
+ # 4) 要解释的样本:原始特征 + 数值编码
1746
+ X_explain_raw = X_raw.sample(
1747
+ min(len(X_raw), n_samples),
1748
+ random_state=self.rand_seed
1749
+ )
1750
+ X_explain_mat = self._build_ft_shap_matrix(
1751
+ X_explain_raw
1752
+ ).astype(np.float64, copy=True)
1753
+
1754
+ # 5) 计算 SHAP 值(注意用 nsamples='auto' 控制复杂度)
1755
+ shap_values = explainer.shap_values(X_explain_mat, nsamples="auto")
1756
+
1757
+ # 6) 保存结果:
1758
+ # - shap_values:数值编码空间,对应 factor_nmes 的每一列
1759
+ # - X_explain_raw:原始 DataFrame,方便画图时显示真实类别名
1760
+ self.shap_xgb = {
1761
+ "explainer": explainer,
1762
+ "X_explain": X_explain_raw,
1763
+ "shap_values": shap_values,
1764
+ "base_value": explainer.expected_value,
1765
+ }
1766
+ return self.shap_xgb
1767
+ # ========= ResNet SHAP =========
1768
+
1769
+ def _resn_predict_wrapper(self, X_np: np.ndarray) -> np.ndarray:
1770
+ # 用于 SHAP 的 ResNet 预测包装。
1771
+ # X_np: numpy array, shape = (N, n_features), 列顺序对应 self.var_nmes
1772
+ X_df = pd.DataFrame(X_np, columns=self.var_nmes)
1773
+ return self.resn_best.predict(X_df)
1774
+
1775
+ def compute_shap_resn(self, n_background: int = 500,
1776
+ n_samples: int = 200,
1777
+ on_train: bool = True):
1778
+
1779
+ # 使用 KernelExplainer 计算 ResNet 的 SHAP 值。
1780
+ # 解释空间:已 one-hot & 标准化后的特征 self.var_nmes。
1781
+
1782
+ if not hasattr(self, 'resn_best'):
1783
+ raise RuntimeError("请先运行 bayesopt_resnet() 训练好 resn_best")
1784
+
1785
+ # 选择数据集(已 one-hot & 标准化)
1786
+ data = self.train_oht_scl_data if on_train else self.test_oht_scl_data
1787
+ X = data[self.var_nmes]
1788
+
1789
+ # 背景样本:float64 numpy
1790
+ background_df = X.sample(
1791
+ min(len(X), n_background),
1792
+ random_state=self.rand_seed
1793
+ )
1794
+ background_np = background_df.to_numpy(dtype=np.float64, copy=True)
1795
+
1796
+ # 黑盒预测函数
1797
+ def f_predict(x):
1798
+ return self._resn_predict_wrapper(x)
1799
+
1800
+ explainer = shap.KernelExplainer(f_predict, background_np)
1801
+
1802
+ # 要解释的样本
1803
+ X_explain_df = X.sample(
1804
+ min(len(X), n_samples),
1805
+ random_state=self.rand_seed
1806
+ )
1807
+ X_explain_np = X_explain_df.to_numpy(dtype=np.float64, copy=True)
1808
+
1809
+ shap_values = explainer.shap_values(X_explain_np, nsamples="auto")
1810
+
1811
+ self.shap_resn = {
1812
+ "explainer": explainer,
1813
+ "X_explain": X_explain_df, # DataFrame: 用于画图(有列名)
1814
+ "shap_values": shap_values, # numpy: (n_samples, n_features)
1815
+ "base_value": explainer.expected_value,
1816
+ }
1817
+ return self.shap_resn
1818
+
1819
+ # ========= FT-Transformer SHAP =========
1820
+
1821
+ def _ft_shap_predict_wrapper(self, X_mat: np.ndarray) -> np.ndarray:
1822
+
1823
+ # SHAP 的预测包装:
1824
+ # 数值矩阵 -> 还原为原始特征 DataFrame -> 调用 ft_best.predict
1825
+
1826
+ df_input = self._decode_ft_shap_matrix_to_df(X_mat)
1827
+ y_pred = self.ft_best.predict(df_input)
1828
+ return y_pred
1829
+
1830
+ def compute_shap_ft(self, n_background: int = 500,
1831
+ n_samples: int = 200,
1832
+ on_train: bool = True):
1833
+
1834
+ # 使用 KernelExplainer 计算 FT-Transformer 的 SHAP 值。
1835
+ # 解释空间:数值+类别code 的混合数值矩阵(float64),
1836
+ # 但对外展示时仍使用原始特征名/取值(X_explain)。
1837
+
1838
+ if not hasattr(self, "ft_best"):
1839
+ raise RuntimeError("请先运行 bayesopt_ft() 训练好 ft_best")
1840
+
1841
+ # 选择数据源(原始特征空间)
1842
+ data = self.train_data if on_train else self.test_data
1843
+ X_raw = data[self.factor_nmes]
1844
+
1845
+ # 背景矩阵
1846
+ background_raw = X_raw.sample(
1847
+ min(len(X_raw), n_background),
1848
+ random_state=self.rand_seed
1849
+ )
1850
+ background_mat = self._build_ft_shap_matrix(
1851
+ background_raw
1852
+ ).astype(np.float64, copy=True)
1853
+
1854
+ # 黑盒预测函数(数值矩阵 → DataFrame → FT 模型)
1855
+ def f_predict(x):
1856
+ return self._ft_shap_predict_wrapper(x)
1857
+
1858
+ explainer = shap.KernelExplainer(f_predict, background_mat)
1859
+
1860
+ # 要解释的样本(原始特征空间)
1861
+ X_explain_raw = X_raw.sample(
1862
+ min(len(X_raw), n_samples),
1863
+ random_state=self.rand_seed
1864
+ )
1865
+ X_explain_mat = self._build_ft_shap_matrix(
1866
+ X_explain_raw
1867
+ ).astype(np.float64, copy=True)
1868
+
1869
+ shap_values = explainer.shap_values(X_explain_mat, nsamples="auto")
1870
+
1871
+ self.shap_ft = {
1872
+ "explainer": explainer,
1873
+ "X_explain": X_explain_raw, # 原始特征 DataFrame,用来画图
1874
+ "shap_values": shap_values, # numpy: (n_samples, n_features)
1875
+ "base_value": explainer.expected_value,
1876
+ }
1877
+ return self.shap_ft