ins-pricing 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. ins_pricing/README.md +60 -0
  2. ins_pricing/__init__.py +102 -0
  3. ins_pricing/governance/README.md +18 -0
  4. ins_pricing/governance/__init__.py +20 -0
  5. ins_pricing/governance/approval.py +93 -0
  6. ins_pricing/governance/audit.py +37 -0
  7. ins_pricing/governance/registry.py +99 -0
  8. ins_pricing/governance/release.py +159 -0
  9. ins_pricing/modelling/BayesOpt.py +146 -0
  10. ins_pricing/modelling/BayesOpt_USAGE.md +925 -0
  11. ins_pricing/modelling/BayesOpt_entry.py +575 -0
  12. ins_pricing/modelling/BayesOpt_incremental.py +731 -0
  13. ins_pricing/modelling/Explain_Run.py +36 -0
  14. ins_pricing/modelling/Explain_entry.py +539 -0
  15. ins_pricing/modelling/Pricing_Run.py +36 -0
  16. ins_pricing/modelling/README.md +33 -0
  17. ins_pricing/modelling/__init__.py +44 -0
  18. ins_pricing/modelling/bayesopt/__init__.py +98 -0
  19. ins_pricing/modelling/bayesopt/config_preprocess.py +303 -0
  20. ins_pricing/modelling/bayesopt/core.py +1476 -0
  21. ins_pricing/modelling/bayesopt/models.py +2196 -0
  22. ins_pricing/modelling/bayesopt/trainers.py +2446 -0
  23. ins_pricing/modelling/bayesopt/utils.py +1021 -0
  24. ins_pricing/modelling/cli_common.py +136 -0
  25. ins_pricing/modelling/explain/__init__.py +55 -0
  26. ins_pricing/modelling/explain/gradients.py +334 -0
  27. ins_pricing/modelling/explain/metrics.py +176 -0
  28. ins_pricing/modelling/explain/permutation.py +155 -0
  29. ins_pricing/modelling/explain/shap_utils.py +146 -0
  30. ins_pricing/modelling/notebook_utils.py +284 -0
  31. ins_pricing/modelling/plotting/__init__.py +45 -0
  32. ins_pricing/modelling/plotting/common.py +63 -0
  33. ins_pricing/modelling/plotting/curves.py +572 -0
  34. ins_pricing/modelling/plotting/diagnostics.py +139 -0
  35. ins_pricing/modelling/plotting/geo.py +362 -0
  36. ins_pricing/modelling/plotting/importance.py +121 -0
  37. ins_pricing/modelling/run_logging.py +133 -0
  38. ins_pricing/modelling/tests/conftest.py +8 -0
  39. ins_pricing/modelling/tests/test_cross_val_generic.py +66 -0
  40. ins_pricing/modelling/tests/test_distributed_utils.py +18 -0
  41. ins_pricing/modelling/tests/test_explain.py +56 -0
  42. ins_pricing/modelling/tests/test_geo_tokens_split.py +49 -0
  43. ins_pricing/modelling/tests/test_graph_cache.py +33 -0
  44. ins_pricing/modelling/tests/test_plotting.py +63 -0
  45. ins_pricing/modelling/tests/test_plotting_library.py +150 -0
  46. ins_pricing/modelling/tests/test_preprocessor.py +48 -0
  47. ins_pricing/modelling/watchdog_run.py +211 -0
  48. ins_pricing/pricing/README.md +44 -0
  49. ins_pricing/pricing/__init__.py +27 -0
  50. ins_pricing/pricing/calibration.py +39 -0
  51. ins_pricing/pricing/data_quality.py +117 -0
  52. ins_pricing/pricing/exposure.py +85 -0
  53. ins_pricing/pricing/factors.py +91 -0
  54. ins_pricing/pricing/monitoring.py +99 -0
  55. ins_pricing/pricing/rate_table.py +78 -0
  56. ins_pricing/production/__init__.py +21 -0
  57. ins_pricing/production/drift.py +30 -0
  58. ins_pricing/production/monitoring.py +143 -0
  59. ins_pricing/production/scoring.py +40 -0
  60. ins_pricing/reporting/README.md +20 -0
  61. ins_pricing/reporting/__init__.py +11 -0
  62. ins_pricing/reporting/report_builder.py +72 -0
  63. ins_pricing/reporting/scheduler.py +45 -0
  64. ins_pricing/setup.py +41 -0
  65. ins_pricing v2/__init__.py +23 -0
  66. ins_pricing v2/governance/__init__.py +20 -0
  67. ins_pricing v2/governance/approval.py +93 -0
  68. ins_pricing v2/governance/audit.py +37 -0
  69. ins_pricing v2/governance/registry.py +99 -0
  70. ins_pricing v2/governance/release.py +159 -0
  71. ins_pricing v2/modelling/Explain_Run.py +36 -0
  72. ins_pricing v2/modelling/Pricing_Run.py +36 -0
  73. ins_pricing v2/modelling/__init__.py +151 -0
  74. ins_pricing v2/modelling/cli_common.py +141 -0
  75. ins_pricing v2/modelling/config.py +249 -0
  76. ins_pricing v2/modelling/config_preprocess.py +254 -0
  77. ins_pricing v2/modelling/core.py +741 -0
  78. ins_pricing v2/modelling/data_container.py +42 -0
  79. ins_pricing v2/modelling/explain/__init__.py +55 -0
  80. ins_pricing v2/modelling/explain/gradients.py +334 -0
  81. ins_pricing v2/modelling/explain/metrics.py +176 -0
  82. ins_pricing v2/modelling/explain/permutation.py +155 -0
  83. ins_pricing v2/modelling/explain/shap_utils.py +146 -0
  84. ins_pricing v2/modelling/features.py +215 -0
  85. ins_pricing v2/modelling/model_manager.py +148 -0
  86. ins_pricing v2/modelling/model_plotting.py +463 -0
  87. ins_pricing v2/modelling/models.py +2203 -0
  88. ins_pricing v2/modelling/notebook_utils.py +294 -0
  89. ins_pricing v2/modelling/plotting/__init__.py +45 -0
  90. ins_pricing v2/modelling/plotting/common.py +63 -0
  91. ins_pricing v2/modelling/plotting/curves.py +572 -0
  92. ins_pricing v2/modelling/plotting/diagnostics.py +139 -0
  93. ins_pricing v2/modelling/plotting/geo.py +362 -0
  94. ins_pricing v2/modelling/plotting/importance.py +121 -0
  95. ins_pricing v2/modelling/run_logging.py +133 -0
  96. ins_pricing v2/modelling/tests/conftest.py +8 -0
  97. ins_pricing v2/modelling/tests/test_cross_val_generic.py +66 -0
  98. ins_pricing v2/modelling/tests/test_distributed_utils.py +18 -0
  99. ins_pricing v2/modelling/tests/test_explain.py +56 -0
  100. ins_pricing v2/modelling/tests/test_geo_tokens_split.py +49 -0
  101. ins_pricing v2/modelling/tests/test_graph_cache.py +33 -0
  102. ins_pricing v2/modelling/tests/test_plotting.py +63 -0
  103. ins_pricing v2/modelling/tests/test_plotting_library.py +150 -0
  104. ins_pricing v2/modelling/tests/test_preprocessor.py +48 -0
  105. ins_pricing v2/modelling/trainers.py +2447 -0
  106. ins_pricing v2/modelling/utils.py +1020 -0
  107. ins_pricing v2/modelling/watchdog_run.py +211 -0
  108. ins_pricing v2/pricing/__init__.py +27 -0
  109. ins_pricing v2/pricing/calibration.py +39 -0
  110. ins_pricing v2/pricing/data_quality.py +117 -0
  111. ins_pricing v2/pricing/exposure.py +85 -0
  112. ins_pricing v2/pricing/factors.py +91 -0
  113. ins_pricing v2/pricing/monitoring.py +99 -0
  114. ins_pricing v2/pricing/rate_table.py +78 -0
  115. ins_pricing v2/production/__init__.py +21 -0
  116. ins_pricing v2/production/drift.py +30 -0
  117. ins_pricing v2/production/monitoring.py +143 -0
  118. ins_pricing v2/production/scoring.py +40 -0
  119. ins_pricing v2/reporting/__init__.py +11 -0
  120. ins_pricing v2/reporting/report_builder.py +72 -0
  121. ins_pricing v2/reporting/scheduler.py +45 -0
  122. ins_pricing v2/scripts/BayesOpt_incremental.py +722 -0
  123. ins_pricing v2/scripts/Explain_entry.py +545 -0
  124. ins_pricing v2/scripts/__init__.py +1 -0
  125. ins_pricing v2/scripts/train.py +568 -0
  126. ins_pricing v2/setup.py +55 -0
  127. ins_pricing v2/smoke_test.py +28 -0
  128. ins_pricing-0.1.6.dist-info/METADATA +78 -0
  129. ins_pricing-0.1.6.dist-info/RECORD +169 -0
  130. ins_pricing-0.1.6.dist-info/WHEEL +5 -0
  131. ins_pricing-0.1.6.dist-info/top_level.txt +4 -0
  132. user_packages/__init__.py +105 -0
  133. user_packages legacy/BayesOpt.py +5659 -0
  134. user_packages legacy/BayesOpt_entry.py +513 -0
  135. user_packages legacy/BayesOpt_incremental.py +685 -0
  136. user_packages legacy/Pricing_Run.py +36 -0
  137. user_packages legacy/Try/BayesOpt Legacy251213.py +3719 -0
  138. user_packages legacy/Try/BayesOpt Legacy251215.py +3758 -0
  139. user_packages legacy/Try/BayesOpt lagecy251201.py +3506 -0
  140. user_packages legacy/Try/BayesOpt lagecy251218.py +3992 -0
  141. user_packages legacy/Try/BayesOpt legacy.py +3280 -0
  142. user_packages legacy/Try/BayesOpt.py +838 -0
  143. user_packages legacy/Try/BayesOptAll.py +1569 -0
  144. user_packages legacy/Try/BayesOptAllPlatform.py +909 -0
  145. user_packages legacy/Try/BayesOptCPUGPU.py +1877 -0
  146. user_packages legacy/Try/BayesOptSearch.py +830 -0
  147. user_packages legacy/Try/BayesOptSearchOrigin.py +829 -0
  148. user_packages legacy/Try/BayesOptV1.py +1911 -0
  149. user_packages legacy/Try/BayesOptV10.py +2973 -0
  150. user_packages legacy/Try/BayesOptV11.py +3001 -0
  151. user_packages legacy/Try/BayesOptV12.py +3001 -0
  152. user_packages legacy/Try/BayesOptV2.py +2065 -0
  153. user_packages legacy/Try/BayesOptV3.py +2209 -0
  154. user_packages legacy/Try/BayesOptV4.py +2342 -0
  155. user_packages legacy/Try/BayesOptV5.py +2372 -0
  156. user_packages legacy/Try/BayesOptV6.py +2759 -0
  157. user_packages legacy/Try/BayesOptV7.py +2832 -0
  158. user_packages legacy/Try/BayesOptV8Codex.py +2731 -0
  159. user_packages legacy/Try/BayesOptV8Gemini.py +2614 -0
  160. user_packages legacy/Try/BayesOptV9.py +2927 -0
  161. user_packages legacy/Try/BayesOpt_entry legacy.py +313 -0
  162. user_packages legacy/Try/ModelBayesOptSearch.py +359 -0
  163. user_packages legacy/Try/ResNetBayesOptSearch.py +249 -0
  164. user_packages legacy/Try/XgbBayesOptSearch.py +121 -0
  165. user_packages legacy/Try/xgbbayesopt.py +523 -0
  166. user_packages legacy/__init__.py +19 -0
  167. user_packages legacy/cli_common.py +124 -0
  168. user_packages legacy/notebook_utils.py +228 -0
  169. user_packages legacy/watchdog_run.py +202 -0
@@ -0,0 +1,2731 @@
1
+ # 数据在CPU和GPU之间传输会带来较大开销,但可以多CUDA流同时传输数据和计算,从而实现更大数据集的操作。
2
+
3
+ import copy
4
+ import gc
5
+ import math
6
+ import os
7
+ from dataclasses import dataclass
8
+ from pathlib import Path
9
+ from typing import Any, Dict, List, Optional
10
+ import csv
11
+
12
+ import joblib
13
+ import matplotlib.pyplot as plt
14
+ import numpy as np # 1.26.2
15
+ import optuna # 4.3.0
16
+ import pandas as pd # 2.2.3
17
+ import shap
18
+ import statsmodels.api as sm
19
+
20
+ import torch # 版本: 1.10.1+cu111
21
+ import torch.nn as nn
22
+ import torch.nn.functional as F
23
+ import xgboost as xgb # 1.7.0
24
+
25
+ from torch.utils.data import Dataset, DataLoader, TensorDataset
26
+ from torch.cuda.amp import autocast, GradScaler
27
+ from torch.nn.utils import clip_grad_norm_
28
+ from sklearn.model_selection import ShuffleSplit, cross_val_score # 1.2.2
29
+ from sklearn.preprocessing import StandardScaler
30
+ from sklearn.metrics import log_loss, make_scorer, mean_tweedie_deviance
31
+
32
+ # =============================================================================
33
+ # Constants & utilities
34
+ # =============================================================================
35
+ EPS = 1e-8
36
+
37
+
38
+ class IOUtils:
39
+ """File and path helper utilities."""
40
+
41
+ @staticmethod
42
+ def csv_to_dict(file_path: str) -> List[Dict[str, Any]]:
43
+ with open(file_path, mode='r', encoding='utf-8') as file:
44
+ reader = csv.DictReader(file)
45
+ return [
46
+ dict(filter(lambda item: item[0] != '', row.items()))
47
+ for row in reader
48
+ ]
49
+
50
+ @staticmethod
51
+ def ensure_parent_dir(file_path: str) -> None:
52
+ # 若目标文件所在目录不存在则自动创建
53
+ directory = os.path.dirname(file_path)
54
+ if directory:
55
+ os.makedirs(directory, exist_ok=True)
56
+
57
+
58
+ class TrainingUtils:
59
+ """Small training-time helpers."""
60
+
61
+ @staticmethod
62
+ def compute_batch_size(data_size: int, learning_rate: float, batch_num: int, minimum: int) -> int:
63
+ estimated = int((learning_rate / 1e-4) ** 0.5 *
64
+ (data_size / max(batch_num, 1)))
65
+ return max(1, min(data_size, max(minimum, estimated)))
66
+
67
+ @staticmethod
68
+ def tweedie_loss(pred, target, p=1.5, eps=1e-6, max_clip=1e6):
69
+ # 为确保稳定性先将预测值裁剪为正数
70
+ pred_clamped = torch.clamp(pred, min=eps)
71
+ if p == 1:
72
+ term1 = target * torch.log(target / pred_clamped + eps) # 泊松
73
+ term2 = -target + pred_clamped
74
+ term3 = 0
75
+ elif p == 0:
76
+ term1 = 0.5 * torch.pow(target - pred_clamped, 2) # 高斯
77
+ term2 = 0
78
+ term3 = 0
79
+ elif p == 2:
80
+ term1 = torch.log(pred_clamped / target + eps) # 伽马
81
+ term2 = -target / pred_clamped + 1
82
+ term3 = 0
83
+ else:
84
+ term1 = torch.pow(target, 2 - p) / ((1 - p) * (2 - p))
85
+ term2 = target * torch.pow(pred_clamped, 1 - p) / (1 - p)
86
+ term3 = torch.pow(pred_clamped, 2 - p) / (2 - p)
87
+ return torch.nan_to_num( # Tweedie 负对数似然(忽略常数项)
88
+ 2 * (term1 - term2 + term3),
89
+ nan=eps,
90
+ posinf=max_clip,
91
+ neginf=-max_clip
92
+ )
93
+
94
+ @staticmethod
95
+ def free_cuda() -> None:
96
+ print(">>> Moving all models to CPU...")
97
+ for obj in gc.get_objects():
98
+ try:
99
+ if hasattr(obj, "to") and callable(obj.to):
100
+ obj.to("cpu")
101
+ except Exception:
102
+ pass
103
+
104
+ print(">>> Deleting tensors, optimizers, dataloaders...")
105
+ gc.collect()
106
+
107
+ print(">>> Emptying CUDA cache...")
108
+ torch.cuda.empty_cache()
109
+ torch.cuda.synchronize()
110
+
111
+ print(">>> CUDA memory freed.")
112
+
113
+
114
+ class PlotUtils:
115
+ """Plotting helpers used across models."""
116
+
117
+ @staticmethod
118
+ def split_data(data: pd.DataFrame, col_nme: str, wgt_nme: str, n_bins: int = 10) -> pd.DataFrame:
119
+ data_sorted = data.sort_values(by=col_nme, ascending=True).copy()
120
+ data_sorted['cum_weight'] = data_sorted[wgt_nme].cumsum()
121
+ w_sum = data_sorted[wgt_nme].sum()
122
+ if w_sum <= EPS:
123
+ data_sorted.loc[:, 'bins'] = 0
124
+ else:
125
+ data_sorted.loc[:, 'bins'] = np.floor(
126
+ data_sorted['cum_weight'] * float(n_bins) / w_sum
127
+ )
128
+ data_sorted.loc[(data_sorted['bins'] == n_bins),
129
+ 'bins'] = n_bins - 1
130
+ return data_sorted.groupby(['bins'], observed=True).sum(numeric_only=True)
131
+
132
+ @staticmethod
133
+ def plot_lift_list(pred_model, w_pred_list, w_act_list,
134
+ weight_list, tgt_nme, n_bins: int = 10,
135
+ fig_nme: str = 'Lift Chart'):
136
+ lift_data = pd.DataFrame()
137
+ lift_data.loc[:, 'pred'] = pred_model
138
+ lift_data.loc[:, 'w_pred'] = w_pred_list
139
+ lift_data.loc[:, 'act'] = w_act_list
140
+ lift_data.loc[:, 'weight'] = weight_list
141
+ plot_data = PlotUtils.split_data(lift_data, 'pred', 'weight', n_bins)
142
+ plot_data['exp_v'] = plot_data['w_pred'] / plot_data['weight']
143
+ plot_data['act_v'] = plot_data['act'] / plot_data['weight']
144
+ plot_data.reset_index(inplace=True)
145
+ fig = plt.figure(figsize=(7, 5))
146
+ ax = fig.add_subplot(111)
147
+ ax.plot(plot_data.index, plot_data['act_v'],
148
+ label='Actual', color='red')
149
+ ax.plot(plot_data.index, plot_data['exp_v'],
150
+ label='Predicted', color='blue')
151
+ ax.set_title(
152
+ 'Lift Chart of %s' % tgt_nme, fontsize=8)
153
+ plt.xticks(plot_data.index,
154
+ plot_data.index,
155
+ rotation=90, fontsize=6)
156
+ plt.yticks(fontsize=6)
157
+ plt.legend(loc='upper left',
158
+ fontsize=5, frameon=False)
159
+ plt.margins(0.05)
160
+ ax2 = ax.twinx()
161
+ ax2.bar(plot_data.index, plot_data['weight'],
162
+ alpha=0.5, color='seagreen',
163
+ label='Earned Exposure')
164
+ plt.yticks(fontsize=6)
165
+ plt.legend(loc='upper right',
166
+ fontsize=5, frameon=False)
167
+ plt.subplots_adjust(wspace=0.3)
168
+ save_path = os.path.join(
169
+ os.getcwd(), 'plot', f'05_{tgt_nme}_{fig_nme}.png')
170
+ IOUtils.ensure_parent_dir(save_path)
171
+ plt.savefig(save_path, dpi=300)
172
+ plt.close(fig)
173
+
174
+ @staticmethod
175
+ def plot_dlift_list(pred_model_1, pred_model_2,
176
+ model_nme_1, model_nme_2,
177
+ tgt_nme,
178
+ w_list, w_act_list, n_bins: int = 10,
179
+ fig_nme: str = 'Double Lift Chart'):
180
+ lift_data = pd.DataFrame()
181
+ lift_data.loc[:, 'pred1'] = pred_model_1
182
+ lift_data.loc[:, 'pred2'] = pred_model_2
183
+ lift_data.loc[:, 'diff_ly'] = lift_data['pred1'] / lift_data['pred2']
184
+ lift_data.loc[:, 'act'] = w_act_list
185
+ lift_data.loc[:, 'weight'] = w_list
186
+ lift_data.loc[:, 'w_pred1'] = lift_data['pred1'] * lift_data['weight']
187
+ lift_data.loc[:, 'w_pred2'] = lift_data['pred2'] * lift_data['weight']
188
+ plot_data = PlotUtils.split_data(
189
+ lift_data, 'diff_ly', 'weight', n_bins)
190
+ plot_data['exp_v1'] = plot_data['w_pred1'] / plot_data['act']
191
+ plot_data['exp_v2'] = plot_data['w_pred2'] / plot_data['act']
192
+ plot_data['act_v'] = plot_data['act']/plot_data['act']
193
+ plot_data.reset_index(inplace=True)
194
+ fig = plt.figure(figsize=(7, 5))
195
+ ax = fig.add_subplot(111)
196
+ ax.plot(plot_data.index, plot_data['act_v'],
197
+ label='Actual', color='red')
198
+ ax.plot(plot_data.index, plot_data['exp_v1'],
199
+ label=model_nme_1, color='blue')
200
+ ax.plot(plot_data.index, plot_data['exp_v2'],
201
+ label=model_nme_2, color='black')
202
+ ax.set_title(
203
+ 'Double Lift Chart of %s' % tgt_nme, fontsize=8)
204
+ plt.xticks(plot_data.index,
205
+ plot_data.index,
206
+ rotation=90, fontsize=6)
207
+ plt.xlabel('%s / %s' % (model_nme_1, model_nme_2), fontsize=6)
208
+ plt.yticks(fontsize=6)
209
+ plt.legend(loc='upper left',
210
+ fontsize=5, frameon=False)
211
+ plt.margins(0.1)
212
+ plt.subplots_adjust(bottom=0.25, top=0.95, right=0.8)
213
+ ax2 = ax.twinx()
214
+ ax2.bar(plot_data.index, plot_data['weight'],
215
+ alpha=0.5, color='seagreen',
216
+ label='Earned Exposure')
217
+ plt.yticks(fontsize=6)
218
+ plt.legend(loc='upper right',
219
+ fontsize=5, frameon=False)
220
+ plt.subplots_adjust(wspace=0.3)
221
+ save_path = os.path.join(
222
+ os.getcwd(), 'plot', f'06_{tgt_nme}_{fig_nme}.png')
223
+ IOUtils.ensure_parent_dir(save_path)
224
+ plt.savefig(save_path, dpi=300)
225
+ plt.close(fig)
226
+
227
+
228
+ # Backwards-compatible functional wrappers
229
+ def csv_to_dict(file_path: str) -> List[Dict[str, Any]]:
230
+ return IOUtils.csv_to_dict(file_path)
231
+
232
+
233
+ def ensure_parent_dir(file_path: str) -> None:
234
+ IOUtils.ensure_parent_dir(file_path)
235
+
236
+
237
+ def compute_batch_size(data_size: int, learning_rate: float, batch_num: int, minimum: int) -> int:
238
+ return TrainingUtils.compute_batch_size(data_size, learning_rate, batch_num, minimum)
239
+
240
+
241
+ # 定义在 PyTorch 环境下的 Tweedie 偏差损失函数
242
+ # 参考文档:https://scikit-learn.org/stable/modules/model_evaluation.html#mean-poisson-gamma-and-tweedie-deviances
243
+ def tweedie_loss(pred, target, p=1.5, eps=1e-6, max_clip=1e6):
244
+ return TrainingUtils.tweedie_loss(pred, target, p=p, eps=eps, max_clip=max_clip)
245
+
246
+
247
+ # 定义释放CUDA内存函数
248
+ def free_cuda():
249
+ TrainingUtils.free_cuda()
250
+
251
+
252
+ class TorchTrainerMixin:
253
+ """Shared utilities for torch-based tabular trainers."""
254
+
255
+ def _device_type(self) -> str:
256
+ return getattr(self, "device", torch.device("cpu")).type
257
+
258
+ def _build_dataloader(self,
259
+ dataset,
260
+ N: int,
261
+ base_bs_gpu: tuple,
262
+ base_bs_cpu: tuple,
263
+ min_bs: int = 64,
264
+ target_effective_cuda: int = 8192,
265
+ target_effective_cpu: int = 4096,
266
+ large_threshold: int = 200_000,
267
+ mid_threshold: int = 50_000):
268
+ batch_size = TrainingUtils.compute_batch_size(
269
+ data_size=len(dataset),
270
+ learning_rate=self.learning_rate,
271
+ batch_num=self.batch_num,
272
+ minimum=min_bs
273
+ )
274
+ gpu_large, gpu_mid, gpu_small = base_bs_gpu
275
+ cpu_mid, cpu_small = base_bs_cpu
276
+
277
+ if self._device_type() == 'cuda':
278
+ if N > large_threshold:
279
+ base_bs = gpu_large
280
+ elif N > mid_threshold:
281
+ base_bs = gpu_mid
282
+ else:
283
+ base_bs = gpu_small
284
+ else:
285
+ base_bs = cpu_mid if N > mid_threshold else cpu_small
286
+
287
+ batch_size = min(batch_size, base_bs, N)
288
+ target_effective_bs = target_effective_cuda if self._device_type(
289
+ ) == 'cuda' else target_effective_cpu
290
+ accum_steps = max(1, target_effective_bs // batch_size)
291
+
292
+ dataloader = DataLoader(
293
+ dataset,
294
+ batch_size=batch_size,
295
+ shuffle=True,
296
+ num_workers=1,
297
+ pin_memory=(self._device_type() == 'cuda')
298
+ )
299
+ return dataloader, accum_steps
300
+
301
+ def _compute_weighted_loss(self, y_pred, y_true, weights, apply_softplus: bool = False):
302
+ task = getattr(self, "task_type", "regression")
303
+ if task == 'classification':
304
+ loss_fn = nn.BCEWithLogitsLoss(reduction='none')
305
+ losses = loss_fn(y_pred, y_true).view(-1)
306
+ else:
307
+ if apply_softplus:
308
+ y_pred = F.softplus(y_pred)
309
+ y_pred = torch.clamp(y_pred, min=1e-6)
310
+ power = getattr(self, "tw_power", 1.5)
311
+ losses = tweedie_loss(y_pred, y_true, p=power).view(-1)
312
+ weighted_loss = (losses * weights.view(-1)).sum() / \
313
+ torch.clamp(weights.sum(), min=EPS)
314
+ return weighted_loss
315
+
316
+ def _early_stop_update(self, val_loss, best_loss, best_state, patience_counter, model):
317
+ if val_loss < best_loss:
318
+ return val_loss, copy.deepcopy(model.state_dict()), 0, False
319
+ patience_counter += 1
320
+ should_stop = best_state is not None and patience_counter >= getattr(
321
+ self, "patience", 0)
322
+ return best_loss, best_state, patience_counter, should_stop
323
+
324
+ def _train_model(self,
325
+ model,
326
+ dataloader,
327
+ accum_steps,
328
+ optimizer,
329
+ scaler,
330
+ forward_fn,
331
+ val_forward_fn=None,
332
+ apply_softplus: bool = False,
333
+ clip_fn=None):
334
+ device_type = self._device_type()
335
+ best_loss = float('inf')
336
+ best_state = None
337
+ patience_counter = 0
338
+ stop_training = False
339
+
340
+ for _ in range(1, getattr(self, "epochs", 1) + 1):
341
+ model.train()
342
+ optimizer.zero_grad()
343
+
344
+ for step, batch in enumerate(dataloader):
345
+ with autocast(enabled=(device_type == 'cuda')):
346
+ y_pred, y_true, w = forward_fn(batch)
347
+ weighted_loss = self._compute_weighted_loss(
348
+ y_pred, y_true, w, apply_softplus=apply_softplus)
349
+ loss_for_backward = weighted_loss / accum_steps
350
+
351
+ scaler.scale(loss_for_backward).backward()
352
+
353
+ if ((step + 1) % accum_steps == 0) or ((step + 1) == len(dataloader)):
354
+ if clip_fn is not None:
355
+ clip_fn()
356
+ scaler.step(optimizer)
357
+ scaler.update()
358
+ optimizer.zero_grad()
359
+
360
+ if val_forward_fn is not None:
361
+ model.eval()
362
+ with torch.no_grad(), autocast(enabled=(device_type == 'cuda')):
363
+ y_val_pred, y_val_true, w_val = val_forward_fn()
364
+ val_weighted_loss = self._compute_weighted_loss(
365
+ y_val_pred, y_val_true, w_val, apply_softplus=apply_softplus)
366
+
367
+ best_loss, best_state, patience_counter, stop_training = self._early_stop_update(
368
+ val_weighted_loss, best_loss, best_state, patience_counter, model)
369
+ if stop_training:
370
+ break
371
+
372
+ return best_state
373
+
374
+
375
+ # =============================================================================
376
+ # Plotting helpers
377
+ # =============================================================================
378
+
379
+ def split_data(data, col_nme, wgt_nme, n_bins=10):
380
+ return PlotUtils.split_data(data, col_nme, wgt_nme, n_bins)
381
+
382
+ # 定义提纯曲线(Lift)绘制函数
383
+
384
+
385
+ def plot_lift_list(pred_model, w_pred_list, w_act_list,
386
+ weight_list, tgt_nme, n_bins=10,
387
+ fig_nme='Lift Chart'):
388
+ return PlotUtils.plot_lift_list(pred_model, w_pred_list, w_act_list,
389
+ weight_list, tgt_nme, n_bins, fig_nme)
390
+
391
+ # 定义双提纯曲线绘制函数
392
+
393
+
394
+ def plot_dlift_list(pred_model_1, pred_model_2,
395
+ model_nme_1, model_nme_2,
396
+ tgt_nme,
397
+ w_list, w_act_list, n_bins=10,
398
+ fig_nme='Double Lift Chart'):
399
+ return PlotUtils.plot_dlift_list(pred_model_1, pred_model_2,
400
+ model_nme_1, model_nme_2,
401
+ tgt_nme, w_list, w_act_list,
402
+ n_bins, fig_nme)
403
+
404
+
405
+ # =============================================================================
406
+ # ResNet model & sklearn-style wrapper
407
+ # =============================================================================
408
+
409
+ # 开始定义ResNet模型结构
410
+ # 残差块:两层线性 + ReLU + 残差连接
411
+ # ResBlock 继承 nn.Module
412
+ class ResBlock(nn.Module):
413
+ def __init__(self, dim: int, dropout: float = 0.1,
414
+ use_layernorm: bool = False, residual_scale: float = 0.1
415
+ ):
416
+ super().__init__()
417
+ self.use_layernorm = use_layernorm
418
+
419
+ if use_layernorm:
420
+ Norm = nn.LayerNorm # 对最后一维做归一化
421
+ else:
422
+ def Norm(d): return nn.BatchNorm1d(d) # 保留一个开关,想试 BN 时也能用
423
+
424
+ self.norm1 = Norm(dim)
425
+ self.fc1 = nn.Linear(dim, dim, bias=True)
426
+ self.act = nn.ReLU(inplace=True)
427
+ self.dropout = nn.Dropout(dropout) if dropout > 0.0 else nn.Identity()
428
+ self.norm2 = Norm(dim)
429
+ self.fc2 = nn.Linear(dim, dim, bias=True)
430
+
431
+ # 残差缩放,防止一开始就把主干搞炸
432
+ self.res_scale = nn.Parameter(
433
+ torch.tensor(residual_scale, dtype=torch.float32)
434
+ )
435
+
436
+ def forward(self, x):
437
+ # 前置激活结构
438
+ out = self.norm1(x)
439
+ out = self.fc1(out)
440
+ out = self.act(out)
441
+ out = self.dropout(out)
442
+ out = self.norm2(out)
443
+ out = self.fc2(out)
444
+ # 残差缩放再相加
445
+ return F.relu(x + self.res_scale * out)
446
+
447
+ # ResNetSequential 继承 nn.Module,定义整个网络结构
448
+
449
+
450
+ class ResNetSequential(nn.Module):
451
+ # 输入张量形状:(batch, input_dim)
452
+ # 网络结构:全连接 + 归一化 + ReLU,再堆叠若干残差块,最后输出 Softplus
453
+
454
+ def __init__(self, input_dim: int, hidden_dim: int = 64, block_num: int = 2,
455
+ use_layernorm: bool = True, dropout: float = 0.1,
456
+ residual_scale: float = 0.1):
457
+ super(ResNetSequential, self).__init__()
458
+
459
+ self.net = nn.Sequential()
460
+ self.net.add_module('fc1', nn.Linear(input_dim, hidden_dim))
461
+
462
+ if use_layernorm:
463
+ self.net.add_module('norm1', nn.LayerNorm(hidden_dim))
464
+ else:
465
+ self.net.add_module('norm1', nn.BatchNorm1d(hidden_dim))
466
+
467
+ self.net.add_module('relu1', nn.ReLU(inplace=True))
468
+
469
+ # 多个残差块
470
+ for i in range(block_num):
471
+ self.net.add_module(
472
+ f'ResBlk_{i+1}',
473
+ ResBlock(
474
+ hidden_dim,
475
+ dropout=dropout,
476
+ use_layernorm=use_layernorm,
477
+ residual_scale=residual_scale)
478
+ )
479
+
480
+ self.net.add_module('fc_out', nn.Linear(hidden_dim, 1))
481
+ self.net.add_module('softplus', nn.Softplus())
482
+
483
+ def forward(self, x):
484
+ return self.net(x)
485
+
486
+ # 定义ResNet模型的Scikit-Learn接口类
487
+
488
+
489
+ class ResNetSklearn(TorchTrainerMixin, nn.Module):
490
+ def __init__(self, model_nme: str, input_dim: int, hidden_dim: int = 64,
491
+ block_num: int = 2, batch_num: int = 100, epochs: int = 100,
492
+ task_type: str = 'regression',
493
+ tweedie_power: float = 1.5, learning_rate: float = 0.01, patience: int = 10,
494
+ use_layernorm: bool = True, dropout: float = 0.1,
495
+ residual_scale: float = 0.1,
496
+ use_data_parallel: bool = True):
497
+ super(ResNetSklearn, self).__init__()
498
+
499
+ self.input_dim = input_dim
500
+ self.hidden_dim = hidden_dim
501
+ self.block_num = block_num
502
+ self.batch_num = batch_num
503
+ self.epochs = epochs
504
+ self.task_type = task_type
505
+ self.model_nme = model_nme
506
+ self.learning_rate = learning_rate
507
+ self.patience = patience
508
+ self.use_layernorm = use_layernorm
509
+ self.dropout = dropout
510
+ self.residual_scale = residual_scale
511
+
512
+ # 设备选择:cuda > mps > cpu
513
+ if torch.cuda.is_available():
514
+ self.device = torch.device('cuda')
515
+ elif torch.backends.mps.is_available():
516
+ self.device = torch.device('mps')
517
+ else:
518
+ self.device = torch.device('cpu')
519
+
520
+ # Tweedie 幂指数设定(分类时不使用)
521
+ if self.task_type == 'classification':
522
+ self.tw_power = None
523
+ elif 'f' in self.model_nme:
524
+ self.tw_power = 1
525
+ elif 's' in self.model_nme:
526
+ self.tw_power = 2
527
+ else:
528
+ self.tw_power = tweedie_power
529
+
530
+ # 搭建网络(先在 CPU 上建好)
531
+ core = ResNetSequential(
532
+ self.input_dim,
533
+ self.hidden_dim,
534
+ self.block_num,
535
+ use_layernorm=self.use_layernorm,
536
+ dropout=self.dropout,
537
+ residual_scale=self.residual_scale
538
+ )
539
+
540
+ # 如果是分类任务,替换掉最后的 Softplus
541
+ if self.task_type == 'classification':
542
+ core.net.softplus = nn.Identity()
543
+
544
+ # ===== ⭐ 多卡支持:DataParallel =====
545
+ if use_data_parallel and (self.device.type == 'cuda') and (torch.cuda.device_count() > 1):
546
+ core = nn.DataParallel(core, device_ids=list(
547
+ range(torch.cuda.device_count())))
548
+ # DataParallel 会把输入 scatter 到多卡上,但“主设备”仍然是 cuda:0
549
+ self.device = torch.device('cuda')
550
+
551
+ self.resnet = core.to(self.device)
552
+
553
+ # ================ 内部工具 ================
554
+ def _build_train_val_tensors(self, X_train, y_train, w_train, X_val, y_val, w_val):
555
+ X_tensor = torch.tensor(X_train.values, dtype=torch.float32)
556
+ y_tensor = torch.tensor(
557
+ y_train.values, dtype=torch.float32).view(-1, 1)
558
+ w_tensor = torch.tensor(w_train.values, dtype=torch.float32).view(
559
+ -1, 1) if w_train is not None else torch.ones_like(y_tensor)
560
+
561
+ has_val = X_val is not None and y_val is not None
562
+ if has_val:
563
+ X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
564
+ y_val_tensor = torch.tensor(
565
+ y_val.values, dtype=torch.float32).view(-1, 1)
566
+ w_val_tensor = torch.tensor(w_val.values, dtype=torch.float32).view(
567
+ -1, 1) if w_val is not None else torch.ones_like(y_val_tensor)
568
+ else:
569
+ X_val_tensor = y_val_tensor = w_val_tensor = None
570
+ return X_tensor, y_tensor, w_tensor, X_val_tensor, y_val_tensor, w_val_tensor, has_val
571
+
572
+ def forward(self, x):
573
+ # 处理 SHAP 的 NumPy 输入
574
+ if isinstance(x, np.ndarray):
575
+ x_tensor = torch.tensor(x, dtype=torch.float32)
576
+ else:
577
+ x_tensor = x
578
+
579
+ x_tensor = x_tensor.to(self.device)
580
+ y_pred = self.resnet(x_tensor)
581
+ return y_pred
582
+
583
+ # ---------------- 训练 ----------------
584
+
585
+ def fit(self, X_train, y_train, w_train=None,
586
+ X_val=None, y_val=None, w_val=None):
587
+
588
+ X_tensor, y_tensor, w_tensor, X_val_tensor, y_val_tensor, w_val_tensor, has_val = \
589
+ self._build_train_val_tensors(
590
+ X_train, y_train, w_train, X_val, y_val, w_val)
591
+
592
+ dataset = TensorDataset(X_tensor, y_tensor, w_tensor)
593
+ dataloader, accum_steps = self._build_dataloader(
594
+ dataset,
595
+ N=X_tensor.shape[0],
596
+ base_bs_gpu=(4096, 2048, 1024),
597
+ base_bs_cpu=(1024, 512),
598
+ min_bs=64,
599
+ target_effective_cuda=8192,
600
+ target_effective_cpu=4096
601
+ )
602
+
603
+ # === 4. 优化器与 AMP ===
604
+ self.optimizer = torch.optim.Adam(
605
+ self.resnet.parameters(), lr=self.learning_rate)
606
+ self.scaler = GradScaler(enabled=(self.device.type == 'cuda'))
607
+
608
+ X_val_dev = y_val_dev = w_val_dev = None
609
+ if has_val:
610
+ X_val_dev = X_val_tensor.to(self.device, non_blocking=True)
611
+ y_val_dev = y_val_tensor.to(self.device, non_blocking=True)
612
+ w_val_dev = w_val_tensor.to(self.device, non_blocking=True)
613
+
614
+ def forward_fn(batch):
615
+ X_batch, y_batch, w_batch = batch
616
+ X_batch = X_batch.to(self.device, non_blocking=True)
617
+ y_batch = y_batch.to(self.device, non_blocking=True)
618
+ w_batch = w_batch.to(self.device, non_blocking=True)
619
+ y_pred = self.resnet(X_batch)
620
+ return y_pred, y_batch, w_batch
621
+
622
+ def val_forward_fn():
623
+ return self.resnet(X_val_dev), y_val_dev, w_val_dev
624
+
625
+ clip_fn = None
626
+ if self.device.type == 'cuda':
627
+ def clip_fn(): return (self.scaler.unscale_(self.optimizer),
628
+ clip_grad_norm_(self.resnet.parameters(), max_norm=1.0))
629
+
630
+ best_state = self._train_model(
631
+ self.resnet,
632
+ dataloader,
633
+ accum_steps,
634
+ self.optimizer,
635
+ self.scaler,
636
+ forward_fn,
637
+ val_forward_fn if has_val else None,
638
+ apply_softplus=False,
639
+ clip_fn=clip_fn
640
+ )
641
+
642
+ if has_val and best_state is not None:
643
+ self.resnet.load_state_dict(best_state)
644
+
645
+ # ---------------- 预测 ----------------
646
+
647
+ def predict(self, X_test):
648
+ self.resnet.eval()
649
+ if isinstance(X_test, pd.DataFrame):
650
+ X_np = X_test.values.astype(np.float32)
651
+ else:
652
+ X_np = X_test
653
+
654
+ with torch.no_grad():
655
+ y_pred = self(X_np).cpu().numpy()
656
+
657
+ if self.task_type == 'classification':
658
+ y_pred = 1 / (1 + np.exp(-y_pred)) # Sigmoid
659
+ else:
660
+ y_pred = np.clip(y_pred, 1e-6, None)
661
+ return y_pred.flatten()
662
+
663
+ # ---------------- 设置参数 ----------------
664
+
665
+ def set_params(self, params):
666
+ for key, value in params.items():
667
+ if hasattr(self, key):
668
+ setattr(self, key, value)
669
+ else:
670
+ raise ValueError(f"Parameter {key} not found in model.")
671
+ return self
672
+
673
+
674
+ # =============================================================================
675
+ # FT-Transformer model & sklearn-style wrapper
676
+ # =============================================================================
677
+ # 开始定义FT Transformer模型结构
678
+
679
+
680
+ class FeatureTokenizer(nn.Module):
681
+ # 将数值与类别特征映射为 token,输出形状 (batch, token 数, d_model)
682
+ # 设定:
683
+ # - X_num 表示数值特征,形状 (batch, num_numeric)
684
+ # - X_cat 表示类别特征,形状 (batch, num_categorical),每列为编码后的整数标签 [0, card-1]
685
+
686
+ def __init__(self, num_numeric: int, cat_cardinalities, d_model: int):
687
+ super().__init__()
688
+
689
+ self.num_numeric = num_numeric
690
+ self.has_numeric = num_numeric > 0
691
+
692
+ if self.has_numeric:
693
+ self.num_linear = nn.Linear(num_numeric, d_model)
694
+
695
+ self.embeddings = nn.ModuleList([
696
+ nn.Embedding(card, d_model) for card in cat_cardinalities
697
+ ])
698
+
699
+ def forward(self, X_num, X_cat):
700
+ tokens = []
701
+
702
+ if self.has_numeric:
703
+ # 数值特征映射为单个 token
704
+ num_token = self.num_linear(X_num) # 形状 (batch, d_model)
705
+ tokens.append(num_token)
706
+
707
+ # 每个类别特征生成一个嵌入 token
708
+ for i, emb in enumerate(self.embeddings):
709
+ tok = emb(X_cat[:, i]) # 形状 (batch, d_model)
710
+ tokens.append(tok)
711
+
712
+ # 最终堆叠为 (batch, token 数, d_model)
713
+ x = torch.stack(tokens, dim=1)
714
+ return x
715
+
716
+ # 定义具有残差缩放的Encoder层
717
+
718
+
719
+ class ScaledTransformerEncoderLayer(nn.Module):
720
+ def __init__(self, d_model: int, nhead: int, dim_feedforward: int = 2048,
721
+ dropout: float = 0.1, residual_scale_attn: float = 1.0,
722
+ residual_scale_ffn: float = 1.0, norm_first: bool = True,
723
+ ):
724
+ super().__init__()
725
+ self.self_attn = nn.MultiheadAttention(
726
+ embed_dim=d_model,
727
+ num_heads=nhead,
728
+ dropout=dropout,
729
+ batch_first=True
730
+ )
731
+
732
+ # 前馈网络部分
733
+ self.linear1 = nn.Linear(d_model, dim_feedforward)
734
+ self.dropout = nn.Dropout(dropout)
735
+ self.linear2 = nn.Linear(dim_feedforward, d_model)
736
+
737
+ # 归一化与 Dropout
738
+ self.norm1 = nn.LayerNorm(d_model)
739
+ self.norm2 = nn.LayerNorm(d_model)
740
+ self.dropout1 = nn.Dropout(dropout)
741
+ self.dropout2 = nn.Dropout(dropout)
742
+
743
+ self.activation = nn.GELU()
744
+ # self.activation = nn.ReLU()
745
+ self.norm_first = norm_first
746
+
747
+ # 残差缩放系数
748
+ self.res_scale_attn = residual_scale_attn
749
+ self.res_scale_ffn = residual_scale_ffn
750
+
751
+ def forward(self, src, src_mask=None, src_key_padding_mask=None):
752
+ # 输入张量形状:(batch, 序列长度, d_model)
753
+ x = src
754
+
755
+ if self.norm_first:
756
+ # 先归一化再做注意力
757
+ x = x + self._sa_block(self.norm1(x), src_mask,
758
+ src_key_padding_mask)
759
+ x = x + self._ff_block(self.norm2(x))
760
+ else:
761
+ # 后归一化(一般不启用)
762
+ x = self.norm1(
763
+ x + self._sa_block(x, src_mask, src_key_padding_mask))
764
+ x = self.norm2(x + self._ff_block(x))
765
+
766
+ return x
767
+
768
+ def _sa_block(self, x, attn_mask, key_padding_mask):
769
+ # 自注意力并附带残差缩放
770
+ attn_out, _ = self.self_attn(
771
+ x, x, x,
772
+ attn_mask=attn_mask,
773
+ key_padding_mask=key_padding_mask,
774
+ need_weights=False
775
+ )
776
+ return self.res_scale_attn * self.dropout1(attn_out)
777
+
778
+ def _ff_block(self, x):
779
+ # 前馈网络并附带残差缩放
780
+ x2 = self.linear2(self.dropout(self.activation(self.linear1(x))))
781
+ return self.res_scale_ffn * self.dropout2(x2)
782
+
783
+ # 定义FT-Transformer核心模型
784
+
785
+
786
+ class FTTransformerCore(nn.Module):
787
+ # 最小可用版本的 FT-Transformer:
788
+ # - FeatureTokenizer:将数值与类别特征转换为 token
789
+ # - TransformerEncoder:捕捉特征之间的交互
790
+ # - 池化 + MLP + Softplus:保证输出为正值(适配 Tweedie/Gamma)
791
+
792
+ def __init__(self, num_numeric: int, cat_cardinalities, d_model: int = 64,
793
+ n_heads: int = 8, n_layers: int = 4, dropout: float = 0.1,
794
+ ):
795
+ super().__init__()
796
+
797
+ self.tokenizer = FeatureTokenizer(
798
+ num_numeric=num_numeric,
799
+ cat_cardinalities=cat_cardinalities,
800
+ d_model=d_model
801
+ )
802
+ scale = 1.0 / math.sqrt(n_layers) # 推荐一个默认值
803
+ encoder_layer = ScaledTransformerEncoderLayer(
804
+ d_model=d_model,
805
+ nhead=n_heads,
806
+ dim_feedforward=d_model * 4,
807
+ dropout=dropout,
808
+ residual_scale_attn=scale,
809
+ residual_scale_ffn=scale,
810
+ norm_first=True,
811
+ )
812
+ self.encoder = nn.TransformerEncoder(
813
+ encoder_layer,
814
+ num_layers=n_layers
815
+ )
816
+ self.n_layers = n_layers
817
+
818
+ self.head = nn.Sequential(
819
+ nn.LayerNorm(d_model),
820
+ nn.Linear(d_model, d_model),
821
+ nn.GELU(),
822
+ # nn.ReLU(),
823
+ nn.Linear(d_model, 1),
824
+ # nn.Softplus() # 保证输出为正,适合 Tweedie / Gamma
825
+ # 移除 Softplus,让模型输出 logits。
826
+ # 在训练和推理时根据任务类型决定是否应用 sigmoid 或 softplus。
827
+ # 对于分类,输出 logits,然后用 BCEWithLogitsLoss
828
+ # 对于回归,在推理时应用 softplus
829
+ )
830
+
831
+ def forward(self, X_num, X_cat):
832
+
833
+ # X_num: (batch, 数值特征数),float32
834
+ # X_cat: (batch, 类别特征数),long
835
+
836
+ tokens = self.tokenizer(X_num, X_cat) # 形状 (batch, token 数, d_model)
837
+ x = self.encoder(tokens) # 形状 (batch, token 数, d_model)
838
+
839
+ # 对 token 做平均池化
840
+ x = x.mean(dim=1) # 形状 (batch, d_model)
841
+
842
+ out = self.head(x) # 形状 (batch, 1),Softplus 保证为正
843
+ return out
844
+
845
+ # 定义TabularDataset类
846
+
847
+
848
+ class TabularDataset(Dataset):
849
+ def __init__(self, X_num, X_cat, y, w):
850
+
851
+ # X_num: torch.float32, 形状 (N, 数值特征数)
852
+ # X_cat: torch.long, 形状 (N, 类别特征数)
853
+ # y: torch.float32, 形状 (N, 1)
854
+ # w: torch.float32, 形状 (N, 1)
855
+
856
+ self.X_num = X_num
857
+ self.X_cat = X_cat
858
+ self.y = y
859
+ self.w = w
860
+
861
+ def __len__(self):
862
+ return self.y.shape[0]
863
+
864
+ def __getitem__(self, idx):
865
+ return (
866
+ self.X_num[idx],
867
+ self.X_cat[idx],
868
+ self.y[idx],
869
+ self.w[idx],
870
+ )
871
+
872
+ # 定义FTTransformer的Scikit-Learn接口类
873
+
874
+
875
+ class FTTransformerSklearn(TorchTrainerMixin, nn.Module):
876
+
877
+ # sklearn 风格包装:
878
+ # - num_cols:数值特征列名列表
879
+ # - cat_cols:类别特征列名列表(需提前做标签编码,取值 [0, n_classes-1])
880
+
881
+ def __init__(self, model_nme: str, num_cols, cat_cols, d_model: int = 64, n_heads: int = 8,
882
+ n_layers: int = 4, dropout: float = 0.1, batch_num: int = 100, epochs: int = 100,
883
+ task_type: str = 'regression',
884
+ tweedie_power: float = 1.5, learning_rate: float = 1e-3, patience: int = 10,
885
+ use_data_parallel: bool = True,
886
+ ):
887
+ super().__init__()
888
+
889
+ self.model_nme = model_nme
890
+ self.num_cols = list(num_cols)
891
+ self.cat_cols = list(cat_cols)
892
+ self.d_model = d_model
893
+ self.n_heads = n_heads
894
+ self.n_layers = n_layers
895
+ self.dropout = dropout
896
+ self.batch_num = batch_num
897
+ self.epochs = epochs
898
+ self.learning_rate = learning_rate
899
+ self.task_type = task_type
900
+ self.patience = patience
901
+ if self.task_type == 'classification':
902
+ self.tw_power = None # 分类时不使用 Tweedie 幂
903
+ elif 'f' in self.model_nme:
904
+ self.tw_power = 1.0
905
+ elif 's' in self.model_nme:
906
+ self.tw_power = 2.0
907
+ else:
908
+ self.tw_power = tweedie_power
909
+ if torch.cuda.is_available():
910
+ self.device = torch.device("cuda")
911
+ elif torch.backends.mps.is_available():
912
+ self.device = torch.device("mps")
913
+ else:
914
+ self.device = torch.device("cpu")
915
+ self.cat_cardinalities = None
916
+ self.cat_categories = {}
917
+ self.ft = None
918
+ self.use_data_parallel = torch.cuda.device_count() > 1 and use_data_parallel
919
+
920
+ def _build_model(self, X_train):
921
+ num_numeric = len(self.num_cols)
922
+ cat_cardinalities = []
923
+
924
+ for col in self.cat_cols:
925
+ cats = X_train[col].astype('category')
926
+ categories = cats.cat.categories
927
+ self.cat_categories[col] = categories # 保存训练集类别全集
928
+
929
+ card = len(categories) + 1 # 多预留 1 类给“未知/缺失”
930
+ cat_cardinalities.append(card)
931
+
932
+ self.cat_cardinalities = cat_cardinalities
933
+
934
+ core = FTTransformerCore(
935
+ num_numeric=num_numeric,
936
+ cat_cardinalities=cat_cardinalities,
937
+ d_model=self.d_model,
938
+ n_heads=self.n_heads,
939
+ n_layers=self.n_layers,
940
+ dropout=self.dropout,
941
+ )
942
+ if self.use_data_parallel:
943
+ core = nn.DataParallel(core, device_ids=list(
944
+ range(torch.cuda.device_count())))
945
+ self.device = torch.device("cuda")
946
+ self.ft = core.to(self.device)
947
+
948
+ def _encode_cats(self, X):
949
+ # 输入 DataFrame 至少需要包含所有类别特征列
950
+ # 返回形状 (N, 类别特征数) 的 int64 数组
951
+
952
+ if not self.cat_cols:
953
+ return np.zeros((len(X), 0), dtype='int64')
954
+
955
+ X_cat_list = []
956
+ for col in self.cat_cols:
957
+ # 使用训练阶段记录的类别全集
958
+ categories = self.cat_categories[col]
959
+ # 按固定类别构造 Categorical
960
+ cats = pd.Categorical(X[col], categories=categories)
961
+ codes = cats.codes.astype('int64', copy=True) # -1 表示未知或缺失
962
+ # 未知或缺失映射到额外的“未知”索引 len(categories)
963
+ codes[codes < 0] = len(categories)
964
+ X_cat_list.append(codes)
965
+
966
+ X_cat_np = np.stack(X_cat_list, axis=1) # 形状 (N, 类别特征数)
967
+ return X_cat_np
968
+
969
+ def _build_train_tensors(self, X_train, y_train, w_train):
970
+ return self._tensorize_split(X_train, y_train, w_train)
971
+
972
+ def _build_val_tensors(self, X_val, y_val, w_val):
973
+ return self._tensorize_split(X_val, y_val, w_val, allow_none=True)
974
+
975
+ def _tensorize_split(self, X, y, w, allow_none: bool = False):
976
+ if X is None:
977
+ if allow_none:
978
+ return None, None, None, None, False
979
+ raise ValueError("输入特征 X 不能为空。")
980
+
981
+ X_num = torch.tensor(
982
+ X[self.num_cols].to_numpy(dtype=np.float32, copy=True),
983
+ dtype=torch.float32
984
+ )
985
+ if self.cat_cols:
986
+ X_cat = torch.tensor(self._encode_cats(X), dtype=torch.long)
987
+ else:
988
+ X_cat = torch.zeros((X_num.shape[0], 0), dtype=torch.long)
989
+
990
+ y_tensor = torch.tensor(
991
+ y.values, dtype=torch.float32).view(-1, 1) if y is not None else None
992
+ if y_tensor is None:
993
+ w_tensor = None
994
+ elif w is not None:
995
+ w_tensor = torch.tensor(
996
+ w.values, dtype=torch.float32).view(-1, 1)
997
+ else:
998
+ w_tensor = torch.ones_like(y_tensor)
999
+ return X_num, X_cat, y_tensor, w_tensor, y is not None
1000
+
1001
+ def fit(self, X_train, y_train, w_train=None,
1002
+ X_val=None, y_val=None, w_val=None):
1003
+
1004
+ # 首次拟合时需要构建底层模型结构
1005
+ if self.ft is None:
1006
+ self._build_model(X_train)
1007
+
1008
+ X_num_train, X_cat_train, y_tensor, w_tensor, _ = self._build_train_tensors(
1009
+ X_train, y_train, w_train)
1010
+ X_num_val, X_cat_val, y_val_tensor, w_val_tensor, has_val = self._build_val_tensors(
1011
+ X_val, y_val, w_val)
1012
+
1013
+ # --- 构建 DataLoader ---
1014
+ dataset = TabularDataset(
1015
+ X_num_train, X_cat_train, y_tensor, w_tensor
1016
+ )
1017
+
1018
+ dataloader, accum_steps = self._build_dataloader(
1019
+ dataset,
1020
+ N=X_num_train.shape[0],
1021
+ base_bs_gpu=(512, 256, 128),
1022
+ base_bs_cpu=(256, 128),
1023
+ min_bs=64,
1024
+ target_effective_cuda=4096,
1025
+ target_effective_cpu=2048
1026
+ )
1027
+
1028
+ optimizer = torch.optim.Adam(
1029
+ self.ft.parameters(), lr=self.learning_rate)
1030
+ scaler = GradScaler(enabled=(self.device.type == 'cuda'))
1031
+
1032
+ X_num_val_dev = X_cat_val_dev = y_val_dev = w_val_dev = None
1033
+ if has_val:
1034
+ X_num_val_dev = X_num_val.to(self.device, non_blocking=True)
1035
+ X_cat_val_dev = X_cat_val.to(self.device, non_blocking=True)
1036
+ y_val_dev = y_val_tensor.to(self.device, non_blocking=True)
1037
+ w_val_dev = w_val_tensor.to(self.device, non_blocking=True)
1038
+
1039
+ def forward_fn(batch):
1040
+ X_num_b, X_cat_b, y_b, w_b = batch
1041
+ X_num_b = X_num_b.to(self.device, non_blocking=True)
1042
+ X_cat_b = X_cat_b.to(self.device, non_blocking=True)
1043
+ y_b = y_b.to(self.device, non_blocking=True)
1044
+ w_b = w_b.to(self.device, non_blocking=True)
1045
+ y_pred = self.ft(X_num_b, X_cat_b)
1046
+ return y_pred, y_b, w_b
1047
+
1048
+ def val_forward_fn():
1049
+ return self.ft(X_num_val_dev, X_cat_val_dev), y_val_dev, w_val_dev
1050
+
1051
+ clip_fn = None
1052
+ if self.device.type == 'cuda':
1053
+ def clip_fn(): return (scaler.unscale_(optimizer),
1054
+ clip_grad_norm_(self.ft.parameters(), max_norm=1.0))
1055
+
1056
+ best_state = self._train_model(
1057
+ self.ft,
1058
+ dataloader,
1059
+ accum_steps,
1060
+ optimizer,
1061
+ scaler,
1062
+ forward_fn,
1063
+ val_forward_fn if has_val else None,
1064
+ apply_softplus=True,
1065
+ clip_fn=clip_fn
1066
+ )
1067
+
1068
+ if has_val and best_state is not None:
1069
+ self.ft.load_state_dict(best_state)
1070
+
1071
+ def predict(self, X_test):
1072
+ # X_test 需要包含所有数值列与类别列
1073
+
1074
+ self.ft.eval()
1075
+ X_num, X_cat, _, _, _ = self._tensorize_split(
1076
+ X_test, None, None, allow_none=True)
1077
+
1078
+ with torch.no_grad():
1079
+ X_num = X_num.to(self.device, non_blocking=True)
1080
+ X_cat = X_cat.to(self.device, non_blocking=True)
1081
+ y_pred = self.ft(X_num, X_cat).cpu().numpy()
1082
+
1083
+ if self.task_type == 'classification':
1084
+ # 从 logits 转换为概率
1085
+ y_pred = 1 / (1 + np.exp(-y_pred))
1086
+ else:
1087
+ y_pred = np.log(1 + np.exp(y_pred)) # softplus
1088
+ y_pred = np.clip(y_pred, 1e-6, None)
1089
+ return y_pred.ravel()
1090
+
1091
+ def set_params(self, params: dict):
1092
+
1093
+ # 和 sklearn 风格保持一致。
1094
+ # 注意:对结构性参数(如 d_model/n_heads)修改后,需要重新 fit 才会生效。
1095
+
1096
+ for key, value in params.items():
1097
+ if hasattr(self, key):
1098
+ setattr(self, key, value)
1099
+ else:
1100
+ raise ValueError(f"Parameter {key} not found in model.")
1101
+ return self
1102
+
1103
+
1104
+ # ===== 基础组件与训练封装 =====================================================
1105
+
1106
+ # =============================================================================
1107
+ # Config, preprocessing, and trainer base
1108
+ # =============================================================================
1109
+ @dataclass
1110
+ class BayesOptConfig:
1111
+ model_nme: str
1112
+ resp_nme: str
1113
+ weight_nme: str
1114
+ factor_nmes: List[str]
1115
+ task_type: str = 'regression'
1116
+ binary_resp_nme: Optional[str] = None
1117
+ cate_list: Optional[List[str]] = None
1118
+ prop_test: float = 0.25
1119
+ rand_seed: Optional[int] = None
1120
+ epochs: int = 100
1121
+ use_gpu: bool = True
1122
+ use_resn_data_parallel: bool = True
1123
+ use_ft_data_parallel: bool = True
1124
+
1125
+
1126
+ class OutputManager:
1127
+ # 统一管理结果、图表与模型的输出路径
1128
+
1129
+ def __init__(self, root: Optional[str] = None, model_name: str = "model") -> None:
1130
+ self.root = Path(root or os.getcwd())
1131
+ self.model_name = model_name
1132
+ self.plot_dir = self.root / 'plot'
1133
+ self.result_dir = self.root / 'Results'
1134
+ self.model_dir = self.root / 'model'
1135
+
1136
+ def _prepare(self, path: Path) -> str:
1137
+ ensure_parent_dir(str(path))
1138
+ return str(path)
1139
+
1140
+ def plot_path(self, filename: str) -> str:
1141
+ return self._prepare(self.plot_dir / filename)
1142
+
1143
+ def result_path(self, filename: str) -> str:
1144
+ return self._prepare(self.result_dir / filename)
1145
+
1146
+ def model_path(self, filename: str) -> str:
1147
+ return self._prepare(self.model_dir / filename)
1148
+
1149
+
1150
+ class DatasetPreprocessor:
1151
+ # 为各训练器准备通用的训练/测试数据视图
1152
+
1153
+ def __init__(self, train_df: pd.DataFrame, test_df: pd.DataFrame,
1154
+ config: BayesOptConfig) -> None:
1155
+ self.config = config
1156
+ self.train_data = train_df.copy(deep=True)
1157
+ self.test_data = test_df.copy(deep=True)
1158
+ self.num_features: List[str] = []
1159
+ self.train_oht_scl_data: Optional[pd.DataFrame] = None
1160
+ self.test_oht_scl_data: Optional[pd.DataFrame] = None
1161
+ self.var_nmes: List[str] = []
1162
+ self.cat_categories_for_shap: Dict[str, List[Any]] = {}
1163
+
1164
+ def run(self) -> "DatasetPreprocessor":
1165
+ cfg = self.config
1166
+ # 预先计算加权实际值,后续画图、校验都依赖该字段
1167
+ self.train_data.loc[:, 'w_act'] = self.train_data[cfg.resp_nme] * \
1168
+ self.train_data[cfg.weight_nme]
1169
+ self.test_data.loc[:, 'w_act'] = self.test_data[cfg.resp_nme] * \
1170
+ self.test_data[cfg.weight_nme]
1171
+ if cfg.binary_resp_nme:
1172
+ self.train_data.loc[:, 'w_binary_act'] = self.train_data[cfg.binary_resp_nme] * \
1173
+ self.train_data[cfg.weight_nme]
1174
+ self.test_data.loc[:, 'w_binary_act'] = self.test_data[cfg.binary_resp_nme] * \
1175
+ self.test_data[cfg.weight_nme]
1176
+ # 高分位裁剪用来吸收离群值;若删除会导致极端点主导损失
1177
+ q99 = self.train_data[cfg.resp_nme].quantile(0.999)
1178
+ self.train_data[cfg.resp_nme] = self.train_data[cfg.resp_nme].clip(
1179
+ upper=q99)
1180
+ cate_list = list(cfg.cate_list or [])
1181
+ if cate_list:
1182
+ for cate in cate_list:
1183
+ self.train_data[cate] = self.train_data[cate].astype(
1184
+ 'category')
1185
+ self.test_data[cate] = self.test_data[cate].astype('category')
1186
+ cats = self.train_data[cate].cat.categories
1187
+ self.cat_categories_for_shap[cate] = list(cats)
1188
+ self.num_features = [
1189
+ nme for nme in cfg.factor_nmes if nme not in cate_list]
1190
+ train_oht = self.train_data[cfg.factor_nmes +
1191
+ [cfg.weight_nme] + [cfg.resp_nme]].copy()
1192
+ test_oht = self.test_data[cfg.factor_nmes +
1193
+ [cfg.weight_nme] + [cfg.resp_nme]].copy()
1194
+ train_oht = pd.get_dummies(
1195
+ train_oht,
1196
+ columns=cate_list,
1197
+ drop_first=True,
1198
+ dtype=np.int8
1199
+ )
1200
+ test_oht = pd.get_dummies(
1201
+ test_oht,
1202
+ columns=cate_list,
1203
+ drop_first=True,
1204
+ dtype=np.int8
1205
+ )
1206
+ for num_chr in self.num_features:
1207
+ # 逐列标准化保障每个特征在同一量级,否则神经网络会难以收敛
1208
+ scaler = StandardScaler()
1209
+ train_oht[num_chr] = scaler.fit_transform(
1210
+ train_oht[num_chr].values.reshape(-1, 1))
1211
+ test_oht[num_chr] = scaler.transform(
1212
+ test_oht[num_chr].values.reshape(-1, 1))
1213
+ # reindex 时将缺失的哑变量列补零,避免测试集列数与训练集不一致
1214
+ test_oht = test_oht.reindex(columns=train_oht.columns, fill_value=0)
1215
+ self.train_oht_scl_data = train_oht
1216
+ self.test_oht_scl_data = test_oht
1217
+ self.var_nmes = list(
1218
+ set(list(train_oht.columns)) - set([cfg.weight_nme, cfg.resp_nme])
1219
+ )
1220
+ return self
1221
+
1222
+ # =============================================================================
1223
+ # Trainers
1224
+ # =============================================================================
1225
+
1226
+
1227
+ class TrainerBase:
1228
+ def __init__(self, context: "BayesOptModel", label: str) -> None:
1229
+ self.ctx = context
1230
+ self.label = label
1231
+
1232
+ @property
1233
+ def config(self) -> BayesOptConfig:
1234
+ return self.ctx.config
1235
+
1236
+ @property
1237
+ def output(self) -> OutputManager:
1238
+ return self.ctx.output_manager
1239
+
1240
+ def tune(self, max_evals: int) -> None: # pragma: no cover 子类会覆盖
1241
+ raise NotImplementedError
1242
+
1243
+ def train(self) -> None: # pragma: no cover 子类会覆盖
1244
+ raise NotImplementedError
1245
+
1246
+ def save(self) -> None:
1247
+ pass
1248
+
1249
+ def load(self) -> None:
1250
+ pass
1251
+
1252
+ # 通用预测写回:根据特征来源或自定义设计矩阵生成器,写入 pred_xxx / w_pred_xxx
1253
+ def _predict_and_cache(self,
1254
+ model,
1255
+ pred_prefix: str,
1256
+ use_oht: bool = False,
1257
+ design_fn=None) -> None:
1258
+ if design_fn:
1259
+ X_train = design_fn(train=True)
1260
+ X_test = design_fn(train=False)
1261
+ elif use_oht:
1262
+ X_train = self.ctx.train_oht_scl_data[self.ctx.var_nmes]
1263
+ X_test = self.ctx.test_oht_scl_data[self.ctx.var_nmes]
1264
+ else:
1265
+ X_train = self.ctx.train_data[self.ctx.factor_nmes]
1266
+ X_test = self.ctx.test_data[self.ctx.factor_nmes]
1267
+
1268
+ preds_train = model.predict(X_train)
1269
+ preds_test = model.predict(X_test)
1270
+
1271
+ self.ctx.train_data[f'pred_{pred_prefix}'] = preds_train
1272
+ self.ctx.test_data[f'pred_{pred_prefix}'] = preds_test
1273
+ self.ctx.train_data[f'w_pred_{pred_prefix}'] = (
1274
+ self.ctx.train_data[f'pred_{pred_prefix}'] *
1275
+ self.ctx.train_data[self.ctx.weight_nme]
1276
+ )
1277
+ self.ctx.test_data[f'w_pred_{pred_prefix}'] = (
1278
+ self.ctx.test_data[f'pred_{pred_prefix}'] *
1279
+ self.ctx.test_data[self.ctx.weight_nme]
1280
+ )
1281
+
1282
+ def _fit_predict_cache(self,
1283
+ model,
1284
+ X_train,
1285
+ y_train,
1286
+ sample_weight,
1287
+ pred_prefix: str,
1288
+ use_oht: bool = False,
1289
+ design_fn=None,
1290
+ fit_kwargs: Optional[Dict[str, Any]] = None,
1291
+ sample_weight_arg: Optional[str] = 'sample_weight') -> None:
1292
+ fit_kwargs = fit_kwargs.copy() if fit_kwargs else {}
1293
+ if sample_weight is not None and sample_weight_arg:
1294
+ fit_kwargs.setdefault(sample_weight_arg, sample_weight)
1295
+ model.fit(X_train, y_train, **fit_kwargs)
1296
+ self.ctx.model_label += [self.label]
1297
+ self._predict_and_cache(
1298
+ model, pred_prefix, use_oht=use_oht, design_fn=design_fn)
1299
+
1300
+
1301
+ class XGBTrainer(TrainerBase):
1302
+ def __init__(self, context: "BayesOptModel") -> None:
1303
+ super().__init__(context, 'Xgboost')
1304
+ self.model: Optional[xgb.XGBRegressor] = None
1305
+ self.best_params: Optional[Dict[str, Any]] = None
1306
+ self.best_trial = None
1307
+
1308
+ def _build_estimator(self) -> xgb.XGBRegressor:
1309
+ params = dict(
1310
+ objective=self.ctx.obj,
1311
+ random_state=self.ctx.rand_seed,
1312
+ subsample=0.9,
1313
+ tree_method='gpu_hist' if self.ctx.use_gpu else 'hist',
1314
+ enable_categorical=True,
1315
+ predictor='gpu_predictor' if self.ctx.use_gpu else 'cpu_predictor'
1316
+ )
1317
+ if self.ctx.use_gpu:
1318
+ params['gpu_id'] = 0
1319
+ return xgb.XGBRegressor(**params)
1320
+
1321
+ def cross_val(self, trial: optuna.trial.Trial) -> float:
1322
+ learning_rate = trial.suggest_float(
1323
+ 'learning_rate', 1e-5, 1e-1, log=True)
1324
+ gamma = trial.suggest_float('gamma', 0, 10000)
1325
+ max_depth = trial.suggest_int('max_depth', 3, 25)
1326
+ n_estimators = trial.suggest_int('n_estimators', 10, 500, step=10)
1327
+ min_child_weight = trial.suggest_int(
1328
+ 'min_child_weight', 100, 10000, step=100)
1329
+ reg_alpha = trial.suggest_float('reg_alpha', 1e-10, 1, log=True)
1330
+ reg_lambda = trial.suggest_float('reg_lambda', 1e-10, 1, log=True)
1331
+ if self.ctx.obj == 'reg:tweedie':
1332
+ tweedie_variance_power = trial.suggest_float(
1333
+ 'tweedie_variance_power', 1, 2)
1334
+ elif self.ctx.obj == 'count:poisson':
1335
+ tweedie_variance_power = 1
1336
+ elif self.ctx.obj == 'reg:gamma':
1337
+ tweedie_variance_power = 2
1338
+ else:
1339
+ tweedie_variance_power = 1.5
1340
+ clf = self._build_estimator()
1341
+ params = {
1342
+ 'learning_rate': learning_rate,
1343
+ 'gamma': gamma,
1344
+ 'max_depth': max_depth,
1345
+ 'n_estimators': n_estimators,
1346
+ 'min_child_weight': min_child_weight,
1347
+ 'reg_alpha': reg_alpha,
1348
+ 'reg_lambda': reg_lambda
1349
+ }
1350
+ if self.ctx.obj == 'reg:tweedie':
1351
+ params['tweedie_variance_power'] = tweedie_variance_power
1352
+ clf.set_params(**params)
1353
+ n_jobs = 1 if self.ctx.use_gpu else int(1 / self.ctx.prop_test)
1354
+ acc = cross_val_score(
1355
+ clf,
1356
+ self.ctx.train_data[self.ctx.factor_nmes],
1357
+ self.ctx.train_data[self.ctx.resp_nme].values,
1358
+ fit_params=self.ctx.fit_params,
1359
+ cv=self.ctx.cv,
1360
+ scoring=make_scorer(
1361
+ mean_tweedie_deviance,
1362
+ power=tweedie_variance_power,
1363
+ greater_is_better=False),
1364
+ error_score='raise',
1365
+ n_jobs=n_jobs
1366
+ ).mean()
1367
+ return -acc
1368
+
1369
+ def tune(self, max_evals: int = 100) -> None:
1370
+ study = optuna.create_study(
1371
+ direction='minimize',
1372
+ sampler=optuna.samplers.TPESampler(seed=self.ctx.rand_seed)
1373
+ )
1374
+ study.optimize(self.cross_val, n_trials=max_evals)
1375
+ self.best_params = study.best_params
1376
+ self.best_trial = study.best_trial
1377
+ params_path = self.output.result_path(
1378
+ f'{self.ctx.model_nme}_bestparams_xgb.csv'
1379
+ )
1380
+ pd.DataFrame(self.best_params, index=[0]).to_csv(params_path)
1381
+
1382
+ def train(self) -> None:
1383
+ if not self.best_params:
1384
+ raise RuntimeError('请先运行 tune() 以获得 XGB 最优参数。')
1385
+ self.model = self._build_estimator()
1386
+ self.model.set_params(**self.best_params)
1387
+ self._fit_predict_cache(
1388
+ self.model,
1389
+ self.ctx.train_data[self.ctx.factor_nmes],
1390
+ self.ctx.train_data[self.ctx.resp_nme].values,
1391
+ sample_weight=None,
1392
+ pred_prefix='xgb',
1393
+ fit_kwargs=self.ctx.fit_params,
1394
+ sample_weight_arg=None # 已在 fit_kwargs 中
1395
+ )
1396
+ self.ctx.xgb_best = self.model
1397
+
1398
+ def save(self) -> None:
1399
+ if self.model is not None:
1400
+ joblib.dump(self.model, self.output.model_path(
1401
+ f'01_{self.ctx.model_nme}_Xgboost.pkl'))
1402
+
1403
+ def load(self) -> None:
1404
+ path = self.output.model_path(
1405
+ f'01_{self.ctx.model_nme}_Xgboost.pkl')
1406
+ if os.path.exists(path):
1407
+ self.model = joblib.load(path)
1408
+ self.ctx.xgb_best = self.model
1409
+ else:
1410
+ print(f"[load_model] Warning: 未找到 Xgboost 模型文件:{path}")
1411
+
1412
+
1413
+ class GLMTrainer(TrainerBase):
1414
+ def __init__(self, context: "BayesOptModel") -> None:
1415
+ super().__init__(context, 'GLM')
1416
+ self.model = None
1417
+ self.best_params: Optional[Dict[str, Any]] = None
1418
+ self.best_trial = None
1419
+
1420
+ def _select_family(self, tweedie_power: Optional[float] = None):
1421
+ if self.ctx.task_type == 'classification':
1422
+ return sm.families.Binomial()
1423
+ if self.ctx.obj == 'count:poisson':
1424
+ return sm.families.Poisson()
1425
+ if self.ctx.obj == 'reg:gamma':
1426
+ return sm.families.Gamma()
1427
+ power = tweedie_power if tweedie_power is not None else 1.5
1428
+ return sm.families.Tweedie(var_power=power, link=sm.families.links.log())
1429
+
1430
+ def _prepare_design(self, data: pd.DataFrame) -> pd.DataFrame:
1431
+ # 为 statsmodels 添加截距项
1432
+ X = data[self.ctx.var_nmes]
1433
+ return sm.add_constant(X, has_constant='add')
1434
+
1435
+ def _metric_power(self, family, tweedie_power: Optional[float]) -> float:
1436
+ if isinstance(family, sm.families.Poisson):
1437
+ return 1.0
1438
+ if isinstance(family, sm.families.Gamma):
1439
+ return 2.0
1440
+ if isinstance(family, sm.families.Tweedie):
1441
+ return tweedie_power if tweedie_power is not None else getattr(family, 'var_power', 1.5)
1442
+ return 1.5
1443
+
1444
+ def cross_val(self, trial: optuna.trial.Trial) -> float:
1445
+ alpha = trial.suggest_float('alpha', 1e-6, 1e2, log=True)
1446
+ l1_ratio = trial.suggest_float('l1_ratio', 0.0, 1.0)
1447
+ tweedie_power = None
1448
+ if self.ctx.task_type == 'regression' and self.ctx.obj == 'reg:tweedie':
1449
+ tweedie_power = trial.suggest_float('tweedie_power', 1.01, 1.99)
1450
+
1451
+ X_all = self._prepare_design(self.ctx.train_oht_scl_data)
1452
+ y_all = self.ctx.train_oht_scl_data[self.ctx.resp_nme]
1453
+ w_all = self.ctx.train_oht_scl_data[self.ctx.weight_nme]
1454
+
1455
+ scores = []
1456
+ for train_idx, val_idx in self.ctx.cv.split(X_all):
1457
+ X_train, X_val = X_all.iloc[train_idx], X_all.iloc[val_idx]
1458
+ y_train, y_val = y_all.iloc[train_idx], y_all.iloc[val_idx]
1459
+ w_train, w_val = w_all.iloc[train_idx], w_all.iloc[val_idx]
1460
+
1461
+ family = self._select_family(tweedie_power)
1462
+ glm = sm.GLM(y_train, X_train, family=family,
1463
+ freq_weights=w_train)
1464
+ result = glm.fit_regularized(
1465
+ alpha=alpha, L1_wt=l1_ratio, maxiter=200)
1466
+
1467
+ y_pred = result.predict(X_val)
1468
+ if self.ctx.task_type == 'classification':
1469
+ y_pred = np.clip(y_pred, EPS, 1 - EPS)
1470
+ fold_score = log_loss(
1471
+ y_val, y_pred, sample_weight=w_val)
1472
+ else:
1473
+ y_pred = np.maximum(y_pred, EPS)
1474
+ fold_score = mean_tweedie_deviance(
1475
+ y_val,
1476
+ y_pred,
1477
+ sample_weight=w_val,
1478
+ power=self._metric_power(family, tweedie_power)
1479
+ )
1480
+ scores.append(fold_score)
1481
+
1482
+ return float(np.mean(scores))
1483
+
1484
+ def tune(self, max_evals: int = 50) -> None:
1485
+ study = optuna.create_study(
1486
+ direction='minimize', sampler=optuna.samplers.TPESampler(seed=self.ctx.rand_seed))
1487
+ study.optimize(self.cross_val, n_trials=max_evals)
1488
+ self.best_params = study.best_params
1489
+ self.best_trial = study.best_trial
1490
+ params_path = self.output.result_path(
1491
+ f'{self.ctx.model_nme}_bestparams_glm.csv')
1492
+ pd.DataFrame(self.best_params, index=[0]).to_csv(params_path)
1493
+
1494
+ def train(self) -> None:
1495
+ if not self.best_params:
1496
+ raise RuntimeError('请先运行 tune() 以获得 GLM 最优参数。')
1497
+ tweedie_power = self.best_params.get('tweedie_power')
1498
+ family = self._select_family(tweedie_power)
1499
+
1500
+ X_train = self._prepare_design(self.ctx.train_oht_scl_data)
1501
+ y_train = self.ctx.train_oht_scl_data[self.ctx.resp_nme]
1502
+ w_train = self.ctx.train_oht_scl_data[self.ctx.weight_nme]
1503
+
1504
+ glm = sm.GLM(y_train, X_train, family=family,
1505
+ freq_weights=w_train)
1506
+ self.model = glm.fit_regularized(
1507
+ alpha=self.best_params['alpha'],
1508
+ L1_wt=self.best_params['l1_ratio'],
1509
+ maxiter=300
1510
+ )
1511
+
1512
+ self.ctx.glm_best = self.model
1513
+ self.ctx.model_label += [self.label]
1514
+ self._predict_and_cache(
1515
+ self.model,
1516
+ 'glm',
1517
+ design_fn=lambda train: self._prepare_design(
1518
+ self.ctx.train_oht_scl_data if train else self.ctx.test_oht_scl_data
1519
+ )
1520
+ )
1521
+
1522
+ def save(self) -> None:
1523
+ if self.model is not None:
1524
+ joblib.dump(self.model, self.output.model_path(
1525
+ f'01_{self.ctx.model_nme}_GLM.pkl'))
1526
+
1527
+ def load(self) -> None:
1528
+ path = self.output.model_path(
1529
+ f'01_{self.ctx.model_nme}_GLM.pkl')
1530
+ if os.path.exists(path):
1531
+ self.model = joblib.load(path)
1532
+ self.ctx.glm_best = self.model
1533
+ else:
1534
+ print(f"[load_model] Warning: 未找到 GLM 模型文件:{path}")
1535
+
1536
+
1537
+ class ResNetTrainer(TrainerBase):
1538
+ def __init__(self, context: "BayesOptModel") -> None:
1539
+ if context.task_type == 'classification':
1540
+ super().__init__(context, 'ResNetClassifier')
1541
+ else:
1542
+ super().__init__(context, 'ResNet')
1543
+ self.model: Optional[ResNetSklearn] = None
1544
+ self.best_params: Optional[Dict[str, Any]] = None
1545
+ self.best_trial = None
1546
+
1547
+ # ========= 交叉验证(BayesOpt 用) =========
1548
+ def cross_val(self, trial: optuna.trial.Trial) -> float:
1549
+ """
1550
+ 对 ResNet 做交叉验证。
1551
+ 为了防止显存 OOM:
1552
+ - 每个 fold 独立创建一个 ResNetSklearn
1553
+ - fold 结束就把模型挪到 CPU + 删除 + gc + empty_cache
1554
+ - 可选:BayesOpt 阶段只用训练集子样本
1555
+ """
1556
+
1557
+ # 1. 超参空间(基本沿用你之前的设定)
1558
+ learning_rate = trial.suggest_float(
1559
+ 'learning_rate', 1e-6, 1e-2, log=True
1560
+ )
1561
+ # hidden_dim = trial.suggest_int('hidden_dim', 32, 256, step=32) # 不宜过大
1562
+ hidden_dim = trial.suggest_int('hidden_dim', 8, 32, step=2)
1563
+ block_num = trial.suggest_int('block_num', 2, 10)
1564
+ # batch_num = trial.suggest_int(
1565
+ # 'batch_num',
1566
+ # 10 if self.ctx.obj == 'reg:gamma' else 100,
1567
+ # 100 if self.ctx.obj == 'reg:gamma' else 1000,
1568
+ # step=10 if self.ctx.obj == 'reg:gamma' else 100
1569
+ # )
1570
+
1571
+ if self.ctx.task_type == 'regression':
1572
+ if self.ctx.obj == 'reg:tweedie':
1573
+ tw_power = trial.suggest_float('tw_power', 1.0, 2.0)
1574
+ elif self.ctx.obj == 'count:poisson':
1575
+ tw_power = 1.0
1576
+ elif self.ctx.obj == 'reg:gamma':
1577
+ tw_power = 2.0
1578
+ else:
1579
+ tw_power = 1.5
1580
+ else: # classification
1581
+ tw_power = None # Not used
1582
+
1583
+ fold_losses = []
1584
+
1585
+ # 2. (可选)BayesOpt 只在子样本上做 CV,减轻显存 & 时间压力
1586
+ data_for_cv = self.ctx.train_oht_scl_data
1587
+ max_rows_for_resnet_bo = min(100000, int(
1588
+ len(data_for_cv)/5)) # 你可以按 A30 情况调小,比如 50_000
1589
+ if len(data_for_cv) > max_rows_for_resnet_bo:
1590
+ data_for_cv = data_for_cv.sample(
1591
+ max_rows_for_resnet_bo,
1592
+ random_state=self.ctx.rand_seed
1593
+ )
1594
+
1595
+ X_all = data_for_cv[self.ctx.var_nmes]
1596
+ y_all = data_for_cv[self.ctx.resp_nme]
1597
+ w_all = data_for_cv[self.ctx.weight_nme]
1598
+
1599
+ # 用局部 ShuffleSplit,避免子样本时索引不一致
1600
+ cv_local = ShuffleSplit(
1601
+ n_splits=int(1 / self.ctx.prop_test),
1602
+ test_size=self.ctx.prop_test,
1603
+ random_state=self.ctx.rand_seed
1604
+ )
1605
+
1606
+ for fold, (train_idx, val_idx) in enumerate(cv_local.split(X_all)):
1607
+ X_train_fold = X_all.iloc[train_idx]
1608
+ y_train_fold = y_all.iloc[train_idx]
1609
+ w_train_fold = w_all.iloc[train_idx]
1610
+
1611
+ X_val_fold = X_all.iloc[val_idx]
1612
+ y_val_fold = y_all.iloc[val_idx]
1613
+ w_val_fold = w_all.iloc[val_idx]
1614
+
1615
+ # 3. 每个 fold 创建一个临时 ResNet 模型
1616
+ cv_net = ResNetSklearn(
1617
+ model_nme=self.ctx.model_nme,
1618
+ input_dim=X_all.shape[1],
1619
+ hidden_dim=hidden_dim,
1620
+ block_num=block_num,
1621
+ task_type=self.ctx.task_type,
1622
+ # batch_num=batch_num,
1623
+ epochs=self.ctx.epochs,
1624
+ tweedie_power=tw_power,
1625
+ learning_rate=learning_rate,
1626
+ patience=5,
1627
+ use_layernorm=True,
1628
+ dropout=0.1,
1629
+ residual_scale=0.1,
1630
+ use_data_parallel=self.ctx.config.use_resn_data_parallel
1631
+ )
1632
+
1633
+ try:
1634
+ # 4. 训练(内部仍然用你自己的 tweedie_loss)
1635
+ cv_net.fit(
1636
+ X_train_fold,
1637
+ y_train_fold,
1638
+ w_train_fold,
1639
+ X_val_fold,
1640
+ y_val_fold,
1641
+ w_val_fold
1642
+ )
1643
+
1644
+ # 5. 验证集预测
1645
+ y_pred_fold = cv_net.predict(X_val_fold)
1646
+
1647
+ # 6. 评估:Tweedie deviance(评估用,训练 loss 不动)
1648
+ if self.ctx.task_type == 'regression':
1649
+ loss = mean_tweedie_deviance(
1650
+ y_val_fold,
1651
+ y_pred_fold,
1652
+ sample_weight=w_val_fold,
1653
+ power=tw_power
1654
+ )
1655
+ else: # classification
1656
+ from sklearn.metrics import log_loss
1657
+ loss = log_loss(
1658
+ y_val_fold,
1659
+ y_pred_fold,
1660
+ sample_weight=w_val_fold,
1661
+ )
1662
+ fold_losses.append(loss)
1663
+ finally:
1664
+ # 7. ★ 每个 fold 结束后释放 GPU 资源 ★
1665
+ try:
1666
+ if hasattr(cv_net, "resnet"):
1667
+ cv_net.resnet.to("cpu")
1668
+ except Exception:
1669
+ pass
1670
+ del cv_net
1671
+ gc.collect()
1672
+ if torch.cuda.is_available():
1673
+ torch.cuda.empty_cache()
1674
+
1675
+ return np.mean(fold_losses)
1676
+
1677
+ # ========= Optuna 调参 =========
1678
+ def tune(self, max_evals: int = 50) -> None:
1679
+ """
1680
+ 使用 Optuna 对 ResNet 做贝叶斯优化。
1681
+ 每个 trial 完成以后再做一次全局的显存清理。
1682
+ """
1683
+ def objective(trial: optuna.trial.Trial) -> float:
1684
+ result = self.cross_val(trial)
1685
+ # trial 级别兜底清理
1686
+ gc.collect()
1687
+ if torch.cuda.is_available():
1688
+ torch.cuda.empty_cache()
1689
+ return result
1690
+
1691
+ study = optuna.create_study(
1692
+ direction='minimize',
1693
+ sampler=optuna.samplers.TPESampler(seed=self.ctx.rand_seed)
1694
+ )
1695
+ study.optimize(objective, n_trials=max_evals)
1696
+
1697
+ self.best_params = study.best_params
1698
+ self.best_trial = study.best_trial
1699
+
1700
+ params_path = self.output.result_path(
1701
+ f'{self.ctx.model_nme}_bestparams_resn.csv'
1702
+ )
1703
+ pd.DataFrame(self.best_params, index=[0]).to_csv(params_path)
1704
+
1705
+ # ========= 用最优超参训练最终 ResNet =========
1706
+ def train(self) -> None:
1707
+ if not self.best_params:
1708
+ raise RuntimeError('请先运行 tune() 以获得 ResNet 最优参数。')
1709
+
1710
+ self.model = ResNetSklearn(
1711
+ model_nme=self.ctx.model_nme,
1712
+ input_dim=self.ctx.train_oht_scl_data[self.ctx.var_nmes].shape[1],
1713
+ task_type=self.ctx.task_type,
1714
+ use_data_parallel=self.ctx.config.use_resn_data_parallel
1715
+ )
1716
+ self.model.set_params(self.best_params)
1717
+
1718
+ self._fit_predict_cache(
1719
+ self.model,
1720
+ self.ctx.train_oht_scl_data[self.ctx.var_nmes],
1721
+ self.ctx.train_oht_scl_data[self.ctx.resp_nme],
1722
+ sample_weight=self.ctx.train_oht_scl_data[self.ctx.weight_nme],
1723
+ pred_prefix='resn',
1724
+ use_oht=True,
1725
+ sample_weight_arg='w_train'
1726
+ )
1727
+
1728
+ # 方便外部调用
1729
+ self.ctx.resn_best = self.model
1730
+
1731
+ # ========= 保存 / 加载 =========
1732
+ def save(self) -> None:
1733
+ """
1734
+ 只保存 ResNet 的 state_dict(轻量,不含优化器)。
1735
+ """
1736
+ if self.model is not None:
1737
+ path = self.output.model_path(
1738
+ f'01_{self.ctx.model_nme}_ResNet.pth'
1739
+ )
1740
+ torch.save(self.model.resnet.state_dict(), path)
1741
+
1742
+ def load(self) -> None:
1743
+ """
1744
+ 从文件加载 ResNet 模型到合适的 device。
1745
+ """
1746
+ path = self.output.model_path(
1747
+ f'01_{self.ctx.model_nme}_ResNet.pth'
1748
+ )
1749
+ if os.path.exists(path):
1750
+ resn_loaded = ResNetSklearn(
1751
+ model_nme=self.ctx.model_nme,
1752
+ input_dim=self.ctx.train_oht_scl_data[self.ctx.var_nmes].shape[1],
1753
+ task_type=self.ctx.task_type,
1754
+ use_data_parallel=self.ctx.config.use_resn_data_parallel
1755
+ )
1756
+ state_dict = torch.load(path, map_location='cpu')
1757
+ resn_loaded.resnet.load_state_dict(state_dict)
1758
+
1759
+ # 根据当前环境设置 device
1760
+ if torch.cuda.is_available():
1761
+ resn_loaded.device = torch.device('cuda')
1762
+ elif torch.backends.mps.is_available():
1763
+ resn_loaded.device = torch.device('mps')
1764
+ else:
1765
+ resn_loaded.device = torch.device('cpu')
1766
+
1767
+ resn_loaded.resnet.to(resn_loaded.device)
1768
+ self.model = resn_loaded
1769
+ self.ctx.resn_best = self.model
1770
+ else:
1771
+ print(f"[ResNetTrainer.load] 未找到模型文件:{path}")
1772
+
1773
+
1774
+ class FTTrainer(TrainerBase):
1775
+ def __init__(self, context: "BayesOptModel") -> None:
1776
+ if context.task_type == 'classification':
1777
+ super().__init__(context, 'FTTransformerClassifier')
1778
+ else:
1779
+ super().__init__(context, 'FTTransformer')
1780
+ self.model: Optional[FTTransformerSklearn] = None
1781
+ self.best_params: Optional[Dict[str, Any]] = None
1782
+ self.best_trial = None
1783
+
1784
+ def cross_val(self, trial: optuna.trial.Trial) -> float:
1785
+ """
1786
+ 对 FT-Transformer 做交叉验证。
1787
+ 这里是显存最容易爆的地方,所以加入了:
1788
+ - 较保守的超参搜索空间
1789
+ - 每个 fold 结束后强制释放 GPU 显存
1790
+ """
1791
+ # 超参空间适当缩小一点,避免特别大的模型
1792
+ learning_rate = trial.suggest_float(
1793
+ 'learning_rate', 1e-5, 5e-4, log=True
1794
+ )
1795
+ d_model = trial.suggest_int('d_model', 32, 256, step=32)
1796
+ # n_heads = trial.suggest_categorical('n_heads', [2, 4]) 避免欠拟合
1797
+ n_heads = trial.suggest_categorical('n_heads', [2, 4, 8])
1798
+ # n_layers = trial.suggest_int('n_layers', 2, 4) 避免欠拟合
1799
+ n_layers = trial.suggest_int('n_layers', 2, 8)
1800
+ dropout = trial.suggest_float('dropout', 0.0, 0.2)
1801
+ # batch_num = trial.suggest_int(
1802
+ # 'batch_num',
1803
+ # 5 if self.ctx.obj == 'reg:gamma' else 10,
1804
+ # 10 if self.ctx.obj == 'reg:gamma' else 50,
1805
+ # step=1 if self.ctx.obj == 'reg:gamma' else 10
1806
+ # )
1807
+
1808
+ if self.ctx.task_type == 'regression':
1809
+ if self.ctx.obj == 'reg:tweedie':
1810
+ tw_power = trial.suggest_float('tw_power', 1.0, 2.0)
1811
+ elif self.ctx.obj == 'count:poisson':
1812
+ tw_power = 1.0
1813
+ elif self.ctx.obj == 'reg:gamma':
1814
+ tw_power = 2.0
1815
+ else:
1816
+ tw_power = 1.5
1817
+ else: # classification
1818
+ tw_power = None # Not used
1819
+
1820
+ fold_losses = []
1821
+
1822
+ # 👉 可选:只在子样本上做 BO,避免大数据直接压垮显存
1823
+ data_for_cv = self.ctx.train_data
1824
+ max_rows_for_ft_bo = min(1000000, int(
1825
+ len(data_for_cv)/2)) # 你可以根据显存情况调小或调大
1826
+ if len(data_for_cv) > max_rows_for_ft_bo:
1827
+ data_for_cv = data_for_cv.sample(
1828
+ max_rows_for_ft_bo,
1829
+ random_state=self.ctx.rand_seed
1830
+ )
1831
+
1832
+ for _, (train_idx, test_idx) in enumerate(
1833
+ self.ctx.cv.split(data_for_cv[self.ctx.factor_nmes])
1834
+ ):
1835
+ X_train_fold = data_for_cv.iloc[train_idx][self.ctx.factor_nmes]
1836
+ y_train_fold = data_for_cv.iloc[train_idx][self.ctx.resp_nme]
1837
+ w_train_fold = data_for_cv.iloc[train_idx][self.ctx.weight_nme]
1838
+ X_val_fold = data_for_cv.iloc[test_idx][self.ctx.factor_nmes]
1839
+ y_val_fold = data_for_cv.iloc[test_idx][self.ctx.resp_nme]
1840
+ w_val_fold = data_for_cv.iloc[test_idx][self.ctx.weight_nme]
1841
+
1842
+ cv_ft = FTTransformerSklearn(
1843
+ model_nme=self.ctx.model_nme,
1844
+ num_cols=self.ctx.num_features,
1845
+ cat_cols=self.ctx.cate_list,
1846
+ d_model=d_model,
1847
+ n_heads=n_heads,
1848
+ n_layers=n_layers,
1849
+ dropout=dropout,
1850
+ task_type=self.ctx.task_type,
1851
+ # batch_num=batch_num,
1852
+ epochs=self.ctx.epochs,
1853
+ tweedie_power=tw_power,
1854
+ learning_rate=learning_rate,
1855
+ patience=5,
1856
+ use_data_parallel=self.ctx.config.use_ft_data_parallel
1857
+ )
1858
+
1859
+ try:
1860
+ cv_ft.fit(
1861
+ X_train_fold, y_train_fold, w_train_fold,
1862
+ X_val_fold, y_val_fold, w_val_fold
1863
+ )
1864
+ y_pred_fold = cv_ft.predict(X_val_fold)
1865
+ if self.ctx.task_type == 'regression':
1866
+ loss = mean_tweedie_deviance(
1867
+ y_val_fold,
1868
+ y_pred_fold,
1869
+ sample_weight=w_val_fold,
1870
+ power=tw_power
1871
+ )
1872
+ else: # classification
1873
+ from sklearn.metrics import log_loss
1874
+ loss = log_loss(
1875
+ y_val_fold,
1876
+ y_pred_fold,
1877
+ sample_weight=w_val_fold,
1878
+ )
1879
+ fold_losses.append(loss)
1880
+ finally:
1881
+ # 🧹 每个 fold 用完就立即释放 GPU 资源
1882
+ try:
1883
+ # 如果模型在 GPU 上,先挪回 CPU
1884
+ if hasattr(cv_ft, "ft"):
1885
+ cv_ft.ft.to("cpu")
1886
+ except Exception:
1887
+ pass
1888
+ del cv_ft
1889
+ gc.collect()
1890
+ if torch.cuda.is_available():
1891
+ torch.cuda.empty_cache()
1892
+
1893
+ return np.mean(fold_losses)
1894
+
1895
+ def tune(self, max_evals: int = 50) -> None:
1896
+ """
1897
+ 用 Optuna 做超参搜索。
1898
+ 在每个 trial 结束后再做一次显存清理,避免 trial 间显存碎片堆积。
1899
+ """
1900
+ def objective(trial: optuna.trial.Trial) -> float:
1901
+ result = self.cross_val(trial)
1902
+ # trial 级别的兜底清理
1903
+ gc.collect()
1904
+ if torch.cuda.is_available():
1905
+ torch.cuda.empty_cache()
1906
+ return result
1907
+
1908
+ study = optuna.create_study(
1909
+ direction='minimize',
1910
+ sampler=optuna.samplers.TPESampler(seed=self.ctx.rand_seed)
1911
+ )
1912
+ study.optimize(objective, n_trials=max_evals)
1913
+ self.best_params = study.best_params
1914
+ self.best_trial = study.best_trial
1915
+ params_path = self.output.result_path(
1916
+ f'{self.ctx.model_nme}_bestparams_ft.csv'
1917
+ )
1918
+ pd.DataFrame(self.best_params, index=[0]).to_csv(params_path)
1919
+
1920
+ def train(self) -> None:
1921
+ if not self.best_params:
1922
+ raise RuntimeError('请先运行 tune() 以获得 FT-Transformer 最优参数。')
1923
+ self.model = FTTransformerSklearn(
1924
+ model_nme=self.ctx.model_nme,
1925
+ num_cols=self.ctx.num_features,
1926
+ cat_cols=self.ctx.cate_list,
1927
+ task_type=self.ctx.task_type,
1928
+ use_data_parallel=self.ctx.config.use_ft_data_parallel
1929
+ )
1930
+ self.model.set_params(self.best_params)
1931
+ self._fit_predict_cache(
1932
+ self.model,
1933
+ self.ctx.train_data[self.ctx.factor_nmes],
1934
+ self.ctx.train_data[self.ctx.resp_nme],
1935
+ sample_weight=self.ctx.train_data[self.ctx.weight_nme],
1936
+ pred_prefix='ft',
1937
+ sample_weight_arg='w_train'
1938
+ )
1939
+ self.ctx.ft_best = self.model
1940
+
1941
+ def save(self) -> None:
1942
+ if self.model is not None:
1943
+ torch.save(
1944
+ self.model,
1945
+ self.output.model_path(
1946
+ f'01_{self.ctx.model_nme}_FTTransformer.pth')
1947
+ )
1948
+
1949
+ def load(self) -> None:
1950
+ path = self.output.model_path(
1951
+ f'01_{self.ctx.model_nme}_FTTransformer.pth')
1952
+ if os.path.exists(path):
1953
+ ft_loaded = torch.load(path, map_location='cpu')
1954
+ if torch.cuda.is_available():
1955
+ ft_loaded.device = torch.device('cuda')
1956
+ elif torch.backends.mps.is_available():
1957
+ ft_loaded.device = torch.device('mps')
1958
+ else:
1959
+ ft_loaded.device = torch.device('cpu')
1960
+ ft_loaded.ft.to(ft_loaded.device)
1961
+ self.model = ft_loaded
1962
+ self.ctx.ft_best = self.model
1963
+ else:
1964
+ print(f"[load_model] Warning: 未找到 FT-Transformer 模型文件:{path}")
1965
+
1966
+
1967
+ # =============================================================================
1968
+ # BayesOpt orchestration & SHAP utilities
1969
+ # =============================================================================
1970
+ class BayesOptModel:
1971
+ def __init__(self, train_data, test_data,
1972
+ model_nme, resp_nme, weight_nme, factor_nmes, task_type='regression',
1973
+ binary_resp_nme=None,
1974
+ cate_list=None, prop_test=0.25, rand_seed=None,
1975
+ epochs=100, use_gpu=True,
1976
+ use_resn_data_parallel: bool = False, use_ft_data_parallel: bool = False):
1977
+ cfg = BayesOptConfig(
1978
+ model_nme=model_nme,
1979
+ task_type=task_type,
1980
+ resp_nme=resp_nme,
1981
+ weight_nme=weight_nme,
1982
+ factor_nmes=list(factor_nmes),
1983
+ binary_resp_nme=binary_resp_nme,
1984
+ cate_list=list(cate_list) if cate_list else None,
1985
+ prop_test=prop_test,
1986
+ rand_seed=rand_seed,
1987
+ epochs=epochs,
1988
+ use_gpu=use_gpu,
1989
+ use_resn_data_parallel=use_resn_data_parallel,
1990
+ use_ft_data_parallel=use_ft_data_parallel
1991
+ )
1992
+ self.config = cfg
1993
+ self.model_nme = cfg.model_nme
1994
+ self.task_type = cfg.task_type
1995
+ self.resp_nme = cfg.resp_nme
1996
+ self.weight_nme = cfg.weight_nme
1997
+ self.factor_nmes = cfg.factor_nmes
1998
+ self.binary_resp_nme = cfg.binary_resp_nme
1999
+ self.cate_list = list(cfg.cate_list or [])
2000
+ self.prop_test = cfg.prop_test
2001
+ self.epochs = cfg.epochs
2002
+ self.rand_seed = cfg.rand_seed if cfg.rand_seed is not None else np.random.randint(
2003
+ 1, 10000)
2004
+ self.use_gpu = bool(cfg.use_gpu and torch.cuda.is_available())
2005
+ self.output_manager = OutputManager(os.getcwd(), self.model_nme)
2006
+
2007
+ preprocessor = DatasetPreprocessor(train_data, test_data, cfg).run()
2008
+ self.train_data = preprocessor.train_data
2009
+ self.test_data = preprocessor.test_data
2010
+ self.train_oht_scl_data = preprocessor.train_oht_scl_data
2011
+ self.test_oht_scl_data = preprocessor.test_oht_scl_data
2012
+ self.var_nmes = preprocessor.var_nmes
2013
+ self.num_features = preprocessor.num_features
2014
+ self.cat_categories_for_shap = preprocessor.cat_categories_for_shap
2015
+
2016
+ self.cv = ShuffleSplit(n_splits=int(1/self.prop_test),
2017
+ test_size=self.prop_test,
2018
+ random_state=self.rand_seed)
2019
+ if self.task_type == 'classification':
2020
+ self.obj = 'binary:logistic'
2021
+ else: # regression
2022
+ if 'f' in self.model_nme:
2023
+ self.obj = 'count:poisson'
2024
+ elif 's' in self.model_nme:
2025
+ self.obj = 'reg:gamma'
2026
+ elif 'bc' in self.model_nme:
2027
+ self.obj = 'reg:tweedie'
2028
+ else:
2029
+ self.obj = 'reg:tweedie'
2030
+ self.fit_params = {
2031
+ 'sample_weight': self.train_data[self.weight_nme].values
2032
+ }
2033
+ self.model_label: List[str] = []
2034
+
2035
+ # 记录各模型训练器,后续统一通过标签访问,方便扩展新模型
2036
+ self.trainers: Dict[str, TrainerBase] = {
2037
+ 'glm': GLMTrainer(self),
2038
+ 'xgb': XGBTrainer(self),
2039
+ 'resn': ResNetTrainer(self),
2040
+ 'ft': FTTrainer(self)
2041
+ }
2042
+ self.xgb_best = None
2043
+ self.resn_best = None
2044
+ self.glm_best = None
2045
+ self.ft_best = None
2046
+ self.best_xgb_params = None
2047
+ self.best_resn_params = None
2048
+ self.best_ft_params = None
2049
+ self.best_xgb_trial = None
2050
+ self.best_resn_trial = None
2051
+ self.best_ft_trial = None
2052
+ self.best_glm_params = None
2053
+ self.best_glm_trial = None
2054
+ self.xgb_load = None
2055
+ self.resn_load = None
2056
+ self.ft_load = None
2057
+
2058
+ # 定义单因素画图函数
2059
+ def plot_oneway(self, n_bins=10):
2060
+ for c in self.factor_nmes:
2061
+ fig = plt.figure(figsize=(7, 5))
2062
+ if c in self.cate_list:
2063
+ group_col = c
2064
+ plot_source = self.train_data
2065
+ else:
2066
+ group_col = f'{c}_bins'
2067
+ bins = pd.qcut(
2068
+ self.train_data[c],
2069
+ n_bins,
2070
+ duplicates='drop' # 注意:如果分位数重复会丢 bin,避免异常终止
2071
+ )
2072
+ plot_source = self.train_data.assign(**{group_col: bins})
2073
+ plot_data = plot_source.groupby(
2074
+ [group_col], observed=True).sum(numeric_only=True)
2075
+ plot_data.reset_index(inplace=True)
2076
+ plot_data['act_v'] = plot_data['w_act'] / \
2077
+ plot_data[self.weight_nme]
2078
+ plot_data.head()
2079
+ ax = fig.add_subplot(111)
2080
+ ax.plot(plot_data.index, plot_data['act_v'],
2081
+ label='Actual', color='red')
2082
+ ax.set_title(
2083
+ 'Analysis of %s : Train Data' % group_col,
2084
+ fontsize=8)
2085
+ plt.xticks(plot_data.index,
2086
+ list(plot_data[group_col].astype(str)),
2087
+ rotation=90)
2088
+ if len(list(plot_data[group_col].astype(str))) > 50:
2089
+ plt.xticks(fontsize=3)
2090
+ else:
2091
+ plt.xticks(fontsize=6)
2092
+ plt.yticks(fontsize=6)
2093
+ ax2 = ax.twinx()
2094
+ ax2.bar(plot_data.index,
2095
+ plot_data[self.weight_nme],
2096
+ alpha=0.5, color='seagreen')
2097
+ plt.yticks(fontsize=6)
2098
+ plt.margins(0.05)
2099
+ plt.subplots_adjust(wspace=0.3)
2100
+ save_path = self.output_manager.plot_path(
2101
+ f'00_{self.model_nme}_{group_col}_oneway.png')
2102
+ plt.savefig(save_path, dpi=300)
2103
+ plt.close(fig)
2104
+
2105
+ # 定义GLM贝叶斯优化函数
2106
+ def bayesopt_glm(self, max_evals=50):
2107
+ trainer = self.trainers['glm']
2108
+ trainer.tune(max_evals)
2109
+ trainer.train()
2110
+ self.glm_best = trainer.model
2111
+ self.best_glm_params = trainer.best_params
2112
+ self.best_glm_trial = trainer.best_trial
2113
+
2114
+ # 定义Xgboost贝叶斯优化函数
2115
+ def bayesopt_xgb(self, max_evals=100):
2116
+ trainer = self.trainers['xgb']
2117
+ trainer.tune(max_evals)
2118
+ trainer.train()
2119
+ self.xgb_best = trainer.model
2120
+ # 记录最优参数及 trial 以便排查或复现结果
2121
+ self.best_xgb_params = trainer.best_params
2122
+ self.best_xgb_trial = trainer.best_trial
2123
+
2124
+ # 定义ResNet贝叶斯优化函数
2125
+
2126
+ def bayesopt_resnet(self, max_evals=100):
2127
+ trainer = self.trainers['resn']
2128
+ trainer.tune(max_evals)
2129
+ trainer.train()
2130
+ self.resn_best = trainer.model
2131
+ # 保存最优 trial 相关信息,方便后续调参分析
2132
+ self.best_resn_params = trainer.best_params
2133
+ self.best_resn_trial = trainer.best_trial
2134
+
2135
+ # 定义 FT-Transformer 贝叶斯优化函数
2136
+ def bayesopt_ft(self, max_evals=50):
2137
+ trainer = self.trainers['ft']
2138
+ trainer.tune(max_evals)
2139
+ trainer.train()
2140
+ self.ft_best = trainer.model
2141
+ # FT-Transformer 参数较多,留存配置信息尤其重要
2142
+ self.best_ft_params = trainer.best_params
2143
+ self.best_ft_trial = trainer.best_trial
2144
+
2145
+ # 绘制提纯曲线
2146
+ def plot_lift(self, model_label, pred_nme, n_bins=10):
2147
+ model_map = {
2148
+ 'Xgboost': 'pred_xgb',
2149
+ 'ResNet': 'pred_resn',
2150
+ 'ResNetClassifier': 'pred_resn',
2151
+ 'FTTransformer': 'pred_ft',
2152
+ 'FTTransformerClassifier': 'pred_ft',
2153
+ 'GLM': 'pred_glm'
2154
+ }
2155
+ for k, v in model_map.items():
2156
+ if model_label.startswith(k):
2157
+ pred_nme = v
2158
+ break
2159
+
2160
+ def _lift_data(data: pd.DataFrame) -> pd.DataFrame:
2161
+ lift_df = pd.DataFrame({
2162
+ 'pred': data[pred_nme].values,
2163
+ 'w_pred': data[f'w_{pred_nme}'].values,
2164
+ 'act': data['w_act'].values,
2165
+ 'weight': data[self.weight_nme].values
2166
+ })
2167
+ plot_data = PlotUtils.split_data(lift_df, 'pred', 'weight', n_bins)
2168
+ denom = np.maximum(plot_data['weight'], EPS)
2169
+ plot_data['exp_v'] = plot_data['w_pred'] / denom
2170
+ plot_data['act_v'] = plot_data['act'] / denom
2171
+ return plot_data.reset_index()
2172
+
2173
+ fig = plt.figure(figsize=(11, 5))
2174
+ for pos, (title, data) in zip([121, 122],
2175
+ [('Lift Chart on Train Data', self.train_data),
2176
+ ('Lift Chart on Test Data', self.test_data)]):
2177
+ plot_data = _lift_data(data)
2178
+ ax = fig.add_subplot(pos)
2179
+ ax.plot(plot_data.index,
2180
+ plot_data['act_v'], label='Actual', color='red')
2181
+ ax.plot(plot_data.index,
2182
+ plot_data['exp_v'], label='Predicted', color='blue')
2183
+ ax.set_title(title, fontsize=8)
2184
+ ax.set_xticks(plot_data.index)
2185
+ ax.set_xticklabels(plot_data.index, rotation=90, fontsize=6)
2186
+ ax.tick_params(axis='y', labelsize=6)
2187
+ ax.legend(loc='upper left', fontsize=5, frameon=False)
2188
+ ax.margins(0.05)
2189
+ ax2 = ax.twinx()
2190
+ ax2.bar(plot_data.index, plot_data['weight'], alpha=0.5, color='seagreen',
2191
+ label='Earned Exposure')
2192
+ ax2.tick_params(axis='y', labelsize=6)
2193
+ ax2.legend(loc='upper right', fontsize=5, frameon=False)
2194
+
2195
+ plt.subplots_adjust(wspace=0.3)
2196
+ save_path = self.output_manager.plot_path(
2197
+ f'01_{self.model_nme}_{model_label}_lift.png')
2198
+ plt.savefig(save_path, dpi=300)
2199
+ plt.show()
2200
+ plt.close(fig)
2201
+
2202
+ # 绘制双提纯曲线
2203
+ def plot_dlift(self, model_comp: List[str] = ['xgb', 'resn'], n_bins: int = 10) -> None:
2204
+ """
2205
+ 绘制双提纯曲线,对比两个模型的预测效果。
2206
+
2207
+ Args:
2208
+ model_comp: 包含两个模型简称的列表,例如 ['xgb', 'resn']。
2209
+ 支持 'xgb', 'resn', 'ft'。
2210
+ n_bins: 分箱数量。
2211
+ """
2212
+ if len(model_comp) != 2:
2213
+ raise ValueError("`model_comp` 必须包含两个模型进行对比。")
2214
+
2215
+ model_name_map = {
2216
+ 'xgb': 'Xgboost',
2217
+ 'resn': 'ResNet',
2218
+ 'ft': 'FTTransformer',
2219
+ 'glm': 'GLM'
2220
+ }
2221
+
2222
+ name1, name2 = model_comp
2223
+ if name1 not in model_name_map or name2 not in model_name_map:
2224
+ raise ValueError(f"不支持的模型简称。请从 {list(model_name_map.keys())} 中选择。")
2225
+
2226
+ fig, axes = plt.subplots(1, 2, figsize=(11, 5))
2227
+ datasets = {
2228
+ 'Train Data': self.train_data,
2229
+ 'Test Data': self.test_data
2230
+ }
2231
+
2232
+ for ax, (data_name, data) in zip(axes, datasets.items()):
2233
+ pred1_col = f'w_pred_{name1}'
2234
+ pred2_col = f'w_pred_{name2}'
2235
+
2236
+ if pred1_col not in data.columns or pred2_col not in data.columns:
2237
+ print(
2238
+ f"警告: 在 {data_name} 中找不到预测列 {pred1_col} 或 {pred2_col}。跳过绘图。")
2239
+ continue
2240
+
2241
+ lift_data = pd.DataFrame({
2242
+ 'pred1': data[pred1_col].values,
2243
+ 'pred2': data[pred2_col].values,
2244
+ 'diff_ly': data[pred1_col].values / np.maximum(data[pred2_col].values, EPS),
2245
+ 'act': data['w_act'].values,
2246
+ 'weight': data[self.weight_nme].values
2247
+ })
2248
+ plot_data = PlotUtils.split_data(
2249
+ lift_data, 'diff_ly', 'weight', n_bins)
2250
+ denom = np.maximum(plot_data['act'], EPS)
2251
+ plot_data['exp_v1'] = plot_data['pred1'] / denom
2252
+ plot_data['exp_v2'] = plot_data['pred2'] / denom
2253
+ plot_data['act_v'] = plot_data['act'] / denom
2254
+ plot_data.reset_index(inplace=True)
2255
+
2256
+ label1 = model_name_map[name1]
2257
+ label2 = model_name_map[name2]
2258
+
2259
+ ax.plot(plot_data.index,
2260
+ plot_data['act_v'], label='Actual', color='red')
2261
+ ax.plot(plot_data.index,
2262
+ plot_data['exp_v1'], label=label1, color='blue')
2263
+ ax.plot(plot_data.index,
2264
+ plot_data['exp_v2'], label=label2, color='black')
2265
+
2266
+ ax.set_title(f'Double Lift Chart on {data_name}', fontsize=8)
2267
+ ax.set_xticks(plot_data.index)
2268
+ ax.set_xticklabels(plot_data.index, rotation=90, fontsize=6)
2269
+ ax.set_xlabel(f'{label1} / {label2}', fontsize=6)
2270
+ ax.tick_params(axis='y', labelsize=6)
2271
+ ax.legend(loc='upper left', fontsize=5, frameon=False)
2272
+ ax.margins(0.1)
2273
+
2274
+ ax2 = ax.twinx()
2275
+ ax2.bar(plot_data.index, plot_data['weight'],
2276
+ alpha=0.5, color='seagreen', label='Earned Exposure')
2277
+ ax2.tick_params(axis='y', labelsize=6)
2278
+ ax2.legend(loc='upper right', fontsize=5, frameon=False)
2279
+
2280
+ plt.subplots_adjust(bottom=0.25, top=0.95, right=0.8, wspace=0.3)
2281
+ save_path = self.output_manager.plot_path(
2282
+ f'02_{self.model_nme}_dlift_{name1}_vs_{name2}.png')
2283
+ plt.savefig(save_path, dpi=300)
2284
+ plt.show()
2285
+ plt.close(fig)
2286
+
2287
+ # 绘制成交率提升曲线
2288
+ def plot_conversion_lift(self, model_pred_col: str, n_bins: int = 20):
2289
+ if not self.binary_resp_nme:
2290
+ print("错误: 未在 BayesOptModel 初始化时提供 `binary_resp_nme`。无法绘制成交率曲线。")
2291
+ return
2292
+
2293
+ fig, axes = plt.subplots(1, 2, figsize=(14, 6), sharey=True)
2294
+ datasets = {
2295
+ 'Train Data': self.train_data,
2296
+ 'Test Data': self.test_data
2297
+ }
2298
+
2299
+ for ax, (data_name, data) in zip(axes, datasets.items()):
2300
+ if model_pred_col not in data.columns:
2301
+ print(f"警告: 在 {data_name} 中找不到预测列 '{model_pred_col}'。跳过绘图。")
2302
+ continue
2303
+
2304
+ # 按模型预测分排序,并计算分箱
2305
+ plot_data = data.sort_values(by=model_pred_col).copy()
2306
+ plot_data['cum_weight'] = plot_data[self.weight_nme].cumsum()
2307
+ total_weight = plot_data[self.weight_nme].sum()
2308
+
2309
+ if total_weight > EPS:
2310
+ plot_data['bin'] = pd.cut(
2311
+ plot_data['cum_weight'],
2312
+ bins=n_bins,
2313
+ labels=False,
2314
+ right=False
2315
+ )
2316
+ else:
2317
+ plot_data['bin'] = 0
2318
+
2319
+ # 按分箱聚合
2320
+ lift_agg = plot_data.groupby('bin').agg(
2321
+ total_weight=(self.weight_nme, 'sum'),
2322
+ actual_conversions=(self.binary_resp_nme, 'sum'),
2323
+ weighted_conversions=('w_binary_act', 'sum'),
2324
+ avg_pred=(model_pred_col, 'mean')
2325
+ ).reset_index()
2326
+
2327
+ # 计算成交率
2328
+ lift_agg['conversion_rate'] = lift_agg['weighted_conversions'] / \
2329
+ lift_agg['total_weight']
2330
+
2331
+ # 计算整体平均成交率
2332
+ overall_conversion_rate = data['w_binary_act'].sum(
2333
+ ) / data[self.weight_nme].sum()
2334
+ ax.axhline(y=overall_conversion_rate, color='gray', linestyle='--',
2335
+ label=f'Overall Avg Rate ({overall_conversion_rate:.2%})')
2336
+
2337
+ ax.plot(lift_agg['bin'], lift_agg['conversion_rate'],
2338
+ marker='o', linestyle='-', label='Actual Conversion Rate')
2339
+ ax.set_title(f'Conversion Rate Lift Chart on {data_name}')
2340
+ ax.set_xlabel(f'Model Score Decile (based on {model_pred_col})')
2341
+ ax.set_ylabel('Conversion Rate')
2342
+ ax.grid(True, linestyle='--', alpha=0.6)
2343
+ ax.legend()
2344
+
2345
+ plt.tight_layout()
2346
+ plt.show()
2347
+
2348
+ # 保存模型
2349
+
2350
+ def save_model(self, model_name=None):
2351
+
2352
+ # model_name 可以是:
2353
+ # - None: 保存全部可用模型
2354
+ # - 'xgb': 只保存 Xgboost
2355
+ # - 'resn': 只保存 ResNet
2356
+ # - 'ft': 只保存 FT-Transformer
2357
+ # - 'glm': 只保存 GLM
2358
+ if model_name in (None, 'xgb'):
2359
+ trainer = self.trainers['xgb']
2360
+ if trainer.model is not None:
2361
+ trainer.save()
2362
+ else:
2363
+ print("[save_model] Warning: xgb_best 不存在,未保存 Xgboost 模型。")
2364
+
2365
+ if model_name in (None, 'resn'):
2366
+ trainer = self.trainers['resn']
2367
+ if trainer.model is not None:
2368
+ trainer.save()
2369
+ else:
2370
+ print("[save_model] Warning: resn_best 不存在,未保存 ResNet 模型。")
2371
+
2372
+ if model_name in (None, 'ft'):
2373
+ trainer = self.trainers['ft']
2374
+ if trainer.model is not None:
2375
+ trainer.save()
2376
+ else:
2377
+ print("[save_model] Warning: ft_best 不存在,未保存 FT-Transformer 模型。")
2378
+
2379
+ if model_name in (None, 'glm'):
2380
+ trainer = self.trainers['glm']
2381
+ if trainer.model is not None:
2382
+ trainer.save()
2383
+ else:
2384
+ print("[save_model] Warning: glm_best 不存在,未保存 GLM 模型。")
2385
+
2386
+ def load_model(self, model_name=None):
2387
+ # model_name 可以是:
2388
+ # - None: 加载全部能找到的模型
2389
+ # - 'xgb': 只加载 Xgboost
2390
+ # - 'resn': 只加载 ResNet
2391
+ # - 'ft': 只加载 FT-Transformer
2392
+ # - 'glm': 只加载 GLM
2393
+
2394
+ if model_name in (None, 'xgb'):
2395
+ trainer = self.trainers['xgb']
2396
+ trainer.load()
2397
+ self.xgb_best = trainer.model
2398
+ self.xgb_load = trainer.model
2399
+
2400
+ if model_name in (None, 'resn'):
2401
+ trainer = self.trainers['resn']
2402
+ trainer.load()
2403
+ self.resn_best = trainer.model
2404
+ self.resn_load = trainer.model
2405
+
2406
+ if model_name in (None, 'ft'):
2407
+ trainer = self.trainers['ft']
2408
+ trainer.load()
2409
+ self.ft_best = trainer.model
2410
+ self.ft_load = trainer.model
2411
+
2412
+ if model_name in (None, 'glm'):
2413
+ trainer = self.trainers['glm']
2414
+ trainer.load()
2415
+ self.glm_best = trainer.model
2416
+
2417
+ def _sample_rows(self, data: pd.DataFrame, n: int) -> pd.DataFrame:
2418
+ if len(data) == 0:
2419
+ return data
2420
+ return data.sample(min(len(data), n), random_state=self.rand_seed)
2421
+
2422
+ @staticmethod
2423
+ def _shap_nsamples(arr: np.ndarray, max_nsamples: int = 300) -> int:
2424
+ min_needed = arr.shape[1] + 2
2425
+ return max(min_needed, min(max_nsamples, arr.shape[0] * arr.shape[1]))
2426
+
2427
+ def _build_ft_shap_matrix(self, data: pd.DataFrame) -> np.ndarray:
2428
+
2429
+ # 将原始特征 DataFrame (包含 self.factor_nmes) 转成
2430
+ # 纯数值矩阵: 数值列为 float64,类别列为整数 code(float64 存储)。
2431
+ # 列顺序与 self.factor_nmes 保持一致。
2432
+
2433
+ matrices = []
2434
+
2435
+ for col in self.factor_nmes:
2436
+ s = data[col]
2437
+
2438
+ if col in self.cate_list:
2439
+ # 类别列:按训练时的类别全集编码
2440
+ cats = pd.Categorical(
2441
+ s,
2442
+ categories=self.cat_categories_for_shap[col]
2443
+ )
2444
+ # cats.codes 是一个 Index / ndarray,用 np.asarray 包一下再 reshape
2445
+ codes = np.asarray(cats.codes, dtype=np.float64).reshape(-1, 1)
2446
+ matrices.append(codes)
2447
+ else:
2448
+ # 数值列:转成 Series -> numpy -> reshape
2449
+ vals = pd.to_numeric(s, errors="coerce")
2450
+ arr = vals.to_numpy(dtype=np.float64, copy=True).reshape(-1, 1)
2451
+ matrices.append(arr)
2452
+
2453
+ X_mat = np.concatenate(matrices, axis=1) # (N, F)
2454
+ return X_mat
2455
+
2456
+ def _decode_ft_shap_matrix_to_df(self, X_mat: np.ndarray) -> pd.DataFrame:
2457
+
2458
+ # 将 SHAP 的数值矩阵 (N, F) 还原为原始特征 DataFrame,
2459
+ # 数值列为 float,类别列还原为 pandas 的 category 类型,
2460
+ # 以便兼容 enable_categorical=True 的 XGBoost 和 FT-Transformer 的输入。
2461
+ # 列顺序 = self.factor_nmes
2462
+
2463
+ data_dict = {}
2464
+
2465
+ for j, col in enumerate(self.factor_nmes):
2466
+ col_vals = X_mat[:, j]
2467
+
2468
+ if col in self.cate_list:
2469
+ cats = self.cat_categories_for_shap[col]
2470
+
2471
+ # SHAP 会扰动成小数,这里 round 回整数 code
2472
+ codes = np.round(col_vals).astype(int)
2473
+ # 限制在 [-1, len(cats)-1]
2474
+ codes = np.clip(codes, -1, len(cats) - 1)
2475
+
2476
+ # 使用 pandas.Categorical.from_codes:
2477
+ # - codes = -1 被当成缺失 (NaN)
2478
+ # - 其他索引映射到 cats 中对应的类别
2479
+ cat_series = pd.Categorical.from_codes(
2480
+ codes,
2481
+ categories=cats
2482
+ )
2483
+ # 存的是 Categorical 类型,而不是 object
2484
+ data_dict[col] = cat_series
2485
+ else:
2486
+ # 数值列:直接 float
2487
+ data_dict[col] = col_vals.astype(float)
2488
+
2489
+ df = pd.DataFrame(data_dict, columns=self.factor_nmes)
2490
+
2491
+ # 再保险:确保所有类别列 dtype 真的是 category
2492
+ for col in self.cate_list:
2493
+ if col in df.columns:
2494
+ df[col] = df[col].astype("category")
2495
+ return df
2496
+
2497
+ def _build_glm_design(self, data: pd.DataFrame) -> pd.DataFrame:
2498
+ # 与 GLM 训练阶段一致:在 one-hot + 标准化特征上添加截距
2499
+ X = data[self.var_nmes]
2500
+ return sm.add_constant(X, has_constant='add')
2501
+
2502
+ # ========= XGBoost SHAP =========
2503
+
2504
+ def compute_shap_xgb(self, n_background: int = 500,
2505
+ n_samples: int = 200,
2506
+ on_train: bool = True):
2507
+ # 使用 KernelExplainer 计算 XGBoost 的 SHAP 值(黑盒方式)。
2508
+ #
2509
+ # - 对 SHAP:输入是一份纯数值矩阵:
2510
+ # * 数值特征:float64
2511
+ # * 类别特征:用 _build_ft_shap_matrix 编码后的整数 code(float64)
2512
+ # - 对模型:仍然用原始 DataFrame + xgb_best.predict(...)
2513
+
2514
+ if not hasattr(self, "xgb_best"):
2515
+ raise RuntimeError("请先运行 bayesopt_xgb() 训练好 self.xgb_best")
2516
+
2517
+ # 1) 选择数据源:训练集 or 测试集(原始特征空间)
2518
+ data = self.train_data if on_train else self.test_data
2519
+ X_raw = data[self.factor_nmes]
2520
+
2521
+ # 2) 构造背景矩阵(用和 FT 一样的数值编码)
2522
+ background_raw = self._sample_rows(X_raw, n_background)
2523
+ # KernelExplainer 计算量极大,务必控制背景样本规模,否则会拖慢调试
2524
+ background_mat = self._build_ft_shap_matrix(
2525
+ background_raw
2526
+ ).astype(np.float64, copy=True)
2527
+
2528
+ # 3) 定义黑盒预测函数:数值矩阵 -> DataFrame -> xgb_best.predict
2529
+ def f_predict(x_mat: np.ndarray) -> np.ndarray:
2530
+ # 把编码矩阵还原成原始 DataFrame(数值+类别)
2531
+ df_input = self._decode_ft_shap_matrix_to_df(x_mat)
2532
+ # 注意:这里用的是 self.xgb_best.predict,和你训练/预测时一致
2533
+ y_pred = self.xgb_best.predict(df_input)
2534
+ return y_pred
2535
+
2536
+ explainer = shap.KernelExplainer(f_predict, background_mat)
2537
+
2538
+ # 4) 要解释的样本:原始特征 + 数值编码
2539
+ X_explain_raw = self._sample_rows(X_raw, n_samples)
2540
+ X_explain_mat = self._build_ft_shap_matrix(
2541
+ X_explain_raw
2542
+ ).astype(np.float64, copy=True)
2543
+
2544
+ # 5) 计算 SHAP 值(注意用 nsamples='auto' 控制复杂度)
2545
+ shap_values = explainer.shap_values(X_explain_mat, nsamples="auto")
2546
+
2547
+ # 6) 保存结果:
2548
+ # - shap_values:数值编码空间,对应 factor_nmes 的每一列
2549
+ # - X_explain_raw:原始 DataFrame,方便画图时显示真实类别名
2550
+ self.shap_xgb = {
2551
+ "explainer": explainer,
2552
+ "X_explain": X_explain_raw,
2553
+ "shap_values": shap_values,
2554
+ "base_value": explainer.expected_value,
2555
+ }
2556
+ return self.shap_xgb
2557
+ # ========= ResNet SHAP =========
2558
+
2559
+ def _resn_predict_wrapper(self, X_np):
2560
+ # 保证走 CPU
2561
+ model = self.resn_best.resnet.to("cpu")
2562
+ with torch.no_grad():
2563
+ # 不要 .to(self.device)
2564
+ X_tensor = torch.tensor(X_np, dtype=torch.float32)
2565
+ y_pred = model(X_tensor).cpu().numpy()
2566
+ y_pred = np.clip(y_pred, 1e-6, None)
2567
+ return y_pred.reshape(-1)
2568
+
2569
+ def compute_shap_resn(self, n_background: int = 500,
2570
+ n_samples: int = 200,
2571
+ on_train: bool = True):
2572
+
2573
+ # 使用 KernelExplainer 计算 ResNet 的 SHAP 值。
2574
+ # 解释空间:已 one-hot & 标准化后的特征 self.var_nmes。
2575
+
2576
+ if not hasattr(self, 'resn_best'):
2577
+ raise RuntimeError("请先运行 bayesopt_resnet() 训练好 resn_best")
2578
+
2579
+ self.resn_best.device = torch.device("cpu") # 强制走 CPU
2580
+ self.resn_best.resnet.to("cpu")
2581
+ if torch.cuda.is_available():
2582
+ torch.cuda.empty_cache()
2583
+
2584
+ # 选择数据集(已 one-hot & 标准化)
2585
+ data = self.train_oht_scl_data if on_train else self.test_oht_scl_data
2586
+ X = data[self.var_nmes]
2587
+ if len(X) == 0:
2588
+ raise ValueError(
2589
+ "compute_shap_resn: 选择的数据集为空(len(X)==0),无法计算 SHAP。")
2590
+
2591
+ # 背景样本:float64 numpy
2592
+ background_df = self._sample_rows(X, n_background)
2593
+ background_np = background_df.to_numpy(dtype=np.float64, copy=True)
2594
+
2595
+ # 黑盒预测函数
2596
+ def f_predict(x):
2597
+ y = self._resn_predict_wrapper(x)
2598
+ # 保证是一维数组
2599
+ y = np.asarray(y, dtype=np.float64).reshape(-1)
2600
+ return y
2601
+
2602
+ explainer = shap.KernelExplainer(f_predict, background_np)
2603
+
2604
+ # 要解释的样本
2605
+ X_explain_df = self._sample_rows(X, n_samples)
2606
+ X_explain_np = X_explain_df.to_numpy(dtype=np.float64, copy=True)
2607
+
2608
+ nsample_eff = self._shap_nsamples(X_explain_np)
2609
+ shap_values = explainer.shap_values(X_explain_np, nsamples=nsample_eff)
2610
+ # 手动计算 base_value,避免 NotOneValueFound
2611
+ bg_pred = f_predict(background_np)
2612
+ if bg_pred.size == 0:
2613
+ raise ValueError("compute_shap_resn: 背景样本预测结果为空,无法计算 base_value。")
2614
+ base_value = float(bg_pred.mean())
2615
+
2616
+ self.shap_resn = {
2617
+ "explainer": explainer,
2618
+ "X_explain": X_explain_df, # DataFrame: 用于画图(有列名)
2619
+ "shap_values": shap_values, # numpy: (n_samples, n_features)
2620
+ # "base_value": explainer.expected_value,
2621
+ "base_value": base_value,
2622
+ }
2623
+ return self.shap_resn
2624
+
2625
+ # ========= FT-Transformer SHAP =========
2626
+
2627
+ def _ft_shap_predict_wrapper(self, X_mat: np.ndarray) -> np.ndarray:
2628
+
2629
+ # SHAP 的预测包装:
2630
+ # 数值矩阵 -> 还原为原始特征 DataFrame -> 调用 ft_best.predict
2631
+
2632
+ df_input = self._decode_ft_shap_matrix_to_df(X_mat)
2633
+ y_pred = self.ft_best.predict(df_input)
2634
+ return np.asarray(y_pred, dtype=np.float64).reshape(-1)
2635
+
2636
+ def compute_shap_ft(self, n_background: int = 500,
2637
+ n_samples: int = 200,
2638
+ on_train: bool = True):
2639
+
2640
+ # 使用 KernelExplainer 计算 FT-Transformer 的 SHAP 值。
2641
+ # 解释空间:数值+类别code 的混合数值矩阵(float64),
2642
+ # 但对外展示时仍使用原始特征名/取值(X_explain)。
2643
+
2644
+ if not hasattr(self, "ft_best"):
2645
+ raise RuntimeError("请先运行 bayesopt_ft() 训练好 ft_best")
2646
+
2647
+ self.ft_best.device = torch.device("cpu") # 强制走 CPU
2648
+ self.ft_best.ft.to("cpu")
2649
+ if torch.cuda.is_available():
2650
+ torch.cuda.empty_cache()
2651
+
2652
+ # 选择数据源(原始特征空间)
2653
+ data = self.train_data if on_train else self.test_data
2654
+ X_raw = data[self.factor_nmes]
2655
+
2656
+ # 背景矩阵
2657
+ background_raw = self._sample_rows(X_raw, n_background)
2658
+ background_mat = self._build_ft_shap_matrix(
2659
+ background_raw
2660
+ ).astype(np.float64, copy=True)
2661
+
2662
+ # 黑盒预测函数(数值矩阵 → DataFrame → FT 模型)
2663
+ def f_predict(x):
2664
+ return self._ft_shap_predict_wrapper(x)
2665
+
2666
+ explainer = shap.KernelExplainer(f_predict, background_mat)
2667
+
2668
+ # 要解释的样本(原始特征空间)
2669
+ X_explain_raw = self._sample_rows(X_raw, n_samples)
2670
+ X_explain_mat = self._build_ft_shap_matrix(
2671
+ X_explain_raw
2672
+ ).astype(np.float64, copy=True)
2673
+
2674
+ nsample_eff = self._shap_nsamples(X_explain_mat)
2675
+ shap_values = explainer.shap_values(
2676
+ X_explain_mat, nsamples=nsample_eff)
2677
+ bg_pred = self._ft_shap_predict_wrapper(background_mat)
2678
+ bg_pred = np.asarray(bg_pred, dtype=np.float64).reshape(-1)
2679
+ base_value = float(bg_pred.mean())
2680
+
2681
+ self.shap_ft = {
2682
+ "explainer": explainer,
2683
+ "X_explain": X_explain_raw, # 原始特征 DataFrame,用来画图
2684
+ "shap_values": shap_values, # numpy: (n_samples, n_features)
2685
+ # "base_value": explainer.expected_value,
2686
+ "base_value": base_value,
2687
+ }
2688
+ return self.shap_ft
2689
+
2690
+ # ========= GLM SHAP =========
2691
+ def compute_shap_glm(self, n_background: int = 500,
2692
+ n_samples: int = 200,
2693
+ on_train: bool = True):
2694
+ """
2695
+ 使用 KernelExplainer 计算 GLM 的 SHAP 值。
2696
+ 解释空间:one-hot + 标准化 + 截距项(与 GLM 训练一致)。
2697
+ """
2698
+ if not hasattr(self, "glm_best") or self.glm_best is None:
2699
+ raise RuntimeError("请先运行 bayesopt_glm() 训练好 glm_best")
2700
+
2701
+ data = self.train_oht_scl_data if on_train else self.test_oht_scl_data
2702
+ if len(data) == 0:
2703
+ raise ValueError("compute_shap_glm: 选择的数据集为空,无法计算 SHAP。")
2704
+
2705
+ design_all = self._build_glm_design(data)
2706
+ background_df = self._sample_rows(design_all, n_background)
2707
+ background_np = background_df.to_numpy(dtype=np.float64, copy=True)
2708
+ design_cols = list(design_all.columns)
2709
+
2710
+ def f_predict(x_np: np.ndarray) -> np.ndarray:
2711
+ x_df = pd.DataFrame(x_np, columns=design_cols)
2712
+ y_pred = self.glm_best.predict(x_df)
2713
+ return np.asarray(y_pred, dtype=np.float64).reshape(-1)
2714
+
2715
+ explainer = shap.KernelExplainer(f_predict, background_np)
2716
+ explain_df = self._sample_rows(design_all, n_samples)
2717
+ explain_np = explain_df.to_numpy(dtype=np.float64, copy=True)
2718
+
2719
+ nsample_eff = self._shap_nsamples(explain_np)
2720
+ shap_values = explainer.shap_values(explain_np, nsamples=nsample_eff)
2721
+ bg_pred = f_predict(background_np)
2722
+ base_value = float(np.asarray(bg_pred, dtype=np.float64).mean())
2723
+
2724
+ self.shap_glm = {
2725
+ "explainer": explainer,
2726
+ "X_explain": explain_df, # 包含 const + 特征,用于画图
2727
+ "shap_values": shap_values,
2728
+ "base_value": base_value,
2729
+ "design_columns": design_cols
2730
+ }
2731
+ return self.shap_glm