ins-pricing 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. ins_pricing/README.md +60 -0
  2. ins_pricing/__init__.py +102 -0
  3. ins_pricing/governance/README.md +18 -0
  4. ins_pricing/governance/__init__.py +20 -0
  5. ins_pricing/governance/approval.py +93 -0
  6. ins_pricing/governance/audit.py +37 -0
  7. ins_pricing/governance/registry.py +99 -0
  8. ins_pricing/governance/release.py +159 -0
  9. ins_pricing/modelling/BayesOpt.py +146 -0
  10. ins_pricing/modelling/BayesOpt_USAGE.md +925 -0
  11. ins_pricing/modelling/BayesOpt_entry.py +575 -0
  12. ins_pricing/modelling/BayesOpt_incremental.py +731 -0
  13. ins_pricing/modelling/Explain_Run.py +36 -0
  14. ins_pricing/modelling/Explain_entry.py +539 -0
  15. ins_pricing/modelling/Pricing_Run.py +36 -0
  16. ins_pricing/modelling/README.md +33 -0
  17. ins_pricing/modelling/__init__.py +44 -0
  18. ins_pricing/modelling/bayesopt/__init__.py +98 -0
  19. ins_pricing/modelling/bayesopt/config_preprocess.py +303 -0
  20. ins_pricing/modelling/bayesopt/core.py +1476 -0
  21. ins_pricing/modelling/bayesopt/models.py +2196 -0
  22. ins_pricing/modelling/bayesopt/trainers.py +2446 -0
  23. ins_pricing/modelling/bayesopt/utils.py +1021 -0
  24. ins_pricing/modelling/cli_common.py +136 -0
  25. ins_pricing/modelling/explain/__init__.py +55 -0
  26. ins_pricing/modelling/explain/gradients.py +334 -0
  27. ins_pricing/modelling/explain/metrics.py +176 -0
  28. ins_pricing/modelling/explain/permutation.py +155 -0
  29. ins_pricing/modelling/explain/shap_utils.py +146 -0
  30. ins_pricing/modelling/notebook_utils.py +284 -0
  31. ins_pricing/modelling/plotting/__init__.py +45 -0
  32. ins_pricing/modelling/plotting/common.py +63 -0
  33. ins_pricing/modelling/plotting/curves.py +572 -0
  34. ins_pricing/modelling/plotting/diagnostics.py +139 -0
  35. ins_pricing/modelling/plotting/geo.py +362 -0
  36. ins_pricing/modelling/plotting/importance.py +121 -0
  37. ins_pricing/modelling/run_logging.py +133 -0
  38. ins_pricing/modelling/tests/conftest.py +8 -0
  39. ins_pricing/modelling/tests/test_cross_val_generic.py +66 -0
  40. ins_pricing/modelling/tests/test_distributed_utils.py +18 -0
  41. ins_pricing/modelling/tests/test_explain.py +56 -0
  42. ins_pricing/modelling/tests/test_geo_tokens_split.py +49 -0
  43. ins_pricing/modelling/tests/test_graph_cache.py +33 -0
  44. ins_pricing/modelling/tests/test_plotting.py +63 -0
  45. ins_pricing/modelling/tests/test_plotting_library.py +150 -0
  46. ins_pricing/modelling/tests/test_preprocessor.py +48 -0
  47. ins_pricing/modelling/watchdog_run.py +211 -0
  48. ins_pricing/pricing/README.md +44 -0
  49. ins_pricing/pricing/__init__.py +27 -0
  50. ins_pricing/pricing/calibration.py +39 -0
  51. ins_pricing/pricing/data_quality.py +117 -0
  52. ins_pricing/pricing/exposure.py +85 -0
  53. ins_pricing/pricing/factors.py +91 -0
  54. ins_pricing/pricing/monitoring.py +99 -0
  55. ins_pricing/pricing/rate_table.py +78 -0
  56. ins_pricing/production/__init__.py +21 -0
  57. ins_pricing/production/drift.py +30 -0
  58. ins_pricing/production/monitoring.py +143 -0
  59. ins_pricing/production/scoring.py +40 -0
  60. ins_pricing/reporting/README.md +20 -0
  61. ins_pricing/reporting/__init__.py +11 -0
  62. ins_pricing/reporting/report_builder.py +72 -0
  63. ins_pricing/reporting/scheduler.py +45 -0
  64. ins_pricing/setup.py +41 -0
  65. ins_pricing v2/__init__.py +23 -0
  66. ins_pricing v2/governance/__init__.py +20 -0
  67. ins_pricing v2/governance/approval.py +93 -0
  68. ins_pricing v2/governance/audit.py +37 -0
  69. ins_pricing v2/governance/registry.py +99 -0
  70. ins_pricing v2/governance/release.py +159 -0
  71. ins_pricing v2/modelling/Explain_Run.py +36 -0
  72. ins_pricing v2/modelling/Pricing_Run.py +36 -0
  73. ins_pricing v2/modelling/__init__.py +151 -0
  74. ins_pricing v2/modelling/cli_common.py +141 -0
  75. ins_pricing v2/modelling/config.py +249 -0
  76. ins_pricing v2/modelling/config_preprocess.py +254 -0
  77. ins_pricing v2/modelling/core.py +741 -0
  78. ins_pricing v2/modelling/data_container.py +42 -0
  79. ins_pricing v2/modelling/explain/__init__.py +55 -0
  80. ins_pricing v2/modelling/explain/gradients.py +334 -0
  81. ins_pricing v2/modelling/explain/metrics.py +176 -0
  82. ins_pricing v2/modelling/explain/permutation.py +155 -0
  83. ins_pricing v2/modelling/explain/shap_utils.py +146 -0
  84. ins_pricing v2/modelling/features.py +215 -0
  85. ins_pricing v2/modelling/model_manager.py +148 -0
  86. ins_pricing v2/modelling/model_plotting.py +463 -0
  87. ins_pricing v2/modelling/models.py +2203 -0
  88. ins_pricing v2/modelling/notebook_utils.py +294 -0
  89. ins_pricing v2/modelling/plotting/__init__.py +45 -0
  90. ins_pricing v2/modelling/plotting/common.py +63 -0
  91. ins_pricing v2/modelling/plotting/curves.py +572 -0
  92. ins_pricing v2/modelling/plotting/diagnostics.py +139 -0
  93. ins_pricing v2/modelling/plotting/geo.py +362 -0
  94. ins_pricing v2/modelling/plotting/importance.py +121 -0
  95. ins_pricing v2/modelling/run_logging.py +133 -0
  96. ins_pricing v2/modelling/tests/conftest.py +8 -0
  97. ins_pricing v2/modelling/tests/test_cross_val_generic.py +66 -0
  98. ins_pricing v2/modelling/tests/test_distributed_utils.py +18 -0
  99. ins_pricing v2/modelling/tests/test_explain.py +56 -0
  100. ins_pricing v2/modelling/tests/test_geo_tokens_split.py +49 -0
  101. ins_pricing v2/modelling/tests/test_graph_cache.py +33 -0
  102. ins_pricing v2/modelling/tests/test_plotting.py +63 -0
  103. ins_pricing v2/modelling/tests/test_plotting_library.py +150 -0
  104. ins_pricing v2/modelling/tests/test_preprocessor.py +48 -0
  105. ins_pricing v2/modelling/trainers.py +2447 -0
  106. ins_pricing v2/modelling/utils.py +1020 -0
  107. ins_pricing v2/modelling/watchdog_run.py +211 -0
  108. ins_pricing v2/pricing/__init__.py +27 -0
  109. ins_pricing v2/pricing/calibration.py +39 -0
  110. ins_pricing v2/pricing/data_quality.py +117 -0
  111. ins_pricing v2/pricing/exposure.py +85 -0
  112. ins_pricing v2/pricing/factors.py +91 -0
  113. ins_pricing v2/pricing/monitoring.py +99 -0
  114. ins_pricing v2/pricing/rate_table.py +78 -0
  115. ins_pricing v2/production/__init__.py +21 -0
  116. ins_pricing v2/production/drift.py +30 -0
  117. ins_pricing v2/production/monitoring.py +143 -0
  118. ins_pricing v2/production/scoring.py +40 -0
  119. ins_pricing v2/reporting/__init__.py +11 -0
  120. ins_pricing v2/reporting/report_builder.py +72 -0
  121. ins_pricing v2/reporting/scheduler.py +45 -0
  122. ins_pricing v2/scripts/BayesOpt_incremental.py +722 -0
  123. ins_pricing v2/scripts/Explain_entry.py +545 -0
  124. ins_pricing v2/scripts/__init__.py +1 -0
  125. ins_pricing v2/scripts/train.py +568 -0
  126. ins_pricing v2/setup.py +55 -0
  127. ins_pricing v2/smoke_test.py +28 -0
  128. ins_pricing-0.1.6.dist-info/METADATA +78 -0
  129. ins_pricing-0.1.6.dist-info/RECORD +169 -0
  130. ins_pricing-0.1.6.dist-info/WHEEL +5 -0
  131. ins_pricing-0.1.6.dist-info/top_level.txt +4 -0
  132. user_packages/__init__.py +105 -0
  133. user_packages legacy/BayesOpt.py +5659 -0
  134. user_packages legacy/BayesOpt_entry.py +513 -0
  135. user_packages legacy/BayesOpt_incremental.py +685 -0
  136. user_packages legacy/Pricing_Run.py +36 -0
  137. user_packages legacy/Try/BayesOpt Legacy251213.py +3719 -0
  138. user_packages legacy/Try/BayesOpt Legacy251215.py +3758 -0
  139. user_packages legacy/Try/BayesOpt lagecy251201.py +3506 -0
  140. user_packages legacy/Try/BayesOpt lagecy251218.py +3992 -0
  141. user_packages legacy/Try/BayesOpt legacy.py +3280 -0
  142. user_packages legacy/Try/BayesOpt.py +838 -0
  143. user_packages legacy/Try/BayesOptAll.py +1569 -0
  144. user_packages legacy/Try/BayesOptAllPlatform.py +909 -0
  145. user_packages legacy/Try/BayesOptCPUGPU.py +1877 -0
  146. user_packages legacy/Try/BayesOptSearch.py +830 -0
  147. user_packages legacy/Try/BayesOptSearchOrigin.py +829 -0
  148. user_packages legacy/Try/BayesOptV1.py +1911 -0
  149. user_packages legacy/Try/BayesOptV10.py +2973 -0
  150. user_packages legacy/Try/BayesOptV11.py +3001 -0
  151. user_packages legacy/Try/BayesOptV12.py +3001 -0
  152. user_packages legacy/Try/BayesOptV2.py +2065 -0
  153. user_packages legacy/Try/BayesOptV3.py +2209 -0
  154. user_packages legacy/Try/BayesOptV4.py +2342 -0
  155. user_packages legacy/Try/BayesOptV5.py +2372 -0
  156. user_packages legacy/Try/BayesOptV6.py +2759 -0
  157. user_packages legacy/Try/BayesOptV7.py +2832 -0
  158. user_packages legacy/Try/BayesOptV8Codex.py +2731 -0
  159. user_packages legacy/Try/BayesOptV8Gemini.py +2614 -0
  160. user_packages legacy/Try/BayesOptV9.py +2927 -0
  161. user_packages legacy/Try/BayesOpt_entry legacy.py +313 -0
  162. user_packages legacy/Try/ModelBayesOptSearch.py +359 -0
  163. user_packages legacy/Try/ResNetBayesOptSearch.py +249 -0
  164. user_packages legacy/Try/XgbBayesOptSearch.py +121 -0
  165. user_packages legacy/Try/xgbbayesopt.py +523 -0
  166. user_packages legacy/__init__.py +19 -0
  167. user_packages legacy/cli_common.py +124 -0
  168. user_packages legacy/notebook_utils.py +228 -0
  169. user_packages legacy/watchdog_run.py +202 -0
@@ -0,0 +1,3280 @@
1
+ from sklearn.metrics import log_loss, make_scorer, mean_tweedie_deviance
2
+ from sklearn.preprocessing import StandardScaler
3
+ from sklearn.neighbors import NearestNeighbors
4
+ from sklearn.model_selection import ShuffleSplit, cross_val_score # 1.2.2
5
+ import torch.distributed as dist
6
+ from torch.nn.parallel import DistributedDataParallel as DDP
7
+ from torch.nn.utils import clip_grad_norm_
8
+ from torch.cuda.amp import autocast, GradScaler
9
+ from torch.utils.data import Dataset, DataLoader, TensorDataset, DistributedSampler
10
+ import xgboost as xgb # 1.7.0
11
+ import torch.nn.functional as F
12
+ import torch.nn as nn
13
+ import torch # 版本: 1.10.1+cu111
14
+ import statsmodels.api as sm
15
+ import shap
16
+ import pandas as pd # 2.2.3
17
+ import optuna # 4.3.0
18
+ import numpy as np # 1.26.2
19
+ try:
20
+ from torch_geometric.nn import knn_graph
21
+ from torch_geometric.utils import add_self_loops, to_undirected
22
+ _PYG_AVAILABLE = True
23
+ except Exception:
24
+ knn_graph = None # type: ignore
25
+ add_self_loops = None # type: ignore
26
+ to_undirected = None # type: ignore
27
+ _PYG_AVAILABLE = False
28
+ import matplotlib.pyplot as plt
29
+ import joblib
30
+ import csv
31
+ from typing import Any, Dict, List, Optional
32
+ from pathlib import Path
33
+ from dataclasses import dataclass
34
+ import os
35
+ import math
36
+ import gc
37
+ import copy
38
+
39
+
40
+ # 常量与工具模块
41
+ # =============================================================================
42
+ torch.backends.cudnn.benchmark = True
43
+ EPS = 1e-8
44
+
45
+
46
+ class IOUtils:
47
+ # 文件与路径处理的小工具集合。
48
+
49
+ @staticmethod
50
+ def csv_to_dict(file_path: str) -> List[Dict[str, Any]]:
51
+ with open(file_path, mode='r', encoding='utf-8') as file:
52
+ reader = csv.DictReader(file)
53
+ return [
54
+ dict(filter(lambda item: item[0] != '', row.items()))
55
+ for row in reader
56
+ ]
57
+
58
+ @staticmethod
59
+ def ensure_parent_dir(file_path: str) -> None:
60
+ # 若目标文件所在目录不存在则自动创建
61
+ directory = os.path.dirname(file_path)
62
+ if directory:
63
+ os.makedirs(directory, exist_ok=True)
64
+
65
+
66
+ class TrainingUtils:
67
+ # 训练阶段常用的小型辅助函数集合。
68
+
69
+ @staticmethod
70
+ def compute_batch_size(data_size: int, learning_rate: float, batch_num: int, minimum: int) -> int:
71
+ estimated = int((learning_rate / 1e-4) ** 0.5 *
72
+ (data_size / max(batch_num, 1)))
73
+ return max(1, min(data_size, max(minimum, estimated)))
74
+
75
+ @staticmethod
76
+ def tweedie_loss(pred, target, p=1.5, eps=1e-6, max_clip=1e6):
77
+ # 为确保稳定性先将预测值裁剪为正数
78
+ pred_clamped = torch.clamp(pred, min=eps)
79
+ if p == 1:
80
+ term1 = target * torch.log(target / pred_clamped + eps) # 泊松
81
+ term2 = -target + pred_clamped
82
+ term3 = 0
83
+ elif p == 0:
84
+ term1 = 0.5 * torch.pow(target - pred_clamped, 2) # 高斯
85
+ term2 = 0
86
+ term3 = 0
87
+ elif p == 2:
88
+ term1 = torch.log(pred_clamped / target + eps) # 伽马
89
+ term2 = -target / pred_clamped + 1
90
+ term3 = 0
91
+ else:
92
+ term1 = torch.pow(target, 2 - p) / ((1 - p) * (2 - p))
93
+ term2 = target * torch.pow(pred_clamped, 1 - p) / (1 - p)
94
+ term3 = torch.pow(pred_clamped, 2 - p) / (2 - p)
95
+ return torch.nan_to_num( # Tweedie 负对数似然(忽略常数项)
96
+ 2 * (term1 - term2 + term3),
97
+ nan=eps,
98
+ posinf=max_clip,
99
+ neginf=-max_clip
100
+ )
101
+
102
+ @staticmethod
103
+ def free_cuda() -> None:
104
+ print(">>> Moving all models to CPU...")
105
+ for obj in gc.get_objects():
106
+ try:
107
+ if hasattr(obj, "to") and callable(obj.to):
108
+ obj.to("cpu")
109
+ except Exception:
110
+ pass
111
+
112
+ print(">>> Deleting tensors, optimizers, dataloaders...")
113
+ gc.collect()
114
+
115
+ print(">>> Emptying CUDA cache...")
116
+ torch.cuda.empty_cache()
117
+ torch.cuda.synchronize()
118
+
119
+ print(">>> CUDA memory freed.")
120
+
121
+
122
+ class DistributedUtils:
123
+ _cached_state: Optional[tuple] = None
124
+
125
+ @staticmethod
126
+ def setup_ddp():
127
+ """Initialize DDP process group."""
128
+ if dist.is_initialized():
129
+ if DistributedUtils._cached_state is None:
130
+ rank = dist.get_rank()
131
+ world_size = dist.get_world_size()
132
+ local_rank = int(os.environ.get("LOCAL_RANK", 0))
133
+ DistributedUtils._cached_state = (
134
+ True,
135
+ local_rank,
136
+ rank,
137
+ world_size,
138
+ )
139
+ return DistributedUtils._cached_state
140
+
141
+ if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
142
+ rank = int(os.environ["RANK"])
143
+ world_size = int(os.environ["WORLD_SIZE"])
144
+ local_rank = int(os.environ["LOCAL_RANK"])
145
+
146
+ if torch.cuda.is_available():
147
+ torch.cuda.set_device(local_rank)
148
+
149
+ dist.init_process_group(backend="nccl", init_method="env://")
150
+ print(
151
+ f">>> DDP Initialized: Rank {rank}/{world_size}, Local Rank {local_rank}")
152
+ DistributedUtils._cached_state = (
153
+ True,
154
+ local_rank,
155
+ rank,
156
+ world_size,
157
+ )
158
+ return DistributedUtils._cached_state
159
+ else:
160
+ print(
161
+ f">>> DDP Setup Failed: RANK or WORLD_SIZE not found in env. Keys found: {list(os.environ.keys())}")
162
+ return False, 0, 0, 1
163
+
164
+ @staticmethod
165
+ def cleanup_ddp():
166
+ """Destroy DDP process group."""
167
+ if dist.is_initialized():
168
+ dist.destroy_process_group()
169
+ DistributedUtils._cached_state = None
170
+
171
+ @staticmethod
172
+ def is_main_process():
173
+ return not dist.is_initialized() or dist.get_rank() == 0
174
+
175
+ @staticmethod
176
+ def world_size() -> int:
177
+ return dist.get_world_size() if dist.is_initialized() else 1
178
+
179
+
180
+ class PlotUtils:
181
+ # 多种模型共享的绘图辅助工具。
182
+
183
+ @staticmethod
184
+ def split_data(data: pd.DataFrame, col_nme: str, wgt_nme: str, n_bins: int = 10) -> pd.DataFrame:
185
+ data_sorted = data.sort_values(by=col_nme, ascending=True).copy()
186
+ data_sorted['cum_weight'] = data_sorted[wgt_nme].cumsum()
187
+ w_sum = data_sorted[wgt_nme].sum()
188
+ if w_sum <= EPS:
189
+ data_sorted.loc[:, 'bins'] = 0
190
+ else:
191
+ data_sorted.loc[:, 'bins'] = np.floor(
192
+ data_sorted['cum_weight'] * float(n_bins) / w_sum
193
+ )
194
+ data_sorted.loc[(data_sorted['bins'] == n_bins),
195
+ 'bins'] = n_bins - 1
196
+ return data_sorted.groupby(['bins'], observed=True).sum(numeric_only=True)
197
+
198
+ @staticmethod
199
+ def plot_lift_ax(ax, plot_data, title, pred_label='Predicted', act_label='Actual', weight_label='Earned Exposure'):
200
+ ax.plot(plot_data.index, plot_data['act_v'],
201
+ label=act_label, color='red')
202
+ ax.plot(plot_data.index, plot_data['exp_v'],
203
+ label=pred_label, color='blue')
204
+ ax.set_title(title, fontsize=8)
205
+ ax.set_xticks(plot_data.index)
206
+ ax.set_xticklabels(plot_data.index, rotation=90, fontsize=6)
207
+ ax.tick_params(axis='y', labelsize=6)
208
+ ax.legend(loc='upper left', fontsize=5, frameon=False)
209
+ ax.margins(0.05)
210
+ ax2 = ax.twinx()
211
+ ax2.bar(plot_data.index, plot_data['weight'],
212
+ alpha=0.5, color='seagreen',
213
+ label=weight_label)
214
+ ax2.tick_params(axis='y', labelsize=6)
215
+ ax2.legend(loc='upper right', fontsize=5, frameon=False)
216
+
217
+ @staticmethod
218
+ def plot_dlift_ax(ax, plot_data, title, label1, label2, act_label='Actual', weight_label='Earned Exposure'):
219
+ ax.plot(plot_data.index, plot_data['act_v'],
220
+ label=act_label, color='red')
221
+ ax.plot(plot_data.index, plot_data['exp_v1'],
222
+ label=label1, color='blue')
223
+ ax.plot(plot_data.index, plot_data['exp_v2'],
224
+ label=label2, color='black')
225
+ ax.set_title(title, fontsize=8)
226
+ ax.set_xticks(plot_data.index)
227
+ ax.set_xticklabels(plot_data.index, rotation=90, fontsize=6)
228
+ ax.set_xlabel(f'{label1} / {label2}', fontsize=6)
229
+ ax.tick_params(axis='y', labelsize=6)
230
+ ax.legend(loc='upper left', fontsize=5, frameon=False)
231
+ ax.margins(0.1)
232
+ ax2 = ax.twinx()
233
+ ax2.bar(plot_data.index, plot_data['weight'],
234
+ alpha=0.5, color='seagreen',
235
+ label=weight_label)
236
+ ax2.tick_params(axis='y', labelsize=6)
237
+ ax2.legend(loc='upper right', fontsize=5, frameon=False)
238
+
239
+ @staticmethod
240
+ def plot_lift_list(pred_model, w_pred_list, w_act_list,
241
+ weight_list, tgt_nme, n_bins: int = 10,
242
+ fig_nme: str = 'Lift Chart'):
243
+ lift_data = pd.DataFrame()
244
+ lift_data.loc[:, 'pred'] = pred_model
245
+ lift_data.loc[:, 'w_pred'] = w_pred_list
246
+ lift_data.loc[:, 'act'] = w_act_list
247
+ lift_data.loc[:, 'weight'] = weight_list
248
+ plot_data = PlotUtils.split_data(lift_data, 'pred', 'weight', n_bins)
249
+ plot_data['exp_v'] = plot_data['w_pred'] / plot_data['weight']
250
+ plot_data['act_v'] = plot_data['act'] / plot_data['weight']
251
+ plot_data.reset_index(inplace=True)
252
+
253
+ fig = plt.figure(figsize=(7, 5))
254
+ ax = fig.add_subplot(111)
255
+ PlotUtils.plot_lift_ax(ax, plot_data, f'Lift Chart of {tgt_nme}')
256
+ plt.subplots_adjust(wspace=0.3)
257
+
258
+ save_path = os.path.join(
259
+ os.getcwd(), 'plot', f'05_{tgt_nme}_{fig_nme}.png')
260
+ IOUtils.ensure_parent_dir(save_path)
261
+ plt.savefig(save_path, dpi=300)
262
+ plt.close(fig)
263
+
264
+ @staticmethod
265
+ def plot_dlift_list(pred_model_1, pred_model_2,
266
+ model_nme_1, model_nme_2,
267
+ tgt_nme,
268
+ w_list, w_act_list, n_bins: int = 10,
269
+ fig_nme: str = 'Double Lift Chart'):
270
+ lift_data = pd.DataFrame()
271
+ lift_data.loc[:, 'pred1'] = pred_model_1
272
+ lift_data.loc[:, 'pred2'] = pred_model_2
273
+ lift_data.loc[:, 'diff_ly'] = lift_data['pred1'] / lift_data['pred2']
274
+ lift_data.loc[:, 'act'] = w_act_list
275
+ lift_data.loc[:, 'weight'] = w_list
276
+ lift_data.loc[:, 'w_pred1'] = lift_data['pred1'] * lift_data['weight']
277
+ lift_data.loc[:, 'w_pred2'] = lift_data['pred2'] * lift_data['weight']
278
+ plot_data = PlotUtils.split_data(
279
+ lift_data, 'diff_ly', 'weight', n_bins)
280
+ plot_data['exp_v1'] = plot_data['w_pred1'] / plot_data['act']
281
+ plot_data['exp_v2'] = plot_data['w_pred2'] / plot_data['act']
282
+ plot_data['act_v'] = plot_data['act']/plot_data['act']
283
+ plot_data.reset_index(inplace=True)
284
+
285
+ fig = plt.figure(figsize=(7, 5))
286
+ ax = fig.add_subplot(111)
287
+ PlotUtils.plot_dlift_ax(
288
+ ax, plot_data, f'Double Lift Chart of {tgt_nme}', model_nme_1, model_nme_2)
289
+ plt.subplots_adjust(bottom=0.25, top=0.95, right=0.8)
290
+
291
+ save_path = os.path.join(
292
+ os.getcwd(), 'plot', f'06_{tgt_nme}_{fig_nme}.png')
293
+ IOUtils.ensure_parent_dir(save_path)
294
+ plt.savefig(save_path, dpi=300)
295
+ plt.close(fig)
296
+
297
+
298
+ # 向后兼容的函数式封装
299
+ def csv_to_dict(file_path: str) -> List[Dict[str, Any]]:
300
+ return IOUtils.csv_to_dict(file_path)
301
+
302
+
303
+ def ensure_parent_dir(file_path: str) -> None:
304
+ IOUtils.ensure_parent_dir(file_path)
305
+
306
+
307
+ def compute_batch_size(data_size: int, learning_rate: float, batch_num: int, minimum: int) -> int:
308
+ return TrainingUtils.compute_batch_size(data_size, learning_rate, batch_num, minimum)
309
+
310
+
311
+ # 定义在 PyTorch 环境下的 Tweedie 偏差损失函数
312
+ # 参考文档:https://scikit-learn.org/stable/modules/model_evaluation.html#mean-poisson-gamma-and-tweedie-deviances
313
+ def tweedie_loss(pred, target, p=1.5, eps=1e-6, max_clip=1e6):
314
+ return TrainingUtils.tweedie_loss(pred, target, p=p, eps=eps, max_clip=max_clip)
315
+
316
+
317
+ # 定义释放CUDA内存函数
318
+ def free_cuda():
319
+ TrainingUtils.free_cuda()
320
+
321
+
322
+ class TorchTrainerMixin:
323
+ # 面向 Torch 表格训练器的共享工具方法。
324
+
325
+ def _device_type(self) -> str:
326
+ return getattr(self, "device", torch.device("cpu")).type
327
+
328
+ def _build_dataloader(self,
329
+ dataset,
330
+ N: int,
331
+ base_bs_gpu: tuple,
332
+ base_bs_cpu: tuple,
333
+ min_bs: int = 64,
334
+ target_effective_cuda: int = 8192,
335
+ target_effective_cpu: int = 4096,
336
+ large_threshold: int = 200_000,
337
+ mid_threshold: int = 50_000):
338
+ batch_size = TrainingUtils.compute_batch_size(
339
+ data_size=len(dataset),
340
+ learning_rate=self.learning_rate,
341
+ batch_num=self.batch_num,
342
+ minimum=min_bs
343
+ )
344
+ gpu_large, gpu_mid, gpu_small = base_bs_gpu
345
+ cpu_mid, cpu_small = base_bs_cpu
346
+
347
+ if self._device_type() == 'cuda':
348
+ device_count = torch.cuda.device_count()
349
+ # 多卡环境下,适当增大最小批量,确保每张卡都能分到足够数据
350
+ if device_count > 1:
351
+ min_bs = min_bs * device_count
352
+ print(
353
+ f">>> Multi-GPU detected: {device_count} devices. Adjusted min_bs to {min_bs}.")
354
+
355
+ if N > large_threshold:
356
+ base_bs = gpu_large * device_count
357
+ elif N > mid_threshold:
358
+ base_bs = gpu_mid * device_count
359
+ else:
360
+ base_bs = gpu_small * device_count
361
+ else:
362
+ base_bs = cpu_mid if N > mid_threshold else cpu_small
363
+
364
+ # 重新计算 batch_size,确保不小于调整后的 min_bs
365
+ batch_size = TrainingUtils.compute_batch_size(
366
+ data_size=len(dataset),
367
+ learning_rate=self.learning_rate,
368
+ batch_num=self.batch_num,
369
+ minimum=min_bs
370
+ )
371
+ batch_size = min(batch_size, base_bs, N)
372
+
373
+ target_effective_bs = target_effective_cuda if self._device_type(
374
+ ) == 'cuda' else target_effective_cpu
375
+ accum_steps = max(1, target_effective_bs // batch_size)
376
+
377
+ print(
378
+ f">>> DataLoader config: Batch Size={batch_size}, Accum Steps={accum_steps}, Workers={min(8, os.cpu_count() or 1)}")
379
+
380
+ # Linux (posix) 采用 fork 更高效;Windows (nt) 使用 spawn,开销更大。
381
+ if os.name == 'nt':
382
+ workers = 0
383
+ else:
384
+ workers = min(8, os.cpu_count() or 1)
385
+
386
+ sampler = None
387
+ if dist.is_initialized():
388
+ sampler = DistributedSampler(dataset, shuffle=True)
389
+ shuffle = False # Sampler handles shuffling
390
+ else:
391
+ shuffle = True
392
+
393
+ dataloader = DataLoader(
394
+ dataset,
395
+ batch_size=batch_size,
396
+ shuffle=shuffle,
397
+ sampler=sampler,
398
+ num_workers=workers,
399
+ pin_memory=(self._device_type() == 'cuda'),
400
+ persistent_workers=workers > 0,
401
+ )
402
+ return dataloader, accum_steps
403
+
404
+ def _compute_weighted_loss(self, y_pred, y_true, weights, apply_softplus: bool = False):
405
+ task = getattr(self, "task_type", "regression")
406
+ if task == 'classification':
407
+ loss_fn = nn.BCEWithLogitsLoss(reduction='none')
408
+ losses = loss_fn(y_pred, y_true).view(-1)
409
+ else:
410
+ if apply_softplus:
411
+ y_pred = F.softplus(y_pred)
412
+ y_pred = torch.clamp(y_pred, min=1e-6)
413
+ power = getattr(self, "tw_power", 1.5)
414
+ losses = tweedie_loss(y_pred, y_true, p=power).view(-1)
415
+ weighted_loss = (losses * weights.view(-1)).sum() / \
416
+ torch.clamp(weights.sum(), min=EPS)
417
+ return weighted_loss
418
+
419
+ def _early_stop_update(self, val_loss, best_loss, best_state, patience_counter, model,
420
+ ignore_keys: Optional[List[str]] = None):
421
+ if val_loss < best_loss:
422
+ ignore_keys = ignore_keys or []
423
+ state_dict = {
424
+ k: (v.clone() if isinstance(v, torch.Tensor) else copy.deepcopy(v))
425
+ for k, v in model.state_dict().items()
426
+ if not any(k.startswith(ignore_key) for ignore_key in ignore_keys)
427
+ }
428
+ return val_loss, state_dict, 0, False
429
+ patience_counter += 1
430
+ should_stop = best_state is not None and patience_counter >= getattr(
431
+ self, "patience", 0)
432
+ return best_loss, best_state, patience_counter, should_stop
433
+
434
+ def _train_model(self,
435
+ model,
436
+ dataloader,
437
+ accum_steps,
438
+ optimizer,
439
+ scaler,
440
+ forward_fn,
441
+ val_forward_fn=None,
442
+ apply_softplus: bool = False,
443
+ clip_fn=None,
444
+ trial: Optional[optuna.trial.Trial] = None):
445
+ device_type = self._device_type()
446
+ best_loss = float('inf')
447
+ best_state = None
448
+ patience_counter = 0
449
+ stop_training = False
450
+
451
+ for epoch in range(1, getattr(self, "epochs", 1) + 1):
452
+ if hasattr(self, 'dataloader_sampler') and self.dataloader_sampler is not None:
453
+ self.dataloader_sampler.set_epoch(epoch)
454
+
455
+ model.train()
456
+ optimizer.zero_grad()
457
+
458
+ for step, batch in enumerate(dataloader):
459
+ with autocast(enabled=(device_type == 'cuda')):
460
+ y_pred, y_true, w = forward_fn(batch)
461
+ weighted_loss = self._compute_weighted_loss(
462
+ y_pred, y_true, w, apply_softplus=apply_softplus)
463
+ loss_for_backward = weighted_loss / accum_steps
464
+
465
+ scaler.scale(loss_for_backward).backward()
466
+
467
+ if ((step + 1) % accum_steps == 0) or ((step + 1) == len(dataloader)):
468
+ if clip_fn is not None:
469
+ clip_fn()
470
+ scaler.step(optimizer)
471
+ scaler.update()
472
+ optimizer.zero_grad()
473
+
474
+ if val_forward_fn is not None:
475
+ model.eval()
476
+ with torch.no_grad(), autocast(enabled=(device_type == 'cuda')):
477
+ val_result = val_forward_fn()
478
+ if isinstance(val_result, tuple) and len(val_result) == 3:
479
+ y_val_pred, y_val_true, w_val = val_result
480
+ val_weighted_loss = self._compute_weighted_loss(
481
+ y_val_pred, y_val_true, w_val, apply_softplus=apply_softplus)
482
+ else:
483
+ val_weighted_loss = val_result
484
+
485
+ best_loss, best_state, patience_counter, stop_training = self._early_stop_update(
486
+ val_weighted_loss, best_loss, best_state, patience_counter, model)
487
+
488
+ # Optuna 剪枝:若评估值劣于历史表现则提前中止该 trial
489
+ if trial is not None:
490
+ trial.report(val_weighted_loss, epoch)
491
+ if trial.should_prune():
492
+ raise optuna.TrialPruned()
493
+
494
+ if stop_training:
495
+ break
496
+
497
+ return best_state
498
+
499
+
500
+ # =============================================================================
501
+ # 绘图辅助模块
502
+ # =============================================================================
503
+
504
+ def split_data(data, col_nme, wgt_nme, n_bins=10):
505
+ return PlotUtils.split_data(data, col_nme, wgt_nme, n_bins)
506
+
507
+ # 定义提纯曲线(Lift)绘制函数
508
+
509
+
510
+ def plot_lift_list(pred_model, w_pred_list, w_act_list,
511
+ weight_list, tgt_nme, n_bins=10,
512
+ fig_nme='Lift Chart'):
513
+ return PlotUtils.plot_lift_list(pred_model, w_pred_list, w_act_list,
514
+ weight_list, tgt_nme, n_bins, fig_nme)
515
+
516
+ # 定义双提纯曲线绘制函数
517
+
518
+
519
+ def plot_dlift_list(pred_model_1, pred_model_2,
520
+ model_nme_1, model_nme_2,
521
+ tgt_nme,
522
+ w_list, w_act_list, n_bins=10,
523
+ fig_nme='Double Lift Chart'):
524
+ return PlotUtils.plot_dlift_list(pred_model_1, pred_model_2,
525
+ model_nme_1, model_nme_2,
526
+ tgt_nme, w_list, w_act_list,
527
+ n_bins, fig_nme)
528
+
529
+
530
+ # =============================================================================
531
+ # ResNet 模型与 sklearn 风格封装
532
+ # =============================================================================
533
+
534
+ # 开始定义ResNet模型结构
535
+ # 残差块:两层线性 + ReLU + 残差连接
536
+ # ResBlock 继承 nn.Module
537
+ class ResBlock(nn.Module):
538
+ def __init__(self, dim: int, dropout: float = 0.1,
539
+ use_layernorm: bool = False, residual_scale: float = 0.1
540
+ ):
541
+ super().__init__()
542
+ self.use_layernorm = use_layernorm
543
+
544
+ if use_layernorm:
545
+ Norm = nn.LayerNorm # 对最后一维做归一化
546
+ else:
547
+ def Norm(d): return nn.BatchNorm1d(d) # 保留一个开关,想试 BN 时也能用
548
+
549
+ self.norm1 = Norm(dim)
550
+ self.fc1 = nn.Linear(dim, dim, bias=True)
551
+ self.act = nn.ReLU(inplace=True)
552
+ self.dropout = nn.Dropout(dropout) if dropout > 0.0 else nn.Identity()
553
+ # self.norm2 = Norm(dim)
554
+ self.fc2 = nn.Linear(dim, dim, bias=True)
555
+
556
+ # 残差缩放,防止一开始就把主干搞炸
557
+ self.res_scale = nn.Parameter(
558
+ torch.tensor(residual_scale, dtype=torch.float32)
559
+ )
560
+
561
+ def forward(self, x):
562
+ # 前置激活结构
563
+ out = self.norm1(x)
564
+ out = self.fc1(out)
565
+ out = self.act(out)
566
+ out = self.dropout(out)
567
+ # out = self.norm2(out)
568
+ out = self.fc2(out)
569
+ # 残差缩放再相加
570
+ return x + self.res_scale * out
571
+
572
+ # ResNetSequential 继承 nn.Module,定义整个网络结构
573
+
574
+
575
+ class ResNetSequential(nn.Module):
576
+ # 输入张量形状:(batch, input_dim)
577
+ # 网络结构:全连接 + 归一化 + ReLU,再堆叠若干残差块,最后输出 Softplus
578
+
579
+ def __init__(self, input_dim: int, hidden_dim: int = 64, block_num: int = 2,
580
+ use_layernorm: bool = True, dropout: float = 0.1,
581
+ residual_scale: float = 0.1, task_type: str = 'regression'):
582
+ super(ResNetSequential, self).__init__()
583
+
584
+ self.net = nn.Sequential()
585
+ self.net.add_module('fc1', nn.Linear(input_dim, hidden_dim))
586
+
587
+ # if use_layernorm:
588
+ # self.net.add_module('norm1', nn.LayerNorm(hidden_dim))
589
+ # else:
590
+ # self.net.add_module('norm1', nn.BatchNorm1d(hidden_dim))
591
+
592
+ # self.net.add_module('relu1', nn.ReLU(inplace=True))
593
+
594
+ # 多个残差块
595
+ for i in range(block_num):
596
+ self.net.add_module(
597
+ f'ResBlk_{i+1}',
598
+ ResBlock(
599
+ hidden_dim,
600
+ dropout=dropout,
601
+ use_layernorm=use_layernorm,
602
+ residual_scale=residual_scale)
603
+ )
604
+
605
+ self.net.add_module('fc_out', nn.Linear(hidden_dim, 1))
606
+
607
+ if task_type == 'classification':
608
+ self.net.add_module('softplus', nn.Identity())
609
+ else:
610
+ self.net.add_module('softplus', nn.Softplus())
611
+
612
+ def forward(self, x):
613
+ if self.training and not hasattr(self, '_printed_device'):
614
+ print(f">>> ResNetSequential executing on device: {x.device}")
615
+ self._printed_device = True
616
+ return self.net(x)
617
+
618
+ # 定义ResNet模型的Scikit-Learn接口类
619
+
620
+
621
+ class ResNetSklearn(TorchTrainerMixin, nn.Module):
622
+ def __init__(self, model_nme: str, input_dim: int, hidden_dim: int = 64,
623
+ block_num: int = 2, batch_num: int = 100, epochs: int = 100,
624
+ task_type: str = 'regression',
625
+ tweedie_power: float = 1.5, learning_rate: float = 0.01, patience: int = 10,
626
+ use_layernorm: bool = True, dropout: float = 0.1,
627
+ residual_scale: float = 0.1,
628
+ use_data_parallel: bool = True,
629
+ use_ddp: bool = False):
630
+ super(ResNetSklearn, self).__init__()
631
+
632
+ self.use_ddp = use_ddp
633
+ self.is_ddp_enabled, self.local_rank, self.rank, self.world_size = (
634
+ False, 0, 0, 1)
635
+
636
+ if self.use_ddp:
637
+ self.is_ddp_enabled, self.local_rank, self.rank, self.world_size = DistributedUtils.setup_ddp()
638
+
639
+ self.input_dim = input_dim
640
+ self.hidden_dim = hidden_dim
641
+ self.block_num = block_num
642
+ self.batch_num = batch_num
643
+ self.epochs = epochs
644
+ self.task_type = task_type
645
+ self.model_nme = model_nme
646
+ self.learning_rate = learning_rate
647
+ self.patience = patience
648
+ self.use_layernorm = use_layernorm
649
+ self.dropout = dropout
650
+ self.residual_scale = residual_scale
651
+
652
+ # 设备选择:cuda > mps > cpu
653
+ if self.is_ddp_enabled:
654
+ self.device = torch.device(f'cuda:{self.local_rank}')
655
+ elif torch.cuda.is_available():
656
+ self.device = torch.device('cuda')
657
+ elif torch.backends.mps.is_available():
658
+ self.device = torch.device('mps')
659
+ else:
660
+ self.device = torch.device('cpu')
661
+
662
+ # Tweedie 幂指数设定(分类时不使用)
663
+ if self.task_type == 'classification':
664
+ self.tw_power = None
665
+ elif 'f' in self.model_nme:
666
+ self.tw_power = 1
667
+ elif 's' in self.model_nme:
668
+ self.tw_power = 2
669
+ else:
670
+ self.tw_power = tweedie_power
671
+
672
+ # 搭建网络(先在 CPU 上建好)
673
+ core = ResNetSequential(
674
+ self.input_dim,
675
+ self.hidden_dim,
676
+ self.block_num,
677
+ use_layernorm=self.use_layernorm,
678
+ dropout=self.dropout,
679
+ residual_scale=self.residual_scale,
680
+ task_type=self.task_type
681
+ )
682
+
683
+ # ===== 多卡支持:DataParallel vs DistributedDataParallel =====
684
+ if self.is_ddp_enabled:
685
+ core = core.to(self.device)
686
+ core = DDP(core, device_ids=[
687
+ self.local_rank], output_device=self.local_rank)
688
+ elif use_data_parallel and (self.device.type == 'cuda') and (torch.cuda.device_count() > 1):
689
+ core = nn.DataParallel(core, device_ids=list(
690
+ range(torch.cuda.device_count())))
691
+ # DataParallel 会把输入 scatter 到多卡上,但“主设备”仍然是 cuda:0
692
+ self.device = torch.device('cuda')
693
+
694
+ self.resnet = core.to(self.device)
695
+
696
+ # ================ 内部工具 ================
697
+ def _build_train_val_tensors(self, X_train, y_train, w_train, X_val, y_val, w_val):
698
+ X_tensor = torch.tensor(X_train.values, dtype=torch.float32)
699
+ y_tensor = torch.tensor(
700
+ y_train.values, dtype=torch.float32).view(-1, 1)
701
+ w_tensor = torch.tensor(w_train.values, dtype=torch.float32).view(
702
+ -1, 1) if w_train is not None else torch.ones_like(y_tensor)
703
+
704
+ has_val = X_val is not None and y_val is not None
705
+ if has_val:
706
+ X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
707
+ y_val_tensor = torch.tensor(
708
+ y_val.values, dtype=torch.float32).view(-1, 1)
709
+ w_val_tensor = torch.tensor(w_val.values, dtype=torch.float32).view(
710
+ -1, 1) if w_val is not None else torch.ones_like(y_val_tensor)
711
+ else:
712
+ X_val_tensor = y_val_tensor = w_val_tensor = None
713
+ return X_tensor, y_tensor, w_tensor, X_val_tensor, y_val_tensor, w_val_tensor, has_val
714
+
715
+ def forward(self, x):
716
+ # 处理 SHAP 的 NumPy 输入
717
+ if isinstance(x, np.ndarray):
718
+ x_tensor = torch.tensor(x, dtype=torch.float32)
719
+ else:
720
+ x_tensor = x
721
+
722
+ x_tensor = x_tensor.to(self.device)
723
+ y_pred = self.resnet(x_tensor)
724
+ return y_pred
725
+
726
+ # ---------------- 训练 ----------------
727
+
728
+ def fit(self, X_train, y_train, w_train=None,
729
+ X_val=None, y_val=None, w_val=None, trial=None):
730
+
731
+ X_tensor, y_tensor, w_tensor, X_val_tensor, y_val_tensor, w_val_tensor, has_val = \
732
+ self._build_train_val_tensors(
733
+ X_train, y_train, w_train, X_val, y_val, w_val)
734
+
735
+ dataset = TensorDataset(X_tensor, y_tensor, w_tensor)
736
+ dataloader, accum_steps = self._build_dataloader(
737
+ dataset,
738
+ N=X_tensor.shape[0],
739
+ base_bs_gpu=(16384, 8192, 4096),
740
+ base_bs_cpu=(1024, 512),
741
+ min_bs=64,
742
+ target_effective_cuda=8192,
743
+ target_effective_cpu=4096
744
+ )
745
+
746
+ # 在每个 epoch 开始前设置 sampler 的 epoch,以保证 shuffle 的随机性
747
+ if self.is_ddp_enabled and hasattr(dataloader.sampler, 'set_epoch'):
748
+ self.dataloader_sampler = dataloader.sampler
749
+ else:
750
+ self.dataloader_sampler = None
751
+
752
+ # === 4. 优化器与 AMP ===
753
+ self.optimizer = torch.optim.Adam(
754
+ self.resnet.parameters(), lr=self.learning_rate)
755
+ self.scaler = GradScaler(enabled=(self.device.type == 'cuda'))
756
+
757
+ X_val_dev = y_val_dev = w_val_dev = None
758
+ val_dataloader = None
759
+ if has_val:
760
+ # 构建验证集 DataLoader
761
+ val_dataset = TensorDataset(
762
+ X_val_tensor, y_val_tensor, w_val_tensor)
763
+ # 验证阶段无需反向传播,可适当放大批量以提高吞吐
764
+ val_bs = accum_steps * dataloader.batch_size
765
+
766
+ # 验证集的 worker 数沿用相同的分配逻辑
767
+ if os.name == 'nt':
768
+ val_workers = 0
769
+ else:
770
+ val_workers = min(4, os.cpu_count() or 1)
771
+
772
+ val_dataloader = DataLoader(
773
+ val_dataset,
774
+ batch_size=val_bs,
775
+ shuffle=False,
776
+ num_workers=val_workers,
777
+ pin_memory=(self.device.type == 'cuda'),
778
+ persistent_workers=val_workers > 0,
779
+ )
780
+ # 验证集通常不需要 DDP Sampler,因为我们只在主进程验证或汇总验证结果
781
+ # 但为了简单起见,这里保持单卡验证或主进程验证
782
+
783
+ is_data_parallel = isinstance(self.resnet, nn.DataParallel)
784
+
785
+ def forward_fn(batch):
786
+ X_batch, y_batch, w_batch = batch
787
+
788
+ if not is_data_parallel:
789
+ X_batch = X_batch.to(self.device, non_blocking=True)
790
+ # 目标值与权重始终与主设备保持一致,便于后续损失计算
791
+ y_batch = y_batch.to(self.device, non_blocking=True)
792
+ w_batch = w_batch.to(self.device, non_blocking=True)
793
+
794
+ y_pred = self.resnet(X_batch)
795
+ return y_pred, y_batch, w_batch
796
+
797
+ def val_forward_fn():
798
+ total_loss = 0.0
799
+ total_weight = 0.0
800
+ for batch in val_dataloader:
801
+ X_b, y_b, w_b = batch
802
+ if not is_data_parallel:
803
+ X_b = X_b.to(self.device, non_blocking=True)
804
+ y_b = y_b.to(self.device, non_blocking=True)
805
+ w_b = w_b.to(self.device, non_blocking=True)
806
+
807
+ y_pred = self.resnet(X_b)
808
+
809
+ # 手动计算当前批次的加权损失,以便后续精确加总
810
+ task = getattr(self, "task_type", "regression")
811
+ if task == 'classification':
812
+ loss_fn = nn.BCEWithLogitsLoss(reduction='none')
813
+ losses = loss_fn(y_pred, y_b).view(-1)
814
+ else:
815
+ # 此处无需再做 softplus:训练时 apply_softplus=False,模型前向结果本身已为正
816
+ y_pred_clamped = torch.clamp(y_pred, min=1e-6)
817
+ power = getattr(self, "tw_power", 1.5)
818
+ losses = tweedie_loss(
819
+ y_pred_clamped, y_b, p=power).view(-1)
820
+
821
+ batch_weight_sum = torch.clamp(w_b.sum(), min=EPS)
822
+ batch_weighted_loss_sum = (losses * w_b.view(-1)).sum()
823
+
824
+ total_loss += batch_weighted_loss_sum.item()
825
+ total_weight += batch_weight_sum.item()
826
+
827
+ return total_loss / max(total_weight, EPS)
828
+
829
+ clip_fn = None
830
+ if self.device.type == 'cuda':
831
+ def clip_fn(): return (self.scaler.unscale_(self.optimizer),
832
+ clip_grad_norm_(self.resnet.parameters(), max_norm=1.0))
833
+
834
+ # DDP 模式下,只在主进程打印日志和保存模型
835
+ if self.is_ddp_enabled and not DistributedUtils.is_main_process():
836
+ # 非主进程不进行验证回调中的打印操作(需在 _train_model 内部控制,这里暂略)
837
+ pass
838
+
839
+ best_state = self._train_model(
840
+ self.resnet,
841
+ dataloader,
842
+ accum_steps,
843
+ self.optimizer,
844
+ self.scaler,
845
+ forward_fn,
846
+ val_forward_fn if has_val else None,
847
+ apply_softplus=False,
848
+ clip_fn=clip_fn,
849
+ trial=trial
850
+ )
851
+
852
+ if has_val and best_state is not None:
853
+ self.resnet.load_state_dict(best_state)
854
+
855
+ # ---------------- 预测 ----------------
856
+
857
+ def predict(self, X_test):
858
+ self.resnet.eval()
859
+ if isinstance(X_test, pd.DataFrame):
860
+ X_np = X_test.values.astype(np.float32)
861
+ else:
862
+ X_np = X_test
863
+
864
+ with torch.no_grad():
865
+ y_pred = self(X_np).cpu().numpy()
866
+
867
+ if self.task_type == 'classification':
868
+ y_pred = 1 / (1 + np.exp(-y_pred)) # Sigmoid 函数将 logit 转换为概率
869
+ else:
870
+ y_pred = np.clip(y_pred, 1e-6, None)
871
+ return y_pred.flatten()
872
+
873
+ # ---------------- 设置参数 ----------------
874
+
875
+ def set_params(self, params):
876
+ for key, value in params.items():
877
+ if hasattr(self, key):
878
+ setattr(self, key, value)
879
+ else:
880
+ raise ValueError(f"Parameter {key} not found in model.")
881
+ return self
882
+
883
+
884
+ # =============================================================================
885
+ # FT-Transformer 模型与 sklearn 风格封装
886
+ # =============================================================================
887
+ # 开始定义FT Transformer模型结构
888
+
889
+
890
+ class FeatureTokenizer(nn.Module):
891
+ # 将数值特征与类别特征统一映射为 token,输出形状为 (batch, token_num, d_model)
892
+ # 约定:
893
+ # - X_num:表示数值特征,shape=(batch, num_numeric)
894
+ # - X_cat:表示类别特征,shape=(batch, num_categorical),每列是编码后的整数标签 [0, card-1]
895
+
896
+ def __init__(self, num_numeric: int, cat_cardinalities, d_model: int):
897
+ super().__init__()
898
+
899
+ self.num_numeric = num_numeric
900
+ self.has_numeric = num_numeric > 0
901
+
902
+ if self.has_numeric:
903
+ self.num_linear = nn.Linear(num_numeric, d_model)
904
+
905
+ self.embeddings = nn.ModuleList([
906
+ nn.Embedding(card, d_model) for card in cat_cardinalities
907
+ ])
908
+
909
+ def forward(self, X_num, X_cat):
910
+ tokens = []
911
+
912
+ if self.has_numeric:
913
+ # 数值特征整体映射为一个 token
914
+ # shape = (batch, d_model)
915
+ num_token = self.num_linear(X_num)
916
+ tokens.append(num_token)
917
+
918
+ # 每个类别特征各生成一个嵌入 token
919
+ for i, emb in enumerate(self.embeddings):
920
+ # shape = (batch, d_model)
921
+ tok = emb(X_cat[:, i])
922
+ tokens.append(tok)
923
+
924
+ # 拼接后得到 (batch, token_num, d_model)
925
+ x = torch.stack(tokens, dim=1)
926
+ return x
927
+
928
+ # 定义具有残差缩放的Encoder层
929
+
930
+
931
+ class ScaledTransformerEncoderLayer(nn.Module):
932
+ def __init__(self, d_model: int, nhead: int, dim_feedforward: int = 2048,
933
+ dropout: float = 0.1, residual_scale_attn: float = 1.0,
934
+ residual_scale_ffn: float = 1.0, norm_first: bool = True,
935
+ ):
936
+ super().__init__()
937
+ self.self_attn = nn.MultiheadAttention(
938
+ embed_dim=d_model,
939
+ num_heads=nhead,
940
+ dropout=dropout,
941
+ batch_first=True
942
+ )
943
+
944
+ # 前馈网络部分
945
+ self.linear1 = nn.Linear(d_model, dim_feedforward)
946
+ self.dropout = nn.Dropout(dropout)
947
+ self.linear2 = nn.Linear(dim_feedforward, d_model)
948
+
949
+ # 归一化与 Dropout
950
+ self.norm1 = nn.LayerNorm(d_model)
951
+ self.norm2 = nn.LayerNorm(d_model)
952
+ self.dropout1 = nn.Dropout(dropout)
953
+ self.dropout2 = nn.Dropout(dropout)
954
+
955
+ self.activation = nn.GELU()
956
+ # self.activation = nn.ReLU()
957
+ self.norm_first = norm_first
958
+
959
+ # 残差缩放系数
960
+ self.res_scale_attn = residual_scale_attn
961
+ self.res_scale_ffn = residual_scale_ffn
962
+
963
+ def forward(self, src, src_mask=None, src_key_padding_mask=None):
964
+ # 输入张量形状:(batch, 序列长度, d_model)
965
+ x = src
966
+
967
+ if self.norm_first:
968
+ # 先归一化再做注意力
969
+ x = x + self._sa_block(self.norm1(x), src_mask,
970
+ src_key_padding_mask)
971
+ x = x + self._ff_block(self.norm2(x))
972
+ else:
973
+ # 后归一化(一般不启用)
974
+ x = self.norm1(
975
+ x + self._sa_block(x, src_mask, src_key_padding_mask))
976
+ x = self.norm2(x + self._ff_block(x))
977
+
978
+ return x
979
+
980
+ def _sa_block(self, x, attn_mask, key_padding_mask):
981
+ # 自注意力并附带残差缩放
982
+ attn_out, _ = self.self_attn(
983
+ x, x, x,
984
+ attn_mask=attn_mask,
985
+ key_padding_mask=key_padding_mask,
986
+ need_weights=False
987
+ )
988
+ return self.res_scale_attn * self.dropout1(attn_out)
989
+
990
+ def _ff_block(self, x):
991
+ # 前馈网络并附带残差缩放
992
+ x2 = self.linear2(self.dropout(self.activation(self.linear1(x))))
993
+ return self.res_scale_ffn * self.dropout2(x2)
994
+
995
+ # 定义FT-Transformer核心模型
996
+
997
+
998
+ class FTTransformerCore(nn.Module):
999
+ # 最小可用版本的 FT-Transformer,由三部分组成:
1000
+ # 1) FeatureTokenizer:将数值/类别特征转换成 token;
1001
+ # 2) TransformerEncoder:建模特征之间的交互;
1002
+ # 3) 池化 + MLP + Softplus:输出正值,方便 Tweedie/Gamma 等任务。
1003
+
1004
+ def __init__(self, num_numeric: int, cat_cardinalities, d_model: int = 64,
1005
+ n_heads: int = 8, n_layers: int = 4, dropout: float = 0.1,
1006
+ task_type: str = 'regression'
1007
+ ):
1008
+ super().__init__()
1009
+
1010
+ self.tokenizer = FeatureTokenizer(
1011
+ num_numeric=num_numeric,
1012
+ cat_cardinalities=cat_cardinalities,
1013
+ d_model=d_model
1014
+ )
1015
+ scale = 1.0 / math.sqrt(n_layers) # 推荐一个默认值
1016
+ encoder_layer = ScaledTransformerEncoderLayer(
1017
+ d_model=d_model,
1018
+ nhead=n_heads,
1019
+ dim_feedforward=d_model * 4,
1020
+ dropout=dropout,
1021
+ residual_scale_attn=scale,
1022
+ residual_scale_ffn=scale,
1023
+ norm_first=True,
1024
+ )
1025
+ self.encoder = nn.TransformerEncoder(
1026
+ encoder_layer,
1027
+ num_layers=n_layers
1028
+ )
1029
+ self.n_layers = n_layers
1030
+
1031
+ layers = [
1032
+ # nn.LayerNorm(d_model),
1033
+ # nn.Linear(d_model, d_model),
1034
+ # nn.GELU(),
1035
+ nn.Linear(d_model, 1),
1036
+ ]
1037
+
1038
+ if task_type == 'classification':
1039
+ # 分类任务输出 logits,与 BCEWithLogitsLoss 更匹配
1040
+ layers.append(nn.Identity())
1041
+ else:
1042
+ # 回归任务需保持正值,适配 Tweedie/Gamma
1043
+ layers.append(nn.Softplus())
1044
+
1045
+ self.head = nn.Sequential(*layers)
1046
+
1047
+ def forward(self, X_num, X_cat):
1048
+
1049
+ # 输入:
1050
+ # X_num -> (batch, 数值特征数) 的 float32 张量
1051
+ # X_cat -> (batch, 类别特征数) 的 long 张量
1052
+
1053
+ if self.training and not hasattr(self, '_printed_device'):
1054
+ print(f">>> FTTransformerCore executing on device: {X_num.device}")
1055
+ self._printed_device = True
1056
+
1057
+ tokens = self.tokenizer(X_num, X_cat) # => (batch, token_num, d_model)
1058
+ x = self.encoder(tokens) # => (batch, token_num, d_model)
1059
+
1060
+ # 对 token 做平均池化,再送入回归头
1061
+ x = x.mean(dim=1) # => (batch, d_model)
1062
+
1063
+ out = self.head(x) # => (batch, 1),Softplus 约束为正
1064
+ return out
1065
+
1066
+ # 定义TabularDataset类
1067
+
1068
+
1069
+ class TabularDataset(Dataset):
1070
+ def __init__(self, X_num, X_cat, y, w):
1071
+
1072
+ # 输入张量说明:
1073
+ # X_num: torch.float32,shape=(N, 数值特征数)
1074
+ # X_cat: torch.long, shape=(N, 类别特征数)
1075
+ # y: torch.float32,shape=(N, 1)
1076
+ # w: torch.float32,shape=(N, 1)
1077
+
1078
+ self.X_num = X_num
1079
+ self.X_cat = X_cat
1080
+ self.y = y
1081
+ self.w = w
1082
+
1083
+ def __len__(self):
1084
+ return self.y.shape[0]
1085
+
1086
+ def __getitem__(self, idx):
1087
+ return (
1088
+ self.X_num[idx],
1089
+ self.X_cat[idx],
1090
+ self.y[idx],
1091
+ self.w[idx],
1092
+ )
1093
+
1094
+ # 定义FTTransformer的Scikit-Learn接口类
1095
+
1096
+
1097
+ class FTTransformerSklearn(TorchTrainerMixin, nn.Module):
1098
+
1099
+ # sklearn 风格包装:
1100
+ # - num_cols:数值特征列名列表
1101
+ # - cat_cols:类别特征列名列表(需事先做标签编码,取值 ∈ [0, n_classes-1])
1102
+
1103
+ def __init__(self, model_nme: str, num_cols, cat_cols, d_model: int = 64, n_heads: int = 8,
1104
+ n_layers: int = 4, dropout: float = 0.1, batch_num: int = 100, epochs: int = 100,
1105
+ task_type: str = 'regression',
1106
+ tweedie_power: float = 1.5, learning_rate: float = 1e-3, patience: int = 10,
1107
+ use_data_parallel: bool = True,
1108
+ use_ddp: bool = False
1109
+ ):
1110
+ super().__init__()
1111
+
1112
+ self.use_ddp = use_ddp
1113
+ self.is_ddp_enabled, self.local_rank, self.rank, self.world_size = (
1114
+ False, 0, 0, 1)
1115
+ if self.use_ddp:
1116
+ self.is_ddp_enabled, self.local_rank, self.rank, self.world_size = DistributedUtils.setup_ddp()
1117
+
1118
+ self.model_nme = model_nme
1119
+ self.num_cols = list(num_cols)
1120
+ self.cat_cols = list(cat_cols)
1121
+ self.d_model = d_model
1122
+ self.n_heads = n_heads
1123
+ self.n_layers = n_layers
1124
+ self.dropout = dropout
1125
+ self.batch_num = batch_num
1126
+ self.epochs = epochs
1127
+ self.learning_rate = learning_rate
1128
+ self.task_type = task_type
1129
+ self.patience = patience
1130
+ if self.task_type == 'classification':
1131
+ self.tw_power = None # 分类时不使用 Tweedie 幂
1132
+ elif 'f' in self.model_nme:
1133
+ self.tw_power = 1.0
1134
+ elif 's' in self.model_nme:
1135
+ self.tw_power = 2.0
1136
+ else:
1137
+ self.tw_power = tweedie_power
1138
+
1139
+ if self.is_ddp_enabled:
1140
+ self.device = torch.device(f"cuda:{self.local_rank}")
1141
+ elif torch.cuda.is_available():
1142
+ self.device = torch.device("cuda")
1143
+ elif torch.backends.mps.is_available():
1144
+ self.device = torch.device("mps")
1145
+ else:
1146
+ self.device = torch.device("cpu")
1147
+ self.cat_cardinalities = None
1148
+ self.cat_categories = {}
1149
+ self.ft = None
1150
+ self.use_data_parallel = torch.cuda.device_count() > 1 and use_data_parallel
1151
+
1152
+ def _build_model(self, X_train):
1153
+ num_numeric = len(self.num_cols)
1154
+ cat_cardinalities = []
1155
+
1156
+ for col in self.cat_cols:
1157
+ cats = X_train[col].astype('category')
1158
+ categories = cats.cat.categories
1159
+ self.cat_categories[col] = categories # 保存训练集类别全集
1160
+
1161
+ card = len(categories) + 1 # 多预留 1 类给“未知/缺失”
1162
+ cat_cardinalities.append(card)
1163
+
1164
+ self.cat_cardinalities = cat_cardinalities
1165
+
1166
+ core = FTTransformerCore(
1167
+ num_numeric=num_numeric,
1168
+ cat_cardinalities=cat_cardinalities,
1169
+ d_model=self.d_model,
1170
+ n_heads=self.n_heads,
1171
+ n_layers=self.n_layers,
1172
+ dropout=self.dropout,
1173
+ task_type=self.task_type
1174
+ )
1175
+ if self.is_ddp_enabled:
1176
+ core = core.to(self.device)
1177
+ core = DDP(core, device_ids=[
1178
+ self.local_rank], output_device=self.local_rank)
1179
+ elif self.use_data_parallel:
1180
+ core = nn.DataParallel(core, device_ids=list(
1181
+ range(torch.cuda.device_count())))
1182
+ self.device = torch.device("cuda")
1183
+ self.ft = core.to(self.device)
1184
+
1185
+ def _encode_cats(self, X):
1186
+ # 输入 DataFrame 至少需要包含所有类别特征列
1187
+ # 返回形状 (N, 类别特征数) 的 int64 数组
1188
+
1189
+ if not self.cat_cols:
1190
+ return np.zeros((len(X), 0), dtype='int64')
1191
+
1192
+ X_cat_list = []
1193
+ for col in self.cat_cols:
1194
+ # 使用训练阶段记录的类别全集
1195
+ categories = self.cat_categories[col]
1196
+ # 按固定类别构造 Categorical
1197
+ cats = pd.Categorical(X[col], categories=categories)
1198
+ codes = cats.codes.astype('int64', copy=True) # -1 表示未知或缺失
1199
+ # 未知或缺失映射到额外的“未知”索引 len(categories)
1200
+ codes[codes < 0] = len(categories)
1201
+ X_cat_list.append(codes)
1202
+
1203
+ X_cat_np = np.stack(X_cat_list, axis=1) # 形状 (N, 类别特征数)
1204
+ return X_cat_np
1205
+
1206
+ def _build_train_tensors(self, X_train, y_train, w_train):
1207
+ return self._tensorize_split(X_train, y_train, w_train)
1208
+
1209
+ def _build_val_tensors(self, X_val, y_val, w_val):
1210
+ return self._tensorize_split(X_val, y_val, w_val, allow_none=True)
1211
+
1212
+ def _tensorize_split(self, X, y, w, allow_none: bool = False):
1213
+ if X is None:
1214
+ if allow_none:
1215
+ return None, None, None, None, False
1216
+ raise ValueError("输入特征 X 不能为空。")
1217
+
1218
+ X_num = torch.tensor(
1219
+ X[self.num_cols].to_numpy(dtype=np.float32, copy=True),
1220
+ dtype=torch.float32
1221
+ )
1222
+ if self.cat_cols:
1223
+ X_cat = torch.tensor(self._encode_cats(X), dtype=torch.long)
1224
+ else:
1225
+ X_cat = torch.zeros((X_num.shape[0], 0), dtype=torch.long)
1226
+
1227
+ y_tensor = torch.tensor(
1228
+ y.values, dtype=torch.float32).view(-1, 1) if y is not None else None
1229
+ if y_tensor is None:
1230
+ w_tensor = None
1231
+ elif w is not None:
1232
+ w_tensor = torch.tensor(
1233
+ w.values, dtype=torch.float32).view(-1, 1)
1234
+ else:
1235
+ w_tensor = torch.ones_like(y_tensor)
1236
+ return X_num, X_cat, y_tensor, w_tensor, y is not None
1237
+
1238
+ def fit(self, X_train, y_train, w_train=None,
1239
+ X_val=None, y_val=None, w_val=None, trial=None):
1240
+
1241
+ # 首次拟合时需要构建底层模型结构
1242
+ if self.ft is None:
1243
+ self._build_model(X_train)
1244
+
1245
+ X_num_train, X_cat_train, y_tensor, w_tensor, _ = self._build_train_tensors(
1246
+ X_train, y_train, w_train)
1247
+ X_num_val, X_cat_val, y_val_tensor, w_val_tensor, has_val = self._build_val_tensors(
1248
+ X_val, y_val, w_val)
1249
+
1250
+ # --- 构建 DataLoader ---
1251
+ dataset = TabularDataset(
1252
+ X_num_train, X_cat_train, y_tensor, w_tensor
1253
+ )
1254
+
1255
+ dataloader, accum_steps = self._build_dataloader(
1256
+ dataset,
1257
+ N=X_num_train.shape[0],
1258
+ base_bs_gpu=(16384, 8192, 4096),
1259
+ base_bs_cpu=(256, 128),
1260
+ min_bs=64,
1261
+ target_effective_cuda=4096,
1262
+ target_effective_cpu=2048
1263
+ )
1264
+
1265
+ if self.is_ddp_enabled and hasattr(dataloader.sampler, 'set_epoch'):
1266
+ self.dataloader_sampler = dataloader.sampler
1267
+ else:
1268
+ self.dataloader_sampler = None
1269
+
1270
+ optimizer = torch.optim.Adam(
1271
+ self.ft.parameters(), lr=self.learning_rate)
1272
+ scaler = GradScaler(enabled=(self.device.type == 'cuda'))
1273
+
1274
+ X_num_val_dev = X_cat_val_dev = y_val_dev = w_val_dev = None
1275
+ val_dataloader = None
1276
+ if has_val:
1277
+ val_dataset = TabularDataset(
1278
+ X_num_val, X_cat_val, y_val_tensor, w_val_tensor
1279
+ )
1280
+ val_bs = accum_steps * dataloader.batch_size
1281
+
1282
+ if os.name == 'nt':
1283
+ val_workers = 0
1284
+ else:
1285
+ val_workers = min(4, os.cpu_count() or 1)
1286
+
1287
+ val_dataloader = DataLoader(
1288
+ val_dataset,
1289
+ batch_size=val_bs,
1290
+ shuffle=False,
1291
+ num_workers=val_workers,
1292
+ pin_memory=(self.device.type == 'cuda'),
1293
+ persistent_workers=val_workers > 0,
1294
+ )
1295
+
1296
+ is_data_parallel = isinstance(self.ft, nn.DataParallel)
1297
+
1298
+ def forward_fn(batch):
1299
+ X_num_b, X_cat_b, y_b, w_b = batch
1300
+
1301
+ if not is_data_parallel:
1302
+ X_num_b = X_num_b.to(self.device, non_blocking=True)
1303
+ X_cat_b = X_cat_b.to(self.device, non_blocking=True)
1304
+ y_b = y_b.to(self.device, non_blocking=True)
1305
+ w_b = w_b.to(self.device, non_blocking=True)
1306
+
1307
+ y_pred = self.ft(X_num_b, X_cat_b)
1308
+ return y_pred, y_b, w_b
1309
+
1310
+ def val_forward_fn():
1311
+ total_loss = 0.0
1312
+ total_weight = 0.0
1313
+ for batch in val_dataloader:
1314
+ X_num_b, X_cat_b, y_b, w_b = batch
1315
+ if not is_data_parallel:
1316
+ X_num_b = X_num_b.to(self.device, non_blocking=True)
1317
+ X_cat_b = X_cat_b.to(self.device, non_blocking=True)
1318
+ y_b = y_b.to(self.device, non_blocking=True)
1319
+ w_b = w_b.to(self.device, non_blocking=True)
1320
+
1321
+ y_pred = self.ft(X_num_b, X_cat_b)
1322
+
1323
+ # 手动计算验证损失
1324
+ task = getattr(self, "task_type", "regression")
1325
+ if task == 'classification':
1326
+ loss_fn = nn.BCEWithLogitsLoss(reduction='none')
1327
+ losses = loss_fn(y_pred, y_b).view(-1)
1328
+ else:
1329
+ # 模型输出已通过 Softplus,无需再次应用
1330
+ y_pred_clamped = torch.clamp(y_pred, min=1e-6)
1331
+ power = getattr(self, "tw_power", 1.5)
1332
+ losses = tweedie_loss(
1333
+ y_pred_clamped, y_b, p=power).view(-1)
1334
+
1335
+ batch_weight_sum = torch.clamp(w_b.sum(), min=EPS)
1336
+ batch_weighted_loss_sum = (losses * w_b.view(-1)).sum()
1337
+
1338
+ total_loss += batch_weighted_loss_sum.item()
1339
+ total_weight += batch_weight_sum.item()
1340
+
1341
+ return total_loss / max(total_weight, EPS)
1342
+
1343
+ clip_fn = None
1344
+ if self.device.type == 'cuda':
1345
+ def clip_fn(): return (scaler.unscale_(optimizer),
1346
+ clip_grad_norm_(self.ft.parameters(), max_norm=1.0))
1347
+
1348
+ best_state = self._train_model(
1349
+ self.ft,
1350
+ dataloader,
1351
+ accum_steps,
1352
+ optimizer,
1353
+ scaler,
1354
+ forward_fn,
1355
+ val_forward_fn if has_val else None,
1356
+ apply_softplus=False,
1357
+ clip_fn=clip_fn,
1358
+ trial=trial
1359
+ )
1360
+
1361
+ if has_val and best_state is not None:
1362
+ self.ft.load_state_dict(best_state)
1363
+
1364
+ def predict(self, X_test):
1365
+ # X_test 需要包含所有数值列与类别列
1366
+
1367
+ self.ft.eval()
1368
+ X_num, X_cat, _, _, _ = self._tensorize_split(
1369
+ X_test, None, None, allow_none=True)
1370
+
1371
+ with torch.no_grad():
1372
+ X_num = X_num.to(self.device, non_blocking=True)
1373
+ X_cat = X_cat.to(self.device, non_blocking=True)
1374
+ y_pred = self.ft(X_num, X_cat).cpu().numpy()
1375
+
1376
+ if self.task_type == 'classification':
1377
+ # 从 logits 转换为概率
1378
+ y_pred = 1 / (1 + np.exp(-y_pred))
1379
+ else:
1380
+ # 模型已含 softplus,若需要可按需做 log-exp 平滑:y_pred = log(1 + exp(y_pred))
1381
+ y_pred = np.clip(y_pred, 1e-6, None)
1382
+ return y_pred.ravel()
1383
+
1384
+ def set_params(self, params: dict):
1385
+
1386
+ # 和 sklearn 风格保持一致。
1387
+ # 注意:对结构性参数(如 d_model/n_heads)修改后,需要重新 fit 才会生效。
1388
+
1389
+ for key, value in params.items():
1390
+ if hasattr(self, key):
1391
+ setattr(self, key, value)
1392
+ else:
1393
+ raise ValueError(f"Parameter {key} not found in model.")
1394
+ return self
1395
+
1396
+
1397
+ # =============================================================================
1398
+ # 图神经网络 (GNN) 简化实现
1399
+ # =============================================================================
1400
+
1401
+
1402
+ class SimpleGraphLayer(nn.Module):
1403
+ def __init__(self, in_dim: int, out_dim: int, dropout: float = 0.1):
1404
+ super().__init__()
1405
+ self.linear = nn.Linear(in_dim, out_dim)
1406
+ self.activation = nn.ReLU(inplace=True)
1407
+ self.dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
1408
+
1409
+ def forward(self, x: torch.Tensor, adj: torch.Tensor) -> torch.Tensor:
1410
+ # 基于归一化稀疏邻接矩阵的消息传递:A_hat * X * W
1411
+ h = torch.sparse.mm(adj, x)
1412
+ h = self.linear(h)
1413
+ h = self.activation(h)
1414
+ return self.dropout(h)
1415
+
1416
+
1417
+ class SimpleGNN(nn.Module):
1418
+ def __init__(self, input_dim: int, hidden_dim: int = 64, num_layers: int = 2,
1419
+ dropout: float = 0.1, task_type: str = 'regression'):
1420
+ super().__init__()
1421
+ layers = []
1422
+ dim_in = input_dim
1423
+ for _ in range(max(1, num_layers)):
1424
+ layers.append(SimpleGraphLayer(
1425
+ dim_in, hidden_dim, dropout=dropout))
1426
+ dim_in = hidden_dim
1427
+ self.layers = nn.ModuleList(layers)
1428
+ self.output = nn.Linear(hidden_dim, 1)
1429
+ if task_type == 'classification':
1430
+ self.output_act = nn.Identity()
1431
+ else:
1432
+ self.output_act = nn.Softplus()
1433
+ self.task_type = task_type
1434
+ # 用 buffer 保持邻接矩阵,便于 DataParallel 复制
1435
+ self.register_buffer("adj_buffer", torch.empty(0))
1436
+
1437
+ def forward(self, x: torch.Tensor, adj: Optional[torch.Tensor] = None) -> torch.Tensor:
1438
+ adj_used = adj if adj is not None else getattr(
1439
+ self, "adj_buffer", None)
1440
+ if adj_used is None or adj_used.numel() == 0:
1441
+ raise RuntimeError("Adjacency is not set for GNN forward.")
1442
+ h = x
1443
+ for layer in self.layers:
1444
+ h = layer(h, adj_used)
1445
+ h = torch.sparse.mm(adj_used, h)
1446
+ out = self.output(h)
1447
+ return self.output_act(out)
1448
+
1449
+
1450
+ class GraphNeuralNetSklearn(TorchTrainerMixin, nn.Module):
1451
+ def __init__(self, model_nme: str, input_dim: int, hidden_dim: int = 64,
1452
+ num_layers: int = 2, k_neighbors: int = 10, dropout: float = 0.1,
1453
+ learning_rate: float = 1e-3, epochs: int = 100, patience: int = 10,
1454
+ task_type: str = 'regression', tweedie_power: float = 1.5,
1455
+ use_data_parallel: bool = False, use_ddp: bool = False) -> None:
1456
+ super().__init__()
1457
+ self.model_nme = model_nme
1458
+ self.input_dim = input_dim
1459
+ self.hidden_dim = hidden_dim
1460
+ self.num_layers = num_layers
1461
+ self.k_neighbors = max(1, k_neighbors)
1462
+ self.dropout = dropout
1463
+ self.learning_rate = learning_rate
1464
+ self.epochs = epochs
1465
+ self.patience = patience
1466
+ self.task_type = task_type
1467
+
1468
+ if self.task_type == 'classification':
1469
+ self.tw_power = None
1470
+ elif 'f' in self.model_nme:
1471
+ self.tw_power = 1.0
1472
+ elif 's' in self.model_nme:
1473
+ self.tw_power = 2.0
1474
+ else:
1475
+ self.tw_power = tweedie_power
1476
+
1477
+ self.ddp_enabled = False
1478
+ self.local_rank = 0
1479
+ self.data_parallel_enabled = False
1480
+
1481
+ # DDP 仅在 CUDA 下有效;若未初始化成功则自动回退单卡
1482
+ if use_ddp and torch.cuda.is_available():
1483
+ ddp_ok, local_rank, _, _ = DistributedUtils.setup_ddp()
1484
+ if ddp_ok:
1485
+ self.ddp_enabled = True
1486
+ self.local_rank = local_rank
1487
+ self.device = torch.device(f'cuda:{local_rank}')
1488
+ else:
1489
+ self.device = torch.device('cuda')
1490
+ elif torch.cuda.is_available():
1491
+ self.device = torch.device('cuda')
1492
+ elif torch.backends.mps.is_available():
1493
+ self.device = torch.device('mps')
1494
+ else:
1495
+ self.device = torch.device('cpu')
1496
+ self.use_pyg_knn = self.device.type == 'cuda' and _PYG_AVAILABLE
1497
+
1498
+ self.gnn = SimpleGNN(
1499
+ input_dim=self.input_dim,
1500
+ hidden_dim=self.hidden_dim,
1501
+ num_layers=self.num_layers,
1502
+ dropout=self.dropout,
1503
+ task_type=self.task_type
1504
+ ).to(self.device)
1505
+
1506
+ # DataParallel: 复制完整图到每张卡,分割特征,适合中等规模图
1507
+ if (not self.ddp_enabled) and use_data_parallel and (self.device.type == 'cuda') and (torch.cuda.device_count() > 1):
1508
+ self.data_parallel_enabled = True
1509
+ self.gnn = nn.DataParallel(
1510
+ self.gnn, device_ids=list(range(torch.cuda.device_count())))
1511
+ self.device = torch.device('cuda')
1512
+
1513
+ if self.ddp_enabled:
1514
+ self.gnn = DDP(
1515
+ self.gnn,
1516
+ device_ids=[self.local_rank],
1517
+ output_device=self.local_rank,
1518
+ find_unused_parameters=False
1519
+ )
1520
+
1521
+ def _unwrap_gnn(self) -> nn.Module:
1522
+ return self.gnn.module if isinstance(self.gnn, DDP) else self.gnn
1523
+
1524
+ def _set_adj_buffer(self, adj: torch.Tensor) -> None:
1525
+ base = self._unwrap_gnn()
1526
+ if hasattr(base, "adj_buffer"):
1527
+ base.adj_buffer = adj
1528
+ else:
1529
+ base.register_buffer("adj_buffer", adj)
1530
+
1531
+ def _build_edge_index_cpu(self, X_np: np.ndarray) -> torch.Tensor:
1532
+ n_samples = X_np.shape[0]
1533
+ k = min(self.k_neighbors, max(1, n_samples - 1))
1534
+ n_neighbors = min(k + 1, n_samples)
1535
+ # 包含自身的 knn(kneighbors 会返回自身索引)
1536
+ nbrs = NearestNeighbors(n_neighbors=n_neighbors, algorithm="auto")
1537
+ nbrs.fit(X_np)
1538
+ _, indices = nbrs.kneighbors(X_np)
1539
+
1540
+ rows = []
1541
+ cols = []
1542
+ for i in range(n_samples):
1543
+ for j in indices[i, 1:]:
1544
+ rows.append(i)
1545
+ cols.append(j)
1546
+ rows.append(j)
1547
+ cols.append(i)
1548
+
1549
+ # 添加自环,避免度为 0 的节点
1550
+ rows.extend(range(n_samples))
1551
+ cols.extend(range(n_samples))
1552
+
1553
+ edge_index = torch.tensor(
1554
+ [rows, cols], dtype=torch.long, device=self.device)
1555
+ return edge_index
1556
+
1557
+ def _build_edge_index_gpu(self, X_tensor: torch.Tensor) -> torch.Tensor:
1558
+ if not self.use_pyg_knn or knn_graph is None or add_self_loops is None or to_undirected is None:
1559
+ # 防御式编程:调用前应检查 use_pyg_knn
1560
+ raise RuntimeError(
1561
+ "GPU graph builder requested but PyG is unavailable.")
1562
+
1563
+ n_samples = X_tensor.size(0)
1564
+ k = min(self.k_neighbors, max(1, n_samples - 1))
1565
+
1566
+ # knn_graph 运行在 GPU 上,避免 CPU 构图成为瓶颈
1567
+ edge_index = knn_graph(
1568
+ X_tensor,
1569
+ k=k,
1570
+ loop=False
1571
+ )
1572
+ edge_index = to_undirected(edge_index, num_nodes=n_samples)
1573
+ edge_index, _ = add_self_loops(edge_index, num_nodes=n_samples)
1574
+ return edge_index
1575
+
1576
+ def _normalized_adj(self, edge_index: torch.Tensor, num_nodes: int) -> torch.Tensor:
1577
+ values = torch.ones(edge_index.shape[1], device=self.device)
1578
+ adj = torch.sparse_coo_tensor(
1579
+ edge_index.to(self.device), values, (num_nodes, num_nodes))
1580
+ adj = adj.coalesce()
1581
+
1582
+ deg = torch.sparse.sum(adj, dim=1).to_dense()
1583
+ deg_inv_sqrt = torch.pow(deg + 1e-8, -0.5)
1584
+ row, col = adj.indices()
1585
+ norm_values = deg_inv_sqrt[row] * adj.values() * deg_inv_sqrt[col]
1586
+ adj_norm = torch.sparse_coo_tensor(
1587
+ adj.indices(), norm_values, size=adj.shape)
1588
+ return adj_norm
1589
+
1590
+ def _tensorize_split(self, X, y, w, allow_none: bool = False):
1591
+ if X is None and allow_none:
1592
+ return None, None, None
1593
+ X_np = X.values.astype(np.float32)
1594
+ X_tensor = torch.tensor(X_np, dtype=torch.float32, device=self.device)
1595
+ if y is None:
1596
+ y_tensor = None
1597
+ else:
1598
+ y_tensor = torch.tensor(
1599
+ y.values, dtype=torch.float32, device=self.device).view(-1, 1)
1600
+ if w is None:
1601
+ w_tensor = torch.ones(
1602
+ (len(X), 1), dtype=torch.float32, device=self.device)
1603
+ else:
1604
+ w_tensor = torch.tensor(
1605
+ w.values, dtype=torch.float32, device=self.device).view(-1, 1)
1606
+ return X_tensor, y_tensor, w_tensor
1607
+
1608
+ def _build_graph_from_df(self, X_df: pd.DataFrame, X_tensor: Optional[torch.Tensor] = None) -> torch.Tensor:
1609
+ if X_tensor is None:
1610
+ X_tensor = torch.tensor(
1611
+ X_df.values.astype(np.float32),
1612
+ dtype=torch.float32,
1613
+ device=self.device
1614
+ )
1615
+ if self.use_pyg_knn:
1616
+ edge_index = self._build_edge_index_gpu(X_tensor)
1617
+ else:
1618
+ edge_index = self._build_edge_index_cpu(
1619
+ X_df.values.astype(np.float32))
1620
+ return self._normalized_adj(edge_index, X_df.shape[0])
1621
+
1622
+ def fit(self, X_train, y_train, w_train=None,
1623
+ X_val=None, y_val=None, w_val=None,
1624
+ trial: Optional[optuna.trial.Trial] = None):
1625
+
1626
+ X_train_tensor, y_train_tensor, w_train_tensor = self._tensorize_split(
1627
+ X_train, y_train, w_train, allow_none=False)
1628
+ has_val = X_val is not None and y_val is not None
1629
+ if has_val:
1630
+ X_val_tensor, y_val_tensor, w_val_tensor = self._tensorize_split(
1631
+ X_val, y_val, w_val, allow_none=False)
1632
+ else:
1633
+ X_val_tensor = y_val_tensor = w_val_tensor = None
1634
+
1635
+ adj_train = self._build_graph_from_df(X_train, X_train_tensor)
1636
+ adj_val = self._build_graph_from_df(
1637
+ X_val, X_val_tensor) if has_val else None
1638
+ # DataParallel 需要将邻接矩阵缓存在模型上,避免被 scatter
1639
+ self._set_adj_buffer(adj_train)
1640
+
1641
+ base_gnn = self._unwrap_gnn()
1642
+ optimizer = torch.optim.Adam(
1643
+ base_gnn.parameters(), lr=self.learning_rate)
1644
+ scaler = GradScaler(enabled=(self.device.type == 'cuda'))
1645
+
1646
+ best_loss = float('inf')
1647
+ best_state = None
1648
+ patience_counter = 0
1649
+
1650
+ for epoch in range(1, self.epochs + 1):
1651
+ self.gnn.train()
1652
+ optimizer.zero_grad()
1653
+ with autocast(enabled=(self.device.type == 'cuda')):
1654
+ if self.data_parallel_enabled:
1655
+ y_pred = self.gnn(X_train_tensor)
1656
+ else:
1657
+ y_pred = self.gnn(X_train_tensor, adj_train)
1658
+ loss = self._compute_weighted_loss(
1659
+ y_pred, y_train_tensor, w_train_tensor, apply_softplus=False)
1660
+ scaler.scale(loss).backward()
1661
+ scaler.unscale_(optimizer)
1662
+ clip_grad_norm_(self.gnn.parameters(), max_norm=1.0)
1663
+ scaler.step(optimizer)
1664
+ scaler.update()
1665
+
1666
+ if has_val:
1667
+ self.gnn.eval()
1668
+ if self.data_parallel_enabled and adj_val is not None:
1669
+ self._set_adj_buffer(adj_val)
1670
+ with torch.no_grad(), autocast(enabled=(self.device.type == 'cuda')):
1671
+ if self.data_parallel_enabled:
1672
+ y_val_pred = self.gnn(X_val_tensor)
1673
+ else:
1674
+ y_val_pred = self.gnn(X_val_tensor, adj_val)
1675
+ val_loss = self._compute_weighted_loss(
1676
+ y_val_pred, y_val_tensor, w_val_tensor, apply_softplus=False)
1677
+ if self.data_parallel_enabled:
1678
+ # 恢复训练邻接矩阵
1679
+ self._set_adj_buffer(adj_train)
1680
+
1681
+ best_loss, best_state, patience_counter, stop_training = self._early_stop_update(
1682
+ val_loss, best_loss, best_state, patience_counter, base_gnn,
1683
+ ignore_keys=["adj_buffer"])
1684
+
1685
+ if trial is not None:
1686
+ trial.report(val_loss, epoch)
1687
+ if trial.should_prune():
1688
+ raise optuna.TrialPruned()
1689
+ if stop_training:
1690
+ break
1691
+
1692
+ if best_state is not None:
1693
+ base_gnn.load_state_dict(best_state, strict=False)
1694
+
1695
+ def predict(self, X: pd.DataFrame) -> np.ndarray:
1696
+ self.gnn.eval()
1697
+ X_tensor, _, _ = self._tensorize_split(
1698
+ X, None, None, allow_none=False)
1699
+ adj = self._build_graph_from_df(X, X_tensor)
1700
+ if self.data_parallel_enabled:
1701
+ self._set_adj_buffer(adj)
1702
+ with torch.no_grad():
1703
+ if self.data_parallel_enabled:
1704
+ y_pred = self.gnn(X_tensor).cpu().numpy()
1705
+ else:
1706
+ y_pred = self.gnn(X_tensor, adj).cpu().numpy()
1707
+ if self.task_type == 'classification':
1708
+ y_pred = 1 / (1 + np.exp(-y_pred))
1709
+ else:
1710
+ y_pred = np.clip(y_pred, 1e-6, None)
1711
+ return y_pred.ravel()
1712
+
1713
+ def set_params(self, params: Dict[str, Any]):
1714
+ for key, value in params.items():
1715
+ if hasattr(self, key):
1716
+ setattr(self, key, value)
1717
+ else:
1718
+ raise ValueError(f"Parameter {key} not found in GNN model.")
1719
+ # 结构参数变化后需要重建骨架
1720
+ self.gnn = SimpleGNN(
1721
+ input_dim=self.input_dim,
1722
+ hidden_dim=self.hidden_dim,
1723
+ num_layers=self.num_layers,
1724
+ dropout=self.dropout,
1725
+ task_type=self.task_type
1726
+ ).to(self.device)
1727
+ return self
1728
+
1729
+
1730
+ # ===== 基础组件与训练封装 =====================================================
1731
+
1732
+ # =============================================================================
1733
+ # 配置、预处理与训练器基类
1734
+ # =============================================================================
1735
+ @dataclass
1736
+ class BayesOptConfig:
1737
+ model_nme: str
1738
+ resp_nme: str
1739
+ weight_nme: str
1740
+ factor_nmes: List[str]
1741
+ task_type: str = 'regression'
1742
+ binary_resp_nme: Optional[str] = None
1743
+ cate_list: Optional[List[str]] = None
1744
+ prop_test: float = 0.25
1745
+ rand_seed: Optional[int] = None
1746
+ epochs: int = 100
1747
+ use_gpu: bool = True
1748
+ use_resn_data_parallel: bool = False
1749
+ use_ft_data_parallel: bool = False
1750
+ use_resn_ddp: bool = False
1751
+ use_ft_ddp: bool = False
1752
+ use_gnn_data_parallel: bool = False
1753
+ use_gnn_ddp: bool = False
1754
+ optuna_storage: Optional[str] = None
1755
+ optuna_study_prefix: Optional[str] = None
1756
+
1757
+
1758
+ class OutputManager:
1759
+ # 统一管理结果、图表与模型的输出路径
1760
+
1761
+ def __init__(self, root: Optional[str] = None, model_name: str = "model") -> None:
1762
+ self.root = Path(root or os.getcwd())
1763
+ self.model_name = model_name
1764
+ self.plot_dir = self.root / 'plot'
1765
+ self.result_dir = self.root / 'Results'
1766
+ self.model_dir = self.root / 'model'
1767
+
1768
+ def _prepare(self, path: Path) -> str:
1769
+ ensure_parent_dir(str(path))
1770
+ return str(path)
1771
+
1772
+ def plot_path(self, filename: str) -> str:
1773
+ return self._prepare(self.plot_dir / filename)
1774
+
1775
+ def result_path(self, filename: str) -> str:
1776
+ return self._prepare(self.result_dir / filename)
1777
+
1778
+ def model_path(self, filename: str) -> str:
1779
+ return self._prepare(self.model_dir / filename)
1780
+
1781
+
1782
+ class DatasetPreprocessor:
1783
+ # 为各训练器准备通用的训练/测试数据视图
1784
+
1785
+ def __init__(self, train_df: pd.DataFrame, test_df: pd.DataFrame,
1786
+ config: BayesOptConfig) -> None:
1787
+ self.config = config
1788
+ self.train_data = train_df.copy(deep=True)
1789
+ self.test_data = test_df.copy(deep=True)
1790
+ self.num_features: List[str] = []
1791
+ self.train_oht_scl_data: Optional[pd.DataFrame] = None
1792
+ self.test_oht_scl_data: Optional[pd.DataFrame] = None
1793
+ self.var_nmes: List[str] = []
1794
+ self.cat_categories_for_shap: Dict[str, List[Any]] = {}
1795
+
1796
+ def run(self) -> "DatasetPreprocessor":
1797
+ cfg = self.config
1798
+ # 预先计算加权实际值,后续画图、校验都依赖该字段
1799
+ self.train_data.loc[:, 'w_act'] = self.train_data[cfg.resp_nme] * \
1800
+ self.train_data[cfg.weight_nme]
1801
+ self.test_data.loc[:, 'w_act'] = self.test_data[cfg.resp_nme] * \
1802
+ self.test_data[cfg.weight_nme]
1803
+ if cfg.binary_resp_nme:
1804
+ self.train_data.loc[:, 'w_binary_act'] = self.train_data[cfg.binary_resp_nme] * \
1805
+ self.train_data[cfg.weight_nme]
1806
+ self.test_data.loc[:, 'w_binary_act'] = self.test_data[cfg.binary_resp_nme] * \
1807
+ self.test_data[cfg.weight_nme]
1808
+ # 高分位裁剪用来吸收离群值;若删除会导致极端点主导损失
1809
+ q99 = self.train_data[cfg.resp_nme].quantile(0.999)
1810
+ self.train_data[cfg.resp_nme] = self.train_data[cfg.resp_nme].clip(
1811
+ upper=q99)
1812
+ cate_list = list(cfg.cate_list or [])
1813
+ if cate_list:
1814
+ for cate in cate_list:
1815
+ self.train_data[cate] = self.train_data[cate].astype(
1816
+ 'category')
1817
+ self.test_data[cate] = self.test_data[cate].astype('category')
1818
+ cats = self.train_data[cate].cat.categories
1819
+ self.cat_categories_for_shap[cate] = list(cats)
1820
+ self.num_features = [
1821
+ nme for nme in cfg.factor_nmes if nme not in cate_list]
1822
+ train_oht = self.train_data[cfg.factor_nmes +
1823
+ [cfg.weight_nme] + [cfg.resp_nme]].copy()
1824
+ test_oht = self.test_data[cfg.factor_nmes +
1825
+ [cfg.weight_nme] + [cfg.resp_nme]].copy()
1826
+ train_oht = pd.get_dummies(
1827
+ train_oht,
1828
+ columns=cate_list,
1829
+ drop_first=True,
1830
+ dtype=np.int8
1831
+ )
1832
+ test_oht = pd.get_dummies(
1833
+ test_oht,
1834
+ columns=cate_list,
1835
+ drop_first=True,
1836
+ dtype=np.int8
1837
+ )
1838
+ for num_chr in self.num_features:
1839
+ # 逐列标准化保障每个特征在同一量级,否则神经网络会难以收敛
1840
+ scaler = StandardScaler()
1841
+ train_oht[num_chr] = scaler.fit_transform(
1842
+ train_oht[num_chr].values.reshape(-1, 1))
1843
+ test_oht[num_chr] = scaler.transform(
1844
+ test_oht[num_chr].values.reshape(-1, 1))
1845
+ # reindex 时将缺失的哑变量列补零,避免测试集列数与训练集不一致
1846
+ test_oht = test_oht.reindex(columns=train_oht.columns, fill_value=0)
1847
+ self.train_oht_scl_data = train_oht
1848
+ self.test_oht_scl_data = test_oht
1849
+ self.var_nmes = list(
1850
+ set(list(train_oht.columns)) - set([cfg.weight_nme, cfg.resp_nme])
1851
+ )
1852
+ return self
1853
+
1854
+ # =============================================================================
1855
+ # 训练器体系
1856
+ # =============================================================================
1857
+
1858
+
1859
+ class TrainerBase:
1860
+ def __init__(self, context: "BayesOptModel", label: str, model_name_prefix: str) -> None:
1861
+ self.ctx = context
1862
+ self.label = label
1863
+ self.model_name_prefix = model_name_prefix
1864
+ self.model = None
1865
+ self.best_params: Optional[Dict[str, Any]] = None
1866
+ self.best_trial = None
1867
+
1868
+ @property
1869
+ def config(self) -> BayesOptConfig:
1870
+ return self.ctx.config
1871
+
1872
+ @property
1873
+ def output(self) -> OutputManager:
1874
+ return self.ctx.output_manager
1875
+
1876
+ def _get_model_filename(self) -> str:
1877
+ ext = 'pkl' if self.label in ['Xgboost', 'GLM'] else 'pth'
1878
+ return f'01_{self.ctx.model_nme}_{self.model_name_prefix}.{ext}'
1879
+
1880
+ def tune(self, max_evals: int, objective_fn=None) -> None:
1881
+ # 通用的 Optuna 调参循环流程。
1882
+ if objective_fn is None:
1883
+ # 若子类未显式提供 objective_fn,则默认使用 cross_val 作为优化目标
1884
+ objective_fn = self.cross_val
1885
+
1886
+ total_trials = max(1, int(max_evals))
1887
+ progress_counter = {"count": 0}
1888
+
1889
+ def objective_wrapper(trial: optuna.trial.Trial) -> float:
1890
+ should_log = DistributedUtils.is_main_process()
1891
+ if should_log:
1892
+ current_idx = progress_counter["count"] + 1
1893
+ print(
1894
+ f"[Optuna][{self.label}] Trial {current_idx}/{total_trials} started "
1895
+ f"(trial_id={trial.number})."
1896
+ )
1897
+ try:
1898
+ result = objective_fn(trial)
1899
+ except RuntimeError as exc:
1900
+ if "out of memory" in str(exc).lower():
1901
+ print(
1902
+ f"[Optuna][{self.label}] OOM detected. Pruning trial and clearing CUDA cache."
1903
+ )
1904
+ self._clean_gpu()
1905
+ raise optuna.TrialPruned() from exc
1906
+ raise
1907
+ finally:
1908
+ self._clean_gpu()
1909
+ if should_log:
1910
+ progress_counter["count"] = progress_counter["count"] + 1
1911
+ trial_state = getattr(trial, "state", None)
1912
+ state_repr = getattr(trial_state, "name", "OK")
1913
+ print(
1914
+ f"[Optuna][{self.label}] Trial {progress_counter['count']}/{total_trials} finished "
1915
+ f"(status={state_repr})."
1916
+ )
1917
+ return result
1918
+
1919
+ study = optuna.create_study(
1920
+ direction='minimize',
1921
+ sampler=optuna.samplers.TPESampler(seed=self.ctx.rand_seed)
1922
+ )
1923
+ study.optimize(objective_wrapper, n_trials=max_evals)
1924
+ self.best_params = study.best_params
1925
+ self.best_trial = study.best_trial
1926
+
1927
+ # 将最优参数保存为 CSV,方便复现
1928
+ params_path = self.output.result_path(
1929
+ f'{self.ctx.model_nme}_bestparams_{self.label.lower()}.csv'
1930
+ )
1931
+ pd.DataFrame(self.best_params, index=[0]).to_csv(params_path)
1932
+
1933
+ def train(self) -> None:
1934
+ raise NotImplementedError
1935
+
1936
+ def save(self) -> None:
1937
+ if self.model is None:
1938
+ print(f"[save] Warning: No model to save for {self.label}")
1939
+ return
1940
+
1941
+ path = self.output.model_path(self._get_model_filename())
1942
+ if self.label in ['Xgboost', 'GLM']:
1943
+ joblib.dump(self.model, path)
1944
+ else:
1945
+ # Torch 模型既可以只存 state_dict,也可以整个对象一起序列化
1946
+ # 兼容历史行为:ResNetTrainer 保存 state_dict,FTTrainer 保存完整对象
1947
+ if hasattr(self.model, 'resnet'): # ResNetSklearn
1948
+ torch.save(self.model.resnet.state_dict(), path)
1949
+ else: # FTTransformerSklearn or others
1950
+ torch.save(self.model, path)
1951
+
1952
+ def load(self) -> None:
1953
+ path = self.output.model_path(self._get_model_filename())
1954
+ if not os.path.exists(path):
1955
+ print(f"[load] Warning: Model file not found: {path}")
1956
+ return
1957
+
1958
+ if self.label in ['Xgboost', 'GLM']:
1959
+ self.model = joblib.load(path)
1960
+ else:
1961
+ # Torch 模型的加载需要根据结构区别处理
1962
+ if self.label == 'ResNet' or self.label == 'ResNetClassifier':
1963
+ # ResNet 需要重新构建骨架,结构参数依赖 ctx,因此交由子类处理
1964
+ pass
1965
+ else:
1966
+ # FT-Transformer 序列化了整个对象,可直接加载后迁移到目标设备
1967
+ loaded = torch.load(path, map_location='cpu')
1968
+ self._move_to_device(loaded)
1969
+ self.model = loaded
1970
+
1971
+ def _move_to_device(self, model_obj):
1972
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
1973
+ if hasattr(model_obj, 'device'):
1974
+ model_obj.device = device
1975
+ if hasattr(model_obj, 'to'):
1976
+ model_obj.to(device)
1977
+ # 若对象内部还包含 ft/resnet 子模块,也要同时迁移设备
1978
+ if hasattr(model_obj, 'ft'):
1979
+ model_obj.ft.to(device)
1980
+ if hasattr(model_obj, 'resnet'):
1981
+ model_obj.resnet.to(device)
1982
+ if hasattr(model_obj, 'gnn'):
1983
+ model_obj.gnn.to(device)
1984
+
1985
+ def _clean_gpu(self):
1986
+ gc.collect()
1987
+ if torch.cuda.is_available():
1988
+ device = None
1989
+ try:
1990
+ device = getattr(self, "device", None)
1991
+ except Exception:
1992
+ device = None
1993
+ if isinstance(device, torch.device):
1994
+ try:
1995
+ torch.cuda.set_device(device)
1996
+ except Exception:
1997
+ pass
1998
+ torch.cuda.empty_cache()
1999
+ torch.cuda.ipc_collect()
2000
+ torch.cuda.synchronize()
2001
+
2002
+ # 预测 + 缓存逻辑
2003
+ def _predict_and_cache(self,
2004
+ model,
2005
+ pred_prefix: str,
2006
+ use_oht: bool = False,
2007
+ design_fn=None) -> None:
2008
+ if design_fn:
2009
+ X_train = design_fn(train=True)
2010
+ X_test = design_fn(train=False)
2011
+ elif use_oht:
2012
+ X_train = self.ctx.train_oht_scl_data[self.ctx.var_nmes]
2013
+ X_test = self.ctx.test_oht_scl_data[self.ctx.var_nmes]
2014
+ else:
2015
+ X_train = self.ctx.train_data[self.ctx.factor_nmes]
2016
+ X_test = self.ctx.test_data[self.ctx.factor_nmes]
2017
+
2018
+ preds_train = model.predict(X_train)
2019
+ preds_test = model.predict(X_test)
2020
+
2021
+ self.ctx.train_data[f'pred_{pred_prefix}'] = preds_train
2022
+ self.ctx.test_data[f'pred_{pred_prefix}'] = preds_test
2023
+ self.ctx.train_data[f'w_pred_{pred_prefix}'] = (
2024
+ self.ctx.train_data[f'pred_{pred_prefix}'] *
2025
+ self.ctx.train_data[self.ctx.weight_nme]
2026
+ )
2027
+ self.ctx.test_data[f'w_pred_{pred_prefix}'] = (
2028
+ self.ctx.test_data[f'pred_{pred_prefix}'] *
2029
+ self.ctx.test_data[self.ctx.weight_nme]
2030
+ )
2031
+
2032
+ def _fit_predict_cache(self,
2033
+ model,
2034
+ X_train,
2035
+ y_train,
2036
+ sample_weight,
2037
+ pred_prefix: str,
2038
+ use_oht: bool = False,
2039
+ design_fn=None,
2040
+ fit_kwargs: Optional[Dict[str, Any]] = None,
2041
+ sample_weight_arg: Optional[str] = 'sample_weight') -> None:
2042
+ fit_kwargs = fit_kwargs.copy() if fit_kwargs else {}
2043
+ if sample_weight is not None and sample_weight_arg:
2044
+ fit_kwargs.setdefault(sample_weight_arg, sample_weight)
2045
+ model.fit(X_train, y_train, **fit_kwargs)
2046
+ self.ctx.model_label.append(self.label)
2047
+ self._predict_and_cache(
2048
+ model, pred_prefix, use_oht=use_oht, design_fn=design_fn)
2049
+
2050
+
2051
+ class XGBTrainer(TrainerBase):
2052
+ def __init__(self, context: "BayesOptModel") -> None:
2053
+ super().__init__(context, 'Xgboost', 'Xgboost')
2054
+ self.model: Optional[xgb.XGBRegressor] = None
2055
+
2056
+ def _build_estimator(self) -> xgb.XGBRegressor:
2057
+ params = dict(
2058
+ objective=self.ctx.obj,
2059
+ random_state=self.ctx.rand_seed,
2060
+ subsample=0.9,
2061
+ tree_method='gpu_hist' if self.ctx.use_gpu else 'hist',
2062
+ enable_categorical=True,
2063
+ predictor='gpu_predictor' if self.ctx.use_gpu else 'cpu_predictor'
2064
+ )
2065
+ if self.ctx.use_gpu:
2066
+ params['gpu_id'] = 0
2067
+ print(f">>> XGBoost using GPU ID: 0 (Single GPU Mode)")
2068
+ return xgb.XGBRegressor(**params)
2069
+
2070
+ def cross_val(self, trial: optuna.trial.Trial) -> float:
2071
+ learning_rate = trial.suggest_float(
2072
+ 'learning_rate', 1e-5, 1e-1, log=True)
2073
+ gamma = trial.suggest_float('gamma', 0, 10000)
2074
+ max_depth = trial.suggest_int('max_depth', 3, 25)
2075
+ n_estimators = trial.suggest_int('n_estimators', 10, 500, step=10)
2076
+ min_child_weight = trial.suggest_int(
2077
+ 'min_child_weight', 100, 10000, step=100)
2078
+ reg_alpha = trial.suggest_float('reg_alpha', 1e-10, 1, log=True)
2079
+ reg_lambda = trial.suggest_float('reg_lambda', 1e-10, 1, log=True)
2080
+ if self.ctx.obj == 'reg:tweedie':
2081
+ tweedie_variance_power = trial.suggest_float(
2082
+ 'tweedie_variance_power', 1, 2)
2083
+ elif self.ctx.obj == 'count:poisson':
2084
+ tweedie_variance_power = 1
2085
+ elif self.ctx.obj == 'reg:gamma':
2086
+ tweedie_variance_power = 2
2087
+ else:
2088
+ tweedie_variance_power = 1.5
2089
+ clf = self._build_estimator()
2090
+ params = {
2091
+ 'learning_rate': learning_rate,
2092
+ 'gamma': gamma,
2093
+ 'max_depth': max_depth,
2094
+ 'n_estimators': n_estimators,
2095
+ 'min_child_weight': min_child_weight,
2096
+ 'reg_alpha': reg_alpha,
2097
+ 'reg_lambda': reg_lambda
2098
+ }
2099
+ if self.ctx.obj == 'reg:tweedie':
2100
+ params['tweedie_variance_power'] = tweedie_variance_power
2101
+ clf.set_params(**params)
2102
+ n_jobs = 1 if self.ctx.use_gpu else int(1 / self.ctx.prop_test)
2103
+ acc = cross_val_score(
2104
+ clf,
2105
+ self.ctx.train_data[self.ctx.factor_nmes],
2106
+ self.ctx.train_data[self.ctx.resp_nme].values,
2107
+ fit_params=self.ctx.fit_params,
2108
+ cv=self.ctx.cv,
2109
+ scoring=make_scorer(
2110
+ mean_tweedie_deviance,
2111
+ power=tweedie_variance_power,
2112
+ greater_is_better=False),
2113
+ error_score='raise',
2114
+ n_jobs=n_jobs
2115
+ ).mean()
2116
+ return -acc
2117
+
2118
+ def train(self) -> None:
2119
+ if not self.best_params:
2120
+ raise RuntimeError('请先运行 tune() 以获得 XGB 最优参数。')
2121
+ self.model = self._build_estimator()
2122
+ self.model.set_params(**self.best_params)
2123
+ self._fit_predict_cache(
2124
+ self.model,
2125
+ self.ctx.train_data[self.ctx.factor_nmes],
2126
+ self.ctx.train_data[self.ctx.resp_nme].values,
2127
+ sample_weight=None,
2128
+ pred_prefix='xgb',
2129
+ fit_kwargs=self.ctx.fit_params,
2130
+ sample_weight_arg=None # 样本权重已通过 fit_kwargs 传入
2131
+ )
2132
+ self.ctx.xgb_best = self.model
2133
+
2134
+
2135
+ class GLMTrainer(TrainerBase):
2136
+ def __init__(self, context: "BayesOptModel") -> None:
2137
+ super().__init__(context, 'GLM', 'GLM')
2138
+ self.model = None
2139
+
2140
+ def _select_family(self, tweedie_power: Optional[float] = None):
2141
+ if self.ctx.task_type == 'classification':
2142
+ return sm.families.Binomial()
2143
+ if self.ctx.obj == 'count:poisson':
2144
+ return sm.families.Poisson()
2145
+ if self.ctx.obj == 'reg:gamma':
2146
+ return sm.families.Gamma()
2147
+ power = tweedie_power if tweedie_power is not None else 1.5
2148
+ return sm.families.Tweedie(var_power=power, link=sm.families.links.log())
2149
+
2150
+ def _prepare_design(self, data: pd.DataFrame) -> pd.DataFrame:
2151
+ # 为 statsmodels 设计矩阵添加截距项
2152
+ X = data[self.ctx.var_nmes]
2153
+ return sm.add_constant(X, has_constant='add')
2154
+
2155
+ def _metric_power(self, family, tweedie_power: Optional[float]) -> float:
2156
+ if isinstance(family, sm.families.Poisson):
2157
+ return 1.0
2158
+ if isinstance(family, sm.families.Gamma):
2159
+ return 2.0
2160
+ if isinstance(family, sm.families.Tweedie):
2161
+ return tweedie_power if tweedie_power is not None else getattr(family, 'var_power', 1.5)
2162
+ return 1.5
2163
+
2164
+ def cross_val(self, trial: optuna.trial.Trial) -> float:
2165
+ alpha = trial.suggest_float('alpha', 1e-6, 1e2, log=True)
2166
+ l1_ratio = trial.suggest_float('l1_ratio', 0.0, 1.0)
2167
+ tweedie_power = None
2168
+ if self.ctx.task_type == 'regression' and self.ctx.obj == 'reg:tweedie':
2169
+ tweedie_power = trial.suggest_float('tweedie_power', 1.0, 2.0)
2170
+
2171
+ X_all = self._prepare_design(self.ctx.train_oht_scl_data)
2172
+ y_all = self.ctx.train_oht_scl_data[self.ctx.resp_nme]
2173
+ w_all = self.ctx.train_oht_scl_data[self.ctx.weight_nme]
2174
+
2175
+ scores = []
2176
+ for train_idx, val_idx in self.ctx.cv.split(X_all):
2177
+ X_train, X_val = X_all.iloc[train_idx], X_all.iloc[val_idx]
2178
+ y_train, y_val = y_all.iloc[train_idx], y_all.iloc[val_idx]
2179
+ w_train, w_val = w_all.iloc[train_idx], w_all.iloc[val_idx]
2180
+
2181
+ family = self._select_family(tweedie_power)
2182
+ glm = sm.GLM(y_train, X_train, family=family,
2183
+ freq_weights=w_train)
2184
+ result = glm.fit_regularized(
2185
+ alpha=alpha, L1_wt=l1_ratio, maxiter=200)
2186
+
2187
+ y_pred = result.predict(X_val)
2188
+ if self.ctx.task_type == 'classification':
2189
+ y_pred = np.clip(y_pred, EPS, 1 - EPS)
2190
+ fold_score = log_loss(
2191
+ y_val, y_pred, sample_weight=w_val)
2192
+ else:
2193
+ y_pred = np.maximum(y_pred, EPS)
2194
+ fold_score = mean_tweedie_deviance(
2195
+ y_val,
2196
+ y_pred,
2197
+ sample_weight=w_val,
2198
+ power=self._metric_power(family, tweedie_power)
2199
+ )
2200
+ scores.append(fold_score)
2201
+
2202
+ return float(np.mean(scores))
2203
+
2204
+ def train(self) -> None:
2205
+ if not self.best_params:
2206
+ raise RuntimeError('请先运行 tune() 以获得 GLM 最优参数。')
2207
+ tweedie_power = self.best_params.get('tweedie_power')
2208
+ family = self._select_family(tweedie_power)
2209
+
2210
+ X_train = self._prepare_design(self.ctx.train_oht_scl_data)
2211
+ y_train = self.ctx.train_oht_scl_data[self.ctx.resp_nme]
2212
+ w_train = self.ctx.train_oht_scl_data[self.ctx.weight_nme]
2213
+
2214
+ glm = sm.GLM(y_train, X_train, family=family,
2215
+ freq_weights=w_train)
2216
+ self.model = glm.fit_regularized(
2217
+ alpha=self.best_params['alpha'],
2218
+ L1_wt=self.best_params['l1_ratio'],
2219
+ maxiter=300
2220
+ )
2221
+
2222
+ self.ctx.glm_best = self.model
2223
+ self.ctx.model_label += [self.label]
2224
+ self._predict_and_cache(
2225
+ self.model,
2226
+ 'glm',
2227
+ design_fn=lambda train: self._prepare_design(
2228
+ self.ctx.train_oht_scl_data if train else self.ctx.test_oht_scl_data
2229
+ )
2230
+ )
2231
+
2232
+
2233
+ class ResNetTrainer(TrainerBase):
2234
+ def __init__(self, context: "BayesOptModel") -> None:
2235
+ if context.task_type == 'classification':
2236
+ super().__init__(context, 'ResNetClassifier', 'ResNet')
2237
+ else:
2238
+ super().__init__(context, 'ResNet', 'ResNet')
2239
+ self.model: Optional[ResNetSklearn] = None
2240
+
2241
+ # ========= 交叉验证(BayesOpt 用) =========
2242
+ def cross_val(self, trial: optuna.trial.Trial) -> float:
2243
+ # 针对 ResNet 的交叉验证流程,重点控制显存:
2244
+ # - 每个 fold 单独创建 ResNetSklearn,结束立刻释放资源;
2245
+ # - fold 完成后迁移模型到 CPU,删除对象并调用 gc/empty_cache;
2246
+ # - 可选:BayesOpt 期间只抽样部分训练集以减少显存压力。
2247
+
2248
+ # 1. 超参空间(基本沿用你之前的设定)
2249
+ learning_rate = trial.suggest_float(
2250
+ 'learning_rate', 1e-6, 1e-2, log=True
2251
+ )
2252
+ # hidden_dim = trial.suggest_int('hidden_dim', 32, 256, step=32) # 不宜过大
2253
+ hidden_dim = trial.suggest_int('hidden_dim', 8, 32, step=2)
2254
+ block_num = trial.suggest_int('block_num', 2, 10)
2255
+
2256
+ if self.ctx.task_type == 'regression':
2257
+ if self.ctx.obj == 'reg:tweedie':
2258
+ tw_power = trial.suggest_float('tw_power', 1.0, 2.0)
2259
+ elif self.ctx.obj == 'count:poisson':
2260
+ tw_power = 1.0
2261
+ elif self.ctx.obj == 'reg:gamma':
2262
+ tw_power = 2.0
2263
+ else:
2264
+ tw_power = 1.5
2265
+ else: # classification
2266
+ tw_power = None # Not used
2267
+
2268
+ fold_losses = []
2269
+
2270
+ # 2. (可选)BayesOpt 只在子样本上做 CV,减轻显存 & 时间压力
2271
+ data_for_cv = self.ctx.train_oht_scl_data
2272
+ max_rows_for_resnet_bo = min(100000, int(
2273
+ len(data_for_cv)/5)) # 你可以按 A30 情况调小,比如 50_000
2274
+ if len(data_for_cv) > max_rows_for_resnet_bo:
2275
+ data_for_cv = data_for_cv.sample(
2276
+ max_rows_for_resnet_bo,
2277
+ random_state=self.ctx.rand_seed
2278
+ )
2279
+
2280
+ X_all = data_for_cv[self.ctx.var_nmes]
2281
+ y_all = data_for_cv[self.ctx.resp_nme]
2282
+ w_all = data_for_cv[self.ctx.weight_nme]
2283
+
2284
+ # 用局部 ShuffleSplit,避免子样本时索引不一致
2285
+ cv_local = ShuffleSplit(
2286
+ n_splits=int(1 / self.ctx.prop_test),
2287
+ test_size=self.ctx.prop_test,
2288
+ random_state=self.ctx.rand_seed
2289
+ )
2290
+
2291
+ # 使用 Hold-out 验证代替 K-Fold CV 以提高速度
2292
+ # 只取一次划分
2293
+ train_idx, val_idx = next(cv_local.split(X_all))
2294
+
2295
+ X_train_fold = X_all.iloc[train_idx]
2296
+ y_train_fold = y_all.iloc[train_idx]
2297
+ w_train_fold = w_all.iloc[train_idx]
2298
+
2299
+ X_val_fold = X_all.iloc[val_idx]
2300
+ y_val_fold = y_all.iloc[val_idx]
2301
+ w_val_fold = w_all.iloc[val_idx]
2302
+
2303
+ # 3. 创建 ResNet 模型
2304
+ cv_net = ResNetSklearn(
2305
+ model_nme=self.ctx.model_nme,
2306
+ input_dim=X_all.shape[1],
2307
+ hidden_dim=hidden_dim,
2308
+ block_num=block_num,
2309
+ task_type=self.ctx.task_type,
2310
+ epochs=self.ctx.epochs,
2311
+ tweedie_power=tw_power,
2312
+ learning_rate=learning_rate,
2313
+ patience=5,
2314
+ use_layernorm=True,
2315
+ dropout=0.1,
2316
+ residual_scale=0.1,
2317
+ use_data_parallel=self.ctx.config.use_resn_data_parallel,
2318
+ use_ddp=self.ctx.config.use_resn_ddp
2319
+ )
2320
+
2321
+ try:
2322
+ # 4. 训练
2323
+ cv_net.fit(
2324
+ X_train_fold,
2325
+ y_train_fold,
2326
+ w_train_fold,
2327
+ X_val_fold,
2328
+ y_val_fold,
2329
+ w_val_fold,
2330
+ trial=trial
2331
+ )
2332
+
2333
+ # 5. 验证集预测
2334
+ y_pred_fold = cv_net.predict(X_val_fold)
2335
+
2336
+ # 6. 评估:Tweedie deviance(评估用,训练 loss 不动)
2337
+ if self.ctx.task_type == 'regression':
2338
+ loss = mean_tweedie_deviance(
2339
+ y_val_fold,
2340
+ y_pred_fold,
2341
+ sample_weight=w_val_fold,
2342
+ power=tw_power
2343
+ )
2344
+ else: # classification
2345
+ from sklearn.metrics import log_loss
2346
+ loss = log_loss(
2347
+ y_val_fold,
2348
+ y_pred_fold,
2349
+ sample_weight=w_val_fold,
2350
+ )
2351
+ fold_losses.append(loss)
2352
+ finally:
2353
+ # 7. 结束后释放 GPU 资源
2354
+ try:
2355
+ if hasattr(cv_net, "resnet"):
2356
+ cv_net.resnet.to("cpu")
2357
+ except Exception:
2358
+ pass
2359
+ del cv_net
2360
+ self._clean_gpu()
2361
+
2362
+ return np.mean(fold_losses)
2363
+
2364
+ # ========= 用最优超参训练最终 ResNet =========
2365
+ def train(self) -> None:
2366
+ if not self.best_params:
2367
+ raise RuntimeError('请先运行 tune() 以获得 ResNet 最优参数。')
2368
+
2369
+ self.model = ResNetSklearn(
2370
+ model_nme=self.ctx.model_nme,
2371
+ input_dim=self.ctx.train_oht_scl_data[self.ctx.var_nmes].shape[1],
2372
+ task_type=self.ctx.task_type,
2373
+ use_data_parallel=self.ctx.config.use_resn_data_parallel,
2374
+ use_ddp=self.ctx.config.use_resn_ddp
2375
+ )
2376
+ self.model.set_params(self.best_params)
2377
+
2378
+ self._fit_predict_cache(
2379
+ self.model,
2380
+ self.ctx.train_oht_scl_data[self.ctx.var_nmes],
2381
+ self.ctx.train_oht_scl_data[self.ctx.resp_nme],
2382
+ sample_weight=self.ctx.train_oht_scl_data[self.ctx.weight_nme],
2383
+ pred_prefix='resn',
2384
+ use_oht=True,
2385
+ sample_weight_arg='w_train'
2386
+ )
2387
+
2388
+ # 方便外部调用
2389
+ self.ctx.resn_best = self.model
2390
+
2391
+ # ========= 保存 / 加载 =========
2392
+ # ResNet 使用 state_dict 保存,需要特殊的 load 逻辑,所以保留 load
2393
+ # save 逻辑已经在 TrainerBase 中处理了 (check for .resnet attribute)
2394
+
2395
+ def load(self) -> None:
2396
+ # 将磁盘中的 ResNet 权重加载到当前设备,保持与上下文一致。
2397
+ path = self.output.model_path(self._get_model_filename())
2398
+ if os.path.exists(path):
2399
+ resn_loaded = ResNetSklearn(
2400
+ model_nme=self.ctx.model_nme,
2401
+ input_dim=self.ctx.train_oht_scl_data[self.ctx.var_nmes].shape[1],
2402
+ task_type=self.ctx.task_type,
2403
+ use_data_parallel=self.ctx.config.use_resn_data_parallel,
2404
+ use_ddp=self.ctx.config.use_resn_ddp
2405
+ )
2406
+ state_dict = torch.load(path, map_location='cpu')
2407
+ resn_loaded.resnet.load_state_dict(state_dict)
2408
+
2409
+ self._move_to_device(resn_loaded)
2410
+ self.model = resn_loaded
2411
+ self.ctx.resn_best = self.model
2412
+ else:
2413
+ print(f"[ResNetTrainer.load] 未找到模型文件:{path}")
2414
+
2415
+
2416
+ class FTTrainer(TrainerBase):
2417
+ def __init__(self, context: "BayesOptModel") -> None:
2418
+ if context.task_type == 'classification':
2419
+ super().__init__(context, 'FTTransformerClassifier', 'FTTransformer')
2420
+ else:
2421
+ super().__init__(context, 'FTTransformer', 'FTTransformer')
2422
+ self.model: Optional[FTTransformerSklearn] = None
2423
+
2424
+ def cross_val(self, trial: optuna.trial.Trial) -> float:
2425
+ # 针对 FT-Transformer 的交叉验证,重点同样在显存控制:
2426
+ # - 收缩超参搜索空间,防止不必要的超大模型;
2427
+ # - 每个 fold 结束后立即释放 GPU 显存,确保下一个 trial 顺利进行。
2428
+ # 超参空间适当缩小一点,避免特别大的模型
2429
+ learning_rate = trial.suggest_float(
2430
+ 'learning_rate', 1e-5, 5e-4, log=True
2431
+ )
2432
+ d_model = trial.suggest_int('d_model', 32, 256, step=32)
2433
+ # n_heads = trial.suggest_categorical('n_heads', [2, 4]) 避免欠拟合
2434
+ n_heads = trial.suggest_categorical('n_heads', [2, 4, 8])
2435
+ # n_layers = trial.suggest_int('n_layers', 2, 4) 避免欠拟合
2436
+ n_layers = trial.suggest_int('n_layers', 2, 8)
2437
+ dropout = trial.suggest_float('dropout', 0.0, 0.2)
2438
+ approx_units = d_model * n_layers * max(1, len(self.ctx.factor_nmes))
2439
+ if approx_units > 1_200_000:
2440
+ print(
2441
+ f"[FTTrainer] Trial pruned early: d_model={d_model}, n_layers={n_layers} -> approx_units={approx_units}")
2442
+ raise optuna.TrialPruned(
2443
+ "config exceeds safe memory budget; prune before training")
2444
+
2445
+ if self.ctx.task_type == 'regression':
2446
+ if self.ctx.obj == 'reg:tweedie':
2447
+ tw_power = trial.suggest_float('tw_power', 1.0, 2.0)
2448
+ elif self.ctx.obj == 'count:poisson':
2449
+ tw_power = 1.0
2450
+ elif self.ctx.obj == 'reg:gamma':
2451
+ tw_power = 2.0
2452
+ else:
2453
+ tw_power = 1.5
2454
+ else: # classification
2455
+ tw_power = None # Not used
2456
+
2457
+ fold_losses = []
2458
+
2459
+ # 可选:只在子样本上做 BO,避免大数据直接压垮显存
2460
+ data_for_cv = self.ctx.train_data
2461
+ max_rows_for_ft_bo = min(1000000, int(
2462
+ len(data_for_cv)/2)) # 你可以根据显存情况调小或调大
2463
+ if len(data_for_cv) > max_rows_for_ft_bo:
2464
+ data_for_cv = data_for_cv.sample(
2465
+ max_rows_for_ft_bo,
2466
+ random_state=self.ctx.rand_seed
2467
+ )
2468
+
2469
+ # 用局部 ShuffleSplit,避免子样本时索引不一致
2470
+ cv_local = ShuffleSplit(
2471
+ n_splits=int(1 / self.ctx.prop_test),
2472
+ test_size=self.ctx.prop_test,
2473
+ random_state=self.ctx.rand_seed
2474
+ )
2475
+
2476
+ # 使用 Hold-out 验证代替 K-Fold CV 以提高速度
2477
+ # 只取一次划分
2478
+ train_idx, val_idx = next(cv_local.split(
2479
+ data_for_cv[self.ctx.factor_nmes]))
2480
+
2481
+ X_train_fold = data_for_cv.iloc[train_idx][self.ctx.factor_nmes]
2482
+ y_train_fold = data_for_cv.iloc[train_idx][self.ctx.resp_nme]
2483
+ w_train_fold = data_for_cv.iloc[train_idx][self.ctx.weight_nme]
2484
+ X_val_fold = data_for_cv.iloc[val_idx][self.ctx.factor_nmes]
2485
+ y_val_fold = data_for_cv.iloc[val_idx][self.ctx.resp_nme]
2486
+ w_val_fold = data_for_cv.iloc[val_idx][self.ctx.weight_nme]
2487
+
2488
+ cv_ft = FTTransformerSklearn(
2489
+ model_nme=self.ctx.model_nme,
2490
+ num_cols=self.ctx.num_features,
2491
+ cat_cols=self.ctx.cate_list,
2492
+ d_model=d_model,
2493
+ n_heads=n_heads,
2494
+ n_layers=n_layers,
2495
+ dropout=dropout,
2496
+ task_type=self.ctx.task_type,
2497
+ # batch_num=batch_num,
2498
+ epochs=self.ctx.epochs,
2499
+ tweedie_power=tw_power,
2500
+ learning_rate=learning_rate,
2501
+ patience=5,
2502
+ use_data_parallel=self.ctx.config.use_ft_data_parallel,
2503
+ use_ddp=self.ctx.config.use_ft_ddp
2504
+ )
2505
+
2506
+ try:
2507
+ cv_ft.fit(
2508
+ X_train_fold, y_train_fold, w_train_fold,
2509
+ X_val_fold, y_val_fold, w_val_fold,
2510
+ trial=trial
2511
+ )
2512
+ y_pred_fold = cv_ft.predict(X_val_fold)
2513
+ if self.ctx.task_type == 'regression':
2514
+ loss = mean_tweedie_deviance(
2515
+ y_val_fold,
2516
+ y_pred_fold,
2517
+ sample_weight=w_val_fold,
2518
+ power=tw_power
2519
+ )
2520
+ else: # classification
2521
+ from sklearn.metrics import log_loss
2522
+ loss = log_loss(
2523
+ y_val_fold,
2524
+ y_pred_fold,
2525
+ sample_weight=w_val_fold,
2526
+ )
2527
+ fold_losses.append(loss)
2528
+ finally:
2529
+ # 结束后立即释放 GPU 资源
2530
+ try:
2531
+ # 如果模型在 GPU 上,先挪回 CPU
2532
+ if hasattr(cv_ft, "ft"):
2533
+ cv_ft.ft.to("cpu")
2534
+ except Exception:
2535
+ pass
2536
+ del cv_ft
2537
+ self._clean_gpu()
2538
+
2539
+ return np.mean(fold_losses)
2540
+
2541
+ def train(self) -> None:
2542
+ if not self.best_params:
2543
+ raise RuntimeError('请先运行 tune() 以获得 FT-Transformer 最优参数。')
2544
+ self.model = FTTransformerSklearn(
2545
+ model_nme=self.ctx.model_nme,
2546
+ num_cols=self.ctx.num_features,
2547
+ cat_cols=self.ctx.cate_list,
2548
+ task_type=self.ctx.task_type,
2549
+ use_data_parallel=self.ctx.config.use_ft_data_parallel,
2550
+ use_ddp=self.ctx.config.use_ft_ddp
2551
+ )
2552
+ self.model.set_params(self.best_params)
2553
+ self._fit_predict_cache(
2554
+ self.model,
2555
+ self.ctx.train_data[self.ctx.factor_nmes],
2556
+ self.ctx.train_data[self.ctx.resp_nme],
2557
+ sample_weight=self.ctx.train_data[self.ctx.weight_nme],
2558
+ pred_prefix='ft',
2559
+ sample_weight_arg='w_train'
2560
+ )
2561
+ self.ctx.ft_best = self.model
2562
+
2563
+
2564
+ class GNNTrainer(TrainerBase):
2565
+ def __init__(self, context: "BayesOptModel") -> None:
2566
+ if context.task_type == 'classification':
2567
+ super().__init__(context, 'GNNClassifier', 'GNN')
2568
+ else:
2569
+ super().__init__(context, 'GNN', 'GNN')
2570
+ self.model: Optional[GraphNeuralNetSklearn] = None
2571
+
2572
+ def cross_val(self, trial: optuna.trial.Trial) -> float:
2573
+ learning_rate = trial.suggest_float(
2574
+ 'learning_rate', 1e-5, 5e-3, log=True)
2575
+ hidden_dim = trial.suggest_int('hidden_dim', 16, 128, step=16)
2576
+ num_layers = trial.suggest_int('num_layers', 1, 4)
2577
+ k_neighbors = trial.suggest_int('k_neighbors', 5, 20)
2578
+ dropout = trial.suggest_float('dropout', 0.0, 0.3)
2579
+
2580
+ if self.ctx.task_type == 'regression':
2581
+ if self.ctx.obj == 'reg:tweedie':
2582
+ tw_power = trial.suggest_float('tw_power', 1.0, 2.0)
2583
+ elif self.ctx.obj == 'count:poisson':
2584
+ tw_power = 1.0
2585
+ elif self.ctx.obj == 'reg:gamma':
2586
+ tw_power = 2.0
2587
+ else:
2588
+ tw_power = 1.5
2589
+ else:
2590
+ tw_power = None
2591
+
2592
+ fold_losses = []
2593
+
2594
+ data_for_cv = self.ctx.train_oht_scl_data
2595
+ max_rows_for_gnn_bo = min(50_000, max(1, int(len(data_for_cv) / 5)))
2596
+ if len(data_for_cv) > max_rows_for_gnn_bo:
2597
+ data_for_cv = data_for_cv.sample(
2598
+ max_rows_for_gnn_bo,
2599
+ random_state=self.ctx.rand_seed
2600
+ )
2601
+
2602
+ cv_local = ShuffleSplit(
2603
+ n_splits=int(1 / self.ctx.prop_test),
2604
+ test_size=self.ctx.prop_test,
2605
+ random_state=self.ctx.rand_seed
2606
+ )
2607
+ train_idx, val_idx = next(cv_local.split(data_for_cv))
2608
+
2609
+ X_train = data_for_cv.iloc[train_idx][self.ctx.var_nmes]
2610
+ y_train = data_for_cv.iloc[train_idx][self.ctx.resp_nme]
2611
+ w_train = data_for_cv.iloc[train_idx][self.ctx.weight_nme]
2612
+ X_val = data_for_cv.iloc[val_idx][self.ctx.var_nmes]
2613
+ y_val = data_for_cv.iloc[val_idx][self.ctx.resp_nme]
2614
+ w_val = data_for_cv.iloc[val_idx][self.ctx.weight_nme]
2615
+
2616
+ cv_gnn = GraphNeuralNetSklearn(
2617
+ model_nme=self.ctx.model_nme,
2618
+ input_dim=X_train.shape[1],
2619
+ hidden_dim=hidden_dim,
2620
+ num_layers=num_layers,
2621
+ k_neighbors=k_neighbors,
2622
+ dropout=dropout,
2623
+ learning_rate=learning_rate,
2624
+ epochs=self.ctx.epochs,
2625
+ patience=5,
2626
+ task_type=self.ctx.task_type,
2627
+ tweedie_power=tw_power if tw_power is not None else 1.5,
2628
+ use_ddp=False # BO 阶段默认单卡,避免多进程与 Optuna 冲突
2629
+ )
2630
+
2631
+ try:
2632
+ cv_gnn.fit(
2633
+ X_train, y_train, w_train,
2634
+ X_val, y_val, w_val,
2635
+ trial=trial
2636
+ )
2637
+ y_pred = cv_gnn.predict(X_val)
2638
+ if self.ctx.task_type == 'regression':
2639
+ loss = mean_tweedie_deviance(
2640
+ y_val,
2641
+ y_pred,
2642
+ sample_weight=w_val,
2643
+ power=tw_power if tw_power is not None else 1.5
2644
+ )
2645
+ else:
2646
+ loss = log_loss(
2647
+ y_val,
2648
+ y_pred,
2649
+ sample_weight=w_val
2650
+ )
2651
+ fold_losses.append(loss)
2652
+ finally:
2653
+ try:
2654
+ if hasattr(cv_gnn, "gnn"):
2655
+ cv_gnn.gnn.to("cpu")
2656
+ except Exception:
2657
+ pass
2658
+ del cv_gnn
2659
+ self._clean_gpu()
2660
+
2661
+ return np.mean(fold_losses)
2662
+
2663
+ def train(self) -> None:
2664
+ if not self.best_params:
2665
+ raise RuntimeError('请先运行 tune() 以获得 GNN 最优参数。')
2666
+
2667
+ self.model = GraphNeuralNetSklearn(
2668
+ model_nme=self.ctx.model_nme,
2669
+ input_dim=self.ctx.train_oht_scl_data[self.ctx.var_nmes].shape[1],
2670
+ task_type=self.ctx.task_type,
2671
+ use_data_parallel=self.ctx.config.use_gnn_data_parallel,
2672
+ use_ddp=self.ctx.config.use_gnn_ddp
2673
+ )
2674
+ self.model.set_params(self.best_params)
2675
+
2676
+ self._fit_predict_cache(
2677
+ self.model,
2678
+ self.ctx.train_oht_scl_data[self.ctx.var_nmes],
2679
+ self.ctx.train_oht_scl_data[self.ctx.resp_nme],
2680
+ sample_weight=self.ctx.train_oht_scl_data[self.ctx.weight_nme],
2681
+ pred_prefix='gnn',
2682
+ use_oht=True,
2683
+ sample_weight_arg='w_train'
2684
+ )
2685
+ self.ctx.gnn_best = self.model
2686
+
2687
+
2688
+ # =============================================================================
2689
+ # BayesOpt orchestration & SHAP utilities
2690
+ # =============================================================================
2691
+ class BayesOptModel:
2692
+ def __init__(self, train_data, test_data,
2693
+ model_nme, resp_nme, weight_nme, factor_nmes, task_type='regression',
2694
+ binary_resp_nme=None,
2695
+ cate_list=None, prop_test=0.25, rand_seed=None,
2696
+ epochs=100, use_gpu=True,
2697
+ use_resn_data_parallel: bool = False, use_ft_data_parallel: bool = False,
2698
+ use_gnn_data_parallel: bool = False,
2699
+ use_resn_ddp: bool = False, use_ft_ddp: bool = False,
2700
+ use_gnn_ddp: bool = False):
2701
+ cfg = BayesOptConfig(
2702
+ model_nme=model_nme,
2703
+ task_type=task_type,
2704
+ resp_nme=resp_nme,
2705
+ weight_nme=weight_nme,
2706
+ factor_nmes=list(factor_nmes),
2707
+ binary_resp_nme=binary_resp_nme,
2708
+ cate_list=list(cate_list) if cate_list else None,
2709
+ prop_test=prop_test,
2710
+ rand_seed=rand_seed,
2711
+ epochs=epochs,
2712
+ use_gpu=use_gpu,
2713
+ use_resn_data_parallel=use_resn_data_parallel,
2714
+ use_ft_data_parallel=use_ft_data_parallel,
2715
+ use_resn_ddp=use_resn_ddp,
2716
+ use_gnn_data_parallel=use_gnn_data_parallel,
2717
+ use_ft_ddp=use_ft_ddp,
2718
+ use_gnn_ddp=use_gnn_ddp
2719
+ )
2720
+ self.config = cfg
2721
+ self.model_nme = cfg.model_nme
2722
+ self.task_type = cfg.task_type
2723
+ self.resp_nme = cfg.resp_nme
2724
+ self.weight_nme = cfg.weight_nme
2725
+ self.factor_nmes = cfg.factor_nmes
2726
+ self.binary_resp_nme = cfg.binary_resp_nme
2727
+ self.cate_list = list(cfg.cate_list or [])
2728
+ self.prop_test = cfg.prop_test
2729
+ self.epochs = cfg.epochs
2730
+ self.rand_seed = cfg.rand_seed if cfg.rand_seed is not None else np.random.randint(
2731
+ 1, 10000)
2732
+ self.use_gpu = bool(cfg.use_gpu and torch.cuda.is_available())
2733
+ self.output_manager = OutputManager(os.getcwd(), self.model_nme)
2734
+
2735
+ preprocessor = DatasetPreprocessor(train_data, test_data, cfg).run()
2736
+ self.train_data = preprocessor.train_data
2737
+ self.test_data = preprocessor.test_data
2738
+ self.train_oht_scl_data = preprocessor.train_oht_scl_data
2739
+ self.test_oht_scl_data = preprocessor.test_oht_scl_data
2740
+ self.var_nmes = preprocessor.var_nmes
2741
+ self.num_features = preprocessor.num_features
2742
+ self.cat_categories_for_shap = preprocessor.cat_categories_for_shap
2743
+
2744
+ self.cv = ShuffleSplit(n_splits=int(1/self.prop_test),
2745
+ test_size=self.prop_test,
2746
+ random_state=self.rand_seed)
2747
+ if self.task_type == 'classification':
2748
+ self.obj = 'binary:logistic'
2749
+ else: # regression
2750
+ if 'f' in self.model_nme:
2751
+ self.obj = 'count:poisson'
2752
+ elif 's' in self.model_nme:
2753
+ self.obj = 'reg:gamma'
2754
+ elif 'bc' in self.model_nme:
2755
+ self.obj = 'reg:tweedie'
2756
+ else:
2757
+ self.obj = 'reg:tweedie'
2758
+ self.fit_params = {
2759
+ 'sample_weight': self.train_data[self.weight_nme].values
2760
+ }
2761
+ self.model_label: List[str] = []
2762
+ self.optuna_storage = cfg.optuna_storage
2763
+ self.optuna_study_prefix = cfg.optuna_study_prefix or "bayesopt"
2764
+
2765
+ # 记录各模型训练器,后续统一通过标签访问,方便扩展新模型
2766
+ self.trainers: Dict[str, TrainerBase] = {
2767
+ 'glm': GLMTrainer(self),
2768
+ 'xgb': XGBTrainer(self),
2769
+ 'resn': ResNetTrainer(self),
2770
+ 'gnn': GNNTrainer(self),
2771
+ 'ft': FTTrainer(self)
2772
+ }
2773
+ self.xgb_best = None
2774
+ self.resn_best = None
2775
+ self.glm_best = None
2776
+ self.ft_best = None
2777
+ self.gnn_best = None
2778
+ self.best_xgb_params = None
2779
+ self.best_resn_params = None
2780
+ self.best_ft_params = None
2781
+ self.best_gnn_params = None
2782
+ self.best_xgb_trial = None
2783
+ self.best_resn_trial = None
2784
+ self.best_ft_trial = None
2785
+ self.best_gnn_trial = None
2786
+ self.best_glm_params = None
2787
+ self.best_glm_trial = None
2788
+ self.xgb_load = None
2789
+ self.resn_load = None
2790
+ self.ft_load = None
2791
+ self.gnn_load = None
2792
+
2793
+ # 定义单因素画图函数
2794
+ def plot_oneway(self, n_bins=10):
2795
+ for c in self.factor_nmes:
2796
+ fig = plt.figure(figsize=(7, 5))
2797
+ if c in self.cate_list:
2798
+ group_col = c
2799
+ plot_source = self.train_data
2800
+ else:
2801
+ group_col = f'{c}_bins'
2802
+ bins = pd.qcut(
2803
+ self.train_data[c],
2804
+ n_bins,
2805
+ duplicates='drop' # 注意:如果分位数重复会丢 bin,避免异常终止
2806
+ )
2807
+ plot_source = self.train_data.assign(**{group_col: bins})
2808
+ plot_data = plot_source.groupby(
2809
+ [group_col], observed=True).sum(numeric_only=True)
2810
+ plot_data.reset_index(inplace=True)
2811
+ plot_data['act_v'] = plot_data['w_act'] / \
2812
+ plot_data[self.weight_nme]
2813
+ plot_data.head()
2814
+ ax = fig.add_subplot(111)
2815
+ ax.plot(plot_data.index, plot_data['act_v'],
2816
+ label='Actual', color='red')
2817
+ ax.set_title(
2818
+ 'Analysis of %s : Train Data' % group_col,
2819
+ fontsize=8)
2820
+ plt.xticks(plot_data.index,
2821
+ list(plot_data[group_col].astype(str)),
2822
+ rotation=90)
2823
+ if len(list(plot_data[group_col].astype(str))) > 50:
2824
+ plt.xticks(fontsize=3)
2825
+ else:
2826
+ plt.xticks(fontsize=6)
2827
+ plt.yticks(fontsize=6)
2828
+ ax2 = ax.twinx()
2829
+ ax2.bar(plot_data.index,
2830
+ plot_data[self.weight_nme],
2831
+ alpha=0.5, color='seagreen')
2832
+ plt.yticks(fontsize=6)
2833
+ plt.margins(0.05)
2834
+ plt.subplots_adjust(wspace=0.3)
2835
+ save_path = self.output_manager.plot_path(
2836
+ f'00_{self.model_nme}_{group_col}_oneway.png')
2837
+ plt.savefig(save_path, dpi=300)
2838
+ plt.close(fig)
2839
+
2840
+ # 定义通用优化函数
2841
+ def optimize_model(self, model_key: str, max_evals: int = 100):
2842
+ if model_key not in self.trainers:
2843
+ print(f"Warning: Unknown model key: {model_key}")
2844
+ return
2845
+
2846
+ trainer = self.trainers[model_key]
2847
+ trainer.tune(max_evals)
2848
+ trainer.train()
2849
+
2850
+ # Update context attributes for backward compatibility
2851
+ setattr(self, f"{model_key}_best", trainer.model)
2852
+ setattr(self, f"best_{model_key}_params", trainer.best_params)
2853
+ setattr(self, f"best_{model_key}_trial", trainer.best_trial)
2854
+
2855
+ # 定义GLM贝叶斯优化函数
2856
+ def bayesopt_glm(self, max_evals=50):
2857
+ self.optimize_model('glm', max_evals)
2858
+
2859
+ # 定义Xgboost贝叶斯优化函数
2860
+ def bayesopt_xgb(self, max_evals=100):
2861
+ self.optimize_model('xgb', max_evals)
2862
+
2863
+ # 定义ResNet贝叶斯优化函数
2864
+ def bayesopt_resnet(self, max_evals=100):
2865
+ self.optimize_model('resn', max_evals)
2866
+
2867
+ # 定义 GNN 贝叶斯优化函数
2868
+ def bayesopt_gnn(self, max_evals=50):
2869
+ self.optimize_model('gnn', max_evals)
2870
+
2871
+ # 定义 FT-Transformer 贝叶斯优化函数
2872
+ def bayesopt_ft(self, max_evals=50):
2873
+ self.optimize_model('ft', max_evals)
2874
+
2875
+ # 绘制提纯曲线
2876
+ def plot_lift(self, model_label, pred_nme, n_bins=10):
2877
+ model_map = {
2878
+ 'Xgboost': 'pred_xgb',
2879
+ 'ResNet': 'pred_resn',
2880
+ 'ResNetClassifier': 'pred_resn',
2881
+ 'FTTransformer': 'pred_ft',
2882
+ 'FTTransformerClassifier': 'pred_ft',
2883
+ 'GLM': 'pred_glm',
2884
+ 'GNN': 'pred_gnn',
2885
+ 'GNNClassifier': 'pred_gnn'
2886
+ }
2887
+ for k, v in model_map.items():
2888
+ if model_label.startswith(k):
2889
+ pred_nme = v
2890
+ break
2891
+
2892
+ fig = plt.figure(figsize=(11, 5))
2893
+ for pos, (title, data) in zip([121, 122],
2894
+ [('Lift Chart on Train Data', self.train_data),
2895
+ ('Lift Chart on Test Data', self.test_data)]):
2896
+ lift_df = pd.DataFrame({
2897
+ 'pred': data[pred_nme].values,
2898
+ 'w_pred': data[f'w_{pred_nme}'].values,
2899
+ 'act': data['w_act'].values,
2900
+ 'weight': data[self.weight_nme].values
2901
+ })
2902
+ plot_data = PlotUtils.split_data(lift_df, 'pred', 'weight', n_bins)
2903
+ denom = np.maximum(plot_data['weight'], EPS)
2904
+ plot_data['exp_v'] = plot_data['w_pred'] / denom
2905
+ plot_data['act_v'] = plot_data['act'] / denom
2906
+ plot_data = plot_data.reset_index()
2907
+
2908
+ ax = fig.add_subplot(pos)
2909
+ PlotUtils.plot_lift_ax(ax, plot_data, title)
2910
+
2911
+ plt.subplots_adjust(wspace=0.3)
2912
+ save_path = self.output_manager.plot_path(
2913
+ f'01_{self.model_nme}_{model_label}_lift.png')
2914
+ plt.savefig(save_path, dpi=300)
2915
+ plt.show()
2916
+ plt.close(fig)
2917
+
2918
+ # 绘制双提纯曲线
2919
+ def plot_dlift(self, model_comp: List[str] = ['xgb', 'resn'], n_bins: int = 10) -> None:
2920
+ # 绘制双提纯曲线,对比两个模型在不同分箱下的表现。
2921
+ # Args:
2922
+ # model_comp: 需要对比的模型简称(如 ['xgb', 'resn'],支持 'xgb'/'resn'/'ft')。
2923
+ # n_bins: 分箱数量,用于控制 lift 曲线的粒度。
2924
+ if len(model_comp) != 2:
2925
+ raise ValueError("`model_comp` 必须包含两个模型进行对比。")
2926
+
2927
+ model_name_map = {
2928
+ 'xgb': 'Xgboost',
2929
+ 'resn': 'ResNet',
2930
+ 'ft': 'FTTransformer',
2931
+ 'glm': 'GLM',
2932
+ 'gnn': 'GNN'
2933
+ }
2934
+
2935
+ name1, name2 = model_comp
2936
+ if name1 not in model_name_map or name2 not in model_name_map:
2937
+ raise ValueError(f"不支持的模型简称。请从 {list(model_name_map.keys())} 中选择。")
2938
+
2939
+ fig, axes = plt.subplots(1, 2, figsize=(11, 5))
2940
+ datasets = {
2941
+ 'Train Data': self.train_data,
2942
+ 'Test Data': self.test_data
2943
+ }
2944
+
2945
+ for ax, (data_name, data) in zip(axes, datasets.items()):
2946
+ pred1_col = f'w_pred_{name1}'
2947
+ pred2_col = f'w_pred_{name2}'
2948
+
2949
+ if pred1_col not in data.columns or pred2_col not in data.columns:
2950
+ print(
2951
+ f"警告: 在 {data_name} 中找不到预测列 {pred1_col} 或 {pred2_col}。跳过绘图。")
2952
+ continue
2953
+
2954
+ lift_data = pd.DataFrame({
2955
+ 'pred1': data[pred1_col].values,
2956
+ 'pred2': data[pred2_col].values,
2957
+ 'diff_ly': data[pred1_col].values / np.maximum(data[pred2_col].values, EPS),
2958
+ 'act': data['w_act'].values,
2959
+ 'weight': data[self.weight_nme].values
2960
+ })
2961
+ plot_data = PlotUtils.split_data(
2962
+ lift_data, 'diff_ly', 'weight', n_bins)
2963
+ denom = np.maximum(plot_data['act'], EPS)
2964
+ plot_data['exp_v1'] = plot_data['pred1'] / denom
2965
+ plot_data['exp_v2'] = plot_data['pred2'] / denom
2966
+ plot_data['act_v'] = plot_data['act'] / denom
2967
+ plot_data.reset_index(inplace=True)
2968
+
2969
+ label1 = model_name_map[name1]
2970
+ label2 = model_name_map[name2]
2971
+
2972
+ PlotUtils.plot_dlift_ax(
2973
+ ax, plot_data, f'Double Lift Chart on {data_name}', label1, label2)
2974
+
2975
+ plt.subplots_adjust(bottom=0.25, top=0.95, right=0.8, wspace=0.3)
2976
+ save_path = self.output_manager.plot_path(
2977
+ f'02_{self.model_nme}_dlift_{name1}_vs_{name2}.png')
2978
+ plt.savefig(save_path, dpi=300)
2979
+ plt.show()
2980
+ plt.close(fig)
2981
+
2982
+ # 绘制成交率提升曲线
2983
+ def plot_conversion_lift(self, model_pred_col: str, n_bins: int = 20):
2984
+ if not self.binary_resp_nme:
2985
+ print("错误: 未在 BayesOptModel 初始化时提供 `binary_resp_nme`。无法绘制成交率曲线。")
2986
+ return
2987
+
2988
+ fig, axes = plt.subplots(1, 2, figsize=(14, 6), sharey=True)
2989
+ datasets = {
2990
+ 'Train Data': self.train_data,
2991
+ 'Test Data': self.test_data
2992
+ }
2993
+
2994
+ for ax, (data_name, data) in zip(axes, datasets.items()):
2995
+ if model_pred_col not in data.columns:
2996
+ print(f"警告: 在 {data_name} 中找不到预测列 '{model_pred_col}'。跳过绘图。")
2997
+ continue
2998
+
2999
+ # 按模型预测分排序,并计算分箱
3000
+ plot_data = data.sort_values(by=model_pred_col).copy()
3001
+ plot_data['cum_weight'] = plot_data[self.weight_nme].cumsum()
3002
+ total_weight = plot_data[self.weight_nme].sum()
3003
+
3004
+ if total_weight > EPS:
3005
+ plot_data['bin'] = pd.cut(
3006
+ plot_data['cum_weight'],
3007
+ bins=n_bins,
3008
+ labels=False,
3009
+ right=False
3010
+ )
3011
+ else:
3012
+ plot_data['bin'] = 0
3013
+
3014
+ # 按分箱聚合
3015
+ lift_agg = plot_data.groupby('bin').agg(
3016
+ total_weight=(self.weight_nme, 'sum'),
3017
+ actual_conversions=(self.binary_resp_nme, 'sum'),
3018
+ weighted_conversions=('w_binary_act', 'sum'),
3019
+ avg_pred=(model_pred_col, 'mean')
3020
+ ).reset_index()
3021
+
3022
+ # 计算成交率
3023
+ lift_agg['conversion_rate'] = lift_agg['weighted_conversions'] / \
3024
+ lift_agg['total_weight']
3025
+
3026
+ # 计算整体平均成交率
3027
+ overall_conversion_rate = data['w_binary_act'].sum(
3028
+ ) / data[self.weight_nme].sum()
3029
+ ax.axhline(y=overall_conversion_rate, color='gray', linestyle='--',
3030
+ label=f'Overall Avg Rate ({overall_conversion_rate:.2%})')
3031
+
3032
+ ax.plot(lift_agg['bin'], lift_agg['conversion_rate'],
3033
+ marker='o', linestyle='-', label='Actual Conversion Rate')
3034
+ ax.set_title(f'Conversion Rate Lift Chart on {data_name}')
3035
+ ax.set_xlabel(f'Model Score Decile (based on {model_pred_col})')
3036
+ ax.set_ylabel('Conversion Rate')
3037
+ ax.grid(True, linestyle='--', alpha=0.6)
3038
+ ax.legend()
3039
+
3040
+ plt.tight_layout()
3041
+ plt.show()
3042
+
3043
+ # 保存模型
3044
+ def save_model(self, model_name=None):
3045
+ keys = [model_name] if model_name else self.trainers.keys()
3046
+ for key in keys:
3047
+ if key in self.trainers:
3048
+ self.trainers[key].save()
3049
+ else:
3050
+ if model_name: # Only warn if specific model requested
3051
+ print(f"[save_model] Warning: Unknown model key {key}")
3052
+
3053
+ def load_model(self, model_name=None):
3054
+ keys = [model_name] if model_name else self.trainers.keys()
3055
+ for key in keys:
3056
+ if key in self.trainers:
3057
+ self.trainers[key].load()
3058
+ # Update context attributes
3059
+ trainer = self.trainers[key]
3060
+ if trainer.model is not None:
3061
+ setattr(self, f"{key}_best", trainer.model)
3062
+ # Also update xxx_load for backward compatibility if needed
3063
+ # Original code had xgb_load, resn_load, ft_load but not glm_load
3064
+ if key in ['xgb', 'resn', 'ft']:
3065
+ setattr(self, f"{key}_load", trainer.model)
3066
+ else:
3067
+ if model_name:
3068
+ print(f"[load_model] Warning: Unknown model key {key}")
3069
+
3070
+ def _sample_rows(self, data: pd.DataFrame, n: int) -> pd.DataFrame:
3071
+ if len(data) == 0:
3072
+ return data
3073
+ return data.sample(min(len(data), n), random_state=self.rand_seed)
3074
+
3075
+ @staticmethod
3076
+ def _shap_nsamples(arr: np.ndarray, max_nsamples: int = 300) -> int:
3077
+ min_needed = arr.shape[1] + 2
3078
+ return max(min_needed, min(max_nsamples, arr.shape[0] * arr.shape[1]))
3079
+
3080
+ def _build_ft_shap_matrix(self, data: pd.DataFrame) -> np.ndarray:
3081
+ matrices = []
3082
+ for col in self.factor_nmes:
3083
+ s = data[col]
3084
+ if col in self.cate_list:
3085
+ cats = pd.Categorical(
3086
+ s,
3087
+ categories=self.cat_categories_for_shap[col]
3088
+ )
3089
+ codes = np.asarray(cats.codes, dtype=np.float64).reshape(-1, 1)
3090
+ matrices.append(codes)
3091
+ else:
3092
+ vals = pd.to_numeric(s, errors="coerce")
3093
+ arr = vals.to_numpy(dtype=np.float64, copy=True).reshape(-1, 1)
3094
+ matrices.append(arr)
3095
+ X_mat = np.concatenate(matrices, axis=1) # (N, F)
3096
+ return X_mat
3097
+
3098
+ def _decode_ft_shap_matrix_to_df(self, X_mat: np.ndarray) -> pd.DataFrame:
3099
+ data_dict = {}
3100
+ for j, col in enumerate(self.factor_nmes):
3101
+ col_vals = X_mat[:, j]
3102
+ if col in self.cate_list:
3103
+ cats = self.cat_categories_for_shap[col]
3104
+ codes = np.round(col_vals).astype(int)
3105
+ codes = np.clip(codes, -1, len(cats) - 1)
3106
+ cat_series = pd.Categorical.from_codes(
3107
+ codes,
3108
+ categories=cats
3109
+ )
3110
+ data_dict[col] = cat_series
3111
+ else:
3112
+ data_dict[col] = col_vals.astype(float)
3113
+
3114
+ df = pd.DataFrame(data_dict, columns=self.factor_nmes)
3115
+ for col in self.cate_list:
3116
+ if col in df.columns:
3117
+ df[col] = df[col].astype("category")
3118
+ return df
3119
+
3120
+ def _build_glm_design(self, data: pd.DataFrame) -> pd.DataFrame:
3121
+ X = data[self.var_nmes]
3122
+ return sm.add_constant(X, has_constant='add')
3123
+
3124
+ def _compute_shap_core(self,
3125
+ model_key: str,
3126
+ n_background: int,
3127
+ n_samples: int,
3128
+ on_train: bool,
3129
+ X_df: pd.DataFrame,
3130
+ prep_fn,
3131
+ predict_fn,
3132
+ cleanup_fn=None):
3133
+ if model_key not in self.trainers or self.trainers[model_key].model is None:
3134
+ raise RuntimeError(f"Model {model_key} not trained.")
3135
+ if cleanup_fn:
3136
+ cleanup_fn()
3137
+ bg_df = self._sample_rows(X_df, n_background)
3138
+ bg_mat = prep_fn(bg_df)
3139
+ explainer = shap.KernelExplainer(predict_fn, bg_mat)
3140
+ ex_df = self._sample_rows(X_df, n_samples)
3141
+ ex_mat = prep_fn(ex_df)
3142
+ nsample_eff = self._shap_nsamples(ex_mat)
3143
+ shap_values = explainer.shap_values(ex_mat, nsamples=nsample_eff)
3144
+ bg_pred = predict_fn(bg_mat)
3145
+ base_value = float(np.asarray(bg_pred).mean())
3146
+
3147
+ return {
3148
+ "explainer": explainer,
3149
+ "X_explain": ex_df,
3150
+ "shap_values": shap_values,
3151
+ "base_value": base_value
3152
+ }
3153
+
3154
+ # ========= GLM SHAP =========
3155
+ def compute_shap_glm(self, n_background: int = 500,
3156
+ n_samples: int = 200,
3157
+ on_train: bool = True):
3158
+ data = self.train_oht_scl_data if on_train else self.test_oht_scl_data
3159
+ design_all = self._build_glm_design(data)
3160
+ design_cols = list(design_all.columns)
3161
+
3162
+ def predict_wrapper(x_np):
3163
+ x_df = pd.DataFrame(x_np, columns=design_cols)
3164
+ y_pred = self.glm_best.predict(x_df)
3165
+ return np.asarray(y_pred, dtype=np.float64).reshape(-1)
3166
+
3167
+ self.shap_glm = self._compute_shap_core(
3168
+ 'glm', n_background, n_samples, on_train,
3169
+ X_df=design_all,
3170
+ prep_fn=lambda df: df.to_numpy(dtype=np.float64),
3171
+ predict_fn=predict_wrapper
3172
+ )
3173
+ return self.shap_glm
3174
+
3175
+ # ========= XGBoost SHAP =========
3176
+ def compute_shap_xgb(self, n_background: int = 500,
3177
+ n_samples: int = 200,
3178
+ on_train: bool = True):
3179
+ data = self.train_data if on_train else self.test_data
3180
+ X_raw = data[self.factor_nmes]
3181
+
3182
+ def predict_wrapper(x_mat):
3183
+ df_input = self._decode_ft_shap_matrix_to_df(x_mat)
3184
+ return self.xgb_best.predict(df_input)
3185
+
3186
+ self.shap_xgb = self._compute_shap_core(
3187
+ 'xgb', n_background, n_samples, on_train,
3188
+ X_df=X_raw,
3189
+ prep_fn=lambda df: self._build_ft_shap_matrix(
3190
+ df).astype(np.float64),
3191
+ predict_fn=predict_wrapper
3192
+ )
3193
+ return self.shap_xgb
3194
+
3195
+ # ========= ResNet SHAP =========
3196
+ def _resn_predict_wrapper(self, X_np):
3197
+ model = self.resn_best.resnet.to("cpu")
3198
+ with torch.no_grad():
3199
+ X_tensor = torch.tensor(X_np, dtype=torch.float32)
3200
+ y_pred = model(X_tensor).cpu().numpy()
3201
+ y_pred = np.clip(y_pred, 1e-6, None)
3202
+ return y_pred.reshape(-1)
3203
+
3204
+ def compute_shap_resn(self, n_background: int = 500,
3205
+ n_samples: int = 200,
3206
+ on_train: bool = True):
3207
+ data = self.train_oht_scl_data if on_train else self.test_oht_scl_data
3208
+ X = data[self.var_nmes]
3209
+
3210
+ def cleanup():
3211
+ self.resn_best.device = torch.device("cpu")
3212
+ self.resn_best.resnet.to("cpu")
3213
+ if torch.cuda.is_available():
3214
+ torch.cuda.empty_cache()
3215
+
3216
+ self.shap_resn = self._compute_shap_core(
3217
+ 'resn', n_background, n_samples, on_train,
3218
+ X_df=X,
3219
+ prep_fn=lambda df: df.to_numpy(dtype=np.float64),
3220
+ predict_fn=lambda x: self._resn_predict_wrapper(x),
3221
+ cleanup_fn=cleanup
3222
+ )
3223
+ return self.shap_resn
3224
+
3225
+ # ========= GNN SHAP =========
3226
+ def _gnn_predict_wrapper(self, X_np: np.ndarray) -> np.ndarray:
3227
+ X_df = pd.DataFrame(X_np, columns=self.var_nmes)
3228
+ y_pred = self.gnn_best.predict(X_df)
3229
+ return np.asarray(y_pred, dtype=np.float64).reshape(-1)
3230
+
3231
+ def compute_shap_gnn(self, n_background: int = 300,
3232
+ n_samples: int = 150,
3233
+ on_train: bool = True):
3234
+ data = self.train_oht_scl_data if on_train else self.test_oht_scl_data
3235
+ if data is None:
3236
+ raise RuntimeError("One-hot 标准化数据未准备好,无法计算 GNN SHAP。")
3237
+ X = data[self.var_nmes]
3238
+
3239
+ def cleanup():
3240
+ self.gnn_best.device = torch.device("cpu")
3241
+ self.gnn_best.gnn.to("cpu")
3242
+ if torch.cuda.is_available():
3243
+ torch.cuda.empty_cache()
3244
+
3245
+ self.shap_gnn = self._compute_shap_core(
3246
+ 'gnn', n_background, n_samples, on_train,
3247
+ X_df=X,
3248
+ prep_fn=lambda df: df.to_numpy(dtype=np.float64),
3249
+ predict_fn=lambda x: self._gnn_predict_wrapper(x),
3250
+ cleanup_fn=cleanup
3251
+ )
3252
+ return self.shap_gnn
3253
+
3254
+ # ========= FT-Transformer SHAP =========
3255
+ def _ft_shap_predict_wrapper(self, X_mat: np.ndarray) -> np.ndarray:
3256
+ df_input = self._decode_ft_shap_matrix_to_df(X_mat)
3257
+ y_pred = self.ft_best.predict(df_input)
3258
+ return np.asarray(y_pred, dtype=np.float64).reshape(-1)
3259
+
3260
+ def compute_shap_ft(self, n_background: int = 500,
3261
+ n_samples: int = 200,
3262
+ on_train: bool = True):
3263
+ data = self.train_data if on_train else self.test_data
3264
+ X_raw = data[self.factor_nmes]
3265
+
3266
+ def cleanup():
3267
+ self.ft_best.device = torch.device("cpu")
3268
+ self.ft_best.ft.to("cpu")
3269
+ if torch.cuda.is_available():
3270
+ torch.cuda.empty_cache()
3271
+
3272
+ self.shap_ft = self._compute_shap_core(
3273
+ 'ft', n_background, n_samples, on_train,
3274
+ X_df=X_raw,
3275
+ prep_fn=lambda df: self._build_ft_shap_matrix(
3276
+ df).astype(np.float64),
3277
+ predict_fn=self._ft_shap_predict_wrapper,
3278
+ cleanup_fn=cleanup
3279
+ )
3280
+ return self.shap_ft