ins-pricing 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. ins_pricing/README.md +60 -0
  2. ins_pricing/__init__.py +102 -0
  3. ins_pricing/governance/README.md +18 -0
  4. ins_pricing/governance/__init__.py +20 -0
  5. ins_pricing/governance/approval.py +93 -0
  6. ins_pricing/governance/audit.py +37 -0
  7. ins_pricing/governance/registry.py +99 -0
  8. ins_pricing/governance/release.py +159 -0
  9. ins_pricing/modelling/BayesOpt.py +146 -0
  10. ins_pricing/modelling/BayesOpt_USAGE.md +925 -0
  11. ins_pricing/modelling/BayesOpt_entry.py +575 -0
  12. ins_pricing/modelling/BayesOpt_incremental.py +731 -0
  13. ins_pricing/modelling/Explain_Run.py +36 -0
  14. ins_pricing/modelling/Explain_entry.py +539 -0
  15. ins_pricing/modelling/Pricing_Run.py +36 -0
  16. ins_pricing/modelling/README.md +33 -0
  17. ins_pricing/modelling/__init__.py +44 -0
  18. ins_pricing/modelling/bayesopt/__init__.py +98 -0
  19. ins_pricing/modelling/bayesopt/config_preprocess.py +303 -0
  20. ins_pricing/modelling/bayesopt/core.py +1476 -0
  21. ins_pricing/modelling/bayesopt/models.py +2196 -0
  22. ins_pricing/modelling/bayesopt/trainers.py +2446 -0
  23. ins_pricing/modelling/bayesopt/utils.py +1021 -0
  24. ins_pricing/modelling/cli_common.py +136 -0
  25. ins_pricing/modelling/explain/__init__.py +55 -0
  26. ins_pricing/modelling/explain/gradients.py +334 -0
  27. ins_pricing/modelling/explain/metrics.py +176 -0
  28. ins_pricing/modelling/explain/permutation.py +155 -0
  29. ins_pricing/modelling/explain/shap_utils.py +146 -0
  30. ins_pricing/modelling/notebook_utils.py +284 -0
  31. ins_pricing/modelling/plotting/__init__.py +45 -0
  32. ins_pricing/modelling/plotting/common.py +63 -0
  33. ins_pricing/modelling/plotting/curves.py +572 -0
  34. ins_pricing/modelling/plotting/diagnostics.py +139 -0
  35. ins_pricing/modelling/plotting/geo.py +362 -0
  36. ins_pricing/modelling/plotting/importance.py +121 -0
  37. ins_pricing/modelling/run_logging.py +133 -0
  38. ins_pricing/modelling/tests/conftest.py +8 -0
  39. ins_pricing/modelling/tests/test_cross_val_generic.py +66 -0
  40. ins_pricing/modelling/tests/test_distributed_utils.py +18 -0
  41. ins_pricing/modelling/tests/test_explain.py +56 -0
  42. ins_pricing/modelling/tests/test_geo_tokens_split.py +49 -0
  43. ins_pricing/modelling/tests/test_graph_cache.py +33 -0
  44. ins_pricing/modelling/tests/test_plotting.py +63 -0
  45. ins_pricing/modelling/tests/test_plotting_library.py +150 -0
  46. ins_pricing/modelling/tests/test_preprocessor.py +48 -0
  47. ins_pricing/modelling/watchdog_run.py +211 -0
  48. ins_pricing/pricing/README.md +44 -0
  49. ins_pricing/pricing/__init__.py +27 -0
  50. ins_pricing/pricing/calibration.py +39 -0
  51. ins_pricing/pricing/data_quality.py +117 -0
  52. ins_pricing/pricing/exposure.py +85 -0
  53. ins_pricing/pricing/factors.py +91 -0
  54. ins_pricing/pricing/monitoring.py +99 -0
  55. ins_pricing/pricing/rate_table.py +78 -0
  56. ins_pricing/production/__init__.py +21 -0
  57. ins_pricing/production/drift.py +30 -0
  58. ins_pricing/production/monitoring.py +143 -0
  59. ins_pricing/production/scoring.py +40 -0
  60. ins_pricing/reporting/README.md +20 -0
  61. ins_pricing/reporting/__init__.py +11 -0
  62. ins_pricing/reporting/report_builder.py +72 -0
  63. ins_pricing/reporting/scheduler.py +45 -0
  64. ins_pricing/setup.py +41 -0
  65. ins_pricing v2/__init__.py +23 -0
  66. ins_pricing v2/governance/__init__.py +20 -0
  67. ins_pricing v2/governance/approval.py +93 -0
  68. ins_pricing v2/governance/audit.py +37 -0
  69. ins_pricing v2/governance/registry.py +99 -0
  70. ins_pricing v2/governance/release.py +159 -0
  71. ins_pricing v2/modelling/Explain_Run.py +36 -0
  72. ins_pricing v2/modelling/Pricing_Run.py +36 -0
  73. ins_pricing v2/modelling/__init__.py +151 -0
  74. ins_pricing v2/modelling/cli_common.py +141 -0
  75. ins_pricing v2/modelling/config.py +249 -0
  76. ins_pricing v2/modelling/config_preprocess.py +254 -0
  77. ins_pricing v2/modelling/core.py +741 -0
  78. ins_pricing v2/modelling/data_container.py +42 -0
  79. ins_pricing v2/modelling/explain/__init__.py +55 -0
  80. ins_pricing v2/modelling/explain/gradients.py +334 -0
  81. ins_pricing v2/modelling/explain/metrics.py +176 -0
  82. ins_pricing v2/modelling/explain/permutation.py +155 -0
  83. ins_pricing v2/modelling/explain/shap_utils.py +146 -0
  84. ins_pricing v2/modelling/features.py +215 -0
  85. ins_pricing v2/modelling/model_manager.py +148 -0
  86. ins_pricing v2/modelling/model_plotting.py +463 -0
  87. ins_pricing v2/modelling/models.py +2203 -0
  88. ins_pricing v2/modelling/notebook_utils.py +294 -0
  89. ins_pricing v2/modelling/plotting/__init__.py +45 -0
  90. ins_pricing v2/modelling/plotting/common.py +63 -0
  91. ins_pricing v2/modelling/plotting/curves.py +572 -0
  92. ins_pricing v2/modelling/plotting/diagnostics.py +139 -0
  93. ins_pricing v2/modelling/plotting/geo.py +362 -0
  94. ins_pricing v2/modelling/plotting/importance.py +121 -0
  95. ins_pricing v2/modelling/run_logging.py +133 -0
  96. ins_pricing v2/modelling/tests/conftest.py +8 -0
  97. ins_pricing v2/modelling/tests/test_cross_val_generic.py +66 -0
  98. ins_pricing v2/modelling/tests/test_distributed_utils.py +18 -0
  99. ins_pricing v2/modelling/tests/test_explain.py +56 -0
  100. ins_pricing v2/modelling/tests/test_geo_tokens_split.py +49 -0
  101. ins_pricing v2/modelling/tests/test_graph_cache.py +33 -0
  102. ins_pricing v2/modelling/tests/test_plotting.py +63 -0
  103. ins_pricing v2/modelling/tests/test_plotting_library.py +150 -0
  104. ins_pricing v2/modelling/tests/test_preprocessor.py +48 -0
  105. ins_pricing v2/modelling/trainers.py +2447 -0
  106. ins_pricing v2/modelling/utils.py +1020 -0
  107. ins_pricing v2/modelling/watchdog_run.py +211 -0
  108. ins_pricing v2/pricing/__init__.py +27 -0
  109. ins_pricing v2/pricing/calibration.py +39 -0
  110. ins_pricing v2/pricing/data_quality.py +117 -0
  111. ins_pricing v2/pricing/exposure.py +85 -0
  112. ins_pricing v2/pricing/factors.py +91 -0
  113. ins_pricing v2/pricing/monitoring.py +99 -0
  114. ins_pricing v2/pricing/rate_table.py +78 -0
  115. ins_pricing v2/production/__init__.py +21 -0
  116. ins_pricing v2/production/drift.py +30 -0
  117. ins_pricing v2/production/monitoring.py +143 -0
  118. ins_pricing v2/production/scoring.py +40 -0
  119. ins_pricing v2/reporting/__init__.py +11 -0
  120. ins_pricing v2/reporting/report_builder.py +72 -0
  121. ins_pricing v2/reporting/scheduler.py +45 -0
  122. ins_pricing v2/scripts/BayesOpt_incremental.py +722 -0
  123. ins_pricing v2/scripts/Explain_entry.py +545 -0
  124. ins_pricing v2/scripts/__init__.py +1 -0
  125. ins_pricing v2/scripts/train.py +568 -0
  126. ins_pricing v2/setup.py +55 -0
  127. ins_pricing v2/smoke_test.py +28 -0
  128. ins_pricing-0.1.6.dist-info/METADATA +78 -0
  129. ins_pricing-0.1.6.dist-info/RECORD +169 -0
  130. ins_pricing-0.1.6.dist-info/WHEEL +5 -0
  131. ins_pricing-0.1.6.dist-info/top_level.txt +4 -0
  132. user_packages/__init__.py +105 -0
  133. user_packages legacy/BayesOpt.py +5659 -0
  134. user_packages legacy/BayesOpt_entry.py +513 -0
  135. user_packages legacy/BayesOpt_incremental.py +685 -0
  136. user_packages legacy/Pricing_Run.py +36 -0
  137. user_packages legacy/Try/BayesOpt Legacy251213.py +3719 -0
  138. user_packages legacy/Try/BayesOpt Legacy251215.py +3758 -0
  139. user_packages legacy/Try/BayesOpt lagecy251201.py +3506 -0
  140. user_packages legacy/Try/BayesOpt lagecy251218.py +3992 -0
  141. user_packages legacy/Try/BayesOpt legacy.py +3280 -0
  142. user_packages legacy/Try/BayesOpt.py +838 -0
  143. user_packages legacy/Try/BayesOptAll.py +1569 -0
  144. user_packages legacy/Try/BayesOptAllPlatform.py +909 -0
  145. user_packages legacy/Try/BayesOptCPUGPU.py +1877 -0
  146. user_packages legacy/Try/BayesOptSearch.py +830 -0
  147. user_packages legacy/Try/BayesOptSearchOrigin.py +829 -0
  148. user_packages legacy/Try/BayesOptV1.py +1911 -0
  149. user_packages legacy/Try/BayesOptV10.py +2973 -0
  150. user_packages legacy/Try/BayesOptV11.py +3001 -0
  151. user_packages legacy/Try/BayesOptV12.py +3001 -0
  152. user_packages legacy/Try/BayesOptV2.py +2065 -0
  153. user_packages legacy/Try/BayesOptV3.py +2209 -0
  154. user_packages legacy/Try/BayesOptV4.py +2342 -0
  155. user_packages legacy/Try/BayesOptV5.py +2372 -0
  156. user_packages legacy/Try/BayesOptV6.py +2759 -0
  157. user_packages legacy/Try/BayesOptV7.py +2832 -0
  158. user_packages legacy/Try/BayesOptV8Codex.py +2731 -0
  159. user_packages legacy/Try/BayesOptV8Gemini.py +2614 -0
  160. user_packages legacy/Try/BayesOptV9.py +2927 -0
  161. user_packages legacy/Try/BayesOpt_entry legacy.py +313 -0
  162. user_packages legacy/Try/ModelBayesOptSearch.py +359 -0
  163. user_packages legacy/Try/ResNetBayesOptSearch.py +249 -0
  164. user_packages legacy/Try/XgbBayesOptSearch.py +121 -0
  165. user_packages legacy/Try/xgbbayesopt.py +523 -0
  166. user_packages legacy/__init__.py +19 -0
  167. user_packages legacy/cli_common.py +124 -0
  168. user_packages legacy/notebook_utils.py +228 -0
  169. user_packages legacy/watchdog_run.py +202 -0
@@ -0,0 +1,909 @@
1
+ from sklearn.metrics import make_scorer, mean_tweedie_deviance
2
+ from sklearn.preprocessing import StandardScaler
3
+ from sklearn.model_selection import KFold, ShuffleSplit, cross_val_score # 1.2.2
4
+ from torch.nn.utils import clip_grad_norm_
5
+ from torch.cuda.amp import autocast, GradScaler
6
+ from torch.utils.data import DataLoader, TensorDataset
7
+ import copy
8
+ import numpy as np # 1.26.2
9
+ import pandas as pd # 2.2.3
10
+ import torch # 1.10.1+cu111
11
+ import torch.nn as nn
12
+ import torch.nn.functional as F
13
+ import optuna # 4.3.0
14
+ import xgboost as xgb # 1.7.0
15
+ import matplotlib.pyplot as plt
16
+ import os
17
+ import joblib
18
+ import platform
19
+
20
+
21
+ from pathlib import Path
22
+ from typing import Optional, Union
23
+
24
+ PathLike = Union[str, os.PathLike, Path]
25
+ DEFAULT_OUTPUT_ROOT = Path.cwd().resolve()
26
+
27
+
28
+ def resolve_output_path(folder: str, filename: str,
29
+ base_dir: Optional[PathLike] = None,
30
+ ensure_dir: bool = True) -> Path:
31
+ """Return a filesystem-safe path, creating the folder if requested."""
32
+ base = Path(base_dir) if base_dir is not None else DEFAULT_OUTPUT_ROOT
33
+ target_dir = base / folder
34
+ if ensure_dir:
35
+ target_dir.mkdir(parents=True, exist_ok=True)
36
+ return target_dir / filename
37
+
38
+
39
+ def detect_xgb_gpu_support() -> bool:
40
+ """Check whether the installed XGBoost build exposes GPU support."""
41
+ has_cuda = getattr(xgb.core, "_has_cuda_support", None)
42
+ if callable(has_cuda):
43
+ try:
44
+ return bool(has_cuda())
45
+ except Exception:
46
+ return False
47
+ return False
48
+
49
+
50
+ def select_device(preferred: Optional[str] = None) -> torch.device:
51
+ """Return a torch.device that works across CUDA, MPS, and CPU."""
52
+ if preferred is not None:
53
+ device = torch.device(preferred)
54
+ if device.type == "cuda" and not torch.cuda.is_available():
55
+ raise ValueError("CUDA device requested but not available.")
56
+ if device.type == "mps" and not torch.backends.mps.is_available():
57
+ raise ValueError("MPS device requested but not available.")
58
+ return device
59
+ if torch.cuda.is_available():
60
+ return torch.device("cuda")
61
+ if torch.backends.mps.is_available():
62
+ return torch.device("mps")
63
+ return torch.device("cpu")
64
+
65
+
66
+ # 定义torch下tweedie deviance损失函数
67
+ # 参考:https://scikit-learn.org/stable/modules/model_evaluation.html#mean-poisson-gamma-and-tweedie-deviances
68
+
69
+
70
+ def tweedie_loss(pred, target, p=1.5):
71
+ # Ensure predictions are positive for stability
72
+ eps = 1e-6
73
+ pred_clamped = torch.clamp(pred, min=eps)
74
+ # Compute Tweedie deviance components
75
+ if p == 1:
76
+ # Poisson case
77
+ term1 = target * torch.log(target / pred_clamped + eps)
78
+ term2 = -target + pred_clamped
79
+ term3 = 0
80
+ elif p == 0:
81
+ # Gaussian case
82
+ term1 = 0.5 * torch.pow(target - pred_clamped, 2)
83
+ term2 = 0
84
+ term3 = 0
85
+ elif p == 2:
86
+ # Gamma case
87
+ term1 = torch.log(pred_clamped / target + eps)
88
+ term2 = -target / pred_clamped + 1
89
+ term3 = 0
90
+ else:
91
+ term1 = torch.pow(target, 2 - p) / ((1 - p) * (2 - p))
92
+ term2 = target * torch.pow(pred_clamped, 1 - p) / (1 - p)
93
+ term3 = torch.pow(pred_clamped, 2 - p) / (2 - p)
94
+ # Tweedie negative log-likelihood (up to a constant)
95
+ return 2 * (term1 - term2 + term3)
96
+
97
+ # 定义分箱函数
98
+
99
+
100
+ def split_data(data, col_nme, wgt_nme, n_bins=10):
101
+ data.sort_values(by=col_nme, ascending=True, inplace=True)
102
+ data['cum_weight'] = data[wgt_nme].cumsum()
103
+ w_sum = data[wgt_nme].sum()
104
+ data.loc[:, 'bins'] = np.floor(data['cum_weight'] * float(n_bins) / w_sum)
105
+ data.loc[(data['bins'] == n_bins), 'bins'] = n_bins - 1
106
+ return data.groupby(['bins'], observed=True).sum(numeric_only=True)
107
+
108
+ # 定义Lift Chart绘制函数
109
+
110
+
111
+ def plot_lift_list(pred_model, w_pred_list, w_act_list,
112
+ weight_list, tgt_nme, n_bins=10,
113
+ fig_nme='Lift Chart', output_dir: Optional[PathLike] = None):
114
+ lift_data = pd.DataFrame()
115
+ lift_data.loc[:, 'pred'] = pred_model
116
+ lift_data.loc[:, 'w_pred'] = w_pred_list
117
+ lift_data.loc[:, 'act'] = w_act_list
118
+ lift_data.loc[:, 'weight'] = weight_list
119
+ plot_data = split_data(lift_data, 'pred', 'weight', n_bins)
120
+ plot_data['exp_v'] = plot_data['w_pred'] / plot_data['weight']
121
+ plot_data['act_v'] = plot_data['act'] / plot_data['weight']
122
+ plot_data.reset_index(inplace=True)
123
+ fig = plt.figure(figsize=(7, 5))
124
+ ax = fig.add_subplot(111)
125
+ ax.plot(plot_data.index, plot_data['act_v'],
126
+ label='Actual', color='red')
127
+ ax.plot(plot_data.index, plot_data['exp_v'],
128
+ label='Predicted', color='blue')
129
+ ax.set_title(
130
+ 'Lift Chart of %s' % tgt_nme, fontsize=8)
131
+ plt.xticks(plot_data.index,
132
+ plot_data.index,
133
+ rotation=90, fontsize=6)
134
+ plt.yticks(fontsize=6)
135
+ plt.legend(loc='upper left',
136
+ fontsize=5, frameon=False)
137
+ plt.margins(0.05)
138
+ ax2 = ax.twinx()
139
+ ax2.bar(plot_data.index, plot_data['weight'],
140
+ alpha=0.5, color='seagreen',
141
+ label='Earned Exposure')
142
+ plt.yticks(fontsize=6)
143
+ plt.legend(loc='upper right',
144
+ fontsize=5, frameon=False)
145
+ plt.subplots_adjust(wspace=0.3)
146
+ save_path = resolve_output_path(
147
+ 'plot', f'05_{tgt_nme}_{fig_nme}.png', base_dir=output_dir)
148
+ plt.savefig(save_path, dpi=300)
149
+ plt.close(fig)
150
+
151
+ # 定义Double Lift Chart绘制函数
152
+
153
+
154
+ def plot_dlift_list(pred_model_1, pred_model_2,
155
+ model_nme_1, model_nme_2,
156
+ tgt_nme,
157
+ w_list, w_act_list, n_bins=10,
158
+ fig_nme='Double Lift Chart',
159
+ output_dir: Optional[PathLike] = None):
160
+ lift_data = pd.DataFrame()
161
+ lift_data.loc[:, 'pred1'] = pred_model_1
162
+ lift_data.loc[:, 'pred2'] = pred_model_2
163
+ lift_data.loc[:, 'diff_ly'] = lift_data['pred1'] / lift_data['pred2']
164
+ lift_data.loc[:, 'act'] = w_act_list
165
+ lift_data.loc[:, 'weight'] = w_list
166
+ lift_data.loc[:, 'w_pred1'] = lift_data['pred1'] * lift_data['weight']
167
+ lift_data.loc[:, 'w_pred2'] = lift_data['pred2'] * lift_data['weight']
168
+ plot_data = split_data(lift_data, 'diff_ly', 'weight', n_bins)
169
+ plot_data['exp_v1'] = plot_data['w_pred1'] / plot_data['act']
170
+ plot_data['exp_v2'] = plot_data['w_pred2'] / plot_data['act']
171
+ plot_data['act_v'] = plot_data['act']/plot_data['act']
172
+ plot_data.reset_index(inplace=True)
173
+ fig = plt.figure(figsize=(7, 5))
174
+ ax = fig.add_subplot(111)
175
+ ax.plot(plot_data.index, plot_data['act_v'],
176
+ label='Actual', color='red')
177
+ ax.plot(plot_data.index, plot_data['exp_v1'],
178
+ label=model_nme_1, color='blue')
179
+ ax.plot(plot_data.index, plot_data['exp_v2'],
180
+ label=model_nme_2, color='black')
181
+ ax.set_title(
182
+ 'Double Lift Chart of %s' % tgt_nme, fontsize=8)
183
+ plt.xticks(plot_data.index,
184
+ plot_data.index,
185
+ rotation=90, fontsize=6)
186
+ plt.xlabel('%s / %s' % (model_nme_1, model_nme_2), fontsize=6)
187
+ plt.yticks(fontsize=6)
188
+ plt.legend(loc='upper left',
189
+ fontsize=5, frameon=False)
190
+ plt.margins(0.1)
191
+ plt.subplots_adjust(bottom=0.25, top=0.95, right=0.8)
192
+ ax2 = ax.twinx()
193
+ ax2.bar(plot_data.index, plot_data['weight'],
194
+ alpha=0.5, color='seagreen',
195
+ label='Earned Exposure')
196
+ plt.yticks(fontsize=6)
197
+ plt.legend(loc='upper right',
198
+ fontsize=5, frameon=False)
199
+ plt.subplots_adjust(wspace=0.3)
200
+ save_path = resolve_output_path(
201
+ 'plot', f'06_{tgt_nme}_{fig_nme}.png', base_dir=output_dir)
202
+ plt.savefig(save_path, dpi=300)
203
+ plt.close(fig)
204
+
205
+ # 残差块:两层线性 + ReLU + 残差连接
206
+ # ResBlock 继承 nn.Module
207
+
208
+
209
+ class ResBlock(nn.Module):
210
+ def __init__(self, dim):
211
+ super(ResBlock, self).__init__()
212
+ self.block = nn.Sequential(
213
+ nn.Linear(dim, dim),
214
+ nn.BatchNorm1d(dim),
215
+ nn.ReLU(),
216
+ nn.Linear(dim, dim),
217
+ nn.BatchNorm1d(dim)
218
+ )
219
+
220
+ def forward(self, x):
221
+ # 原始输入 + 两层变换,然后再过 ReLU
222
+ return F.relu(self.block(x) + x)
223
+
224
+ # ResNetSequential 继承 nn.Module,定义整个网络结构
225
+
226
+
227
+ class ResNetSequential(nn.Module):
228
+ # 整个网络用 nn.Sequential 串联:输入 -> ResBlock*block_num -> 输出
229
+ def __init__(self, input_dim, hidden_dim=64, block_num=2):
230
+ super(ResNetSequential, self).__init__()
231
+ self.net = nn.Sequential()
232
+ self.net.add_module('fc1', nn.Linear(input_dim, hidden_dim))
233
+ self.net.add_module('bn1', nn.BatchNorm1d(hidden_dim))
234
+ self.net.add_module('ReLU1', nn.ReLU())
235
+ for i in range(block_num):
236
+ self.net.add_module('ResBlk_'+str(i+1), ResBlock(hidden_dim))
237
+ self.net.add_module('fc2', nn.Linear(hidden_dim, 1))
238
+ self.net.add_module('softplus', nn.Softplus())
239
+
240
+ def forward(self, x):
241
+ return self.net(x)
242
+
243
+ # 贝叶斯优化类,使用高斯过程进行超参数优化
244
+
245
+
246
+ class ResNetScikitLearn(nn.Module):
247
+ def __init__(self, model_nme, input_dim, hidden_dim=64,
248
+ block_num=2, batch_num=100, epochs=100,
249
+ tweedie_power=1.5, learning_rate=0.01, patience=10,
250
+ device: Optional[str] = None,
251
+ num_workers: Optional[int] = None):
252
+ super(ResNetScikitLearn, self).__init__()
253
+ self.input_dim = input_dim
254
+ self.hidden_dim = hidden_dim
255
+ self.block_num = block_num
256
+ self.preferred_device = device
257
+ self.device = select_device(device)
258
+ self.batch_num = batch_num
259
+ self.epochs = epochs
260
+ self.model_nme = model_nme
261
+ if self.model_nme.find('f') != -1:
262
+ self.tw_power = 1
263
+ elif self.model_nme.find('s') != -1:
264
+ self.tw_power = 2
265
+ else:
266
+ self.tw_power = tweedie_power
267
+ self.learning_rate = learning_rate
268
+ self.patience = patience # Early stopping patience
269
+ default_workers = 0 if platform.system().lower().startswith(
270
+ 'win') else max(1, (os.cpu_count() or 2) - 1)
271
+ self.num_workers = max(
272
+ 0, num_workers) if num_workers is not None else default_workers
273
+ self.pin_memory = self.device.type == 'cuda'
274
+ self._use_mixed_precision = self.device.type == 'cuda'
275
+ self.resnet = ResNetSequential(
276
+ self.input_dim,
277
+ self.hidden_dim,
278
+ self.block_num
279
+ ).to(self.device)
280
+ if self.device.type == 'cuda' and torch.cuda.device_count() > 1 and self.preferred_device is None:
281
+ self.resnet = nn.DataParallel(
282
+ self.resnet,
283
+ device_ids=list(range(torch.cuda.device_count()))
284
+ )
285
+
286
+ def fit(self, X_train, y_train, w_train=None, X_val=None, y_val=None, w_val=None):
287
+ # 将数据转换为 PyTorch 张量
288
+ X_tensor = torch.tensor(
289
+ X_train.values, dtype=torch.float32).to(self.device)
290
+ y_tensor = torch.tensor(
291
+ y_train.values, dtype=torch.float32).view(-1, 1).to(self.device)
292
+ w_tensor = torch.tensor(
293
+ w_train.values, dtype=torch.float32).view(-1, 1).to(self.device) if w_train is not None else torch.ones_like(y_tensor)
294
+
295
+ # 验证集张量
296
+ if X_val is not None:
297
+ X_val_tensor = torch.tensor(
298
+ X_val.values, dtype=torch.float32).to(self.device)
299
+ y_val_tensor = torch.tensor(
300
+ y_val.values, dtype=torch.float32).view(-1, 1).to(self.device)
301
+ w_val_tensor = torch.tensor(
302
+ w_val.values, dtype=torch.float32).view(-1, 1).to(self.device) if w_val is not None else torch.ones_like(y_val_tensor)
303
+
304
+ # 创建数据集和数据加载器
305
+ dataset = TensorDataset(
306
+ X_tensor, y_tensor, w_tensor
307
+ )
308
+ dataloader_kwargs = {
309
+ 'batch_size': max(1, int((self.learning_rate/(1e-4))**0.5 *
310
+ (X_train.shape[0]/self.batch_num))),
311
+ 'shuffle': True,
312
+ 'num_workers': self.num_workers,
313
+ 'pin_memory': self.pin_memory
314
+ }
315
+ if self.num_workers > 0:
316
+ dataloader_kwargs['persistent_workers'] = True
317
+ dataloader = DataLoader(dataset, **dataloader_kwargs)
318
+ # 定义损失函数和优化器
319
+ optimizer = torch.optim.Adam(
320
+ self.resnet.parameters(), lr=self.learning_rate)
321
+ scaler = GradScaler(enabled=self._use_mixed_precision)
322
+
323
+ # Early stopping 参数
324
+ best_loss, patience_counter = float('inf'), 0
325
+ best_model_state = copy.deepcopy(self.resnet.state_dict())
326
+
327
+ # 训练模型
328
+ for epoch in range(1, self.epochs + 1):
329
+ self.resnet.train()
330
+ for X_batch, y_batch, w_batch in dataloader:
331
+ optimizer.zero_grad()
332
+ # 如果运行设备为 CUDA,则启用混合精度。
333
+ with autocast(enabled=self._use_mixed_precision):
334
+ X_batch, y_batch, w_batch = X_batch.to(self.device), y_batch.to(
335
+ self.device), w_batch.to(self.device)
336
+ y_pred = self.resnet(X_batch)
337
+ y_pred = torch.clamp(y_pred, min=1e-6)
338
+ losses = tweedie_loss(
339
+ y_pred, y_batch, p=self.tw_power).view(-1)
340
+ weighted_loss = (losses * w_batch.view(-1)
341
+ ).sum() / w_batch.sum()
342
+ scaler.scale(weighted_loss).backward()
343
+ # gradient clipping
344
+ # 如进行gradient clipping,需要在反向传播之前取消缩放
345
+ if self._use_mixed_precision:
346
+ scaler.unscale_(optimizer)
347
+ clip_grad_norm_(
348
+ self.resnet.parameters(),
349
+ max_norm=1.0
350
+ )
351
+ scaler.step(optimizer)
352
+ scaler.update()
353
+
354
+ # 验证集损失计算
355
+ if X_val is not None and y_val is not None:
356
+ self.resnet.eval()
357
+ with torch.no_grad(), autocast(enabled=self._use_mixed_precision):
358
+ y_val_pred = self.resnet(X_val_tensor)
359
+ val_loss_values = tweedie_loss(
360
+ y_val_pred, y_val_tensor, p=self.tw_power).view(-1)
361
+ val_weighted_loss = (
362
+ val_loss_values * w_val_tensor.view(-1)).sum() / w_val_tensor.sum()
363
+
364
+ # Early stopping 检查
365
+ if val_weighted_loss < best_loss:
366
+ best_loss, patience_counter = val_weighted_loss, 0
367
+ # 保存当前最佳模型
368
+ best_model_state = copy.deepcopy(self.resnet.state_dict())
369
+ else:
370
+ patience_counter += 1
371
+ if patience_counter >= self.patience:
372
+ break
373
+ # 最终保持最佳模型权重
374
+ self.resnet.load_state_dict(best_model_state)
375
+
376
+ def predict(self, X_test):
377
+ self.resnet.eval()
378
+ with torch.no_grad():
379
+ X_tensor = torch.tensor(
380
+ X_test.values, dtype=torch.float32).to(self.device)
381
+ y_pred = self.resnet(X_tensor).cpu().numpy()
382
+
383
+ y_pred = np.clip(y_pred, 1e-6, None)
384
+ return y_pred.flatten()
385
+
386
+ def set_params(self, params):
387
+ # 设置模型参数
388
+ for key, value in params.items():
389
+ if hasattr(self, key):
390
+ setattr(self, key, value)
391
+ else:
392
+ raise ValueError(f"Parameter {key} not found in model.")
393
+
394
+ # 定义贝叶斯优化模型类,包含XGBoost和ResNet模型
395
+
396
+
397
+ class BayesOptModel:
398
+ def __init__(self, train_data, test_data,
399
+ model_nme, resp_nme, weight_nme, factor_nmes,
400
+ cate_list=[], prop_test=0.25, rand_seed=None, epochs=100,
401
+ output_dir: Optional[PathLike] = None,
402
+ device: Optional[str] = None,
403
+ resnet_workers: Optional[int] = None):
404
+ # 初始化数据
405
+ # train_data: 训练数据, test_data: 测试数据 格式需为DataFrame
406
+ # model_nme: 模型名称
407
+ # resp_nme: 因变量名称, weight_nme: 权重名称
408
+ # factor_nmes: 因子名称列表, space_params: 参数空间
409
+ # cate_list: 类别变量列表
410
+ # prop_test: 测试集比例, rand_seed
411
+ self.train_data = train_data
412
+ self.test_data = test_data
413
+ self.resp_nme = resp_nme
414
+ self.weight_nme = weight_nme
415
+ self.train_data.loc[:, 'w_act'] = self.train_data[self.resp_nme] * \
416
+ self.train_data[self.weight_nme]
417
+ self.test_data.loc[:, 'w_act'] = self.test_data[self.resp_nme] * \
418
+ self.test_data[self.weight_nme]
419
+ self.factor_nmes = factor_nmes
420
+ self.cate_list = cate_list
421
+ self.rand_seed = rand_seed if rand_seed is not None else np.random.randint(
422
+ 1, 10000)
423
+ if self.cate_list != []:
424
+ for cate in self.cate_list:
425
+ self.train_data[cate] = self.train_data[cate].astype(
426
+ 'category')
427
+ self.test_data[cate] = self.test_data[cate].astype('category')
428
+ self.prop_test = prop_test
429
+ self.cv = ShuffleSplit(n_splits=int(1/self.prop_test),
430
+ test_size=self.prop_test,
431
+ random_state=self.rand_seed)
432
+ self.model_nme = model_nme
433
+ if self.model_nme.find('f') != -1:
434
+ self.obj = 'count:poisson'
435
+ elif self.model_nme.find('s') != -1:
436
+ self.obj = 'reg:gamma'
437
+ elif self.model_nme.find('bc') != -1:
438
+ self.obj = 'reg:tweedie'
439
+ self.fit_params = {
440
+ 'sample_weight': self.train_data[self.weight_nme].values
441
+ }
442
+ self.num_features = [
443
+ nme for nme in self.factor_nmes if nme not in self.cate_list]
444
+ self.train_oht_scl_data = self.train_data[self.factor_nmes +
445
+ [self.weight_nme]+[self.resp_nme]].copy()
446
+ self.test_oht_scl_data = self.test_data[self.factor_nmes +
447
+ [self.weight_nme]+[self.resp_nme]].copy()
448
+ self.train_oht_scl_data = pd.get_dummies(
449
+ self.train_oht_scl_data,
450
+ columns=self.cate_list,
451
+ drop_first=True,
452
+ dtype=np.int8
453
+ )
454
+ self.test_oht_scl_data = pd.get_dummies(
455
+ self.test_oht_scl_data,
456
+ columns=self.cate_list,
457
+ drop_first=True,
458
+ dtype=np.int8
459
+ )
460
+ for num_chr in self.num_features:
461
+ scaler = StandardScaler()
462
+ self.train_oht_scl_data[num_chr] = scaler.fit_transform(
463
+ self.train_oht_scl_data[num_chr].values.reshape(-1, 1))
464
+ self.test_oht_scl_data[num_chr] = scaler.transform(
465
+ self.test_oht_scl_data[num_chr].values.reshape(-1, 1))
466
+ # 对测试集进行列对齐
467
+ self.test_oht_scl_data = self.test_oht_scl_data.reindex(
468
+ columns=self.train_oht_scl_data.columns,
469
+ fill_value=0
470
+ )
471
+ self.var_nmes = list(
472
+ set(list(self.train_oht_scl_data.columns)) -
473
+ set([self.weight_nme, self.resp_nme])
474
+ )
475
+ self.epochs = epochs
476
+ self.model_label = []
477
+ self.output_dir = Path(output_dir).expanduser().resolve(
478
+ ) if output_dir is not None else DEFAULT_OUTPUT_ROOT
479
+ self.device = device
480
+ self.resnet_workers = resnet_workers
481
+ self.xgb_use_gpu = torch.cuda.is_available() and detect_xgb_gpu_support()
482
+
483
+ # 定义单因素画图函数
484
+ def plot_oneway(self, n_bins=10):
485
+ for c in self.factor_nmes:
486
+ fig = plt.figure(figsize=(7, 5))
487
+ if c in self.cate_list:
488
+ strs = c
489
+ else:
490
+ strs = c+'_bins'
491
+ self.train_data.loc[:, strs] = pd.qcut(
492
+ self.train_data[c],
493
+ n_bins,
494
+ duplicates='drop'
495
+ )
496
+ plot_data = self.train_data.groupby(
497
+ [strs], observed=True).sum(numeric_only=True)
498
+ plot_data.reset_index(inplace=True)
499
+ plot_data['act_v'] = plot_data['w_act'] / \
500
+ plot_data[self.weight_nme]
501
+ plot_data.head()
502
+ ax = fig.add_subplot(111)
503
+ ax.plot(plot_data.index, plot_data['act_v'],
504
+ label='Actual', color='red')
505
+ ax.set_title(
506
+ 'Analysis of %s : Train Data' % strs,
507
+ fontsize=8)
508
+ plt.xticks(plot_data.index,
509
+ list(plot_data[strs].astype(str)),
510
+ rotation=90)
511
+ if len(list(plot_data[strs].astype(str))) > 50:
512
+ plt.xticks(fontsize=3)
513
+ else:
514
+ plt.xticks(fontsize=6)
515
+ plt.yticks(fontsize=6)
516
+ ax2 = ax.twinx()
517
+ ax2.bar(plot_data.index,
518
+ plot_data[self.weight_nme],
519
+ alpha=0.5, color='seagreen')
520
+ plt.yticks(fontsize=6)
521
+ plt.margins(0.05)
522
+ plt.subplots_adjust(wspace=0.3)
523
+ save_path = resolve_output_path(
524
+ 'plot',
525
+ f'00_{self.model_nme}_{strs}_oneway.png',
526
+ base_dir=self.output_dir
527
+ )
528
+ plt.savefig(save_path, dpi=300)
529
+ plt.close(fig)
530
+
531
+ # Xgboost交叉验证函数
532
+ def cross_val_xgb(self, trial):
533
+ learning_rate = trial.suggest_float(
534
+ 'learning_rate', 1e-5, 1e-1, log=True)
535
+ gamma = trial.suggest_float(
536
+ 'gamma', 0, 10000)
537
+ max_depth = trial.suggest_int(
538
+ 'max_depth', 3, 25)
539
+ n_estimators = trial.suggest_int(
540
+ 'n_estimators', 10, 500, step=10)
541
+ min_child_weight = trial.suggest_int(
542
+ 'min_child_weight', 100, 10000, step=100)
543
+ reg_alpha = trial.suggest_float(
544
+ 'reg_alpha', 1e-10, 1, log=True)
545
+ reg_lambda = trial.suggest_float(
546
+ 'reg_lambda', 1e-10, 1, log=True)
547
+ if self.obj == 'reg:tweedie':
548
+ tweedie_variance_power = trial.suggest_float(
549
+ 'tweedie_variance_power', 1, 2)
550
+ elif self.obj == 'count:poisson':
551
+ tweedie_variance_power = 1
552
+ elif self.obj == 'reg:gamma':
553
+ tweedie_variance_power = 2
554
+ tree_method = 'gpu_hist' if self.xgb_use_gpu else 'hist'
555
+ predictor = 'gpu_predictor' if self.xgb_use_gpu else 'auto'
556
+ xgb_kwargs = {
557
+ 'objective': self.obj,
558
+ 'random_state': self.rand_seed,
559
+ 'subsample': 0.9,
560
+ 'tree_method': tree_method,
561
+ 'enable_categorical': True,
562
+ 'predictor': predictor
563
+ }
564
+ if self.xgb_use_gpu:
565
+ xgb_kwargs['gpu_id'] = 0
566
+ clf = xgb.XGBRegressor(**xgb_kwargs)
567
+ params = {
568
+ 'learning_rate': learning_rate,
569
+ 'gamma': gamma,
570
+ 'max_depth': max_depth,
571
+ 'n_estimators': n_estimators,
572
+ 'min_child_weight': min_child_weight,
573
+ 'reg_alpha': reg_alpha,
574
+ 'reg_lambda': reg_lambda
575
+ }
576
+ if self.obj == 'reg:tweedie':
577
+ params['tweedie_variance_power'] = tweedie_variance_power
578
+ clf.set_params(**params)
579
+ acc = cross_val_score(
580
+ clf,
581
+ self.train_data[self.factor_nmes],
582
+ self.train_data[self.resp_nme].values,
583
+ fit_params=self.fit_params,
584
+ cv=self.cv,
585
+ scoring=make_scorer(
586
+ mean_tweedie_deviance,
587
+ power=tweedie_variance_power,
588
+ greater_is_better=False),
589
+ error_score='raise',
590
+ n_jobs=int(1/self.prop_test)).mean()
591
+ return -acc
592
+
593
+ # 定义Xgboost贝叶斯优化函数
594
+ def bayesopt_xgb(self, max_evals=100):
595
+ study = optuna.create_study(
596
+ direction='minimize',
597
+ sampler=optuna.samplers.TPESampler(seed=self.rand_seed))
598
+ study.optimize(self.cross_val_xgb, n_trials=max_evals)
599
+ self.best_xgb_params = study.best_params
600
+ xgb_param_path = resolve_output_path(
601
+ 'Results',
602
+ f'{self.model_nme}_bestparams_xgb.csv',
603
+ base_dir=self.output_dir
604
+ )
605
+ pd.DataFrame(self.best_xgb_params, index=[0]).to_csv(xgb_param_path)
606
+ self.best_xgb_trial = study.best_trial
607
+ tree_method = 'gpu_hist' if self.xgb_use_gpu else 'hist'
608
+ predictor = 'gpu_predictor' if self.xgb_use_gpu else 'auto'
609
+ xgb_best_kwargs = {
610
+ 'objective': self.obj,
611
+ 'random_state': self.rand_seed,
612
+ 'subsample': 0.9,
613
+ 'tree_method': tree_method,
614
+ 'enable_categorical': True,
615
+ 'predictor': predictor
616
+ }
617
+ if self.xgb_use_gpu:
618
+ xgb_best_kwargs['gpu_id'] = 0
619
+ self.xgb_best = xgb.XGBRegressor(**xgb_best_kwargs)
620
+ self.xgb_best.set_params(**self.best_xgb_params)
621
+ self.xgb_best.fit(self.train_data[self.factor_nmes],
622
+ self.train_data[self.resp_nme].values,
623
+ **self.fit_params)
624
+ self.model_label += ['Xgboost']
625
+ self.train_data['pred_xgb'] = self.xgb_best.predict(
626
+ self.train_data[self.factor_nmes])
627
+ self.test_data['pred_xgb'] = self.xgb_best.predict(
628
+ self.test_data[self.factor_nmes])
629
+ self.train_data.loc[:, 'w_pred_xgb'] = self.train_data['pred_xgb'] * \
630
+ self.train_data[self.weight_nme]
631
+ self.test_data.loc[:, 'w_pred_xgb'] = self.test_data['pred_xgb'] * \
632
+ self.test_data[self.weight_nme]
633
+
634
+ # ResNet交叉验证函数
635
+ def cross_val_resn(self, trial):
636
+
637
+ learning_rate = trial.suggest_float(
638
+ 'learning_rate', 1e-6, 1e-2, log=True)
639
+ hidden_dim = trial.suggest_int(
640
+ 'hidden_dim', 32, 256, step=16)
641
+ block_num = trial.suggest_int(
642
+ 'block_num', 3, 10)
643
+ batch_num = trial.suggest_int(
644
+ 'batch_num',
645
+ 10 if self.obj == 'reg:gamma' else 100,
646
+ 100 if self.obj == 'reg:gamma' else 1000,
647
+ step=10)
648
+ if self.obj == 'reg:tweedie':
649
+ tw_power = trial.suggest_float(
650
+ 'tw_power', 1, 2.0)
651
+ elif self.obj == 'count:poisson':
652
+ tw_power = 1
653
+ elif self.obj == 'reg:gamma':
654
+ tw_power = 2
655
+ '''fold_num = int(1/self.prop_test)
656
+ kf = KFold(n_splits=fold_num, shuffle=True,
657
+ random_state=self.rand_seed)'''
658
+ loss = 0
659
+ for fold, (train_idx, test_idx) in enumerate(self.cv.split(self.train_oht_scl_data[self.var_nmes])):
660
+ # 创建模型
661
+ cv_net = ResNetScikitLearn(
662
+ model_nme=self.model_nme,
663
+ input_dim=self.train_oht_scl_data[self.var_nmes].shape[1],
664
+ epochs=self.epochs,
665
+ learning_rate=learning_rate,
666
+ hidden_dim=hidden_dim,
667
+ block_num=block_num,
668
+ # 保证权重方差不变
669
+ batch_num=batch_num,
670
+ tweedie_power=tw_power if self.obj == 'reg:tweedie' and tw_power != 1 else tw_power+1e-6,
671
+ device=self.device,
672
+ num_workers=self.resnet_workers
673
+ )
674
+ # 训练模型
675
+ cv_net.fit(
676
+ self.train_oht_scl_data[self.var_nmes].iloc[train_idx],
677
+ self.train_oht_scl_data[self.resp_nme].iloc[train_idx],
678
+ self.train_oht_scl_data[self.weight_nme].iloc[train_idx],
679
+ self.train_oht_scl_data[self.var_nmes].iloc[test_idx],
680
+ self.train_oht_scl_data[self.resp_nme].iloc[test_idx],
681
+ self.train_oht_scl_data[self.weight_nme].iloc[test_idx]
682
+ )
683
+ # 预测
684
+ y_pred_fold = cv_net.predict(
685
+ self.train_oht_scl_data[self.var_nmes].iloc[test_idx]
686
+ )
687
+ # 计算损失
688
+ loss += mean_tweedie_deviance(
689
+ self.train_oht_scl_data[self.resp_nme].iloc[test_idx],
690
+ y_pred_fold,
691
+ sample_weight=self.train_oht_scl_data[self.weight_nme].iloc[test_idx],
692
+ power=tw_power
693
+ )
694
+ return loss / int(1/self.prop_test)
695
+
696
+ # 定义ResNet贝叶斯优化函数
697
+ def bayesopt_resnet(self, max_evals=100):
698
+ study = optuna.create_study(
699
+ direction='minimize',
700
+ sampler=optuna.samplers.TPESampler(seed=self.rand_seed))
701
+ study.optimize(self.cross_val_resn, n_trials=max_evals)
702
+ self.best_resn_params = study.best_params
703
+ resn_param_path = resolve_output_path(
704
+ 'Results',
705
+ f'{self.model_nme}_bestparams_resn.csv',
706
+ base_dir=self.output_dir
707
+ )
708
+ pd.DataFrame(self.best_resn_params, index=[0]).to_csv(resn_param_path)
709
+ self.best_resn_trial = study.best_trial
710
+ self.resn_best = ResNetScikitLearn(
711
+ model_nme=self.model_nme,
712
+ input_dim=self.train_oht_scl_data[self.var_nmes].shape[1],
713
+ device=self.device,
714
+ num_workers=self.resnet_workers
715
+ )
716
+ self.resn_best.set_params(self.best_resn_params)
717
+ self.resn_best.fit(self.train_oht_scl_data[self.var_nmes],
718
+ self.train_oht_scl_data[self.resp_nme],
719
+ self.train_oht_scl_data[self.weight_nme])
720
+ self.model_label += ['ResNet']
721
+ self.train_data['pred_resn'] = self.resn_best.predict(
722
+ self.train_oht_scl_data[self.var_nmes])
723
+ self.test_data['pred_resn'] = self.resn_best.predict(
724
+ self.test_oht_scl_data[self.var_nmes])
725
+ self.train_data.loc[:, 'w_pred_resn'] = self.train_data['pred_resn'] * \
726
+ self.train_data[self.weight_nme]
727
+ self.test_data.loc[:, 'w_pred_resn'] = self.test_data['pred_resn'] * \
728
+ self.test_data[self.weight_nme]
729
+
730
+ # 定义分箱函数
731
+ def _split_data(self, data, col_nme, wgt_nme, n_bins=10):
732
+ data.sort_values(by=col_nme, ascending=True, inplace=True)
733
+ data['cum_weight'] = data[wgt_nme].cumsum()
734
+ w_sum = data[wgt_nme].sum()
735
+ data.loc[:, 'bins'] = np.floor(
736
+ data['cum_weight']*float(n_bins)/w_sum)
737
+ data.loc[(data['bins'] == n_bins), 'bins'] = n_bins-1
738
+ return data.groupby(['bins'], observed=True).sum(numeric_only=True)
739
+
740
+ # 定义Lift Chart绘制数据集函数
741
+ def _plot_data_lift(self,
742
+ pred_list, w_pred_list,
743
+ w_act_list, weight_list, n_bins=10):
744
+ lift_data = pd.DataFrame()
745
+ lift_data.loc[:, 'pred'] = pred_list
746
+ lift_data.loc[:, 'w_pred'] = w_pred_list
747
+ lift_data.loc[:, 'act'] = w_act_list
748
+ lift_data.loc[:, 'weight'] = weight_list
749
+ plot_data = self._split_data(
750
+ lift_data, 'pred', 'weight', n_bins)
751
+ plot_data['exp_v'] = plot_data['w_pred'] / plot_data['weight']
752
+ plot_data['act_v'] = plot_data['act'] / plot_data['weight']
753
+ plot_data.reset_index(inplace=True)
754
+ return plot_data
755
+
756
+ # 定义lift曲线绘制函数
757
+ def plot_lift(self, model_label, pred_nme, n_bins=10):
758
+ # 绘制建模集上结果
759
+ figpos_list = [121, 122]
760
+ plot_dict = {
761
+ 121: self.train_data,
762
+ 122: self.test_data
763
+ }
764
+ name_list = {
765
+ 121: 'Train Data',
766
+ 122: 'Test Data'
767
+ }
768
+ if model_label == 'Xgboost':
769
+ pred_nme = 'pred_xgb'
770
+ elif model_label == 'ResNet':
771
+ pred_nme = 'pred_resn'
772
+
773
+ fig = plt.figure(figsize=(11, 5))
774
+ for figpos in figpos_list:
775
+ plot_data = self._plot_data_lift(
776
+ plot_dict[figpos][pred_nme].values,
777
+ plot_dict[figpos]['w_'+pred_nme].values,
778
+ plot_dict[figpos]['w_act'].values,
779
+ plot_dict[figpos][self.weight_nme].values,
780
+ n_bins)
781
+ ax = fig.add_subplot(figpos)
782
+ ax.plot(plot_data.index, plot_data['act_v'],
783
+ label='Actual', color='red')
784
+ ax.plot(plot_data.index, plot_data['exp_v'],
785
+ label='Predicted', color='blue')
786
+ ax.set_title(
787
+ 'Lift Chart on %s' % name_list[figpos], fontsize=8)
788
+ plt.xticks(plot_data.index,
789
+ plot_data.index,
790
+ rotation=90, fontsize=6)
791
+ plt.yticks(fontsize=6)
792
+ plt.legend(loc='upper left',
793
+ fontsize=5, frameon=False)
794
+ plt.margins(0.05)
795
+ ax2 = ax.twinx()
796
+ ax2.bar(plot_data.index, plot_data['weight'],
797
+ alpha=0.5, color='seagreen',
798
+ label='Earned Exposure')
799
+ plt.yticks(fontsize=6)
800
+ plt.legend(loc='upper right',
801
+ fontsize=5, frameon=False)
802
+ plt.subplots_adjust(wspace=0.3)
803
+ save_path = resolve_output_path(
804
+ 'plot', f'01_{self.model_nme}_{model_label}_lift.png', base_dir=self.output_dir)
805
+ plt.savefig(save_path, dpi=300)
806
+ plt.show()
807
+ plt.close(fig)
808
+
809
+ # 定义Double Lift Chart绘制数据集函数
810
+ def _plot_data_dlift(self,
811
+ pred_list_model1, pred_list_model2,
812
+ w_list, w_act_list, n_bins=10):
813
+ lift_data = pd.DataFrame()
814
+ lift_data.loc[:, 'pred1'] = pred_list_model1
815
+ lift_data.loc[:, 'pred2'] = pred_list_model2
816
+ lift_data.loc[:, 'diff_ly'] = lift_data['pred1'] / lift_data['pred2']
817
+ lift_data.loc[:, 'act'] = w_act_list
818
+ lift_data.loc[:, 'weight'] = w_list
819
+ plot_data = self._split_data(lift_data, 'diff_ly', 'weight', n_bins)
820
+ plot_data['exp_v1'] = plot_data['pred1'] / plot_data['act']
821
+ plot_data['exp_v2'] = plot_data['pred2'] / plot_data['act']
822
+ plot_data['act_v'] = plot_data['act'] / plot_data['act']
823
+ plot_data.reset_index(inplace=True)
824
+ return plot_data
825
+
826
+ # 定义绘制Double Lift Chart函数
827
+ def plot_dlift(self, n_bins=10):
828
+ # 绘制建模集上结果
829
+ figpos_list = [121, 122]
830
+ plot_dict = {
831
+ 121: self.train_data,
832
+ 122: self.test_data
833
+ }
834
+ name_list = {
835
+ 121: 'Train Data',
836
+ 122: 'Test Data'
837
+ }
838
+ fig = plt.figure(figsize=(11, 5))
839
+ for figpos in figpos_list:
840
+ plot_data = self._plot_data_dlift(
841
+ plot_dict[figpos]['w_pred_xgb'].values,
842
+ plot_dict[figpos]['w_pred_resn'].values,
843
+ plot_dict[figpos][self.weight_nme].values,
844
+ plot_dict[figpos]['w_act'].values,
845
+ n_bins)
846
+ ax = fig.add_subplot(figpos)
847
+ tt1 = 'Xgboost'
848
+ tt2 = 'ResNet'
849
+ ax.plot(plot_data.index, plot_data['act_v'],
850
+ label='Actual', color='red')
851
+ ax.plot(plot_data.index, plot_data['exp_v1'],
852
+ label=tt1, color='blue')
853
+ ax.plot(plot_data.index, plot_data['exp_v2'],
854
+ label=tt2, color='black')
855
+ ax.set_title(
856
+ 'Double Lift Chart on %s' % name_list[figpos], fontsize=8)
857
+ plt.xticks(plot_data.index,
858
+ plot_data.index,
859
+ rotation=90, fontsize=6)
860
+ plt.xlabel('%s / %s' % (tt1, tt2), fontsize=6)
861
+ plt.yticks(fontsize=6)
862
+ plt.legend(loc='upper left',
863
+ fontsize=5, frameon=False)
864
+ plt.margins(0.1)
865
+ plt.subplots_adjust(bottom=0.25, top=0.95, right=0.8)
866
+ ax2 = ax.twinx()
867
+ ax2.bar(plot_data.index, plot_data['weight'],
868
+ alpha=0.5, color='seagreen',
869
+ label='Earned Exposure')
870
+ plt.yticks(fontsize=6)
871
+ plt.legend(loc='upper right',
872
+ fontsize=5, frameon=False)
873
+ plt.subplots_adjust(wspace=0.3)
874
+ save_path = resolve_output_path(
875
+ 'plot', f'02_{self.model_nme}_dlift.png', base_dir=self.output_dir)
876
+ plt.savefig(save_path, dpi=300)
877
+ plt.show()
878
+ plt.close(fig)
879
+
880
+ # 保存模型
881
+ def save_model(self, model_name=None):
882
+ # model_name 可以是 'xgb', 'resn' 或 None
883
+ save_path_xgb = resolve_output_path(
884
+ 'model', f'01_{self.model_nme}_Xgboost.pkl', base_dir=self.output_dir)
885
+ save_path_resn = resolve_output_path(
886
+ 'model', f'01_{self.model_nme}_ResNet.pth', base_dir=self.output_dir)
887
+ # self.xgb_best.save_model(save_path_xgb)
888
+ if model_name != 'resn':
889
+ joblib.dump(self.xgb_best, save_path_xgb)
890
+ if model_name != 'xgb':
891
+ torch.save(self.resn_best.resnet.state_dict(), save_path_resn)
892
+
893
+ def load_model(self, model_name=None):
894
+ # model_name 可以是 'xgb', 'resn' 或 None
895
+ save_path_xgb = resolve_output_path(
896
+ 'model', f'01_{self.model_nme}_Xgboost.pkl', base_dir=self.output_dir, ensure_dir=False)
897
+ save_path_resn = resolve_output_path(
898
+ 'model', f'01_{self.model_nme}_ResNet.pth', base_dir=self.output_dir, ensure_dir=False)
899
+ if model_name != 'resn':
900
+ self.xgb_load = joblib.load(save_path_xgb)
901
+ if model_name != 'xgb':
902
+ self.resn_load = ResNetScikitLearn(
903
+ model_nme=self.model_nme,
904
+ input_dim=self.train_oht_scl_data[self.var_nmes].shape[1],
905
+ device=self.device,
906
+ num_workers=self.resnet_workers
907
+ )
908
+ self.resn_load.resnet.load_state_dict(
909
+ torch.load(save_path_resn, map_location=self.resn_load.device))