ins-pricing 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. ins_pricing/README.md +60 -0
  2. ins_pricing/__init__.py +102 -0
  3. ins_pricing/governance/README.md +18 -0
  4. ins_pricing/governance/__init__.py +20 -0
  5. ins_pricing/governance/approval.py +93 -0
  6. ins_pricing/governance/audit.py +37 -0
  7. ins_pricing/governance/registry.py +99 -0
  8. ins_pricing/governance/release.py +159 -0
  9. ins_pricing/modelling/BayesOpt.py +146 -0
  10. ins_pricing/modelling/BayesOpt_USAGE.md +925 -0
  11. ins_pricing/modelling/BayesOpt_entry.py +575 -0
  12. ins_pricing/modelling/BayesOpt_incremental.py +731 -0
  13. ins_pricing/modelling/Explain_Run.py +36 -0
  14. ins_pricing/modelling/Explain_entry.py +539 -0
  15. ins_pricing/modelling/Pricing_Run.py +36 -0
  16. ins_pricing/modelling/README.md +33 -0
  17. ins_pricing/modelling/__init__.py +44 -0
  18. ins_pricing/modelling/bayesopt/__init__.py +98 -0
  19. ins_pricing/modelling/bayesopt/config_preprocess.py +303 -0
  20. ins_pricing/modelling/bayesopt/core.py +1476 -0
  21. ins_pricing/modelling/bayesopt/models.py +2196 -0
  22. ins_pricing/modelling/bayesopt/trainers.py +2446 -0
  23. ins_pricing/modelling/bayesopt/utils.py +1021 -0
  24. ins_pricing/modelling/cli_common.py +136 -0
  25. ins_pricing/modelling/explain/__init__.py +55 -0
  26. ins_pricing/modelling/explain/gradients.py +334 -0
  27. ins_pricing/modelling/explain/metrics.py +176 -0
  28. ins_pricing/modelling/explain/permutation.py +155 -0
  29. ins_pricing/modelling/explain/shap_utils.py +146 -0
  30. ins_pricing/modelling/notebook_utils.py +284 -0
  31. ins_pricing/modelling/plotting/__init__.py +45 -0
  32. ins_pricing/modelling/plotting/common.py +63 -0
  33. ins_pricing/modelling/plotting/curves.py +572 -0
  34. ins_pricing/modelling/plotting/diagnostics.py +139 -0
  35. ins_pricing/modelling/plotting/geo.py +362 -0
  36. ins_pricing/modelling/plotting/importance.py +121 -0
  37. ins_pricing/modelling/run_logging.py +133 -0
  38. ins_pricing/modelling/tests/conftest.py +8 -0
  39. ins_pricing/modelling/tests/test_cross_val_generic.py +66 -0
  40. ins_pricing/modelling/tests/test_distributed_utils.py +18 -0
  41. ins_pricing/modelling/tests/test_explain.py +56 -0
  42. ins_pricing/modelling/tests/test_geo_tokens_split.py +49 -0
  43. ins_pricing/modelling/tests/test_graph_cache.py +33 -0
  44. ins_pricing/modelling/tests/test_plotting.py +63 -0
  45. ins_pricing/modelling/tests/test_plotting_library.py +150 -0
  46. ins_pricing/modelling/tests/test_preprocessor.py +48 -0
  47. ins_pricing/modelling/watchdog_run.py +211 -0
  48. ins_pricing/pricing/README.md +44 -0
  49. ins_pricing/pricing/__init__.py +27 -0
  50. ins_pricing/pricing/calibration.py +39 -0
  51. ins_pricing/pricing/data_quality.py +117 -0
  52. ins_pricing/pricing/exposure.py +85 -0
  53. ins_pricing/pricing/factors.py +91 -0
  54. ins_pricing/pricing/monitoring.py +99 -0
  55. ins_pricing/pricing/rate_table.py +78 -0
  56. ins_pricing/production/__init__.py +21 -0
  57. ins_pricing/production/drift.py +30 -0
  58. ins_pricing/production/monitoring.py +143 -0
  59. ins_pricing/production/scoring.py +40 -0
  60. ins_pricing/reporting/README.md +20 -0
  61. ins_pricing/reporting/__init__.py +11 -0
  62. ins_pricing/reporting/report_builder.py +72 -0
  63. ins_pricing/reporting/scheduler.py +45 -0
  64. ins_pricing/setup.py +41 -0
  65. ins_pricing v2/__init__.py +23 -0
  66. ins_pricing v2/governance/__init__.py +20 -0
  67. ins_pricing v2/governance/approval.py +93 -0
  68. ins_pricing v2/governance/audit.py +37 -0
  69. ins_pricing v2/governance/registry.py +99 -0
  70. ins_pricing v2/governance/release.py +159 -0
  71. ins_pricing v2/modelling/Explain_Run.py +36 -0
  72. ins_pricing v2/modelling/Pricing_Run.py +36 -0
  73. ins_pricing v2/modelling/__init__.py +151 -0
  74. ins_pricing v2/modelling/cli_common.py +141 -0
  75. ins_pricing v2/modelling/config.py +249 -0
  76. ins_pricing v2/modelling/config_preprocess.py +254 -0
  77. ins_pricing v2/modelling/core.py +741 -0
  78. ins_pricing v2/modelling/data_container.py +42 -0
  79. ins_pricing v2/modelling/explain/__init__.py +55 -0
  80. ins_pricing v2/modelling/explain/gradients.py +334 -0
  81. ins_pricing v2/modelling/explain/metrics.py +176 -0
  82. ins_pricing v2/modelling/explain/permutation.py +155 -0
  83. ins_pricing v2/modelling/explain/shap_utils.py +146 -0
  84. ins_pricing v2/modelling/features.py +215 -0
  85. ins_pricing v2/modelling/model_manager.py +148 -0
  86. ins_pricing v2/modelling/model_plotting.py +463 -0
  87. ins_pricing v2/modelling/models.py +2203 -0
  88. ins_pricing v2/modelling/notebook_utils.py +294 -0
  89. ins_pricing v2/modelling/plotting/__init__.py +45 -0
  90. ins_pricing v2/modelling/plotting/common.py +63 -0
  91. ins_pricing v2/modelling/plotting/curves.py +572 -0
  92. ins_pricing v2/modelling/plotting/diagnostics.py +139 -0
  93. ins_pricing v2/modelling/plotting/geo.py +362 -0
  94. ins_pricing v2/modelling/plotting/importance.py +121 -0
  95. ins_pricing v2/modelling/run_logging.py +133 -0
  96. ins_pricing v2/modelling/tests/conftest.py +8 -0
  97. ins_pricing v2/modelling/tests/test_cross_val_generic.py +66 -0
  98. ins_pricing v2/modelling/tests/test_distributed_utils.py +18 -0
  99. ins_pricing v2/modelling/tests/test_explain.py +56 -0
  100. ins_pricing v2/modelling/tests/test_geo_tokens_split.py +49 -0
  101. ins_pricing v2/modelling/tests/test_graph_cache.py +33 -0
  102. ins_pricing v2/modelling/tests/test_plotting.py +63 -0
  103. ins_pricing v2/modelling/tests/test_plotting_library.py +150 -0
  104. ins_pricing v2/modelling/tests/test_preprocessor.py +48 -0
  105. ins_pricing v2/modelling/trainers.py +2447 -0
  106. ins_pricing v2/modelling/utils.py +1020 -0
  107. ins_pricing v2/modelling/watchdog_run.py +211 -0
  108. ins_pricing v2/pricing/__init__.py +27 -0
  109. ins_pricing v2/pricing/calibration.py +39 -0
  110. ins_pricing v2/pricing/data_quality.py +117 -0
  111. ins_pricing v2/pricing/exposure.py +85 -0
  112. ins_pricing v2/pricing/factors.py +91 -0
  113. ins_pricing v2/pricing/monitoring.py +99 -0
  114. ins_pricing v2/pricing/rate_table.py +78 -0
  115. ins_pricing v2/production/__init__.py +21 -0
  116. ins_pricing v2/production/drift.py +30 -0
  117. ins_pricing v2/production/monitoring.py +143 -0
  118. ins_pricing v2/production/scoring.py +40 -0
  119. ins_pricing v2/reporting/__init__.py +11 -0
  120. ins_pricing v2/reporting/report_builder.py +72 -0
  121. ins_pricing v2/reporting/scheduler.py +45 -0
  122. ins_pricing v2/scripts/BayesOpt_incremental.py +722 -0
  123. ins_pricing v2/scripts/Explain_entry.py +545 -0
  124. ins_pricing v2/scripts/__init__.py +1 -0
  125. ins_pricing v2/scripts/train.py +568 -0
  126. ins_pricing v2/setup.py +55 -0
  127. ins_pricing v2/smoke_test.py +28 -0
  128. ins_pricing-0.1.6.dist-info/METADATA +78 -0
  129. ins_pricing-0.1.6.dist-info/RECORD +169 -0
  130. ins_pricing-0.1.6.dist-info/WHEEL +5 -0
  131. ins_pricing-0.1.6.dist-info/top_level.txt +4 -0
  132. user_packages/__init__.py +105 -0
  133. user_packages legacy/BayesOpt.py +5659 -0
  134. user_packages legacy/BayesOpt_entry.py +513 -0
  135. user_packages legacy/BayesOpt_incremental.py +685 -0
  136. user_packages legacy/Pricing_Run.py +36 -0
  137. user_packages legacy/Try/BayesOpt Legacy251213.py +3719 -0
  138. user_packages legacy/Try/BayesOpt Legacy251215.py +3758 -0
  139. user_packages legacy/Try/BayesOpt lagecy251201.py +3506 -0
  140. user_packages legacy/Try/BayesOpt lagecy251218.py +3992 -0
  141. user_packages legacy/Try/BayesOpt legacy.py +3280 -0
  142. user_packages legacy/Try/BayesOpt.py +838 -0
  143. user_packages legacy/Try/BayesOptAll.py +1569 -0
  144. user_packages legacy/Try/BayesOptAllPlatform.py +909 -0
  145. user_packages legacy/Try/BayesOptCPUGPU.py +1877 -0
  146. user_packages legacy/Try/BayesOptSearch.py +830 -0
  147. user_packages legacy/Try/BayesOptSearchOrigin.py +829 -0
  148. user_packages legacy/Try/BayesOptV1.py +1911 -0
  149. user_packages legacy/Try/BayesOptV10.py +2973 -0
  150. user_packages legacy/Try/BayesOptV11.py +3001 -0
  151. user_packages legacy/Try/BayesOptV12.py +3001 -0
  152. user_packages legacy/Try/BayesOptV2.py +2065 -0
  153. user_packages legacy/Try/BayesOptV3.py +2209 -0
  154. user_packages legacy/Try/BayesOptV4.py +2342 -0
  155. user_packages legacy/Try/BayesOptV5.py +2372 -0
  156. user_packages legacy/Try/BayesOptV6.py +2759 -0
  157. user_packages legacy/Try/BayesOptV7.py +2832 -0
  158. user_packages legacy/Try/BayesOptV8Codex.py +2731 -0
  159. user_packages legacy/Try/BayesOptV8Gemini.py +2614 -0
  160. user_packages legacy/Try/BayesOptV9.py +2927 -0
  161. user_packages legacy/Try/BayesOpt_entry legacy.py +313 -0
  162. user_packages legacy/Try/ModelBayesOptSearch.py +359 -0
  163. user_packages legacy/Try/ResNetBayesOptSearch.py +249 -0
  164. user_packages legacy/Try/XgbBayesOptSearch.py +121 -0
  165. user_packages legacy/Try/xgbbayesopt.py +523 -0
  166. user_packages legacy/__init__.py +19 -0
  167. user_packages legacy/cli_common.py +124 -0
  168. user_packages legacy/notebook_utils.py +228 -0
  169. user_packages legacy/watchdog_run.py +202 -0
@@ -0,0 +1,249 @@
1
+ from random import sample
2
+ import pandas as pd
3
+ import numpy as np
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch.nn.functional as F
7
+ import os
8
+ import optuna
9
+
10
+ # from hyperopt import plotting, fmin, hp, tpe, Trials, STATUS_OK # 0.2.7
11
+ from torch.utils.data import DataLoader, TensorDataset
12
+ from sklearn.model_selection import KFold, train_test_split
13
+ from torch.utils.data import DataLoader, TensorDataset
14
+ from sklearn.preprocessing import StandardScaler
15
+ from sklearn.metrics import mean_tweedie_deviance
16
+
17
+ def tweedie_loss(pred, target, p=1.5):
18
+ # Ensure predictions are positive for stability
19
+ eps = 1e-6
20
+ pred_clamped = torch.clamp(pred, min=eps)
21
+ # Compute Tweedie deviance components
22
+ if p == 1:
23
+ # Poisson case
24
+ term1 = target * torch.log(target / pred_clamped + eps)
25
+ term2 = -target + pred_clamped
26
+ term3 = 0
27
+ elif p == 0:
28
+ # Gaussian case
29
+ term1 = 0.5 * torch.pow(target - pred_clamped, 2)
30
+ term2 = 0
31
+ term3 = 0
32
+ elif p == 2:
33
+ # Gamma case
34
+ term1 = torch.log(pred_clamped / target + eps)
35
+ term2 = -target / pred_clamped +1
36
+ term3 = 0
37
+ else:
38
+ term1 = torch.pow(target, 2 - p) / ((1 - p) * (2 - p))
39
+ term2 = target * torch.pow(pred_clamped, 1 - p) / (1 - p)
40
+ term3 = torch.pow(pred_clamped, 2 - p) / (2 - p)
41
+ # Tweedie negative log-likelihood (up to a constant)
42
+ return 2 * (term1 - term2 + term3)
43
+
44
+ class ResBlock(nn.Module):
45
+ """一个简单的残差块:两层线性 + ReLU, 带跳跃连接"""
46
+ def __init__(self, dim):
47
+ super().__init__()
48
+ self.block = nn.Sequential(
49
+ nn.Linear(dim, dim),
50
+ nn.ReLU(),
51
+ nn.Linear(dim, dim)
52
+ )
53
+
54
+ def forward(self, x):
55
+ # 原始输入 + 两层变换,然后再过 ReLU
56
+ return F.relu(self.block(x) + x)
57
+
58
+ class ResNetSequential(nn.Module):
59
+ """整个网络用 nn.Sequential 串联:输入 -> ResBlock*block_num -> 输出"""
60
+ def __init__(self, input_dim, hidden_dim=64, block_num=2):
61
+ super().__init__()
62
+ self.net = nn.Sequential()
63
+ self.net.add_module('fc1', nn.Linear(input_dim, hidden_dim)),
64
+ self.net.add_module('ReLU1', nn.ReLU())
65
+ for i in range(block_num):
66
+ self.net.add_module('ResBlk_'+str(i+1), ResBlock(hidden_dim))
67
+ self.net.add_module('fc2', nn.Linear(hidden_dim, 1))
68
+ self.net.add_module('softplus', nn.Softplus())
69
+
70
+ def forward(self, x):
71
+ return self.net(x)
72
+
73
+ class ResNetScikitLearn:
74
+ """贝叶斯优化类,使用高斯过程进行超参数优化"""
75
+ def __init__(self, model_nme, input_dim, hidden_dim=64,
76
+ block_num=2, batch_size=32, epochs=100,
77
+ tweedie_power=1.5,learning_rate=0.01):
78
+ self.input_dim = input_dim
79
+ self.hidden_dim = hidden_dim
80
+ self.block_num = block_num
81
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
82
+ self.resnet = ResNetSequential(
83
+ self.input_dim,
84
+ self.hidden_dim,
85
+ self.block_num
86
+ ).to(self.device)
87
+ self.batch_size = batch_size
88
+ self.epochs = epochs
89
+ self.model_nme = model_nme
90
+ if self.model_nme.find('f') != -1:
91
+ self.tw_power = 1
92
+ elif self.model_nme.find('s') != -1:
93
+ self.tw_power = 2
94
+ else:
95
+ self.tw_power = tweedie_power
96
+ self.learning_rate = learning_rate
97
+
98
+ def fit(self, X_train, y_train, w_train=None):
99
+ # 将数据转换为 PyTorch 张量
100
+ X_tensor = torch.tensor(X_train.values, dtype=torch.float32).to(self.device)
101
+ y_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1).to(self.device)
102
+ w_tensor = torch.tensor(w_train.values, dtype=torch.float32).view(-1, 1).to(self.device) if w_train is not None else None
103
+ # 创建数据集和数据加载器
104
+ dataset = TensorDataset(X_tensor, y_tensor, w_tensor) if w_train is not None else TensorDataset(X_tensor, y_tensor)
105
+ dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)
106
+
107
+ # 定义损失函数和优化器
108
+ optimizer = torch.optim.Adam(self.resnet.parameters(), lr=self.learning_rate)
109
+
110
+ # 训练模型
111
+ for epoch in range(1, self.epochs + 1):
112
+ self.resnet.train()
113
+ total_loss = 0.0
114
+ total_weight = 0.0
115
+ for X_batch, y_batch, w_batch in dataloader:
116
+ X_batch = X_batch.to(self.device)
117
+ y_batch = y_batch.to(self.device)
118
+ w_batch = w_batch.to(self.device)
119
+ optimizer.zero_grad()
120
+ y_pred = self.resnet(X_batch)
121
+ loss_values = tweedie_loss(y_pred, y_batch, p=self.tw_power).view(-1)
122
+ weighted_loss = (loss_values * w_batch.view(-1)).sum() / w_batch.sum()
123
+ weighted_loss.backward()
124
+ optimizer.step()
125
+ total_loss += weighted_loss.item() * w_batch.sum().item()
126
+ total_weight += w_batch.sum().item()
127
+ avg_loss = total_loss / total_weight
128
+ # print(f"total weigiht: {total_weight:.4f}")
129
+ # print(f"Epoch {epoch}/{self.epochs}, Training Loss: {avg_loss:.4f}")
130
+
131
+ def predict(self, X_test):
132
+ self.resnet.eval()
133
+ with torch.no_grad():
134
+ X_tensor = torch.tensor(X_test.values, dtype=torch.float32).to(self.device)
135
+ y_pred = self.resnet(X_tensor).cpu().numpy()
136
+ return y_pred.flatten()
137
+
138
+ def set_params(self, params):
139
+ # 设置模型参数
140
+ for key, value in params.items():
141
+ if hasattr(self, key):
142
+ setattr(self, key, value)
143
+ else:
144
+ raise ValueError(f"Parameter {key} not found in model.")
145
+
146
+ class ResNetBayesOpt:
147
+ def __init__(self, basic_data, model_nme,
148
+ resp_nme, weight_nme, factor_nmes,
149
+ int_p_list=['hidden_dim', 'block_num', 'batch_size'],
150
+ cate_list=[], prop_test=0.25, rand_seed=None, epochs=100):
151
+ # 初始化数据
152
+ # basic基础数据 格式需为DataFrame
153
+ # model_nme: 模型名称
154
+ # resp_nme: 因变量名称, weight_nme: 权重名称
155
+ # factor_nmes: 因子名称列表
156
+ # int_p_list: 整数参数列表, cate_list: 类别变量列表
157
+ # prop_test: 测试集比例, rand_seed
158
+ self.basic_data = basic_data
159
+ self.resp_nme = resp_nme
160
+ self.weight_nme = weight_nme
161
+ self.factor_nmes = factor_nmes
162
+ self.cate_list = cate_list
163
+ self.num_features = [nme for nme in self.factor_nmes if nme not in self.cate_list]
164
+ self.basic_data.loc[:, 'w_act'] = self.basic_data[self.resp_nme] * \
165
+ self.basic_data[self.weight_nme]
166
+ self.proc_data = self.basic_data[self.factor_nmes +\
167
+ [self.weight_nme]+[self.resp_nme]].copy()
168
+ self.proc_data = pd.get_dummies(self.proc_data, columns=self.cate_list,
169
+ drop_first=True, dtype=np.int8)
170
+ train_data, test_data = train_test_split(
171
+ self.proc_data, test_size=prop_test, random_state=rand_seed)
172
+ for num_chr in self.num_features:
173
+ scaler = StandardScaler()
174
+ train_data[num_chr] = scaler.fit_transform(
175
+ train_data[num_chr].values.reshape(-1, 1))
176
+ test_data[num_chr] = scaler.transform(
177
+ test_data[num_chr].values.reshape(-1, 1))
178
+ self.X_train = train_data.drop([self.weight_nme, self.resp_nme],
179
+ axis=1).copy()
180
+ self.y_train = train_data[resp_nme].copy()
181
+ self.w_train = train_data[weight_nme].copy()
182
+ self.X_test = test_data.drop([self.weight_nme, self.resp_nme],
183
+ axis=1).copy()
184
+ self.y_test = test_data[resp_nme].copy()
185
+ self.w_test = test_data[weight_nme].copy()
186
+ self.rand_seed = rand_seed if rand_seed is not None else np.random.randint(
187
+ 1, 10000)
188
+ self.prop_test = prop_test
189
+ self.model_nme = model_nme
190
+ if self.model_nme.find('f') != -1:
191
+ self.obj = 'count:poisson'
192
+ elif self.model_nme.find('s') != -1:
193
+ self.obj = 'reg:gamma'
194
+ elif self.model_nme.find('bc') != -1:
195
+ self.obj = 'reg:tweedie'
196
+ self.int_p_list = int_p_list
197
+ self.epochs = epochs
198
+
199
+ def cross_val_func(self, trial):
200
+ # 交叉验证函数
201
+ learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True)
202
+ hidden_dim = trial.suggest_int('hidden_dim', 8, 128)
203
+ block_num = trial.suggest_int('block_num', 2, 10)
204
+ batch_size = trial.suggest_float('batch_size', 200, 10000, step=200)
205
+ if self.obj == 'reg:tweedie':
206
+ tw_power = trial.suggest_uniform('tw_power', 0, 2.0)
207
+ elif self.obj == 'count:poisson':
208
+ tw_power = 1
209
+ elif self.obj == 'reg:gamma':
210
+ tw_power = 2
211
+ fold_num = int(1/self.prop_test)
212
+ kf = KFold(n_splits=fold_num, shuffle=True, random_state=self.rand_seed)
213
+ loss = 0
214
+ for fold, (train_idx, test_idx) in enumerate(kf.split(self.X_train)):
215
+ # 创建模型
216
+ cv_net = ResNetScikitLearn(
217
+ model_nme=self.model_nme,
218
+ input_dim= self.X_train.shape[1],
219
+ epochs=self.epochs,
220
+ learning_rate=learning_rate,
221
+ hidden_dim=hidden_dim,
222
+ block_num=block_num,
223
+ batch_size=int(batch_size),
224
+ tweedie_power=tw_power)
225
+ # 训练模型
226
+ cv_net.fit(self.X_train.iloc[train_idx],
227
+ self.y_train.iloc[train_idx],
228
+ self.w_train.iloc[train_idx])
229
+ # 预测
230
+ y_pred_fold = cv_net.predict(self.X_train.iloc[test_idx])
231
+ # 计算损失
232
+ loss += mean_tweedie_deviance(self.y_train.iloc[test_idx],
233
+ y_pred_fold,
234
+ sample_weight=self.w_train.iloc[test_idx],
235
+ power=tw_power)
236
+ return loss / fold_num
237
+
238
+ def bayesopt(self, max_evals=100):
239
+ study = optuna.create_study(
240
+ direction='minimize',
241
+ sampler=optuna.samplers.TPESampler(seed=self.rand_seed))
242
+ study.optimize(self.cross_val_func, n_trials=max_evals)
243
+ self.best_params = study.best_params
244
+ self.best_trial = study.best_trial
245
+ self.ResNet_best = ResNetScikitLearn(
246
+ model_nme=self.model_nme,
247
+ input_dim= self.X_train.shape[1])
248
+ self.ResNet_best.set_params(self.best_params)
249
+ self.ResNet_best.fit(self.X_train, self.y_train, self.w_train)
@@ -0,0 +1,121 @@
1
+ from sklearn.model_selection import ShuffleSplit, cross_val_score # 1.2.2
2
+ from hyperopt import plotting, fmin, hp, tpe, Trials, STATUS_OK # 0.2.7
3
+ from sklearn.metrics import make_scorer, mean_tweedie_deviance # 1.2.2
4
+
5
+ import shap # 0.44.1
6
+ import xgboost as xgb # 1.7.0
7
+ import joblib
8
+ import matplotlib.pyplot as plt
9
+ import numpy as np # 1.26.2
10
+ import pandas as pd # 2.2.3
11
+ import os
12
+ import re
13
+
14
+ class xgb_bayesopt:
15
+ def __init__(self, train_data, test_data,
16
+ model_nme, resp_nme, weight_nme,
17
+ factor_nmes, space_params,
18
+ int_p_list=['n_estimators', 'max_depth'],
19
+ cate_list=[], prop_test=0.25, rand_seed=None):
20
+ # 初始化数据
21
+ # train_data: 训练数据, test_data: 测试数据 格式需为DataFrame
22
+ # model_nme: 模型名称
23
+ # resp_nme: 因变量名称, weight_nme: 权重名称
24
+ # factor_nmes: 因子名称列表, space_params: 参数空间
25
+ # int_p_list: 整数参数列表, cate_list: 类别变量列表
26
+ # prop_test: 测试集比例, rand_seed
27
+
28
+ self.train_data = train_data
29
+ self.test_data = test_data
30
+ self.resp_nme = resp_nme
31
+ self.weight_nme = weight_nme
32
+ self.factor_nmes = factor_nmes
33
+ self.train_data.loc[:, 'w_act'] = self.train_data[self.resp_nme] * \
34
+ self.train_data[self.weight_nme]
35
+ self.test_data.loc[:, 'w_act'] = self.test_data[self.resp_nme] * \
36
+ self.test_data[self.weight_nme]
37
+ self.cate_list = cate_list
38
+ self.space_params = space_params
39
+ self.rand_seed = rand_seed if rand_seed is not None else np.random.randint(
40
+ 1, 10000)
41
+ if self.cate_list != []:
42
+ for cate in self.cate_list:
43
+ self.train_data[cate] = self.train_data[cate].astype('category')
44
+ self.test_data[cate] = self.test_data[cate].astype('category')
45
+ self.prop_test = prop_test
46
+ self.cv = ShuffleSplit(n_splits=int(1/self.prop_test),
47
+ test_size=self.prop_test,
48
+ random_state=self.rand_seed)
49
+ self.model_nme = model_nme
50
+ if self.model_nme.find('f') != -1:
51
+ self.obj = 'count:poisson'
52
+ elif self.model_nme.find('s') != -1:
53
+ self.obj = 'reg:gamma'
54
+ elif self.model_nme.find('bc') != -1:
55
+ self.obj = 'reg:tweedie'
56
+
57
+ if self.obj != 'reg:tweedie':
58
+ del self.space_params['tweedie_variance_power']
59
+ self.int_p_list = int_p_list
60
+ self.clf = xgb.XGBRegressor(objective=self.obj,
61
+ random_state=self.rand_seed,
62
+ subsample=0.9,
63
+ tree_method='gpu_hist',
64
+ gpu_id=0,
65
+ enable_categorical=True,
66
+ predictor='gpu_predictor')
67
+ self.fit_params = {
68
+ 'sample_weight': self.train_data[self.weight_nme].values
69
+ }
70
+
71
+ # 定义交叉验证函数
72
+ def cross_val_xgb(self, params):
73
+ # 将部分float参数调整为整数型
74
+ for param_name in self.int_p_list: # , 'max_leaves'
75
+ params[param_name] = int(params[param_name])
76
+ self.clf.set_params(**params)
77
+ if self.obj == 'reg:tweedie':
78
+ tw_power = params['tweedie_variance_power']
79
+ elif self.obj == 'count:poisson':
80
+ tw_power = 1
81
+ elif self.obj == 'reg:gamma':
82
+ tw_power = 2
83
+ acc = cross_val_score(self.clf,
84
+ self.train_data[self.factor_nmes],
85
+ self.train_data[self.resp_nme].values,
86
+ fit_params=self.fit_params,
87
+ cv=self.cv,
88
+ # scoring='neg_root_mean_squared_error',
89
+ scoring=make_scorer(mean_tweedie_deviance,
90
+ power=tw_power,
91
+ greater_is_better=False),
92
+ error_score='raise',
93
+ n_jobs=int(1/self.prop_test)).mean()
94
+ return {'loss': -acc, 'params': params, 'status': STATUS_OK}
95
+
96
+ # 定义贝叶斯优化函数
97
+ def bayesopt(self, max_evals=100):
98
+ self.trials = Trials()
99
+ self.best = fmin(self.cross_val_xgb, self.space_params,
100
+ algo=tpe.suggest,
101
+ max_evals=max_evals, trials=self.trials)
102
+ for param_name in self.int_p_list: # , 'max_leaves'
103
+ self.best[param_name] = int(self.best[param_name])
104
+ pd.DataFrame(self.best, index=[0]).to_csv(
105
+ os.getcwd() + '/Results/' + self.model_nme + '_bestparams_xgb.csv')
106
+ self.clf.set_params(**self.best)
107
+ self.clf.fit(self.train_data[self.factor_nmes],
108
+ self.train_data[self.resp_nme],
109
+ **self.fit_params)
110
+
111
+ # 定义输出模型函数
112
+ def output_model(self):
113
+ ''' 模型可在Optimization和Initial两种模式下保存 '''
114
+ joblib.dump(self.clf, os.getcwd() + '/Results/' +
115
+ self.model_nme + '_xgb.pkl')
116
+
117
+ def pred(self, data):
118
+ # 模型可在Optimization和Initial两种模式下预测
119
+ return self.clf.predict(data[self.factor_nmes])
120
+
121
+