ins-pricing 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. ins_pricing/README.md +60 -0
  2. ins_pricing/__init__.py +102 -0
  3. ins_pricing/governance/README.md +18 -0
  4. ins_pricing/governance/__init__.py +20 -0
  5. ins_pricing/governance/approval.py +93 -0
  6. ins_pricing/governance/audit.py +37 -0
  7. ins_pricing/governance/registry.py +99 -0
  8. ins_pricing/governance/release.py +159 -0
  9. ins_pricing/modelling/BayesOpt.py +146 -0
  10. ins_pricing/modelling/BayesOpt_USAGE.md +925 -0
  11. ins_pricing/modelling/BayesOpt_entry.py +575 -0
  12. ins_pricing/modelling/BayesOpt_incremental.py +731 -0
  13. ins_pricing/modelling/Explain_Run.py +36 -0
  14. ins_pricing/modelling/Explain_entry.py +539 -0
  15. ins_pricing/modelling/Pricing_Run.py +36 -0
  16. ins_pricing/modelling/README.md +33 -0
  17. ins_pricing/modelling/__init__.py +44 -0
  18. ins_pricing/modelling/bayesopt/__init__.py +98 -0
  19. ins_pricing/modelling/bayesopt/config_preprocess.py +303 -0
  20. ins_pricing/modelling/bayesopt/core.py +1476 -0
  21. ins_pricing/modelling/bayesopt/models.py +2196 -0
  22. ins_pricing/modelling/bayesopt/trainers.py +2446 -0
  23. ins_pricing/modelling/bayesopt/utils.py +1021 -0
  24. ins_pricing/modelling/cli_common.py +136 -0
  25. ins_pricing/modelling/explain/__init__.py +55 -0
  26. ins_pricing/modelling/explain/gradients.py +334 -0
  27. ins_pricing/modelling/explain/metrics.py +176 -0
  28. ins_pricing/modelling/explain/permutation.py +155 -0
  29. ins_pricing/modelling/explain/shap_utils.py +146 -0
  30. ins_pricing/modelling/notebook_utils.py +284 -0
  31. ins_pricing/modelling/plotting/__init__.py +45 -0
  32. ins_pricing/modelling/plotting/common.py +63 -0
  33. ins_pricing/modelling/plotting/curves.py +572 -0
  34. ins_pricing/modelling/plotting/diagnostics.py +139 -0
  35. ins_pricing/modelling/plotting/geo.py +362 -0
  36. ins_pricing/modelling/plotting/importance.py +121 -0
  37. ins_pricing/modelling/run_logging.py +133 -0
  38. ins_pricing/modelling/tests/conftest.py +8 -0
  39. ins_pricing/modelling/tests/test_cross_val_generic.py +66 -0
  40. ins_pricing/modelling/tests/test_distributed_utils.py +18 -0
  41. ins_pricing/modelling/tests/test_explain.py +56 -0
  42. ins_pricing/modelling/tests/test_geo_tokens_split.py +49 -0
  43. ins_pricing/modelling/tests/test_graph_cache.py +33 -0
  44. ins_pricing/modelling/tests/test_plotting.py +63 -0
  45. ins_pricing/modelling/tests/test_plotting_library.py +150 -0
  46. ins_pricing/modelling/tests/test_preprocessor.py +48 -0
  47. ins_pricing/modelling/watchdog_run.py +211 -0
  48. ins_pricing/pricing/README.md +44 -0
  49. ins_pricing/pricing/__init__.py +27 -0
  50. ins_pricing/pricing/calibration.py +39 -0
  51. ins_pricing/pricing/data_quality.py +117 -0
  52. ins_pricing/pricing/exposure.py +85 -0
  53. ins_pricing/pricing/factors.py +91 -0
  54. ins_pricing/pricing/monitoring.py +99 -0
  55. ins_pricing/pricing/rate_table.py +78 -0
  56. ins_pricing/production/__init__.py +21 -0
  57. ins_pricing/production/drift.py +30 -0
  58. ins_pricing/production/monitoring.py +143 -0
  59. ins_pricing/production/scoring.py +40 -0
  60. ins_pricing/reporting/README.md +20 -0
  61. ins_pricing/reporting/__init__.py +11 -0
  62. ins_pricing/reporting/report_builder.py +72 -0
  63. ins_pricing/reporting/scheduler.py +45 -0
  64. ins_pricing/setup.py +41 -0
  65. ins_pricing v2/__init__.py +23 -0
  66. ins_pricing v2/governance/__init__.py +20 -0
  67. ins_pricing v2/governance/approval.py +93 -0
  68. ins_pricing v2/governance/audit.py +37 -0
  69. ins_pricing v2/governance/registry.py +99 -0
  70. ins_pricing v2/governance/release.py +159 -0
  71. ins_pricing v2/modelling/Explain_Run.py +36 -0
  72. ins_pricing v2/modelling/Pricing_Run.py +36 -0
  73. ins_pricing v2/modelling/__init__.py +151 -0
  74. ins_pricing v2/modelling/cli_common.py +141 -0
  75. ins_pricing v2/modelling/config.py +249 -0
  76. ins_pricing v2/modelling/config_preprocess.py +254 -0
  77. ins_pricing v2/modelling/core.py +741 -0
  78. ins_pricing v2/modelling/data_container.py +42 -0
  79. ins_pricing v2/modelling/explain/__init__.py +55 -0
  80. ins_pricing v2/modelling/explain/gradients.py +334 -0
  81. ins_pricing v2/modelling/explain/metrics.py +176 -0
  82. ins_pricing v2/modelling/explain/permutation.py +155 -0
  83. ins_pricing v2/modelling/explain/shap_utils.py +146 -0
  84. ins_pricing v2/modelling/features.py +215 -0
  85. ins_pricing v2/modelling/model_manager.py +148 -0
  86. ins_pricing v2/modelling/model_plotting.py +463 -0
  87. ins_pricing v2/modelling/models.py +2203 -0
  88. ins_pricing v2/modelling/notebook_utils.py +294 -0
  89. ins_pricing v2/modelling/plotting/__init__.py +45 -0
  90. ins_pricing v2/modelling/plotting/common.py +63 -0
  91. ins_pricing v2/modelling/plotting/curves.py +572 -0
  92. ins_pricing v2/modelling/plotting/diagnostics.py +139 -0
  93. ins_pricing v2/modelling/plotting/geo.py +362 -0
  94. ins_pricing v2/modelling/plotting/importance.py +121 -0
  95. ins_pricing v2/modelling/run_logging.py +133 -0
  96. ins_pricing v2/modelling/tests/conftest.py +8 -0
  97. ins_pricing v2/modelling/tests/test_cross_val_generic.py +66 -0
  98. ins_pricing v2/modelling/tests/test_distributed_utils.py +18 -0
  99. ins_pricing v2/modelling/tests/test_explain.py +56 -0
  100. ins_pricing v2/modelling/tests/test_geo_tokens_split.py +49 -0
  101. ins_pricing v2/modelling/tests/test_graph_cache.py +33 -0
  102. ins_pricing v2/modelling/tests/test_plotting.py +63 -0
  103. ins_pricing v2/modelling/tests/test_plotting_library.py +150 -0
  104. ins_pricing v2/modelling/tests/test_preprocessor.py +48 -0
  105. ins_pricing v2/modelling/trainers.py +2447 -0
  106. ins_pricing v2/modelling/utils.py +1020 -0
  107. ins_pricing v2/modelling/watchdog_run.py +211 -0
  108. ins_pricing v2/pricing/__init__.py +27 -0
  109. ins_pricing v2/pricing/calibration.py +39 -0
  110. ins_pricing v2/pricing/data_quality.py +117 -0
  111. ins_pricing v2/pricing/exposure.py +85 -0
  112. ins_pricing v2/pricing/factors.py +91 -0
  113. ins_pricing v2/pricing/monitoring.py +99 -0
  114. ins_pricing v2/pricing/rate_table.py +78 -0
  115. ins_pricing v2/production/__init__.py +21 -0
  116. ins_pricing v2/production/drift.py +30 -0
  117. ins_pricing v2/production/monitoring.py +143 -0
  118. ins_pricing v2/production/scoring.py +40 -0
  119. ins_pricing v2/reporting/__init__.py +11 -0
  120. ins_pricing v2/reporting/report_builder.py +72 -0
  121. ins_pricing v2/reporting/scheduler.py +45 -0
  122. ins_pricing v2/scripts/BayesOpt_incremental.py +722 -0
  123. ins_pricing v2/scripts/Explain_entry.py +545 -0
  124. ins_pricing v2/scripts/__init__.py +1 -0
  125. ins_pricing v2/scripts/train.py +568 -0
  126. ins_pricing v2/setup.py +55 -0
  127. ins_pricing v2/smoke_test.py +28 -0
  128. ins_pricing-0.1.6.dist-info/METADATA +78 -0
  129. ins_pricing-0.1.6.dist-info/RECORD +169 -0
  130. ins_pricing-0.1.6.dist-info/WHEEL +5 -0
  131. ins_pricing-0.1.6.dist-info/top_level.txt +4 -0
  132. user_packages/__init__.py +105 -0
  133. user_packages legacy/BayesOpt.py +5659 -0
  134. user_packages legacy/BayesOpt_entry.py +513 -0
  135. user_packages legacy/BayesOpt_incremental.py +685 -0
  136. user_packages legacy/Pricing_Run.py +36 -0
  137. user_packages legacy/Try/BayesOpt Legacy251213.py +3719 -0
  138. user_packages legacy/Try/BayesOpt Legacy251215.py +3758 -0
  139. user_packages legacy/Try/BayesOpt lagecy251201.py +3506 -0
  140. user_packages legacy/Try/BayesOpt lagecy251218.py +3992 -0
  141. user_packages legacy/Try/BayesOpt legacy.py +3280 -0
  142. user_packages legacy/Try/BayesOpt.py +838 -0
  143. user_packages legacy/Try/BayesOptAll.py +1569 -0
  144. user_packages legacy/Try/BayesOptAllPlatform.py +909 -0
  145. user_packages legacy/Try/BayesOptCPUGPU.py +1877 -0
  146. user_packages legacy/Try/BayesOptSearch.py +830 -0
  147. user_packages legacy/Try/BayesOptSearchOrigin.py +829 -0
  148. user_packages legacy/Try/BayesOptV1.py +1911 -0
  149. user_packages legacy/Try/BayesOptV10.py +2973 -0
  150. user_packages legacy/Try/BayesOptV11.py +3001 -0
  151. user_packages legacy/Try/BayesOptV12.py +3001 -0
  152. user_packages legacy/Try/BayesOptV2.py +2065 -0
  153. user_packages legacy/Try/BayesOptV3.py +2209 -0
  154. user_packages legacy/Try/BayesOptV4.py +2342 -0
  155. user_packages legacy/Try/BayesOptV5.py +2372 -0
  156. user_packages legacy/Try/BayesOptV6.py +2759 -0
  157. user_packages legacy/Try/BayesOptV7.py +2832 -0
  158. user_packages legacy/Try/BayesOptV8Codex.py +2731 -0
  159. user_packages legacy/Try/BayesOptV8Gemini.py +2614 -0
  160. user_packages legacy/Try/BayesOptV9.py +2927 -0
  161. user_packages legacy/Try/BayesOpt_entry legacy.py +313 -0
  162. user_packages legacy/Try/ModelBayesOptSearch.py +359 -0
  163. user_packages legacy/Try/ResNetBayesOptSearch.py +249 -0
  164. user_packages legacy/Try/XgbBayesOptSearch.py +121 -0
  165. user_packages legacy/Try/xgbbayesopt.py +523 -0
  166. user_packages legacy/__init__.py +19 -0
  167. user_packages legacy/cli_common.py +124 -0
  168. user_packages legacy/notebook_utils.py +228 -0
  169. user_packages legacy/watchdog_run.py +202 -0
@@ -0,0 +1,313 @@
1
+ """
2
+ CLI entry point generated from BayesOpt_AutoPricing.ipynb so the workflow can
3
+ run non‑interactively (e.g., via torchrun).
4
+
5
+ Example:
6
+ torchrun --standalone --nproc_per_node=2 \\
7
+ python BayesOpt_entry.py \\
8
+ --config-json config.json \\
9
+ --model-key ft --max-evals 50 --use-ft-ddp
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import argparse
15
+ import json
16
+ import os
17
+ from itertools import combinations
18
+ from pathlib import Path
19
+ from typing import Dict, List, Tuple
20
+
21
+ import pandas as pd
22
+ from sklearn.model_selection import train_test_split
23
+
24
+ import BayesOpt as ropt
25
+
26
+ PLOT_MODEL_LABELS: Dict[str, Tuple[str, str]] = {
27
+ "glm": ("GLM", "pred_glm"),
28
+ "xgb": ("Xgboost", "pred_xgb"),
29
+ "resn": ("ResNet", "pred_resn"),
30
+ "ft": ("FTTransformer", "pred_ft"),
31
+ "gnn": ("GNN", "pred_gnn"),
32
+ }
33
+
34
+
35
+ def _parse_args() -> argparse.Namespace:
36
+ parser = argparse.ArgumentParser(
37
+ description="Batch trainer generated from BayesOpt_AutoPricing notebook."
38
+ )
39
+ parser.add_argument(
40
+ "--config-json",
41
+ required=True,
42
+ help="Path to the JSON config describing datasets and feature columns.",
43
+ )
44
+ parser.add_argument(
45
+ "--model-keys",
46
+ nargs="+",
47
+ default=["ft"],
48
+ choices=["glm", "xgb", "resn", "ft", "gnn", "all"],
49
+ help="Space-separated list of trainers to run (e.g., --model-keys glm xgb). Include 'all' to run every trainer.",
50
+ )
51
+ parser.add_argument(
52
+ "--max-evals",
53
+ type=int,
54
+ default=50,
55
+ help="Optuna trial count per dataset.",
56
+ )
57
+ parser.add_argument(
58
+ "--use-resn-ddp",
59
+ action="store_true",
60
+ help="Force ResNet trainer to use DistributedDataParallel.",
61
+ )
62
+ parser.add_argument(
63
+ "--use-ft-ddp",
64
+ action="store_true",
65
+ help="Force FT-Transformer trainer to use DistributedDataParallel.",
66
+ )
67
+ parser.add_argument(
68
+ "--use-resn-dp",
69
+ action="store_true",
70
+ help="Enable ResNet DataParallel fall-back regardless of config.",
71
+ )
72
+ parser.add_argument(
73
+ "--use-ft-dp",
74
+ action="store_true",
75
+ help="Enable FT-Transformer DataParallel fall-back regardless of config.",
76
+ )
77
+ parser.add_argument(
78
+ "--use-gnn-dp",
79
+ action="store_true",
80
+ help="Enable GNN DataParallel fall-back regardless of config.",
81
+ )
82
+ parser.add_argument(
83
+ "--use-gnn-ddp",
84
+ action="store_true",
85
+ help="Force GNN trainer to use DistributedDataParallel.",
86
+ )
87
+ parser.add_argument(
88
+ "--use-gnn-faiss",
89
+ action="store_true",
90
+ help="Force GNN to build graphs with FAISS ANN (CPU/GPU depending on availability).",
91
+ )
92
+ parser.add_argument(
93
+ "--gnn-faiss-cpu",
94
+ action="store_true",
95
+ help="Force FAISS ANN to run on CPU even if GPUs are visible.",
96
+ )
97
+ return parser.parse_args()
98
+
99
+
100
+ def _load_config(path: Path) -> Dict:
101
+ # JSON 配置中包含数据集路径、特征/标签字段以及绘图开关。
102
+ cfg = json.loads(path.read_text(encoding="utf-8"))
103
+ required = [
104
+ "data_dir",
105
+ "model_list",
106
+ "model_categories",
107
+ "target",
108
+ "weight",
109
+ "feature_list",
110
+ "categorical_features",
111
+ ]
112
+ missing = [key for key in required if key not in cfg]
113
+ if missing:
114
+ raise ValueError(f"Missing required keys in {path}: {missing}")
115
+ return cfg
116
+
117
+
118
+ def _set_env(env_overrides: Dict[str, str]) -> None:
119
+ # 仅在环境变量未设置时进行填充。
120
+ for key, value in env_overrides.items():
121
+ os.environ.setdefault(key, str(value))
122
+
123
+
124
+ def _build_model_names(prefixes: List[str], suffixes: List[str]) -> List[str]:
125
+ # 生成基础模型名与类别的笛卡尔积(如 prod/gl/)。
126
+ names: List[str] = []
127
+ for suffix in suffixes:
128
+ names.extend(f"{prefix}_{suffix}" for prefix in prefixes)
129
+ return names
130
+
131
+
132
+ def _dedupe_preserve_order(items: List[str]) -> List[str]:
133
+ # 去重但保留首个出现顺序。
134
+ seen = set()
135
+ unique_items: List[str] = []
136
+ for item in items:
137
+ if item not in seen:
138
+ unique_items.append(item)
139
+ seen.add(item)
140
+ return unique_items
141
+
142
+
143
+ def _parse_model_pairs(raw_pairs: List) -> List[Tuple[str, str]]:
144
+ # 兼容 [["glm","xgb"]] 或 "glm,xgb" 两种格式。
145
+ pairs: List[Tuple[str, str]] = []
146
+ for pair in raw_pairs:
147
+ if isinstance(pair, (list, tuple)) and len(pair) == 2:
148
+ pairs.append((str(pair[0]), str(pair[1])))
149
+ elif isinstance(pair, str):
150
+ parts = [p.strip() for p in pair.split(",") if p.strip()]
151
+ if len(parts) == 2:
152
+ pairs.append((parts[0], parts[1]))
153
+ return pairs
154
+
155
+
156
+ def _plot_curves_for_model(model: ropt.BayesOptModel, trained_keys: List[str], cfg: Dict) -> None:
157
+ plot_cfg = cfg.get("plot", {})
158
+ legacy_lift_flags = {
159
+ "glm": cfg.get("plot_lift_glm", False),
160
+ "xgb": cfg.get("plot_lift_xgb", False),
161
+ "resn": cfg.get("plot_lift_resn", False),
162
+ "ft": cfg.get("plot_lift_ft", False),
163
+ }
164
+ plot_enabled = plot_cfg.get("enable", any(legacy_lift_flags.values()))
165
+ if not plot_enabled:
166
+ return
167
+
168
+ n_bins = int(plot_cfg.get("n_bins", 10))
169
+ oneway_enabled = plot_cfg.get("oneway", True)
170
+
171
+ available_models = _dedupe_preserve_order(
172
+ [m for m in trained_keys if m in PLOT_MODEL_LABELS]
173
+ )
174
+
175
+ if oneway_enabled:
176
+ model.plot_oneway(n_bins=n_bins)
177
+
178
+ if not available_models:
179
+ return
180
+
181
+ lift_models = plot_cfg.get("lift_models")
182
+ if lift_models is None:
183
+ lift_models = [
184
+ m for m, enabled in legacy_lift_flags.items() if enabled]
185
+ if not lift_models:
186
+ lift_models = available_models
187
+ lift_models = _dedupe_preserve_order(
188
+ [m for m in lift_models if m in available_models]
189
+ )
190
+
191
+ for model_key in lift_models:
192
+ label, pred_nme = PLOT_MODEL_LABELS[model_key]
193
+ model.plot_lift(model_label=label, pred_nme=pred_nme, n_bins=n_bins)
194
+
195
+ if not plot_cfg.get("double_lift", True) or len(available_models) < 2:
196
+ return
197
+
198
+ raw_pairs = plot_cfg.get("double_lift_pairs")
199
+ if raw_pairs:
200
+ pairs = [
201
+ (a, b)
202
+ for a, b in _parse_model_pairs(raw_pairs)
203
+ if a in available_models and b in available_models and a != b
204
+ ]
205
+ else:
206
+ pairs = list(combinations(available_models, 2))
207
+
208
+ for first, second in pairs:
209
+ model.plot_dlift([first, second], n_bins=n_bins)
210
+
211
+
212
+ def train_from_config(args: argparse.Namespace) -> None:
213
+ cfg = _load_config(Path(args.config_json))
214
+
215
+ _set_env(cfg.get("env", {}))
216
+
217
+ data_dir = Path(cfg["data_dir"]).resolve()
218
+ data_dir.mkdir(parents=True, exist_ok=True)
219
+
220
+ prop_test = cfg.get("prop_test", 0.25)
221
+ rand_seed = cfg.get("rand_seed", 13)
222
+ epochs = cfg.get("epochs", 50)
223
+
224
+ model_names = _build_model_names(
225
+ cfg["model_list"], cfg["model_categories"])
226
+ if not model_names:
227
+ raise ValueError(
228
+ "No model names generated from model_list/model_categories.")
229
+
230
+ results: Dict[str, ropt.BayesOptModel] = {}
231
+ trained_keys_by_model: Dict[str, List[str]] = {}
232
+
233
+ for model_name in model_names:
234
+ # 针对每个数据集的训练循环:加载数据、划分训练测试、按请求训练模型。
235
+ csv_path = data_dir / f"{model_name}.csv"
236
+ if not csv_path.exists():
237
+ raise FileNotFoundError(f"Missing dataset: {csv_path}")
238
+
239
+ print(f"\n=== Processing model {model_name} ===")
240
+ raw = pd.read_csv(csv_path, low_memory=False)
241
+ raw.fillna(0, inplace=True)
242
+
243
+ train_df, test_df = train_test_split(
244
+ raw, test_size=prop_test, random_state=rand_seed
245
+ )
246
+
247
+ use_resn_dp = args.use_resn_dp or cfg.get(
248
+ "use_resn_data_parallel", False)
249
+ use_ft_dp = args.use_ft_dp or cfg.get("use_ft_data_parallel", True)
250
+ use_resn_ddp = args.use_resn_ddp or cfg.get("use_resn_ddp", False)
251
+ use_ft_ddp = args.use_ft_ddp or cfg.get("use_ft_ddp", False)
252
+ use_gnn_dp = args.use_gnn_dp or cfg.get("use_gnn_data_parallel", False)
253
+ use_gnn_ddp = args.use_gnn_ddp or cfg.get("use_gnn_ddp", False)
254
+ use_gnn_faiss = args.use_gnn_faiss or cfg.get("use_gnn_faiss", False)
255
+ use_gnn_faiss_gpu = cfg.get(
256
+ "use_gnn_faiss_gpu", True) and (not args.gnn_faiss_cpu)
257
+
258
+ model = ropt.BayesOptModel(
259
+ train_df,
260
+ test_df,
261
+ model_name,
262
+ cfg["target"],
263
+ cfg["weight"],
264
+ cfg["feature_list"],
265
+ cate_list=cfg["categorical_features"],
266
+ prop_test=prop_test,
267
+ rand_seed=rand_seed,
268
+ epochs=epochs,
269
+ use_resn_data_parallel=use_resn_dp,
270
+ use_ft_data_parallel=use_ft_dp,
271
+ use_resn_ddp=use_resn_ddp,
272
+ use_ft_ddp=use_ft_ddp,
273
+ use_gnn_data_parallel=use_gnn_dp,
274
+ use_gnn_ddp=use_gnn_ddp,
275
+ use_gnn_faiss=use_gnn_faiss,
276
+ use_gnn_faiss_gpu=use_gnn_faiss_gpu,
277
+ )
278
+
279
+ if "all" in args.model_keys:
280
+ requested_keys = ["glm", "xgb", "resn", "ft", "gnn"]
281
+ else:
282
+ requested_keys = args.model_keys
283
+ requested_keys = _dedupe_preserve_order(requested_keys)
284
+ missing = [key for key in requested_keys if key not in model.trainers]
285
+ if missing:
286
+ raise ValueError(
287
+ f"Trainer(s) {missing} not available for {model_name}")
288
+
289
+ for key in requested_keys:
290
+ print(
291
+ f"Optimizing {key} for {model_name} (max_evals={args.max_evals})")
292
+ model.optimize_model(key, max_evals=args.max_evals)
293
+ model.trainers[key].save()
294
+ ropt.free_cuda()
295
+
296
+ results[model_name] = model
297
+ trained_keys_by_model[model_name] = requested_keys
298
+
299
+ for name, model in results.items():
300
+ _plot_curves_for_model(
301
+ model,
302
+ trained_keys_by_model.get(name, []),
303
+ cfg,
304
+ )
305
+
306
+
307
+ def main() -> None:
308
+ args = _parse_args()
309
+ train_from_config(args)
310
+
311
+
312
+ if __name__ == "__main__":
313
+ main()
@@ -0,0 +1,359 @@
1
+ from random import sample
2
+ import numpy as np # 1.26.2
3
+ import pandas as pd # 2.2.3
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch.nn.functional as F
7
+ import os
8
+ import re
9
+ import optuna
10
+ import xgboost as xgb # 1.7.0
11
+ import joblib
12
+
13
+ from torch.utils.data import DataLoader, TensorDataset
14
+ from sklearn.model_selection import KFold, train_test_split, ShuffleSplit, cross_val_score # 1.2.2
15
+ from torch.utils.data import DataLoader, TensorDataset
16
+ from sklearn.preprocessing import StandardScaler
17
+ from sklearn.metrics import make_scorer, mean_tweedie_deviance
18
+
19
+ # 定义torch下tweedie deviance损失函数
20
+ # 参考:https://scikit-learn.org/stable/modules/model_evaluation.html#mean-poisson-gamma-and-tweedie-deviances
21
+ def tweedie_loss(pred, target, p=1.5):
22
+ # Ensure predictions are positive for stability
23
+ eps = 1e-6
24
+ pred_clamped = torch.clamp(pred, min=eps)
25
+ # Compute Tweedie deviance components
26
+ if p == 1:
27
+ # Poisson case
28
+ term1 = target * torch.log(target / pred_clamped + eps)
29
+ term2 = -target + pred_clamped
30
+ term3 = 0
31
+ elif p == 0:
32
+ # Gaussian case
33
+ term1 = 0.5 * torch.pow(target - pred_clamped, 2)
34
+ term2 = 0
35
+ term3 = 0
36
+ elif p == 2:
37
+ # Gamma case
38
+ term1 = torch.log(pred_clamped / target + eps)
39
+ term2 = -target / pred_clamped +1
40
+ term3 = 0
41
+ else:
42
+ term1 = torch.pow(target, 2 - p) / ((1 - p) * (2 - p))
43
+ term2 = target * torch.pow(pred_clamped, 1 - p) / (1 - p)
44
+ term3 = torch.pow(pred_clamped, 2 - p) / (2 - p)
45
+ # Tweedie negative log-likelihood (up to a constant)
46
+ return 2 * (term1 - term2 + term3)
47
+
48
+ class xgb_bayesopt:
49
+ def __init__(self, train_data, test_data,
50
+ model_nme, resp_nme, weight_nme, factor_nmes,
51
+ int_p_list=['n_estimators', 'max_depth'],
52
+ cate_list=[], prop_test=0.25, rand_seed=None):
53
+ # 初始化数据
54
+ # train_data: 训练数据, test_data: 测试数据 格式需为DataFrame
55
+ # model_nme: 模型名称
56
+ # resp_nme: 因变量名称, weight_nme: 权重名称
57
+ # factor_nmes: 因子名称列表, space_params: 参数空间
58
+ # int_p_list: 整数参数列表, cate_list: 类别变量列表
59
+ # prop_test: 测试集比例, rand_seed
60
+
61
+ self.train_data = train_data
62
+ self.test_data = test_data
63
+ self.resp_nme = resp_nme
64
+ self.weight_nme = weight_nme
65
+ self.factor_nmes = factor_nmes
66
+ self.cate_list = cate_list
67
+ self.rand_seed = rand_seed if rand_seed is not None else np.random.randint(
68
+ 1, 10000)
69
+ if self.cate_list != []:
70
+ for cate in self.cate_list:
71
+ self.train_data[cate] = self.train_data[cate].astype('category')
72
+ self.test_data[cate] = self.test_data[cate].astype('category')
73
+ self.prop_test = prop_test
74
+ self.cv = ShuffleSplit(n_splits=int(1/self.prop_test),
75
+ test_size=self.prop_test,
76
+ random_state=self.rand_seed)
77
+ self.model_nme = model_nme
78
+ if self.model_nme.find('f') != -1:
79
+ self.obj = 'count:poisson'
80
+ elif self.model_nme.find('s') != -1:
81
+ self.obj = 'reg:gamma'
82
+ elif self.model_nme.find('bc') != -1:
83
+ self.obj = 'reg:tweedie'
84
+
85
+ self.int_p_list = int_p_list
86
+ self.clf = xgb.XGBRegressor(objective=self.obj,
87
+ random_state=self.rand_seed,
88
+ subsample=0.9,
89
+ tree_method='gpu_hist',
90
+ gpu_id=0,
91
+ enable_categorical=True,
92
+ predictor='gpu_predictor')
93
+ self.fit_params = {
94
+ 'sample_weight': self.train_data[self.weight_nme].values
95
+ }
96
+
97
+ # 定义交叉验证函数
98
+ def cross_val_xgb(self, trial):
99
+ learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True)
100
+ gamma = trial.suggest_float('gamma', 0, 10000)
101
+ max_depth = trial.suggest_int('max_depth', 3, 25)
102
+ n_estimators = trial.suggest_int('n_estimators', 10, 500, step=10)
103
+ min_child_weight = trial.suggest_float('min_child_weight', 1, 10000)
104
+ reg_alpha = trial.suggest_float('reg_alpha', 1e-10, 1, log=True)
105
+ reg_lambda = trial.suggest_float('reg_lambda', 1e-10, 1, log=True)
106
+ if self.obj == 'reg:tweedie':
107
+ tweedie_variance_power = trial.suggest_float('tweedie_variance_power', 1, 2)
108
+ elif self.obj == 'count:poisson':
109
+ tweedie_variance_power = 1
110
+ elif self.obj == 'reg:gamma':
111
+ tweedie_variance_power = 2
112
+ params = {
113
+ 'learning_rate': learning_rate,
114
+ 'gamma': gamma,
115
+ 'max_depth': int(max_depth),
116
+ 'n_estimators': int(n_estimators),
117
+ 'min_child_weight': min_child_weight,
118
+ 'reg_alpha': reg_alpha,
119
+ 'reg_lambda': reg_lambda,
120
+ 'tweedie_variance_power': tweedie_variance_power
121
+ }
122
+ if self.obj != 'reg:tweedie':
123
+ del params['tweedie_variance_power']
124
+ self.clf.set_params(**params)
125
+ acc = cross_val_score(self.clf,
126
+ self.train_data[self.factor_nmes],
127
+ self.train_data[self.resp_nme].values,
128
+ fit_params=self.fit_params,
129
+ cv=self.cv,
130
+ scoring=make_scorer(mean_tweedie_deviance,
131
+ power=tweedie_variance_power,
132
+ greater_is_better=False),
133
+ error_score='raise',
134
+ n_jobs=int(1/self.prop_test)).mean()
135
+ return -acc
136
+
137
+ # 定义贝叶斯优化函数
138
+ def bayesopt(self, max_evals=100):
139
+ study = optuna.create_study(
140
+ direction='minimize',
141
+ sampler=optuna.samplers.TPESampler(seed=self.rand_seed))
142
+ study.optimize(self.cross_val_xgb, n_trials=max_evals)
143
+ self.best_params = study.best_params
144
+ self.best_trial = study.best_trial
145
+ self.clf.set_params(**self.best_params)
146
+ self.clf.fit(self.train_data[self.factor_nmes],
147
+ self.train_data[self.resp_nme].values,
148
+ **self.fit_params))
149
+ self.train_data['pred'] = self.clf.predict(
150
+ self.train_data[self.factor_nmes])
151
+ self.test_data['pred'] = self.clf.predict(
152
+ self.test_data[self.factor_nmes])
153
+
154
+ # 定义ResNet模型
155
+
156
+ class ResBlock(nn.Module):
157
+ """一个简单的残差块:两层线性 + ReLU, 带跳跃连接"""
158
+ def __init__(self, dim):
159
+ super().__init__()
160
+ self.block = nn.Sequential(
161
+ nn.Linear(dim, dim),
162
+ nn.ReLU(),
163
+ nn.Linear(dim, dim)
164
+ )
165
+
166
+ def forward(self, x):
167
+ # 原始输入 + 两层变换,然后再过 ReLU
168
+ return F.relu(self.block(x) + x)
169
+
170
+ class ResNetSequential(nn.Module):
171
+ """整个网络用 nn.Sequential 串联:输入 -> ResBlock*block_num -> 输出"""
172
+ def __init__(self, input_dim, hidden_dim=64, block_num=2):
173
+ super().__init__()
174
+ self.net = nn.Sequential()
175
+ self.net.add_module('fc1', nn.Linear(input_dim, hidden_dim)),
176
+ self.net.add_module('ReLU1', nn.ReLU())
177
+ for i in range(block_num):
178
+ self.net.add_module('ResBlk_'+str(i+1), ResBlock(hidden_dim))
179
+ self.net.add_module('fc2', nn.Linear(hidden_dim, 1))
180
+ self.net.add_module('softplus', nn.Softplus())
181
+
182
+ def forward(self, x):
183
+ return self.net(x)
184
+
185
+ class ResNetScikitLearn:
186
+ """贝叶斯优化类,使用高斯过程进行超参数优化"""
187
+ def __init__(self, model_nme, input_dim, hidden_dim=64,
188
+ block_num=2, batch_size=32, epochs=100,
189
+ tweedie_power=1.5,learning_rate=0.01):
190
+ self.input_dim = input_dim
191
+ self.hidden_dim = hidden_dim
192
+ self.block_num = block_num
193
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
194
+ self.resnet = ResNetSequential(
195
+ self.input_dim,
196
+ self.hidden_dim,
197
+ self.block_num
198
+ ).to(self.device)
199
+ self.batch_size = batch_size
200
+ self.epochs = epochs
201
+ self.model_nme = model_nme
202
+ if self.model_nme.find('f') != -1:
203
+ self.tw_power = 1
204
+ elif self.model_nme.find('s') != -1:
205
+ self.tw_power = 2
206
+ else:
207
+ self.tw_power = tweedie_power
208
+ self.learning_rate = learning_rate
209
+
210
+ def fit(self, X_train, y_train, w_train=None):
211
+ # 将数据转换为 PyTorch 张量
212
+ X_tensor = torch.tensor(X_train.values, dtype=torch.float32).to(self.device)
213
+ y_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1).to(self.device)
214
+ w_tensor = torch.tensor(w_train.values, dtype=torch.float32).view(-1, 1).to(self.device) if w_train is not None else None
215
+ # 创建数据集和数据加载器
216
+ dataset = TensorDataset(X_tensor, y_tensor, w_tensor) if w_train is not None else TensorDataset(X_tensor, y_tensor)
217
+ dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)
218
+
219
+ # 定义损失函数和优化器
220
+ optimizer = torch.optim.Adam(self.resnet.parameters(), lr=self.learning_rate)
221
+
222
+ # 训练模型
223
+ for epoch in range(1, self.epochs + 1):
224
+ self.resnet.train()
225
+ total_loss = 0.0
226
+ total_weight = 0.0
227
+ for X_batch, y_batch, w_batch in dataloader:
228
+ X_batch = X_batch.to(self.device)
229
+ y_batch = y_batch.to(self.device)
230
+ w_batch = w_batch.to(self.device)
231
+ optimizer.zero_grad()
232
+ y_pred = self.resnet(X_batch)
233
+ loss_values = tweedie_loss(y_pred, y_batch, p=self.tw_power).view(-1)
234
+ weighted_loss = (loss_values * w_batch.view(-1)).sum() / w_batch.sum()
235
+ weighted_loss.backward()
236
+ optimizer.step()
237
+ total_loss += weighted_loss.item() * w_batch.sum().item()
238
+ total_weight += w_batch.sum().item()
239
+
240
+ def predict(self, X_test):
241
+ self.resnet.eval()
242
+ with torch.no_grad():
243
+ X_tensor = torch.tensor(X_test.values, dtype=torch.float32).to(self.device)
244
+ y_pred = self.resnet(X_tensor).cpu().numpy()
245
+ return y_pred.flatten()
246
+
247
+ def set_params(self, params):
248
+ # 设置模型参数
249
+ for key, value in params.items():
250
+ if hasattr(self, key):
251
+ setattr(self, key, value)
252
+ else:
253
+ raise ValueError(f"Parameter {key} not found in model.")
254
+
255
+ class ResNetBayesOpt:
256
+ def __init__(self, train_data, test_data,
257
+ model_nme, resp_nme, weight_nme, factor_nmes,
258
+ int_p_list=['hidden_dim', 'block_num', 'batch_size'],
259
+ cate_list=[], prop_test=0.25, rand_seed=None, epochs=100):
260
+ # 初始化数据
261
+ # basic基础数据 格式需为DataFrame
262
+ # model_nme: 模型名称
263
+ # resp_nme: 因变量名称, weight_nme: 权重名称
264
+ # factor_nmes: 因子名称列表
265
+ # int_p_list: 整数参数列表, cate_list: 类别变量列表
266
+ # prop_test: 测试集比例, rand_seed
267
+ self.train_data = train_data
268
+ self.test_data = test_data
269
+ self.resp_nme = resp_nme
270
+ self.weight_nme = weight_nme
271
+ self.factor_nmes = factor_nmes
272
+ self.cate_list = cate_list
273
+ self.num_features = [nme for nme in self.factor_nmes if nme not in self.cate_list]
274
+ self.train_oht_scl_data = self.train_data[self.factor_nmes +\
275
+ [self.weight_nme]+[self.resp_nme]].copy()
276
+ self.test_oht_scl_data = self.test_data[self.factor_nmes +\
277
+ [self.weight_nme]+[self.resp_nme]].copy()
278
+ self.train_oht_scl_data = pd.get_dummies(self.train_oht_scl_data, columns=self.cate_list,
279
+ drop_first=True, dtype=np.int8)
280
+ self.test_oht_scl_data = pd.get_dummies(self.test_oht_scl_data, columns=self.cate_list,
281
+ drop_first=True, dtype=np.int8)
282
+ for num_chr in self.num_features:
283
+ scaler = StandardScaler()
284
+ self.train_oht_scl_data[num_chr] = scaler.fit_transform(
285
+ self.train_oht_scl_data[num_chr].values.reshape(-1, 1))
286
+ self.test_oht_scl_data[num_chr] = scaler.transform(
287
+ self.test_oht_scl_data[num_chr].values.reshape(-1, 1))
288
+
289
+ self.var_nmes = [nme for nme in self.train_oht_scl_data.columns if nme not in [self.resp_nme, self.weight_nme]]
290
+ self.rand_seed = rand_seed if rand_seed is not None else np.random.randint(
291
+ 1, 10000)
292
+ self.prop_test = prop_test
293
+ self.model_nme = model_nme
294
+ if self.model_nme.find('f') != -1:
295
+ self.obj = 'count:poisson'
296
+ elif self.model_nme.find('s') != -1:
297
+ self.obj = 'reg:gamma'
298
+ elif self.model_nme.find('bc') != -1:
299
+ self.obj = 'reg:tweedie'
300
+ self.int_p_list = int_p_list
301
+ self.epochs = epochs
302
+
303
+ def cross_val_resn(self, trial):
304
+ # 交叉验证函数
305
+ learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True)
306
+ hidden_dim = trial.suggest_int('hidden_dim', 8, 128)
307
+ block_num = trial.suggest_int('block_num', 2, 10)
308
+ batch_size = trial.suggest_float('batch_size', 200, 10000, step=200)
309
+ if self.obj == 'reg:tweedie':
310
+ tw_power = trial.suggest_uniform('tw_power', 0, 2.0)
311
+ elif self.obj == 'count:poisson':
312
+ tw_power = 1
313
+ elif self.obj == 'reg:gamma':
314
+ tw_power = 2
315
+ fold_num = int(1/self.prop_test)
316
+ kf = KFold(n_splits=fold_num, shuffle=True, random_state=self.rand_seed)
317
+ loss = 0
318
+ for fold, (train_idx, test_idx) in enumerate(kf.split(self.train_oht_scl_data[self.var_nmes])):
319
+ # 创建模型
320
+ cv_net = ResNetScikitLearn(
321
+ model_nme=self.model_nme,
322
+ input_dim= self.train_oht_scl_data[self.var_nmes].shape[1],
323
+ epochs=self.epochs,
324
+ learning_rate=learning_rate,
325
+ hidden_dim=hidden_dim,
326
+ block_num=block_num,
327
+ batch_size=int(batch_size),
328
+ tweedie_power=tw_power)
329
+ # 训练模型
330
+ cv_net.fit(self.train_oht_scl_data[self.var_nmes].iloc[train_idx],
331
+ self.train_oht_scl_data[self.resp_nme].iloc[train_idx],
332
+ self.train_oht_scl_data[self.weight_nme].iloc[train_idx])
333
+ # 预测
334
+ y_pred_fold = cv_net.predict(self.train_oht_scl_data[self.var_nmes].iloc[test_idx])
335
+ # 计算损失
336
+ loss += mean_tweedie_deviance(self.train_oht_scl_data[self.resp_nme].iloc[test_idx],
337
+ y_pred_fold,
338
+ sample_weight=self.train_oht_scl_data[self.weight_nme].iloc[test_idx],
339
+ power=tw_power)
340
+ return loss / fold_num
341
+
342
+ def bayesopt(self, max_evals=100):
343
+ study = optuna.create_study(
344
+ direction='minimize',
345
+ sampler=optuna.samplers.TPESampler(seed=self.rand_seed))
346
+ study.optimize(self.cross_val_resn, n_trials=max_evals)
347
+ self.best_params = study.best_params
348
+ self.best_trial = study.best_trial
349
+ self.clf = ResNetScikitLearn(
350
+ model_nme=self.model_nme,
351
+ input_dim=self.train_oht_scl_data[self.var_nmes].shape[1])
352
+ self.clf.set_params(self.best_params)
353
+ self.clf.fit(self.train_oht_scl_data[self.var_nmes],
354
+ self.train_oht_scl_data[self.resp_nme],
355
+ self.train_oht_scl_data[self.weight_nme])
356
+ self.train_data['pred'] = self.clf.predict(
357
+ self.train_oht_scl_data[self.var_nmes])
358
+ self.test_data['pred'] = self.clf.predict(
359
+ self.test_oht_scl_data[self.var_nmes])