ins-pricing 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. ins_pricing/README.md +60 -0
  2. ins_pricing/__init__.py +102 -0
  3. ins_pricing/governance/README.md +18 -0
  4. ins_pricing/governance/__init__.py +20 -0
  5. ins_pricing/governance/approval.py +93 -0
  6. ins_pricing/governance/audit.py +37 -0
  7. ins_pricing/governance/registry.py +99 -0
  8. ins_pricing/governance/release.py +159 -0
  9. ins_pricing/modelling/BayesOpt.py +146 -0
  10. ins_pricing/modelling/BayesOpt_USAGE.md +925 -0
  11. ins_pricing/modelling/BayesOpt_entry.py +575 -0
  12. ins_pricing/modelling/BayesOpt_incremental.py +731 -0
  13. ins_pricing/modelling/Explain_Run.py +36 -0
  14. ins_pricing/modelling/Explain_entry.py +539 -0
  15. ins_pricing/modelling/Pricing_Run.py +36 -0
  16. ins_pricing/modelling/README.md +33 -0
  17. ins_pricing/modelling/__init__.py +44 -0
  18. ins_pricing/modelling/bayesopt/__init__.py +98 -0
  19. ins_pricing/modelling/bayesopt/config_preprocess.py +303 -0
  20. ins_pricing/modelling/bayesopt/core.py +1476 -0
  21. ins_pricing/modelling/bayesopt/models.py +2196 -0
  22. ins_pricing/modelling/bayesopt/trainers.py +2446 -0
  23. ins_pricing/modelling/bayesopt/utils.py +1021 -0
  24. ins_pricing/modelling/cli_common.py +136 -0
  25. ins_pricing/modelling/explain/__init__.py +55 -0
  26. ins_pricing/modelling/explain/gradients.py +334 -0
  27. ins_pricing/modelling/explain/metrics.py +176 -0
  28. ins_pricing/modelling/explain/permutation.py +155 -0
  29. ins_pricing/modelling/explain/shap_utils.py +146 -0
  30. ins_pricing/modelling/notebook_utils.py +284 -0
  31. ins_pricing/modelling/plotting/__init__.py +45 -0
  32. ins_pricing/modelling/plotting/common.py +63 -0
  33. ins_pricing/modelling/plotting/curves.py +572 -0
  34. ins_pricing/modelling/plotting/diagnostics.py +139 -0
  35. ins_pricing/modelling/plotting/geo.py +362 -0
  36. ins_pricing/modelling/plotting/importance.py +121 -0
  37. ins_pricing/modelling/run_logging.py +133 -0
  38. ins_pricing/modelling/tests/conftest.py +8 -0
  39. ins_pricing/modelling/tests/test_cross_val_generic.py +66 -0
  40. ins_pricing/modelling/tests/test_distributed_utils.py +18 -0
  41. ins_pricing/modelling/tests/test_explain.py +56 -0
  42. ins_pricing/modelling/tests/test_geo_tokens_split.py +49 -0
  43. ins_pricing/modelling/tests/test_graph_cache.py +33 -0
  44. ins_pricing/modelling/tests/test_plotting.py +63 -0
  45. ins_pricing/modelling/tests/test_plotting_library.py +150 -0
  46. ins_pricing/modelling/tests/test_preprocessor.py +48 -0
  47. ins_pricing/modelling/watchdog_run.py +211 -0
  48. ins_pricing/pricing/README.md +44 -0
  49. ins_pricing/pricing/__init__.py +27 -0
  50. ins_pricing/pricing/calibration.py +39 -0
  51. ins_pricing/pricing/data_quality.py +117 -0
  52. ins_pricing/pricing/exposure.py +85 -0
  53. ins_pricing/pricing/factors.py +91 -0
  54. ins_pricing/pricing/monitoring.py +99 -0
  55. ins_pricing/pricing/rate_table.py +78 -0
  56. ins_pricing/production/__init__.py +21 -0
  57. ins_pricing/production/drift.py +30 -0
  58. ins_pricing/production/monitoring.py +143 -0
  59. ins_pricing/production/scoring.py +40 -0
  60. ins_pricing/reporting/README.md +20 -0
  61. ins_pricing/reporting/__init__.py +11 -0
  62. ins_pricing/reporting/report_builder.py +72 -0
  63. ins_pricing/reporting/scheduler.py +45 -0
  64. ins_pricing/setup.py +41 -0
  65. ins_pricing v2/__init__.py +23 -0
  66. ins_pricing v2/governance/__init__.py +20 -0
  67. ins_pricing v2/governance/approval.py +93 -0
  68. ins_pricing v2/governance/audit.py +37 -0
  69. ins_pricing v2/governance/registry.py +99 -0
  70. ins_pricing v2/governance/release.py +159 -0
  71. ins_pricing v2/modelling/Explain_Run.py +36 -0
  72. ins_pricing v2/modelling/Pricing_Run.py +36 -0
  73. ins_pricing v2/modelling/__init__.py +151 -0
  74. ins_pricing v2/modelling/cli_common.py +141 -0
  75. ins_pricing v2/modelling/config.py +249 -0
  76. ins_pricing v2/modelling/config_preprocess.py +254 -0
  77. ins_pricing v2/modelling/core.py +741 -0
  78. ins_pricing v2/modelling/data_container.py +42 -0
  79. ins_pricing v2/modelling/explain/__init__.py +55 -0
  80. ins_pricing v2/modelling/explain/gradients.py +334 -0
  81. ins_pricing v2/modelling/explain/metrics.py +176 -0
  82. ins_pricing v2/modelling/explain/permutation.py +155 -0
  83. ins_pricing v2/modelling/explain/shap_utils.py +146 -0
  84. ins_pricing v2/modelling/features.py +215 -0
  85. ins_pricing v2/modelling/model_manager.py +148 -0
  86. ins_pricing v2/modelling/model_plotting.py +463 -0
  87. ins_pricing v2/modelling/models.py +2203 -0
  88. ins_pricing v2/modelling/notebook_utils.py +294 -0
  89. ins_pricing v2/modelling/plotting/__init__.py +45 -0
  90. ins_pricing v2/modelling/plotting/common.py +63 -0
  91. ins_pricing v2/modelling/plotting/curves.py +572 -0
  92. ins_pricing v2/modelling/plotting/diagnostics.py +139 -0
  93. ins_pricing v2/modelling/plotting/geo.py +362 -0
  94. ins_pricing v2/modelling/plotting/importance.py +121 -0
  95. ins_pricing v2/modelling/run_logging.py +133 -0
  96. ins_pricing v2/modelling/tests/conftest.py +8 -0
  97. ins_pricing v2/modelling/tests/test_cross_val_generic.py +66 -0
  98. ins_pricing v2/modelling/tests/test_distributed_utils.py +18 -0
  99. ins_pricing v2/modelling/tests/test_explain.py +56 -0
  100. ins_pricing v2/modelling/tests/test_geo_tokens_split.py +49 -0
  101. ins_pricing v2/modelling/tests/test_graph_cache.py +33 -0
  102. ins_pricing v2/modelling/tests/test_plotting.py +63 -0
  103. ins_pricing v2/modelling/tests/test_plotting_library.py +150 -0
  104. ins_pricing v2/modelling/tests/test_preprocessor.py +48 -0
  105. ins_pricing v2/modelling/trainers.py +2447 -0
  106. ins_pricing v2/modelling/utils.py +1020 -0
  107. ins_pricing v2/modelling/watchdog_run.py +211 -0
  108. ins_pricing v2/pricing/__init__.py +27 -0
  109. ins_pricing v2/pricing/calibration.py +39 -0
  110. ins_pricing v2/pricing/data_quality.py +117 -0
  111. ins_pricing v2/pricing/exposure.py +85 -0
  112. ins_pricing v2/pricing/factors.py +91 -0
  113. ins_pricing v2/pricing/monitoring.py +99 -0
  114. ins_pricing v2/pricing/rate_table.py +78 -0
  115. ins_pricing v2/production/__init__.py +21 -0
  116. ins_pricing v2/production/drift.py +30 -0
  117. ins_pricing v2/production/monitoring.py +143 -0
  118. ins_pricing v2/production/scoring.py +40 -0
  119. ins_pricing v2/reporting/__init__.py +11 -0
  120. ins_pricing v2/reporting/report_builder.py +72 -0
  121. ins_pricing v2/reporting/scheduler.py +45 -0
  122. ins_pricing v2/scripts/BayesOpt_incremental.py +722 -0
  123. ins_pricing v2/scripts/Explain_entry.py +545 -0
  124. ins_pricing v2/scripts/__init__.py +1 -0
  125. ins_pricing v2/scripts/train.py +568 -0
  126. ins_pricing v2/setup.py +55 -0
  127. ins_pricing v2/smoke_test.py +28 -0
  128. ins_pricing-0.1.6.dist-info/METADATA +78 -0
  129. ins_pricing-0.1.6.dist-info/RECORD +169 -0
  130. ins_pricing-0.1.6.dist-info/WHEEL +5 -0
  131. ins_pricing-0.1.6.dist-info/top_level.txt +4 -0
  132. user_packages/__init__.py +105 -0
  133. user_packages legacy/BayesOpt.py +5659 -0
  134. user_packages legacy/BayesOpt_entry.py +513 -0
  135. user_packages legacy/BayesOpt_incremental.py +685 -0
  136. user_packages legacy/Pricing_Run.py +36 -0
  137. user_packages legacy/Try/BayesOpt Legacy251213.py +3719 -0
  138. user_packages legacy/Try/BayesOpt Legacy251215.py +3758 -0
  139. user_packages legacy/Try/BayesOpt lagecy251201.py +3506 -0
  140. user_packages legacy/Try/BayesOpt lagecy251218.py +3992 -0
  141. user_packages legacy/Try/BayesOpt legacy.py +3280 -0
  142. user_packages legacy/Try/BayesOpt.py +838 -0
  143. user_packages legacy/Try/BayesOptAll.py +1569 -0
  144. user_packages legacy/Try/BayesOptAllPlatform.py +909 -0
  145. user_packages legacy/Try/BayesOptCPUGPU.py +1877 -0
  146. user_packages legacy/Try/BayesOptSearch.py +830 -0
  147. user_packages legacy/Try/BayesOptSearchOrigin.py +829 -0
  148. user_packages legacy/Try/BayesOptV1.py +1911 -0
  149. user_packages legacy/Try/BayesOptV10.py +2973 -0
  150. user_packages legacy/Try/BayesOptV11.py +3001 -0
  151. user_packages legacy/Try/BayesOptV12.py +3001 -0
  152. user_packages legacy/Try/BayesOptV2.py +2065 -0
  153. user_packages legacy/Try/BayesOptV3.py +2209 -0
  154. user_packages legacy/Try/BayesOptV4.py +2342 -0
  155. user_packages legacy/Try/BayesOptV5.py +2372 -0
  156. user_packages legacy/Try/BayesOptV6.py +2759 -0
  157. user_packages legacy/Try/BayesOptV7.py +2832 -0
  158. user_packages legacy/Try/BayesOptV8Codex.py +2731 -0
  159. user_packages legacy/Try/BayesOptV8Gemini.py +2614 -0
  160. user_packages legacy/Try/BayesOptV9.py +2927 -0
  161. user_packages legacy/Try/BayesOpt_entry legacy.py +313 -0
  162. user_packages legacy/Try/ModelBayesOptSearch.py +359 -0
  163. user_packages legacy/Try/ResNetBayesOptSearch.py +249 -0
  164. user_packages legacy/Try/XgbBayesOptSearch.py +121 -0
  165. user_packages legacy/Try/xgbbayesopt.py +523 -0
  166. user_packages legacy/__init__.py +19 -0
  167. user_packages legacy/cli_common.py +124 -0
  168. user_packages legacy/notebook_utils.py +228 -0
  169. user_packages legacy/watchdog_run.py +202 -0
@@ -0,0 +1,249 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, List, Optional
4
+ from pydantic import BaseModel, Field, validator
5
+
6
+ class DataConfig(BaseModel):
7
+ resp_nme: str
8
+ weight_nme: str
9
+ factor_nmes: List[str]
10
+ cate_list: Optional[List[str]] = None
11
+ binary_resp_nme: Optional[str] = None
12
+ task_type: str = 'regression'
13
+ prop_test: float = 0.25
14
+ rand_seed: Optional[int] = None
15
+
16
+ class DistributedConfig(BaseModel):
17
+ use_gpu: bool = True
18
+ use_resn_data_parallel: bool = False
19
+ use_ft_data_parallel: bool = False
20
+ use_gnn_data_parallel: bool = False
21
+ use_resn_ddp: bool = False
22
+ use_ft_ddp: bool = False
23
+ use_gnn_ddp: bool = False
24
+ # DDP Timeout settings can be passed via env, but good to have here if needed
25
+
26
+ class GNNConfig(BaseModel):
27
+ use_approx_knn: bool = True
28
+ approx_knn_threshold: int = 50000
29
+ graph_cache: Optional[str] = None
30
+ max_gpu_knn_nodes: Optional[int] = 200000
31
+ knn_gpu_mem_ratio: float = 0.9
32
+ knn_gpu_mem_overhead: float = 2.0
33
+
34
+ class RegionConfig(BaseModel):
35
+ province_col: Optional[str] = None
36
+ city_col: Optional[str] = None
37
+ effect_alpha: float = 50.0
38
+
39
+ class GeoTokenConfig(BaseModel):
40
+ feature_nmes: Optional[List[str]] = None
41
+ hidden_dim: int = 32
42
+ layers: int = 2
43
+ dropout: float = 0.1
44
+ k_neighbors: int = 10
45
+ learning_rate: float = 1e-3
46
+ epochs: int = 50
47
+
48
+ class OptunaConfig(BaseModel):
49
+ storage: Optional[str] = None
50
+ study_prefix: Optional[str] = None
51
+ best_params_files: Optional[Dict[str, str]] = None
52
+ reuse_best_params: bool = False
53
+
54
+ class FTConfig(BaseModel):
55
+ role: str = "model" # "model", "embedding", "unsupervised_embedding"
56
+ feature_prefix: str = "ft_emb"
57
+ num_numeric_tokens: Optional[int] = None
58
+
59
+ class BayesOptConfig(BaseModel):
60
+ # Core Data & Task
61
+ data: DataConfig
62
+
63
+ # Model Names & Meta
64
+ model_nme: str
65
+
66
+ # Training Hyperparameters
67
+ epochs: int = 100
68
+ xgb_max_depth_max: int = 25
69
+ xgb_n_estimators_max: int = 500
70
+ resn_weight_decay: float = 1e-4
71
+
72
+ # Sub-component Configs
73
+ dist: DistributedConfig = Field(default_factory=DistributedConfig)
74
+ gnn: GNNConfig = Field(default_factory=GNNConfig)
75
+ region: RegionConfig = Field(default_factory=RegionConfig)
76
+ geo: GeoTokenConfig = Field(default_factory=GeoTokenConfig)
77
+ optuna: OptunaConfig = Field(default_factory=OptunaConfig)
78
+ ft: FTConfig = Field(default_factory=FTConfig)
79
+
80
+ # Ensemble & output
81
+ output_dir: Optional[str] = None
82
+ final_ensemble: bool = False
83
+ final_ensemble_k: int = 3
84
+ final_refit: bool = True
85
+
86
+ # Flattened accessors for backward compatibility
87
+ @property
88
+ def resp_nme(self): return self.data.resp_nme
89
+ @property
90
+ def weight_nme(self): return self.data.weight_nme
91
+ @property
92
+ def factor_nmes(self): return self.data.factor_nmes
93
+ @property
94
+ def task_type(self): return self.data.task_type
95
+ @property
96
+ def cate_list(self): return self.data.cate_list
97
+ @property
98
+ def binary_resp_nme(self): return self.data.binary_resp_nme
99
+ @property
100
+ def prop_test(self): return self.data.prop_test
101
+ @property
102
+ def rand_seed(self): return self.data.rand_seed
103
+
104
+ @property
105
+ def use_gpu(self): return self.dist.use_gpu
106
+ @property
107
+ def use_resn_data_parallel(self): return self.dist.use_resn_data_parallel
108
+ @property
109
+ def use_ft_data_parallel(self): return self.dist.use_ft_data_parallel
110
+ @property
111
+ def use_gnn_data_parallel(self): return self.dist.use_gnn_data_parallel
112
+ @property
113
+ def use_resn_ddp(self): return self.dist.use_resn_ddp
114
+ @property
115
+ def use_ft_ddp(self): return self.dist.use_ft_ddp
116
+ @property
117
+ def use_gnn_ddp(self): return self.dist.use_gnn_ddp
118
+
119
+ @property
120
+ def gnn_use_approx_knn(self): return self.gnn.use_approx_knn
121
+ @property
122
+ def gnn_approx_knn_threshold(self): return self.gnn.approx_knn_threshold
123
+ @property
124
+ def gnn_graph_cache(self): return self.gnn.graph_cache
125
+ @property
126
+ def gnn_max_gpu_knn_nodes(self): return self.gnn.max_gpu_knn_nodes
127
+ @property
128
+ def gnn_knn_gpu_mem_ratio(self): return self.gnn.knn_gpu_mem_ratio
129
+ @property
130
+ def gnn_knn_gpu_mem_overhead(self): return self.gnn.knn_gpu_mem_overhead
131
+
132
+ @property
133
+ def region_province_col(self): return self.region.province_col
134
+ @property
135
+ def region_city_col(self): return self.region.city_col
136
+ @property
137
+ def region_effect_alpha(self): return self.region.effect_alpha
138
+
139
+ @property
140
+ def geo_feature_nmes(self): return self.geo.feature_nmes
141
+ @property
142
+ def geo_token_hidden_dim(self): return self.geo.hidden_dim
143
+ @property
144
+ def geo_token_layers(self): return self.geo.layers
145
+ @property
146
+ def geo_token_dropout(self): return self.geo.dropout
147
+ @property
148
+ def geo_token_k_neighbors(self): return self.geo.k_neighbors
149
+ @property
150
+ def geo_token_learning_rate(self): return self.geo.learning_rate
151
+ @property
152
+ def geo_token_epochs(self): return self.geo.epochs
153
+
154
+ @property
155
+ def optuna_storage(self): return self.optuna.storage
156
+ @property
157
+ def optuna_study_prefix(self): return self.optuna.study_prefix
158
+ @property
159
+ def best_params_files(self): return self.optuna.best_params_files
160
+ @property
161
+ def reuse_best_params(self): return self.optuna.reuse_best_params
162
+
163
+ @property
164
+ def ft_role(self): return self.ft.role
165
+ @property
166
+ def ft_feature_prefix(self): return self.ft.feature_prefix
167
+ @property
168
+ def ft_num_numeric_tokens(self): return self.ft.num_numeric_tokens
169
+
170
+ @classmethod
171
+ def from_legacy_dict(cls, d: Dict[str, Any]) -> 'BayesOptConfig':
172
+ """Map flat dictionary to nested Pydantic structure."""
173
+ data = DataConfig(
174
+ resp_nme=d.get('resp_nme'),
175
+ weight_nme=d.get('weight_nme'),
176
+ factor_nmes=d.get('factor_nmes', []),
177
+ cate_list=d.get('cate_list'),
178
+ binary_resp_nme=d.get('binary_resp_nme'),
179
+ task_type=d.get('task_type', 'regression'),
180
+ prop_test=d.get('prop_test', 0.25),
181
+ rand_seed=d.get('rand_seed')
182
+ )
183
+
184
+ dist = DistributedConfig(
185
+ use_gpu=d.get('use_gpu', True),
186
+ use_resn_data_parallel=d.get('use_resn_data_parallel', False),
187
+ use_ft_data_parallel=d.get('use_ft_data_parallel', False),
188
+ use_gnn_data_parallel=d.get('use_gnn_data_parallel', False),
189
+ use_resn_ddp=d.get('use_resn_ddp', False),
190
+ use_ft_ddp=d.get('use_ft_ddp', False),
191
+ use_gnn_ddp=d.get('use_gnn_ddp', False),
192
+ )
193
+
194
+ gnn = GNNConfig(
195
+ use_approx_knn=d.get('gnn_use_approx_knn', True),
196
+ approx_knn_threshold=d.get('gnn_approx_knn_threshold', 50000),
197
+ graph_cache=d.get('gnn_graph_cache'),
198
+ max_gpu_knn_nodes=d.get('gnn_max_gpu_knn_nodes', 200000),
199
+ knn_gpu_mem_ratio=d.get('gnn_knn_gpu_mem_ratio', 0.9),
200
+ knn_gpu_mem_overhead=d.get('gnn_knn_gpu_mem_overhead', 2.0),
201
+ )
202
+
203
+ region = RegionConfig(
204
+ province_col=d.get('region_province_col'),
205
+ city_col=d.get('region_city_col'),
206
+ effect_alpha=d.get('region_effect_alpha', 50.0)
207
+ )
208
+
209
+ geo = GeoTokenConfig(
210
+ feature_nmes=d.get('geo_feature_nmes'),
211
+ hidden_dim=d.get('geo_token_hidden_dim', 32),
212
+ layers=d.get('geo_token_layers', 2),
213
+ dropout=d.get('geo_token_dropout', 0.1),
214
+ k_neighbors=d.get('geo_token_k_neighbors', 10),
215
+ learning_rate=d.get('geo_token_learning_rate', 1e-3),
216
+ epochs=d.get('geo_token_epochs', 50)
217
+ )
218
+
219
+ optuna = OptunaConfig(
220
+ storage=d.get('optuna_storage'),
221
+ study_prefix=d.get('optuna_study_prefix'),
222
+ best_params_files=d.get('best_params_files'),
223
+ reuse_best_params=d.get('reuse_best_params', False)
224
+ )
225
+
226
+ ft = FTConfig(
227
+ role=d.get('ft_role', 'model'),
228
+ feature_prefix=d.get('ft_feature_prefix', 'ft_emb'),
229
+ num_numeric_tokens=d.get('ft_num_numeric_tokens')
230
+ )
231
+
232
+ return cls(
233
+ data=data,
234
+ model_nme=d.get('model_nme', 'model'),
235
+ epochs=d.get('epochs', 100),
236
+ xgb_max_depth_max=d.get('xgb_max_depth_max', 25),
237
+ xgb_n_estimators_max=d.get('xgb_n_estimators_max', 500),
238
+ resn_weight_decay=d.get('resn_weight_decay', 1e-4),
239
+ dist=dist,
240
+ gnn=gnn,
241
+ region=region,
242
+ geo=geo,
243
+ optuna=optuna,
244
+ ft=ft,
245
+ output_dir=d.get('output_dir'),
246
+ final_ensemble=d.get('final_ensemble', False),
247
+ final_ensemble_k=d.get('final_ensemble_k', 3),
248
+ final_refit=d.get('final_refit', True)
249
+ )
@@ -0,0 +1,254 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ from dataclasses import dataclass
6
+ from datetime import datetime
7
+ from pathlib import Path
8
+ from typing import Any, Dict, List, Optional
9
+
10
+ import numpy as np
11
+ import pandas as pd
12
+ from sklearn.preprocessing import StandardScaler
13
+
14
+ from .utils import IOUtils
15
+ from .config import BayesOptConfig
16
+
17
+
18
+ # NOTE: Some CSV exports may contain invisible BOM characters or leading/trailing
19
+ # spaces in column names. Pandas requires exact matches, so we normalize a few
20
+ # "required" column names (response/weight/binary response) before validating.
21
+
22
+
23
+ def _clean_column_name(name: Any) -> Any:
24
+ if not isinstance(name, str):
25
+ return name
26
+ return name.replace("\ufeff", "").strip()
27
+
28
+
29
+ def _normalize_required_columns(
30
+ df: pd.DataFrame, required: List[Optional[str]], *, df_label: str
31
+ ) -> None:
32
+ required_names = [r for r in required if isinstance(r, str) and r.strip()]
33
+ if not required_names:
34
+ return
35
+
36
+ mapping: Dict[Any, Any] = {}
37
+ existing = set(df.columns)
38
+ for col in df.columns:
39
+ cleaned = _clean_column_name(col)
40
+ if cleaned != col and cleaned not in existing:
41
+ mapping[col] = cleaned
42
+ if mapping:
43
+ df.rename(columns=mapping, inplace=True)
44
+
45
+ existing = set(df.columns)
46
+ for req in required_names:
47
+ if req in existing:
48
+ continue
49
+ candidates = [
50
+ col
51
+ for col in df.columns
52
+ if isinstance(col, str) and _clean_column_name(col).lower() == req.lower()
53
+ ]
54
+ if len(candidates) == 1 and req not in existing:
55
+ df.rename(columns={candidates[0]: req}, inplace=True)
56
+ existing = set(df.columns)
57
+ elif len(candidates) > 1:
58
+ raise KeyError(
59
+ f"{df_label} has multiple columns matching required {req!r} "
60
+ f"(case/space-insensitive): {candidates}"
61
+ )
62
+
63
+
64
+ # ===== Core components and training wrappers =================================
65
+
66
+ # =============================================================================
67
+ # Config, preprocessing, and trainer base types
68
+ # =============================================================================
69
+ # BayesOptConfig is now imported from .config
70
+
71
+
72
+
73
+ class OutputManager:
74
+ # Centralize output paths for plots, results, and models.
75
+
76
+ def __init__(self, root: Optional[str] = None, model_name: str = "model") -> None:
77
+ self.root = Path(root or os.getcwd())
78
+ self.model_name = model_name
79
+ self.plot_dir = self.root / 'plot'
80
+ self.result_dir = self.root / 'Results'
81
+ self.model_dir = self.root / 'model'
82
+
83
+ def _prepare(self, path: Path) -> str:
84
+ IOUtils.ensure_parent_dir(str(path))
85
+ return str(path)
86
+
87
+ def plot_path(self, filename: str) -> str:
88
+ return self._prepare(self.plot_dir / filename)
89
+
90
+ def result_path(self, filename: str) -> str:
91
+ return self._prepare(self.result_dir / filename)
92
+
93
+ def model_path(self, filename: str) -> str:
94
+ return self._prepare(self.model_dir / filename)
95
+
96
+
97
+ class VersionManager:
98
+ """Lightweight versioning: save config and best-params snapshots for traceability."""
99
+
100
+ def __init__(self, output: OutputManager) -> None:
101
+ self.output = output
102
+ self.version_dir = Path(self.output.result_dir) / "versions"
103
+ IOUtils.ensure_parent_dir(str(self.version_dir))
104
+
105
+ def save(self, tag: str, payload: Dict[str, Any]) -> str:
106
+ safe_tag = tag.replace(" ", "_")
107
+ ts = datetime.now().strftime("%Y%m%d_%H%M%S")
108
+ path = self.version_dir / f"{ts}_{safe_tag}.json"
109
+ IOUtils.ensure_parent_dir(str(path))
110
+ with open(path, "w", encoding="utf-8") as f:
111
+ json.dump(payload, f, ensure_ascii=False, indent=2, default=str)
112
+ print(f"[Version] Saved snapshot: {path}")
113
+ return str(path)
114
+
115
+ def load_latest(self, tag: str) -> Optional[Dict[str, Any]]:
116
+ """Load the latest snapshot for a tag (sorted by timestamp prefix)."""
117
+ safe_tag = tag.replace(" ", "_")
118
+ pattern = f"*_{safe_tag}.json"
119
+ candidates = sorted(self.version_dir.glob(pattern))
120
+ if not candidates:
121
+ return None
122
+ path = candidates[-1]
123
+ try:
124
+ return json.loads(path.read_text(encoding="utf-8"))
125
+ except Exception as exc:
126
+ print(f"[Version] Failed to load snapshot {path}: {exc}")
127
+ return None
128
+
129
+
130
+ class DatasetPreprocessor:
131
+ # Prepare shared train/test views for trainers.
132
+
133
+ def __init__(self, train_df: pd.DataFrame, test_df: pd.DataFrame,
134
+ config: BayesOptConfig) -> None:
135
+ self.config = config
136
+ self.train_data = train_df.copy(deep=False)
137
+ self.test_data = test_df.copy(deep=False)
138
+ self.num_features: List[str] = []
139
+ self.train_oht_data: Optional[pd.DataFrame] = None
140
+ self.test_oht_data: Optional[pd.DataFrame] = None
141
+ self.train_oht_scl_data: Optional[pd.DataFrame] = None
142
+ self.test_oht_scl_data: Optional[pd.DataFrame] = None
143
+ self.var_nmes: List[str] = []
144
+ self.cat_categories_for_shap: Dict[str, List[Any]] = {}
145
+
146
+ def run(self) -> "DatasetPreprocessor":
147
+ """Run preprocessing: categorical encoding, target clipping, numeric scaling."""
148
+ cfg = self.config
149
+ _normalize_required_columns(
150
+ self.train_data,
151
+ [cfg.resp_nme, cfg.weight_nme, cfg.binary_resp_nme],
152
+ df_label="Train data",
153
+ )
154
+ _normalize_required_columns(
155
+ self.test_data,
156
+ [cfg.resp_nme, cfg.weight_nme, cfg.binary_resp_nme],
157
+ df_label="Test data",
158
+ )
159
+ missing_train = [
160
+ col for col in (cfg.resp_nme, cfg.weight_nme)
161
+ if col not in self.train_data.columns
162
+ ]
163
+ if missing_train:
164
+ raise KeyError(
165
+ f"Train data missing required columns: {missing_train}. "
166
+ f"Available columns (first 50): {list(self.train_data.columns)[:50]}"
167
+ )
168
+ if cfg.binary_resp_nme and cfg.binary_resp_nme not in self.train_data.columns:
169
+ raise KeyError(
170
+ f"Train data missing binary response column: {cfg.binary_resp_nme}. "
171
+ f"Available columns (first 50): {list(self.train_data.columns)[:50]}"
172
+ )
173
+
174
+ test_has_resp = cfg.resp_nme in self.test_data.columns
175
+ test_has_weight = cfg.weight_nme in self.test_data.columns
176
+ test_has_binary = bool(
177
+ cfg.binary_resp_nme and cfg.binary_resp_nme in self.test_data.columns
178
+ )
179
+ if not test_has_weight:
180
+ self.test_data[cfg.weight_nme] = 1.0
181
+ if not test_has_resp:
182
+ self.test_data[cfg.resp_nme] = np.nan
183
+ if cfg.binary_resp_nme and cfg.binary_resp_nme not in self.test_data.columns:
184
+ self.test_data[cfg.binary_resp_nme] = np.nan
185
+
186
+ # Precompute weighted actuals for plots and validation checks.
187
+ self.train_data.loc[:, 'w_act'] = self.train_data[cfg.resp_nme] * \
188
+ self.train_data[cfg.weight_nme]
189
+ if test_has_resp:
190
+ self.test_data.loc[:, 'w_act'] = self.test_data[cfg.resp_nme] * \
191
+ self.test_data[cfg.weight_nme]
192
+ if cfg.binary_resp_nme:
193
+ self.train_data.loc[:, 'w_binary_act'] = self.train_data[cfg.binary_resp_nme] * \
194
+ self.train_data[cfg.weight_nme]
195
+ if test_has_binary:
196
+ self.test_data.loc[:, 'w_binary_act'] = self.test_data[cfg.binary_resp_nme] * \
197
+ self.test_data[cfg.weight_nme]
198
+ # High-quantile clipping absorbs outliers; removing it lets extremes dominate loss.
199
+ q99 = self.train_data[cfg.resp_nme].quantile(0.999)
200
+ self.train_data[cfg.resp_nme] = self.train_data[cfg.resp_nme].clip(
201
+ upper=q99)
202
+ cate_list = list(cfg.cate_list or [])
203
+ if cate_list:
204
+ for cate in cate_list:
205
+ self.train_data[cate] = self.train_data[cate].astype(
206
+ 'category')
207
+ self.test_data[cate] = self.test_data[cate].astype('category')
208
+ cats = self.train_data[cate].cat.categories
209
+ self.cat_categories_for_shap[cate] = list(cats)
210
+ self.num_features = [
211
+ nme for nme in cfg.factor_nmes if nme not in cate_list]
212
+ train_oht = self.train_data[cfg.factor_nmes +
213
+ [cfg.weight_nme] + [cfg.resp_nme]].copy()
214
+ test_oht = self.test_data[cfg.factor_nmes +
215
+ [cfg.weight_nme] + [cfg.resp_nme]].copy()
216
+ train_oht = pd.get_dummies(
217
+ train_oht,
218
+ columns=cate_list,
219
+ drop_first=True,
220
+ dtype=np.int8
221
+ )
222
+ test_oht = pd.get_dummies(
223
+ test_oht,
224
+ columns=cate_list,
225
+ drop_first=True,
226
+ dtype=np.int8
227
+ )
228
+
229
+ # Fill missing dummy columns when reindexing to align train/test columns.
230
+ test_oht = test_oht.reindex(columns=train_oht.columns, fill_value=0)
231
+
232
+ # Keep unscaled one-hot data for fold-specific scaling to avoid leakage.
233
+ self.train_oht_data = train_oht
234
+ self.test_oht_data = test_oht
235
+
236
+ train_oht_scaled = train_oht.copy(deep=False)
237
+ test_oht_scaled = test_oht.copy(deep=False)
238
+ for num_chr in self.num_features:
239
+ # Scale per column so features are on comparable ranges for NN stability.
240
+ scaler = StandardScaler()
241
+ train_oht_scaled[num_chr] = scaler.fit_transform(
242
+ train_oht_scaled[num_chr].values.reshape(-1, 1)).astype(np.float32)
243
+ test_oht_scaled[num_chr] = scaler.transform(
244
+ test_oht_scaled[num_chr].values.reshape(-1, 1)).astype(np.float32)
245
+ # Fill missing dummy columns when reindexing to align train/test columns.
246
+ test_oht_scaled = test_oht_scaled.reindex(
247
+ columns=train_oht_scaled.columns, fill_value=0)
248
+ self.train_oht_scl_data = train_oht_scaled
249
+ self.test_oht_scl_data = test_oht_scaled
250
+ excluded = {cfg.weight_nme, cfg.resp_nme}
251
+ self.var_nmes = [
252
+ col for col in train_oht_scaled.columns if col not in excluded
253
+ ]
254
+ return self