ins-pricing 0.4.5__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. ins_pricing/README.md +48 -22
  2. ins_pricing/__init__.py +142 -90
  3. ins_pricing/cli/BayesOpt_entry.py +58 -46
  4. ins_pricing/cli/BayesOpt_incremental.py +77 -110
  5. ins_pricing/cli/Explain_Run.py +42 -23
  6. ins_pricing/cli/Explain_entry.py +551 -577
  7. ins_pricing/cli/Pricing_Run.py +42 -23
  8. ins_pricing/cli/bayesopt_entry_runner.py +51 -16
  9. ins_pricing/cli/utils/bootstrap.py +23 -0
  10. ins_pricing/cli/utils/cli_common.py +256 -256
  11. ins_pricing/cli/utils/cli_config.py +379 -360
  12. ins_pricing/cli/utils/import_resolver.py +375 -358
  13. ins_pricing/cli/utils/notebook_utils.py +256 -242
  14. ins_pricing/cli/watchdog_run.py +216 -198
  15. ins_pricing/frontend/__init__.py +10 -10
  16. ins_pricing/frontend/app.py +132 -61
  17. ins_pricing/frontend/config_builder.py +33 -0
  18. ins_pricing/frontend/example_config.json +11 -0
  19. ins_pricing/frontend/example_workflows.py +1 -1
  20. ins_pricing/frontend/runner.py +340 -388
  21. ins_pricing/governance/__init__.py +20 -20
  22. ins_pricing/governance/release.py +159 -159
  23. ins_pricing/modelling/README.md +1 -1
  24. ins_pricing/modelling/__init__.py +147 -92
  25. ins_pricing/modelling/{core/bayesopt → bayesopt}/README.md +31 -13
  26. ins_pricing/modelling/{core/bayesopt → bayesopt}/__init__.py +64 -102
  27. ins_pricing/modelling/{core/bayesopt → bayesopt}/config_components.py +12 -0
  28. ins_pricing/modelling/{core/bayesopt → bayesopt}/config_preprocess.py +589 -552
  29. ins_pricing/modelling/{core/bayesopt → bayesopt}/core.py +987 -958
  30. ins_pricing/modelling/{core/bayesopt → bayesopt}/model_explain_mixin.py +296 -296
  31. ins_pricing/modelling/{core/bayesopt → bayesopt}/model_plotting_mixin.py +488 -548
  32. ins_pricing/modelling/{core/bayesopt → bayesopt}/models/__init__.py +27 -27
  33. ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_ft_components.py +349 -342
  34. ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_ft_trainer.py +921 -913
  35. ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_gnn.py +794 -785
  36. ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_resn.py +454 -446
  37. ins_pricing/modelling/bayesopt/trainers/__init__.py +19 -0
  38. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_base.py +1294 -1282
  39. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_ft.py +64 -56
  40. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_glm.py +203 -198
  41. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_gnn.py +333 -325
  42. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_resn.py +279 -267
  43. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_xgb.py +515 -313
  44. ins_pricing/modelling/bayesopt/utils/__init__.py +67 -0
  45. ins_pricing/modelling/bayesopt/utils/constants.py +21 -0
  46. ins_pricing/modelling/{core/bayesopt → bayesopt}/utils/distributed_utils.py +193 -186
  47. ins_pricing/modelling/bayesopt/utils/io_utils.py +7 -0
  48. ins_pricing/modelling/bayesopt/utils/losses.py +27 -0
  49. ins_pricing/modelling/bayesopt/utils/metrics_and_devices.py +17 -0
  50. ins_pricing/modelling/{core/bayesopt → bayesopt}/utils/torch_trainer_mixin.py +636 -623
  51. ins_pricing/modelling/{core/evaluation.py → evaluation.py} +113 -104
  52. ins_pricing/modelling/explain/__init__.py +55 -55
  53. ins_pricing/modelling/explain/metrics.py +27 -174
  54. ins_pricing/modelling/explain/permutation.py +237 -237
  55. ins_pricing/modelling/plotting/__init__.py +40 -36
  56. ins_pricing/modelling/plotting/compat.py +228 -0
  57. ins_pricing/modelling/plotting/curves.py +572 -572
  58. ins_pricing/modelling/plotting/diagnostics.py +163 -163
  59. ins_pricing/modelling/plotting/geo.py +362 -362
  60. ins_pricing/modelling/plotting/importance.py +121 -121
  61. ins_pricing/pricing/__init__.py +27 -27
  62. ins_pricing/pricing/factors.py +67 -56
  63. ins_pricing/production/__init__.py +35 -25
  64. ins_pricing/production/{predict.py → inference.py} +140 -57
  65. ins_pricing/production/monitoring.py +8 -21
  66. ins_pricing/reporting/__init__.py +11 -11
  67. ins_pricing/setup.py +1 -1
  68. ins_pricing/tests/production/test_inference.py +90 -0
  69. ins_pricing/utils/__init__.py +112 -78
  70. ins_pricing/utils/device.py +258 -237
  71. ins_pricing/utils/features.py +53 -0
  72. ins_pricing/utils/io.py +72 -0
  73. ins_pricing/utils/logging.py +34 -1
  74. ins_pricing/{modelling/core/bayesopt/utils → utils}/losses.py +125 -129
  75. ins_pricing/utils/metrics.py +158 -24
  76. ins_pricing/utils/numerics.py +76 -0
  77. ins_pricing/utils/paths.py +9 -1
  78. ins_pricing/utils/profiling.py +8 -4
  79. {ins_pricing-0.4.5.dist-info → ins_pricing-0.5.1.dist-info}/METADATA +1 -1
  80. ins_pricing-0.5.1.dist-info/RECORD +132 -0
  81. ins_pricing/modelling/core/BayesOpt.py +0 -146
  82. ins_pricing/modelling/core/__init__.py +0 -1
  83. ins_pricing/modelling/core/bayesopt/trainers/__init__.py +0 -19
  84. ins_pricing/modelling/core/bayesopt/utils/__init__.py +0 -86
  85. ins_pricing/modelling/core/bayesopt/utils/constants.py +0 -183
  86. ins_pricing/modelling/core/bayesopt/utils/io_utils.py +0 -126
  87. ins_pricing/modelling/core/bayesopt/utils/metrics_and_devices.py +0 -555
  88. ins_pricing/modelling/core/bayesopt/utils.py +0 -105
  89. ins_pricing/modelling/core/bayesopt/utils_backup.py +0 -1503
  90. ins_pricing/tests/production/test_predict.py +0 -233
  91. ins_pricing-0.4.5.dist-info/RECORD +0 -130
  92. {ins_pricing-0.4.5.dist-info → ins_pricing-0.5.1.dist-info}/WHEEL +0 -0
  93. {ins_pricing-0.4.5.dist-info → ins_pricing-0.5.1.dist-info}/top_level.txt +0 -0
@@ -1,446 +1,454 @@
1
- from __future__ import annotations
2
-
3
- from typing import Dict, List, Optional
4
-
5
- import numpy as np
6
- import pandas as pd
7
- import torch
8
- import torch.nn as nn
9
- from torch.cuda.amp import GradScaler
10
- from torch.nn.parallel import DistributedDataParallel as DDP
11
- from torch.nn.utils import clip_grad_norm_
12
- from torch.utils.data import TensorDataset
13
-
14
- from ..utils import DistributedUtils, EPS, TorchTrainerMixin
15
- from ..utils.losses import (
16
- infer_loss_name_from_model_name,
17
- normalize_loss_name,
18
- resolve_tweedie_power,
19
- )
20
-
21
-
22
- # =============================================================================
23
- # ResNet model and sklearn-style wrapper
24
- # =============================================================================
25
-
26
- # ResNet model definition
27
- # Residual block: two linear layers + ReLU + residual connection
28
- # ResBlock inherits nn.Module
29
- class ResBlock(nn.Module):
30
- def __init__(self, dim: int, dropout: float = 0.1,
31
- use_layernorm: bool = False, residual_scale: float = 0.1,
32
- stochastic_depth: float = 0.0
33
- ):
34
- super().__init__()
35
- self.use_layernorm = use_layernorm
36
-
37
- if use_layernorm:
38
- Norm = nn.LayerNorm # Normalize the last dimension
39
- else:
40
- def Norm(d): return nn.BatchNorm1d(d) # Keep a switch to try BN
41
-
42
- self.norm1 = Norm(dim)
43
- self.fc1 = nn.Linear(dim, dim, bias=True)
44
- self.act = nn.ReLU(inplace=True)
45
- self.dropout = nn.Dropout(dropout) if dropout > 0.0 else nn.Identity()
46
- # Enable post-second-layer norm if needed: self.norm2 = Norm(dim)
47
- self.fc2 = nn.Linear(dim, dim, bias=True)
48
-
49
- # Residual scaling to stabilize early training
50
- self.res_scale = nn.Parameter(
51
- torch.tensor(residual_scale, dtype=torch.float32)
52
- )
53
- self.stochastic_depth = max(0.0, float(stochastic_depth))
54
-
55
- def _drop_path(self, x: torch.Tensor) -> torch.Tensor:
56
- if self.stochastic_depth <= 0.0 or not self.training:
57
- return x
58
- keep_prob = 1.0 - self.stochastic_depth
59
- if keep_prob <= 0.0:
60
- return torch.zeros_like(x)
61
- shape = (x.shape[0],) + (1,) * (x.ndim - 1)
62
- random_tensor = keep_prob + torch.rand(
63
- shape, dtype=x.dtype, device=x.device)
64
- binary_tensor = torch.floor(random_tensor)
65
- return x * binary_tensor / keep_prob
66
-
67
- def forward(self, x):
68
- # Pre-activation structure
69
- out = self.norm1(x)
70
- out = self.fc1(out)
71
- out = self.act(out)
72
- out = self.dropout(out)
73
- # If a second norm is enabled: out = self.norm2(out)
74
- out = self.fc2(out)
75
- # Apply residual scaling then add
76
- out = self.res_scale * out
77
- out = self._drop_path(out)
78
- return x + out
79
-
80
- # ResNetSequential defines the full network
81
-
82
-
83
- class ResNetSequential(nn.Module):
84
- # Input shape: (batch, input_dim)
85
- # Network: FC + norm + ReLU, stack residual blocks, output Softplus
86
-
87
- def __init__(self, input_dim: int, hidden_dim: int = 64, block_num: int = 2,
88
- use_layernorm: bool = True, dropout: float = 0.1,
89
- residual_scale: float = 0.1, stochastic_depth: float = 0.0,
90
- task_type: str = 'regression'):
91
- super(ResNetSequential, self).__init__()
92
-
93
- self.net = nn.Sequential()
94
- self.net.add_module('fc1', nn.Linear(input_dim, hidden_dim))
95
-
96
- # Optional explicit normalization after the first layer:
97
- # For LayerNorm:
98
- # self.net.add_module('norm1', nn.LayerNorm(hidden_dim))
99
- # Or BatchNorm:
100
- # self.net.add_module('norm1', nn.BatchNorm1d(hidden_dim))
101
-
102
- # If desired, insert ReLU before residual blocks:
103
- # self.net.add_module('relu1', nn.ReLU(inplace=True))
104
-
105
- # Residual blocks
106
- drop_path_rate = max(0.0, float(stochastic_depth))
107
- for i in range(block_num):
108
- if block_num > 1:
109
- block_drop = drop_path_rate * (i / (block_num - 1))
110
- else:
111
- block_drop = drop_path_rate
112
- self.net.add_module(
113
- f'ResBlk_{i+1}',
114
- ResBlock(
115
- hidden_dim,
116
- dropout=dropout,
117
- use_layernorm=use_layernorm,
118
- residual_scale=residual_scale,
119
- stochastic_depth=block_drop)
120
- )
121
-
122
- self.net.add_module('fc_out', nn.Linear(hidden_dim, 1))
123
-
124
- if task_type == 'classification':
125
- self.net.add_module('softplus', nn.Identity())
126
- else:
127
- self.net.add_module('softplus', nn.Softplus())
128
-
129
- def forward(self, x):
130
- if self.training and not hasattr(self, '_printed_device'):
131
- print(f">>> ResNetSequential executing on device: {x.device}")
132
- self._printed_device = True
133
- return self.net(x)
134
-
135
- # Define the ResNet sklearn-style wrapper.
136
-
137
-
138
- class ResNetSklearn(TorchTrainerMixin, nn.Module):
139
- def __init__(self, model_nme: str, input_dim: int, hidden_dim: int = 64,
140
- block_num: int = 2, batch_num: int = 100, epochs: int = 100,
141
- task_type: str = 'regression',
142
- tweedie_power: float = 1.5, learning_rate: float = 0.01, patience: int = 10,
143
- use_layernorm: bool = True, dropout: float = 0.1,
144
- residual_scale: float = 0.1,
145
- stochastic_depth: float = 0.0,
146
- weight_decay: float = 1e-4,
147
- use_data_parallel: bool = True,
148
- use_ddp: bool = False,
149
- loss_name: Optional[str] = None):
150
- super(ResNetSklearn, self).__init__()
151
-
152
- self.use_ddp = use_ddp
153
- self.is_ddp_enabled, self.local_rank, self.rank, self.world_size = (
154
- False, 0, 0, 1)
155
-
156
- if self.use_ddp:
157
- self.is_ddp_enabled, self.local_rank, self.rank, self.world_size = DistributedUtils.setup_ddp()
158
-
159
- self.input_dim = input_dim
160
- self.hidden_dim = hidden_dim
161
- self.block_num = block_num
162
- self.batch_num = batch_num
163
- self.epochs = epochs
164
- self.task_type = task_type
165
- self.model_nme = model_nme
166
- self.learning_rate = learning_rate
167
- self.weight_decay = weight_decay
168
- self.patience = patience
169
- self.use_layernorm = use_layernorm
170
- self.dropout = dropout
171
- self.residual_scale = residual_scale
172
- self.stochastic_depth = max(0.0, float(stochastic_depth))
173
- self.loss_curve_path: Optional[str] = None
174
- self.training_history: Dict[str, List[float]] = {
175
- "train": [], "val": []}
176
- self.use_data_parallel = bool(use_data_parallel)
177
-
178
- # Device selection: cuda > mps > cpu
179
- if self.is_ddp_enabled:
180
- self.device = torch.device(f'cuda:{self.local_rank}')
181
- elif torch.cuda.is_available():
182
- self.device = torch.device('cuda')
183
- elif torch.backends.mps.is_available():
184
- self.device = torch.device('mps')
185
- else:
186
- self.device = torch.device('cpu')
187
-
188
- resolved_loss = normalize_loss_name(loss_name, self.task_type)
189
- if self.task_type == 'classification':
190
- self.loss_name = "logloss"
191
- self.tw_power = None
192
- else:
193
- if resolved_loss == "auto":
194
- resolved_loss = infer_loss_name_from_model_name(self.model_nme)
195
- self.loss_name = resolved_loss
196
- if self.loss_name == "tweedie":
197
- self.tw_power = float(tweedie_power) if tweedie_power is not None else 1.5
198
- else:
199
- self.tw_power = resolve_tweedie_power(self.loss_name, default=1.5)
200
-
201
- # Build network (construct on CPU first)
202
- core = ResNetSequential(
203
- self.input_dim,
204
- self.hidden_dim,
205
- self.block_num,
206
- use_layernorm=self.use_layernorm,
207
- dropout=self.dropout,
208
- residual_scale=self.residual_scale,
209
- stochastic_depth=self.stochastic_depth,
210
- task_type=self.task_type
211
- )
212
-
213
- # ===== Multi-GPU: DataParallel vs DistributedDataParallel =====
214
- if self.is_ddp_enabled:
215
- core = core.to(self.device)
216
- core = DDP(core, device_ids=[
217
- self.local_rank], output_device=self.local_rank)
218
- self.use_data_parallel = False
219
- elif use_data_parallel and (self.device.type == 'cuda') and (torch.cuda.device_count() > 1):
220
- if self.use_ddp and not self.is_ddp_enabled:
221
- print(
222
- ">>> DDP requested but not initialized; falling back to DataParallel.")
223
- core = nn.DataParallel(core, device_ids=list(
224
- range(torch.cuda.device_count())))
225
- # DataParallel scatters inputs, but the primary device remains cuda:0.
226
- self.device = torch.device('cuda')
227
- self.use_data_parallel = True
228
- else:
229
- self.use_data_parallel = False
230
-
231
- self.resnet = core.to(self.device)
232
-
233
- # ================ Internal helpers ================
234
- @staticmethod
235
- def _validate_vector(arr, name: str, n_rows: int) -> None:
236
- if arr is None:
237
- return
238
- if isinstance(arr, pd.DataFrame):
239
- if arr.shape[1] != 1:
240
- raise ValueError(f"{name} must be 1d (single column).")
241
- length = len(arr)
242
- else:
243
- arr_np = np.asarray(arr)
244
- if arr_np.ndim == 0:
245
- raise ValueError(f"{name} must be 1d.")
246
- if arr_np.ndim > 2 or (arr_np.ndim == 2 and arr_np.shape[1] != 1):
247
- raise ValueError(f"{name} must be 1d or Nx1.")
248
- length = arr_np.shape[0]
249
- if length != n_rows:
250
- raise ValueError(
251
- f"{name} length {length} does not match X length {n_rows}."
252
- )
253
-
254
- def _validate_inputs(self, X, y, w, label: str) -> None:
255
- if X is None:
256
- raise ValueError(f"{label} X cannot be None.")
257
- n_rows = len(X)
258
- if y is None:
259
- raise ValueError(f"{label} y cannot be None.")
260
- self._validate_vector(y, f"{label} y", n_rows)
261
- self._validate_vector(w, f"{label} w", n_rows)
262
-
263
- def _build_train_val_tensors(self, X_train, y_train, w_train, X_val, y_val, w_val):
264
- self._validate_inputs(X_train, y_train, w_train, "train")
265
- if X_val is not None or y_val is not None or w_val is not None:
266
- if X_val is None or y_val is None:
267
- raise ValueError("validation X and y must both be provided.")
268
- self._validate_inputs(X_val, y_val, w_val, "val")
269
-
270
- def _to_numpy(arr):
271
- if hasattr(arr, "to_numpy"):
272
- return arr.to_numpy(dtype=np.float32, copy=False)
273
- return np.asarray(arr, dtype=np.float32)
274
-
275
- X_tensor = torch.as_tensor(_to_numpy(X_train))
276
- y_tensor = torch.as_tensor(_to_numpy(y_train)).view(-1, 1)
277
- w_tensor = (
278
- torch.as_tensor(_to_numpy(w_train)).view(-1, 1)
279
- if w_train is not None else torch.ones_like(y_tensor)
280
- )
281
-
282
- has_val = X_val is not None and y_val is not None
283
- if has_val:
284
- X_val_tensor = torch.as_tensor(_to_numpy(X_val))
285
- y_val_tensor = torch.as_tensor(_to_numpy(y_val)).view(-1, 1)
286
- w_val_tensor = (
287
- torch.as_tensor(_to_numpy(w_val)).view(-1, 1)
288
- if w_val is not None else torch.ones_like(y_val_tensor)
289
- )
290
- else:
291
- X_val_tensor = y_val_tensor = w_val_tensor = None
292
- return X_tensor, y_tensor, w_tensor, X_val_tensor, y_val_tensor, w_val_tensor, has_val
293
-
294
- def forward(self, x):
295
- # Handle SHAP NumPy input.
296
- if isinstance(x, np.ndarray):
297
- x_tensor = torch.as_tensor(x, dtype=torch.float32)
298
- else:
299
- x_tensor = x
300
-
301
- x_tensor = x_tensor.to(self.device)
302
- y_pred = self.resnet(x_tensor)
303
- return y_pred
304
-
305
- # ---------------- Training ----------------
306
-
307
- def fit(self, X_train, y_train, w_train=None,
308
- X_val=None, y_val=None, w_val=None, trial=None):
309
-
310
- X_tensor, y_tensor, w_tensor, X_val_tensor, y_val_tensor, w_val_tensor, has_val = \
311
- self._build_train_val_tensors(
312
- X_train, y_train, w_train, X_val, y_val, w_val)
313
-
314
- dataset = TensorDataset(X_tensor, y_tensor, w_tensor)
315
- dataloader, accum_steps = self._build_dataloader(
316
- dataset,
317
- N=X_tensor.shape[0],
318
- base_bs_gpu=(2048, 1024, 512),
319
- base_bs_cpu=(256, 128),
320
- min_bs=64,
321
- target_effective_cuda=2048,
322
- target_effective_cpu=1024
323
- )
324
-
325
- # Set sampler epoch at the start of each epoch to keep shuffling deterministic.
326
- if self.is_ddp_enabled and hasattr(dataloader.sampler, 'set_epoch'):
327
- self.dataloader_sampler = dataloader.sampler
328
- else:
329
- self.dataloader_sampler = None
330
-
331
- # === 4. Optimizer and AMP ===
332
- self.optimizer = torch.optim.Adam(
333
- self.resnet.parameters(),
334
- lr=self.learning_rate,
335
- weight_decay=float(self.weight_decay),
336
- )
337
- self.scaler = GradScaler(enabled=(self.device.type == 'cuda'))
338
-
339
- X_val_dev = y_val_dev = w_val_dev = None
340
- val_dataloader = None
341
- if has_val:
342
- # Build validation DataLoader.
343
- val_dataset = TensorDataset(
344
- X_val_tensor, y_val_tensor, w_val_tensor)
345
- # No backward pass in validation; batch size can be larger for throughput.
346
- val_dataloader = self._build_val_dataloader(
347
- val_dataset, dataloader, accum_steps)
348
- # Validation usually does not need a DDP sampler because we validate on the main process
349
- # or aggregate results. For simplicity, keep validation on a single GPU or the main process.
350
-
351
- is_data_parallel = isinstance(self.resnet, nn.DataParallel)
352
-
353
- def forward_fn(batch):
354
- X_batch, y_batch, w_batch = batch
355
-
356
- if not is_data_parallel:
357
- X_batch = X_batch.to(self.device, non_blocking=True)
358
- # Keep targets and weights on the main device for loss computation.
359
- y_batch = y_batch.to(self.device, non_blocking=True)
360
- w_batch = w_batch.to(self.device, non_blocking=True)
361
-
362
- y_pred = self.resnet(X_batch)
363
- return y_pred, y_batch, w_batch
364
-
365
- def val_forward_fn():
366
- total_loss = 0.0
367
- total_weight = 0.0
368
- for batch in val_dataloader:
369
- X_b, y_b, w_b = batch
370
- if not is_data_parallel:
371
- X_b = X_b.to(self.device, non_blocking=True)
372
- y_b = y_b.to(self.device, non_blocking=True)
373
- w_b = w_b.to(self.device, non_blocking=True)
374
-
375
- y_pred = self.resnet(X_b)
376
-
377
- # Manually compute weighted loss for accurate aggregation.
378
- losses = self._compute_losses(
379
- y_pred, y_b, apply_softplus=False)
380
-
381
- batch_weight_sum = torch.clamp(w_b.sum(), min=EPS)
382
- batch_weighted_loss_sum = (losses * w_b.view(-1)).sum()
383
-
384
- total_loss += batch_weighted_loss_sum.item()
385
- total_weight += batch_weight_sum.item()
386
-
387
- return total_loss / max(total_weight, EPS)
388
-
389
- clip_fn = None
390
- if self.device.type == 'cuda':
391
- def clip_fn(): return (self.scaler.unscale_(self.optimizer),
392
- clip_grad_norm_(self.resnet.parameters(), max_norm=1.0))
393
-
394
- # Under DDP, only the main process prints logs and saves models.
395
- if self.is_ddp_enabled and not DistributedUtils.is_main_process():
396
- # Non-main processes skip validation callback logging (handled inside _train_model).
397
- pass
398
-
399
- best_state, history = self._train_model(
400
- self.resnet,
401
- dataloader,
402
- accum_steps,
403
- self.optimizer,
404
- self.scaler,
405
- forward_fn,
406
- val_forward_fn if has_val else None,
407
- apply_softplus=False,
408
- clip_fn=clip_fn,
409
- trial=trial,
410
- loss_curve_path=getattr(self, "loss_curve_path", None)
411
- )
412
-
413
- if has_val and best_state is not None:
414
- # Load state into unwrapped module to match how it was saved
415
- base_module = self.resnet.module if hasattr(self.resnet, "module") else self.resnet
416
- base_module.load_state_dict(best_state)
417
- self.training_history = history
418
-
419
- # ---------------- Prediction ----------------
420
-
421
- def predict(self, X_test):
422
- self.resnet.eval()
423
- if isinstance(X_test, pd.DataFrame):
424
- X_np = X_test.to_numpy(dtype=np.float32, copy=False)
425
- else:
426
- X_np = np.asarray(X_test, dtype=np.float32)
427
-
428
- inference_cm = getattr(torch, "inference_mode", torch.no_grad)
429
- with inference_cm():
430
- y_pred = self(X_np).cpu().numpy()
431
-
432
- if self.task_type == 'classification':
433
- y_pred = 1 / (1 + np.exp(-y_pred)) # Sigmoid converts logits to probabilities.
434
- else:
435
- y_pred = np.clip(y_pred, 1e-6, None)
436
- return y_pred.flatten()
437
-
438
- # ---------------- Set Params ----------------
439
-
440
- def set_params(self, params):
441
- for key, value in params.items():
442
- if hasattr(self, key):
443
- setattr(self, key, value)
444
- else:
445
- raise ValueError(f"Parameter {key} not found in model.")
446
- return self
1
+ from __future__ import annotations
2
+
3
+ from typing import Dict, List, Optional
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ import torch
8
+ import torch.nn as nn
9
+ from torch.cuda.amp import GradScaler
10
+ from torch.nn.parallel import DistributedDataParallel as DDP
11
+ from torch.nn.utils import clip_grad_norm_
12
+ from torch.utils.data import TensorDataset
13
+
14
+ from ins_pricing.modelling.bayesopt.utils.distributed_utils import DistributedUtils
15
+ from ins_pricing.modelling.bayesopt.utils.torch_trainer_mixin import TorchTrainerMixin
16
+ from ins_pricing.utils import EPS, get_logger, log_print
17
+ from ins_pricing.utils.losses import (
18
+ infer_loss_name_from_model_name,
19
+ normalize_loss_name,
20
+ resolve_tweedie_power,
21
+ )
22
+
23
+ _logger = get_logger("ins_pricing.modelling.bayesopt.models.model_resn")
24
+
25
+
26
+ def _log(*args, **kwargs) -> None:
27
+ log_print(_logger, *args, **kwargs)
28
+
29
+
30
+ # =============================================================================
31
+ # ResNet model and sklearn-style wrapper
32
+ # =============================================================================
33
+
34
+ # ResNet model definition
35
+ # Residual block: two linear layers + ReLU + residual connection
36
+ # ResBlock inherits nn.Module
37
+ class ResBlock(nn.Module):
38
+ def __init__(self, dim: int, dropout: float = 0.1,
39
+ use_layernorm: bool = False, residual_scale: float = 0.1,
40
+ stochastic_depth: float = 0.0
41
+ ):
42
+ super().__init__()
43
+ self.use_layernorm = use_layernorm
44
+
45
+ if use_layernorm:
46
+ Norm = nn.LayerNorm # Normalize the last dimension
47
+ else:
48
+ def Norm(d): return nn.BatchNorm1d(d) # Keep a switch to try BN
49
+
50
+ self.norm1 = Norm(dim)
51
+ self.fc1 = nn.Linear(dim, dim, bias=True)
52
+ self.act = nn.ReLU(inplace=True)
53
+ self.dropout = nn.Dropout(dropout) if dropout > 0.0 else nn.Identity()
54
+ # Enable post-second-layer norm if needed: self.norm2 = Norm(dim)
55
+ self.fc2 = nn.Linear(dim, dim, bias=True)
56
+
57
+ # Residual scaling to stabilize early training
58
+ self.res_scale = nn.Parameter(
59
+ torch.tensor(residual_scale, dtype=torch.float32)
60
+ )
61
+ self.stochastic_depth = max(0.0, float(stochastic_depth))
62
+
63
+ def _drop_path(self, x: torch.Tensor) -> torch.Tensor:
64
+ if self.stochastic_depth <= 0.0 or not self.training:
65
+ return x
66
+ keep_prob = 1.0 - self.stochastic_depth
67
+ if keep_prob <= 0.0:
68
+ return torch.zeros_like(x)
69
+ shape = (x.shape[0],) + (1,) * (x.ndim - 1)
70
+ random_tensor = keep_prob + torch.rand(
71
+ shape, dtype=x.dtype, device=x.device)
72
+ binary_tensor = torch.floor(random_tensor)
73
+ return x * binary_tensor / keep_prob
74
+
75
+ def forward(self, x):
76
+ # Pre-activation structure
77
+ out = self.norm1(x)
78
+ out = self.fc1(out)
79
+ out = self.act(out)
80
+ out = self.dropout(out)
81
+ # If a second norm is enabled: out = self.norm2(out)
82
+ out = self.fc2(out)
83
+ # Apply residual scaling then add
84
+ out = self.res_scale * out
85
+ out = self._drop_path(out)
86
+ return x + out
87
+
88
+ # ResNetSequential defines the full network
89
+
90
+
91
+ class ResNetSequential(nn.Module):
92
+ # Input shape: (batch, input_dim)
93
+ # Network: FC + norm + ReLU, stack residual blocks, output Softplus
94
+
95
+ def __init__(self, input_dim: int, hidden_dim: int = 64, block_num: int = 2,
96
+ use_layernorm: bool = True, dropout: float = 0.1,
97
+ residual_scale: float = 0.1, stochastic_depth: float = 0.0,
98
+ task_type: str = 'regression'):
99
+ super(ResNetSequential, self).__init__()
100
+
101
+ self.net = nn.Sequential()
102
+ self.net.add_module('fc1', nn.Linear(input_dim, hidden_dim))
103
+
104
+ # Optional explicit normalization after the first layer:
105
+ # For LayerNorm:
106
+ # self.net.add_module('norm1', nn.LayerNorm(hidden_dim))
107
+ # Or BatchNorm:
108
+ # self.net.add_module('norm1', nn.BatchNorm1d(hidden_dim))
109
+
110
+ # If desired, insert ReLU before residual blocks:
111
+ # self.net.add_module('relu1', nn.ReLU(inplace=True))
112
+
113
+ # Residual blocks
114
+ drop_path_rate = max(0.0, float(stochastic_depth))
115
+ for i in range(block_num):
116
+ if block_num > 1:
117
+ block_drop = drop_path_rate * (i / (block_num - 1))
118
+ else:
119
+ block_drop = drop_path_rate
120
+ self.net.add_module(
121
+ f'ResBlk_{i+1}',
122
+ ResBlock(
123
+ hidden_dim,
124
+ dropout=dropout,
125
+ use_layernorm=use_layernorm,
126
+ residual_scale=residual_scale,
127
+ stochastic_depth=block_drop)
128
+ )
129
+
130
+ self.net.add_module('fc_out', nn.Linear(hidden_dim, 1))
131
+
132
+ if task_type == 'classification':
133
+ self.net.add_module('softplus', nn.Identity())
134
+ else:
135
+ self.net.add_module('softplus', nn.Softplus())
136
+
137
+ def forward(self, x):
138
+ if self.training and not hasattr(self, '_printed_device'):
139
+ _log(f">>> ResNetSequential executing on device: {x.device}")
140
+ self._printed_device = True
141
+ return self.net(x)
142
+
143
+ # Define the ResNet sklearn-style wrapper.
144
+
145
+
146
+ class ResNetSklearn(TorchTrainerMixin, nn.Module):
147
+ def __init__(self, model_nme: str, input_dim: int, hidden_dim: int = 64,
148
+ block_num: int = 2, batch_num: int = 100, epochs: int = 100,
149
+ task_type: str = 'regression',
150
+ tweedie_power: float = 1.5, learning_rate: float = 0.01, patience: int = 10,
151
+ use_layernorm: bool = True, dropout: float = 0.1,
152
+ residual_scale: float = 0.1,
153
+ stochastic_depth: float = 0.0,
154
+ weight_decay: float = 1e-4,
155
+ use_data_parallel: bool = True,
156
+ use_ddp: bool = False,
157
+ loss_name: Optional[str] = None):
158
+ super(ResNetSklearn, self).__init__()
159
+
160
+ self.use_ddp = use_ddp
161
+ self.is_ddp_enabled, self.local_rank, self.rank, self.world_size = (
162
+ False, 0, 0, 1)
163
+
164
+ if self.use_ddp:
165
+ self.is_ddp_enabled, self.local_rank, self.rank, self.world_size = DistributedUtils.setup_ddp()
166
+
167
+ self.input_dim = input_dim
168
+ self.hidden_dim = hidden_dim
169
+ self.block_num = block_num
170
+ self.batch_num = batch_num
171
+ self.epochs = epochs
172
+ self.task_type = task_type
173
+ self.model_nme = model_nme
174
+ self.learning_rate = learning_rate
175
+ self.weight_decay = weight_decay
176
+ self.patience = patience
177
+ self.use_layernorm = use_layernorm
178
+ self.dropout = dropout
179
+ self.residual_scale = residual_scale
180
+ self.stochastic_depth = max(0.0, float(stochastic_depth))
181
+ self.loss_curve_path: Optional[str] = None
182
+ self.training_history: Dict[str, List[float]] = {
183
+ "train": [], "val": []}
184
+ self.use_data_parallel = bool(use_data_parallel)
185
+
186
+ # Device selection: cuda > mps > cpu
187
+ if self.is_ddp_enabled:
188
+ self.device = torch.device(f'cuda:{self.local_rank}')
189
+ elif torch.cuda.is_available():
190
+ self.device = torch.device('cuda')
191
+ elif torch.backends.mps.is_available():
192
+ self.device = torch.device('mps')
193
+ else:
194
+ self.device = torch.device('cpu')
195
+
196
+ resolved_loss = normalize_loss_name(loss_name, self.task_type)
197
+ if self.task_type == 'classification':
198
+ self.loss_name = "logloss"
199
+ self.tw_power = None
200
+ else:
201
+ if resolved_loss == "auto":
202
+ resolved_loss = infer_loss_name_from_model_name(self.model_nme)
203
+ self.loss_name = resolved_loss
204
+ if self.loss_name == "tweedie":
205
+ self.tw_power = float(tweedie_power) if tweedie_power is not None else 1.5
206
+ else:
207
+ self.tw_power = resolve_tweedie_power(self.loss_name, default=1.5)
208
+
209
+ # Build network (construct on CPU first)
210
+ core = ResNetSequential(
211
+ self.input_dim,
212
+ self.hidden_dim,
213
+ self.block_num,
214
+ use_layernorm=self.use_layernorm,
215
+ dropout=self.dropout,
216
+ residual_scale=self.residual_scale,
217
+ stochastic_depth=self.stochastic_depth,
218
+ task_type=self.task_type
219
+ )
220
+
221
+ # ===== Multi-GPU: DataParallel vs DistributedDataParallel =====
222
+ if self.is_ddp_enabled:
223
+ core = core.to(self.device)
224
+ core = DDP(core, device_ids=[
225
+ self.local_rank], output_device=self.local_rank)
226
+ self.use_data_parallel = False
227
+ elif use_data_parallel and (self.device.type == 'cuda') and (torch.cuda.device_count() > 1):
228
+ if self.use_ddp and not self.is_ddp_enabled:
229
+ _log(
230
+ ">>> DDP requested but not initialized; falling back to DataParallel.")
231
+ core = nn.DataParallel(core, device_ids=list(
232
+ range(torch.cuda.device_count())))
233
+ # DataParallel scatters inputs, but the primary device remains cuda:0.
234
+ self.device = torch.device('cuda')
235
+ self.use_data_parallel = True
236
+ else:
237
+ self.use_data_parallel = False
238
+
239
+ self.resnet = core.to(self.device)
240
+
241
+ # ================ Internal helpers ================
242
+ @staticmethod
243
+ def _validate_vector(arr, name: str, n_rows: int) -> None:
244
+ if arr is None:
245
+ return
246
+ if isinstance(arr, pd.DataFrame):
247
+ if arr.shape[1] != 1:
248
+ raise ValueError(f"{name} must be 1d (single column).")
249
+ length = len(arr)
250
+ else:
251
+ arr_np = np.asarray(arr)
252
+ if arr_np.ndim == 0:
253
+ raise ValueError(f"{name} must be 1d.")
254
+ if arr_np.ndim > 2 or (arr_np.ndim == 2 and arr_np.shape[1] != 1):
255
+ raise ValueError(f"{name} must be 1d or Nx1.")
256
+ length = arr_np.shape[0]
257
+ if length != n_rows:
258
+ raise ValueError(
259
+ f"{name} length {length} does not match X length {n_rows}."
260
+ )
261
+
262
+ def _validate_inputs(self, X, y, w, label: str) -> None:
263
+ if X is None:
264
+ raise ValueError(f"{label} X cannot be None.")
265
+ n_rows = len(X)
266
+ if y is None:
267
+ raise ValueError(f"{label} y cannot be None.")
268
+ self._validate_vector(y, f"{label} y", n_rows)
269
+ self._validate_vector(w, f"{label} w", n_rows)
270
+
271
+ def _build_train_val_tensors(self, X_train, y_train, w_train, X_val, y_val, w_val):
272
+ self._validate_inputs(X_train, y_train, w_train, "train")
273
+ if X_val is not None or y_val is not None or w_val is not None:
274
+ if X_val is None or y_val is None:
275
+ raise ValueError("validation X and y must both be provided.")
276
+ self._validate_inputs(X_val, y_val, w_val, "val")
277
+
278
+ def _to_numpy(arr):
279
+ if hasattr(arr, "to_numpy"):
280
+ return arr.to_numpy(dtype=np.float32, copy=False)
281
+ return np.asarray(arr, dtype=np.float32)
282
+
283
+ X_tensor = torch.as_tensor(_to_numpy(X_train))
284
+ y_tensor = torch.as_tensor(_to_numpy(y_train)).view(-1, 1)
285
+ w_tensor = (
286
+ torch.as_tensor(_to_numpy(w_train)).view(-1, 1)
287
+ if w_train is not None else torch.ones_like(y_tensor)
288
+ )
289
+
290
+ has_val = X_val is not None and y_val is not None
291
+ if has_val:
292
+ X_val_tensor = torch.as_tensor(_to_numpy(X_val))
293
+ y_val_tensor = torch.as_tensor(_to_numpy(y_val)).view(-1, 1)
294
+ w_val_tensor = (
295
+ torch.as_tensor(_to_numpy(w_val)).view(-1, 1)
296
+ if w_val is not None else torch.ones_like(y_val_tensor)
297
+ )
298
+ else:
299
+ X_val_tensor = y_val_tensor = w_val_tensor = None
300
+ return X_tensor, y_tensor, w_tensor, X_val_tensor, y_val_tensor, w_val_tensor, has_val
301
+
302
+ def forward(self, x):
303
+ # Handle SHAP NumPy input.
304
+ if isinstance(x, np.ndarray):
305
+ x_tensor = torch.as_tensor(x, dtype=torch.float32)
306
+ else:
307
+ x_tensor = x
308
+
309
+ x_tensor = x_tensor.to(self.device)
310
+ y_pred = self.resnet(x_tensor)
311
+ return y_pred
312
+
313
+ # ---------------- Training ----------------
314
+
315
+ def fit(self, X_train, y_train, w_train=None,
316
+ X_val=None, y_val=None, w_val=None, trial=None):
317
+
318
+ X_tensor, y_tensor, w_tensor, X_val_tensor, y_val_tensor, w_val_tensor, has_val = \
319
+ self._build_train_val_tensors(
320
+ X_train, y_train, w_train, X_val, y_val, w_val)
321
+
322
+ dataset = TensorDataset(X_tensor, y_tensor, w_tensor)
323
+ dataloader, accum_steps = self._build_dataloader(
324
+ dataset,
325
+ N=X_tensor.shape[0],
326
+ base_bs_gpu=(2048, 1024, 512),
327
+ base_bs_cpu=(256, 128),
328
+ min_bs=64,
329
+ target_effective_cuda=2048,
330
+ target_effective_cpu=1024
331
+ )
332
+
333
+ # Set sampler epoch at the start of each epoch to keep shuffling deterministic.
334
+ if self.is_ddp_enabled and hasattr(dataloader.sampler, 'set_epoch'):
335
+ self.dataloader_sampler = dataloader.sampler
336
+ else:
337
+ self.dataloader_sampler = None
338
+
339
+ # === 4. Optimizer and AMP ===
340
+ self.optimizer = torch.optim.Adam(
341
+ self.resnet.parameters(),
342
+ lr=self.learning_rate,
343
+ weight_decay=float(self.weight_decay),
344
+ )
345
+ self.scaler = GradScaler(enabled=(self.device.type == 'cuda'))
346
+
347
+ X_val_dev = y_val_dev = w_val_dev = None
348
+ val_dataloader = None
349
+ if has_val:
350
+ # Build validation DataLoader.
351
+ val_dataset = TensorDataset(
352
+ X_val_tensor, y_val_tensor, w_val_tensor)
353
+ # No backward pass in validation; batch size can be larger for throughput.
354
+ val_dataloader = self._build_val_dataloader(
355
+ val_dataset, dataloader, accum_steps)
356
+ # Validation usually does not need a DDP sampler because we validate on the main process
357
+ # or aggregate results. For simplicity, keep validation on a single GPU or the main process.
358
+
359
+ is_data_parallel = isinstance(self.resnet, nn.DataParallel)
360
+
361
+ def forward_fn(batch):
362
+ X_batch, y_batch, w_batch = batch
363
+
364
+ if not is_data_parallel:
365
+ X_batch = X_batch.to(self.device, non_blocking=True)
366
+ # Keep targets and weights on the main device for loss computation.
367
+ y_batch = y_batch.to(self.device, non_blocking=True)
368
+ w_batch = w_batch.to(self.device, non_blocking=True)
369
+
370
+ y_pred = self.resnet(X_batch)
371
+ return y_pred, y_batch, w_batch
372
+
373
+ def val_forward_fn():
374
+ total_loss = 0.0
375
+ total_weight = 0.0
376
+ for batch in val_dataloader:
377
+ X_b, y_b, w_b = batch
378
+ if not is_data_parallel:
379
+ X_b = X_b.to(self.device, non_blocking=True)
380
+ y_b = y_b.to(self.device, non_blocking=True)
381
+ w_b = w_b.to(self.device, non_blocking=True)
382
+
383
+ y_pred = self.resnet(X_b)
384
+
385
+ # Manually compute weighted loss for accurate aggregation.
386
+ losses = self._compute_losses(
387
+ y_pred, y_b, apply_softplus=False)
388
+
389
+ batch_weight_sum = torch.clamp(w_b.sum(), min=EPS)
390
+ batch_weighted_loss_sum = (losses * w_b.view(-1)).sum()
391
+
392
+ total_loss += batch_weighted_loss_sum.item()
393
+ total_weight += batch_weight_sum.item()
394
+
395
+ return total_loss / max(total_weight, EPS)
396
+
397
+ clip_fn = None
398
+ if self.device.type == 'cuda':
399
+ def clip_fn(): return (self.scaler.unscale_(self.optimizer),
400
+ clip_grad_norm_(self.resnet.parameters(), max_norm=1.0))
401
+
402
+ # Under DDP, only the main process prints logs and saves models.
403
+ if self.is_ddp_enabled and not DistributedUtils.is_main_process():
404
+ # Non-main processes skip validation callback logging (handled inside _train_model).
405
+ pass
406
+
407
+ best_state, history = self._train_model(
408
+ self.resnet,
409
+ dataloader,
410
+ accum_steps,
411
+ self.optimizer,
412
+ self.scaler,
413
+ forward_fn,
414
+ val_forward_fn if has_val else None,
415
+ apply_softplus=False,
416
+ clip_fn=clip_fn,
417
+ trial=trial,
418
+ loss_curve_path=getattr(self, "loss_curve_path", None)
419
+ )
420
+
421
+ if has_val and best_state is not None:
422
+ # Load state into unwrapped module to match how it was saved
423
+ base_module = self.resnet.module if hasattr(self.resnet, "module") else self.resnet
424
+ base_module.load_state_dict(best_state)
425
+ self.training_history = history
426
+
427
+ # ---------------- Prediction ----------------
428
+
429
+ def predict(self, X_test):
430
+ self.resnet.eval()
431
+ if isinstance(X_test, pd.DataFrame):
432
+ X_np = X_test.to_numpy(dtype=np.float32, copy=False)
433
+ else:
434
+ X_np = np.asarray(X_test, dtype=np.float32)
435
+
436
+ inference_cm = getattr(torch, "inference_mode", torch.no_grad)
437
+ with inference_cm():
438
+ y_pred = self(X_np).cpu().numpy()
439
+
440
+ if self.task_type == 'classification':
441
+ y_pred = 1 / (1 + np.exp(-y_pred)) # Sigmoid converts logits to probabilities.
442
+ else:
443
+ y_pred = np.clip(y_pred, 1e-6, None)
444
+ return y_pred.flatten()
445
+
446
+ # ---------------- Set Params ----------------
447
+
448
+ def set_params(self, params):
449
+ for key, value in params.items():
450
+ if hasattr(self, key):
451
+ setattr(self, key, value)
452
+ else:
453
+ raise ValueError(f"Parameter {key} not found in model.")
454
+ return self