ins-pricing 0.4.5__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. ins_pricing/README.md +48 -22
  2. ins_pricing/__init__.py +142 -90
  3. ins_pricing/cli/BayesOpt_entry.py +52 -50
  4. ins_pricing/cli/BayesOpt_incremental.py +39 -105
  5. ins_pricing/cli/Explain_Run.py +31 -23
  6. ins_pricing/cli/Explain_entry.py +532 -579
  7. ins_pricing/cli/Pricing_Run.py +31 -23
  8. ins_pricing/cli/bayesopt_entry_runner.py +11 -9
  9. ins_pricing/cli/utils/cli_common.py +256 -256
  10. ins_pricing/cli/utils/cli_config.py +375 -375
  11. ins_pricing/cli/utils/import_resolver.py +382 -365
  12. ins_pricing/cli/utils/notebook_utils.py +340 -340
  13. ins_pricing/cli/watchdog_run.py +209 -201
  14. ins_pricing/frontend/__init__.py +10 -10
  15. ins_pricing/frontend/example_workflows.py +1 -1
  16. ins_pricing/governance/__init__.py +20 -20
  17. ins_pricing/governance/release.py +159 -159
  18. ins_pricing/modelling/__init__.py +147 -92
  19. ins_pricing/modelling/{core/bayesopt → bayesopt}/README.md +2 -2
  20. ins_pricing/modelling/{core/bayesopt → bayesopt}/__init__.py +64 -102
  21. ins_pricing/modelling/{core/bayesopt → bayesopt}/config_preprocess.py +562 -562
  22. ins_pricing/modelling/{core/bayesopt → bayesopt}/core.py +965 -964
  23. ins_pricing/modelling/{core/bayesopt → bayesopt}/model_explain_mixin.py +296 -296
  24. ins_pricing/modelling/{core/bayesopt → bayesopt}/model_plotting_mixin.py +482 -548
  25. ins_pricing/modelling/{core/bayesopt → bayesopt}/models/__init__.py +27 -27
  26. ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_ft_trainer.py +915 -913
  27. ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_gnn.py +788 -785
  28. ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_resn.py +448 -446
  29. ins_pricing/modelling/bayesopt/trainers/__init__.py +19 -0
  30. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_base.py +1308 -1308
  31. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_ft.py +3 -3
  32. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_glm.py +197 -198
  33. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_gnn.py +344 -344
  34. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_resn.py +283 -283
  35. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_xgb.py +346 -347
  36. ins_pricing/modelling/bayesopt/utils/__init__.py +67 -0
  37. ins_pricing/modelling/bayesopt/utils/constants.py +21 -0
  38. ins_pricing/modelling/bayesopt/utils/io_utils.py +7 -0
  39. ins_pricing/modelling/bayesopt/utils/losses.py +27 -0
  40. ins_pricing/modelling/bayesopt/utils/metrics_and_devices.py +17 -0
  41. ins_pricing/modelling/{core/bayesopt → bayesopt}/utils/torch_trainer_mixin.py +623 -623
  42. ins_pricing/modelling/{core/evaluation.py → evaluation.py} +113 -104
  43. ins_pricing/modelling/explain/__init__.py +55 -55
  44. ins_pricing/modelling/explain/metrics.py +27 -174
  45. ins_pricing/modelling/explain/permutation.py +237 -237
  46. ins_pricing/modelling/plotting/__init__.py +40 -36
  47. ins_pricing/modelling/plotting/compat.py +228 -0
  48. ins_pricing/modelling/plotting/curves.py +572 -572
  49. ins_pricing/modelling/plotting/diagnostics.py +163 -163
  50. ins_pricing/modelling/plotting/geo.py +362 -362
  51. ins_pricing/modelling/plotting/importance.py +121 -121
  52. ins_pricing/pricing/__init__.py +27 -27
  53. ins_pricing/production/__init__.py +35 -25
  54. ins_pricing/production/{predict.py → inference.py} +140 -57
  55. ins_pricing/production/monitoring.py +8 -21
  56. ins_pricing/reporting/__init__.py +11 -11
  57. ins_pricing/setup.py +1 -1
  58. ins_pricing/tests/production/test_inference.py +90 -0
  59. ins_pricing/utils/__init__.py +116 -83
  60. ins_pricing/utils/device.py +255 -255
  61. ins_pricing/utils/features.py +53 -0
  62. ins_pricing/utils/io.py +72 -0
  63. ins_pricing/{modelling/core/bayesopt/utils → utils}/losses.py +125 -129
  64. ins_pricing/utils/metrics.py +158 -24
  65. ins_pricing/utils/numerics.py +76 -0
  66. ins_pricing/utils/paths.py +9 -1
  67. {ins_pricing-0.4.5.dist-info → ins_pricing-0.5.0.dist-info}/METADATA +182 -182
  68. ins_pricing-0.5.0.dist-info/RECORD +131 -0
  69. ins_pricing/modelling/core/BayesOpt.py +0 -146
  70. ins_pricing/modelling/core/__init__.py +0 -1
  71. ins_pricing/modelling/core/bayesopt/trainers/__init__.py +0 -19
  72. ins_pricing/modelling/core/bayesopt/utils/__init__.py +0 -86
  73. ins_pricing/modelling/core/bayesopt/utils/constants.py +0 -183
  74. ins_pricing/modelling/core/bayesopt/utils/io_utils.py +0 -126
  75. ins_pricing/modelling/core/bayesopt/utils/metrics_and_devices.py +0 -555
  76. ins_pricing/modelling/core/bayesopt/utils.py +0 -105
  77. ins_pricing/modelling/core/bayesopt/utils_backup.py +0 -1503
  78. ins_pricing/tests/production/test_predict.py +0 -233
  79. ins_pricing-0.4.5.dist-info/RECORD +0 -130
  80. /ins_pricing/modelling/{core/bayesopt → bayesopt}/config_components.py +0 -0
  81. /ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_ft_components.py +0 -0
  82. /ins_pricing/modelling/{core/bayesopt → bayesopt}/utils/distributed_utils.py +0 -0
  83. {ins_pricing-0.4.5.dist-info → ins_pricing-0.5.0.dist-info}/WHEEL +0 -0
  84. {ins_pricing-0.4.5.dist-info → ins_pricing-0.5.0.dist-info}/top_level.txt +0 -0
@@ -1,446 +1,448 @@
1
- from __future__ import annotations
2
-
3
- from typing import Dict, List, Optional
4
-
5
- import numpy as np
6
- import pandas as pd
7
- import torch
8
- import torch.nn as nn
9
- from torch.cuda.amp import GradScaler
10
- from torch.nn.parallel import DistributedDataParallel as DDP
11
- from torch.nn.utils import clip_grad_norm_
12
- from torch.utils.data import TensorDataset
13
-
14
- from ..utils import DistributedUtils, EPS, TorchTrainerMixin
15
- from ..utils.losses import (
16
- infer_loss_name_from_model_name,
17
- normalize_loss_name,
18
- resolve_tweedie_power,
19
- )
20
-
21
-
22
- # =============================================================================
23
- # ResNet model and sklearn-style wrapper
24
- # =============================================================================
25
-
26
- # ResNet model definition
27
- # Residual block: two linear layers + ReLU + residual connection
28
- # ResBlock inherits nn.Module
29
- class ResBlock(nn.Module):
30
- def __init__(self, dim: int, dropout: float = 0.1,
31
- use_layernorm: bool = False, residual_scale: float = 0.1,
32
- stochastic_depth: float = 0.0
33
- ):
34
- super().__init__()
35
- self.use_layernorm = use_layernorm
36
-
37
- if use_layernorm:
38
- Norm = nn.LayerNorm # Normalize the last dimension
39
- else:
40
- def Norm(d): return nn.BatchNorm1d(d) # Keep a switch to try BN
41
-
42
- self.norm1 = Norm(dim)
43
- self.fc1 = nn.Linear(dim, dim, bias=True)
44
- self.act = nn.ReLU(inplace=True)
45
- self.dropout = nn.Dropout(dropout) if dropout > 0.0 else nn.Identity()
46
- # Enable post-second-layer norm if needed: self.norm2 = Norm(dim)
47
- self.fc2 = nn.Linear(dim, dim, bias=True)
48
-
49
- # Residual scaling to stabilize early training
50
- self.res_scale = nn.Parameter(
51
- torch.tensor(residual_scale, dtype=torch.float32)
52
- )
53
- self.stochastic_depth = max(0.0, float(stochastic_depth))
54
-
55
- def _drop_path(self, x: torch.Tensor) -> torch.Tensor:
56
- if self.stochastic_depth <= 0.0 or not self.training:
57
- return x
58
- keep_prob = 1.0 - self.stochastic_depth
59
- if keep_prob <= 0.0:
60
- return torch.zeros_like(x)
61
- shape = (x.shape[0],) + (1,) * (x.ndim - 1)
62
- random_tensor = keep_prob + torch.rand(
63
- shape, dtype=x.dtype, device=x.device)
64
- binary_tensor = torch.floor(random_tensor)
65
- return x * binary_tensor / keep_prob
66
-
67
- def forward(self, x):
68
- # Pre-activation structure
69
- out = self.norm1(x)
70
- out = self.fc1(out)
71
- out = self.act(out)
72
- out = self.dropout(out)
73
- # If a second norm is enabled: out = self.norm2(out)
74
- out = self.fc2(out)
75
- # Apply residual scaling then add
76
- out = self.res_scale * out
77
- out = self._drop_path(out)
78
- return x + out
79
-
80
- # ResNetSequential defines the full network
81
-
82
-
83
- class ResNetSequential(nn.Module):
84
- # Input shape: (batch, input_dim)
85
- # Network: FC + norm + ReLU, stack residual blocks, output Softplus
86
-
87
- def __init__(self, input_dim: int, hidden_dim: int = 64, block_num: int = 2,
88
- use_layernorm: bool = True, dropout: float = 0.1,
89
- residual_scale: float = 0.1, stochastic_depth: float = 0.0,
90
- task_type: str = 'regression'):
91
- super(ResNetSequential, self).__init__()
92
-
93
- self.net = nn.Sequential()
94
- self.net.add_module('fc1', nn.Linear(input_dim, hidden_dim))
95
-
96
- # Optional explicit normalization after the first layer:
97
- # For LayerNorm:
98
- # self.net.add_module('norm1', nn.LayerNorm(hidden_dim))
99
- # Or BatchNorm:
100
- # self.net.add_module('norm1', nn.BatchNorm1d(hidden_dim))
101
-
102
- # If desired, insert ReLU before residual blocks:
103
- # self.net.add_module('relu1', nn.ReLU(inplace=True))
104
-
105
- # Residual blocks
106
- drop_path_rate = max(0.0, float(stochastic_depth))
107
- for i in range(block_num):
108
- if block_num > 1:
109
- block_drop = drop_path_rate * (i / (block_num - 1))
110
- else:
111
- block_drop = drop_path_rate
112
- self.net.add_module(
113
- f'ResBlk_{i+1}',
114
- ResBlock(
115
- hidden_dim,
116
- dropout=dropout,
117
- use_layernorm=use_layernorm,
118
- residual_scale=residual_scale,
119
- stochastic_depth=block_drop)
120
- )
121
-
122
- self.net.add_module('fc_out', nn.Linear(hidden_dim, 1))
123
-
124
- if task_type == 'classification':
125
- self.net.add_module('softplus', nn.Identity())
126
- else:
127
- self.net.add_module('softplus', nn.Softplus())
128
-
129
- def forward(self, x):
130
- if self.training and not hasattr(self, '_printed_device'):
131
- print(f">>> ResNetSequential executing on device: {x.device}")
132
- self._printed_device = True
133
- return self.net(x)
134
-
135
- # Define the ResNet sklearn-style wrapper.
136
-
137
-
138
- class ResNetSklearn(TorchTrainerMixin, nn.Module):
139
- def __init__(self, model_nme: str, input_dim: int, hidden_dim: int = 64,
140
- block_num: int = 2, batch_num: int = 100, epochs: int = 100,
141
- task_type: str = 'regression',
142
- tweedie_power: float = 1.5, learning_rate: float = 0.01, patience: int = 10,
143
- use_layernorm: bool = True, dropout: float = 0.1,
144
- residual_scale: float = 0.1,
145
- stochastic_depth: float = 0.0,
146
- weight_decay: float = 1e-4,
147
- use_data_parallel: bool = True,
148
- use_ddp: bool = False,
149
- loss_name: Optional[str] = None):
150
- super(ResNetSklearn, self).__init__()
151
-
152
- self.use_ddp = use_ddp
153
- self.is_ddp_enabled, self.local_rank, self.rank, self.world_size = (
154
- False, 0, 0, 1)
155
-
156
- if self.use_ddp:
157
- self.is_ddp_enabled, self.local_rank, self.rank, self.world_size = DistributedUtils.setup_ddp()
158
-
159
- self.input_dim = input_dim
160
- self.hidden_dim = hidden_dim
161
- self.block_num = block_num
162
- self.batch_num = batch_num
163
- self.epochs = epochs
164
- self.task_type = task_type
165
- self.model_nme = model_nme
166
- self.learning_rate = learning_rate
167
- self.weight_decay = weight_decay
168
- self.patience = patience
169
- self.use_layernorm = use_layernorm
170
- self.dropout = dropout
171
- self.residual_scale = residual_scale
172
- self.stochastic_depth = max(0.0, float(stochastic_depth))
173
- self.loss_curve_path: Optional[str] = None
174
- self.training_history: Dict[str, List[float]] = {
175
- "train": [], "val": []}
176
- self.use_data_parallel = bool(use_data_parallel)
177
-
178
- # Device selection: cuda > mps > cpu
179
- if self.is_ddp_enabled:
180
- self.device = torch.device(f'cuda:{self.local_rank}')
181
- elif torch.cuda.is_available():
182
- self.device = torch.device('cuda')
183
- elif torch.backends.mps.is_available():
184
- self.device = torch.device('mps')
185
- else:
186
- self.device = torch.device('cpu')
187
-
188
- resolved_loss = normalize_loss_name(loss_name, self.task_type)
189
- if self.task_type == 'classification':
190
- self.loss_name = "logloss"
191
- self.tw_power = None
192
- else:
193
- if resolved_loss == "auto":
194
- resolved_loss = infer_loss_name_from_model_name(self.model_nme)
195
- self.loss_name = resolved_loss
196
- if self.loss_name == "tweedie":
197
- self.tw_power = float(tweedie_power) if tweedie_power is not None else 1.5
198
- else:
199
- self.tw_power = resolve_tweedie_power(self.loss_name, default=1.5)
200
-
201
- # Build network (construct on CPU first)
202
- core = ResNetSequential(
203
- self.input_dim,
204
- self.hidden_dim,
205
- self.block_num,
206
- use_layernorm=self.use_layernorm,
207
- dropout=self.dropout,
208
- residual_scale=self.residual_scale,
209
- stochastic_depth=self.stochastic_depth,
210
- task_type=self.task_type
211
- )
212
-
213
- # ===== Multi-GPU: DataParallel vs DistributedDataParallel =====
214
- if self.is_ddp_enabled:
215
- core = core.to(self.device)
216
- core = DDP(core, device_ids=[
217
- self.local_rank], output_device=self.local_rank)
218
- self.use_data_parallel = False
219
- elif use_data_parallel and (self.device.type == 'cuda') and (torch.cuda.device_count() > 1):
220
- if self.use_ddp and not self.is_ddp_enabled:
221
- print(
222
- ">>> DDP requested but not initialized; falling back to DataParallel.")
223
- core = nn.DataParallel(core, device_ids=list(
224
- range(torch.cuda.device_count())))
225
- # DataParallel scatters inputs, but the primary device remains cuda:0.
226
- self.device = torch.device('cuda')
227
- self.use_data_parallel = True
228
- else:
229
- self.use_data_parallel = False
230
-
231
- self.resnet = core.to(self.device)
232
-
233
- # ================ Internal helpers ================
234
- @staticmethod
235
- def _validate_vector(arr, name: str, n_rows: int) -> None:
236
- if arr is None:
237
- return
238
- if isinstance(arr, pd.DataFrame):
239
- if arr.shape[1] != 1:
240
- raise ValueError(f"{name} must be 1d (single column).")
241
- length = len(arr)
242
- else:
243
- arr_np = np.asarray(arr)
244
- if arr_np.ndim == 0:
245
- raise ValueError(f"{name} must be 1d.")
246
- if arr_np.ndim > 2 or (arr_np.ndim == 2 and arr_np.shape[1] != 1):
247
- raise ValueError(f"{name} must be 1d or Nx1.")
248
- length = arr_np.shape[0]
249
- if length != n_rows:
250
- raise ValueError(
251
- f"{name} length {length} does not match X length {n_rows}."
252
- )
253
-
254
- def _validate_inputs(self, X, y, w, label: str) -> None:
255
- if X is None:
256
- raise ValueError(f"{label} X cannot be None.")
257
- n_rows = len(X)
258
- if y is None:
259
- raise ValueError(f"{label} y cannot be None.")
260
- self._validate_vector(y, f"{label} y", n_rows)
261
- self._validate_vector(w, f"{label} w", n_rows)
262
-
263
- def _build_train_val_tensors(self, X_train, y_train, w_train, X_val, y_val, w_val):
264
- self._validate_inputs(X_train, y_train, w_train, "train")
265
- if X_val is not None or y_val is not None or w_val is not None:
266
- if X_val is None or y_val is None:
267
- raise ValueError("validation X and y must both be provided.")
268
- self._validate_inputs(X_val, y_val, w_val, "val")
269
-
270
- def _to_numpy(arr):
271
- if hasattr(arr, "to_numpy"):
272
- return arr.to_numpy(dtype=np.float32, copy=False)
273
- return np.asarray(arr, dtype=np.float32)
274
-
275
- X_tensor = torch.as_tensor(_to_numpy(X_train))
276
- y_tensor = torch.as_tensor(_to_numpy(y_train)).view(-1, 1)
277
- w_tensor = (
278
- torch.as_tensor(_to_numpy(w_train)).view(-1, 1)
279
- if w_train is not None else torch.ones_like(y_tensor)
280
- )
281
-
282
- has_val = X_val is not None and y_val is not None
283
- if has_val:
284
- X_val_tensor = torch.as_tensor(_to_numpy(X_val))
285
- y_val_tensor = torch.as_tensor(_to_numpy(y_val)).view(-1, 1)
286
- w_val_tensor = (
287
- torch.as_tensor(_to_numpy(w_val)).view(-1, 1)
288
- if w_val is not None else torch.ones_like(y_val_tensor)
289
- )
290
- else:
291
- X_val_tensor = y_val_tensor = w_val_tensor = None
292
- return X_tensor, y_tensor, w_tensor, X_val_tensor, y_val_tensor, w_val_tensor, has_val
293
-
294
- def forward(self, x):
295
- # Handle SHAP NumPy input.
296
- if isinstance(x, np.ndarray):
297
- x_tensor = torch.as_tensor(x, dtype=torch.float32)
298
- else:
299
- x_tensor = x
300
-
301
- x_tensor = x_tensor.to(self.device)
302
- y_pred = self.resnet(x_tensor)
303
- return y_pred
304
-
305
- # ---------------- Training ----------------
306
-
307
- def fit(self, X_train, y_train, w_train=None,
308
- X_val=None, y_val=None, w_val=None, trial=None):
309
-
310
- X_tensor, y_tensor, w_tensor, X_val_tensor, y_val_tensor, w_val_tensor, has_val = \
311
- self._build_train_val_tensors(
312
- X_train, y_train, w_train, X_val, y_val, w_val)
313
-
314
- dataset = TensorDataset(X_tensor, y_tensor, w_tensor)
315
- dataloader, accum_steps = self._build_dataloader(
316
- dataset,
317
- N=X_tensor.shape[0],
318
- base_bs_gpu=(2048, 1024, 512),
319
- base_bs_cpu=(256, 128),
320
- min_bs=64,
321
- target_effective_cuda=2048,
322
- target_effective_cpu=1024
323
- )
324
-
325
- # Set sampler epoch at the start of each epoch to keep shuffling deterministic.
326
- if self.is_ddp_enabled and hasattr(dataloader.sampler, 'set_epoch'):
327
- self.dataloader_sampler = dataloader.sampler
328
- else:
329
- self.dataloader_sampler = None
330
-
331
- # === 4. Optimizer and AMP ===
332
- self.optimizer = torch.optim.Adam(
333
- self.resnet.parameters(),
334
- lr=self.learning_rate,
335
- weight_decay=float(self.weight_decay),
336
- )
337
- self.scaler = GradScaler(enabled=(self.device.type == 'cuda'))
338
-
339
- X_val_dev = y_val_dev = w_val_dev = None
340
- val_dataloader = None
341
- if has_val:
342
- # Build validation DataLoader.
343
- val_dataset = TensorDataset(
344
- X_val_tensor, y_val_tensor, w_val_tensor)
345
- # No backward pass in validation; batch size can be larger for throughput.
346
- val_dataloader = self._build_val_dataloader(
347
- val_dataset, dataloader, accum_steps)
348
- # Validation usually does not need a DDP sampler because we validate on the main process
349
- # or aggregate results. For simplicity, keep validation on a single GPU or the main process.
350
-
351
- is_data_parallel = isinstance(self.resnet, nn.DataParallel)
352
-
353
- def forward_fn(batch):
354
- X_batch, y_batch, w_batch = batch
355
-
356
- if not is_data_parallel:
357
- X_batch = X_batch.to(self.device, non_blocking=True)
358
- # Keep targets and weights on the main device for loss computation.
359
- y_batch = y_batch.to(self.device, non_blocking=True)
360
- w_batch = w_batch.to(self.device, non_blocking=True)
361
-
362
- y_pred = self.resnet(X_batch)
363
- return y_pred, y_batch, w_batch
364
-
365
- def val_forward_fn():
366
- total_loss = 0.0
367
- total_weight = 0.0
368
- for batch in val_dataloader:
369
- X_b, y_b, w_b = batch
370
- if not is_data_parallel:
371
- X_b = X_b.to(self.device, non_blocking=True)
372
- y_b = y_b.to(self.device, non_blocking=True)
373
- w_b = w_b.to(self.device, non_blocking=True)
374
-
375
- y_pred = self.resnet(X_b)
376
-
377
- # Manually compute weighted loss for accurate aggregation.
378
- losses = self._compute_losses(
379
- y_pred, y_b, apply_softplus=False)
380
-
381
- batch_weight_sum = torch.clamp(w_b.sum(), min=EPS)
382
- batch_weighted_loss_sum = (losses * w_b.view(-1)).sum()
383
-
384
- total_loss += batch_weighted_loss_sum.item()
385
- total_weight += batch_weight_sum.item()
386
-
387
- return total_loss / max(total_weight, EPS)
388
-
389
- clip_fn = None
390
- if self.device.type == 'cuda':
391
- def clip_fn(): return (self.scaler.unscale_(self.optimizer),
392
- clip_grad_norm_(self.resnet.parameters(), max_norm=1.0))
393
-
394
- # Under DDP, only the main process prints logs and saves models.
395
- if self.is_ddp_enabled and not DistributedUtils.is_main_process():
396
- # Non-main processes skip validation callback logging (handled inside _train_model).
397
- pass
398
-
399
- best_state, history = self._train_model(
400
- self.resnet,
401
- dataloader,
402
- accum_steps,
403
- self.optimizer,
404
- self.scaler,
405
- forward_fn,
406
- val_forward_fn if has_val else None,
407
- apply_softplus=False,
408
- clip_fn=clip_fn,
409
- trial=trial,
410
- loss_curve_path=getattr(self, "loss_curve_path", None)
411
- )
412
-
413
- if has_val and best_state is not None:
414
- # Load state into unwrapped module to match how it was saved
415
- base_module = self.resnet.module if hasattr(self.resnet, "module") else self.resnet
416
- base_module.load_state_dict(best_state)
417
- self.training_history = history
418
-
419
- # ---------------- Prediction ----------------
420
-
421
- def predict(self, X_test):
422
- self.resnet.eval()
423
- if isinstance(X_test, pd.DataFrame):
424
- X_np = X_test.to_numpy(dtype=np.float32, copy=False)
425
- else:
426
- X_np = np.asarray(X_test, dtype=np.float32)
427
-
428
- inference_cm = getattr(torch, "inference_mode", torch.no_grad)
429
- with inference_cm():
430
- y_pred = self(X_np).cpu().numpy()
431
-
432
- if self.task_type == 'classification':
433
- y_pred = 1 / (1 + np.exp(-y_pred)) # Sigmoid converts logits to probabilities.
434
- else:
435
- y_pred = np.clip(y_pred, 1e-6, None)
436
- return y_pred.flatten()
437
-
438
- # ---------------- Set Params ----------------
439
-
440
- def set_params(self, params):
441
- for key, value in params.items():
442
- if hasattr(self, key):
443
- setattr(self, key, value)
444
- else:
445
- raise ValueError(f"Parameter {key} not found in model.")
446
- return self
1
+ from __future__ import annotations
2
+
3
+ from typing import Dict, List, Optional
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ import torch
8
+ import torch.nn as nn
9
+ from torch.cuda.amp import GradScaler
10
+ from torch.nn.parallel import DistributedDataParallel as DDP
11
+ from torch.nn.utils import clip_grad_norm_
12
+ from torch.utils.data import TensorDataset
13
+
14
+ from ins_pricing.modelling.bayesopt.utils.distributed_utils import DistributedUtils
15
+ from ins_pricing.modelling.bayesopt.utils.torch_trainer_mixin import TorchTrainerMixin
16
+ from ins_pricing.utils import EPS
17
+ from ins_pricing.utils.losses import (
18
+ infer_loss_name_from_model_name,
19
+ normalize_loss_name,
20
+ resolve_tweedie_power,
21
+ )
22
+
23
+
24
+ # =============================================================================
25
+ # ResNet model and sklearn-style wrapper
26
+ # =============================================================================
27
+
28
+ # ResNet model definition
29
+ # Residual block: two linear layers + ReLU + residual connection
30
+ # ResBlock inherits nn.Module
31
+ class ResBlock(nn.Module):
32
+ def __init__(self, dim: int, dropout: float = 0.1,
33
+ use_layernorm: bool = False, residual_scale: float = 0.1,
34
+ stochastic_depth: float = 0.0
35
+ ):
36
+ super().__init__()
37
+ self.use_layernorm = use_layernorm
38
+
39
+ if use_layernorm:
40
+ Norm = nn.LayerNorm # Normalize the last dimension
41
+ else:
42
+ def Norm(d): return nn.BatchNorm1d(d) # Keep a switch to try BN
43
+
44
+ self.norm1 = Norm(dim)
45
+ self.fc1 = nn.Linear(dim, dim, bias=True)
46
+ self.act = nn.ReLU(inplace=True)
47
+ self.dropout = nn.Dropout(dropout) if dropout > 0.0 else nn.Identity()
48
+ # Enable post-second-layer norm if needed: self.norm2 = Norm(dim)
49
+ self.fc2 = nn.Linear(dim, dim, bias=True)
50
+
51
+ # Residual scaling to stabilize early training
52
+ self.res_scale = nn.Parameter(
53
+ torch.tensor(residual_scale, dtype=torch.float32)
54
+ )
55
+ self.stochastic_depth = max(0.0, float(stochastic_depth))
56
+
57
+ def _drop_path(self, x: torch.Tensor) -> torch.Tensor:
58
+ if self.stochastic_depth <= 0.0 or not self.training:
59
+ return x
60
+ keep_prob = 1.0 - self.stochastic_depth
61
+ if keep_prob <= 0.0:
62
+ return torch.zeros_like(x)
63
+ shape = (x.shape[0],) + (1,) * (x.ndim - 1)
64
+ random_tensor = keep_prob + torch.rand(
65
+ shape, dtype=x.dtype, device=x.device)
66
+ binary_tensor = torch.floor(random_tensor)
67
+ return x * binary_tensor / keep_prob
68
+
69
+ def forward(self, x):
70
+ # Pre-activation structure
71
+ out = self.norm1(x)
72
+ out = self.fc1(out)
73
+ out = self.act(out)
74
+ out = self.dropout(out)
75
+ # If a second norm is enabled: out = self.norm2(out)
76
+ out = self.fc2(out)
77
+ # Apply residual scaling then add
78
+ out = self.res_scale * out
79
+ out = self._drop_path(out)
80
+ return x + out
81
+
82
+ # ResNetSequential defines the full network
83
+
84
+
85
+ class ResNetSequential(nn.Module):
86
+ # Input shape: (batch, input_dim)
87
+ # Network: FC + norm + ReLU, stack residual blocks, output Softplus
88
+
89
+ def __init__(self, input_dim: int, hidden_dim: int = 64, block_num: int = 2,
90
+ use_layernorm: bool = True, dropout: float = 0.1,
91
+ residual_scale: float = 0.1, stochastic_depth: float = 0.0,
92
+ task_type: str = 'regression'):
93
+ super(ResNetSequential, self).__init__()
94
+
95
+ self.net = nn.Sequential()
96
+ self.net.add_module('fc1', nn.Linear(input_dim, hidden_dim))
97
+
98
+ # Optional explicit normalization after the first layer:
99
+ # For LayerNorm:
100
+ # self.net.add_module('norm1', nn.LayerNorm(hidden_dim))
101
+ # Or BatchNorm:
102
+ # self.net.add_module('norm1', nn.BatchNorm1d(hidden_dim))
103
+
104
+ # If desired, insert ReLU before residual blocks:
105
+ # self.net.add_module('relu1', nn.ReLU(inplace=True))
106
+
107
+ # Residual blocks
108
+ drop_path_rate = max(0.0, float(stochastic_depth))
109
+ for i in range(block_num):
110
+ if block_num > 1:
111
+ block_drop = drop_path_rate * (i / (block_num - 1))
112
+ else:
113
+ block_drop = drop_path_rate
114
+ self.net.add_module(
115
+ f'ResBlk_{i+1}',
116
+ ResBlock(
117
+ hidden_dim,
118
+ dropout=dropout,
119
+ use_layernorm=use_layernorm,
120
+ residual_scale=residual_scale,
121
+ stochastic_depth=block_drop)
122
+ )
123
+
124
+ self.net.add_module('fc_out', nn.Linear(hidden_dim, 1))
125
+
126
+ if task_type == 'classification':
127
+ self.net.add_module('softplus', nn.Identity())
128
+ else:
129
+ self.net.add_module('softplus', nn.Softplus())
130
+
131
+ def forward(self, x):
132
+ if self.training and not hasattr(self, '_printed_device'):
133
+ print(f">>> ResNetSequential executing on device: {x.device}")
134
+ self._printed_device = True
135
+ return self.net(x)
136
+
137
+ # Define the ResNet sklearn-style wrapper.
138
+
139
+
140
+ class ResNetSklearn(TorchTrainerMixin, nn.Module):
141
+ def __init__(self, model_nme: str, input_dim: int, hidden_dim: int = 64,
142
+ block_num: int = 2, batch_num: int = 100, epochs: int = 100,
143
+ task_type: str = 'regression',
144
+ tweedie_power: float = 1.5, learning_rate: float = 0.01, patience: int = 10,
145
+ use_layernorm: bool = True, dropout: float = 0.1,
146
+ residual_scale: float = 0.1,
147
+ stochastic_depth: float = 0.0,
148
+ weight_decay: float = 1e-4,
149
+ use_data_parallel: bool = True,
150
+ use_ddp: bool = False,
151
+ loss_name: Optional[str] = None):
152
+ super(ResNetSklearn, self).__init__()
153
+
154
+ self.use_ddp = use_ddp
155
+ self.is_ddp_enabled, self.local_rank, self.rank, self.world_size = (
156
+ False, 0, 0, 1)
157
+
158
+ if self.use_ddp:
159
+ self.is_ddp_enabled, self.local_rank, self.rank, self.world_size = DistributedUtils.setup_ddp()
160
+
161
+ self.input_dim = input_dim
162
+ self.hidden_dim = hidden_dim
163
+ self.block_num = block_num
164
+ self.batch_num = batch_num
165
+ self.epochs = epochs
166
+ self.task_type = task_type
167
+ self.model_nme = model_nme
168
+ self.learning_rate = learning_rate
169
+ self.weight_decay = weight_decay
170
+ self.patience = patience
171
+ self.use_layernorm = use_layernorm
172
+ self.dropout = dropout
173
+ self.residual_scale = residual_scale
174
+ self.stochastic_depth = max(0.0, float(stochastic_depth))
175
+ self.loss_curve_path: Optional[str] = None
176
+ self.training_history: Dict[str, List[float]] = {
177
+ "train": [], "val": []}
178
+ self.use_data_parallel = bool(use_data_parallel)
179
+
180
+ # Device selection: cuda > mps > cpu
181
+ if self.is_ddp_enabled:
182
+ self.device = torch.device(f'cuda:{self.local_rank}')
183
+ elif torch.cuda.is_available():
184
+ self.device = torch.device('cuda')
185
+ elif torch.backends.mps.is_available():
186
+ self.device = torch.device('mps')
187
+ else:
188
+ self.device = torch.device('cpu')
189
+
190
+ resolved_loss = normalize_loss_name(loss_name, self.task_type)
191
+ if self.task_type == 'classification':
192
+ self.loss_name = "logloss"
193
+ self.tw_power = None
194
+ else:
195
+ if resolved_loss == "auto":
196
+ resolved_loss = infer_loss_name_from_model_name(self.model_nme)
197
+ self.loss_name = resolved_loss
198
+ if self.loss_name == "tweedie":
199
+ self.tw_power = float(tweedie_power) if tweedie_power is not None else 1.5
200
+ else:
201
+ self.tw_power = resolve_tweedie_power(self.loss_name, default=1.5)
202
+
203
+ # Build network (construct on CPU first)
204
+ core = ResNetSequential(
205
+ self.input_dim,
206
+ self.hidden_dim,
207
+ self.block_num,
208
+ use_layernorm=self.use_layernorm,
209
+ dropout=self.dropout,
210
+ residual_scale=self.residual_scale,
211
+ stochastic_depth=self.stochastic_depth,
212
+ task_type=self.task_type
213
+ )
214
+
215
+ # ===== Multi-GPU: DataParallel vs DistributedDataParallel =====
216
+ if self.is_ddp_enabled:
217
+ core = core.to(self.device)
218
+ core = DDP(core, device_ids=[
219
+ self.local_rank], output_device=self.local_rank)
220
+ self.use_data_parallel = False
221
+ elif use_data_parallel and (self.device.type == 'cuda') and (torch.cuda.device_count() > 1):
222
+ if self.use_ddp and not self.is_ddp_enabled:
223
+ print(
224
+ ">>> DDP requested but not initialized; falling back to DataParallel.")
225
+ core = nn.DataParallel(core, device_ids=list(
226
+ range(torch.cuda.device_count())))
227
+ # DataParallel scatters inputs, but the primary device remains cuda:0.
228
+ self.device = torch.device('cuda')
229
+ self.use_data_parallel = True
230
+ else:
231
+ self.use_data_parallel = False
232
+
233
+ self.resnet = core.to(self.device)
234
+
235
+ # ================ Internal helpers ================
236
+ @staticmethod
237
+ def _validate_vector(arr, name: str, n_rows: int) -> None:
238
+ if arr is None:
239
+ return
240
+ if isinstance(arr, pd.DataFrame):
241
+ if arr.shape[1] != 1:
242
+ raise ValueError(f"{name} must be 1d (single column).")
243
+ length = len(arr)
244
+ else:
245
+ arr_np = np.asarray(arr)
246
+ if arr_np.ndim == 0:
247
+ raise ValueError(f"{name} must be 1d.")
248
+ if arr_np.ndim > 2 or (arr_np.ndim == 2 and arr_np.shape[1] != 1):
249
+ raise ValueError(f"{name} must be 1d or Nx1.")
250
+ length = arr_np.shape[0]
251
+ if length != n_rows:
252
+ raise ValueError(
253
+ f"{name} length {length} does not match X length {n_rows}."
254
+ )
255
+
256
+ def _validate_inputs(self, X, y, w, label: str) -> None:
257
+ if X is None:
258
+ raise ValueError(f"{label} X cannot be None.")
259
+ n_rows = len(X)
260
+ if y is None:
261
+ raise ValueError(f"{label} y cannot be None.")
262
+ self._validate_vector(y, f"{label} y", n_rows)
263
+ self._validate_vector(w, f"{label} w", n_rows)
264
+
265
+ def _build_train_val_tensors(self, X_train, y_train, w_train, X_val, y_val, w_val):
266
+ self._validate_inputs(X_train, y_train, w_train, "train")
267
+ if X_val is not None or y_val is not None or w_val is not None:
268
+ if X_val is None or y_val is None:
269
+ raise ValueError("validation X and y must both be provided.")
270
+ self._validate_inputs(X_val, y_val, w_val, "val")
271
+
272
+ def _to_numpy(arr):
273
+ if hasattr(arr, "to_numpy"):
274
+ return arr.to_numpy(dtype=np.float32, copy=False)
275
+ return np.asarray(arr, dtype=np.float32)
276
+
277
+ X_tensor = torch.as_tensor(_to_numpy(X_train))
278
+ y_tensor = torch.as_tensor(_to_numpy(y_train)).view(-1, 1)
279
+ w_tensor = (
280
+ torch.as_tensor(_to_numpy(w_train)).view(-1, 1)
281
+ if w_train is not None else torch.ones_like(y_tensor)
282
+ )
283
+
284
+ has_val = X_val is not None and y_val is not None
285
+ if has_val:
286
+ X_val_tensor = torch.as_tensor(_to_numpy(X_val))
287
+ y_val_tensor = torch.as_tensor(_to_numpy(y_val)).view(-1, 1)
288
+ w_val_tensor = (
289
+ torch.as_tensor(_to_numpy(w_val)).view(-1, 1)
290
+ if w_val is not None else torch.ones_like(y_val_tensor)
291
+ )
292
+ else:
293
+ X_val_tensor = y_val_tensor = w_val_tensor = None
294
+ return X_tensor, y_tensor, w_tensor, X_val_tensor, y_val_tensor, w_val_tensor, has_val
295
+
296
+ def forward(self, x):
297
+ # Handle SHAP NumPy input.
298
+ if isinstance(x, np.ndarray):
299
+ x_tensor = torch.as_tensor(x, dtype=torch.float32)
300
+ else:
301
+ x_tensor = x
302
+
303
+ x_tensor = x_tensor.to(self.device)
304
+ y_pred = self.resnet(x_tensor)
305
+ return y_pred
306
+
307
+ # ---------------- Training ----------------
308
+
309
+ def fit(self, X_train, y_train, w_train=None,
310
+ X_val=None, y_val=None, w_val=None, trial=None):
311
+
312
+ X_tensor, y_tensor, w_tensor, X_val_tensor, y_val_tensor, w_val_tensor, has_val = \
313
+ self._build_train_val_tensors(
314
+ X_train, y_train, w_train, X_val, y_val, w_val)
315
+
316
+ dataset = TensorDataset(X_tensor, y_tensor, w_tensor)
317
+ dataloader, accum_steps = self._build_dataloader(
318
+ dataset,
319
+ N=X_tensor.shape[0],
320
+ base_bs_gpu=(2048, 1024, 512),
321
+ base_bs_cpu=(256, 128),
322
+ min_bs=64,
323
+ target_effective_cuda=2048,
324
+ target_effective_cpu=1024
325
+ )
326
+
327
+ # Set sampler epoch at the start of each epoch to keep shuffling deterministic.
328
+ if self.is_ddp_enabled and hasattr(dataloader.sampler, 'set_epoch'):
329
+ self.dataloader_sampler = dataloader.sampler
330
+ else:
331
+ self.dataloader_sampler = None
332
+
333
+ # === 4. Optimizer and AMP ===
334
+ self.optimizer = torch.optim.Adam(
335
+ self.resnet.parameters(),
336
+ lr=self.learning_rate,
337
+ weight_decay=float(self.weight_decay),
338
+ )
339
+ self.scaler = GradScaler(enabled=(self.device.type == 'cuda'))
340
+
341
+ X_val_dev = y_val_dev = w_val_dev = None
342
+ val_dataloader = None
343
+ if has_val:
344
+ # Build validation DataLoader.
345
+ val_dataset = TensorDataset(
346
+ X_val_tensor, y_val_tensor, w_val_tensor)
347
+ # No backward pass in validation; batch size can be larger for throughput.
348
+ val_dataloader = self._build_val_dataloader(
349
+ val_dataset, dataloader, accum_steps)
350
+ # Validation usually does not need a DDP sampler because we validate on the main process
351
+ # or aggregate results. For simplicity, keep validation on a single GPU or the main process.
352
+
353
+ is_data_parallel = isinstance(self.resnet, nn.DataParallel)
354
+
355
+ def forward_fn(batch):
356
+ X_batch, y_batch, w_batch = batch
357
+
358
+ if not is_data_parallel:
359
+ X_batch = X_batch.to(self.device, non_blocking=True)
360
+ # Keep targets and weights on the main device for loss computation.
361
+ y_batch = y_batch.to(self.device, non_blocking=True)
362
+ w_batch = w_batch.to(self.device, non_blocking=True)
363
+
364
+ y_pred = self.resnet(X_batch)
365
+ return y_pred, y_batch, w_batch
366
+
367
+ def val_forward_fn():
368
+ total_loss = 0.0
369
+ total_weight = 0.0
370
+ for batch in val_dataloader:
371
+ X_b, y_b, w_b = batch
372
+ if not is_data_parallel:
373
+ X_b = X_b.to(self.device, non_blocking=True)
374
+ y_b = y_b.to(self.device, non_blocking=True)
375
+ w_b = w_b.to(self.device, non_blocking=True)
376
+
377
+ y_pred = self.resnet(X_b)
378
+
379
+ # Manually compute weighted loss for accurate aggregation.
380
+ losses = self._compute_losses(
381
+ y_pred, y_b, apply_softplus=False)
382
+
383
+ batch_weight_sum = torch.clamp(w_b.sum(), min=EPS)
384
+ batch_weighted_loss_sum = (losses * w_b.view(-1)).sum()
385
+
386
+ total_loss += batch_weighted_loss_sum.item()
387
+ total_weight += batch_weight_sum.item()
388
+
389
+ return total_loss / max(total_weight, EPS)
390
+
391
+ clip_fn = None
392
+ if self.device.type == 'cuda':
393
+ def clip_fn(): return (self.scaler.unscale_(self.optimizer),
394
+ clip_grad_norm_(self.resnet.parameters(), max_norm=1.0))
395
+
396
+ # Under DDP, only the main process prints logs and saves models.
397
+ if self.is_ddp_enabled and not DistributedUtils.is_main_process():
398
+ # Non-main processes skip validation callback logging (handled inside _train_model).
399
+ pass
400
+
401
+ best_state, history = self._train_model(
402
+ self.resnet,
403
+ dataloader,
404
+ accum_steps,
405
+ self.optimizer,
406
+ self.scaler,
407
+ forward_fn,
408
+ val_forward_fn if has_val else None,
409
+ apply_softplus=False,
410
+ clip_fn=clip_fn,
411
+ trial=trial,
412
+ loss_curve_path=getattr(self, "loss_curve_path", None)
413
+ )
414
+
415
+ if has_val and best_state is not None:
416
+ # Load state into unwrapped module to match how it was saved
417
+ base_module = self.resnet.module if hasattr(self.resnet, "module") else self.resnet
418
+ base_module.load_state_dict(best_state)
419
+ self.training_history = history
420
+
421
+ # ---------------- Prediction ----------------
422
+
423
+ def predict(self, X_test):
424
+ self.resnet.eval()
425
+ if isinstance(X_test, pd.DataFrame):
426
+ X_np = X_test.to_numpy(dtype=np.float32, copy=False)
427
+ else:
428
+ X_np = np.asarray(X_test, dtype=np.float32)
429
+
430
+ inference_cm = getattr(torch, "inference_mode", torch.no_grad)
431
+ with inference_cm():
432
+ y_pred = self(X_np).cpu().numpy()
433
+
434
+ if self.task_type == 'classification':
435
+ y_pred = 1 / (1 + np.exp(-y_pred)) # Sigmoid converts logits to probabilities.
436
+ else:
437
+ y_pred = np.clip(y_pred, 1e-6, None)
438
+ return y_pred.flatten()
439
+
440
+ # ---------------- Set Params ----------------
441
+
442
+ def set_params(self, params):
443
+ for key, value in params.items():
444
+ if hasattr(self, key):
445
+ setattr(self, key, value)
446
+ else:
447
+ raise ValueError(f"Parameter {key} not found in model.")
448
+ return self