ins-pricing 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ins_pricing/README.md +74 -56
- ins_pricing/__init__.py +142 -90
- ins_pricing/cli/BayesOpt_entry.py +52 -50
- ins_pricing/cli/BayesOpt_incremental.py +832 -898
- ins_pricing/cli/Explain_Run.py +31 -23
- ins_pricing/cli/Explain_entry.py +532 -579
- ins_pricing/cli/Pricing_Run.py +31 -23
- ins_pricing/cli/bayesopt_entry_runner.py +1440 -1438
- ins_pricing/cli/utils/cli_common.py +256 -256
- ins_pricing/cli/utils/cli_config.py +375 -375
- ins_pricing/cli/utils/import_resolver.py +382 -365
- ins_pricing/cli/utils/notebook_utils.py +340 -340
- ins_pricing/cli/watchdog_run.py +209 -201
- ins_pricing/frontend/README.md +573 -419
- ins_pricing/frontend/__init__.py +10 -10
- ins_pricing/frontend/config_builder.py +1 -0
- ins_pricing/frontend/example_workflows.py +1 -1
- ins_pricing/governance/__init__.py +20 -20
- ins_pricing/governance/release.py +159 -159
- ins_pricing/modelling/README.md +67 -0
- ins_pricing/modelling/__init__.py +147 -92
- ins_pricing/modelling/bayesopt/README.md +59 -0
- ins_pricing/modelling/{core/bayesopt → bayesopt}/__init__.py +64 -102
- ins_pricing/modelling/{core/bayesopt → bayesopt}/config_preprocess.py +562 -550
- ins_pricing/modelling/{core/bayesopt → bayesopt}/core.py +965 -962
- ins_pricing/modelling/{core/bayesopt → bayesopt}/model_explain_mixin.py +296 -296
- ins_pricing/modelling/{core/bayesopt → bayesopt}/model_plotting_mixin.py +482 -548
- ins_pricing/modelling/{core/bayesopt → bayesopt}/models/__init__.py +27 -27
- ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_ft_trainer.py +915 -913
- ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_gnn.py +788 -785
- ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_resn.py +448 -446
- ins_pricing/modelling/bayesopt/trainers/__init__.py +19 -0
- ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_base.py +1308 -1308
- ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_ft.py +3 -3
- ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_glm.py +197 -198
- ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_gnn.py +344 -344
- ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_resn.py +283 -283
- ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_xgb.py +346 -347
- ins_pricing/modelling/bayesopt/utils/__init__.py +67 -0
- ins_pricing/modelling/bayesopt/utils/constants.py +21 -0
- ins_pricing/modelling/bayesopt/utils/io_utils.py +7 -0
- ins_pricing/modelling/bayesopt/utils/losses.py +27 -0
- ins_pricing/modelling/bayesopt/utils/metrics_and_devices.py +17 -0
- ins_pricing/modelling/{core/bayesopt → bayesopt}/utils/torch_trainer_mixin.py +623 -623
- ins_pricing/modelling/{core/evaluation.py → evaluation.py} +113 -104
- ins_pricing/modelling/explain/__init__.py +55 -55
- ins_pricing/modelling/explain/metrics.py +27 -174
- ins_pricing/modelling/explain/permutation.py +237 -237
- ins_pricing/modelling/plotting/__init__.py +40 -36
- ins_pricing/modelling/plotting/compat.py +228 -0
- ins_pricing/modelling/plotting/curves.py +572 -572
- ins_pricing/modelling/plotting/diagnostics.py +163 -163
- ins_pricing/modelling/plotting/geo.py +362 -362
- ins_pricing/modelling/plotting/importance.py +121 -121
- ins_pricing/pricing/__init__.py +27 -27
- ins_pricing/production/__init__.py +35 -25
- ins_pricing/production/{predict.py → inference.py} +140 -57
- ins_pricing/production/monitoring.py +8 -21
- ins_pricing/reporting/__init__.py +11 -11
- ins_pricing/setup.py +1 -1
- ins_pricing/tests/production/test_inference.py +90 -0
- ins_pricing/utils/__init__.py +116 -83
- ins_pricing/utils/device.py +255 -255
- ins_pricing/utils/features.py +53 -0
- ins_pricing/utils/io.py +72 -0
- ins_pricing/{modelling/core/bayesopt/utils → utils}/losses.py +125 -129
- ins_pricing/utils/metrics.py +158 -24
- ins_pricing/utils/numerics.py +76 -0
- ins_pricing/utils/paths.py +9 -1
- {ins_pricing-0.4.4.dist-info → ins_pricing-0.5.0.dist-info}/METADATA +55 -35
- ins_pricing-0.5.0.dist-info/RECORD +131 -0
- ins_pricing/CHANGELOG.md +0 -272
- ins_pricing/RELEASE_NOTES_0.2.8.md +0 -344
- ins_pricing/docs/LOSS_FUNCTIONS.md +0 -78
- ins_pricing/docs/modelling/BayesOpt_USAGE.md +0 -945
- ins_pricing/docs/modelling/README.md +0 -34
- ins_pricing/frontend/QUICKSTART.md +0 -152
- ins_pricing/modelling/core/BayesOpt.py +0 -146
- ins_pricing/modelling/core/__init__.py +0 -1
- ins_pricing/modelling/core/bayesopt/PHASE2_REFACTORING_SUMMARY.md +0 -449
- ins_pricing/modelling/core/bayesopt/PHASE3_REFACTORING_SUMMARY.md +0 -406
- ins_pricing/modelling/core/bayesopt/REFACTORING_SUMMARY.md +0 -247
- ins_pricing/modelling/core/bayesopt/trainers/__init__.py +0 -19
- ins_pricing/modelling/core/bayesopt/utils/__init__.py +0 -86
- ins_pricing/modelling/core/bayesopt/utils/constants.py +0 -183
- ins_pricing/modelling/core/bayesopt/utils/io_utils.py +0 -126
- ins_pricing/modelling/core/bayesopt/utils/metrics_and_devices.py +0 -555
- ins_pricing/modelling/core/bayesopt/utils.py +0 -105
- ins_pricing/modelling/core/bayesopt/utils_backup.py +0 -1503
- ins_pricing/tests/production/test_predict.py +0 -233
- ins_pricing-0.4.4.dist-info/RECORD +0 -137
- /ins_pricing/modelling/{core/bayesopt → bayesopt}/config_components.py +0 -0
- /ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_ft_components.py +0 -0
- /ins_pricing/modelling/{core/bayesopt → bayesopt}/utils/distributed_utils.py +0 -0
- {ins_pricing-0.4.4.dist-info → ins_pricing-0.5.0.dist-info}/WHEEL +0 -0
- {ins_pricing-0.4.4.dist-info → ins_pricing-0.5.0.dist-info}/top_level.txt +0 -0
|
@@ -1,446 +1,448 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from typing import Dict, List, Optional
|
|
4
|
-
|
|
5
|
-
import numpy as np
|
|
6
|
-
import pandas as pd
|
|
7
|
-
import torch
|
|
8
|
-
import torch.nn as nn
|
|
9
|
-
from torch.cuda.amp import GradScaler
|
|
10
|
-
from torch.nn.parallel import DistributedDataParallel as DDP
|
|
11
|
-
from torch.nn.utils import clip_grad_norm_
|
|
12
|
-
from torch.utils.data import TensorDataset
|
|
13
|
-
|
|
14
|
-
from
|
|
15
|
-
from
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
# =============================================================================
|
|
25
|
-
|
|
26
|
-
#
|
|
27
|
-
|
|
28
|
-
#
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
self.
|
|
45
|
-
self.
|
|
46
|
-
|
|
47
|
-
self.
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
out = self.
|
|
72
|
-
out = self.
|
|
73
|
-
|
|
74
|
-
out = self.
|
|
75
|
-
#
|
|
76
|
-
out = self.
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
self.
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
#
|
|
99
|
-
#
|
|
100
|
-
# self.net.add_module('norm1', nn.
|
|
101
|
-
|
|
102
|
-
#
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
#
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
block_drop = drop_path_rate
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
self.net.add_module('softplus', nn.
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
self.
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
self.
|
|
162
|
-
self.
|
|
163
|
-
self.
|
|
164
|
-
self.
|
|
165
|
-
self.
|
|
166
|
-
self.
|
|
167
|
-
self.
|
|
168
|
-
self.
|
|
169
|
-
self.
|
|
170
|
-
self.
|
|
171
|
-
self.
|
|
172
|
-
self.
|
|
173
|
-
self.
|
|
174
|
-
self.
|
|
175
|
-
|
|
176
|
-
self.
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
self.device = torch.device('cuda')
|
|
183
|
-
elif torch.
|
|
184
|
-
self.device = torch.device('
|
|
185
|
-
|
|
186
|
-
self.device = torch.device('
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
self.tw_power =
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
self.
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
self.use_data_parallel =
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
if
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
if arr_np.ndim
|
|
247
|
-
raise ValueError(f"{name} must be 1d
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
x_tensor = x
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
self.dataloader_sampler =
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
X_np =
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
y_pred = np.
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import torch
|
|
8
|
+
import torch.nn as nn
|
|
9
|
+
from torch.cuda.amp import GradScaler
|
|
10
|
+
from torch.nn.parallel import DistributedDataParallel as DDP
|
|
11
|
+
from torch.nn.utils import clip_grad_norm_
|
|
12
|
+
from torch.utils.data import TensorDataset
|
|
13
|
+
|
|
14
|
+
from ins_pricing.modelling.bayesopt.utils.distributed_utils import DistributedUtils
|
|
15
|
+
from ins_pricing.modelling.bayesopt.utils.torch_trainer_mixin import TorchTrainerMixin
|
|
16
|
+
from ins_pricing.utils import EPS
|
|
17
|
+
from ins_pricing.utils.losses import (
|
|
18
|
+
infer_loss_name_from_model_name,
|
|
19
|
+
normalize_loss_name,
|
|
20
|
+
resolve_tweedie_power,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# =============================================================================
|
|
25
|
+
# ResNet model and sklearn-style wrapper
|
|
26
|
+
# =============================================================================
|
|
27
|
+
|
|
28
|
+
# ResNet model definition
|
|
29
|
+
# Residual block: two linear layers + ReLU + residual connection
|
|
30
|
+
# ResBlock inherits nn.Module
|
|
31
|
+
class ResBlock(nn.Module):
|
|
32
|
+
def __init__(self, dim: int, dropout: float = 0.1,
|
|
33
|
+
use_layernorm: bool = False, residual_scale: float = 0.1,
|
|
34
|
+
stochastic_depth: float = 0.0
|
|
35
|
+
):
|
|
36
|
+
super().__init__()
|
|
37
|
+
self.use_layernorm = use_layernorm
|
|
38
|
+
|
|
39
|
+
if use_layernorm:
|
|
40
|
+
Norm = nn.LayerNorm # Normalize the last dimension
|
|
41
|
+
else:
|
|
42
|
+
def Norm(d): return nn.BatchNorm1d(d) # Keep a switch to try BN
|
|
43
|
+
|
|
44
|
+
self.norm1 = Norm(dim)
|
|
45
|
+
self.fc1 = nn.Linear(dim, dim, bias=True)
|
|
46
|
+
self.act = nn.ReLU(inplace=True)
|
|
47
|
+
self.dropout = nn.Dropout(dropout) if dropout > 0.0 else nn.Identity()
|
|
48
|
+
# Enable post-second-layer norm if needed: self.norm2 = Norm(dim)
|
|
49
|
+
self.fc2 = nn.Linear(dim, dim, bias=True)
|
|
50
|
+
|
|
51
|
+
# Residual scaling to stabilize early training
|
|
52
|
+
self.res_scale = nn.Parameter(
|
|
53
|
+
torch.tensor(residual_scale, dtype=torch.float32)
|
|
54
|
+
)
|
|
55
|
+
self.stochastic_depth = max(0.0, float(stochastic_depth))
|
|
56
|
+
|
|
57
|
+
def _drop_path(self, x: torch.Tensor) -> torch.Tensor:
|
|
58
|
+
if self.stochastic_depth <= 0.0 or not self.training:
|
|
59
|
+
return x
|
|
60
|
+
keep_prob = 1.0 - self.stochastic_depth
|
|
61
|
+
if keep_prob <= 0.0:
|
|
62
|
+
return torch.zeros_like(x)
|
|
63
|
+
shape = (x.shape[0],) + (1,) * (x.ndim - 1)
|
|
64
|
+
random_tensor = keep_prob + torch.rand(
|
|
65
|
+
shape, dtype=x.dtype, device=x.device)
|
|
66
|
+
binary_tensor = torch.floor(random_tensor)
|
|
67
|
+
return x * binary_tensor / keep_prob
|
|
68
|
+
|
|
69
|
+
def forward(self, x):
|
|
70
|
+
# Pre-activation structure
|
|
71
|
+
out = self.norm1(x)
|
|
72
|
+
out = self.fc1(out)
|
|
73
|
+
out = self.act(out)
|
|
74
|
+
out = self.dropout(out)
|
|
75
|
+
# If a second norm is enabled: out = self.norm2(out)
|
|
76
|
+
out = self.fc2(out)
|
|
77
|
+
# Apply residual scaling then add
|
|
78
|
+
out = self.res_scale * out
|
|
79
|
+
out = self._drop_path(out)
|
|
80
|
+
return x + out
|
|
81
|
+
|
|
82
|
+
# ResNetSequential defines the full network
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class ResNetSequential(nn.Module):
|
|
86
|
+
# Input shape: (batch, input_dim)
|
|
87
|
+
# Network: FC + norm + ReLU, stack residual blocks, output Softplus
|
|
88
|
+
|
|
89
|
+
def __init__(self, input_dim: int, hidden_dim: int = 64, block_num: int = 2,
|
|
90
|
+
use_layernorm: bool = True, dropout: float = 0.1,
|
|
91
|
+
residual_scale: float = 0.1, stochastic_depth: float = 0.0,
|
|
92
|
+
task_type: str = 'regression'):
|
|
93
|
+
super(ResNetSequential, self).__init__()
|
|
94
|
+
|
|
95
|
+
self.net = nn.Sequential()
|
|
96
|
+
self.net.add_module('fc1', nn.Linear(input_dim, hidden_dim))
|
|
97
|
+
|
|
98
|
+
# Optional explicit normalization after the first layer:
|
|
99
|
+
# For LayerNorm:
|
|
100
|
+
# self.net.add_module('norm1', nn.LayerNorm(hidden_dim))
|
|
101
|
+
# Or BatchNorm:
|
|
102
|
+
# self.net.add_module('norm1', nn.BatchNorm1d(hidden_dim))
|
|
103
|
+
|
|
104
|
+
# If desired, insert ReLU before residual blocks:
|
|
105
|
+
# self.net.add_module('relu1', nn.ReLU(inplace=True))
|
|
106
|
+
|
|
107
|
+
# Residual blocks
|
|
108
|
+
drop_path_rate = max(0.0, float(stochastic_depth))
|
|
109
|
+
for i in range(block_num):
|
|
110
|
+
if block_num > 1:
|
|
111
|
+
block_drop = drop_path_rate * (i / (block_num - 1))
|
|
112
|
+
else:
|
|
113
|
+
block_drop = drop_path_rate
|
|
114
|
+
self.net.add_module(
|
|
115
|
+
f'ResBlk_{i+1}',
|
|
116
|
+
ResBlock(
|
|
117
|
+
hidden_dim,
|
|
118
|
+
dropout=dropout,
|
|
119
|
+
use_layernorm=use_layernorm,
|
|
120
|
+
residual_scale=residual_scale,
|
|
121
|
+
stochastic_depth=block_drop)
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
self.net.add_module('fc_out', nn.Linear(hidden_dim, 1))
|
|
125
|
+
|
|
126
|
+
if task_type == 'classification':
|
|
127
|
+
self.net.add_module('softplus', nn.Identity())
|
|
128
|
+
else:
|
|
129
|
+
self.net.add_module('softplus', nn.Softplus())
|
|
130
|
+
|
|
131
|
+
def forward(self, x):
|
|
132
|
+
if self.training and not hasattr(self, '_printed_device'):
|
|
133
|
+
print(f">>> ResNetSequential executing on device: {x.device}")
|
|
134
|
+
self._printed_device = True
|
|
135
|
+
return self.net(x)
|
|
136
|
+
|
|
137
|
+
# Define the ResNet sklearn-style wrapper.
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class ResNetSklearn(TorchTrainerMixin, nn.Module):
|
|
141
|
+
def __init__(self, model_nme: str, input_dim: int, hidden_dim: int = 64,
|
|
142
|
+
block_num: int = 2, batch_num: int = 100, epochs: int = 100,
|
|
143
|
+
task_type: str = 'regression',
|
|
144
|
+
tweedie_power: float = 1.5, learning_rate: float = 0.01, patience: int = 10,
|
|
145
|
+
use_layernorm: bool = True, dropout: float = 0.1,
|
|
146
|
+
residual_scale: float = 0.1,
|
|
147
|
+
stochastic_depth: float = 0.0,
|
|
148
|
+
weight_decay: float = 1e-4,
|
|
149
|
+
use_data_parallel: bool = True,
|
|
150
|
+
use_ddp: bool = False,
|
|
151
|
+
loss_name: Optional[str] = None):
|
|
152
|
+
super(ResNetSklearn, self).__init__()
|
|
153
|
+
|
|
154
|
+
self.use_ddp = use_ddp
|
|
155
|
+
self.is_ddp_enabled, self.local_rank, self.rank, self.world_size = (
|
|
156
|
+
False, 0, 0, 1)
|
|
157
|
+
|
|
158
|
+
if self.use_ddp:
|
|
159
|
+
self.is_ddp_enabled, self.local_rank, self.rank, self.world_size = DistributedUtils.setup_ddp()
|
|
160
|
+
|
|
161
|
+
self.input_dim = input_dim
|
|
162
|
+
self.hidden_dim = hidden_dim
|
|
163
|
+
self.block_num = block_num
|
|
164
|
+
self.batch_num = batch_num
|
|
165
|
+
self.epochs = epochs
|
|
166
|
+
self.task_type = task_type
|
|
167
|
+
self.model_nme = model_nme
|
|
168
|
+
self.learning_rate = learning_rate
|
|
169
|
+
self.weight_decay = weight_decay
|
|
170
|
+
self.patience = patience
|
|
171
|
+
self.use_layernorm = use_layernorm
|
|
172
|
+
self.dropout = dropout
|
|
173
|
+
self.residual_scale = residual_scale
|
|
174
|
+
self.stochastic_depth = max(0.0, float(stochastic_depth))
|
|
175
|
+
self.loss_curve_path: Optional[str] = None
|
|
176
|
+
self.training_history: Dict[str, List[float]] = {
|
|
177
|
+
"train": [], "val": []}
|
|
178
|
+
self.use_data_parallel = bool(use_data_parallel)
|
|
179
|
+
|
|
180
|
+
# Device selection: cuda > mps > cpu
|
|
181
|
+
if self.is_ddp_enabled:
|
|
182
|
+
self.device = torch.device(f'cuda:{self.local_rank}')
|
|
183
|
+
elif torch.cuda.is_available():
|
|
184
|
+
self.device = torch.device('cuda')
|
|
185
|
+
elif torch.backends.mps.is_available():
|
|
186
|
+
self.device = torch.device('mps')
|
|
187
|
+
else:
|
|
188
|
+
self.device = torch.device('cpu')
|
|
189
|
+
|
|
190
|
+
resolved_loss = normalize_loss_name(loss_name, self.task_type)
|
|
191
|
+
if self.task_type == 'classification':
|
|
192
|
+
self.loss_name = "logloss"
|
|
193
|
+
self.tw_power = None
|
|
194
|
+
else:
|
|
195
|
+
if resolved_loss == "auto":
|
|
196
|
+
resolved_loss = infer_loss_name_from_model_name(self.model_nme)
|
|
197
|
+
self.loss_name = resolved_loss
|
|
198
|
+
if self.loss_name == "tweedie":
|
|
199
|
+
self.tw_power = float(tweedie_power) if tweedie_power is not None else 1.5
|
|
200
|
+
else:
|
|
201
|
+
self.tw_power = resolve_tweedie_power(self.loss_name, default=1.5)
|
|
202
|
+
|
|
203
|
+
# Build network (construct on CPU first)
|
|
204
|
+
core = ResNetSequential(
|
|
205
|
+
self.input_dim,
|
|
206
|
+
self.hidden_dim,
|
|
207
|
+
self.block_num,
|
|
208
|
+
use_layernorm=self.use_layernorm,
|
|
209
|
+
dropout=self.dropout,
|
|
210
|
+
residual_scale=self.residual_scale,
|
|
211
|
+
stochastic_depth=self.stochastic_depth,
|
|
212
|
+
task_type=self.task_type
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
# ===== Multi-GPU: DataParallel vs DistributedDataParallel =====
|
|
216
|
+
if self.is_ddp_enabled:
|
|
217
|
+
core = core.to(self.device)
|
|
218
|
+
core = DDP(core, device_ids=[
|
|
219
|
+
self.local_rank], output_device=self.local_rank)
|
|
220
|
+
self.use_data_parallel = False
|
|
221
|
+
elif use_data_parallel and (self.device.type == 'cuda') and (torch.cuda.device_count() > 1):
|
|
222
|
+
if self.use_ddp and not self.is_ddp_enabled:
|
|
223
|
+
print(
|
|
224
|
+
">>> DDP requested but not initialized; falling back to DataParallel.")
|
|
225
|
+
core = nn.DataParallel(core, device_ids=list(
|
|
226
|
+
range(torch.cuda.device_count())))
|
|
227
|
+
# DataParallel scatters inputs, but the primary device remains cuda:0.
|
|
228
|
+
self.device = torch.device('cuda')
|
|
229
|
+
self.use_data_parallel = True
|
|
230
|
+
else:
|
|
231
|
+
self.use_data_parallel = False
|
|
232
|
+
|
|
233
|
+
self.resnet = core.to(self.device)
|
|
234
|
+
|
|
235
|
+
# ================ Internal helpers ================
|
|
236
|
+
@staticmethod
|
|
237
|
+
def _validate_vector(arr, name: str, n_rows: int) -> None:
|
|
238
|
+
if arr is None:
|
|
239
|
+
return
|
|
240
|
+
if isinstance(arr, pd.DataFrame):
|
|
241
|
+
if arr.shape[1] != 1:
|
|
242
|
+
raise ValueError(f"{name} must be 1d (single column).")
|
|
243
|
+
length = len(arr)
|
|
244
|
+
else:
|
|
245
|
+
arr_np = np.asarray(arr)
|
|
246
|
+
if arr_np.ndim == 0:
|
|
247
|
+
raise ValueError(f"{name} must be 1d.")
|
|
248
|
+
if arr_np.ndim > 2 or (arr_np.ndim == 2 and arr_np.shape[1] != 1):
|
|
249
|
+
raise ValueError(f"{name} must be 1d or Nx1.")
|
|
250
|
+
length = arr_np.shape[0]
|
|
251
|
+
if length != n_rows:
|
|
252
|
+
raise ValueError(
|
|
253
|
+
f"{name} length {length} does not match X length {n_rows}."
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
def _validate_inputs(self, X, y, w, label: str) -> None:
|
|
257
|
+
if X is None:
|
|
258
|
+
raise ValueError(f"{label} X cannot be None.")
|
|
259
|
+
n_rows = len(X)
|
|
260
|
+
if y is None:
|
|
261
|
+
raise ValueError(f"{label} y cannot be None.")
|
|
262
|
+
self._validate_vector(y, f"{label} y", n_rows)
|
|
263
|
+
self._validate_vector(w, f"{label} w", n_rows)
|
|
264
|
+
|
|
265
|
+
def _build_train_val_tensors(self, X_train, y_train, w_train, X_val, y_val, w_val):
|
|
266
|
+
self._validate_inputs(X_train, y_train, w_train, "train")
|
|
267
|
+
if X_val is not None or y_val is not None or w_val is not None:
|
|
268
|
+
if X_val is None or y_val is None:
|
|
269
|
+
raise ValueError("validation X and y must both be provided.")
|
|
270
|
+
self._validate_inputs(X_val, y_val, w_val, "val")
|
|
271
|
+
|
|
272
|
+
def _to_numpy(arr):
|
|
273
|
+
if hasattr(arr, "to_numpy"):
|
|
274
|
+
return arr.to_numpy(dtype=np.float32, copy=False)
|
|
275
|
+
return np.asarray(arr, dtype=np.float32)
|
|
276
|
+
|
|
277
|
+
X_tensor = torch.as_tensor(_to_numpy(X_train))
|
|
278
|
+
y_tensor = torch.as_tensor(_to_numpy(y_train)).view(-1, 1)
|
|
279
|
+
w_tensor = (
|
|
280
|
+
torch.as_tensor(_to_numpy(w_train)).view(-1, 1)
|
|
281
|
+
if w_train is not None else torch.ones_like(y_tensor)
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
has_val = X_val is not None and y_val is not None
|
|
285
|
+
if has_val:
|
|
286
|
+
X_val_tensor = torch.as_tensor(_to_numpy(X_val))
|
|
287
|
+
y_val_tensor = torch.as_tensor(_to_numpy(y_val)).view(-1, 1)
|
|
288
|
+
w_val_tensor = (
|
|
289
|
+
torch.as_tensor(_to_numpy(w_val)).view(-1, 1)
|
|
290
|
+
if w_val is not None else torch.ones_like(y_val_tensor)
|
|
291
|
+
)
|
|
292
|
+
else:
|
|
293
|
+
X_val_tensor = y_val_tensor = w_val_tensor = None
|
|
294
|
+
return X_tensor, y_tensor, w_tensor, X_val_tensor, y_val_tensor, w_val_tensor, has_val
|
|
295
|
+
|
|
296
|
+
def forward(self, x):
|
|
297
|
+
# Handle SHAP NumPy input.
|
|
298
|
+
if isinstance(x, np.ndarray):
|
|
299
|
+
x_tensor = torch.as_tensor(x, dtype=torch.float32)
|
|
300
|
+
else:
|
|
301
|
+
x_tensor = x
|
|
302
|
+
|
|
303
|
+
x_tensor = x_tensor.to(self.device)
|
|
304
|
+
y_pred = self.resnet(x_tensor)
|
|
305
|
+
return y_pred
|
|
306
|
+
|
|
307
|
+
# ---------------- Training ----------------
|
|
308
|
+
|
|
309
|
+
def fit(self, X_train, y_train, w_train=None,
|
|
310
|
+
X_val=None, y_val=None, w_val=None, trial=None):
|
|
311
|
+
|
|
312
|
+
X_tensor, y_tensor, w_tensor, X_val_tensor, y_val_tensor, w_val_tensor, has_val = \
|
|
313
|
+
self._build_train_val_tensors(
|
|
314
|
+
X_train, y_train, w_train, X_val, y_val, w_val)
|
|
315
|
+
|
|
316
|
+
dataset = TensorDataset(X_tensor, y_tensor, w_tensor)
|
|
317
|
+
dataloader, accum_steps = self._build_dataloader(
|
|
318
|
+
dataset,
|
|
319
|
+
N=X_tensor.shape[0],
|
|
320
|
+
base_bs_gpu=(2048, 1024, 512),
|
|
321
|
+
base_bs_cpu=(256, 128),
|
|
322
|
+
min_bs=64,
|
|
323
|
+
target_effective_cuda=2048,
|
|
324
|
+
target_effective_cpu=1024
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
# Set sampler epoch at the start of each epoch to keep shuffling deterministic.
|
|
328
|
+
if self.is_ddp_enabled and hasattr(dataloader.sampler, 'set_epoch'):
|
|
329
|
+
self.dataloader_sampler = dataloader.sampler
|
|
330
|
+
else:
|
|
331
|
+
self.dataloader_sampler = None
|
|
332
|
+
|
|
333
|
+
# === 4. Optimizer and AMP ===
|
|
334
|
+
self.optimizer = torch.optim.Adam(
|
|
335
|
+
self.resnet.parameters(),
|
|
336
|
+
lr=self.learning_rate,
|
|
337
|
+
weight_decay=float(self.weight_decay),
|
|
338
|
+
)
|
|
339
|
+
self.scaler = GradScaler(enabled=(self.device.type == 'cuda'))
|
|
340
|
+
|
|
341
|
+
X_val_dev = y_val_dev = w_val_dev = None
|
|
342
|
+
val_dataloader = None
|
|
343
|
+
if has_val:
|
|
344
|
+
# Build validation DataLoader.
|
|
345
|
+
val_dataset = TensorDataset(
|
|
346
|
+
X_val_tensor, y_val_tensor, w_val_tensor)
|
|
347
|
+
# No backward pass in validation; batch size can be larger for throughput.
|
|
348
|
+
val_dataloader = self._build_val_dataloader(
|
|
349
|
+
val_dataset, dataloader, accum_steps)
|
|
350
|
+
# Validation usually does not need a DDP sampler because we validate on the main process
|
|
351
|
+
# or aggregate results. For simplicity, keep validation on a single GPU or the main process.
|
|
352
|
+
|
|
353
|
+
is_data_parallel = isinstance(self.resnet, nn.DataParallel)
|
|
354
|
+
|
|
355
|
+
def forward_fn(batch):
|
|
356
|
+
X_batch, y_batch, w_batch = batch
|
|
357
|
+
|
|
358
|
+
if not is_data_parallel:
|
|
359
|
+
X_batch = X_batch.to(self.device, non_blocking=True)
|
|
360
|
+
# Keep targets and weights on the main device for loss computation.
|
|
361
|
+
y_batch = y_batch.to(self.device, non_blocking=True)
|
|
362
|
+
w_batch = w_batch.to(self.device, non_blocking=True)
|
|
363
|
+
|
|
364
|
+
y_pred = self.resnet(X_batch)
|
|
365
|
+
return y_pred, y_batch, w_batch
|
|
366
|
+
|
|
367
|
+
def val_forward_fn():
|
|
368
|
+
total_loss = 0.0
|
|
369
|
+
total_weight = 0.0
|
|
370
|
+
for batch in val_dataloader:
|
|
371
|
+
X_b, y_b, w_b = batch
|
|
372
|
+
if not is_data_parallel:
|
|
373
|
+
X_b = X_b.to(self.device, non_blocking=True)
|
|
374
|
+
y_b = y_b.to(self.device, non_blocking=True)
|
|
375
|
+
w_b = w_b.to(self.device, non_blocking=True)
|
|
376
|
+
|
|
377
|
+
y_pred = self.resnet(X_b)
|
|
378
|
+
|
|
379
|
+
# Manually compute weighted loss for accurate aggregation.
|
|
380
|
+
losses = self._compute_losses(
|
|
381
|
+
y_pred, y_b, apply_softplus=False)
|
|
382
|
+
|
|
383
|
+
batch_weight_sum = torch.clamp(w_b.sum(), min=EPS)
|
|
384
|
+
batch_weighted_loss_sum = (losses * w_b.view(-1)).sum()
|
|
385
|
+
|
|
386
|
+
total_loss += batch_weighted_loss_sum.item()
|
|
387
|
+
total_weight += batch_weight_sum.item()
|
|
388
|
+
|
|
389
|
+
return total_loss / max(total_weight, EPS)
|
|
390
|
+
|
|
391
|
+
clip_fn = None
|
|
392
|
+
if self.device.type == 'cuda':
|
|
393
|
+
def clip_fn(): return (self.scaler.unscale_(self.optimizer),
|
|
394
|
+
clip_grad_norm_(self.resnet.parameters(), max_norm=1.0))
|
|
395
|
+
|
|
396
|
+
# Under DDP, only the main process prints logs and saves models.
|
|
397
|
+
if self.is_ddp_enabled and not DistributedUtils.is_main_process():
|
|
398
|
+
# Non-main processes skip validation callback logging (handled inside _train_model).
|
|
399
|
+
pass
|
|
400
|
+
|
|
401
|
+
best_state, history = self._train_model(
|
|
402
|
+
self.resnet,
|
|
403
|
+
dataloader,
|
|
404
|
+
accum_steps,
|
|
405
|
+
self.optimizer,
|
|
406
|
+
self.scaler,
|
|
407
|
+
forward_fn,
|
|
408
|
+
val_forward_fn if has_val else None,
|
|
409
|
+
apply_softplus=False,
|
|
410
|
+
clip_fn=clip_fn,
|
|
411
|
+
trial=trial,
|
|
412
|
+
loss_curve_path=getattr(self, "loss_curve_path", None)
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
if has_val and best_state is not None:
|
|
416
|
+
# Load state into unwrapped module to match how it was saved
|
|
417
|
+
base_module = self.resnet.module if hasattr(self.resnet, "module") else self.resnet
|
|
418
|
+
base_module.load_state_dict(best_state)
|
|
419
|
+
self.training_history = history
|
|
420
|
+
|
|
421
|
+
# ---------------- Prediction ----------------
|
|
422
|
+
|
|
423
|
+
def predict(self, X_test):
|
|
424
|
+
self.resnet.eval()
|
|
425
|
+
if isinstance(X_test, pd.DataFrame):
|
|
426
|
+
X_np = X_test.to_numpy(dtype=np.float32, copy=False)
|
|
427
|
+
else:
|
|
428
|
+
X_np = np.asarray(X_test, dtype=np.float32)
|
|
429
|
+
|
|
430
|
+
inference_cm = getattr(torch, "inference_mode", torch.no_grad)
|
|
431
|
+
with inference_cm():
|
|
432
|
+
y_pred = self(X_np).cpu().numpy()
|
|
433
|
+
|
|
434
|
+
if self.task_type == 'classification':
|
|
435
|
+
y_pred = 1 / (1 + np.exp(-y_pred)) # Sigmoid converts logits to probabilities.
|
|
436
|
+
else:
|
|
437
|
+
y_pred = np.clip(y_pred, 1e-6, None)
|
|
438
|
+
return y_pred.flatten()
|
|
439
|
+
|
|
440
|
+
# ---------------- Set Params ----------------
|
|
441
|
+
|
|
442
|
+
def set_params(self, params):
|
|
443
|
+
for key, value in params.items():
|
|
444
|
+
if hasattr(self, key):
|
|
445
|
+
setattr(self, key, value)
|
|
446
|
+
else:
|
|
447
|
+
raise ValueError(f"Parameter {key} not found in model.")
|
|
448
|
+
return self
|