ins-pricing 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ins_pricing/cli/BayesOpt_entry.py +32 -0
- ins_pricing/cli/utils/import_resolver.py +29 -3
- ins_pricing/cli/utils/notebook_utils.py +3 -2
- ins_pricing/docs/modelling/BayesOpt_USAGE.md +3 -3
- ins_pricing/modelling/core/bayesopt/__init__.py +4 -0
- ins_pricing/modelling/core/bayesopt/config_preprocess.py +12 -0
- ins_pricing/modelling/core/bayesopt/core.py +21 -8
- ins_pricing/modelling/core/bayesopt/models/model_ft_components.py +38 -12
- ins_pricing/modelling/core/bayesopt/models/model_ft_trainer.py +16 -6
- ins_pricing/modelling/core/bayesopt/models/model_gnn.py +16 -6
- ins_pricing/modelling/core/bayesopt/models/model_resn.py +16 -7
- ins_pricing/modelling/core/bayesopt/trainers/trainer_base.py +2 -0
- ins_pricing/modelling/core/bayesopt/trainers/trainer_ft.py +25 -8
- ins_pricing/modelling/core/bayesopt/trainers/trainer_glm.py +14 -11
- ins_pricing/modelling/core/bayesopt/trainers/trainer_gnn.py +29 -10
- ins_pricing/modelling/core/bayesopt/trainers/trainer_resn.py +28 -12
- ins_pricing/modelling/core/bayesopt/trainers/trainer_xgb.py +13 -14
- ins_pricing/modelling/core/bayesopt/utils/losses.py +129 -0
- ins_pricing/modelling/core/bayesopt/utils/metrics_and_devices.py +18 -3
- ins_pricing/modelling/core/bayesopt/utils/torch_trainer_mixin.py +24 -3
- ins_pricing/production/predict.py +38 -9
- ins_pricing/setup.py +1 -1
- ins_pricing/utils/metrics.py +27 -3
- ins_pricing/utils/torch_compat.py +40 -0
- {ins_pricing-0.3.2.dist-info → ins_pricing-0.3.4.dist-info}/METADATA +162 -162
- {ins_pricing-0.3.2.dist-info → ins_pricing-0.3.4.dist-info}/RECORD +28 -27
- {ins_pricing-0.3.2.dist-info → ins_pricing-0.3.4.dist-info}/WHEEL +0 -0
- {ins_pricing-0.3.2.dist-info → ins_pricing-0.3.4.dist-info}/top_level.txt +0 -0
|
@@ -6,6 +6,8 @@ The main implementation lives in bayesopt_entry_runner.py.
|
|
|
6
6
|
from __future__ import annotations
|
|
7
7
|
|
|
8
8
|
from pathlib import Path
|
|
9
|
+
import json
|
|
10
|
+
import os
|
|
9
11
|
import sys
|
|
10
12
|
|
|
11
13
|
if __package__ in {None, ""}:
|
|
@@ -13,6 +15,36 @@ if __package__ in {None, ""}:
|
|
|
13
15
|
if str(repo_root) not in sys.path:
|
|
14
16
|
sys.path.insert(0, str(repo_root))
|
|
15
17
|
|
|
18
|
+
def _apply_env_from_config(argv: list[str]) -> None:
|
|
19
|
+
if "--config-json" not in argv:
|
|
20
|
+
return
|
|
21
|
+
idx = argv.index("--config-json")
|
|
22
|
+
if idx + 1 >= len(argv):
|
|
23
|
+
return
|
|
24
|
+
raw_path = argv[idx + 1]
|
|
25
|
+
try:
|
|
26
|
+
cfg_path = Path(raw_path).expanduser()
|
|
27
|
+
if not cfg_path.is_absolute():
|
|
28
|
+
cfg_path = cfg_path.resolve()
|
|
29
|
+
if not cfg_path.exists():
|
|
30
|
+
script_dir = Path(__file__).resolve().parents[1]
|
|
31
|
+
candidate = (script_dir / raw_path).resolve()
|
|
32
|
+
if candidate.exists():
|
|
33
|
+
cfg_path = candidate
|
|
34
|
+
if not cfg_path.exists():
|
|
35
|
+
return
|
|
36
|
+
cfg = json.loads(cfg_path.read_text(encoding="utf-8", errors="replace"))
|
|
37
|
+
env = cfg.get("env", {})
|
|
38
|
+
if isinstance(env, dict):
|
|
39
|
+
for key, value in env.items():
|
|
40
|
+
if key is None:
|
|
41
|
+
continue
|
|
42
|
+
os.environ.setdefault(str(key), str(value))
|
|
43
|
+
except Exception:
|
|
44
|
+
return
|
|
45
|
+
|
|
46
|
+
_apply_env_from_config(sys.argv)
|
|
47
|
+
|
|
16
48
|
try:
|
|
17
49
|
from .bayesopt_entry_runner import main
|
|
18
50
|
except Exception: # pragma: no cover
|
|
@@ -13,6 +13,7 @@ Usage:
|
|
|
13
13
|
from __future__ import annotations
|
|
14
14
|
|
|
15
15
|
import importlib
|
|
16
|
+
import os
|
|
16
17
|
import sys
|
|
17
18
|
from dataclasses import dataclass, field
|
|
18
19
|
from pathlib import Path
|
|
@@ -70,14 +71,39 @@ class ResolvedImports:
|
|
|
70
71
|
plot_loss_curve: Optional[Callable] = None
|
|
71
72
|
|
|
72
73
|
|
|
74
|
+
def _debug_imports_enabled() -> bool:
|
|
75
|
+
value = os.environ.get("BAYESOPT_DEBUG_IMPORTS")
|
|
76
|
+
if value is None:
|
|
77
|
+
return False
|
|
78
|
+
return str(value).strip().lower() in {"1", "true", "yes", "y", "on"}
|
|
79
|
+
|
|
80
|
+
|
|
73
81
|
def _try_import(module_path: str, attr_name: Optional[str] = None) -> Optional[Any]:
|
|
74
82
|
"""Attempt to import a module or attribute, returning None on failure."""
|
|
75
83
|
try:
|
|
76
84
|
module = importlib.import_module(module_path)
|
|
77
85
|
if attr_name:
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
86
|
+
result = getattr(module, attr_name, None)
|
|
87
|
+
else:
|
|
88
|
+
result = module
|
|
89
|
+
if _debug_imports_enabled():
|
|
90
|
+
origin = getattr(module, "__file__", None)
|
|
91
|
+
origin = origin or getattr(module, "__path__", None)
|
|
92
|
+
print(
|
|
93
|
+
f"[BAYESOPT_DEBUG_IMPORTS] imported {module_path}"
|
|
94
|
+
f"{'::' + attr_name if attr_name else ''} from {origin}",
|
|
95
|
+
file=sys.stderr,
|
|
96
|
+
flush=True,
|
|
97
|
+
)
|
|
98
|
+
return result
|
|
99
|
+
except Exception as exc:
|
|
100
|
+
if _debug_imports_enabled():
|
|
101
|
+
print(
|
|
102
|
+
f"[BAYESOPT_DEBUG_IMPORTS] failed import {module_path}"
|
|
103
|
+
f"{'::' + attr_name if attr_name else ''}: {exc.__class__.__name__}: {exc}",
|
|
104
|
+
file=sys.stderr,
|
|
105
|
+
flush=True,
|
|
106
|
+
)
|
|
81
107
|
return None
|
|
82
108
|
|
|
83
109
|
|
|
@@ -8,9 +8,9 @@ from pathlib import Path
|
|
|
8
8
|
from typing import Iterable, List, Optional, Sequence, cast
|
|
9
9
|
|
|
10
10
|
try:
|
|
11
|
-
from .cli_config import add_config_json_arg # type: ignore
|
|
11
|
+
from .cli_config import add_config_json_arg, set_env # type: ignore
|
|
12
12
|
except Exception: # pragma: no cover
|
|
13
|
-
from cli_config import add_config_json_arg # type: ignore
|
|
13
|
+
from cli_config import add_config_json_arg, set_env # type: ignore
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
def _find_ins_pricing_dir(cwd: Optional[Path] = None) -> Path:
|
|
@@ -261,6 +261,7 @@ def run_from_config(config_json: str | Path) -> subprocess.CompletedProcess:
|
|
|
261
261
|
if not config_path.is_absolute():
|
|
262
262
|
config_path = (pkg_dir / config_path).resolve() if (pkg_dir / config_path).exists() else config_path.resolve()
|
|
263
263
|
raw = json.loads(config_path.read_text(encoding="utf-8", errors="replace"))
|
|
264
|
+
set_env(raw.get("env", {}))
|
|
264
265
|
runner = cast(dict, raw.get("runner") or {})
|
|
265
266
|
|
|
266
267
|
mode = str(runner.get("mode") or "entry").strip().lower()
|
|
@@ -75,13 +75,13 @@ Under `ins_pricing/modelling/core/bayesopt/`:
|
|
|
75
75
|
|
|
76
76
|
1) **Tools and utilities**
|
|
77
77
|
|
|
78
|
-
- `IOUtils / TrainingUtils / PlotUtils`: I/O, training utilities (batch size,
|
|
78
|
+
- `IOUtils / TrainingUtils / PlotUtils`: I/O, training utilities (batch size, loss functions, free_cuda), plotting helpers
|
|
79
79
|
- `DistributedUtils`: DDP init, rank/world_size helpers
|
|
80
80
|
|
|
81
81
|
2) **TorchTrainerMixin (common components for torch tabular training)**
|
|
82
82
|
|
|
83
83
|
- DataLoader: `_build_dataloader()` / `_build_val_dataloader()` (prints batch/accum/workers)
|
|
84
|
-
- Loss: `_compute_losses()` / `_compute_weighted_loss()` (regression
|
|
84
|
+
- Loss: `_compute_losses()` / `_compute_weighted_loss()` (regression supports tweedie/poisson/gamma/mse/mae; classification uses BCEWithLogits)
|
|
85
85
|
- Early stop: `_early_stop_update()`
|
|
86
86
|
|
|
87
87
|
3) **Sklearn-style model classes (core training objects)**
|
|
@@ -292,7 +292,7 @@ FT role is controlled by `ft_role` (from config or CLI `--ft-role`):
|
|
|
292
292
|
### 4.1 Supervised models (GLM/XGB/ResNet/FT-as-model)
|
|
293
293
|
|
|
294
294
|
- `TrainerBase.tune()` calls each trainer's `cross_val()` and minimizes validation metric (default direction `minimize`)
|
|
295
|
-
- Regression
|
|
295
|
+
- Regression loss is configurable (tweedie/poisson/gamma/mse/mae); classification uses logloss
|
|
296
296
|
|
|
297
297
|
### 4.2 FT self-supervised (`unsupervised_embedding`)
|
|
298
298
|
|
|
@@ -12,6 +12,7 @@ import pandas as pd
|
|
|
12
12
|
from sklearn.preprocessing import StandardScaler
|
|
13
13
|
|
|
14
14
|
from .utils import IOUtils
|
|
15
|
+
from .utils.losses import normalize_loss_name
|
|
15
16
|
from ....exceptions import ConfigurationError, DataValidationError
|
|
16
17
|
|
|
17
18
|
# NOTE: Some CSV exports may contain invisible BOM characters or leading/trailing
|
|
@@ -81,6 +82,7 @@ class BayesOptConfig:
|
|
|
81
82
|
task_type: Either 'regression' or 'classification'
|
|
82
83
|
binary_resp_nme: Column name for binary response (optional)
|
|
83
84
|
cate_list: List of categorical feature column names
|
|
85
|
+
loss_name: Regression loss ('auto', 'tweedie', 'poisson', 'gamma', 'mse', 'mae')
|
|
84
86
|
prop_test: Proportion of data for validation (0.0-1.0)
|
|
85
87
|
rand_seed: Random seed for reproducibility
|
|
86
88
|
epochs: Number of training epochs
|
|
@@ -117,6 +119,7 @@ class BayesOptConfig:
|
|
|
117
119
|
task_type: str = 'regression'
|
|
118
120
|
binary_resp_nme: Optional[str] = None
|
|
119
121
|
cate_list: Optional[List[str]] = None
|
|
122
|
+
loss_name: str = "auto"
|
|
120
123
|
|
|
121
124
|
# Training configuration
|
|
122
125
|
prop_test: float = 0.25
|
|
@@ -207,6 +210,15 @@ class BayesOptConfig:
|
|
|
207
210
|
errors.append(
|
|
208
211
|
f"task_type must be one of {valid_task_types}, got '{self.task_type}'"
|
|
209
212
|
)
|
|
213
|
+
# Validate loss_name
|
|
214
|
+
try:
|
|
215
|
+
normalized_loss = normalize_loss_name(self.loss_name, self.task_type)
|
|
216
|
+
if self.task_type == "classification" and normalized_loss not in {"auto", "logloss", "bce"}:
|
|
217
|
+
errors.append(
|
|
218
|
+
"loss_name must be 'auto', 'logloss', or 'bce' for classification tasks."
|
|
219
|
+
)
|
|
220
|
+
except ValueError as exc:
|
|
221
|
+
errors.append(str(exc))
|
|
210
222
|
|
|
211
223
|
# Validate prop_test
|
|
212
224
|
if not 0.0 < self.prop_test < 1.0:
|
|
@@ -17,6 +17,12 @@ from .model_plotting_mixin import BayesOptPlottingMixin
|
|
|
17
17
|
from .models import GraphNeuralNetSklearn
|
|
18
18
|
from .trainers import FTTrainer, GLMTrainer, GNNTrainer, ResNetTrainer, XGBTrainer
|
|
19
19
|
from .utils import EPS, infer_factor_and_cate_list, set_global_seed
|
|
20
|
+
from .utils.losses import (
|
|
21
|
+
infer_loss_name_from_model_name,
|
|
22
|
+
normalize_loss_name,
|
|
23
|
+
resolve_tweedie_power,
|
|
24
|
+
resolve_xgb_objective,
|
|
25
|
+
)
|
|
20
26
|
|
|
21
27
|
|
|
22
28
|
class _CVSplitter:
|
|
@@ -293,6 +299,14 @@ class BayesOptModel(BayesOptPlottingMixin, BayesOptExplainMixin):
|
|
|
293
299
|
self.config = cfg
|
|
294
300
|
self.model_nme = cfg.model_nme
|
|
295
301
|
self.task_type = cfg.task_type
|
|
302
|
+
normalized_loss = normalize_loss_name(getattr(cfg, "loss_name", None), self.task_type)
|
|
303
|
+
if self.task_type == "classification":
|
|
304
|
+
self.loss_name = "logloss" if normalized_loss == "auto" else normalized_loss
|
|
305
|
+
else:
|
|
306
|
+
if normalized_loss == "auto":
|
|
307
|
+
self.loss_name = infer_loss_name_from_model_name(self.model_nme)
|
|
308
|
+
else:
|
|
309
|
+
self.loss_name = normalized_loss
|
|
296
310
|
self.resp_nme = cfg.resp_nme
|
|
297
311
|
self.weight_nme = cfg.weight_nme
|
|
298
312
|
self.factor_nmes = cfg.factor_nmes
|
|
@@ -339,14 +353,7 @@ class BayesOptModel(BayesOptPlottingMixin, BayesOptExplainMixin):
|
|
|
339
353
|
if self.task_type == 'classification':
|
|
340
354
|
self.obj = 'binary:logistic'
|
|
341
355
|
else: # regression task
|
|
342
|
-
|
|
343
|
-
self.obj = 'count:poisson'
|
|
344
|
-
elif 's' in self.model_nme:
|
|
345
|
-
self.obj = 'reg:gamma'
|
|
346
|
-
elif 'bc' in self.model_nme:
|
|
347
|
-
self.obj = 'reg:tweedie'
|
|
348
|
-
else:
|
|
349
|
-
self.obj = 'reg:tweedie'
|
|
356
|
+
self.obj = resolve_xgb_objective(self.loss_name)
|
|
350
357
|
self.fit_params = {
|
|
351
358
|
'sample_weight': self.train_data[self.weight_nme].values
|
|
352
359
|
}
|
|
@@ -426,6 +433,11 @@ class BayesOptModel(BayesOptPlottingMixin, BayesOptExplainMixin):
|
|
|
426
433
|
def default_tweedie_power(self, obj: Optional[str] = None) -> Optional[float]:
|
|
427
434
|
if self.task_type == 'classification':
|
|
428
435
|
return None
|
|
436
|
+
loss_name = getattr(self, "loss_name", None)
|
|
437
|
+
if loss_name:
|
|
438
|
+
resolved = resolve_tweedie_power(str(loss_name), default=1.5)
|
|
439
|
+
if resolved is not None:
|
|
440
|
+
return resolved
|
|
429
441
|
objective = obj or getattr(self, "obj", None)
|
|
430
442
|
if objective == 'count:poisson':
|
|
431
443
|
return 1.0
|
|
@@ -503,6 +515,7 @@ class BayesOptModel(BayesOptPlottingMixin, BayesOptExplainMixin):
|
|
|
503
515
|
patience=5,
|
|
504
516
|
task_type=self.task_type,
|
|
505
517
|
tweedie_power=tw_power,
|
|
518
|
+
loss_name=self.loss_name,
|
|
506
519
|
use_data_parallel=False,
|
|
507
520
|
use_ddp=False,
|
|
508
521
|
use_approx_knn=self.config.gnn_use_approx_knn,
|
|
@@ -106,31 +106,58 @@ class ScaledTransformerEncoderLayer(nn.Module):
|
|
|
106
106
|
self.res_scale_attn = residual_scale_attn
|
|
107
107
|
self.res_scale_ffn = residual_scale_ffn
|
|
108
108
|
|
|
109
|
-
def forward(self, src, src_mask=None, src_key_padding_mask=None):
|
|
109
|
+
def forward(self, src, src_mask=None, src_key_padding_mask=None, is_causal: Optional[bool] = None, **_kwargs):
|
|
110
110
|
# Input tensor shape: (batch, seq_len, d_model).
|
|
111
111
|
x = src
|
|
112
112
|
|
|
113
113
|
if self.norm_first:
|
|
114
114
|
# Pre-norm before attention.
|
|
115
|
-
x = x + self._sa_block(
|
|
116
|
-
|
|
115
|
+
x = x + self._sa_block(
|
|
116
|
+
self.norm1(x),
|
|
117
|
+
src_mask,
|
|
118
|
+
src_key_padding_mask,
|
|
119
|
+
is_causal=is_causal,
|
|
120
|
+
)
|
|
117
121
|
x = x + self._ff_block(self.norm2(x))
|
|
118
122
|
else:
|
|
119
123
|
# Post-norm (usually disabled).
|
|
120
124
|
x = self.norm1(
|
|
121
|
-
x + self._sa_block(
|
|
125
|
+
x + self._sa_block(
|
|
126
|
+
x,
|
|
127
|
+
src_mask,
|
|
128
|
+
src_key_padding_mask,
|
|
129
|
+
is_causal=is_causal,
|
|
130
|
+
)
|
|
131
|
+
)
|
|
122
132
|
x = self.norm2(x + self._ff_block(x))
|
|
123
133
|
|
|
124
134
|
return x
|
|
125
135
|
|
|
126
|
-
def _sa_block(self, x, attn_mask, key_padding_mask):
|
|
136
|
+
def _sa_block(self, x, attn_mask, key_padding_mask, *, is_causal: Optional[bool] = None):
|
|
127
137
|
# Self-attention with residual scaling.
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
138
|
+
if is_causal is None:
|
|
139
|
+
attn_out, _ = self.self_attn(
|
|
140
|
+
x, x, x,
|
|
141
|
+
attn_mask=attn_mask,
|
|
142
|
+
key_padding_mask=key_padding_mask,
|
|
143
|
+
need_weights=False,
|
|
144
|
+
)
|
|
145
|
+
else:
|
|
146
|
+
try:
|
|
147
|
+
attn_out, _ = self.self_attn(
|
|
148
|
+
x, x, x,
|
|
149
|
+
attn_mask=attn_mask,
|
|
150
|
+
key_padding_mask=key_padding_mask,
|
|
151
|
+
need_weights=False,
|
|
152
|
+
is_causal=is_causal,
|
|
153
|
+
)
|
|
154
|
+
except TypeError:
|
|
155
|
+
attn_out, _ = self.self_attn(
|
|
156
|
+
x, x, x,
|
|
157
|
+
attn_mask=attn_mask,
|
|
158
|
+
key_padding_mask=key_padding_mask,
|
|
159
|
+
need_weights=False,
|
|
160
|
+
)
|
|
134
161
|
return self.res_scale_attn * self.dropout1(attn_out)
|
|
135
162
|
|
|
136
163
|
def _ff_block(self, x):
|
|
@@ -313,4 +340,3 @@ class MaskedTabularDataset(Dataset):
|
|
|
313
340
|
None if self.X_cat_true is None else self.X_cat_true[idx],
|
|
314
341
|
None if self.cat_mask is None else self.cat_mask[idx],
|
|
315
342
|
)
|
|
316
|
-
|
|
@@ -16,6 +16,11 @@ from torch.nn.parallel import DistributedDataParallel as DDP
|
|
|
16
16
|
from torch.nn.utils import clip_grad_norm_
|
|
17
17
|
|
|
18
18
|
from ..utils import DistributedUtils, EPS, TorchTrainerMixin
|
|
19
|
+
from ..utils.losses import (
|
|
20
|
+
infer_loss_name_from_model_name,
|
|
21
|
+
normalize_loss_name,
|
|
22
|
+
resolve_tweedie_power,
|
|
23
|
+
)
|
|
19
24
|
from .model_ft_components import FTTransformerCore, MaskedTabularDataset, TabularDataset
|
|
20
25
|
|
|
21
26
|
|
|
@@ -159,7 +164,8 @@ class FTTransformerSklearn(TorchTrainerMixin, nn.Module):
|
|
|
159
164
|
weight_decay: float = 0.0,
|
|
160
165
|
use_data_parallel: bool = True,
|
|
161
166
|
use_ddp: bool = False,
|
|
162
|
-
num_numeric_tokens: Optional[int] = None
|
|
167
|
+
num_numeric_tokens: Optional[int] = None,
|
|
168
|
+
loss_name: Optional[str] = None
|
|
163
169
|
):
|
|
164
170
|
super().__init__()
|
|
165
171
|
|
|
@@ -187,14 +193,18 @@ class FTTransformerSklearn(TorchTrainerMixin, nn.Module):
|
|
|
187
193
|
self.weight_decay = weight_decay
|
|
188
194
|
self.task_type = task_type
|
|
189
195
|
self.patience = patience
|
|
196
|
+
resolved_loss = normalize_loss_name(loss_name, self.task_type)
|
|
190
197
|
if self.task_type == 'classification':
|
|
198
|
+
self.loss_name = "logloss"
|
|
191
199
|
self.tw_power = None # No Tweedie power for classification.
|
|
192
|
-
elif 'f' in self.model_nme:
|
|
193
|
-
self.tw_power = 1.0
|
|
194
|
-
elif 's' in self.model_nme:
|
|
195
|
-
self.tw_power = 2.0
|
|
196
200
|
else:
|
|
197
|
-
|
|
201
|
+
if resolved_loss == "auto":
|
|
202
|
+
resolved_loss = infer_loss_name_from_model_name(self.model_nme)
|
|
203
|
+
self.loss_name = resolved_loss
|
|
204
|
+
if self.loss_name == "tweedie":
|
|
205
|
+
self.tw_power = float(tweedie_power) if tweedie_power is not None else 1.5
|
|
206
|
+
else:
|
|
207
|
+
self.tw_power = resolve_tweedie_power(self.loss_name, default=1.5)
|
|
198
208
|
|
|
199
209
|
if self.is_ddp_enabled:
|
|
200
210
|
self.device = torch.device(f"cuda:{self.local_rank}")
|
|
@@ -17,6 +17,11 @@ from torch.nn.parallel import DistributedDataParallel as DDP
|
|
|
17
17
|
from torch.nn.utils import clip_grad_norm_
|
|
18
18
|
|
|
19
19
|
from ..utils import DistributedUtils, EPS, IOUtils, TorchTrainerMixin
|
|
20
|
+
from ..utils.losses import (
|
|
21
|
+
infer_loss_name_from_model_name,
|
|
22
|
+
normalize_loss_name,
|
|
23
|
+
resolve_tweedie_power,
|
|
24
|
+
)
|
|
20
25
|
|
|
21
26
|
try:
|
|
22
27
|
from torch_geometric.nn import knn_graph
|
|
@@ -109,7 +114,8 @@ class GraphNeuralNetSklearn(TorchTrainerMixin, nn.Module):
|
|
|
109
114
|
max_gpu_knn_nodes: Optional[int] = None,
|
|
110
115
|
knn_gpu_mem_ratio: float = 0.9,
|
|
111
116
|
knn_gpu_mem_overhead: float = 2.0,
|
|
112
|
-
knn_cpu_jobs: Optional[int] = -1
|
|
117
|
+
knn_cpu_jobs: Optional[int] = -1,
|
|
118
|
+
loss_name: Optional[str] = None) -> None:
|
|
113
119
|
super().__init__()
|
|
114
120
|
self.model_nme = model_nme
|
|
115
121
|
self.input_dim = input_dim
|
|
@@ -139,14 +145,18 @@ class GraphNeuralNetSklearn(TorchTrainerMixin, nn.Module):
|
|
|
139
145
|
self._adj_cache_key: Optional[Tuple[Any, ...]] = None
|
|
140
146
|
self._adj_cache_tensor: Optional[torch.Tensor] = None
|
|
141
147
|
|
|
148
|
+
resolved_loss = normalize_loss_name(loss_name, self.task_type)
|
|
142
149
|
if self.task_type == 'classification':
|
|
150
|
+
self.loss_name = "logloss"
|
|
143
151
|
self.tw_power = None
|
|
144
|
-
elif 'f' in self.model_nme:
|
|
145
|
-
self.tw_power = 1.0
|
|
146
|
-
elif 's' in self.model_nme:
|
|
147
|
-
self.tw_power = 2.0
|
|
148
152
|
else:
|
|
149
|
-
|
|
153
|
+
if resolved_loss == "auto":
|
|
154
|
+
resolved_loss = infer_loss_name_from_model_name(self.model_nme)
|
|
155
|
+
self.loss_name = resolved_loss
|
|
156
|
+
if self.loss_name == "tweedie":
|
|
157
|
+
self.tw_power = float(tweedie_power) if tweedie_power is not None else 1.5
|
|
158
|
+
else:
|
|
159
|
+
self.tw_power = resolve_tweedie_power(self.loss_name, default=1.5)
|
|
150
160
|
|
|
151
161
|
self.ddp_enabled = False
|
|
152
162
|
self.local_rank = int(os.environ.get("LOCAL_RANK", 0))
|
|
@@ -12,6 +12,11 @@ from torch.nn.utils import clip_grad_norm_
|
|
|
12
12
|
from torch.utils.data import TensorDataset
|
|
13
13
|
|
|
14
14
|
from ..utils import DistributedUtils, EPS, TorchTrainerMixin
|
|
15
|
+
from ..utils.losses import (
|
|
16
|
+
infer_loss_name_from_model_name,
|
|
17
|
+
normalize_loss_name,
|
|
18
|
+
resolve_tweedie_power,
|
|
19
|
+
)
|
|
15
20
|
|
|
16
21
|
|
|
17
22
|
# =============================================================================
|
|
@@ -140,7 +145,8 @@ class ResNetSklearn(TorchTrainerMixin, nn.Module):
|
|
|
140
145
|
stochastic_depth: float = 0.0,
|
|
141
146
|
weight_decay: float = 1e-4,
|
|
142
147
|
use_data_parallel: bool = True,
|
|
143
|
-
use_ddp: bool = False
|
|
148
|
+
use_ddp: bool = False,
|
|
149
|
+
loss_name: Optional[str] = None):
|
|
144
150
|
super(ResNetSklearn, self).__init__()
|
|
145
151
|
|
|
146
152
|
self.use_ddp = use_ddp
|
|
@@ -179,15 +185,18 @@ class ResNetSklearn(TorchTrainerMixin, nn.Module):
|
|
|
179
185
|
else:
|
|
180
186
|
self.device = torch.device('cpu')
|
|
181
187
|
|
|
182
|
-
|
|
188
|
+
resolved_loss = normalize_loss_name(loss_name, self.task_type)
|
|
183
189
|
if self.task_type == 'classification':
|
|
190
|
+
self.loss_name = "logloss"
|
|
184
191
|
self.tw_power = None
|
|
185
|
-
elif 'f' in self.model_nme:
|
|
186
|
-
self.tw_power = 1
|
|
187
|
-
elif 's' in self.model_nme:
|
|
188
|
-
self.tw_power = 2
|
|
189
192
|
else:
|
|
190
|
-
|
|
193
|
+
if resolved_loss == "auto":
|
|
194
|
+
resolved_loss = infer_loss_name_from_model_name(self.model_nme)
|
|
195
|
+
self.loss_name = resolved_loss
|
|
196
|
+
if self.loss_name == "tweedie":
|
|
197
|
+
self.tw_power = float(tweedie_power) if tweedie_power is not None else 1.5
|
|
198
|
+
else:
|
|
199
|
+
self.tw_power = resolve_tweedie_power(self.loss_name, default=1.5)
|
|
191
200
|
|
|
192
201
|
# Build network (construct on CPU first)
|
|
193
202
|
core = ResNetSequential(
|
|
@@ -578,6 +578,7 @@ class TrainerBase:
|
|
|
578
578
|
"n_layers": getattr(self.model, "n_layers", 4),
|
|
579
579
|
"dropout": getattr(self.model, "dropout", 0.1),
|
|
580
580
|
"task_type": getattr(self.model, "task_type", "regression"),
|
|
581
|
+
"loss_name": getattr(self.model, "loss_name", None),
|
|
581
582
|
"tw_power": getattr(self.model, "tw_power", 1.5),
|
|
582
583
|
"num_geo": getattr(self.model, "num_geo", 0),
|
|
583
584
|
"num_numeric_tokens": getattr(self.model, "num_numeric_tokens", None),
|
|
@@ -638,6 +639,7 @@ class TrainerBase:
|
|
|
638
639
|
n_layers=model_config.get("n_layers", 4),
|
|
639
640
|
dropout=model_config.get("dropout", 0.1),
|
|
640
641
|
task_type=model_config.get("task_type", "regression"),
|
|
642
|
+
loss_name=model_config.get("loss_name", None),
|
|
641
643
|
tweedie_power=model_config.get("tw_power", 1.5),
|
|
642
644
|
num_numeric_tokens=model_config.get("num_numeric_tokens"),
|
|
643
645
|
use_data_parallel=False,
|
|
@@ -5,11 +5,12 @@ from typing import Any, Dict, List, Optional, Tuple
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import optuna
|
|
7
7
|
import pandas as pd
|
|
8
|
-
from sklearn.metrics import log_loss
|
|
8
|
+
from sklearn.metrics import log_loss
|
|
9
9
|
from sklearn.model_selection import GroupKFold, TimeSeriesSplit
|
|
10
10
|
|
|
11
11
|
from .trainer_base import TrainerBase
|
|
12
12
|
from ..models import FTTransformerSklearn
|
|
13
|
+
from ..utils.losses import regression_loss
|
|
13
14
|
|
|
14
15
|
class FTTrainer(TrainerBase):
|
|
15
16
|
def __init__(self, context: "BayesOptModel") -> None:
|
|
@@ -67,6 +68,7 @@ class FTTrainer(TrainerBase):
|
|
|
67
68
|
|
|
68
69
|
def cross_val_unsupervised(self, trial: Optional[optuna.trial.Trial]) -> float:
|
|
69
70
|
"""Optuna objective A: minimize validation loss for masked reconstruction."""
|
|
71
|
+
loss_name = getattr(self.ctx, "loss_name", "tweedie")
|
|
70
72
|
param_space: Dict[str, Callable[[optuna.trial.Trial], Any]] = {
|
|
71
73
|
"learning_rate": lambda t: t.suggest_float('learning_rate', 1e-5, 5e-3, log=True),
|
|
72
74
|
"d_model": lambda t: t.suggest_int('d_model', 16, 128, step=16),
|
|
@@ -159,6 +161,7 @@ class FTTrainer(TrainerBase):
|
|
|
159
161
|
use_data_parallel=self.ctx.config.use_ft_data_parallel,
|
|
160
162
|
use_ddp=self.ctx.config.use_ft_ddp,
|
|
161
163
|
num_numeric_tokens=num_numeric_tokens,
|
|
164
|
+
loss_name=loss_name,
|
|
162
165
|
)
|
|
163
166
|
model.set_params(model_params)
|
|
164
167
|
try:
|
|
@@ -191,7 +194,8 @@ class FTTrainer(TrainerBase):
|
|
|
191
194
|
"dropout": lambda t: t.suggest_float('dropout', 0.0, 0.2),
|
|
192
195
|
"weight_decay": lambda t: t.suggest_float('weight_decay', 1e-6, 1e-2, log=True),
|
|
193
196
|
}
|
|
194
|
-
|
|
197
|
+
loss_name = getattr(self.ctx, "loss_name", "tweedie")
|
|
198
|
+
if self.ctx.task_type == 'regression' and loss_name == 'tweedie':
|
|
195
199
|
param_space["tw_power"] = lambda t: t.suggest_float(
|
|
196
200
|
'tw_power', 1.0, 2.0)
|
|
197
201
|
geo_enabled = bool(
|
|
@@ -231,10 +235,12 @@ class FTTrainer(TrainerBase):
|
|
|
231
235
|
tw_power = params.get("tw_power")
|
|
232
236
|
if self.ctx.task_type == 'regression':
|
|
233
237
|
base_tw = self.ctx.default_tweedie_power()
|
|
234
|
-
if
|
|
235
|
-
tw_power = base_tw
|
|
236
|
-
elif
|
|
238
|
+
if loss_name == "tweedie":
|
|
239
|
+
tw_power = base_tw if tw_power is None else tw_power
|
|
240
|
+
elif loss_name in ("poisson", "gamma"):
|
|
237
241
|
tw_power = base_tw
|
|
242
|
+
else:
|
|
243
|
+
tw_power = None
|
|
238
244
|
metric_ctx["tw_power"] = tw_power
|
|
239
245
|
|
|
240
246
|
adaptive_heads, _ = self._resolve_adaptive_heads(
|
|
@@ -259,6 +265,7 @@ class FTTrainer(TrainerBase):
|
|
|
259
265
|
use_data_parallel=self.ctx.config.use_ft_data_parallel,
|
|
260
266
|
use_ddp=self.ctx.config.use_ft_ddp,
|
|
261
267
|
num_numeric_tokens=num_numeric_tokens,
|
|
268
|
+
loss_name=loss_name,
|
|
262
269
|
).set_params({"_geo_params": geo_params_local} if geo_enabled else {})
|
|
263
270
|
|
|
264
271
|
def fit_predict(model, X_train, y_train, w_train, X_val, y_val, w_val, trial_obj):
|
|
@@ -286,11 +293,12 @@ class FTTrainer(TrainerBase):
|
|
|
286
293
|
|
|
287
294
|
def metric_fn(y_true, y_pred, weight):
|
|
288
295
|
if self.ctx.task_type == 'regression':
|
|
289
|
-
return
|
|
296
|
+
return regression_loss(
|
|
290
297
|
y_true,
|
|
291
298
|
y_pred,
|
|
292
|
-
|
|
293
|
-
|
|
299
|
+
weight,
|
|
300
|
+
loss_name=loss_name,
|
|
301
|
+
tweedie_power=metric_ctx.get("tw_power", 1.5),
|
|
294
302
|
)
|
|
295
303
|
return log_loss(y_true, y_pred, sample_weight=weight)
|
|
296
304
|
|
|
@@ -313,6 +321,7 @@ class FTTrainer(TrainerBase):
|
|
|
313
321
|
def train(self) -> None:
|
|
314
322
|
if not self.best_params:
|
|
315
323
|
raise RuntimeError("Run tune() first to obtain best FT-Transformer parameters.")
|
|
324
|
+
loss_name = getattr(self.ctx, "loss_name", "tweedie")
|
|
316
325
|
resolved_params = dict(self.best_params)
|
|
317
326
|
d_model_value = resolved_params.get("d_model", 64)
|
|
318
327
|
adaptive_heads, heads_adjusted = self._resolve_adaptive_heads(
|
|
@@ -342,6 +351,7 @@ class FTTrainer(TrainerBase):
|
|
|
342
351
|
use_ddp=self.ctx.config.use_ft_ddp,
|
|
343
352
|
num_numeric_tokens=self._resolve_numeric_tokens(),
|
|
344
353
|
weight_decay=float(resolved_params.get("weight_decay", 0.0)),
|
|
354
|
+
loss_name=loss_name,
|
|
345
355
|
)
|
|
346
356
|
tmp_model.set_params(resolved_params)
|
|
347
357
|
geo_train_full = self.ctx.train_geo_tokens
|
|
@@ -375,6 +385,7 @@ class FTTrainer(TrainerBase):
|
|
|
375
385
|
use_ddp=self.ctx.config.use_ft_ddp,
|
|
376
386
|
num_numeric_tokens=self._resolve_numeric_tokens(),
|
|
377
387
|
weight_decay=float(resolved_params.get("weight_decay", 0.0)),
|
|
388
|
+
loss_name=loss_name,
|
|
378
389
|
)
|
|
379
390
|
if refit_epochs is not None:
|
|
380
391
|
self.model.epochs = int(refit_epochs)
|
|
@@ -408,6 +419,7 @@ class FTTrainer(TrainerBase):
|
|
|
408
419
|
def ensemble_predict(self, k: int) -> None:
|
|
409
420
|
if not self.best_params:
|
|
410
421
|
raise RuntimeError("Run tune() first to obtain best FT-Transformer parameters.")
|
|
422
|
+
loss_name = getattr(self.ctx, "loss_name", "tweedie")
|
|
411
423
|
k = max(2, int(k))
|
|
412
424
|
X_all = self.ctx.train_data[self.ctx.factor_nmes]
|
|
413
425
|
y_all = self.ctx.train_data[self.ctx.resp_nme]
|
|
@@ -446,6 +458,7 @@ class FTTrainer(TrainerBase):
|
|
|
446
458
|
use_ddp=self.ctx.config.use_ft_ddp,
|
|
447
459
|
num_numeric_tokens=self._resolve_numeric_tokens(),
|
|
448
460
|
weight_decay=float(resolved_params.get("weight_decay", 0.0)),
|
|
461
|
+
loss_name=loss_name,
|
|
449
462
|
)
|
|
450
463
|
model.set_params(resolved_params)
|
|
451
464
|
|
|
@@ -541,6 +554,7 @@ class FTTrainer(TrainerBase):
|
|
|
541
554
|
return splitter, None, oof_folds
|
|
542
555
|
|
|
543
556
|
def _build_ft_feature_model(self, resolved_params: Dict[str, Any]) -> FTTransformerSklearn:
|
|
557
|
+
loss_name = getattr(self.ctx, "loss_name", "tweedie")
|
|
544
558
|
model = FTTransformerSklearn(
|
|
545
559
|
model_nme=self.ctx.model_nme,
|
|
546
560
|
num_cols=self.ctx.num_features,
|
|
@@ -549,6 +563,7 @@ class FTTrainer(TrainerBase):
|
|
|
549
563
|
use_data_parallel=self.ctx.config.use_ft_data_parallel,
|
|
550
564
|
use_ddp=self.ctx.config.use_ft_ddp,
|
|
551
565
|
num_numeric_tokens=self._resolve_numeric_tokens(),
|
|
566
|
+
loss_name=loss_name,
|
|
552
567
|
)
|
|
553
568
|
adaptive_heads, heads_adjusted = self._resolve_adaptive_heads(
|
|
554
569
|
d_model=resolved_params.get("d_model", model.d_model),
|
|
@@ -702,6 +717,7 @@ class FTTrainer(TrainerBase):
|
|
|
702
717
|
num_loss_weight: float = 1.0,
|
|
703
718
|
cat_loss_weight: float = 1.0) -> None:
|
|
704
719
|
"""Self-supervised pretraining (masked reconstruction) and cache embeddings."""
|
|
720
|
+
loss_name = getattr(self.ctx, "loss_name", "tweedie")
|
|
705
721
|
self.model = FTTransformerSklearn(
|
|
706
722
|
model_nme=self.ctx.model_nme,
|
|
707
723
|
num_cols=self.ctx.num_features,
|
|
@@ -710,6 +726,7 @@ class FTTrainer(TrainerBase):
|
|
|
710
726
|
use_data_parallel=self.ctx.config.use_ft_data_parallel,
|
|
711
727
|
use_ddp=self.ctx.config.use_ft_ddp,
|
|
712
728
|
num_numeric_tokens=self._resolve_numeric_tokens(),
|
|
729
|
+
loss_name=loss_name,
|
|
713
730
|
)
|
|
714
731
|
resolved_params = dict(params or {})
|
|
715
732
|
# Reuse supervised tuning structure params unless explicitly overridden.
|