ins-pricing 0.4.5__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ins_pricing/README.md +48 -22
- ins_pricing/__init__.py +142 -90
- ins_pricing/cli/BayesOpt_entry.py +58 -46
- ins_pricing/cli/BayesOpt_incremental.py +77 -110
- ins_pricing/cli/Explain_Run.py +42 -23
- ins_pricing/cli/Explain_entry.py +551 -577
- ins_pricing/cli/Pricing_Run.py +42 -23
- ins_pricing/cli/bayesopt_entry_runner.py +51 -16
- ins_pricing/cli/utils/bootstrap.py +23 -0
- ins_pricing/cli/utils/cli_common.py +256 -256
- ins_pricing/cli/utils/cli_config.py +379 -360
- ins_pricing/cli/utils/import_resolver.py +375 -358
- ins_pricing/cli/utils/notebook_utils.py +256 -242
- ins_pricing/cli/watchdog_run.py +216 -198
- ins_pricing/frontend/__init__.py +10 -10
- ins_pricing/frontend/app.py +132 -61
- ins_pricing/frontend/config_builder.py +33 -0
- ins_pricing/frontend/example_config.json +11 -0
- ins_pricing/frontend/example_workflows.py +1 -1
- ins_pricing/frontend/runner.py +340 -388
- ins_pricing/governance/__init__.py +20 -20
- ins_pricing/governance/release.py +159 -159
- ins_pricing/modelling/README.md +1 -1
- ins_pricing/modelling/__init__.py +147 -92
- ins_pricing/modelling/{core/bayesopt → bayesopt}/README.md +31 -13
- ins_pricing/modelling/{core/bayesopt → bayesopt}/__init__.py +64 -102
- ins_pricing/modelling/{core/bayesopt → bayesopt}/config_components.py +12 -0
- ins_pricing/modelling/{core/bayesopt → bayesopt}/config_preprocess.py +589 -552
- ins_pricing/modelling/{core/bayesopt → bayesopt}/core.py +987 -958
- ins_pricing/modelling/{core/bayesopt → bayesopt}/model_explain_mixin.py +296 -296
- ins_pricing/modelling/{core/bayesopt → bayesopt}/model_plotting_mixin.py +488 -548
- ins_pricing/modelling/{core/bayesopt → bayesopt}/models/__init__.py +27 -27
- ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_ft_components.py +349 -342
- ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_ft_trainer.py +921 -913
- ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_gnn.py +794 -785
- ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_resn.py +454 -446
- ins_pricing/modelling/bayesopt/trainers/__init__.py +19 -0
- ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_base.py +1294 -1282
- ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_ft.py +64 -56
- ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_glm.py +203 -198
- ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_gnn.py +333 -325
- ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_resn.py +279 -267
- ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_xgb.py +515 -313
- ins_pricing/modelling/bayesopt/utils/__init__.py +67 -0
- ins_pricing/modelling/bayesopt/utils/constants.py +21 -0
- ins_pricing/modelling/{core/bayesopt → bayesopt}/utils/distributed_utils.py +193 -186
- ins_pricing/modelling/bayesopt/utils/io_utils.py +7 -0
- ins_pricing/modelling/bayesopt/utils/losses.py +27 -0
- ins_pricing/modelling/bayesopt/utils/metrics_and_devices.py +17 -0
- ins_pricing/modelling/{core/bayesopt → bayesopt}/utils/torch_trainer_mixin.py +636 -623
- ins_pricing/modelling/{core/evaluation.py → evaluation.py} +113 -104
- ins_pricing/modelling/explain/__init__.py +55 -55
- ins_pricing/modelling/explain/metrics.py +27 -174
- ins_pricing/modelling/explain/permutation.py +237 -237
- ins_pricing/modelling/plotting/__init__.py +40 -36
- ins_pricing/modelling/plotting/compat.py +228 -0
- ins_pricing/modelling/plotting/curves.py +572 -572
- ins_pricing/modelling/plotting/diagnostics.py +163 -163
- ins_pricing/modelling/plotting/geo.py +362 -362
- ins_pricing/modelling/plotting/importance.py +121 -121
- ins_pricing/pricing/__init__.py +27 -27
- ins_pricing/pricing/factors.py +67 -56
- ins_pricing/production/__init__.py +35 -25
- ins_pricing/production/{predict.py → inference.py} +140 -57
- ins_pricing/production/monitoring.py +8 -21
- ins_pricing/reporting/__init__.py +11 -11
- ins_pricing/setup.py +1 -1
- ins_pricing/tests/production/test_inference.py +90 -0
- ins_pricing/utils/__init__.py +112 -78
- ins_pricing/utils/device.py +258 -237
- ins_pricing/utils/features.py +53 -0
- ins_pricing/utils/io.py +72 -0
- ins_pricing/utils/logging.py +34 -1
- ins_pricing/{modelling/core/bayesopt/utils → utils}/losses.py +125 -129
- ins_pricing/utils/metrics.py +158 -24
- ins_pricing/utils/numerics.py +76 -0
- ins_pricing/utils/paths.py +9 -1
- ins_pricing/utils/profiling.py +8 -4
- {ins_pricing-0.4.5.dist-info → ins_pricing-0.5.1.dist-info}/METADATA +1 -1
- ins_pricing-0.5.1.dist-info/RECORD +132 -0
- ins_pricing/modelling/core/BayesOpt.py +0 -146
- ins_pricing/modelling/core/__init__.py +0 -1
- ins_pricing/modelling/core/bayesopt/trainers/__init__.py +0 -19
- ins_pricing/modelling/core/bayesopt/utils/__init__.py +0 -86
- ins_pricing/modelling/core/bayesopt/utils/constants.py +0 -183
- ins_pricing/modelling/core/bayesopt/utils/io_utils.py +0 -126
- ins_pricing/modelling/core/bayesopt/utils/metrics_and_devices.py +0 -555
- ins_pricing/modelling/core/bayesopt/utils.py +0 -105
- ins_pricing/modelling/core/bayesopt/utils_backup.py +0 -1503
- ins_pricing/tests/production/test_predict.py +0 -233
- ins_pricing-0.4.5.dist-info/RECORD +0 -130
- {ins_pricing-0.4.5.dist-info → ins_pricing-0.5.1.dist-info}/WHEEL +0 -0
- {ins_pricing-0.4.5.dist-info → ins_pricing-0.5.1.dist-info}/top_level.txt +0 -0
ins_pricing/utils/logging.py
CHANGED
|
@@ -18,7 +18,7 @@ from __future__ import annotations
|
|
|
18
18
|
import logging
|
|
19
19
|
import os
|
|
20
20
|
from functools import lru_cache
|
|
21
|
-
from typing import Optional
|
|
21
|
+
from typing import Optional, Union
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
@lru_cache(maxsize=1)
|
|
@@ -72,3 +72,36 @@ def configure_logging(
|
|
|
72
72
|
formatter = logging.Formatter(format_string)
|
|
73
73
|
for handler in logger.handlers:
|
|
74
74
|
handler.setFormatter(formatter)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def log_print(
|
|
78
|
+
logger: logging.Logger,
|
|
79
|
+
*args,
|
|
80
|
+
level: Optional[Union[int, str]] = None,
|
|
81
|
+
**kwargs,
|
|
82
|
+
) -> None:
|
|
83
|
+
"""Print-like helper that routes messages to a logger.
|
|
84
|
+
|
|
85
|
+
This preserves basic print semantics (sep/end) while ignoring file/flush,
|
|
86
|
+
and it auto-detects severity when level is not provided.
|
|
87
|
+
"""
|
|
88
|
+
sep = kwargs.get("sep", " ")
|
|
89
|
+
msg = sep.join(str(arg) for arg in args)
|
|
90
|
+
if not msg:
|
|
91
|
+
return
|
|
92
|
+
|
|
93
|
+
if level is None:
|
|
94
|
+
lowered = msg.lstrip().lower()
|
|
95
|
+
if lowered.startswith(("warn", "[warn]", "warning")):
|
|
96
|
+
level_value = logging.WARNING
|
|
97
|
+
elif lowered.startswith(("error", "[error]", "err")):
|
|
98
|
+
level_value = logging.ERROR
|
|
99
|
+
else:
|
|
100
|
+
level_value = logging.INFO
|
|
101
|
+
else:
|
|
102
|
+
if isinstance(level, str):
|
|
103
|
+
level_value = getattr(logging, level.upper(), logging.INFO)
|
|
104
|
+
else:
|
|
105
|
+
level_value = int(level)
|
|
106
|
+
|
|
107
|
+
logger.log(level_value, msg)
|
|
@@ -1,129 +1,125 @@
|
|
|
1
|
-
"""Loss selection and regression loss utilities."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
from typing import Optional
|
|
6
|
-
|
|
7
|
-
import numpy as np
|
|
8
|
-
|
|
9
|
-
from
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
"
|
|
17
|
-
"
|
|
18
|
-
"
|
|
19
|
-
"
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
""
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
if loss_name == "
|
|
64
|
-
return
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
"
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
return
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
total = float(np.sum(weight))
|
|
127
|
-
if total <= 0:
|
|
128
|
-
return float(np.mean(values))
|
|
129
|
-
return float(np.sum(values * weight) / total)
|
|
1
|
+
"""Loss selection and regression loss utilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
from ins_pricing.utils.metrics import gamma_deviance, poisson_deviance, tweedie_deviance
|
|
10
|
+
|
|
11
|
+
LOSS_ALIASES = {
|
|
12
|
+
"poisson_deviance": "poisson",
|
|
13
|
+
"gamma_deviance": "gamma",
|
|
14
|
+
"tweedie_deviance": "tweedie",
|
|
15
|
+
"l2": "mse",
|
|
16
|
+
"l1": "mae",
|
|
17
|
+
"absolute": "mae",
|
|
18
|
+
"gaussian": "mse",
|
|
19
|
+
"normal": "mse",
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
REGRESSION_LOSSES = {"tweedie", "poisson", "gamma", "mse", "mae"}
|
|
23
|
+
CLASSIFICATION_LOSSES = {"logloss", "bce"}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def normalize_loss_name(loss_name: Optional[str], task_type: str) -> str:
|
|
27
|
+
"""Normalize the loss name and validate against supported values."""
|
|
28
|
+
name = str(loss_name or "auto").strip().lower()
|
|
29
|
+
if not name or name == "auto":
|
|
30
|
+
return "auto"
|
|
31
|
+
name = LOSS_ALIASES.get(name, name)
|
|
32
|
+
if task_type == "classification":
|
|
33
|
+
if name not in CLASSIFICATION_LOSSES:
|
|
34
|
+
raise ValueError(
|
|
35
|
+
f"Unsupported classification loss '{loss_name}'. "
|
|
36
|
+
f"Supported: {sorted(CLASSIFICATION_LOSSES)}"
|
|
37
|
+
)
|
|
38
|
+
else:
|
|
39
|
+
if name not in REGRESSION_LOSSES:
|
|
40
|
+
raise ValueError(
|
|
41
|
+
f"Unsupported regression loss '{loss_name}'. "
|
|
42
|
+
f"Supported: {sorted(REGRESSION_LOSSES)}"
|
|
43
|
+
)
|
|
44
|
+
return name
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def infer_loss_name_from_model_name(model_name: str) -> str:
|
|
48
|
+
"""Preserve legacy heuristic for loss selection based on model name."""
|
|
49
|
+
name = str(model_name or "")
|
|
50
|
+
if "f" in name:
|
|
51
|
+
return "poisson"
|
|
52
|
+
if "s" in name:
|
|
53
|
+
return "gamma"
|
|
54
|
+
return "tweedie"
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def resolve_tweedie_power(loss_name: str, default: float = 1.5) -> Optional[float]:
|
|
58
|
+
"""Resolve Tweedie power based on loss name."""
|
|
59
|
+
if loss_name == "poisson":
|
|
60
|
+
return 1.0
|
|
61
|
+
if loss_name == "gamma":
|
|
62
|
+
return 2.0
|
|
63
|
+
if loss_name == "tweedie":
|
|
64
|
+
return float(default)
|
|
65
|
+
return None
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def resolve_xgb_objective(loss_name: str) -> str:
|
|
69
|
+
"""Map regression loss name to XGBoost objective."""
|
|
70
|
+
name = loss_name if loss_name != "auto" else "tweedie"
|
|
71
|
+
mapping = {
|
|
72
|
+
"tweedie": "reg:tweedie",
|
|
73
|
+
"poisson": "count:poisson",
|
|
74
|
+
"gamma": "reg:gamma",
|
|
75
|
+
"mse": "reg:squarederror",
|
|
76
|
+
"mae": "reg:absoluteerror",
|
|
77
|
+
}
|
|
78
|
+
return mapping.get(name, "reg:tweedie")
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def regression_loss(
|
|
82
|
+
y_true,
|
|
83
|
+
y_pred,
|
|
84
|
+
sample_weight=None,
|
|
85
|
+
*,
|
|
86
|
+
loss_name: str,
|
|
87
|
+
tweedie_power: Optional[float] = 1.5,
|
|
88
|
+
eps: float = 1e-8,
|
|
89
|
+
) -> float:
|
|
90
|
+
"""Compute weighted regression loss based on configured loss name."""
|
|
91
|
+
name = normalize_loss_name(loss_name, task_type="regression")
|
|
92
|
+
if name == "auto":
|
|
93
|
+
name = "tweedie"
|
|
94
|
+
|
|
95
|
+
y_t = np.asarray(y_true, dtype=float).reshape(-1)
|
|
96
|
+
y_p = np.asarray(y_pred, dtype=float).reshape(-1)
|
|
97
|
+
w = None if sample_weight is None else np.asarray(sample_weight, dtype=float).reshape(-1)
|
|
98
|
+
|
|
99
|
+
if name == "mse":
|
|
100
|
+
err = (y_t - y_p) ** 2
|
|
101
|
+
return _weighted_mean(err, w)
|
|
102
|
+
if name == "mae":
|
|
103
|
+
err = np.abs(y_t - y_p)
|
|
104
|
+
return _weighted_mean(err, w)
|
|
105
|
+
if name == "poisson":
|
|
106
|
+
return poisson_deviance(y_t, y_p, sample_weight=w, eps=eps)
|
|
107
|
+
if name == "gamma":
|
|
108
|
+
return gamma_deviance(y_t, y_p, sample_weight=w, eps=eps)
|
|
109
|
+
|
|
110
|
+
power = 1.5 if tweedie_power is None else float(tweedie_power)
|
|
111
|
+
return tweedie_deviance(y_t, y_p, sample_weight=w, power=power, eps=eps)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def loss_requires_positive(loss_name: str) -> bool:
|
|
115
|
+
"""Return True if the loss requires positive predictions."""
|
|
116
|
+
return loss_name in {"tweedie", "poisson", "gamma"}
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _weighted_mean(values: np.ndarray, weight: Optional[np.ndarray]) -> float:
|
|
120
|
+
if weight is None:
|
|
121
|
+
return float(np.mean(values))
|
|
122
|
+
total = float(np.sum(weight))
|
|
123
|
+
if total <= 0:
|
|
124
|
+
return float(np.mean(values))
|
|
125
|
+
return float(np.sum(values * weight) / total)
|
ins_pricing/utils/metrics.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
This module consolidates metric computation used across:
|
|
4
4
|
- pricing/monitoring.py: PSI for feature drift
|
|
5
5
|
- production/drift.py: PSI wrapper for production monitoring
|
|
6
|
-
- modelling/
|
|
6
|
+
- modelling/bayesopt/: Model evaluation metrics
|
|
7
7
|
|
|
8
8
|
Example:
|
|
9
9
|
>>> from ins_pricing.utils import psi_report, MetricFactory
|
|
@@ -16,23 +16,15 @@ Example:
|
|
|
16
16
|
|
|
17
17
|
from __future__ import annotations
|
|
18
18
|
|
|
19
|
-
from typing import
|
|
19
|
+
from typing import Iterable, List, Optional
|
|
20
20
|
|
|
21
21
|
import numpy as np
|
|
22
22
|
import pandas as pd
|
|
23
23
|
|
|
24
24
|
try:
|
|
25
|
-
from sklearn.metrics import
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
mean_squared_error,
|
|
29
|
-
mean_tweedie_deviance,
|
|
30
|
-
)
|
|
31
|
-
except ImportError:
|
|
32
|
-
log_loss = None
|
|
33
|
-
mean_absolute_error = None
|
|
34
|
-
mean_squared_error = None
|
|
35
|
-
mean_tweedie_deviance = None
|
|
25
|
+
from sklearn.metrics import roc_auc_score
|
|
26
|
+
except Exception: # pragma: no cover - optional dependency
|
|
27
|
+
roc_auc_score = None
|
|
36
28
|
|
|
37
29
|
|
|
38
30
|
# =============================================================================
|
|
@@ -190,6 +182,152 @@ def psi_report(
|
|
|
190
182
|
# =============================================================================
|
|
191
183
|
|
|
192
184
|
|
|
185
|
+
def _to_numpy(arr) -> np.ndarray:
|
|
186
|
+
out = np.asarray(arr, dtype=float)
|
|
187
|
+
return out.reshape(-1)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _align(y_true, y_pred, sample_weight=None):
|
|
191
|
+
y_t = _to_numpy(y_true)
|
|
192
|
+
y_p = _to_numpy(y_pred)
|
|
193
|
+
if y_t.shape[0] != y_p.shape[0]:
|
|
194
|
+
raise ValueError("y_true and y_pred must have the same length.")
|
|
195
|
+
if sample_weight is None:
|
|
196
|
+
return y_t, y_p, None
|
|
197
|
+
w = _to_numpy(sample_weight)
|
|
198
|
+
if w.shape[0] != y_t.shape[0]:
|
|
199
|
+
raise ValueError("sample_weight must have the same length as y_true.")
|
|
200
|
+
return y_t, y_p, w
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _weighted_mean(values: np.ndarray, weight: Optional[np.ndarray]) -> float:
|
|
204
|
+
if weight is None:
|
|
205
|
+
return float(np.mean(values))
|
|
206
|
+
total = float(np.sum(weight))
|
|
207
|
+
if total <= 0:
|
|
208
|
+
return float(np.mean(values))
|
|
209
|
+
return float(np.sum(values * weight) / total)
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def rmse(y_true, y_pred, sample_weight=None) -> float:
|
|
213
|
+
y_t, y_p, w = _align(y_true, y_pred, sample_weight)
|
|
214
|
+
err = (y_t - y_p) ** 2
|
|
215
|
+
return float(np.sqrt(_weighted_mean(err, w)))
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def mae(y_true, y_pred, sample_weight=None) -> float:
|
|
219
|
+
y_t, y_p, w = _align(y_true, y_pred, sample_weight)
|
|
220
|
+
err = np.abs(y_t - y_p)
|
|
221
|
+
return _weighted_mean(err, w)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def mape(y_true, y_pred, sample_weight=None, eps: float = 1e-8) -> float:
|
|
225
|
+
y_t, y_p, w = _align(y_true, y_pred, sample_weight)
|
|
226
|
+
denom = np.maximum(np.abs(y_t), eps)
|
|
227
|
+
err = np.abs((y_t - y_p) / denom)
|
|
228
|
+
return _weighted_mean(err, w)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def r2_score(y_true, y_pred, sample_weight=None) -> float:
|
|
232
|
+
y_t, y_p, w = _align(y_true, y_pred, sample_weight)
|
|
233
|
+
if w is None:
|
|
234
|
+
y_mean = float(np.mean(y_t))
|
|
235
|
+
sse = float(np.sum((y_t - y_p) ** 2))
|
|
236
|
+
sst = float(np.sum((y_t - y_mean) ** 2))
|
|
237
|
+
else:
|
|
238
|
+
w_sum = float(np.sum(w))
|
|
239
|
+
y_mean = float(np.sum(w * y_t) / w_sum) if w_sum > 0 else float(np.mean(y_t))
|
|
240
|
+
sse = float(np.sum(w * (y_t - y_p) ** 2))
|
|
241
|
+
sst = float(np.sum(w * (y_t - y_mean) ** 2))
|
|
242
|
+
if sst <= 0:
|
|
243
|
+
return 0.0
|
|
244
|
+
return 1.0 - sse / sst
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def logloss(y_true, y_pred, sample_weight=None, eps: float = 1e-8) -> float:
|
|
248
|
+
y_t, y_p, w = _align(y_true, y_pred, sample_weight)
|
|
249
|
+
p = np.clip(y_p, eps, 1 - eps)
|
|
250
|
+
loss = -(y_t * np.log(p) + (1 - y_t) * np.log(1 - p))
|
|
251
|
+
return _weighted_mean(loss, w)
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def tweedie_deviance(
|
|
255
|
+
y_true,
|
|
256
|
+
y_pred,
|
|
257
|
+
sample_weight=None,
|
|
258
|
+
*,
|
|
259
|
+
power: float = 1.5,
|
|
260
|
+
eps: float = 1e-8,
|
|
261
|
+
) -> float:
|
|
262
|
+
if power < 0:
|
|
263
|
+
raise ValueError("power must be >= 0.")
|
|
264
|
+
y_t, y_p, w = _align(y_true, y_pred, sample_weight)
|
|
265
|
+
y_p = np.clip(y_p, eps, None)
|
|
266
|
+
y_t_safe = np.clip(y_t, eps, None)
|
|
267
|
+
|
|
268
|
+
if power == 0:
|
|
269
|
+
dev = (y_t - y_p) ** 2
|
|
270
|
+
elif power == 1:
|
|
271
|
+
dev = 2 * (y_t_safe * np.log(y_t_safe / y_p) - (y_t_safe - y_p))
|
|
272
|
+
elif power == 2:
|
|
273
|
+
ratio = y_t_safe / y_p
|
|
274
|
+
dev = 2 * ((ratio - 1) - np.log(ratio))
|
|
275
|
+
else:
|
|
276
|
+
term1 = np.power(y_t_safe, 2 - power) / ((1 - power) * (2 - power))
|
|
277
|
+
term2 = y_t_safe * np.power(y_p, 1 - power) / (1 - power)
|
|
278
|
+
term3 = np.power(y_p, 2 - power) / (2 - power)
|
|
279
|
+
dev = 2 * (term1 - term2 + term3)
|
|
280
|
+
return _weighted_mean(dev, w)
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def poisson_deviance(y_true, y_pred, sample_weight=None, eps: float = 1e-8) -> float:
|
|
284
|
+
return tweedie_deviance(
|
|
285
|
+
y_true, y_pred, sample_weight=sample_weight, power=1.0, eps=eps
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def gamma_deviance(y_true, y_pred, sample_weight=None, eps: float = 1e-8) -> float:
|
|
290
|
+
return tweedie_deviance(
|
|
291
|
+
y_true, y_pred, sample_weight=sample_weight, power=2.0, eps=eps
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def auc_score(y_true, y_pred, sample_weight=None) -> float:
|
|
296
|
+
if roc_auc_score is None:
|
|
297
|
+
raise RuntimeError("auc requires scikit-learn.")
|
|
298
|
+
y_t, y_p, w = _align(y_true, y_pred, sample_weight)
|
|
299
|
+
return float(roc_auc_score(y_t, y_p, sample_weight=w))
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def resolve_metric(metric, *, task_type: Optional[str] = None, higher_is_better: Optional[bool] = None):
|
|
303
|
+
if callable(metric):
|
|
304
|
+
if higher_is_better is None:
|
|
305
|
+
raise ValueError("higher_is_better must be provided for custom metric.")
|
|
306
|
+
return metric, bool(higher_is_better), getattr(metric, "__name__", "custom")
|
|
307
|
+
|
|
308
|
+
name = str(metric or "auto").lower()
|
|
309
|
+
if name == "auto":
|
|
310
|
+
name = "logloss" if task_type == "classification" else "rmse"
|
|
311
|
+
|
|
312
|
+
mapping = {
|
|
313
|
+
"rmse": (rmse, False),
|
|
314
|
+
"mae": (mae, False),
|
|
315
|
+
"mape": (mape, False),
|
|
316
|
+
"r2": (r2_score, True),
|
|
317
|
+
"logloss": (logloss, False),
|
|
318
|
+
"poisson": (poisson_deviance, False),
|
|
319
|
+
"gamma": (gamma_deviance, False),
|
|
320
|
+
"tweedie": (tweedie_deviance, False),
|
|
321
|
+
"auc": (auc_score, True),
|
|
322
|
+
}
|
|
323
|
+
if name not in mapping:
|
|
324
|
+
raise ValueError(f"Unsupported metric: {metric}")
|
|
325
|
+
fn, hib = mapping[name]
|
|
326
|
+
if higher_is_better is not None:
|
|
327
|
+
hib = bool(higher_is_better)
|
|
328
|
+
return fn, hib, name
|
|
329
|
+
|
|
330
|
+
|
|
193
331
|
class MetricFactory:
|
|
194
332
|
"""Factory for computing evaluation metrics consistently across all trainers.
|
|
195
333
|
|
|
@@ -240,25 +378,21 @@ class MetricFactory:
|
|
|
240
378
|
Returns:
|
|
241
379
|
Computed metric value (lower is better)
|
|
242
380
|
"""
|
|
243
|
-
if log_loss is None or mean_tweedie_deviance is None:
|
|
244
|
-
raise ImportError("sklearn is required for metric computation")
|
|
245
|
-
|
|
246
381
|
y_pred = np.asarray(y_pred)
|
|
247
382
|
y_true = np.asarray(y_true)
|
|
248
383
|
|
|
249
384
|
if self.task_type == "classification":
|
|
250
385
|
y_pred_clipped = np.clip(y_pred, self.clip_min, self.clip_max)
|
|
251
|
-
return float(
|
|
386
|
+
return float(logloss(y_true, y_pred_clipped, sample_weight=sample_weight))
|
|
252
387
|
|
|
253
388
|
loss_name = str(self.loss_name or "tweedie").strip().lower()
|
|
254
389
|
if loss_name in {"mse", "mae"}:
|
|
255
|
-
|
|
256
|
-
raise ImportError("sklearn is required for metric computation")
|
|
390
|
+
y_t, y_p, w = _align(y_true, y_pred, sample_weight)
|
|
257
391
|
if loss_name == "mse":
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
392
|
+
err = (y_t - y_p) ** 2
|
|
393
|
+
return _weighted_mean(err, w)
|
|
394
|
+
err = np.abs(y_t - y_p)
|
|
395
|
+
return _weighted_mean(err, w)
|
|
262
396
|
|
|
263
397
|
y_pred_safe = np.maximum(y_pred, self.clip_min)
|
|
264
398
|
power = self.tweedie_power
|
|
@@ -267,7 +401,7 @@ class MetricFactory:
|
|
|
267
401
|
elif loss_name == "gamma":
|
|
268
402
|
power = 2.0
|
|
269
403
|
return float(
|
|
270
|
-
|
|
404
|
+
tweedie_deviance(
|
|
271
405
|
y_true,
|
|
272
406
|
y_pred_safe,
|
|
273
407
|
sample_weight=sample_weight,
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""Numerical utilities shared across ins_pricing.
|
|
2
|
+
|
|
3
|
+
This module centralizes small, dependency-light numerical helpers so that
|
|
4
|
+
other subpackages can reuse them without importing bayesopt-specific code.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import random
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
try:
|
|
13
|
+
import torch
|
|
14
|
+
_TORCH_AVAILABLE = True
|
|
15
|
+
except Exception: # pragma: no cover - optional dependency
|
|
16
|
+
torch = None # type: ignore[assignment]
|
|
17
|
+
_TORCH_AVAILABLE = False
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
EPS = 1e-8
|
|
21
|
+
"""Small epsilon value for numerical stability."""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def set_global_seed(seed: int) -> None:
|
|
25
|
+
"""Set random seed for reproducibility across numpy/python/torch."""
|
|
26
|
+
random.seed(seed)
|
|
27
|
+
np.random.seed(seed)
|
|
28
|
+
if _TORCH_AVAILABLE:
|
|
29
|
+
torch.manual_seed(seed)
|
|
30
|
+
if torch.cuda.is_available():
|
|
31
|
+
torch.cuda.manual_seed_all(seed)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def compute_batch_size(data_size: int, learning_rate: float, batch_num: int, minimum: int) -> int:
|
|
35
|
+
"""Compute adaptive batch size based on data size and learning rate."""
|
|
36
|
+
estimated = int((learning_rate / 1e-4) ** 0.5 * (data_size / max(batch_num, 1)))
|
|
37
|
+
return max(1, min(int(data_size), max(int(minimum), estimated)))
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def tweedie_loss(
|
|
41
|
+
pred,
|
|
42
|
+
target,
|
|
43
|
+
*,
|
|
44
|
+
p: float = 1.5,
|
|
45
|
+
eps: float = 1e-6,
|
|
46
|
+
max_clip: float = 1e6,
|
|
47
|
+
):
|
|
48
|
+
"""Compute Tweedie deviance loss for PyTorch tensors."""
|
|
49
|
+
if not _TORCH_AVAILABLE:
|
|
50
|
+
raise ImportError("tweedie_loss requires torch to be installed.")
|
|
51
|
+
|
|
52
|
+
pred_clamped = torch.clamp(pred, min=eps)
|
|
53
|
+
|
|
54
|
+
if p == 1:
|
|
55
|
+
term1 = target * torch.log(target / pred_clamped + eps)
|
|
56
|
+
term2 = -target + pred_clamped
|
|
57
|
+
term3 = 0
|
|
58
|
+
elif p == 0:
|
|
59
|
+
term1 = 0.5 * torch.pow(target - pred_clamped, 2)
|
|
60
|
+
term2 = 0
|
|
61
|
+
term3 = 0
|
|
62
|
+
elif p == 2:
|
|
63
|
+
term1 = torch.log(pred_clamped / target + eps)
|
|
64
|
+
term2 = -target / pred_clamped + 1
|
|
65
|
+
term3 = 0
|
|
66
|
+
else:
|
|
67
|
+
term1 = torch.pow(target, 2 - p) / ((1 - p) * (2 - p))
|
|
68
|
+
term2 = target * torch.pow(pred_clamped, 1 - p) / (1 - p)
|
|
69
|
+
term3 = torch.pow(pred_clamped, 2 - p) / (2 - p)
|
|
70
|
+
|
|
71
|
+
return torch.nan_to_num(
|
|
72
|
+
2 * (term1 - term2 + term3),
|
|
73
|
+
nan=eps,
|
|
74
|
+
posinf=max_clip,
|
|
75
|
+
neginf=-max_clip,
|
|
76
|
+
)
|
ins_pricing/utils/paths.py
CHANGED
|
@@ -217,6 +217,7 @@ def load_dataset(
|
|
|
217
217
|
data_format: str = "auto",
|
|
218
218
|
dtype_map: Optional[Dict[str, Any]] = None,
|
|
219
219
|
low_memory: bool = False,
|
|
220
|
+
chunksize: Optional[int] = None,
|
|
220
221
|
) -> pd.DataFrame:
|
|
221
222
|
"""Load a dataset from various formats.
|
|
222
223
|
|
|
@@ -225,6 +226,7 @@ def load_dataset(
|
|
|
225
226
|
data_format: Format ('csv', 'parquet', 'feather', 'auto')
|
|
226
227
|
dtype_map: Column type mapping
|
|
227
228
|
low_memory: Whether to use low memory mode for CSV
|
|
229
|
+
chunksize: Optional chunk size for CSV streaming
|
|
228
230
|
|
|
229
231
|
Returns:
|
|
230
232
|
Loaded DataFrame
|
|
@@ -238,7 +240,13 @@ def load_dataset(
|
|
|
238
240
|
elif fmt == "feather":
|
|
239
241
|
df = pd.read_feather(path)
|
|
240
242
|
elif fmt == "csv":
|
|
241
|
-
|
|
243
|
+
if chunksize is not None:
|
|
244
|
+
chunks = []
|
|
245
|
+
for chunk in pd.read_csv(path, low_memory=low_memory, dtype=dtype_map or None, chunksize=chunksize):
|
|
246
|
+
chunks.append(chunk)
|
|
247
|
+
df = pd.concat(chunks, ignore_index=True) if chunks else pd.DataFrame()
|
|
248
|
+
else:
|
|
249
|
+
df = pd.read_csv(path, low_memory=low_memory, dtype=dtype_map or None)
|
|
242
250
|
else:
|
|
243
251
|
raise ValueError(f"Unsupported data_format: {data_format}")
|
|
244
252
|
|