ins-pricing 0.4.5__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. ins_pricing/README.md +48 -22
  2. ins_pricing/__init__.py +142 -90
  3. ins_pricing/cli/BayesOpt_entry.py +58 -46
  4. ins_pricing/cli/BayesOpt_incremental.py +77 -110
  5. ins_pricing/cli/Explain_Run.py +42 -23
  6. ins_pricing/cli/Explain_entry.py +551 -577
  7. ins_pricing/cli/Pricing_Run.py +42 -23
  8. ins_pricing/cli/bayesopt_entry_runner.py +51 -16
  9. ins_pricing/cli/utils/bootstrap.py +23 -0
  10. ins_pricing/cli/utils/cli_common.py +256 -256
  11. ins_pricing/cli/utils/cli_config.py +379 -360
  12. ins_pricing/cli/utils/import_resolver.py +375 -358
  13. ins_pricing/cli/utils/notebook_utils.py +256 -242
  14. ins_pricing/cli/watchdog_run.py +216 -198
  15. ins_pricing/frontend/__init__.py +10 -10
  16. ins_pricing/frontend/app.py +132 -61
  17. ins_pricing/frontend/config_builder.py +33 -0
  18. ins_pricing/frontend/example_config.json +11 -0
  19. ins_pricing/frontend/example_workflows.py +1 -1
  20. ins_pricing/frontend/runner.py +340 -388
  21. ins_pricing/governance/__init__.py +20 -20
  22. ins_pricing/governance/release.py +159 -159
  23. ins_pricing/modelling/README.md +1 -1
  24. ins_pricing/modelling/__init__.py +147 -92
  25. ins_pricing/modelling/{core/bayesopt → bayesopt}/README.md +31 -13
  26. ins_pricing/modelling/{core/bayesopt → bayesopt}/__init__.py +64 -102
  27. ins_pricing/modelling/{core/bayesopt → bayesopt}/config_components.py +12 -0
  28. ins_pricing/modelling/{core/bayesopt → bayesopt}/config_preprocess.py +589 -552
  29. ins_pricing/modelling/{core/bayesopt → bayesopt}/core.py +987 -958
  30. ins_pricing/modelling/{core/bayesopt → bayesopt}/model_explain_mixin.py +296 -296
  31. ins_pricing/modelling/{core/bayesopt → bayesopt}/model_plotting_mixin.py +488 -548
  32. ins_pricing/modelling/{core/bayesopt → bayesopt}/models/__init__.py +27 -27
  33. ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_ft_components.py +349 -342
  34. ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_ft_trainer.py +921 -913
  35. ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_gnn.py +794 -785
  36. ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_resn.py +454 -446
  37. ins_pricing/modelling/bayesopt/trainers/__init__.py +19 -0
  38. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_base.py +1294 -1282
  39. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_ft.py +64 -56
  40. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_glm.py +203 -198
  41. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_gnn.py +333 -325
  42. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_resn.py +279 -267
  43. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_xgb.py +515 -313
  44. ins_pricing/modelling/bayesopt/utils/__init__.py +67 -0
  45. ins_pricing/modelling/bayesopt/utils/constants.py +21 -0
  46. ins_pricing/modelling/{core/bayesopt → bayesopt}/utils/distributed_utils.py +193 -186
  47. ins_pricing/modelling/bayesopt/utils/io_utils.py +7 -0
  48. ins_pricing/modelling/bayesopt/utils/losses.py +27 -0
  49. ins_pricing/modelling/bayesopt/utils/metrics_and_devices.py +17 -0
  50. ins_pricing/modelling/{core/bayesopt → bayesopt}/utils/torch_trainer_mixin.py +636 -623
  51. ins_pricing/modelling/{core/evaluation.py → evaluation.py} +113 -104
  52. ins_pricing/modelling/explain/__init__.py +55 -55
  53. ins_pricing/modelling/explain/metrics.py +27 -174
  54. ins_pricing/modelling/explain/permutation.py +237 -237
  55. ins_pricing/modelling/plotting/__init__.py +40 -36
  56. ins_pricing/modelling/plotting/compat.py +228 -0
  57. ins_pricing/modelling/plotting/curves.py +572 -572
  58. ins_pricing/modelling/plotting/diagnostics.py +163 -163
  59. ins_pricing/modelling/plotting/geo.py +362 -362
  60. ins_pricing/modelling/plotting/importance.py +121 -121
  61. ins_pricing/pricing/__init__.py +27 -27
  62. ins_pricing/pricing/factors.py +67 -56
  63. ins_pricing/production/__init__.py +35 -25
  64. ins_pricing/production/{predict.py → inference.py} +140 -57
  65. ins_pricing/production/monitoring.py +8 -21
  66. ins_pricing/reporting/__init__.py +11 -11
  67. ins_pricing/setup.py +1 -1
  68. ins_pricing/tests/production/test_inference.py +90 -0
  69. ins_pricing/utils/__init__.py +112 -78
  70. ins_pricing/utils/device.py +258 -237
  71. ins_pricing/utils/features.py +53 -0
  72. ins_pricing/utils/io.py +72 -0
  73. ins_pricing/utils/logging.py +34 -1
  74. ins_pricing/{modelling/core/bayesopt/utils → utils}/losses.py +125 -129
  75. ins_pricing/utils/metrics.py +158 -24
  76. ins_pricing/utils/numerics.py +76 -0
  77. ins_pricing/utils/paths.py +9 -1
  78. ins_pricing/utils/profiling.py +8 -4
  79. {ins_pricing-0.4.5.dist-info → ins_pricing-0.5.1.dist-info}/METADATA +1 -1
  80. ins_pricing-0.5.1.dist-info/RECORD +132 -0
  81. ins_pricing/modelling/core/BayesOpt.py +0 -146
  82. ins_pricing/modelling/core/__init__.py +0 -1
  83. ins_pricing/modelling/core/bayesopt/trainers/__init__.py +0 -19
  84. ins_pricing/modelling/core/bayesopt/utils/__init__.py +0 -86
  85. ins_pricing/modelling/core/bayesopt/utils/constants.py +0 -183
  86. ins_pricing/modelling/core/bayesopt/utils/io_utils.py +0 -126
  87. ins_pricing/modelling/core/bayesopt/utils/metrics_and_devices.py +0 -555
  88. ins_pricing/modelling/core/bayesopt/utils.py +0 -105
  89. ins_pricing/modelling/core/bayesopt/utils_backup.py +0 -1503
  90. ins_pricing/tests/production/test_predict.py +0 -233
  91. ins_pricing-0.4.5.dist-info/RECORD +0 -130
  92. {ins_pricing-0.4.5.dist-info → ins_pricing-0.5.1.dist-info}/WHEEL +0 -0
  93. {ins_pricing-0.4.5.dist-info → ins_pricing-0.5.1.dist-info}/top_level.txt +0 -0
@@ -18,7 +18,7 @@ from __future__ import annotations
18
18
  import logging
19
19
  import os
20
20
  from functools import lru_cache
21
- from typing import Optional
21
+ from typing import Optional, Union
22
22
 
23
23
 
24
24
  @lru_cache(maxsize=1)
@@ -72,3 +72,36 @@ def configure_logging(
72
72
  formatter = logging.Formatter(format_string)
73
73
  for handler in logger.handlers:
74
74
  handler.setFormatter(formatter)
75
+
76
+
77
+ def log_print(
78
+ logger: logging.Logger,
79
+ *args,
80
+ level: Optional[Union[int, str]] = None,
81
+ **kwargs,
82
+ ) -> None:
83
+ """Print-like helper that routes messages to a logger.
84
+
85
+ This preserves basic print semantics (sep/end) while ignoring file/flush,
86
+ and it auto-detects severity when level is not provided.
87
+ """
88
+ sep = kwargs.get("sep", " ")
89
+ msg = sep.join(str(arg) for arg in args)
90
+ if not msg:
91
+ return
92
+
93
+ if level is None:
94
+ lowered = msg.lstrip().lower()
95
+ if lowered.startswith(("warn", "[warn]", "warning")):
96
+ level_value = logging.WARNING
97
+ elif lowered.startswith(("error", "[error]", "err")):
98
+ level_value = logging.ERROR
99
+ else:
100
+ level_value = logging.INFO
101
+ else:
102
+ if isinstance(level, str):
103
+ level_value = getattr(logging, level.upper(), logging.INFO)
104
+ else:
105
+ level_value = int(level)
106
+
107
+ logger.log(level_value, msg)
@@ -1,129 +1,125 @@
1
- """Loss selection and regression loss utilities."""
2
-
3
- from __future__ import annotations
4
-
5
- from typing import Optional
6
-
7
- import numpy as np
8
-
9
- from ....explain.metrics import (
10
- gamma_deviance,
11
- poisson_deviance,
12
- tweedie_deviance,
13
- )
14
-
15
- LOSS_ALIASES = {
16
- "poisson_deviance": "poisson",
17
- "gamma_deviance": "gamma",
18
- "tweedie_deviance": "tweedie",
19
- "l2": "mse",
20
- "l1": "mae",
21
- "absolute": "mae",
22
- "gaussian": "mse",
23
- "normal": "mse",
24
- }
25
-
26
- REGRESSION_LOSSES = {"tweedie", "poisson", "gamma", "mse", "mae"}
27
- CLASSIFICATION_LOSSES = {"logloss", "bce"}
28
-
29
-
30
- def normalize_loss_name(loss_name: Optional[str], task_type: str) -> str:
31
- """Normalize the loss name and validate against supported values."""
32
- name = str(loss_name or "auto").strip().lower()
33
- if not name or name == "auto":
34
- return "auto"
35
- name = LOSS_ALIASES.get(name, name)
36
- if task_type == "classification":
37
- if name not in CLASSIFICATION_LOSSES:
38
- raise ValueError(
39
- f"Unsupported classification loss '{loss_name}'. "
40
- f"Supported: {sorted(CLASSIFICATION_LOSSES)}"
41
- )
42
- else:
43
- if name not in REGRESSION_LOSSES:
44
- raise ValueError(
45
- f"Unsupported regression loss '{loss_name}'. "
46
- f"Supported: {sorted(REGRESSION_LOSSES)}"
47
- )
48
- return name
49
-
50
-
51
- def infer_loss_name_from_model_name(model_name: str) -> str:
52
- """Preserve legacy heuristic for loss selection based on model name."""
53
- name = str(model_name or "")
54
- if "f" in name:
55
- return "poisson"
56
- if "s" in name:
57
- return "gamma"
58
- return "tweedie"
59
-
60
-
61
- def resolve_tweedie_power(loss_name: str, default: float = 1.5) -> Optional[float]:
62
- """Resolve Tweedie power based on loss name."""
63
- if loss_name == "poisson":
64
- return 1.0
65
- if loss_name == "gamma":
66
- return 2.0
67
- if loss_name == "tweedie":
68
- return float(default)
69
- return None
70
-
71
-
72
- def resolve_xgb_objective(loss_name: str) -> str:
73
- """Map regression loss name to XGBoost objective."""
74
- name = loss_name if loss_name != "auto" else "tweedie"
75
- mapping = {
76
- "tweedie": "reg:tweedie",
77
- "poisson": "count:poisson",
78
- "gamma": "reg:gamma",
79
- "mse": "reg:squarederror",
80
- "mae": "reg:absoluteerror",
81
- }
82
- return mapping.get(name, "reg:tweedie")
83
-
84
-
85
- def regression_loss(
86
- y_true,
87
- y_pred,
88
- sample_weight=None,
89
- *,
90
- loss_name: str,
91
- tweedie_power: Optional[float] = 1.5,
92
- eps: float = 1e-8,
93
- ) -> float:
94
- """Compute weighted regression loss based on configured loss name."""
95
- name = normalize_loss_name(loss_name, task_type="regression")
96
- if name == "auto":
97
- name = "tweedie"
98
-
99
- y_t = np.asarray(y_true, dtype=float).reshape(-1)
100
- y_p = np.asarray(y_pred, dtype=float).reshape(-1)
101
- w = None if sample_weight is None else np.asarray(sample_weight, dtype=float).reshape(-1)
102
-
103
- if name == "mse":
104
- err = (y_t - y_p) ** 2
105
- return _weighted_mean(err, w)
106
- if name == "mae":
107
- err = np.abs(y_t - y_p)
108
- return _weighted_mean(err, w)
109
- if name == "poisson":
110
- return poisson_deviance(y_t, y_p, sample_weight=w, eps=eps)
111
- if name == "gamma":
112
- return gamma_deviance(y_t, y_p, sample_weight=w, eps=eps)
113
-
114
- power = 1.5 if tweedie_power is None else float(tweedie_power)
115
- return tweedie_deviance(y_t, y_p, sample_weight=w, power=power, eps=eps)
116
-
117
-
118
- def loss_requires_positive(loss_name: str) -> bool:
119
- """Return True if the loss requires positive predictions."""
120
- return loss_name in {"tweedie", "poisson", "gamma"}
121
-
122
-
123
- def _weighted_mean(values: np.ndarray, weight: Optional[np.ndarray]) -> float:
124
- if weight is None:
125
- return float(np.mean(values))
126
- total = float(np.sum(weight))
127
- if total <= 0:
128
- return float(np.mean(values))
129
- return float(np.sum(values * weight) / total)
1
+ """Loss selection and regression loss utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional
6
+
7
+ import numpy as np
8
+
9
+ from ins_pricing.utils.metrics import gamma_deviance, poisson_deviance, tweedie_deviance
10
+
11
+ LOSS_ALIASES = {
12
+ "poisson_deviance": "poisson",
13
+ "gamma_deviance": "gamma",
14
+ "tweedie_deviance": "tweedie",
15
+ "l2": "mse",
16
+ "l1": "mae",
17
+ "absolute": "mae",
18
+ "gaussian": "mse",
19
+ "normal": "mse",
20
+ }
21
+
22
+ REGRESSION_LOSSES = {"tweedie", "poisson", "gamma", "mse", "mae"}
23
+ CLASSIFICATION_LOSSES = {"logloss", "bce"}
24
+
25
+
26
+ def normalize_loss_name(loss_name: Optional[str], task_type: str) -> str:
27
+ """Normalize the loss name and validate against supported values."""
28
+ name = str(loss_name or "auto").strip().lower()
29
+ if not name or name == "auto":
30
+ return "auto"
31
+ name = LOSS_ALIASES.get(name, name)
32
+ if task_type == "classification":
33
+ if name not in CLASSIFICATION_LOSSES:
34
+ raise ValueError(
35
+ f"Unsupported classification loss '{loss_name}'. "
36
+ f"Supported: {sorted(CLASSIFICATION_LOSSES)}"
37
+ )
38
+ else:
39
+ if name not in REGRESSION_LOSSES:
40
+ raise ValueError(
41
+ f"Unsupported regression loss '{loss_name}'. "
42
+ f"Supported: {sorted(REGRESSION_LOSSES)}"
43
+ )
44
+ return name
45
+
46
+
47
+ def infer_loss_name_from_model_name(model_name: str) -> str:
48
+ """Preserve legacy heuristic for loss selection based on model name."""
49
+ name = str(model_name or "")
50
+ if "f" in name:
51
+ return "poisson"
52
+ if "s" in name:
53
+ return "gamma"
54
+ return "tweedie"
55
+
56
+
57
+ def resolve_tweedie_power(loss_name: str, default: float = 1.5) -> Optional[float]:
58
+ """Resolve Tweedie power based on loss name."""
59
+ if loss_name == "poisson":
60
+ return 1.0
61
+ if loss_name == "gamma":
62
+ return 2.0
63
+ if loss_name == "tweedie":
64
+ return float(default)
65
+ return None
66
+
67
+
68
+ def resolve_xgb_objective(loss_name: str) -> str:
69
+ """Map regression loss name to XGBoost objective."""
70
+ name = loss_name if loss_name != "auto" else "tweedie"
71
+ mapping = {
72
+ "tweedie": "reg:tweedie",
73
+ "poisson": "count:poisson",
74
+ "gamma": "reg:gamma",
75
+ "mse": "reg:squarederror",
76
+ "mae": "reg:absoluteerror",
77
+ }
78
+ return mapping.get(name, "reg:tweedie")
79
+
80
+
81
+ def regression_loss(
82
+ y_true,
83
+ y_pred,
84
+ sample_weight=None,
85
+ *,
86
+ loss_name: str,
87
+ tweedie_power: Optional[float] = 1.5,
88
+ eps: float = 1e-8,
89
+ ) -> float:
90
+ """Compute weighted regression loss based on configured loss name."""
91
+ name = normalize_loss_name(loss_name, task_type="regression")
92
+ if name == "auto":
93
+ name = "tweedie"
94
+
95
+ y_t = np.asarray(y_true, dtype=float).reshape(-1)
96
+ y_p = np.asarray(y_pred, dtype=float).reshape(-1)
97
+ w = None if sample_weight is None else np.asarray(sample_weight, dtype=float).reshape(-1)
98
+
99
+ if name == "mse":
100
+ err = (y_t - y_p) ** 2
101
+ return _weighted_mean(err, w)
102
+ if name == "mae":
103
+ err = np.abs(y_t - y_p)
104
+ return _weighted_mean(err, w)
105
+ if name == "poisson":
106
+ return poisson_deviance(y_t, y_p, sample_weight=w, eps=eps)
107
+ if name == "gamma":
108
+ return gamma_deviance(y_t, y_p, sample_weight=w, eps=eps)
109
+
110
+ power = 1.5 if tweedie_power is None else float(tweedie_power)
111
+ return tweedie_deviance(y_t, y_p, sample_weight=w, power=power, eps=eps)
112
+
113
+
114
+ def loss_requires_positive(loss_name: str) -> bool:
115
+ """Return True if the loss requires positive predictions."""
116
+ return loss_name in {"tweedie", "poisson", "gamma"}
117
+
118
+
119
+ def _weighted_mean(values: np.ndarray, weight: Optional[np.ndarray]) -> float:
120
+ if weight is None:
121
+ return float(np.mean(values))
122
+ total = float(np.sum(weight))
123
+ if total <= 0:
124
+ return float(np.mean(values))
125
+ return float(np.sum(values * weight) / total)
@@ -3,7 +3,7 @@
3
3
  This module consolidates metric computation used across:
4
4
  - pricing/monitoring.py: PSI for feature drift
5
5
  - production/drift.py: PSI wrapper for production monitoring
6
- - modelling/core/bayesopt/: Model evaluation metrics
6
+ - modelling/bayesopt/: Model evaluation metrics
7
7
 
8
8
  Example:
9
9
  >>> from ins_pricing.utils import psi_report, MetricFactory
@@ -16,23 +16,15 @@ Example:
16
16
 
17
17
  from __future__ import annotations
18
18
 
19
- from typing import Any, Iterable, List, Optional
19
+ from typing import Iterable, List, Optional
20
20
 
21
21
  import numpy as np
22
22
  import pandas as pd
23
23
 
24
24
  try:
25
- from sklearn.metrics import (
26
- log_loss,
27
- mean_absolute_error,
28
- mean_squared_error,
29
- mean_tweedie_deviance,
30
- )
31
- except ImportError:
32
- log_loss = None
33
- mean_absolute_error = None
34
- mean_squared_error = None
35
- mean_tweedie_deviance = None
25
+ from sklearn.metrics import roc_auc_score
26
+ except Exception: # pragma: no cover - optional dependency
27
+ roc_auc_score = None
36
28
 
37
29
 
38
30
  # =============================================================================
@@ -190,6 +182,152 @@ def psi_report(
190
182
  # =============================================================================
191
183
 
192
184
 
185
+ def _to_numpy(arr) -> np.ndarray:
186
+ out = np.asarray(arr, dtype=float)
187
+ return out.reshape(-1)
188
+
189
+
190
+ def _align(y_true, y_pred, sample_weight=None):
191
+ y_t = _to_numpy(y_true)
192
+ y_p = _to_numpy(y_pred)
193
+ if y_t.shape[0] != y_p.shape[0]:
194
+ raise ValueError("y_true and y_pred must have the same length.")
195
+ if sample_weight is None:
196
+ return y_t, y_p, None
197
+ w = _to_numpy(sample_weight)
198
+ if w.shape[0] != y_t.shape[0]:
199
+ raise ValueError("sample_weight must have the same length as y_true.")
200
+ return y_t, y_p, w
201
+
202
+
203
+ def _weighted_mean(values: np.ndarray, weight: Optional[np.ndarray]) -> float:
204
+ if weight is None:
205
+ return float(np.mean(values))
206
+ total = float(np.sum(weight))
207
+ if total <= 0:
208
+ return float(np.mean(values))
209
+ return float(np.sum(values * weight) / total)
210
+
211
+
212
+ def rmse(y_true, y_pred, sample_weight=None) -> float:
213
+ y_t, y_p, w = _align(y_true, y_pred, sample_weight)
214
+ err = (y_t - y_p) ** 2
215
+ return float(np.sqrt(_weighted_mean(err, w)))
216
+
217
+
218
+ def mae(y_true, y_pred, sample_weight=None) -> float:
219
+ y_t, y_p, w = _align(y_true, y_pred, sample_weight)
220
+ err = np.abs(y_t - y_p)
221
+ return _weighted_mean(err, w)
222
+
223
+
224
+ def mape(y_true, y_pred, sample_weight=None, eps: float = 1e-8) -> float:
225
+ y_t, y_p, w = _align(y_true, y_pred, sample_weight)
226
+ denom = np.maximum(np.abs(y_t), eps)
227
+ err = np.abs((y_t - y_p) / denom)
228
+ return _weighted_mean(err, w)
229
+
230
+
231
+ def r2_score(y_true, y_pred, sample_weight=None) -> float:
232
+ y_t, y_p, w = _align(y_true, y_pred, sample_weight)
233
+ if w is None:
234
+ y_mean = float(np.mean(y_t))
235
+ sse = float(np.sum((y_t - y_p) ** 2))
236
+ sst = float(np.sum((y_t - y_mean) ** 2))
237
+ else:
238
+ w_sum = float(np.sum(w))
239
+ y_mean = float(np.sum(w * y_t) / w_sum) if w_sum > 0 else float(np.mean(y_t))
240
+ sse = float(np.sum(w * (y_t - y_p) ** 2))
241
+ sst = float(np.sum(w * (y_t - y_mean) ** 2))
242
+ if sst <= 0:
243
+ return 0.0
244
+ return 1.0 - sse / sst
245
+
246
+
247
+ def logloss(y_true, y_pred, sample_weight=None, eps: float = 1e-8) -> float:
248
+ y_t, y_p, w = _align(y_true, y_pred, sample_weight)
249
+ p = np.clip(y_p, eps, 1 - eps)
250
+ loss = -(y_t * np.log(p) + (1 - y_t) * np.log(1 - p))
251
+ return _weighted_mean(loss, w)
252
+
253
+
254
+ def tweedie_deviance(
255
+ y_true,
256
+ y_pred,
257
+ sample_weight=None,
258
+ *,
259
+ power: float = 1.5,
260
+ eps: float = 1e-8,
261
+ ) -> float:
262
+ if power < 0:
263
+ raise ValueError("power must be >= 0.")
264
+ y_t, y_p, w = _align(y_true, y_pred, sample_weight)
265
+ y_p = np.clip(y_p, eps, None)
266
+ y_t_safe = np.clip(y_t, eps, None)
267
+
268
+ if power == 0:
269
+ dev = (y_t - y_p) ** 2
270
+ elif power == 1:
271
+ dev = 2 * (y_t_safe * np.log(y_t_safe / y_p) - (y_t_safe - y_p))
272
+ elif power == 2:
273
+ ratio = y_t_safe / y_p
274
+ dev = 2 * ((ratio - 1) - np.log(ratio))
275
+ else:
276
+ term1 = np.power(y_t_safe, 2 - power) / ((1 - power) * (2 - power))
277
+ term2 = y_t_safe * np.power(y_p, 1 - power) / (1 - power)
278
+ term3 = np.power(y_p, 2 - power) / (2 - power)
279
+ dev = 2 * (term1 - term2 + term3)
280
+ return _weighted_mean(dev, w)
281
+
282
+
283
+ def poisson_deviance(y_true, y_pred, sample_weight=None, eps: float = 1e-8) -> float:
284
+ return tweedie_deviance(
285
+ y_true, y_pred, sample_weight=sample_weight, power=1.0, eps=eps
286
+ )
287
+
288
+
289
+ def gamma_deviance(y_true, y_pred, sample_weight=None, eps: float = 1e-8) -> float:
290
+ return tweedie_deviance(
291
+ y_true, y_pred, sample_weight=sample_weight, power=2.0, eps=eps
292
+ )
293
+
294
+
295
+ def auc_score(y_true, y_pred, sample_weight=None) -> float:
296
+ if roc_auc_score is None:
297
+ raise RuntimeError("auc requires scikit-learn.")
298
+ y_t, y_p, w = _align(y_true, y_pred, sample_weight)
299
+ return float(roc_auc_score(y_t, y_p, sample_weight=w))
300
+
301
+
302
+ def resolve_metric(metric, *, task_type: Optional[str] = None, higher_is_better: Optional[bool] = None):
303
+ if callable(metric):
304
+ if higher_is_better is None:
305
+ raise ValueError("higher_is_better must be provided for custom metric.")
306
+ return metric, bool(higher_is_better), getattr(metric, "__name__", "custom")
307
+
308
+ name = str(metric or "auto").lower()
309
+ if name == "auto":
310
+ name = "logloss" if task_type == "classification" else "rmse"
311
+
312
+ mapping = {
313
+ "rmse": (rmse, False),
314
+ "mae": (mae, False),
315
+ "mape": (mape, False),
316
+ "r2": (r2_score, True),
317
+ "logloss": (logloss, False),
318
+ "poisson": (poisson_deviance, False),
319
+ "gamma": (gamma_deviance, False),
320
+ "tweedie": (tweedie_deviance, False),
321
+ "auc": (auc_score, True),
322
+ }
323
+ if name not in mapping:
324
+ raise ValueError(f"Unsupported metric: {metric}")
325
+ fn, hib = mapping[name]
326
+ if higher_is_better is not None:
327
+ hib = bool(higher_is_better)
328
+ return fn, hib, name
329
+
330
+
193
331
  class MetricFactory:
194
332
  """Factory for computing evaluation metrics consistently across all trainers.
195
333
 
@@ -240,25 +378,21 @@ class MetricFactory:
240
378
  Returns:
241
379
  Computed metric value (lower is better)
242
380
  """
243
- if log_loss is None or mean_tweedie_deviance is None:
244
- raise ImportError("sklearn is required for metric computation")
245
-
246
381
  y_pred = np.asarray(y_pred)
247
382
  y_true = np.asarray(y_true)
248
383
 
249
384
  if self.task_type == "classification":
250
385
  y_pred_clipped = np.clip(y_pred, self.clip_min, self.clip_max)
251
- return float(log_loss(y_true, y_pred_clipped, sample_weight=sample_weight))
386
+ return float(logloss(y_true, y_pred_clipped, sample_weight=sample_weight))
252
387
 
253
388
  loss_name = str(self.loss_name or "tweedie").strip().lower()
254
389
  if loss_name in {"mse", "mae"}:
255
- if mean_squared_error is None or mean_absolute_error is None:
256
- raise ImportError("sklearn is required for metric computation")
390
+ y_t, y_p, w = _align(y_true, y_pred, sample_weight)
257
391
  if loss_name == "mse":
258
- return float(mean_squared_error(
259
- y_true, y_pred, sample_weight=sample_weight))
260
- return float(mean_absolute_error(
261
- y_true, y_pred, sample_weight=sample_weight))
392
+ err = (y_t - y_p) ** 2
393
+ return _weighted_mean(err, w)
394
+ err = np.abs(y_t - y_p)
395
+ return _weighted_mean(err, w)
262
396
 
263
397
  y_pred_safe = np.maximum(y_pred, self.clip_min)
264
398
  power = self.tweedie_power
@@ -267,7 +401,7 @@ class MetricFactory:
267
401
  elif loss_name == "gamma":
268
402
  power = 2.0
269
403
  return float(
270
- mean_tweedie_deviance(
404
+ tweedie_deviance(
271
405
  y_true,
272
406
  y_pred_safe,
273
407
  sample_weight=sample_weight,
@@ -0,0 +1,76 @@
1
+ """Numerical utilities shared across ins_pricing.
2
+
3
+ This module centralizes small, dependency-light numerical helpers so that
4
+ other subpackages can reuse them without importing bayesopt-specific code.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import random
10
+ import numpy as np
11
+
12
+ try:
13
+ import torch
14
+ _TORCH_AVAILABLE = True
15
+ except Exception: # pragma: no cover - optional dependency
16
+ torch = None # type: ignore[assignment]
17
+ _TORCH_AVAILABLE = False
18
+
19
+
20
+ EPS = 1e-8
21
+ """Small epsilon value for numerical stability."""
22
+
23
+
24
+ def set_global_seed(seed: int) -> None:
25
+ """Set random seed for reproducibility across numpy/python/torch."""
26
+ random.seed(seed)
27
+ np.random.seed(seed)
28
+ if _TORCH_AVAILABLE:
29
+ torch.manual_seed(seed)
30
+ if torch.cuda.is_available():
31
+ torch.cuda.manual_seed_all(seed)
32
+
33
+
34
+ def compute_batch_size(data_size: int, learning_rate: float, batch_num: int, minimum: int) -> int:
35
+ """Compute adaptive batch size based on data size and learning rate."""
36
+ estimated = int((learning_rate / 1e-4) ** 0.5 * (data_size / max(batch_num, 1)))
37
+ return max(1, min(int(data_size), max(int(minimum), estimated)))
38
+
39
+
40
+ def tweedie_loss(
41
+ pred,
42
+ target,
43
+ *,
44
+ p: float = 1.5,
45
+ eps: float = 1e-6,
46
+ max_clip: float = 1e6,
47
+ ):
48
+ """Compute Tweedie deviance loss for PyTorch tensors."""
49
+ if not _TORCH_AVAILABLE:
50
+ raise ImportError("tweedie_loss requires torch to be installed.")
51
+
52
+ pred_clamped = torch.clamp(pred, min=eps)
53
+
54
+ if p == 1:
55
+ term1 = target * torch.log(target / pred_clamped + eps)
56
+ term2 = -target + pred_clamped
57
+ term3 = 0
58
+ elif p == 0:
59
+ term1 = 0.5 * torch.pow(target - pred_clamped, 2)
60
+ term2 = 0
61
+ term3 = 0
62
+ elif p == 2:
63
+ term1 = torch.log(pred_clamped / target + eps)
64
+ term2 = -target / pred_clamped + 1
65
+ term3 = 0
66
+ else:
67
+ term1 = torch.pow(target, 2 - p) / ((1 - p) * (2 - p))
68
+ term2 = target * torch.pow(pred_clamped, 1 - p) / (1 - p)
69
+ term3 = torch.pow(pred_clamped, 2 - p) / (2 - p)
70
+
71
+ return torch.nan_to_num(
72
+ 2 * (term1 - term2 + term3),
73
+ nan=eps,
74
+ posinf=max_clip,
75
+ neginf=-max_clip,
76
+ )
@@ -217,6 +217,7 @@ def load_dataset(
217
217
  data_format: str = "auto",
218
218
  dtype_map: Optional[Dict[str, Any]] = None,
219
219
  low_memory: bool = False,
220
+ chunksize: Optional[int] = None,
220
221
  ) -> pd.DataFrame:
221
222
  """Load a dataset from various formats.
222
223
 
@@ -225,6 +226,7 @@ def load_dataset(
225
226
  data_format: Format ('csv', 'parquet', 'feather', 'auto')
226
227
  dtype_map: Column type mapping
227
228
  low_memory: Whether to use low memory mode for CSV
229
+ chunksize: Optional chunk size for CSV streaming
228
230
 
229
231
  Returns:
230
232
  Loaded DataFrame
@@ -238,7 +240,13 @@ def load_dataset(
238
240
  elif fmt == "feather":
239
241
  df = pd.read_feather(path)
240
242
  elif fmt == "csv":
241
- df = pd.read_csv(path, low_memory=low_memory, dtype=dtype_map or None)
243
+ if chunksize is not None:
244
+ chunks = []
245
+ for chunk in pd.read_csv(path, low_memory=low_memory, dtype=dtype_map or None, chunksize=chunksize):
246
+ chunks.append(chunk)
247
+ df = pd.concat(chunks, ignore_index=True) if chunks else pd.DataFrame()
248
+ else:
249
+ df = pd.read_csv(path, low_memory=low_memory, dtype=dtype_map or None)
242
250
  else:
243
251
  raise ValueError(f"Unsupported data_format: {data_format}")
244
252