PyPI - ins-pricing - Versions diffs - 0.4.5__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

ins-pricing 0.4.5py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (93) hide show

ins_pricing/README.md +48 -22
ins_pricing/__init__.py +142 -90
ins_pricing/cli/BayesOpt_entry.py +58 -46
ins_pricing/cli/BayesOpt_incremental.py +77 -110
ins_pricing/cli/Explain_Run.py +42 -23
ins_pricing/cli/Explain_entry.py +551 -577
ins_pricing/cli/Pricing_Run.py +42 -23
ins_pricing/cli/bayesopt_entry_runner.py +51 -16
ins_pricing/cli/utils/bootstrap.py +23 -0
ins_pricing/cli/utils/cli_common.py +256 -256
ins_pricing/cli/utils/cli_config.py +379 -360
ins_pricing/cli/utils/import_resolver.py +375 -358
ins_pricing/cli/utils/notebook_utils.py +256 -242
ins_pricing/cli/watchdog_run.py +216 -198
ins_pricing/frontend/__init__.py +10 -10
ins_pricing/frontend/app.py +132 -61
ins_pricing/frontend/config_builder.py +33 -0
ins_pricing/frontend/example_config.json +11 -0
ins_pricing/frontend/example_workflows.py +1 -1
ins_pricing/frontend/runner.py +340 -388
ins_pricing/governance/__init__.py +20 -20
ins_pricing/governance/release.py +159 -159
ins_pricing/modelling/README.md +1 -1
ins_pricing/modelling/__init__.py +147 -92
ins_pricing/modelling/{core/bayesopt → bayesopt}/README.md +31 -13
ins_pricing/modelling/{core/bayesopt → bayesopt}/__init__.py +64 -102
ins_pricing/modelling/{core/bayesopt → bayesopt}/config_components.py +12 -0
ins_pricing/modelling/{core/bayesopt → bayesopt}/config_preprocess.py +589 -552
ins_pricing/modelling/{core/bayesopt → bayesopt}/core.py +987 -958
ins_pricing/modelling/{core/bayesopt → bayesopt}/model_explain_mixin.py +296 -296
ins_pricing/modelling/{core/bayesopt → bayesopt}/model_plotting_mixin.py +488 -548
ins_pricing/modelling/{core/bayesopt → bayesopt}/models/__init__.py +27 -27
ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_ft_components.py +349 -342
ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_ft_trainer.py +921 -913
ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_gnn.py +794 -785
ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_resn.py +454 -446
ins_pricing/modelling/bayesopt/trainers/__init__.py +19 -0
ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_base.py +1294 -1282
ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_ft.py +64 -56
ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_glm.py +203 -198
ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_gnn.py +333 -325
ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_resn.py +279 -267
ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_xgb.py +515 -313
ins_pricing/modelling/bayesopt/utils/__init__.py +67 -0
ins_pricing/modelling/bayesopt/utils/constants.py +21 -0
ins_pricing/modelling/{core/bayesopt → bayesopt}/utils/distributed_utils.py +193 -186
ins_pricing/modelling/bayesopt/utils/io_utils.py +7 -0
ins_pricing/modelling/bayesopt/utils/losses.py +27 -0
ins_pricing/modelling/bayesopt/utils/metrics_and_devices.py +17 -0
ins_pricing/modelling/{core/bayesopt → bayesopt}/utils/torch_trainer_mixin.py +636 -623
ins_pricing/modelling/{core/evaluation.py → evaluation.py} +113 -104
ins_pricing/modelling/explain/__init__.py +55 -55
ins_pricing/modelling/explain/metrics.py +27 -174
ins_pricing/modelling/explain/permutation.py +237 -237
ins_pricing/modelling/plotting/__init__.py +40 -36
ins_pricing/modelling/plotting/compat.py +228 -0
ins_pricing/modelling/plotting/curves.py +572 -572
ins_pricing/modelling/plotting/diagnostics.py +163 -163
ins_pricing/modelling/plotting/geo.py +362 -362
ins_pricing/modelling/plotting/importance.py +121 -121
ins_pricing/pricing/__init__.py +27 -27
ins_pricing/pricing/factors.py +67 -56
ins_pricing/production/__init__.py +35 -25
ins_pricing/production/{predict.py → inference.py} +140 -57
ins_pricing/production/monitoring.py +8 -21
ins_pricing/reporting/__init__.py +11 -11
ins_pricing/setup.py +1 -1
ins_pricing/tests/production/test_inference.py +90 -0
ins_pricing/utils/__init__.py +112 -78
ins_pricing/utils/device.py +258 -237
ins_pricing/utils/features.py +53 -0
ins_pricing/utils/io.py +72 -0
ins_pricing/utils/logging.py +34 -1
ins_pricing/{modelling/core/bayesopt/utils → utils}/losses.py +125 -129
ins_pricing/utils/metrics.py +158 -24
ins_pricing/utils/numerics.py +76 -0
ins_pricing/utils/paths.py +9 -1
ins_pricing/utils/profiling.py +8 -4
{ins_pricing-0.4.5.dist-info → ins_pricing-0.5.1.dist-info}/METADATA +1 -1
ins_pricing-0.5.1.dist-info/RECORD +132 -0
ins_pricing/modelling/core/BayesOpt.py +0 -146
ins_pricing/modelling/core/__init__.py +0 -1
ins_pricing/modelling/core/bayesopt/trainers/__init__.py +0 -19
ins_pricing/modelling/core/bayesopt/utils/__init__.py +0 -86
ins_pricing/modelling/core/bayesopt/utils/constants.py +0 -183
ins_pricing/modelling/core/bayesopt/utils/io_utils.py +0 -126
ins_pricing/modelling/core/bayesopt/utils/metrics_and_devices.py +0 -555
ins_pricing/modelling/core/bayesopt/utils.py +0 -105
ins_pricing/modelling/core/bayesopt/utils_backup.py +0 -1503
ins_pricing/tests/production/test_predict.py +0 -233
ins_pricing-0.4.5.dist-info/RECORD +0 -130
{ins_pricing-0.4.5.dist-info → ins_pricing-0.5.1.dist-info}/WHEEL +0 -0
{ins_pricing-0.4.5.dist-info → ins_pricing-0.5.1.dist-info}/top_level.txt +0 -0

ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_ft.py RENAMED Viewed

@@ -8,20 +8,36 @@ import pandas as pd
 from sklearn.metrics import log_loss
 from sklearn.model_selection import GroupKFold, TimeSeriesSplit
-from .trainer_base import TrainerBase
-from ..models import FTTransformerSklearn
-from ..utils.losses import regression_loss
-class FTTrainer(TrainerBase):
-    def __init__(self, context: "BayesOptModel") -> None:
-        if context.task_type == 'classification':
-            super().__init__(context, 'FTTransformerClassifier', 'FTTransformer')
-        else:
-            super().__init__(context, 'FTTransformer', 'FTTransformer')
-        self.model: Optional[FTTransformerSklearn] = None
-        self.enable_distributed_optuna = bool(context.config.use_ft_ddp)
-        self._cv_geo_warned = False
+from ins_pricing.modelling.bayesopt.trainers.trainer_base import TrainerBase
+from ins_pricing.modelling.bayesopt.models import FTTransformerSklearn
+from ins_pricing.utils.losses import regression_loss
+from ins_pricing.utils import get_logger, log_print
+_logger = get_logger("ins_pricing.trainer.ft")
+def _log(*args, **kwargs) -> None:
+    log_print(_logger, *args, **kwargs)
+class FTTrainer(TrainerBase):
+    def __init__(self, context: "BayesOptModel") -> None:
+        if context.task_type == 'classification':
+            super().__init__(context, 'FTTransformerClassifier', 'FTTransformer')
+        else:
+            super().__init__(context, 'FTTransformer', 'FTTransformer')
+        self.model: Optional[FTTransformerSklearn] = None
+        self.enable_distributed_optuna = bool(context.config.use_ft_ddp)
+        self._cv_geo_warned = False
+    def _maybe_cleanup_gpu(self, model: Optional[FTTransformerSklearn]) -> None:
+        if not bool(getattr(self.ctx.config, "ft_cleanup_per_fold", False)):
+            return
+        if model is not None:
+            getattr(getattr(model, "ft", None), "to",
+                    lambda *_args, **_kwargs: None)("cpu")
+        synchronize = bool(getattr(self.ctx.config, "ft_cleanup_synchronize", False))
+        self._clean_gpu(synchronize=synchronize)
     def _resolve_numeric_tokens(self) -> int:
         requested = getattr(self.ctx.config, "ft_num_numeric_tokens", None)
@@ -121,7 +137,7 @@ class FTTrainer(TrainerBase):
             if built is not None:
                 geo_train, geo_val, _, _ = built
             elif not self._cv_geo_warned:
-                print(
+                _log(
                     "[FTTrainer] Geo tokens unavailable for CV split; continue without geo tokens.",
                     flush=True,
                 )
@@ -168,22 +184,20 @@ class FTTrainer(TrainerBase):
         )
         model = self._apply_dataloader_overrides(model)
         model.set_params(model_params)
-        try:
-            return float(model.fit_unsupervised(
-                X_train,
-                X_val=X_val,
-                trial=trial,
-                geo_train=geo_train,
-                geo_val=geo_val,
-                mask_prob_num=mask_prob_num,
-                mask_prob_cat=mask_prob_cat,
-                num_loss_weight=num_loss_weight,
-                cat_loss_weight=cat_loss_weight
-            ))
-        finally:
-            getattr(getattr(model, "ft", None), "to",
-                    lambda *_args, **_kwargs: None)("cpu")
-            self._clean_gpu()
+        try:
+            return float(model.fit_unsupervised(
+                X_train,
+                X_val=X_val,
+                trial=trial,
+                geo_train=geo_train,
+                geo_val=geo_val,
+                mask_prob_num=mask_prob_num,
+                mask_prob_cat=mask_prob_cat,
+                num_loss_weight=num_loss_weight,
+                cat_loss_weight=cat_loss_weight
+            ))
+        finally:
+            self._maybe_cleanup_gpu(model)
     def cross_val(self, trial: optuna.trial.Trial) -> float:
         # FT-Transformer CV also focuses on memory control:
@@ -229,7 +243,7 @@ class FTTrainer(TrainerBase):
                 token_count += 1
             approx_units = d_model * n_layers * max(1, token_count)
             if approx_units > 12_000_000:
-                print(
+                _log(
                     f"[FTTrainer] Trial pruned early: d_model={d_model}, n_layers={n_layers} -> approx_units={approx_units}")
                 raise optuna.TrialPruned(
                     "config exceeds safe memory budget; prune before training")
@@ -285,7 +299,7 @@ class FTTrainer(TrainerBase):
                 if built is not None:
                     geo_train, geo_val, _, _ = built
                 elif not self._cv_geo_warned:
-                    print(
+                    _log(
                         "[FTTrainer] Geo tokens unavailable for CV split; continue without geo tokens.",
                         flush=True,
                     )
@@ -338,7 +352,7 @@ class FTTrainer(TrainerBase):
             requested_heads=resolved_params.get("n_heads")
         )
         if heads_adjusted:
-            print(f"[FTTrainer] Auto-adjusted n_heads from "
+            _log(f"[FTTrainer] Auto-adjusted n_heads from "
                   f"{resolved_params.get('n_heads')} to {adaptive_heads} "
                   f"(d_model={d_model_value}).")
         resolved_params["n_heads"] = adaptive_heads
@@ -378,13 +392,11 @@ class FTTrainer(TrainerBase):
                 geo_train=geo_train,
                 geo_val=geo_val,
             )
-            refit_epochs = self._resolve_best_epoch(
-                getattr(tmp_model, "training_history", None),
-                default_epochs=int(self.ctx.epochs),
-            )
-            getattr(getattr(tmp_model, "ft", None), "to",
-                    lambda *_args, **_kwargs: None)("cpu")
-            self._clean_gpu()
+            refit_epochs = self._resolve_best_epoch(
+                getattr(tmp_model, "training_history", None),
+                default_epochs=int(self.ctx.epochs),
+            )
+            self._maybe_cleanup_gpu(tmp_model)
         self.model = FTTransformerSklearn(
             model_nme=self.ctx.model_nme,
@@ -451,7 +463,7 @@ class FTTrainer(TrainerBase):
         split_iter, _ = self._resolve_ensemble_splits(X_all, k=k)
         if split_iter is None:
-            print(
+            _log(
                 f"[FT Ensemble] unable to build CV split (n_samples={n_samples}); skip ensemble.",
                 flush=True,
             )
@@ -494,15 +506,13 @@ class FTTrainer(TrainerBase):
             pred_train = model.predict(X_all, geo_tokens=geo_train_full)
             pred_test = model.predict(X_test, geo_tokens=geo_test_full)
-            preds_train_sum += np.asarray(pred_train, dtype=np.float64)
-            preds_test_sum += np.asarray(pred_test, dtype=np.float64)
-            getattr(getattr(model, "ft", None), "to",
-                    lambda *_args, **_kwargs: None)("cpu")
-            self._clean_gpu()
-            split_count += 1
+            preds_train_sum += np.asarray(pred_train, dtype=np.float64)
+            preds_test_sum += np.asarray(pred_test, dtype=np.float64)
+            self._maybe_cleanup_gpu(model)
+            split_count += 1
         if split_count < 1:
-            print(
+            _log(
                 f"[FT Ensemble] no CV splits generated; skip ensemble.",
                 flush=True,
             )
@@ -591,7 +601,7 @@ class FTTrainer(TrainerBase):
             requested_heads=resolved_params.get("n_heads"),
         )
         if heads_adjusted:
-            print(
+            _log(
                 f"[FTTrainer] Auto-adjusted n_heads from "
                 f"{resolved_params.get('n_heads')} to {adaptive_heads} "
                 f"(d_model={resolved_params.get('d_model', model.d_model)})."
@@ -652,11 +662,9 @@ class FTTrainer(TrainerBase):
             if preds_train is None:
                 preds_train = np.empty(
                     (len(X_all),) + fold_pred.shape[1:], dtype=fold_pred.dtype)
-            preds_train[val_idx] = fold_pred
-            getattr(getattr(model, "ft", None), "to",
-                    lambda *_a, **_k: None)("cpu")
-            self._clean_gpu()
+            preds_train[val_idx] = fold_pred
+            self._maybe_cleanup_gpu(model)
         if preds_train is None:
             return None
@@ -773,7 +781,7 @@ class FTTrainer(TrainerBase):
             requested_heads=resolved_params.get("n_heads")
         )
         if heads_adjusted:
-            print(f"[FTTrainer] Auto-adjusted n_heads from "
+            _log(f"[FTTrainer] Auto-adjusted n_heads from "
                   f"{resolved_params.get('n_heads')} to {adaptive_heads} "
                   f"(d_model={resolved_params.get('d_model', self.model.d_model)}).")
         resolved_params["n_heads"] = adaptive_heads

ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_glm.py RENAMED Viewed

@@ -1,198 +1,203 @@
-from __future__ import annotations
-from typing import Any, Dict, List, Optional, Tuple
-import numpy as np
-import optuna
-import pandas as pd
-import statsmodels.api as sm
-from sklearn.metrics import log_loss
-from .trainer_base import TrainerBase
-from ..utils import EPS
-from ..utils.losses import regression_loss
-class GLMTrainer(TrainerBase):
-    def __init__(self, context: "BayesOptModel") -> None:
-        super().__init__(context, 'GLM', 'GLM')
-        self.model = None
-    def _select_family(self, tweedie_power: Optional[float] = None):
-        if self.ctx.task_type == 'classification':
-            return sm.families.Binomial()
-        loss_name = getattr(self.ctx, "loss_name", "tweedie")
-        if loss_name == "poisson":
-            return sm.families.Poisson()
-        if loss_name == "gamma":
-            return sm.families.Gamma()
-        if loss_name in {"mse", "mae"}:
-            return sm.families.Gaussian()
-        power = tweedie_power if tweedie_power is not None else 1.5
-        return sm.families.Tweedie(var_power=power, link=sm.families.links.log())
-    def _prepare_design(self, data: pd.DataFrame) -> pd.DataFrame:
-        # Add intercept to the statsmodels design matrix.
-        X = data[self.ctx.var_nmes]
-        return sm.add_constant(X, has_constant='add')
-    def _metric_power(self, family, tweedie_power: Optional[float]) -> float:
-        if isinstance(family, sm.families.Poisson):
-            return 1.0
-        if isinstance(family, sm.families.Gamma):
-            return 2.0
-        if isinstance(family, sm.families.Tweedie):
-            return tweedie_power if tweedie_power is not None else getattr(family, 'var_power', 1.5)
-        return 1.5
-    def cross_val(self, trial: optuna.trial.Trial) -> float:
-        param_space = {
-            "alpha": lambda t: t.suggest_float('alpha', 1e-6, 1e2, log=True),
-            "l1_ratio": lambda t: t.suggest_float('l1_ratio', 0.0, 1.0)
-        }
-        loss_name = getattr(self.ctx, "loss_name", "tweedie")
-        if self.ctx.task_type == 'regression' and loss_name == 'tweedie':
-            param_space["tweedie_power"] = lambda t: t.suggest_float(
-                'tweedie_power', 1.0, 2.0)
-        def data_provider():
-            data = self.ctx.train_oht_data if self.ctx.train_oht_data is not None else self.ctx.train_oht_scl_data
-            assert data is not None, "Preprocessed training data is missing."
-            return data[self.ctx.var_nmes], data[self.ctx.resp_nme], data[self.ctx.weight_nme]
-        def preprocess_fn(X_train, X_val):
-            X_train_s, X_val_s, _ = self._standardize_fold(
-                X_train, X_val, self.ctx.num_features)
-            return self._prepare_design(X_train_s), self._prepare_design(X_val_s)
-        metric_ctx: Dict[str, Any] = {}
-        def model_builder(params):
-            family = self._select_family(params.get("tweedie_power"))
-            metric_ctx["family"] = family
-            metric_ctx["tweedie_power"] = params.get("tweedie_power")
-            return {
-                "family": family,
-                "alpha": params["alpha"],
-                "l1_ratio": params["l1_ratio"],
-                "tweedie_power": params.get("tweedie_power")
-            }
-        def fit_predict(model_cfg, X_train, y_train, w_train, X_val, y_val, w_val, _trial):
-            glm = sm.GLM(y_train, X_train,
-                         family=model_cfg["family"],
-                         freq_weights=w_train)
-            result = glm.fit_regularized(
-                alpha=model_cfg["alpha"],
-                L1_wt=model_cfg["l1_ratio"],
-                maxiter=200
-            )
-            return result.predict(X_val)
-        def metric_fn(y_true, y_pred, weight):
-            if self.ctx.task_type == 'classification':
-                y_pred_clipped = np.clip(y_pred, EPS, 1 - EPS)
-                return log_loss(y_true, y_pred_clipped, sample_weight=weight)
-            return regression_loss(
-                y_true,
-                y_pred,
-                weight,
-                loss_name=loss_name,
-                tweedie_power=metric_ctx.get("tweedie_power"),
-            )
-        return self.cross_val_generic(
-            trial=trial,
-            hyperparameter_space=param_space,
-            data_provider=data_provider,
-            model_builder=model_builder,
-            metric_fn=metric_fn,
-            preprocess_fn=preprocess_fn,
-            fit_predict_fn=fit_predict
-        )
-    def train(self) -> None:
-        if not self.best_params:
-            raise RuntimeError("Run tune() first to obtain best GLM parameters.")
-        tweedie_power = self.best_params.get('tweedie_power')
-        family = self._select_family(tweedie_power)
-        X_train = self._prepare_design(self.ctx.train_oht_scl_data)
-        y_train = self.ctx.train_oht_scl_data[self.ctx.resp_nme]
-        w_train = self.ctx.train_oht_scl_data[self.ctx.weight_nme]
-        glm = sm.GLM(y_train, X_train, family=family,
-                     freq_weights=w_train)
-        self.model = glm.fit_regularized(
-            alpha=self.best_params['alpha'],
-            L1_wt=self.best_params['l1_ratio'],
-            maxiter=300
-        )
-        self.ctx.glm_best = self.model
-        self.ctx.model_label += [self.label]
-        self._predict_and_cache(
-            self.model,
-            'glm',
-            design_fn=lambda train: self._prepare_design(
-                self.ctx.train_oht_scl_data if train else self.ctx.test_oht_scl_data
-            )
-        )
-    def ensemble_predict(self, k: int) -> None:
-        if not self.best_params:
-            raise RuntimeError("Run tune() first to obtain best GLM parameters.")
-        k = max(2, int(k))
-        data = self.ctx.train_oht_scl_data
-        if data is None:
-            raise RuntimeError("Missing standardized data for GLM ensemble.")
-        X_all = data[self.ctx.var_nmes]
-        y_all = data[self.ctx.resp_nme]
-        w_all = data[self.ctx.weight_nme]
-        X_test = self.ctx.test_oht_scl_data
-        if X_test is None:
-            raise RuntimeError("Missing standardized test data for GLM ensemble.")
-        n_samples = len(X_all)
-        X_all_design = self._prepare_design(data)
-        X_test_design = self._prepare_design(X_test)
-        tweedie_power = self.best_params.get('tweedie_power')
-        family = self._select_family(tweedie_power)
-        split_iter, _ = self._resolve_ensemble_splits(X_all, k=k)
-        if split_iter is None:
-            print(
-                f"[GLM Ensemble] unable to build CV split (n_samples={n_samples}); skip ensemble.",
-                flush=True,
-            )
-            return
-        preds_train_sum = np.zeros(n_samples, dtype=np.float64)
-        preds_test_sum = np.zeros(len(X_test_design), dtype=np.float64)
-        split_count = 0
-        for train_idx, _val_idx in split_iter:
-            X_train = X_all_design.iloc[train_idx]
-            y_train = y_all.iloc[train_idx]
-            w_train = w_all.iloc[train_idx]
-            glm = sm.GLM(y_train, X_train, family=family, freq_weights=w_train)
-            result = glm.fit_regularized(
-                alpha=self.best_params['alpha'],
-                L1_wt=self.best_params['l1_ratio'],
-                maxiter=300
-            )
-            pred_train = result.predict(X_all_design)
-            pred_test = result.predict(X_test_design)
-            preds_train_sum += np.asarray(pred_train, dtype=np.float64)
-            preds_test_sum += np.asarray(pred_test, dtype=np.float64)
-            split_count += 1
-        if split_count < 1:
-            print(
-                f"[GLM Ensemble] no CV splits generated; skip ensemble.",
-                flush=True,
-            )
-            return
-        preds_train = preds_train_sum / float(split_count)
-        preds_test = preds_test_sum / float(split_count)
-        self._cache_predictions("glm", preds_train, preds_test)
+from __future__ import annotations
+from typing import Any, Dict, List, Optional, Tuple
+import numpy as np
+import optuna
+import pandas as pd
+import statsmodels.api as sm
+from sklearn.metrics import log_loss
+from ins_pricing.modelling.bayesopt.trainers.trainer_base import TrainerBase
+from ins_pricing.utils import EPS, get_logger, log_print
+from ins_pricing.utils.losses import regression_loss
+_logger = get_logger("ins_pricing.trainer.glm")
+def _log(*args, **kwargs) -> None:
+    log_print(_logger, *args, **kwargs)
+class GLMTrainer(TrainerBase):
+    def __init__(self, context: "BayesOptModel") -> None:
+        super().__init__(context, 'GLM', 'GLM')
+        self.model = None
+    def _select_family(self, tweedie_power: Optional[float] = None):
+        if self.ctx.task_type == 'classification':
+            return sm.families.Binomial()
+        loss_name = getattr(self.ctx, "loss_name", "tweedie")
+        if loss_name == "poisson":
+            return sm.families.Poisson()
+        if loss_name == "gamma":
+            return sm.families.Gamma()
+        if loss_name in {"mse", "mae"}:
+            return sm.families.Gaussian()
+        power = tweedie_power if tweedie_power is not None else 1.5
+        return sm.families.Tweedie(var_power=power, link=sm.families.links.log())
+    def _prepare_design(self, data: pd.DataFrame) -> pd.DataFrame:
+        # Add intercept to the statsmodels design matrix.
+        X = data[self.ctx.var_nmes]
+        return sm.add_constant(X, has_constant='add')
+    def _metric_power(self, family, tweedie_power: Optional[float]) -> float:
+        if isinstance(family, sm.families.Poisson):
+            return 1.0
+        if isinstance(family, sm.families.Gamma):
+            return 2.0
+        if isinstance(family, sm.families.Tweedie):
+            return tweedie_power if tweedie_power is not None else getattr(family, 'var_power', 1.5)
+        return 1.5
+    def cross_val(self, trial: optuna.trial.Trial) -> float:
+        param_space = {
+            "alpha": lambda t: t.suggest_float('alpha', 1e-6, 1e2, log=True),
+            "l1_ratio": lambda t: t.suggest_float('l1_ratio', 0.0, 1.0)
+        }
+        loss_name = getattr(self.ctx, "loss_name", "tweedie")
+        if self.ctx.task_type == 'regression' and loss_name == 'tweedie':
+            param_space["tweedie_power"] = lambda t: t.suggest_float(
+                'tweedie_power', 1.0, 2.0)
+        def data_provider():
+            data = self.ctx.train_oht_data if self.ctx.train_oht_data is not None else self.ctx.train_oht_scl_data
+            assert data is not None, "Preprocessed training data is missing."
+            return data[self.ctx.var_nmes], data[self.ctx.resp_nme], data[self.ctx.weight_nme]
+        def preprocess_fn(X_train, X_val):
+            X_train_s, X_val_s, _ = self._standardize_fold(
+                X_train, X_val, self.ctx.num_features)
+            return self._prepare_design(X_train_s), self._prepare_design(X_val_s)
+        metric_ctx: Dict[str, Any] = {}
+        def model_builder(params):
+            family = self._select_family(params.get("tweedie_power"))
+            metric_ctx["family"] = family
+            metric_ctx["tweedie_power"] = params.get("tweedie_power")
+            return {
+                "family": family,
+                "alpha": params["alpha"],
+                "l1_ratio": params["l1_ratio"],
+                "tweedie_power": params.get("tweedie_power")
+            }
+        def fit_predict(model_cfg, X_train, y_train, w_train, X_val, y_val, w_val, _trial):
+            glm = sm.GLM(y_train, X_train,
+                         family=model_cfg["family"],
+                         freq_weights=w_train)
+            result = glm.fit_regularized(
+                alpha=model_cfg["alpha"],
+                L1_wt=model_cfg["l1_ratio"],
+                maxiter=200
+            )
+            return result.predict(X_val)
+        def metric_fn(y_true, y_pred, weight):
+            if self.ctx.task_type == 'classification':
+                y_pred_clipped = np.clip(y_pred, EPS, 1 - EPS)
+                return log_loss(y_true, y_pred_clipped, sample_weight=weight)
+            return regression_loss(
+                y_true,
+                y_pred,
+                weight,
+                loss_name=loss_name,
+                tweedie_power=metric_ctx.get("tweedie_power"),
+            )
+        return self.cross_val_generic(
+            trial=trial,
+            hyperparameter_space=param_space,
+            data_provider=data_provider,
+            model_builder=model_builder,
+            metric_fn=metric_fn,
+            preprocess_fn=preprocess_fn,
+            fit_predict_fn=fit_predict
+        )
+    def train(self) -> None:
+        if not self.best_params:
+            raise RuntimeError("Run tune() first to obtain best GLM parameters.")
+        tweedie_power = self.best_params.get('tweedie_power')
+        family = self._select_family(tweedie_power)
+        X_train = self._prepare_design(self.ctx.train_oht_scl_data)
+        y_train = self.ctx.train_oht_scl_data[self.ctx.resp_nme]
+        w_train = self.ctx.train_oht_scl_data[self.ctx.weight_nme]
+        glm = sm.GLM(y_train, X_train, family=family,
+                     freq_weights=w_train)
+        self.model = glm.fit_regularized(
+            alpha=self.best_params['alpha'],
+            L1_wt=self.best_params['l1_ratio'],
+            maxiter=300
+        )
+        self.ctx.glm_best = self.model
+        self.ctx.model_label += [self.label]
+        self._predict_and_cache(
+            self.model,
+            'glm',
+            design_fn=lambda train: self._prepare_design(
+                self.ctx.train_oht_scl_data if train else self.ctx.test_oht_scl_data
+            )
+        )
+    def ensemble_predict(self, k: int) -> None:
+        if not self.best_params:
+            raise RuntimeError("Run tune() first to obtain best GLM parameters.")
+        k = max(2, int(k))
+        data = self.ctx.train_oht_scl_data
+        if data is None:
+            raise RuntimeError("Missing standardized data for GLM ensemble.")
+        X_all = data[self.ctx.var_nmes]
+        y_all = data[self.ctx.resp_nme]
+        w_all = data[self.ctx.weight_nme]
+        X_test = self.ctx.test_oht_scl_data
+        if X_test is None:
+            raise RuntimeError("Missing standardized test data for GLM ensemble.")
+        n_samples = len(X_all)
+        X_all_design = self._prepare_design(data)
+        X_test_design = self._prepare_design(X_test)
+        tweedie_power = self.best_params.get('tweedie_power')
+        family = self._select_family(tweedie_power)
+        split_iter, _ = self._resolve_ensemble_splits(X_all, k=k)
+        if split_iter is None:
+            _log(
+                f"[GLM Ensemble] unable to build CV split (n_samples={n_samples}); skip ensemble.",
+                flush=True,
+            )
+            return
+        preds_train_sum = np.zeros(n_samples, dtype=np.float64)
+        preds_test_sum = np.zeros(len(X_test_design), dtype=np.float64)
+        split_count = 0
+        for train_idx, _val_idx in split_iter:
+            X_train = X_all_design.iloc[train_idx]
+            y_train = y_all.iloc[train_idx]
+            w_train = w_all.iloc[train_idx]
+            glm = sm.GLM(y_train, X_train, family=family, freq_weights=w_train)
+            result = glm.fit_regularized(
+                alpha=self.best_params['alpha'],
+                L1_wt=self.best_params['l1_ratio'],
+                maxiter=300
+            )
+            pred_train = result.predict(X_all_design)
+            pred_test = result.predict(X_test_design)
+            preds_train_sum += np.asarray(pred_train, dtype=np.float64)
+            preds_test_sum += np.asarray(pred_test, dtype=np.float64)
+            split_count += 1
+        if split_count < 1:
+            _log(
+                f"[GLM Ensemble] no CV splits generated; skip ensemble.",
+                flush=True,
+            )
+            return
+        preds_train = preds_train_sum / float(split_count)
+        preds_test = preds_test_sum / float(split_count)
+        self._cache_predictions("glm", preds_train, preds_test)

ins-pricing 0.4.5__py3-none-any.whl → 0.5.1__py3-none-any.whl

ins-pricing 0.4.5py3-none-any.whl → 0.5.1py3-none-any.whl