ins-pricing 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ins_pricing/README.md +66 -74
- ins_pricing/cli/BayesOpt_incremental.py +904 -904
- ins_pricing/cli/bayesopt_entry_runner.py +1442 -1442
- ins_pricing/frontend/README.md +573 -419
- ins_pricing/frontend/config_builder.py +1 -0
- ins_pricing/modelling/README.md +67 -0
- ins_pricing/modelling/core/bayesopt/README.md +59 -0
- ins_pricing/modelling/core/bayesopt/config_preprocess.py +12 -0
- ins_pricing/modelling/core/bayesopt/core.py +3 -1
- ins_pricing/modelling/core/bayesopt/trainers/trainer_ft.py +830 -809
- ins_pricing/setup.py +1 -1
- {ins_pricing-0.4.3.dist-info → ins_pricing-0.4.5.dist-info}/METADATA +182 -162
- {ins_pricing-0.4.3.dist-info → ins_pricing-0.4.5.dist-info}/RECORD +15 -22
- ins_pricing/CHANGELOG.md +0 -272
- ins_pricing/RELEASE_NOTES_0.2.8.md +0 -344
- ins_pricing/docs/LOSS_FUNCTIONS.md +0 -78
- ins_pricing/docs/modelling/BayesOpt_USAGE.md +0 -945
- ins_pricing/docs/modelling/README.md +0 -34
- ins_pricing/frontend/QUICKSTART.md +0 -152
- ins_pricing/modelling/core/bayesopt/PHASE2_REFACTORING_SUMMARY.md +0 -449
- ins_pricing/modelling/core/bayesopt/PHASE3_REFACTORING_SUMMARY.md +0 -406
- ins_pricing/modelling/core/bayesopt/REFACTORING_SUMMARY.md +0 -247
- {ins_pricing-0.4.3.dist-info → ins_pricing-0.4.5.dist-info}/WHEEL +0 -0
- {ins_pricing-0.4.3.dist-info → ins_pricing-0.4.5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# Modelling
|
|
2
|
+
|
|
3
|
+
This directory contains reusable training tooling and frameworks centered on BayesOpt.
|
|
4
|
+
|
|
5
|
+
## Key locations
|
|
6
|
+
|
|
7
|
+
- `core/bayesopt/` - core training/tuning package
|
|
8
|
+
- `explain/` - explainability helpers
|
|
9
|
+
- `plotting/` - plotting utilities
|
|
10
|
+
- `ins_pricing/cli/` - CLI entry points
|
|
11
|
+
- `examples/` - example configs and notebooks (repo only)
|
|
12
|
+
|
|
13
|
+
## Common usage
|
|
14
|
+
|
|
15
|
+
- CLI training: `python ins_pricing/cli/BayesOpt_entry.py --config-json config_template.json`
|
|
16
|
+
- Notebook API: `from ins_pricing.modelling import BayesOptModel`
|
|
17
|
+
|
|
18
|
+
## Explainability
|
|
19
|
+
|
|
20
|
+
- CLI: `python ins_pricing/cli/Explain_entry.py --config-json config_explain_template.json`
|
|
21
|
+
- Notebook: `examples/04 Explain_Run.ipynb`
|
|
22
|
+
|
|
23
|
+
## Loss functions
|
|
24
|
+
|
|
25
|
+
Configure the regression/classification loss with `loss_name` in the BayesOpt config.
|
|
26
|
+
|
|
27
|
+
Supported `loss_name` values:
|
|
28
|
+
- `auto` (default): legacy behavior based on model name
|
|
29
|
+
- `tweedie`: Tweedie deviance
|
|
30
|
+
- `poisson`: Poisson deviance
|
|
31
|
+
- `gamma`: Gamma deviance
|
|
32
|
+
- `mse`: mean squared error
|
|
33
|
+
- `mae`: mean absolute error
|
|
34
|
+
|
|
35
|
+
Mapping summary:
|
|
36
|
+
- Tweedie deviance -> `tweedie`
|
|
37
|
+
- Poisson deviance -> `poisson`
|
|
38
|
+
- Gamma deviance -> `gamma`
|
|
39
|
+
- Mean squared error -> `mse`
|
|
40
|
+
- Mean absolute error -> `mae`
|
|
41
|
+
- Classification log loss -> `logloss` (classification only)
|
|
42
|
+
- Classification BCE -> `bce` (classification only)
|
|
43
|
+
|
|
44
|
+
Classification tasks:
|
|
45
|
+
- `loss_name` can be `auto`, `logloss`, or `bce`.
|
|
46
|
+
- Training uses `BCEWithLogits` for torch models; evaluation uses log loss.
|
|
47
|
+
|
|
48
|
+
Where to set `loss_name`:
|
|
49
|
+
|
|
50
|
+
```json
|
|
51
|
+
{
|
|
52
|
+
"task_type": "regression",
|
|
53
|
+
"loss_name": "mse"
|
|
54
|
+
}
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Behavior notes:
|
|
58
|
+
- When `loss_name` is `mse` or `mae`, tuning does not sample Tweedie power.
|
|
59
|
+
- When `loss_name` is `poisson` or `gamma`, power is fixed (1.0 / 2.0).
|
|
60
|
+
- When `loss_name` is `tweedie`, power is sampled as usual.
|
|
61
|
+
- XGBoost objective is selected from the loss name.
|
|
62
|
+
|
|
63
|
+
## Notes
|
|
64
|
+
|
|
65
|
+
- Models load from `output_dir/model` by default (override with `explain.model_dir`).
|
|
66
|
+
- Training outputs are written to `plot/`, `Results/`, and `model/` under `output_dir`.
|
|
67
|
+
- Keep large data and secrets outside the repo; use environment variables or `.env` files.
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# BayesOpt
|
|
2
|
+
|
|
3
|
+
BayesOpt is the training/tuning core for GLM, XGBoost, ResNet, FT-Transformer, and GNN workflows.
|
|
4
|
+
It supports JSON-driven CLI runs and a Python API for notebooks/scripts.
|
|
5
|
+
|
|
6
|
+
## Recommended API (config-based)
|
|
7
|
+
|
|
8
|
+
```python
|
|
9
|
+
from ins_pricing.modelling.core.bayesopt import BayesOptConfig
|
|
10
|
+
from ins_pricing.modelling import BayesOptModel
|
|
11
|
+
|
|
12
|
+
config = BayesOptConfig(
|
|
13
|
+
model_nme="my_model",
|
|
14
|
+
resp_nme="target",
|
|
15
|
+
weight_nme="weight",
|
|
16
|
+
factor_nmes=["f1", "f2"],
|
|
17
|
+
cate_list=["f2"],
|
|
18
|
+
task_type="regression",
|
|
19
|
+
epochs=50,
|
|
20
|
+
output_dir="./Results",
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
model = BayesOptModel(train_data, test_data, config=config)
|
|
24
|
+
model.optimize_model("xgb", max_evals=50)
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Load config from file
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from ins_pricing.modelling.core.bayesopt import BayesOptConfig
|
|
31
|
+
from ins_pricing.modelling import BayesOptModel
|
|
32
|
+
|
|
33
|
+
config = BayesOptConfig.from_file("config.json")
|
|
34
|
+
model = BayesOptModel(train_data, test_data, config=config)
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## CLI entry
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
python ins_pricing/cli/BayesOpt_entry.py --config-json config_template.json
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## FT roles
|
|
44
|
+
|
|
45
|
+
- `model`: FT is a prediction model (writes `pred_ft`).
|
|
46
|
+
- `embedding`: FT trains with labels but exports embeddings (`pred_<prefix>_*`).
|
|
47
|
+
- `unsupervised_embedding`: FT trains without labels and exports embeddings.
|
|
48
|
+
|
|
49
|
+
## Output layout
|
|
50
|
+
|
|
51
|
+
`output_dir/` contains:
|
|
52
|
+
- `plot/` plots and diagnostics
|
|
53
|
+
- `Results/` metrics, params, and snapshots
|
|
54
|
+
- `model/` saved models
|
|
55
|
+
|
|
56
|
+
## Notes
|
|
57
|
+
|
|
58
|
+
- Relative paths in config are resolved from the config file directory.
|
|
59
|
+
- For multi-GPU, use `torchrun` and set `runner.nproc_per_node` in config.
|
|
@@ -97,6 +97,7 @@ class BayesOptConfig:
|
|
|
97
97
|
use_gnn_ddp: Use DDP for GNN
|
|
98
98
|
ft_role: FT-Transformer role ('model', 'embedding', 'unsupervised_embedding')
|
|
99
99
|
cv_strategy: CV strategy ('random', 'group', 'time', 'stratified')
|
|
100
|
+
build_oht: Whether to build one-hot encoded features (default True)
|
|
100
101
|
|
|
101
102
|
Example:
|
|
102
103
|
>>> config = BayesOptConfig(
|
|
@@ -192,6 +193,7 @@ class BayesOptConfig:
|
|
|
192
193
|
preprocess_artifact_path: Optional[str] = None
|
|
193
194
|
plot_path_style: str = "nested"
|
|
194
195
|
bo_sample_limit: Optional[int] = None
|
|
196
|
+
build_oht: bool = True
|
|
195
197
|
cache_predictions: bool = False
|
|
196
198
|
prediction_cache_dir: Optional[str] = None
|
|
197
199
|
prediction_cache_format: str = "parquet"
|
|
@@ -465,6 +467,16 @@ class DatasetPreprocessor:
|
|
|
465
467
|
self.num_features = [
|
|
466
468
|
nme for nme in cfg.factor_nmes if nme not in cate_list]
|
|
467
469
|
|
|
470
|
+
build_oht = bool(getattr(cfg, "build_oht", True))
|
|
471
|
+
if not build_oht:
|
|
472
|
+
print("[Preprocess] build_oht=False; skip one-hot features.", flush=True)
|
|
473
|
+
self.train_oht_data = None
|
|
474
|
+
self.test_oht_data = None
|
|
475
|
+
self.train_oht_scl_data = None
|
|
476
|
+
self.test_oht_scl_data = None
|
|
477
|
+
self.var_nmes = list(cfg.factor_nmes)
|
|
478
|
+
return self
|
|
479
|
+
|
|
468
480
|
# Memory optimization: Single copy + in-place operations
|
|
469
481
|
train_oht = self.train_data[cfg.factor_nmes +
|
|
470
482
|
[cfg.weight_nme] + [cfg.resp_nme]].copy()
|
|
@@ -201,6 +201,8 @@ class BayesOptModel(BayesOptPlottingMixin, BayesOptExplainMixin):
|
|
|
201
201
|
raise ValueError("weight_nme is required when not using config parameter")
|
|
202
202
|
|
|
203
203
|
# Infer categorical features if needed
|
|
204
|
+
# Only use user-specified categorical list for one-hot; do not auto-infer.
|
|
205
|
+
user_cate_list = [] if cate_list is None else list(cate_list)
|
|
204
206
|
inferred_factors, inferred_cats = infer_factor_and_cate_list(
|
|
205
207
|
train_df=train_data,
|
|
206
208
|
test_df=test_data,
|
|
@@ -208,7 +210,7 @@ class BayesOptModel(BayesOptPlottingMixin, BayesOptExplainMixin):
|
|
|
208
210
|
weight_nme=weight_nme,
|
|
209
211
|
binary_resp_nme=binary_resp_nme,
|
|
210
212
|
factor_nmes=factor_nmes,
|
|
211
|
-
cate_list=
|
|
213
|
+
cate_list=user_cate_list,
|
|
212
214
|
infer_categorical_max_unique=int(infer_categorical_max_unique),
|
|
213
215
|
infer_categorical_max_ratio=float(infer_categorical_max_ratio),
|
|
214
216
|
)
|