pyiblm 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pyiblm-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Your Name
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
pyiblm-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,198 @@
1
+ Metadata-Version: 2.1
2
+ Name: pyiblm
3
+ Version: 0.1.0
4
+ Summary: Interpretable Boosted Linear Model (IBLM): A transparent machine learning approach combining generalized linear models with gradient boosting
5
+ License: MIT
6
+ Author: Your Name
7
+ Author-email: you@example.com
8
+ Requires-Python: >=3.12,<4.0
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Classifier: Programming Language :: Python :: 3.13
13
+ Provides-Extra: all
14
+ Provides-Extra: explainability
15
+ Provides-Extra: visualization
16
+ Requires-Dist: altair (>=5.4.0,<6.0.0) ; extra == "visualization" or extra == "all"
17
+ Requires-Dist: altair-saver (>=0.5.0,<0.6.0) ; extra == "visualization" or extra == "all"
18
+ Requires-Dist: joblib (>=1.3.0,<2.0.0)
19
+ Requires-Dist: numpy (>=2.0.0,<3.0.0)
20
+ Requires-Dist: pandas (>=2.0.0,<3.0.0)
21
+ Requires-Dist: plotnine (>=0.15.0,<0.16.0) ; extra == "visualization" or extra == "all"
22
+ Requires-Dist: pydantic (>=2.10.0,<3.0.0)
23
+ Requires-Dist: scikit-learn (>=1.5.0,<2.0.0)
24
+ Requires-Dist: shap (>=0.45.0,<0.46.0) ; extra == "explainability" or extra == "all"
25
+ Requires-Dist: statsmodels (>=0.14.0,<0.15.0)
26
+ Requires-Dist: vl-convert-python (>=1.6.1,<2.0.0) ; extra == "visualization" or extra == "all"
27
+ Requires-Dist: xgboost (>=2.1.0,<3.0.0)
28
+ Description-Content-Type: text/markdown
29
+
30
+ # PyIBLM: Interpretable Boosted Linear Model
31
+
32
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
33
+ [![Python 3.12+](https://img.shields.io/badge/python-3.12+-blue.svg)](https://www.python.org/downloads/)
34
+
35
+ **PyIBLM** is a Python package implementing the Interpretable Boosted Linear Model (IBLM), a transparent machine learning approach that combines the interpretability of Generalized Linear Models (GLMs) with the predictive power of gradient boosting.
36
+
37
+ ## Features
38
+
39
+ - 🎯 **Interpretable by design**: Combines GLM transparency with boosting performance
40
+ - 📊 **Multiple model families**: Poisson, Tweedie, Gaussian, and more (via statsmodels)
41
+ - 🚀 **Gradient boosting integration**: Uses scikit-learn's HistGradientBoostingRegressor and XGBoost
42
+ - 📈 **SHAP explanations**: Built-in feature importance and contribution analysis
43
+ - 🔍 **Comprehensive diagnostics**: Pinball scores, deviance metrics, and model comparisons
44
+ - 📉 **Visualization tools**: Beta corrections, density plots, and correction corridors
45
+
46
+ ## Installation
47
+
48
+ ### Basic Installation
49
+ ```bash
50
+ pip install pyiblm
51
+ ```
52
+
53
+ ### With Visualization Support
54
+ ```bash
55
+ pip install pyiblm[visualization]
56
+ ```
57
+
58
+ ### With Explainability Features
59
+ ```bash
60
+ pip install pyiblm[explainability]
61
+ ```
62
+
63
+ ### Full Installation
64
+ ```bash
65
+ pip install pyiblm[all]
66
+ ```
67
+
68
+ ## Quick Start
69
+
70
+ ```python
71
+ from pyBLM import (
72
+ IBLMModel,
73
+ BoosterConfig,
74
+ GLMConfig,
75
+ TrainingConfig,
76
+ load_freMTPL2freq,
77
+ )
78
+
79
+ # Load example data
80
+ data = load_freMTPL2freq("data/freMTPL2freq.csv")
81
+ train, validate, test = data.split_into_train_validate_test(seed=123)
82
+
83
+ # Configure the model
84
+ config = TrainingConfig(
85
+ response="ClaimRate",
86
+ glm=GLMConfig(family="poisson"),
87
+ booster=BoosterConfig(
88
+ nrounds=500,
89
+ early_stopping_rounds=20,
90
+ params={"max_depth": 3, "eta": 0.025},
91
+ ),
92
+ )
93
+
94
+ # Train the model
95
+ model = IBLMModel(config).fit(train, validate)
96
+
97
+ # Make predictions
98
+ predictions = model.predict(test)
99
+
100
+ # Get GLM parameters
101
+ glm_params = model.get_glm_params()
102
+ print(glm_params)
103
+ ```
104
+
105
+ ## Core Components
106
+
107
+ ### Model Classes
108
+ - **`IBLMModel`**: Main model class combining GLM and gradient boosting
109
+ - **`BoosterConfig`**: Configuration for the gradient boosting component
110
+ - **`GLMConfig`**: Configuration for the GLM component
111
+ - **`TrainingConfig`**: Overall training configuration
112
+
113
+ ### Data Handling
114
+ - **`load_freMTPL2freq()`**: Load example insurance dataset
115
+ - **`FeaturePreprocessor`**: Automatic feature encoding and preprocessing
116
+
117
+ ### Evaluation
118
+ - **`poisson_deviance()`**: Compute Poisson deviance
119
+ - **`get_pinball_scores()`**: Multi-model pinball loss comparison
120
+ - **`calculate_deviance()`**: Family-based deviance calculation
121
+
122
+ ### Explanation & Visualization
123
+ - **`explain()`**: Generate explanation object with SHAP values
124
+ - **`IBLMPlotter`**: Visualization utilities for model interpretation
125
+ - **`correction_corridor()`**: Visualize model correction patterns
126
+ - **`extract_booster_shap()`**: Extract SHAP values from booster
127
+
128
+ ## Documentation
129
+
130
+ For detailed documentation and tutorials, see:
131
+ - `examples/` - Example scripts and use cases
132
+ - `dev.ipynb` - Development notebook with comprehensive example
133
+
134
+ ## Development
135
+
136
+ This package is actively developed. Contributions are welcome!
137
+
138
+ ### Development Setup
139
+ ```bash
140
+ git clone https://github.com/ZZhouGit/pyBLM.git
141
+ cd pyBLM
142
+ python -m venv .venv
143
+ source .venv/bin/activate # On Windows: .venv\Scripts\activate
144
+ pip install -e ".[all]"
145
+ poetry install --with dev
146
+ ```
147
+
148
+ ### Running Tests
149
+ ```bash
150
+ pytest tests/
151
+ ```
152
+
153
+ ### Development Notebook
154
+ Open `dev.ipynb` in Jupyter to see comprehensive examples:
155
+ ```bash
156
+ jupyter notebook dev.ipynb
157
+ ```
158
+
159
+ ## Requirements
160
+
161
+ - Python 3.12+
162
+ - pandas >= 2.0.0
163
+ - numpy >= 2.0.0
164
+ - scikit-learn >= 1.5.0
165
+ - xgboost >= 2.1.0
166
+ - pydantic >= 2.10.0
167
+ - statsmodels >= 0.14.0
168
+
169
+ Optional dependencies:
170
+ - plotnine >= 0.15.0 (for visualization)
171
+ - altair >= 5.4.0 (for interactive plots)
172
+ - shap >= 0.45.0 (for SHAP explanations)
173
+
174
+ ## Citation
175
+
176
+ If you use PyBLM in your research, please cite:
177
+
178
+ ```bibtex
179
+ @software{pyiblm2025,
180
+ title={PyIBLM: Interpretable Boosted Linear Models},
181
+ author={Your Name},
182
+ year={2025},
183
+ url={https://github.com/ZZhouGit/pyBLM},
184
+ }
185
+ ```
186
+
187
+ ## License
188
+
189
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
190
+
191
+ ## Authors
192
+
193
+ - Your Name
194
+
195
+ ## Acknowledgments
196
+
197
+ Built with [scikit-learn](https://scikit-learn.org/), [XGBoost](https://xgboost.readthedocs.io/), [SHAP](https://github.com/shap/shap), and [statsmodels](https://www.statsmodels.org/).
198
+
pyiblm-0.1.0/README.md ADDED
@@ -0,0 +1,168 @@
1
+ # PyIBLM: Interpretable Boosted Linear Model
2
+
3
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
4
+ [![Python 3.12+](https://img.shields.io/badge/python-3.12+-blue.svg)](https://www.python.org/downloads/)
5
+
6
+ **PyIBLM** is a Python package implementing the Interpretable Boosted Linear Model (IBLM), a transparent machine learning approach that combines the interpretability of Generalized Linear Models (GLMs) with the predictive power of gradient boosting.
7
+
8
+ ## Features
9
+
10
+ - 🎯 **Interpretable by design**: Combines GLM transparency with boosting performance
11
+ - 📊 **Multiple model families**: Poisson, Tweedie, Gaussian, and more (via statsmodels)
12
+ - 🚀 **Gradient boosting integration**: Uses scikit-learn's HistGradientBoostingRegressor and XGBoost
13
+ - 📈 **SHAP explanations**: Built-in feature importance and contribution analysis
14
+ - 🔍 **Comprehensive diagnostics**: Pinball scores, deviance metrics, and model comparisons
15
+ - 📉 **Visualization tools**: Beta corrections, density plots, and correction corridors
16
+
17
+ ## Installation
18
+
19
+ ### Basic Installation
20
+ ```bash
21
+ pip install pyiblm
22
+ ```
23
+
24
+ ### With Visualization Support
25
+ ```bash
26
+ pip install pyiblm[visualization]
27
+ ```
28
+
29
+ ### With Explainability Features
30
+ ```bash
31
+ pip install pyiblm[explainability]
32
+ ```
33
+
34
+ ### Full Installation
35
+ ```bash
36
+ pip install pyiblm[all]
37
+ ```
38
+
39
+ ## Quick Start
40
+
41
+ ```python
42
+ from pyBLM import (
43
+ IBLMModel,
44
+ BoosterConfig,
45
+ GLMConfig,
46
+ TrainingConfig,
47
+ load_freMTPL2freq,
48
+ )
49
+
50
+ # Load example data
51
+ data = load_freMTPL2freq("data/freMTPL2freq.csv")
52
+ train, validate, test = data.split_into_train_validate_test(seed=123)
53
+
54
+ # Configure the model
55
+ config = TrainingConfig(
56
+ response="ClaimRate",
57
+ glm=GLMConfig(family="poisson"),
58
+ booster=BoosterConfig(
59
+ nrounds=500,
60
+ early_stopping_rounds=20,
61
+ params={"max_depth": 3, "eta": 0.025},
62
+ ),
63
+ )
64
+
65
+ # Train the model
66
+ model = IBLMModel(config).fit(train, validate)
67
+
68
+ # Make predictions
69
+ predictions = model.predict(test)
70
+
71
+ # Get GLM parameters
72
+ glm_params = model.get_glm_params()
73
+ print(glm_params)
74
+ ```
75
+
76
+ ## Core Components
77
+
78
+ ### Model Classes
79
+ - **`IBLMModel`**: Main model class combining GLM and gradient boosting
80
+ - **`BoosterConfig`**: Configuration for the gradient boosting component
81
+ - **`GLMConfig`**: Configuration for the GLM component
82
+ - **`TrainingConfig`**: Overall training configuration
83
+
84
+ ### Data Handling
85
+ - **`load_freMTPL2freq()`**: Load example insurance dataset
86
+ - **`FeaturePreprocessor`**: Automatic feature encoding and preprocessing
87
+
88
+ ### Evaluation
89
+ - **`poisson_deviance()`**: Compute Poisson deviance
90
+ - **`get_pinball_scores()`**: Multi-model pinball loss comparison
91
+ - **`calculate_deviance()`**: Family-based deviance calculation
92
+
93
+ ### Explanation & Visualization
94
+ - **`explain()`**: Generate explanation object with SHAP values
95
+ - **`IBLMPlotter`**: Visualization utilities for model interpretation
96
+ - **`correction_corridor()`**: Visualize model correction patterns
97
+ - **`extract_booster_shap()`**: Extract SHAP values from booster
98
+
99
+ ## Documentation
100
+
101
+ For detailed documentation and tutorials, see:
102
+ - `examples/` - Example scripts and use cases
103
+ - `dev.ipynb` - Development notebook with comprehensive example
104
+
105
+ ## Development
106
+
107
+ This package is actively developed. Contributions are welcome!
108
+
109
+ ### Development Setup
110
+ ```bash
111
+ git clone https://github.com/ZZhouGit/pyBLM.git
112
+ cd pyBLM
113
+ python -m venv .venv
114
+ source .venv/bin/activate # On Windows: .venv\Scripts\activate
115
+ pip install -e ".[all]"
116
+ poetry install --with dev
117
+ ```
118
+
119
+ ### Running Tests
120
+ ```bash
121
+ pytest tests/
122
+ ```
123
+
124
+ ### Development Notebook
125
+ Open `dev.ipynb` in Jupyter to see comprehensive examples:
126
+ ```bash
127
+ jupyter notebook dev.ipynb
128
+ ```
129
+
130
+ ## Requirements
131
+
132
+ - Python 3.12+
133
+ - pandas >= 2.0.0
134
+ - numpy >= 2.0.0
135
+ - scikit-learn >= 1.5.0
136
+ - xgboost >= 2.1.0
137
+ - pydantic >= 2.10.0
138
+ - statsmodels >= 0.14.0
139
+
140
+ Optional dependencies:
141
+ - plotnine >= 0.15.0 (for visualization)
142
+ - altair >= 5.4.0 (for interactive plots)
143
+ - shap >= 0.45.0 (for SHAP explanations)
144
+
145
+ ## Citation
146
+
147
+ If you use PyBLM in your research, please cite:
148
+
149
+ ```bibtex
150
+ @software{pyiblm2025,
151
+ title={PyIBLM: Interpretable Boosted Linear Models},
152
+ author={Your Name},
153
+ year={2025},
154
+ url={https://github.com/ZZhouGit/pyBLM},
155
+ }
156
+ ```
157
+
158
+ ## License
159
+
160
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
161
+
162
+ ## Authors
163
+
164
+ - Your Name
165
+
166
+ ## Acknowledgments
167
+
168
+ Built with [scikit-learn](https://scikit-learn.org/), [XGBoost](https://xgboost.readthedocs.io/), [SHAP](https://github.com/shap/shap), and [statsmodels](https://www.statsmodels.org/).
@@ -0,0 +1,96 @@
1
+ [tool.poetry]
2
+ name = "pyiblm"
3
+ version = "0.1.0"
4
+ description = "Interpretable Boosted Linear Model (IBLM): A transparent machine learning approach combining generalized linear models with gradient boosting"
5
+ authors = ["Your Name <you@example.com>"]
6
+ license = "MIT"
7
+ readme = "README.md"
8
+
9
+ [[tool.poetry.packages]]
10
+ include = "pyBLM"
11
+ from = "src"
12
+
13
+ [tool.poetry.dependencies]
14
+ python = "^3.12"
15
+ pandas = "^2.0.0"
16
+ numpy = "^2.0.0"
17
+ scikit-learn = "^1.5.0"
18
+ xgboost = "^2.1.0"
19
+ pydantic = "^2.10.0"
20
+ statsmodels = "^0.14.0"
21
+ joblib = "^1.3.0"
22
+
23
+ # Optional visualization dependencies
24
+ plotnine = {version = "^0.15.0", optional = true}
25
+ altair = {version = "^5.4.0", optional = true}
26
+ altair-saver = {version = "^0.5.0", optional = true}
27
+ vl-convert-python = {version = "^1.6.1", optional = true}
28
+ shap = {version = "^0.45.0", optional = true}
29
+
30
+
31
+ [tool.poetry.extras]
32
+ visualization = ["plotnine", "altair", "altair-saver", "vl-convert-python"]
33
+ explainability = ["shap"]
34
+ all = ["plotnine", "altair", "altair-saver", "vl-convert-python", "shap"]
35
+
36
+ [tool.poetry.group.dev.dependencies]
37
+ pytest = "^9.0.2"
38
+ ipykernel = "^7.2.0"
39
+ jupyter = "^1.0.0"
40
+ black = "^24.1.0"
41
+ isort = "^5.13.0"
42
+ flake8 = "^7.0.0"
43
+
44
+ [tool.black]
45
+ line-length = 100
46
+ target-version = ["py312"]
47
+ include = '\.pyi?$'
48
+ extend-exclude = '''
49
+ /(
50
+ # directories
51
+ \.eggs
52
+ | \.git
53
+ | \.hg
54
+ | \.mypy_cache
55
+ | \.tox
56
+ | \.venv
57
+ | build
58
+ | dist
59
+ )/
60
+ '''
61
+
62
+ [tool.isort]
63
+ profile = "black"
64
+ line_length = 100
65
+ py_version = 312
66
+ skip_gitignore = true
67
+
68
+ [tool.pytest.ini_options]
69
+ testpaths = ["tests"]
70
+ python_files = "test_*.py"
71
+ python_classes = "Test*"
72
+ python_functions = "test_*"
73
+ addopts = "--strict-markers -v"
74
+ markers = [
75
+ "unit: Unit tests",
76
+ "integration: Integration tests",
77
+ "slow: Slow tests",
78
+ ]
79
+
80
+ [tool.coverage.run]
81
+ source = ["src/pyBLM"]
82
+ omit = ["*/tests/*", "*/test_*.py"]
83
+
84
+ [tool.coverage.report]
85
+ exclude_lines = [
86
+ "pragma: no cover",
87
+ "def __repr__",
88
+ "raise AssertionError",
89
+ "raise NotImplementedError",
90
+ "if __name__ == .__main__.:",
91
+ "if TYPE_CHECKING:",
92
+ ]
93
+
94
+ [build-system]
95
+ requires = ["poetry-core"]
96
+ build-backend = "poetry.core.masonry.api"
@@ -0,0 +1,39 @@
1
+ from .config import BoosterConfig, GLMConfig, TrainingConfig
2
+ from .dataset import split_into_train_validate_test, load_freMTPL2freq
3
+ from .metrics import calculate_deviance, get_pinball_scores, poisson_deviance
4
+ from .model import IBLMModel
5
+ from .explain import (
6
+ Explanation,
7
+ correction_corridor,
8
+ explain,
9
+ explain_iblm,
10
+ data_to_onehot,
11
+ extract_booster_shap,
12
+ detect_outliers,
13
+ )
14
+ from .plotting import IBLMPlotter
15
+ from .validation import check_required_names, check_iblm_model, check_data_variability
16
+
17
+ __all__ = [
18
+ "BoosterConfig",
19
+ "GLMConfig",
20
+ "TrainingConfig",
21
+ "split_into_train_validate_test",
22
+ "load_freMTPL2freq",
23
+ "IBLMModel",
24
+ "Explanation",
25
+ "explain",
26
+ "explain_iblm",
27
+ "correction_corridor",
28
+ "calculate_deviance",
29
+ "poisson_deviance",
30
+ "get_pinball_scores",
31
+ "data_to_onehot",
32
+ "extract_booster_shap",
33
+ "detect_outliers",
34
+ "IBLMPlotter",
35
+ "check_required_names",
36
+ "check_iblm_model",
37
+ "check_data_variability",
38
+ ]
39
+
@@ -0,0 +1,60 @@
1
+ """Configuration objects for the pyBLM implementation.
2
+
3
+ These classes are Pydantic models that validate user input while keeping
4
+ parameters discoverable and IDE friendly.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ from typing import Any, Dict, Optional, Literal
9
+
10
+ from pydantic import BaseModel, Field, field_validator
11
+
12
+
13
+ class GLMConfig(BaseModel):
14
+ """Settings for the GLM component."""
15
+
16
+ family: Literal["poisson", "gamma", "tweedie", "gaussian"] = "poisson"
17
+ tweedie_power: float = 1.5
18
+ strip_glm: bool = True
19
+
20
+ @field_validator("tweedie_power")
21
+ @classmethod
22
+ def _validate_power(cls, value: float) -> float:
23
+ if value <= 1 or value >= 2:
24
+ raise ValueError("tweedie_power should be in (1, 2) to ensure a valid Tweedie distribution")
25
+ return value
26
+
27
+
28
+ class BoosterConfig(BaseModel):
29
+ """Settings forwarded to the XGBoost booster."""
30
+
31
+ nrounds: int = 500
32
+ params: Dict[str, Any] = Field(default_factory=dict)
33
+ early_stopping_rounds: int = 25
34
+ verbose_eval: int = 0
35
+ print_every_n: int = 50
36
+ maximize: Optional[bool] = None
37
+ custom_metric: Optional[str] = None
38
+ objective: Optional[str] = None
39
+
40
+ @field_validator("nrounds")
41
+ @classmethod
42
+ def _validate_nrounds(cls, value: int) -> int:
43
+ if value <= 0:
44
+ raise ValueError("nrounds must be positive")
45
+ return value
46
+
47
+
48
+ class TrainingConfig(BaseModel):
49
+ """Top level training configuration."""
50
+
51
+ response: str
52
+ glm: GLMConfig = GLMConfig()
53
+ booster: BoosterConfig = BoosterConfig()
54
+
55
+ @field_validator("response")
56
+ @classmethod
57
+ def _non_empty_response(cls, value: str) -> str:
58
+ if not value:
59
+ raise ValueError("response must be a non-empty column name")
60
+ return value
@@ -0,0 +1,81 @@
1
+ from __future__ import annotations
2
+
3
+ """Small dataset helpers used by pyBLM."""
4
+
5
+ from pathlib import Path
6
+ from typing import Optional, Tuple
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+
11
+
12
+ def split_into_train_validate_test(
13
+ df: pd.DataFrame,
14
+ train_prop: float = 0.7,
15
+ validate_prop: float = 0.15,
16
+ test_prop: float = 0.15,
17
+ seed: Optional[int] = None,
18
+ ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
19
+ """Randomly split a dataframe into train/validate/test subsets.
20
+
21
+ Unnamed index columns are dropped automatically.
22
+
23
+ Returns:
24
+ Tuple of (train_df, validate_df, test_df)
25
+ """
26
+ df = _drop_unnamed_columns(df)
27
+ if not np.isclose(train_prop + validate_prop + test_prop, 1.0):
28
+ raise ValueError("train_prop + validate_prop + test_prop must sum to 1")
29
+
30
+ rng = np.random.default_rng(seed)
31
+ buckets = rng.choice(
32
+ ["train", "validate", "test"],
33
+ size=len(df),
34
+ replace=True,
35
+ p=[train_prop, validate_prop, test_prop],
36
+ )
37
+
38
+ train_df = df[buckets == "train"].copy()
39
+ validate_df = df[buckets == "validate"].copy()
40
+ test_df = df[buckets == "test"].copy()
41
+
42
+ return train_df, validate_df, test_df
43
+
44
+
45
+ # Register the method on pandas DataFrame when this module is imported
46
+ # This makes it available as df.split_into_train_validate_test()
47
+ if not hasattr(pd.DataFrame, 'split_into_train_validate_test'):
48
+ pd.DataFrame.split_into_train_validate_test = split_into_train_validate_test
49
+
50
+
51
+ def load_freMTPL2freq(path: Optional[str | Path] = None) -> pd.DataFrame:
52
+ """Load the preprocessed freMTPL2freq CSV from disk.
53
+
54
+ The default location is data/freMTPL2freq.csv relative to the repository root.
55
+ Unnamed index columns are removed; object columns are left as-is for callers
56
+ to cast to category if desired.
57
+ """
58
+
59
+ resolved = Path(path) if path else Path(__file__).resolve().parents[2] / "data" / "freMTPL2freq.csv"
60
+ df = pd.read_csv(resolved)
61
+ df = _drop_unnamed_columns(df)
62
+
63
+ # Convert ClaimNb to a rate per exposure and winsorize heavy tails
64
+ if "ClaimNb" in df.columns and "Exposure" in df.columns:
65
+ with np.errstate(divide="ignore", invalid="ignore"):
66
+ df["ClaimRate"] = df["ClaimNb"] / df["Exposure"].replace(0, np.nan)
67
+ df["ClaimRate"] = df["ClaimRate"].fillna(0)
68
+ cap = df["ClaimRate"].quantile(0.999)
69
+ df["ClaimRate"] = df["ClaimRate"].clip(upper=cap)
70
+ df["VehAge"] = df["VehAge"].clip(upper=50)
71
+ drop_cols = [c for c in ("IDpol", "Exposure", "ClaimNb") if c in df.columns]
72
+ df = df.drop(columns=drop_cols)
73
+
74
+ return df
75
+
76
+
77
+ def _drop_unnamed_columns(df: pd.DataFrame) -> pd.DataFrame:
78
+ cols_to_drop = [c for c in df.columns if c.startswith("Unnamed") or c == ""]
79
+ if cols_to_drop:
80
+ df = df.drop(columns=cols_to_drop)
81
+ return df