PyPI - pyiblm - Versions diffs - 0.1.0__tar.gz - Mend

pyiblm 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

pyiblm-0.1.0/LICENSE +21 -0
pyiblm-0.1.0/PKG-INFO +198 -0
pyiblm-0.1.0/README.md +168 -0
pyiblm-0.1.0/pyproject.toml +96 -0
pyiblm-0.1.0/src/pyBLM/__init__.py +39 -0
pyiblm-0.1.0/src/pyBLM/config.py +60 -0
pyiblm-0.1.0/src/pyBLM/dataset.py +81 -0
pyiblm-0.1.0/src/pyBLM/explain.py +589 -0
pyiblm-0.1.0/src/pyBLM/metrics.py +101 -0
pyiblm-0.1.0/src/pyBLM/model.py +307 -0
pyiblm-0.1.0/src/pyBLM/plotting.py +434 -0
pyiblm-0.1.0/src/pyBLM/preprocessing.py +143 -0
pyiblm-0.1.0/src/pyBLM/py.typed +0 -0
pyiblm-0.1.0/src/pyBLM/validation.py +84 -0

pyiblm-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2025 Your Name
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

pyiblm-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,198 @@
+Metadata-Version: 2.1
+Name: pyiblm
+Version: 0.1.0
+Summary: Interpretable Boosted Linear Model (IBLM): A transparent machine learning approach combining generalized linear models with gradient boosting
+License: MIT
+Author: Your Name
+Author-email: you@example.com
+Requires-Python: >=3.12,<4.0
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Provides-Extra: all
+Provides-Extra: explainability
+Provides-Extra: visualization
+Requires-Dist: altair (>=5.4.0,<6.0.0) ; extra == "visualization" or extra == "all"
+Requires-Dist: altair-saver (>=0.5.0,<0.6.0) ; extra == "visualization" or extra == "all"
+Requires-Dist: joblib (>=1.3.0,<2.0.0)
+Requires-Dist: numpy (>=2.0.0,<3.0.0)
+Requires-Dist: pandas (>=2.0.0,<3.0.0)
+Requires-Dist: plotnine (>=0.15.0,<0.16.0) ; extra == "visualization" or extra == "all"
+Requires-Dist: pydantic (>=2.10.0,<3.0.0)
+Requires-Dist: scikit-learn (>=1.5.0,<2.0.0)
+Requires-Dist: shap (>=0.45.0,<0.46.0) ; extra == "explainability" or extra == "all"
+Requires-Dist: statsmodels (>=0.14.0,<0.15.0)
+Requires-Dist: vl-convert-python (>=1.6.1,<2.0.0) ; extra == "visualization" or extra == "all"
+Requires-Dist: xgboost (>=2.1.0,<3.0.0)
+Description-Content-Type: text/markdown
+# PyIBLM: Interpretable Boosted Linear Model
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+[![Python 3.12+](https://img.shields.io/badge/python-3.12+-blue.svg)](https://www.python.org/downloads/)
+**PyIBLM** is a Python package implementing the Interpretable Boosted Linear Model (IBLM), a transparent machine learning approach that combines the interpretability of Generalized Linear Models (GLMs) with the predictive power of gradient boosting.
+## Features
+- 🎯 **Interpretable by design**: Combines GLM transparency with boosting performance
+- 📊 **Multiple model families**: Poisson, Tweedie, Gaussian, and more (via statsmodels)
+- 🚀 **Gradient boosting integration**: Uses scikit-learn's HistGradientBoostingRegressor and XGBoost
+- 📈 **SHAP explanations**: Built-in feature importance and contribution analysis
+- 🔍 **Comprehensive diagnostics**: Pinball scores, deviance metrics, and model comparisons
+- 📉 **Visualization tools**: Beta corrections, density plots, and correction corridors
+## Installation
+### Basic Installation
+```bash
+pip install pyiblm
+```
+### With Visualization Support
+```bash
+pip install pyiblm[visualization]
+```
+### With Explainability Features
+```bash
+pip install pyiblm[explainability]
+```
+### Full Installation
+```bash
+pip install pyiblm[all]
+```
+## Quick Start
+```python
+from pyBLM import (
+    IBLMModel,
+    BoosterConfig,
+    GLMConfig,
+    TrainingConfig,
+    load_freMTPL2freq,
+)
+# Load example data
+data = load_freMTPL2freq("data/freMTPL2freq.csv")
+train, validate, test = data.split_into_train_validate_test(seed=123)
+# Configure the model
+config = TrainingConfig(
+    response="ClaimRate",
+    glm=GLMConfig(family="poisson"),
+    booster=BoosterConfig(
+        nrounds=500,
+        early_stopping_rounds=20,
+        params={"max_depth": 3, "eta": 0.025},
+    ),
+)
+# Train the model
+model = IBLMModel(config).fit(train, validate)
+# Make predictions
+predictions = model.predict(test)
+# Get GLM parameters
+glm_params = model.get_glm_params()
+print(glm_params)
+```
+## Core Components
+### Model Classes
+- **`IBLMModel`**: Main model class combining GLM and gradient boosting
+- **`BoosterConfig`**: Configuration for the gradient boosting component
+- **`GLMConfig`**: Configuration for the GLM component
+- **`TrainingConfig`**: Overall training configuration
+### Data Handling
+- **`load_freMTPL2freq()`**: Load example insurance dataset
+- **`FeaturePreprocessor`**: Automatic feature encoding and preprocessing
+### Evaluation
+- **`poisson_deviance()`**: Compute Poisson deviance
+- **`get_pinball_scores()`**: Multi-model pinball loss comparison
+- **`calculate_deviance()`**: Family-based deviance calculation
+### Explanation & Visualization
+- **`explain()`**: Generate explanation object with SHAP values
+- **`IBLMPlotter`**: Visualization utilities for model interpretation
+- **`correction_corridor()`**: Visualize model correction patterns
+- **`extract_booster_shap()`**: Extract SHAP values from booster
+## Documentation
+For detailed documentation and tutorials, see:
+- `examples/` - Example scripts and use cases
+- `dev.ipynb` - Development notebook with comprehensive example
+## Development
+This package is actively developed. Contributions are welcome!
+### Development Setup
+```bash
+git clone https://github.com/ZZhouGit/pyBLM.git
+cd pyBLM
+python -m venv .venv
+source .venv/bin/activate  # On Windows: .venv\Scripts\activate
+pip install -e ".[all]"
+poetry install --with dev
+```
+### Running Tests
+```bash
+pytest tests/
+```
+### Development Notebook
+Open `dev.ipynb` in Jupyter to see comprehensive examples:
+```bash
+jupyter notebook dev.ipynb
+```
+## Requirements
+- Python 3.12+
+- pandas >= 2.0.0
+- numpy >= 2.0.0
+- scikit-learn >= 1.5.0
+- xgboost >= 2.1.0
+- pydantic >= 2.10.0
+- statsmodels >= 0.14.0
+Optional dependencies:
+- plotnine >= 0.15.0 (for visualization)
+- altair >= 5.4.0 (for interactive plots)
+- shap >= 0.45.0 (for SHAP explanations)
+## Citation
+If you use PyBLM in your research, please cite:
+```bibtex
+@software{pyiblm2025,
+  title={PyIBLM: Interpretable Boosted Linear Models},
+  author={Your Name},
+  year={2025},
+  url={https://github.com/ZZhouGit/pyBLM},
+}
+```
+## License
+This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
+## Authors
+- Your Name
+## Acknowledgments
+Built with [scikit-learn](https://scikit-learn.org/), [XGBoost](https://xgboost.readthedocs.io/), [SHAP](https://github.com/shap/shap), and [statsmodels](https://www.statsmodels.org/).

pyiblm-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,168 @@
+# PyIBLM: Interpretable Boosted Linear Model
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+[![Python 3.12+](https://img.shields.io/badge/python-3.12+-blue.svg)](https://www.python.org/downloads/)
+**PyIBLM** is a Python package implementing the Interpretable Boosted Linear Model (IBLM), a transparent machine learning approach that combines the interpretability of Generalized Linear Models (GLMs) with the predictive power of gradient boosting.
+## Features
+- 🎯 **Interpretable by design**: Combines GLM transparency with boosting performance
+- 📊 **Multiple model families**: Poisson, Tweedie, Gaussian, and more (via statsmodels)
+- 🚀 **Gradient boosting integration**: Uses scikit-learn's HistGradientBoostingRegressor and XGBoost
+- 📈 **SHAP explanations**: Built-in feature importance and contribution analysis
+- 🔍 **Comprehensive diagnostics**: Pinball scores, deviance metrics, and model comparisons
+- 📉 **Visualization tools**: Beta corrections, density plots, and correction corridors
+## Installation
+### Basic Installation
+```bash
+pip install pyiblm
+```
+### With Visualization Support
+```bash
+pip install pyiblm[visualization]
+```
+### With Explainability Features
+```bash
+pip install pyiblm[explainability]
+```
+### Full Installation
+```bash
+pip install pyiblm[all]
+```
+## Quick Start
+```python
+from pyBLM import (
+    IBLMModel,
+    BoosterConfig,
+    GLMConfig,
+    TrainingConfig,
+    load_freMTPL2freq,
+)
+# Load example data
+data = load_freMTPL2freq("data/freMTPL2freq.csv")
+train, validate, test = data.split_into_train_validate_test(seed=123)
+# Configure the model
+config = TrainingConfig(
+    response="ClaimRate",
+    glm=GLMConfig(family="poisson"),
+    booster=BoosterConfig(
+        nrounds=500,
+        early_stopping_rounds=20,
+        params={"max_depth": 3, "eta": 0.025},
+    ),
+)
+# Train the model
+model = IBLMModel(config).fit(train, validate)
+# Make predictions
+predictions = model.predict(test)
+# Get GLM parameters
+glm_params = model.get_glm_params()
+print(glm_params)
+```
+## Core Components
+### Model Classes
+- **`IBLMModel`**: Main model class combining GLM and gradient boosting
+- **`BoosterConfig`**: Configuration for the gradient boosting component
+- **`GLMConfig`**: Configuration for the GLM component
+- **`TrainingConfig`**: Overall training configuration
+### Data Handling
+- **`load_freMTPL2freq()`**: Load example insurance dataset
+- **`FeaturePreprocessor`**: Automatic feature encoding and preprocessing
+### Evaluation
+- **`poisson_deviance()`**: Compute Poisson deviance
+- **`get_pinball_scores()`**: Multi-model pinball loss comparison
+- **`calculate_deviance()`**: Family-based deviance calculation
+### Explanation & Visualization
+- **`explain()`**: Generate explanation object with SHAP values
+- **`IBLMPlotter`**: Visualization utilities for model interpretation
+- **`correction_corridor()`**: Visualize model correction patterns
+- **`extract_booster_shap()`**: Extract SHAP values from booster
+## Documentation
+For detailed documentation and tutorials, see:
+- `examples/` - Example scripts and use cases
+- `dev.ipynb` - Development notebook with comprehensive example
+## Development
+This package is actively developed. Contributions are welcome!
+### Development Setup
+```bash
+git clone https://github.com/ZZhouGit/pyBLM.git
+cd pyBLM
+python -m venv .venv
+source .venv/bin/activate  # On Windows: .venv\Scripts\activate
+pip install -e ".[all]"
+poetry install --with dev
+```
+### Running Tests
+```bash
+pytest tests/
+```
+### Development Notebook
+Open `dev.ipynb` in Jupyter to see comprehensive examples:
+```bash
+jupyter notebook dev.ipynb
+```
+## Requirements
+- Python 3.12+
+- pandas >= 2.0.0
+- numpy >= 2.0.0
+- scikit-learn >= 1.5.0
+- xgboost >= 2.1.0
+- pydantic >= 2.10.0
+- statsmodels >= 0.14.0
+Optional dependencies:
+- plotnine >= 0.15.0 (for visualization)
+- altair >= 5.4.0 (for interactive plots)
+- shap >= 0.45.0 (for SHAP explanations)
+## Citation
+If you use PyBLM in your research, please cite:
+```bibtex
+@software{pyiblm2025,
+  title={PyIBLM: Interpretable Boosted Linear Models},
+  author={Your Name},
+  year={2025},
+  url={https://github.com/ZZhouGit/pyBLM},
+}
+```
+## License
+This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
+## Authors
+- Your Name
+## Acknowledgments
+Built with [scikit-learn](https://scikit-learn.org/), [XGBoost](https://xgboost.readthedocs.io/), [SHAP](https://github.com/shap/shap), and [statsmodels](https://www.statsmodels.org/).

pyiblm-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,96 @@
+[tool.poetry]
+name = "pyiblm"
+version = "0.1.0"
+description = "Interpretable Boosted Linear Model (IBLM): A transparent machine learning approach combining generalized linear models with gradient boosting"
+authors = ["Your Name <you@example.com>"]
+license = "MIT"
+readme = "README.md"
+[[tool.poetry.packages]]
+include = "pyBLM"
+from = "src"
+[tool.poetry.dependencies]
+python = "^3.12"
+pandas = "^2.0.0"
+numpy = "^2.0.0"
+scikit-learn = "^1.5.0"
+xgboost = "^2.1.0"
+pydantic = "^2.10.0"
+statsmodels = "^0.14.0"
+joblib = "^1.3.0"
+# Optional visualization dependencies
+plotnine = {version = "^0.15.0", optional = true}
+altair = {version = "^5.4.0", optional = true}
+altair-saver = {version = "^0.5.0", optional = true}
+vl-convert-python = {version = "^1.6.1", optional = true}
+shap = {version = "^0.45.0", optional = true}
+[tool.poetry.extras]
+visualization = ["plotnine", "altair", "altair-saver", "vl-convert-python"]
+explainability = ["shap"]
+all = ["plotnine", "altair", "altair-saver", "vl-convert-python", "shap"]
+[tool.poetry.group.dev.dependencies]
+pytest = "^9.0.2"
+ipykernel = "^7.2.0"
+jupyter = "^1.0.0"
+black = "^24.1.0"
+isort = "^5.13.0"
+flake8 = "^7.0.0"
+[tool.black]
+line-length = 100
+target-version = ["py312"]
+include = '\.pyi?$'
+extend-exclude = '''
+/(
+  # directories
+  \.eggs
+  | \.git
+  | \.hg
+  | \.mypy_cache
+  | \.tox
+  | \.venv
+  | build
+  | dist
+)/
+'''
+[tool.isort]
+profile = "black"
+line_length = 100
+py_version = 312
+skip_gitignore = true
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+python_files = "test_*.py"
+python_classes = "Test*"
+python_functions = "test_*"
+addopts = "--strict-markers -v"
+markers = [
+    "unit: Unit tests",
+    "integration: Integration tests",
+    "slow: Slow tests",
+]
+[tool.coverage.run]
+source = ["src/pyBLM"]
+omit = ["*/tests/*", "*/test_*.py"]
+[tool.coverage.report]
+exclude_lines = [
+    "pragma: no cover",
+    "def __repr__",
+    "raise AssertionError",
+    "raise NotImplementedError",
+    "if __name__ == .__main__.:",
+    "if TYPE_CHECKING:",
+]
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"

pyiblm-0.1.0/src/pyBLM/__init__.py ADDED Viewed

@@ -0,0 +1,39 @@
+from .config import BoosterConfig, GLMConfig, TrainingConfig
+from .dataset import split_into_train_validate_test, load_freMTPL2freq
+from .metrics import calculate_deviance, get_pinball_scores, poisson_deviance
+from .model import IBLMModel
+from .explain import (
+    Explanation,
+    correction_corridor,
+    explain,
+    explain_iblm,
+    data_to_onehot,
+    extract_booster_shap,
+    detect_outliers,
+)
+from .plotting import IBLMPlotter
+from .validation import check_required_names, check_iblm_model, check_data_variability
+__all__ = [
+    "BoosterConfig",
+    "GLMConfig",
+    "TrainingConfig",
+    "split_into_train_validate_test",
+    "load_freMTPL2freq",
+    "IBLMModel",
+    "Explanation",
+    "explain",
+    "explain_iblm",
+    "correction_corridor",
+    "calculate_deviance",
+    "poisson_deviance",
+    "get_pinball_scores",
+    "data_to_onehot",
+    "extract_booster_shap",
+    "detect_outliers",
+    "IBLMPlotter",
+    "check_required_names",
+    "check_iblm_model",
+    "check_data_variability",
+]

pyiblm-0.1.0/src/pyBLM/config.py ADDED Viewed

@@ -0,0 +1,60 @@
+"""Configuration objects for the pyBLM implementation.
+These classes are Pydantic models that validate user input while keeping
+parameters discoverable and IDE friendly.
+"""
+from __future__ import annotations
+from typing import Any, Dict, Optional, Literal
+from pydantic import BaseModel, Field, field_validator
+class GLMConfig(BaseModel):
+    """Settings for the GLM component."""
+    family: Literal["poisson", "gamma", "tweedie", "gaussian"] = "poisson"
+    tweedie_power: float = 1.5
+    strip_glm: bool = True
+    @field_validator("tweedie_power")
+    @classmethod
+    def _validate_power(cls, value: float) -> float:
+        if value <= 1 or value >= 2:
+            raise ValueError("tweedie_power should be in (1, 2) to ensure a valid Tweedie distribution")
+        return value
+class BoosterConfig(BaseModel):
+    """Settings forwarded to the XGBoost booster."""
+    nrounds: int = 500
+    params: Dict[str, Any] = Field(default_factory=dict)
+    early_stopping_rounds: int = 25
+    verbose_eval: int = 0
+    print_every_n: int = 50
+    maximize: Optional[bool] = None
+    custom_metric: Optional[str] = None
+    objective: Optional[str] = None
+    @field_validator("nrounds")
+    @classmethod
+    def _validate_nrounds(cls, value: int) -> int:
+        if value <= 0:
+            raise ValueError("nrounds must be positive")
+        return value
+class TrainingConfig(BaseModel):
+    """Top level training configuration."""
+    response: str
+    glm: GLMConfig = GLMConfig()
+    booster: BoosterConfig = BoosterConfig()
+    @field_validator("response")
+    @classmethod
+    def _non_empty_response(cls, value: str) -> str:
+        if not value:
+            raise ValueError("response must be a non-empty column name")
+        return value

pyiblm-0.1.0/src/pyBLM/dataset.py ADDED Viewed

@@ -0,0 +1,81 @@
+from __future__ import annotations
+"""Small dataset helpers used by pyBLM."""
+from pathlib import Path
+from typing import Optional, Tuple
+import numpy as np
+import pandas as pd
+def split_into_train_validate_test(
+    df: pd.DataFrame,
+    train_prop: float = 0.7,
+    validate_prop: float = 0.15,
+    test_prop: float = 0.15,
+    seed: Optional[int] = None,
+) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+    """Randomly split a dataframe into train/validate/test subsets.
+    Unnamed index columns are dropped automatically.
+    Returns:
+        Tuple of (train_df, validate_df, test_df)
+    """
+    df = _drop_unnamed_columns(df)
+    if not np.isclose(train_prop + validate_prop + test_prop, 1.0):
+        raise ValueError("train_prop + validate_prop + test_prop must sum to 1")
+    rng = np.random.default_rng(seed)
+    buckets = rng.choice(
+        ["train", "validate", "test"],
+        size=len(df),
+        replace=True,
+        p=[train_prop, validate_prop, test_prop],
+    )
+    train_df = df[buckets == "train"].copy()
+    validate_df = df[buckets == "validate"].copy()
+    test_df = df[buckets == "test"].copy()
+    return train_df, validate_df, test_df
+# Register the method on pandas DataFrame when this module is imported
+# This makes it available as df.split_into_train_validate_test()
+if not hasattr(pd.DataFrame, 'split_into_train_validate_test'):
+    pd.DataFrame.split_into_train_validate_test = split_into_train_validate_test
+def load_freMTPL2freq(path: Optional[str | Path] = None) -> pd.DataFrame:
+    """Load the preprocessed freMTPL2freq CSV from disk.
+    The default location is data/freMTPL2freq.csv relative to the repository root.
+    Unnamed index columns are removed; object columns are left as-is for callers
+    to cast to category if desired.
+    """
+    resolved = Path(path) if path else Path(__file__).resolve().parents[2] / "data" / "freMTPL2freq.csv"
+    df = pd.read_csv(resolved)
+    df = _drop_unnamed_columns(df)
+    # Convert ClaimNb to a rate per exposure and winsorize heavy tails
+    if "ClaimNb" in df.columns and "Exposure" in df.columns:
+        with np.errstate(divide="ignore", invalid="ignore"):
+            df["ClaimRate"] = df["ClaimNb"] / df["Exposure"].replace(0, np.nan)
+        df["ClaimRate"] = df["ClaimRate"].fillna(0)
+        cap = df["ClaimRate"].quantile(0.999)
+        df["ClaimRate"] = df["ClaimRate"].clip(upper=cap)
+        df["VehAge"] = df["VehAge"].clip(upper=50)
+        drop_cols = [c for c in ("IDpol", "Exposure", "ClaimNb") if c in df.columns]
+        df = df.drop(columns=drop_cols)
+    return df
+def _drop_unnamed_columns(df: pd.DataFrame) -> pd.DataFrame:
+    cols_to_drop = [c for c in df.columns if c.startswith("Unnamed") or c == ""]
+    if cols_to_drop:
+        df = df.drop(columns=cols_to_drop)
+    return df