imbreg 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- imbreg-0.1.0/LICENSE +21 -0
- imbreg-0.1.0/MANIFEST.in +9 -0
- imbreg-0.1.0/PKG-INFO +153 -0
- imbreg-0.1.0/README.md +123 -0
- imbreg-0.1.0/imbreg/__init__.py +79 -0
- imbreg-0.1.0/imbreg/data_loader.py +427 -0
- imbreg-0.1.0/imbreg/metrics.py +338 -0
- imbreg-0.1.0/imbreg/models.py +119 -0
- imbreg-0.1.0/imbreg/plots.py +227 -0
- imbreg-0.1.0/imbreg/resampling.py +384 -0
- imbreg-0.1.0/imbreg/stratification.py +448 -0
- imbreg-0.1.0/imbreg/utils.py +165 -0
- imbreg-0.1.0/imbreg/validation.py +423 -0
- imbreg-0.1.0/imbreg.egg-info/PKG-INFO +153 -0
- imbreg-0.1.0/imbreg.egg-info/SOURCES.txt +18 -0
- imbreg-0.1.0/imbreg.egg-info/dependency_links.txt +1 -0
- imbreg-0.1.0/imbreg.egg-info/requires.txt +8 -0
- imbreg-0.1.0/imbreg.egg-info/top_level.txt +1 -0
- imbreg-0.1.0/pyproject.toml +43 -0
- imbreg-0.1.0/setup.cfg +4 -0
imbreg-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Gabriel Oliveros
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
imbreg-0.1.0/MANIFEST.in
ADDED
imbreg-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: imbreg
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A Python library for Imbalanced Regression with SMOGN, stratified CV, and utility-based metrics.
|
|
5
|
+
Author: Gabriel Oliveros
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/goliverosj/imbreg
|
|
8
|
+
Project-URL: Issues, https://github.com/goliverosj/imbreg/issues
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Intended Audience :: Science/Research
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Requires-Python: >=3.9
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: numpy
|
|
22
|
+
Requires-Dist: pandas
|
|
23
|
+
Requires-Dist: scipy
|
|
24
|
+
Requires-Dist: scikit-learn
|
|
25
|
+
Requires-Dist: xgboost
|
|
26
|
+
Requires-Dist: matplotlib
|
|
27
|
+
Requires-Dist: seaborn
|
|
28
|
+
Requires-Dist: plotly
|
|
29
|
+
Dynamic: license-file
|
|
30
|
+
|
|
31
|
+
# imbreg
|
|
32
|
+
|
|
33
|
+

|
|
34
|
+

|
|
35
|
+

|
|
36
|
+
|
|
37
|
+
**imbreg** is a powerful Python library specifically designed to tackle the **Imbalanced Regression** problem. It facilitates the processing of datasets with missing values, applies advanced synthetic over-sampling techniques like SMOGN (Synthetic Minority Over-sampling Technique for Regression with Gaussian Noise), evaluates predictive models using utility-based metrics, and manages stratified cross-validation partitioning.
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## Key Features
|
|
42
|
+
|
|
43
|
+
- **SMOGN Resampling (DIBS):** Generates synthetic examples for extreme minority values in continuous domains using the DIBS strategy (a combination of SmoteR interpolation and GaussNoise perturbation).
|
|
44
|
+
- **Stratified Partitioning:** Implements purely stratified cross-validation (CV) algorithms to ensure that extreme values are evenly distributed across folds.
|
|
45
|
+
- **Robust Data Imputation:** Native integration with iterative algorithms (Scikit-Learn IterativeImputer) that prevents data leakage between training and test partitions.
|
|
46
|
+
- **Advanced Utility-based Metrics:** Precise calculation of specialized metrics for imbalanced regression:
|
|
47
|
+
- **Utility-based F1-Score** ($\beta$-measure).
|
|
48
|
+
- **SERA** (Squared Error Relevance Area).
|
|
49
|
+
- **Dataset Loading (KEEL/CSV/ARFF):** A smart data loader that infers categorical variables, caps decimals, maps ranges, and cleans noisy values automatically.
|
|
50
|
+
- **Data Visualization:** Built-in 2D and 3D plotting modules (using Plotly, Seaborn) to visually analyze the relevance of the target variable and the impact of noise/distribution.
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## Requirements and Installation
|
|
55
|
+
|
|
56
|
+
To use this library, ensure you have Python 3.9 or higher installed. The main dependencies are built around the classic data science ecosystem.
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
pip install imbreg
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
## Quickstart Guide
|
|
65
|
+
|
|
66
|
+
Here is a quick snippet of how to use the core functions:
|
|
67
|
+
|
|
68
|
+
### 1. Generate Partitions (Cross-Validation)
|
|
69
|
+
|
|
70
|
+
The `cv_partitions` function will take care of reading your original dataset, cleaning it, performing missing data imputation, and injecting SMOGN oversampling automatically into each repetition.
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
from imbreg import cv_partitions
|
|
74
|
+
|
|
75
|
+
cv_partitions(
|
|
76
|
+
ds_name="my_dataset.csv",
|
|
77
|
+
ds_location="raw_data/",
|
|
78
|
+
times=1, # Number of repetitions
|
|
79
|
+
folds=10, # Number of partitions (k-fold)
|
|
80
|
+
strat=True, # Enable stratification
|
|
81
|
+
smogn=True, # Apply SMOGN during training
|
|
82
|
+
impute=True, # Impute missing values (NaNs)
|
|
83
|
+
out_dir="Output/" # Output directory for raw data partitions
|
|
84
|
+
)
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### 2. Evaluate Predictions
|
|
88
|
+
|
|
89
|
+
Once the physical folds are generated on your disk, you can automatically train the algorithms and retrieve the results summary containing SERA and F1 metrics.
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
from imbreg import evaluate_folds
|
|
93
|
+
|
|
94
|
+
results = evaluate_folds(
|
|
95
|
+
output_dir="Output/", # Directory containing the generated folds
|
|
96
|
+
dataset="my_dataset",
|
|
97
|
+
model_type="rf", # 'rf' (Random Forest), 'et' (Extra Trees), 'xgb' (XGBoost)
|
|
98
|
+
n_reps=1,
|
|
99
|
+
n_folds=10,
|
|
100
|
+
use_imputation=True,
|
|
101
|
+
use_smogn=True,
|
|
102
|
+
thr_rel=0.8 # Relevance threshold to define "rare" cases
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# You can export these results to a flat structure using the built-in exporter
|
|
106
|
+
from imbreg.validation import export_experiment_summaries
|
|
107
|
+
export_experiment_summaries(results, output_dir="Results/", dataset_name="my_dataset", flat_output=True)
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### 3. Visualize the Data
|
|
111
|
+
|
|
112
|
+
Analyze the relevance curve of your target variable:
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
import matplotlib.pyplot as plt
|
|
116
|
+
from imbreg import read_dataset, phi_control, plot_target_distribution
|
|
117
|
+
|
|
118
|
+
# Load dataset and create relevance control structure
|
|
119
|
+
df = read_dataset("my_dataset.csv", "raw_data/")
|
|
120
|
+
ctrl = phi_control(df["y"].values, method="extremes")
|
|
121
|
+
|
|
122
|
+
# Visualize distribution vs relevance
|
|
123
|
+
fig = plot_target_distribution(df, target_col="y", phi_ctrl=ctrl, thr_rel=0.8)
|
|
124
|
+
plt.show()
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
---
|
|
128
|
+
|
|
129
|
+
## Project Structure
|
|
130
|
+
|
|
131
|
+
```text
|
|
132
|
+
imbreg/
|
|
133
|
+
│
|
|
134
|
+
├── data_loader.py # I/O functions (CSV/KEEL) and imputation wrappers
|
|
135
|
+
├── metrics.py # Mathematical evaluation functions (Utility F1, SERA, Bumps)
|
|
136
|
+
├── models.py # Training and prediction wrappers (RF, ET, XGBoost)
|
|
137
|
+
├── plots.py # Advanced visualizations (Histograms, Scatters, Prediction Error)
|
|
138
|
+
├── resampling.py # Core engine for the DIBS strategy (SMOGN for regression)
|
|
139
|
+
├── stratification.py # Phi function (relevance) and K-Folds generators
|
|
140
|
+
├── utils.py # Math operations, distance metrics, and internal helpers
|
|
141
|
+
└── validation.py # Cross-validation evaluation pipeline and result export
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### Folder Architecture for Experiments
|
|
145
|
+
|
|
146
|
+
When running the full validation pipeline, the project enforces a clean separation of concerns:
|
|
147
|
+
|
|
148
|
+
- **`Output/`**: Stores all heavy, raw data partitions generated by cross-validation and SMOGN.
|
|
149
|
+
- **`Results/`**: A flat, clean directory containing only the final `.txt` and `.csv` summary metrics.
|
|
150
|
+
|
|
151
|
+
---
|
|
152
|
+
|
|
153
|
+
**Author: Gabriel Oliveros**
|
imbreg-0.1.0/README.md
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
# imbreg
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+

|
|
5
|
+

|
|
6
|
+
|
|
7
|
+
**imbreg** is a powerful Python library specifically designed to tackle the **Imbalanced Regression** problem. It facilitates the processing of datasets with missing values, applies advanced synthetic over-sampling techniques like SMOGN (Synthetic Minority Over-sampling Technique for Regression with Gaussian Noise), evaluates predictive models using utility-based metrics, and manages stratified cross-validation partitioning.
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## Key Features
|
|
12
|
+
|
|
13
|
+
- **SMOGN Resampling (DIBS):** Generates synthetic examples for extreme minority values in continuous domains using the DIBS strategy (a combination of SmoteR interpolation and GaussNoise perturbation).
|
|
14
|
+
- **Stratified Partitioning:** Implements purely stratified cross-validation (CV) algorithms to ensure that extreme values are evenly distributed across folds.
|
|
15
|
+
- **Robust Data Imputation:** Native integration with iterative algorithms (Scikit-Learn IterativeImputer) that prevents data leakage between training and test partitions.
|
|
16
|
+
- **Advanced Utility-based Metrics:** Precise calculation of specialized metrics for imbalanced regression:
|
|
17
|
+
- **Utility-based F1-Score** ($\beta$-measure).
|
|
18
|
+
- **SERA** (Squared Error Relevance Area).
|
|
19
|
+
- **Dataset Loading (KEEL/CSV/ARFF):** A smart data loader that infers categorical variables, caps decimals, maps ranges, and cleans noisy values automatically.
|
|
20
|
+
- **Data Visualization:** Built-in 2D and 3D plotting modules (using Plotly, Seaborn) to visually analyze the relevance of the target variable and the impact of noise/distribution.
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## Requirements and Installation
|
|
25
|
+
|
|
26
|
+
To use this library, ensure you have Python 3.9 or higher installed. The main dependencies are built around the classic data science ecosystem.
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install imbreg
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
## Quickstart Guide
|
|
35
|
+
|
|
36
|
+
Here is a quick snippet of how to use the core functions:
|
|
37
|
+
|
|
38
|
+
### 1. Generate Partitions (Cross-Validation)
|
|
39
|
+
|
|
40
|
+
The `cv_partitions` function will take care of reading your original dataset, cleaning it, performing missing data imputation, and injecting SMOGN oversampling automatically into each repetition.
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
from imbreg import cv_partitions
|
|
44
|
+
|
|
45
|
+
cv_partitions(
|
|
46
|
+
ds_name="my_dataset.csv",
|
|
47
|
+
ds_location="raw_data/",
|
|
48
|
+
times=1, # Number of repetitions
|
|
49
|
+
folds=10, # Number of partitions (k-fold)
|
|
50
|
+
strat=True, # Enable stratification
|
|
51
|
+
smogn=True, # Apply SMOGN during training
|
|
52
|
+
impute=True, # Impute missing values (NaNs)
|
|
53
|
+
out_dir="Output/" # Output directory for raw data partitions
|
|
54
|
+
)
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### 2. Evaluate Predictions
|
|
58
|
+
|
|
59
|
+
Once the physical folds are generated on your disk, you can automatically train the algorithms and retrieve the results summary containing SERA and F1 metrics.
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
from imbreg import evaluate_folds
|
|
63
|
+
|
|
64
|
+
results = evaluate_folds(
|
|
65
|
+
output_dir="Output/", # Directory containing the generated folds
|
|
66
|
+
dataset="my_dataset",
|
|
67
|
+
model_type="rf", # 'rf' (Random Forest), 'et' (Extra Trees), 'xgb' (XGBoost)
|
|
68
|
+
n_reps=1,
|
|
69
|
+
n_folds=10,
|
|
70
|
+
use_imputation=True,
|
|
71
|
+
use_smogn=True,
|
|
72
|
+
thr_rel=0.8 # Relevance threshold to define "rare" cases
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# You can export these results to a flat structure using the built-in exporter
|
|
76
|
+
from imbreg.validation import export_experiment_summaries
|
|
77
|
+
export_experiment_summaries(results, output_dir="Results/", dataset_name="my_dataset", flat_output=True)
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### 3. Visualize the Data
|
|
81
|
+
|
|
82
|
+
Analyze the relevance curve of your target variable:
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
import matplotlib.pyplot as plt
|
|
86
|
+
from imbreg import read_dataset, phi_control, plot_target_distribution
|
|
87
|
+
|
|
88
|
+
# Load dataset and create relevance control structure
|
|
89
|
+
df = read_dataset("my_dataset.csv", "raw_data/")
|
|
90
|
+
ctrl = phi_control(df["y"].values, method="extremes")
|
|
91
|
+
|
|
92
|
+
# Visualize distribution vs relevance
|
|
93
|
+
fig = plot_target_distribution(df, target_col="y", phi_ctrl=ctrl, thr_rel=0.8)
|
|
94
|
+
plt.show()
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## Project Structure
|
|
100
|
+
|
|
101
|
+
```text
|
|
102
|
+
imbreg/
|
|
103
|
+
│
|
|
104
|
+
├── data_loader.py # I/O functions (CSV/KEEL) and imputation wrappers
|
|
105
|
+
├── metrics.py # Mathematical evaluation functions (Utility F1, SERA, Bumps)
|
|
106
|
+
├── models.py # Training and prediction wrappers (RF, ET, XGBoost)
|
|
107
|
+
├── plots.py # Advanced visualizations (Histograms, Scatters, Prediction Error)
|
|
108
|
+
├── resampling.py # Core engine for the DIBS strategy (SMOGN for regression)
|
|
109
|
+
├── stratification.py # Phi function (relevance) and K-Folds generators
|
|
110
|
+
├── utils.py # Math operations, distance metrics, and internal helpers
|
|
111
|
+
└── validation.py # Cross-validation evaluation pipeline and result export
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### Folder Architecture for Experiments
|
|
115
|
+
|
|
116
|
+
When running the full validation pipeline, the project enforces a clean separation of concerns:
|
|
117
|
+
|
|
118
|
+
- **`Output/`**: Stores all heavy, raw data partitions generated by cross-validation and SMOGN.
|
|
119
|
+
- **`Results/`**: A flat, clean directory containing only the final `.txt` and `.csv` summary metrics.
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
**Author: Gabriel Oliveros**
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""
|
|
2
|
+
imbreg - Imbalanced Regression library.
|
|
3
|
+
|
|
4
|
+
Public API
|
|
5
|
+
----------
|
|
6
|
+
Phi relevance
|
|
7
|
+
phi_control - build relevance control structure
|
|
8
|
+
phi - evaluate relevance for target values
|
|
9
|
+
|
|
10
|
+
Resampling
|
|
11
|
+
dibs_regress - DIBS strategy resampling (SmoteR + GaussNoise)
|
|
12
|
+
|
|
13
|
+
Stratification / CV
|
|
14
|
+
cv_partitions - repeated K-fold CV with optional SMOGN + imputation
|
|
15
|
+
make_folds - generate fold indices (stratified or random)
|
|
16
|
+
|
|
17
|
+
Data I/O
|
|
18
|
+
read_dataset - read KEEL-style .dat, csv and arff datasets
|
|
19
|
+
write_dataset - write datasets (CSV/KEEL)
|
|
20
|
+
get_percentages - compute % of rare cases per dataset
|
|
21
|
+
split_features_target - convenience X / y split
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from .stratification import phi_control, phi, cv_partitions, make_folds
|
|
25
|
+
from .resampling import dibs_regress, safe_dibs_regress
|
|
26
|
+
from .data_loader import (
|
|
27
|
+
read_dataset,
|
|
28
|
+
write_dataset,
|
|
29
|
+
get_percentages,
|
|
30
|
+
split_features_target,
|
|
31
|
+
encode_categoricals,
|
|
32
|
+
impute_train,
|
|
33
|
+
impute_test,
|
|
34
|
+
)
|
|
35
|
+
from .plots import (
|
|
36
|
+
plot_target_distribution,
|
|
37
|
+
plot_scatter_2d,
|
|
38
|
+
plot_scatter_3d,
|
|
39
|
+
plot_prediction_error,
|
|
40
|
+
)
|
|
41
|
+
from .metrics import utility_f1_score, sera_score
|
|
42
|
+
from .validation import evaluate_folds, export_experiment_summaries, evaluate_predictions_from_files
|
|
43
|
+
|
|
44
|
+
__version__ = "0.1.0"
|
|
45
|
+
|
|
46
|
+
__all__ = [
|
|
47
|
+
# Phi relevance
|
|
48
|
+
"phi_control",
|
|
49
|
+
"phi",
|
|
50
|
+
# Resampling
|
|
51
|
+
"dibs_regress",
|
|
52
|
+
"safe_dibs_regress",
|
|
53
|
+
# CV partitioning
|
|
54
|
+
"cv_partitions",
|
|
55
|
+
"make_folds",
|
|
56
|
+
# Data I/O
|
|
57
|
+
"train_extra_trees",
|
|
58
|
+
"train_xgboost",
|
|
59
|
+
"predict_model",
|
|
60
|
+
"read_dataset",
|
|
61
|
+
"write_dataset",
|
|
62
|
+
"get_percentages",
|
|
63
|
+
"split_features_target",
|
|
64
|
+
"encode_categoricals",
|
|
65
|
+
# Imputation
|
|
66
|
+
"impute_train",
|
|
67
|
+
"impute_test",
|
|
68
|
+
# Visualization
|
|
69
|
+
"plot_target_distribution",
|
|
70
|
+
"plot_scatter_2d",
|
|
71
|
+
"plot_scatter_3d",
|
|
72
|
+
"plot_prediction_error",
|
|
73
|
+
# Metrics / Validation
|
|
74
|
+
"utility_f1_score",
|
|
75
|
+
"sera_score",
|
|
76
|
+
"evaluate_folds",
|
|
77
|
+
"export_experiment_summaries",
|
|
78
|
+
"evaluate_predictions_from_files",
|
|
79
|
+
]
|