imbreg 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- imbreg/__init__.py +79 -0
- imbreg/data_loader.py +427 -0
- imbreg/metrics.py +338 -0
- imbreg/models.py +119 -0
- imbreg/plots.py +227 -0
- imbreg/resampling.py +384 -0
- imbreg/stratification.py +448 -0
- imbreg/utils.py +165 -0
- imbreg/validation.py +423 -0
- imbreg-0.1.0.dist-info/METADATA +153 -0
- imbreg-0.1.0.dist-info/RECORD +14 -0
- imbreg-0.1.0.dist-info/WHEEL +5 -0
- imbreg-0.1.0.dist-info/licenses/LICENSE +21 -0
- imbreg-0.1.0.dist-info/top_level.txt +1 -0
imbreg/__init__.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""
|
|
2
|
+
imbreg - Imbalanced Regression library.
|
|
3
|
+
|
|
4
|
+
Public API
|
|
5
|
+
----------
|
|
6
|
+
Phi relevance
|
|
7
|
+
phi_control - build relevance control structure
|
|
8
|
+
phi - evaluate relevance for target values
|
|
9
|
+
|
|
10
|
+
Resampling
|
|
11
|
+
dibs_regress - DIBS strategy resampling (SmoteR + GaussNoise)
|
|
12
|
+
|
|
13
|
+
Stratification / CV
|
|
14
|
+
cv_partitions - repeated K-fold CV with optional SMOGN + imputation
|
|
15
|
+
make_folds - generate fold indices (stratified or random)
|
|
16
|
+
|
|
17
|
+
Data I/O
|
|
18
|
+
read_dataset - read KEEL-style .dat, csv and arff datasets
|
|
19
|
+
write_dataset - write datasets (CSV/KEEL)
|
|
20
|
+
get_percentages - compute % of rare cases per dataset
|
|
21
|
+
split_features_target - convenience X / y split
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from .stratification import phi_control, phi, cv_partitions, make_folds
|
|
25
|
+
from .resampling import dibs_regress, safe_dibs_regress
|
|
26
|
+
from .data_loader import (
|
|
27
|
+
read_dataset,
|
|
28
|
+
write_dataset,
|
|
29
|
+
get_percentages,
|
|
30
|
+
split_features_target,
|
|
31
|
+
encode_categoricals,
|
|
32
|
+
impute_train,
|
|
33
|
+
impute_test,
|
|
34
|
+
)
|
|
35
|
+
from .plots import (
|
|
36
|
+
plot_target_distribution,
|
|
37
|
+
plot_scatter_2d,
|
|
38
|
+
plot_scatter_3d,
|
|
39
|
+
plot_prediction_error,
|
|
40
|
+
)
|
|
41
|
+
from .metrics import utility_f1_score, sera_score
|
|
42
|
+
from .validation import evaluate_folds, export_experiment_summaries, evaluate_predictions_from_files
|
|
43
|
+
|
|
44
|
+
__version__ = "0.1.0"
|
|
45
|
+
|
|
46
|
+
__all__ = [
|
|
47
|
+
# Phi relevance
|
|
48
|
+
"phi_control",
|
|
49
|
+
"phi",
|
|
50
|
+
# Resampling
|
|
51
|
+
"dibs_regress",
|
|
52
|
+
"safe_dibs_regress",
|
|
53
|
+
# CV partitioning
|
|
54
|
+
"cv_partitions",
|
|
55
|
+
"make_folds",
|
|
56
|
+
# Data I/O
|
|
57
|
+
"train_extra_trees",
|
|
58
|
+
"train_xgboost",
|
|
59
|
+
"predict_model",
|
|
60
|
+
"read_dataset",
|
|
61
|
+
"write_dataset",
|
|
62
|
+
"get_percentages",
|
|
63
|
+
"split_features_target",
|
|
64
|
+
"encode_categoricals",
|
|
65
|
+
# Imputation
|
|
66
|
+
"impute_train",
|
|
67
|
+
"impute_test",
|
|
68
|
+
# Visualization
|
|
69
|
+
"plot_target_distribution",
|
|
70
|
+
"plot_scatter_2d",
|
|
71
|
+
"plot_scatter_3d",
|
|
72
|
+
"plot_prediction_error",
|
|
73
|
+
# Metrics / Validation
|
|
74
|
+
"utility_f1_score",
|
|
75
|
+
"sera_score",
|
|
76
|
+
"evaluate_folds",
|
|
77
|
+
"export_experiment_summaries",
|
|
78
|
+
"evaluate_predictions_from_files",
|
|
79
|
+
]
|
imbreg/data_loader.py
ADDED
|
@@ -0,0 +1,427 @@
|
|
|
1
|
+
"""
|
|
2
|
+
data_loader.py - KEEL dataset I/O, imputation helpers, and dataset utilities.
|
|
3
|
+
"""
|
|
4
|
+
import re
|
|
5
|
+
from io import StringIO
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from .utils import (
|
|
11
|
+
align_categories,
|
|
12
|
+
apply_decimal_map,
|
|
13
|
+
apply_range_cap,
|
|
14
|
+
decimal_map_from_reference,
|
|
15
|
+
range_map_from_reference,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# ── KEEL reader ───────────────────────────────────────────────────────────────
|
|
20
|
+
|
|
21
|
+
def read_dataset(name, location="", infer_categoricals=True, max_levels=5, max_decimals=6):
|
|
22
|
+
"""
|
|
23
|
+
Reads a dataset (.dat KEEL, .arff Weka, or .csv).
|
|
24
|
+
|
|
25
|
+
Conventions:
|
|
26
|
+
- For .dat/.arff: Lines starting with '@' or '%' are skipped. Missing values are '?'.
|
|
27
|
+
- For .csv: The first row is assumed to be the header. Missing values can be empty or '?'.
|
|
28
|
+
- The LAST column is treated as the regression target and renamed to 'y'.
|
|
29
|
+
- Low-cardinality numeric columns are cast to `pd.Categorical` if `infer_categoricals` is True.
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
name : str
|
|
34
|
+
The name of the dataset file (e.g., 'abalone.dat', 'data.csv', 'data.arff').
|
|
35
|
+
location : str or Path, default=''
|
|
36
|
+
The sub-path or directory containing the file.
|
|
37
|
+
infer_categoricals : bool, default=True
|
|
38
|
+
If True, automatically converts low cardinality numeric columns into categoricals.
|
|
39
|
+
max_levels : int, default=5
|
|
40
|
+
Maximum unique numeric values a column can have before being promoted to
|
|
41
|
+
categorical.
|
|
42
|
+
max_decimals : int or None, default=6
|
|
43
|
+
If set, numeric columns are capped (rounded) at this precision.
|
|
44
|
+
|
|
45
|
+
Returns
|
|
46
|
+
-------
|
|
47
|
+
pandas.DataFrame
|
|
48
|
+
The data loaded and structurally formatted.
|
|
49
|
+
|
|
50
|
+
Raises
|
|
51
|
+
------
|
|
52
|
+
ValueError
|
|
53
|
+
If no data lines are found.
|
|
54
|
+
"""
|
|
55
|
+
path = Path(".") / location / name
|
|
56
|
+
|
|
57
|
+
is_csv = path.suffix.lower() == '.csv'
|
|
58
|
+
|
|
59
|
+
if is_csv:
|
|
60
|
+
raw = pd.read_csv(path, na_values=["?", "NA"], skipinitialspace=True)
|
|
61
|
+
# Rename target column (assume last column is target)
|
|
62
|
+
cols = list(raw.columns)
|
|
63
|
+
cols[-1] = "y"
|
|
64
|
+
raw.columns = cols
|
|
65
|
+
else:
|
|
66
|
+
with open(path, "r", encoding="utf-8", errors="replace") as fh:
|
|
67
|
+
lines = fh.readlines()
|
|
68
|
+
|
|
69
|
+
data_lines = [
|
|
70
|
+
ln for ln in lines if not ln.strip().startswith(("@", "%")) and ln.strip()
|
|
71
|
+
]
|
|
72
|
+
if not data_lines:
|
|
73
|
+
raise ValueError(f"No data lines found in {path}")
|
|
74
|
+
|
|
75
|
+
raw = pd.read_csv(
|
|
76
|
+
StringIO("".join(data_lines)),
|
|
77
|
+
header=None,
|
|
78
|
+
na_values=["?"],
|
|
79
|
+
skipinitialspace=True,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# Rename columns: predictors V1..V(n-1), target 'y'
|
|
83
|
+
n_cols = raw.shape[1]
|
|
84
|
+
raw.columns = [f"V{i + 1}" for i in range(n_cols - 1)] + ["y"]
|
|
85
|
+
|
|
86
|
+
# 1) String predictors → Categorical
|
|
87
|
+
for i in range(raw.shape[1] - 1):
|
|
88
|
+
if pd.api.types.is_object_dtype(raw.iloc[:, i]) or pd.api.types.is_string_dtype(raw.iloc[:, i]) or raw.iloc[:, i].dtype == str:
|
|
89
|
+
col_name = raw.columns[i]
|
|
90
|
+
raw[col_name] = raw.iloc[:, i].astype(str).str.strip().astype("category")
|
|
91
|
+
|
|
92
|
+
# 2) Low-cardinality numeric predictors → Categorical (mirrors R factor)
|
|
93
|
+
if infer_categoricals:
|
|
94
|
+
for col in raw.columns[:-1]:
|
|
95
|
+
if pd.api.types.is_numeric_dtype(raw[col]) and not isinstance(raw[col].dtype, pd.CategoricalDtype):
|
|
96
|
+
n_uniq = raw[col].dropna().nunique()
|
|
97
|
+
if 1 < n_uniq <= max_levels:
|
|
98
|
+
raw[col] = raw[col].astype("category")
|
|
99
|
+
|
|
100
|
+
# 3) Cap decimals on remaining numeric columns (including y)
|
|
101
|
+
if max_decimals is not None:
|
|
102
|
+
for col in raw.columns:
|
|
103
|
+
if pd.api.types.is_numeric_dtype(raw[col]) and not isinstance(raw[col].dtype, pd.CategoricalDtype):
|
|
104
|
+
raw[col] = raw[col].round(max_decimals)
|
|
105
|
+
|
|
106
|
+
return raw
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
# ── KEEL writer ───────────────────────────────────────────────────────────────
|
|
110
|
+
|
|
111
|
+
def write_dataset(df, file_path, template_path=None, na_symbol="<null>", sep=",", dec=".", out_format="dat"):
|
|
112
|
+
"""
|
|
113
|
+
Writes a DataFrame as a dataset file (.dat, .arff, or .csv).
|
|
114
|
+
|
|
115
|
+
For .dat and .arff, the header is preserved up to and including the
|
|
116
|
+
'@data' identifier from the template file prior to appending row values,
|
|
117
|
+
or generated dynamically if no template is found.
|
|
118
|
+
|
|
119
|
+
Parameters
|
|
120
|
+
----------
|
|
121
|
+
df : pandas.DataFrame
|
|
122
|
+
The data payload intended for serialization.
|
|
123
|
+
file_path : str or pathlib.Path
|
|
124
|
+
The path where the resulting file will be written.
|
|
125
|
+
template_path : str or pathlib.Path, optional
|
|
126
|
+
A path pointing to the original partition containing the original meta-headers.
|
|
127
|
+
na_symbol : str, default='<null>'
|
|
128
|
+
The token representation for missing values.
|
|
129
|
+
sep : str, default=','
|
|
130
|
+
The separator / delimiter string used in the writing process.
|
|
131
|
+
dec : str, default='.'
|
|
132
|
+
The decimal separator to encode floating point data properly.
|
|
133
|
+
out_format : str, default='dat'
|
|
134
|
+
The format of the output ('dat', 'arff', 'csv').
|
|
135
|
+
|
|
136
|
+
Returns
|
|
137
|
+
-------
|
|
138
|
+
None
|
|
139
|
+
"""
|
|
140
|
+
file_path = Path(file_path)
|
|
141
|
+
|
|
142
|
+
# Make sure we enforce the requested extension
|
|
143
|
+
if file_path.suffix != f".{out_format}":
|
|
144
|
+
file_path = file_path.with_suffix(f".{out_format}")
|
|
145
|
+
|
|
146
|
+
# If format is CSV, use simple Pandas exporter
|
|
147
|
+
if out_format.lower() == "csv":
|
|
148
|
+
df.to_csv(file_path, index=False, na_rep=na_symbol)
|
|
149
|
+
return
|
|
150
|
+
|
|
151
|
+
# For KEEL/ARFF formats, extract or build the block header
|
|
152
|
+
header = None
|
|
153
|
+
if template_path and Path(template_path).exists():
|
|
154
|
+
with open(template_path, "r", encoding="utf-8", errors="replace") as fh:
|
|
155
|
+
tpl_lines = fh.readlines()
|
|
156
|
+
|
|
157
|
+
data_idx = None
|
|
158
|
+
for i, line in enumerate(tpl_lines):
|
|
159
|
+
if re.match(r"^\s*@data\s*$", line, re.IGNORECASE):
|
|
160
|
+
data_idx = i
|
|
161
|
+
break
|
|
162
|
+
|
|
163
|
+
if data_idx is not None:
|
|
164
|
+
header = tpl_lines[: data_idx + 1]
|
|
165
|
+
|
|
166
|
+
if header is None:
|
|
167
|
+
# Fallback for CSV templates: Generate a valid basic KEEL/ARFF header dynamically
|
|
168
|
+
header = [f"@relation {file_path.stem}\n"]
|
|
169
|
+
for col in df.columns:
|
|
170
|
+
header.append(f"@attribute {col} real\n")
|
|
171
|
+
header.append("@data\n")
|
|
172
|
+
|
|
173
|
+
# Build output rows
|
|
174
|
+
out = df.copy()
|
|
175
|
+
rows = []
|
|
176
|
+
for _, row in out.iterrows():
|
|
177
|
+
vals = []
|
|
178
|
+
for v in row:
|
|
179
|
+
if pd.isna(v) if not isinstance(v, str) else v in ("nan", "None", "<NA>"):
|
|
180
|
+
vals.append(na_symbol)
|
|
181
|
+
elif isinstance(v, float):
|
|
182
|
+
# No scientific notation, replace decimal separator if needed
|
|
183
|
+
s = f"{v:.10g}"
|
|
184
|
+
if dec != ".":
|
|
185
|
+
s = s.replace(".", dec)
|
|
186
|
+
vals.append(s)
|
|
187
|
+
else:
|
|
188
|
+
vals.append(str(v))
|
|
189
|
+
rows.append(sep.join(vals))
|
|
190
|
+
|
|
191
|
+
with open(file_path, "w", encoding="utf-8", newline="") as fh:
|
|
192
|
+
fh.writelines(header)
|
|
193
|
+
fh.write("\n".join(rows) + "\n")
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
# ── Scikit-learn Iterative Imputation ──────────────────────────────────────────
|
|
197
|
+
|
|
198
|
+
def impute_train(train_df, target_col="y", max_decimals_cap=6, random_state=None):
|
|
199
|
+
"""
|
|
200
|
+
Fits a scikit-learn iterative imputation kernel exclusively on training predictors.
|
|
201
|
+
|
|
202
|
+
Fits the kernel dynamically on purely continuous or purely categorical data,
|
|
203
|
+
while masking out the regression target to avoid data leakage metrics.
|
|
204
|
+
|
|
205
|
+
Parameters
|
|
206
|
+
----------
|
|
207
|
+
train_df : pandas.DataFrame
|
|
208
|
+
The full training dataset layout.
|
|
209
|
+
target_col : str, default='y'
|
|
210
|
+
The target column to explicitly exclude from the MICE calculation.
|
|
211
|
+
max_decimals_cap : int or None, default=6
|
|
212
|
+
Rounds outputs structurally ensuring the precision logic remains stable across
|
|
213
|
+
iterations of mice imputation.
|
|
214
|
+
random_state : int or None, default=None
|
|
215
|
+
Ensures imputation reproducibility across instances and permutations.
|
|
216
|
+
|
|
217
|
+
Returns
|
|
218
|
+
-------
|
|
219
|
+
dict
|
|
220
|
+
A mapping containing:
|
|
221
|
+
- 'kernel' (`sklearn.impute.IterativeImputer`): the fitted model block used for downstream targets.
|
|
222
|
+
- 'train_imp' (`pandas.DataFrame`): the iteratively imputed layout identical to `train_df`.
|
|
223
|
+
- 'dec_map' (dict): numeric precision mappings computed from the un-imputed references.
|
|
224
|
+
- 'range_map' (dict): feature scale caps defining numeric thresholds mappings.
|
|
225
|
+
|
|
226
|
+
Raises
|
|
227
|
+
------
|
|
228
|
+
ImportError
|
|
229
|
+
If `scikit-learn` is missing internally.
|
|
230
|
+
"""
|
|
231
|
+
try:
|
|
232
|
+
from sklearn.experimental import enable_iterative_imputer # noqa: F401
|
|
233
|
+
from sklearn.impute import IterativeImputer # noqa: F401
|
|
234
|
+
from sklearn.ensemble import ExtraTreesRegressor
|
|
235
|
+
import pandas as pd
|
|
236
|
+
except ImportError:
|
|
237
|
+
raise ImportError(
|
|
238
|
+
"scikit-learn is required for imputation. Install with: pip install scikit-learn"
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
train_x = train_df.drop(columns=[target_col])
|
|
242
|
+
|
|
243
|
+
dec_map = decimal_map_from_reference(train_x, max_decimals_cap=max_decimals_cap)
|
|
244
|
+
range_map = range_map_from_reference(train_x)
|
|
245
|
+
|
|
246
|
+
kernel = IterativeImputer(
|
|
247
|
+
estimator=ExtraTreesRegressor(n_estimators=10, random_state=random_state),
|
|
248
|
+
max_iter=3,
|
|
249
|
+
random_state=random_state
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
train_x_imp_array = kernel.fit_transform(train_x)
|
|
253
|
+
train_x_imp = pd.DataFrame(train_x_imp_array, columns=train_x.columns, index=train_x.index)
|
|
254
|
+
|
|
255
|
+
train_x_imp = apply_range_cap(train_x_imp, range_map)
|
|
256
|
+
train_x_imp = apply_decimal_map(train_x_imp, dec_map)
|
|
257
|
+
|
|
258
|
+
train_imp = train_x_imp.copy()
|
|
259
|
+
train_imp[target_col] = train_df[target_col].values
|
|
260
|
+
# Restore original column order
|
|
261
|
+
train_imp = train_imp[list(train_df.columns)]
|
|
262
|
+
|
|
263
|
+
return {
|
|
264
|
+
"kernel": kernel,
|
|
265
|
+
"train_imp": train_imp,
|
|
266
|
+
"dec_map": dec_map,
|
|
267
|
+
"range_map": range_map,
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def impute_test(train_imp_df, test_df, dec_map, range_map, kernel, target_col="y"):
|
|
272
|
+
"""
|
|
273
|
+
Applies the pre-fitted imputation kernel block explicitly onto the test predictors.
|
|
274
|
+
|
|
275
|
+
This ensures no data leakage occurs from test feature spaces onto the training
|
|
276
|
+
features. Applies previous categorical alignments mapping values independently.
|
|
277
|
+
|
|
278
|
+
Parameters
|
|
279
|
+
----------
|
|
280
|
+
train_imp_df : pandas.DataFrame
|
|
281
|
+
The imputed layout of the corresponding train set.
|
|
282
|
+
test_df : pandas.DataFrame
|
|
283
|
+
The missing value-prone test layout containing identical structures.
|
|
284
|
+
dec_map : dict
|
|
285
|
+
A map defining categorical rounding sizes.
|
|
286
|
+
range_map : dict
|
|
287
|
+
A map describing limit scaling boundaries.
|
|
288
|
+
kernel : sklearn.impute.IterativeImputer
|
|
289
|
+
The underlying iterative mice engine holding the target parameters.
|
|
290
|
+
target_col : str, default='y'
|
|
291
|
+
String column identifier to avoid metric imputation leaks.
|
|
292
|
+
|
|
293
|
+
Returns
|
|
294
|
+
-------
|
|
295
|
+
pandas.DataFrame
|
|
296
|
+
The fully iteratively processed `test_df` preserving its baseline structure.
|
|
297
|
+
|
|
298
|
+
Raises
|
|
299
|
+
------
|
|
300
|
+
ImportError
|
|
301
|
+
When unable to import `scikit-learn`.
|
|
302
|
+
"""
|
|
303
|
+
try:
|
|
304
|
+
from sklearn.experimental import enable_iterative_imputer # noqa: F401
|
|
305
|
+
from sklearn.impute import IterativeImputer # noqa: F401
|
|
306
|
+
import pandas as pd
|
|
307
|
+
except ImportError:
|
|
308
|
+
raise ImportError(
|
|
309
|
+
"scikit-learn is required for imputation. Install with: pip install scikit-learn"
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
train_x_imp = train_imp_df.drop(columns=[target_col])
|
|
313
|
+
test_x = test_df.drop(columns=[target_col])
|
|
314
|
+
test_x = align_categories(train_x_imp, test_x)
|
|
315
|
+
|
|
316
|
+
# Modification to prevent failure when the test set has no NaN values during imputation
|
|
317
|
+
|
|
318
|
+
# Columns containing NaN values in the test set
|
|
319
|
+
test_nan_cols = set(test_x.columns[test_x.isna().any()])
|
|
320
|
+
|
|
321
|
+
# Only perform imputation if there is a real intersection
|
|
322
|
+
if len(test_nan_cols) > 0:
|
|
323
|
+
test_x_imp_array = kernel.transform(test_x)
|
|
324
|
+
test_x_imp = pd.DataFrame(test_x_imp_array, columns=test_x.columns, index=test_x.index)
|
|
325
|
+
else:
|
|
326
|
+
test_x_imp = test_x.copy()
|
|
327
|
+
|
|
328
|
+
test_x_imp = apply_range_cap(test_x_imp, range_map)
|
|
329
|
+
test_x_imp = apply_decimal_map(test_x_imp, dec_map)
|
|
330
|
+
|
|
331
|
+
test_imp = test_x_imp.copy()
|
|
332
|
+
test_imp[target_col] = test_df[target_col].values
|
|
333
|
+
test_imp = test_imp[list(test_df.columns)]
|
|
334
|
+
|
|
335
|
+
return test_imp
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
# ── Utilities ─────────────────────────────────────────────────────────────────
|
|
339
|
+
|
|
340
|
+
def split_features_target(df, target_col="y"):
|
|
341
|
+
"""
|
|
342
|
+
Separates a DataFrame into its features (X) and target variable (y).
|
|
343
|
+
|
|
344
|
+
Convenience functional wrapper that logically splits data.
|
|
345
|
+
|
|
346
|
+
Parameters
|
|
347
|
+
----------
|
|
348
|
+
df : pandas.DataFrame
|
|
349
|
+
The full dataset containing predictors and target.
|
|
350
|
+
target_col : str, default='y'
|
|
351
|
+
The name of the column explicitly denoting the regression target.
|
|
352
|
+
|
|
353
|
+
Returns
|
|
354
|
+
-------
|
|
355
|
+
tuple of (pandas.DataFrame, pandas.Series)
|
|
356
|
+
A structural tuple mapping `(X, y)` subsets sequentially.
|
|
357
|
+
"""
|
|
358
|
+
return df.drop(columns=[target_col]), df[target_col]
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def encode_categoricals(df, target_col="y", drop_first=True):
|
|
362
|
+
"""
|
|
363
|
+
Identifies and one-hot encodes all categorical/string predictors in a DataFrame.
|
|
364
|
+
|
|
365
|
+
Parameters
|
|
366
|
+
----------
|
|
367
|
+
df : pandas.DataFrame
|
|
368
|
+
The dataset to process.
|
|
369
|
+
target_col : str, default='y'
|
|
370
|
+
The name of the target column, which should be excluded from encoding.
|
|
371
|
+
drop_first : bool, default=True
|
|
372
|
+
Whether to drop the first category column to avoid the collinearity trap.
|
|
373
|
+
|
|
374
|
+
Returns
|
|
375
|
+
-------
|
|
376
|
+
pandas.DataFrame
|
|
377
|
+
The DataFrame with encoded features.
|
|
378
|
+
"""
|
|
379
|
+
import pandas as pd
|
|
380
|
+
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
|
|
381
|
+
if target_col in cat_cols:
|
|
382
|
+
cat_cols.remove(target_col)
|
|
383
|
+
|
|
384
|
+
if cat_cols:
|
|
385
|
+
print(f"[Encoding] Categorical variables detected. Applying pd.get_dummies to: {cat_cols}")
|
|
386
|
+
df = pd.get_dummies(df, columns=cat_cols, drop_first=drop_first, dtype=int)
|
|
387
|
+
|
|
388
|
+
return df
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
def get_percentages(ds_names, ds_location, phi_thr=0.8):
|
|
392
|
+
"""
|
|
393
|
+
Computes the percentage of 'rare' (extreme) cases per dataset.
|
|
394
|
+
|
|
395
|
+
This function analyzes the target variable distribution and checks what percentage
|
|
396
|
+
of instances exceed the relevance threshold. It is essential for quantifying the
|
|
397
|
+
imbalance severity.
|
|
398
|
+
|
|
399
|
+
Parameters
|
|
400
|
+
----------
|
|
401
|
+
ds_names : list of str
|
|
402
|
+
A list containing the names of the dataset files (e.g., ['abalone.dat', 'bos.dat']).
|
|
403
|
+
ds_location : str or Path
|
|
404
|
+
The directory path where the dataset files are located.
|
|
405
|
+
phi_thr : float, default=0.8
|
|
406
|
+
The relevance threshold [0, 1] above which an instance is considered 'rare' or extreme.
|
|
407
|
+
|
|
408
|
+
Returns
|
|
409
|
+
-------
|
|
410
|
+
pd.DataFrame
|
|
411
|
+
A DataFrame with two columns: 'dataset' (the name of the file) and
|
|
412
|
+
'relevant_pct' (the percentage of rare cases, float).
|
|
413
|
+
|
|
414
|
+
Notes
|
|
415
|
+
-----
|
|
416
|
+
Uses `phi_control` with method "extremes" and `phi` internally.
|
|
417
|
+
"""
|
|
418
|
+
from .stratification import phi_control, phi # local import avoids circular dep
|
|
419
|
+
|
|
420
|
+
rows = []
|
|
421
|
+
for nm in ds_names:
|
|
422
|
+
d = read_dataset(nm, ds_location)
|
|
423
|
+
ctrl = phi_control(d["y"].values, method="extremes")
|
|
424
|
+
p = phi(d["y"].values, ctrl)
|
|
425
|
+
pct = float((p >= phi_thr).sum()) / len(d) * 100
|
|
426
|
+
rows.append({"dataset": nm, "relevant_pct": pct})
|
|
427
|
+
return pd.DataFrame(rows)
|