imbreg 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
imbreg/__init__.py ADDED
@@ -0,0 +1,79 @@
1
+ """
2
+ imbreg - Imbalanced Regression library.
3
+
4
+ Public API
5
+ ----------
6
+ Phi relevance
7
+ phi_control - build relevance control structure
8
+ phi - evaluate relevance for target values
9
+
10
+ Resampling
11
+ dibs_regress - DIBS strategy resampling (SmoteR + GaussNoise)
12
+
13
+ Stratification / CV
14
+ cv_partitions - repeated K-fold CV with optional SMOGN + imputation
15
+ make_folds - generate fold indices (stratified or random)
16
+
17
+ Data I/O
18
+ read_dataset - read KEEL-style .dat, csv and arff datasets
19
+ write_dataset - write datasets (CSV/KEEL)
20
+ get_percentages - compute % of rare cases per dataset
21
+ split_features_target - convenience X / y split
22
+ """
23
+
24
+ from .stratification import phi_control, phi, cv_partitions, make_folds
25
+ from .resampling import dibs_regress, safe_dibs_regress
26
+ from .data_loader import (
27
+ read_dataset,
28
+ write_dataset,
29
+ get_percentages,
30
+ split_features_target,
31
+ encode_categoricals,
32
+ impute_train,
33
+ impute_test,
34
+ )
35
+ from .plots import (
36
+ plot_target_distribution,
37
+ plot_scatter_2d,
38
+ plot_scatter_3d,
39
+ plot_prediction_error,
40
+ )
41
+ from .metrics import utility_f1_score, sera_score
42
+ from .validation import evaluate_folds, export_experiment_summaries, evaluate_predictions_from_files
43
+
44
+ __version__ = "0.1.0"
45
+
46
+ __all__ = [
47
+ # Phi relevance
48
+ "phi_control",
49
+ "phi",
50
+ # Resampling
51
+ "dibs_regress",
52
+ "safe_dibs_regress",
53
+ # CV partitioning
54
+ "cv_partitions",
55
+ "make_folds",
56
+ # Data I/O
57
+ "train_extra_trees",
58
+ "train_xgboost",
59
+ "predict_model",
60
+ "read_dataset",
61
+ "write_dataset",
62
+ "get_percentages",
63
+ "split_features_target",
64
+ "encode_categoricals",
65
+ # Imputation
66
+ "impute_train",
67
+ "impute_test",
68
+ # Visualization
69
+ "plot_target_distribution",
70
+ "plot_scatter_2d",
71
+ "plot_scatter_3d",
72
+ "plot_prediction_error",
73
+ # Metrics / Validation
74
+ "utility_f1_score",
75
+ "sera_score",
76
+ "evaluate_folds",
77
+ "export_experiment_summaries",
78
+ "evaluate_predictions_from_files",
79
+ ]
imbreg/data_loader.py ADDED
@@ -0,0 +1,427 @@
1
+ """
2
+ data_loader.py - KEEL dataset I/O, imputation helpers, and dataset utilities.
3
+ """
4
+ import re
5
+ from io import StringIO
6
+ from pathlib import Path
7
+
8
+ import pandas as pd
9
+
10
+ from .utils import (
11
+ align_categories,
12
+ apply_decimal_map,
13
+ apply_range_cap,
14
+ decimal_map_from_reference,
15
+ range_map_from_reference,
16
+ )
17
+
18
+
19
+ # ── KEEL reader ───────────────────────────────────────────────────────────────
20
+
21
+ def read_dataset(name, location="", infer_categoricals=True, max_levels=5, max_decimals=6):
22
+ """
23
+ Reads a dataset (.dat KEEL, .arff Weka, or .csv).
24
+
25
+ Conventions:
26
+ - For .dat/.arff: Lines starting with '@' or '%' are skipped. Missing values are '?'.
27
+ - For .csv: The first row is assumed to be the header. Missing values can be empty or '?'.
28
+ - The LAST column is treated as the regression target and renamed to 'y'.
29
+ - Low-cardinality numeric columns are cast to `pd.Categorical` if `infer_categoricals` is True.
30
+
31
+ Parameters
32
+ ----------
33
+ name : str
34
+ The name of the dataset file (e.g., 'abalone.dat', 'data.csv', 'data.arff').
35
+ location : str or Path, default=''
36
+ The sub-path or directory containing the file.
37
+ infer_categoricals : bool, default=True
38
+ If True, automatically converts low cardinality numeric columns into categoricals.
39
+ max_levels : int, default=5
40
+ Maximum unique numeric values a column can have before being promoted to
41
+ categorical.
42
+ max_decimals : int or None, default=6
43
+ If set, numeric columns are capped (rounded) at this precision.
44
+
45
+ Returns
46
+ -------
47
+ pandas.DataFrame
48
+ The data loaded and structurally formatted.
49
+
50
+ Raises
51
+ ------
52
+ ValueError
53
+ If no data lines are found.
54
+ """
55
+ path = Path(".") / location / name
56
+
57
+ is_csv = path.suffix.lower() == '.csv'
58
+
59
+ if is_csv:
60
+ raw = pd.read_csv(path, na_values=["?", "NA"], skipinitialspace=True)
61
+ # Rename target column (assume last column is target)
62
+ cols = list(raw.columns)
63
+ cols[-1] = "y"
64
+ raw.columns = cols
65
+ else:
66
+ with open(path, "r", encoding="utf-8", errors="replace") as fh:
67
+ lines = fh.readlines()
68
+
69
+ data_lines = [
70
+ ln for ln in lines if not ln.strip().startswith(("@", "%")) and ln.strip()
71
+ ]
72
+ if not data_lines:
73
+ raise ValueError(f"No data lines found in {path}")
74
+
75
+ raw = pd.read_csv(
76
+ StringIO("".join(data_lines)),
77
+ header=None,
78
+ na_values=["?"],
79
+ skipinitialspace=True,
80
+ )
81
+
82
+ # Rename columns: predictors V1..V(n-1), target 'y'
83
+ n_cols = raw.shape[1]
84
+ raw.columns = [f"V{i + 1}" for i in range(n_cols - 1)] + ["y"]
85
+
86
+ # 1) String predictors → Categorical
87
+ for i in range(raw.shape[1] - 1):
88
+ if pd.api.types.is_object_dtype(raw.iloc[:, i]) or pd.api.types.is_string_dtype(raw.iloc[:, i]) or raw.iloc[:, i].dtype == str:
89
+ col_name = raw.columns[i]
90
+ raw[col_name] = raw.iloc[:, i].astype(str).str.strip().astype("category")
91
+
92
+ # 2) Low-cardinality numeric predictors → Categorical (mirrors R factor)
93
+ if infer_categoricals:
94
+ for col in raw.columns[:-1]:
95
+ if pd.api.types.is_numeric_dtype(raw[col]) and not isinstance(raw[col].dtype, pd.CategoricalDtype):
96
+ n_uniq = raw[col].dropna().nunique()
97
+ if 1 < n_uniq <= max_levels:
98
+ raw[col] = raw[col].astype("category")
99
+
100
+ # 3) Cap decimals on remaining numeric columns (including y)
101
+ if max_decimals is not None:
102
+ for col in raw.columns:
103
+ if pd.api.types.is_numeric_dtype(raw[col]) and not isinstance(raw[col].dtype, pd.CategoricalDtype):
104
+ raw[col] = raw[col].round(max_decimals)
105
+
106
+ return raw
107
+
108
+
109
+ # ── KEEL writer ───────────────────────────────────────────────────────────────
110
+
111
+ def write_dataset(df, file_path, template_path=None, na_symbol="<null>", sep=",", dec=".", out_format="dat"):
112
+ """
113
+ Writes a DataFrame as a dataset file (.dat, .arff, or .csv).
114
+
115
+ For .dat and .arff, the header is preserved up to and including the
116
+ '@data' identifier from the template file prior to appending row values,
117
+ or generated dynamically if no template is found.
118
+
119
+ Parameters
120
+ ----------
121
+ df : pandas.DataFrame
122
+ The data payload intended for serialization.
123
+ file_path : str or pathlib.Path
124
+ The path where the resulting file will be written.
125
+ template_path : str or pathlib.Path, optional
126
+ A path pointing to the original partition containing the original meta-headers.
127
+ na_symbol : str, default='<null>'
128
+ The token representation for missing values.
129
+ sep : str, default=','
130
+ The separator / delimiter string used in the writing process.
131
+ dec : str, default='.'
132
+ The decimal separator to encode floating point data properly.
133
+ out_format : str, default='dat'
134
+ The format of the output ('dat', 'arff', 'csv').
135
+
136
+ Returns
137
+ -------
138
+ None
139
+ """
140
+ file_path = Path(file_path)
141
+
142
+ # Make sure we enforce the requested extension
143
+ if file_path.suffix != f".{out_format}":
144
+ file_path = file_path.with_suffix(f".{out_format}")
145
+
146
+ # If format is CSV, use simple Pandas exporter
147
+ if out_format.lower() == "csv":
148
+ df.to_csv(file_path, index=False, na_rep=na_symbol)
149
+ return
150
+
151
+ # For KEEL/ARFF formats, extract or build the block header
152
+ header = None
153
+ if template_path and Path(template_path).exists():
154
+ with open(template_path, "r", encoding="utf-8", errors="replace") as fh:
155
+ tpl_lines = fh.readlines()
156
+
157
+ data_idx = None
158
+ for i, line in enumerate(tpl_lines):
159
+ if re.match(r"^\s*@data\s*$", line, re.IGNORECASE):
160
+ data_idx = i
161
+ break
162
+
163
+ if data_idx is not None:
164
+ header = tpl_lines[: data_idx + 1]
165
+
166
+ if header is None:
167
+ # Fallback for CSV templates: Generate a valid basic KEEL/ARFF header dynamically
168
+ header = [f"@relation {file_path.stem}\n"]
169
+ for col in df.columns:
170
+ header.append(f"@attribute {col} real\n")
171
+ header.append("@data\n")
172
+
173
+ # Build output rows
174
+ out = df.copy()
175
+ rows = []
176
+ for _, row in out.iterrows():
177
+ vals = []
178
+ for v in row:
179
+ if pd.isna(v) if not isinstance(v, str) else v in ("nan", "None", "<NA>"):
180
+ vals.append(na_symbol)
181
+ elif isinstance(v, float):
182
+ # No scientific notation, replace decimal separator if needed
183
+ s = f"{v:.10g}"
184
+ if dec != ".":
185
+ s = s.replace(".", dec)
186
+ vals.append(s)
187
+ else:
188
+ vals.append(str(v))
189
+ rows.append(sep.join(vals))
190
+
191
+ with open(file_path, "w", encoding="utf-8", newline="") as fh:
192
+ fh.writelines(header)
193
+ fh.write("\n".join(rows) + "\n")
194
+
195
+
196
+ # ── Scikit-learn Iterative Imputation ──────────────────────────────────────────
197
+
198
+ def impute_train(train_df, target_col="y", max_decimals_cap=6, random_state=None):
199
+ """
200
+ Fits a scikit-learn iterative imputation kernel exclusively on training predictors.
201
+
202
+ Fits the kernel dynamically on purely continuous or purely categorical data,
203
+ while masking out the regression target to avoid data leakage metrics.
204
+
205
+ Parameters
206
+ ----------
207
+ train_df : pandas.DataFrame
208
+ The full training dataset layout.
209
+ target_col : str, default='y'
210
+ The target column to explicitly exclude from the MICE calculation.
211
+ max_decimals_cap : int or None, default=6
212
+ Rounds outputs structurally ensuring the precision logic remains stable across
213
+ iterations of mice imputation.
214
+ random_state : int or None, default=None
215
+ Ensures imputation reproducibility across instances and permutations.
216
+
217
+ Returns
218
+ -------
219
+ dict
220
+ A mapping containing:
221
+ - 'kernel' (`sklearn.impute.IterativeImputer`): the fitted model block used for downstream targets.
222
+ - 'train_imp' (`pandas.DataFrame`): the iteratively imputed layout identical to `train_df`.
223
+ - 'dec_map' (dict): numeric precision mappings computed from the un-imputed references.
224
+ - 'range_map' (dict): feature scale caps defining numeric thresholds mappings.
225
+
226
+ Raises
227
+ ------
228
+ ImportError
229
+ If `scikit-learn` is missing internally.
230
+ """
231
+ try:
232
+ from sklearn.experimental import enable_iterative_imputer # noqa: F401
233
+ from sklearn.impute import IterativeImputer # noqa: F401
234
+ from sklearn.ensemble import ExtraTreesRegressor
235
+ import pandas as pd
236
+ except ImportError:
237
+ raise ImportError(
238
+ "scikit-learn is required for imputation. Install with: pip install scikit-learn"
239
+ )
240
+
241
+ train_x = train_df.drop(columns=[target_col])
242
+
243
+ dec_map = decimal_map_from_reference(train_x, max_decimals_cap=max_decimals_cap)
244
+ range_map = range_map_from_reference(train_x)
245
+
246
+ kernel = IterativeImputer(
247
+ estimator=ExtraTreesRegressor(n_estimators=10, random_state=random_state),
248
+ max_iter=3,
249
+ random_state=random_state
250
+ )
251
+
252
+ train_x_imp_array = kernel.fit_transform(train_x)
253
+ train_x_imp = pd.DataFrame(train_x_imp_array, columns=train_x.columns, index=train_x.index)
254
+
255
+ train_x_imp = apply_range_cap(train_x_imp, range_map)
256
+ train_x_imp = apply_decimal_map(train_x_imp, dec_map)
257
+
258
+ train_imp = train_x_imp.copy()
259
+ train_imp[target_col] = train_df[target_col].values
260
+ # Restore original column order
261
+ train_imp = train_imp[list(train_df.columns)]
262
+
263
+ return {
264
+ "kernel": kernel,
265
+ "train_imp": train_imp,
266
+ "dec_map": dec_map,
267
+ "range_map": range_map,
268
+ }
269
+
270
+
271
+ def impute_test(train_imp_df, test_df, dec_map, range_map, kernel, target_col="y"):
272
+ """
273
+ Applies the pre-fitted imputation kernel block explicitly onto the test predictors.
274
+
275
+ This ensures no data leakage occurs from test feature spaces onto the training
276
+ features. Applies previous categorical alignments mapping values independently.
277
+
278
+ Parameters
279
+ ----------
280
+ train_imp_df : pandas.DataFrame
281
+ The imputed layout of the corresponding train set.
282
+ test_df : pandas.DataFrame
283
+ The missing value-prone test layout containing identical structures.
284
+ dec_map : dict
285
+ A map defining categorical rounding sizes.
286
+ range_map : dict
287
+ A map describing limit scaling boundaries.
288
+ kernel : sklearn.impute.IterativeImputer
289
+ The underlying iterative mice engine holding the target parameters.
290
+ target_col : str, default='y'
291
+ String column identifier to avoid metric imputation leaks.
292
+
293
+ Returns
294
+ -------
295
+ pandas.DataFrame
296
+ The fully iteratively processed `test_df` preserving its baseline structure.
297
+
298
+ Raises
299
+ ------
300
+ ImportError
301
+ When unable to import `scikit-learn`.
302
+ """
303
+ try:
304
+ from sklearn.experimental import enable_iterative_imputer # noqa: F401
305
+ from sklearn.impute import IterativeImputer # noqa: F401
306
+ import pandas as pd
307
+ except ImportError:
308
+ raise ImportError(
309
+ "scikit-learn is required for imputation. Install with: pip install scikit-learn"
310
+ )
311
+
312
+ train_x_imp = train_imp_df.drop(columns=[target_col])
313
+ test_x = test_df.drop(columns=[target_col])
314
+ test_x = align_categories(train_x_imp, test_x)
315
+
316
+ # Modification to prevent failure when the test set has no NaN values during imputation
317
+
318
+ # Columns containing NaN values in the test set
319
+ test_nan_cols = set(test_x.columns[test_x.isna().any()])
320
+
321
+ # Only perform imputation if there is a real intersection
322
+ if len(test_nan_cols) > 0:
323
+ test_x_imp_array = kernel.transform(test_x)
324
+ test_x_imp = pd.DataFrame(test_x_imp_array, columns=test_x.columns, index=test_x.index)
325
+ else:
326
+ test_x_imp = test_x.copy()
327
+
328
+ test_x_imp = apply_range_cap(test_x_imp, range_map)
329
+ test_x_imp = apply_decimal_map(test_x_imp, dec_map)
330
+
331
+ test_imp = test_x_imp.copy()
332
+ test_imp[target_col] = test_df[target_col].values
333
+ test_imp = test_imp[list(test_df.columns)]
334
+
335
+ return test_imp
336
+
337
+
338
+ # ── Utilities ─────────────────────────────────────────────────────────────────
339
+
340
+ def split_features_target(df, target_col="y"):
341
+ """
342
+ Separates a DataFrame into its features (X) and target variable (y).
343
+
344
+ Convenience functional wrapper that logically splits data.
345
+
346
+ Parameters
347
+ ----------
348
+ df : pandas.DataFrame
349
+ The full dataset containing predictors and target.
350
+ target_col : str, default='y'
351
+ The name of the column explicitly denoting the regression target.
352
+
353
+ Returns
354
+ -------
355
+ tuple of (pandas.DataFrame, pandas.Series)
356
+ A structural tuple mapping `(X, y)` subsets sequentially.
357
+ """
358
+ return df.drop(columns=[target_col]), df[target_col]
359
+
360
+
361
+ def encode_categoricals(df, target_col="y", drop_first=True):
362
+ """
363
+ Identifies and one-hot encodes all categorical/string predictors in a DataFrame.
364
+
365
+ Parameters
366
+ ----------
367
+ df : pandas.DataFrame
368
+ The dataset to process.
369
+ target_col : str, default='y'
370
+ The name of the target column, which should be excluded from encoding.
371
+ drop_first : bool, default=True
372
+ Whether to drop the first category column to avoid the collinearity trap.
373
+
374
+ Returns
375
+ -------
376
+ pandas.DataFrame
377
+ The DataFrame with encoded features.
378
+ """
379
+ import pandas as pd
380
+ cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
381
+ if target_col in cat_cols:
382
+ cat_cols.remove(target_col)
383
+
384
+ if cat_cols:
385
+ print(f"[Encoding] Categorical variables detected. Applying pd.get_dummies to: {cat_cols}")
386
+ df = pd.get_dummies(df, columns=cat_cols, drop_first=drop_first, dtype=int)
387
+
388
+ return df
389
+
390
+
391
+ def get_percentages(ds_names, ds_location, phi_thr=0.8):
392
+ """
393
+ Computes the percentage of 'rare' (extreme) cases per dataset.
394
+
395
+ This function analyzes the target variable distribution and checks what percentage
396
+ of instances exceed the relevance threshold. It is essential for quantifying the
397
+ imbalance severity.
398
+
399
+ Parameters
400
+ ----------
401
+ ds_names : list of str
402
+ A list containing the names of the dataset files (e.g., ['abalone.dat', 'bos.dat']).
403
+ ds_location : str or Path
404
+ The directory path where the dataset files are located.
405
+ phi_thr : float, default=0.8
406
+ The relevance threshold [0, 1] above which an instance is considered 'rare' or extreme.
407
+
408
+ Returns
409
+ -------
410
+ pd.DataFrame
411
+ A DataFrame with two columns: 'dataset' (the name of the file) and
412
+ 'relevant_pct' (the percentage of rare cases, float).
413
+
414
+ Notes
415
+ -----
416
+ Uses `phi_control` with method "extremes" and `phi` internally.
417
+ """
418
+ from .stratification import phi_control, phi # local import avoids circular dep
419
+
420
+ rows = []
421
+ for nm in ds_names:
422
+ d = read_dataset(nm, ds_location)
423
+ ctrl = phi_control(d["y"].values, method="extremes")
424
+ p = phi(d["y"].values, ctrl)
425
+ pct = float((p >= phi_thr).sum()) / len(d) * 100
426
+ rows.append({"dataset": nm, "relevant_pct": pct})
427
+ return pd.DataFrame(rows)