balancr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- balancr/__init__.py +13 -0
- balancr/base.py +14 -0
- balancr/classifier_registry.py +300 -0
- balancr/cli/__init__.py +0 -0
- balancr/cli/commands.py +1838 -0
- balancr/cli/config.py +165 -0
- balancr/cli/main.py +778 -0
- balancr/cli/utils.py +101 -0
- balancr/data/__init__.py +5 -0
- balancr/data/loader.py +59 -0
- balancr/data/preprocessor.py +556 -0
- balancr/evaluation/__init__.py +19 -0
- balancr/evaluation/metrics.py +442 -0
- balancr/evaluation/visualisation.py +660 -0
- balancr/imbalance_analyser.py +677 -0
- balancr/technique_registry.py +284 -0
- balancr/techniques/__init__.py +4 -0
- balancr/techniques/custom/__init__.py +0 -0
- balancr/techniques/custom/example_custom_technique.py +27 -0
- balancr-0.1.0.dist-info/LICENSE +21 -0
- balancr-0.1.0.dist-info/METADATA +536 -0
- balancr-0.1.0.dist-info/RECORD +25 -0
- balancr-0.1.0.dist-info/WHEEL +5 -0
- balancr-0.1.0.dist-info/entry_points.txt +2 -0
- balancr-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,556 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import List, Tuple, Dict, Any, Optional
|
3
|
+
import numpy as np
|
4
|
+
import pandas as pd
|
5
|
+
from sklearn.preprocessing import (
|
6
|
+
StandardScaler,
|
7
|
+
MinMaxScaler,
|
8
|
+
RobustScaler,
|
9
|
+
LabelEncoder,
|
10
|
+
)
|
11
|
+
from sklearn.impute import SimpleImputer
|
12
|
+
from sklearn.feature_extraction import FeatureHasher
|
13
|
+
|
14
|
+
|
15
|
+
class DataPreprocessor:
|
16
|
+
"""Handles data preprocessing operations"""
|
17
|
+
|
18
|
+
def __init__(self):
|
19
|
+
self.scaler = None
|
20
|
+
self.imputer = None
|
21
|
+
self.label_encoder = None
|
22
|
+
self.feature_names = None
|
23
|
+
self.categorical_features = None
|
24
|
+
self.numerical_features = None
|
25
|
+
|
26
|
+
def inspect_class_distribution(self, y: np.ndarray) -> Dict[Any, int]:
|
27
|
+
"""
|
28
|
+
Inspect the distribution of classes in the target variable
|
29
|
+
|
30
|
+
Args:
|
31
|
+
y: Target vector
|
32
|
+
|
33
|
+
Returns:
|
34
|
+
Dictionary mapping class labels to their counts
|
35
|
+
"""
|
36
|
+
unique, counts = np.unique(y, return_counts=True)
|
37
|
+
return dict(zip(unique, counts))
|
38
|
+
|
39
|
+
def check_data_quality(
|
40
|
+
self,
|
41
|
+
X: np.ndarray,
|
42
|
+
feature_names: Optional[list] = None,
|
43
|
+
correlation_threshold: float = 0.95,
|
44
|
+
) -> Dict[str, list]:
|
45
|
+
"""
|
46
|
+
Check data quality issues
|
47
|
+
|
48
|
+
Args:
|
49
|
+
X: Feature matrix (numpy array or pandas DataFrame)
|
50
|
+
feature_names: Optional list of feature names
|
51
|
+
|
52
|
+
Returns:
|
53
|
+
Dictionary containing quality metrics with more descriptive information
|
54
|
+
"""
|
55
|
+
# Validate correlation threshold
|
56
|
+
if not (0 <= correlation_threshold <= 1):
|
57
|
+
raise ValueError("correlation_threshold must be between 0 and 1")
|
58
|
+
|
59
|
+
# Handle input that is already a DataFrame or convert numpy array to DataFrame
|
60
|
+
if not isinstance(X, pd.DataFrame):
|
61
|
+
if feature_names is None:
|
62
|
+
feature_names = [f"Feature_{i}" for i in range(X.shape[1])]
|
63
|
+
X_df = pd.DataFrame(X, columns=feature_names)
|
64
|
+
else:
|
65
|
+
X_df = X
|
66
|
+
if feature_names is None:
|
67
|
+
feature_names = X_df.columns.tolist()
|
68
|
+
|
69
|
+
# Check for missing values and create a more descriptive result
|
70
|
+
missing_values_counts = X_df.isna().sum()
|
71
|
+
missing_values = []
|
72
|
+
for feature_name, count in missing_values_counts.items():
|
73
|
+
if count > 0:
|
74
|
+
missing_values.append((feature_name, int(count)))
|
75
|
+
|
76
|
+
quality_report = {
|
77
|
+
"missing_values": missing_values,
|
78
|
+
"constant_features": [],
|
79
|
+
"feature_correlations": [],
|
80
|
+
}
|
81
|
+
|
82
|
+
# Check for constant features
|
83
|
+
constant_features = []
|
84
|
+
for i, col in enumerate(X_df.columns):
|
85
|
+
if X_df[col].nunique(dropna=True) <= 1:
|
86
|
+
constant_features.append((col, i))
|
87
|
+
|
88
|
+
quality_report["constant_features"] = constant_features
|
89
|
+
|
90
|
+
# Calculate correlations only on numeric columns
|
91
|
+
numeric_cols = X_df.select_dtypes(include=np.number).columns
|
92
|
+
|
93
|
+
if len(numeric_cols) > 1 and X_df.shape[0] > 1:
|
94
|
+
try:
|
95
|
+
correlations = X_df[numeric_cols].corr()
|
96
|
+
high_corr_pairs = []
|
97
|
+
|
98
|
+
for i in range(len(correlations.columns)):
|
99
|
+
for j in range(i + 1, len(correlations.columns)):
|
100
|
+
if abs(correlations.iloc[i, j]) > correlation_threshold:
|
101
|
+
# Map back to original column names
|
102
|
+
col_i = correlations.columns[i]
|
103
|
+
col_j = correlations.columns[j]
|
104
|
+
high_corr_pairs.append(
|
105
|
+
(col_i, col_j, correlations.iloc[i, j])
|
106
|
+
)
|
107
|
+
quality_report["feature_correlations"] = high_corr_pairs
|
108
|
+
except Exception:
|
109
|
+
# In case of correlation calculation errors
|
110
|
+
quality_report["feature_correlations"] = []
|
111
|
+
else:
|
112
|
+
quality_report["feature_correlations"] = []
|
113
|
+
|
114
|
+
return quality_report
|
115
|
+
|
116
|
+
def preprocess(
|
117
|
+
self,
|
118
|
+
X: np.ndarray,
|
119
|
+
y: np.ndarray,
|
120
|
+
handle_missing: str = "mean",
|
121
|
+
scale: str = "standard",
|
122
|
+
handle_constant_features: str = "none",
|
123
|
+
handle_correlations: str = "none",
|
124
|
+
constant_features: Optional[List[str]] = None,
|
125
|
+
correlated_features: Optional[List[Tuple[str, str, float]]] = None,
|
126
|
+
categorical_features: Dict[str, str] = None,
|
127
|
+
all_features: List[str] = None,
|
128
|
+
encode_target: bool = True,
|
129
|
+
hash_components_dict: Dict[str, int] = None,
|
130
|
+
) -> Tuple[pd.DataFrame, np.ndarray]:
|
131
|
+
"""
|
132
|
+
Preprocess the data with enhanced options for categorical features.
|
133
|
+
|
134
|
+
Args:
|
135
|
+
X: Feature matrix
|
136
|
+
y: Target vector
|
137
|
+
handle_missing: Strategy to handle missing values
|
138
|
+
("drop", "mean", "median", "mode", "none")
|
139
|
+
scale: Scaling method
|
140
|
+
("standard", "minmax", "robust", "none")
|
141
|
+
categorical_features: Dictionary mapping categorical column names to encoding methods
|
142
|
+
Each column will be encoded according to its specified method:
|
143
|
+
"onehot", "label", "ordinal", or "none"
|
144
|
+
all_features: List of all feature column names. If provided, these will be used
|
145
|
+
as column names for the DataFrame.
|
146
|
+
|
147
|
+
Returns:
|
148
|
+
Preprocessed X (as DataFrame with proper column names) and y
|
149
|
+
"""
|
150
|
+
# Initialise categorical_features dictionary if None
|
151
|
+
if categorical_features is None:
|
152
|
+
categorical_features = {}
|
153
|
+
if hash_components_dict is None:
|
154
|
+
hash_components_dict = {}
|
155
|
+
|
156
|
+
# Convert to DataFrame for more flexible processing if not already
|
157
|
+
if not isinstance(X, pd.DataFrame):
|
158
|
+
if all_features and len(all_features) == X.shape[1]:
|
159
|
+
# If all feature names are provided, use them as column names
|
160
|
+
X = pd.DataFrame(X, columns=all_features)
|
161
|
+
else:
|
162
|
+
# Use generic column names if not provided
|
163
|
+
X = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
|
164
|
+
|
165
|
+
# Convert y to Series if it's not already
|
166
|
+
if not isinstance(y, pd.Series):
|
167
|
+
y = pd.Series(y)
|
168
|
+
|
169
|
+
# Store the initial feature names for reference
|
170
|
+
self.feature_names = X.columns.tolist()
|
171
|
+
|
172
|
+
# Handle constant features
|
173
|
+
if (
|
174
|
+
handle_constant_features == "drop"
|
175
|
+
and constant_features
|
176
|
+
and len(constant_features) > 0
|
177
|
+
):
|
178
|
+
# Log the constant features being dropped
|
179
|
+
logging.debug(
|
180
|
+
f"Dropping {len(constant_features)} constant features: {constant_features}"
|
181
|
+
)
|
182
|
+
|
183
|
+
# Drop the constant features
|
184
|
+
X = X.drop(columns=constant_features)
|
185
|
+
|
186
|
+
# Update feature names
|
187
|
+
self.feature_names = X.columns.tolist()
|
188
|
+
|
189
|
+
# Handle highly correlated features
|
190
|
+
if (
|
191
|
+
handle_correlations != "none"
|
192
|
+
and correlated_features
|
193
|
+
and len(correlated_features) > 0
|
194
|
+
):
|
195
|
+
# Create a list to track features to drop
|
196
|
+
features_to_drop = []
|
197
|
+
|
198
|
+
# Mapping for PCA components if needed
|
199
|
+
pca_mapping = {}
|
200
|
+
|
201
|
+
# Process each correlated pair
|
202
|
+
for feature1, feature2, corr_value in correlated_features:
|
203
|
+
# Skip if either feature has already been dropped
|
204
|
+
if feature1 in features_to_drop or feature2 in features_to_drop:
|
205
|
+
continue
|
206
|
+
|
207
|
+
if handle_correlations == "drop_first":
|
208
|
+
# Drop the first feature in the pair
|
209
|
+
features_to_drop.append(feature1)
|
210
|
+
logging.debug(
|
211
|
+
f"Dropping {feature1} (first feature in correlation pair"
|
212
|
+
"with {feature2}, corr={corr_value:.2f})"
|
213
|
+
)
|
214
|
+
|
215
|
+
elif handle_correlations == "drop_lowest":
|
216
|
+
# Calculate variance of both features
|
217
|
+
var1 = X[feature1].var()
|
218
|
+
var2 = X[feature2].var()
|
219
|
+
|
220
|
+
# Drop the feature with lower variance
|
221
|
+
if var1 < var2:
|
222
|
+
features_to_drop.append(feature1)
|
223
|
+
logging.debug(
|
224
|
+
f"Dropping {feature1} (lower variance in correlation"
|
225
|
+
"pair with {feature2}, corr={corr_value:.2f})"
|
226
|
+
)
|
227
|
+
else:
|
228
|
+
features_to_drop.append(feature2)
|
229
|
+
logging.debug(
|
230
|
+
f"Dropping {feature2} (lower variance in correlation"
|
231
|
+
"pair with {feature1}, corr={corr_value:.2f})"
|
232
|
+
)
|
233
|
+
|
234
|
+
elif handle_correlations == "pca":
|
235
|
+
# Only process each pair once
|
236
|
+
pair_key = tuple(sorted([feature1, feature2]))
|
237
|
+
if pair_key in pca_mapping:
|
238
|
+
continue
|
239
|
+
|
240
|
+
# Get the data for these two features
|
241
|
+
pair_data = X[[feature1, feature2]]
|
242
|
+
|
243
|
+
try:
|
244
|
+
# Apply PCA to reduce to one component
|
245
|
+
from sklearn.decomposition import PCA
|
246
|
+
|
247
|
+
pca = PCA(n_components=1)
|
248
|
+
pca_result = pca.fit_transform(pair_data)
|
249
|
+
|
250
|
+
# Create a new feature name for the PCA component
|
251
|
+
pca_feature = f"pca_{feature1}_{feature2}"
|
252
|
+
|
253
|
+
# Add the PCA component as a new column
|
254
|
+
X[pca_feature] = pca_result
|
255
|
+
|
256
|
+
# Mark both original features for dropping
|
257
|
+
features_to_drop.extend([feature1, feature2])
|
258
|
+
|
259
|
+
# Store the mapping for reference
|
260
|
+
pca_mapping[pair_key] = pca_feature
|
261
|
+
|
262
|
+
logging.debug(
|
263
|
+
f"Applied PCA to correlated features {feature1} and {feature2} (corr={corr_value:.2f})"
|
264
|
+
)
|
265
|
+
except Exception as e:
|
266
|
+
logging.warning(
|
267
|
+
f"Failed to apply PCA to {feature1} and {feature2}: {str(e)}"
|
268
|
+
)
|
269
|
+
|
270
|
+
# Drop all marked features at once
|
271
|
+
if features_to_drop:
|
272
|
+
X = X.drop(columns=features_to_drop)
|
273
|
+
logging.debug(
|
274
|
+
f"Dropped {len(features_to_drop)} features due to high correlation"
|
275
|
+
)
|
276
|
+
|
277
|
+
# Update feature names
|
278
|
+
self.feature_names = X.columns.tolist()
|
279
|
+
|
280
|
+
# Handle missing values
|
281
|
+
if handle_missing != "none" and X.isna().any().any():
|
282
|
+
if handle_missing == "drop":
|
283
|
+
# Remove rows with any missing values
|
284
|
+
mask = ~X.isna().any(axis=1)
|
285
|
+
X = X[mask].copy()
|
286
|
+
y = y[mask] if isinstance(y, pd.Series) else y[mask]
|
287
|
+
else:
|
288
|
+
# Use SimpleImputer for other strategies
|
289
|
+
strategy = (
|
290
|
+
handle_missing
|
291
|
+
if handle_missing in ["mean", "median", "most_frequent"]
|
292
|
+
else "mean"
|
293
|
+
)
|
294
|
+
if handle_missing == "mode":
|
295
|
+
strategy = "most_frequent"
|
296
|
+
|
297
|
+
# Identify numerical columns (those not in categorical_features)
|
298
|
+
numerical_cols = [
|
299
|
+
col for col in X.columns if col not in categorical_features
|
300
|
+
]
|
301
|
+
|
302
|
+
# Apply imputation to numerical columns
|
303
|
+
if numerical_cols:
|
304
|
+
imputer = SimpleImputer(strategy=strategy)
|
305
|
+
X[numerical_cols] = pd.DataFrame(
|
306
|
+
imputer.fit_transform(X[numerical_cols]),
|
307
|
+
columns=numerical_cols,
|
308
|
+
index=X.index,
|
309
|
+
)
|
310
|
+
|
311
|
+
# For categorical columns with missing values, fill with mode
|
312
|
+
for col in categorical_features:
|
313
|
+
if col in X.columns and X[col].isna().any():
|
314
|
+
X[col] = X[col].fillna(X[col].mode().iloc[0])
|
315
|
+
|
316
|
+
# Apply scaling only to numerical features (those not in categorical_features)
|
317
|
+
if scale != "none":
|
318
|
+
numerical_cols = [
|
319
|
+
col for col in X.columns if col not in categorical_features
|
320
|
+
]
|
321
|
+
if numerical_cols:
|
322
|
+
if scale == "standard":
|
323
|
+
scaler = StandardScaler()
|
324
|
+
elif scale == "minmax":
|
325
|
+
scaler = MinMaxScaler()
|
326
|
+
elif scale == "robust":
|
327
|
+
scaler = RobustScaler()
|
328
|
+
else:
|
329
|
+
scaler = StandardScaler() # Default
|
330
|
+
|
331
|
+
X[numerical_cols] = pd.DataFrame(
|
332
|
+
scaler.fit_transform(X[numerical_cols]),
|
333
|
+
columns=numerical_cols,
|
334
|
+
index=X.index,
|
335
|
+
)
|
336
|
+
|
337
|
+
# Process categorical columns based on their specified encoding types
|
338
|
+
for col, encoding_info in categorical_features.items():
|
339
|
+
# Skip if column doesn't exist in the DataFrame
|
340
|
+
if col not in X.columns:
|
341
|
+
continue
|
342
|
+
|
343
|
+
# Normalise encoding_info into (encoding_type, extra_info)
|
344
|
+
if isinstance(encoding_info, list):
|
345
|
+
encoding_type = encoding_info[0]
|
346
|
+
extra_info = encoding_info[1:]
|
347
|
+
else:
|
348
|
+
encoding_type = encoding_info
|
349
|
+
extra_info = []
|
350
|
+
|
351
|
+
# Apply encoding based on type
|
352
|
+
if encoding_type == "onehot":
|
353
|
+
unique_count = X[col].nunique()
|
354
|
+
if unique_count > 50: # Threshold for "high cardinality"
|
355
|
+
logging.warning(
|
356
|
+
f"Column '{col}' has {unique_count} unique values. "
|
357
|
+
"One-hot encoding may create too many features."
|
358
|
+
)
|
359
|
+
if unique_count > 100: # Very high cardinality threshold
|
360
|
+
logging.warning(
|
361
|
+
f"Falling back to hash encoding for column '{col}' "
|
362
|
+
"due to very high cardinality."
|
363
|
+
)
|
364
|
+
# Use 32 hash columns as a default fallback
|
365
|
+
n_components = 32
|
366
|
+
|
367
|
+
# Use feature hashing for high cardinality
|
368
|
+
hasher = FeatureHasher(
|
369
|
+
n_features=n_components, input_type="string"
|
370
|
+
)
|
371
|
+
|
372
|
+
# Convert column values to a format FeatureHasher can handle
|
373
|
+
feature_values = X[col].astype(str).tolist()
|
374
|
+
hashed_features = hasher.transform(
|
375
|
+
[[value] for value in feature_values]
|
376
|
+
)
|
377
|
+
|
378
|
+
# Create a DataFrame with descriptive column names
|
379
|
+
hashed_df = pd.DataFrame(
|
380
|
+
hashed_features.toarray(),
|
381
|
+
index=X.index,
|
382
|
+
columns=[f"{col}_hash_{i}" for i in range(n_components)],
|
383
|
+
)
|
384
|
+
# Drop original column and add hashed features
|
385
|
+
X = pd.concat([X.drop(col, axis=1), hashed_df], axis=1)
|
386
|
+
else:
|
387
|
+
# Proceed with one-hot encoding but warn the user
|
388
|
+
# Use descriptive column names: original_col_value
|
389
|
+
dummies = pd.get_dummies(
|
390
|
+
X[col], prefix=col, prefix_sep="_", drop_first=False
|
391
|
+
)
|
392
|
+
X = pd.concat([X.drop(col, axis=1), dummies], axis=1)
|
393
|
+
else:
|
394
|
+
# Normal one-hot encoding for manageable cardinality
|
395
|
+
# Use descriptive column names: original_col_value
|
396
|
+
dummies = pd.get_dummies(
|
397
|
+
X[col], prefix=col, prefix_sep="_", drop_first=False
|
398
|
+
)
|
399
|
+
X = pd.concat([X.drop(col, axis=1), dummies], axis=1)
|
400
|
+
|
401
|
+
elif encoding_type == "hash":
|
402
|
+
# Get the number of components for this feature, if specified
|
403
|
+
n_components = (
|
404
|
+
extra_info[0] if extra_info else 32
|
405
|
+
) # Default to 32 if not specified
|
406
|
+
|
407
|
+
hasher = FeatureHasher(n_features=n_components, input_type="string")
|
408
|
+
|
409
|
+
# Convert column values to a format FeatureHasher can handle
|
410
|
+
feature_values = X[col].astype(str).tolist()
|
411
|
+
hashed_features = hasher.transform(
|
412
|
+
[[value] for value in feature_values]
|
413
|
+
)
|
414
|
+
|
415
|
+
# Create a DataFrame with descriptive column names
|
416
|
+
hashed_df = pd.DataFrame(
|
417
|
+
hashed_features.toarray(),
|
418
|
+
index=X.index,
|
419
|
+
columns=[f"{col}_hash_{i}" for i in range(n_components)],
|
420
|
+
)
|
421
|
+
# Drop original column and add hashed features
|
422
|
+
X = pd.concat([X.drop(col, axis=1), hashed_df], axis=1)
|
423
|
+
|
424
|
+
elif encoding_type == "label":
|
425
|
+
# Label encode this column
|
426
|
+
le = LabelEncoder()
|
427
|
+
X[col] = le.fit_transform(X[col])
|
428
|
+
|
429
|
+
elif encoding_type == "ordinal":
|
430
|
+
# Ordinal encode this column
|
431
|
+
categories = X[col].unique()
|
432
|
+
mapping = {cat: i for i, cat in enumerate(categories)}
|
433
|
+
X[col] = X[col].map(mapping)
|
434
|
+
|
435
|
+
# Skip if encoding type is "none"
|
436
|
+
|
437
|
+
# Encode labels if necessary (for the target variable)
|
438
|
+
if encode_target and not np.issubdtype(y.dtype, np.number):
|
439
|
+
# Use label encoding for the target
|
440
|
+
self.label_encoder = LabelEncoder()
|
441
|
+
y = self.label_encoder.fit_transform(y)
|
442
|
+
|
443
|
+
# Store the column names for future reference
|
444
|
+
self.feature_names = list(X.columns)
|
445
|
+
|
446
|
+
# Return the DataFrame with column names preserved and the numpy array for y
|
447
|
+
return X, y
|
448
|
+
|
449
|
+
def assign_encoding_types(
|
450
|
+
self,
|
451
|
+
df,
|
452
|
+
categorical_columns,
|
453
|
+
encoding_type="auto",
|
454
|
+
hash_components=32,
|
455
|
+
ordinal_columns=None,
|
456
|
+
):
|
457
|
+
"""
|
458
|
+
Assigns encoding types to categorical features based on user preference or automatic recommendation.
|
459
|
+
|
460
|
+
Args:
|
461
|
+
df: DataFrame containing the data
|
462
|
+
categorical_columns: List of categorical feature column names
|
463
|
+
encoding_type: Global encoding strategy: "auto", "onehot", "label", "ordinal", or "none"
|
464
|
+
ordinal_columns: List of categorical columns that have a natural order
|
465
|
+
|
466
|
+
Returns:
|
467
|
+
Dictionary mapping column names to their assigned encoding types
|
468
|
+
|
469
|
+
Raises:
|
470
|
+
ValueError: If an invalid encoding type is provided
|
471
|
+
"""
|
472
|
+
if not categorical_columns:
|
473
|
+
return {}
|
474
|
+
|
475
|
+
# Initialise encoding types dictionary
|
476
|
+
encoding_types = {}
|
477
|
+
|
478
|
+
# Initialise ordinal columns if None
|
479
|
+
if ordinal_columns is None:
|
480
|
+
ordinal_columns = []
|
481
|
+
|
482
|
+
# Validate and filter categorical columns
|
483
|
+
valid_categorical_columns = []
|
484
|
+
for col in categorical_columns:
|
485
|
+
if col in df.columns:
|
486
|
+
valid_categorical_columns.append(col)
|
487
|
+
else:
|
488
|
+
logging.warning(
|
489
|
+
f"Column '{col}' not found in the dataset and will be ignored"
|
490
|
+
)
|
491
|
+
|
492
|
+
# If no valid columns remain, return empty dictionary
|
493
|
+
if not valid_categorical_columns:
|
494
|
+
logging.warning("No valid categorical columns found in the dataset")
|
495
|
+
return {}
|
496
|
+
|
497
|
+
# Assign specific encoding type to all columns
|
498
|
+
if encoding_type in ["onehot", "label", "ordinal"]:
|
499
|
+
for col in valid_categorical_columns:
|
500
|
+
encoding_types[col] = encoding_type
|
501
|
+
|
502
|
+
# Assign hash encoding with components to all columns
|
503
|
+
elif encoding_type == "hash":
|
504
|
+
for col in valid_categorical_columns:
|
505
|
+
encoding_types[col] = ["hash", hash_components]
|
506
|
+
|
507
|
+
# Assign "none" to all columns
|
508
|
+
elif encoding_type == "none":
|
509
|
+
for col in valid_categorical_columns:
|
510
|
+
encoding_types[col] = "none"
|
511
|
+
|
512
|
+
# Recommend encoding types based on data characteristics
|
513
|
+
elif encoding_type == "auto":
|
514
|
+
for col in valid_categorical_columns:
|
515
|
+
# If column is marked as ordinal, assign ordinal encoding
|
516
|
+
if col in ordinal_columns:
|
517
|
+
encoding_types[col] = "ordinal"
|
518
|
+
continue
|
519
|
+
|
520
|
+
# Get cardinality (number of unique values)
|
521
|
+
unique_count = df[col].nunique()
|
522
|
+
|
523
|
+
# Check for highly skewed distribution
|
524
|
+
value_counts = df[col].value_counts(normalize=True)
|
525
|
+
is_highly_skewed = (
|
526
|
+
value_counts.iloc[0] >= 0.8
|
527
|
+
) # If dominant category > 80%
|
528
|
+
|
529
|
+
# Check for many rare categories
|
530
|
+
rare_categories = (value_counts < 0.05).sum()
|
531
|
+
has_many_rare_cats = (
|
532
|
+
rare_categories > unique_count * 0.5
|
533
|
+
) # If >50% of categories are rare
|
534
|
+
|
535
|
+
# Recommend based on cardinality and distribution characteristics
|
536
|
+
if unique_count <= 20:
|
537
|
+
# For low cardinality, use one-hot
|
538
|
+
encoding_types[col] = "onehot"
|
539
|
+
elif is_highly_skewed:
|
540
|
+
# For skewed distributions, use label encoding
|
541
|
+
encoding_types[col] = "label"
|
542
|
+
elif has_many_rare_cats and unique_count <= 29:
|
543
|
+
# For many rare categories and under cardinality of 50, use label encoding
|
544
|
+
encoding_types[col] = "label"
|
545
|
+
else:
|
546
|
+
# For high cardinality with balanced distribution, use hash encoding
|
547
|
+
# Adjust number of hash columns based on cardinality
|
548
|
+
n_components = min(32, max(16, int(unique_count / 4)))
|
549
|
+
encoding_types[col] = ["hash", n_components]
|
550
|
+
|
551
|
+
else:
|
552
|
+
raise ValueError(
|
553
|
+
f"Invalid encoding type: {encoding_type}. Must be one of: auto, onehot, label, ordinal, hash, none"
|
554
|
+
)
|
555
|
+
|
556
|
+
return encoding_types
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# src/balancr/evaluation/__init__.py
|
2
|
+
# flake8: noqa
|
3
|
+
|
4
|
+
from .metrics import (
|
5
|
+
format_time,
|
6
|
+
get_metrics,
|
7
|
+
get_cv_scores,
|
8
|
+
get_learning_curve_data,
|
9
|
+
get_learning_curve_data_multiple_techniques,
|
10
|
+
)
|
11
|
+
|
12
|
+
from .visualisation import (
|
13
|
+
plot_class_distribution,
|
14
|
+
plot_class_distributions_comparison,
|
15
|
+
plot_comparison_results,
|
16
|
+
plot_learning_curves,
|
17
|
+
plot_radar_chart,
|
18
|
+
plot_3d_scatter,
|
19
|
+
)
|