balancr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,556 @@
1
+ import logging
2
+ from typing import List, Tuple, Dict, Any, Optional
3
+ import numpy as np
4
+ import pandas as pd
5
+ from sklearn.preprocessing import (
6
+ StandardScaler,
7
+ MinMaxScaler,
8
+ RobustScaler,
9
+ LabelEncoder,
10
+ )
11
+ from sklearn.impute import SimpleImputer
12
+ from sklearn.feature_extraction import FeatureHasher
13
+
14
+
15
+ class DataPreprocessor:
16
+ """Handles data preprocessing operations"""
17
+
18
+ def __init__(self):
19
+ self.scaler = None
20
+ self.imputer = None
21
+ self.label_encoder = None
22
+ self.feature_names = None
23
+ self.categorical_features = None
24
+ self.numerical_features = None
25
+
26
+ def inspect_class_distribution(self, y: np.ndarray) -> Dict[Any, int]:
27
+ """
28
+ Inspect the distribution of classes in the target variable
29
+
30
+ Args:
31
+ y: Target vector
32
+
33
+ Returns:
34
+ Dictionary mapping class labels to their counts
35
+ """
36
+ unique, counts = np.unique(y, return_counts=True)
37
+ return dict(zip(unique, counts))
38
+
39
+ def check_data_quality(
40
+ self,
41
+ X: np.ndarray,
42
+ feature_names: Optional[list] = None,
43
+ correlation_threshold: float = 0.95,
44
+ ) -> Dict[str, list]:
45
+ """
46
+ Check data quality issues
47
+
48
+ Args:
49
+ X: Feature matrix (numpy array or pandas DataFrame)
50
+ feature_names: Optional list of feature names
51
+
52
+ Returns:
53
+ Dictionary containing quality metrics with more descriptive information
54
+ """
55
+ # Validate correlation threshold
56
+ if not (0 <= correlation_threshold <= 1):
57
+ raise ValueError("correlation_threshold must be between 0 and 1")
58
+
59
+ # Handle input that is already a DataFrame or convert numpy array to DataFrame
60
+ if not isinstance(X, pd.DataFrame):
61
+ if feature_names is None:
62
+ feature_names = [f"Feature_{i}" for i in range(X.shape[1])]
63
+ X_df = pd.DataFrame(X, columns=feature_names)
64
+ else:
65
+ X_df = X
66
+ if feature_names is None:
67
+ feature_names = X_df.columns.tolist()
68
+
69
+ # Check for missing values and create a more descriptive result
70
+ missing_values_counts = X_df.isna().sum()
71
+ missing_values = []
72
+ for feature_name, count in missing_values_counts.items():
73
+ if count > 0:
74
+ missing_values.append((feature_name, int(count)))
75
+
76
+ quality_report = {
77
+ "missing_values": missing_values,
78
+ "constant_features": [],
79
+ "feature_correlations": [],
80
+ }
81
+
82
+ # Check for constant features
83
+ constant_features = []
84
+ for i, col in enumerate(X_df.columns):
85
+ if X_df[col].nunique(dropna=True) <= 1:
86
+ constant_features.append((col, i))
87
+
88
+ quality_report["constant_features"] = constant_features
89
+
90
+ # Calculate correlations only on numeric columns
91
+ numeric_cols = X_df.select_dtypes(include=np.number).columns
92
+
93
+ if len(numeric_cols) > 1 and X_df.shape[0] > 1:
94
+ try:
95
+ correlations = X_df[numeric_cols].corr()
96
+ high_corr_pairs = []
97
+
98
+ for i in range(len(correlations.columns)):
99
+ for j in range(i + 1, len(correlations.columns)):
100
+ if abs(correlations.iloc[i, j]) > correlation_threshold:
101
+ # Map back to original column names
102
+ col_i = correlations.columns[i]
103
+ col_j = correlations.columns[j]
104
+ high_corr_pairs.append(
105
+ (col_i, col_j, correlations.iloc[i, j])
106
+ )
107
+ quality_report["feature_correlations"] = high_corr_pairs
108
+ except Exception:
109
+ # In case of correlation calculation errors
110
+ quality_report["feature_correlations"] = []
111
+ else:
112
+ quality_report["feature_correlations"] = []
113
+
114
+ return quality_report
115
+
116
+ def preprocess(
117
+ self,
118
+ X: np.ndarray,
119
+ y: np.ndarray,
120
+ handle_missing: str = "mean",
121
+ scale: str = "standard",
122
+ handle_constant_features: str = "none",
123
+ handle_correlations: str = "none",
124
+ constant_features: Optional[List[str]] = None,
125
+ correlated_features: Optional[List[Tuple[str, str, float]]] = None,
126
+ categorical_features: Dict[str, str] = None,
127
+ all_features: List[str] = None,
128
+ encode_target: bool = True,
129
+ hash_components_dict: Dict[str, int] = None,
130
+ ) -> Tuple[pd.DataFrame, np.ndarray]:
131
+ """
132
+ Preprocess the data with enhanced options for categorical features.
133
+
134
+ Args:
135
+ X: Feature matrix
136
+ y: Target vector
137
+ handle_missing: Strategy to handle missing values
138
+ ("drop", "mean", "median", "mode", "none")
139
+ scale: Scaling method
140
+ ("standard", "minmax", "robust", "none")
141
+ categorical_features: Dictionary mapping categorical column names to encoding methods
142
+ Each column will be encoded according to its specified method:
143
+ "onehot", "label", "ordinal", or "none"
144
+ all_features: List of all feature column names. If provided, these will be used
145
+ as column names for the DataFrame.
146
+
147
+ Returns:
148
+ Preprocessed X (as DataFrame with proper column names) and y
149
+ """
150
+ # Initialise categorical_features dictionary if None
151
+ if categorical_features is None:
152
+ categorical_features = {}
153
+ if hash_components_dict is None:
154
+ hash_components_dict = {}
155
+
156
+ # Convert to DataFrame for more flexible processing if not already
157
+ if not isinstance(X, pd.DataFrame):
158
+ if all_features and len(all_features) == X.shape[1]:
159
+ # If all feature names are provided, use them as column names
160
+ X = pd.DataFrame(X, columns=all_features)
161
+ else:
162
+ # Use generic column names if not provided
163
+ X = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
164
+
165
+ # Convert y to Series if it's not already
166
+ if not isinstance(y, pd.Series):
167
+ y = pd.Series(y)
168
+
169
+ # Store the initial feature names for reference
170
+ self.feature_names = X.columns.tolist()
171
+
172
+ # Handle constant features
173
+ if (
174
+ handle_constant_features == "drop"
175
+ and constant_features
176
+ and len(constant_features) > 0
177
+ ):
178
+ # Log the constant features being dropped
179
+ logging.debug(
180
+ f"Dropping {len(constant_features)} constant features: {constant_features}"
181
+ )
182
+
183
+ # Drop the constant features
184
+ X = X.drop(columns=constant_features)
185
+
186
+ # Update feature names
187
+ self.feature_names = X.columns.tolist()
188
+
189
+ # Handle highly correlated features
190
+ if (
191
+ handle_correlations != "none"
192
+ and correlated_features
193
+ and len(correlated_features) > 0
194
+ ):
195
+ # Create a list to track features to drop
196
+ features_to_drop = []
197
+
198
+ # Mapping for PCA components if needed
199
+ pca_mapping = {}
200
+
201
+ # Process each correlated pair
202
+ for feature1, feature2, corr_value in correlated_features:
203
+ # Skip if either feature has already been dropped
204
+ if feature1 in features_to_drop or feature2 in features_to_drop:
205
+ continue
206
+
207
+ if handle_correlations == "drop_first":
208
+ # Drop the first feature in the pair
209
+ features_to_drop.append(feature1)
210
+ logging.debug(
211
+ f"Dropping {feature1} (first feature in correlation pair"
212
+ "with {feature2}, corr={corr_value:.2f})"
213
+ )
214
+
215
+ elif handle_correlations == "drop_lowest":
216
+ # Calculate variance of both features
217
+ var1 = X[feature1].var()
218
+ var2 = X[feature2].var()
219
+
220
+ # Drop the feature with lower variance
221
+ if var1 < var2:
222
+ features_to_drop.append(feature1)
223
+ logging.debug(
224
+ f"Dropping {feature1} (lower variance in correlation"
225
+ "pair with {feature2}, corr={corr_value:.2f})"
226
+ )
227
+ else:
228
+ features_to_drop.append(feature2)
229
+ logging.debug(
230
+ f"Dropping {feature2} (lower variance in correlation"
231
+ "pair with {feature1}, corr={corr_value:.2f})"
232
+ )
233
+
234
+ elif handle_correlations == "pca":
235
+ # Only process each pair once
236
+ pair_key = tuple(sorted([feature1, feature2]))
237
+ if pair_key in pca_mapping:
238
+ continue
239
+
240
+ # Get the data for these two features
241
+ pair_data = X[[feature1, feature2]]
242
+
243
+ try:
244
+ # Apply PCA to reduce to one component
245
+ from sklearn.decomposition import PCA
246
+
247
+ pca = PCA(n_components=1)
248
+ pca_result = pca.fit_transform(pair_data)
249
+
250
+ # Create a new feature name for the PCA component
251
+ pca_feature = f"pca_{feature1}_{feature2}"
252
+
253
+ # Add the PCA component as a new column
254
+ X[pca_feature] = pca_result
255
+
256
+ # Mark both original features for dropping
257
+ features_to_drop.extend([feature1, feature2])
258
+
259
+ # Store the mapping for reference
260
+ pca_mapping[pair_key] = pca_feature
261
+
262
+ logging.debug(
263
+ f"Applied PCA to correlated features {feature1} and {feature2} (corr={corr_value:.2f})"
264
+ )
265
+ except Exception as e:
266
+ logging.warning(
267
+ f"Failed to apply PCA to {feature1} and {feature2}: {str(e)}"
268
+ )
269
+
270
+ # Drop all marked features at once
271
+ if features_to_drop:
272
+ X = X.drop(columns=features_to_drop)
273
+ logging.debug(
274
+ f"Dropped {len(features_to_drop)} features due to high correlation"
275
+ )
276
+
277
+ # Update feature names
278
+ self.feature_names = X.columns.tolist()
279
+
280
+ # Handle missing values
281
+ if handle_missing != "none" and X.isna().any().any():
282
+ if handle_missing == "drop":
283
+ # Remove rows with any missing values
284
+ mask = ~X.isna().any(axis=1)
285
+ X = X[mask].copy()
286
+ y = y[mask] if isinstance(y, pd.Series) else y[mask]
287
+ else:
288
+ # Use SimpleImputer for other strategies
289
+ strategy = (
290
+ handle_missing
291
+ if handle_missing in ["mean", "median", "most_frequent"]
292
+ else "mean"
293
+ )
294
+ if handle_missing == "mode":
295
+ strategy = "most_frequent"
296
+
297
+ # Identify numerical columns (those not in categorical_features)
298
+ numerical_cols = [
299
+ col for col in X.columns if col not in categorical_features
300
+ ]
301
+
302
+ # Apply imputation to numerical columns
303
+ if numerical_cols:
304
+ imputer = SimpleImputer(strategy=strategy)
305
+ X[numerical_cols] = pd.DataFrame(
306
+ imputer.fit_transform(X[numerical_cols]),
307
+ columns=numerical_cols,
308
+ index=X.index,
309
+ )
310
+
311
+ # For categorical columns with missing values, fill with mode
312
+ for col in categorical_features:
313
+ if col in X.columns and X[col].isna().any():
314
+ X[col] = X[col].fillna(X[col].mode().iloc[0])
315
+
316
+ # Apply scaling only to numerical features (those not in categorical_features)
317
+ if scale != "none":
318
+ numerical_cols = [
319
+ col for col in X.columns if col not in categorical_features
320
+ ]
321
+ if numerical_cols:
322
+ if scale == "standard":
323
+ scaler = StandardScaler()
324
+ elif scale == "minmax":
325
+ scaler = MinMaxScaler()
326
+ elif scale == "robust":
327
+ scaler = RobustScaler()
328
+ else:
329
+ scaler = StandardScaler() # Default
330
+
331
+ X[numerical_cols] = pd.DataFrame(
332
+ scaler.fit_transform(X[numerical_cols]),
333
+ columns=numerical_cols,
334
+ index=X.index,
335
+ )
336
+
337
+ # Process categorical columns based on their specified encoding types
338
+ for col, encoding_info in categorical_features.items():
339
+ # Skip if column doesn't exist in the DataFrame
340
+ if col not in X.columns:
341
+ continue
342
+
343
+ # Normalise encoding_info into (encoding_type, extra_info)
344
+ if isinstance(encoding_info, list):
345
+ encoding_type = encoding_info[0]
346
+ extra_info = encoding_info[1:]
347
+ else:
348
+ encoding_type = encoding_info
349
+ extra_info = []
350
+
351
+ # Apply encoding based on type
352
+ if encoding_type == "onehot":
353
+ unique_count = X[col].nunique()
354
+ if unique_count > 50: # Threshold for "high cardinality"
355
+ logging.warning(
356
+ f"Column '{col}' has {unique_count} unique values. "
357
+ "One-hot encoding may create too many features."
358
+ )
359
+ if unique_count > 100: # Very high cardinality threshold
360
+ logging.warning(
361
+ f"Falling back to hash encoding for column '{col}' "
362
+ "due to very high cardinality."
363
+ )
364
+ # Use 32 hash columns as a default fallback
365
+ n_components = 32
366
+
367
+ # Use feature hashing for high cardinality
368
+ hasher = FeatureHasher(
369
+ n_features=n_components, input_type="string"
370
+ )
371
+
372
+ # Convert column values to a format FeatureHasher can handle
373
+ feature_values = X[col].astype(str).tolist()
374
+ hashed_features = hasher.transform(
375
+ [[value] for value in feature_values]
376
+ )
377
+
378
+ # Create a DataFrame with descriptive column names
379
+ hashed_df = pd.DataFrame(
380
+ hashed_features.toarray(),
381
+ index=X.index,
382
+ columns=[f"{col}_hash_{i}" for i in range(n_components)],
383
+ )
384
+ # Drop original column and add hashed features
385
+ X = pd.concat([X.drop(col, axis=1), hashed_df], axis=1)
386
+ else:
387
+ # Proceed with one-hot encoding but warn the user
388
+ # Use descriptive column names: original_col_value
389
+ dummies = pd.get_dummies(
390
+ X[col], prefix=col, prefix_sep="_", drop_first=False
391
+ )
392
+ X = pd.concat([X.drop(col, axis=1), dummies], axis=1)
393
+ else:
394
+ # Normal one-hot encoding for manageable cardinality
395
+ # Use descriptive column names: original_col_value
396
+ dummies = pd.get_dummies(
397
+ X[col], prefix=col, prefix_sep="_", drop_first=False
398
+ )
399
+ X = pd.concat([X.drop(col, axis=1), dummies], axis=1)
400
+
401
+ elif encoding_type == "hash":
402
+ # Get the number of components for this feature, if specified
403
+ n_components = (
404
+ extra_info[0] if extra_info else 32
405
+ ) # Default to 32 if not specified
406
+
407
+ hasher = FeatureHasher(n_features=n_components, input_type="string")
408
+
409
+ # Convert column values to a format FeatureHasher can handle
410
+ feature_values = X[col].astype(str).tolist()
411
+ hashed_features = hasher.transform(
412
+ [[value] for value in feature_values]
413
+ )
414
+
415
+ # Create a DataFrame with descriptive column names
416
+ hashed_df = pd.DataFrame(
417
+ hashed_features.toarray(),
418
+ index=X.index,
419
+ columns=[f"{col}_hash_{i}" for i in range(n_components)],
420
+ )
421
+ # Drop original column and add hashed features
422
+ X = pd.concat([X.drop(col, axis=1), hashed_df], axis=1)
423
+
424
+ elif encoding_type == "label":
425
+ # Label encode this column
426
+ le = LabelEncoder()
427
+ X[col] = le.fit_transform(X[col])
428
+
429
+ elif encoding_type == "ordinal":
430
+ # Ordinal encode this column
431
+ categories = X[col].unique()
432
+ mapping = {cat: i for i, cat in enumerate(categories)}
433
+ X[col] = X[col].map(mapping)
434
+
435
+ # Skip if encoding type is "none"
436
+
437
+ # Encode labels if necessary (for the target variable)
438
+ if encode_target and not np.issubdtype(y.dtype, np.number):
439
+ # Use label encoding for the target
440
+ self.label_encoder = LabelEncoder()
441
+ y = self.label_encoder.fit_transform(y)
442
+
443
+ # Store the column names for future reference
444
+ self.feature_names = list(X.columns)
445
+
446
+ # Return the DataFrame with column names preserved and the numpy array for y
447
+ return X, y
448
+
449
+ def assign_encoding_types(
450
+ self,
451
+ df,
452
+ categorical_columns,
453
+ encoding_type="auto",
454
+ hash_components=32,
455
+ ordinal_columns=None,
456
+ ):
457
+ """
458
+ Assigns encoding types to categorical features based on user preference or automatic recommendation.
459
+
460
+ Args:
461
+ df: DataFrame containing the data
462
+ categorical_columns: List of categorical feature column names
463
+ encoding_type: Global encoding strategy: "auto", "onehot", "label", "ordinal", or "none"
464
+ ordinal_columns: List of categorical columns that have a natural order
465
+
466
+ Returns:
467
+ Dictionary mapping column names to their assigned encoding types
468
+
469
+ Raises:
470
+ ValueError: If an invalid encoding type is provided
471
+ """
472
+ if not categorical_columns:
473
+ return {}
474
+
475
+ # Initialise encoding types dictionary
476
+ encoding_types = {}
477
+
478
+ # Initialise ordinal columns if None
479
+ if ordinal_columns is None:
480
+ ordinal_columns = []
481
+
482
+ # Validate and filter categorical columns
483
+ valid_categorical_columns = []
484
+ for col in categorical_columns:
485
+ if col in df.columns:
486
+ valid_categorical_columns.append(col)
487
+ else:
488
+ logging.warning(
489
+ f"Column '{col}' not found in the dataset and will be ignored"
490
+ )
491
+
492
+ # If no valid columns remain, return empty dictionary
493
+ if not valid_categorical_columns:
494
+ logging.warning("No valid categorical columns found in the dataset")
495
+ return {}
496
+
497
+ # Assign specific encoding type to all columns
498
+ if encoding_type in ["onehot", "label", "ordinal"]:
499
+ for col in valid_categorical_columns:
500
+ encoding_types[col] = encoding_type
501
+
502
+ # Assign hash encoding with components to all columns
503
+ elif encoding_type == "hash":
504
+ for col in valid_categorical_columns:
505
+ encoding_types[col] = ["hash", hash_components]
506
+
507
+ # Assign "none" to all columns
508
+ elif encoding_type == "none":
509
+ for col in valid_categorical_columns:
510
+ encoding_types[col] = "none"
511
+
512
+ # Recommend encoding types based on data characteristics
513
+ elif encoding_type == "auto":
514
+ for col in valid_categorical_columns:
515
+ # If column is marked as ordinal, assign ordinal encoding
516
+ if col in ordinal_columns:
517
+ encoding_types[col] = "ordinal"
518
+ continue
519
+
520
+ # Get cardinality (number of unique values)
521
+ unique_count = df[col].nunique()
522
+
523
+ # Check for highly skewed distribution
524
+ value_counts = df[col].value_counts(normalize=True)
525
+ is_highly_skewed = (
526
+ value_counts.iloc[0] >= 0.8
527
+ ) # If dominant category > 80%
528
+
529
+ # Check for many rare categories
530
+ rare_categories = (value_counts < 0.05).sum()
531
+ has_many_rare_cats = (
532
+ rare_categories > unique_count * 0.5
533
+ ) # If >50% of categories are rare
534
+
535
+ # Recommend based on cardinality and distribution characteristics
536
+ if unique_count <= 20:
537
+ # For low cardinality, use one-hot
538
+ encoding_types[col] = "onehot"
539
+ elif is_highly_skewed:
540
+ # For skewed distributions, use label encoding
541
+ encoding_types[col] = "label"
542
+ elif has_many_rare_cats and unique_count <= 29:
543
+ # For many rare categories and under cardinality of 50, use label encoding
544
+ encoding_types[col] = "label"
545
+ else:
546
+ # For high cardinality with balanced distribution, use hash encoding
547
+ # Adjust number of hash columns based on cardinality
548
+ n_components = min(32, max(16, int(unique_count / 4)))
549
+ encoding_types[col] = ["hash", n_components]
550
+
551
+ else:
552
+ raise ValueError(
553
+ f"Invalid encoding type: {encoding_type}. Must be one of: auto, onehot, label, ordinal, hash, none"
554
+ )
555
+
556
+ return encoding_types
@@ -0,0 +1,19 @@
1
+ # src/balancr/evaluation/__init__.py
2
+ # flake8: noqa
3
+
4
+ from .metrics import (
5
+ format_time,
6
+ get_metrics,
7
+ get_cv_scores,
8
+ get_learning_curve_data,
9
+ get_learning_curve_data_multiple_techniques,
10
+ )
11
+
12
+ from .visualisation import (
13
+ plot_class_distribution,
14
+ plot_class_distributions_comparison,
15
+ plot_comparison_results,
16
+ plot_learning_curves,
17
+ plot_radar_chart,
18
+ plot_3d_scatter,
19
+ )