balancr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- balancr/__init__.py +13 -0
- balancr/base.py +14 -0
- balancr/classifier_registry.py +300 -0
- balancr/cli/__init__.py +0 -0
- balancr/cli/commands.py +1838 -0
- balancr/cli/config.py +165 -0
- balancr/cli/main.py +778 -0
- balancr/cli/utils.py +101 -0
- balancr/data/__init__.py +5 -0
- balancr/data/loader.py +59 -0
- balancr/data/preprocessor.py +556 -0
- balancr/evaluation/__init__.py +19 -0
- balancr/evaluation/metrics.py +442 -0
- balancr/evaluation/visualisation.py +660 -0
- balancr/imbalance_analyser.py +677 -0
- balancr/technique_registry.py +284 -0
- balancr/techniques/__init__.py +4 -0
- balancr/techniques/custom/__init__.py +0 -0
- balancr/techniques/custom/example_custom_technique.py +27 -0
- balancr-0.1.0.dist-info/LICENSE +21 -0
- balancr-0.1.0.dist-info/METADATA +536 -0
- balancr-0.1.0.dist-info/RECORD +25 -0
- balancr-0.1.0.dist-info/WHEEL +5 -0
- balancr-0.1.0.dist-info/entry_points.txt +2 -0
- balancr-0.1.0.dist-info/top_level.txt +1 -0
balancr/__init__.py
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
# src/balancr/__init__.py
|
2
|
+
# flake8: noqa
|
3
|
+
|
4
|
+
from .base import BaseBalancer
|
5
|
+
|
6
|
+
from .technique_registry import TechniqueRegistry
|
7
|
+
|
8
|
+
from .classifier_registry import ClassifierRegistry
|
9
|
+
|
10
|
+
from .imbalance_analyser import (
|
11
|
+
BalancingFramework,
|
12
|
+
format_time,
|
13
|
+
)
|
balancr/base.py
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
import numpy as np
|
3
|
+
|
4
|
+
|
5
|
+
class BaseBalancer(ABC):
|
6
|
+
"""Base class for all balancing techniques"""
|
7
|
+
|
8
|
+
def __init__(self):
|
9
|
+
self.name = self.__class__.__name__
|
10
|
+
|
11
|
+
@abstractmethod
|
12
|
+
def balance(self, X: np.ndarray, y: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
|
13
|
+
"""Balance the dataset"""
|
14
|
+
pass
|
@@ -0,0 +1,300 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
from typing import Dict, Type, Optional, List
|
3
|
+
import importlib
|
4
|
+
import inspect
|
5
|
+
import logging
|
6
|
+
import json
|
7
|
+
from sklearn.base import BaseEstimator
|
8
|
+
|
9
|
+
|
10
|
+
class ClassifierRegistry:
|
11
|
+
"""Registry for managing classification algorithms from various sources"""
|
12
|
+
|
13
|
+
# List of scikit-learn modules where we'll look for classifiers
|
14
|
+
SKLEARN_MODULES = [
|
15
|
+
"sklearn.ensemble",
|
16
|
+
"sklearn.linear_model",
|
17
|
+
"sklearn.tree",
|
18
|
+
"sklearn.svm",
|
19
|
+
"sklearn.neighbors",
|
20
|
+
"sklearn.naive_bayes",
|
21
|
+
"sklearn.neural_network",
|
22
|
+
"sklearn.discriminant_analysis",
|
23
|
+
]
|
24
|
+
|
25
|
+
def __init__(self):
|
26
|
+
# Storage for custom classifiers
|
27
|
+
self.custom_classifiers: Dict[str, Type[BaseEstimator]] = {}
|
28
|
+
|
29
|
+
# Cache of sklearn classifiers, organised by module
|
30
|
+
self._cached_sklearn_classifiers: Dict[str, Dict[str, tuple]] = {}
|
31
|
+
|
32
|
+
# Find all available classifiers when initialised
|
33
|
+
self._discover_sklearn_classifiers()
|
34
|
+
|
35
|
+
self._load_custom_classifiers()
|
36
|
+
|
37
|
+
def _discover_sklearn_classifiers(self) -> None:
|
38
|
+
"""Look through scikit-learn modules to find usable classifier classes"""
|
39
|
+
for module_path in self.SKLEARN_MODULES:
|
40
|
+
try:
|
41
|
+
# Try to import module
|
42
|
+
module = importlib.import_module(module_path)
|
43
|
+
|
44
|
+
# Get just the module name (e.g., 'ensemble' from 'sklearn.ensemble')
|
45
|
+
module_name = module_path.split(".")[-1]
|
46
|
+
|
47
|
+
# Make sure we have a dict ready for this module
|
48
|
+
if module_name not in self._cached_sklearn_classifiers:
|
49
|
+
self._cached_sklearn_classifiers[module_name] = {}
|
50
|
+
|
51
|
+
# Look at all classes in the module
|
52
|
+
for name, obj in inspect.getmembers(module, inspect.isclass):
|
53
|
+
# We consider something a classifier if it:
|
54
|
+
# 1. Has fit and predict methods
|
55
|
+
# 2. Inherits from BaseEstimator
|
56
|
+
if (
|
57
|
+
hasattr(obj, "fit")
|
58
|
+
and hasattr(obj, "predict")
|
59
|
+
and issubclass(obj, BaseEstimator)
|
60
|
+
):
|
61
|
+
|
62
|
+
# Skip abstract base classes and internal classes
|
63
|
+
if not name.startswith("Base") and not name.startswith("_"):
|
64
|
+
self._cached_sklearn_classifiers[module_name][name] = (
|
65
|
+
module_path,
|
66
|
+
obj,
|
67
|
+
)
|
68
|
+
|
69
|
+
except ImportError as e:
|
70
|
+
logging.warning(f"Couldn't import {module_path}: {str(e)}")
|
71
|
+
|
72
|
+
def get_classifier_class(
|
73
|
+
self, classifier_name: str, module_name: Optional[str] = None
|
74
|
+
) -> Optional[Type[BaseEstimator]]:
|
75
|
+
"""
|
76
|
+
Find a classifier class by its name, handling suffixed variations.
|
77
|
+
|
78
|
+
Args:
|
79
|
+
classifier_name: Name of the classifier (e.g., 'RandomForestClassifier')
|
80
|
+
module_name: Optional module to look in (e.g., 'ensemble', 'linear_model')
|
81
|
+
|
82
|
+
Returns:
|
83
|
+
The classifier class if found, None otherwise
|
84
|
+
"""
|
85
|
+
# First, check for exact matches in custom classifiers
|
86
|
+
if classifier_name in self.custom_classifiers:
|
87
|
+
return self.custom_classifiers[classifier_name]
|
88
|
+
|
89
|
+
# If user specified a module, only look there for exact match first
|
90
|
+
if module_name is not None:
|
91
|
+
if (
|
92
|
+
module_name in self._cached_sklearn_classifiers
|
93
|
+
and classifier_name in self._cached_sklearn_classifiers[module_name]
|
94
|
+
):
|
95
|
+
_, classifier_class = self._cached_sklearn_classifiers[module_name][
|
96
|
+
classifier_name
|
97
|
+
]
|
98
|
+
return classifier_class
|
99
|
+
|
100
|
+
# Otherwise, look through all modules for exact match
|
101
|
+
if module_name is None:
|
102
|
+
for module_dict in self._cached_sklearn_classifiers.values():
|
103
|
+
if classifier_name in module_dict:
|
104
|
+
_, classifier_class = module_dict[classifier_name]
|
105
|
+
return classifier_class
|
106
|
+
|
107
|
+
# If no exact match, extract base name if this is a variation with _ or - suffix
|
108
|
+
base_name = None
|
109
|
+
for delimiter in ["_", "-"]:
|
110
|
+
if delimiter in classifier_name:
|
111
|
+
parts = classifier_name.split(delimiter, 1)
|
112
|
+
if len(parts) > 1 and parts[0]: # Ensure we have a non-empty base name
|
113
|
+
base_name = parts[0]
|
114
|
+
break
|
115
|
+
|
116
|
+
# If we have a valid base name, look it up
|
117
|
+
if base_name:
|
118
|
+
# Check custom classifiers for the base name
|
119
|
+
if base_name in self.custom_classifiers:
|
120
|
+
return self.custom_classifiers[base_name]
|
121
|
+
|
122
|
+
# If user specified a module, only look there for the base name
|
123
|
+
if module_name is not None:
|
124
|
+
if (
|
125
|
+
module_name in self._cached_sklearn_classifiers
|
126
|
+
and base_name in self._cached_sklearn_classifiers[module_name]
|
127
|
+
):
|
128
|
+
_, classifier_class = self._cached_sklearn_classifiers[module_name][
|
129
|
+
base_name
|
130
|
+
]
|
131
|
+
return classifier_class
|
132
|
+
else:
|
133
|
+
# Otherwise, look through all modules for the base name
|
134
|
+
for module_dict in self._cached_sklearn_classifiers.values():
|
135
|
+
if base_name in module_dict:
|
136
|
+
_, classifier_class = module_dict[base_name]
|
137
|
+
return classifier_class
|
138
|
+
|
139
|
+
# If not found, try to discover new techniques (in case sklearn was updated)
|
140
|
+
self._discover_sklearn_classifiers()
|
141
|
+
|
142
|
+
# Try exact match again with freshly discovered classifiers
|
143
|
+
if module_name is not None:
|
144
|
+
if (
|
145
|
+
module_name in self._cached_sklearn_classifiers
|
146
|
+
and classifier_name in self._cached_sklearn_classifiers[module_name]
|
147
|
+
):
|
148
|
+
_, classifier_class = self._cached_sklearn_classifiers[module_name][
|
149
|
+
classifier_name
|
150
|
+
]
|
151
|
+
return classifier_class
|
152
|
+
else:
|
153
|
+
for module_dict in self._cached_sklearn_classifiers.values():
|
154
|
+
if classifier_name in module_dict:
|
155
|
+
_, classifier_class = module_dict[classifier_name]
|
156
|
+
return classifier_class
|
157
|
+
|
158
|
+
# Try base name again with freshly discovered classifiers
|
159
|
+
if base_name:
|
160
|
+
if module_name is not None:
|
161
|
+
if (
|
162
|
+
module_name in self._cached_sklearn_classifiers
|
163
|
+
and base_name in self._cached_sklearn_classifiers[module_name]
|
164
|
+
):
|
165
|
+
_, classifier_class = self._cached_sklearn_classifiers[module_name][
|
166
|
+
base_name
|
167
|
+
]
|
168
|
+
return classifier_class
|
169
|
+
else:
|
170
|
+
for module_dict in self._cached_sklearn_classifiers.values():
|
171
|
+
if base_name in module_dict:
|
172
|
+
_, classifier_class = module_dict[base_name]
|
173
|
+
return classifier_class
|
174
|
+
|
175
|
+
return None
|
176
|
+
|
177
|
+
def list_available_classifiers(self) -> Dict[str, Dict[str, List[str]]]:
|
178
|
+
"""
|
179
|
+
Get a hierarchical list of all available classifiers.
|
180
|
+
|
181
|
+
Returns:
|
182
|
+
Dictionary organised by source -> module -> classifier names
|
183
|
+
"""
|
184
|
+
# Refresh cache in case new classifiers were installed
|
185
|
+
self._discover_sklearn_classifiers()
|
186
|
+
|
187
|
+
result = {"custom": {}, "sklearn": self._get_sklearn_classifiers_by_module()}
|
188
|
+
|
189
|
+
# Add custom classifiers if there are any
|
190
|
+
if self.custom_classifiers:
|
191
|
+
result["custom"] = {"general": list(self.custom_classifiers.keys())}
|
192
|
+
|
193
|
+
return result
|
194
|
+
|
195
|
+
def _get_sklearn_classifiers_by_module(self) -> Dict[str, List[str]]:
|
196
|
+
"""Organise scikit-learn classifiers by their module for a cleaner display"""
|
197
|
+
result = {}
|
198
|
+
|
199
|
+
for module_name, classifiers in self._cached_sklearn_classifiers.items():
|
200
|
+
if classifiers: # Only include modules that have classifiers
|
201
|
+
result[module_name] = list(classifiers.keys())
|
202
|
+
|
203
|
+
return result
|
204
|
+
|
205
|
+
def register_custom_classifier(
|
206
|
+
self, name: str, classifier_class: Type[BaseEstimator]
|
207
|
+
) -> None:
|
208
|
+
"""
|
209
|
+
Register a custom classifier for use in the framework.
|
210
|
+
|
211
|
+
Args:
|
212
|
+
name: Name to register the classifier under
|
213
|
+
classifier_class: The classifier class itself
|
214
|
+
|
215
|
+
Raises:
|
216
|
+
TypeError: If the classifier doesn't meet requirements
|
217
|
+
ValueError: If the name is invalid
|
218
|
+
"""
|
219
|
+
if not isinstance(name, str) or not name.strip():
|
220
|
+
raise ValueError("Classifier name must be a non-empty string")
|
221
|
+
|
222
|
+
if classifier_class is None:
|
223
|
+
raise TypeError("Classifier class cannot be None")
|
224
|
+
|
225
|
+
if not isinstance(classifier_class, type) or not issubclass(
|
226
|
+
classifier_class, BaseEstimator
|
227
|
+
):
|
228
|
+
raise TypeError(
|
229
|
+
"Classifier class must inherit from sklearn.base.BaseEstimator"
|
230
|
+
)
|
231
|
+
|
232
|
+
# Make sure it has the required methods
|
233
|
+
if not hasattr(classifier_class, "fit") or not hasattr(
|
234
|
+
classifier_class, "predict"
|
235
|
+
):
|
236
|
+
raise TypeError(
|
237
|
+
"Classifier class must implement 'fit' and 'predict' methods"
|
238
|
+
)
|
239
|
+
|
240
|
+
self.custom_classifiers[name] = classifier_class
|
241
|
+
|
242
|
+
def _load_custom_classifiers(self) -> None:
|
243
|
+
"""Load registered custom classifiers from the custom classifiers directory."""
|
244
|
+
custom_dir = Path.home() / ".balancr" / "custom_classifiers"
|
245
|
+
if not custom_dir.exists():
|
246
|
+
return
|
247
|
+
|
248
|
+
metadata_file = custom_dir / "classifiers_metadata.json"
|
249
|
+
if not metadata_file.exists():
|
250
|
+
return
|
251
|
+
|
252
|
+
try:
|
253
|
+
with open(metadata_file, "r") as f:
|
254
|
+
metadata = json.load(f)
|
255
|
+
|
256
|
+
for classifier_name, info in metadata.items():
|
257
|
+
file_path = Path(info["file"])
|
258
|
+
class_name = info["class_name"]
|
259
|
+
|
260
|
+
if not file_path.exists():
|
261
|
+
logging.warning(f"Custom classifier file not found: {file_path}")
|
262
|
+
continue
|
263
|
+
|
264
|
+
try:
|
265
|
+
# Import the module dynamically
|
266
|
+
module_name = file_path.stem
|
267
|
+
spec = importlib.util.spec_from_file_location(
|
268
|
+
module_name, file_path
|
269
|
+
)
|
270
|
+
if spec is None or spec.loader is None:
|
271
|
+
logging.warning(f"Could not load module from {file_path}")
|
272
|
+
continue
|
273
|
+
|
274
|
+
module = importlib.util.module_from_spec(spec)
|
275
|
+
spec.loader.exec_module(module)
|
276
|
+
|
277
|
+
# Find the specific class
|
278
|
+
classifier_class = None
|
279
|
+
for name, obj in inspect.getmembers(module, inspect.isclass):
|
280
|
+
if (
|
281
|
+
name == class_name
|
282
|
+
and hasattr(obj, "fit")
|
283
|
+
and hasattr(obj, "predict")
|
284
|
+
):
|
285
|
+
classifier_class = obj
|
286
|
+
break
|
287
|
+
|
288
|
+
if classifier_class:
|
289
|
+
self.custom_classifiers[classifier_name] = classifier_class
|
290
|
+
logging.debug(f"Loaded custom classifier: {classifier_name}")
|
291
|
+
else:
|
292
|
+
logging.warning(f"Class {class_name} not found in {file_path}")
|
293
|
+
|
294
|
+
except Exception as e:
|
295
|
+
logging.warning(
|
296
|
+
f"Error loading custom classifier {classifier_name}: {e}"
|
297
|
+
)
|
298
|
+
|
299
|
+
except Exception as e:
|
300
|
+
logging.warning(f"Error loading custom classifiers metadata: {e}")
|
balancr/cli/__init__.py
ADDED
File without changes
|