dragon-ml-toolbox 1.4.1__py3-none-any.whl → 1.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dragon_ml_toolbox-1.4.1.dist-info → dragon_ml_toolbox-1.4.3.dist-info}/METADATA +1 -1
- dragon_ml_toolbox-1.4.3.dist-info/RECORD +19 -0
- {dragon_ml_toolbox-1.4.1.dist-info → dragon_ml_toolbox-1.4.3.dist-info}/licenses/LICENSE-THIRD-PARTY.md +6 -1
- ml_tools/MICE_imputation.py +22 -14
- ml_tools/data_exploration.py +41 -8
- ml_tools/ensemble_learning.py +446 -187
- ml_tools/particle_swarm_optimization.py +43 -52
- ml_tools/utilities.py +44 -8
- dragon_ml_toolbox-1.4.1.dist-info/RECORD +0 -19
- {dragon_ml_toolbox-1.4.1.dist-info → dragon_ml_toolbox-1.4.3.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-1.4.1.dist-info → dragon_ml_toolbox-1.4.3.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-1.4.1.dist-info → dragon_ml_toolbox-1.4.3.dist-info}/top_level.txt +0 -0
|
@@ -5,11 +5,10 @@ import xgboost as xgb
|
|
|
5
5
|
import lightgbm as lgb
|
|
6
6
|
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
|
|
7
7
|
from sklearn.base import ClassifierMixin
|
|
8
|
-
from
|
|
9
|
-
from typing import Literal, Union, Tuple, Dict
|
|
8
|
+
from typing import Literal, Union, Tuple, Dict, Optional
|
|
10
9
|
import polars as pl
|
|
11
10
|
from functools import partial
|
|
12
|
-
from .utilities import sanitize_filename, _script_info
|
|
11
|
+
from .utilities import sanitize_filename, _script_info, threshold_binary_values
|
|
13
12
|
|
|
14
13
|
|
|
15
14
|
__all__ = [
|
|
@@ -20,14 +19,14 @@ __all__ = [
|
|
|
20
19
|
|
|
21
20
|
class ObjectiveFunction():
|
|
22
21
|
"""
|
|
23
|
-
Callable objective function designed for optimizing continuous outputs from regression models.
|
|
22
|
+
Callable objective function designed for optimizing continuous outputs from tree-based regression models.
|
|
24
23
|
|
|
25
|
-
The target serialized file (joblib) must include a
|
|
24
|
+
The target serialized file (joblib) must include a trained tree-based 'model'. Additionally 'feature_names' and 'target_name' will be parsed if present.
|
|
26
25
|
|
|
27
26
|
Parameters
|
|
28
27
|
----------
|
|
29
28
|
trained_model_path : str
|
|
30
|
-
Path to a serialized model
|
|
29
|
+
Path to a serialized model (joblib) compatible with scikit-learn-like `.predict`.
|
|
31
30
|
add_noise : bool
|
|
32
31
|
Whether to apply multiplicative noise to the input features during evaluation.
|
|
33
32
|
binary_features : int, default=0
|
|
@@ -35,15 +34,14 @@ class ObjectiveFunction():
|
|
|
35
34
|
task : Literal, default 'maximization'
|
|
36
35
|
Whether to maximize or minimize the target.
|
|
37
36
|
"""
|
|
38
|
-
def __init__(self, trained_model_path: str, add_noise: bool
|
|
37
|
+
def __init__(self, trained_model_path: str, add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int=0) -> None:
|
|
39
38
|
self.binary_features = binary_features
|
|
40
39
|
self.is_hybrid = False if binary_features <= 0 else True
|
|
41
40
|
self.use_noise = add_noise
|
|
42
41
|
self._artifact = joblib.load(trained_model_path)
|
|
43
42
|
self.model = self._get_from_artifact('model')
|
|
44
|
-
self.
|
|
45
|
-
self.
|
|
46
|
-
self.target_name: str = self._get_from_artifact('target_name') # type: ignore
|
|
43
|
+
self.feature_names: Optional[list[str]] = self._get_from_artifact('feature_names') # type: ignore
|
|
44
|
+
self.target_name: Optional[str] = self._get_from_artifact('target_name') # type: ignore
|
|
47
45
|
self.task = task
|
|
48
46
|
self.check_model() # check for classification models and None values
|
|
49
47
|
|
|
@@ -51,16 +49,15 @@ class ObjectiveFunction():
|
|
|
51
49
|
if self.use_noise:
|
|
52
50
|
features_array = self.add_noise(features_array)
|
|
53
51
|
if self.is_hybrid:
|
|
54
|
-
features_array = self.
|
|
52
|
+
features_array = threshold_binary_values(input_array=features_array, binary_features=self.binary_features)
|
|
55
53
|
|
|
56
54
|
if features_array.ndim == 1:
|
|
57
55
|
features_array = features_array.reshape(1, -1)
|
|
58
56
|
|
|
59
|
-
# scale features as the model expects
|
|
60
|
-
features_array = self.scaler.transform(features_array) # type: ignore
|
|
61
|
-
|
|
62
57
|
result = self.model.predict(features_array) # type: ignore
|
|
63
58
|
scalar = result.item()
|
|
59
|
+
# print(f"[DEBUG] Model predicted: {scalar}")
|
|
60
|
+
|
|
64
61
|
# pso minimizes by default, so we return the negative value to maximize
|
|
65
62
|
if self.task == "maximization":
|
|
66
63
|
return -scalar
|
|
@@ -68,33 +65,22 @@ class ObjectiveFunction():
|
|
|
68
65
|
return scalar
|
|
69
66
|
|
|
70
67
|
def add_noise(self, features_array):
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
if self.binary_features == total_features:
|
|
82
|
-
feat_binary = (features_array > 0.5).astype(int)
|
|
83
|
-
return feat_binary
|
|
84
|
-
|
|
85
|
-
# Normal case: split into continuous and binary parts
|
|
86
|
-
feat_continuous = features_array[:-self.binary_features]
|
|
87
|
-
feat_binary = (features_array[-self.binary_features:] > 0.5).astype(int) #threshold binary values
|
|
88
|
-
new_feature_values = np.concatenate([feat_continuous, feat_binary])
|
|
89
|
-
return new_feature_values
|
|
68
|
+
if self.binary_features > 0:
|
|
69
|
+
split_idx = -self.binary_features
|
|
70
|
+
cont_part = features_array[:split_idx]
|
|
71
|
+
bin_part = features_array[split_idx:]
|
|
72
|
+
noise = np.random.uniform(0.95, 1.05, size=cont_part.shape)
|
|
73
|
+
cont_noised = cont_part * noise
|
|
74
|
+
return np.concatenate([cont_noised, bin_part])
|
|
75
|
+
else:
|
|
76
|
+
noise = np.random.uniform(0.95, 1.05, size=features_array.shape)
|
|
77
|
+
return features_array * noise
|
|
90
78
|
|
|
91
79
|
def check_model(self):
|
|
92
80
|
if isinstance(self.model, ClassifierMixin) or isinstance(self.model, xgb.XGBClassifier) or isinstance(self.model, lgb.LGBMClassifier):
|
|
93
81
|
raise ValueError(f"[Model Check Failed] ❌\nThe loaded model ({type(self.model).__name__}) is a Classifier.\nOptimization is not suitable for standard classification tasks.")
|
|
94
82
|
if self.model is None:
|
|
95
83
|
raise ValueError("Loaded model is None")
|
|
96
|
-
if self.scaler is None:
|
|
97
|
-
raise ValueError("Loaded scaler is None")
|
|
98
84
|
|
|
99
85
|
def _get_from_artifact(self, key: str):
|
|
100
86
|
val = self._artifact.get(key)
|
|
@@ -105,7 +91,7 @@ class ObjectiveFunction():
|
|
|
105
91
|
return result
|
|
106
92
|
|
|
107
93
|
def __repr__(self):
|
|
108
|
-
return (f"<ObjectiveFunction(model={type(self.model).__name__},
|
|
94
|
+
return (f"<ObjectiveFunction(model={type(self.model).__name__}, use_noise={self.use_noise}, is_hybrid={self.is_hybrid}, task='{self.task}')>")
|
|
109
95
|
|
|
110
96
|
|
|
111
97
|
def _set_boundaries(lower_boundaries: list[float], upper_boundaries: list[float]):
|
|
@@ -142,11 +128,11 @@ def run_pso(lower_boundaries: list[float],
|
|
|
142
128
|
auto_binary_boundaries: bool=True,
|
|
143
129
|
target_name: Union[str, None]=None,
|
|
144
130
|
feature_names: Union[list[str], None]=None,
|
|
145
|
-
swarm_size: int=
|
|
146
|
-
max_iterations: int=
|
|
131
|
+
swarm_size: int=200,
|
|
132
|
+
max_iterations: int=1000,
|
|
147
133
|
inequality_constrain_function=None,
|
|
148
|
-
post_hoc_analysis:
|
|
149
|
-
workers: int=
|
|
134
|
+
post_hoc_analysis: Optional[int]=3,
|
|
135
|
+
workers: int=3) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
|
|
150
136
|
"""
|
|
151
137
|
Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results as a CSV file.
|
|
152
138
|
|
|
@@ -157,7 +143,7 @@ def run_pso(lower_boundaries: list[float],
|
|
|
157
143
|
upper_boundaries : list[float]
|
|
158
144
|
Upper bounds for each feature in the search space (as many as features expected by the model).
|
|
159
145
|
objective_function : ObjectiveFunction
|
|
160
|
-
A callable object encapsulating a regression model
|
|
146
|
+
A callable object encapsulating a tree-based regression model.
|
|
161
147
|
save_results_dir : str
|
|
162
148
|
Directory path to save the results CSV file.
|
|
163
149
|
auto_binary_boundaries : bool
|
|
@@ -172,7 +158,7 @@ def run_pso(lower_boundaries: list[float],
|
|
|
172
158
|
Maximum number of iterations for the optimization algorithm.
|
|
173
159
|
inequality_constrain_function : callable or None, optional
|
|
174
160
|
Optional function defining inequality constraints to be respected by the optimization.
|
|
175
|
-
post_hoc_analysis : int or None
|
|
161
|
+
post_hoc_analysis : int or None
|
|
176
162
|
If specified, runs the optimization multiple times to perform post hoc analysis. The value indicates the number of repetitions.
|
|
177
163
|
workers : int
|
|
178
164
|
Number of parallel processes to use.
|
|
@@ -191,7 +177,6 @@ def run_pso(lower_boundaries: list[float],
|
|
|
191
177
|
Notes
|
|
192
178
|
-----
|
|
193
179
|
- PSO minimizes the objective function by default; if maximization is desired, it should be handled inside the ObjectiveFunction.
|
|
194
|
-
- Feature values are scaled before being passed to the model and inverse-transformed before result saving.
|
|
195
180
|
"""
|
|
196
181
|
# Append binary boundaries
|
|
197
182
|
binary_number = objective_function.binary_features
|
|
@@ -229,12 +214,15 @@ def run_pso(lower_boundaries: list[float],
|
|
|
229
214
|
best_features, best_target, *_ = _pso(**arguments)
|
|
230
215
|
# best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)
|
|
231
216
|
|
|
232
|
-
#
|
|
233
|
-
|
|
234
|
-
|
|
217
|
+
# flip best_target if maximization was used
|
|
218
|
+
if objective_function.task == "maximization":
|
|
219
|
+
best_target = -best_target
|
|
220
|
+
|
|
221
|
+
# threshold binary features
|
|
222
|
+
best_features_threshold = threshold_binary_values(best_features, binary_number)
|
|
235
223
|
|
|
236
224
|
# name features
|
|
237
|
-
best_features_named = {name: value for name, value in zip(names,
|
|
225
|
+
best_features_named = {name: value for name, value in zip(names, best_features_threshold)}
|
|
238
226
|
best_target_named = {target_name: best_target}
|
|
239
227
|
|
|
240
228
|
# save results
|
|
@@ -248,11 +236,14 @@ def run_pso(lower_boundaries: list[float],
|
|
|
248
236
|
best_features, best_target, *_ = _pso(**arguments)
|
|
249
237
|
# best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)
|
|
250
238
|
|
|
251
|
-
#
|
|
252
|
-
|
|
253
|
-
|
|
239
|
+
# flip best_target if maximization was used
|
|
240
|
+
if objective_function.task == "maximization":
|
|
241
|
+
best_target = -best_target
|
|
242
|
+
|
|
243
|
+
# threshold binary features
|
|
244
|
+
best_features_threshold = threshold_binary_values(best_features, binary_number)
|
|
254
245
|
|
|
255
|
-
for i, best_feature in enumerate(
|
|
246
|
+
for i, best_feature in enumerate(best_features_threshold):
|
|
256
247
|
all_best_features[i].append(best_feature)
|
|
257
248
|
all_best_targets.append(best_target)
|
|
258
249
|
|
|
@@ -270,7 +261,7 @@ def info():
|
|
|
270
261
|
_script_info(__all__)
|
|
271
262
|
|
|
272
263
|
|
|
273
|
-
### SOURCE CODE FOR PSO ###
|
|
264
|
+
### SOURCE CODE FOR PSO FROM PYSWARM ###
|
|
274
265
|
def _obj_wrapper(func, args, kwargs, x):
|
|
275
266
|
return func(x, *args, **kwargs)
|
|
276
267
|
|
ml_tools/utilities.py
CHANGED
|
@@ -4,7 +4,7 @@ import pandas as pd
|
|
|
4
4
|
import os
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
import re
|
|
7
|
-
from typing import Literal
|
|
7
|
+
from typing import Literal, Union, Sequence
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
# Keep track of available tools
|
|
@@ -15,7 +15,8 @@ __all__ = [
|
|
|
15
15
|
"merge_dataframes",
|
|
16
16
|
"save_dataframe",
|
|
17
17
|
"normalize_mixed_list",
|
|
18
|
-
"sanitize_filename"
|
|
18
|
+
"sanitize_filename",
|
|
19
|
+
"threshold_binary_values"
|
|
19
20
|
]
|
|
20
21
|
|
|
21
22
|
|
|
@@ -94,7 +95,8 @@ def yield_dataframes_from_dir(datasets_dir: str):
|
|
|
94
95
|
def merge_dataframes(
|
|
95
96
|
*dfs: pd.DataFrame,
|
|
96
97
|
reset_index: bool = False,
|
|
97
|
-
direction: Literal["horizontal", "vertical"] = "horizontal"
|
|
98
|
+
direction: Literal["horizontal", "vertical"] = "horizontal",
|
|
99
|
+
verbose: bool=True
|
|
98
100
|
) -> pd.DataFrame:
|
|
99
101
|
"""
|
|
100
102
|
Merges multiple DataFrames either horizontally or vertically.
|
|
@@ -118,8 +120,9 @@ def merge_dataframes(
|
|
|
118
120
|
if len(dfs) < 2:
|
|
119
121
|
raise ValueError("At least 2 DataFrames must be provided.")
|
|
120
122
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
+
if verbose:
|
|
124
|
+
for i, df in enumerate(dfs, start=1):
|
|
125
|
+
print(f"DataFrame {i} shape: {df.shape}")
|
|
123
126
|
|
|
124
127
|
|
|
125
128
|
if direction == "horizontal":
|
|
@@ -141,8 +144,9 @@ def merge_dataframes(
|
|
|
141
144
|
|
|
142
145
|
if reset_index:
|
|
143
146
|
merged_df = merged_df.reset_index(drop=True)
|
|
144
|
-
|
|
145
|
-
|
|
147
|
+
|
|
148
|
+
if verbose:
|
|
149
|
+
print(f"Merged DataFrame shape: {merged_df.shape}")
|
|
146
150
|
|
|
147
151
|
return merged_df
|
|
148
152
|
|
|
@@ -170,7 +174,7 @@ def save_dataframe(df: pd.DataFrame, save_dir: str, filename: str) -> None:
|
|
|
170
174
|
output_path = os.path.join(save_dir, filename)
|
|
171
175
|
|
|
172
176
|
df.to_csv(output_path, index=False, encoding='utf-8')
|
|
173
|
-
print(f"✅ Saved
|
|
177
|
+
print(f"✅ Saved dataset: '{filename}' with shape: {df.shape}")
|
|
174
178
|
|
|
175
179
|
|
|
176
180
|
def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
|
|
@@ -263,6 +267,38 @@ def sanitize_filename(filename: str) -> str:
|
|
|
263
267
|
return sanitized
|
|
264
268
|
|
|
265
269
|
|
|
270
|
+
def threshold_binary_values(
|
|
271
|
+
input_array: Union[Sequence[float], np.ndarray],
|
|
272
|
+
binary_features: int
|
|
273
|
+
) -> np.ndarray:
|
|
274
|
+
"""
|
|
275
|
+
Thresholds binary features in a 1D numeric sequence. Binary features must be located at the end of the sequence.
|
|
276
|
+
|
|
277
|
+
Converts binary elements to values (0 or 1) using a threshold of 0.5. The rest of the array (assumed to be continuous features) is returned unchanged.
|
|
278
|
+
|
|
279
|
+
Parameters:
|
|
280
|
+
input_array (Union[Sequence[float], np.ndarray]) : A one-dimensional collection of numeric values. The binary features must be located at the end of the array.
|
|
281
|
+
|
|
282
|
+
binary_features (int) : Number of binary features to threshold from the end of the array. Must be between 0 and the total number of elements.
|
|
283
|
+
|
|
284
|
+
Returns:
|
|
285
|
+
np.ndarray : A 1D NumPy array where the final `binary_features` values have been binarized.
|
|
286
|
+
"""
|
|
287
|
+
array = np.asarray(input_array).flatten()
|
|
288
|
+
total = array.shape[0]
|
|
289
|
+
|
|
290
|
+
if binary_features < 0 or binary_features > total:
|
|
291
|
+
raise ValueError("Binary features must be between 0 and the total number of features.")
|
|
292
|
+
|
|
293
|
+
if binary_features == 0:
|
|
294
|
+
return array
|
|
295
|
+
|
|
296
|
+
cont_part = array[:-binary_features]
|
|
297
|
+
bin_part = (array[-binary_features:] > 0.5).astype(int)
|
|
298
|
+
|
|
299
|
+
return np.concatenate([cont_part, bin_part])
|
|
300
|
+
|
|
301
|
+
|
|
266
302
|
def _script_info(all_data: list[str]):
|
|
267
303
|
"""
|
|
268
304
|
List available names.
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
dragon_ml_toolbox-1.4.1.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
|
|
2
|
-
dragon_ml_toolbox-1.4.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=e1Hg5ZtaBpDV7ZvxhLe1ac28l7nMjvi1MSE5YvB1s-o,1472
|
|
3
|
-
ml_tools/MICE_imputation.py,sha256=CK0tYZ_kQkdETohOlhI7RP7oFkJTXrP-XtIxb--dzpU,9726
|
|
4
|
-
ml_tools/VIF_factor.py,sha256=LQWr1P8WYij07FX_3RZC6Rr22bfAMnrt0Lhvi7SbBpY,9846
|
|
5
|
-
ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
-
ml_tools/data_exploration.py,sha256=FXP5i6bQo8J3RCyLRmlX-qJVh4VH8DbMjrdUmyd1mF0,18708
|
|
7
|
-
ml_tools/datasetmaster.py,sha256=EFUEX-tqq94Ak1rXXYR-XaX85olrxvF2EuytdzUK7y0,29131
|
|
8
|
-
ml_tools/ensemble_learning.py,sha256=khXXRiR7boWwI4CAvb2bxzS3fhLADNETMOiRe3ihZ4Y,28821
|
|
9
|
-
ml_tools/handle_excel.py,sha256=ZJui5__0rc2T8UGHTheqZGhKmdVZ7Q2I54IoYCjAqJw,12612
|
|
10
|
-
ml_tools/logger.py,sha256=NOtL3YSuffAGmpTpXjY-uJjqFLdRG_jpL7MDyloBw9c,4712
|
|
11
|
-
ml_tools/particle_swarm_optimization.py,sha256=714kZo6lvUvRaPTtj6kJGecZwHcehcSkLysokXAf3No,20706
|
|
12
|
-
ml_tools/pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
|
|
13
|
-
ml_tools/trainer.py,sha256=WAZ4EdrZuTOAnGXRWV3XcLNce4s7EKGf2-qchLC08Ik,15702
|
|
14
|
-
ml_tools/utilities.py,sha256=z2JPy4GM2YBLUC0sPq7aNLuesPFAQu5KNcsgmuOywdU,8738
|
|
15
|
-
ml_tools/vision_helpers.py,sha256=idQ-Ugp1IdsvwXiYyhYa9G3rTRTm37YRpkQDLEpANHM,7701
|
|
16
|
-
dragon_ml_toolbox-1.4.1.dist-info/METADATA,sha256=0XdPwNWe81rCvJLJfSS5XvB2ZdJKpBLLoqMU5uxYLMc,2516
|
|
17
|
-
dragon_ml_toolbox-1.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
18
|
-
dragon_ml_toolbox-1.4.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
19
|
-
dragon_ml_toolbox-1.4.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|