dragon-ml-toolbox 1.4.0__py3-none-any.whl → 1.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-1.4.0.dist-info → dragon_ml_toolbox-1.4.2.dist-info}/METADATA +18 -2
- dragon_ml_toolbox-1.4.2.dist-info/RECORD +19 -0
- ml_tools/MICE_imputation.py +17 -2
- ml_tools/VIF_factor.py +29 -14
- ml_tools/data_exploration.py +68 -140
- ml_tools/datasetmaster.py +13 -1
- ml_tools/ensemble_learning.py +83 -82
- ml_tools/handle_excel.py +32 -9
- ml_tools/logger.py +10 -1
- ml_tools/particle_swarm_optimization.py +92 -64
- ml_tools/pytorch_models.py +13 -1
- ml_tools/trainer.py +10 -30
- ml_tools/utilities.py +133 -18
- ml_tools/vision_helpers.py +14 -1
- dragon_ml_toolbox-1.4.0.dist-info/RECORD +0 -19
- {dragon_ml_toolbox-1.4.0.dist-info → dragon_ml_toolbox-1.4.2.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-1.4.0.dist-info → dragon_ml_toolbox-1.4.2.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-1.4.0.dist-info → dragon_ml_toolbox-1.4.2.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-1.4.0.dist-info → dragon_ml_toolbox-1.4.2.dist-info}/top_level.txt +0 -0
|
@@ -5,18 +5,23 @@ import xgboost as xgb
|
|
|
5
5
|
import lightgbm as lgb
|
|
6
6
|
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
|
|
7
7
|
from sklearn.base import ClassifierMixin
|
|
8
|
-
from
|
|
9
|
-
from typing import Literal, Union, Tuple, Dict
|
|
10
|
-
from collections.abc import Sequence
|
|
8
|
+
from typing import Literal, Union, Tuple, Dict, Optional
|
|
11
9
|
import polars as pl
|
|
12
10
|
from functools import partial
|
|
11
|
+
from .utilities import sanitize_filename, _script_info, threshold_binary_values
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"ObjectiveFunction",
|
|
16
|
+
"run_pso"
|
|
17
|
+
]
|
|
13
18
|
|
|
14
19
|
|
|
15
20
|
class ObjectiveFunction():
|
|
16
21
|
"""
|
|
17
|
-
Callable objective function designed for optimizing continuous outputs from regression models.
|
|
22
|
+
Callable objective function designed for optimizing continuous outputs from tree-based regression models.
|
|
18
23
|
|
|
19
|
-
The
|
|
24
|
+
The target serialized file (joblib) must include a trained tree-based 'model'. Additionally 'feature_names' and 'target_name' will be parsed if present.
|
|
20
25
|
|
|
21
26
|
Parameters
|
|
22
27
|
----------
|
|
@@ -29,15 +34,14 @@ class ObjectiveFunction():
|
|
|
29
34
|
task : Literal, default 'maximization'
|
|
30
35
|
Whether to maximize or minimize the target.
|
|
31
36
|
"""
|
|
32
|
-
def __init__(self, trained_model_path: str, add_noise: bool
|
|
37
|
+
def __init__(self, trained_model_path: str, add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int=0) -> None:
|
|
33
38
|
self.binary_features = binary_features
|
|
34
39
|
self.is_hybrid = False if binary_features <= 0 else True
|
|
35
40
|
self.use_noise = add_noise
|
|
36
41
|
self._artifact = joblib.load(trained_model_path)
|
|
37
42
|
self.model = self._get_from_artifact('model')
|
|
38
|
-
self.
|
|
39
|
-
self.
|
|
40
|
-
self.target_name: str = self._get_from_artifact('target_name') # type: ignore
|
|
43
|
+
self.feature_names: Optional[list[str]] = self._get_from_artifact('feature_names') # type: ignore
|
|
44
|
+
self.target_name: Optional[str] = self._get_from_artifact('target_name') # type: ignore
|
|
41
45
|
self.task = task
|
|
42
46
|
self.check_model() # check for classification models and None values
|
|
43
47
|
|
|
@@ -45,16 +49,15 @@ class ObjectiveFunction():
|
|
|
45
49
|
if self.use_noise:
|
|
46
50
|
features_array = self.add_noise(features_array)
|
|
47
51
|
if self.is_hybrid:
|
|
48
|
-
features_array = self.
|
|
52
|
+
features_array = threshold_binary_values(input_array=features_array, binary_features=self.binary_features)
|
|
49
53
|
|
|
50
54
|
if features_array.ndim == 1:
|
|
51
55
|
features_array = features_array.reshape(1, -1)
|
|
52
56
|
|
|
53
|
-
# scale features as the model expects
|
|
54
|
-
features_array = self.scaler.transform(features_array) # type: ignore
|
|
55
|
-
|
|
56
57
|
result = self.model.predict(features_array) # type: ignore
|
|
57
58
|
scalar = result.item()
|
|
59
|
+
# print(f"[DEBUG] Model predicted: {scalar}")
|
|
60
|
+
|
|
58
61
|
# pso minimizes by default, so we return the negative value to maximize
|
|
59
62
|
if self.task == "maximization":
|
|
60
63
|
return -scalar
|
|
@@ -62,23 +65,22 @@ class ObjectiveFunction():
|
|
|
62
65
|
return scalar
|
|
63
66
|
|
|
64
67
|
def add_noise(self, features_array):
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
68
|
+
if self.binary_features > 0:
|
|
69
|
+
split_idx = -self.binary_features
|
|
70
|
+
cont_part = features_array[:split_idx]
|
|
71
|
+
bin_part = features_array[split_idx:]
|
|
72
|
+
noise = np.random.uniform(0.95, 1.05, size=cont_part.shape)
|
|
73
|
+
cont_noised = cont_part * noise
|
|
74
|
+
return np.concatenate([cont_noised, bin_part])
|
|
75
|
+
else:
|
|
76
|
+
noise = np.random.uniform(0.95, 1.05, size=features_array.shape)
|
|
77
|
+
return features_array * noise
|
|
74
78
|
|
|
75
79
|
def check_model(self):
|
|
76
80
|
if isinstance(self.model, ClassifierMixin) or isinstance(self.model, xgb.XGBClassifier) or isinstance(self.model, lgb.LGBMClassifier):
|
|
77
81
|
raise ValueError(f"[Model Check Failed] ❌\nThe loaded model ({type(self.model).__name__}) is a Classifier.\nOptimization is not suitable for standard classification tasks.")
|
|
78
82
|
if self.model is None:
|
|
79
83
|
raise ValueError("Loaded model is None")
|
|
80
|
-
if self.scaler is None:
|
|
81
|
-
raise ValueError("Loaded scaler is None")
|
|
82
84
|
|
|
83
85
|
def _get_from_artifact(self, key: str):
|
|
84
86
|
val = self._artifact.get(key)
|
|
@@ -89,10 +91,10 @@ class ObjectiveFunction():
|
|
|
89
91
|
return result
|
|
90
92
|
|
|
91
93
|
def __repr__(self):
|
|
92
|
-
return (f"<ObjectiveFunction(model={type(self.model).__name__},
|
|
94
|
+
return (f"<ObjectiveFunction(model={type(self.model).__name__}, use_noise={self.use_noise}, is_hybrid={self.is_hybrid}, task='{self.task}')>")
|
|
93
95
|
|
|
94
96
|
|
|
95
|
-
def _set_boundaries(lower_boundaries:
|
|
97
|
+
def _set_boundaries(lower_boundaries: list[float], upper_boundaries: list[float]):
|
|
96
98
|
assert len(lower_boundaries) == len(upper_boundaries), "Lower and upper boundaries must have the same length."
|
|
97
99
|
assert len(lower_boundaries) >= 1, "At least one boundary pair is required."
|
|
98
100
|
lower = np.array(lower_boundaries)
|
|
@@ -112,31 +114,40 @@ def _save_results(*dicts, save_dir: str, target_name: str):
|
|
|
112
114
|
combined_dict = dict()
|
|
113
115
|
for single_dict in dicts:
|
|
114
116
|
combined_dict.update(single_dict)
|
|
115
|
-
|
|
116
|
-
|
|
117
|
+
|
|
118
|
+
sanitized_target_name = sanitize_filename(target_name)
|
|
119
|
+
|
|
120
|
+
full_path = os.path.join(save_dir, f"Optimization_{sanitized_target_name}.csv")
|
|
117
121
|
pl.DataFrame(combined_dict).write_csv(full_path)
|
|
118
122
|
|
|
119
123
|
|
|
120
|
-
def run_pso(lower_boundaries:
|
|
121
|
-
|
|
124
|
+
def run_pso(lower_boundaries: list[float],
|
|
125
|
+
upper_boundaries: list[float],
|
|
126
|
+
objective_function: ObjectiveFunction,
|
|
127
|
+
save_results_dir: str,
|
|
128
|
+
auto_binary_boundaries: bool=True,
|
|
122
129
|
target_name: Union[str, None]=None,
|
|
123
130
|
feature_names: Union[list[str], None]=None,
|
|
124
|
-
swarm_size: int=
|
|
131
|
+
swarm_size: int=200,
|
|
132
|
+
max_iterations: int=400,
|
|
125
133
|
inequality_constrain_function=None,
|
|
126
|
-
post_hoc_analysis:
|
|
134
|
+
post_hoc_analysis: Optional[int]=3,
|
|
135
|
+
workers: int=3) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
|
|
127
136
|
"""
|
|
128
|
-
Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results.
|
|
137
|
+
Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results as a CSV file.
|
|
129
138
|
|
|
130
139
|
Parameters
|
|
131
140
|
----------
|
|
132
|
-
lower_boundaries :
|
|
133
|
-
Lower bounds for each feature in the search space.
|
|
134
|
-
upper_boundaries :
|
|
135
|
-
Upper bounds for each feature in the search space.
|
|
141
|
+
lower_boundaries : list[float]
|
|
142
|
+
Lower bounds for each feature in the search space (as many as features expected by the model).
|
|
143
|
+
upper_boundaries : list[float]
|
|
144
|
+
Upper bounds for each feature in the search space (as many as features expected by the model).
|
|
136
145
|
objective_function : ObjectiveFunction
|
|
137
|
-
A callable object encapsulating a regression model
|
|
146
|
+
A callable object encapsulating a tree-based regression model.
|
|
138
147
|
save_results_dir : str
|
|
139
148
|
Directory path to save the results CSV file.
|
|
149
|
+
auto_binary_boundaries : bool
|
|
150
|
+
Use `ObjectiveFunction.binary_features` to append as many binary boundaries as needed to `lower_boundaries` and `upper_boundaries` automatically.
|
|
140
151
|
target_name : str or None, optional
|
|
141
152
|
Name of the target variable. If None, attempts to retrieve from the ObjectiveFunction object.
|
|
142
153
|
feature_names : list[str] or None, optional
|
|
@@ -147,32 +158,39 @@ def run_pso(lower_boundaries: Sequence[float], upper_boundaries: Sequence[float]
|
|
|
147
158
|
Maximum number of iterations for the optimization algorithm.
|
|
148
159
|
inequality_constrain_function : callable or None, optional
|
|
149
160
|
Optional function defining inequality constraints to be respected by the optimization.
|
|
150
|
-
post_hoc_analysis : int or None
|
|
161
|
+
post_hoc_analysis : int or None
|
|
151
162
|
If specified, runs the optimization multiple times to perform post hoc analysis. The value indicates the number of repetitions.
|
|
163
|
+
workers : int
|
|
164
|
+
Number of parallel processes to use.
|
|
152
165
|
|
|
153
166
|
Returns
|
|
154
167
|
-------
|
|
155
168
|
Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]
|
|
156
169
|
If `post_hoc_analysis` is None, returns two dictionaries:
|
|
157
|
-
-
|
|
158
|
-
-
|
|
170
|
+
- feature_names: Feature values (after inverse scaling) that yield the best result.
|
|
171
|
+
- target_name: Best result obtained for the target variable.
|
|
159
172
|
|
|
160
173
|
If `post_hoc_analysis` is an integer, returns two dictionaries:
|
|
161
|
-
-
|
|
162
|
-
-
|
|
174
|
+
- feature_names: Lists of best feature values (after inverse scaling) for each repetition.
|
|
175
|
+
- target_name: List of best target values across repetitions.
|
|
163
176
|
|
|
164
177
|
Notes
|
|
165
178
|
-----
|
|
166
179
|
- PSO minimizes the objective function by default; if maximization is desired, it should be handled inside the ObjectiveFunction.
|
|
167
|
-
- Feature values are scaled before being passed to the model and inverse-transformed before result saving.
|
|
168
180
|
"""
|
|
181
|
+
# Append binary boundaries
|
|
182
|
+
binary_number = objective_function.binary_features
|
|
183
|
+
if auto_binary_boundaries and binary_number > 0:
|
|
184
|
+
lower_boundaries.extend([0] * binary_number)
|
|
185
|
+
upper_boundaries.extend([1] * binary_number)
|
|
186
|
+
|
|
169
187
|
lower, upper = _set_boundaries(lower_boundaries, upper_boundaries)
|
|
170
|
-
|
|
188
|
+
|
|
171
189
|
# feature names
|
|
172
190
|
if feature_names is None and objective_function.feature_names is not None:
|
|
173
191
|
feature_names = objective_function.feature_names
|
|
174
192
|
names = _set_feature_names(size=len(lower_boundaries), names=feature_names)
|
|
175
|
-
|
|
193
|
+
|
|
176
194
|
# target name
|
|
177
195
|
if target_name is None and objective_function.target_name is not None:
|
|
178
196
|
target_name = objective_function.target_name
|
|
@@ -186,20 +204,25 @@ def run_pso(lower_boundaries: Sequence[float], upper_boundaries: Sequence[float]
|
|
|
186
204
|
"f_ieqcons": inequality_constrain_function,
|
|
187
205
|
"swarmsize": swarm_size,
|
|
188
206
|
"maxiter": max_iterations,
|
|
189
|
-
"processes":
|
|
190
|
-
"particle_output":
|
|
207
|
+
"processes": workers,
|
|
208
|
+
"particle_output": False
|
|
191
209
|
}
|
|
192
210
|
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
211
|
+
os.makedirs(save_results_dir, exist_ok=True)
|
|
212
|
+
|
|
213
|
+
if post_hoc_analysis is None or post_hoc_analysis == 1:
|
|
214
|
+
best_features, best_target, *_ = _pso(**arguments)
|
|
215
|
+
# best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)
|
|
196
216
|
|
|
197
|
-
#
|
|
198
|
-
|
|
199
|
-
|
|
217
|
+
# flip best_target if maximization was used
|
|
218
|
+
if objective_function.task == "maximization":
|
|
219
|
+
best_target = -best_target
|
|
220
|
+
|
|
221
|
+
# threshold binary features
|
|
222
|
+
best_features_threshold = threshold_binary_values(best_features, binary_number)
|
|
200
223
|
|
|
201
224
|
# name features
|
|
202
|
-
best_features_named = {name: value for name, value in zip(names,
|
|
225
|
+
best_features_named = {name: value for name, value in zip(names, best_features_threshold)}
|
|
203
226
|
best_target_named = {target_name: best_target}
|
|
204
227
|
|
|
205
228
|
# save results
|
|
@@ -209,15 +232,18 @@ def run_pso(lower_boundaries: Sequence[float], upper_boundaries: Sequence[float]
|
|
|
209
232
|
else:
|
|
210
233
|
all_best_targets = list()
|
|
211
234
|
all_best_features = [[] for _ in range(len(lower_boundaries))]
|
|
212
|
-
for
|
|
213
|
-
|
|
214
|
-
best_features, best_target, _particle_positions, _target_values_per_position =
|
|
235
|
+
for _ in range(post_hoc_analysis):
|
|
236
|
+
best_features, best_target, *_ = _pso(**arguments)
|
|
237
|
+
# best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)
|
|
238
|
+
|
|
239
|
+
# flip best_target if maximization was used
|
|
240
|
+
if objective_function.task == "maximization":
|
|
241
|
+
best_target = -best_target
|
|
215
242
|
|
|
216
|
-
#
|
|
217
|
-
|
|
218
|
-
best_features_real = objective_function.scaler.inverse_transform(best_features).flatten() # type: ignore
|
|
243
|
+
# threshold binary features
|
|
244
|
+
best_features_threshold = threshold_binary_values(best_features, binary_number)
|
|
219
245
|
|
|
220
|
-
for i, best_feature in enumerate(
|
|
246
|
+
for i, best_feature in enumerate(best_features_threshold):
|
|
221
247
|
all_best_features[i].append(best_feature)
|
|
222
248
|
all_best_targets.append(best_target)
|
|
223
249
|
|
|
@@ -231,6 +257,8 @@ def run_pso(lower_boundaries: Sequence[float], upper_boundaries: Sequence[float]
|
|
|
231
257
|
return all_best_features_named, all_best_targets_named # type: ignore
|
|
232
258
|
|
|
233
259
|
|
|
260
|
+
def info():
|
|
261
|
+
_script_info(__all__)
|
|
234
262
|
|
|
235
263
|
|
|
236
264
|
### SOURCE CODE FOR PSO ###
|
|
@@ -249,7 +277,7 @@ def _cons_ieqcons_wrapper(ieqcons, args, kwargs, x):
|
|
|
249
277
|
def _cons_f_ieqcons_wrapper(f_ieqcons, args, kwargs, x):
|
|
250
278
|
return np.array(f_ieqcons(x, *args, **kwargs))
|
|
251
279
|
|
|
252
|
-
def
|
|
280
|
+
def _pso(func, lb, ub, ieqcons=[], f_ieqcons=None, args=(), kwargs={},
|
|
253
281
|
swarmsize=100, omega=0.5, phip=0.5, phig=0.5, maxiter=100,
|
|
254
282
|
minstep=1e-8, minfunc=1e-8, debug=False, processes=1,
|
|
255
283
|
particle_output=False):
|
|
@@ -377,7 +405,7 @@ def pso(func, lb, ub, ieqcons=[], f_ieqcons=None, args=(), kwargs={},
|
|
|
377
405
|
for i in range(S):
|
|
378
406
|
fx[i] = obj(x[i, :])
|
|
379
407
|
fs[i] = is_feasible(x[i, :])
|
|
380
|
-
|
|
408
|
+
|
|
381
409
|
# Store particle's best position (if constraints are satisfied)
|
|
382
410
|
i_update = np.logical_and((fx < fp), fs)
|
|
383
411
|
p[i_update, :] = x[i_update, :].copy()
|
ml_tools/pytorch_models.py
CHANGED
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
import torch
|
|
2
2
|
from torch import nn
|
|
3
|
+
from .utilities import _script_info
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"MyNeuralNetwork",
|
|
8
|
+
"MyLSTMNetwork"
|
|
9
|
+
]
|
|
3
10
|
|
|
4
11
|
|
|
5
12
|
class MyNeuralNetwork(nn.Module):
|
|
@@ -73,9 +80,11 @@ class MyNeuralNetwork(nn.Module):
|
|
|
73
80
|
return X
|
|
74
81
|
|
|
75
82
|
|
|
76
|
-
class
|
|
83
|
+
class _MyConvolutionalNetwork(nn.Module):
|
|
77
84
|
def __init__(self, outputs: int, color_channels: int=3, img_size: int=256, drop_out: float=0.2):
|
|
78
85
|
"""
|
|
86
|
+
- EDUCATIONAL PURPOSES ONLY, not optimized and requires lots of memory.
|
|
87
|
+
|
|
79
88
|
Create a basic Convolutional Neural Network with two convolution layers with a pooling layer after each convolution.
|
|
80
89
|
|
|
81
90
|
Args:
|
|
@@ -225,3 +234,6 @@ class MyLSTMNetwork(nn.Module):
|
|
|
225
234
|
else:
|
|
226
235
|
return output
|
|
227
236
|
|
|
237
|
+
|
|
238
|
+
def info():
|
|
239
|
+
_script_info(__all__)
|
ml_tools/trainer.py
CHANGED
|
@@ -6,6 +6,12 @@ import matplotlib.pyplot as plt
|
|
|
6
6
|
import torch
|
|
7
7
|
from torch import nn
|
|
8
8
|
from sklearn.metrics import mean_squared_error, classification_report, ConfusionMatrixDisplay, roc_curve, roc_auc_score, r2_score, median_absolute_error
|
|
9
|
+
from .utilities import _script_info
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"MyTrainer"
|
|
14
|
+
]
|
|
9
15
|
|
|
10
16
|
|
|
11
17
|
class MyTrainer():
|
|
@@ -288,36 +294,6 @@ class MyTrainer():
|
|
|
288
294
|
print(f"Area under the curve score: {area_under_curve:4.2f}")
|
|
289
295
|
else:
|
|
290
296
|
print("Error encountered while retrieving 'model.kind' attribute.")
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
def forecast(self, samples_list: list[torch.Tensor], view_as: tuple[int,int]=(1,-1)):
|
|
294
|
-
"""
|
|
295
|
-
DEPRECATED - Use `helpers.model_predict()` instead
|
|
296
|
-
|
|
297
|
-
Returns a list containing lists of predicted values, one for each sample.
|
|
298
|
-
|
|
299
|
-
Each sample must be a tensor and have the same shape and normalization expected by the model
|
|
300
|
-
(this method will add the batch dimension automatically).
|
|
301
|
-
|
|
302
|
-
Args:
|
|
303
|
-
`samples_list`: list of tensors.
|
|
304
|
-
|
|
305
|
-
`view_as`: reshape each output, default is (1,-1).
|
|
306
|
-
|
|
307
|
-
Returns: List of lists.
|
|
308
|
-
"""
|
|
309
|
-
self.model.eval()
|
|
310
|
-
results = list()
|
|
311
|
-
with torch.no_grad():
|
|
312
|
-
for data_point in samples_list:
|
|
313
|
-
data_point = data_point.unsqueeze(0).to(self.device)
|
|
314
|
-
output = self.model(data_point)
|
|
315
|
-
if self.kind == "classification":
|
|
316
|
-
results.append(output.argmax(dim=1).view(view_as).cpu().tolist())
|
|
317
|
-
else: #regression
|
|
318
|
-
results.append(output.view(view_as).cpu().tolist())
|
|
319
|
-
|
|
320
|
-
return results
|
|
321
297
|
|
|
322
298
|
|
|
323
299
|
def rnn_forecast(self, sequence: torch.Tensor, steps: int):
|
|
@@ -364,3 +340,7 @@ class MyTrainer():
|
|
|
364
340
|
# Cast to array and return
|
|
365
341
|
predictions = numpy.array(predictions)
|
|
366
342
|
return predictions
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def info():
|
|
346
|
+
_script_info(__all__)
|
ml_tools/utilities.py
CHANGED
|
@@ -4,6 +4,20 @@ import pandas as pd
|
|
|
4
4
|
import os
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
import re
|
|
7
|
+
from typing import Literal, Union, Sequence
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# Keep track of available tools
|
|
11
|
+
__all__ = [
|
|
12
|
+
"list_csv_paths",
|
|
13
|
+
"load_dataframe",
|
|
14
|
+
"yield_dataframes_from_dir",
|
|
15
|
+
"merge_dataframes",
|
|
16
|
+
"save_dataframe",
|
|
17
|
+
"normalize_mixed_list",
|
|
18
|
+
"sanitize_filename",
|
|
19
|
+
"threshold_binary_values"
|
|
20
|
+
]
|
|
7
21
|
|
|
8
22
|
|
|
9
23
|
def list_csv_paths(directory: str) -> dict[str, str]:
|
|
@@ -76,11 +90,93 @@ def yield_dataframes_from_dir(datasets_dir: str):
|
|
|
76
90
|
for df_name, df_path in list_csv_paths(datasets_dir).items():
|
|
77
91
|
df, _ = load_dataframe(df_path)
|
|
78
92
|
yield df, df_name
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def merge_dataframes(
|
|
96
|
+
*dfs: pd.DataFrame,
|
|
97
|
+
reset_index: bool = False,
|
|
98
|
+
direction: Literal["horizontal", "vertical"] = "horizontal"
|
|
99
|
+
) -> pd.DataFrame:
|
|
100
|
+
"""
|
|
101
|
+
Merges multiple DataFrames either horizontally or vertically.
|
|
102
|
+
|
|
103
|
+
Parameters:
|
|
104
|
+
*dfs (pd.DataFrame): Variable number of DataFrames to merge.
|
|
105
|
+
reset_index (bool): Whether to reset index in the final merged DataFrame.
|
|
106
|
+
direction (["horizontal" | "vertical"]):
|
|
107
|
+
- "horizontal": Merge on index, adding columns.
|
|
108
|
+
- "vertical": Append rows; all DataFrames must have identical columns.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
pd.DataFrame: A single merged DataFrame.
|
|
112
|
+
|
|
113
|
+
Raises:
|
|
114
|
+
ValueError:
|
|
115
|
+
- If fewer than 2 DataFrames are provided.
|
|
116
|
+
- If indexes do not match for horizontal merge.
|
|
117
|
+
- If column names or order differ for vertical merge.
|
|
118
|
+
"""
|
|
119
|
+
if len(dfs) < 2:
|
|
120
|
+
raise ValueError("At least 2 DataFrames must be provided.")
|
|
121
|
+
|
|
122
|
+
for i, df in enumerate(dfs, start=1):
|
|
123
|
+
print(f"DataFrame {i} shape: {df.shape}")
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
if direction == "horizontal":
|
|
127
|
+
reference_index = dfs[0].index
|
|
128
|
+
for i, df in enumerate(dfs, start=1):
|
|
129
|
+
if not df.index.equals(reference_index):
|
|
130
|
+
raise ValueError(f"Indexes do not match: Dataset 1 and Dataset {i}.")
|
|
131
|
+
merged_df = pd.concat(dfs, axis=1)
|
|
132
|
+
|
|
133
|
+
elif direction == "vertical":
|
|
134
|
+
reference_columns = dfs[0].columns
|
|
135
|
+
for i, df in enumerate(dfs, start=1):
|
|
136
|
+
if not df.columns.equals(reference_columns):
|
|
137
|
+
raise ValueError(f"Column names/order do not match: Dataset 1 and Dataset {i}.")
|
|
138
|
+
merged_df = pd.concat(dfs, axis=0)
|
|
139
|
+
|
|
140
|
+
else:
|
|
141
|
+
raise ValueError(f"Invalid merge direction: {direction}")
|
|
142
|
+
|
|
143
|
+
if reset_index:
|
|
144
|
+
merged_df = merged_df.reset_index(drop=True)
|
|
145
|
+
|
|
146
|
+
print(f"Merged DataFrame shape: {merged_df.shape}")
|
|
147
|
+
|
|
148
|
+
return merged_df
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def save_dataframe(df: pd.DataFrame, save_dir: str, filename: str) -> None:
|
|
152
|
+
"""
|
|
153
|
+
Save a pandas DataFrame to a CSV file.
|
|
154
|
+
|
|
155
|
+
Parameters:
|
|
156
|
+
df: pandas.DataFrame to save
|
|
157
|
+
save_dir: str, directory where the CSV file will be saved.
|
|
158
|
+
filename: str, CSV filename, extension will be added if missing.
|
|
159
|
+
"""
|
|
160
|
+
if df.empty:
|
|
161
|
+
print(f"⚠️ Attempting to save an empty DataFrame: '{filename}'. Process Skipped.")
|
|
162
|
+
return
|
|
163
|
+
|
|
164
|
+
os.makedirs(save_dir, exist_ok=True)
|
|
165
|
+
|
|
166
|
+
filename = sanitize_filename(filename)
|
|
167
|
+
|
|
168
|
+
if not filename.endswith('.csv'):
|
|
169
|
+
filename += '.csv'
|
|
79
170
|
|
|
171
|
+
output_path = os.path.join(save_dir, filename)
|
|
80
172
|
|
|
173
|
+
df.to_csv(output_path, index=False, encoding='utf-8')
|
|
174
|
+
print(f"✅ Saved file: '{filename}'")
|
|
175
|
+
|
|
176
|
+
|
|
81
177
|
def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
|
|
82
178
|
"""
|
|
83
|
-
Normalize a mixed list of numeric values and strings so that the sum of the values equals 1.0,
|
|
179
|
+
Normalize a mixed list of numeric values and strings casted to floats so that the sum of the values equals 1.0,
|
|
84
180
|
applying heuristic adjustments to correct for potential data entry scale mismatches.
|
|
85
181
|
|
|
86
182
|
Parameters:
|
|
@@ -168,27 +264,46 @@ def sanitize_filename(filename: str) -> str:
|
|
|
168
264
|
return sanitized
|
|
169
265
|
|
|
170
266
|
|
|
171
|
-
def
|
|
267
|
+
def threshold_binary_values(
|
|
268
|
+
input_array: Union[Sequence[float], np.ndarray],
|
|
269
|
+
binary_features: int
|
|
270
|
+
) -> np.ndarray:
|
|
172
271
|
"""
|
|
173
|
-
|
|
272
|
+
Thresholds binary features in a 1D numeric sequence. Binary features must be located at the end of the sequence.
|
|
273
|
+
|
|
274
|
+
Converts binary elements to values (0 or 1) using a threshold of 0.5. The rest of the array (assumed to be continuous features) is returned unchanged.
|
|
174
275
|
|
|
175
276
|
Parameters:
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
277
|
+
input_array (Union[Sequence[float], np.ndarray]) : A one-dimensional collection of numeric values. The binary features must be located at the end of the array.
|
|
278
|
+
|
|
279
|
+
binary_features (int) : Number of binary features to threshold from the end of the array. Must be between 0 and the total number of elements.
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
np.ndarray : A 1D NumPy array where the final `binary_features` values have been binarized.
|
|
179
283
|
"""
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
return
|
|
284
|
+
array = np.asarray(input_array).flatten()
|
|
285
|
+
total = array.shape[0]
|
|
183
286
|
|
|
184
|
-
|
|
287
|
+
if binary_features < 0 or binary_features > total:
|
|
288
|
+
raise ValueError("Binary features must be between 0 and the total number of features.")
|
|
185
289
|
|
|
186
|
-
|
|
290
|
+
if binary_features == 0:
|
|
291
|
+
return array
|
|
292
|
+
|
|
293
|
+
cont_part = array[:-binary_features]
|
|
294
|
+
bin_part = (array[-binary_features:] > 0.5).astype(int)
|
|
187
295
|
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
296
|
+
return np.concatenate([cont_part, bin_part])
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def _script_info(all_data: list[str]):
|
|
300
|
+
"""
|
|
301
|
+
List available names.
|
|
302
|
+
"""
|
|
303
|
+
print("Available functions and objects:")
|
|
304
|
+
for i, name in enumerate(all_data, start=1):
|
|
305
|
+
print(f"{i} - {name}")
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def info():
|
|
309
|
+
_script_info(__all__)
|
ml_tools/vision_helpers.py
CHANGED
|
@@ -4,9 +4,18 @@ from PIL import Image, ImageOps
|
|
|
4
4
|
from typing import Literal
|
|
5
5
|
from torchvision import transforms
|
|
6
6
|
import torch
|
|
7
|
+
from .utilities import _script_info
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"inspect_images",
|
|
12
|
+
"image_augmentation",
|
|
13
|
+
"ResizeAspectFill",
|
|
14
|
+
"is_image",
|
|
15
|
+
"model_predict"
|
|
16
|
+
]
|
|
7
17
|
|
|
8
18
|
|
|
9
|
-
# --- Helper Functions ---
|
|
10
19
|
def inspect_images(path: str):
|
|
11
20
|
"""
|
|
12
21
|
Prints out the types, sizes and channels of image files found in the directory and its subdirectories.
|
|
@@ -216,3 +225,7 @@ def model_predict(model: torch.nn.Module, kind: Literal["regression", "classific
|
|
|
216
225
|
results.append(output.view(view_as).cpu().tolist())
|
|
217
226
|
|
|
218
227
|
return results
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def info():
|
|
231
|
+
_script_info(__all__)
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
dragon_ml_toolbox-1.4.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
|
|
2
|
-
dragon_ml_toolbox-1.4.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=e1Hg5ZtaBpDV7ZvxhLe1ac28l7nMjvi1MSE5YvB1s-o,1472
|
|
3
|
-
ml_tools/MICE_imputation.py,sha256=4kqZiesk8vyh4MBLnNE9grflG4fDusqzuYBElsbk4LY,9484
|
|
4
|
-
ml_tools/VIF_factor.py,sha256=rHSAxQcXLrG8dIjCXBAvETsSkCBfYus9NqimOnm2Bvk,9559
|
|
5
|
-
ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
-
ml_tools/data_exploration.py,sha256=qtkGumckC2PmTpj3brVFi072ewX0OI6dwUF4Or7Yikg,21341
|
|
7
|
-
ml_tools/datasetmaster.py,sha256=VUneKshnmjOGbtqVVGTFcIMRKF3s6ZDYrosIYKDjD80,28956
|
|
8
|
-
ml_tools/ensemble_learning.py,sha256=wK6mtOE4v9AWlxkcWhJj5XZjREChxb46kE0i2IxS-OE,28372
|
|
9
|
-
ml_tools/handle_excel.py,sha256=IR0VQc3hYdmjwC31E5YxDnRcWig4jSIx7Y_7to-KZz4,11969
|
|
10
|
-
ml_tools/logger.py,sha256=XwSpCUzw2Le24fJHyljBxNLgw63SwjZ0pMjTJqf0ylI,4622
|
|
11
|
-
ml_tools/particle_swarm_optimization.py,sha256=jpkje4OETC9fyISxxUTx4XGrImSU6gDEcwz46ZDs2bQ,19250
|
|
12
|
-
ml_tools/pytorch_models.py,sha256=Oykw02sOZLCjvSadQd64UGesBN7kq0x1EGXHusvYiQI,9908
|
|
13
|
-
ml_tools/trainer.py,sha256=Zd7AaHeoNd8dEas2JChWoHaCUpWUVRDUMybuHaKJ0XY,16740
|
|
14
|
-
ml_tools/utilities.py,sha256=gr1cyRUfZcRo9fjWpCaQkrvWY0-xJnDJdrE8JEsOi8o,6309
|
|
15
|
-
ml_tools/vision_helpers.py,sha256=lBAW6dzAK-HOswAt1fU_tfP9hkNLY5D8c_I_7hhEXno,7528
|
|
16
|
-
dragon_ml_toolbox-1.4.0.dist-info/METADATA,sha256=V7Y96iAbgX6Xl6RWzEt4nGfKMZe4cuLs0BrFQghXxX8,2335
|
|
17
|
-
dragon_ml_toolbox-1.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
18
|
-
dragon_ml_toolbox-1.4.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
19
|
-
dragon_ml_toolbox-1.4.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|