dragon-ml-toolbox 1.3.2__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-1.3.2.dist-info → dragon_ml_toolbox-1.4.1.dist-info}/METADATA +19 -2
- dragon_ml_toolbox-1.4.1.dist-info/RECORD +19 -0
- ml_tools/MICE_imputation.py +24 -6
- ml_tools/VIF_factor.py +224 -0
- ml_tools/data_exploration.py +74 -286
- ml_tools/datasetmaster.py +13 -1
- ml_tools/ensemble_learning.py +128 -129
- ml_tools/handle_excel.py +32 -9
- ml_tools/logger.py +10 -1
- ml_tools/particle_swarm_optimization.py +71 -34
- ml_tools/pytorch_models.py +13 -1
- ml_tools/trainer.py +10 -30
- ml_tools/utilities.py +122 -14
- ml_tools/vision_helpers.py +14 -1
- dragon_ml_toolbox-1.3.2.dist-info/RECORD +0 -18
- {dragon_ml_toolbox-1.3.2.dist-info → dragon_ml_toolbox-1.4.1.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-1.3.2.dist-info → dragon_ml_toolbox-1.4.1.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-1.3.2.dist-info → dragon_ml_toolbox-1.4.1.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-1.3.2.dist-info → dragon_ml_toolbox-1.4.1.dist-info}/top_level.txt +0 -0
|
@@ -5,23 +5,29 @@ import xgboost as xgb
|
|
|
5
5
|
import lightgbm as lgb
|
|
6
6
|
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
|
|
7
7
|
from sklearn.base import ClassifierMixin
|
|
8
|
-
from sklearn.preprocessing import StandardScaler
|
|
8
|
+
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
|
|
9
9
|
from typing import Literal, Union, Tuple, Dict
|
|
10
|
-
from collections.abc import Sequence
|
|
11
10
|
import polars as pl
|
|
12
11
|
from functools import partial
|
|
12
|
+
from .utilities import sanitize_filename, _script_info
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"ObjectiveFunction",
|
|
17
|
+
"run_pso"
|
|
18
|
+
]
|
|
13
19
|
|
|
14
20
|
|
|
15
21
|
class ObjectiveFunction():
|
|
16
22
|
"""
|
|
17
23
|
Callable objective function designed for optimizing continuous outputs from regression models.
|
|
18
24
|
|
|
19
|
-
The
|
|
25
|
+
The target serialized file (joblib) must include a 'model' and a 'scaler'. Additionally 'feature_names' and 'target_name' will be parsed if present.
|
|
20
26
|
|
|
21
27
|
Parameters
|
|
22
28
|
----------
|
|
23
29
|
trained_model_path : str
|
|
24
|
-
Path to a serialized model (joblib) compatible with scikit-learn-like `.predict`.
|
|
30
|
+
Path to a serialized model and its scaler (joblib) compatible with scikit-learn-like `.predict`.
|
|
25
31
|
add_noise : bool
|
|
26
32
|
Whether to apply multiplicative noise to the input features during evaluation.
|
|
27
33
|
binary_features : int, default=0
|
|
@@ -67,8 +73,18 @@ class ObjectiveFunction():
|
|
|
67
73
|
return new_feature_values
|
|
68
74
|
|
|
69
75
|
def _handle_hybrid(self, features_array):
|
|
70
|
-
|
|
71
|
-
|
|
76
|
+
total_features = features_array.shape[0]
|
|
77
|
+
if self.binary_features > total_features:
|
|
78
|
+
raise ValueError("self.binary_features exceeds total number of features.")
|
|
79
|
+
|
|
80
|
+
# Handle corner case where all features are binary
|
|
81
|
+
if self.binary_features == total_features:
|
|
82
|
+
feat_binary = (features_array > 0.5).astype(int)
|
|
83
|
+
return feat_binary
|
|
84
|
+
|
|
85
|
+
# Normal case: split into continuous and binary parts
|
|
86
|
+
feat_continuous = features_array[:-self.binary_features]
|
|
87
|
+
feat_binary = (features_array[-self.binary_features:] > 0.5).astype(int) #threshold binary values
|
|
72
88
|
new_feature_values = np.concatenate([feat_continuous, feat_binary])
|
|
73
89
|
return new_feature_values
|
|
74
90
|
|
|
@@ -92,7 +108,7 @@ class ObjectiveFunction():
|
|
|
92
108
|
return (f"<ObjectiveFunction(model={type(self.model).__name__}, scaler={type(self.scaler).__name__}, use_noise={self.use_noise}, is_hybrid={self.is_hybrid}, task='{self.task}')>")
|
|
93
109
|
|
|
94
110
|
|
|
95
|
-
def _set_boundaries(lower_boundaries:
|
|
111
|
+
def _set_boundaries(lower_boundaries: list[float], upper_boundaries: list[float]):
|
|
96
112
|
assert len(lower_boundaries) == len(upper_boundaries), "Lower and upper boundaries must have the same length."
|
|
97
113
|
assert len(lower_boundaries) >= 1, "At least one boundary pair is required."
|
|
98
114
|
lower = np.array(lower_boundaries)
|
|
@@ -112,31 +128,40 @@ def _save_results(*dicts, save_dir: str, target_name: str):
|
|
|
112
128
|
combined_dict = dict()
|
|
113
129
|
for single_dict in dicts:
|
|
114
130
|
combined_dict.update(single_dict)
|
|
115
|
-
|
|
116
|
-
|
|
131
|
+
|
|
132
|
+
sanitized_target_name = sanitize_filename(target_name)
|
|
133
|
+
|
|
134
|
+
full_path = os.path.join(save_dir, f"Optimization_{sanitized_target_name}.csv")
|
|
117
135
|
pl.DataFrame(combined_dict).write_csv(full_path)
|
|
118
136
|
|
|
119
137
|
|
|
120
|
-
def run_pso(lower_boundaries:
|
|
121
|
-
|
|
138
|
+
def run_pso(lower_boundaries: list[float],
|
|
139
|
+
upper_boundaries: list[float],
|
|
140
|
+
objective_function: ObjectiveFunction,
|
|
141
|
+
save_results_dir: str,
|
|
142
|
+
auto_binary_boundaries: bool=True,
|
|
122
143
|
target_name: Union[str, None]=None,
|
|
123
144
|
feature_names: Union[list[str], None]=None,
|
|
124
|
-
swarm_size: int=100,
|
|
145
|
+
swarm_size: int=100,
|
|
146
|
+
max_iterations: int=100,
|
|
125
147
|
inequality_constrain_function=None,
|
|
126
|
-
post_hoc_analysis: Union[int, None]=None
|
|
148
|
+
post_hoc_analysis: Union[int, None]=None,
|
|
149
|
+
workers: int=5) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
|
|
127
150
|
"""
|
|
128
|
-
Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results.
|
|
151
|
+
Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results as a CSV file.
|
|
129
152
|
|
|
130
153
|
Parameters
|
|
131
154
|
----------
|
|
132
|
-
lower_boundaries :
|
|
133
|
-
Lower bounds for each feature in the search space.
|
|
134
|
-
upper_boundaries :
|
|
135
|
-
Upper bounds for each feature in the search space.
|
|
155
|
+
lower_boundaries : list[float]
|
|
156
|
+
Lower bounds for each feature in the search space (as many as features expected by the model).
|
|
157
|
+
upper_boundaries : list[float]
|
|
158
|
+
Upper bounds for each feature in the search space (as many as features expected by the model).
|
|
136
159
|
objective_function : ObjectiveFunction
|
|
137
160
|
A callable object encapsulating a regression model and its scaler.
|
|
138
161
|
save_results_dir : str
|
|
139
162
|
Directory path to save the results CSV file.
|
|
163
|
+
auto_binary_boundaries : bool
|
|
164
|
+
Use `ObjectiveFunction.binary_features` to append as many binary boundaries as needed to `lower_boundaries` and `upper_boundaries` automatically.
|
|
140
165
|
target_name : str or None, optional
|
|
141
166
|
Name of the target variable. If None, attempts to retrieve from the ObjectiveFunction object.
|
|
142
167
|
feature_names : list[str] or None, optional
|
|
@@ -149,30 +174,38 @@ def run_pso(lower_boundaries: Sequence[float], upper_boundaries: Sequence[float]
|
|
|
149
174
|
Optional function defining inequality constraints to be respected by the optimization.
|
|
150
175
|
post_hoc_analysis : int or None, optional
|
|
151
176
|
If specified, runs the optimization multiple times to perform post hoc analysis. The value indicates the number of repetitions.
|
|
177
|
+
workers : int
|
|
178
|
+
Number of parallel processes to use.
|
|
152
179
|
|
|
153
180
|
Returns
|
|
154
181
|
-------
|
|
155
182
|
Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]
|
|
156
183
|
If `post_hoc_analysis` is None, returns two dictionaries:
|
|
157
|
-
-
|
|
158
|
-
-
|
|
184
|
+
- feature_names: Feature values (after inverse scaling) that yield the best result.
|
|
185
|
+
- target_name: Best result obtained for the target variable.
|
|
159
186
|
|
|
160
187
|
If `post_hoc_analysis` is an integer, returns two dictionaries:
|
|
161
|
-
-
|
|
162
|
-
-
|
|
188
|
+
- feature_names: Lists of best feature values (after inverse scaling) for each repetition.
|
|
189
|
+
- target_name: List of best target values across repetitions.
|
|
163
190
|
|
|
164
191
|
Notes
|
|
165
192
|
-----
|
|
166
193
|
- PSO minimizes the objective function by default; if maximization is desired, it should be handled inside the ObjectiveFunction.
|
|
167
194
|
- Feature values are scaled before being passed to the model and inverse-transformed before result saving.
|
|
168
195
|
"""
|
|
196
|
+
# Append binary boundaries
|
|
197
|
+
binary_number = objective_function.binary_features
|
|
198
|
+
if auto_binary_boundaries and binary_number > 0:
|
|
199
|
+
lower_boundaries.extend([0] * binary_number)
|
|
200
|
+
upper_boundaries.extend([1] * binary_number)
|
|
201
|
+
|
|
169
202
|
lower, upper = _set_boundaries(lower_boundaries, upper_boundaries)
|
|
170
|
-
|
|
203
|
+
|
|
171
204
|
# feature names
|
|
172
205
|
if feature_names is None and objective_function.feature_names is not None:
|
|
173
206
|
feature_names = objective_function.feature_names
|
|
174
207
|
names = _set_feature_names(size=len(lower_boundaries), names=feature_names)
|
|
175
|
-
|
|
208
|
+
|
|
176
209
|
# target name
|
|
177
210
|
if target_name is None and objective_function.target_name is not None:
|
|
178
211
|
target_name = objective_function.target_name
|
|
@@ -186,13 +219,15 @@ def run_pso(lower_boundaries: Sequence[float], upper_boundaries: Sequence[float]
|
|
|
186
219
|
"f_ieqcons": inequality_constrain_function,
|
|
187
220
|
"swarmsize": swarm_size,
|
|
188
221
|
"maxiter": max_iterations,
|
|
189
|
-
"processes":
|
|
190
|
-
"particle_output":
|
|
222
|
+
"processes": workers,
|
|
223
|
+
"particle_output": False
|
|
191
224
|
}
|
|
192
225
|
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
226
|
+
os.makedirs(save_results_dir, exist_ok=True)
|
|
227
|
+
|
|
228
|
+
if post_hoc_analysis is None or post_hoc_analysis == 1:
|
|
229
|
+
best_features, best_target, *_ = _pso(**arguments)
|
|
230
|
+
# best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)
|
|
196
231
|
|
|
197
232
|
# inverse transformation
|
|
198
233
|
best_features = np.array(best_features).reshape(1, -1)
|
|
@@ -209,9 +244,9 @@ def run_pso(lower_boundaries: Sequence[float], upper_boundaries: Sequence[float]
|
|
|
209
244
|
else:
|
|
210
245
|
all_best_targets = list()
|
|
211
246
|
all_best_features = [[] for _ in range(len(lower_boundaries))]
|
|
212
|
-
for
|
|
213
|
-
|
|
214
|
-
best_features, best_target, _particle_positions, _target_values_per_position =
|
|
247
|
+
for _ in range(post_hoc_analysis):
|
|
248
|
+
best_features, best_target, *_ = _pso(**arguments)
|
|
249
|
+
# best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)
|
|
215
250
|
|
|
216
251
|
# inverse transformation
|
|
217
252
|
best_features = np.array(best_features).reshape(1, -1)
|
|
@@ -231,6 +266,8 @@ def run_pso(lower_boundaries: Sequence[float], upper_boundaries: Sequence[float]
|
|
|
231
266
|
return all_best_features_named, all_best_targets_named # type: ignore
|
|
232
267
|
|
|
233
268
|
|
|
269
|
+
def info():
|
|
270
|
+
_script_info(__all__)
|
|
234
271
|
|
|
235
272
|
|
|
236
273
|
### SOURCE CODE FOR PSO ###
|
|
@@ -249,7 +286,7 @@ def _cons_ieqcons_wrapper(ieqcons, args, kwargs, x):
|
|
|
249
286
|
def _cons_f_ieqcons_wrapper(f_ieqcons, args, kwargs, x):
|
|
250
287
|
return np.array(f_ieqcons(x, *args, **kwargs))
|
|
251
288
|
|
|
252
|
-
def
|
|
289
|
+
def _pso(func, lb, ub, ieqcons=[], f_ieqcons=None, args=(), kwargs={},
|
|
253
290
|
swarmsize=100, omega=0.5, phip=0.5, phig=0.5, maxiter=100,
|
|
254
291
|
minstep=1e-8, minfunc=1e-8, debug=False, processes=1,
|
|
255
292
|
particle_output=False):
|
|
@@ -377,7 +414,7 @@ def pso(func, lb, ub, ieqcons=[], f_ieqcons=None, args=(), kwargs={},
|
|
|
377
414
|
for i in range(S):
|
|
378
415
|
fx[i] = obj(x[i, :])
|
|
379
416
|
fs[i] = is_feasible(x[i, :])
|
|
380
|
-
|
|
417
|
+
|
|
381
418
|
# Store particle's best position (if constraints are satisfied)
|
|
382
419
|
i_update = np.logical_and((fx < fp), fs)
|
|
383
420
|
p[i_update, :] = x[i_update, :].copy()
|
ml_tools/pytorch_models.py
CHANGED
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
import torch
|
|
2
2
|
from torch import nn
|
|
3
|
+
from .utilities import _script_info
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"MyNeuralNetwork",
|
|
8
|
+
"MyLSTMNetwork"
|
|
9
|
+
]
|
|
3
10
|
|
|
4
11
|
|
|
5
12
|
class MyNeuralNetwork(nn.Module):
|
|
@@ -73,9 +80,11 @@ class MyNeuralNetwork(nn.Module):
|
|
|
73
80
|
return X
|
|
74
81
|
|
|
75
82
|
|
|
76
|
-
class
|
|
83
|
+
class _MyConvolutionalNetwork(nn.Module):
|
|
77
84
|
def __init__(self, outputs: int, color_channels: int=3, img_size: int=256, drop_out: float=0.2):
|
|
78
85
|
"""
|
|
86
|
+
- EDUCATIONAL PURPOSES ONLY, not optimized and requires lots of memory.
|
|
87
|
+
|
|
79
88
|
Create a basic Convolutional Neural Network with two convolution layers with a pooling layer after each convolution.
|
|
80
89
|
|
|
81
90
|
Args:
|
|
@@ -225,3 +234,6 @@ class MyLSTMNetwork(nn.Module):
|
|
|
225
234
|
else:
|
|
226
235
|
return output
|
|
227
236
|
|
|
237
|
+
|
|
238
|
+
def info():
|
|
239
|
+
_script_info(__all__)
|
ml_tools/trainer.py
CHANGED
|
@@ -6,6 +6,12 @@ import matplotlib.pyplot as plt
|
|
|
6
6
|
import torch
|
|
7
7
|
from torch import nn
|
|
8
8
|
from sklearn.metrics import mean_squared_error, classification_report, ConfusionMatrixDisplay, roc_curve, roc_auc_score, r2_score, median_absolute_error
|
|
9
|
+
from .utilities import _script_info
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"MyTrainer"
|
|
14
|
+
]
|
|
9
15
|
|
|
10
16
|
|
|
11
17
|
class MyTrainer():
|
|
@@ -288,36 +294,6 @@ class MyTrainer():
|
|
|
288
294
|
print(f"Area under the curve score: {area_under_curve:4.2f}")
|
|
289
295
|
else:
|
|
290
296
|
print("Error encountered while retrieving 'model.kind' attribute.")
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
def forecast(self, samples_list: list[torch.Tensor], view_as: tuple[int,int]=(1,-1)):
|
|
294
|
-
"""
|
|
295
|
-
DEPRECATED - Use `helpers.model_predict()` instead
|
|
296
|
-
|
|
297
|
-
Returns a list containing lists of predicted values, one for each sample.
|
|
298
|
-
|
|
299
|
-
Each sample must be a tensor and have the same shape and normalization expected by the model
|
|
300
|
-
(this method will add the batch dimension automatically).
|
|
301
|
-
|
|
302
|
-
Args:
|
|
303
|
-
`samples_list`: list of tensors.
|
|
304
|
-
|
|
305
|
-
`view_as`: reshape each output, default is (1,-1).
|
|
306
|
-
|
|
307
|
-
Returns: List of lists.
|
|
308
|
-
"""
|
|
309
|
-
self.model.eval()
|
|
310
|
-
results = list()
|
|
311
|
-
with torch.no_grad():
|
|
312
|
-
for data_point in samples_list:
|
|
313
|
-
data_point = data_point.unsqueeze(0).to(self.device)
|
|
314
|
-
output = self.model(data_point)
|
|
315
|
-
if self.kind == "classification":
|
|
316
|
-
results.append(output.argmax(dim=1).view(view_as).cpu().tolist())
|
|
317
|
-
else: #regression
|
|
318
|
-
results.append(output.view(view_as).cpu().tolist())
|
|
319
|
-
|
|
320
|
-
return results
|
|
321
297
|
|
|
322
298
|
|
|
323
299
|
def rnn_forecast(self, sequence: torch.Tensor, steps: int):
|
|
@@ -364,3 +340,7 @@ class MyTrainer():
|
|
|
364
340
|
# Cast to array and return
|
|
365
341
|
predictions = numpy.array(predictions)
|
|
366
342
|
return predictions
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def info():
|
|
346
|
+
_script_info(__all__)
|
ml_tools/utilities.py
CHANGED
|
@@ -4,19 +4,30 @@ import pandas as pd
|
|
|
4
4
|
import os
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
import re
|
|
7
|
+
from typing import Literal
|
|
7
8
|
|
|
8
9
|
|
|
9
|
-
|
|
10
|
+
# Keep track of available tools
|
|
11
|
+
__all__ = [
|
|
12
|
+
"list_csv_paths",
|
|
13
|
+
"load_dataframe",
|
|
14
|
+
"yield_dataframes_from_dir",
|
|
15
|
+
"merge_dataframes",
|
|
16
|
+
"save_dataframe",
|
|
17
|
+
"normalize_mixed_list",
|
|
18
|
+
"sanitize_filename"
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def list_csv_paths(directory: str) -> dict[str, str]:
|
|
10
23
|
"""
|
|
11
|
-
Lists all
|
|
24
|
+
Lists all `.csv` files in the specified directory and returns a mapping: filenames (without extensions) to their absolute paths.
|
|
12
25
|
|
|
13
26
|
Parameters:
|
|
14
27
|
directory (str): Path to the directory containing `.csv` files.
|
|
15
28
|
|
|
16
29
|
Returns:
|
|
17
|
-
|
|
18
|
-
- List of absolute paths to `.csv` files.
|
|
19
|
-
- List of corresponding base names (without extensions).
|
|
30
|
+
(dict[str, str]): Mapping {name, path}.
|
|
20
31
|
"""
|
|
21
32
|
dir_path = Path(directory).expanduser().resolve()
|
|
22
33
|
|
|
@@ -26,11 +37,15 @@ def list_csv_paths(directory: str) -> tuple[list[str], list[str]]:
|
|
|
26
37
|
csv_paths = list(dir_path.glob("*.csv"))
|
|
27
38
|
if not csv_paths:
|
|
28
39
|
raise IOError(f"No CSV files found in directory: {dir_path}")
|
|
40
|
+
|
|
41
|
+
# make a dictionary of paths and names
|
|
42
|
+
name_path_dict = {p.stem: str(p) for p in csv_paths}
|
|
43
|
+
|
|
44
|
+
print("🗂️ CSV files found:")
|
|
45
|
+
for name in name_path_dict.keys():
|
|
46
|
+
print(f"\t{name}")
|
|
29
47
|
|
|
30
|
-
|
|
31
|
-
names = [p.stem for p in csv_paths]
|
|
32
|
-
|
|
33
|
-
return paths, names
|
|
48
|
+
return name_path_dict
|
|
34
49
|
|
|
35
50
|
|
|
36
51
|
def load_dataframe(df_path: str) -> tuple[pd.DataFrame, str]:
|
|
@@ -49,7 +64,7 @@ def load_dataframe(df_path: str) -> tuple[pd.DataFrame, str]:
|
|
|
49
64
|
df_name = path.stem
|
|
50
65
|
if df.empty:
|
|
51
66
|
raise ValueError(f"DataFrame '{df_name}' is empty.")
|
|
52
|
-
print(f"Loaded dataset: '{df_name}' with shape: {df.shape}")
|
|
67
|
+
print(f"\n💿 Loaded dataset: '{df_name}' with shape: {df.shape}")
|
|
53
68
|
return df, df_name
|
|
54
69
|
|
|
55
70
|
|
|
@@ -71,15 +86,96 @@ def yield_dataframes_from_dir(datasets_dir: str):
|
|
|
71
86
|
- CSV files are read using UTF-8 encoding.
|
|
72
87
|
- Output is streamed via a generator to support lazy loading of multiple datasets.
|
|
73
88
|
"""
|
|
74
|
-
for
|
|
75
|
-
df =
|
|
76
|
-
print(f"Loaded dataset: '{df_name}' with shape: {df.shape}")
|
|
89
|
+
for df_name, df_path in list_csv_paths(datasets_dir).items():
|
|
90
|
+
df, _ = load_dataframe(df_path)
|
|
77
91
|
yield df, df_name
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def merge_dataframes(
|
|
95
|
+
*dfs: pd.DataFrame,
|
|
96
|
+
reset_index: bool = False,
|
|
97
|
+
direction: Literal["horizontal", "vertical"] = "horizontal"
|
|
98
|
+
) -> pd.DataFrame:
|
|
99
|
+
"""
|
|
100
|
+
Merges multiple DataFrames either horizontally or vertically.
|
|
101
|
+
|
|
102
|
+
Parameters:
|
|
103
|
+
*dfs (pd.DataFrame): Variable number of DataFrames to merge.
|
|
104
|
+
reset_index (bool): Whether to reset index in the final merged DataFrame.
|
|
105
|
+
direction (["horizontal" | "vertical"]):
|
|
106
|
+
- "horizontal": Merge on index, adding columns.
|
|
107
|
+
- "vertical": Append rows; all DataFrames must have identical columns.
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
pd.DataFrame: A single merged DataFrame.
|
|
111
|
+
|
|
112
|
+
Raises:
|
|
113
|
+
ValueError:
|
|
114
|
+
- If fewer than 2 DataFrames are provided.
|
|
115
|
+
- If indexes do not match for horizontal merge.
|
|
116
|
+
- If column names or order differ for vertical merge.
|
|
117
|
+
"""
|
|
118
|
+
if len(dfs) < 2:
|
|
119
|
+
raise ValueError("At least 2 DataFrames must be provided.")
|
|
120
|
+
|
|
121
|
+
for i, df in enumerate(dfs, start=1):
|
|
122
|
+
print(f"DataFrame {i} shape: {df.shape}")
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
if direction == "horizontal":
|
|
126
|
+
reference_index = dfs[0].index
|
|
127
|
+
for i, df in enumerate(dfs, start=1):
|
|
128
|
+
if not df.index.equals(reference_index):
|
|
129
|
+
raise ValueError(f"Indexes do not match: Dataset 1 and Dataset {i}.")
|
|
130
|
+
merged_df = pd.concat(dfs, axis=1)
|
|
131
|
+
|
|
132
|
+
elif direction == "vertical":
|
|
133
|
+
reference_columns = dfs[0].columns
|
|
134
|
+
for i, df in enumerate(dfs, start=1):
|
|
135
|
+
if not df.columns.equals(reference_columns):
|
|
136
|
+
raise ValueError(f"Column names/order do not match: Dataset 1 and Dataset {i}.")
|
|
137
|
+
merged_df = pd.concat(dfs, axis=0)
|
|
138
|
+
|
|
139
|
+
else:
|
|
140
|
+
raise ValueError(f"Invalid merge direction: {direction}")
|
|
141
|
+
|
|
142
|
+
if reset_index:
|
|
143
|
+
merged_df = merged_df.reset_index(drop=True)
|
|
144
|
+
|
|
145
|
+
print(f"Merged DataFrame shape: {merged_df.shape}")
|
|
146
|
+
|
|
147
|
+
return merged_df
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def save_dataframe(df: pd.DataFrame, save_dir: str, filename: str) -> None:
|
|
151
|
+
"""
|
|
152
|
+
Save a pandas DataFrame to a CSV file.
|
|
153
|
+
|
|
154
|
+
Parameters:
|
|
155
|
+
df: pandas.DataFrame to save
|
|
156
|
+
save_dir: str, directory where the CSV file will be saved.
|
|
157
|
+
filename: str, CSV filename, extension will be added if missing.
|
|
158
|
+
"""
|
|
159
|
+
if df.empty:
|
|
160
|
+
print(f"⚠️ Attempting to save an empty DataFrame: '{filename}'. Process Skipped.")
|
|
161
|
+
return
|
|
162
|
+
|
|
163
|
+
os.makedirs(save_dir, exist_ok=True)
|
|
164
|
+
|
|
165
|
+
filename = sanitize_filename(filename)
|
|
166
|
+
|
|
167
|
+
if not filename.endswith('.csv'):
|
|
168
|
+
filename += '.csv'
|
|
78
169
|
|
|
170
|
+
output_path = os.path.join(save_dir, filename)
|
|
79
171
|
|
|
172
|
+
df.to_csv(output_path, index=False, encoding='utf-8')
|
|
173
|
+
print(f"✅ Saved file: '{filename}'")
|
|
174
|
+
|
|
175
|
+
|
|
80
176
|
def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
|
|
81
177
|
"""
|
|
82
|
-
Normalize a mixed list of numeric values and strings so that the sum of the values equals 1.0,
|
|
178
|
+
Normalize a mixed list of numeric values and strings casted to floats so that the sum of the values equals 1.0,
|
|
83
179
|
applying heuristic adjustments to correct for potential data entry scale mismatches.
|
|
84
180
|
|
|
85
181
|
Parameters:
|
|
@@ -166,3 +262,15 @@ def sanitize_filename(filename: str) -> str:
|
|
|
166
262
|
|
|
167
263
|
return sanitized
|
|
168
264
|
|
|
265
|
+
|
|
266
|
+
def _script_info(all_data: list[str]):
|
|
267
|
+
"""
|
|
268
|
+
List available names.
|
|
269
|
+
"""
|
|
270
|
+
print("Available functions and objects:")
|
|
271
|
+
for i, name in enumerate(all_data, start=1):
|
|
272
|
+
print(f"{i} - {name}")
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def info():
|
|
276
|
+
_script_info(__all__)
|
ml_tools/vision_helpers.py
CHANGED
|
@@ -4,9 +4,18 @@ from PIL import Image, ImageOps
|
|
|
4
4
|
from typing import Literal
|
|
5
5
|
from torchvision import transforms
|
|
6
6
|
import torch
|
|
7
|
+
from .utilities import _script_info
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"inspect_images",
|
|
12
|
+
"image_augmentation",
|
|
13
|
+
"ResizeAspectFill",
|
|
14
|
+
"is_image",
|
|
15
|
+
"model_predict"
|
|
16
|
+
]
|
|
7
17
|
|
|
8
18
|
|
|
9
|
-
# --- Helper Functions ---
|
|
10
19
|
def inspect_images(path: str):
|
|
11
20
|
"""
|
|
12
21
|
Prints out the types, sizes and channels of image files found in the directory and its subdirectories.
|
|
@@ -216,3 +225,7 @@ def model_predict(model: torch.nn.Module, kind: Literal["regression", "classific
|
|
|
216
225
|
results.append(output.view(view_as).cpu().tolist())
|
|
217
226
|
|
|
218
227
|
return results
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def info():
|
|
231
|
+
_script_info(__all__)
|
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
dragon_ml_toolbox-1.3.2.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
|
|
2
|
-
dragon_ml_toolbox-1.3.2.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=e1Hg5ZtaBpDV7ZvxhLe1ac28l7nMjvi1MSE5YvB1s-o,1472
|
|
3
|
-
ml_tools/MICE_imputation.py,sha256=71Kdi5rhPePIT5rJKIyRCM7ORPSjeujQCzKcLIwXs90,9428
|
|
4
|
-
ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
-
ml_tools/data_exploration.py,sha256=laTNbN5_xlhqWiKfF-cJ9yMZ8zAM2a-AryqgiIQBBLg,26649
|
|
6
|
-
ml_tools/datasetmaster.py,sha256=VUneKshnmjOGbtqVVGTFcIMRKF3s6ZDYrosIYKDjD80,28956
|
|
7
|
-
ml_tools/ensemble_learning.py,sha256=5UmlXI3Orm5zL0P07Ub_Y0gwjruH-REHY-cFWQpJWb0,29085
|
|
8
|
-
ml_tools/handle_excel.py,sha256=IR0VQc3hYdmjwC31E5YxDnRcWig4jSIx7Y_7to-KZz4,11969
|
|
9
|
-
ml_tools/logger.py,sha256=XwSpCUzw2Le24fJHyljBxNLgw63SwjZ0pMjTJqf0ylI,4622
|
|
10
|
-
ml_tools/particle_swarm_optimization.py,sha256=jpkje4OETC9fyISxxUTx4XGrImSU6gDEcwz46ZDs2bQ,19250
|
|
11
|
-
ml_tools/pytorch_models.py,sha256=Oykw02sOZLCjvSadQd64UGesBN7kq0x1EGXHusvYiQI,9908
|
|
12
|
-
ml_tools/trainer.py,sha256=Zd7AaHeoNd8dEas2JChWoHaCUpWUVRDUMybuHaKJ0XY,16740
|
|
13
|
-
ml_tools/utilities.py,sha256=mG_--EFplfI9H7OhrWI8VkdNJtTbs4Wbz32xvcFWps8,5518
|
|
14
|
-
ml_tools/vision_helpers.py,sha256=lBAW6dzAK-HOswAt1fU_tfP9hkNLY5D8c_I_7hhEXno,7528
|
|
15
|
-
dragon_ml_toolbox-1.3.2.dist-info/METADATA,sha256=NgNKZD1v97kBBdE96OJELolvlAXviJ-DgJvZAjjy5Ik,2309
|
|
16
|
-
dragon_ml_toolbox-1.3.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
17
|
-
dragon_ml_toolbox-1.3.2.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
18
|
-
dragon_ml_toolbox-1.3.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|