dragon-ml-toolbox 1.4.5__py3-none-any.whl → 1.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dragon_ml_toolbox-1.4.5.dist-info → dragon_ml_toolbox-1.4.7.dist-info}/METADATA +1 -1
- {dragon_ml_toolbox-1.4.5.dist-info → dragon_ml_toolbox-1.4.7.dist-info}/RECORD +9 -9
- ml_tools/data_exploration.py +36 -5
- ml_tools/particle_swarm_optimization.py +50 -12
- ml_tools/utilities.py +39 -4
- {dragon_ml_toolbox-1.4.5.dist-info → dragon_ml_toolbox-1.4.7.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-1.4.5.dist-info → dragon_ml_toolbox-1.4.7.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-1.4.5.dist-info → dragon_ml_toolbox-1.4.7.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-1.4.5.dist-info → dragon_ml_toolbox-1.4.7.dist-info}/top_level.txt +0 -0
|
@@ -1,19 +1,19 @@
|
|
|
1
|
-
dragon_ml_toolbox-1.4.
|
|
2
|
-
dragon_ml_toolbox-1.4.
|
|
1
|
+
dragon_ml_toolbox-1.4.7.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
|
|
2
|
+
dragon_ml_toolbox-1.4.7.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=jDnniT0tgD0uw1NpjibsPF-qK3wmOKgTykLG2iNQU7E,1840
|
|
3
3
|
ml_tools/MICE_imputation.py,sha256=JMe9hyidJadFTHW7AHkNQ_fduTxH6CEh7_Ouy2LhCOQ,11096
|
|
4
4
|
ml_tools/VIF_factor.py,sha256=HEBsLJy_qSDaPw1Btha5B7omxN4wjJXg-sqoetCjCJw,10016
|
|
5
5
|
ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
-
ml_tools/data_exploration.py,sha256=
|
|
6
|
+
ml_tools/data_exploration.py,sha256=X9mYZdynRGghT06GeOdVsfGBTFa342Ko-MkMImDll-M,20123
|
|
7
7
|
ml_tools/datasetmaster.py,sha256=EFUEX-tqq94Ak1rXXYR-XaX85olrxvF2EuytdzUK7y0,29131
|
|
8
8
|
ml_tools/ensemble_learning.py,sha256=xJyEbkFObm5YX6DmDW10FOUjSeYeBRhHLvncWZv_uTo,37319
|
|
9
9
|
ml_tools/handle_excel.py,sha256=ZJui5__0rc2T8UGHTheqZGhKmdVZ7Q2I54IoYCjAqJw,12612
|
|
10
10
|
ml_tools/logger.py,sha256=ZTtUB9HTkNs5zHTdYRKNbKADjUkuObsF7s8U5pNnVRA,4716
|
|
11
|
-
ml_tools/particle_swarm_optimization.py,sha256=
|
|
11
|
+
ml_tools/particle_swarm_optimization.py,sha256=wRk5ni6pPnh-tqS5t9M5TAjg8GUSGFxp-u09FSIviOM,22213
|
|
12
12
|
ml_tools/pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
|
|
13
13
|
ml_tools/trainer.py,sha256=WAZ4EdrZuTOAnGXRWV3XcLNce4s7EKGf2-qchLC08Ik,15702
|
|
14
|
-
ml_tools/utilities.py,sha256=
|
|
14
|
+
ml_tools/utilities.py,sha256=Ir3Yw4SuWMLKnbnl4Qzudn5U8CgcQ7zMtNqcllZMHeM,15682
|
|
15
15
|
ml_tools/vision_helpers.py,sha256=idQ-Ugp1IdsvwXiYyhYa9G3rTRTm37YRpkQDLEpANHM,7701
|
|
16
|
-
dragon_ml_toolbox-1.4.
|
|
17
|
-
dragon_ml_toolbox-1.4.
|
|
18
|
-
dragon_ml_toolbox-1.4.
|
|
19
|
-
dragon_ml_toolbox-1.4.
|
|
16
|
+
dragon_ml_toolbox-1.4.7.dist-info/METADATA,sha256=Z_ai6XNFd8a_sf9CD73kh3mgVJlgVhcajXiJhkEivsM,2516
|
|
17
|
+
dragon_ml_toolbox-1.4.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
18
|
+
dragon_ml_toolbox-1.4.7.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
19
|
+
dragon_ml_toolbox-1.4.7.dist-info/RECORD,,
|
ml_tools/data_exploration.py
CHANGED
|
@@ -5,9 +5,10 @@ import seaborn as sns
|
|
|
5
5
|
from IPython import get_ipython
|
|
6
6
|
from IPython.display import clear_output
|
|
7
7
|
import time
|
|
8
|
-
from typing import Union, Literal, Dict, Tuple,
|
|
8
|
+
from typing import Union, Literal, Dict, Tuple, List
|
|
9
9
|
import os
|
|
10
10
|
from ml_tools.utilities import sanitize_filename, _script_info
|
|
11
|
+
import re
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
# Keep track of all available tools, show using `info()`
|
|
@@ -22,7 +23,8 @@ __all__ = [
|
|
|
22
23
|
"check_value_distributions",
|
|
23
24
|
"plot_value_distributions",
|
|
24
25
|
"clip_outliers_single",
|
|
25
|
-
"clip_outliers_multi"
|
|
26
|
+
"clip_outliers_multi",
|
|
27
|
+
"match_and_filter_columns_by_regex"
|
|
26
28
|
]
|
|
27
29
|
|
|
28
30
|
|
|
@@ -245,9 +247,6 @@ def plot_correlation_heatmap(df: pd.DataFrame, save_dir: Union[str, None] = None
|
|
|
245
247
|
cbar_kws={"shrink": 0.8}
|
|
246
248
|
)
|
|
247
249
|
|
|
248
|
-
# sanitize the plot title
|
|
249
|
-
plot_title = sanitize_filename(plot_title)
|
|
250
|
-
|
|
251
250
|
plt.title(plot_title)
|
|
252
251
|
plt.xticks(rotation=45, ha='right')
|
|
253
252
|
plt.yticks(rotation=0)
|
|
@@ -255,6 +254,8 @@ def plot_correlation_heatmap(df: pd.DataFrame, save_dir: Union[str, None] = None
|
|
|
255
254
|
plt.tight_layout()
|
|
256
255
|
|
|
257
256
|
if save_dir:
|
|
257
|
+
# sanitize the plot title to save the file
|
|
258
|
+
plot_title = sanitize_filename(plot_title)
|
|
258
259
|
os.makedirs(save_dir, exist_ok=True)
|
|
259
260
|
full_path = os.path.join(save_dir, plot_title + ".svg")
|
|
260
261
|
plt.savefig(full_path, bbox_inches="tight", format='svg')
|
|
@@ -518,6 +519,36 @@ def clip_outliers_multi(
|
|
|
518
519
|
return new_df
|
|
519
520
|
|
|
520
521
|
|
|
522
|
+
def match_and_filter_columns_by_regex(
|
|
523
|
+
df: pd.DataFrame,
|
|
524
|
+
pattern: str,
|
|
525
|
+
case_sensitive: bool = False,
|
|
526
|
+
escape_pattern: bool = False
|
|
527
|
+
) -> Tuple[pd.DataFrame, List[str]]:
|
|
528
|
+
"""
|
|
529
|
+
Return a tuple of (filtered DataFrame, matched column names) based on a regex pattern.
|
|
530
|
+
|
|
531
|
+
Parameters:
|
|
532
|
+
df (pd.DataFrame): The DataFrame to search.
|
|
533
|
+
pattern (str): The regex pattern to match column names (use a raw string).
|
|
534
|
+
case_sensitive (bool): Whether matching is case-sensitive.
|
|
535
|
+
escape_pattern (bool): If True, the pattern is escaped with `re.escape()` to treat it literally.
|
|
536
|
+
|
|
537
|
+
Returns:
|
|
538
|
+
(Tuple[pd.DataFrame, list[str]]): A DataFrame filtered to matched columns, and a list of matching column names.
|
|
539
|
+
"""
|
|
540
|
+
if escape_pattern:
|
|
541
|
+
pattern = re.escape(pattern)
|
|
542
|
+
|
|
543
|
+
mask = df.columns.str.contains(pattern, case=case_sensitive, regex=True)
|
|
544
|
+
matched_columns = df.columns[mask].to_list()
|
|
545
|
+
filtered_df = df.loc[:, mask]
|
|
546
|
+
|
|
547
|
+
print(f"{len(matched_columns)} column(s) match the regex pattern '{pattern}'.")
|
|
548
|
+
|
|
549
|
+
return filtered_df, matched_columns
|
|
550
|
+
|
|
551
|
+
|
|
521
552
|
def _is_notebook():
|
|
522
553
|
return get_ipython() is not None
|
|
523
554
|
|
|
@@ -8,11 +8,13 @@ from sklearn.base import ClassifierMixin
|
|
|
8
8
|
from typing import Literal, Union, Tuple, Dict, Optional
|
|
9
9
|
import polars as pl
|
|
10
10
|
from functools import partial
|
|
11
|
-
from
|
|
11
|
+
from copy import deepcopy
|
|
12
|
+
from .utilities import sanitize_filename, _script_info, threshold_binary_values, deserialize_object, list_files_by_extension
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
__all__ = [
|
|
15
16
|
"ObjectiveFunction",
|
|
17
|
+
"multiple_objective_functions_from_dir",
|
|
16
18
|
"run_pso"
|
|
17
19
|
]
|
|
18
20
|
|
|
@@ -29,12 +31,12 @@ class ObjectiveFunction():
|
|
|
29
31
|
Path to a serialized model (joblib) compatible with scikit-learn-like `.predict`.
|
|
30
32
|
add_noise : bool
|
|
31
33
|
Whether to apply multiplicative noise to the input features during evaluation.
|
|
32
|
-
|
|
33
|
-
Number of binary features located at the END of the feature vector. Model should be trained with continuous features first, followed by binary.
|
|
34
|
-
task : Literal, default 'maximization'
|
|
34
|
+
task : (Literal["maximization", "minimization"])
|
|
35
35
|
Whether to maximize or minimize the target.
|
|
36
|
+
binary_features : int
|
|
37
|
+
Number of binary features located at the END of the feature vector. Model should be trained with continuous features first, followed by binary.
|
|
36
38
|
"""
|
|
37
|
-
def __init__(self, trained_model_path: str, add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int
|
|
39
|
+
def __init__(self, trained_model_path: str, add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int) -> None:
|
|
38
40
|
self.binary_features = binary_features
|
|
39
41
|
self.is_hybrid = False if binary_features <= 0 else True
|
|
40
42
|
self.use_noise = add_noise
|
|
@@ -96,6 +98,35 @@ class ObjectiveFunction():
|
|
|
96
98
|
return (f"<ObjectiveFunction(model={type(self.model).__name__}, use_noise={self.use_noise}, is_hybrid={self.is_hybrid}, task='{self.task}')>")
|
|
97
99
|
|
|
98
100
|
|
|
101
|
+
def multiple_objective_functions_from_dir(directory: str, add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int):
|
|
102
|
+
"""
|
|
103
|
+
Loads multiple objective functions from serialized models in the given directory.
|
|
104
|
+
|
|
105
|
+
Each `.joblib` file which is loaded and wrapped as an `ObjectiveFunction` instance. Returns a list of such instances along with their corresponding names.
|
|
106
|
+
|
|
107
|
+
Parameters:
|
|
108
|
+
directory (str) : Path to the directory containing `.joblib` files (serialized models).
|
|
109
|
+
add_noise (bool) : Whether to apply multiplicative noise to the input features during evaluation.
|
|
110
|
+
task (Literal["maximization", "minimization"]) : Defines the nature of the optimization task.
|
|
111
|
+
binary_features (int) : Number of binary features expected by each objective function.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
(tuple[list[ObjectiveFunction], list[str]]) : A tuple containing:
|
|
115
|
+
- list of `ObjectiveFunction` instances.
|
|
116
|
+
- list of corresponding filenames.
|
|
117
|
+
"""
|
|
118
|
+
objective_functions = list()
|
|
119
|
+
objective_function_names = list()
|
|
120
|
+
for file_name, file_path in list_files_by_extension(directory=directory, extension='joblib').items():
|
|
121
|
+
current_objective = ObjectiveFunction(trained_model_path=file_path,
|
|
122
|
+
add_noise=add_noise,
|
|
123
|
+
task=task,
|
|
124
|
+
binary_features=binary_features)
|
|
125
|
+
objective_functions.append(current_objective)
|
|
126
|
+
objective_function_names.append(file_name)
|
|
127
|
+
return objective_functions, objective_function_names
|
|
128
|
+
|
|
129
|
+
|
|
99
130
|
def _set_boundaries(lower_boundaries: list[float], upper_boundaries: list[float]):
|
|
100
131
|
assert len(lower_boundaries) == len(upper_boundaries), "Lower and upper boundaries must have the same length."
|
|
101
132
|
assert len(lower_boundaries) >= 1, "At least one boundary pair is required."
|
|
@@ -131,9 +162,9 @@ def run_pso(lower_boundaries: list[float],
|
|
|
131
162
|
target_name: Union[str, None]=None,
|
|
132
163
|
feature_names: Union[list[str], None]=None,
|
|
133
164
|
swarm_size: int=200,
|
|
134
|
-
max_iterations: int=
|
|
165
|
+
max_iterations: int=1000,
|
|
135
166
|
inequality_constrain_function=None,
|
|
136
|
-
post_hoc_analysis: Optional[int]=
|
|
167
|
+
post_hoc_analysis: Optional[int]=3,
|
|
137
168
|
workers: int=1) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
|
|
138
169
|
"""
|
|
139
170
|
Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results as a CSV file.
|
|
@@ -180,18 +211,25 @@ def run_pso(lower_boundaries: list[float],
|
|
|
180
211
|
-----
|
|
181
212
|
- PSO minimizes the objective function by default; if maximization is desired, it should be handled inside the ObjectiveFunction.
|
|
182
213
|
"""
|
|
214
|
+
# set local deep copies to prevent in place list modification
|
|
215
|
+
local_lower_boundaries = deepcopy(lower_boundaries)
|
|
216
|
+
local_upper_boundaries = deepcopy(upper_boundaries)
|
|
217
|
+
|
|
183
218
|
# Append binary boundaries
|
|
184
219
|
binary_number = objective_function.binary_features
|
|
185
220
|
if auto_binary_boundaries and binary_number > 0:
|
|
186
|
-
|
|
187
|
-
|
|
221
|
+
local_lower_boundaries.extend([0] * binary_number)
|
|
222
|
+
local_upper_boundaries.extend([1] * binary_number)
|
|
223
|
+
|
|
224
|
+
# Set the total length of features
|
|
225
|
+
size_of_features = len(local_lower_boundaries)
|
|
188
226
|
|
|
189
|
-
lower, upper = _set_boundaries(
|
|
227
|
+
lower, upper = _set_boundaries(local_lower_boundaries, local_upper_boundaries)
|
|
190
228
|
|
|
191
229
|
# feature names
|
|
192
230
|
if feature_names is None and objective_function.feature_names is not None:
|
|
193
231
|
feature_names = objective_function.feature_names
|
|
194
|
-
names = _set_feature_names(size=
|
|
232
|
+
names = _set_feature_names(size=size_of_features, names=feature_names)
|
|
195
233
|
|
|
196
234
|
# target name
|
|
197
235
|
if target_name is None and objective_function.target_name is not None:
|
|
@@ -233,7 +271,7 @@ def run_pso(lower_boundaries: list[float],
|
|
|
233
271
|
return best_features_named, best_target_named
|
|
234
272
|
else:
|
|
235
273
|
all_best_targets = list()
|
|
236
|
-
all_best_features = [[] for _ in range(
|
|
274
|
+
all_best_features = [[] for _ in range(size_of_features)]
|
|
237
275
|
for _ in range(post_hoc_analysis):
|
|
238
276
|
best_features, best_target, *_ = _pso(**arguments)
|
|
239
277
|
# best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)
|
ml_tools/utilities.py
CHANGED
|
@@ -13,6 +13,7 @@ from joblib.externals.loky.process_executor import TerminatedWorkerError
|
|
|
13
13
|
# Keep track of available tools
|
|
14
14
|
__all__ = [
|
|
15
15
|
"list_csv_paths",
|
|
16
|
+
"list_files_by_extension",
|
|
16
17
|
"load_dataframe",
|
|
17
18
|
"yield_dataframes_from_dir",
|
|
18
19
|
"merge_dataframes",
|
|
@@ -34,7 +35,7 @@ def list_csv_paths(directory: str) -> dict[str, str]:
|
|
|
34
35
|
directory (str): Path to the directory containing `.csv` files.
|
|
35
36
|
|
|
36
37
|
Returns:
|
|
37
|
-
(dict[str, str]):
|
|
38
|
+
(dict[str, str]): Dictionary mapping {filename: filepath}.
|
|
38
39
|
"""
|
|
39
40
|
dir_path = Path(directory).expanduser().resolve()
|
|
40
41
|
|
|
@@ -48,13 +49,47 @@ def list_csv_paths(directory: str) -> dict[str, str]:
|
|
|
48
49
|
# make a dictionary of paths and names
|
|
49
50
|
name_path_dict = {p.stem: str(p) for p in csv_paths}
|
|
50
51
|
|
|
51
|
-
print("🗂️ CSV files found:")
|
|
52
|
+
print("\n🗂️ CSV files found:")
|
|
52
53
|
for name in name_path_dict.keys():
|
|
53
54
|
print(f"\t{name}")
|
|
54
55
|
|
|
55
56
|
return name_path_dict
|
|
56
57
|
|
|
57
58
|
|
|
59
|
+
def list_files_by_extension(directory: str, extension: str) -> dict[str, str]:
|
|
60
|
+
"""
|
|
61
|
+
Lists all files with the specified extension in the given directory and returns a mapping:
|
|
62
|
+
filenames (without extensions) to their absolute paths.
|
|
63
|
+
|
|
64
|
+
Parameters:
|
|
65
|
+
directory (str): Path to the directory to search in.
|
|
66
|
+
extension (str): File extension to search for (e.g., 'json', 'txt').
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
(dict[str, str]): Dictionary mapping {filename: filepath}.
|
|
70
|
+
"""
|
|
71
|
+
dir_path = Path(directory).expanduser().resolve()
|
|
72
|
+
|
|
73
|
+
if not dir_path.is_dir():
|
|
74
|
+
raise FileNotFoundError(f"Directory not found: {dir_path}")
|
|
75
|
+
|
|
76
|
+
# Normalize the extension (remove leading dot if present)
|
|
77
|
+
normalized_ext = extension.lstrip(".").lower()
|
|
78
|
+
pattern = f"*.{normalized_ext}"
|
|
79
|
+
|
|
80
|
+
matched_paths = list(dir_path.glob(pattern))
|
|
81
|
+
if not matched_paths:
|
|
82
|
+
raise IOError(f"No '.{normalized_ext}' files found in directory: {dir_path}")
|
|
83
|
+
|
|
84
|
+
name_path_dict = {p.stem: str(p) for p in matched_paths}
|
|
85
|
+
|
|
86
|
+
print(f"\n📂 '{normalized_ext.upper()}' files found:")
|
|
87
|
+
for name in name_path_dict:
|
|
88
|
+
print(f"\t{name}")
|
|
89
|
+
|
|
90
|
+
return name_path_dict
|
|
91
|
+
|
|
92
|
+
|
|
58
93
|
def load_dataframe(df_path: str) -> tuple[pd.DataFrame, str]:
|
|
59
94
|
"""
|
|
60
95
|
Load a CSV file into a pandas DataFrame and extract the base name (without extension) from the file path.
|
|
@@ -404,8 +439,8 @@ def distribute_datasets_by_target(
|
|
|
404
439
|
Yields
|
|
405
440
|
------
|
|
406
441
|
Tuple[str, pd.DataFrame]
|
|
407
|
-
*
|
|
408
|
-
*
|
|
442
|
+
* Target name.
|
|
443
|
+
* Pandas DataFrame.
|
|
409
444
|
"""
|
|
410
445
|
# Validate path
|
|
411
446
|
if isinstance(df_or_path, str):
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|