dragon-ml-toolbox 1.4.4__py3-none-any.whl → 1.4.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dragon_ml_toolbox-1.4.4.dist-info → dragon_ml_toolbox-1.4.6.dist-info}/METADATA +1 -1
- dragon_ml_toolbox-1.4.6.dist-info/RECORD +19 -0
- ml_tools/MICE_imputation.py +22 -6
- ml_tools/data_exploration.py +33 -38
- ml_tools/ensemble_learning.py +4 -3
- ml_tools/particle_swarm_optimization.py +41 -9
- ml_tools/utilities.py +185 -26
- dragon_ml_toolbox-1.4.4.dist-info/RECORD +0 -19
- {dragon_ml_toolbox-1.4.4.dist-info → dragon_ml_toolbox-1.4.6.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-1.4.4.dist-info → dragon_ml_toolbox-1.4.6.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-1.4.4.dist-info → dragon_ml_toolbox-1.4.6.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-1.4.4.dist-info → dragon_ml_toolbox-1.4.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
dragon_ml_toolbox-1.4.6.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
|
|
2
|
+
dragon_ml_toolbox-1.4.6.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=jDnniT0tgD0uw1NpjibsPF-qK3wmOKgTykLG2iNQU7E,1840
|
|
3
|
+
ml_tools/MICE_imputation.py,sha256=JMe9hyidJadFTHW7AHkNQ_fduTxH6CEh7_Ouy2LhCOQ,11096
|
|
4
|
+
ml_tools/VIF_factor.py,sha256=HEBsLJy_qSDaPw1Btha5B7omxN4wjJXg-sqoetCjCJw,10016
|
|
5
|
+
ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
ml_tools/data_exploration.py,sha256=X9mYZdynRGghT06GeOdVsfGBTFa342Ko-MkMImDll-M,20123
|
|
7
|
+
ml_tools/datasetmaster.py,sha256=EFUEX-tqq94Ak1rXXYR-XaX85olrxvF2EuytdzUK7y0,29131
|
|
8
|
+
ml_tools/ensemble_learning.py,sha256=xJyEbkFObm5YX6DmDW10FOUjSeYeBRhHLvncWZv_uTo,37319
|
|
9
|
+
ml_tools/handle_excel.py,sha256=ZJui5__0rc2T8UGHTheqZGhKmdVZ7Q2I54IoYCjAqJw,12612
|
|
10
|
+
ml_tools/logger.py,sha256=ZTtUB9HTkNs5zHTdYRKNbKADjUkuObsF7s8U5pNnVRA,4716
|
|
11
|
+
ml_tools/particle_swarm_optimization.py,sha256=ByCYFV8PWP9CYGZ0wblphtmDLRbSezY9a0_fGqGWQV4,21891
|
|
12
|
+
ml_tools/pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
|
|
13
|
+
ml_tools/trainer.py,sha256=WAZ4EdrZuTOAnGXRWV3XcLNce4s7EKGf2-qchLC08Ik,15702
|
|
14
|
+
ml_tools/utilities.py,sha256=Ir3Yw4SuWMLKnbnl4Qzudn5U8CgcQ7zMtNqcllZMHeM,15682
|
|
15
|
+
ml_tools/vision_helpers.py,sha256=idQ-Ugp1IdsvwXiYyhYa9G3rTRTm37YRpkQDLEpANHM,7701
|
|
16
|
+
dragon_ml_toolbox-1.4.6.dist-info/METADATA,sha256=SDqa8Cz72fH669cfuMIcVX02SC0DeK0UmKU-fAPx4AU,2516
|
|
17
|
+
dragon_ml_toolbox-1.4.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
18
|
+
dragon_ml_toolbox-1.4.6.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
19
|
+
dragon_ml_toolbox-1.4.6.dist-info/RECORD,,
|
ml_tools/MICE_imputation.py
CHANGED
|
@@ -3,8 +3,9 @@ import miceforest as mf
|
|
|
3
3
|
import os
|
|
4
4
|
import matplotlib.pyplot as plt
|
|
5
5
|
import numpy as np
|
|
6
|
-
from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info, merge_dataframes, save_dataframe
|
|
6
|
+
from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info, merge_dataframes, save_dataframe, threshold_binary_values
|
|
7
7
|
from plotnine import ggplot, labs, theme, element_blank # type: ignore
|
|
8
|
+
from typing import Optional
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
__all__ = [
|
|
@@ -17,7 +18,7 @@ __all__ = [
|
|
|
17
18
|
]
|
|
18
19
|
|
|
19
20
|
|
|
20
|
-
def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
|
|
21
|
+
def apply_mice(df: pd.DataFrame, df_name: str, binary_columns: Optional[list[str]]=None, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
|
|
21
22
|
|
|
22
23
|
# Initialize kernel with number of imputed datasets to generate
|
|
23
24
|
kernel = mf.ImputationKernel(
|
|
@@ -35,6 +36,16 @@ def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterat
|
|
|
35
36
|
if imputed_datasets is None or len(imputed_datasets) == 0:
|
|
36
37
|
raise ValueError("No imputed datasets were generated. Check the MICE process.")
|
|
37
38
|
|
|
39
|
+
# threshold binary columns
|
|
40
|
+
if binary_columns is not None:
|
|
41
|
+
invalid_binary_columns = set(binary_columns) - set(df.columns)
|
|
42
|
+
if invalid_binary_columns:
|
|
43
|
+
print(f"⚠️ These 'binary columns' are not in the dataset: {invalid_binary_columns}")
|
|
44
|
+
valid_binary_columns = [col for col in binary_columns if col not in invalid_binary_columns]
|
|
45
|
+
for imputed_df in imputed_datasets:
|
|
46
|
+
for binary_column_name in valid_binary_columns:
|
|
47
|
+
imputed_df[binary_column_name] = threshold_binary_values(imputed_df[binary_column_name]) # type: ignore
|
|
48
|
+
|
|
38
49
|
if resulting_datasets == 1:
|
|
39
50
|
imputed_dataset_names = [f"{df_name}_MICE"]
|
|
40
51
|
else:
|
|
@@ -106,7 +117,7 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
|
|
|
106
117
|
# Adjust plot display for the X axis
|
|
107
118
|
_ticks = np.arange(iterations_cap)
|
|
108
119
|
_labels = np.arange(1, iterations_cap + 1)
|
|
109
|
-
plt.xticks(ticks=_ticks, labels=_labels)
|
|
120
|
+
plt.xticks(ticks=_ticks, labels=_labels) # type: ignore
|
|
110
121
|
plt.grid(True)
|
|
111
122
|
|
|
112
123
|
feature_save_name = sanitize_filename(feature_name)
|
|
@@ -202,7 +213,12 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
|
|
|
202
213
|
print(f"{local_dir_name} completed.")
|
|
203
214
|
|
|
204
215
|
|
|
205
|
-
def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str],
|
|
216
|
+
def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str],
|
|
217
|
+
save_datasets_dir: str, save_metrics_dir: str,
|
|
218
|
+
binary_columns: Optional[list[str]]=None,
|
|
219
|
+
resulting_datasets: int=1,
|
|
220
|
+
iterations: int=20,
|
|
221
|
+
random_state: int=101):
|
|
206
222
|
"""
|
|
207
223
|
Call functions in sequence for each dataset in the provided path or directory:
|
|
208
224
|
1. Load dataframe
|
|
@@ -211,7 +227,7 @@ def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str], save_datas
|
|
|
211
227
|
4. Save convergence metrics
|
|
212
228
|
5. Save distribution metrics
|
|
213
229
|
|
|
214
|
-
Target columns must be skipped from the imputation.
|
|
230
|
+
Target columns must be skipped from the imputation. Binary columns will be thresholded after imputation.
|
|
215
231
|
"""
|
|
216
232
|
# Check paths
|
|
217
233
|
os.makedirs(save_datasets_dir, exist_ok=True)
|
|
@@ -229,7 +245,7 @@ def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str], save_datas
|
|
|
229
245
|
|
|
230
246
|
df, df_targets = _skip_targets(df, target_columns)
|
|
231
247
|
|
|
232
|
-
kernel, imputed_datasets, imputed_dataset_names = apply_mice(df=df, df_name=df_name, resulting_datasets=resulting_datasets, iterations=iterations, random_state=random_state)
|
|
248
|
+
kernel, imputed_datasets, imputed_dataset_names = apply_mice(df=df, df_name=df_name, binary_columns=binary_columns, resulting_datasets=resulting_datasets, iterations=iterations, random_state=random_state)
|
|
233
249
|
|
|
234
250
|
save_imputed_datasets(save_dir=save_datasets_dir, imputed_datasets=imputed_datasets, df_targets=df_targets, imputed_dataset_names=imputed_dataset_names)
|
|
235
251
|
|
ml_tools/data_exploration.py
CHANGED
|
@@ -5,9 +5,10 @@ import seaborn as sns
|
|
|
5
5
|
from IPython import get_ipython
|
|
6
6
|
from IPython.display import clear_output
|
|
7
7
|
import time
|
|
8
|
-
from typing import Union, Literal, Dict, Tuple,
|
|
8
|
+
from typing import Union, Literal, Dict, Tuple, List
|
|
9
9
|
import os
|
|
10
10
|
from ml_tools.utilities import sanitize_filename, _script_info
|
|
11
|
+
import re
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
# Keep track of all available tools, show using `info()`
|
|
@@ -23,7 +24,7 @@ __all__ = [
|
|
|
23
24
|
"plot_value_distributions",
|
|
24
25
|
"clip_outliers_single",
|
|
25
26
|
"clip_outliers_multi",
|
|
26
|
-
"
|
|
27
|
+
"match_and_filter_columns_by_regex"
|
|
27
28
|
]
|
|
28
29
|
|
|
29
30
|
|
|
@@ -90,18 +91,18 @@ def split_features_targets(df: pd.DataFrame, targets: list[str]):
|
|
|
90
91
|
|
|
91
92
|
Returns:
|
|
92
93
|
tuple: A tuple containing:
|
|
93
|
-
- pd.DataFrame: Targets dataframe.
|
|
94
94
|
- pd.DataFrame: Features dataframe.
|
|
95
|
+
- pd.DataFrame: Targets dataframe.
|
|
95
96
|
|
|
96
97
|
Prints:
|
|
97
98
|
- Shape of the original dataframe.
|
|
98
|
-
- Shape of the targets dataframe.
|
|
99
99
|
- Shape of the features dataframe.
|
|
100
|
+
- Shape of the targets dataframe.
|
|
100
101
|
"""
|
|
101
102
|
df_targets = df[targets]
|
|
102
103
|
df_features = df.drop(columns=targets)
|
|
103
|
-
print(f"Original shape: {df.shape}\
|
|
104
|
-
return
|
|
104
|
+
print(f"Original shape: {df.shape}\nFeatures shape: {df_features.shape}\nTargets shape: {df_targets.shape}")
|
|
105
|
+
return df_features, df_targets
|
|
105
106
|
|
|
106
107
|
|
|
107
108
|
def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
|
|
@@ -246,9 +247,6 @@ def plot_correlation_heatmap(df: pd.DataFrame, save_dir: Union[str, None] = None
|
|
|
246
247
|
cbar_kws={"shrink": 0.8}
|
|
247
248
|
)
|
|
248
249
|
|
|
249
|
-
# sanitize the plot title
|
|
250
|
-
plot_title = sanitize_filename(plot_title)
|
|
251
|
-
|
|
252
250
|
plt.title(plot_title)
|
|
253
251
|
plt.xticks(rotation=45, ha='right')
|
|
254
252
|
plt.yticks(rotation=0)
|
|
@@ -256,6 +254,8 @@ def plot_correlation_heatmap(df: pd.DataFrame, save_dir: Union[str, None] = None
|
|
|
256
254
|
plt.tight_layout()
|
|
257
255
|
|
|
258
256
|
if save_dir:
|
|
257
|
+
# sanitize the plot title to save the file
|
|
258
|
+
plot_title = sanitize_filename(plot_title)
|
|
259
259
|
os.makedirs(save_dir, exist_ok=True)
|
|
260
260
|
full_path = os.path.join(save_dir, plot_title + ".svg")
|
|
261
261
|
plt.savefig(full_path, bbox_inches="tight", format='svg')
|
|
@@ -519,38 +519,34 @@ def clip_outliers_multi(
|
|
|
519
519
|
return new_df
|
|
520
520
|
|
|
521
521
|
|
|
522
|
-
def
|
|
522
|
+
def match_and_filter_columns_by_regex(
|
|
523
523
|
df: pd.DataFrame,
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
524
|
+
pattern: str,
|
|
525
|
+
case_sensitive: bool = False,
|
|
526
|
+
escape_pattern: bool = False
|
|
527
|
+
) -> Tuple[pd.DataFrame, List[str]]:
|
|
527
528
|
"""
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
Whether to print info for each yielded dataset.
|
|
539
|
-
|
|
540
|
-
Yields
|
|
541
|
-
------
|
|
542
|
-
Tuple[str, pd.DataFrame]
|
|
543
|
-
* First element is the target column name.
|
|
544
|
-
* Second element is the corresponding cleaned DataFrame.
|
|
529
|
+
Return a tuple of (filtered DataFrame, matched column names) based on a regex pattern.
|
|
530
|
+
|
|
531
|
+
Parameters:
|
|
532
|
+
df (pd.DataFrame): The DataFrame to search.
|
|
533
|
+
pattern (str): The regex pattern to match column names (use a raw string).
|
|
534
|
+
case_sensitive (bool): Whether matching is case-sensitive.
|
|
535
|
+
escape_pattern (bool): If True, the pattern is escaped with `re.escape()` to treat it literally.
|
|
536
|
+
|
|
537
|
+
Returns:
|
|
538
|
+
(Tuple[pd.DataFrame, list[str]]): A DataFrame filtered to matched columns, and a list of matching column names.
|
|
545
539
|
"""
|
|
546
|
-
|
|
547
|
-
|
|
540
|
+
if escape_pattern:
|
|
541
|
+
pattern = re.escape(pattern)
|
|
548
542
|
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
543
|
+
mask = df.columns.str.contains(pattern, case=case_sensitive, regex=True)
|
|
544
|
+
matched_columns = df.columns[mask].to_list()
|
|
545
|
+
filtered_df = df.loc[:, mask]
|
|
546
|
+
|
|
547
|
+
print(f"{len(matched_columns)} column(s) match the regex pattern '{pattern}'.")
|
|
548
|
+
|
|
549
|
+
return filtered_df, matched_columns
|
|
554
550
|
|
|
555
551
|
|
|
556
552
|
def _is_notebook():
|
|
@@ -559,4 +555,3 @@ def _is_notebook():
|
|
|
559
555
|
|
|
560
556
|
def info():
|
|
561
557
|
_script_info(__all__)
|
|
562
|
-
|
ml_tools/ensemble_learning.py
CHANGED
|
@@ -20,7 +20,7 @@ from sklearn.model_selection import train_test_split
|
|
|
20
20
|
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, mean_absolute_error, mean_squared_error, r2_score, roc_curve, roc_auc_score
|
|
21
21
|
import shap
|
|
22
22
|
|
|
23
|
-
from .utilities import yield_dataframes_from_dir, sanitize_filename, _script_info
|
|
23
|
+
from .utilities import yield_dataframes_from_dir, sanitize_filename, _script_info, serialize_object
|
|
24
24
|
|
|
25
25
|
import warnings # Ignore warnings
|
|
26
26
|
warnings.filterwarnings('ignore', category=DeprecationWarning)
|
|
@@ -485,8 +485,9 @@ def _local_directories(model_name: str, dataset_id: str, save_dir: str):
|
|
|
485
485
|
def _save_model(trained_model, model_name: str, target_name:str, feature_names: list[str], save_directory: str):
|
|
486
486
|
#Sanitize filenames to save
|
|
487
487
|
sanitized_target_name = sanitize_filename(target_name)
|
|
488
|
-
|
|
489
|
-
|
|
488
|
+
filename = f"{model_name}_{sanitized_target_name}"
|
|
489
|
+
to_save = {'model': trained_model, 'feature_names': feature_names, 'target_name':target_name}
|
|
490
|
+
serialize_object(obj=to_save, save_dir=save_directory, filename=filename, verbose=False, raise_on_error=True)
|
|
490
491
|
|
|
491
492
|
# function to evaluate the model and save metrics (Classification)
|
|
492
493
|
def evaluate_model_classification(
|
|
@@ -8,11 +8,12 @@ from sklearn.base import ClassifierMixin
|
|
|
8
8
|
from typing import Literal, Union, Tuple, Dict, Optional
|
|
9
9
|
import polars as pl
|
|
10
10
|
from functools import partial
|
|
11
|
-
from .utilities import sanitize_filename, _script_info, threshold_binary_values
|
|
11
|
+
from .utilities import sanitize_filename, _script_info, threshold_binary_values, deserialize_object, list_files_by_extension
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
__all__ = [
|
|
15
15
|
"ObjectiveFunction",
|
|
16
|
+
"multiple_objective_functions_from_dir",
|
|
16
17
|
"run_pso"
|
|
17
18
|
]
|
|
18
19
|
|
|
@@ -29,16 +30,16 @@ class ObjectiveFunction():
|
|
|
29
30
|
Path to a serialized model (joblib) compatible with scikit-learn-like `.predict`.
|
|
30
31
|
add_noise : bool
|
|
31
32
|
Whether to apply multiplicative noise to the input features during evaluation.
|
|
32
|
-
|
|
33
|
-
Number of binary features located at the END of the feature vector. Model should be trained with continuous features first, followed by binary.
|
|
34
|
-
task : Literal, default 'maximization'
|
|
33
|
+
task : (Literal["maximization", "minimization"])
|
|
35
34
|
Whether to maximize or minimize the target.
|
|
35
|
+
binary_features : int
|
|
36
|
+
Number of binary features located at the END of the feature vector. Model should be trained with continuous features first, followed by binary.
|
|
36
37
|
"""
|
|
37
|
-
def __init__(self, trained_model_path: str, add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int
|
|
38
|
+
def __init__(self, trained_model_path: str, add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int) -> None:
|
|
38
39
|
self.binary_features = binary_features
|
|
39
40
|
self.is_hybrid = False if binary_features <= 0 else True
|
|
40
41
|
self.use_noise = add_noise
|
|
41
|
-
self._artifact =
|
|
42
|
+
self._artifact = deserialize_object(trained_model_path, verbose=False, raise_on_error=True)
|
|
42
43
|
self.model = self._get_from_artifact('model')
|
|
43
44
|
self.feature_names: Optional[list[str]] = self._get_from_artifact('feature_names') # type: ignore
|
|
44
45
|
self.target_name: Optional[str] = self._get_from_artifact('target_name') # type: ignore
|
|
@@ -49,7 +50,7 @@ class ObjectiveFunction():
|
|
|
49
50
|
if self.use_noise:
|
|
50
51
|
features_array = self.add_noise(features_array)
|
|
51
52
|
if self.is_hybrid:
|
|
52
|
-
features_array = threshold_binary_values(input_array=features_array,
|
|
53
|
+
features_array = threshold_binary_values(input_array=features_array, binary_values=self.binary_features) # type: ignore
|
|
53
54
|
|
|
54
55
|
if features_array.ndim == 1:
|
|
55
56
|
features_array = features_array.reshape(1, -1)
|
|
@@ -83,6 +84,8 @@ class ObjectiveFunction():
|
|
|
83
84
|
raise ValueError("Loaded model is None")
|
|
84
85
|
|
|
85
86
|
def _get_from_artifact(self, key: str):
|
|
87
|
+
if self._artifact is None:
|
|
88
|
+
raise TypeError("Load model error")
|
|
86
89
|
val = self._artifact.get(key)
|
|
87
90
|
if key == "feature_names":
|
|
88
91
|
result = val if isinstance(val, list) and val else None
|
|
@@ -94,6 +97,35 @@ class ObjectiveFunction():
|
|
|
94
97
|
return (f"<ObjectiveFunction(model={type(self.model).__name__}, use_noise={self.use_noise}, is_hybrid={self.is_hybrid}, task='{self.task}')>")
|
|
95
98
|
|
|
96
99
|
|
|
100
|
+
def multiple_objective_functions_from_dir(directory: str, add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int):
|
|
101
|
+
"""
|
|
102
|
+
Loads multiple objective functions from serialized models in the given directory.
|
|
103
|
+
|
|
104
|
+
Each `.joblib` file which is loaded and wrapped as an `ObjectiveFunction` instance. Returns a list of such instances along with their corresponding names.
|
|
105
|
+
|
|
106
|
+
Parameters:
|
|
107
|
+
directory (str) : Path to the directory containing `.joblib` files (serialized models).
|
|
108
|
+
add_noise (bool) : Whether to apply multiplicative noise to the input features during evaluation.
|
|
109
|
+
task (Literal["maximization", "minimization"]) : Defines the nature of the optimization task.
|
|
110
|
+
binary_features (int) : Number of binary features expected by each objective function.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
(tuple[list[ObjectiveFunction], list[str]]) : A tuple containing:
|
|
114
|
+
- list of `ObjectiveFunction` instances.
|
|
115
|
+
- list of corresponding filenames.
|
|
116
|
+
"""
|
|
117
|
+
objective_functions = list()
|
|
118
|
+
objective_function_names = list()
|
|
119
|
+
for file_name, file_path in list_files_by_extension(directory=directory, extension='joblib').items():
|
|
120
|
+
current_objective = ObjectiveFunction(trained_model_path=file_path,
|
|
121
|
+
add_noise=add_noise,
|
|
122
|
+
task=task,
|
|
123
|
+
binary_features=binary_features)
|
|
124
|
+
objective_functions.append(current_objective)
|
|
125
|
+
objective_function_names.append(file_name)
|
|
126
|
+
return objective_functions, objective_function_names
|
|
127
|
+
|
|
128
|
+
|
|
97
129
|
def _set_boundaries(lower_boundaries: list[float], upper_boundaries: list[float]):
|
|
98
130
|
assert len(lower_boundaries) == len(upper_boundaries), "Lower and upper boundaries must have the same length."
|
|
99
131
|
assert len(lower_boundaries) >= 1, "At least one boundary pair is required."
|
|
@@ -129,9 +161,9 @@ def run_pso(lower_boundaries: list[float],
|
|
|
129
161
|
target_name: Union[str, None]=None,
|
|
130
162
|
feature_names: Union[list[str], None]=None,
|
|
131
163
|
swarm_size: int=200,
|
|
132
|
-
max_iterations: int=
|
|
164
|
+
max_iterations: int=1000,
|
|
133
165
|
inequality_constrain_function=None,
|
|
134
|
-
post_hoc_analysis: Optional[int]=
|
|
166
|
+
post_hoc_analysis: Optional[int]=3,
|
|
135
167
|
workers: int=1) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
|
|
136
168
|
"""
|
|
137
169
|
Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results as a CSV file.
|
ml_tools/utilities.py
CHANGED
|
@@ -1,22 +1,29 @@
|
|
|
1
1
|
import math
|
|
2
2
|
import numpy as np
|
|
3
3
|
import pandas as pd
|
|
4
|
+
import polars as pl
|
|
4
5
|
import os
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
import re
|
|
7
|
-
from typing import Literal, Union, Sequence
|
|
8
|
+
from typing import Literal, Union, Sequence, Optional, Any, Iterator, Tuple
|
|
9
|
+
import joblib
|
|
10
|
+
from joblib.externals.loky.process_executor import TerminatedWorkerError
|
|
8
11
|
|
|
9
12
|
|
|
10
13
|
# Keep track of available tools
|
|
11
14
|
__all__ = [
|
|
12
15
|
"list_csv_paths",
|
|
16
|
+
"list_files_by_extension",
|
|
13
17
|
"load_dataframe",
|
|
14
18
|
"yield_dataframes_from_dir",
|
|
15
19
|
"merge_dataframes",
|
|
16
20
|
"save_dataframe",
|
|
17
21
|
"normalize_mixed_list",
|
|
18
22
|
"sanitize_filename",
|
|
19
|
-
"threshold_binary_values"
|
|
23
|
+
"threshold_binary_values",
|
|
24
|
+
"serialize_object",
|
|
25
|
+
"deserialize_object",
|
|
26
|
+
"distribute_datasets_by_target"
|
|
20
27
|
]
|
|
21
28
|
|
|
22
29
|
|
|
@@ -28,7 +35,7 @@ def list_csv_paths(directory: str) -> dict[str, str]:
|
|
|
28
35
|
directory (str): Path to the directory containing `.csv` files.
|
|
29
36
|
|
|
30
37
|
Returns:
|
|
31
|
-
(dict[str, str]):
|
|
38
|
+
(dict[str, str]): Dictionary mapping {filename: filepath}.
|
|
32
39
|
"""
|
|
33
40
|
dir_path = Path(directory).expanduser().resolve()
|
|
34
41
|
|
|
@@ -42,13 +49,47 @@ def list_csv_paths(directory: str) -> dict[str, str]:
|
|
|
42
49
|
# make a dictionary of paths and names
|
|
43
50
|
name_path_dict = {p.stem: str(p) for p in csv_paths}
|
|
44
51
|
|
|
45
|
-
print("🗂️ CSV files found:")
|
|
52
|
+
print("\n🗂️ CSV files found:")
|
|
46
53
|
for name in name_path_dict.keys():
|
|
47
54
|
print(f"\t{name}")
|
|
48
55
|
|
|
49
56
|
return name_path_dict
|
|
50
57
|
|
|
51
58
|
|
|
59
|
+
def list_files_by_extension(directory: str, extension: str) -> dict[str, str]:
|
|
60
|
+
"""
|
|
61
|
+
Lists all files with the specified extension in the given directory and returns a mapping:
|
|
62
|
+
filenames (without extensions) to their absolute paths.
|
|
63
|
+
|
|
64
|
+
Parameters:
|
|
65
|
+
directory (str): Path to the directory to search in.
|
|
66
|
+
extension (str): File extension to search for (e.g., 'json', 'txt').
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
(dict[str, str]): Dictionary mapping {filename: filepath}.
|
|
70
|
+
"""
|
|
71
|
+
dir_path = Path(directory).expanduser().resolve()
|
|
72
|
+
|
|
73
|
+
if not dir_path.is_dir():
|
|
74
|
+
raise FileNotFoundError(f"Directory not found: {dir_path}")
|
|
75
|
+
|
|
76
|
+
# Normalize the extension (remove leading dot if present)
|
|
77
|
+
normalized_ext = extension.lstrip(".").lower()
|
|
78
|
+
pattern = f"*.{normalized_ext}"
|
|
79
|
+
|
|
80
|
+
matched_paths = list(dir_path.glob(pattern))
|
|
81
|
+
if not matched_paths:
|
|
82
|
+
raise IOError(f"No '.{normalized_ext}' files found in directory: {dir_path}")
|
|
83
|
+
|
|
84
|
+
name_path_dict = {p.stem: str(p) for p in matched_paths}
|
|
85
|
+
|
|
86
|
+
print(f"\n📂 '{normalized_ext.upper()}' files found:")
|
|
87
|
+
for name in name_path_dict:
|
|
88
|
+
print(f"\t{name}")
|
|
89
|
+
|
|
90
|
+
return name_path_dict
|
|
91
|
+
|
|
92
|
+
|
|
52
93
|
def load_dataframe(df_path: str) -> tuple[pd.DataFrame, str]:
|
|
53
94
|
"""
|
|
54
95
|
Load a CSV file into a pandas DataFrame and extract the base name (without extension) from the file path.
|
|
@@ -194,12 +235,9 @@ def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
|
|
|
194
235
|
|
|
195
236
|
Returns:
|
|
196
237
|
List[float]: A list of normalized float values summing to 1.0.
|
|
197
|
-
Values significantly smaller than the median scale are scaled up
|
|
198
|
-
before normalization to correct likely input errors.
|
|
199
238
|
|
|
200
239
|
Notes:
|
|
201
240
|
- Zeros and None values remain zero.
|
|
202
|
-
- If all input values are zero or None, the function returns a list of zeros.
|
|
203
241
|
- Input strings are automatically cast to floats if possible.
|
|
204
242
|
|
|
205
243
|
Example:
|
|
@@ -268,35 +306,156 @@ def sanitize_filename(filename: str) -> str:
|
|
|
268
306
|
|
|
269
307
|
|
|
270
308
|
def threshold_binary_values(
|
|
271
|
-
input_array: Union[Sequence[float], np.ndarray],
|
|
272
|
-
|
|
273
|
-
) -> np.ndarray:
|
|
309
|
+
input_array: Union[Sequence[float], np.ndarray, pd.Series, pl.Series],
|
|
310
|
+
binary_values: Optional[int] = None
|
|
311
|
+
) -> Union[np.ndarray, pd.Series, pl.Series, list[float], tuple[float]]:
|
|
274
312
|
"""
|
|
275
|
-
Thresholds binary features in a 1D
|
|
313
|
+
Thresholds binary features in a 1D input. The number of binary features are counted starting from the end.
|
|
276
314
|
|
|
277
|
-
|
|
315
|
+
Parameters:
|
|
316
|
+
input_array: 1D sequence, NumPy array, pandas Series, or polars Series.
|
|
317
|
+
binary_values (Optional[int]) :
|
|
318
|
+
- If `None`, all values are treated as binary.
|
|
319
|
+
- If `int`, only this many last `binary_values` are thresholded.
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
Same type as input, with binary elements binarized to 0 or 1 using a 0.5 threshold.
|
|
323
|
+
"""
|
|
324
|
+
original_type = type(input_array)
|
|
325
|
+
|
|
326
|
+
if isinstance(input_array, pl.Series):
|
|
327
|
+
array = input_array.to_numpy()
|
|
328
|
+
elif isinstance(input_array, (pd.Series, np.ndarray)):
|
|
329
|
+
array = np.asarray(input_array)
|
|
330
|
+
elif isinstance(input_array, (list, tuple)):
|
|
331
|
+
array = np.array(input_array)
|
|
332
|
+
else:
|
|
333
|
+
raise TypeError("Unsupported input type")
|
|
334
|
+
|
|
335
|
+
array = array.flatten()
|
|
336
|
+
total = array.shape[0]
|
|
337
|
+
|
|
338
|
+
bin_count = total if binary_values is None else binary_values
|
|
339
|
+
if not (0 <= bin_count <= total):
|
|
340
|
+
raise ValueError("binary_values must be between 0 and the total number of elements")
|
|
341
|
+
|
|
342
|
+
if bin_count == 0:
|
|
343
|
+
result = array
|
|
344
|
+
else:
|
|
345
|
+
cont_part = array[:-bin_count] if bin_count < total else np.array([])
|
|
346
|
+
bin_part = (array[-bin_count:] > 0.5).astype(int)
|
|
347
|
+
result = np.concatenate([cont_part, bin_part])
|
|
348
|
+
|
|
349
|
+
if original_type is pd.Series:
|
|
350
|
+
return pd.Series(result, index=input_array.index if hasattr(input_array, 'index') else None) # type: ignore
|
|
351
|
+
elif original_type is pl.Series:
|
|
352
|
+
return pl.Series(input_array.name if hasattr(input_array, 'name') else "binary", result) # type: ignore
|
|
353
|
+
elif original_type is list:
|
|
354
|
+
return result.tolist()
|
|
355
|
+
elif original_type is tuple:
|
|
356
|
+
return tuple(result)
|
|
357
|
+
else:
|
|
358
|
+
return result
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def serialize_object(obj: Any, save_dir: str, filename: str, verbose: bool=True, raise_on_error: bool=False) -> Optional[str]:
|
|
362
|
+
"""
|
|
363
|
+
Serializes a Python object using joblib; suitable for Python built-ins, numpy, and pandas.
|
|
278
364
|
|
|
279
365
|
Parameters:
|
|
280
|
-
|
|
366
|
+
obj (Any) : The Python object to serialize.
|
|
367
|
+
save_dir (str) : Directory path where the serialized object will be saved.
|
|
368
|
+
filename (str) : Name for the output file, extension will be appended if needed.
|
|
369
|
+
|
|
370
|
+
Returns:
|
|
371
|
+
(str | None) : The full file path where the object was saved if successful; otherwise, None.
|
|
372
|
+
"""
|
|
373
|
+
try:
|
|
374
|
+
os.makedirs(save_dir, exist_ok=True)
|
|
375
|
+
sanitized_name = sanitize_filename(filename)
|
|
376
|
+
if not sanitized_name.endswith('.joblib'):
|
|
377
|
+
sanitized_name = sanitized_name + ".joblib"
|
|
378
|
+
full_path = os.path.join(save_dir, sanitized_name)
|
|
379
|
+
joblib.dump(obj, full_path)
|
|
380
|
+
except (IOError, OSError, TypeError, TerminatedWorkerError) as e:
|
|
381
|
+
message = f"❌ Failed to serialize object of type '{type(obj)}': {e}"
|
|
382
|
+
if raise_on_error:
|
|
383
|
+
raise Exception(message)
|
|
384
|
+
else:
|
|
385
|
+
print(message)
|
|
386
|
+
return None
|
|
387
|
+
else:
|
|
388
|
+
if verbose:
|
|
389
|
+
print(f"✅ Object of type '{type(obj)}' saved to '{full_path}'")
|
|
390
|
+
return full_path
|
|
281
391
|
|
|
282
|
-
|
|
392
|
+
|
|
393
|
+
def deserialize_object(filepath: str, verbose: bool=True, raise_on_error: bool=True) -> Optional[Any]:
|
|
394
|
+
"""
|
|
395
|
+
Loads a serialized object from a .joblib file.
|
|
396
|
+
|
|
397
|
+
Parameters:
|
|
398
|
+
filepath (str): Full path to the serialized .joblib file.
|
|
283
399
|
|
|
284
400
|
Returns:
|
|
285
|
-
|
|
401
|
+
(Any | None): The deserialized Python object, or None if loading fails.
|
|
286
402
|
"""
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
403
|
+
if not os.path.exists(filepath):
|
|
404
|
+
print(f"❌ File does not exist: {filepath}")
|
|
405
|
+
return None
|
|
406
|
+
try:
|
|
407
|
+
obj = joblib.load(filepath)
|
|
408
|
+
except (IOError, OSError, EOFError, TypeError, ValueError) as e:
|
|
409
|
+
message = f"❌ Failed to deserialize object from '{filepath}': {e}"
|
|
410
|
+
if raise_on_error:
|
|
411
|
+
raise Exception(message)
|
|
412
|
+
else:
|
|
413
|
+
print(message)
|
|
414
|
+
return None
|
|
415
|
+
else:
|
|
416
|
+
if verbose:
|
|
417
|
+
print(f"✅ Loaded object of type '{type(obj)}'")
|
|
418
|
+
return obj
|
|
419
|
+
|
|
295
420
|
|
|
296
|
-
|
|
297
|
-
|
|
421
|
+
def distribute_datasets_by_target(
|
|
422
|
+
df_or_path: Union[pd.DataFrame, str],
|
|
423
|
+
target_columns: list[str],
|
|
424
|
+
verbose: bool = False
|
|
425
|
+
) -> Iterator[Tuple[str, pd.DataFrame]]:
|
|
426
|
+
"""
|
|
427
|
+
Yields cleaned DataFrames for each target column, where rows with missing
|
|
428
|
+
target values are removed. The target column is placed at the end.
|
|
429
|
+
|
|
430
|
+
Parameters
|
|
431
|
+
----------
|
|
432
|
+
df_or_path : [pd.DataFrame | str]
|
|
433
|
+
Dataframe or path to Dataframe with all feature and target columns ready to split and train a model.
|
|
434
|
+
target_columns : List[str]
|
|
435
|
+
List of target column names to generate per-target DataFrames.
|
|
436
|
+
verbose: bool
|
|
437
|
+
Whether to print info for each yielded dataset.
|
|
438
|
+
|
|
439
|
+
Yields
|
|
440
|
+
------
|
|
441
|
+
Tuple[str, pd.DataFrame]
|
|
442
|
+
* Target name.
|
|
443
|
+
* Pandas DataFrame.
|
|
444
|
+
"""
|
|
445
|
+
# Validate path
|
|
446
|
+
if isinstance(df_or_path, str):
|
|
447
|
+
df, _ = load_dataframe(df_or_path)
|
|
448
|
+
else:
|
|
449
|
+
df = df_or_path
|
|
298
450
|
|
|
299
|
-
|
|
451
|
+
valid_targets = [col for col in df.columns if col in target_columns]
|
|
452
|
+
feature_columns = [col for col in df.columns if col not in valid_targets]
|
|
453
|
+
|
|
454
|
+
for target in valid_targets:
|
|
455
|
+
subset = df[feature_columns + [target]].dropna(subset=[target])
|
|
456
|
+
if verbose:
|
|
457
|
+
print(f"Target: '{target}' - Dataframe shape: {subset.shape}")
|
|
458
|
+
yield target, subset
|
|
300
459
|
|
|
301
460
|
|
|
302
461
|
def _script_info(all_data: list[str]):
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
dragon_ml_toolbox-1.4.4.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
|
|
2
|
-
dragon_ml_toolbox-1.4.4.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=jDnniT0tgD0uw1NpjibsPF-qK3wmOKgTykLG2iNQU7E,1840
|
|
3
|
-
ml_tools/MICE_imputation.py,sha256=o1wPu3sYt6AxmUgtyDcfqbwL-82Hyep0upvC4tIWKyw,10136
|
|
4
|
-
ml_tools/VIF_factor.py,sha256=HEBsLJy_qSDaPw1Btha5B7omxN4wjJXg-sqoetCjCJw,10016
|
|
5
|
-
ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
-
ml_tools/data_exploration.py,sha256=nBhDuaUCgEVG0Xi8MQ0X_xWQadr-7j_wUqwt-t-w5uE,20172
|
|
7
|
-
ml_tools/datasetmaster.py,sha256=EFUEX-tqq94Ak1rXXYR-XaX85olrxvF2EuytdzUK7y0,29131
|
|
8
|
-
ml_tools/ensemble_learning.py,sha256=q8qOTbSAwdsgKhjojVm7wn_UApGOCsvFa0mvhLYUuyM,37239
|
|
9
|
-
ml_tools/handle_excel.py,sha256=ZJui5__0rc2T8UGHTheqZGhKmdVZ7Q2I54IoYCjAqJw,12612
|
|
10
|
-
ml_tools/logger.py,sha256=ZTtUB9HTkNs5zHTdYRKNbKADjUkuObsF7s8U5pNnVRA,4716
|
|
11
|
-
ml_tools/particle_swarm_optimization.py,sha256=zaDQTKz2fAixgAfZdkRttGpN2vgraT18ZlpllJlQesk,20034
|
|
12
|
-
ml_tools/pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
|
|
13
|
-
ml_tools/trainer.py,sha256=WAZ4EdrZuTOAnGXRWV3XcLNce4s7EKGf2-qchLC08Ik,15702
|
|
14
|
-
ml_tools/utilities.py,sha256=r7uEmo38Imly56BP3-Jv6dFJvLsbGipeBlkiZx2fcNQ,10189
|
|
15
|
-
ml_tools/vision_helpers.py,sha256=idQ-Ugp1IdsvwXiYyhYa9G3rTRTm37YRpkQDLEpANHM,7701
|
|
16
|
-
dragon_ml_toolbox-1.4.4.dist-info/METADATA,sha256=8hdr2v8UE5uxieOUSRgjTa5OP1C2BzgjawgxT8ynbQ0,2516
|
|
17
|
-
dragon_ml_toolbox-1.4.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
18
|
-
dragon_ml_toolbox-1.4.4.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
19
|
-
dragon_ml_toolbox-1.4.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|