dragon-ml-toolbox 1.4.3__tar.gz → 1.4.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-1.4.3/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-1.4.5}/PKG-INFO +1 -1
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.5/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.5}/ml_tools/MICE_imputation.py +24 -8
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.5}/ml_tools/VIF_factor.py +9 -6
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.5}/ml_tools/data_exploration.py +7 -37
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.5}/ml_tools/ensemble_learning.py +8 -11
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.5}/ml_tools/logger.py +2 -2
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.5}/ml_tools/particle_swarm_optimization.py +8 -6
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.5}/ml_tools/utilities.py +148 -24
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.5}/pyproject.toml +1 -1
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.5}/LICENSE +0 -0
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.5}/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.5}/README.md +0 -0
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.5}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.5}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.5}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.5}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.5}/ml_tools/__init__.py +0 -0
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.5}/ml_tools/datasetmaster.py +0 -0
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.5}/ml_tools/handle_excel.py +0 -0
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.5}/ml_tools/pytorch_models.py +0 -0
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.5}/ml_tools/trainer.py +0 -0
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.5}/ml_tools/vision_helpers.py +0 -0
- {dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.5}/setup.cfg +0 -0
|
@@ -3,8 +3,9 @@ import miceforest as mf
|
|
|
3
3
|
import os
|
|
4
4
|
import matplotlib.pyplot as plt
|
|
5
5
|
import numpy as np
|
|
6
|
-
from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info, merge_dataframes, save_dataframe
|
|
6
|
+
from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info, merge_dataframes, save_dataframe, threshold_binary_values
|
|
7
7
|
from plotnine import ggplot, labs, theme, element_blank # type: ignore
|
|
8
|
+
from typing import Optional
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
__all__ = [
|
|
@@ -17,7 +18,7 @@ __all__ = [
|
|
|
17
18
|
]
|
|
18
19
|
|
|
19
20
|
|
|
20
|
-
def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
|
|
21
|
+
def apply_mice(df: pd.DataFrame, df_name: str, binary_columns: Optional[list[str]]=None, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
|
|
21
22
|
|
|
22
23
|
# Initialize kernel with number of imputed datasets to generate
|
|
23
24
|
kernel = mf.ImputationKernel(
|
|
@@ -35,10 +36,20 @@ def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterat
|
|
|
35
36
|
if imputed_datasets is None or len(imputed_datasets) == 0:
|
|
36
37
|
raise ValueError("No imputed datasets were generated. Check the MICE process.")
|
|
37
38
|
|
|
39
|
+
# threshold binary columns
|
|
40
|
+
if binary_columns is not None:
|
|
41
|
+
invalid_binary_columns = set(binary_columns) - set(df.columns)
|
|
42
|
+
if invalid_binary_columns:
|
|
43
|
+
print(f"⚠️ These 'binary columns' are not in the dataset: {invalid_binary_columns}")
|
|
44
|
+
valid_binary_columns = [col for col in binary_columns if col not in invalid_binary_columns]
|
|
45
|
+
for imputed_df in imputed_datasets:
|
|
46
|
+
for binary_column_name in valid_binary_columns:
|
|
47
|
+
imputed_df[binary_column_name] = threshold_binary_values(imputed_df[binary_column_name]) # type: ignore
|
|
48
|
+
|
|
38
49
|
if resulting_datasets == 1:
|
|
39
|
-
imputed_dataset_names = [f"{df_name}
|
|
50
|
+
imputed_dataset_names = [f"{df_name}_MICE"]
|
|
40
51
|
else:
|
|
41
|
-
imputed_dataset_names = [f"{df_name}
|
|
52
|
+
imputed_dataset_names = [f"{df_name}_MICE_{i+1}" for i in range(resulting_datasets)]
|
|
42
53
|
|
|
43
54
|
# Ensure indexes match
|
|
44
55
|
for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
|
|
@@ -106,7 +117,7 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
|
|
|
106
117
|
# Adjust plot display for the X axis
|
|
107
118
|
_ticks = np.arange(iterations_cap)
|
|
108
119
|
_labels = np.arange(1, iterations_cap + 1)
|
|
109
|
-
plt.xticks(ticks=_ticks, labels=_labels)
|
|
120
|
+
plt.xticks(ticks=_ticks, labels=_labels) # type: ignore
|
|
110
121
|
plt.grid(True)
|
|
111
122
|
|
|
112
123
|
feature_save_name = sanitize_filename(feature_name)
|
|
@@ -202,7 +213,12 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
|
|
|
202
213
|
print(f"{local_dir_name} completed.")
|
|
203
214
|
|
|
204
215
|
|
|
205
|
-
def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str],
|
|
216
|
+
def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str],
|
|
217
|
+
save_datasets_dir: str, save_metrics_dir: str,
|
|
218
|
+
binary_columns: Optional[list[str]]=None,
|
|
219
|
+
resulting_datasets: int=1,
|
|
220
|
+
iterations: int=20,
|
|
221
|
+
random_state: int=101):
|
|
206
222
|
"""
|
|
207
223
|
Call functions in sequence for each dataset in the provided path or directory:
|
|
208
224
|
1. Load dataframe
|
|
@@ -211,7 +227,7 @@ def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str], save_datas
|
|
|
211
227
|
4. Save convergence metrics
|
|
212
228
|
5. Save distribution metrics
|
|
213
229
|
|
|
214
|
-
Target columns must be skipped from the imputation.
|
|
230
|
+
Target columns must be skipped from the imputation. Binary columns will be thresholded after imputation.
|
|
215
231
|
"""
|
|
216
232
|
# Check paths
|
|
217
233
|
os.makedirs(save_datasets_dir, exist_ok=True)
|
|
@@ -229,7 +245,7 @@ def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str], save_datas
|
|
|
229
245
|
|
|
230
246
|
df, df_targets = _skip_targets(df, target_columns)
|
|
231
247
|
|
|
232
|
-
kernel, imputed_datasets, imputed_dataset_names = apply_mice(df=df, df_name=df_name, resulting_datasets=resulting_datasets, iterations=iterations, random_state=random_state)
|
|
248
|
+
kernel, imputed_datasets, imputed_dataset_names = apply_mice(df=df, df_name=df_name, binary_columns=binary_columns, resulting_datasets=resulting_datasets, iterations=iterations, random_state=random_state)
|
|
233
249
|
|
|
234
250
|
save_imputed_datasets(save_dir=save_datasets_dir, imputed_datasets=imputed_datasets, df_targets=df_targets, imputed_dataset_names=imputed_dataset_names)
|
|
235
251
|
|
|
@@ -26,6 +26,7 @@ def compute_vif(
|
|
|
26
26
|
filename: Optional[str] = None,
|
|
27
27
|
fontsize: int = 14,
|
|
28
28
|
show_plot: bool = True,
|
|
29
|
+
verbose: bool = True
|
|
29
30
|
) -> pd.DataFrame:
|
|
30
31
|
"""
|
|
31
32
|
Computes Variance Inflation Factors (VIF) for numeric columns in a DataFrame. Optionally, generates a bar plot of VIF values.
|
|
@@ -52,19 +53,20 @@ def compute_vif(
|
|
|
52
53
|
if use_columns is None:
|
|
53
54
|
sanitized_columns = df.select_dtypes(include='number').columns.tolist()
|
|
54
55
|
missing_features = set(ground_truth_cols) - set(sanitized_columns)
|
|
55
|
-
if missing_features:
|
|
56
|
+
if missing_features and verbose:
|
|
56
57
|
print(f"⚠️ These columns are not Numeric:\n{missing_features}")
|
|
57
58
|
else:
|
|
58
59
|
sanitized_columns = list()
|
|
59
60
|
for feature in use_columns:
|
|
60
61
|
if feature not in ground_truth_cols:
|
|
61
|
-
|
|
62
|
+
if verbose:
|
|
63
|
+
print(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
|
|
62
64
|
else:
|
|
63
65
|
sanitized_columns.append(feature)
|
|
64
66
|
|
|
65
67
|
if ignore_columns is not None and use_columns is None:
|
|
66
68
|
missing_ignore = set(ignore_columns) - set(ground_truth_cols)
|
|
67
|
-
if missing_ignore:
|
|
69
|
+
if missing_ignore and verbose:
|
|
68
70
|
print(f"⚠️ Warning: The following 'columns to ignore' are not in the Dataframe:\n{missing_ignore}")
|
|
69
71
|
sanitized_columns = [f for f in sanitized_columns if f not in ignore_columns]
|
|
70
72
|
|
|
@@ -182,7 +184,7 @@ def compute_vif_multi(input_directory: str,
|
|
|
182
184
|
max_features_to_plot: int = 20,
|
|
183
185
|
fontsize: int = 14):
|
|
184
186
|
"""
|
|
185
|
-
Computes Variance Inflation Factors (VIF) for numeric columns in a directory with CSV files (loaded as pandas DataFrames).
|
|
187
|
+
Computes Variance Inflation Factors (VIF) for numeric columns in a directory with CSV files (loaded as pandas DataFrames). No plots or warnings will be displayed inline.
|
|
186
188
|
Generates a bar plot of VIF values. Optionally drops columns with VIF >= 10 and saves as a new CSV file.
|
|
187
189
|
|
|
188
190
|
Args:
|
|
@@ -210,10 +212,11 @@ def compute_vif_multi(input_directory: str,
|
|
|
210
212
|
fontsize=fontsize,
|
|
211
213
|
save_dir=output_plot_directory,
|
|
212
214
|
filename=df_name,
|
|
213
|
-
show_plot=False
|
|
215
|
+
show_plot=False,
|
|
216
|
+
verbose=False)
|
|
214
217
|
|
|
215
218
|
if output_dataset_directory is not None:
|
|
216
|
-
new_filename =
|
|
219
|
+
new_filename = df_name + '_VIF'
|
|
217
220
|
result_df, dropped_cols = drop_vif_based(df=df, vif_df=vif_dataframe)
|
|
218
221
|
|
|
219
222
|
if len(dropped_cols) > 0:
|
|
@@ -22,8 +22,7 @@ __all__ = [
|
|
|
22
22
|
"check_value_distributions",
|
|
23
23
|
"plot_value_distributions",
|
|
24
24
|
"clip_outliers_single",
|
|
25
|
-
"clip_outliers_multi"
|
|
26
|
-
"distribute_datasets_by_target"
|
|
25
|
+
"clip_outliers_multi"
|
|
27
26
|
]
|
|
28
27
|
|
|
29
28
|
|
|
@@ -90,18 +89,18 @@ def split_features_targets(df: pd.DataFrame, targets: list[str]):
|
|
|
90
89
|
|
|
91
90
|
Returns:
|
|
92
91
|
tuple: A tuple containing:
|
|
93
|
-
- pd.DataFrame: Targets dataframe.
|
|
94
92
|
- pd.DataFrame: Features dataframe.
|
|
93
|
+
- pd.DataFrame: Targets dataframe.
|
|
95
94
|
|
|
96
95
|
Prints:
|
|
97
96
|
- Shape of the original dataframe.
|
|
98
|
-
- Shape of the targets dataframe.
|
|
99
97
|
- Shape of the features dataframe.
|
|
98
|
+
- Shape of the targets dataframe.
|
|
100
99
|
"""
|
|
101
100
|
df_targets = df[targets]
|
|
102
101
|
df_features = df.drop(columns=targets)
|
|
103
|
-
print(f"Original shape: {df.shape}\
|
|
104
|
-
return
|
|
102
|
+
print(f"Original shape: {df.shape}\nFeatures shape: {df_features.shape}\nTargets shape: {df_targets.shape}")
|
|
103
|
+
return df_features, df_targets
|
|
105
104
|
|
|
106
105
|
|
|
107
106
|
def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
|
|
@@ -153,7 +152,7 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, sho
|
|
|
153
152
|
|
|
154
153
|
result_df = df.drop(columns=cols_to_drop)
|
|
155
154
|
if show_nulls_after:
|
|
156
|
-
show_null_columns(df=result_df)
|
|
155
|
+
print(show_null_columns(df=result_df))
|
|
157
156
|
|
|
158
157
|
return result_df
|
|
159
158
|
else:
|
|
@@ -259,7 +258,7 @@ def plot_correlation_heatmap(df: pd.DataFrame, save_dir: Union[str, None] = None
|
|
|
259
258
|
os.makedirs(save_dir, exist_ok=True)
|
|
260
259
|
full_path = os.path.join(save_dir, plot_title + ".svg")
|
|
261
260
|
plt.savefig(full_path, bbox_inches="tight", format='svg')
|
|
262
|
-
print(f"Saved correlation heatmap
|
|
261
|
+
print(f"Saved correlation heatmap: '{plot_title}.svg'")
|
|
263
262
|
|
|
264
263
|
plt.show()
|
|
265
264
|
plt.close()
|
|
@@ -519,38 +518,9 @@ def clip_outliers_multi(
|
|
|
519
518
|
return new_df
|
|
520
519
|
|
|
521
520
|
|
|
522
|
-
def distribute_datasets_by_target(
|
|
523
|
-
df: pd.DataFrame,
|
|
524
|
-
target_columns: list[str]
|
|
525
|
-
) -> Iterator[Tuple[str, pd.DataFrame]]:
|
|
526
|
-
"""
|
|
527
|
-
Yields cleaned DataFrames for each target column, where rows with missing
|
|
528
|
-
target values are removed. The target column is placed at the end.
|
|
529
|
-
|
|
530
|
-
Parameters
|
|
531
|
-
----------
|
|
532
|
-
df : pd.DataFrame
|
|
533
|
-
Preprocessed dataframe with all feature and target columns ready to train.
|
|
534
|
-
target_columns : List[str]
|
|
535
|
-
List of target column names to generate per-target DataFrames.
|
|
536
|
-
|
|
537
|
-
Yields
|
|
538
|
-
------
|
|
539
|
-
Tuple[str, pd.DataFrame]
|
|
540
|
-
* First element is the target column name.
|
|
541
|
-
* Second element is the corresponding cleaned DataFrame.
|
|
542
|
-
"""
|
|
543
|
-
feature_columns = [col for col in df.columns if col not in target_columns]
|
|
544
|
-
|
|
545
|
-
for target in target_columns:
|
|
546
|
-
subset = df[feature_columns + [target]].dropna(subset=[target])
|
|
547
|
-
yield target, subset
|
|
548
|
-
|
|
549
|
-
|
|
550
521
|
def _is_notebook():
|
|
551
522
|
return get_ipython() is not None
|
|
552
523
|
|
|
553
524
|
|
|
554
525
|
def info():
|
|
555
526
|
_script_info(__all__)
|
|
556
|
-
|
|
@@ -20,7 +20,7 @@ from sklearn.model_selection import train_test_split
|
|
|
20
20
|
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, mean_absolute_error, mean_squared_error, r2_score, roc_curve, roc_auc_score
|
|
21
21
|
import shap
|
|
22
22
|
|
|
23
|
-
from .utilities import yield_dataframes_from_dir, sanitize_filename, _script_info
|
|
23
|
+
from .utilities import yield_dataframes_from_dir, sanitize_filename, _script_info, serialize_object
|
|
24
24
|
|
|
25
25
|
import warnings # Ignore warnings
|
|
26
26
|
warnings.filterwarnings('ignore', category=DeprecationWarning)
|
|
@@ -157,9 +157,7 @@ class RegressionTreeModels:
|
|
|
157
157
|
self.gamma = gamma
|
|
158
158
|
|
|
159
159
|
# LightGBM specific
|
|
160
|
-
|
|
161
|
-
num_leaves = (2**max_depth) - 1
|
|
162
|
-
print(f"⚠️ Warning: 'num_leaves' should be set proportional to 'max_depth'. Value set as {num_leaves}.")
|
|
160
|
+
num_leaves = min(num_leaves, 2 ** (max_depth - 1))
|
|
163
161
|
self.num_leaves = num_leaves
|
|
164
162
|
self.min_data_in_leaf = min_data_in_leaf
|
|
165
163
|
|
|
@@ -202,7 +200,7 @@ class RegressionTreeModels:
|
|
|
202
200
|
verbose=-1,
|
|
203
201
|
reg_alpha=self.L1,
|
|
204
202
|
reg_lambda=self.L2,
|
|
205
|
-
boosting_type='
|
|
203
|
+
boosting_type='gbdt',
|
|
206
204
|
num_leaves=self.num_leaves,
|
|
207
205
|
min_data_in_leaf=self.min_data_in_leaf
|
|
208
206
|
)
|
|
@@ -321,9 +319,7 @@ class ClassificationTreeModels:
|
|
|
321
319
|
self.gamma = gamma
|
|
322
320
|
|
|
323
321
|
# LightGBM specific
|
|
324
|
-
|
|
325
|
-
num_leaves = (2**max_depth) - 1
|
|
326
|
-
print(f"⚠️ Warning: 'num_leaves' should be set proportional to 'max_depth'. Value set as {num_leaves}.")
|
|
322
|
+
num_leaves = min(num_leaves, 2 ** (max_depth - 1))
|
|
327
323
|
self.num_leaves = num_leaves
|
|
328
324
|
self.min_data_in_leaf = min_data_in_leaf
|
|
329
325
|
|
|
@@ -370,7 +366,7 @@ class ClassificationTreeModels:
|
|
|
370
366
|
verbose=-1,
|
|
371
367
|
reg_alpha=self.L1,
|
|
372
368
|
reg_lambda=self.L2,
|
|
373
|
-
boosting_type='
|
|
369
|
+
boosting_type='gbdt' if self.use_model_balance else 'goss',
|
|
374
370
|
num_leaves=self.num_leaves,
|
|
375
371
|
min_data_in_leaf=self.min_data_in_leaf,
|
|
376
372
|
class_weight='balanced' if self.use_model_balance else None
|
|
@@ -489,8 +485,9 @@ def _local_directories(model_name: str, dataset_id: str, save_dir: str):
|
|
|
489
485
|
def _save_model(trained_model, model_name: str, target_name:str, feature_names: list[str], save_directory: str):
|
|
490
486
|
#Sanitize filenames to save
|
|
491
487
|
sanitized_target_name = sanitize_filename(target_name)
|
|
492
|
-
|
|
493
|
-
|
|
488
|
+
filename = f"{model_name}_{sanitized_target_name}"
|
|
489
|
+
to_save = {'model': trained_model, 'feature_names': feature_names, 'target_name':target_name}
|
|
490
|
+
serialize_object(obj=to_save, save_dir=save_directory, filename=filename, verbose=False, raise_on_error=True)
|
|
494
491
|
|
|
495
492
|
# function to evaluate the model and save metrics (Classification)
|
|
496
493
|
def evaluate_model_classification(
|
|
@@ -55,7 +55,7 @@ def custom_logger(
|
|
|
55
55
|
"""
|
|
56
56
|
try:
|
|
57
57
|
os.makedirs(save_directory, exist_ok=True)
|
|
58
|
-
timestamp = datetime.now().strftime(r"%Y%m%d_%H%M")
|
|
58
|
+
timestamp = datetime.now().strftime(r"%Y%m%d_%H%M%S")
|
|
59
59
|
log_name = sanitize_filename(log_name)
|
|
60
60
|
base_path = os.path.join(save_directory, f"{log_name}_{timestamp}")
|
|
61
61
|
|
|
@@ -80,7 +80,7 @@ def custom_logger(
|
|
|
80
80
|
else:
|
|
81
81
|
raise ValueError("Unsupported data type. Must be list, dict, DataFrame, str, or BaseException.")
|
|
82
82
|
|
|
83
|
-
print(f"Log saved to: {base_path}")
|
|
83
|
+
print(f"Log saved to: '{base_path}'")
|
|
84
84
|
|
|
85
85
|
except Exception as e:
|
|
86
86
|
print(f"Error in custom_logger: {e}")
|
|
@@ -8,7 +8,7 @@ from sklearn.base import ClassifierMixin
|
|
|
8
8
|
from typing import Literal, Union, Tuple, Dict, Optional
|
|
9
9
|
import polars as pl
|
|
10
10
|
from functools import partial
|
|
11
|
-
from .utilities import sanitize_filename, _script_info, threshold_binary_values
|
|
11
|
+
from .utilities import sanitize_filename, _script_info, threshold_binary_values, deserialize_object
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
__all__ = [
|
|
@@ -38,7 +38,7 @@ class ObjectiveFunction():
|
|
|
38
38
|
self.binary_features = binary_features
|
|
39
39
|
self.is_hybrid = False if binary_features <= 0 else True
|
|
40
40
|
self.use_noise = add_noise
|
|
41
|
-
self._artifact =
|
|
41
|
+
self._artifact = deserialize_object(trained_model_path, verbose=False, raise_on_error=True)
|
|
42
42
|
self.model = self._get_from_artifact('model')
|
|
43
43
|
self.feature_names: Optional[list[str]] = self._get_from_artifact('feature_names') # type: ignore
|
|
44
44
|
self.target_name: Optional[str] = self._get_from_artifact('target_name') # type: ignore
|
|
@@ -49,7 +49,7 @@ class ObjectiveFunction():
|
|
|
49
49
|
if self.use_noise:
|
|
50
50
|
features_array = self.add_noise(features_array)
|
|
51
51
|
if self.is_hybrid:
|
|
52
|
-
features_array = threshold_binary_values(input_array=features_array,
|
|
52
|
+
features_array = threshold_binary_values(input_array=features_array, binary_values=self.binary_features) # type: ignore
|
|
53
53
|
|
|
54
54
|
if features_array.ndim == 1:
|
|
55
55
|
features_array = features_array.reshape(1, -1)
|
|
@@ -83,6 +83,8 @@ class ObjectiveFunction():
|
|
|
83
83
|
raise ValueError("Loaded model is None")
|
|
84
84
|
|
|
85
85
|
def _get_from_artifact(self, key: str):
|
|
86
|
+
if self._artifact is None:
|
|
87
|
+
raise TypeError("Load model error")
|
|
86
88
|
val = self._artifact.get(key)
|
|
87
89
|
if key == "feature_names":
|
|
88
90
|
result = val if isinstance(val, list) and val else None
|
|
@@ -129,10 +131,10 @@ def run_pso(lower_boundaries: list[float],
|
|
|
129
131
|
target_name: Union[str, None]=None,
|
|
130
132
|
feature_names: Union[list[str], None]=None,
|
|
131
133
|
swarm_size: int=200,
|
|
132
|
-
max_iterations: int=
|
|
134
|
+
max_iterations: int=1500,
|
|
133
135
|
inequality_constrain_function=None,
|
|
134
|
-
post_hoc_analysis: Optional[int]=
|
|
135
|
-
workers: int=
|
|
136
|
+
post_hoc_analysis: Optional[int]=5,
|
|
137
|
+
workers: int=1) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
|
|
136
138
|
"""
|
|
137
139
|
Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results as a CSV file.
|
|
138
140
|
|
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import math
|
|
2
2
|
import numpy as np
|
|
3
3
|
import pandas as pd
|
|
4
|
+
import polars as pl
|
|
4
5
|
import os
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
import re
|
|
7
|
-
from typing import Literal, Union, Sequence
|
|
8
|
+
from typing import Literal, Union, Sequence, Optional, Any, Iterator, Tuple
|
|
9
|
+
import joblib
|
|
10
|
+
from joblib.externals.loky.process_executor import TerminatedWorkerError
|
|
8
11
|
|
|
9
12
|
|
|
10
13
|
# Keep track of available tools
|
|
@@ -16,7 +19,10 @@ __all__ = [
|
|
|
16
19
|
"save_dataframe",
|
|
17
20
|
"normalize_mixed_list",
|
|
18
21
|
"sanitize_filename",
|
|
19
|
-
"threshold_binary_values"
|
|
22
|
+
"threshold_binary_values",
|
|
23
|
+
"serialize_object",
|
|
24
|
+
"deserialize_object",
|
|
25
|
+
"distribute_datasets_by_target"
|
|
20
26
|
]
|
|
21
27
|
|
|
22
28
|
|
|
@@ -194,12 +200,9 @@ def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
|
|
|
194
200
|
|
|
195
201
|
Returns:
|
|
196
202
|
List[float]: A list of normalized float values summing to 1.0.
|
|
197
|
-
Values significantly smaller than the median scale are scaled up
|
|
198
|
-
before normalization to correct likely input errors.
|
|
199
203
|
|
|
200
204
|
Notes:
|
|
201
205
|
- Zeros and None values remain zero.
|
|
202
|
-
- If all input values are zero or None, the function returns a list of zeros.
|
|
203
206
|
- Input strings are automatically cast to floats if possible.
|
|
204
207
|
|
|
205
208
|
Example:
|
|
@@ -268,35 +271,156 @@ def sanitize_filename(filename: str) -> str:
|
|
|
268
271
|
|
|
269
272
|
|
|
270
273
|
def threshold_binary_values(
|
|
271
|
-
input_array: Union[Sequence[float], np.ndarray],
|
|
272
|
-
|
|
273
|
-
) -> np.ndarray:
|
|
274
|
+
input_array: Union[Sequence[float], np.ndarray, pd.Series, pl.Series],
|
|
275
|
+
binary_values: Optional[int] = None
|
|
276
|
+
) -> Union[np.ndarray, pd.Series, pl.Series, list[float], tuple[float]]:
|
|
274
277
|
"""
|
|
275
|
-
Thresholds binary features in a 1D
|
|
278
|
+
Thresholds binary features in a 1D input. The number of binary features are counted starting from the end.
|
|
276
279
|
|
|
277
|
-
|
|
280
|
+
Parameters:
|
|
281
|
+
input_array: 1D sequence, NumPy array, pandas Series, or polars Series.
|
|
282
|
+
binary_values (Optional[int]) :
|
|
283
|
+
- If `None`, all values are treated as binary.
|
|
284
|
+
- If `int`, only this many last `binary_values` are thresholded.
|
|
285
|
+
|
|
286
|
+
Returns:
|
|
287
|
+
Same type as input, with binary elements binarized to 0 or 1 using a 0.5 threshold.
|
|
288
|
+
"""
|
|
289
|
+
original_type = type(input_array)
|
|
290
|
+
|
|
291
|
+
if isinstance(input_array, pl.Series):
|
|
292
|
+
array = input_array.to_numpy()
|
|
293
|
+
elif isinstance(input_array, (pd.Series, np.ndarray)):
|
|
294
|
+
array = np.asarray(input_array)
|
|
295
|
+
elif isinstance(input_array, (list, tuple)):
|
|
296
|
+
array = np.array(input_array)
|
|
297
|
+
else:
|
|
298
|
+
raise TypeError("Unsupported input type")
|
|
299
|
+
|
|
300
|
+
array = array.flatten()
|
|
301
|
+
total = array.shape[0]
|
|
302
|
+
|
|
303
|
+
bin_count = total if binary_values is None else binary_values
|
|
304
|
+
if not (0 <= bin_count <= total):
|
|
305
|
+
raise ValueError("binary_values must be between 0 and the total number of elements")
|
|
306
|
+
|
|
307
|
+
if bin_count == 0:
|
|
308
|
+
result = array
|
|
309
|
+
else:
|
|
310
|
+
cont_part = array[:-bin_count] if bin_count < total else np.array([])
|
|
311
|
+
bin_part = (array[-bin_count:] > 0.5).astype(int)
|
|
312
|
+
result = np.concatenate([cont_part, bin_part])
|
|
313
|
+
|
|
314
|
+
if original_type is pd.Series:
|
|
315
|
+
return pd.Series(result, index=input_array.index if hasattr(input_array, 'index') else None) # type: ignore
|
|
316
|
+
elif original_type is pl.Series:
|
|
317
|
+
return pl.Series(input_array.name if hasattr(input_array, 'name') else "binary", result) # type: ignore
|
|
318
|
+
elif original_type is list:
|
|
319
|
+
return result.tolist()
|
|
320
|
+
elif original_type is tuple:
|
|
321
|
+
return tuple(result)
|
|
322
|
+
else:
|
|
323
|
+
return result
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def serialize_object(obj: Any, save_dir: str, filename: str, verbose: bool=True, raise_on_error: bool=False) -> Optional[str]:
|
|
327
|
+
"""
|
|
328
|
+
Serializes a Python object using joblib; suitable for Python built-ins, numpy, and pandas.
|
|
278
329
|
|
|
279
330
|
Parameters:
|
|
280
|
-
|
|
331
|
+
obj (Any) : The Python object to serialize.
|
|
332
|
+
save_dir (str) : Directory path where the serialized object will be saved.
|
|
333
|
+
filename (str) : Name for the output file, extension will be appended if needed.
|
|
334
|
+
|
|
335
|
+
Returns:
|
|
336
|
+
(str | None) : The full file path where the object was saved if successful; otherwise, None.
|
|
337
|
+
"""
|
|
338
|
+
try:
|
|
339
|
+
os.makedirs(save_dir, exist_ok=True)
|
|
340
|
+
sanitized_name = sanitize_filename(filename)
|
|
341
|
+
if not sanitized_name.endswith('.joblib'):
|
|
342
|
+
sanitized_name = sanitized_name + ".joblib"
|
|
343
|
+
full_path = os.path.join(save_dir, sanitized_name)
|
|
344
|
+
joblib.dump(obj, full_path)
|
|
345
|
+
except (IOError, OSError, TypeError, TerminatedWorkerError) as e:
|
|
346
|
+
message = f"❌ Failed to serialize object of type '{type(obj)}': {e}"
|
|
347
|
+
if raise_on_error:
|
|
348
|
+
raise Exception(message)
|
|
349
|
+
else:
|
|
350
|
+
print(message)
|
|
351
|
+
return None
|
|
352
|
+
else:
|
|
353
|
+
if verbose:
|
|
354
|
+
print(f"✅ Object of type '{type(obj)}' saved to '{full_path}'")
|
|
355
|
+
return full_path
|
|
356
|
+
|
|
281
357
|
|
|
282
|
-
|
|
358
|
+
def deserialize_object(filepath: str, verbose: bool=True, raise_on_error: bool=True) -> Optional[Any]:
|
|
359
|
+
"""
|
|
360
|
+
Loads a serialized object from a .joblib file.
|
|
361
|
+
|
|
362
|
+
Parameters:
|
|
363
|
+
filepath (str): Full path to the serialized .joblib file.
|
|
283
364
|
|
|
284
365
|
Returns:
|
|
285
|
-
|
|
366
|
+
(Any | None): The deserialized Python object, or None if loading fails.
|
|
286
367
|
"""
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
368
|
+
if not os.path.exists(filepath):
|
|
369
|
+
print(f"❌ File does not exist: {filepath}")
|
|
370
|
+
return None
|
|
371
|
+
try:
|
|
372
|
+
obj = joblib.load(filepath)
|
|
373
|
+
except (IOError, OSError, EOFError, TypeError, ValueError) as e:
|
|
374
|
+
message = f"❌ Failed to deserialize object from '{filepath}': {e}"
|
|
375
|
+
if raise_on_error:
|
|
376
|
+
raise Exception(message)
|
|
377
|
+
else:
|
|
378
|
+
print(message)
|
|
379
|
+
return None
|
|
380
|
+
else:
|
|
381
|
+
if verbose:
|
|
382
|
+
print(f"✅ Loaded object of type '{type(obj)}'")
|
|
383
|
+
return obj
|
|
295
384
|
|
|
296
|
-
|
|
297
|
-
|
|
385
|
+
|
|
386
|
+
def distribute_datasets_by_target(
|
|
387
|
+
df_or_path: Union[pd.DataFrame, str],
|
|
388
|
+
target_columns: list[str],
|
|
389
|
+
verbose: bool = False
|
|
390
|
+
) -> Iterator[Tuple[str, pd.DataFrame]]:
|
|
391
|
+
"""
|
|
392
|
+
Yields cleaned DataFrames for each target column, where rows with missing
|
|
393
|
+
target values are removed. The target column is placed at the end.
|
|
394
|
+
|
|
395
|
+
Parameters
|
|
396
|
+
----------
|
|
397
|
+
df_or_path : [pd.DataFrame | str]
|
|
398
|
+
Dataframe or path to Dataframe with all feature and target columns ready to split and train a model.
|
|
399
|
+
target_columns : List[str]
|
|
400
|
+
List of target column names to generate per-target DataFrames.
|
|
401
|
+
verbose: bool
|
|
402
|
+
Whether to print info for each yielded dataset.
|
|
403
|
+
|
|
404
|
+
Yields
|
|
405
|
+
------
|
|
406
|
+
Tuple[str, pd.DataFrame]
|
|
407
|
+
* First element is the target column name.
|
|
408
|
+
* Second element is the corresponding cleaned DataFrame.
|
|
409
|
+
"""
|
|
410
|
+
# Validate path
|
|
411
|
+
if isinstance(df_or_path, str):
|
|
412
|
+
df, _ = load_dataframe(df_or_path)
|
|
413
|
+
else:
|
|
414
|
+
df = df_or_path
|
|
298
415
|
|
|
299
|
-
|
|
416
|
+
valid_targets = [col for col in df.columns if col in target_columns]
|
|
417
|
+
feature_columns = [col for col in df.columns if col not in valid_targets]
|
|
418
|
+
|
|
419
|
+
for target in valid_targets:
|
|
420
|
+
subset = df[feature_columns + [target]].dropna(subset=[target])
|
|
421
|
+
if verbose:
|
|
422
|
+
print(f"Target: '{target}' - Dataframe shape: {subset.shape}")
|
|
423
|
+
yield target, subset
|
|
300
424
|
|
|
301
425
|
|
|
302
426
|
def _script_info(all_data: list[str]):
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.5}/dragon_ml_toolbox.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-1.4.3 → dragon_ml_toolbox-1.4.5}/dragon_ml_toolbox.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|