dragon-ml-toolbox 1.4.2__tar.gz → 1.4.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.3}/LICENSE-THIRD-PARTY.md +6 -1
- {dragon_ml_toolbox-1.4.2/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-1.4.3}/PKG-INFO +1 -1
- {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.3/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
- {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.3}/ml_tools/MICE_imputation.py +22 -14
- {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.3}/ml_tools/data_exploration.py +41 -8
- {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.3}/ml_tools/ensemble_learning.py +381 -115
- {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.3}/ml_tools/particle_swarm_optimization.py +2 -2
- {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.3}/ml_tools/utilities.py +9 -6
- {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.3}/pyproject.toml +1 -1
- {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.3}/LICENSE +0 -0
- {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.3}/README.md +0 -0
- {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.3}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
- {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.3}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
- {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.3}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
- {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.3}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
- {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.3}/ml_tools/VIF_factor.py +0 -0
- {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.3}/ml_tools/__init__.py +0 -0
- {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.3}/ml_tools/datasetmaster.py +0 -0
- {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.3}/ml_tools/handle_excel.py +0 -0
- {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.3}/ml_tools/logger.py +0 -0
- {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.3}/ml_tools/pytorch_models.py +0 -0
- {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.3}/ml_tools/trainer.py +0 -0
- {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.3}/ml_tools/vision_helpers.py +0 -0
- {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.3}/setup.cfg +0 -0
|
@@ -8,7 +8,12 @@ This project depends on the following third-party packages. Each is governed by
|
|
|
8
8
|
- [seaborn](https://github.com/mwaskom/seaborn/blob/main/LICENSE)
|
|
9
9
|
- [statsmodels](https://github.com/statsmodels/statsmodels/blob/main/LICENSE.txt)
|
|
10
10
|
- [ipython](https://github.com/ipython/ipython/blob/main/COPYING.rst)
|
|
11
|
+
- [ipykernel](https://github.com/ipython/ipykernel/blob/main/COPYING.rst)
|
|
12
|
+
- [notebook](https://github.com/jupyter/notebook/blob/main/LICENSE)
|
|
13
|
+
- [jupyterlab](https://github.com/jupyterlab/jupyterlab/blob/main/LICENSE)
|
|
14
|
+
- [ipywidgets](https://github.com/jupyter-widgets/ipywidgets/blob/main/LICENSE)
|
|
11
15
|
- [torch](https://github.com/pytorch/pytorch/blob/main/LICENSE)
|
|
16
|
+
- [torchvision](https://github.com/pytorch/vision/blob/main/LICENSE)
|
|
12
17
|
- [scikit-learn](https://github.com/scikit-learn/scikit-learn/blob/main/COPYING)
|
|
13
18
|
- [imblearn](https://github.com/scikit-learn-contrib/imbalanced-learn/blob/main/LICENSE)
|
|
14
19
|
- [Pillow](https://github.com/python-pillow/Pillow/blob/main/LICENSE)
|
|
@@ -19,5 +24,5 @@ This project depends on the following third-party packages. Each is governed by
|
|
|
19
24
|
- [openpyxl](https://github.com/chronossc/openpyxl/blob/main/LICENSE)
|
|
20
25
|
- [miceforest](https://github.com/AnotherSamWilson/miceforest/blob/main/LICENSE)
|
|
21
26
|
- [polars](https://github.com/pola-rs/polars/blob/main/LICENSE)
|
|
22
|
-
- [
|
|
27
|
+
- [plotnine](https://github.com/has2k1/plotnine/blob/main/LICENSE.txt)
|
|
23
28
|
- [pyswarm](https://pythonhosted.org/pyswarm/#license)
|
|
@@ -3,7 +3,7 @@ import miceforest as mf
|
|
|
3
3
|
import os
|
|
4
4
|
import matplotlib.pyplot as plt
|
|
5
5
|
import numpy as np
|
|
6
|
-
from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info
|
|
6
|
+
from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info, merge_dataframes, save_dataframe
|
|
7
7
|
from plotnine import ggplot, labs, theme, element_blank # type: ignore
|
|
8
8
|
|
|
9
9
|
|
|
@@ -49,15 +49,11 @@ def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterat
|
|
|
49
49
|
return kernel, imputed_datasets, imputed_dataset_names
|
|
50
50
|
|
|
51
51
|
|
|
52
|
-
def save_imputed_datasets(save_dir: str, imputed_datasets: list, imputed_dataset_names: list[str]):
|
|
53
|
-
# Check path
|
|
54
|
-
os.makedirs(save_dir, exist_ok=True)
|
|
55
|
-
|
|
52
|
+
def save_imputed_datasets(save_dir: str, imputed_datasets: list, df_targets: pd.DataFrame, imputed_dataset_names: list[str]):
|
|
56
53
|
for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
54
|
+
merged_df = merge_dataframes(imputed_df, df_targets, direction="horizontal", verbose=False)
|
|
55
|
+
save_dataframe(df=merged_df, save_dir=save_dir, filename=subname)
|
|
56
|
+
|
|
61
57
|
|
|
62
58
|
#Get names of features that had missing values before imputation
|
|
63
59
|
def get_na_column_names(df: pd.DataFrame):
|
|
@@ -119,7 +115,7 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
|
|
|
119
115
|
plt.savefig(save_path, bbox_inches='tight', format="svg")
|
|
120
116
|
plt.close()
|
|
121
117
|
|
|
122
|
-
print(f"
|
|
118
|
+
print(f"{dataset_file_dir} completed.")
|
|
123
119
|
|
|
124
120
|
|
|
125
121
|
# Imputed distributions
|
|
@@ -131,7 +127,8 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
|
|
|
131
127
|
'''
|
|
132
128
|
# Check path
|
|
133
129
|
os.makedirs(root_dir, exist_ok=True)
|
|
134
|
-
|
|
130
|
+
local_dir_name = f"Distribution_Metrics_{df_name}_imputed"
|
|
131
|
+
local_save_dir = os.path.join(root_dir, local_dir_name)
|
|
135
132
|
if not os.path.isdir(local_save_dir):
|
|
136
133
|
os.makedirs(local_save_dir)
|
|
137
134
|
|
|
@@ -202,10 +199,10 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
|
|
|
202
199
|
fig = kernel.plot_imputed_distributions(variables=[feature])
|
|
203
200
|
_process_figure(fig, feature)
|
|
204
201
|
|
|
205
|
-
print("
|
|
202
|
+
print(f"{local_dir_name} completed.")
|
|
206
203
|
|
|
207
204
|
|
|
208
|
-
def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_dir: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
|
|
205
|
+
def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str], save_datasets_dir: str, save_metrics_dir: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
|
|
209
206
|
"""
|
|
210
207
|
Call functions in sequence for each dataset in the provided path or directory:
|
|
211
208
|
1. Load dataframe
|
|
@@ -213,6 +210,8 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
|
|
|
213
210
|
3. Save imputed dataset(s)
|
|
214
211
|
4. Save convergence metrics
|
|
215
212
|
5. Save distribution metrics
|
|
213
|
+
|
|
214
|
+
Target columns must be skipped from the imputation.
|
|
216
215
|
"""
|
|
217
216
|
# Check paths
|
|
218
217
|
os.makedirs(save_datasets_dir, exist_ok=True)
|
|
@@ -228,9 +227,11 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
|
|
|
228
227
|
for df_path in all_file_paths:
|
|
229
228
|
df, df_name = load_dataframe(df_path=df_path)
|
|
230
229
|
|
|
230
|
+
df, df_targets = _skip_targets(df, target_columns)
|
|
231
|
+
|
|
231
232
|
kernel, imputed_datasets, imputed_dataset_names = apply_mice(df=df, df_name=df_name, resulting_datasets=resulting_datasets, iterations=iterations, random_state=random_state)
|
|
232
233
|
|
|
233
|
-
save_imputed_datasets(save_dir=save_datasets_dir, imputed_datasets=imputed_datasets, imputed_dataset_names=imputed_dataset_names)
|
|
234
|
+
save_imputed_datasets(save_dir=save_datasets_dir, imputed_datasets=imputed_datasets, df_targets=df_targets, imputed_dataset_names=imputed_dataset_names)
|
|
234
235
|
|
|
235
236
|
imputed_column_names = get_na_column_names(df=df)
|
|
236
237
|
|
|
@@ -239,5 +240,12 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
|
|
|
239
240
|
get_imputed_distributions(kernel=kernel, df_name=df_name, root_dir=save_metrics_dir, column_names=imputed_column_names)
|
|
240
241
|
|
|
241
242
|
|
|
243
|
+
def _skip_targets(df: pd.DataFrame, target_cols: list[str]):
|
|
244
|
+
valid_targets = [col for col in target_cols if col in df.columns]
|
|
245
|
+
df_targets = df[valid_targets]
|
|
246
|
+
df_feats = df.drop(columns=valid_targets)
|
|
247
|
+
return df_feats, df_targets
|
|
248
|
+
|
|
249
|
+
|
|
242
250
|
def info():
|
|
243
251
|
_script_info(__all__)
|
|
@@ -5,10 +5,8 @@ import seaborn as sns
|
|
|
5
5
|
from IPython import get_ipython
|
|
6
6
|
from IPython.display import clear_output
|
|
7
7
|
import time
|
|
8
|
-
from typing import Union, Literal, Dict, Tuple
|
|
8
|
+
from typing import Union, Literal, Dict, Tuple, Iterator
|
|
9
9
|
import os
|
|
10
|
-
import sys
|
|
11
|
-
import textwrap
|
|
12
10
|
from ml_tools.utilities import sanitize_filename, _script_info
|
|
13
11
|
|
|
14
12
|
|
|
@@ -24,7 +22,8 @@ __all__ = [
|
|
|
24
22
|
"check_value_distributions",
|
|
25
23
|
"plot_value_distributions",
|
|
26
24
|
"clip_outliers_single",
|
|
27
|
-
"clip_outliers_multi"
|
|
25
|
+
"clip_outliers_multi",
|
|
26
|
+
"distribute_datasets_by_target"
|
|
28
27
|
]
|
|
29
28
|
|
|
30
29
|
|
|
@@ -113,7 +112,7 @@ def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
|
|
|
113
112
|
Parameters:
|
|
114
113
|
df (pd.DataFrame): The input DataFrame.
|
|
115
114
|
round_digits (int): Number of decimal places for the percentage.
|
|
116
|
-
|
|
115
|
+
|
|
117
116
|
Returns:
|
|
118
117
|
pd.DataFrame: A DataFrame summarizing missing values in each column.
|
|
119
118
|
"""
|
|
@@ -133,13 +132,14 @@ def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
|
|
|
133
132
|
return null_summary
|
|
134
133
|
|
|
135
134
|
|
|
136
|
-
def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7) -> pd.DataFrame:
|
|
135
|
+
def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, show_nulls_after: bool = True) -> pd.DataFrame:
|
|
137
136
|
"""
|
|
138
137
|
Drops columns with more than `threshold` fraction of missing values.
|
|
139
138
|
|
|
140
139
|
Parameters:
|
|
141
140
|
df (pd.DataFrame): The input DataFrame.
|
|
142
141
|
threshold (float): Fraction of missing values above which columns are dropped.
|
|
142
|
+
show_nulls_after (bool): Prints `show_null_columns` after dropping columns.
|
|
143
143
|
|
|
144
144
|
Returns:
|
|
145
145
|
pd.DataFrame: A new DataFrame without the dropped columns.
|
|
@@ -150,10 +150,15 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7) ->
|
|
|
150
150
|
if len(cols_to_drop) > 0:
|
|
151
151
|
print(f"Dropping columns with more than {threshold*100:.0f}% missing data:")
|
|
152
152
|
print(list(cols_to_drop))
|
|
153
|
+
|
|
154
|
+
result_df = df.drop(columns=cols_to_drop)
|
|
155
|
+
if show_nulls_after:
|
|
156
|
+
show_null_columns(df=result_df).head(20)
|
|
157
|
+
|
|
158
|
+
return result_df
|
|
153
159
|
else:
|
|
154
160
|
print(f"No columns have more than {threshold*100:.0f}% missing data.")
|
|
155
|
-
|
|
156
|
-
return df.drop(columns=cols_to_drop)
|
|
161
|
+
return df
|
|
157
162
|
|
|
158
163
|
|
|
159
164
|
def split_continuous_binary(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
@@ -514,6 +519,34 @@ def clip_outliers_multi(
|
|
|
514
519
|
return new_df
|
|
515
520
|
|
|
516
521
|
|
|
522
|
+
def distribute_datasets_by_target(
|
|
523
|
+
df: pd.DataFrame,
|
|
524
|
+
target_columns: list[str]
|
|
525
|
+
) -> Iterator[Tuple[str, pd.DataFrame]]:
|
|
526
|
+
"""
|
|
527
|
+
Yields cleaned DataFrames for each target column, where rows with missing
|
|
528
|
+
target values are removed. The target column is placed at the end.
|
|
529
|
+
|
|
530
|
+
Parameters
|
|
531
|
+
----------
|
|
532
|
+
df : pd.DataFrame
|
|
533
|
+
Preprocessed dataframe with all feature and target columns ready to train.
|
|
534
|
+
target_columns : List[str]
|
|
535
|
+
List of target column names to generate per-target DataFrames.
|
|
536
|
+
|
|
537
|
+
Yields
|
|
538
|
+
------
|
|
539
|
+
Tuple[str, pd.DataFrame]
|
|
540
|
+
* First element is the target column name.
|
|
541
|
+
* Second element is the corresponding cleaned DataFrame.
|
|
542
|
+
"""
|
|
543
|
+
feature_columns = [col for col in df.columns if col not in target_columns]
|
|
544
|
+
|
|
545
|
+
for target in target_columns:
|
|
546
|
+
subset = df[feature_columns + [target]].dropna(subset=[target])
|
|
547
|
+
yield target, subset
|
|
548
|
+
|
|
549
|
+
|
|
517
550
|
def _is_notebook():
|
|
518
551
|
return get_ipython() is not None
|
|
519
552
|
|
|
@@ -6,7 +6,7 @@ from matplotlib.colors import Colormap
|
|
|
6
6
|
from matplotlib import rcdefaults
|
|
7
7
|
|
|
8
8
|
import os
|
|
9
|
-
from typing import Literal, Union, Optional
|
|
9
|
+
from typing import Literal, Union, Optional, Iterator, Tuple
|
|
10
10
|
import joblib
|
|
11
11
|
|
|
12
12
|
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
|
|
@@ -29,7 +29,9 @@ warnings.filterwarnings('ignore', category=UserWarning)
|
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
__all__ = [
|
|
32
|
-
"
|
|
32
|
+
"dataset_yielder",
|
|
33
|
+
"RegressionTreeModels",
|
|
34
|
+
"ClassificationTreeModels",
|
|
33
35
|
"dataset_pipeline",
|
|
34
36
|
"evaluate_model_classification",
|
|
35
37
|
"plot_roc_curve",
|
|
@@ -39,114 +41,364 @@ __all__ = [
|
|
|
39
41
|
"run_ensemble_pipeline"
|
|
40
42
|
]
|
|
41
43
|
|
|
44
|
+
## Type aliases
|
|
45
|
+
HandleImbalanceStrategy = Literal[
|
|
46
|
+
"ADASYN", "SMOTE", "RAND_OVERSAMPLE", "RAND_UNDERSAMPLE", "by_model", None
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
TaskType = Literal[
|
|
50
|
+
"classification", "regression"
|
|
51
|
+
]
|
|
42
52
|
|
|
43
53
|
###### 1. Dataset Loader ######
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
54
|
+
def dataset_yielder(
|
|
55
|
+
df: pd.DataFrame,
|
|
56
|
+
target_cols: list[str]
|
|
57
|
+
) -> Iterator[Tuple[pd.DataFrame, pd.Series, list[str], str]]:
|
|
58
|
+
"""
|
|
59
|
+
Yields one tuple at a time:
|
|
60
|
+
(features_dataframe, target_series, feature_names, target_name)
|
|
61
|
+
|
|
62
|
+
Skips any target columns not found in the DataFrame.
|
|
63
|
+
"""
|
|
64
|
+
# Determine which target columns actually exist in the DataFrame
|
|
65
|
+
valid_targets = [col for col in target_cols if col in df.columns]
|
|
66
|
+
|
|
67
|
+
# Features = all columns excluding valid target columns
|
|
68
|
+
df_features = df.drop(columns=valid_targets)
|
|
50
69
|
feature_names = df_features.columns.to_list()
|
|
51
|
-
|
|
52
|
-
for target_col in
|
|
70
|
+
|
|
71
|
+
for target_col in valid_targets:
|
|
53
72
|
df_target = df[target_col]
|
|
54
73
|
yield (df_features, df_target, feature_names, target_col)
|
|
55
74
|
|
|
75
|
+
|
|
56
76
|
###### 2. Initialize Models ######
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
Valid tasks: "classification" or "regression".
|
|
77
|
+
class RegressionTreeModels:
|
|
78
|
+
"""
|
|
79
|
+
A factory class for creating and configuring multiple gradient boosting regression models
|
|
80
|
+
with unified hyperparameters. This includes XGBoost, LightGBM, and HistGradientBoostingRegressor.
|
|
62
81
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
For classification only: Set `is_balanced=False` for imbalanced datasets.
|
|
82
|
+
Use the `__call__`, `()` method.
|
|
83
|
+
|
|
84
|
+
Parameters
|
|
85
|
+
----------
|
|
86
|
+
random_state : int
|
|
87
|
+
Seed used by the random number generator.
|
|
88
|
+
|
|
89
|
+
learning_rate : float [0.001 - 0.300]
|
|
90
|
+
Boosting learning rate (shrinkage).
|
|
73
91
|
|
|
74
|
-
|
|
75
|
-
|
|
92
|
+
L1_regularization : float [0.0 - 10.0]
|
|
93
|
+
L1 regularization term (alpha). Might drive to sparsity.
|
|
94
|
+
|
|
95
|
+
L2_regularization : float [0.0 - 10.0]
|
|
96
|
+
L2 regularization term (lambda).
|
|
97
|
+
|
|
98
|
+
n_estimators : int [100 - 3000]
|
|
99
|
+
Number of boosting iterations for XGBoost and LightGBM.
|
|
100
|
+
|
|
101
|
+
max_depth : int [3 - 15]
|
|
102
|
+
Maximum depth of individual trees. Controls model complexity; high values may overfit.
|
|
103
|
+
|
|
104
|
+
subsample : float [0.5 - 1.0]
|
|
105
|
+
Fraction of rows per tree; used to prevent overfitting.
|
|
106
|
+
|
|
107
|
+
colsample_bytree : float [0.3 - 1.0]
|
|
108
|
+
Fraction of features per tree; useful for regularization (used by XGBoost and LightGBM).
|
|
109
|
+
|
|
110
|
+
min_samples_leaf : int [10 - 100]
|
|
111
|
+
Minimum samples per leaf; higher = less overfitting (used in HistGB).
|
|
112
|
+
|
|
113
|
+
max_iter : int [100 - 2000]
|
|
114
|
+
Maximum number of iterations (used in HistGB).
|
|
115
|
+
|
|
116
|
+
min_child_weight : float [0.1 - 10.0]
|
|
117
|
+
Minimum sum of instance weight (hessian) needed in a child; larger values make the algorithm more conservative (used in XGBoost).
|
|
118
|
+
|
|
119
|
+
gamma : float [0.0 - 5.0]
|
|
120
|
+
Minimum loss reduction required to make a further partition on a leaf node; higher = more regularization (used in XGBoost).
|
|
121
|
+
|
|
122
|
+
num_leaves : int [20 - 200]
|
|
123
|
+
Maximum number of leaves in one tree; should be less than 2^(max_depth); larger = more complex (used in LightGBM).
|
|
124
|
+
|
|
125
|
+
min_data_in_leaf : int [10 - 100]
|
|
126
|
+
Minimum number of data points in a leaf; increasing may prevent overfitting (used in LightGBM).
|
|
127
|
+
"""
|
|
128
|
+
def __init__(self,
|
|
129
|
+
random_state: int = 101,
|
|
130
|
+
learning_rate: float = 0.005,
|
|
131
|
+
L1_regularization: float = 1.0,
|
|
132
|
+
L2_regularization: float = 1.0,
|
|
133
|
+
n_estimators: int = 1000,
|
|
134
|
+
max_depth: int = 8,
|
|
135
|
+
subsample: float = 0.8,
|
|
136
|
+
colsample_bytree: float = 0.8,
|
|
137
|
+
min_samples_leaf: int = 50,
|
|
138
|
+
max_iter: int = 1000,
|
|
139
|
+
min_child_weight: float = 3.0,
|
|
140
|
+
gamma: float = 1.0,
|
|
141
|
+
num_leaves: int = 31,
|
|
142
|
+
min_data_in_leaf: int = 40):
|
|
143
|
+
# General config
|
|
144
|
+
self.random_state = random_state
|
|
145
|
+
self.lr = learning_rate
|
|
146
|
+
self.L1 = L1_regularization
|
|
147
|
+
self.L2 = L2_regularization
|
|
148
|
+
|
|
149
|
+
# Shared tree structure
|
|
150
|
+
self.n_estimators = n_estimators
|
|
151
|
+
self.max_depth = max_depth
|
|
152
|
+
self.subsample = subsample
|
|
153
|
+
self.colsample_bytree = colsample_bytree
|
|
154
|
+
|
|
155
|
+
# XGBoost specific
|
|
156
|
+
self.min_child_weight = min_child_weight
|
|
157
|
+
self.gamma = gamma
|
|
158
|
+
|
|
159
|
+
# LightGBM specific
|
|
160
|
+
if num_leaves >= (2**max_depth):
|
|
161
|
+
num_leaves = (2**max_depth) - 1
|
|
162
|
+
print(f"⚠️ Warning: 'num_leaves' should be set proportional to 'max_depth'. Value set as {num_leaves}.")
|
|
163
|
+
self.num_leaves = num_leaves
|
|
164
|
+
self.min_data_in_leaf = min_data_in_leaf
|
|
165
|
+
|
|
166
|
+
# HistGB specific
|
|
167
|
+
self.max_iter = max_iter
|
|
168
|
+
self.min_samples_leaf = min_samples_leaf
|
|
169
|
+
|
|
170
|
+
def __call__(self) -> dict[str, object]:
|
|
171
|
+
"""
|
|
172
|
+
Returns a dictionary with new instances of:
|
|
173
|
+
- "XGBoost": XGBRegressor
|
|
174
|
+
- "LightGBM": LGBMRegressor
|
|
175
|
+
- "HistGB": HistGradientBoostingRegressor
|
|
176
|
+
"""
|
|
177
|
+
# XGBoost Regressor
|
|
178
|
+
xgb_model = xgb.XGBRegressor(
|
|
179
|
+
n_estimators=self.n_estimators,
|
|
180
|
+
max_depth=self.max_depth,
|
|
181
|
+
learning_rate=self.lr,
|
|
182
|
+
subsample=self.subsample,
|
|
183
|
+
colsample_bytree=self.colsample_bytree,
|
|
184
|
+
random_state=self.random_state,
|
|
185
|
+
reg_alpha=self.L1,
|
|
186
|
+
reg_lambda=self.L2,
|
|
187
|
+
eval_metric='rmse',
|
|
188
|
+
min_child_weight=self.min_child_weight,
|
|
189
|
+
gamma=self.gamma,
|
|
190
|
+
tree_method='hist',
|
|
191
|
+
grow_policy='lossguide'
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
# LightGBM Regressor
|
|
195
|
+
lgb_model = lgb.LGBMRegressor(
|
|
196
|
+
n_estimators=self.n_estimators,
|
|
197
|
+
learning_rate=self.lr,
|
|
198
|
+
max_depth=self.max_depth,
|
|
199
|
+
subsample=self.subsample,
|
|
200
|
+
colsample_bytree=self.colsample_bytree,
|
|
201
|
+
random_state=self.random_state,
|
|
202
|
+
verbose=-1,
|
|
203
|
+
reg_alpha=self.L1,
|
|
204
|
+
reg_lambda=self.L2,
|
|
205
|
+
boosting_type='dart',
|
|
206
|
+
num_leaves=self.num_leaves,
|
|
207
|
+
min_data_in_leaf=self.min_data_in_leaf
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
# HistGradientBoosting Regressor
|
|
211
|
+
hist_model = HistGradientBoostingRegressor(
|
|
212
|
+
max_iter=self.max_iter,
|
|
213
|
+
learning_rate=self.lr,
|
|
214
|
+
max_depth=self.max_depth,
|
|
215
|
+
min_samples_leaf=self.min_samples_leaf,
|
|
216
|
+
random_state=self.random_state,
|
|
217
|
+
l2_regularization=self.L2,
|
|
218
|
+
scoring='neg_mean_squared_error',
|
|
219
|
+
early_stopping=True,
|
|
220
|
+
validation_fraction=0.1
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
return {
|
|
224
|
+
"XGBoost": xgb_model,
|
|
225
|
+
"LightGBM": lgb_model,
|
|
226
|
+
"HistGB": hist_model
|
|
227
|
+
}
|
|
76
228
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
raise ValueError(f"Invalid task: {task}. Must be 'classification' or 'regression'.")
|
|
80
|
-
|
|
81
|
-
models = {}
|
|
82
|
-
|
|
83
|
-
# Common parameters
|
|
84
|
-
xgb_params = {
|
|
85
|
-
'n_estimators': 200,
|
|
86
|
-
'max_depth': 5,
|
|
87
|
-
'learning_rate': learning_rate,
|
|
88
|
-
'subsample': 0.8,
|
|
89
|
-
'colsample_bytree': 0.8,
|
|
90
|
-
'random_state': random_state,
|
|
91
|
-
'reg_alpha': L1_regularization,
|
|
92
|
-
'reg_lambda': L2_regularization,
|
|
93
|
-
}
|
|
94
|
-
|
|
95
|
-
lgbm_params = {
|
|
96
|
-
'n_estimators': 200,
|
|
97
|
-
'learning_rate': learning_rate,
|
|
98
|
-
'max_depth': 5,
|
|
99
|
-
'subsample': 0.8,
|
|
100
|
-
'colsample_bytree': 0.8,
|
|
101
|
-
'random_state': random_state,
|
|
102
|
-
'verbose': -1,
|
|
103
|
-
'reg_alpha': L1_regularization,
|
|
104
|
-
'reg_lambda': L2_regularization,
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
hist_params = {
|
|
108
|
-
'max_iter': 200,
|
|
109
|
-
'learning_rate': learning_rate,
|
|
110
|
-
'max_depth': 5,
|
|
111
|
-
'min_samples_leaf': 30,
|
|
112
|
-
'random_state': random_state,
|
|
113
|
-
'l2_regularization': L2_regularization,
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
# XGB Model
|
|
117
|
-
if task == "classification":
|
|
118
|
-
xgb_params.update({
|
|
119
|
-
'scale_pos_weight': 1 if is_balanced else 8,
|
|
120
|
-
'eval_metric': 'aucpr'
|
|
121
|
-
})
|
|
122
|
-
models["XGBoost"] = xgb.XGBClassifier(**xgb_params)
|
|
123
|
-
else:
|
|
124
|
-
xgb_params.update({'eval_metric': 'rmse'})
|
|
125
|
-
models["XGBoost"] = xgb.XGBRegressor(**xgb_params)
|
|
229
|
+
def __str__(self):
|
|
230
|
+
return f"{self.__class__.__name__}(n_estimators={self.n_estimators}, max_depth={self.max_depth}, lr={self.lr}, L1={self.L1}, L2={self.L2}"
|
|
126
231
|
|
|
127
|
-
# LGBM Model
|
|
128
|
-
if task == "classification":
|
|
129
|
-
lgbm_params.update({
|
|
130
|
-
'class_weight': None if is_balanced else 'balanced',
|
|
131
|
-
'boosting_type': 'goss' if is_balanced else 'dart',
|
|
132
|
-
})
|
|
133
|
-
models["LightGBM"] = lgb.LGBMClassifier(**lgbm_params)
|
|
134
|
-
else:
|
|
135
|
-
lgbm_params['boosting_type'] = 'dart'
|
|
136
|
-
models["LightGBM"] = lgb.LGBMRegressor(**lgbm_params)
|
|
137
232
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
233
|
+
class ClassificationTreeModels:
|
|
234
|
+
"""
|
|
235
|
+
A factory class for creating and configuring multiple gradient boosting classification models
|
|
236
|
+
with unified hyperparameters. This includes: XGBoost, LightGBM, and HistGradientBoostingClassifier.
|
|
237
|
+
|
|
238
|
+
Use the `__call__`, `()` method.
|
|
239
|
+
|
|
240
|
+
Parameters
|
|
241
|
+
----------
|
|
242
|
+
random_state : int
|
|
243
|
+
Seed used by the random number generator to ensure reproducibility.
|
|
244
|
+
|
|
245
|
+
learning_rate : float [0.001 - 0.300]
|
|
246
|
+
Boosting learning rate (shrinkage factor).
|
|
247
|
+
|
|
248
|
+
L1_regularization : float [0.0 - 10.0]
|
|
249
|
+
L1 regularization term (alpha), might drive to sparsity.
|
|
250
|
+
|
|
251
|
+
L2_regularization : float [0.0 - 10.0]
|
|
252
|
+
L2 regularization term (lambda).
|
|
253
|
+
|
|
254
|
+
n_estimators : int [100 - 3000]
|
|
255
|
+
Number of boosting rounds for XGBoost and LightGBM.
|
|
256
|
+
|
|
257
|
+
max_depth : int [3 - 15]
|
|
258
|
+
Maximum depth of individual trees in the ensemble. Controls model complexity; high values may overfit.
|
|
259
|
+
|
|
260
|
+
subsample : float [0.5 - 1.0]
|
|
261
|
+
Fraction of samples to use when fitting base learners; used to prevent overfitting.
|
|
262
|
+
|
|
263
|
+
colsample_bytree : float [0.3 - 1.0]
|
|
264
|
+
Fraction of features per tree; useful for regularization (used by XGBoost and LightGBM).
|
|
265
|
+
|
|
266
|
+
min_samples_leaf : int [10 - 100]
|
|
267
|
+
Minimum number of samples required to be at a leaf node; higher = less overfitting (used in HistGB).
|
|
268
|
+
|
|
269
|
+
max_iter : int [100 - 2000]
|
|
270
|
+
Maximum number of boosting iteration (used in HistGB).
|
|
271
|
+
|
|
272
|
+
min_child_weight : float [0.1 - 10.0]
|
|
273
|
+
Minimum sum of instance weight (Hessian) in a child node; larger values make the algorithm more conservative (used in XGBoost).
|
|
274
|
+
|
|
275
|
+
gamma : float [0.0 - 5.0]
|
|
276
|
+
Minimum loss reduction required to make a further partition; higher = more regularization (used in XGBoost).
|
|
277
|
+
|
|
278
|
+
num_leaves : int [20 - 200]
|
|
279
|
+
Maximum number of leaves in one tree. Should be less than 2^(max_depth); larger = more complex (used in LightGBM).
|
|
280
|
+
|
|
281
|
+
min_data_in_leaf : int [10 -100]
|
|
282
|
+
Minimum number of samples required in a leaf; increasing may prevent overfitting (used in LightGBM).
|
|
283
|
+
|
|
284
|
+
Attributes
|
|
285
|
+
----------
|
|
286
|
+
use_model_balance : bool
|
|
287
|
+
Indicates whether to apply class balancing strategies internally. Can be overridden at runtime via the `__call__` method.
|
|
288
|
+
"""
|
|
289
|
+
def __init__(self,
|
|
290
|
+
random_state: int = 101,
|
|
291
|
+
learning_rate: float = 0.005,
|
|
292
|
+
L1_regularization: float = 1.0,
|
|
293
|
+
L2_regularization: float = 1.0,
|
|
294
|
+
n_estimators: int = 1000,
|
|
295
|
+
max_depth: int = 8,
|
|
296
|
+
subsample: float = 0.8,
|
|
297
|
+
colsample_bytree: float = 0.8,
|
|
298
|
+
min_samples_leaf: int = 50,
|
|
299
|
+
max_iter: int = 1000,
|
|
300
|
+
min_child_weight: float = 3.0,
|
|
301
|
+
gamma: float = 1.0,
|
|
302
|
+
num_leaves: int = 31,
|
|
303
|
+
min_data_in_leaf: int = 40):
|
|
304
|
+
# General config
|
|
305
|
+
self.random_state = random_state
|
|
306
|
+
self.lr = learning_rate
|
|
307
|
+
self.L1 = L1_regularization
|
|
308
|
+
self.L2 = L2_regularization
|
|
309
|
+
|
|
310
|
+
# To be set by the pipeline
|
|
311
|
+
self.use_model_balance: bool = True
|
|
312
|
+
|
|
313
|
+
# Shared tree structure
|
|
314
|
+
self.n_estimators = n_estimators
|
|
315
|
+
self.max_depth = max_depth
|
|
316
|
+
self.subsample = subsample
|
|
317
|
+
self.colsample_bytree = colsample_bytree
|
|
318
|
+
|
|
319
|
+
# XGBoost specific
|
|
320
|
+
self.min_child_weight = min_child_weight
|
|
321
|
+
self.gamma = gamma
|
|
322
|
+
|
|
323
|
+
# LightGBM specific
|
|
324
|
+
if num_leaves >= (2**max_depth):
|
|
325
|
+
num_leaves = (2**max_depth) - 1
|
|
326
|
+
print(f"⚠️ Warning: 'num_leaves' should be set proportional to 'max_depth'. Value set as {num_leaves}.")
|
|
327
|
+
self.num_leaves = num_leaves
|
|
328
|
+
self.min_data_in_leaf = min_data_in_leaf
|
|
329
|
+
|
|
330
|
+
# HistGB specific
|
|
331
|
+
self.max_iter = max_iter
|
|
332
|
+
self.min_samples_leaf = min_samples_leaf
|
|
333
|
+
|
|
334
|
+
def __call__(self, use_model_balance: Optional[bool]=None) -> dict[str, object]:
|
|
335
|
+
"""
|
|
336
|
+
Returns a dictionary with new instances of:
|
|
337
|
+
- "XGBoost": XGBClassifier
|
|
338
|
+
- "LightGBM": LGBMClassifier
|
|
339
|
+
- "HistGB": HistGradientBoostingClassifier
|
|
340
|
+
"""
|
|
341
|
+
if use_model_balance is not None:
|
|
342
|
+
self.use_model_balance = use_model_balance
|
|
343
|
+
|
|
344
|
+
# XGBoost Classifier
|
|
345
|
+
xgb_model = xgb.XGBClassifier(
|
|
346
|
+
n_estimators=self.n_estimators,
|
|
347
|
+
max_depth=self.max_depth,
|
|
348
|
+
learning_rate=self.lr,
|
|
349
|
+
subsample=self.subsample,
|
|
350
|
+
colsample_bytree=self.colsample_bytree,
|
|
351
|
+
random_state=self.random_state,
|
|
352
|
+
reg_alpha=self.L1,
|
|
353
|
+
reg_lambda=self.L2,
|
|
354
|
+
eval_metric='aucpr',
|
|
355
|
+
min_child_weight=self.min_child_weight,
|
|
356
|
+
gamma=self.gamma,
|
|
357
|
+
tree_method='hist',
|
|
358
|
+
grow_policy='lossguide',
|
|
359
|
+
scale_pos_weight=8.0 if self.use_model_balance else 1.0
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
# LightGBM Classifier
|
|
363
|
+
lgb_model = lgb.LGBMClassifier(
|
|
364
|
+
n_estimators=self.n_estimators,
|
|
365
|
+
learning_rate=self.lr,
|
|
366
|
+
max_depth=self.max_depth,
|
|
367
|
+
subsample=self.subsample,
|
|
368
|
+
colsample_bytree=self.colsample_bytree,
|
|
369
|
+
random_state=self.random_state,
|
|
370
|
+
verbose=-1,
|
|
371
|
+
reg_alpha=self.L1,
|
|
372
|
+
reg_lambda=self.L2,
|
|
373
|
+
boosting_type='dart' if self.use_model_balance else 'goss',
|
|
374
|
+
num_leaves=self.num_leaves,
|
|
375
|
+
min_data_in_leaf=self.min_data_in_leaf,
|
|
376
|
+
class_weight='balanced' if self.use_model_balance else None
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
# HistGradientBoosting Classifier
|
|
380
|
+
hist_model = HistGradientBoostingClassifier(
|
|
381
|
+
max_iter=self.max_iter,
|
|
382
|
+
learning_rate=self.lr,
|
|
383
|
+
max_depth=self.max_depth,
|
|
384
|
+
min_samples_leaf=self.min_samples_leaf,
|
|
385
|
+
random_state=self.random_state,
|
|
386
|
+
l2_regularization=self.L2,
|
|
387
|
+
early_stopping=True,
|
|
388
|
+
validation_fraction=0.1,
|
|
389
|
+
class_weight='balanced' if self.use_model_balance else None,
|
|
390
|
+
scoring='balanced_accuracy' if self.use_model_balance else 'loss'
|
|
391
|
+
)
|
|
392
|
+
|
|
393
|
+
return {
|
|
394
|
+
"XGBoost": xgb_model,
|
|
395
|
+
"LightGBM": lgb_model,
|
|
396
|
+
"HistGB": hist_model
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
def __str__(self):
|
|
400
|
+
return f"{self.__class__.__name__}(n_estimators={self.n_estimators}, max_depth={self.max_depth}, lr={self.lr}, L1={self.L1}, L2={self.L2}"
|
|
148
401
|
|
|
149
|
-
return models
|
|
150
402
|
|
|
151
403
|
###### 3. Process Dataset ######
|
|
152
404
|
# function to split data into train and test
|
|
@@ -157,7 +409,7 @@ def _split_data(features, target, test_size, random_state, task):
|
|
|
157
409
|
|
|
158
410
|
# Over-sample minority class (Positive cases) and return several single target datasets (Classification)
|
|
159
411
|
def _resample(X_train: np.ndarray, y_train: pd.Series,
|
|
160
|
-
strategy:
|
|
412
|
+
strategy: HandleImbalanceStrategy, random_state):
|
|
161
413
|
'''
|
|
162
414
|
Oversample minority class or undersample majority class.
|
|
163
415
|
|
|
@@ -165,9 +417,9 @@ def _resample(X_train: np.ndarray, y_train: pd.Series,
|
|
|
165
417
|
'''
|
|
166
418
|
if strategy == 'SMOTE':
|
|
167
419
|
resample_algorithm = SMOTE(random_state=random_state, k_neighbors=3)
|
|
168
|
-
elif strategy == '
|
|
420
|
+
elif strategy == 'RAND_OVERSAMPLE':
|
|
169
421
|
resample_algorithm = RandomOverSampler(random_state=random_state)
|
|
170
|
-
elif strategy == '
|
|
422
|
+
elif strategy == 'RAND_UNDERSAMPLE':
|
|
171
423
|
resample_algorithm = RandomUnderSampler(random_state=random_state)
|
|
172
424
|
elif strategy == 'ADASYN':
|
|
173
425
|
resample_algorithm = ADASYN(random_state=random_state, n_neighbors=3)
|
|
@@ -178,8 +430,8 @@ def _resample(X_train: np.ndarray, y_train: pd.Series,
|
|
|
178
430
|
return X_res, y_res
|
|
179
431
|
|
|
180
432
|
# DATASET PIPELINE
|
|
181
|
-
def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task:
|
|
182
|
-
resample_strategy:
|
|
433
|
+
def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task: TaskType,
|
|
434
|
+
resample_strategy: HandleImbalanceStrategy,
|
|
183
435
|
test_size: float=0.2, debug: bool=False, random_state: int=101):
|
|
184
436
|
'''
|
|
185
437
|
1. Make Train/Test splits
|
|
@@ -204,7 +456,7 @@ def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task: Lite
|
|
|
204
456
|
|
|
205
457
|
|
|
206
458
|
# Resample
|
|
207
|
-
if resample_strategy is None or task == "regression":
|
|
459
|
+
if resample_strategy is None or resample_strategy == "by_model" or task == "regression":
|
|
208
460
|
X_train_oversampled, y_train_oversampled = X_train, y_train
|
|
209
461
|
else:
|
|
210
462
|
X_train_oversampled, y_train_oversampled = _resample(X_train=X_train, y_train=y_train, strategy=resample_strategy, random_state=random_state)
|
|
@@ -431,7 +683,7 @@ def evaluate_model_regression(model, model_name: str,
|
|
|
431
683
|
sanitized_target_name = sanitize_filename(target_name)
|
|
432
684
|
report_path = os.path.join(save_dir, f"Regression_Report_{sanitized_target_name}.txt")
|
|
433
685
|
with open(report_path, "w") as f:
|
|
434
|
-
f.write(f"{model_name} - {target_name}
|
|
686
|
+
f.write(f"{model_name} - Regression Performance for '{target_name}'\n\n")
|
|
435
687
|
f.write(f"Mean Absolute Error (MAE): {mae:.4f}\n")
|
|
436
688
|
f.write(f"Mean Squared Error (MSE): {mse:.4f}\n")
|
|
437
689
|
f.write(f"Root Mean Squared Error (RMSE): {rmse:.4f}\n")
|
|
@@ -596,7 +848,7 @@ def get_shap_values(
|
|
|
596
848
|
|
|
597
849
|
|
|
598
850
|
# TRAIN TEST PIPELINE
|
|
599
|
-
def train_test_pipeline(model, model_name: str, dataset_id: str, task:
|
|
851
|
+
def train_test_pipeline(model, model_name: str, dataset_id: str, task: TaskType,
|
|
600
852
|
train_features: np.ndarray, train_target: np.ndarray,
|
|
601
853
|
test_features: np.ndarray, test_target: np.ndarray,
|
|
602
854
|
feature_names: list[str], target_name: str,
|
|
@@ -609,7 +861,7 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["
|
|
|
609
861
|
|
|
610
862
|
Returns: Tuple(Trained model, Test-set Predictions)
|
|
611
863
|
'''
|
|
612
|
-
print(f"\
|
|
864
|
+
print(f"\tTraining model: {model_name} for Target: {target_name}...")
|
|
613
865
|
trained_model = _train_model(model=model, train_features=train_features, train_target=train_target)
|
|
614
866
|
if debug:
|
|
615
867
|
print(f"Trained model object: {type(trained_model)}")
|
|
@@ -637,26 +889,40 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["
|
|
|
637
889
|
|
|
638
890
|
get_shap_values(model=trained_model, model_name=model_name, save_dir=local_save_directory,
|
|
639
891
|
features_to_explain=train_features, feature_names=feature_names, target_name=target_name, task=task)
|
|
640
|
-
print("\t...done.")
|
|
892
|
+
# print("\t...done.")
|
|
641
893
|
return trained_model, y_pred
|
|
642
894
|
|
|
643
895
|
###### 5. Execution ######
|
|
644
|
-
def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list[str],
|
|
645
|
-
|
|
646
|
-
test_size: float=0.2, debug:bool=False
|
|
896
|
+
def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list[str], model_object: Union[RegressionTreeModels, ClassificationTreeModels],
|
|
897
|
+
handle_classification_imbalance: HandleImbalanceStrategy=None, save_model: bool=False,
|
|
898
|
+
test_size: float=0.2, debug:bool=False):
|
|
899
|
+
#Check models
|
|
900
|
+
if isinstance(model_object, RegressionTreeModels):
|
|
901
|
+
task = "regression"
|
|
902
|
+
elif isinstance(model_object, ClassificationTreeModels):
|
|
903
|
+
task = "classification"
|
|
904
|
+
if handle_classification_imbalance is None:
|
|
905
|
+
print("⚠️ No method to handle classification class imbalance has been selected. Datasets are assumed to be balanced.")
|
|
906
|
+
elif handle_classification_imbalance == "by_model":
|
|
907
|
+
model_object.use_model_balance = True
|
|
908
|
+
else:
|
|
909
|
+
model_object.use_model_balance = False
|
|
910
|
+
else:
|
|
911
|
+
raise TypeError(f"Unrecognized model {type(model_object)}")
|
|
912
|
+
|
|
647
913
|
#Check paths
|
|
648
914
|
_check_paths(datasets_dir, save_dir)
|
|
915
|
+
|
|
649
916
|
#Yield imputed dataset
|
|
650
917
|
for dataframe, dataframe_name in yield_dataframes_from_dir(datasets_dir):
|
|
651
918
|
#Yield features dataframe and target dataframe
|
|
652
|
-
for df_features, df_target, feature_names, target_name in
|
|
919
|
+
for df_features, df_target, feature_names, target_name in dataset_yielder(df=dataframe, target_cols=target_columns):
|
|
653
920
|
#Dataset pipeline
|
|
654
921
|
X_train, y_train, X_test, y_test = dataset_pipeline(df_features=df_features, df_target=df_target, task=task,
|
|
655
|
-
resample_strategy=
|
|
656
|
-
test_size=test_size, debug=debug, random_state=random_state)
|
|
922
|
+
resample_strategy=handle_classification_imbalance,
|
|
923
|
+
test_size=test_size, debug=debug, random_state=model_object.random_state)
|
|
657
924
|
#Get models
|
|
658
|
-
models_dict =
|
|
659
|
-
L1_regularization=L1_regularization, L2_regularization=L2_regularization, learning_rate=learning_rate)
|
|
925
|
+
models_dict = model_object()
|
|
660
926
|
#Train models
|
|
661
927
|
for model_name, model in models_dict.items():
|
|
662
928
|
train_test_pipeline(model=model, model_name=model_name, dataset_id=dataframe_name, task=task,
|
|
@@ -129,7 +129,7 @@ def run_pso(lower_boundaries: list[float],
|
|
|
129
129
|
target_name: Union[str, None]=None,
|
|
130
130
|
feature_names: Union[list[str], None]=None,
|
|
131
131
|
swarm_size: int=200,
|
|
132
|
-
max_iterations: int=
|
|
132
|
+
max_iterations: int=1000,
|
|
133
133
|
inequality_constrain_function=None,
|
|
134
134
|
post_hoc_analysis: Optional[int]=3,
|
|
135
135
|
workers: int=3) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
|
|
@@ -261,7 +261,7 @@ def info():
|
|
|
261
261
|
_script_info(__all__)
|
|
262
262
|
|
|
263
263
|
|
|
264
|
-
### SOURCE CODE FOR PSO ###
|
|
264
|
+
### SOURCE CODE FOR PSO FROM PYSWARM ###
|
|
265
265
|
def _obj_wrapper(func, args, kwargs, x):
|
|
266
266
|
return func(x, *args, **kwargs)
|
|
267
267
|
|
|
@@ -95,7 +95,8 @@ def yield_dataframes_from_dir(datasets_dir: str):
|
|
|
95
95
|
def merge_dataframes(
|
|
96
96
|
*dfs: pd.DataFrame,
|
|
97
97
|
reset_index: bool = False,
|
|
98
|
-
direction: Literal["horizontal", "vertical"] = "horizontal"
|
|
98
|
+
direction: Literal["horizontal", "vertical"] = "horizontal",
|
|
99
|
+
verbose: bool=True
|
|
99
100
|
) -> pd.DataFrame:
|
|
100
101
|
"""
|
|
101
102
|
Merges multiple DataFrames either horizontally or vertically.
|
|
@@ -119,8 +120,9 @@ def merge_dataframes(
|
|
|
119
120
|
if len(dfs) < 2:
|
|
120
121
|
raise ValueError("At least 2 DataFrames must be provided.")
|
|
121
122
|
|
|
122
|
-
|
|
123
|
-
|
|
123
|
+
if verbose:
|
|
124
|
+
for i, df in enumerate(dfs, start=1):
|
|
125
|
+
print(f"DataFrame {i} shape: {df.shape}")
|
|
124
126
|
|
|
125
127
|
|
|
126
128
|
if direction == "horizontal":
|
|
@@ -142,8 +144,9 @@ def merge_dataframes(
|
|
|
142
144
|
|
|
143
145
|
if reset_index:
|
|
144
146
|
merged_df = merged_df.reset_index(drop=True)
|
|
145
|
-
|
|
146
|
-
|
|
147
|
+
|
|
148
|
+
if verbose:
|
|
149
|
+
print(f"Merged DataFrame shape: {merged_df.shape}")
|
|
147
150
|
|
|
148
151
|
return merged_df
|
|
149
152
|
|
|
@@ -171,7 +174,7 @@ def save_dataframe(df: pd.DataFrame, save_dir: str, filename: str) -> None:
|
|
|
171
174
|
output_path = os.path.join(save_dir, filename)
|
|
172
175
|
|
|
173
176
|
df.to_csv(output_path, index=False, encoding='utf-8')
|
|
174
|
-
print(f"✅ Saved
|
|
177
|
+
print(f"✅ Saved dataset: '{filename}' with shape: {df.shape}")
|
|
175
178
|
|
|
176
179
|
|
|
177
180
|
def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.3}/dragon_ml_toolbox.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.3}/dragon_ml_toolbox.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|