dragon-ml-toolbox 1.4.2__py3-none-any.whl → 1.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-1.4.2.dist-info → dragon_ml_toolbox-1.4.4.dist-info}/METADATA +1 -1
- dragon_ml_toolbox-1.4.4.dist-info/RECORD +19 -0
- {dragon_ml_toolbox-1.4.2.dist-info → dragon_ml_toolbox-1.4.4.dist-info}/licenses/LICENSE-THIRD-PARTY.md +6 -1
- ml_tools/MICE_imputation.py +24 -16
- ml_tools/VIF_factor.py +9 -6
- ml_tools/data_exploration.py +48 -9
- ml_tools/ensemble_learning.py +377 -115
- ml_tools/logger.py +2 -2
- ml_tools/particle_swarm_optimization.py +4 -4
- ml_tools/utilities.py +9 -6
- dragon_ml_toolbox-1.4.2.dist-info/RECORD +0 -19
- {dragon_ml_toolbox-1.4.2.dist-info → dragon_ml_toolbox-1.4.4.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-1.4.2.dist-info → dragon_ml_toolbox-1.4.4.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-1.4.2.dist-info → dragon_ml_toolbox-1.4.4.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
dragon_ml_toolbox-1.4.4.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
|
|
2
|
+
dragon_ml_toolbox-1.4.4.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=jDnniT0tgD0uw1NpjibsPF-qK3wmOKgTykLG2iNQU7E,1840
|
|
3
|
+
ml_tools/MICE_imputation.py,sha256=o1wPu3sYt6AxmUgtyDcfqbwL-82Hyep0upvC4tIWKyw,10136
|
|
4
|
+
ml_tools/VIF_factor.py,sha256=HEBsLJy_qSDaPw1Btha5B7omxN4wjJXg-sqoetCjCJw,10016
|
|
5
|
+
ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
ml_tools/data_exploration.py,sha256=nBhDuaUCgEVG0Xi8MQ0X_xWQadr-7j_wUqwt-t-w5uE,20172
|
|
7
|
+
ml_tools/datasetmaster.py,sha256=EFUEX-tqq94Ak1rXXYR-XaX85olrxvF2EuytdzUK7y0,29131
|
|
8
|
+
ml_tools/ensemble_learning.py,sha256=q8qOTbSAwdsgKhjojVm7wn_UApGOCsvFa0mvhLYUuyM,37239
|
|
9
|
+
ml_tools/handle_excel.py,sha256=ZJui5__0rc2T8UGHTheqZGhKmdVZ7Q2I54IoYCjAqJw,12612
|
|
10
|
+
ml_tools/logger.py,sha256=ZTtUB9HTkNs5zHTdYRKNbKADjUkuObsF7s8U5pNnVRA,4716
|
|
11
|
+
ml_tools/particle_swarm_optimization.py,sha256=zaDQTKz2fAixgAfZdkRttGpN2vgraT18ZlpllJlQesk,20034
|
|
12
|
+
ml_tools/pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
|
|
13
|
+
ml_tools/trainer.py,sha256=WAZ4EdrZuTOAnGXRWV3XcLNce4s7EKGf2-qchLC08Ik,15702
|
|
14
|
+
ml_tools/utilities.py,sha256=r7uEmo38Imly56BP3-Jv6dFJvLsbGipeBlkiZx2fcNQ,10189
|
|
15
|
+
ml_tools/vision_helpers.py,sha256=idQ-Ugp1IdsvwXiYyhYa9G3rTRTm37YRpkQDLEpANHM,7701
|
|
16
|
+
dragon_ml_toolbox-1.4.4.dist-info/METADATA,sha256=8hdr2v8UE5uxieOUSRgjTa5OP1C2BzgjawgxT8ynbQ0,2516
|
|
17
|
+
dragon_ml_toolbox-1.4.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
18
|
+
dragon_ml_toolbox-1.4.4.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
19
|
+
dragon_ml_toolbox-1.4.4.dist-info/RECORD,,
|
|
@@ -8,7 +8,12 @@ This project depends on the following third-party packages. Each is governed by
|
|
|
8
8
|
- [seaborn](https://github.com/mwaskom/seaborn/blob/main/LICENSE)
|
|
9
9
|
- [statsmodels](https://github.com/statsmodels/statsmodels/blob/main/LICENSE.txt)
|
|
10
10
|
- [ipython](https://github.com/ipython/ipython/blob/main/COPYING.rst)
|
|
11
|
+
- [ipykernel](https://github.com/ipython/ipykernel/blob/main/COPYING.rst)
|
|
12
|
+
- [notebook](https://github.com/jupyter/notebook/blob/main/LICENSE)
|
|
13
|
+
- [jupyterlab](https://github.com/jupyterlab/jupyterlab/blob/main/LICENSE)
|
|
14
|
+
- [ipywidgets](https://github.com/jupyter-widgets/ipywidgets/blob/main/LICENSE)
|
|
11
15
|
- [torch](https://github.com/pytorch/pytorch/blob/main/LICENSE)
|
|
16
|
+
- [torchvision](https://github.com/pytorch/vision/blob/main/LICENSE)
|
|
12
17
|
- [scikit-learn](https://github.com/scikit-learn/scikit-learn/blob/main/COPYING)
|
|
13
18
|
- [imblearn](https://github.com/scikit-learn-contrib/imbalanced-learn/blob/main/LICENSE)
|
|
14
19
|
- [Pillow](https://github.com/python-pillow/Pillow/blob/main/LICENSE)
|
|
@@ -19,5 +24,5 @@ This project depends on the following third-party packages. Each is governed by
|
|
|
19
24
|
- [openpyxl](https://github.com/chronossc/openpyxl/blob/main/LICENSE)
|
|
20
25
|
- [miceforest](https://github.com/AnotherSamWilson/miceforest/blob/main/LICENSE)
|
|
21
26
|
- [polars](https://github.com/pola-rs/polars/blob/main/LICENSE)
|
|
22
|
-
- [
|
|
27
|
+
- [plotnine](https://github.com/has2k1/plotnine/blob/main/LICENSE.txt)
|
|
23
28
|
- [pyswarm](https://pythonhosted.org/pyswarm/#license)
|
ml_tools/MICE_imputation.py
CHANGED
|
@@ -3,7 +3,7 @@ import miceforest as mf
|
|
|
3
3
|
import os
|
|
4
4
|
import matplotlib.pyplot as plt
|
|
5
5
|
import numpy as np
|
|
6
|
-
from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info
|
|
6
|
+
from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info, merge_dataframes, save_dataframe
|
|
7
7
|
from plotnine import ggplot, labs, theme, element_blank # type: ignore
|
|
8
8
|
|
|
9
9
|
|
|
@@ -36,9 +36,9 @@ def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterat
|
|
|
36
36
|
raise ValueError("No imputed datasets were generated. Check the MICE process.")
|
|
37
37
|
|
|
38
38
|
if resulting_datasets == 1:
|
|
39
|
-
imputed_dataset_names = [f"{df_name}
|
|
39
|
+
imputed_dataset_names = [f"{df_name}_MICE"]
|
|
40
40
|
else:
|
|
41
|
-
imputed_dataset_names = [f"{df_name}
|
|
41
|
+
imputed_dataset_names = [f"{df_name}_MICE_{i+1}" for i in range(resulting_datasets)]
|
|
42
42
|
|
|
43
43
|
# Ensure indexes match
|
|
44
44
|
for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
|
|
@@ -49,15 +49,11 @@ def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterat
|
|
|
49
49
|
return kernel, imputed_datasets, imputed_dataset_names
|
|
50
50
|
|
|
51
51
|
|
|
52
|
-
def save_imputed_datasets(save_dir: str, imputed_datasets: list, imputed_dataset_names: list[str]):
|
|
53
|
-
# Check path
|
|
54
|
-
os.makedirs(save_dir, exist_ok=True)
|
|
55
|
-
|
|
52
|
+
def save_imputed_datasets(save_dir: str, imputed_datasets: list, df_targets: pd.DataFrame, imputed_dataset_names: list[str]):
|
|
56
53
|
for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
54
|
+
merged_df = merge_dataframes(imputed_df, df_targets, direction="horizontal", verbose=False)
|
|
55
|
+
save_dataframe(df=merged_df, save_dir=save_dir, filename=subname)
|
|
56
|
+
|
|
61
57
|
|
|
62
58
|
#Get names of features that had missing values before imputation
|
|
63
59
|
def get_na_column_names(df: pd.DataFrame):
|
|
@@ -119,7 +115,7 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
|
|
|
119
115
|
plt.savefig(save_path, bbox_inches='tight', format="svg")
|
|
120
116
|
plt.close()
|
|
121
117
|
|
|
122
|
-
print(f"
|
|
118
|
+
print(f"{dataset_file_dir} completed.")
|
|
123
119
|
|
|
124
120
|
|
|
125
121
|
# Imputed distributions
|
|
@@ -131,7 +127,8 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
|
|
|
131
127
|
'''
|
|
132
128
|
# Check path
|
|
133
129
|
os.makedirs(root_dir, exist_ok=True)
|
|
134
|
-
|
|
130
|
+
local_dir_name = f"Distribution_Metrics_{df_name}_imputed"
|
|
131
|
+
local_save_dir = os.path.join(root_dir, local_dir_name)
|
|
135
132
|
if not os.path.isdir(local_save_dir):
|
|
136
133
|
os.makedirs(local_save_dir)
|
|
137
134
|
|
|
@@ -202,10 +199,10 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
|
|
|
202
199
|
fig = kernel.plot_imputed_distributions(variables=[feature])
|
|
203
200
|
_process_figure(fig, feature)
|
|
204
201
|
|
|
205
|
-
print("
|
|
202
|
+
print(f"{local_dir_name} completed.")
|
|
206
203
|
|
|
207
204
|
|
|
208
|
-
def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_dir: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
|
|
205
|
+
def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str], save_datasets_dir: str, save_metrics_dir: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
|
|
209
206
|
"""
|
|
210
207
|
Call functions in sequence for each dataset in the provided path or directory:
|
|
211
208
|
1. Load dataframe
|
|
@@ -213,6 +210,8 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
|
|
|
213
210
|
3. Save imputed dataset(s)
|
|
214
211
|
4. Save convergence metrics
|
|
215
212
|
5. Save distribution metrics
|
|
213
|
+
|
|
214
|
+
Target columns must be skipped from the imputation.
|
|
216
215
|
"""
|
|
217
216
|
# Check paths
|
|
218
217
|
os.makedirs(save_datasets_dir, exist_ok=True)
|
|
@@ -228,9 +227,11 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
|
|
|
228
227
|
for df_path in all_file_paths:
|
|
229
228
|
df, df_name = load_dataframe(df_path=df_path)
|
|
230
229
|
|
|
230
|
+
df, df_targets = _skip_targets(df, target_columns)
|
|
231
|
+
|
|
231
232
|
kernel, imputed_datasets, imputed_dataset_names = apply_mice(df=df, df_name=df_name, resulting_datasets=resulting_datasets, iterations=iterations, random_state=random_state)
|
|
232
233
|
|
|
233
|
-
save_imputed_datasets(save_dir=save_datasets_dir, imputed_datasets=imputed_datasets, imputed_dataset_names=imputed_dataset_names)
|
|
234
|
+
save_imputed_datasets(save_dir=save_datasets_dir, imputed_datasets=imputed_datasets, df_targets=df_targets, imputed_dataset_names=imputed_dataset_names)
|
|
234
235
|
|
|
235
236
|
imputed_column_names = get_na_column_names(df=df)
|
|
236
237
|
|
|
@@ -239,5 +240,12 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
|
|
|
239
240
|
get_imputed_distributions(kernel=kernel, df_name=df_name, root_dir=save_metrics_dir, column_names=imputed_column_names)
|
|
240
241
|
|
|
241
242
|
|
|
243
|
+
def _skip_targets(df: pd.DataFrame, target_cols: list[str]):
|
|
244
|
+
valid_targets = [col for col in target_cols if col in df.columns]
|
|
245
|
+
df_targets = df[valid_targets]
|
|
246
|
+
df_feats = df.drop(columns=valid_targets)
|
|
247
|
+
return df_feats, df_targets
|
|
248
|
+
|
|
249
|
+
|
|
242
250
|
def info():
|
|
243
251
|
_script_info(__all__)
|
ml_tools/VIF_factor.py
CHANGED
|
@@ -26,6 +26,7 @@ def compute_vif(
|
|
|
26
26
|
filename: Optional[str] = None,
|
|
27
27
|
fontsize: int = 14,
|
|
28
28
|
show_plot: bool = True,
|
|
29
|
+
verbose: bool = True
|
|
29
30
|
) -> pd.DataFrame:
|
|
30
31
|
"""
|
|
31
32
|
Computes Variance Inflation Factors (VIF) for numeric columns in a DataFrame. Optionally, generates a bar plot of VIF values.
|
|
@@ -52,19 +53,20 @@ def compute_vif(
|
|
|
52
53
|
if use_columns is None:
|
|
53
54
|
sanitized_columns = df.select_dtypes(include='number').columns.tolist()
|
|
54
55
|
missing_features = set(ground_truth_cols) - set(sanitized_columns)
|
|
55
|
-
if missing_features:
|
|
56
|
+
if missing_features and verbose:
|
|
56
57
|
print(f"⚠️ These columns are not Numeric:\n{missing_features}")
|
|
57
58
|
else:
|
|
58
59
|
sanitized_columns = list()
|
|
59
60
|
for feature in use_columns:
|
|
60
61
|
if feature not in ground_truth_cols:
|
|
61
|
-
|
|
62
|
+
if verbose:
|
|
63
|
+
print(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
|
|
62
64
|
else:
|
|
63
65
|
sanitized_columns.append(feature)
|
|
64
66
|
|
|
65
67
|
if ignore_columns is not None and use_columns is None:
|
|
66
68
|
missing_ignore = set(ignore_columns) - set(ground_truth_cols)
|
|
67
|
-
if missing_ignore:
|
|
69
|
+
if missing_ignore and verbose:
|
|
68
70
|
print(f"⚠️ Warning: The following 'columns to ignore' are not in the Dataframe:\n{missing_ignore}")
|
|
69
71
|
sanitized_columns = [f for f in sanitized_columns if f not in ignore_columns]
|
|
70
72
|
|
|
@@ -182,7 +184,7 @@ def compute_vif_multi(input_directory: str,
|
|
|
182
184
|
max_features_to_plot: int = 20,
|
|
183
185
|
fontsize: int = 14):
|
|
184
186
|
"""
|
|
185
|
-
Computes Variance Inflation Factors (VIF) for numeric columns in a directory with CSV files (loaded as pandas DataFrames).
|
|
187
|
+
Computes Variance Inflation Factors (VIF) for numeric columns in a directory with CSV files (loaded as pandas DataFrames). No plots or warnings will be displayed inline.
|
|
186
188
|
Generates a bar plot of VIF values. Optionally drops columns with VIF >= 10 and saves as a new CSV file.
|
|
187
189
|
|
|
188
190
|
Args:
|
|
@@ -210,10 +212,11 @@ def compute_vif_multi(input_directory: str,
|
|
|
210
212
|
fontsize=fontsize,
|
|
211
213
|
save_dir=output_plot_directory,
|
|
212
214
|
filename=df_name,
|
|
213
|
-
show_plot=False
|
|
215
|
+
show_plot=False,
|
|
216
|
+
verbose=False)
|
|
214
217
|
|
|
215
218
|
if output_dataset_directory is not None:
|
|
216
|
-
new_filename =
|
|
219
|
+
new_filename = df_name + '_VIF'
|
|
217
220
|
result_df, dropped_cols = drop_vif_based(df=df, vif_df=vif_dataframe)
|
|
218
221
|
|
|
219
222
|
if len(dropped_cols) > 0:
|
ml_tools/data_exploration.py
CHANGED
|
@@ -5,10 +5,8 @@ import seaborn as sns
|
|
|
5
5
|
from IPython import get_ipython
|
|
6
6
|
from IPython.display import clear_output
|
|
7
7
|
import time
|
|
8
|
-
from typing import Union, Literal, Dict, Tuple
|
|
8
|
+
from typing import Union, Literal, Dict, Tuple, Iterator
|
|
9
9
|
import os
|
|
10
|
-
import sys
|
|
11
|
-
import textwrap
|
|
12
10
|
from ml_tools.utilities import sanitize_filename, _script_info
|
|
13
11
|
|
|
14
12
|
|
|
@@ -24,7 +22,8 @@ __all__ = [
|
|
|
24
22
|
"check_value_distributions",
|
|
25
23
|
"plot_value_distributions",
|
|
26
24
|
"clip_outliers_single",
|
|
27
|
-
"clip_outliers_multi"
|
|
25
|
+
"clip_outliers_multi",
|
|
26
|
+
"distribute_datasets_by_target"
|
|
28
27
|
]
|
|
29
28
|
|
|
30
29
|
|
|
@@ -113,7 +112,7 @@ def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
|
|
|
113
112
|
Parameters:
|
|
114
113
|
df (pd.DataFrame): The input DataFrame.
|
|
115
114
|
round_digits (int): Number of decimal places for the percentage.
|
|
116
|
-
|
|
115
|
+
|
|
117
116
|
Returns:
|
|
118
117
|
pd.DataFrame: A DataFrame summarizing missing values in each column.
|
|
119
118
|
"""
|
|
@@ -133,13 +132,14 @@ def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
|
|
|
133
132
|
return null_summary
|
|
134
133
|
|
|
135
134
|
|
|
136
|
-
def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7) -> pd.DataFrame:
|
|
135
|
+
def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, show_nulls_after: bool = True) -> pd.DataFrame:
|
|
137
136
|
"""
|
|
138
137
|
Drops columns with more than `threshold` fraction of missing values.
|
|
139
138
|
|
|
140
139
|
Parameters:
|
|
141
140
|
df (pd.DataFrame): The input DataFrame.
|
|
142
141
|
threshold (float): Fraction of missing values above which columns are dropped.
|
|
142
|
+
show_nulls_after (bool): Prints `show_null_columns` after dropping columns.
|
|
143
143
|
|
|
144
144
|
Returns:
|
|
145
145
|
pd.DataFrame: A new DataFrame without the dropped columns.
|
|
@@ -150,10 +150,15 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7) ->
|
|
|
150
150
|
if len(cols_to_drop) > 0:
|
|
151
151
|
print(f"Dropping columns with more than {threshold*100:.0f}% missing data:")
|
|
152
152
|
print(list(cols_to_drop))
|
|
153
|
+
|
|
154
|
+
result_df = df.drop(columns=cols_to_drop)
|
|
155
|
+
if show_nulls_after:
|
|
156
|
+
print(show_null_columns(df=result_df))
|
|
157
|
+
|
|
158
|
+
return result_df
|
|
153
159
|
else:
|
|
154
160
|
print(f"No columns have more than {threshold*100:.0f}% missing data.")
|
|
155
|
-
|
|
156
|
-
return df.drop(columns=cols_to_drop)
|
|
161
|
+
return df
|
|
157
162
|
|
|
158
163
|
|
|
159
164
|
def split_continuous_binary(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
@@ -254,7 +259,7 @@ def plot_correlation_heatmap(df: pd.DataFrame, save_dir: Union[str, None] = None
|
|
|
254
259
|
os.makedirs(save_dir, exist_ok=True)
|
|
255
260
|
full_path = os.path.join(save_dir, plot_title + ".svg")
|
|
256
261
|
plt.savefig(full_path, bbox_inches="tight", format='svg')
|
|
257
|
-
print(f"Saved correlation heatmap
|
|
262
|
+
print(f"Saved correlation heatmap: '{plot_title}.svg'")
|
|
258
263
|
|
|
259
264
|
plt.show()
|
|
260
265
|
plt.close()
|
|
@@ -514,6 +519,40 @@ def clip_outliers_multi(
|
|
|
514
519
|
return new_df
|
|
515
520
|
|
|
516
521
|
|
|
522
|
+
def distribute_datasets_by_target(
|
|
523
|
+
df: pd.DataFrame,
|
|
524
|
+
target_columns: list[str],
|
|
525
|
+
verbose: bool = False
|
|
526
|
+
) -> Iterator[Tuple[str, pd.DataFrame]]:
|
|
527
|
+
"""
|
|
528
|
+
Yields cleaned DataFrames for each target column, where rows with missing
|
|
529
|
+
target values are removed. The target column is placed at the end.
|
|
530
|
+
|
|
531
|
+
Parameters
|
|
532
|
+
----------
|
|
533
|
+
df : pd.DataFrame
|
|
534
|
+
Preprocessed dataframe with all feature and target columns ready to train.
|
|
535
|
+
target_columns : List[str]
|
|
536
|
+
List of target column names to generate per-target DataFrames.
|
|
537
|
+
verbose: bool
|
|
538
|
+
Whether to print info for each yielded dataset.
|
|
539
|
+
|
|
540
|
+
Yields
|
|
541
|
+
------
|
|
542
|
+
Tuple[str, pd.DataFrame]
|
|
543
|
+
* First element is the target column name.
|
|
544
|
+
* Second element is the corresponding cleaned DataFrame.
|
|
545
|
+
"""
|
|
546
|
+
valid_targets = [col for col in df.columns if col in target_columns]
|
|
547
|
+
feature_columns = [col for col in df.columns if col not in valid_targets]
|
|
548
|
+
|
|
549
|
+
for target in valid_targets:
|
|
550
|
+
subset = df[feature_columns + [target]].dropna(subset=[target])
|
|
551
|
+
if verbose:
|
|
552
|
+
print(f"Target: '{target}' - Dataframe shape: {subset.shape}")
|
|
553
|
+
yield target, subset
|
|
554
|
+
|
|
555
|
+
|
|
517
556
|
def _is_notebook():
|
|
518
557
|
return get_ipython() is not None
|
|
519
558
|
|
ml_tools/ensemble_learning.py
CHANGED
|
@@ -6,7 +6,7 @@ from matplotlib.colors import Colormap
|
|
|
6
6
|
from matplotlib import rcdefaults
|
|
7
7
|
|
|
8
8
|
import os
|
|
9
|
-
from typing import Literal, Union, Optional
|
|
9
|
+
from typing import Literal, Union, Optional, Iterator, Tuple
|
|
10
10
|
import joblib
|
|
11
11
|
|
|
12
12
|
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
|
|
@@ -29,7 +29,9 @@ warnings.filterwarnings('ignore', category=UserWarning)
|
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
__all__ = [
|
|
32
|
-
"
|
|
32
|
+
"dataset_yielder",
|
|
33
|
+
"RegressionTreeModels",
|
|
34
|
+
"ClassificationTreeModels",
|
|
33
35
|
"dataset_pipeline",
|
|
34
36
|
"evaluate_model_classification",
|
|
35
37
|
"plot_roc_curve",
|
|
@@ -39,114 +41,360 @@ __all__ = [
|
|
|
39
41
|
"run_ensemble_pipeline"
|
|
40
42
|
]
|
|
41
43
|
|
|
44
|
+
## Type aliases
|
|
45
|
+
HandleImbalanceStrategy = Literal[
|
|
46
|
+
"ADASYN", "SMOTE", "RAND_OVERSAMPLE", "RAND_UNDERSAMPLE", "by_model", None
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
TaskType = Literal[
|
|
50
|
+
"classification", "regression"
|
|
51
|
+
]
|
|
42
52
|
|
|
43
53
|
###### 1. Dataset Loader ######
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
54
|
+
def dataset_yielder(
|
|
55
|
+
df: pd.DataFrame,
|
|
56
|
+
target_cols: list[str]
|
|
57
|
+
) -> Iterator[Tuple[pd.DataFrame, pd.Series, list[str], str]]:
|
|
58
|
+
"""
|
|
59
|
+
Yields one tuple at a time:
|
|
60
|
+
(features_dataframe, target_series, feature_names, target_name)
|
|
61
|
+
|
|
62
|
+
Skips any target columns not found in the DataFrame.
|
|
63
|
+
"""
|
|
64
|
+
# Determine which target columns actually exist in the DataFrame
|
|
65
|
+
valid_targets = [col for col in target_cols if col in df.columns]
|
|
66
|
+
|
|
67
|
+
# Features = all columns excluding valid target columns
|
|
68
|
+
df_features = df.drop(columns=valid_targets)
|
|
50
69
|
feature_names = df_features.columns.to_list()
|
|
51
|
-
|
|
52
|
-
for target_col in
|
|
70
|
+
|
|
71
|
+
for target_col in valid_targets:
|
|
53
72
|
df_target = df[target_col]
|
|
54
73
|
yield (df_features, df_target, feature_names, target_col)
|
|
55
74
|
|
|
75
|
+
|
|
56
76
|
###### 2. Initialize Models ######
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
Valid tasks: "classification" or "regression".
|
|
77
|
+
class RegressionTreeModels:
|
|
78
|
+
"""
|
|
79
|
+
A factory class for creating and configuring multiple gradient boosting regression models
|
|
80
|
+
with unified hyperparameters. This includes XGBoost, LightGBM, and HistGradientBoostingRegressor.
|
|
62
81
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
For classification only: Set `is_balanced=False` for imbalanced datasets.
|
|
82
|
+
Use the `__call__`, `()` method.
|
|
83
|
+
|
|
84
|
+
Parameters
|
|
85
|
+
----------
|
|
86
|
+
random_state : int
|
|
87
|
+
Seed used by the random number generator.
|
|
88
|
+
|
|
89
|
+
learning_rate : float [0.001 - 0.300]
|
|
90
|
+
Boosting learning rate (shrinkage).
|
|
73
91
|
|
|
74
|
-
|
|
75
|
-
|
|
92
|
+
L1_regularization : float [0.0 - 10.0]
|
|
93
|
+
L1 regularization term (alpha). Might drive to sparsity.
|
|
94
|
+
|
|
95
|
+
L2_regularization : float [0.0 - 10.0]
|
|
96
|
+
L2 regularization term (lambda).
|
|
97
|
+
|
|
98
|
+
n_estimators : int [100 - 3000]
|
|
99
|
+
Number of boosting iterations for XGBoost and LightGBM.
|
|
100
|
+
|
|
101
|
+
max_depth : int [3 - 15]
|
|
102
|
+
Maximum depth of individual trees. Controls model complexity; high values may overfit.
|
|
103
|
+
|
|
104
|
+
subsample : float [0.5 - 1.0]
|
|
105
|
+
Fraction of rows per tree; used to prevent overfitting.
|
|
106
|
+
|
|
107
|
+
colsample_bytree : float [0.3 - 1.0]
|
|
108
|
+
Fraction of features per tree; useful for regularization (used by XGBoost and LightGBM).
|
|
109
|
+
|
|
110
|
+
min_samples_leaf : int [10 - 100]
|
|
111
|
+
Minimum samples per leaf; higher = less overfitting (used in HistGB).
|
|
112
|
+
|
|
113
|
+
max_iter : int [100 - 2000]
|
|
114
|
+
Maximum number of iterations (used in HistGB).
|
|
115
|
+
|
|
116
|
+
min_child_weight : float [0.1 - 10.0]
|
|
117
|
+
Minimum sum of instance weight (hessian) needed in a child; larger values make the algorithm more conservative (used in XGBoost).
|
|
118
|
+
|
|
119
|
+
gamma : float [0.0 - 5.0]
|
|
120
|
+
Minimum loss reduction required to make a further partition on a leaf node; higher = more regularization (used in XGBoost).
|
|
121
|
+
|
|
122
|
+
num_leaves : int [20 - 200]
|
|
123
|
+
Maximum number of leaves in one tree; should be less than 2^(max_depth); larger = more complex (used in LightGBM).
|
|
124
|
+
|
|
125
|
+
min_data_in_leaf : int [10 - 100]
|
|
126
|
+
Minimum number of data points in a leaf; increasing may prevent overfitting (used in LightGBM).
|
|
127
|
+
"""
|
|
128
|
+
def __init__(self,
|
|
129
|
+
random_state: int = 101,
|
|
130
|
+
learning_rate: float = 0.005,
|
|
131
|
+
L1_regularization: float = 1.0,
|
|
132
|
+
L2_regularization: float = 1.0,
|
|
133
|
+
n_estimators: int = 1000,
|
|
134
|
+
max_depth: int = 8,
|
|
135
|
+
subsample: float = 0.8,
|
|
136
|
+
colsample_bytree: float = 0.8,
|
|
137
|
+
min_samples_leaf: int = 50,
|
|
138
|
+
max_iter: int = 1000,
|
|
139
|
+
min_child_weight: float = 3.0,
|
|
140
|
+
gamma: float = 1.0,
|
|
141
|
+
num_leaves: int = 31,
|
|
142
|
+
min_data_in_leaf: int = 40):
|
|
143
|
+
# General config
|
|
144
|
+
self.random_state = random_state
|
|
145
|
+
self.lr = learning_rate
|
|
146
|
+
self.L1 = L1_regularization
|
|
147
|
+
self.L2 = L2_regularization
|
|
148
|
+
|
|
149
|
+
# Shared tree structure
|
|
150
|
+
self.n_estimators = n_estimators
|
|
151
|
+
self.max_depth = max_depth
|
|
152
|
+
self.subsample = subsample
|
|
153
|
+
self.colsample_bytree = colsample_bytree
|
|
154
|
+
|
|
155
|
+
# XGBoost specific
|
|
156
|
+
self.min_child_weight = min_child_weight
|
|
157
|
+
self.gamma = gamma
|
|
158
|
+
|
|
159
|
+
# LightGBM specific
|
|
160
|
+
num_leaves = min(num_leaves, 2 ** (max_depth - 1))
|
|
161
|
+
self.num_leaves = num_leaves
|
|
162
|
+
self.min_data_in_leaf = min_data_in_leaf
|
|
163
|
+
|
|
164
|
+
# HistGB specific
|
|
165
|
+
self.max_iter = max_iter
|
|
166
|
+
self.min_samples_leaf = min_samples_leaf
|
|
167
|
+
|
|
168
|
+
def __call__(self) -> dict[str, object]:
|
|
169
|
+
"""
|
|
170
|
+
Returns a dictionary with new instances of:
|
|
171
|
+
- "XGBoost": XGBRegressor
|
|
172
|
+
- "LightGBM": LGBMRegressor
|
|
173
|
+
- "HistGB": HistGradientBoostingRegressor
|
|
174
|
+
"""
|
|
175
|
+
# XGBoost Regressor
|
|
176
|
+
xgb_model = xgb.XGBRegressor(
|
|
177
|
+
n_estimators=self.n_estimators,
|
|
178
|
+
max_depth=self.max_depth,
|
|
179
|
+
learning_rate=self.lr,
|
|
180
|
+
subsample=self.subsample,
|
|
181
|
+
colsample_bytree=self.colsample_bytree,
|
|
182
|
+
random_state=self.random_state,
|
|
183
|
+
reg_alpha=self.L1,
|
|
184
|
+
reg_lambda=self.L2,
|
|
185
|
+
eval_metric='rmse',
|
|
186
|
+
min_child_weight=self.min_child_weight,
|
|
187
|
+
gamma=self.gamma,
|
|
188
|
+
tree_method='hist',
|
|
189
|
+
grow_policy='lossguide'
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
# LightGBM Regressor
|
|
193
|
+
lgb_model = lgb.LGBMRegressor(
|
|
194
|
+
n_estimators=self.n_estimators,
|
|
195
|
+
learning_rate=self.lr,
|
|
196
|
+
max_depth=self.max_depth,
|
|
197
|
+
subsample=self.subsample,
|
|
198
|
+
colsample_bytree=self.colsample_bytree,
|
|
199
|
+
random_state=self.random_state,
|
|
200
|
+
verbose=-1,
|
|
201
|
+
reg_alpha=self.L1,
|
|
202
|
+
reg_lambda=self.L2,
|
|
203
|
+
boosting_type='gbdt',
|
|
204
|
+
num_leaves=self.num_leaves,
|
|
205
|
+
min_data_in_leaf=self.min_data_in_leaf
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
# HistGradientBoosting Regressor
|
|
209
|
+
hist_model = HistGradientBoostingRegressor(
|
|
210
|
+
max_iter=self.max_iter,
|
|
211
|
+
learning_rate=self.lr,
|
|
212
|
+
max_depth=self.max_depth,
|
|
213
|
+
min_samples_leaf=self.min_samples_leaf,
|
|
214
|
+
random_state=self.random_state,
|
|
215
|
+
l2_regularization=self.L2,
|
|
216
|
+
scoring='neg_mean_squared_error',
|
|
217
|
+
early_stopping=True,
|
|
218
|
+
validation_fraction=0.1
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
return {
|
|
222
|
+
"XGBoost": xgb_model,
|
|
223
|
+
"LightGBM": lgb_model,
|
|
224
|
+
"HistGB": hist_model
|
|
225
|
+
}
|
|
76
226
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
raise ValueError(f"Invalid task: {task}. Must be 'classification' or 'regression'.")
|
|
80
|
-
|
|
81
|
-
models = {}
|
|
82
|
-
|
|
83
|
-
# Common parameters
|
|
84
|
-
xgb_params = {
|
|
85
|
-
'n_estimators': 200,
|
|
86
|
-
'max_depth': 5,
|
|
87
|
-
'learning_rate': learning_rate,
|
|
88
|
-
'subsample': 0.8,
|
|
89
|
-
'colsample_bytree': 0.8,
|
|
90
|
-
'random_state': random_state,
|
|
91
|
-
'reg_alpha': L1_regularization,
|
|
92
|
-
'reg_lambda': L2_regularization,
|
|
93
|
-
}
|
|
94
|
-
|
|
95
|
-
lgbm_params = {
|
|
96
|
-
'n_estimators': 200,
|
|
97
|
-
'learning_rate': learning_rate,
|
|
98
|
-
'max_depth': 5,
|
|
99
|
-
'subsample': 0.8,
|
|
100
|
-
'colsample_bytree': 0.8,
|
|
101
|
-
'random_state': random_state,
|
|
102
|
-
'verbose': -1,
|
|
103
|
-
'reg_alpha': L1_regularization,
|
|
104
|
-
'reg_lambda': L2_regularization,
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
hist_params = {
|
|
108
|
-
'max_iter': 200,
|
|
109
|
-
'learning_rate': learning_rate,
|
|
110
|
-
'max_depth': 5,
|
|
111
|
-
'min_samples_leaf': 30,
|
|
112
|
-
'random_state': random_state,
|
|
113
|
-
'l2_regularization': L2_regularization,
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
# XGB Model
|
|
117
|
-
if task == "classification":
|
|
118
|
-
xgb_params.update({
|
|
119
|
-
'scale_pos_weight': 1 if is_balanced else 8,
|
|
120
|
-
'eval_metric': 'aucpr'
|
|
121
|
-
})
|
|
122
|
-
models["XGBoost"] = xgb.XGBClassifier(**xgb_params)
|
|
123
|
-
else:
|
|
124
|
-
xgb_params.update({'eval_metric': 'rmse'})
|
|
125
|
-
models["XGBoost"] = xgb.XGBRegressor(**xgb_params)
|
|
227
|
+
def __str__(self):
|
|
228
|
+
return f"{self.__class__.__name__}(n_estimators={self.n_estimators}, max_depth={self.max_depth}, lr={self.lr}, L1={self.L1}, L2={self.L2}"
|
|
126
229
|
|
|
127
|
-
# LGBM Model
|
|
128
|
-
if task == "classification":
|
|
129
|
-
lgbm_params.update({
|
|
130
|
-
'class_weight': None if is_balanced else 'balanced',
|
|
131
|
-
'boosting_type': 'goss' if is_balanced else 'dart',
|
|
132
|
-
})
|
|
133
|
-
models["LightGBM"] = lgb.LGBMClassifier(**lgbm_params)
|
|
134
|
-
else:
|
|
135
|
-
lgbm_params['boosting_type'] = 'dart'
|
|
136
|
-
models["LightGBM"] = lgb.LGBMRegressor(**lgbm_params)
|
|
137
230
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
231
|
+
class ClassificationTreeModels:
|
|
232
|
+
"""
|
|
233
|
+
A factory class for creating and configuring multiple gradient boosting classification models
|
|
234
|
+
with unified hyperparameters. This includes: XGBoost, LightGBM, and HistGradientBoostingClassifier.
|
|
235
|
+
|
|
236
|
+
Use the `__call__`, `()` method.
|
|
237
|
+
|
|
238
|
+
Parameters
|
|
239
|
+
----------
|
|
240
|
+
random_state : int
|
|
241
|
+
Seed used by the random number generator to ensure reproducibility.
|
|
242
|
+
|
|
243
|
+
learning_rate : float [0.001 - 0.300]
|
|
244
|
+
Boosting learning rate (shrinkage factor).
|
|
245
|
+
|
|
246
|
+
L1_regularization : float [0.0 - 10.0]
|
|
247
|
+
L1 regularization term (alpha), might drive to sparsity.
|
|
248
|
+
|
|
249
|
+
L2_regularization : float [0.0 - 10.0]
|
|
250
|
+
L2 regularization term (lambda).
|
|
251
|
+
|
|
252
|
+
n_estimators : int [100 - 3000]
|
|
253
|
+
Number of boosting rounds for XGBoost and LightGBM.
|
|
254
|
+
|
|
255
|
+
max_depth : int [3 - 15]
|
|
256
|
+
Maximum depth of individual trees in the ensemble. Controls model complexity; high values may overfit.
|
|
257
|
+
|
|
258
|
+
subsample : float [0.5 - 1.0]
|
|
259
|
+
Fraction of samples to use when fitting base learners; used to prevent overfitting.
|
|
260
|
+
|
|
261
|
+
colsample_bytree : float [0.3 - 1.0]
|
|
262
|
+
Fraction of features per tree; useful for regularization (used by XGBoost and LightGBM).
|
|
263
|
+
|
|
264
|
+
min_samples_leaf : int [10 - 100]
|
|
265
|
+
Minimum number of samples required to be at a leaf node; higher = less overfitting (used in HistGB).
|
|
266
|
+
|
|
267
|
+
max_iter : int [100 - 2000]
|
|
268
|
+
Maximum number of boosting iteration (used in HistGB).
|
|
269
|
+
|
|
270
|
+
min_child_weight : float [0.1 - 10.0]
|
|
271
|
+
Minimum sum of instance weight (Hessian) in a child node; larger values make the algorithm more conservative (used in XGBoost).
|
|
272
|
+
|
|
273
|
+
gamma : float [0.0 - 5.0]
|
|
274
|
+
Minimum loss reduction required to make a further partition; higher = more regularization (used in XGBoost).
|
|
275
|
+
|
|
276
|
+
num_leaves : int [20 - 200]
|
|
277
|
+
Maximum number of leaves in one tree. Should be less than 2^(max_depth); larger = more complex (used in LightGBM).
|
|
278
|
+
|
|
279
|
+
min_data_in_leaf : int [10 -100]
|
|
280
|
+
Minimum number of samples required in a leaf; increasing may prevent overfitting (used in LightGBM).
|
|
281
|
+
|
|
282
|
+
Attributes
|
|
283
|
+
----------
|
|
284
|
+
use_model_balance : bool
|
|
285
|
+
Indicates whether to apply class balancing strategies internally. Can be overridden at runtime via the `__call__` method.
|
|
286
|
+
"""
|
|
287
|
+
def __init__(self,
|
|
288
|
+
random_state: int = 101,
|
|
289
|
+
learning_rate: float = 0.005,
|
|
290
|
+
L1_regularization: float = 1.0,
|
|
291
|
+
L2_regularization: float = 1.0,
|
|
292
|
+
n_estimators: int = 1000,
|
|
293
|
+
max_depth: int = 8,
|
|
294
|
+
subsample: float = 0.8,
|
|
295
|
+
colsample_bytree: float = 0.8,
|
|
296
|
+
min_samples_leaf: int = 50,
|
|
297
|
+
max_iter: int = 1000,
|
|
298
|
+
min_child_weight: float = 3.0,
|
|
299
|
+
gamma: float = 1.0,
|
|
300
|
+
num_leaves: int = 31,
|
|
301
|
+
min_data_in_leaf: int = 40):
|
|
302
|
+
# General config
|
|
303
|
+
self.random_state = random_state
|
|
304
|
+
self.lr = learning_rate
|
|
305
|
+
self.L1 = L1_regularization
|
|
306
|
+
self.L2 = L2_regularization
|
|
307
|
+
|
|
308
|
+
# To be set by the pipeline
|
|
309
|
+
self.use_model_balance: bool = True
|
|
310
|
+
|
|
311
|
+
# Shared tree structure
|
|
312
|
+
self.n_estimators = n_estimators
|
|
313
|
+
self.max_depth = max_depth
|
|
314
|
+
self.subsample = subsample
|
|
315
|
+
self.colsample_bytree = colsample_bytree
|
|
316
|
+
|
|
317
|
+
# XGBoost specific
|
|
318
|
+
self.min_child_weight = min_child_weight
|
|
319
|
+
self.gamma = gamma
|
|
320
|
+
|
|
321
|
+
# LightGBM specific
|
|
322
|
+
num_leaves = min(num_leaves, 2 ** (max_depth - 1))
|
|
323
|
+
self.num_leaves = num_leaves
|
|
324
|
+
self.min_data_in_leaf = min_data_in_leaf
|
|
325
|
+
|
|
326
|
+
# HistGB specific
|
|
327
|
+
self.max_iter = max_iter
|
|
328
|
+
self.min_samples_leaf = min_samples_leaf
|
|
329
|
+
|
|
330
|
+
def __call__(self, use_model_balance: Optional[bool]=None) -> dict[str, object]:
|
|
331
|
+
"""
|
|
332
|
+
Returns a dictionary with new instances of:
|
|
333
|
+
- "XGBoost": XGBClassifier
|
|
334
|
+
- "LightGBM": LGBMClassifier
|
|
335
|
+
- "HistGB": HistGradientBoostingClassifier
|
|
336
|
+
"""
|
|
337
|
+
if use_model_balance is not None:
|
|
338
|
+
self.use_model_balance = use_model_balance
|
|
339
|
+
|
|
340
|
+
# XGBoost Classifier
|
|
341
|
+
xgb_model = xgb.XGBClassifier(
|
|
342
|
+
n_estimators=self.n_estimators,
|
|
343
|
+
max_depth=self.max_depth,
|
|
344
|
+
learning_rate=self.lr,
|
|
345
|
+
subsample=self.subsample,
|
|
346
|
+
colsample_bytree=self.colsample_bytree,
|
|
347
|
+
random_state=self.random_state,
|
|
348
|
+
reg_alpha=self.L1,
|
|
349
|
+
reg_lambda=self.L2,
|
|
350
|
+
eval_metric='aucpr',
|
|
351
|
+
min_child_weight=self.min_child_weight,
|
|
352
|
+
gamma=self.gamma,
|
|
353
|
+
tree_method='hist',
|
|
354
|
+
grow_policy='lossguide',
|
|
355
|
+
scale_pos_weight=8.0 if self.use_model_balance else 1.0
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
# LightGBM Classifier
|
|
359
|
+
lgb_model = lgb.LGBMClassifier(
|
|
360
|
+
n_estimators=self.n_estimators,
|
|
361
|
+
learning_rate=self.lr,
|
|
362
|
+
max_depth=self.max_depth,
|
|
363
|
+
subsample=self.subsample,
|
|
364
|
+
colsample_bytree=self.colsample_bytree,
|
|
365
|
+
random_state=self.random_state,
|
|
366
|
+
verbose=-1,
|
|
367
|
+
reg_alpha=self.L1,
|
|
368
|
+
reg_lambda=self.L2,
|
|
369
|
+
boosting_type='gbdt' if self.use_model_balance else 'goss',
|
|
370
|
+
num_leaves=self.num_leaves,
|
|
371
|
+
min_data_in_leaf=self.min_data_in_leaf,
|
|
372
|
+
class_weight='balanced' if self.use_model_balance else None
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
# HistGradientBoosting Classifier
|
|
376
|
+
hist_model = HistGradientBoostingClassifier(
|
|
377
|
+
max_iter=self.max_iter,
|
|
378
|
+
learning_rate=self.lr,
|
|
379
|
+
max_depth=self.max_depth,
|
|
380
|
+
min_samples_leaf=self.min_samples_leaf,
|
|
381
|
+
random_state=self.random_state,
|
|
382
|
+
l2_regularization=self.L2,
|
|
383
|
+
early_stopping=True,
|
|
384
|
+
validation_fraction=0.1,
|
|
385
|
+
class_weight='balanced' if self.use_model_balance else None,
|
|
386
|
+
scoring='balanced_accuracy' if self.use_model_balance else 'loss'
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
return {
|
|
390
|
+
"XGBoost": xgb_model,
|
|
391
|
+
"LightGBM": lgb_model,
|
|
392
|
+
"HistGB": hist_model
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
def __str__(self):
|
|
396
|
+
return f"{self.__class__.__name__}(n_estimators={self.n_estimators}, max_depth={self.max_depth}, lr={self.lr}, L1={self.L1}, L2={self.L2}"
|
|
148
397
|
|
|
149
|
-
return models
|
|
150
398
|
|
|
151
399
|
###### 3. Process Dataset ######
|
|
152
400
|
# function to split data into train and test
|
|
@@ -157,7 +405,7 @@ def _split_data(features, target, test_size, random_state, task):
|
|
|
157
405
|
|
|
158
406
|
# Over-sample minority class (Positive cases) and return several single target datasets (Classification)
|
|
159
407
|
def _resample(X_train: np.ndarray, y_train: pd.Series,
|
|
160
|
-
strategy:
|
|
408
|
+
strategy: HandleImbalanceStrategy, random_state):
|
|
161
409
|
'''
|
|
162
410
|
Oversample minority class or undersample majority class.
|
|
163
411
|
|
|
@@ -165,9 +413,9 @@ def _resample(X_train: np.ndarray, y_train: pd.Series,
|
|
|
165
413
|
'''
|
|
166
414
|
if strategy == 'SMOTE':
|
|
167
415
|
resample_algorithm = SMOTE(random_state=random_state, k_neighbors=3)
|
|
168
|
-
elif strategy == '
|
|
416
|
+
elif strategy == 'RAND_OVERSAMPLE':
|
|
169
417
|
resample_algorithm = RandomOverSampler(random_state=random_state)
|
|
170
|
-
elif strategy == '
|
|
418
|
+
elif strategy == 'RAND_UNDERSAMPLE':
|
|
171
419
|
resample_algorithm = RandomUnderSampler(random_state=random_state)
|
|
172
420
|
elif strategy == 'ADASYN':
|
|
173
421
|
resample_algorithm = ADASYN(random_state=random_state, n_neighbors=3)
|
|
@@ -178,8 +426,8 @@ def _resample(X_train: np.ndarray, y_train: pd.Series,
|
|
|
178
426
|
return X_res, y_res
|
|
179
427
|
|
|
180
428
|
# DATASET PIPELINE
|
|
181
|
-
def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task:
|
|
182
|
-
resample_strategy:
|
|
429
|
+
def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task: TaskType,
|
|
430
|
+
resample_strategy: HandleImbalanceStrategy,
|
|
183
431
|
test_size: float=0.2, debug: bool=False, random_state: int=101):
|
|
184
432
|
'''
|
|
185
433
|
1. Make Train/Test splits
|
|
@@ -204,7 +452,7 @@ def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task: Lite
|
|
|
204
452
|
|
|
205
453
|
|
|
206
454
|
# Resample
|
|
207
|
-
if resample_strategy is None or task == "regression":
|
|
455
|
+
if resample_strategy is None or resample_strategy == "by_model" or task == "regression":
|
|
208
456
|
X_train_oversampled, y_train_oversampled = X_train, y_train
|
|
209
457
|
else:
|
|
210
458
|
X_train_oversampled, y_train_oversampled = _resample(X_train=X_train, y_train=y_train, strategy=resample_strategy, random_state=random_state)
|
|
@@ -431,7 +679,7 @@ def evaluate_model_regression(model, model_name: str,
|
|
|
431
679
|
sanitized_target_name = sanitize_filename(target_name)
|
|
432
680
|
report_path = os.path.join(save_dir, f"Regression_Report_{sanitized_target_name}.txt")
|
|
433
681
|
with open(report_path, "w") as f:
|
|
434
|
-
f.write(f"{model_name} - {target_name}
|
|
682
|
+
f.write(f"{model_name} - Regression Performance for '{target_name}'\n\n")
|
|
435
683
|
f.write(f"Mean Absolute Error (MAE): {mae:.4f}\n")
|
|
436
684
|
f.write(f"Mean Squared Error (MSE): {mse:.4f}\n")
|
|
437
685
|
f.write(f"Root Mean Squared Error (RMSE): {rmse:.4f}\n")
|
|
@@ -596,7 +844,7 @@ def get_shap_values(
|
|
|
596
844
|
|
|
597
845
|
|
|
598
846
|
# TRAIN TEST PIPELINE
|
|
599
|
-
def train_test_pipeline(model, model_name: str, dataset_id: str, task:
|
|
847
|
+
def train_test_pipeline(model, model_name: str, dataset_id: str, task: TaskType,
|
|
600
848
|
train_features: np.ndarray, train_target: np.ndarray,
|
|
601
849
|
test_features: np.ndarray, test_target: np.ndarray,
|
|
602
850
|
feature_names: list[str], target_name: str,
|
|
@@ -609,7 +857,7 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["
|
|
|
609
857
|
|
|
610
858
|
Returns: Tuple(Trained model, Test-set Predictions)
|
|
611
859
|
'''
|
|
612
|
-
print(f"\
|
|
860
|
+
print(f"\tTraining model: {model_name} for Target: {target_name}...")
|
|
613
861
|
trained_model = _train_model(model=model, train_features=train_features, train_target=train_target)
|
|
614
862
|
if debug:
|
|
615
863
|
print(f"Trained model object: {type(trained_model)}")
|
|
@@ -637,26 +885,40 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["
|
|
|
637
885
|
|
|
638
886
|
get_shap_values(model=trained_model, model_name=model_name, save_dir=local_save_directory,
|
|
639
887
|
features_to_explain=train_features, feature_names=feature_names, target_name=target_name, task=task)
|
|
640
|
-
print("\t...done.")
|
|
888
|
+
# print("\t...done.")
|
|
641
889
|
return trained_model, y_pred
|
|
642
890
|
|
|
643
891
|
###### 5. Execution ######
|
|
644
|
-
def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list[str],
|
|
645
|
-
|
|
646
|
-
test_size: float=0.2, debug:bool=False
|
|
892
|
+
def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list[str], model_object: Union[RegressionTreeModels, ClassificationTreeModels],
|
|
893
|
+
handle_classification_imbalance: HandleImbalanceStrategy=None, save_model: bool=False,
|
|
894
|
+
test_size: float=0.2, debug:bool=False):
|
|
895
|
+
#Check models
|
|
896
|
+
if isinstance(model_object, RegressionTreeModels):
|
|
897
|
+
task = "regression"
|
|
898
|
+
elif isinstance(model_object, ClassificationTreeModels):
|
|
899
|
+
task = "classification"
|
|
900
|
+
if handle_classification_imbalance is None:
|
|
901
|
+
print("⚠️ No method to handle classification class imbalance has been selected. Datasets are assumed to be balanced.")
|
|
902
|
+
elif handle_classification_imbalance == "by_model":
|
|
903
|
+
model_object.use_model_balance = True
|
|
904
|
+
else:
|
|
905
|
+
model_object.use_model_balance = False
|
|
906
|
+
else:
|
|
907
|
+
raise TypeError(f"Unrecognized model {type(model_object)}")
|
|
908
|
+
|
|
647
909
|
#Check paths
|
|
648
910
|
_check_paths(datasets_dir, save_dir)
|
|
911
|
+
|
|
649
912
|
#Yield imputed dataset
|
|
650
913
|
for dataframe, dataframe_name in yield_dataframes_from_dir(datasets_dir):
|
|
651
914
|
#Yield features dataframe and target dataframe
|
|
652
|
-
for df_features, df_target, feature_names, target_name in
|
|
915
|
+
for df_features, df_target, feature_names, target_name in dataset_yielder(df=dataframe, target_cols=target_columns):
|
|
653
916
|
#Dataset pipeline
|
|
654
917
|
X_train, y_train, X_test, y_test = dataset_pipeline(df_features=df_features, df_target=df_target, task=task,
|
|
655
|
-
resample_strategy=
|
|
656
|
-
test_size=test_size, debug=debug, random_state=random_state)
|
|
918
|
+
resample_strategy=handle_classification_imbalance,
|
|
919
|
+
test_size=test_size, debug=debug, random_state=model_object.random_state)
|
|
657
920
|
#Get models
|
|
658
|
-
models_dict =
|
|
659
|
-
L1_regularization=L1_regularization, L2_regularization=L2_regularization, learning_rate=learning_rate)
|
|
921
|
+
models_dict = model_object()
|
|
660
922
|
#Train models
|
|
661
923
|
for model_name, model in models_dict.items():
|
|
662
924
|
train_test_pipeline(model=model, model_name=model_name, dataset_id=dataframe_name, task=task,
|
ml_tools/logger.py
CHANGED
|
@@ -55,7 +55,7 @@ def custom_logger(
|
|
|
55
55
|
"""
|
|
56
56
|
try:
|
|
57
57
|
os.makedirs(save_directory, exist_ok=True)
|
|
58
|
-
timestamp = datetime.now().strftime(r"%Y%m%d_%H%M")
|
|
58
|
+
timestamp = datetime.now().strftime(r"%Y%m%d_%H%M%S")
|
|
59
59
|
log_name = sanitize_filename(log_name)
|
|
60
60
|
base_path = os.path.join(save_directory, f"{log_name}_{timestamp}")
|
|
61
61
|
|
|
@@ -80,7 +80,7 @@ def custom_logger(
|
|
|
80
80
|
else:
|
|
81
81
|
raise ValueError("Unsupported data type. Must be list, dict, DataFrame, str, or BaseException.")
|
|
82
82
|
|
|
83
|
-
print(f"Log saved to: {base_path}")
|
|
83
|
+
print(f"Log saved to: '{base_path}'")
|
|
84
84
|
|
|
85
85
|
except Exception as e:
|
|
86
86
|
print(f"Error in custom_logger: {e}")
|
|
@@ -129,10 +129,10 @@ def run_pso(lower_boundaries: list[float],
|
|
|
129
129
|
target_name: Union[str, None]=None,
|
|
130
130
|
feature_names: Union[list[str], None]=None,
|
|
131
131
|
swarm_size: int=200,
|
|
132
|
-
max_iterations: int=
|
|
132
|
+
max_iterations: int=1500,
|
|
133
133
|
inequality_constrain_function=None,
|
|
134
|
-
post_hoc_analysis: Optional[int]=
|
|
135
|
-
workers: int=
|
|
134
|
+
post_hoc_analysis: Optional[int]=5,
|
|
135
|
+
workers: int=1) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
|
|
136
136
|
"""
|
|
137
137
|
Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results as a CSV file.
|
|
138
138
|
|
|
@@ -261,7 +261,7 @@ def info():
|
|
|
261
261
|
_script_info(__all__)
|
|
262
262
|
|
|
263
263
|
|
|
264
|
-
### SOURCE CODE FOR PSO ###
|
|
264
|
+
### SOURCE CODE FOR PSO FROM PYSWARM ###
|
|
265
265
|
def _obj_wrapper(func, args, kwargs, x):
|
|
266
266
|
return func(x, *args, **kwargs)
|
|
267
267
|
|
ml_tools/utilities.py
CHANGED
|
@@ -95,7 +95,8 @@ def yield_dataframes_from_dir(datasets_dir: str):
|
|
|
95
95
|
def merge_dataframes(
|
|
96
96
|
*dfs: pd.DataFrame,
|
|
97
97
|
reset_index: bool = False,
|
|
98
|
-
direction: Literal["horizontal", "vertical"] = "horizontal"
|
|
98
|
+
direction: Literal["horizontal", "vertical"] = "horizontal",
|
|
99
|
+
verbose: bool=True
|
|
99
100
|
) -> pd.DataFrame:
|
|
100
101
|
"""
|
|
101
102
|
Merges multiple DataFrames either horizontally or vertically.
|
|
@@ -119,8 +120,9 @@ def merge_dataframes(
|
|
|
119
120
|
if len(dfs) < 2:
|
|
120
121
|
raise ValueError("At least 2 DataFrames must be provided.")
|
|
121
122
|
|
|
122
|
-
|
|
123
|
-
|
|
123
|
+
if verbose:
|
|
124
|
+
for i, df in enumerate(dfs, start=1):
|
|
125
|
+
print(f"DataFrame {i} shape: {df.shape}")
|
|
124
126
|
|
|
125
127
|
|
|
126
128
|
if direction == "horizontal":
|
|
@@ -142,8 +144,9 @@ def merge_dataframes(
|
|
|
142
144
|
|
|
143
145
|
if reset_index:
|
|
144
146
|
merged_df = merged_df.reset_index(drop=True)
|
|
145
|
-
|
|
146
|
-
|
|
147
|
+
|
|
148
|
+
if verbose:
|
|
149
|
+
print(f"Merged DataFrame shape: {merged_df.shape}")
|
|
147
150
|
|
|
148
151
|
return merged_df
|
|
149
152
|
|
|
@@ -171,7 +174,7 @@ def save_dataframe(df: pd.DataFrame, save_dir: str, filename: str) -> None:
|
|
|
171
174
|
output_path = os.path.join(save_dir, filename)
|
|
172
175
|
|
|
173
176
|
df.to_csv(output_path, index=False, encoding='utf-8')
|
|
174
|
-
print(f"✅ Saved
|
|
177
|
+
print(f"✅ Saved dataset: '{filename}' with shape: {df.shape}")
|
|
175
178
|
|
|
176
179
|
|
|
177
180
|
def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
dragon_ml_toolbox-1.4.2.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
|
|
2
|
-
dragon_ml_toolbox-1.4.2.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=e1Hg5ZtaBpDV7ZvxhLe1ac28l7nMjvi1MSE5YvB1s-o,1472
|
|
3
|
-
ml_tools/MICE_imputation.py,sha256=CK0tYZ_kQkdETohOlhI7RP7oFkJTXrP-XtIxb--dzpU,9726
|
|
4
|
-
ml_tools/VIF_factor.py,sha256=LQWr1P8WYij07FX_3RZC6Rr22bfAMnrt0Lhvi7SbBpY,9846
|
|
5
|
-
ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
-
ml_tools/data_exploration.py,sha256=FXP5i6bQo8J3RCyLRmlX-qJVh4VH8DbMjrdUmyd1mF0,18708
|
|
7
|
-
ml_tools/datasetmaster.py,sha256=EFUEX-tqq94Ak1rXXYR-XaX85olrxvF2EuytdzUK7y0,29131
|
|
8
|
-
ml_tools/ensemble_learning.py,sha256=p8t5PI63N3G0ZgvOKmvFOvwJ24qqPdZCvyiDAx4ggXY,27670
|
|
9
|
-
ml_tools/handle_excel.py,sha256=ZJui5__0rc2T8UGHTheqZGhKmdVZ7Q2I54IoYCjAqJw,12612
|
|
10
|
-
ml_tools/logger.py,sha256=NOtL3YSuffAGmpTpXjY-uJjqFLdRG_jpL7MDyloBw9c,4712
|
|
11
|
-
ml_tools/particle_swarm_optimization.py,sha256=3xsc6sg-5o3cPbG_dWUyF3HdRVxgL4k_kRuPMU11NnM,20020
|
|
12
|
-
ml_tools/pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
|
|
13
|
-
ml_tools/trainer.py,sha256=WAZ4EdrZuTOAnGXRWV3XcLNce4s7EKGf2-qchLC08Ik,15702
|
|
14
|
-
ml_tools/utilities.py,sha256=Pou-8IZsZj9NiZ_shhLt552yaKNvbnQ1Ztoj6VMHIeE,10091
|
|
15
|
-
ml_tools/vision_helpers.py,sha256=idQ-Ugp1IdsvwXiYyhYa9G3rTRTm37YRpkQDLEpANHM,7701
|
|
16
|
-
dragon_ml_toolbox-1.4.2.dist-info/METADATA,sha256=c95w_AETVdAwMYWrowJKxkC0wYCsgRrTmxyekPz7WBE,2516
|
|
17
|
-
dragon_ml_toolbox-1.4.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
18
|
-
dragon_ml_toolbox-1.4.2.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
19
|
-
dragon_ml_toolbox-1.4.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|