dragon-ml-toolbox 1.4.2__py3-none-any.whl → 1.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 1.4.2
3
+ Version: 1.4.3
4
4
  Summary: A collection of tools for data science and machine learning projects
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -0,0 +1,19 @@
1
+ dragon_ml_toolbox-1.4.3.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
+ dragon_ml_toolbox-1.4.3.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=jDnniT0tgD0uw1NpjibsPF-qK3wmOKgTykLG2iNQU7E,1840
3
+ ml_tools/MICE_imputation.py,sha256=3CN_Z5NnQnr9BQOBcccIV13BcV-zRSvWUpYXoMZpPt8,10142
4
+ ml_tools/VIF_factor.py,sha256=LQWr1P8WYij07FX_3RZC6Rr22bfAMnrt0Lhvi7SbBpY,9846
5
+ ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ ml_tools/data_exploration.py,sha256=iRMyn-H0ffjhLkL-B5zKSb1tlyT4bKm0H4vE_GMaXP0,19903
7
+ ml_tools/datasetmaster.py,sha256=EFUEX-tqq94Ak1rXXYR-XaX85olrxvF2EuytdzUK7y0,29131
8
+ ml_tools/ensemble_learning.py,sha256=5CCd8w0j-uDkf7ToN2ENT_KdZbB8ZQUFYlrKN-OHUxA,37533
9
+ ml_tools/handle_excel.py,sha256=ZJui5__0rc2T8UGHTheqZGhKmdVZ7Q2I54IoYCjAqJw,12612
10
+ ml_tools/logger.py,sha256=NOtL3YSuffAGmpTpXjY-uJjqFLdRG_jpL7MDyloBw9c,4712
11
+ ml_tools/particle_swarm_optimization.py,sha256=g_KwPQL77HuVwceABP17RsF__qLmNAp2YVsXOxmFEOM,20034
12
+ ml_tools/pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
13
+ ml_tools/trainer.py,sha256=WAZ4EdrZuTOAnGXRWV3XcLNce4s7EKGf2-qchLC08Ik,15702
14
+ ml_tools/utilities.py,sha256=r7uEmo38Imly56BP3-Jv6dFJvLsbGipeBlkiZx2fcNQ,10189
15
+ ml_tools/vision_helpers.py,sha256=idQ-Ugp1IdsvwXiYyhYa9G3rTRTm37YRpkQDLEpANHM,7701
16
+ dragon_ml_toolbox-1.4.3.dist-info/METADATA,sha256=l0uOaYlimIH_YCT89C2mOkHaiKDEtq9XxhHpbcMCppU,2516
17
+ dragon_ml_toolbox-1.4.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
18
+ dragon_ml_toolbox-1.4.3.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
19
+ dragon_ml_toolbox-1.4.3.dist-info/RECORD,,
@@ -8,7 +8,12 @@ This project depends on the following third-party packages. Each is governed by
8
8
  - [seaborn](https://github.com/mwaskom/seaborn/blob/main/LICENSE)
9
9
  - [statsmodels](https://github.com/statsmodels/statsmodels/blob/main/LICENSE.txt)
10
10
  - [ipython](https://github.com/ipython/ipython/blob/main/COPYING.rst)
11
+ - [ipykernel](https://github.com/ipython/ipykernel/blob/main/COPYING.rst)
12
+ - [notebook](https://github.com/jupyter/notebook/blob/main/LICENSE)
13
+ - [jupyterlab](https://github.com/jupyterlab/jupyterlab/blob/main/LICENSE)
14
+ - [ipywidgets](https://github.com/jupyter-widgets/ipywidgets/blob/main/LICENSE)
11
15
  - [torch](https://github.com/pytorch/pytorch/blob/main/LICENSE)
16
+ - [torchvision](https://github.com/pytorch/vision/blob/main/LICENSE)
12
17
  - [scikit-learn](https://github.com/scikit-learn/scikit-learn/blob/main/COPYING)
13
18
  - [imblearn](https://github.com/scikit-learn-contrib/imbalanced-learn/blob/main/LICENSE)
14
19
  - [Pillow](https://github.com/python-pillow/Pillow/blob/main/LICENSE)
@@ -19,5 +24,5 @@ This project depends on the following third-party packages. Each is governed by
19
24
  - [openpyxl](https://github.com/chronossc/openpyxl/blob/main/LICENSE)
20
25
  - [miceforest](https://github.com/AnotherSamWilson/miceforest/blob/main/LICENSE)
21
26
  - [polars](https://github.com/pola-rs/polars/blob/main/LICENSE)
22
- - [torchvision](https://github.com/pytorch/vision/blob/main/LICENSE)
27
+ - [plotnine](https://github.com/has2k1/plotnine/blob/main/LICENSE.txt)
23
28
  - [pyswarm](https://pythonhosted.org/pyswarm/#license)
@@ -3,7 +3,7 @@ import miceforest as mf
3
3
  import os
4
4
  import matplotlib.pyplot as plt
5
5
  import numpy as np
6
- from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info
6
+ from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info, merge_dataframes, save_dataframe
7
7
  from plotnine import ggplot, labs, theme, element_blank # type: ignore
8
8
 
9
9
 
@@ -49,15 +49,11 @@ def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterat
49
49
  return kernel, imputed_datasets, imputed_dataset_names
50
50
 
51
51
 
52
- def save_imputed_datasets(save_dir: str, imputed_datasets: list, imputed_dataset_names: list[str]):
53
- # Check path
54
- os.makedirs(save_dir, exist_ok=True)
55
-
52
+ def save_imputed_datasets(save_dir: str, imputed_datasets: list, df_targets: pd.DataFrame, imputed_dataset_names: list[str]):
56
53
  for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
57
- output_path = os.path.join(save_dir, subname + ".csv")
58
- imputed_df.to_csv(output_path, index=False, encoding='utf-8')
59
- print(f"\tSaved {subname} with shape {imputed_df.shape}")
60
-
54
+ merged_df = merge_dataframes(imputed_df, df_targets, direction="horizontal", verbose=False)
55
+ save_dataframe(df=merged_df, save_dir=save_dir, filename=subname)
56
+
61
57
 
62
58
  #Get names of features that had missing values before imputation
63
59
  def get_na_column_names(df: pd.DataFrame):
@@ -119,7 +115,7 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
119
115
  plt.savefig(save_path, bbox_inches='tight', format="svg")
120
116
  plt.close()
121
117
 
122
- print(f"\t{dataset_file_dir} completed.")
118
+ print(f"{dataset_file_dir} completed.")
123
119
 
124
120
 
125
121
  # Imputed distributions
@@ -131,7 +127,8 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
131
127
  '''
132
128
  # Check path
133
129
  os.makedirs(root_dir, exist_ok=True)
134
- local_save_dir = os.path.join(root_dir, f"Distribution_Metrics_{df_name}_imputed")
130
+ local_dir_name = f"Distribution_Metrics_{df_name}_imputed"
131
+ local_save_dir = os.path.join(root_dir, local_dir_name)
135
132
  if not os.path.isdir(local_save_dir):
136
133
  os.makedirs(local_save_dir)
137
134
 
@@ -202,10 +199,10 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
202
199
  fig = kernel.plot_imputed_distributions(variables=[feature])
203
200
  _process_figure(fig, feature)
204
201
 
205
- print("\tImputed distributions saved successfully.")
202
+ print(f"{local_dir_name} completed.")
206
203
 
207
204
 
208
- def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_dir: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
205
+ def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str], save_datasets_dir: str, save_metrics_dir: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
209
206
  """
210
207
  Call functions in sequence for each dataset in the provided path or directory:
211
208
  1. Load dataframe
@@ -213,6 +210,8 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
213
210
  3. Save imputed dataset(s)
214
211
  4. Save convergence metrics
215
212
  5. Save distribution metrics
213
+
214
+ Target columns must be skipped from the imputation.
216
215
  """
217
216
  # Check paths
218
217
  os.makedirs(save_datasets_dir, exist_ok=True)
@@ -228,9 +227,11 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
228
227
  for df_path in all_file_paths:
229
228
  df, df_name = load_dataframe(df_path=df_path)
230
229
 
230
+ df, df_targets = _skip_targets(df, target_columns)
231
+
231
232
  kernel, imputed_datasets, imputed_dataset_names = apply_mice(df=df, df_name=df_name, resulting_datasets=resulting_datasets, iterations=iterations, random_state=random_state)
232
233
 
233
- save_imputed_datasets(save_dir=save_datasets_dir, imputed_datasets=imputed_datasets, imputed_dataset_names=imputed_dataset_names)
234
+ save_imputed_datasets(save_dir=save_datasets_dir, imputed_datasets=imputed_datasets, df_targets=df_targets, imputed_dataset_names=imputed_dataset_names)
234
235
 
235
236
  imputed_column_names = get_na_column_names(df=df)
236
237
 
@@ -239,5 +240,12 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
239
240
  get_imputed_distributions(kernel=kernel, df_name=df_name, root_dir=save_metrics_dir, column_names=imputed_column_names)
240
241
 
241
242
 
243
+ def _skip_targets(df: pd.DataFrame, target_cols: list[str]):
244
+ valid_targets = [col for col in target_cols if col in df.columns]
245
+ df_targets = df[valid_targets]
246
+ df_feats = df.drop(columns=valid_targets)
247
+ return df_feats, df_targets
248
+
249
+
242
250
  def info():
243
251
  _script_info(__all__)
@@ -5,10 +5,8 @@ import seaborn as sns
5
5
  from IPython import get_ipython
6
6
  from IPython.display import clear_output
7
7
  import time
8
- from typing import Union, Literal, Dict, Tuple
8
+ from typing import Union, Literal, Dict, Tuple, Iterator
9
9
  import os
10
- import sys
11
- import textwrap
12
10
  from ml_tools.utilities import sanitize_filename, _script_info
13
11
 
14
12
 
@@ -24,7 +22,8 @@ __all__ = [
24
22
  "check_value_distributions",
25
23
  "plot_value_distributions",
26
24
  "clip_outliers_single",
27
- "clip_outliers_multi"
25
+ "clip_outliers_multi",
26
+ "distribute_datasets_by_target"
28
27
  ]
29
28
 
30
29
 
@@ -113,7 +112,7 @@ def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
113
112
  Parameters:
114
113
  df (pd.DataFrame): The input DataFrame.
115
114
  round_digits (int): Number of decimal places for the percentage.
116
-
115
+
117
116
  Returns:
118
117
  pd.DataFrame: A DataFrame summarizing missing values in each column.
119
118
  """
@@ -133,13 +132,14 @@ def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
133
132
  return null_summary
134
133
 
135
134
 
136
- def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7) -> pd.DataFrame:
135
+ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, show_nulls_after: bool = True) -> pd.DataFrame:
137
136
  """
138
137
  Drops columns with more than `threshold` fraction of missing values.
139
138
 
140
139
  Parameters:
141
140
  df (pd.DataFrame): The input DataFrame.
142
141
  threshold (float): Fraction of missing values above which columns are dropped.
142
+ show_nulls_after (bool): Prints `show_null_columns` after dropping columns.
143
143
 
144
144
  Returns:
145
145
  pd.DataFrame: A new DataFrame without the dropped columns.
@@ -150,10 +150,15 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7) ->
150
150
  if len(cols_to_drop) > 0:
151
151
  print(f"Dropping columns with more than {threshold*100:.0f}% missing data:")
152
152
  print(list(cols_to_drop))
153
+
154
+ result_df = df.drop(columns=cols_to_drop)
155
+ if show_nulls_after:
156
+ show_null_columns(df=result_df).head(20)
157
+
158
+ return result_df
153
159
  else:
154
160
  print(f"No columns have more than {threshold*100:.0f}% missing data.")
155
-
156
- return df.drop(columns=cols_to_drop)
161
+ return df
157
162
 
158
163
 
159
164
  def split_continuous_binary(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
@@ -514,6 +519,34 @@ def clip_outliers_multi(
514
519
  return new_df
515
520
 
516
521
 
522
+ def distribute_datasets_by_target(
523
+ df: pd.DataFrame,
524
+ target_columns: list[str]
525
+ ) -> Iterator[Tuple[str, pd.DataFrame]]:
526
+ """
527
+ Yields cleaned DataFrames for each target column, where rows with missing
528
+ target values are removed. The target column is placed at the end.
529
+
530
+ Parameters
531
+ ----------
532
+ df : pd.DataFrame
533
+ Preprocessed dataframe with all feature and target columns ready to train.
534
+ target_columns : List[str]
535
+ List of target column names to generate per-target DataFrames.
536
+
537
+ Yields
538
+ ------
539
+ Tuple[str, pd.DataFrame]
540
+ * First element is the target column name.
541
+ * Second element is the corresponding cleaned DataFrame.
542
+ """
543
+ feature_columns = [col for col in df.columns if col not in target_columns]
544
+
545
+ for target in target_columns:
546
+ subset = df[feature_columns + [target]].dropna(subset=[target])
547
+ yield target, subset
548
+
549
+
517
550
  def _is_notebook():
518
551
  return get_ipython() is not None
519
552
 
@@ -6,7 +6,7 @@ from matplotlib.colors import Colormap
6
6
  from matplotlib import rcdefaults
7
7
 
8
8
  import os
9
- from typing import Literal, Union, Optional
9
+ from typing import Literal, Union, Optional, Iterator, Tuple
10
10
  import joblib
11
11
 
12
12
  from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
@@ -29,7 +29,9 @@ warnings.filterwarnings('ignore', category=UserWarning)
29
29
 
30
30
 
31
31
  __all__ = [
32
- "get_models",
32
+ "dataset_yielder",
33
+ "RegressionTreeModels",
34
+ "ClassificationTreeModels",
33
35
  "dataset_pipeline",
34
36
  "evaluate_model_classification",
35
37
  "plot_roc_curve",
@@ -39,114 +41,364 @@ __all__ = [
39
41
  "run_ensemble_pipeline"
40
42
  ]
41
43
 
44
+ ## Type aliases
45
+ HandleImbalanceStrategy = Literal[
46
+ "ADASYN", "SMOTE", "RAND_OVERSAMPLE", "RAND_UNDERSAMPLE", "by_model", None
47
+ ]
48
+
49
+ TaskType = Literal[
50
+ "classification", "regression"
51
+ ]
42
52
 
43
53
  ###### 1. Dataset Loader ######
44
- #Split a dataset into features and targets datasets
45
- def _dataset_yielder(df: pd.DataFrame, target_cols: list[str]):
46
- '''
47
- Yields one Tuple at a time: `(df_features, df_target, feature_names, target_name)`
48
- '''
49
- df_features = df.drop(columns=target_cols)
54
+ def dataset_yielder(
55
+ df: pd.DataFrame,
56
+ target_cols: list[str]
57
+ ) -> Iterator[Tuple[pd.DataFrame, pd.Series, list[str], str]]:
58
+ """
59
+ Yields one tuple at a time:
60
+ (features_dataframe, target_series, feature_names, target_name)
61
+
62
+ Skips any target columns not found in the DataFrame.
63
+ """
64
+ # Determine which target columns actually exist in the DataFrame
65
+ valid_targets = [col for col in target_cols if col in df.columns]
66
+
67
+ # Features = all columns excluding valid target columns
68
+ df_features = df.drop(columns=valid_targets)
50
69
  feature_names = df_features.columns.to_list()
51
-
52
- for target_col in target_cols:
70
+
71
+ for target_col in valid_targets:
53
72
  df_target = df[target_col]
54
73
  yield (df_features, df_target, feature_names, target_col)
55
74
 
75
+
56
76
  ###### 2. Initialize Models ######
57
- def get_models(task: Literal["classification", "regression"], random_state: int=101, is_balanced: bool = True,
58
- L1_regularization: float = 1.0, L2_regularization: float = 1.0, learning_rate: float=0.005) -> dict:
59
- '''
60
- Returns a dictionary `{Model_Name: Model}` with new instances of models.
61
- Valid tasks: "classification" or "regression".
77
+ class RegressionTreeModels:
78
+ """
79
+ A factory class for creating and configuring multiple gradient boosting regression models
80
+ with unified hyperparameters. This includes XGBoost, LightGBM, and HistGradientBoostingRegressor.
62
81
 
63
- Classification Models:
64
- - "XGBoost" - XGBClassifier
65
- - "LightGBM" - LGBMClassifier
66
- - "HistGB" - HistGradientBoostingClassifier
67
- Regression Models:
68
- - "XGBoost" - XGBRegressor
69
- - "LightGBM" - LGBMRegressor
70
- - "HistGB" - HistGradientBoostingRegressor
71
-
72
- For classification only: Set `is_balanced=False` for imbalanced datasets.
82
+ Use the `__call__`, `()` method.
83
+
84
+ Parameters
85
+ ----------
86
+ random_state : int
87
+ Seed used by the random number generator.
88
+
89
+ learning_rate : float [0.001 - 0.300]
90
+ Boosting learning rate (shrinkage).
73
91
 
74
- Increase L1 and L2 if model is overfitting
75
- '''
92
+ L1_regularization : float [0.0 - 10.0]
93
+ L1 regularization term (alpha). Might drive to sparsity.
94
+
95
+ L2_regularization : float [0.0 - 10.0]
96
+ L2 regularization term (lambda).
97
+
98
+ n_estimators : int [100 - 3000]
99
+ Number of boosting iterations for XGBoost and LightGBM.
100
+
101
+ max_depth : int [3 - 15]
102
+ Maximum depth of individual trees. Controls model complexity; high values may overfit.
103
+
104
+ subsample : float [0.5 - 1.0]
105
+ Fraction of rows per tree; used to prevent overfitting.
106
+
107
+ colsample_bytree : float [0.3 - 1.0]
108
+ Fraction of features per tree; useful for regularization (used by XGBoost and LightGBM).
109
+
110
+ min_samples_leaf : int [10 - 100]
111
+ Minimum samples per leaf; higher = less overfitting (used in HistGB).
112
+
113
+ max_iter : int [100 - 2000]
114
+ Maximum number of iterations (used in HistGB).
115
+
116
+ min_child_weight : float [0.1 - 10.0]
117
+ Minimum sum of instance weight (hessian) needed in a child; larger values make the algorithm more conservative (used in XGBoost).
118
+
119
+ gamma : float [0.0 - 5.0]
120
+ Minimum loss reduction required to make a further partition on a leaf node; higher = more regularization (used in XGBoost).
121
+
122
+ num_leaves : int [20 - 200]
123
+ Maximum number of leaves in one tree; should be less than 2^(max_depth); larger = more complex (used in LightGBM).
124
+
125
+ min_data_in_leaf : int [10 - 100]
126
+ Minimum number of data points in a leaf; increasing may prevent overfitting (used in LightGBM).
127
+ """
128
+ def __init__(self,
129
+ random_state: int = 101,
130
+ learning_rate: float = 0.005,
131
+ L1_regularization: float = 1.0,
132
+ L2_regularization: float = 1.0,
133
+ n_estimators: int = 1000,
134
+ max_depth: int = 8,
135
+ subsample: float = 0.8,
136
+ colsample_bytree: float = 0.8,
137
+ min_samples_leaf: int = 50,
138
+ max_iter: int = 1000,
139
+ min_child_weight: float = 3.0,
140
+ gamma: float = 1.0,
141
+ num_leaves: int = 31,
142
+ min_data_in_leaf: int = 40):
143
+ # General config
144
+ self.random_state = random_state
145
+ self.lr = learning_rate
146
+ self.L1 = L1_regularization
147
+ self.L2 = L2_regularization
148
+
149
+ # Shared tree structure
150
+ self.n_estimators = n_estimators
151
+ self.max_depth = max_depth
152
+ self.subsample = subsample
153
+ self.colsample_bytree = colsample_bytree
154
+
155
+ # XGBoost specific
156
+ self.min_child_weight = min_child_weight
157
+ self.gamma = gamma
158
+
159
+ # LightGBM specific
160
+ if num_leaves >= (2**max_depth):
161
+ num_leaves = (2**max_depth) - 1
162
+ print(f"⚠️ Warning: 'num_leaves' should be set proportional to 'max_depth'. Value set as {num_leaves}.")
163
+ self.num_leaves = num_leaves
164
+ self.min_data_in_leaf = min_data_in_leaf
165
+
166
+ # HistGB specific
167
+ self.max_iter = max_iter
168
+ self.min_samples_leaf = min_samples_leaf
169
+
170
+ def __call__(self) -> dict[str, object]:
171
+ """
172
+ Returns a dictionary with new instances of:
173
+ - "XGBoost": XGBRegressor
174
+ - "LightGBM": LGBMRegressor
175
+ - "HistGB": HistGradientBoostingRegressor
176
+ """
177
+ # XGBoost Regressor
178
+ xgb_model = xgb.XGBRegressor(
179
+ n_estimators=self.n_estimators,
180
+ max_depth=self.max_depth,
181
+ learning_rate=self.lr,
182
+ subsample=self.subsample,
183
+ colsample_bytree=self.colsample_bytree,
184
+ random_state=self.random_state,
185
+ reg_alpha=self.L1,
186
+ reg_lambda=self.L2,
187
+ eval_metric='rmse',
188
+ min_child_weight=self.min_child_weight,
189
+ gamma=self.gamma,
190
+ tree_method='hist',
191
+ grow_policy='lossguide'
192
+ )
193
+
194
+ # LightGBM Regressor
195
+ lgb_model = lgb.LGBMRegressor(
196
+ n_estimators=self.n_estimators,
197
+ learning_rate=self.lr,
198
+ max_depth=self.max_depth,
199
+ subsample=self.subsample,
200
+ colsample_bytree=self.colsample_bytree,
201
+ random_state=self.random_state,
202
+ verbose=-1,
203
+ reg_alpha=self.L1,
204
+ reg_lambda=self.L2,
205
+ boosting_type='dart',
206
+ num_leaves=self.num_leaves,
207
+ min_data_in_leaf=self.min_data_in_leaf
208
+ )
209
+
210
+ # HistGradientBoosting Regressor
211
+ hist_model = HistGradientBoostingRegressor(
212
+ max_iter=self.max_iter,
213
+ learning_rate=self.lr,
214
+ max_depth=self.max_depth,
215
+ min_samples_leaf=self.min_samples_leaf,
216
+ random_state=self.random_state,
217
+ l2_regularization=self.L2,
218
+ scoring='neg_mean_squared_error',
219
+ early_stopping=True,
220
+ validation_fraction=0.1
221
+ )
222
+
223
+ return {
224
+ "XGBoost": xgb_model,
225
+ "LightGBM": lgb_model,
226
+ "HistGB": hist_model
227
+ }
76
228
 
77
- # Model initialization logic
78
- if task not in ["classification", "regression"]:
79
- raise ValueError(f"Invalid task: {task}. Must be 'classification' or 'regression'.")
80
-
81
- models = {}
82
-
83
- # Common parameters
84
- xgb_params = {
85
- 'n_estimators': 200,
86
- 'max_depth': 5,
87
- 'learning_rate': learning_rate,
88
- 'subsample': 0.8,
89
- 'colsample_bytree': 0.8,
90
- 'random_state': random_state,
91
- 'reg_alpha': L1_regularization,
92
- 'reg_lambda': L2_regularization,
93
- }
94
-
95
- lgbm_params = {
96
- 'n_estimators': 200,
97
- 'learning_rate': learning_rate,
98
- 'max_depth': 5,
99
- 'subsample': 0.8,
100
- 'colsample_bytree': 0.8,
101
- 'random_state': random_state,
102
- 'verbose': -1,
103
- 'reg_alpha': L1_regularization,
104
- 'reg_lambda': L2_regularization,
105
- }
106
-
107
- hist_params = {
108
- 'max_iter': 200,
109
- 'learning_rate': learning_rate,
110
- 'max_depth': 5,
111
- 'min_samples_leaf': 30,
112
- 'random_state': random_state,
113
- 'l2_regularization': L2_regularization,
114
- }
115
-
116
- # XGB Model
117
- if task == "classification":
118
- xgb_params.update({
119
- 'scale_pos_weight': 1 if is_balanced else 8,
120
- 'eval_metric': 'aucpr'
121
- })
122
- models["XGBoost"] = xgb.XGBClassifier(**xgb_params)
123
- else:
124
- xgb_params.update({'eval_metric': 'rmse'})
125
- models["XGBoost"] = xgb.XGBRegressor(**xgb_params)
229
+ def __str__(self):
230
+ return f"{self.__class__.__name__}(n_estimators={self.n_estimators}, max_depth={self.max_depth}, lr={self.lr}, L1={self.L1}, L2={self.L2}"
126
231
 
127
- # LGBM Model
128
- if task == "classification":
129
- lgbm_params.update({
130
- 'class_weight': None if is_balanced else 'balanced',
131
- 'boosting_type': 'goss' if is_balanced else 'dart',
132
- })
133
- models["LightGBM"] = lgb.LGBMClassifier(**lgbm_params)
134
- else:
135
- lgbm_params['boosting_type'] = 'dart'
136
- models["LightGBM"] = lgb.LGBMRegressor(**lgbm_params)
137
232
 
138
- # HistGB Model
139
- if task == "classification":
140
- hist_params.update({
141
- 'class_weight': None if is_balanced else 'balanced',
142
- 'scoring': 'loss' if is_balanced else 'balanced_accuracy',
143
- })
144
- models["HistGB"] = HistGradientBoostingClassifier(**hist_params)
145
- else:
146
- hist_params['scoring'] = 'neg_mean_squared_error'
147
- models["HistGB"] = HistGradientBoostingRegressor(**hist_params)
233
+ class ClassificationTreeModels:
234
+ """
235
+ A factory class for creating and configuring multiple gradient boosting classification models
236
+ with unified hyperparameters. This includes: XGBoost, LightGBM, and HistGradientBoostingClassifier.
237
+
238
+ Use the `__call__`, `()` method.
239
+
240
+ Parameters
241
+ ----------
242
+ random_state : int
243
+ Seed used by the random number generator to ensure reproducibility.
244
+
245
+ learning_rate : float [0.001 - 0.300]
246
+ Boosting learning rate (shrinkage factor).
247
+
248
+ L1_regularization : float [0.0 - 10.0]
249
+ L1 regularization term (alpha), might drive to sparsity.
250
+
251
+ L2_regularization : float [0.0 - 10.0]
252
+ L2 regularization term (lambda).
253
+
254
+ n_estimators : int [100 - 3000]
255
+ Number of boosting rounds for XGBoost and LightGBM.
256
+
257
+ max_depth : int [3 - 15]
258
+ Maximum depth of individual trees in the ensemble. Controls model complexity; high values may overfit.
259
+
260
+ subsample : float [0.5 - 1.0]
261
+ Fraction of samples to use when fitting base learners; used to prevent overfitting.
262
+
263
+ colsample_bytree : float [0.3 - 1.0]
264
+ Fraction of features per tree; useful for regularization (used by XGBoost and LightGBM).
265
+
266
+ min_samples_leaf : int [10 - 100]
267
+ Minimum number of samples required to be at a leaf node; higher = less overfitting (used in HistGB).
268
+
269
+ max_iter : int [100 - 2000]
270
+ Maximum number of boosting iteration (used in HistGB).
271
+
272
+ min_child_weight : float [0.1 - 10.0]
273
+ Minimum sum of instance weight (Hessian) in a child node; larger values make the algorithm more conservative (used in XGBoost).
274
+
275
+ gamma : float [0.0 - 5.0]
276
+ Minimum loss reduction required to make a further partition; higher = more regularization (used in XGBoost).
277
+
278
+ num_leaves : int [20 - 200]
279
+ Maximum number of leaves in one tree. Should be less than 2^(max_depth); larger = more complex (used in LightGBM).
280
+
281
+ min_data_in_leaf : int [10 -100]
282
+ Minimum number of samples required in a leaf; increasing may prevent overfitting (used in LightGBM).
283
+
284
+ Attributes
285
+ ----------
286
+ use_model_balance : bool
287
+ Indicates whether to apply class balancing strategies internally. Can be overridden at runtime via the `__call__` method.
288
+ """
289
+ def __init__(self,
290
+ random_state: int = 101,
291
+ learning_rate: float = 0.005,
292
+ L1_regularization: float = 1.0,
293
+ L2_regularization: float = 1.0,
294
+ n_estimators: int = 1000,
295
+ max_depth: int = 8,
296
+ subsample: float = 0.8,
297
+ colsample_bytree: float = 0.8,
298
+ min_samples_leaf: int = 50,
299
+ max_iter: int = 1000,
300
+ min_child_weight: float = 3.0,
301
+ gamma: float = 1.0,
302
+ num_leaves: int = 31,
303
+ min_data_in_leaf: int = 40):
304
+ # General config
305
+ self.random_state = random_state
306
+ self.lr = learning_rate
307
+ self.L1 = L1_regularization
308
+ self.L2 = L2_regularization
309
+
310
+ # To be set by the pipeline
311
+ self.use_model_balance: bool = True
312
+
313
+ # Shared tree structure
314
+ self.n_estimators = n_estimators
315
+ self.max_depth = max_depth
316
+ self.subsample = subsample
317
+ self.colsample_bytree = colsample_bytree
318
+
319
+ # XGBoost specific
320
+ self.min_child_weight = min_child_weight
321
+ self.gamma = gamma
322
+
323
+ # LightGBM specific
324
+ if num_leaves >= (2**max_depth):
325
+ num_leaves = (2**max_depth) - 1
326
+ print(f"⚠️ Warning: 'num_leaves' should be set proportional to 'max_depth'. Value set as {num_leaves}.")
327
+ self.num_leaves = num_leaves
328
+ self.min_data_in_leaf = min_data_in_leaf
329
+
330
+ # HistGB specific
331
+ self.max_iter = max_iter
332
+ self.min_samples_leaf = min_samples_leaf
333
+
334
+ def __call__(self, use_model_balance: Optional[bool]=None) -> dict[str, object]:
335
+ """
336
+ Returns a dictionary with new instances of:
337
+ - "XGBoost": XGBClassifier
338
+ - "LightGBM": LGBMClassifier
339
+ - "HistGB": HistGradientBoostingClassifier
340
+ """
341
+ if use_model_balance is not None:
342
+ self.use_model_balance = use_model_balance
343
+
344
+ # XGBoost Classifier
345
+ xgb_model = xgb.XGBClassifier(
346
+ n_estimators=self.n_estimators,
347
+ max_depth=self.max_depth,
348
+ learning_rate=self.lr,
349
+ subsample=self.subsample,
350
+ colsample_bytree=self.colsample_bytree,
351
+ random_state=self.random_state,
352
+ reg_alpha=self.L1,
353
+ reg_lambda=self.L2,
354
+ eval_metric='aucpr',
355
+ min_child_weight=self.min_child_weight,
356
+ gamma=self.gamma,
357
+ tree_method='hist',
358
+ grow_policy='lossguide',
359
+ scale_pos_weight=8.0 if self.use_model_balance else 1.0
360
+ )
361
+
362
+ # LightGBM Classifier
363
+ lgb_model = lgb.LGBMClassifier(
364
+ n_estimators=self.n_estimators,
365
+ learning_rate=self.lr,
366
+ max_depth=self.max_depth,
367
+ subsample=self.subsample,
368
+ colsample_bytree=self.colsample_bytree,
369
+ random_state=self.random_state,
370
+ verbose=-1,
371
+ reg_alpha=self.L1,
372
+ reg_lambda=self.L2,
373
+ boosting_type='dart' if self.use_model_balance else 'goss',
374
+ num_leaves=self.num_leaves,
375
+ min_data_in_leaf=self.min_data_in_leaf,
376
+ class_weight='balanced' if self.use_model_balance else None
377
+ )
378
+
379
+ # HistGradientBoosting Classifier
380
+ hist_model = HistGradientBoostingClassifier(
381
+ max_iter=self.max_iter,
382
+ learning_rate=self.lr,
383
+ max_depth=self.max_depth,
384
+ min_samples_leaf=self.min_samples_leaf,
385
+ random_state=self.random_state,
386
+ l2_regularization=self.L2,
387
+ early_stopping=True,
388
+ validation_fraction=0.1,
389
+ class_weight='balanced' if self.use_model_balance else None,
390
+ scoring='balanced_accuracy' if self.use_model_balance else 'loss'
391
+ )
392
+
393
+ return {
394
+ "XGBoost": xgb_model,
395
+ "LightGBM": lgb_model,
396
+ "HistGB": hist_model
397
+ }
398
+
399
+ def __str__(self):
400
+ return f"{self.__class__.__name__}(n_estimators={self.n_estimators}, max_depth={self.max_depth}, lr={self.lr}, L1={self.L1}, L2={self.L2}"
148
401
 
149
- return models
150
402
 
151
403
  ###### 3. Process Dataset ######
152
404
  # function to split data into train and test
@@ -157,7 +409,7 @@ def _split_data(features, target, test_size, random_state, task):
157
409
 
158
410
  # Over-sample minority class (Positive cases) and return several single target datasets (Classification)
159
411
  def _resample(X_train: np.ndarray, y_train: pd.Series,
160
- strategy: Literal[r"ADASYN", r'SMOTE', r'RANDOM', r'UNDERSAMPLE'], random_state):
412
+ strategy: HandleImbalanceStrategy, random_state):
161
413
  '''
162
414
  Oversample minority class or undersample majority class.
163
415
 
@@ -165,9 +417,9 @@ def _resample(X_train: np.ndarray, y_train: pd.Series,
165
417
  '''
166
418
  if strategy == 'SMOTE':
167
419
  resample_algorithm = SMOTE(random_state=random_state, k_neighbors=3)
168
- elif strategy == 'RANDOM':
420
+ elif strategy == 'RAND_OVERSAMPLE':
169
421
  resample_algorithm = RandomOverSampler(random_state=random_state)
170
- elif strategy == 'UNDERSAMPLE':
422
+ elif strategy == 'RAND_UNDERSAMPLE':
171
423
  resample_algorithm = RandomUnderSampler(random_state=random_state)
172
424
  elif strategy == 'ADASYN':
173
425
  resample_algorithm = ADASYN(random_state=random_state, n_neighbors=3)
@@ -178,8 +430,8 @@ def _resample(X_train: np.ndarray, y_train: pd.Series,
178
430
  return X_res, y_res
179
431
 
180
432
  # DATASET PIPELINE
181
- def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task: Literal["classification", "regression"],
182
- resample_strategy: Union[Literal[r"ADASYN", r'SMOTE', r'RANDOM', r'UNDERSAMPLE'], None],
433
+ def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task: TaskType,
434
+ resample_strategy: HandleImbalanceStrategy,
183
435
  test_size: float=0.2, debug: bool=False, random_state: int=101):
184
436
  '''
185
437
  1. Make Train/Test splits
@@ -204,7 +456,7 @@ def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task: Lite
204
456
 
205
457
 
206
458
  # Resample
207
- if resample_strategy is None or task == "regression":
459
+ if resample_strategy is None or resample_strategy == "by_model" or task == "regression":
208
460
  X_train_oversampled, y_train_oversampled = X_train, y_train
209
461
  else:
210
462
  X_train_oversampled, y_train_oversampled = _resample(X_train=X_train, y_train=y_train, strategy=resample_strategy, random_state=random_state)
@@ -431,7 +683,7 @@ def evaluate_model_regression(model, model_name: str,
431
683
  sanitized_target_name = sanitize_filename(target_name)
432
684
  report_path = os.path.join(save_dir, f"Regression_Report_{sanitized_target_name}.txt")
433
685
  with open(report_path, "w") as f:
434
- f.write(f"{model_name} - {target_name} Regression Performance\n")
686
+ f.write(f"{model_name} - Regression Performance for '{target_name}'\n\n")
435
687
  f.write(f"Mean Absolute Error (MAE): {mae:.4f}\n")
436
688
  f.write(f"Mean Squared Error (MSE): {mse:.4f}\n")
437
689
  f.write(f"Root Mean Squared Error (RMSE): {rmse:.4f}\n")
@@ -596,7 +848,7 @@ def get_shap_values(
596
848
 
597
849
 
598
850
  # TRAIN TEST PIPELINE
599
- def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["classification", "regression"],
851
+ def train_test_pipeline(model, model_name: str, dataset_id: str, task: TaskType,
600
852
  train_features: np.ndarray, train_target: np.ndarray,
601
853
  test_features: np.ndarray, test_target: np.ndarray,
602
854
  feature_names: list[str], target_name: str,
@@ -609,7 +861,7 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["
609
861
 
610
862
  Returns: Tuple(Trained model, Test-set Predictions)
611
863
  '''
612
- print(f"\tModel: {model_name} for Target: {target_name}...")
864
+ print(f"\tTraining model: {model_name} for Target: {target_name}...")
613
865
  trained_model = _train_model(model=model, train_features=train_features, train_target=train_target)
614
866
  if debug:
615
867
  print(f"Trained model object: {type(trained_model)}")
@@ -637,26 +889,40 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["
637
889
 
638
890
  get_shap_values(model=trained_model, model_name=model_name, save_dir=local_save_directory,
639
891
  features_to_explain=train_features, feature_names=feature_names, target_name=target_name, task=task)
640
- print("\t...done.")
892
+ # print("\t...done.")
641
893
  return trained_model, y_pred
642
894
 
643
895
  ###### 5. Execution ######
644
- def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list[str], task: Literal["classification", "regression"],
645
- resample_strategy: Literal[r"ADASYN", r'SMOTE', r'RANDOM', r'UNDERSAMPLE', None]=None, save_model: bool=False,
646
- test_size: float=0.2, debug:bool=False, L1_regularization: float=0.5, L2_regularization: float=0.5, learning_rate: float=0.005, random_state: int=101):
896
+ def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list[str], model_object: Union[RegressionTreeModels, ClassificationTreeModels],
897
+ handle_classification_imbalance: HandleImbalanceStrategy=None, save_model: bool=False,
898
+ test_size: float=0.2, debug:bool=False):
899
+ #Check models
900
+ if isinstance(model_object, RegressionTreeModels):
901
+ task = "regression"
902
+ elif isinstance(model_object, ClassificationTreeModels):
903
+ task = "classification"
904
+ if handle_classification_imbalance is None:
905
+ print("⚠️ No method to handle classification class imbalance has been selected. Datasets are assumed to be balanced.")
906
+ elif handle_classification_imbalance == "by_model":
907
+ model_object.use_model_balance = True
908
+ else:
909
+ model_object.use_model_balance = False
910
+ else:
911
+ raise TypeError(f"Unrecognized model {type(model_object)}")
912
+
647
913
  #Check paths
648
914
  _check_paths(datasets_dir, save_dir)
915
+
649
916
  #Yield imputed dataset
650
917
  for dataframe, dataframe_name in yield_dataframes_from_dir(datasets_dir):
651
918
  #Yield features dataframe and target dataframe
652
- for df_features, df_target, feature_names, target_name in _dataset_yielder(df=dataframe, target_cols=target_columns):
919
+ for df_features, df_target, feature_names, target_name in dataset_yielder(df=dataframe, target_cols=target_columns):
653
920
  #Dataset pipeline
654
921
  X_train, y_train, X_test, y_test = dataset_pipeline(df_features=df_features, df_target=df_target, task=task,
655
- resample_strategy=resample_strategy,
656
- test_size=test_size, debug=debug, random_state=random_state)
922
+ resample_strategy=handle_classification_imbalance,
923
+ test_size=test_size, debug=debug, random_state=model_object.random_state)
657
924
  #Get models
658
- models_dict = get_models(task=task, is_balanced=False if resample_strategy is None else True,
659
- L1_regularization=L1_regularization, L2_regularization=L2_regularization, learning_rate=learning_rate)
925
+ models_dict = model_object()
660
926
  #Train models
661
927
  for model_name, model in models_dict.items():
662
928
  train_test_pipeline(model=model, model_name=model_name, dataset_id=dataframe_name, task=task,
@@ -129,7 +129,7 @@ def run_pso(lower_boundaries: list[float],
129
129
  target_name: Union[str, None]=None,
130
130
  feature_names: Union[list[str], None]=None,
131
131
  swarm_size: int=200,
132
- max_iterations: int=400,
132
+ max_iterations: int=1000,
133
133
  inequality_constrain_function=None,
134
134
  post_hoc_analysis: Optional[int]=3,
135
135
  workers: int=3) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
@@ -261,7 +261,7 @@ def info():
261
261
  _script_info(__all__)
262
262
 
263
263
 
264
- ### SOURCE CODE FOR PSO ###
264
+ ### SOURCE CODE FOR PSO FROM PYSWARM ###
265
265
  def _obj_wrapper(func, args, kwargs, x):
266
266
  return func(x, *args, **kwargs)
267
267
 
ml_tools/utilities.py CHANGED
@@ -95,7 +95,8 @@ def yield_dataframes_from_dir(datasets_dir: str):
95
95
  def merge_dataframes(
96
96
  *dfs: pd.DataFrame,
97
97
  reset_index: bool = False,
98
- direction: Literal["horizontal", "vertical"] = "horizontal"
98
+ direction: Literal["horizontal", "vertical"] = "horizontal",
99
+ verbose: bool=True
99
100
  ) -> pd.DataFrame:
100
101
  """
101
102
  Merges multiple DataFrames either horizontally or vertically.
@@ -119,8 +120,9 @@ def merge_dataframes(
119
120
  if len(dfs) < 2:
120
121
  raise ValueError("At least 2 DataFrames must be provided.")
121
122
 
122
- for i, df in enumerate(dfs, start=1):
123
- print(f"DataFrame {i} shape: {df.shape}")
123
+ if verbose:
124
+ for i, df in enumerate(dfs, start=1):
125
+ print(f"DataFrame {i} shape: {df.shape}")
124
126
 
125
127
 
126
128
  if direction == "horizontal":
@@ -142,8 +144,9 @@ def merge_dataframes(
142
144
 
143
145
  if reset_index:
144
146
  merged_df = merged_df.reset_index(drop=True)
145
-
146
- print(f"Merged DataFrame shape: {merged_df.shape}")
147
+
148
+ if verbose:
149
+ print(f"Merged DataFrame shape: {merged_df.shape}")
147
150
 
148
151
  return merged_df
149
152
 
@@ -171,7 +174,7 @@ def save_dataframe(df: pd.DataFrame, save_dir: str, filename: str) -> None:
171
174
  output_path = os.path.join(save_dir, filename)
172
175
 
173
176
  df.to_csv(output_path, index=False, encoding='utf-8')
174
- print(f"✅ Saved file: '{filename}'")
177
+ print(f"✅ Saved dataset: '{filename}' with shape: {df.shape}")
175
178
 
176
179
 
177
180
  def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
@@ -1,19 +0,0 @@
1
- dragon_ml_toolbox-1.4.2.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
- dragon_ml_toolbox-1.4.2.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=e1Hg5ZtaBpDV7ZvxhLe1ac28l7nMjvi1MSE5YvB1s-o,1472
3
- ml_tools/MICE_imputation.py,sha256=CK0tYZ_kQkdETohOlhI7RP7oFkJTXrP-XtIxb--dzpU,9726
4
- ml_tools/VIF_factor.py,sha256=LQWr1P8WYij07FX_3RZC6Rr22bfAMnrt0Lhvi7SbBpY,9846
5
- ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- ml_tools/data_exploration.py,sha256=FXP5i6bQo8J3RCyLRmlX-qJVh4VH8DbMjrdUmyd1mF0,18708
7
- ml_tools/datasetmaster.py,sha256=EFUEX-tqq94Ak1rXXYR-XaX85olrxvF2EuytdzUK7y0,29131
8
- ml_tools/ensemble_learning.py,sha256=p8t5PI63N3G0ZgvOKmvFOvwJ24qqPdZCvyiDAx4ggXY,27670
9
- ml_tools/handle_excel.py,sha256=ZJui5__0rc2T8UGHTheqZGhKmdVZ7Q2I54IoYCjAqJw,12612
10
- ml_tools/logger.py,sha256=NOtL3YSuffAGmpTpXjY-uJjqFLdRG_jpL7MDyloBw9c,4712
11
- ml_tools/particle_swarm_optimization.py,sha256=3xsc6sg-5o3cPbG_dWUyF3HdRVxgL4k_kRuPMU11NnM,20020
12
- ml_tools/pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
13
- ml_tools/trainer.py,sha256=WAZ4EdrZuTOAnGXRWV3XcLNce4s7EKGf2-qchLC08Ik,15702
14
- ml_tools/utilities.py,sha256=Pou-8IZsZj9NiZ_shhLt552yaKNvbnQ1Ztoj6VMHIeE,10091
15
- ml_tools/vision_helpers.py,sha256=idQ-Ugp1IdsvwXiYyhYa9G3rTRTm37YRpkQDLEpANHM,7701
16
- dragon_ml_toolbox-1.4.2.dist-info/METADATA,sha256=c95w_AETVdAwMYWrowJKxkC0wYCsgRrTmxyekPz7WBE,2516
17
- dragon_ml_toolbox-1.4.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
18
- dragon_ml_toolbox-1.4.2.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
19
- dragon_ml_toolbox-1.4.2.dist-info/RECORD,,