dragon-ml-toolbox 1.4.2__tar.gz → 1.4.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (24) hide show
  1. {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.4}/LICENSE-THIRD-PARTY.md +6 -1
  2. {dragon_ml_toolbox-1.4.2/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-1.4.4}/PKG-INFO +1 -1
  3. {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.4/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
  4. {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.4}/ml_tools/MICE_imputation.py +24 -16
  5. {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.4}/ml_tools/VIF_factor.py +9 -6
  6. {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.4}/ml_tools/data_exploration.py +48 -9
  7. {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.4}/ml_tools/ensemble_learning.py +377 -115
  8. {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.4}/ml_tools/logger.py +2 -2
  9. {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.4}/ml_tools/particle_swarm_optimization.py +4 -4
  10. {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.4}/ml_tools/utilities.py +9 -6
  11. {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.4}/pyproject.toml +1 -1
  12. {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.4}/LICENSE +0 -0
  13. {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.4}/README.md +0 -0
  14. {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.4}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
  15. {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.4}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  16. {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.4}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
  17. {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.4}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  18. {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.4}/ml_tools/__init__.py +0 -0
  19. {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.4}/ml_tools/datasetmaster.py +0 -0
  20. {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.4}/ml_tools/handle_excel.py +0 -0
  21. {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.4}/ml_tools/pytorch_models.py +0 -0
  22. {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.4}/ml_tools/trainer.py +0 -0
  23. {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.4}/ml_tools/vision_helpers.py +0 -0
  24. {dragon_ml_toolbox-1.4.2 → dragon_ml_toolbox-1.4.4}/setup.cfg +0 -0
@@ -8,7 +8,12 @@ This project depends on the following third-party packages. Each is governed by
8
8
  - [seaborn](https://github.com/mwaskom/seaborn/blob/main/LICENSE)
9
9
  - [statsmodels](https://github.com/statsmodels/statsmodels/blob/main/LICENSE.txt)
10
10
  - [ipython](https://github.com/ipython/ipython/blob/main/COPYING.rst)
11
+ - [ipykernel](https://github.com/ipython/ipykernel/blob/main/COPYING.rst)
12
+ - [notebook](https://github.com/jupyter/notebook/blob/main/LICENSE)
13
+ - [jupyterlab](https://github.com/jupyterlab/jupyterlab/blob/main/LICENSE)
14
+ - [ipywidgets](https://github.com/jupyter-widgets/ipywidgets/blob/main/LICENSE)
11
15
  - [torch](https://github.com/pytorch/pytorch/blob/main/LICENSE)
16
+ - [torchvision](https://github.com/pytorch/vision/blob/main/LICENSE)
12
17
  - [scikit-learn](https://github.com/scikit-learn/scikit-learn/blob/main/COPYING)
13
18
  - [imblearn](https://github.com/scikit-learn-contrib/imbalanced-learn/blob/main/LICENSE)
14
19
  - [Pillow](https://github.com/python-pillow/Pillow/blob/main/LICENSE)
@@ -19,5 +24,5 @@ This project depends on the following third-party packages. Each is governed by
19
24
  - [openpyxl](https://github.com/chronossc/openpyxl/blob/main/LICENSE)
20
25
  - [miceforest](https://github.com/AnotherSamWilson/miceforest/blob/main/LICENSE)
21
26
  - [polars](https://github.com/pola-rs/polars/blob/main/LICENSE)
22
- - [torchvision](https://github.com/pytorch/vision/blob/main/LICENSE)
27
+ - [plotnine](https://github.com/has2k1/plotnine/blob/main/LICENSE.txt)
23
28
  - [pyswarm](https://pythonhosted.org/pyswarm/#license)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 1.4.2
3
+ Version: 1.4.4
4
4
  Summary: A collection of tools for data science and machine learning projects
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 1.4.2
3
+ Version: 1.4.4
4
4
  Summary: A collection of tools for data science and machine learning projects
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -3,7 +3,7 @@ import miceforest as mf
3
3
  import os
4
4
  import matplotlib.pyplot as plt
5
5
  import numpy as np
6
- from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info
6
+ from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info, merge_dataframes, save_dataframe
7
7
  from plotnine import ggplot, labs, theme, element_blank # type: ignore
8
8
 
9
9
 
@@ -36,9 +36,9 @@ def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterat
36
36
  raise ValueError("No imputed datasets were generated. Check the MICE process.")
37
37
 
38
38
  if resulting_datasets == 1:
39
- imputed_dataset_names = [f"{df_name}_imputed"]
39
+ imputed_dataset_names = [f"{df_name}_MICE"]
40
40
  else:
41
- imputed_dataset_names = [f"{df_name}_imputed_{i+1}" for i in range(resulting_datasets)]
41
+ imputed_dataset_names = [f"{df_name}_MICE_{i+1}" for i in range(resulting_datasets)]
42
42
 
43
43
  # Ensure indexes match
44
44
  for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
@@ -49,15 +49,11 @@ def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterat
49
49
  return kernel, imputed_datasets, imputed_dataset_names
50
50
 
51
51
 
52
- def save_imputed_datasets(save_dir: str, imputed_datasets: list, imputed_dataset_names: list[str]):
53
- # Check path
54
- os.makedirs(save_dir, exist_ok=True)
55
-
52
+ def save_imputed_datasets(save_dir: str, imputed_datasets: list, df_targets: pd.DataFrame, imputed_dataset_names: list[str]):
56
53
  for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
57
- output_path = os.path.join(save_dir, subname + ".csv")
58
- imputed_df.to_csv(output_path, index=False, encoding='utf-8')
59
- print(f"\tSaved {subname} with shape {imputed_df.shape}")
60
-
54
+ merged_df = merge_dataframes(imputed_df, df_targets, direction="horizontal", verbose=False)
55
+ save_dataframe(df=merged_df, save_dir=save_dir, filename=subname)
56
+
61
57
 
62
58
  #Get names of features that had missing values before imputation
63
59
  def get_na_column_names(df: pd.DataFrame):
@@ -119,7 +115,7 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
119
115
  plt.savefig(save_path, bbox_inches='tight', format="svg")
120
116
  plt.close()
121
117
 
122
- print(f"\t{dataset_file_dir} completed.")
118
+ print(f"{dataset_file_dir} completed.")
123
119
 
124
120
 
125
121
  # Imputed distributions
@@ -131,7 +127,8 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
131
127
  '''
132
128
  # Check path
133
129
  os.makedirs(root_dir, exist_ok=True)
134
- local_save_dir = os.path.join(root_dir, f"Distribution_Metrics_{df_name}_imputed")
130
+ local_dir_name = f"Distribution_Metrics_{df_name}_imputed"
131
+ local_save_dir = os.path.join(root_dir, local_dir_name)
135
132
  if not os.path.isdir(local_save_dir):
136
133
  os.makedirs(local_save_dir)
137
134
 
@@ -202,10 +199,10 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
202
199
  fig = kernel.plot_imputed_distributions(variables=[feature])
203
200
  _process_figure(fig, feature)
204
201
 
205
- print("\tImputed distributions saved successfully.")
202
+ print(f"{local_dir_name} completed.")
206
203
 
207
204
 
208
- def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_dir: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
205
+ def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str], save_datasets_dir: str, save_metrics_dir: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
209
206
  """
210
207
  Call functions in sequence for each dataset in the provided path or directory:
211
208
  1. Load dataframe
@@ -213,6 +210,8 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
213
210
  3. Save imputed dataset(s)
214
211
  4. Save convergence metrics
215
212
  5. Save distribution metrics
213
+
214
+ Target columns must be skipped from the imputation.
216
215
  """
217
216
  # Check paths
218
217
  os.makedirs(save_datasets_dir, exist_ok=True)
@@ -228,9 +227,11 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
228
227
  for df_path in all_file_paths:
229
228
  df, df_name = load_dataframe(df_path=df_path)
230
229
 
230
+ df, df_targets = _skip_targets(df, target_columns)
231
+
231
232
  kernel, imputed_datasets, imputed_dataset_names = apply_mice(df=df, df_name=df_name, resulting_datasets=resulting_datasets, iterations=iterations, random_state=random_state)
232
233
 
233
- save_imputed_datasets(save_dir=save_datasets_dir, imputed_datasets=imputed_datasets, imputed_dataset_names=imputed_dataset_names)
234
+ save_imputed_datasets(save_dir=save_datasets_dir, imputed_datasets=imputed_datasets, df_targets=df_targets, imputed_dataset_names=imputed_dataset_names)
234
235
 
235
236
  imputed_column_names = get_na_column_names(df=df)
236
237
 
@@ -239,5 +240,12 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
239
240
  get_imputed_distributions(kernel=kernel, df_name=df_name, root_dir=save_metrics_dir, column_names=imputed_column_names)
240
241
 
241
242
 
243
+ def _skip_targets(df: pd.DataFrame, target_cols: list[str]):
244
+ valid_targets = [col for col in target_cols if col in df.columns]
245
+ df_targets = df[valid_targets]
246
+ df_feats = df.drop(columns=valid_targets)
247
+ return df_feats, df_targets
248
+
249
+
242
250
  def info():
243
251
  _script_info(__all__)
@@ -26,6 +26,7 @@ def compute_vif(
26
26
  filename: Optional[str] = None,
27
27
  fontsize: int = 14,
28
28
  show_plot: bool = True,
29
+ verbose: bool = True
29
30
  ) -> pd.DataFrame:
30
31
  """
31
32
  Computes Variance Inflation Factors (VIF) for numeric columns in a DataFrame. Optionally, generates a bar plot of VIF values.
@@ -52,19 +53,20 @@ def compute_vif(
52
53
  if use_columns is None:
53
54
  sanitized_columns = df.select_dtypes(include='number').columns.tolist()
54
55
  missing_features = set(ground_truth_cols) - set(sanitized_columns)
55
- if missing_features:
56
+ if missing_features and verbose:
56
57
  print(f"⚠️ These columns are not Numeric:\n{missing_features}")
57
58
  else:
58
59
  sanitized_columns = list()
59
60
  for feature in use_columns:
60
61
  if feature not in ground_truth_cols:
61
- print(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
62
+ if verbose:
63
+ print(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
62
64
  else:
63
65
  sanitized_columns.append(feature)
64
66
 
65
67
  if ignore_columns is not None and use_columns is None:
66
68
  missing_ignore = set(ignore_columns) - set(ground_truth_cols)
67
- if missing_ignore:
69
+ if missing_ignore and verbose:
68
70
  print(f"⚠️ Warning: The following 'columns to ignore' are not in the Dataframe:\n{missing_ignore}")
69
71
  sanitized_columns = [f for f in sanitized_columns if f not in ignore_columns]
70
72
 
@@ -182,7 +184,7 @@ def compute_vif_multi(input_directory: str,
182
184
  max_features_to_plot: int = 20,
183
185
  fontsize: int = 14):
184
186
  """
185
- Computes Variance Inflation Factors (VIF) for numeric columns in a directory with CSV files (loaded as pandas DataFrames).
187
+ Computes Variance Inflation Factors (VIF) for numeric columns in a directory with CSV files (loaded as pandas DataFrames). No plots or warnings will be displayed inline.
186
188
  Generates a bar plot of VIF values. Optionally drops columns with VIF >= 10 and saves as a new CSV file.
187
189
 
188
190
  Args:
@@ -210,10 +212,11 @@ def compute_vif_multi(input_directory: str,
210
212
  fontsize=fontsize,
211
213
  save_dir=output_plot_directory,
212
214
  filename=df_name,
213
- show_plot=False)
215
+ show_plot=False,
216
+ verbose=False)
214
217
 
215
218
  if output_dataset_directory is not None:
216
- new_filename = 'VIF_' + df_name
219
+ new_filename = df_name + '_VIF'
217
220
  result_df, dropped_cols = drop_vif_based(df=df, vif_df=vif_dataframe)
218
221
 
219
222
  if len(dropped_cols) > 0:
@@ -5,10 +5,8 @@ import seaborn as sns
5
5
  from IPython import get_ipython
6
6
  from IPython.display import clear_output
7
7
  import time
8
- from typing import Union, Literal, Dict, Tuple
8
+ from typing import Union, Literal, Dict, Tuple, Iterator
9
9
  import os
10
- import sys
11
- import textwrap
12
10
  from ml_tools.utilities import sanitize_filename, _script_info
13
11
 
14
12
 
@@ -24,7 +22,8 @@ __all__ = [
24
22
  "check_value_distributions",
25
23
  "plot_value_distributions",
26
24
  "clip_outliers_single",
27
- "clip_outliers_multi"
25
+ "clip_outliers_multi",
26
+ "distribute_datasets_by_target"
28
27
  ]
29
28
 
30
29
 
@@ -113,7 +112,7 @@ def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
113
112
  Parameters:
114
113
  df (pd.DataFrame): The input DataFrame.
115
114
  round_digits (int): Number of decimal places for the percentage.
116
-
115
+
117
116
  Returns:
118
117
  pd.DataFrame: A DataFrame summarizing missing values in each column.
119
118
  """
@@ -133,13 +132,14 @@ def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
133
132
  return null_summary
134
133
 
135
134
 
136
- def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7) -> pd.DataFrame:
135
+ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, show_nulls_after: bool = True) -> pd.DataFrame:
137
136
  """
138
137
  Drops columns with more than `threshold` fraction of missing values.
139
138
 
140
139
  Parameters:
141
140
  df (pd.DataFrame): The input DataFrame.
142
141
  threshold (float): Fraction of missing values above which columns are dropped.
142
+ show_nulls_after (bool): Prints `show_null_columns` after dropping columns.
143
143
 
144
144
  Returns:
145
145
  pd.DataFrame: A new DataFrame without the dropped columns.
@@ -150,10 +150,15 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7) ->
150
150
  if len(cols_to_drop) > 0:
151
151
  print(f"Dropping columns with more than {threshold*100:.0f}% missing data:")
152
152
  print(list(cols_to_drop))
153
+
154
+ result_df = df.drop(columns=cols_to_drop)
155
+ if show_nulls_after:
156
+ print(show_null_columns(df=result_df))
157
+
158
+ return result_df
153
159
  else:
154
160
  print(f"No columns have more than {threshold*100:.0f}% missing data.")
155
-
156
- return df.drop(columns=cols_to_drop)
161
+ return df
157
162
 
158
163
 
159
164
  def split_continuous_binary(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
@@ -254,7 +259,7 @@ def plot_correlation_heatmap(df: pd.DataFrame, save_dir: Union[str, None] = None
254
259
  os.makedirs(save_dir, exist_ok=True)
255
260
  full_path = os.path.join(save_dir, plot_title + ".svg")
256
261
  plt.savefig(full_path, bbox_inches="tight", format='svg')
257
- print(f"Saved correlation heatmap to: {full_path}")
262
+ print(f"Saved correlation heatmap: '{plot_title}.svg'")
258
263
 
259
264
  plt.show()
260
265
  plt.close()
@@ -514,6 +519,40 @@ def clip_outliers_multi(
514
519
  return new_df
515
520
 
516
521
 
522
+ def distribute_datasets_by_target(
523
+ df: pd.DataFrame,
524
+ target_columns: list[str],
525
+ verbose: bool = False
526
+ ) -> Iterator[Tuple[str, pd.DataFrame]]:
527
+ """
528
+ Yields cleaned DataFrames for each target column, where rows with missing
529
+ target values are removed. The target column is placed at the end.
530
+
531
+ Parameters
532
+ ----------
533
+ df : pd.DataFrame
534
+ Preprocessed dataframe with all feature and target columns ready to train.
535
+ target_columns : List[str]
536
+ List of target column names to generate per-target DataFrames.
537
+ verbose: bool
538
+ Whether to print info for each yielded dataset.
539
+
540
+ Yields
541
+ ------
542
+ Tuple[str, pd.DataFrame]
543
+ * First element is the target column name.
544
+ * Second element is the corresponding cleaned DataFrame.
545
+ """
546
+ valid_targets = [col for col in df.columns if col in target_columns]
547
+ feature_columns = [col for col in df.columns if col not in valid_targets]
548
+
549
+ for target in valid_targets:
550
+ subset = df[feature_columns + [target]].dropna(subset=[target])
551
+ if verbose:
552
+ print(f"Target: '{target}' - Dataframe shape: {subset.shape}")
553
+ yield target, subset
554
+
555
+
517
556
  def _is_notebook():
518
557
  return get_ipython() is not None
519
558
 
@@ -6,7 +6,7 @@ from matplotlib.colors import Colormap
6
6
  from matplotlib import rcdefaults
7
7
 
8
8
  import os
9
- from typing import Literal, Union, Optional
9
+ from typing import Literal, Union, Optional, Iterator, Tuple
10
10
  import joblib
11
11
 
12
12
  from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
@@ -29,7 +29,9 @@ warnings.filterwarnings('ignore', category=UserWarning)
29
29
 
30
30
 
31
31
  __all__ = [
32
- "get_models",
32
+ "dataset_yielder",
33
+ "RegressionTreeModels",
34
+ "ClassificationTreeModels",
33
35
  "dataset_pipeline",
34
36
  "evaluate_model_classification",
35
37
  "plot_roc_curve",
@@ -39,114 +41,360 @@ __all__ = [
39
41
  "run_ensemble_pipeline"
40
42
  ]
41
43
 
44
+ ## Type aliases
45
+ HandleImbalanceStrategy = Literal[
46
+ "ADASYN", "SMOTE", "RAND_OVERSAMPLE", "RAND_UNDERSAMPLE", "by_model", None
47
+ ]
48
+
49
+ TaskType = Literal[
50
+ "classification", "regression"
51
+ ]
42
52
 
43
53
  ###### 1. Dataset Loader ######
44
- #Split a dataset into features and targets datasets
45
- def _dataset_yielder(df: pd.DataFrame, target_cols: list[str]):
46
- '''
47
- Yields one Tuple at a time: `(df_features, df_target, feature_names, target_name)`
48
- '''
49
- df_features = df.drop(columns=target_cols)
54
+ def dataset_yielder(
55
+ df: pd.DataFrame,
56
+ target_cols: list[str]
57
+ ) -> Iterator[Tuple[pd.DataFrame, pd.Series, list[str], str]]:
58
+ """
59
+ Yields one tuple at a time:
60
+ (features_dataframe, target_series, feature_names, target_name)
61
+
62
+ Skips any target columns not found in the DataFrame.
63
+ """
64
+ # Determine which target columns actually exist in the DataFrame
65
+ valid_targets = [col for col in target_cols if col in df.columns]
66
+
67
+ # Features = all columns excluding valid target columns
68
+ df_features = df.drop(columns=valid_targets)
50
69
  feature_names = df_features.columns.to_list()
51
-
52
- for target_col in target_cols:
70
+
71
+ for target_col in valid_targets:
53
72
  df_target = df[target_col]
54
73
  yield (df_features, df_target, feature_names, target_col)
55
74
 
75
+
56
76
  ###### 2. Initialize Models ######
57
- def get_models(task: Literal["classification", "regression"], random_state: int=101, is_balanced: bool = True,
58
- L1_regularization: float = 1.0, L2_regularization: float = 1.0, learning_rate: float=0.005) -> dict:
59
- '''
60
- Returns a dictionary `{Model_Name: Model}` with new instances of models.
61
- Valid tasks: "classification" or "regression".
77
+ class RegressionTreeModels:
78
+ """
79
+ A factory class for creating and configuring multiple gradient boosting regression models
80
+ with unified hyperparameters. This includes XGBoost, LightGBM, and HistGradientBoostingRegressor.
62
81
 
63
- Classification Models:
64
- - "XGBoost" - XGBClassifier
65
- - "LightGBM" - LGBMClassifier
66
- - "HistGB" - HistGradientBoostingClassifier
67
- Regression Models:
68
- - "XGBoost" - XGBRegressor
69
- - "LightGBM" - LGBMRegressor
70
- - "HistGB" - HistGradientBoostingRegressor
71
-
72
- For classification only: Set `is_balanced=False` for imbalanced datasets.
82
+ Use the `__call__`, `()` method.
83
+
84
+ Parameters
85
+ ----------
86
+ random_state : int
87
+ Seed used by the random number generator.
88
+
89
+ learning_rate : float [0.001 - 0.300]
90
+ Boosting learning rate (shrinkage).
73
91
 
74
- Increase L1 and L2 if model is overfitting
75
- '''
92
+ L1_regularization : float [0.0 - 10.0]
93
+ L1 regularization term (alpha). Might drive to sparsity.
94
+
95
+ L2_regularization : float [0.0 - 10.0]
96
+ L2 regularization term (lambda).
97
+
98
+ n_estimators : int [100 - 3000]
99
+ Number of boosting iterations for XGBoost and LightGBM.
100
+
101
+ max_depth : int [3 - 15]
102
+ Maximum depth of individual trees. Controls model complexity; high values may overfit.
103
+
104
+ subsample : float [0.5 - 1.0]
105
+ Fraction of rows per tree; used to prevent overfitting.
106
+
107
+ colsample_bytree : float [0.3 - 1.0]
108
+ Fraction of features per tree; useful for regularization (used by XGBoost and LightGBM).
109
+
110
+ min_samples_leaf : int [10 - 100]
111
+ Minimum samples per leaf; higher = less overfitting (used in HistGB).
112
+
113
+ max_iter : int [100 - 2000]
114
+ Maximum number of iterations (used in HistGB).
115
+
116
+ min_child_weight : float [0.1 - 10.0]
117
+ Minimum sum of instance weight (hessian) needed in a child; larger values make the algorithm more conservative (used in XGBoost).
118
+
119
+ gamma : float [0.0 - 5.0]
120
+ Minimum loss reduction required to make a further partition on a leaf node; higher = more regularization (used in XGBoost).
121
+
122
+ num_leaves : int [20 - 200]
123
+ Maximum number of leaves in one tree; should be less than 2^(max_depth); larger = more complex (used in LightGBM).
124
+
125
+ min_data_in_leaf : int [10 - 100]
126
+ Minimum number of data points in a leaf; increasing may prevent overfitting (used in LightGBM).
127
+ """
128
+ def __init__(self,
129
+ random_state: int = 101,
130
+ learning_rate: float = 0.005,
131
+ L1_regularization: float = 1.0,
132
+ L2_regularization: float = 1.0,
133
+ n_estimators: int = 1000,
134
+ max_depth: int = 8,
135
+ subsample: float = 0.8,
136
+ colsample_bytree: float = 0.8,
137
+ min_samples_leaf: int = 50,
138
+ max_iter: int = 1000,
139
+ min_child_weight: float = 3.0,
140
+ gamma: float = 1.0,
141
+ num_leaves: int = 31,
142
+ min_data_in_leaf: int = 40):
143
+ # General config
144
+ self.random_state = random_state
145
+ self.lr = learning_rate
146
+ self.L1 = L1_regularization
147
+ self.L2 = L2_regularization
148
+
149
+ # Shared tree structure
150
+ self.n_estimators = n_estimators
151
+ self.max_depth = max_depth
152
+ self.subsample = subsample
153
+ self.colsample_bytree = colsample_bytree
154
+
155
+ # XGBoost specific
156
+ self.min_child_weight = min_child_weight
157
+ self.gamma = gamma
158
+
159
+ # LightGBM specific
160
+ num_leaves = min(num_leaves, 2 ** (max_depth - 1))
161
+ self.num_leaves = num_leaves
162
+ self.min_data_in_leaf = min_data_in_leaf
163
+
164
+ # HistGB specific
165
+ self.max_iter = max_iter
166
+ self.min_samples_leaf = min_samples_leaf
167
+
168
+ def __call__(self) -> dict[str, object]:
169
+ """
170
+ Returns a dictionary with new instances of:
171
+ - "XGBoost": XGBRegressor
172
+ - "LightGBM": LGBMRegressor
173
+ - "HistGB": HistGradientBoostingRegressor
174
+ """
175
+ # XGBoost Regressor
176
+ xgb_model = xgb.XGBRegressor(
177
+ n_estimators=self.n_estimators,
178
+ max_depth=self.max_depth,
179
+ learning_rate=self.lr,
180
+ subsample=self.subsample,
181
+ colsample_bytree=self.colsample_bytree,
182
+ random_state=self.random_state,
183
+ reg_alpha=self.L1,
184
+ reg_lambda=self.L2,
185
+ eval_metric='rmse',
186
+ min_child_weight=self.min_child_weight,
187
+ gamma=self.gamma,
188
+ tree_method='hist',
189
+ grow_policy='lossguide'
190
+ )
191
+
192
+ # LightGBM Regressor
193
+ lgb_model = lgb.LGBMRegressor(
194
+ n_estimators=self.n_estimators,
195
+ learning_rate=self.lr,
196
+ max_depth=self.max_depth,
197
+ subsample=self.subsample,
198
+ colsample_bytree=self.colsample_bytree,
199
+ random_state=self.random_state,
200
+ verbose=-1,
201
+ reg_alpha=self.L1,
202
+ reg_lambda=self.L2,
203
+ boosting_type='gbdt',
204
+ num_leaves=self.num_leaves,
205
+ min_data_in_leaf=self.min_data_in_leaf
206
+ )
207
+
208
+ # HistGradientBoosting Regressor
209
+ hist_model = HistGradientBoostingRegressor(
210
+ max_iter=self.max_iter,
211
+ learning_rate=self.lr,
212
+ max_depth=self.max_depth,
213
+ min_samples_leaf=self.min_samples_leaf,
214
+ random_state=self.random_state,
215
+ l2_regularization=self.L2,
216
+ scoring='neg_mean_squared_error',
217
+ early_stopping=True,
218
+ validation_fraction=0.1
219
+ )
220
+
221
+ return {
222
+ "XGBoost": xgb_model,
223
+ "LightGBM": lgb_model,
224
+ "HistGB": hist_model
225
+ }
76
226
 
77
- # Model initialization logic
78
- if task not in ["classification", "regression"]:
79
- raise ValueError(f"Invalid task: {task}. Must be 'classification' or 'regression'.")
80
-
81
- models = {}
82
-
83
- # Common parameters
84
- xgb_params = {
85
- 'n_estimators': 200,
86
- 'max_depth': 5,
87
- 'learning_rate': learning_rate,
88
- 'subsample': 0.8,
89
- 'colsample_bytree': 0.8,
90
- 'random_state': random_state,
91
- 'reg_alpha': L1_regularization,
92
- 'reg_lambda': L2_regularization,
93
- }
94
-
95
- lgbm_params = {
96
- 'n_estimators': 200,
97
- 'learning_rate': learning_rate,
98
- 'max_depth': 5,
99
- 'subsample': 0.8,
100
- 'colsample_bytree': 0.8,
101
- 'random_state': random_state,
102
- 'verbose': -1,
103
- 'reg_alpha': L1_regularization,
104
- 'reg_lambda': L2_regularization,
105
- }
106
-
107
- hist_params = {
108
- 'max_iter': 200,
109
- 'learning_rate': learning_rate,
110
- 'max_depth': 5,
111
- 'min_samples_leaf': 30,
112
- 'random_state': random_state,
113
- 'l2_regularization': L2_regularization,
114
- }
115
-
116
- # XGB Model
117
- if task == "classification":
118
- xgb_params.update({
119
- 'scale_pos_weight': 1 if is_balanced else 8,
120
- 'eval_metric': 'aucpr'
121
- })
122
- models["XGBoost"] = xgb.XGBClassifier(**xgb_params)
123
- else:
124
- xgb_params.update({'eval_metric': 'rmse'})
125
- models["XGBoost"] = xgb.XGBRegressor(**xgb_params)
227
+ def __str__(self):
228
+ return f"{self.__class__.__name__}(n_estimators={self.n_estimators}, max_depth={self.max_depth}, lr={self.lr}, L1={self.L1}, L2={self.L2}"
126
229
 
127
- # LGBM Model
128
- if task == "classification":
129
- lgbm_params.update({
130
- 'class_weight': None if is_balanced else 'balanced',
131
- 'boosting_type': 'goss' if is_balanced else 'dart',
132
- })
133
- models["LightGBM"] = lgb.LGBMClassifier(**lgbm_params)
134
- else:
135
- lgbm_params['boosting_type'] = 'dart'
136
- models["LightGBM"] = lgb.LGBMRegressor(**lgbm_params)
137
230
 
138
- # HistGB Model
139
- if task == "classification":
140
- hist_params.update({
141
- 'class_weight': None if is_balanced else 'balanced',
142
- 'scoring': 'loss' if is_balanced else 'balanced_accuracy',
143
- })
144
- models["HistGB"] = HistGradientBoostingClassifier(**hist_params)
145
- else:
146
- hist_params['scoring'] = 'neg_mean_squared_error'
147
- models["HistGB"] = HistGradientBoostingRegressor(**hist_params)
231
+ class ClassificationTreeModels:
232
+ """
233
+ A factory class for creating and configuring multiple gradient boosting classification models
234
+ with unified hyperparameters. This includes: XGBoost, LightGBM, and HistGradientBoostingClassifier.
235
+
236
+ Use the `__call__`, `()` method.
237
+
238
+ Parameters
239
+ ----------
240
+ random_state : int
241
+ Seed used by the random number generator to ensure reproducibility.
242
+
243
+ learning_rate : float [0.001 - 0.300]
244
+ Boosting learning rate (shrinkage factor).
245
+
246
+ L1_regularization : float [0.0 - 10.0]
247
+ L1 regularization term (alpha), might drive to sparsity.
248
+
249
+ L2_regularization : float [0.0 - 10.0]
250
+ L2 regularization term (lambda).
251
+
252
+ n_estimators : int [100 - 3000]
253
+ Number of boosting rounds for XGBoost and LightGBM.
254
+
255
+ max_depth : int [3 - 15]
256
+ Maximum depth of individual trees in the ensemble. Controls model complexity; high values may overfit.
257
+
258
+ subsample : float [0.5 - 1.0]
259
+ Fraction of samples to use when fitting base learners; used to prevent overfitting.
260
+
261
+ colsample_bytree : float [0.3 - 1.0]
262
+ Fraction of features per tree; useful for regularization (used by XGBoost and LightGBM).
263
+
264
+ min_samples_leaf : int [10 - 100]
265
+ Minimum number of samples required to be at a leaf node; higher = less overfitting (used in HistGB).
266
+
267
+ max_iter : int [100 - 2000]
268
+ Maximum number of boosting iteration (used in HistGB).
269
+
270
+ min_child_weight : float [0.1 - 10.0]
271
+ Minimum sum of instance weight (Hessian) in a child node; larger values make the algorithm more conservative (used in XGBoost).
272
+
273
+ gamma : float [0.0 - 5.0]
274
+ Minimum loss reduction required to make a further partition; higher = more regularization (used in XGBoost).
275
+
276
+ num_leaves : int [20 - 200]
277
+ Maximum number of leaves in one tree. Should be less than 2^(max_depth); larger = more complex (used in LightGBM).
278
+
279
+ min_data_in_leaf : int [10 -100]
280
+ Minimum number of samples required in a leaf; increasing may prevent overfitting (used in LightGBM).
281
+
282
+ Attributes
283
+ ----------
284
+ use_model_balance : bool
285
+ Indicates whether to apply class balancing strategies internally. Can be overridden at runtime via the `__call__` method.
286
+ """
287
+ def __init__(self,
288
+ random_state: int = 101,
289
+ learning_rate: float = 0.005,
290
+ L1_regularization: float = 1.0,
291
+ L2_regularization: float = 1.0,
292
+ n_estimators: int = 1000,
293
+ max_depth: int = 8,
294
+ subsample: float = 0.8,
295
+ colsample_bytree: float = 0.8,
296
+ min_samples_leaf: int = 50,
297
+ max_iter: int = 1000,
298
+ min_child_weight: float = 3.0,
299
+ gamma: float = 1.0,
300
+ num_leaves: int = 31,
301
+ min_data_in_leaf: int = 40):
302
+ # General config
303
+ self.random_state = random_state
304
+ self.lr = learning_rate
305
+ self.L1 = L1_regularization
306
+ self.L2 = L2_regularization
307
+
308
+ # To be set by the pipeline
309
+ self.use_model_balance: bool = True
310
+
311
+ # Shared tree structure
312
+ self.n_estimators = n_estimators
313
+ self.max_depth = max_depth
314
+ self.subsample = subsample
315
+ self.colsample_bytree = colsample_bytree
316
+
317
+ # XGBoost specific
318
+ self.min_child_weight = min_child_weight
319
+ self.gamma = gamma
320
+
321
+ # LightGBM specific
322
+ num_leaves = min(num_leaves, 2 ** (max_depth - 1))
323
+ self.num_leaves = num_leaves
324
+ self.min_data_in_leaf = min_data_in_leaf
325
+
326
+ # HistGB specific
327
+ self.max_iter = max_iter
328
+ self.min_samples_leaf = min_samples_leaf
329
+
330
+ def __call__(self, use_model_balance: Optional[bool]=None) -> dict[str, object]:
331
+ """
332
+ Returns a dictionary with new instances of:
333
+ - "XGBoost": XGBClassifier
334
+ - "LightGBM": LGBMClassifier
335
+ - "HistGB": HistGradientBoostingClassifier
336
+ """
337
+ if use_model_balance is not None:
338
+ self.use_model_balance = use_model_balance
339
+
340
+ # XGBoost Classifier
341
+ xgb_model = xgb.XGBClassifier(
342
+ n_estimators=self.n_estimators,
343
+ max_depth=self.max_depth,
344
+ learning_rate=self.lr,
345
+ subsample=self.subsample,
346
+ colsample_bytree=self.colsample_bytree,
347
+ random_state=self.random_state,
348
+ reg_alpha=self.L1,
349
+ reg_lambda=self.L2,
350
+ eval_metric='aucpr',
351
+ min_child_weight=self.min_child_weight,
352
+ gamma=self.gamma,
353
+ tree_method='hist',
354
+ grow_policy='lossguide',
355
+ scale_pos_weight=8.0 if self.use_model_balance else 1.0
356
+ )
357
+
358
+ # LightGBM Classifier
359
+ lgb_model = lgb.LGBMClassifier(
360
+ n_estimators=self.n_estimators,
361
+ learning_rate=self.lr,
362
+ max_depth=self.max_depth,
363
+ subsample=self.subsample,
364
+ colsample_bytree=self.colsample_bytree,
365
+ random_state=self.random_state,
366
+ verbose=-1,
367
+ reg_alpha=self.L1,
368
+ reg_lambda=self.L2,
369
+ boosting_type='gbdt' if self.use_model_balance else 'goss',
370
+ num_leaves=self.num_leaves,
371
+ min_data_in_leaf=self.min_data_in_leaf,
372
+ class_weight='balanced' if self.use_model_balance else None
373
+ )
374
+
375
+ # HistGradientBoosting Classifier
376
+ hist_model = HistGradientBoostingClassifier(
377
+ max_iter=self.max_iter,
378
+ learning_rate=self.lr,
379
+ max_depth=self.max_depth,
380
+ min_samples_leaf=self.min_samples_leaf,
381
+ random_state=self.random_state,
382
+ l2_regularization=self.L2,
383
+ early_stopping=True,
384
+ validation_fraction=0.1,
385
+ class_weight='balanced' if self.use_model_balance else None,
386
+ scoring='balanced_accuracy' if self.use_model_balance else 'loss'
387
+ )
388
+
389
+ return {
390
+ "XGBoost": xgb_model,
391
+ "LightGBM": lgb_model,
392
+ "HistGB": hist_model
393
+ }
394
+
395
+ def __str__(self):
396
+ return f"{self.__class__.__name__}(n_estimators={self.n_estimators}, max_depth={self.max_depth}, lr={self.lr}, L1={self.L1}, L2={self.L2}"
148
397
 
149
- return models
150
398
 
151
399
  ###### 3. Process Dataset ######
152
400
  # function to split data into train and test
@@ -157,7 +405,7 @@ def _split_data(features, target, test_size, random_state, task):
157
405
 
158
406
  # Over-sample minority class (Positive cases) and return several single target datasets (Classification)
159
407
  def _resample(X_train: np.ndarray, y_train: pd.Series,
160
- strategy: Literal[r"ADASYN", r'SMOTE', r'RANDOM', r'UNDERSAMPLE'], random_state):
408
+ strategy: HandleImbalanceStrategy, random_state):
161
409
  '''
162
410
  Oversample minority class or undersample majority class.
163
411
 
@@ -165,9 +413,9 @@ def _resample(X_train: np.ndarray, y_train: pd.Series,
165
413
  '''
166
414
  if strategy == 'SMOTE':
167
415
  resample_algorithm = SMOTE(random_state=random_state, k_neighbors=3)
168
- elif strategy == 'RANDOM':
416
+ elif strategy == 'RAND_OVERSAMPLE':
169
417
  resample_algorithm = RandomOverSampler(random_state=random_state)
170
- elif strategy == 'UNDERSAMPLE':
418
+ elif strategy == 'RAND_UNDERSAMPLE':
171
419
  resample_algorithm = RandomUnderSampler(random_state=random_state)
172
420
  elif strategy == 'ADASYN':
173
421
  resample_algorithm = ADASYN(random_state=random_state, n_neighbors=3)
@@ -178,8 +426,8 @@ def _resample(X_train: np.ndarray, y_train: pd.Series,
178
426
  return X_res, y_res
179
427
 
180
428
  # DATASET PIPELINE
181
- def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task: Literal["classification", "regression"],
182
- resample_strategy: Union[Literal[r"ADASYN", r'SMOTE', r'RANDOM', r'UNDERSAMPLE'], None],
429
+ def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task: TaskType,
430
+ resample_strategy: HandleImbalanceStrategy,
183
431
  test_size: float=0.2, debug: bool=False, random_state: int=101):
184
432
  '''
185
433
  1. Make Train/Test splits
@@ -204,7 +452,7 @@ def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task: Lite
204
452
 
205
453
 
206
454
  # Resample
207
- if resample_strategy is None or task == "regression":
455
+ if resample_strategy is None or resample_strategy == "by_model" or task == "regression":
208
456
  X_train_oversampled, y_train_oversampled = X_train, y_train
209
457
  else:
210
458
  X_train_oversampled, y_train_oversampled = _resample(X_train=X_train, y_train=y_train, strategy=resample_strategy, random_state=random_state)
@@ -431,7 +679,7 @@ def evaluate_model_regression(model, model_name: str,
431
679
  sanitized_target_name = sanitize_filename(target_name)
432
680
  report_path = os.path.join(save_dir, f"Regression_Report_{sanitized_target_name}.txt")
433
681
  with open(report_path, "w") as f:
434
- f.write(f"{model_name} - {target_name} Regression Performance\n")
682
+ f.write(f"{model_name} - Regression Performance for '{target_name}'\n\n")
435
683
  f.write(f"Mean Absolute Error (MAE): {mae:.4f}\n")
436
684
  f.write(f"Mean Squared Error (MSE): {mse:.4f}\n")
437
685
  f.write(f"Root Mean Squared Error (RMSE): {rmse:.4f}\n")
@@ -596,7 +844,7 @@ def get_shap_values(
596
844
 
597
845
 
598
846
  # TRAIN TEST PIPELINE
599
- def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["classification", "regression"],
847
+ def train_test_pipeline(model, model_name: str, dataset_id: str, task: TaskType,
600
848
  train_features: np.ndarray, train_target: np.ndarray,
601
849
  test_features: np.ndarray, test_target: np.ndarray,
602
850
  feature_names: list[str], target_name: str,
@@ -609,7 +857,7 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["
609
857
 
610
858
  Returns: Tuple(Trained model, Test-set Predictions)
611
859
  '''
612
- print(f"\tModel: {model_name} for Target: {target_name}...")
860
+ print(f"\tTraining model: {model_name} for Target: {target_name}...")
613
861
  trained_model = _train_model(model=model, train_features=train_features, train_target=train_target)
614
862
  if debug:
615
863
  print(f"Trained model object: {type(trained_model)}")
@@ -637,26 +885,40 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["
637
885
 
638
886
  get_shap_values(model=trained_model, model_name=model_name, save_dir=local_save_directory,
639
887
  features_to_explain=train_features, feature_names=feature_names, target_name=target_name, task=task)
640
- print("\t...done.")
888
+ # print("\t...done.")
641
889
  return trained_model, y_pred
642
890
 
643
891
  ###### 5. Execution ######
644
- def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list[str], task: Literal["classification", "regression"],
645
- resample_strategy: Literal[r"ADASYN", r'SMOTE', r'RANDOM', r'UNDERSAMPLE', None]=None, save_model: bool=False,
646
- test_size: float=0.2, debug:bool=False, L1_regularization: float=0.5, L2_regularization: float=0.5, learning_rate: float=0.005, random_state: int=101):
892
+ def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list[str], model_object: Union[RegressionTreeModels, ClassificationTreeModels],
893
+ handle_classification_imbalance: HandleImbalanceStrategy=None, save_model: bool=False,
894
+ test_size: float=0.2, debug:bool=False):
895
+ #Check models
896
+ if isinstance(model_object, RegressionTreeModels):
897
+ task = "regression"
898
+ elif isinstance(model_object, ClassificationTreeModels):
899
+ task = "classification"
900
+ if handle_classification_imbalance is None:
901
+ print("⚠️ No method to handle classification class imbalance has been selected. Datasets are assumed to be balanced.")
902
+ elif handle_classification_imbalance == "by_model":
903
+ model_object.use_model_balance = True
904
+ else:
905
+ model_object.use_model_balance = False
906
+ else:
907
+ raise TypeError(f"Unrecognized model {type(model_object)}")
908
+
647
909
  #Check paths
648
910
  _check_paths(datasets_dir, save_dir)
911
+
649
912
  #Yield imputed dataset
650
913
  for dataframe, dataframe_name in yield_dataframes_from_dir(datasets_dir):
651
914
  #Yield features dataframe and target dataframe
652
- for df_features, df_target, feature_names, target_name in _dataset_yielder(df=dataframe, target_cols=target_columns):
915
+ for df_features, df_target, feature_names, target_name in dataset_yielder(df=dataframe, target_cols=target_columns):
653
916
  #Dataset pipeline
654
917
  X_train, y_train, X_test, y_test = dataset_pipeline(df_features=df_features, df_target=df_target, task=task,
655
- resample_strategy=resample_strategy,
656
- test_size=test_size, debug=debug, random_state=random_state)
918
+ resample_strategy=handle_classification_imbalance,
919
+ test_size=test_size, debug=debug, random_state=model_object.random_state)
657
920
  #Get models
658
- models_dict = get_models(task=task, is_balanced=False if resample_strategy is None else True,
659
- L1_regularization=L1_regularization, L2_regularization=L2_regularization, learning_rate=learning_rate)
921
+ models_dict = model_object()
660
922
  #Train models
661
923
  for model_name, model in models_dict.items():
662
924
  train_test_pipeline(model=model, model_name=model_name, dataset_id=dataframe_name, task=task,
@@ -55,7 +55,7 @@ def custom_logger(
55
55
  """
56
56
  try:
57
57
  os.makedirs(save_directory, exist_ok=True)
58
- timestamp = datetime.now().strftime(r"%Y%m%d_%H%M")
58
+ timestamp = datetime.now().strftime(r"%Y%m%d_%H%M%S")
59
59
  log_name = sanitize_filename(log_name)
60
60
  base_path = os.path.join(save_directory, f"{log_name}_{timestamp}")
61
61
 
@@ -80,7 +80,7 @@ def custom_logger(
80
80
  else:
81
81
  raise ValueError("Unsupported data type. Must be list, dict, DataFrame, str, or BaseException.")
82
82
 
83
- print(f"Log saved to: {base_path}")
83
+ print(f"Log saved to: '{base_path}'")
84
84
 
85
85
  except Exception as e:
86
86
  print(f"Error in custom_logger: {e}")
@@ -129,10 +129,10 @@ def run_pso(lower_boundaries: list[float],
129
129
  target_name: Union[str, None]=None,
130
130
  feature_names: Union[list[str], None]=None,
131
131
  swarm_size: int=200,
132
- max_iterations: int=400,
132
+ max_iterations: int=1500,
133
133
  inequality_constrain_function=None,
134
- post_hoc_analysis: Optional[int]=3,
135
- workers: int=3) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
134
+ post_hoc_analysis: Optional[int]=5,
135
+ workers: int=1) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
136
136
  """
137
137
  Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results as a CSV file.
138
138
 
@@ -261,7 +261,7 @@ def info():
261
261
  _script_info(__all__)
262
262
 
263
263
 
264
- ### SOURCE CODE FOR PSO ###
264
+ ### SOURCE CODE FOR PSO FROM PYSWARM ###
265
265
  def _obj_wrapper(func, args, kwargs, x):
266
266
  return func(x, *args, **kwargs)
267
267
 
@@ -95,7 +95,8 @@ def yield_dataframes_from_dir(datasets_dir: str):
95
95
  def merge_dataframes(
96
96
  *dfs: pd.DataFrame,
97
97
  reset_index: bool = False,
98
- direction: Literal["horizontal", "vertical"] = "horizontal"
98
+ direction: Literal["horizontal", "vertical"] = "horizontal",
99
+ verbose: bool=True
99
100
  ) -> pd.DataFrame:
100
101
  """
101
102
  Merges multiple DataFrames either horizontally or vertically.
@@ -119,8 +120,9 @@ def merge_dataframes(
119
120
  if len(dfs) < 2:
120
121
  raise ValueError("At least 2 DataFrames must be provided.")
121
122
 
122
- for i, df in enumerate(dfs, start=1):
123
- print(f"DataFrame {i} shape: {df.shape}")
123
+ if verbose:
124
+ for i, df in enumerate(dfs, start=1):
125
+ print(f"DataFrame {i} shape: {df.shape}")
124
126
 
125
127
 
126
128
  if direction == "horizontal":
@@ -142,8 +144,9 @@ def merge_dataframes(
142
144
 
143
145
  if reset_index:
144
146
  merged_df = merged_df.reset_index(drop=True)
145
-
146
- print(f"Merged DataFrame shape: {merged_df.shape}")
147
+
148
+ if verbose:
149
+ print(f"Merged DataFrame shape: {merged_df.shape}")
147
150
 
148
151
  return merged_df
149
152
 
@@ -171,7 +174,7 @@ def save_dataframe(df: pd.DataFrame, save_dir: str, filename: str) -> None:
171
174
  output_path = os.path.join(save_dir, filename)
172
175
 
173
176
  df.to_csv(output_path, index=False, encoding='utf-8')
174
- print(f"✅ Saved file: '{filename}'")
177
+ print(f"✅ Saved dataset: '{filename}' with shape: {df.shape}")
175
178
 
176
179
 
177
180
  def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dragon-ml-toolbox"
3
- version = "1.4.2"
3
+ version = "1.4.4"
4
4
  description = "A collection of tools for data science and machine learning projects"
5
5
  authors = [
6
6
  { name = "Karl Loza", email = "luigiloza@gmail.com" }