dragon-ml-toolbox 1.4.1__py3-none-any.whl → 1.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 1.4.1
3
+ Version: 1.4.2
4
4
  Summary: A collection of tools for data science and machine learning projects
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,19 +1,19 @@
1
- dragon_ml_toolbox-1.4.1.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
- dragon_ml_toolbox-1.4.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=e1Hg5ZtaBpDV7ZvxhLe1ac28l7nMjvi1MSE5YvB1s-o,1472
1
+ dragon_ml_toolbox-1.4.2.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
+ dragon_ml_toolbox-1.4.2.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=e1Hg5ZtaBpDV7ZvxhLe1ac28l7nMjvi1MSE5YvB1s-o,1472
3
3
  ml_tools/MICE_imputation.py,sha256=CK0tYZ_kQkdETohOlhI7RP7oFkJTXrP-XtIxb--dzpU,9726
4
4
  ml_tools/VIF_factor.py,sha256=LQWr1P8WYij07FX_3RZC6Rr22bfAMnrt0Lhvi7SbBpY,9846
5
5
  ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  ml_tools/data_exploration.py,sha256=FXP5i6bQo8J3RCyLRmlX-qJVh4VH8DbMjrdUmyd1mF0,18708
7
7
  ml_tools/datasetmaster.py,sha256=EFUEX-tqq94Ak1rXXYR-XaX85olrxvF2EuytdzUK7y0,29131
8
- ml_tools/ensemble_learning.py,sha256=khXXRiR7boWwI4CAvb2bxzS3fhLADNETMOiRe3ihZ4Y,28821
8
+ ml_tools/ensemble_learning.py,sha256=p8t5PI63N3G0ZgvOKmvFOvwJ24qqPdZCvyiDAx4ggXY,27670
9
9
  ml_tools/handle_excel.py,sha256=ZJui5__0rc2T8UGHTheqZGhKmdVZ7Q2I54IoYCjAqJw,12612
10
10
  ml_tools/logger.py,sha256=NOtL3YSuffAGmpTpXjY-uJjqFLdRG_jpL7MDyloBw9c,4712
11
- ml_tools/particle_swarm_optimization.py,sha256=714kZo6lvUvRaPTtj6kJGecZwHcehcSkLysokXAf3No,20706
11
+ ml_tools/particle_swarm_optimization.py,sha256=3xsc6sg-5o3cPbG_dWUyF3HdRVxgL4k_kRuPMU11NnM,20020
12
12
  ml_tools/pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
13
13
  ml_tools/trainer.py,sha256=WAZ4EdrZuTOAnGXRWV3XcLNce4s7EKGf2-qchLC08Ik,15702
14
- ml_tools/utilities.py,sha256=z2JPy4GM2YBLUC0sPq7aNLuesPFAQu5KNcsgmuOywdU,8738
14
+ ml_tools/utilities.py,sha256=Pou-8IZsZj9NiZ_shhLt552yaKNvbnQ1Ztoj6VMHIeE,10091
15
15
  ml_tools/vision_helpers.py,sha256=idQ-Ugp1IdsvwXiYyhYa9G3rTRTm37YRpkQDLEpANHM,7701
16
- dragon_ml_toolbox-1.4.1.dist-info/METADATA,sha256=0XdPwNWe81rCvJLJfSS5XvB2ZdJKpBLLoqMU5uxYLMc,2516
17
- dragon_ml_toolbox-1.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
18
- dragon_ml_toolbox-1.4.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
19
- dragon_ml_toolbox-1.4.1.dist-info/RECORD,,
16
+ dragon_ml_toolbox-1.4.2.dist-info/METADATA,sha256=c95w_AETVdAwMYWrowJKxkC0wYCsgRrTmxyekPz7WBE,2516
17
+ dragon_ml_toolbox-1.4.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
18
+ dragon_ml_toolbox-1.4.2.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
19
+ dragon_ml_toolbox-1.4.2.dist-info/RECORD,,
@@ -17,11 +17,10 @@ import xgboost as xgb
17
17
  import lightgbm as lgb
18
18
 
19
19
  from sklearn.model_selection import train_test_split
20
- from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
21
20
  from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, mean_absolute_error, mean_squared_error, r2_score, roc_curve, roc_auc_score
22
21
  import shap
23
22
 
24
- from .utilities import yield_dataframes_from_dir, sanitize_filename
23
+ from .utilities import yield_dataframes_from_dir, sanitize_filename, _script_info
25
24
 
26
25
  import warnings # Ignore warnings
27
26
  warnings.filterwarnings('ignore', category=DeprecationWarning)
@@ -29,9 +28,21 @@ warnings.filterwarnings('ignore', category=FutureWarning)
29
28
  warnings.filterwarnings('ignore', category=UserWarning)
30
29
 
31
30
 
31
+ __all__ = [
32
+ "get_models",
33
+ "dataset_pipeline",
34
+ "evaluate_model_classification",
35
+ "plot_roc_curve",
36
+ "evaluate_model_regression",
37
+ "get_shap_values",
38
+ "train_test_pipeline",
39
+ "run_ensemble_pipeline"
40
+ ]
41
+
42
+
32
43
  ###### 1. Dataset Loader ######
33
44
  #Split a dataset into features and targets datasets
34
- def dataset_yielder(df: pd.DataFrame, target_cols: list[str]):
45
+ def _dataset_yielder(df: pd.DataFrame, target_cols: list[str]):
35
46
  '''
36
47
  Yields one Tuple at a time: `(df_features, df_target, feature_names, target_name)`
37
48
  '''
@@ -144,22 +155,8 @@ def _split_data(features, target, test_size, random_state, task):
144
155
  stratify=target if task=="classification" else None)
145
156
  return X_train, X_test, y_train, y_test
146
157
 
147
- # function to standardize the data
148
- def _standardize_data(train_features, test_features, scaler_code):
149
- if scaler_code == "standard":
150
- scaler = StandardScaler()
151
- elif scaler_code == "minmax":
152
- scaler = MinMaxScaler()
153
- elif scaler_code == "maxabs":
154
- scaler = MaxAbsScaler()
155
- else:
156
- raise ValueError(f"Unrecognized scaler {scaler_code}")
157
- train_scaled = scaler.fit_transform(train_features)
158
- test_scaled = scaler.transform(test_features)
159
- return train_scaled, test_scaled, scaler
160
-
161
158
  # Over-sample minority class (Positive cases) and return several single target datasets (Classification)
162
- def _resample(X_train_scaled: np.ndarray, y_train: pd.Series,
159
+ def _resample(X_train: np.ndarray, y_train: pd.Series,
163
160
  strategy: Literal[r"ADASYN", r'SMOTE', r'RANDOM', r'UNDERSAMPLE'], random_state):
164
161
  '''
165
162
  Oversample minority class or undersample majority class.
@@ -177,21 +174,20 @@ def _resample(X_train_scaled: np.ndarray, y_train: pd.Series,
177
174
  else:
178
175
  raise ValueError(f"Invalid resampling strategy: {strategy}")
179
176
 
180
- X_res, y_res, *_ = resample_algorithm.fit_resample(X_train_scaled, y_train)
177
+ X_res, y_res, *_ = resample_algorithm.fit_resample(X_train, y_train)
181
178
  return X_res, y_res
182
179
 
183
180
  # DATASET PIPELINE
184
181
  def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task: Literal["classification", "regression"],
185
- resample_strategy: Union[Literal[r"ADASYN", r'SMOTE', r'RANDOM', r'UNDERSAMPLE'], None], scaler: Literal["standard", "minmax", "maxabs"],
182
+ resample_strategy: Union[Literal[r"ADASYN", r'SMOTE', r'RANDOM', r'UNDERSAMPLE'], None],
186
183
  test_size: float=0.2, debug: bool=False, random_state: int=101):
187
184
  '''
188
185
  1. Make Train/Test splits
189
- 2. Standardize Train and Test Features
190
- 3. Oversample imbalanced classes (classification)
186
+ 2. Oversample imbalanced classes (classification)
191
187
 
192
- Return a processed Tuple: (X_train, y_train, X_test, y_test, Scaler)
188
+ Return a processed Tuple: (X_train, y_train, X_test, y_test)
193
189
 
194
- `(nD-array, 1D-array, nD-array, Series, Scaler)`
190
+ `(nD-array, 1D-array, nD-array, Series)`
195
191
  '''
196
192
  #DEBUG
197
193
  if debug:
@@ -206,24 +202,18 @@ def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task: Lite
206
202
  if debug:
207
203
  print(f"Shapes after train test split - X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")
208
204
 
209
- # Standardize
210
- X_train_scaled, X_test_scaled, scaler_object = _standardize_data(train_features=X_train, test_features=X_test, scaler_code=scaler)
211
-
212
- #DEBUG
213
- if debug:
214
- print(f"Shapes after scaling features - X_train: {X_train_scaled.shape}, y_train: {y_train.shape}, X_test: {X_test_scaled.shape}, y_test: {y_test.shape}")
215
205
 
216
- # Scale
206
+ # Resample
217
207
  if resample_strategy is None or task == "regression":
218
- X_train_oversampled, y_train_oversampled = X_train_scaled, y_train
208
+ X_train_oversampled, y_train_oversampled = X_train, y_train
219
209
  else:
220
- X_train_oversampled, y_train_oversampled = _resample(X_train_scaled=X_train_scaled, y_train=y_train, strategy=resample_strategy, random_state=random_state)
210
+ X_train_oversampled, y_train_oversampled = _resample(X_train=X_train, y_train=y_train, strategy=resample_strategy, random_state=random_state)
221
211
 
222
212
  #DEBUG
223
213
  if debug:
224
- print(f"Shapes after resampling - X_train: {X_train_oversampled.shape}, y_train: {y_train_oversampled.shape}, X_test: {X_test_scaled.shape}, y_test: {y_test.shape}")
214
+ print(f"Shapes after resampling - X_train: {X_train_oversampled.shape}, y_train: {y_train_oversampled.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")
225
215
 
226
- return X_train_oversampled, y_train_oversampled, X_test_scaled, y_test, scaler_object
216
+ return X_train_oversampled, y_train_oversampled, X_test, y_test
227
217
 
228
218
  ###### 4. Train and Evaluation ######
229
219
  # Trainer function
@@ -244,11 +234,11 @@ def _local_directories(model_name: str, dataset_id: str, save_dir: str):
244
234
  return model_dir
245
235
 
246
236
  # save model
247
- def _save_model(trained_model, model_name: str, target_name:str, feature_names: list[str], save_directory: str, scaler_object: Union[StandardScaler, MinMaxScaler, MaxAbsScaler]):
237
+ def _save_model(trained_model, model_name: str, target_name:str, feature_names: list[str], save_directory: str):
248
238
  #Sanitize filenames to save
249
239
  sanitized_target_name = sanitize_filename(target_name)
250
240
  full_path = os.path.join(save_directory, f"{model_name}_{sanitized_target_name}.joblib")
251
- joblib.dump({'model': trained_model, 'scaler':scaler_object, 'feature_names': feature_names, 'target_name':target_name}, full_path)
241
+ joblib.dump({'model': trained_model, 'feature_names': feature_names, 'target_name':target_name}, full_path)
252
242
 
253
243
  # function to evaluate the model and save metrics (Classification)
254
244
  def evaluate_model_classification(
@@ -257,10 +247,9 @@ def evaluate_model_classification(
257
247
  save_dir: str,
258
248
  x_test_scaled: np.ndarray,
259
249
  single_y_test: np.ndarray,
260
- target_id: str,
250
+ target_name: str,
261
251
  figsize: tuple = (10, 8),
262
- title_fontsize: int = 24,
263
- label_fontsize: int = 24,
252
+ base_fontsize: int = 24,
264
253
  cmap: Colormap = plt.cm.Blues # type: ignore
265
254
  ) -> np.ndarray:
266
255
  """
@@ -271,8 +260,8 @@ def evaluate_model_classification(
271
260
  model_name: Identifier for the model
272
261
  save_dir: Directory where results are saved
273
262
  x_test_scaled: Feature matrix for test set
274
- single_y_test: True binary labels
275
- target_id: Suffix for naming output files
263
+ single_y_test: True targets
264
+ target_name: Target name
276
265
  figsize: Size of the confusion matrix figure (width, height)
277
266
  fontsize: Font size used for title, axis labels and ticks
278
267
  cmap: Color map for the confusion matrix. Examples include:
@@ -300,10 +289,10 @@ def evaluate_model_classification(
300
289
  )
301
290
 
302
291
  # Save text report
303
- sanitized_target_id = sanitize_filename(target_id)
304
- report_path = os.path.join(save_dir, f"Classification_Report_{sanitized_target_id}.txt")
292
+ sanitized_target_name = sanitize_filename(target_name)
293
+ report_path = os.path.join(save_dir, f"Classification_Report_{sanitized_target_name}.txt")
305
294
  with open(report_path, "w") as f:
306
- f.write(f"{model_name} - {target_id}\t\tAccuracy: {accuracy:.2f}\n")
295
+ f.write(f"{model_name} - {target_name}\t\tAccuracy: {accuracy:.2f}\n")
307
296
  f.write("Classification Report:\n")
308
297
  f.write(report) # type: ignore
309
298
 
@@ -318,20 +307,20 @@ def evaluate_model_classification(
318
307
  ax=ax
319
308
  )
320
309
 
321
- ax.set_title(f"{model_name} - {target_id}", fontsize=title_fontsize)
322
- ax.tick_params(axis='both', labelsize=label_fontsize)
323
- ax.set_xlabel("Predicted label", fontsize=label_fontsize)
324
- ax.set_ylabel("True label", fontsize=label_fontsize)
310
+ ax.set_title(f"{model_name} - {target_name}", fontsize=base_fontsize)
311
+ ax.tick_params(axis='both', labelsize=base_fontsize)
312
+ ax.set_xlabel("Predicted label", fontsize=base_fontsize)
313
+ ax.set_ylabel("True label", fontsize=base_fontsize)
325
314
 
326
315
  # Turn off gridlines
327
316
  ax.grid(False)
328
317
 
329
318
  # Manually update font size of cell texts
330
319
  for text in ax.texts:
331
- text.set_fontsize(title_fontsize+4)
320
+ text.set_fontsize(base_fontsize+4)
332
321
 
333
322
  fig.tight_layout()
334
- fig_path = os.path.join(save_dir, f"Confusion_Matrix_{sanitized_target_id}.svg")
323
+ fig_path = os.path.join(save_dir, f"Confusion_Matrix_{sanitized_target_name}.svg")
335
324
  fig.savefig(fig_path, format="svg", bbox_inches="tight")
336
325
  plt.close(fig)
337
326
 
@@ -356,7 +345,7 @@ def plot_roc_curve(
356
345
  Parameters:
357
346
  true_labels: np.ndarray of shape (n_samples,), ground truth binary labels (0 or 1).
358
347
  probabilities_or_model: either predicted probabilities (ndarray), or a trained model with attribute `.predict_proba()`.
359
- target_name: str, used for figure title and filename.
348
+ target_name: str, Target name.
360
349
  save_directory: str, path to directory where figure is saved.
361
350
  color: color of the ROC curve. Accepts any valid Matplotlib color specification. Examples:
362
351
  - Named colors: "darkorange", "blue", "red", "green", "black"
@@ -425,7 +414,7 @@ def plot_roc_curve(
425
414
  def evaluate_model_regression(model, model_name: str,
426
415
  save_dir: str,
427
416
  x_test_scaled: np.ndarray, single_y_test: np.ndarray,
428
- target_id: str,
417
+ target_name: str,
429
418
  figure_size: tuple = (12, 8),
430
419
  alpha_transparency: float = 0.5,
431
420
  base_fontsize: int = 24):
@@ -439,10 +428,10 @@ def evaluate_model_regression(model, model_name: str,
439
428
  r2 = r2_score(single_y_test, y_pred)
440
429
 
441
430
  # Create formatted report
442
- sanitized_target_id = sanitize_filename(target_id)
443
- report_path = os.path.join(save_dir, f"Regression_Report_{sanitized_target_id}.txt")
431
+ sanitized_target_name = sanitize_filename(target_name)
432
+ report_path = os.path.join(save_dir, f"Regression_Report_{sanitized_target_name}.txt")
444
433
  with open(report_path, "w") as f:
445
- f.write(f"{model_name} - {target_id} Regression Performance\n")
434
+ f.write(f"{model_name} - {target_name} Regression Performance\n")
446
435
  f.write(f"Mean Absolute Error (MAE): {mae:.4f}\n")
447
436
  f.write(f"Mean Squared Error (MSE): {mse:.4f}\n")
448
437
  f.write(f"Root Mean Squared Error (RMSE): {rmse:.4f}\n")
@@ -455,10 +444,10 @@ def evaluate_model_regression(model, model_name: str,
455
444
  plt.axhline(0, color='red', linestyle='--')
456
445
  plt.xlabel("Predicted Values", fontsize=base_fontsize)
457
446
  plt.ylabel("Residuals", fontsize=base_fontsize)
458
- plt.title(f"{model_name} - Residual Plot for {target_id}", fontsize=base_fontsize)
447
+ plt.title(f"{model_name} - Residual Plot for {target_name}", fontsize=base_fontsize)
459
448
  plt.grid(True)
460
449
  plt.tight_layout()
461
- plt.savefig(os.path.join(save_dir, f"Residual_Plot_{sanitized_target_id}.svg"), bbox_inches='tight', format="svg")
450
+ plt.savefig(os.path.join(save_dir, f"Residual_Plot_{sanitized_target_name}.svg"), bbox_inches='tight', format="svg")
462
451
  plt.close()
463
452
 
464
453
  # Create true vs predicted values plot
@@ -469,9 +458,9 @@ def evaluate_model_regression(model, model_name: str,
469
458
  'k--', lw=2)
470
459
  plt.xlabel('True Values', fontsize=base_fontsize)
471
460
  plt.ylabel('Predictions', fontsize=base_fontsize)
472
- plt.title(f"{model_name} - True vs Predicted for {target_id}", fontsize=base_fontsize)
461
+ plt.title(f"{model_name} - True vs Predicted for {target_name}", fontsize=base_fontsize)
473
462
  plt.grid(True)
474
- plot_path = os.path.join(save_dir, f"Regression_Plot_{sanitized_target_id}.svg")
463
+ plot_path = os.path.join(save_dir, f"Regression_Plot_{sanitized_target_name}.svg")
475
464
  plt.savefig(plot_path, bbox_inches='tight', format="svg")
476
465
  plt.close()
477
466
 
@@ -485,7 +474,7 @@ def get_shap_values(
485
474
  save_dir: str,
486
475
  features_to_explain: np.ndarray,
487
476
  feature_names: list[str],
488
- target_id: str,
477
+ target_name: str,
489
478
  task: Literal["classification", "regression"],
490
479
  max_display_features: int = 10,
491
480
  figsize: tuple = (16, 20),
@@ -504,7 +493,7 @@ def get_shap_values(
504
493
  features_to_explain: Should match the model's training data format, including scaling.
505
494
  save_dir: Directory to save visualizations
506
495
  """
507
- sanitized_target_id = sanitize_filename(target_id)
496
+ sanitized_target_name = sanitize_filename(target_name)
508
497
 
509
498
  def _apply_plot_style():
510
499
  styles = ['seaborn', 'seaborn-v0_8-darkgrid', 'seaborn-v0_8', 'default']
@@ -567,9 +556,9 @@ def get_shap_values(
567
556
  _create_shap_plot(
568
557
  shap_values=class_shap,
569
558
  features=features_to_explain,
570
- save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_id}_Class{class_name}_{plot_type}.svg"),
559
+ save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_name}_Class{class_name}_{plot_type}.svg"),
571
560
  plot_type=plot_type,
572
- title=f"{model_name} - {target_id} (Class {class_name})"
561
+ title=f"{model_name} - {target_name} (Class {class_name})"
573
562
  )
574
563
  else:
575
564
  values = shap_values[1] if isinstance(shap_values, list) else shap_values
@@ -577,9 +566,9 @@ def get_shap_values(
577
566
  _create_shap_plot(
578
567
  shap_values=values,
579
568
  features=features_to_explain,
580
- save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_id}_{plot_type}.svg"),
569
+ save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_name}_{plot_type}.svg"),
581
570
  plot_type=plot_type,
582
- title=f"{model_name} - {target_id}"
571
+ title=f"{model_name} - {target_name}"
583
572
  )
584
573
 
585
574
  def _plot_for_regression(shap_values):
@@ -587,9 +576,9 @@ def get_shap_values(
587
576
  _create_shap_plot(
588
577
  shap_values=shap_values,
589
578
  features=features_to_explain,
590
- save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_id}_{plot_type}.svg"),
579
+ save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_name}_{plot_type}.svg"),
591
580
  plot_type=plot_type,
592
- title=f"{model_name} - {target_id}"
581
+ title=f"{model_name} - {target_name}"
593
582
  )
594
583
  #START_O
595
584
 
@@ -610,7 +599,7 @@ def get_shap_values(
610
599
  def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["classification", "regression"],
611
600
  train_features: np.ndarray, train_target: np.ndarray,
612
601
  test_features: np.ndarray, test_target: np.ndarray,
613
- feature_names: list[str], target_id: str, scaler_object: Union[StandardScaler, MinMaxScaler, MaxAbsScaler],
602
+ feature_names: list[str], target_name: str,
614
603
  save_dir: str,
615
604
  debug: bool=False, save_model: bool=False):
616
605
  '''
@@ -620,7 +609,7 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["
620
609
 
621
610
  Returns: Tuple(Trained model, Test-set Predictions)
622
611
  '''
623
- print(f"\tModel: {model_name} for Target: {target_id}...")
612
+ print(f"\tModel: {model_name} for Target: {target_name}...")
624
613
  trained_model = _train_model(model=model, train_features=train_features, train_target=train_target)
625
614
  if debug:
626
615
  print(f"Trained model object: {type(trained_model)}")
@@ -628,42 +617,42 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["
628
617
 
629
618
  if save_model:
630
619
  _save_model(trained_model=trained_model, model_name=model_name,
631
- target_name=target_id, feature_names=feature_names,
632
- save_directory=local_save_directory, scaler_object=scaler_object)
620
+ target_name=target_name, feature_names=feature_names,
621
+ save_directory=local_save_directory)
633
622
 
634
623
  if task == "classification":
635
624
  y_pred = evaluate_model_classification(model=trained_model, model_name=model_name, save_dir=local_save_directory,
636
- x_test_scaled=test_features, single_y_test=test_target, target_id=target_id)
625
+ x_test_scaled=test_features, single_y_test=test_target, target_name=target_name)
637
626
  plot_roc_curve(true_labels=test_target,
638
627
  probabilities_or_model=trained_model, model_name=model_name,
639
- target_name=target_id, save_directory=local_save_directory,
628
+ target_name=target_name, save_directory=local_save_directory,
640
629
  input_features=test_features)
641
630
  elif task == "regression":
642
631
  y_pred = evaluate_model_regression(model=trained_model, model_name=model_name, save_dir=local_save_directory,
643
- x_test_scaled=test_features, single_y_test=test_target, target_id=target_id)
632
+ x_test_scaled=test_features, single_y_test=test_target, target_name=target_name)
644
633
  else:
645
634
  raise ValueError(f"Unrecognized task '{task}' for model training,")
646
635
  if debug:
647
636
  print(f"Predicted vector: {type(y_pred)} with shape: {y_pred.shape}")
648
637
 
649
638
  get_shap_values(model=trained_model, model_name=model_name, save_dir=local_save_directory,
650
- features_to_explain=train_features, feature_names=feature_names, target_id=target_id, task=task)
639
+ features_to_explain=train_features, feature_names=feature_names, target_name=target_name, task=task)
651
640
  print("\t...done.")
652
641
  return trained_model, y_pred
653
642
 
654
643
  ###### 5. Execution ######
655
644
  def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list[str], task: Literal["classification", "regression"],
656
- resample_strategy: Literal[r"ADASYN", r'SMOTE', r'RANDOM', r'UNDERSAMPLE', None]=None, scaler: Literal["standard", "minmax", "maxabs"]="minmax", save_model: bool=False,
645
+ resample_strategy: Literal[r"ADASYN", r'SMOTE', r'RANDOM', r'UNDERSAMPLE', None]=None, save_model: bool=False,
657
646
  test_size: float=0.2, debug:bool=False, L1_regularization: float=0.5, L2_regularization: float=0.5, learning_rate: float=0.005, random_state: int=101):
658
647
  #Check paths
659
648
  _check_paths(datasets_dir, save_dir)
660
649
  #Yield imputed dataset
661
650
  for dataframe, dataframe_name in yield_dataframes_from_dir(datasets_dir):
662
651
  #Yield features dataframe and target dataframe
663
- for df_features, df_target, feature_names, target_name in dataset_yielder(df=dataframe, target_cols=target_columns):
652
+ for df_features, df_target, feature_names, target_name in _dataset_yielder(df=dataframe, target_cols=target_columns):
664
653
  #Dataset pipeline
665
- X_train, y_train, X_test, y_test, scaler_object = dataset_pipeline(df_features=df_features, df_target=df_target, task=task,
666
- resample_strategy=resample_strategy, scaler=scaler,
654
+ X_train, y_train, X_test, y_test = dataset_pipeline(df_features=df_features, df_target=df_target, task=task,
655
+ resample_strategy=resample_strategy,
667
656
  test_size=test_size, debug=debug, random_state=random_state)
668
657
  #Get models
669
658
  models_dict = get_models(task=task, is_balanced=False if resample_strategy is None else True,
@@ -673,7 +662,7 @@ def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list
673
662
  train_test_pipeline(model=model, model_name=model_name, dataset_id=dataframe_name, task=task,
674
663
  train_features=X_train, train_target=y_train, # type: ignore
675
664
  test_features=X_test, test_target=y_test,
676
- feature_names=feature_names,target_id=target_name, scaler_object=scaler_object,
665
+ feature_names=feature_names,target_name=target_name,
677
666
  debug=debug, save_dir=save_dir, save_model=save_model)
678
667
  print("\n✅ Training and evaluation complete.")
679
668
 
@@ -683,3 +672,7 @@ def _check_paths(datasets_dir: str, save_dir:str):
683
672
  os.makedirs(save_dir)
684
673
  if not os.path.isdir(datasets_dir):
685
674
  raise IOError(f"Datasets directory '{datasets_dir}' not found.")
675
+
676
+
677
+ def info():
678
+ _script_info(__all__)
@@ -5,11 +5,10 @@ import xgboost as xgb
5
5
  import lightgbm as lgb
6
6
  from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
7
7
  from sklearn.base import ClassifierMixin
8
- from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
9
- from typing import Literal, Union, Tuple, Dict
8
+ from typing import Literal, Union, Tuple, Dict, Optional
10
9
  import polars as pl
11
10
  from functools import partial
12
- from .utilities import sanitize_filename, _script_info
11
+ from .utilities import sanitize_filename, _script_info, threshold_binary_values
13
12
 
14
13
 
15
14
  __all__ = [
@@ -20,14 +19,14 @@ __all__ = [
20
19
 
21
20
  class ObjectiveFunction():
22
21
  """
23
- Callable objective function designed for optimizing continuous outputs from regression models.
22
+ Callable objective function designed for optimizing continuous outputs from tree-based regression models.
24
23
 
25
- The target serialized file (joblib) must include a 'model' and a 'scaler'. Additionally 'feature_names' and 'target_name' will be parsed if present.
24
+ The target serialized file (joblib) must include a trained tree-based 'model'. Additionally 'feature_names' and 'target_name' will be parsed if present.
26
25
 
27
26
  Parameters
28
27
  ----------
29
28
  trained_model_path : str
30
- Path to a serialized model and its scaler (joblib) compatible with scikit-learn-like `.predict`.
29
+ Path to a serialized model (joblib) compatible with scikit-learn-like `.predict`.
31
30
  add_noise : bool
32
31
  Whether to apply multiplicative noise to the input features during evaluation.
33
32
  binary_features : int, default=0
@@ -35,15 +34,14 @@ class ObjectiveFunction():
35
34
  task : Literal, default 'maximization'
36
35
  Whether to maximize or minimize the target.
37
36
  """
38
- def __init__(self, trained_model_path: str, add_noise: bool=True, task: Literal["maximization", "minimization"]="maximization", binary_features: int=0) -> None:
37
+ def __init__(self, trained_model_path: str, add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int=0) -> None:
39
38
  self.binary_features = binary_features
40
39
  self.is_hybrid = False if binary_features <= 0 else True
41
40
  self.use_noise = add_noise
42
41
  self._artifact = joblib.load(trained_model_path)
43
42
  self.model = self._get_from_artifact('model')
44
- self.scaler = self._get_from_artifact('scaler')
45
- self.feature_names: list[str] = self._get_from_artifact('feature_names') # type: ignore
46
- self.target_name: str = self._get_from_artifact('target_name') # type: ignore
43
+ self.feature_names: Optional[list[str]] = self._get_from_artifact('feature_names') # type: ignore
44
+ self.target_name: Optional[str] = self._get_from_artifact('target_name') # type: ignore
47
45
  self.task = task
48
46
  self.check_model() # check for classification models and None values
49
47
 
@@ -51,16 +49,15 @@ class ObjectiveFunction():
51
49
  if self.use_noise:
52
50
  features_array = self.add_noise(features_array)
53
51
  if self.is_hybrid:
54
- features_array = self._handle_hybrid(features_array)
52
+ features_array = threshold_binary_values(input_array=features_array, binary_features=self.binary_features)
55
53
 
56
54
  if features_array.ndim == 1:
57
55
  features_array = features_array.reshape(1, -1)
58
56
 
59
- # scale features as the model expects
60
- features_array = self.scaler.transform(features_array) # type: ignore
61
-
62
57
  result = self.model.predict(features_array) # type: ignore
63
58
  scalar = result.item()
59
+ # print(f"[DEBUG] Model predicted: {scalar}")
60
+
64
61
  # pso minimizes by default, so we return the negative value to maximize
65
62
  if self.task == "maximization":
66
63
  return -scalar
@@ -68,33 +65,22 @@ class ObjectiveFunction():
68
65
  return scalar
69
66
 
70
67
  def add_noise(self, features_array):
71
- noise_range = np.random.uniform(0.95, 1.05, size=features_array.shape)
72
- new_feature_values = features_array * noise_range
73
- return new_feature_values
74
-
75
- def _handle_hybrid(self, features_array):
76
- total_features = features_array.shape[0]
77
- if self.binary_features > total_features:
78
- raise ValueError("self.binary_features exceeds total number of features.")
79
-
80
- # Handle corner case where all features are binary
81
- if self.binary_features == total_features:
82
- feat_binary = (features_array > 0.5).astype(int)
83
- return feat_binary
84
-
85
- # Normal case: split into continuous and binary parts
86
- feat_continuous = features_array[:-self.binary_features]
87
- feat_binary = (features_array[-self.binary_features:] > 0.5).astype(int) #threshold binary values
88
- new_feature_values = np.concatenate([feat_continuous, feat_binary])
89
- return new_feature_values
68
+ if self.binary_features > 0:
69
+ split_idx = -self.binary_features
70
+ cont_part = features_array[:split_idx]
71
+ bin_part = features_array[split_idx:]
72
+ noise = np.random.uniform(0.95, 1.05, size=cont_part.shape)
73
+ cont_noised = cont_part * noise
74
+ return np.concatenate([cont_noised, bin_part])
75
+ else:
76
+ noise = np.random.uniform(0.95, 1.05, size=features_array.shape)
77
+ return features_array * noise
90
78
 
91
79
  def check_model(self):
92
80
  if isinstance(self.model, ClassifierMixin) or isinstance(self.model, xgb.XGBClassifier) or isinstance(self.model, lgb.LGBMClassifier):
93
81
  raise ValueError(f"[Model Check Failed] ❌\nThe loaded model ({type(self.model).__name__}) is a Classifier.\nOptimization is not suitable for standard classification tasks.")
94
82
  if self.model is None:
95
83
  raise ValueError("Loaded model is None")
96
- if self.scaler is None:
97
- raise ValueError("Loaded scaler is None")
98
84
 
99
85
  def _get_from_artifact(self, key: str):
100
86
  val = self._artifact.get(key)
@@ -105,7 +91,7 @@ class ObjectiveFunction():
105
91
  return result
106
92
 
107
93
  def __repr__(self):
108
- return (f"<ObjectiveFunction(model={type(self.model).__name__}, scaler={type(self.scaler).__name__}, use_noise={self.use_noise}, is_hybrid={self.is_hybrid}, task='{self.task}')>")
94
+ return (f"<ObjectiveFunction(model={type(self.model).__name__}, use_noise={self.use_noise}, is_hybrid={self.is_hybrid}, task='{self.task}')>")
109
95
 
110
96
 
111
97
  def _set_boundaries(lower_boundaries: list[float], upper_boundaries: list[float]):
@@ -142,11 +128,11 @@ def run_pso(lower_boundaries: list[float],
142
128
  auto_binary_boundaries: bool=True,
143
129
  target_name: Union[str, None]=None,
144
130
  feature_names: Union[list[str], None]=None,
145
- swarm_size: int=100,
146
- max_iterations: int=100,
131
+ swarm_size: int=200,
132
+ max_iterations: int=400,
147
133
  inequality_constrain_function=None,
148
- post_hoc_analysis: Union[int, None]=None,
149
- workers: int=5) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
134
+ post_hoc_analysis: Optional[int]=3,
135
+ workers: int=3) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
150
136
  """
151
137
  Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results as a CSV file.
152
138
 
@@ -157,7 +143,7 @@ def run_pso(lower_boundaries: list[float],
157
143
  upper_boundaries : list[float]
158
144
  Upper bounds for each feature in the search space (as many as features expected by the model).
159
145
  objective_function : ObjectiveFunction
160
- A callable object encapsulating a regression model and its scaler.
146
+ A callable object encapsulating a tree-based regression model.
161
147
  save_results_dir : str
162
148
  Directory path to save the results CSV file.
163
149
  auto_binary_boundaries : bool
@@ -172,7 +158,7 @@ def run_pso(lower_boundaries: list[float],
172
158
  Maximum number of iterations for the optimization algorithm.
173
159
  inequality_constrain_function : callable or None, optional
174
160
  Optional function defining inequality constraints to be respected by the optimization.
175
- post_hoc_analysis : int or None, optional
161
+ post_hoc_analysis : int or None
176
162
  If specified, runs the optimization multiple times to perform post hoc analysis. The value indicates the number of repetitions.
177
163
  workers : int
178
164
  Number of parallel processes to use.
@@ -191,7 +177,6 @@ def run_pso(lower_boundaries: list[float],
191
177
  Notes
192
178
  -----
193
179
  - PSO minimizes the objective function by default; if maximization is desired, it should be handled inside the ObjectiveFunction.
194
- - Feature values are scaled before being passed to the model and inverse-transformed before result saving.
195
180
  """
196
181
  # Append binary boundaries
197
182
  binary_number = objective_function.binary_features
@@ -229,12 +214,15 @@ def run_pso(lower_boundaries: list[float],
229
214
  best_features, best_target, *_ = _pso(**arguments)
230
215
  # best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)
231
216
 
232
- # inverse transformation
233
- best_features = np.array(best_features).reshape(1, -1)
234
- best_features_real = objective_function.scaler.inverse_transform(best_features).flatten() # type: ignore
217
+ # flip best_target if maximization was used
218
+ if objective_function.task == "maximization":
219
+ best_target = -best_target
220
+
221
+ # threshold binary features
222
+ best_features_threshold = threshold_binary_values(best_features, binary_number)
235
223
 
236
224
  # name features
237
- best_features_named = {name: value for name, value in zip(names, best_features_real)}
225
+ best_features_named = {name: value for name, value in zip(names, best_features_threshold)}
238
226
  best_target_named = {target_name: best_target}
239
227
 
240
228
  # save results
@@ -248,11 +236,14 @@ def run_pso(lower_boundaries: list[float],
248
236
  best_features, best_target, *_ = _pso(**arguments)
249
237
  # best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)
250
238
 
251
- # inverse transformation
252
- best_features = np.array(best_features).reshape(1, -1)
253
- best_features_real = objective_function.scaler.inverse_transform(best_features).flatten() # type: ignore
239
+ # flip best_target if maximization was used
240
+ if objective_function.task == "maximization":
241
+ best_target = -best_target
242
+
243
+ # threshold binary features
244
+ best_features_threshold = threshold_binary_values(best_features, binary_number)
254
245
 
255
- for i, best_feature in enumerate(best_features_real):
246
+ for i, best_feature in enumerate(best_features_threshold):
256
247
  all_best_features[i].append(best_feature)
257
248
  all_best_targets.append(best_target)
258
249
 
ml_tools/utilities.py CHANGED
@@ -4,7 +4,7 @@ import pandas as pd
4
4
  import os
5
5
  from pathlib import Path
6
6
  import re
7
- from typing import Literal
7
+ from typing import Literal, Union, Sequence
8
8
 
9
9
 
10
10
  # Keep track of available tools
@@ -15,7 +15,8 @@ __all__ = [
15
15
  "merge_dataframes",
16
16
  "save_dataframe",
17
17
  "normalize_mixed_list",
18
- "sanitize_filename"
18
+ "sanitize_filename",
19
+ "threshold_binary_values"
19
20
  ]
20
21
 
21
22
 
@@ -263,6 +264,38 @@ def sanitize_filename(filename: str) -> str:
263
264
  return sanitized
264
265
 
265
266
 
267
+ def threshold_binary_values(
268
+ input_array: Union[Sequence[float], np.ndarray],
269
+ binary_features: int
270
+ ) -> np.ndarray:
271
+ """
272
+ Thresholds binary features in a 1D numeric sequence. Binary features must be located at the end of the sequence.
273
+
274
+ Converts binary elements to values (0 or 1) using a threshold of 0.5. The rest of the array (assumed to be continuous features) is returned unchanged.
275
+
276
+ Parameters:
277
+ input_array (Union[Sequence[float], np.ndarray]) : A one-dimensional collection of numeric values. The binary features must be located at the end of the array.
278
+
279
+ binary_features (int) : Number of binary features to threshold from the end of the array. Must be between 0 and the total number of elements.
280
+
281
+ Returns:
282
+ np.ndarray : A 1D NumPy array where the final `binary_features` values have been binarized.
283
+ """
284
+ array = np.asarray(input_array).flatten()
285
+ total = array.shape[0]
286
+
287
+ if binary_features < 0 or binary_features > total:
288
+ raise ValueError("Binary features must be between 0 and the total number of features.")
289
+
290
+ if binary_features == 0:
291
+ return array
292
+
293
+ cont_part = array[:-binary_features]
294
+ bin_part = (array[-binary_features:] > 0.5).astype(int)
295
+
296
+ return np.concatenate([cont_part, bin_part])
297
+
298
+
266
299
  def _script_info(all_data: list[str]):
267
300
  """
268
301
  List available names.