dragon-ml-toolbox 3.9.1__tar.gz → 3.10.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (32) hide show
  1. {dragon_ml_toolbox-3.9.1 → dragon_ml_toolbox-3.10.1}/LICENSE-THIRD-PARTY.md +0 -1
  2. {dragon_ml_toolbox-3.9.1/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-3.10.1}/PKG-INFO +1 -1
  3. {dragon_ml_toolbox-3.9.1 → dragon_ml_toolbox-3.10.1/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
  4. {dragon_ml_toolbox-3.9.1 → dragon_ml_toolbox-3.10.1}/dragon_ml_toolbox.egg-info/SOURCES.txt +1 -1
  5. {dragon_ml_toolbox-3.9.1 → dragon_ml_toolbox-3.10.1}/ml_tools/GUI_tools.py +62 -11
  6. {dragon_ml_toolbox-3.9.1 → dragon_ml_toolbox-3.10.1}/ml_tools/ML_callbacks.py +2 -1
  7. {dragon_ml_toolbox-3.9.1 → dragon_ml_toolbox-3.10.1}/ml_tools/ML_trainer.py +2 -1
  8. {dragon_ml_toolbox-3.9.1 → dragon_ml_toolbox-3.10.1}/ml_tools/ML_tutorial.py +1 -1
  9. {dragon_ml_toolbox-3.9.1 → dragon_ml_toolbox-3.10.1}/ml_tools/PSO_optimization.py +5 -4
  10. {dragon_ml_toolbox-3.9.1 → dragon_ml_toolbox-3.10.1}/ml_tools/ensemble_learning.py +87 -3
  11. dragon_ml_toolbox-3.10.1/ml_tools/keys.py +25 -0
  12. {dragon_ml_toolbox-3.9.1 → dragon_ml_toolbox-3.10.1}/ml_tools/utilities.py +0 -16
  13. {dragon_ml_toolbox-3.9.1 → dragon_ml_toolbox-3.10.1}/pyproject.toml +1 -1
  14. dragon_ml_toolbox-3.9.1/ml_tools/_particle_swarm_optimization.py +0 -539
  15. {dragon_ml_toolbox-3.9.1 → dragon_ml_toolbox-3.10.1}/LICENSE +0 -0
  16. {dragon_ml_toolbox-3.9.1 → dragon_ml_toolbox-3.10.1}/README.md +0 -0
  17. {dragon_ml_toolbox-3.9.1 → dragon_ml_toolbox-3.10.1}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  18. {dragon_ml_toolbox-3.9.1 → dragon_ml_toolbox-3.10.1}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
  19. {dragon_ml_toolbox-3.9.1 → dragon_ml_toolbox-3.10.1}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  20. {dragon_ml_toolbox-3.9.1 → dragon_ml_toolbox-3.10.1}/ml_tools/ETL_engineering.py +0 -0
  21. {dragon_ml_toolbox-3.9.1 → dragon_ml_toolbox-3.10.1}/ml_tools/MICE_imputation.py +0 -0
  22. {dragon_ml_toolbox-3.9.1 → dragon_ml_toolbox-3.10.1}/ml_tools/ML_evaluation.py +0 -0
  23. {dragon_ml_toolbox-3.9.1 → dragon_ml_toolbox-3.10.1}/ml_tools/RNN_forecast.py +0 -0
  24. {dragon_ml_toolbox-3.9.1 → dragon_ml_toolbox-3.10.1}/ml_tools/VIF_factor.py +0 -0
  25. {dragon_ml_toolbox-3.9.1 → dragon_ml_toolbox-3.10.1}/ml_tools/__init__.py +0 -0
  26. {dragon_ml_toolbox-3.9.1 → dragon_ml_toolbox-3.10.1}/ml_tools/_pytorch_models.py +0 -0
  27. {dragon_ml_toolbox-3.9.1 → dragon_ml_toolbox-3.10.1}/ml_tools/data_exploration.py +0 -0
  28. {dragon_ml_toolbox-3.9.1 → dragon_ml_toolbox-3.10.1}/ml_tools/datasetmaster.py +0 -0
  29. {dragon_ml_toolbox-3.9.1 → dragon_ml_toolbox-3.10.1}/ml_tools/handle_excel.py +0 -0
  30. {dragon_ml_toolbox-3.9.1 → dragon_ml_toolbox-3.10.1}/ml_tools/logger.py +0 -0
  31. {dragon_ml_toolbox-3.9.1 → dragon_ml_toolbox-3.10.1}/ml_tools/path_manager.py +0 -0
  32. {dragon_ml_toolbox-3.9.1 → dragon_ml_toolbox-3.10.1}/setup.cfg +0 -0
@@ -25,5 +25,4 @@ This project depends on the following third-party packages. Each is governed by
25
25
  - [miceforest](https://github.com/AnotherSamWilson/miceforest/blob/main/LICENSE)
26
26
  - [polars](https://github.com/pola-rs/polars/blob/main/LICENSE)
27
27
  - [plotnine](https://github.com/has2k1/plotnine/blob/main/LICENSE)
28
- - [pyswarm](https://pythonhosted.org/pyswarm/#license)
29
28
  - [tqdm](https://github.com/tqdm/tqdm/blob/master/LICENSE)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 3.9.1
3
+ Version: 3.10.1
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 3.9.1
3
+ Version: 3.10.1
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -18,12 +18,12 @@ ml_tools/PSO_optimization.py
18
18
  ml_tools/RNN_forecast.py
19
19
  ml_tools/VIF_factor.py
20
20
  ml_tools/__init__.py
21
- ml_tools/_particle_swarm_optimization.py
22
21
  ml_tools/_pytorch_models.py
23
22
  ml_tools/data_exploration.py
24
23
  ml_tools/datasetmaster.py
25
24
  ml_tools/ensemble_learning.py
26
25
  ml_tools/handle_excel.py
26
+ ml_tools/keys.py
27
27
  ml_tools/logger.py
28
28
  ml_tools/path_manager.py
29
29
  ml_tools/utilities.py
@@ -389,23 +389,73 @@ class BaseFeatureHandler(ABC):
389
389
 
390
390
  Should return a dictionary mapping each GUI input name to its type ('continuous' or 'categorical').
391
391
 
392
+ _Example:_
392
393
  ```python
393
- #Example:
394
- {'temperature': 'continuous', 'material_type': 'categorical'}
394
+ {
395
+ 'Temperature': 'continuous',
396
+ 'Material Type': 'categorical'
397
+ }
398
+ ```
399
+ """
400
+ pass
401
+
402
+ @property
403
+ @abstractmethod
404
+ def map_gui_to_real(self) -> Dict[str,str]:
405
+ """
406
+ Must be implemented by the subclass.
407
+
408
+ Should return a dictionary mapping each GUI continuous feature name to its expected model feature name.
409
+
410
+ _Example:_
411
+ ```python
412
+ {
413
+ 'Temperature (K)': 'temperature_k',
414
+ 'Pressure (Pa)': 'pressure_pa'
415
+ }
395
416
  ```
396
417
  """
397
418
  pass
398
419
 
399
420
  @abstractmethod
400
- def process_categorical(self, feature_name: str, chosen_value: Any) -> Dict[str, float]:
421
+ def process_categorical(self, gui_feature_name: str, chosen_value: Any) -> Dict[str, float]:
401
422
  """
402
423
  Must be implemented by the subclass.
403
424
 
404
- Should take a GUI categorical feature name and its chosen value, and return a dictionary mapping the one-hot-encoded feature names to their
425
+ Should take a GUI categorical feature name and its chosen value, and return a dictionary mapping the one-hot-encoded/binary real feature names to their
405
426
  float values (as expected by the inference model).
427
+
428
+ _Example:_
429
+ ```python
430
+ # GUI input: "Material Type"
431
+ # GUI values: "Steel", "Aluminum", "Titanium"
432
+ {
433
+ "is_steel": 0,
434
+ "is_aluminum": 1,
435
+ "is_titanium": 0,
436
+ }
437
+ ```
406
438
  """
407
439
  pass
408
-
440
+
441
+ def _process_continuous(self, gui_feature_name: str, chosen_value: Any) -> Tuple[str, float]:
442
+ """
443
+ Maps GUI names to model expected names and casts the value to float.
444
+
445
+ Should not be overridden by subclasses.
446
+ """
447
+ try:
448
+ real_name = self.map_gui_to_real[gui_feature_name]
449
+ float_value = float(chosen_value)
450
+ except KeyError as e:
451
+ _LOGGER.error(f"No matching name for '{gui_feature_name}'. Check the 'map_gui_to_real' implementation.")
452
+ raise e
453
+ except (ValueError, TypeError) as e2:
454
+ _LOGGER.error(f"Invalid number conversion for '{chosen_value}' of '{gui_feature_name}'.")
455
+ raise e2
456
+ else:
457
+ return real_name, float_value
458
+
409
459
  def __call__(self, window_values: Dict[str, Any]) -> np.ndarray:
410
460
  """
411
461
  Performs the full vector preparation, returning a 1D numpy array.
@@ -416,16 +466,17 @@ class BaseFeatureHandler(ABC):
416
466
  processed_features: Dict[str, float] = {}
417
467
  for gui_name, feature_type in self.gui_input_map.items():
418
468
  chosen_value = window_values.get(gui_name)
419
-
469
+
470
+ # value validation
420
471
  if chosen_value is None or str(chosen_value) == '':
421
472
  raise ValueError(f"GUI input '{gui_name}' is missing a value.")
422
473
 
474
+ # process continuous
423
475
  if feature_type == 'continuous':
424
- try:
425
- processed_features[gui_name] = float(chosen_value)
426
- except (ValueError, TypeError):
427
- raise ValueError(f"Invalid number '{chosen_value}' for '{gui_name}'.")
428
-
476
+ mapped_name, float_value = self._process_continuous(gui_name, chosen_value)
477
+ processed_features[mapped_name] = float_value
478
+
479
+ # process categorical
429
480
  elif feature_type == 'categorical':
430
481
  feature_dict = self.process_categorical(gui_name, chosen_value)
431
482
  processed_features.update(feature_dict)
@@ -1,7 +1,8 @@
1
1
  import numpy as np
2
2
  import torch
3
3
  from tqdm.auto import tqdm
4
- from .utilities import make_fullpath, LogKeys
4
+ from .utilities import make_fullpath
5
+ from .keys import LogKeys
5
6
  from .logger import _LOGGER
6
7
  from typing import Optional
7
8
 
@@ -7,7 +7,8 @@ import numpy as np
7
7
 
8
8
  from .ML_callbacks import Callback, History, TqdmProgressBar
9
9
  from .ML_evaluation import classification_metrics, regression_metrics, plot_losses, shap_summary_plot
10
- from .utilities import _script_info, LogKeys
10
+ from .utilities import _script_info
11
+ from .keys import LogKeys
11
12
  from .logger import _LOGGER
12
13
 
13
14
 
@@ -25,7 +25,7 @@ def _get_notebook_content(kind: str):
25
25
  "# Import from dragon_ml_toolbox\n",
26
26
  "from ml_tools.ML_trainer import MyTrainer\n",
27
27
  "from ml_tools.ML_callbacks import EarlyStopping, ModelCheckpoint"
28
- "from ml_tools.utilities import LogKeys"
28
+ "from ml_tools.keys import LogKeys"
29
29
  ]
30
30
  }
31
31
 
@@ -23,6 +23,7 @@ from tqdm import trange
23
23
  import matplotlib.pyplot as plt
24
24
  import seaborn as sns
25
25
  from .logger import _LOGGER
26
+ from .keys import ModelSaveKeys
26
27
 
27
28
 
28
29
  __all__ = [
@@ -55,9 +56,9 @@ class ObjectiveFunction():
55
56
  self.is_hybrid = False if binary_features <= 0 else True
56
57
  self.use_noise = add_noise
57
58
  self._artifact = deserialize_object(trained_model_path, verbose=False, raise_on_error=True)
58
- self.model = self._get_from_artifact('model')
59
- self.feature_names: Optional[list[str]] = self._get_from_artifact('feature_names') # type: ignore
60
- self.target_name: Optional[str] = self._get_from_artifact('target_name') # type: ignore
59
+ self.model = self._get_from_artifact(ModelSaveKeys.MODEL)
60
+ self.feature_names: Optional[list[str]] = self._get_from_artifact(ModelSaveKeys.FEATURES) # type: ignore
61
+ self.target_name: Optional[str] = self._get_from_artifact(ModelSaveKeys.TARGET) # type: ignore
61
62
  self.task = task
62
63
  self.check_model() # check for classification models and None values
63
64
 
@@ -133,7 +134,7 @@ class ObjectiveFunction():
133
134
  if self._artifact is None:
134
135
  raise TypeError("Load model error")
135
136
  val = self._artifact.get(key)
136
- if key == "feature_names":
137
+ if key == ModelSaveKeys.FEATURES:
137
138
  result = val if isinstance(val, list) and val else None
138
139
  else:
139
140
  result = val if val else None
@@ -1,5 +1,6 @@
1
1
  import pandas as pd
2
2
  import numpy as np
3
+ import json
3
4
  import seaborn as sns
4
5
  import matplotlib.pyplot as plt
5
6
  from matplotlib.colors import Colormap
@@ -20,6 +21,7 @@ from sklearn.metrics import accuracy_score, classification_report, ConfusionMatr
20
21
  import shap
21
22
 
22
23
  from .utilities import yield_dataframes_from_dir, sanitize_filename, _script_info, serialize_object, make_fullpath, list_files_by_extension, deserialize_object
24
+ from .keys import ModelSaveKeys
23
25
  from .logger import _LOGGER
24
26
 
25
27
  import warnings # Ignore warnings
@@ -39,7 +41,8 @@ __all__ = [
39
41
  "get_shap_values",
40
42
  "train_test_pipeline",
41
43
  "run_ensemble_pipeline",
42
- "InferenceHandler"
44
+ "InferenceHandler",
45
+ "model_report"
43
46
  ]
44
47
 
45
48
  ## Type aliases
@@ -487,8 +490,10 @@ def _save_model(trained_model, model_name: str, target_name:str, feature_names:
487
490
  #Sanitize filenames to save
488
491
  sanitized_target_name = sanitize_filename(target_name)
489
492
  filename = f"{model_name}_{sanitized_target_name}"
490
- to_save = {'model': trained_model, 'feature_names': feature_names, 'target_name':target_name}
491
-
493
+ to_save = {ModelSaveKeys.MODEL: trained_model,
494
+ ModelSaveKeys.FEATURES: feature_names,
495
+ ModelSaveKeys.TARGET: target_name}
496
+
492
497
  serialize_object(obj=to_save, save_dir=save_directory, filename=filename, verbose=False, raise_on_error=True)
493
498
 
494
499
  # function to evaluate the model and save metrics (Classification)
@@ -1057,5 +1062,84 @@ class InferenceHandler:
1057
1062
  return results
1058
1063
 
1059
1064
 
1065
+ ###### 7. Save Model info report ######
1066
+ def model_report(
1067
+ model_path: Union[str,Path],
1068
+ output_dir: Optional[Union[str,Path]] = None,
1069
+ verbose: bool = True
1070
+ ) -> Dict[str, Any]:
1071
+ """
1072
+ Deserializes a model and generates a summary report.
1073
+
1074
+ This function loads a serialized model object (joblib), prints a summary to the
1075
+ console (if verbose), and saves a detailed JSON report.
1076
+
1077
+ Args:
1078
+ model_path (str): The path to the serialized model file.
1079
+ output_dir (str, optional): Directory to save the JSON report.
1080
+ If None, it defaults to the same directory as the model file.
1081
+ verbose (bool, optional): If True, prints summary information
1082
+ to the console. Defaults to True.
1083
+
1084
+ Returns:
1085
+ (Dict[str, Any]): A dictionary containing the model metadata.
1086
+
1087
+ Raises:
1088
+ FileNotFoundError: If the model_path does not exist.
1089
+ KeyError: If the deserialized object is missing required keys from `ModelSaveKeys`.
1090
+ """
1091
+ # 1. Convert to Path object
1092
+ model_p = make_fullpath(model_path)
1093
+
1094
+ # --- 2. Deserialize and Extract Info ---
1095
+ try:
1096
+ full_object: dict = deserialize_object(model_p) # type: ignore
1097
+ model = full_object[ModelSaveKeys.MODEL]
1098
+ target = full_object[ModelSaveKeys.TARGET]
1099
+ features = full_object[ModelSaveKeys.FEATURES]
1100
+ except FileNotFoundError:
1101
+ _LOGGER.error(f"❌ Model file not found at '{model_p}'")
1102
+ raise
1103
+ except (KeyError, TypeError) as e:
1104
+ _LOGGER.error(
1105
+ f"❌ The serialized object is missing required keys '{ModelSaveKeys.MODEL}', '{ModelSaveKeys.TARGET}', '{ModelSaveKeys.FEATURES}'"
1106
+ )
1107
+ raise e
1108
+
1109
+ # --- 3. Print Summary to Console (if verbose) ---
1110
+ if verbose:
1111
+ print("\n--- 📝 Model Summary ---")
1112
+ print(f"Source File: {model_p.name}")
1113
+ print(f"Model Type: {type(model).__name__}")
1114
+ print(f"Target: {target}")
1115
+ print(f"Feature Count: {len(features)}")
1116
+ print("-----------------------")
1117
+
1118
+ # --- 4. Generate JSON Report ---
1119
+ report_data = {
1120
+ "source_file": model_p.name,
1121
+ "model_type": str(type(model)),
1122
+ "target_name": target,
1123
+ "feature_count": len(features),
1124
+ "feature_names": features
1125
+ }
1126
+
1127
+ # Determine output path
1128
+ output_p = make_fullpath(output_dir, make=True) if output_dir else model_p.parent
1129
+ json_filename = model_p.stem + "_info.json"
1130
+ json_filepath = output_p / json_filename
1131
+
1132
+ try:
1133
+ with open(json_filepath, 'w') as f:
1134
+ json.dump(report_data, f, indent=4)
1135
+ if verbose:
1136
+ _LOGGER.info(f"✅ JSON report saved to: '{json_filepath}'")
1137
+ except PermissionError:
1138
+ _LOGGER.error(f"❌ Permission denied to write JSON report at '{json_filepath}'")
1139
+
1140
+ # --- 5. Return the extracted data ---
1141
+ return report_data
1142
+
1143
+
1060
1144
  def info():
1061
1145
  _script_info(__all__)
@@ -0,0 +1,25 @@
1
+ class LogKeys:
2
+ """
3
+ Used internally for ML scripts module.
4
+
5
+ Centralized keys for logging and history.
6
+ """
7
+ # --- Epoch Level ---
8
+ TRAIN_LOSS = 'train_loss'
9
+ VAL_LOSS = 'val_loss'
10
+
11
+ # --- Batch Level ---
12
+ BATCH_LOSS = 'loss'
13
+ BATCH_INDEX = 'batch'
14
+ BATCH_SIZE = 'size'
15
+
16
+
17
+ class ModelSaveKeys:
18
+ """
19
+ Used internally for ensemble_learning module.
20
+
21
+ Keys used for serializing a trained model metadata.
22
+ """
23
+ MODEL = "model"
24
+ FEATURES = "feature_names"
25
+ TARGET = "target_name"
@@ -643,22 +643,6 @@ def train_dataset_orchestrator(list_of_dirs: list[Union[str,Path]],
643
643
  print(f"\n✅ {total_saved} single-target datasets were created.")
644
644
 
645
645
 
646
- class LogKeys:
647
- """
648
- Used internally for ML scripts.
649
-
650
- Centralized keys for logging and history.
651
- """
652
- # --- Epoch Level ---
653
- TRAIN_LOSS = 'train_loss'
654
- VAL_LOSS = 'val_loss'
655
-
656
- # --- Batch Level ---
657
- BATCH_LOSS = 'loss'
658
- BATCH_INDEX = 'batch'
659
- BATCH_SIZE = 'size'
660
-
661
-
662
646
  def _script_info(all_data: list[str]):
663
647
  """
664
648
  List available names.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dragon-ml-toolbox"
3
- version = "3.9.1"
3
+ version = "3.10.1"
4
4
  description = "A collection of tools for data science and machine learning projects."
5
5
  authors = [
6
6
  { name = "Karl Loza", email = "luigiloza@gmail.com" }
@@ -1,539 +0,0 @@
1
- """
2
- DEPRECATED
3
- """
4
-
5
-
6
- import numpy as np
7
- import os
8
- import xgboost as xgb
9
- import lightgbm as lgb
10
- from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
11
- from sklearn.base import ClassifierMixin
12
- from typing import Literal, Union, Tuple, Dict, Optional
13
- import polars as pl
14
- from functools import partial
15
- from copy import deepcopy
16
- from .utilities import sanitize_filename, _script_info, threshold_binary_values, deserialize_object, list_files_by_extension
17
-
18
-
19
- __all__ = [
20
- "ObjectiveFunction",
21
- "multiple_objective_functions_from_dir",
22
- "run_pso"
23
- ]
24
-
25
-
26
- class ObjectiveFunction():
27
- """
28
- Callable objective function designed for optimizing continuous outputs from tree-based regression models.
29
-
30
- The target serialized file (joblib) must include a trained tree-based 'model'. Additionally 'feature_names' and 'target_name' will be parsed if present.
31
-
32
- Parameters
33
- ----------
34
- trained_model_path : str
35
- Path to a serialized model (joblib) compatible with scikit-learn-like `.predict`.
36
- add_noise : bool
37
- Whether to apply multiplicative noise to the input features during evaluation.
38
- task : (Literal["maximization", "minimization"])
39
- Whether to maximize or minimize the target.
40
- binary_features : int
41
- Number of binary features located at the END of the feature vector. Model should be trained with continuous features first, followed by binary.
42
- """
43
- def __init__(self, trained_model_path: str, add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int) -> None:
44
- self.binary_features = binary_features
45
- self.is_hybrid = False if binary_features <= 0 else True
46
- self.use_noise = add_noise
47
- self._artifact = deserialize_object(trained_model_path, verbose=False, raise_on_error=True)
48
- self.model = self._get_from_artifact('model')
49
- self.feature_names: Optional[list[str]] = self._get_from_artifact('feature_names') # type: ignore
50
- self.target_name: Optional[str] = self._get_from_artifact('target_name') # type: ignore
51
- self.task = task
52
- self.check_model() # check for classification models and None values
53
-
54
- def __call__(self, features_array: np.ndarray) -> float:
55
- if self.use_noise:
56
- features_array = self.add_noise(features_array)
57
- if self.is_hybrid:
58
- features_array = threshold_binary_values(input_array=features_array, binary_values=self.binary_features) # type: ignore
59
-
60
- if features_array.ndim == 1:
61
- features_array = features_array.reshape(1, -1)
62
-
63
- result = self.model.predict(features_array) # type: ignore
64
- scalar = result.item()
65
- # print(f"[DEBUG] Model predicted: {scalar}")
66
-
67
- # pso minimizes by default, so we return the negative value to maximize
68
- if self.task == "maximization":
69
- return -scalar
70
- else:
71
- return scalar
72
-
73
- def add_noise(self, features_array):
74
- if self.binary_features > 0:
75
- split_idx = -self.binary_features
76
- cont_part = features_array[:split_idx]
77
- bin_part = features_array[split_idx:]
78
- noise = np.random.uniform(0.95, 1.05, size=cont_part.shape)
79
- cont_noised = cont_part * noise
80
- return np.concatenate([cont_noised, bin_part])
81
- else:
82
- noise = np.random.uniform(0.95, 1.05, size=features_array.shape)
83
- return features_array * noise
84
-
85
- def check_model(self):
86
- if isinstance(self.model, ClassifierMixin) or isinstance(self.model, xgb.XGBClassifier) or isinstance(self.model, lgb.LGBMClassifier):
87
- raise ValueError(f"[Model Check Failed] ❌\nThe loaded model ({type(self.model).__name__}) is a Classifier.\nOptimization is not suitable for standard classification tasks.")
88
- if self.model is None:
89
- raise ValueError("Loaded model is None")
90
-
91
- def _get_from_artifact(self, key: str):
92
- if self._artifact is None:
93
- raise TypeError("Load model error")
94
- val = self._artifact.get(key)
95
- if key == "feature_names":
96
- result = val if isinstance(val, list) and val else None
97
- else:
98
- result = val if val else None
99
- return result
100
-
101
- def __repr__(self):
102
- return (f"<ObjectiveFunction(model={type(self.model).__name__}, use_noise={self.use_noise}, is_hybrid={self.is_hybrid}, task='{self.task}')>")
103
-
104
-
105
- def multiple_objective_functions_from_dir(directory: str, add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int):
106
- """
107
- Loads multiple objective functions from serialized models in the given directory.
108
-
109
- Each `.joblib` file which is loaded and wrapped as an `ObjectiveFunction` instance. Returns a list of such instances along with their corresponding names.
110
-
111
- Parameters:
112
- directory (str) : Path to the directory containing `.joblib` files (serialized models).
113
- add_noise (bool) : Whether to apply multiplicative noise to the input features during evaluation.
114
- task (Literal["maximization", "minimization"]) : Defines the nature of the optimization task.
115
- binary_features (int) : Number of binary features expected by each objective function.
116
-
117
- Returns:
118
- (tuple[list[ObjectiveFunction], list[str]]) : A tuple containing:
119
- - list of `ObjectiveFunction` instances.
120
- - list of corresponding filenames.
121
- """
122
- objective_functions = list()
123
- objective_function_names = list()
124
- for file_name, file_path in list_files_by_extension(directory=directory, extension='joblib').items():
125
- current_objective = ObjectiveFunction(trained_model_path=file_path,
126
- add_noise=add_noise,
127
- task=task,
128
- binary_features=binary_features)
129
- objective_functions.append(current_objective)
130
- objective_function_names.append(file_name)
131
- return objective_functions, objective_function_names
132
-
133
-
134
- def _set_boundaries(lower_boundaries: list[float], upper_boundaries: list[float]):
135
- assert len(lower_boundaries) == len(upper_boundaries), "Lower and upper boundaries must have the same length."
136
- assert len(lower_boundaries) >= 1, "At least one boundary pair is required."
137
- lower = np.array(lower_boundaries)
138
- upper = np.array(upper_boundaries)
139
- return lower, upper
140
-
141
-
142
- def _set_feature_names(size: int, names: Union[list[str], None]):
143
- if names is None:
144
- return [str(i) for i in range(1, size+1)]
145
- else:
146
- assert len(names) == size, "List with feature names do not match the number of features"
147
- return names
148
-
149
-
150
- def _save_results(*dicts, save_dir: str, target_name: str):
151
- combined_dict = dict()
152
- for single_dict in dicts:
153
- combined_dict.update(single_dict)
154
-
155
- sanitized_target_name = sanitize_filename(target_name)
156
-
157
- full_path = os.path.join(save_dir, f"Optimization_{sanitized_target_name}.csv")
158
- pl.DataFrame(combined_dict).write_csv(full_path)
159
-
160
-
161
- def run_pso(lower_boundaries: list[float],
162
- upper_boundaries: list[float],
163
- objective_function: ObjectiveFunction,
164
- save_results_dir: str,
165
- auto_binary_boundaries: bool=True,
166
- target_name: Union[str, None]=None,
167
- feature_names: Union[list[str], None]=None,
168
- swarm_size: int=200,
169
- max_iterations: int=1000,
170
- inequality_constrain_function=None,
171
- post_hoc_analysis: Optional[int]=3,
172
- workers: int=1) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
173
- """
174
- Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results as a CSV file.
175
-
176
- Parameters
177
- ----------
178
- lower_boundaries : list[float]
179
- Lower bounds for each feature in the search space (as many as features expected by the model).
180
- upper_boundaries : list[float]
181
- Upper bounds for each feature in the search space (as many as features expected by the model).
182
- objective_function : ObjectiveFunction
183
- A callable object encapsulating a tree-based regression model.
184
- save_results_dir : str
185
- Directory path to save the results CSV file.
186
- auto_binary_boundaries : bool
187
- Use `ObjectiveFunction.binary_features` to append as many binary boundaries as needed to `lower_boundaries` and `upper_boundaries` automatically.
188
- target_name : str or None, optional
189
- Name of the target variable. If None, attempts to retrieve from the ObjectiveFunction object.
190
- feature_names : list[str] or None, optional
191
- List of feature names. If None, attempts to retrieve from the ObjectiveFunction or generate generic names.
192
- swarm_size : int, default=100
193
- Number of particles in the swarm.
194
- max_iterations : int, default=100
195
- Maximum number of iterations for the optimization algorithm.
196
- inequality_constrain_function : callable or None, optional
197
- Optional function defining inequality constraints to be respected by the optimization.
198
- post_hoc_analysis : int or None
199
- If specified, runs the optimization multiple times to perform post hoc analysis. The value indicates the number of repetitions.
200
- workers : int
201
- Number of parallel processes to use.
202
-
203
- Returns
204
- -------
205
- Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]
206
- If `post_hoc_analysis` is None, returns two dictionaries:
207
- - feature_names: Feature values (after inverse scaling) that yield the best result.
208
- - target_name: Best result obtained for the target variable.
209
-
210
- If `post_hoc_analysis` is an integer, returns two dictionaries:
211
- - feature_names: Lists of best feature values (after inverse scaling) for each repetition.
212
- - target_name: List of best target values across repetitions.
213
-
214
- Notes
215
- -----
216
- - PSO minimizes the objective function by default; if maximization is desired, it should be handled inside the ObjectiveFunction.
217
- """
218
- # set local deep copies to prevent in place list modification
219
- local_lower_boundaries = deepcopy(lower_boundaries)
220
- local_upper_boundaries = deepcopy(upper_boundaries)
221
-
222
- # Append binary boundaries
223
- binary_number = objective_function.binary_features
224
- if auto_binary_boundaries and binary_number > 0:
225
- local_lower_boundaries.extend([0] * binary_number)
226
- local_upper_boundaries.extend([1] * binary_number)
227
-
228
- # Set the total length of features
229
- size_of_features = len(local_lower_boundaries)
230
-
231
- lower, upper = _set_boundaries(local_lower_boundaries, local_upper_boundaries)
232
-
233
- # feature names
234
- if feature_names is None and objective_function.feature_names is not None:
235
- feature_names = objective_function.feature_names
236
- names = _set_feature_names(size=size_of_features, names=feature_names)
237
-
238
- # target name
239
- if target_name is None and objective_function.target_name is not None:
240
- target_name = objective_function.target_name
241
- if target_name is None:
242
- target_name = "Target"
243
-
244
- arguments = {
245
- "func":objective_function,
246
- "lb": lower,
247
- "ub": upper,
248
- "f_ieqcons": inequality_constrain_function,
249
- "swarmsize": swarm_size,
250
- "maxiter": max_iterations,
251
- "processes": workers,
252
- "particle_output": False
253
- }
254
-
255
- os.makedirs(save_results_dir, exist_ok=True)
256
-
257
- if post_hoc_analysis is None or post_hoc_analysis == 1:
258
- best_features, best_target, *_ = _pso(**arguments)
259
- # best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)
260
-
261
- # flip best_target if maximization was used
262
- if objective_function.task == "maximization":
263
- best_target = -best_target
264
-
265
- # threshold binary features
266
- best_features_threshold = threshold_binary_values(best_features, binary_number)
267
-
268
- # name features
269
- best_features_named = {name: value for name, value in zip(names, best_features_threshold)}
270
- best_target_named = {target_name: best_target}
271
-
272
- # save results
273
- _save_results(best_features_named, best_target_named, save_dir=save_results_dir, target_name=target_name)
274
-
275
- return best_features_named, best_target_named
276
- else:
277
- all_best_targets = list()
278
- all_best_features = [[] for _ in range(size_of_features)]
279
- for _ in range(post_hoc_analysis):
280
- best_features, best_target, *_ = _pso(**arguments)
281
- # best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)
282
-
283
- # flip best_target if maximization was used
284
- if objective_function.task == "maximization":
285
- best_target = -best_target
286
-
287
- # threshold binary features
288
- best_features_threshold = threshold_binary_values(best_features, binary_number)
289
-
290
- for i, best_feature in enumerate(best_features_threshold):
291
- all_best_features[i].append(best_feature)
292
- all_best_targets.append(best_target)
293
-
294
- # name features
295
- all_best_features_named = {name: list_values for name, list_values in zip(names, all_best_features)}
296
- all_best_targets_named = {target_name: all_best_targets}
297
-
298
- # save results
299
- _save_results(all_best_features_named, all_best_targets_named, save_dir=save_results_dir, target_name=target_name)
300
-
301
- return all_best_features_named, all_best_targets_named # type: ignore
302
-
303
-
304
- def info():
305
- _script_info(__all__)
306
-
307
-
308
- ### SOURCE CODE FOR PSO FROM PYSWARM ###
309
- def _obj_wrapper(func, args, kwargs, x):
310
- return func(x, *args, **kwargs)
311
-
312
- def _is_feasible_wrapper(func, x):
313
- return np.all(func(x)>=0)
314
-
315
- def _cons_none_wrapper(x):
316
- return np.array([0])
317
-
318
- def _cons_ieqcons_wrapper(ieqcons, args, kwargs, x):
319
- return np.array([y(x, *args, **kwargs) for y in ieqcons])
320
-
321
- def _cons_f_ieqcons_wrapper(f_ieqcons, args, kwargs, x):
322
- return np.array(f_ieqcons(x, *args, **kwargs))
323
-
324
- def _pso(func, lb, ub, ieqcons=[], f_ieqcons=None, args=(), kwargs={},
325
- swarmsize=100, omega=0.5, phip=0.5, phig=0.5, maxiter=100,
326
- minstep=1e-8, minfunc=1e-8, debug=False, processes=1,
327
- particle_output=False):
328
- """
329
- Perform a particle swarm optimization (PSO)
330
-
331
- Parameters
332
- ==========
333
- func : function
334
- The function to be minimized
335
- lb : array
336
- The lower bounds of the design variable(s)
337
- ub : array
338
- The upper bounds of the design variable(s)
339
-
340
- Optional
341
- ========
342
- ieqcons : list
343
- A list of functions of length n such that ieqcons[j](x,*args) >= 0.0 in
344
- a successfully optimized problem (Default: [])
345
- f_ieqcons : function
346
- Returns a 1-D array in which each element must be greater or equal
347
- to 0.0 in a successfully optimized problem. If f_ieqcons is specified,
348
- ieqcons is ignored (Default: None)
349
- args : tuple
350
- Additional arguments passed to objective and constraint functions
351
- (Default: empty tuple)
352
- kwargs : dict
353
- Additional keyword arguments passed to objective and constraint
354
- functions (Default: empty dict)
355
- swarmsize : int
356
- The number of particles in the swarm (Default: 100)
357
- omega : scalar
358
- Particle velocity scaling factor (Default: 0.5)
359
- phip : scalar
360
- Scaling factor to search away from the particle's best known position
361
- (Default: 0.5)
362
- phig : scalar
363
- Scaling factor to search away from the swarm's best known position
364
- (Default: 0.5)
365
- maxiter : int
366
- The maximum number of iterations for the swarm to search (Default: 100)
367
- minstep : scalar
368
- The minimum stepsize of swarm's best position before the search
369
- terminates (Default: 1e-8)
370
- minfunc : scalar
371
- The minimum change of swarm's best objective value before the search
372
- terminates (Default: 1e-8)
373
- debug : boolean
374
- If True, progress statements will be displayed every iteration
375
- (Default: False)
376
- processes : int
377
- The number of processes to use to evaluate objective function and
378
- constraints (default: 1)
379
- particle_output : boolean
380
- Whether to include the best per-particle position and the objective
381
- values at those.
382
-
383
- Returns
384
- =======
385
- g : array
386
- The swarm's best known position (optimal design)
387
- f : scalar
388
- The objective value at ``g``
389
- p : array
390
- The best known position per particle
391
- pf: arrray
392
- The objective values at each position in p
393
-
394
- """
395
-
396
- assert len(lb)==len(ub), 'Lower- and upper-bounds must be the same length'
397
- assert hasattr(func, '__call__'), 'Invalid function handle'
398
- lb = np.array(lb)
399
- ub = np.array(ub)
400
- assert np.all(ub>lb), 'All upper-bound values must be greater than lower-bound values'
401
-
402
- vhigh = np.abs(ub - lb)
403
- vlow = -vhigh
404
-
405
- # Initialize objective function
406
- obj = partial(_obj_wrapper, func, args, kwargs)
407
-
408
- # Check for constraint function(s) #########################################
409
- if f_ieqcons is None:
410
- if not len(ieqcons):
411
- if debug:
412
- print('No constraints given.')
413
- cons = _cons_none_wrapper
414
- else:
415
- if debug:
416
- print('Converting ieqcons to a single constraint function')
417
- cons = partial(_cons_ieqcons_wrapper, ieqcons, args, kwargs)
418
- else:
419
- if debug:
420
- print('Single constraint function given in f_ieqcons')
421
- cons = partial(_cons_f_ieqcons_wrapper, f_ieqcons, args, kwargs)
422
- is_feasible = partial(_is_feasible_wrapper, cons)
423
-
424
- # Initialize the multiprocessing module if necessary
425
- if processes > 1:
426
- import multiprocessing
427
- mp_pool = multiprocessing.Pool(processes)
428
-
429
- # Initialize the particle swarm ############################################
430
- S = swarmsize
431
- D = len(lb) # the number of dimensions each particle has
432
- x = np.random.rand(S, D) # particle positions
433
- v = np.zeros_like(x) # particle velocities
434
- p = np.zeros_like(x) # best particle positions
435
- fx = np.zeros(S) # current particle function values
436
- fs = np.zeros(S, dtype=bool) # feasibility of each particle
437
- fp = np.ones(S)*np.inf # best particle function values
438
- g = [] # best swarm position
439
- fg = np.inf # best swarm position starting value
440
-
441
- # Initialize the particle's position
442
- x = lb + x*(ub - lb)
443
-
444
- # Calculate objective and constraints for each particle
445
- if processes > 1:
446
- fx = np.array(mp_pool.map(obj, x))
447
- fs = np.array(mp_pool.map(is_feasible, x))
448
- else:
449
- for i in range(S):
450
- fx[i] = obj(x[i, :])
451
- fs[i] = is_feasible(x[i, :])
452
-
453
- # Store particle's best position (if constraints are satisfied)
454
- i_update = np.logical_and((fx < fp), fs)
455
- p[i_update, :] = x[i_update, :].copy()
456
- fp[i_update] = fx[i_update]
457
-
458
- # Update swarm's best position
459
- i_min = np.argmin(fp)
460
- if fp[i_min] < fg:
461
- fg = fp[i_min]
462
- g = p[i_min, :].copy()
463
- else:
464
- # At the start, there may not be any feasible starting point, so just
465
- # give it a temporary "best" point since it's likely to change
466
- g = x[0, :].copy()
467
-
468
- # Initialize the particle's velocity
469
- v = vlow + np.random.rand(S, D)*(vhigh - vlow)
470
-
471
- # Iterate until termination criterion met ##################################
472
- it = 1
473
- while it <= maxiter:
474
- rp = np.random.uniform(size=(S, D))
475
- rg = np.random.uniform(size=(S, D))
476
-
477
- # Update the particles velocities
478
- v = omega*v + phip*rp*(p - x) + phig*rg*(g - x)
479
- # Update the particles' positions
480
- x = x + v
481
- # Correct for bound violations
482
- maskl = x < lb
483
- masku = x > ub
484
- x = x*(~np.logical_or(maskl, masku)) + lb*maskl + ub*masku
485
-
486
- # Update objectives and constraints
487
- if processes > 1:
488
- fx = np.array(mp_pool.map(obj, x))
489
- fs = np.array(mp_pool.map(is_feasible, x))
490
- else:
491
- for i in range(S):
492
- fx[i] = obj(x[i, :])
493
- fs[i] = is_feasible(x[i, :])
494
-
495
- # Store particle's best position (if constraints are satisfied)
496
- i_update = np.logical_and((fx < fp), fs)
497
- p[i_update, :] = x[i_update, :].copy()
498
- fp[i_update] = fx[i_update]
499
-
500
- # Compare swarm's best position with global best position
501
- i_min = np.argmin(fp)
502
- if fp[i_min] < fg:
503
- if debug:
504
- print('New best for swarm at iteration {:}: {:} {:}'\
505
- .format(it, p[i_min, :], fp[i_min]))
506
-
507
- p_min = p[i_min, :].copy()
508
- stepsize = np.sqrt(np.sum((g - p_min)**2))
509
-
510
- if np.abs(fg - fp[i_min]) <= minfunc:
511
- print('Stopping search: Swarm best objective change less than {:}'\
512
- .format(minfunc))
513
- if particle_output:
514
- return p_min, fp[i_min], p, fp
515
- else:
516
- return p_min, fp[i_min]
517
- elif stepsize <= minstep:
518
- print('Stopping search: Swarm best position change less than {:}'\
519
- .format(minstep))
520
- if particle_output:
521
- return p_min, fp[i_min], p, fp
522
- else:
523
- return p_min, fp[i_min]
524
- else:
525
- g = p_min.copy()
526
- fg = fp[i_min]
527
-
528
- if debug:
529
- print('Best after iteration {:}: {:} {:}'.format(it, g, fg))
530
- it += 1
531
-
532
- print('Stopping search: maximum iterations reached --> {:}'.format(maxiter))
533
-
534
- if not is_feasible(g):
535
- print("However, the optimization couldn't find a feasible design. Sorry")
536
- if particle_output:
537
- return g, fg, p, fp
538
- else:
539
- return g, fg