dragon-ml-toolbox 1.4.7__tar.gz → 2.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {dragon_ml_toolbox-1.4.7 → dragon_ml_toolbox-2.0.0}/LICENSE-THIRD-PARTY.md +5 -4
  2. {dragon_ml_toolbox-1.4.7/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-2.0.0}/PKG-INFO +24 -14
  3. {dragon_ml_toolbox-1.4.7 → dragon_ml_toolbox-2.0.0}/README.md +20 -11
  4. {dragon_ml_toolbox-1.4.7 → dragon_ml_toolbox-2.0.0/dragon_ml_toolbox.egg-info}/PKG-INFO +24 -14
  5. {dragon_ml_toolbox-1.4.7 → dragon_ml_toolbox-2.0.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +2 -1
  6. {dragon_ml_toolbox-1.4.7 → dragon_ml_toolbox-2.0.0}/dragon_ml_toolbox.egg-info/requires.txt +2 -1
  7. {dragon_ml_toolbox-1.4.7 → dragon_ml_toolbox-2.0.0}/ml_tools/MICE_imputation.py +1 -1
  8. dragon_ml_toolbox-2.0.0/ml_tools/PSO_optimization.py +490 -0
  9. dragon_ml_toolbox-1.4.7/ml_tools/particle_swarm_optimization.py → dragon_ml_toolbox-2.0.0/ml_tools/_particle_swarm_optimization.py +5 -1
  10. {dragon_ml_toolbox-1.4.7 → dragon_ml_toolbox-2.0.0}/ml_tools/data_exploration.py +1 -1
  11. {dragon_ml_toolbox-1.4.7 → dragon_ml_toolbox-2.0.0}/ml_tools/ensemble_learning.py +0 -1
  12. {dragon_ml_toolbox-1.4.7 → dragon_ml_toolbox-2.0.0}/ml_tools/handle_excel.py +1 -1
  13. {dragon_ml_toolbox-1.4.7 → dragon_ml_toolbox-2.0.0}/ml_tools/logger.py +1 -1
  14. {dragon_ml_toolbox-1.4.7 → dragon_ml_toolbox-2.0.0}/ml_tools/utilities.py +34 -0
  15. {dragon_ml_toolbox-1.4.7 → dragon_ml_toolbox-2.0.0}/pyproject.toml +5 -4
  16. {dragon_ml_toolbox-1.4.7 → dragon_ml_toolbox-2.0.0}/LICENSE +0 -0
  17. {dragon_ml_toolbox-1.4.7 → dragon_ml_toolbox-2.0.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  18. {dragon_ml_toolbox-1.4.7 → dragon_ml_toolbox-2.0.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  19. {dragon_ml_toolbox-1.4.7 → dragon_ml_toolbox-2.0.0}/ml_tools/VIF_factor.py +0 -0
  20. {dragon_ml_toolbox-1.4.7 → dragon_ml_toolbox-2.0.0}/ml_tools/__init__.py +0 -0
  21. {dragon_ml_toolbox-1.4.7 → dragon_ml_toolbox-2.0.0}/ml_tools/datasetmaster.py +0 -0
  22. {dragon_ml_toolbox-1.4.7 → dragon_ml_toolbox-2.0.0}/ml_tools/pytorch_models.py +0 -0
  23. {dragon_ml_toolbox-1.4.7 → dragon_ml_toolbox-2.0.0}/ml_tools/trainer.py +0 -0
  24. {dragon_ml_toolbox-1.4.7 → dragon_ml_toolbox-2.0.0}/ml_tools/vision_helpers.py +0 -0
  25. {dragon_ml_toolbox-1.4.7 → dragon_ml_toolbox-2.0.0}/setup.cfg +0 -0
@@ -5,10 +5,10 @@ This project depends on the following third-party packages. Each is governed by
5
5
  - [pandas](https://github.com/pandas-dev/pandas/blob/main/LICENSE)
6
6
  - [numpy](https://github.com/numpy/numpy/blob/main/LICENSE.txt)
7
7
  - [matplotlib](https://github.com/matplotlib/matplotlib/blob/main/LICENSE/LICENSE)
8
- - [seaborn](https://github.com/mwaskom/seaborn/blob/main/LICENSE)
8
+ - [seaborn](https://github.com/mwaskom/seaborn/blob/master/LICENSE.md)
9
9
  - [statsmodels](https://github.com/statsmodels/statsmodels/blob/main/LICENSE.txt)
10
- - [ipython](https://github.com/ipython/ipython/blob/main/COPYING.rst)
11
- - [ipykernel](https://github.com/ipython/ipykernel/blob/main/COPYING.rst)
10
+ - [ipython](https://github.com/ipython/ipython/blob/main/LICENSE)
11
+ - [ipykernel](https://github.com/ipython/ipykernel/blob/main/LICENSE)
12
12
  - [notebook](https://github.com/jupyter/notebook/blob/main/LICENSE)
13
13
  - [jupyterlab](https://github.com/jupyterlab/jupyterlab/blob/main/LICENSE)
14
14
  - [ipywidgets](https://github.com/jupyter-widgets/ipywidgets/blob/main/LICENSE)
@@ -24,5 +24,6 @@ This project depends on the following third-party packages. Each is governed by
24
24
  - [openpyxl](https://github.com/chronossc/openpyxl/blob/main/LICENSE)
25
25
  - [miceforest](https://github.com/AnotherSamWilson/miceforest/blob/main/LICENSE)
26
26
  - [polars](https://github.com/pola-rs/polars/blob/main/LICENSE)
27
- - [plotnine](https://github.com/has2k1/plotnine/blob/main/LICENSE.txt)
27
+ - [plotnine](https://github.com/has2k1/plotnine/blob/main/LICENSE)
28
28
  - [pyswarm](https://pythonhosted.org/pyswarm/#license)
29
+ - [tqdm](https://github.com/tqdm/tqdm/blob/master/LICENSE)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 1.4.7
3
+ Version: 2.0.0
4
4
  Summary: A collection of tools for data science and machine learning projects
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -8,7 +8,7 @@ Project-URL: Homepage, https://github.com/DrAg0n-BoRn/ML_tools
8
8
  Project-URL: Changelog, https://github.com/DrAg0n-BoRn/ML_tools/blob/master/CHANGELOG.md
9
9
  Classifier: Programming Language :: Python :: 3
10
10
  Classifier: Operating System :: OS Independent
11
- Requires-Python: >=3.9
11
+ Requires-Python: >=3.10
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
14
  License-File: LICENSE-THIRD-PARTY.md
@@ -32,9 +32,10 @@ Requires-Dist: joblib
32
32
  Requires-Dist: xgboost
33
33
  Requires-Dist: lightgbm<=4.5.0
34
34
  Requires-Dist: shap
35
+ Requires-Dist: tqdm>=4.0
36
+ Requires-Dist: Pillow
35
37
  Provides-Extra: pytorch
36
38
  Requires-Dist: torch; extra == "pytorch"
37
- Requires-Dist: Pillow; extra == "pytorch"
38
39
  Requires-Dist: torchvision; extra == "pytorch"
39
40
  Dynamic: license-file
40
41
 
@@ -49,7 +50,7 @@ A collection of Python utilities for data science and machine learning, structur
49
50
 
50
51
  ## Installation
51
52
 
52
- **Python 3.9+ recommended.**
53
+ **Python 3.10+ recommended.**
53
54
 
54
55
  ### Via PyPI
55
56
 
@@ -59,6 +60,16 @@ Install the latest stable release from PyPI:
59
60
  pip install dragon-ml-tools
60
61
  ```
61
62
 
63
+ ### Via GitHub (Editable)
64
+
65
+ Clone the repository and install in editable mode with optional dependencies:
66
+
67
+ ```bash
68
+ git clone https://github.com/DrAg0n-BoRn/ML_tools.git
69
+ cd ML_tools
70
+ pip install -e .
71
+ ```
72
+
62
73
  ### Via conda-forge
63
74
 
64
75
  Install from the conda-forge channel:
@@ -66,22 +77,21 @@ Install from the conda-forge channel:
66
77
  ```bash
67
78
  conda install -c conda-forge dragon-ml-toolbox
68
79
  ```
80
+ **Note:** This version is outdated or broken due to dependency incompatibilities.
69
81
 
70
- #### Optional dependencies
82
+ ## Optional dependencies
83
+
84
+ **PyTorch**, which provides different builds depending on the **platform** and **hardware acceleration** (e.g., CUDA for NVIDIA GPUs on Linux/Windows, or MPS for Apple Silicon on macOS).
85
+
86
+ Install the default CPU-only version with
71
87
 
72
88
  ```bash
73
89
  pip install dragon-ml-tools[pytorch]
74
90
  ```
75
91
 
76
- ### Via GitHub (Editable)
77
-
78
- Clone the repository and install in editable mode with optional dependencies:
92
+ To make use of GPU acceleration use the official PyTorch installation instructions:
79
93
 
80
- ```bash
81
- git clone https://github.com/DrAg0n-BoRn/ML_tools.git
82
- cd ML_tools
83
- pip install -e .
84
- ```
94
+ [PyTorch Instructions](https://pytorch.org/get-started/locally/)
85
95
 
86
96
  ## Usage
87
97
 
@@ -101,7 +111,7 @@ ensemble_learning
101
111
  handle_excel
102
112
  logger
103
113
  MICE_imputation
104
- particle_swarm_optimization
114
+ PSO_optimization
105
115
  trainer
106
116
  utilities
107
117
  VIF_factor
@@ -9,7 +9,7 @@ A collection of Python utilities for data science and machine learning, structur
9
9
 
10
10
  ## Installation
11
11
 
12
- **Python 3.9+ recommended.**
12
+ **Python 3.10+ recommended.**
13
13
 
14
14
  ### Via PyPI
15
15
 
@@ -19,6 +19,16 @@ Install the latest stable release from PyPI:
19
19
  pip install dragon-ml-tools
20
20
  ```
21
21
 
22
+ ### Via GitHub (Editable)
23
+
24
+ Clone the repository and install in editable mode with optional dependencies:
25
+
26
+ ```bash
27
+ git clone https://github.com/DrAg0n-BoRn/ML_tools.git
28
+ cd ML_tools
29
+ pip install -e .
30
+ ```
31
+
22
32
  ### Via conda-forge
23
33
 
24
34
  Install from the conda-forge channel:
@@ -26,22 +36,21 @@ Install from the conda-forge channel:
26
36
  ```bash
27
37
  conda install -c conda-forge dragon-ml-toolbox
28
38
  ```
39
+ **Note:** This version is outdated or broken due to dependency incompatibilities.
29
40
 
30
- #### Optional dependencies
41
+ ## Optional dependencies
42
+
43
+ **PyTorch**, which provides different builds depending on the **platform** and **hardware acceleration** (e.g., CUDA for NVIDIA GPUs on Linux/Windows, or MPS for Apple Silicon on macOS).
44
+
45
+ Install the default CPU-only version with
31
46
 
32
47
  ```bash
33
48
  pip install dragon-ml-tools[pytorch]
34
49
  ```
35
50
 
36
- ### Via GitHub (Editable)
37
-
38
- Clone the repository and install in editable mode with optional dependencies:
51
+ To make use of GPU acceleration use the official PyTorch installation instructions:
39
52
 
40
- ```bash
41
- git clone https://github.com/DrAg0n-BoRn/ML_tools.git
42
- cd ML_tools
43
- pip install -e .
44
- ```
53
+ [PyTorch Instructions](https://pytorch.org/get-started/locally/)
45
54
 
46
55
  ## Usage
47
56
 
@@ -61,7 +70,7 @@ ensemble_learning
61
70
  handle_excel
62
71
  logger
63
72
  MICE_imputation
64
- particle_swarm_optimization
73
+ PSO_optimization
65
74
  trainer
66
75
  utilities
67
76
  VIF_factor
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 1.4.7
3
+ Version: 2.0.0
4
4
  Summary: A collection of tools for data science and machine learning projects
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -8,7 +8,7 @@ Project-URL: Homepage, https://github.com/DrAg0n-BoRn/ML_tools
8
8
  Project-URL: Changelog, https://github.com/DrAg0n-BoRn/ML_tools/blob/master/CHANGELOG.md
9
9
  Classifier: Programming Language :: Python :: 3
10
10
  Classifier: Operating System :: OS Independent
11
- Requires-Python: >=3.9
11
+ Requires-Python: >=3.10
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
14
  License-File: LICENSE-THIRD-PARTY.md
@@ -32,9 +32,10 @@ Requires-Dist: joblib
32
32
  Requires-Dist: xgboost
33
33
  Requires-Dist: lightgbm<=4.5.0
34
34
  Requires-Dist: shap
35
+ Requires-Dist: tqdm>=4.0
36
+ Requires-Dist: Pillow
35
37
  Provides-Extra: pytorch
36
38
  Requires-Dist: torch; extra == "pytorch"
37
- Requires-Dist: Pillow; extra == "pytorch"
38
39
  Requires-Dist: torchvision; extra == "pytorch"
39
40
  Dynamic: license-file
40
41
 
@@ -49,7 +50,7 @@ A collection of Python utilities for data science and machine learning, structur
49
50
 
50
51
  ## Installation
51
52
 
52
- **Python 3.9+ recommended.**
53
+ **Python 3.10+ recommended.**
53
54
 
54
55
  ### Via PyPI
55
56
 
@@ -59,6 +60,16 @@ Install the latest stable release from PyPI:
59
60
  pip install dragon-ml-tools
60
61
  ```
61
62
 
63
+ ### Via GitHub (Editable)
64
+
65
+ Clone the repository and install in editable mode with optional dependencies:
66
+
67
+ ```bash
68
+ git clone https://github.com/DrAg0n-BoRn/ML_tools.git
69
+ cd ML_tools
70
+ pip install -e .
71
+ ```
72
+
62
73
  ### Via conda-forge
63
74
 
64
75
  Install from the conda-forge channel:
@@ -66,22 +77,21 @@ Install from the conda-forge channel:
66
77
  ```bash
67
78
  conda install -c conda-forge dragon-ml-toolbox
68
79
  ```
80
+ **Note:** This version is outdated or broken due to dependency incompatibilities.
69
81
 
70
- #### Optional dependencies
82
+ ## Optional dependencies
83
+
84
+ **PyTorch**, which provides different builds depending on the **platform** and **hardware acceleration** (e.g., CUDA for NVIDIA GPUs on Linux/Windows, or MPS for Apple Silicon on macOS).
85
+
86
+ Install the default CPU-only version with
71
87
 
72
88
  ```bash
73
89
  pip install dragon-ml-tools[pytorch]
74
90
  ```
75
91
 
76
- ### Via GitHub (Editable)
77
-
78
- Clone the repository and install in editable mode with optional dependencies:
92
+ To make use of GPU acceleration use the official PyTorch installation instructions:
79
93
 
80
- ```bash
81
- git clone https://github.com/DrAg0n-BoRn/ML_tools.git
82
- cd ML_tools
83
- pip install -e .
84
- ```
94
+ [PyTorch Instructions](https://pytorch.org/get-started/locally/)
85
95
 
86
96
  ## Usage
87
97
 
@@ -101,7 +111,7 @@ ensemble_learning
101
111
  handle_excel
102
112
  logger
103
113
  MICE_imputation
104
- particle_swarm_optimization
114
+ PSO_optimization
105
115
  trainer
106
116
  utilities
107
117
  VIF_factor
@@ -8,14 +8,15 @@ dragon_ml_toolbox.egg-info/dependency_links.txt
8
8
  dragon_ml_toolbox.egg-info/requires.txt
9
9
  dragon_ml_toolbox.egg-info/top_level.txt
10
10
  ml_tools/MICE_imputation.py
11
+ ml_tools/PSO_optimization.py
11
12
  ml_tools/VIF_factor.py
12
13
  ml_tools/__init__.py
14
+ ml_tools/_particle_swarm_optimization.py
13
15
  ml_tools/data_exploration.py
14
16
  ml_tools/datasetmaster.py
15
17
  ml_tools/ensemble_learning.py
16
18
  ml_tools/handle_excel.py
17
19
  ml_tools/logger.py
18
- ml_tools/particle_swarm_optimization.py
19
20
  ml_tools/pytorch_models.py
20
21
  ml_tools/trainer.py
21
22
  ml_tools/utilities.py
@@ -18,8 +18,9 @@ joblib
18
18
  xgboost
19
19
  lightgbm<=4.5.0
20
20
  shap
21
+ tqdm>=4.0
22
+ Pillow
21
23
 
22
24
  [pytorch]
23
25
  torch
24
- Pillow
25
26
  torchvision
@@ -3,7 +3,7 @@ import miceforest as mf
3
3
  import os
4
4
  import matplotlib.pyplot as plt
5
5
  import numpy as np
6
- from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info, merge_dataframes, save_dataframe, threshold_binary_values
6
+ from .utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info, merge_dataframes, save_dataframe, threshold_binary_values
7
7
  from plotnine import ggplot, labs, theme, element_blank # type: ignore
8
8
  from typing import Optional
9
9
 
@@ -0,0 +1,490 @@
1
+ import numpy as np
2
+ import os
3
+ import xgboost as xgb
4
+ import lightgbm as lgb
5
+ from sklearn.ensemble import HistGradientBoostingRegressor
6
+ from sklearn.base import ClassifierMixin
7
+ from typing import Literal, Union, Tuple, Dict, Optional
8
+ import pandas as pd
9
+ from copy import deepcopy
10
+ from .utilities import _script_info, threshold_binary_values, threshold_binary_values_batch, deserialize_object, list_files_by_extension, save_dataframe
11
+ import torch
12
+ from tqdm import trange
13
+
14
+
15
+ __all__ = [
16
+ "ObjectiveFunction",
17
+ "multiple_objective_functions_from_dir",
18
+ "run_pso"
19
+ ]
20
+
21
+
22
+ class ObjectiveFunction():
23
+ """
24
+ Callable objective function designed for optimizing continuous outputs from tree-based regression models.
25
+
26
+ The target serialized file (joblib) must include a trained tree-based 'model'. Additionally 'feature_names' and 'target_name' will be parsed if present.
27
+
28
+ Parameters
29
+ ----------
30
+ trained_model_path : str
31
+ Path to a serialized model (joblib) compatible with scikit-learn-like `.predict`.
32
+ add_noise : bool
33
+ Whether to apply multiplicative noise to the input features during evaluation.
34
+ task : (Literal["maximization", "minimization"])
35
+ Whether to maximize or minimize the target.
36
+ binary_features : int
37
+ Number of binary features located at the END of the feature vector. Model should be trained with continuous features first, followed by binary.
38
+ """
39
+ def __init__(self, trained_model_path: str, add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int) -> None:
40
+ self.binary_features = binary_features
41
+ self.is_hybrid = False if binary_features <= 0 else True
42
+ self.use_noise = add_noise
43
+ self._artifact = deserialize_object(trained_model_path, verbose=False, raise_on_error=True)
44
+ self.model = self._get_from_artifact('model')
45
+ self.feature_names: Optional[list[str]] = self._get_from_artifact('feature_names') # type: ignore
46
+ self.target_name: Optional[str] = self._get_from_artifact('target_name') # type: ignore
47
+ self.task = task
48
+ self.check_model() # check for classification models and None values
49
+
50
+ def __call__(self, features_array: np.ndarray) -> np.ndarray:
51
+ """
52
+ Batched evaluation for PSO. Accepts 2D array (n_samples, n_features).
53
+
54
+ Applies optional noise and hybrid binary thresholding.
55
+
56
+ Returns
57
+ -------
58
+ np.ndarray
59
+ 1D array with length n_samples containing predicted target values.
60
+ """
61
+ assert features_array.ndim == 2, f"Expected 2D array, got shape {features_array.shape}"
62
+
63
+ # Apply noise if enabled
64
+ if self.use_noise:
65
+ features_array = self.add_noise(features_array)
66
+
67
+ # Apply binary thresholding if enabled
68
+ if self.is_hybrid:
69
+ features_array = threshold_binary_values_batch(features_array, self.binary_features)
70
+
71
+ # Ensure correct type
72
+ features_array = features_array.astype(np.float32)
73
+
74
+ # Evaluate
75
+ result = self.model.predict(features_array) # type: ignore
76
+
77
+ # Flip sign if maximizing
78
+ if self.task == "maximization":
79
+ return -result
80
+ return result
81
+
82
+ def add_noise(self, features_array: np.ndarray) -> np.ndarray:
83
+ """
84
+ Apply multiplicative noise to input feature batch (2D).
85
+ Binary features (if present) are excluded from noise injection.
86
+
87
+ Parameters
88
+ ----------
89
+ features_array : np.ndarray
90
+ Input array of shape (batch_size, n_features)
91
+
92
+ Returns
93
+ -------
94
+ np.ndarray
95
+ Noised array of same shape
96
+ """
97
+ assert features_array.ndim == 2, "Expected 2D array for batch noise injection"
98
+
99
+ if self.binary_features > 0:
100
+ split_idx = -self.binary_features
101
+ cont_part = features_array[:, :split_idx]
102
+ bin_part = features_array[:, split_idx:]
103
+
104
+ noise = np.random.uniform(0.95, 1.05, size=cont_part.shape)
105
+ cont_noised = cont_part * noise
106
+
107
+ return np.hstack([cont_noised, bin_part])
108
+ else:
109
+ noise = np.random.uniform(0.95, 1.05, size=features_array.shape)
110
+ return features_array * noise
111
+
112
+ def check_model(self):
113
+ if isinstance(self.model, ClassifierMixin) or isinstance(self.model, xgb.XGBClassifier) or isinstance(self.model, lgb.LGBMClassifier):
114
+ raise ValueError(f"[Model Check Failed] ❌\nThe loaded model ({type(self.model).__name__}) is a Classifier.\nOptimization is not suitable for standard classification tasks.")
115
+ if self.model is None:
116
+ raise ValueError("Loaded model is None")
117
+
118
+ def _get_from_artifact(self, key: str):
119
+ if self._artifact is None:
120
+ raise TypeError("Load model error")
121
+ val = self._artifact.get(key)
122
+ if key == "feature_names":
123
+ result = val if isinstance(val, list) and val else None
124
+ else:
125
+ result = val if val else None
126
+ return result
127
+
128
+ def __repr__(self):
129
+ return (f"<ObjectiveFunction(model={type(self.model).__name__}, use_noise={self.use_noise}, is_hybrid={self.is_hybrid}, task='{self.task}')>")
130
+
131
+
132
+ def multiple_objective_functions_from_dir(directory: str, add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int):
133
+ """
134
+ Loads multiple objective functions from serialized models in the given directory.
135
+
136
+ Each `.joblib` file which is loaded and wrapped as an `ObjectiveFunction` instance. Returns a list of such instances along with their corresponding names.
137
+
138
+ Parameters:
139
+ directory (str) : Path to the directory containing `.joblib` files (serialized models).
140
+ add_noise (bool) : Whether to apply multiplicative noise to the input features during evaluation.
141
+ task (Literal["maximization", "minimization"]) : Defines the nature of the optimization task.
142
+ binary_features (int) : Number of binary features expected by each objective function.
143
+
144
+ Returns:
145
+ (tuple[list[ObjectiveFunction], list[str]]) : A tuple containing:
146
+ - list of `ObjectiveFunction` instances.
147
+ - list of corresponding filenames.
148
+ """
149
+ objective_functions = list()
150
+ objective_function_names = list()
151
+ for file_name, file_path in list_files_by_extension(directory=directory, extension='joblib').items():
152
+ current_objective = ObjectiveFunction(trained_model_path=file_path,
153
+ add_noise=add_noise,
154
+ task=task,
155
+ binary_features=binary_features)
156
+ objective_functions.append(current_objective)
157
+ objective_function_names.append(file_name)
158
+ return objective_functions, objective_function_names
159
+
160
+
161
+ def _set_boundaries(lower_boundaries: list[float], upper_boundaries: list[float]):
162
+ assert len(lower_boundaries) == len(upper_boundaries), "Lower and upper boundaries must have the same length."
163
+ assert len(lower_boundaries) >= 1, "At least one boundary pair is required."
164
+ lower = np.array(lower_boundaries)
165
+ upper = np.array(upper_boundaries)
166
+ return lower, upper
167
+
168
+
169
+ def _set_feature_names(size: int, names: Union[list[str], None]):
170
+ if names is None:
171
+ return [str(i) for i in range(1, size+1)]
172
+ else:
173
+ assert len(names) == size, "List with feature names do not match the number of features"
174
+ return names
175
+
176
+
177
+ def _save_results(*dicts, save_dir: str, target_name: str):
178
+ combined_dict = dict()
179
+ for single_dict in dicts:
180
+ combined_dict.update(single_dict)
181
+
182
+ df = pd.DataFrame(combined_dict)
183
+
184
+ save_dataframe(df=df, save_dir=save_dir, filename=f"Optimization_{target_name}")
185
+
186
+
187
+ def run_pso(lower_boundaries: list[float],
188
+ upper_boundaries: list[float],
189
+ objective_function: ObjectiveFunction,
190
+ save_results_dir: str,
191
+ auto_binary_boundaries: bool=True,
192
+ target_name: Union[str, None]=None,
193
+ feature_names: Union[list[str], None]=None,
194
+ swarm_size: int=200,
195
+ max_iterations: int=1000,
196
+ random_state: int=101,
197
+ post_hoc_analysis: Optional[int]=3) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
198
+ """
199
+ Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results as a CSV file.
200
+
201
+ Parameters
202
+ ----------
203
+ lower_boundaries : list[float]
204
+ Lower bounds for each feature in the search space (as many as features expected by the model).
205
+ upper_boundaries : list[float]
206
+ Upper bounds for each feature in the search space (as many as features expected by the model).
207
+ objective_function : ObjectiveFunction
208
+ A callable object encapsulating a tree-based regression model.
209
+ save_results_dir : str
210
+ Directory path to save the results CSV file.
211
+ auto_binary_boundaries : bool
212
+ Use `ObjectiveFunction.binary_features` to append as many binary boundaries as needed to `lower_boundaries` and `upper_boundaries` automatically.
213
+ target_name : str or None, optional
214
+ Name of the target variable. If None, attempts to retrieve from the ObjectiveFunction object.
215
+ feature_names : list[str] or None, optional
216
+ List of feature names. If None, attempts to retrieve from the ObjectiveFunction or generate generic names.
217
+ swarm_size : int
218
+ Number of particles in the swarm.
219
+ max_iterations : int
220
+ Maximum number of iterations for the optimization algorithm.
221
+ post_hoc_analysis : int or None
222
+ If specified, runs the optimization multiple times to perform post hoc analysis. The value indicates the number of repetitions.
223
+
224
+ Returns
225
+ -------
226
+ Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]
227
+ If `post_hoc_analysis` is None, returns two dictionaries:
228
+ - feature_names: Feature values (after inverse scaling) that yield the best result.
229
+ - target_name: Best result obtained for the target variable.
230
+
231
+ If `post_hoc_analysis` is an integer, returns two dictionaries:
232
+ - feature_names: Lists of best feature values (after inverse scaling) for each repetition.
233
+ - target_name: List of best target values across repetitions.
234
+
235
+ Notes
236
+ -----
237
+ - PSO minimizes the objective function by default; if maximization is desired, it should be handled inside the ObjectiveFunction.
238
+ """
239
+ # Select device
240
+ if torch.cuda.is_available():
241
+ device = torch.device("cuda")
242
+ elif torch.backends.mps.is_available():
243
+ device = torch.device("mps")
244
+ else:
245
+ device = torch.device("cpu")
246
+ print(f"[PSO] Using device: '{device}'")
247
+
248
+ # set local deep copies to prevent in place list modification
249
+ local_lower_boundaries = deepcopy(lower_boundaries)
250
+ local_upper_boundaries = deepcopy(upper_boundaries)
251
+
252
+ # Append binary boundaries
253
+ binary_number = objective_function.binary_features
254
+ if auto_binary_boundaries and binary_number > 0:
255
+ local_lower_boundaries.extend([0] * binary_number)
256
+ local_upper_boundaries.extend([1] * binary_number)
257
+
258
+ # Set the total length of features
259
+ size_of_features = len(local_lower_boundaries)
260
+
261
+ lower, upper = _set_boundaries(local_lower_boundaries, local_upper_boundaries)
262
+
263
+ # feature names
264
+ if feature_names is None and objective_function.feature_names is not None:
265
+ feature_names = objective_function.feature_names
266
+ names = _set_feature_names(size=size_of_features, names=feature_names)
267
+
268
+ # target name
269
+ if target_name is None and objective_function.target_name is not None:
270
+ target_name = objective_function.target_name
271
+ if target_name is None:
272
+ target_name = "Target"
273
+
274
+ arguments = {
275
+ "func":objective_function,
276
+ "lb": lower,
277
+ "ub": upper,
278
+ "device": device,
279
+ "swarmsize": swarm_size,
280
+ "maxiter": max_iterations,
281
+ "particle_output": False,
282
+ }
283
+
284
+ os.makedirs(save_results_dir, exist_ok=True)
285
+
286
+ if post_hoc_analysis is None or post_hoc_analysis == 1:
287
+ arguments.update({"seed": random_state})
288
+
289
+ best_features, best_target, *_ = _pso(**arguments)
290
+ # best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)
291
+
292
+ # flip best_target if maximization was used
293
+ if objective_function.task == "maximization":
294
+ best_target = -best_target
295
+
296
+ # threshold binary features
297
+ best_features_threshold = threshold_binary_values(best_features, binary_number)
298
+
299
+ # name features
300
+ best_features_named = {name: value for name, value in zip(names, best_features_threshold)}
301
+ best_target_named = {target_name: best_target}
302
+
303
+ # save results
304
+ _save_results(best_features_named, best_target_named, save_dir=save_results_dir, target_name=target_name)
305
+
306
+ return best_features_named, best_target_named
307
+ else:
308
+ all_best_targets = list()
309
+ all_best_features = [[] for _ in range(size_of_features)]
310
+ for _ in range(post_hoc_analysis):
311
+ best_features, best_target, *_ = _pso(**arguments)
312
+ # best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)
313
+
314
+ # flip best_target if maximization was used
315
+ if objective_function.task == "maximization":
316
+ best_target = -best_target
317
+
318
+ # threshold binary features
319
+ best_features_threshold = threshold_binary_values(best_features, binary_number)
320
+
321
+ for i, best_feature in enumerate(best_features_threshold):
322
+ all_best_features[i].append(best_feature)
323
+ all_best_targets.append(best_target)
324
+
325
+ # name features
326
+ all_best_features_named = {name: list_values for name, list_values in zip(names, all_best_features)}
327
+ all_best_targets_named = {target_name: all_best_targets}
328
+
329
+ # save results
330
+ _save_results(all_best_features_named, all_best_targets_named, save_dir=save_results_dir, target_name=target_name)
331
+
332
+ return all_best_features_named, all_best_targets_named # type: ignore
333
+
334
+
335
+ def info():
336
+ _script_info(__all__)
337
+
338
+
339
+ def _pso(func: ObjectiveFunction,
340
+ lb: np.ndarray,
341
+ ub: np.ndarray,
342
+ device: torch.device,
343
+ swarmsize=100,
344
+ maxiter=100,
345
+ omega = 0.729, # Clerc and Kennedy’s constriction coefficient
346
+ phip = 1.49445, # Clerc and Kennedy’s constriction coefficient
347
+ phig = 1.49445, # Clerc and Kennedy’s constriction coefficient
348
+ tolerance = 1e-8,
349
+ particle_output=False,
350
+ seed: Optional[int] = None):
351
+ """
352
+ Internal PSO implementation using PyTorch tensors for acceleration on CUDA or MPS devices.
353
+
354
+ Parameters
355
+ ----------
356
+ func : callable
357
+ Callable objective function with batched evaluation support. Must accept a 2D NumPy array
358
+ of shape (n_particles, n_features) and return a 1D NumPy array of shape (n_particles,).
359
+
360
+ lb : np.ndarray
361
+ Lower bounds for each feature (1D array of length n_features).
362
+
363
+ ub : np.ndarray
364
+ Upper bounds for each feature (1D array of length n_features).
365
+
366
+ swarmsize : int
367
+ Number of particles in the swarm (i.e., batch size per iteration).
368
+
369
+ maxiter : int
370
+ Number of iterations to perform (i.e., optimization steps).
371
+
372
+ omega : float
373
+ Inertia weight controlling velocity retention across iterations.
374
+ - Typical range: [0.4, 0.9]
375
+ - Lower values encourage convergence, higher values promote exploration.
376
+ - The default value (0.729) comes from Clerc & Kennedy's constriction method.
377
+
378
+ phip : float
379
+ Cognitive acceleration coefficient.
380
+ - Controls how strongly particles are pulled toward their own best-known positions.
381
+ - Typical range: [0.5, 2.5]
382
+ - Default from Clerc & Kennedy's recommended setting.
383
+
384
+ phig : float
385
+ Social acceleration coefficient.
386
+ - Controls how strongly particles are pulled toward the swarm's global best.
387
+ - Typical range: [0.5, 2.5]
388
+ - Default from Clerc & Kennedy's recommended setting.
389
+
390
+ particle_output : bool, default=False
391
+ If True, returns the full history of particle positions and objective scores at each iteration.
392
+
393
+ seed : int or None, default=None
394
+ Random seed for reproducibility. If None, defaults to 42.
395
+
396
+ Returns
397
+ -------
398
+ best_position : np.ndarray
399
+ 1D array of shape (n_features,) representing the best solution found.
400
+
401
+ best_score : float
402
+ Objective value at `best_position`.
403
+
404
+ history_positions : list[np.ndarray], optional
405
+ Only returned if `particle_output=True`. List of particle positions per iteration.
406
+ Each element has shape (swarmsize, n_features).
407
+
408
+ history_scores : list[np.ndarray], optional
409
+ Only returned if `particle_output=True`. List of objective scores per iteration.
410
+ Each element has shape (swarmsize,).
411
+ """
412
+ if seed is not None:
413
+ torch.manual_seed(seed)
414
+
415
+ ndim = len(lb)
416
+ lb_t = torch.tensor(lb, dtype=torch.float32, device=device, requires_grad=False)
417
+ ub_t = torch.tensor(ub, dtype=torch.float32, device=device, requires_grad=False)
418
+
419
+ # Initialize positions and velocities
420
+ r = torch.rand((swarmsize, ndim), device=device, requires_grad=False)
421
+ positions = lb_t + r * (ub_t - lb_t) # shape: (swarmsize, ndim)
422
+ velocities = torch.zeros_like(positions, requires_grad=False)
423
+
424
+ # Initialize best positions and scores
425
+ personal_best_positions = positions.clone()
426
+ personal_best_scores = torch.full((swarmsize,), float('inf'), device=device, requires_grad=False)
427
+
428
+ global_best_score = float('inf')
429
+ global_best_position = torch.zeros(ndim, device=device, requires_grad=False)
430
+
431
+ # History (optional)
432
+ if particle_output:
433
+ history_positions = []
434
+ history_scores = []
435
+
436
+ # Main loop
437
+ previous_best_score = float('inf')
438
+ progress = trange(maxiter, desc="PSO", unit="iter", leave=True) #tqdm bar
439
+ with torch.no_grad():
440
+ for i in progress:
441
+ # Evaluate objective for all particles
442
+ positions_np = positions.detach().cpu().numpy() # shape: (swarmsize, n_features)
443
+ scores_np = func(positions_np) # shape: (swarmsize,)
444
+ scores = torch.tensor(scores_np, device=device, dtype=torch.float32)
445
+
446
+ # Update personal bests
447
+ improved = scores < personal_best_scores
448
+ personal_best_scores = torch.where(improved, scores, personal_best_scores)
449
+ personal_best_positions = torch.where(improved[:, None], positions, personal_best_positions)
450
+
451
+ # Update global best
452
+ min_score, min_idx = torch.min(personal_best_scores, dim=0)
453
+ if min_score < global_best_score:
454
+ global_best_score = min_score.item()
455
+ global_best_position = personal_best_positions[min_idx].clone()
456
+
457
+ # Early stopping criteria
458
+ if abs(previous_best_score - global_best_score) < tolerance:
459
+ progress.set_description(f"PSO (early stop at iteration {i+1})")
460
+ break
461
+ previous_best_score = global_best_score
462
+
463
+ # Optional: track history for debugging/visualization
464
+ if particle_output:
465
+ history_positions.append(positions.detach().cpu().numpy())
466
+ history_scores.append(scores_np)
467
+
468
+ # Velocity update
469
+ rp = torch.rand((swarmsize, ndim), device=device, requires_grad=False)
470
+ rg = torch.rand((swarmsize, ndim), device=device, requires_grad=False)
471
+
472
+ cognitive = phip * rp * (personal_best_positions - positions)
473
+ social = phig * rg * (global_best_position - positions)
474
+ velocities = omega * velocities + cognitive + social
475
+
476
+ # Position update
477
+ positions = positions + velocities
478
+
479
+ # Clamp to search space bounds
480
+ positions = torch.max(positions, lb_t)
481
+ positions = torch.min(positions, ub_t)
482
+
483
+ # Move to CPU and convert to NumPy
484
+ best_position = global_best_position.detach().cpu().numpy()
485
+ best_score = global_best_score
486
+
487
+ if particle_output:
488
+ return best_position, best_score, history_positions, history_scores
489
+ else:
490
+ return best_position, best_score
@@ -1,6 +1,10 @@
1
+ """
2
+ DEPRECATED
3
+ """
4
+
5
+
1
6
  import numpy as np
2
7
  import os
3
- import joblib
4
8
  import xgboost as xgb
5
9
  import lightgbm as lgb
6
10
  from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
@@ -7,7 +7,7 @@ from IPython.display import clear_output
7
7
  import time
8
8
  from typing import Union, Literal, Dict, Tuple, List
9
9
  import os
10
- from ml_tools.utilities import sanitize_filename, _script_info
10
+ from .utilities import sanitize_filename, _script_info
11
11
  import re
12
12
 
13
13
 
@@ -7,7 +7,6 @@ from matplotlib import rcdefaults
7
7
 
8
8
  import os
9
9
  from typing import Literal, Union, Optional, Iterator, Tuple
10
- import joblib
11
10
 
12
11
  from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
13
12
  from imblearn.under_sampling import RandomUnderSampler
@@ -2,7 +2,7 @@ import os
2
2
  from openpyxl import load_workbook, Workbook
3
3
  import pandas as pd
4
4
  from typing import List, Optional
5
- from utilities import _script_info, sanitize_filename
5
+ from .utilities import _script_info, sanitize_filename
6
6
 
7
7
 
8
8
  __all__ = [
@@ -5,7 +5,7 @@ import pandas as pd
5
5
  from openpyxl.styles import Font, PatternFill
6
6
  import traceback
7
7
  import json
8
- from ml_tools.utilities import sanitize_filename, _script_info
8
+ from .utilities import sanitize_filename, _script_info
9
9
 
10
10
 
11
11
  __all__ = [
@@ -21,6 +21,7 @@ __all__ = [
21
21
  "normalize_mixed_list",
22
22
  "sanitize_filename",
23
23
  "threshold_binary_values",
24
+ "threshold_binary_values_batch",
24
25
  "serialize_object",
25
26
  "deserialize_object",
26
27
  "distribute_datasets_by_target"
@@ -356,6 +357,39 @@ def threshold_binary_values(
356
357
  return tuple(result)
357
358
  else:
358
359
  return result
360
+
361
+
362
+ def threshold_binary_values_batch(
363
+ input_array: np.ndarray,
364
+ binary_values: int
365
+ ) -> np.ndarray:
366
+ """
367
+ Threshold the last `binary_values` columns of a 2D NumPy array to binary {0,1} using 0.5 cutoff.
368
+
369
+ Parameters
370
+ ----------
371
+ input_array : np.ndarray
372
+ 2D array with shape (batch_size, n_features).
373
+ binary_values : int
374
+ Number of binary features located at the END of each row.
375
+
376
+ Returns
377
+ -------
378
+ np.ndarray
379
+ Thresholded array, same shape as input.
380
+ """
381
+ assert input_array.ndim == 2, f"Expected 2D array, got {input_array.ndim}D"
382
+ batch_size, total_features = input_array.shape
383
+ assert 0 <= binary_values <= total_features, "binary_values out of valid range"
384
+
385
+ if binary_values == 0:
386
+ return input_array.copy()
387
+
388
+ cont_part = input_array[:, :-binary_values] if binary_values < total_features else np.empty((batch_size, 0))
389
+ bin_part = input_array[:, -binary_values:] > 0.5
390
+ bin_part = bin_part.astype(np.int32)
391
+
392
+ return np.hstack([cont_part, bin_part])
359
393
 
360
394
 
361
395
  def serialize_object(obj: Any, save_dir: str, filename: str, verbose: bool=True, raise_on_error: bool=False) -> Optional[str]:
@@ -1,12 +1,12 @@
1
1
  [project]
2
2
  name = "dragon-ml-toolbox"
3
- version = "1.4.7"
3
+ version = "2.0.0"
4
4
  description = "A collection of tools for data science and machine learning projects"
5
5
  authors = [
6
6
  { name = "Karl Loza", email = "luigiloza@gmail.com" }
7
7
  ]
8
8
  readme = "README.md"
9
- requires-python = ">=3.9"
9
+ requires-python = ">=3.10"
10
10
  license = "MIT"
11
11
  classifiers = [
12
12
  "Programming Language :: Python :: 3",
@@ -32,7 +32,9 @@ dependencies = [
32
32
  "joblib",
33
33
  "xgboost",
34
34
  "lightgbm<=4.5.0",
35
- "shap"
35
+ "shap",
36
+ "tqdm>=4.0",
37
+ "Pillow"
36
38
  ]
37
39
 
38
40
  [project.urls]
@@ -42,7 +44,6 @@ Changelog = "https://github.com/DrAg0n-BoRn/ML_tools/blob/master/CHANGELOG.md"
42
44
  [project.optional-dependencies]
43
45
  pytorch = [
44
46
  "torch",
45
- "Pillow",
46
47
  "torchvision"
47
48
  ]
48
49