dragon-ml-toolbox 4.5.0__tar.gz → 5.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (37) hide show
  1. {dragon_ml_toolbox-4.5.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-5.1.0}/PKG-INFO +5 -2
  2. {dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/README.md +3 -1
  3. {dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0/dragon_ml_toolbox.egg-info}/PKG-INFO +5 -2
  4. {dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +3 -1
  5. {dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/dragon_ml_toolbox.egg-info/requires.txt +1 -0
  6. dragon_ml_toolbox-4.5.0/ml_tools/datasetmaster.py → dragon_ml_toolbox-5.1.0/ml_tools/ML_datasetmaster.py +91 -1
  7. dragon_ml_toolbox-5.1.0/ml_tools/ML_optimization.py +236 -0
  8. {dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/ml_tools/PSO_optimization.py +8 -141
  9. dragon_ml_toolbox-5.1.0/ml_tools/optimization_tools.py +137 -0
  10. {dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/pyproject.toml +4 -3
  11. {dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/LICENSE +0 -0
  12. {dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/LICENSE-THIRD-PARTY.md +0 -0
  13. {dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  14. {dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  15. {dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/ml_tools/ETL_engineering.py +0 -0
  16. {dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/ml_tools/GUI_tools.py +0 -0
  17. {dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/ml_tools/MICE_imputation.py +0 -0
  18. {dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/ml_tools/ML_callbacks.py +0 -0
  19. {dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/ml_tools/ML_evaluation.py +0 -0
  20. {dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/ml_tools/ML_inference.py +0 -0
  21. {dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/ml_tools/ML_trainer.py +0 -0
  22. {dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/ml_tools/RNN_forecast.py +0 -0
  23. {dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/ml_tools/SQL.py +0 -0
  24. {dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/ml_tools/VIF_factor.py +0 -0
  25. {dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/ml_tools/__init__.py +0 -0
  26. {dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/ml_tools/_logger.py +0 -0
  27. {dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/ml_tools/_pytorch_models.py +0 -0
  28. {dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/ml_tools/_script_info.py +0 -0
  29. {dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/ml_tools/custom_logger.py +0 -0
  30. {dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/ml_tools/data_exploration.py +0 -0
  31. {dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/ml_tools/ensemble_inference.py +0 -0
  32. {dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/ml_tools/ensemble_learning.py +0 -0
  33. {dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/ml_tools/handle_excel.py +0 -0
  34. {dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/ml_tools/keys.py +0 -0
  35. {dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/ml_tools/path_manager.py +0 -0
  36. {dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/ml_tools/utilities.py +0 -0
  37. {dragon_ml_toolbox-4.5.0 → dragon_ml_toolbox-5.1.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 4.5.0
3
+ Version: 5.1.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -36,6 +36,7 @@ Requires-Dist: lightgbm; extra == "ml"
36
36
  Requires-Dist: shap; extra == "ml"
37
37
  Requires-Dist: tqdm; extra == "ml"
38
38
  Requires-Dist: Pillow; extra == "ml"
39
+ Requires-Dist: evotorch; extra == "ml"
39
40
  Provides-Extra: mice
40
41
  Requires-Dist: numpy<2.0; extra == "mice"
41
42
  Requires-Dist: pandas; extra == "mice"
@@ -204,6 +205,7 @@ pip install "dragon-ml-toolbox[gui-boost,plot]"
204
205
  #### Modules:
205
206
 
206
207
  ```Bash
208
+ custom_logger
207
209
  GUI_tools
208
210
  ensemble_inference
209
211
  path_manager
@@ -224,6 +226,7 @@ pip install "dragon-ml-toolbox[gui-torch,plot]"
224
226
  #### Modules:
225
227
 
226
228
  ```Bash
229
+ custom_logger
227
230
  GUI_tools
228
231
  ML_inference
229
232
  path_manager
@@ -265,5 +268,5 @@ After installation, import modules like this:
265
268
 
266
269
  ```python
267
270
  from ml_tools.utilities import serialize_object, deserialize_object
268
- from ml_tools.custom_logger import custom_logger
271
+ from ml_tools import custom_logger
269
272
  ```
@@ -124,6 +124,7 @@ pip install "dragon-ml-toolbox[gui-boost,plot]"
124
124
  #### Modules:
125
125
 
126
126
  ```Bash
127
+ custom_logger
127
128
  GUI_tools
128
129
  ensemble_inference
129
130
  path_manager
@@ -144,6 +145,7 @@ pip install "dragon-ml-toolbox[gui-torch,plot]"
144
145
  #### Modules:
145
146
 
146
147
  ```Bash
148
+ custom_logger
147
149
  GUI_tools
148
150
  ML_inference
149
151
  path_manager
@@ -185,5 +187,5 @@ After installation, import modules like this:
185
187
 
186
188
  ```python
187
189
  from ml_tools.utilities import serialize_object, deserialize_object
188
- from ml_tools.custom_logger import custom_logger
190
+ from ml_tools import custom_logger
189
191
  ```
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 4.5.0
3
+ Version: 5.1.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -36,6 +36,7 @@ Requires-Dist: lightgbm; extra == "ml"
36
36
  Requires-Dist: shap; extra == "ml"
37
37
  Requires-Dist: tqdm; extra == "ml"
38
38
  Requires-Dist: Pillow; extra == "ml"
39
+ Requires-Dist: evotorch; extra == "ml"
39
40
  Provides-Extra: mice
40
41
  Requires-Dist: numpy<2.0; extra == "mice"
41
42
  Requires-Dist: pandas; extra == "mice"
@@ -204,6 +205,7 @@ pip install "dragon-ml-toolbox[gui-boost,plot]"
204
205
  #### Modules:
205
206
 
206
207
  ```Bash
208
+ custom_logger
207
209
  GUI_tools
208
210
  ensemble_inference
209
211
  path_manager
@@ -224,6 +226,7 @@ pip install "dragon-ml-toolbox[gui-torch,plot]"
224
226
  #### Modules:
225
227
 
226
228
  ```Bash
229
+ custom_logger
227
230
  GUI_tools
228
231
  ML_inference
229
232
  path_manager
@@ -265,5 +268,5 @@ After installation, import modules like this:
265
268
 
266
269
  ```python
267
270
  from ml_tools.utilities import serialize_object, deserialize_object
268
- from ml_tools.custom_logger import custom_logger
271
+ from ml_tools import custom_logger
269
272
  ```
@@ -11,8 +11,10 @@ ml_tools/ETL_engineering.py
11
11
  ml_tools/GUI_tools.py
12
12
  ml_tools/MICE_imputation.py
13
13
  ml_tools/ML_callbacks.py
14
+ ml_tools/ML_datasetmaster.py
14
15
  ml_tools/ML_evaluation.py
15
16
  ml_tools/ML_inference.py
17
+ ml_tools/ML_optimization.py
16
18
  ml_tools/ML_trainer.py
17
19
  ml_tools/PSO_optimization.py
18
20
  ml_tools/RNN_forecast.py
@@ -24,10 +26,10 @@ ml_tools/_pytorch_models.py
24
26
  ml_tools/_script_info.py
25
27
  ml_tools/custom_logger.py
26
28
  ml_tools/data_exploration.py
27
- ml_tools/datasetmaster.py
28
29
  ml_tools/ensemble_inference.py
29
30
  ml_tools/ensemble_learning.py
30
31
  ml_tools/handle_excel.py
31
32
  ml_tools/keys.py
33
+ ml_tools/optimization_tools.py
32
34
  ml_tools/path_manager.py
33
35
  ml_tools/utilities.py
@@ -18,6 +18,7 @@ lightgbm
18
18
  shap
19
19
  tqdm
20
20
  Pillow
21
+ evotorch
21
22
 
22
23
  [base]
23
24
  pandas
@@ -21,6 +21,7 @@ from ._script_info import _script_info
21
21
  # --- public-facing API ---
22
22
  __all__ = [
23
23
  "DatasetMaker",
24
+ "SimpleDatasetMaker",
24
25
  "VisionDatasetMaker",
25
26
  "SequenceMaker",
26
27
  "ResizeAspectFill",
@@ -328,7 +329,7 @@ class DatasetMaker(_BaseMaker):
328
329
 
329
330
  return self.scaler.inverse_transform(data_np)
330
331
 
331
- def get_datasets(self) -> Tuple[_PytorchDataset, _PytorchDataset]:
332
+ def get_datasets(self) -> Tuple[Dataset, Dataset]:
332
333
  """Primary method to get the final PyTorch Datasets."""
333
334
  if not self._is_split:
334
335
  raise RuntimeError("Data has not been split yet. Call .split_data() or .process() first.")
@@ -370,6 +371,95 @@ class DatasetMaker(_BaseMaker):
370
371
  return pandas.DataFrame(full_tensor.numpy(), columns=new_columns, index=cat_df.index)
371
372
 
372
373
 
374
+ # Streamlined DatasetMaker version
375
+ class SimpleDatasetMaker:
376
+ """
377
+ A simplified dataset maker for pre-processed, numerical pandas DataFrames.
378
+
379
+ This class takes a DataFrame, automatically splits it into training and
380
+ testing sets, and converts them into PyTorch Datasets. It assumes the
381
+ target variable is the last column.
382
+
383
+ Args:
384
+ pandas_df (pandas.DataFrame): The pre-processed input DataFrame with numerical data.
385
+ test_size (float): The proportion of the dataset to allocate to the
386
+ test split.
387
+ random_state (int): The seed for the random number generator for
388
+ reproducibility.
389
+ id (str | None): An optional object identifier.
390
+ """
391
+ def __init__(self, pandas_df: pandas.DataFrame, test_size: float = 0.2, random_state: int = 42, id: Optional[str]=None):
392
+ """
393
+ Attributes:
394
+ `train_dataset` -> PyTorch Dataset
395
+ `test_dataset` -> PyTorch Dataset
396
+ `feature_names` -> list[str]
397
+ `target_name` -> str
398
+ `id` -> str | None
399
+ """
400
+
401
+ if not isinstance(pandas_df, pandas.DataFrame):
402
+ raise TypeError("Input must be a pandas.DataFrame.")
403
+
404
+ #set id
405
+ self._id = id
406
+
407
+ # 1. Identify features and target
408
+ features = pandas_df.iloc[:, :-1]
409
+ target = pandas_df.iloc[:, -1]
410
+
411
+ self._feature_names = features.columns.tolist()
412
+ self._target_name = target.name
413
+
414
+ # 2. Split the data
415
+ X_train, X_test, y_train, y_test = train_test_split(
416
+ features, target, test_size=test_size, random_state=random_state
417
+ )
418
+
419
+ self._X_train_shape = X_train.shape
420
+ self._X_test_shape = X_test.shape
421
+ self._y_train_shape = y_train.shape
422
+ self._y_test_shape = y_test.shape
423
+
424
+ # 3. Convert to PyTorch Datasets
425
+ self._train_ds = _PytorchDataset(X_train.values, y_train.values)
426
+ self._test_ds = _PytorchDataset(X_test.values, y_test.values)
427
+
428
+ @property
429
+ def train_dataset(self) -> Dataset:
430
+ """Returns the training PyTorch dataset."""
431
+ return self._train_ds
432
+
433
+ @property
434
+ def test_dataset(self) -> Dataset:
435
+ """Returns the testing PyTorch dataset."""
436
+ return self._test_ds
437
+
438
+ @property
439
+ def feature_names(self) -> list[str]:
440
+ """Returns the list of feature column names."""
441
+ return self._feature_names
442
+
443
+ @property
444
+ def target_name(self) -> str:
445
+ """Returns the name of the target column."""
446
+ return str(self._target_name)
447
+
448
+ @property
449
+ def id(self) -> Optional[str]:
450
+ """Returns teh object identifier if any."""
451
+ return self._id
452
+
453
+ def dataframes_info(self) -> None:
454
+ """Prints the shape information of the split pandas DataFrames."""
455
+ print("--- Original DataFrame Shapes After Split ---")
456
+ print(f" X_train shape: {self._X_train_shape}")
457
+ print(f" y_train shape: {self._y_train_shape}\n")
458
+ print(f" X_test shape: {self._X_test_shape}")
459
+ print(f" y_test shape: {self._y_test_shape}")
460
+ print("-------------------------------------------")
461
+
462
+
373
463
  # --- VisionDatasetMaker ---
374
464
  class VisionDatasetMaker(_BaseMaker):
375
465
  """
@@ -0,0 +1,236 @@
1
+ import torch
2
+ import numpy
3
+ import evotorch
4
+ from evotorch.algorithms import CMAES, SteadyStateGA
5
+ from evotorch.logging import StdOutLogger
6
+ from typing import Literal, Union, Tuple, List, Optional
7
+ from pathlib import Path
8
+ from tqdm.auto import trange
9
+ from contextlib import nullcontext
10
+
11
+ from .path_manager import make_fullpath, sanitize_filename
12
+ from ._logger import _LOGGER
13
+ from ._script_info import _script_info
14
+ from .ML_inference import PyTorchInferenceHandler
15
+ from .keys import PyTorchInferenceKeys
16
+ from .SQL import DatabaseManager
17
+ from .optimization_tools import _save_result
18
+ from .utilities import threshold_binary_values
19
+
20
+
21
+ __all__ = [
22
+ "create_pytorch_problem",
23
+ "run_optimization"
24
+ ]
25
+
26
+
27
+ def create_pytorch_problem(
28
+ handler: PyTorchInferenceHandler,
29
+ bounds: Tuple[List[float], List[float]],
30
+ binary_features: int,
31
+ task: Literal["minimize", "maximize"],
32
+ algorithm: Literal["CMAES", "GA"] = "CMAES",
33
+ verbose: bool = False,
34
+ **searcher_kwargs
35
+ ) -> Tuple[evotorch.Problem, evotorch.Searcher]:
36
+ """
37
+ Creates and configures an EvoTorch Problem and Searcher for a PyTorch model.
38
+
39
+ Args:
40
+ handler (PyTorchInferenceHandler): An initialized inference handler
41
+ containing the model and weights.
42
+ bounds (tuple[list[float], list[float]]): A tuple containing the lower
43
+ and upper bounds for the solution features.
44
+ binary_features (int): Number of binary features located at the END of the feature vector. Will be automatically added to the bounds.
45
+ task (str): The optimization goal, either "minimize" or "maximize".
46
+ algorithm (str): The search algorithm to use, "CMAES" or "GA" (SteadyStateGA).
47
+ verbose (bool): Add an Evotorch logger for real-time console updates.
48
+ **searcher_kwargs: Additional keyword arguments to pass to the
49
+ selected search algorithm's constructor (e.g., stdev_init=0.5 for CMAES).
50
+
51
+ Returns:
52
+ A tuple containing the configured evotorch.Problem and evotorch.Searcher.
53
+ """
54
+ lower_bounds, upper_bounds = bounds
55
+
56
+ # add binary bounds
57
+ if binary_features > 0:
58
+ lower_bounds.extend([0.45] * binary_features)
59
+ upper_bounds.extend([0.55] * binary_features)
60
+
61
+ solution_length = len(lower_bounds)
62
+ device = handler.device
63
+
64
+ # Define the fitness function that EvoTorch will call.
65
+ @evotorch.decorators.to_tensor
66
+ @evotorch.decorators.on_aux_device(device)
67
+ def fitness_func(solution_tensor: torch.Tensor) -> torch.Tensor:
68
+ # Make a mutable copy of the solutions from the optimizer
69
+ processed_tensor = solution_tensor.clone()
70
+
71
+ # Apply thresholding if binary features are present
72
+ if binary_features > 0:
73
+ # Isolate the binary part of the tensor (the last n columns)
74
+ binary_part = processed_tensor[:, -binary_features:]
75
+
76
+ # Apply rounding to snap values to 0.0 or 1.0
77
+ processed_tensor[:, -binary_features:] = torch.round(binary_part)
78
+
79
+ # Use the processed tensor (with thresholded values) for prediction
80
+ predictions = handler.predict_batch(processed_tensor)[PyTorchInferenceKeys.PREDICTIONS]
81
+ return predictions.flatten()
82
+
83
+ # Create the Problem instance.
84
+ problem = evotorch.Problem(
85
+ objective_sense=task,
86
+ objective_func=fitness_func,
87
+ solution_length=solution_length,
88
+ initial_bounds=(lower_bounds, upper_bounds),
89
+ device=device,
90
+ )
91
+
92
+ # Create the selected searcher instance.
93
+ if algorithm == "CMAES":
94
+ searcher = CMAES(problem, **searcher_kwargs)
95
+ elif algorithm == "GA":
96
+ searcher = SteadyStateGA(problem, **searcher_kwargs)
97
+ else:
98
+ raise ValueError(f"Unknown algorithm '{algorithm}'. Choose 'CMAES' or 'GA'.")
99
+
100
+ # Add a logger for real-time console updates.
101
+ # This gives the user immediate feedback on the optimization progress.
102
+ if verbose:
103
+ _ = StdOutLogger(searcher)
104
+
105
+ return problem, searcher
106
+
107
+
108
+ def run_optimization(
109
+ problem: evotorch.Problem,
110
+ searcher: evotorch.Searcher,
111
+ num_generations: int,
112
+ target_name: str,
113
+ binary_features: int,
114
+ save_dir: Union[str, Path],
115
+ save_format: Literal['csv', 'sqlite', 'both'],
116
+ feature_names: Optional[List[str]],
117
+ repetitions: int = 1
118
+ ) -> Optional[dict]:
119
+ """
120
+ Runs the evolutionary optimization process, with support for multiple repetitions.
121
+
122
+ This function serves as the main engine for the optimization task. It takes a
123
+ configured Problem and a Searcher from EvoTorch and executes the optimization
124
+ for a specified number of generations.
125
+
126
+ It has two modes of operation:
127
+ 1. **Single Run (repetitions=1):** Executes the optimization once, saves the
128
+ single best result to a CSV file, and returns it as a dictionary.
129
+ 2. **Iterative Analysis (repetitions > 1):** Executes the optimization
130
+ multiple times. Results from each run are streamed incrementally to the
131
+ specified file formats (CSV and/or SQLite database). In this mode,
132
+ the function returns None.
133
+
134
+ Args:
135
+ problem (evotorch.Problem): The configured problem instance, which defines
136
+ the objective function, solution space, and optimization sense.
137
+ searcher (evotorch.Searcher): The configured searcher instance, which
138
+ contains the evolutionary algorithm (e.g., CMAES, GA).
139
+ num_generations (int): The total number of generations to run the
140
+ search algorithm for in each repetition.
141
+ target_name (str): Target name that will also be used for the CSV filename and SQL table.
142
+ binary_features (int): Number of binary features located at the END of the feature vector.
143
+ save_dir (str | Path): The directory where the result file(s) will be saved.
144
+ save_format (Literal['csv', 'sqlite', 'both'], optional): The format for
145
+ saving results during iterative analysis. Defaults to 'both'.
146
+ feature_names (List[str], optional): Names of the solution features for
147
+ labeling the output files. If None, generic names like 'feature_0',
148
+ 'feature_1', etc., will be created. Defaults to None.
149
+ repetitions (int, optional): The number of independent times to run the
150
+ entire optimization process. Defaults to 1.
151
+
152
+ Returns:
153
+ Optional[dict]: A dictionary containing the best feature values and the
154
+ fitness score if `repetitions` is 1. Returns `None` if `repetitions`
155
+ is greater than 1, as results are streamed to files instead.
156
+ """
157
+ # preprocess paths
158
+ save_path = make_fullpath(save_dir, make=True, enforce="directory")
159
+
160
+ sanitized_target_name = sanitize_filename(target_name)
161
+ if not sanitized_target_name.endswith(".csv"):
162
+ sanitized_target_name = sanitized_target_name + ".csv"
163
+
164
+ csv_path = save_path / sanitized_target_name
165
+
166
+ db_path = save_path / "Optimization.db"
167
+ db_table_name = target_name
168
+
169
+ # preprocess feature names
170
+ if feature_names is None:
171
+ feature_names = [f"feature_{i}" for i in range(problem.solution_length)]
172
+
173
+ # --- SINGLE RUN LOGIC ---
174
+ if repetitions <= 1:
175
+ _LOGGER.info(f"🤖 Starting optimization with {searcher.__class__.__name__} for {num_generations} generations...")
176
+ for _ in trange(num_generations, desc="Optimizing"):
177
+ searcher.step()
178
+
179
+ best_solution_tensor, best_fitness = searcher.best
180
+ best_solution_np = best_solution_tensor.cpu().numpy()
181
+
182
+ # threshold binary features
183
+ if binary_features > 0:
184
+ best_solution_thresholded = threshold_binary_values(input_array=best_solution_np, binary_values=binary_features)
185
+ else:
186
+ best_solution_thresholded = best_solution_np
187
+
188
+ result_dict = {name: value for name, value in zip(feature_names, best_solution_thresholded)}
189
+ result_dict[target_name] = best_fitness.item()
190
+
191
+ _save_result(result_dict, 'csv', csv_path) # Single run defaults to CSV
192
+ _LOGGER.info(f"✅ Optimization complete. Best solution saved to '{csv_path.name}'")
193
+ return result_dict
194
+
195
+ # --- MULTIPLE REPETITIONS LOGIC ---
196
+ else:
197
+ _LOGGER.info(f"🏁 Starting optimal solution space analysis with {repetitions} repetitions...")
198
+
199
+ db_context = DatabaseManager(db_path) if save_format in ['sqlite', 'both'] else nullcontext()
200
+
201
+ with db_context as db_manager:
202
+ if db_manager:
203
+ schema = {name: "REAL" for name in feature_names}
204
+ schema[target_name] = "REAL"
205
+ db_manager.create_table(db_table_name, schema)
206
+
207
+ for i in trange(repetitions, desc="Repetitions"):
208
+ _LOGGER.info(f"--- Starting Repetition {i+1}/{repetitions} ---")
209
+
210
+ # CRITICAL: Re-initialize the searcher to ensure each run is independent
211
+ searcher.reset()
212
+
213
+ for _ in range(num_generations): # Inner loop does not need a progress bar
214
+ searcher.step()
215
+
216
+ best_solution_tensor, best_fitness = searcher.best
217
+ best_solution_np = best_solution_tensor.cpu().numpy()
218
+
219
+ # threshold binary features
220
+ if binary_features > 0:
221
+ best_solution_thresholded = threshold_binary_values(input_array=best_solution_np, binary_values=binary_features)
222
+ else:
223
+ best_solution_thresholded = best_solution_np
224
+
225
+ result_dict = {name: value for name, value in zip(feature_names, best_solution_thresholded)}
226
+ result_dict[target_name] = best_fitness.item()
227
+
228
+ # Save each result incrementally
229
+ _save_result(result_dict, save_format, csv_path, db_manager, db_table_name)
230
+
231
+ _LOGGER.info(f"✅ Optimal solution space complete. Results saved to '{save_path}'")
232
+ return None
233
+
234
+
235
+ def info():
236
+ _script_info(__all__)
@@ -2,32 +2,27 @@ import numpy as np
2
2
  from pathlib import Path
3
3
  import xgboost as xgb
4
4
  import lightgbm as lgb
5
- from typing import Literal, Union, Tuple, Dict, Optional, Any
6
- import pandas as pd
5
+ from typing import Literal, Union, Tuple, Dict, Optional
7
6
  from copy import deepcopy
8
7
  from .utilities import (
9
8
  threshold_binary_values,
10
9
  threshold_binary_values_batch,
11
- deserialize_object,
12
- yield_dataframes_from_dir)
13
- from .path_manager import sanitize_filename, make_fullpath, list_files_by_extension, list_csv_paths
10
+ deserialize_object)
11
+ from .path_manager import sanitize_filename, make_fullpath, list_files_by_extension
14
12
  import torch
15
13
  from tqdm import trange
16
- import matplotlib.pyplot as plt
17
- import seaborn as sns
18
14
  from ._logger import _LOGGER
19
15
  from .keys import ModelSaveKeys
20
16
  from ._script_info import _script_info
21
17
  from .SQL import DatabaseManager
22
18
  from contextlib import nullcontext
19
+ from .optimization_tools import _save_result
23
20
 
24
21
 
25
22
  __all__ = [
26
23
  "ObjectiveFunction",
27
24
  "multiple_objective_functions_from_dir",
28
- "parse_lower_upper_bounds",
29
- "run_pso",
30
- "plot_optimal_feature_distributions"
25
+ "run_pso"
31
26
  ]
32
27
 
33
28
 
@@ -170,18 +165,6 @@ def multiple_objective_functions_from_dir(directory: Union[str,Path], add_noise:
170
165
  return objective_functions, objective_function_names
171
166
 
172
167
 
173
- def parse_lower_upper_bounds(source: dict[str,tuple[Any,Any]]):
174
- """
175
- Parse lower and upper boundaries, returning 2 lists:
176
-
177
- `lower_bounds`, `upper_bounds`
178
- """
179
- lower = [low[0] for low in source.values()]
180
- upper = [up[1] for up in source.values()]
181
-
182
- return lower, upper
183
-
184
-
185
168
  def _set_boundaries(lower_boundaries: list[float], upper_boundaries: list[float]):
186
169
  assert len(lower_boundaries) == len(upper_boundaries), "Lower and upper boundaries must have the same length."
187
170
  assert len(lower_boundaries) >= 1, "At least one boundary pair is required."
@@ -198,45 +181,6 @@ def _set_feature_names(size: int, names: Union[list[str], None]):
198
181
  return names
199
182
 
200
183
 
201
- def _save_result(result_dict: dict,
202
- save_format: Literal['csv', 'sqlite', 'both'],
203
- csv_path: Path,
204
- db_manager: Optional[DatabaseManager] = None,
205
- db_table_name: Optional[str] = None):
206
- """
207
- Handles saving a single result to CSV, SQLite, or both.
208
- """
209
- # Save to CSV
210
- if save_format in ['csv', 'both']:
211
- _save_or_append_to_csv(result_dict, csv_path)
212
-
213
- # Save to SQLite
214
- if save_format in ['sqlite', 'both']:
215
- if db_manager and db_table_name:
216
- db_manager.insert_row(db_table_name, result_dict)
217
- else:
218
- _LOGGER.warning("SQLite saving requested but db_manager or table_name not provided.")
219
-
220
-
221
- def _save_or_append_to_csv(data_dict: dict, save_path: Path):
222
- """
223
- Saves or appends a dictionary of data as a single row to a CSV file.
224
-
225
- If the file doesn't exist, it creates it and writes the header.
226
- If the file exists, it appends the new data without the header.
227
- """
228
- df_row = pd.DataFrame([data_dict])
229
-
230
- file_exists = save_path.exists()
231
-
232
- df_row.to_csv(
233
- save_path,
234
- mode='a', # 'a' for append mode
235
- index=False, # Don't write the DataFrame index
236
- header=not file_exists # Write header only if file does NOT exist
237
- )
238
-
239
-
240
184
  def _run_single_pso(objective_function: ObjectiveFunction, pso_args: dict, feature_names: list[str], target_name: str, random_state: int, save_format: Literal['csv', 'sqlite', 'both'], csv_path: Path, db_manager: Optional[DatabaseManager], db_table_name: str):
241
185
  """Helper for a single PSO run that also handles saving."""
242
186
  pso_args.update({"seed": random_state})
@@ -282,14 +226,14 @@ def run_pso(lower_boundaries: list[float],
282
226
  upper_boundaries: list[float],
283
227
  objective_function: ObjectiveFunction,
284
228
  save_results_dir: Union[str,Path],
285
- save_format: Literal['csv', 'sqlite', 'both'] = 'csv',
229
+ save_format: Literal['csv', 'sqlite', 'both'],
286
230
  auto_binary_boundaries: bool=True,
287
231
  target_name: Union[str, None]=None,
288
232
  feature_names: Union[list[str], None]=None,
289
233
  swarm_size: int=200,
290
234
  max_iterations: int=3000,
291
235
  random_state: int=101,
292
- post_hoc_analysis: Optional[int]=10) -> Optional[Tuple[Dict[str, float], Dict[str, float]]]:
236
+ post_hoc_analysis: Optional[int]=20) -> Optional[Tuple[Dict[str, float], Dict[str, float]]]:
293
237
  """
294
238
  Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results as a CSV file.
295
239
 
@@ -303,7 +247,7 @@ def run_pso(lower_boundaries: list[float],
303
247
  A callable object encapsulating a tree-based regression model.
304
248
  save_results_dir : str | Path
305
249
  Directory path to save the results CSV file.
306
- save_format : {'csv', 'sqlite', 'both'}, default 'csv'
250
+ save_format : {'csv', 'sqlite', 'both'}
307
251
  The format for saving optimization results.
308
252
  - 'csv': Saves results to a CSV file.
309
253
  - 'sqlite': Saves results to an SQLite database file. ⚠️ If a database exists, new tables will be created using the target name.
@@ -578,83 +522,6 @@ def _pso(func: ObjectiveFunction,
578
522
  return best_position, best_score
579
523
 
580
524
 
581
- def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir: Union[str, Path]):
582
- """
583
- Analyzes optimization results and plots the distribution of optimal values for each feature.
584
-
585
- For features with more than two unique values, this function generates a color-coded
586
- Kernel Density Estimate (KDE) plot. For binary or constant features, it generates a bar plot
587
- showing relative frequency.
588
-
589
- Parameters
590
- ----------
591
- results_dir : str or Path
592
- The path to the directory containing the optimization result CSV files.
593
- save_dir : str or Path
594
- The directory where the output plots will be saved.
595
- """
596
- # Check results_dir and create output path
597
- results_path = make_fullpath(results_dir)
598
- output_path = make_fullpath(save_dir, make=True)
599
-
600
- # Check that the directory contains csv files
601
- list_csv_paths(results_path, verbose=False)
602
-
603
- # --- Data Loading and Preparation ---
604
- _LOGGER.info(f"📁 Starting analysis from results in: '{results_dir}'")
605
- data_to_plot = []
606
- for df, df_name in yield_dataframes_from_dir(results_path):
607
- melted_df = df.iloc[:, :-1].melt(var_name='feature', value_name='value')
608
- melted_df['target'] = df_name.replace("Optimization_", "")
609
- data_to_plot.append(melted_df)
610
-
611
- long_df = pd.concat(data_to_plot, ignore_index=True)
612
- features = long_df['feature'].unique()
613
- _LOGGER.info(f"📂 Found data for {len(features)} features across {len(long_df['target'].unique())} targets. Generating plots...")
614
-
615
- # --- Plotting Loop ---
616
- for feature_name in features:
617
- plt.figure(figsize=(12, 7))
618
- feature_df = long_df[long_df['feature'] == feature_name]
619
-
620
- # Check if the feature is binary or constant
621
- if feature_df['value'].nunique() <= 2:
622
- # PLOT 1: For discrete values, calculate percentages and use a true bar plot.
623
- # This ensures the X-axis is clean (e.g., just 0 and 1).
624
- norm_df = (feature_df.groupby('target')['value']
625
- .value_counts(normalize=True)
626
- .mul(100)
627
- .rename('percent')
628
- .reset_index())
629
-
630
- ax = sns.barplot(data=norm_df, x='value', y='percent', hue='target')
631
-
632
- plt.title(f"Optimal Value Distribution for '{feature_name}'", fontsize=16)
633
- plt.ylabel("Frequency (%)", fontsize=12)
634
- ax.set_ylim(0, 100) # Set Y-axis from 0 to 100
635
-
636
- else:
637
- # PLOT 2: KDE plot for continuous values.
638
- ax = sns.kdeplot(data=feature_df, x='value', hue='target',
639
- fill=True, alpha=0.1, warn_singular=False)
640
-
641
- plt.title(f"Optimal Value Distribution for '{feature_name}'", fontsize=16)
642
- plt.ylabel("Density", fontsize=12) # Y-axis is "Density" for KDE plots
643
-
644
- # --- Common settings for both plot types ---
645
- plt.xlabel("Feature Value", fontsize=12)
646
- plt.grid(axis='y', alpha=0.5, linestyle='--')
647
-
648
- legend = ax.get_legend()
649
- if legend:
650
- legend.set_title('Target')
651
-
652
- sanitized_feature_name = sanitize_filename(feature_name)
653
- plot_filename = output_path / f"Distribution_{sanitized_feature_name}.svg"
654
- plt.savefig(plot_filename, bbox_inches='tight')
655
- plt.close()
656
-
657
- _LOGGER.info(f"✅ All plots saved successfully to: '{output_path}'")
658
525
 
659
526
 
660
527
  def info():
@@ -0,0 +1,137 @@
1
+ import matplotlib.pyplot as plt
2
+ import seaborn as sns
3
+ from typing import Union, Any, Literal, Optional
4
+ from pathlib import Path
5
+ import pandas as pd
6
+
7
+ from .path_manager import make_fullpath, list_csv_paths, sanitize_filename
8
+ from .utilities import yield_dataframes_from_dir
9
+ from ._logger import _LOGGER
10
+ from ._script_info import _script_info
11
+ from .SQL import DatabaseManager
12
+
13
+
14
+ __all__ = [
15
+ "parse_lower_upper_bounds",
16
+ "plot_optimal_feature_distributions"
17
+ ]
18
+
19
+
20
+ def parse_lower_upper_bounds(source: dict[str,tuple[Any,Any]]):
21
+ """
22
+ Parse lower and upper boundaries, returning 2 lists:
23
+
24
+ `lower_bounds`, `upper_bounds`
25
+ """
26
+ lower = [low[0] for low in source.values()]
27
+ upper = [up[1] for up in source.values()]
28
+
29
+ return lower, upper
30
+
31
+
32
+ def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir: Union[str, Path]):
33
+ """
34
+ Analyzes optimization results and plots the distribution of optimal values for each feature.
35
+
36
+ For features with more than two unique values, this function generates a color-coded
37
+ Kernel Density Estimate (KDE) plot. For binary or constant features, it generates a bar plot
38
+ showing relative frequency.
39
+
40
+ Parameters
41
+ ----------
42
+ results_dir : str or Path
43
+ The path to the directory containing the optimization result CSV files.
44
+ save_dir : str or Path
45
+ The directory where the output plots will be saved.
46
+ """
47
+ # Check results_dir and create output path
48
+ results_path = make_fullpath(results_dir)
49
+ output_path = make_fullpath(save_dir, make=True)
50
+
51
+ # Check that the directory contains csv files
52
+ list_csv_paths(results_path, verbose=False)
53
+
54
+ # --- Data Loading and Preparation ---
55
+ _LOGGER.info(f"📁 Starting analysis from results in: '{results_dir}'")
56
+ data_to_plot = []
57
+ for df, df_name in yield_dataframes_from_dir(results_path):
58
+ melted_df = df.iloc[:, :-1].melt(var_name='feature', value_name='value')
59
+ melted_df['target'] = df_name.replace("Optimization_", "")
60
+ data_to_plot.append(melted_df)
61
+
62
+ long_df = pd.concat(data_to_plot, ignore_index=True)
63
+ features = long_df['feature'].unique()
64
+ _LOGGER.info(f"📂 Found data for {len(features)} features across {len(long_df['target'].unique())} targets. Generating plots...")
65
+
66
+ # --- Plotting Loop ---
67
+ for feature_name in features:
68
+ plt.figure(figsize=(12, 7))
69
+ feature_df = long_df[long_df['feature'] == feature_name]
70
+
71
+ # Check if the feature is binary or constant
72
+ if feature_df['value'].nunique() <= 2:
73
+ # PLOT 1: For discrete values, calculate percentages and use a true bar plot.
74
+ # This ensures the X-axis is clean (e.g., just 0 and 1).
75
+ norm_df = (feature_df.groupby('target')['value']
76
+ .value_counts(normalize=True)
77
+ .mul(100)
78
+ .rename('percent')
79
+ .reset_index())
80
+
81
+ ax = sns.barplot(data=norm_df, x='value', y='percent', hue='target')
82
+
83
+ plt.title(f"Optimal Value Distribution for '{feature_name}'", fontsize=16)
84
+ plt.ylabel("Frequency (%)", fontsize=12)
85
+ ax.set_ylim(0, 100) # Set Y-axis from 0 to 100
86
+
87
+ else:
88
+ # PLOT 2: KDE plot for continuous values.
89
+ ax = sns.kdeplot(data=feature_df, x='value', hue='target',
90
+ fill=True, alpha=0.1, warn_singular=False)
91
+
92
+ plt.title(f"Optimal Value Distribution for '{feature_name}'", fontsize=16)
93
+ plt.ylabel("Density", fontsize=12) # Y-axis is "Density" for KDE plots
94
+
95
+ # --- Common settings for both plot types ---
96
+ plt.xlabel("Feature Value", fontsize=12)
97
+ plt.grid(axis='y', alpha=0.5, linestyle='--')
98
+
99
+ legend = ax.get_legend()
100
+ if legend:
101
+ legend.set_title('Target')
102
+
103
+ sanitized_feature_name = sanitize_filename(feature_name)
104
+ plot_filename = output_path / f"Distribution_{sanitized_feature_name}.svg"
105
+ plt.savefig(plot_filename, bbox_inches='tight')
106
+ plt.close()
107
+
108
+ _LOGGER.info(f"✅ All plots saved successfully to: '{output_path}'")
109
+
110
+
111
+ def _save_result(
112
+ result_dict: dict,
113
+ save_format: Literal['csv', 'sqlite', 'both'],
114
+ csv_path: Path,
115
+ db_manager: Optional[DatabaseManager] = None,
116
+ db_table_name: Optional[str] = None
117
+ ):
118
+ """
119
+ Private helper to handle saving a single result to CSV, SQLite, or both.
120
+ """
121
+ # Save to CSV
122
+ if save_format in ['csv', 'both']:
123
+ df_row = pd.DataFrame([result_dict])
124
+ file_exists = csv_path.exists()
125
+ df_row.to_csv(csv_path, mode='a', index=False, header=not file_exists)
126
+
127
+ # Save to SQLite
128
+ if save_format in ['sqlite', 'both']:
129
+ if db_manager and db_table_name:
130
+ db_manager.insert_row(db_table_name, result_dict)
131
+ else:
132
+ _LOGGER.warning("⚠️ SQLite saving requested but db_manager or table_name not provided.")
133
+
134
+
135
+
136
+ def info():
137
+ _script_info(__all__)
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dragon-ml-toolbox"
3
- version = "4.5.0"
3
+ version = "5.1.0"
4
4
  description = "A collection of tools for data science and machine learning projects."
5
5
  authors = [
6
6
  { name = "Karl Loza", email = "luigiloza@gmail.com" }
@@ -27,7 +27,7 @@ base = [
27
27
  "joblib"
28
28
  ]
29
29
 
30
- # Machine Learning main toolbox. Additionally Requires PyTorch with CUDA / MPS support if pytorch models are used
30
+ # Machine Learning main toolbox. Additionally Requires PyTorch with CUDA / MPS support
31
31
  ML = [
32
32
  "numpy",
33
33
  "pandas",
@@ -46,7 +46,8 @@ ML = [
46
46
  "lightgbm",
47
47
  "shap",
48
48
  "tqdm",
49
- "Pillow"
49
+ "Pillow",
50
+ "evotorch"
50
51
  ]
51
52
 
52
53
  # MICE and VIF - Requires a new virtual-env due to dependency version conflicts