dragon-ml-toolbox 4.4.0__tar.gz → 5.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (38) hide show
  1. {dragon_ml_toolbox-4.4.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-5.0.0}/PKG-INFO +4 -1
  2. {dragon_ml_toolbox-4.4.0 → dragon_ml_toolbox-5.0.0}/README.md +2 -0
  3. {dragon_ml_toolbox-4.4.0 → dragon_ml_toolbox-5.0.0/dragon_ml_toolbox.egg-info}/PKG-INFO +4 -1
  4. {dragon_ml_toolbox-4.4.0 → dragon_ml_toolbox-5.0.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +3 -1
  5. {dragon_ml_toolbox-4.4.0 → dragon_ml_toolbox-5.0.0}/dragon_ml_toolbox.egg-info/requires.txt +1 -0
  6. dragon_ml_toolbox-5.0.0/ml_tools/ML_optimization.py +236 -0
  7. {dragon_ml_toolbox-4.4.0 → dragon_ml_toolbox-5.0.0}/ml_tools/PSO_optimization.py +7 -127
  8. dragon_ml_toolbox-5.0.0/ml_tools/__init__.py +1 -0
  9. {dragon_ml_toolbox-4.4.0 → dragon_ml_toolbox-5.0.0}/ml_tools/data_exploration.py +1 -1
  10. dragon_ml_toolbox-5.0.0/ml_tools/optimization_tools.py +137 -0
  11. {dragon_ml_toolbox-4.4.0 → dragon_ml_toolbox-5.0.0}/pyproject.toml +4 -3
  12. dragon_ml_toolbox-4.4.0/ml_tools/__init__.py +0 -0
  13. {dragon_ml_toolbox-4.4.0 → dragon_ml_toolbox-5.0.0}/LICENSE +0 -0
  14. {dragon_ml_toolbox-4.4.0 → dragon_ml_toolbox-5.0.0}/LICENSE-THIRD-PARTY.md +0 -0
  15. {dragon_ml_toolbox-4.4.0 → dragon_ml_toolbox-5.0.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  16. {dragon_ml_toolbox-4.4.0 → dragon_ml_toolbox-5.0.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  17. {dragon_ml_toolbox-4.4.0 → dragon_ml_toolbox-5.0.0}/ml_tools/ETL_engineering.py +0 -0
  18. {dragon_ml_toolbox-4.4.0 → dragon_ml_toolbox-5.0.0}/ml_tools/GUI_tools.py +0 -0
  19. {dragon_ml_toolbox-4.4.0 → dragon_ml_toolbox-5.0.0}/ml_tools/MICE_imputation.py +0 -0
  20. {dragon_ml_toolbox-4.4.0 → dragon_ml_toolbox-5.0.0}/ml_tools/ML_callbacks.py +0 -0
  21. /dragon_ml_toolbox-4.4.0/ml_tools/datasetmaster.py → /dragon_ml_toolbox-5.0.0/ml_tools/ML_datasetmaster.py +0 -0
  22. {dragon_ml_toolbox-4.4.0 → dragon_ml_toolbox-5.0.0}/ml_tools/ML_evaluation.py +0 -0
  23. {dragon_ml_toolbox-4.4.0 → dragon_ml_toolbox-5.0.0}/ml_tools/ML_inference.py +0 -0
  24. {dragon_ml_toolbox-4.4.0 → dragon_ml_toolbox-5.0.0}/ml_tools/ML_trainer.py +0 -0
  25. {dragon_ml_toolbox-4.4.0 → dragon_ml_toolbox-5.0.0}/ml_tools/RNN_forecast.py +0 -0
  26. {dragon_ml_toolbox-4.4.0 → dragon_ml_toolbox-5.0.0}/ml_tools/SQL.py +0 -0
  27. {dragon_ml_toolbox-4.4.0 → dragon_ml_toolbox-5.0.0}/ml_tools/VIF_factor.py +0 -0
  28. {dragon_ml_toolbox-4.4.0 → dragon_ml_toolbox-5.0.0}/ml_tools/_logger.py +0 -0
  29. {dragon_ml_toolbox-4.4.0 → dragon_ml_toolbox-5.0.0}/ml_tools/_pytorch_models.py +0 -0
  30. {dragon_ml_toolbox-4.4.0 → dragon_ml_toolbox-5.0.0}/ml_tools/_script_info.py +0 -0
  31. {dragon_ml_toolbox-4.4.0 → dragon_ml_toolbox-5.0.0}/ml_tools/custom_logger.py +0 -0
  32. {dragon_ml_toolbox-4.4.0 → dragon_ml_toolbox-5.0.0}/ml_tools/ensemble_inference.py +0 -0
  33. {dragon_ml_toolbox-4.4.0 → dragon_ml_toolbox-5.0.0}/ml_tools/ensemble_learning.py +0 -0
  34. {dragon_ml_toolbox-4.4.0 → dragon_ml_toolbox-5.0.0}/ml_tools/handle_excel.py +0 -0
  35. {dragon_ml_toolbox-4.4.0 → dragon_ml_toolbox-5.0.0}/ml_tools/keys.py +0 -0
  36. {dragon_ml_toolbox-4.4.0 → dragon_ml_toolbox-5.0.0}/ml_tools/path_manager.py +0 -0
  37. {dragon_ml_toolbox-4.4.0 → dragon_ml_toolbox-5.0.0}/ml_tools/utilities.py +0 -0
  38. {dragon_ml_toolbox-4.4.0 → dragon_ml_toolbox-5.0.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 4.4.0
3
+ Version: 5.0.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -36,6 +36,7 @@ Requires-Dist: lightgbm; extra == "ml"
36
36
  Requires-Dist: shap; extra == "ml"
37
37
  Requires-Dist: tqdm; extra == "ml"
38
38
  Requires-Dist: Pillow; extra == "ml"
39
+ Requires-Dist: evotorch; extra == "ml"
39
40
  Provides-Extra: mice
40
41
  Requires-Dist: numpy<2.0; extra == "mice"
41
42
  Requires-Dist: pandas; extra == "mice"
@@ -204,6 +205,7 @@ pip install "dragon-ml-toolbox[gui-boost,plot]"
204
205
  #### Modules:
205
206
 
206
207
  ```Bash
208
+ custom_logger
207
209
  GUI_tools
208
210
  ensemble_inference
209
211
  path_manager
@@ -224,6 +226,7 @@ pip install "dragon-ml-toolbox[gui-torch,plot]"
224
226
  #### Modules:
225
227
 
226
228
  ```Bash
229
+ custom_logger
227
230
  GUI_tools
228
231
  ML_inference
229
232
  path_manager
@@ -124,6 +124,7 @@ pip install "dragon-ml-toolbox[gui-boost,plot]"
124
124
  #### Modules:
125
125
 
126
126
  ```Bash
127
+ custom_logger
127
128
  GUI_tools
128
129
  ensemble_inference
129
130
  path_manager
@@ -144,6 +145,7 @@ pip install "dragon-ml-toolbox[gui-torch,plot]"
144
145
  #### Modules:
145
146
 
146
147
  ```Bash
148
+ custom_logger
147
149
  GUI_tools
148
150
  ML_inference
149
151
  path_manager
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 4.4.0
3
+ Version: 5.0.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -36,6 +36,7 @@ Requires-Dist: lightgbm; extra == "ml"
36
36
  Requires-Dist: shap; extra == "ml"
37
37
  Requires-Dist: tqdm; extra == "ml"
38
38
  Requires-Dist: Pillow; extra == "ml"
39
+ Requires-Dist: evotorch; extra == "ml"
39
40
  Provides-Extra: mice
40
41
  Requires-Dist: numpy<2.0; extra == "mice"
41
42
  Requires-Dist: pandas; extra == "mice"
@@ -204,6 +205,7 @@ pip install "dragon-ml-toolbox[gui-boost,plot]"
204
205
  #### Modules:
205
206
 
206
207
  ```Bash
208
+ custom_logger
207
209
  GUI_tools
208
210
  ensemble_inference
209
211
  path_manager
@@ -224,6 +226,7 @@ pip install "dragon-ml-toolbox[gui-torch,plot]"
224
226
  #### Modules:
225
227
 
226
228
  ```Bash
229
+ custom_logger
227
230
  GUI_tools
228
231
  ML_inference
229
232
  path_manager
@@ -11,8 +11,10 @@ ml_tools/ETL_engineering.py
11
11
  ml_tools/GUI_tools.py
12
12
  ml_tools/MICE_imputation.py
13
13
  ml_tools/ML_callbacks.py
14
+ ml_tools/ML_datasetmaster.py
14
15
  ml_tools/ML_evaluation.py
15
16
  ml_tools/ML_inference.py
17
+ ml_tools/ML_optimization.py
16
18
  ml_tools/ML_trainer.py
17
19
  ml_tools/PSO_optimization.py
18
20
  ml_tools/RNN_forecast.py
@@ -24,10 +26,10 @@ ml_tools/_pytorch_models.py
24
26
  ml_tools/_script_info.py
25
27
  ml_tools/custom_logger.py
26
28
  ml_tools/data_exploration.py
27
- ml_tools/datasetmaster.py
28
29
  ml_tools/ensemble_inference.py
29
30
  ml_tools/ensemble_learning.py
30
31
  ml_tools/handle_excel.py
31
32
  ml_tools/keys.py
33
+ ml_tools/optimization_tools.py
32
34
  ml_tools/path_manager.py
33
35
  ml_tools/utilities.py
@@ -18,6 +18,7 @@ lightgbm
18
18
  shap
19
19
  tqdm
20
20
  Pillow
21
+ evotorch
21
22
 
22
23
  [base]
23
24
  pandas
@@ -0,0 +1,236 @@
1
+ import torch
2
+ import numpy
3
+ import evotorch
4
+ from evotorch.algorithms import CMAES, SteadyStateGA
5
+ from evotorch.logging import StdOutLogger
6
+ from typing import Literal, Union, Tuple, List, Optional
7
+ from pathlib import Path
8
+ from tqdm.auto import trange
9
+ from contextlib import nullcontext
10
+
11
+ from .path_manager import make_fullpath, sanitize_filename
12
+ from ._logger import _LOGGER
13
+ from ._script_info import _script_info
14
+ from .ML_inference import PyTorchInferenceHandler
15
+ from .keys import PyTorchInferenceKeys
16
+ from .SQL import DatabaseManager
17
+ from .optimization_tools import _save_result
18
+ from .utilities import threshold_binary_values
19
+
20
+
21
+ __all__ = [
22
+ "create_pytorch_problem",
23
+ "run_optimization"
24
+ ]
25
+
26
+
27
+ def create_pytorch_problem(
28
+ handler: PyTorchInferenceHandler,
29
+ bounds: Tuple[List[float], List[float]],
30
+ binary_features: int,
31
+ task: Literal["minimize", "maximize"],
32
+ algorithm: Literal["CMAES", "GA"] = "CMAES",
33
+ verbose: bool = False,
34
+ **searcher_kwargs
35
+ ) -> Tuple[evotorch.Problem, evotorch.Searcher]:
36
+ """
37
+ Creates and configures an EvoTorch Problem and Searcher for a PyTorch model.
38
+
39
+ Args:
40
+ handler (PyTorchInferenceHandler): An initialized inference handler
41
+ containing the model and weights.
42
+ bounds (tuple[list[float], list[float]]): A tuple containing the lower
43
+ and upper bounds for the solution features.
44
+ binary_features (int): Number of binary features located at the END of the feature vector. Will be automatically added to the bounds.
45
+ task (str): The optimization goal, either "minimize" or "maximize".
46
+ algorithm (str): The search algorithm to use, "CMAES" or "GA" (SteadyStateGA).
47
+ verbose (bool): Add an Evotorch logger for real-time console updates.
48
+ **searcher_kwargs: Additional keyword arguments to pass to the
49
+ selected search algorithm's constructor (e.g., stdev_init=0.5 for CMAES).
50
+
51
+ Returns:
52
+ A tuple containing the configured evotorch.Problem and evotorch.Searcher.
53
+ """
54
+ lower_bounds, upper_bounds = bounds
55
+
56
+ # add binary bounds
57
+ if binary_features > 0:
58
+ lower_bounds.extend([0.45] * binary_features)
59
+ upper_bounds.extend([0.55] * binary_features)
60
+
61
+ solution_length = len(lower_bounds)
62
+ device = handler.device
63
+
64
+ # Define the fitness function that EvoTorch will call.
65
+ @evotorch.decorators.to_tensor
66
+ @evotorch.decorators.on_aux_device(device)
67
+ def fitness_func(solution_tensor: torch.Tensor) -> torch.Tensor:
68
+ # Make a mutable copy of the solutions from the optimizer
69
+ processed_tensor = solution_tensor.clone()
70
+
71
+ # Apply thresholding if binary features are present
72
+ if binary_features > 0:
73
+ # Isolate the binary part of the tensor (the last n columns)
74
+ binary_part = processed_tensor[:, -binary_features:]
75
+
76
+ # Apply rounding to snap values to 0.0 or 1.0
77
+ processed_tensor[:, -binary_features:] = torch.round(binary_part)
78
+
79
+ # Use the processed tensor (with thresholded values) for prediction
80
+ predictions = handler.predict_batch(processed_tensor)[PyTorchInferenceKeys.PREDICTIONS]
81
+ return predictions.flatten()
82
+
83
+ # Create the Problem instance.
84
+ problem = evotorch.Problem(
85
+ objective_sense=task,
86
+ objective_func=fitness_func,
87
+ solution_length=solution_length,
88
+ initial_bounds=(lower_bounds, upper_bounds),
89
+ device=device,
90
+ )
91
+
92
+ # Create the selected searcher instance.
93
+ if algorithm == "CMAES":
94
+ searcher = CMAES(problem, **searcher_kwargs)
95
+ elif algorithm == "GA":
96
+ searcher = SteadyStateGA(problem, **searcher_kwargs)
97
+ else:
98
+ raise ValueError(f"Unknown algorithm '{algorithm}'. Choose 'CMAES' or 'GA'.")
99
+
100
+ # Add a logger for real-time console updates.
101
+ # This gives the user immediate feedback on the optimization progress.
102
+ if verbose:
103
+ _ = StdOutLogger(searcher)
104
+
105
+ return problem, searcher
106
+
107
+
108
+ def run_optimization(
109
+ problem: evotorch.Problem,
110
+ searcher: evotorch.Searcher,
111
+ num_generations: int,
112
+ target_name: str,
113
+ binary_features: int,
114
+ save_dir: Union[str, Path],
115
+ save_format: Literal['csv', 'sqlite', 'both'],
116
+ feature_names: Optional[List[str]],
117
+ repetitions: int = 1
118
+ ) -> Optional[dict]:
119
+ """
120
+ Runs the evolutionary optimization process, with support for multiple repetitions.
121
+
122
+ This function serves as the main engine for the optimization task. It takes a
123
+ configured Problem and a Searcher from EvoTorch and executes the optimization
124
+ for a specified number of generations.
125
+
126
+ It has two modes of operation:
127
+ 1. **Single Run (repetitions=1):** Executes the optimization once, saves the
128
+ single best result to a CSV file, and returns it as a dictionary.
129
+ 2. **Iterative Analysis (repetitions > 1):** Executes the optimization
130
+ multiple times. Results from each run are streamed incrementally to the
131
+ specified file formats (CSV and/or SQLite database). In this mode,
132
+ the function returns None.
133
+
134
+ Args:
135
+ problem (evotorch.Problem): The configured problem instance, which defines
136
+ the objective function, solution space, and optimization sense.
137
+ searcher (evotorch.Searcher): The configured searcher instance, which
138
+ contains the evolutionary algorithm (e.g., CMAES, GA).
139
+ num_generations (int): The total number of generations to run the
140
+ search algorithm for in each repetition.
141
+ target_name (str): Target name that will also be used for the CSV filename and SQL table.
142
+ binary_features (int): Number of binary features located at the END of the feature vector.
143
+ save_dir (str | Path): The directory where the result file(s) will be saved.
144
+ save_format (Literal['csv', 'sqlite', 'both'], optional): The format for
145
+ saving results during iterative analysis. Defaults to 'both'.
146
+ feature_names (List[str], optional): Names of the solution features for
147
+ labeling the output files. If None, generic names like 'feature_0',
148
+ 'feature_1', etc., will be created. Defaults to None.
149
+ repetitions (int, optional): The number of independent times to run the
150
+ entire optimization process. Defaults to 1.
151
+
152
+ Returns:
153
+ Optional[dict]: A dictionary containing the best feature values and the
154
+ fitness score if `repetitions` is 1. Returns `None` if `repetitions`
155
+ is greater than 1, as results are streamed to files instead.
156
+ """
157
+ # preprocess paths
158
+ save_path = make_fullpath(save_dir, make=True, enforce="directory")
159
+
160
+ sanitized_target_name = sanitize_filename(target_name)
161
+ if not sanitized_target_name.endswith(".csv"):
162
+ sanitized_target_name = sanitized_target_name + ".csv"
163
+
164
+ csv_path = save_path / sanitized_target_name
165
+
166
+ db_path = save_path / "Optimization.db"
167
+ db_table_name = target_name
168
+
169
+ # preprocess feature names
170
+ if feature_names is None:
171
+ feature_names = [f"feature_{i}" for i in range(problem.solution_length)]
172
+
173
+ # --- SINGLE RUN LOGIC ---
174
+ if repetitions <= 1:
175
+ _LOGGER.info(f"🤖 Starting optimization with {searcher.__class__.__name__} for {num_generations} generations...")
176
+ for _ in trange(num_generations, desc="Optimizing"):
177
+ searcher.step()
178
+
179
+ best_solution_tensor, best_fitness = searcher.best
180
+ best_solution_np = best_solution_tensor.cpu().numpy()
181
+
182
+ # threshold binary features
183
+ if binary_features > 0:
184
+ best_solution_thresholded = threshold_binary_values(input_array=best_solution_np, binary_values=binary_features)
185
+ else:
186
+ best_solution_thresholded = best_solution_np
187
+
188
+ result_dict = {name: value for name, value in zip(feature_names, best_solution_thresholded)}
189
+ result_dict[target_name] = best_fitness.item()
190
+
191
+ _save_result(result_dict, 'csv', csv_path) # Single run defaults to CSV
192
+ _LOGGER.info(f"✅ Optimization complete. Best solution saved to '{csv_path.name}'")
193
+ return result_dict
194
+
195
+ # --- MULTIPLE REPETITIONS LOGIC ---
196
+ else:
197
+ _LOGGER.info(f"🏁 Starting optimal solution space analysis with {repetitions} repetitions...")
198
+
199
+ db_context = DatabaseManager(db_path) if save_format in ['sqlite', 'both'] else nullcontext()
200
+
201
+ with db_context as db_manager:
202
+ if db_manager:
203
+ schema = {name: "REAL" for name in feature_names}
204
+ schema[target_name] = "REAL"
205
+ db_manager.create_table(db_table_name, schema)
206
+
207
+ for i in trange(repetitions, desc="Repetitions"):
208
+ _LOGGER.info(f"--- Starting Repetition {i+1}/{repetitions} ---")
209
+
210
+ # CRITICAL: Re-initialize the searcher to ensure each run is independent
211
+ searcher.reset()
212
+
213
+ for _ in range(num_generations): # Inner loop does not need a progress bar
214
+ searcher.step()
215
+
216
+ best_solution_tensor, best_fitness = searcher.best
217
+ best_solution_np = best_solution_tensor.cpu().numpy()
218
+
219
+ # threshold binary features
220
+ if binary_features > 0:
221
+ best_solution_thresholded = threshold_binary_values(input_array=best_solution_np, binary_values=binary_features)
222
+ else:
223
+ best_solution_thresholded = best_solution_np
224
+
225
+ result_dict = {name: value for name, value in zip(feature_names, best_solution_thresholded)}
226
+ result_dict[target_name] = best_fitness.item()
227
+
228
+ # Save each result incrementally
229
+ _save_result(result_dict, save_format, csv_path, db_manager, db_table_name)
230
+
231
+ _LOGGER.info(f"✅ Optimal solution space complete. Results saved to '{save_path}'")
232
+ return None
233
+
234
+
235
+ def info():
236
+ _script_info(__all__)
@@ -3,30 +3,26 @@ from pathlib import Path
3
3
  import xgboost as xgb
4
4
  import lightgbm as lgb
5
5
  from typing import Literal, Union, Tuple, Dict, Optional
6
- import pandas as pd
7
6
  from copy import deepcopy
8
7
  from .utilities import (
9
8
  threshold_binary_values,
10
9
  threshold_binary_values_batch,
11
- deserialize_object,
12
- yield_dataframes_from_dir)
13
- from .path_manager import sanitize_filename, make_fullpath, list_files_by_extension, list_csv_paths
10
+ deserialize_object)
11
+ from .path_manager import sanitize_filename, make_fullpath, list_files_by_extension
14
12
  import torch
15
13
  from tqdm import trange
16
- import matplotlib.pyplot as plt
17
- import seaborn as sns
18
14
  from ._logger import _LOGGER
19
15
  from .keys import ModelSaveKeys
20
16
  from ._script_info import _script_info
21
17
  from .SQL import DatabaseManager
22
18
  from contextlib import nullcontext
19
+ from .optimization_tools import _save_result
23
20
 
24
21
 
25
22
  __all__ = [
26
23
  "ObjectiveFunction",
27
24
  "multiple_objective_functions_from_dir",
28
- "run_pso",
29
- "plot_optimal_feature_distributions"
25
+ "run_pso"
30
26
  ]
31
27
 
32
28
 
@@ -185,45 +181,6 @@ def _set_feature_names(size: int, names: Union[list[str], None]):
185
181
  return names
186
182
 
187
183
 
188
- def _save_result(result_dict: dict,
189
- save_format: Literal['csv', 'sqlite', 'both'],
190
- csv_path: Path,
191
- db_manager: Optional[DatabaseManager] = None,
192
- db_table_name: Optional[str] = None):
193
- """
194
- Handles saving a single result to CSV, SQLite, or both.
195
- """
196
- # Save to CSV
197
- if save_format in ['csv', 'both']:
198
- _save_or_append_to_csv(result_dict, csv_path)
199
-
200
- # Save to SQLite
201
- if save_format in ['sqlite', 'both']:
202
- if db_manager and db_table_name:
203
- db_manager.insert_row(db_table_name, result_dict)
204
- else:
205
- _LOGGER.warning("SQLite saving requested but db_manager or table_name not provided.")
206
-
207
-
208
- def _save_or_append_to_csv(data_dict: dict, save_path: Path):
209
- """
210
- Saves or appends a dictionary of data as a single row to a CSV file.
211
-
212
- If the file doesn't exist, it creates it and writes the header.
213
- If the file exists, it appends the new data without the header.
214
- """
215
- df_row = pd.DataFrame([data_dict])
216
-
217
- file_exists = save_path.exists()
218
-
219
- df_row.to_csv(
220
- save_path,
221
- mode='a', # 'a' for append mode
222
- index=False, # Don't write the DataFrame index
223
- header=not file_exists # Write header only if file does NOT exist
224
- )
225
-
226
-
227
184
  def _run_single_pso(objective_function: ObjectiveFunction, pso_args: dict, feature_names: list[str], target_name: str, random_state: int, save_format: Literal['csv', 'sqlite', 'both'], csv_path: Path, db_manager: Optional[DatabaseManager], db_table_name: str):
228
185
  """Helper for a single PSO run that also handles saving."""
229
186
  pso_args.update({"seed": random_state})
@@ -269,14 +226,14 @@ def run_pso(lower_boundaries: list[float],
269
226
  upper_boundaries: list[float],
270
227
  objective_function: ObjectiveFunction,
271
228
  save_results_dir: Union[str,Path],
272
- save_format: Literal['csv', 'sqlite', 'both'] = 'csv',
229
+ save_format: Literal['csv', 'sqlite', 'both'],
273
230
  auto_binary_boundaries: bool=True,
274
231
  target_name: Union[str, None]=None,
275
232
  feature_names: Union[list[str], None]=None,
276
233
  swarm_size: int=200,
277
234
  max_iterations: int=3000,
278
235
  random_state: int=101,
279
- post_hoc_analysis: Optional[int]=10) -> Optional[Tuple[Dict[str, float], Dict[str, float]]]:
236
+ post_hoc_analysis: Optional[int]=20) -> Optional[Tuple[Dict[str, float], Dict[str, float]]]:
280
237
  """
281
238
  Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results as a CSV file.
282
239
 
@@ -290,7 +247,7 @@ def run_pso(lower_boundaries: list[float],
290
247
  A callable object encapsulating a tree-based regression model.
291
248
  save_results_dir : str | Path
292
249
  Directory path to save the results CSV file.
293
- save_format : {'csv', 'sqlite', 'both'}, default 'csv'
250
+ save_format : {'csv', 'sqlite', 'both'}
294
251
  The format for saving optimization results.
295
252
  - 'csv': Saves results to a CSV file.
296
253
  - 'sqlite': Saves results to an SQLite database file. ⚠️ If a database exists, new tables will be created using the target name.
@@ -565,83 +522,6 @@ def _pso(func: ObjectiveFunction,
565
522
  return best_position, best_score
566
523
 
567
524
 
568
- def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir: Union[str, Path]):
569
- """
570
- Analyzes optimization results and plots the distribution of optimal values for each feature.
571
-
572
- For features with more than two unique values, this function generates a color-coded
573
- Kernel Density Estimate (KDE) plot. For binary or constant features, it generates a bar plot
574
- showing relative frequency.
575
-
576
- Parameters
577
- ----------
578
- results_dir : str or Path
579
- The path to the directory containing the optimization result CSV files.
580
- save_dir : str or Path
581
- The directory where the output plots will be saved.
582
- """
583
- # Check results_dir and create output path
584
- results_path = make_fullpath(results_dir)
585
- output_path = make_fullpath(save_dir, make=True)
586
-
587
- # Check that the directory contains csv files
588
- list_csv_paths(results_path, verbose=False)
589
-
590
- # --- Data Loading and Preparation ---
591
- _LOGGER.info(f"📁 Starting analysis from results in: '{results_dir}'")
592
- data_to_plot = []
593
- for df, df_name in yield_dataframes_from_dir(results_path):
594
- melted_df = df.iloc[:, :-1].melt(var_name='feature', value_name='value')
595
- melted_df['target'] = df_name.replace("Optimization_", "")
596
- data_to_plot.append(melted_df)
597
-
598
- long_df = pd.concat(data_to_plot, ignore_index=True)
599
- features = long_df['feature'].unique()
600
- _LOGGER.info(f"📂 Found data for {len(features)} features across {len(long_df['target'].unique())} targets. Generating plots...")
601
-
602
- # --- Plotting Loop ---
603
- for feature_name in features:
604
- plt.figure(figsize=(12, 7))
605
- feature_df = long_df[long_df['feature'] == feature_name]
606
-
607
- # Check if the feature is binary or constant
608
- if feature_df['value'].nunique() <= 2:
609
- # PLOT 1: For discrete values, calculate percentages and use a true bar plot.
610
- # This ensures the X-axis is clean (e.g., just 0 and 1).
611
- norm_df = (feature_df.groupby('target')['value']
612
- .value_counts(normalize=True)
613
- .mul(100)
614
- .rename('percent')
615
- .reset_index())
616
-
617
- ax = sns.barplot(data=norm_df, x='value', y='percent', hue='target')
618
-
619
- plt.title(f"Optimal Value Distribution for '{feature_name}'", fontsize=16)
620
- plt.ylabel("Frequency (%)", fontsize=12)
621
- ax.set_ylim(0, 100) # Set Y-axis from 0 to 100
622
-
623
- else:
624
- # PLOT 2: KDE plot for continuous values.
625
- ax = sns.kdeplot(data=feature_df, x='value', hue='target',
626
- fill=True, alpha=0.1, warn_singular=False)
627
-
628
- plt.title(f"Optimal Value Distribution for '{feature_name}'", fontsize=16)
629
- plt.ylabel("Density", fontsize=12) # Y-axis is "Density" for KDE plots
630
-
631
- # --- Common settings for both plot types ---
632
- plt.xlabel("Feature Value", fontsize=12)
633
- plt.grid(axis='y', alpha=0.5, linestyle='--')
634
-
635
- legend = ax.get_legend()
636
- if legend:
637
- legend.set_title('Target')
638
-
639
- sanitized_feature_name = sanitize_filename(feature_name)
640
- plot_filename = output_path / f"Distribution_{sanitized_feature_name}.svg"
641
- plt.savefig(plot_filename, bbox_inches='tight')
642
- plt.close()
643
-
644
- _LOGGER.info(f"✅ All plots saved successfully to: '{output_path}'")
645
525
 
646
526
 
647
527
  def info():
@@ -0,0 +1 @@
1
+ from .custom_logger import custom_logger
@@ -348,7 +348,7 @@ def plot_correlation_heatmap(df: pd.DataFrame,
348
348
  full_path = save_path / plot_title
349
349
 
350
350
  plt.savefig(full_path, bbox_inches="tight", format='svg')
351
- print(f"Saved correlation heatmap: '{plot_title}.svg'")
351
+ print(f"Saved correlation heatmap: '{plot_title}'")
352
352
 
353
353
  plt.show()
354
354
  plt.close()
@@ -0,0 +1,137 @@
1
+ import matplotlib.pyplot as plt
2
+ import seaborn as sns
3
+ from typing import Union, Any, Literal, Optional
4
+ from pathlib import Path
5
+ import pandas as pd
6
+
7
+ from .path_manager import make_fullpath, list_csv_paths, sanitize_filename
8
+ from .utilities import yield_dataframes_from_dir
9
+ from ._logger import _LOGGER
10
+ from ._script_info import _script_info
11
+ from .SQL import DatabaseManager
12
+
13
+
14
+ __all__ = [
15
+ "parse_lower_upper_bounds",
16
+ "plot_optimal_feature_distributions"
17
+ ]
18
+
19
+
20
+ def parse_lower_upper_bounds(source: dict[str,tuple[Any,Any]]):
21
+ """
22
+ Parse lower and upper boundaries, returning 2 lists:
23
+
24
+ `lower_bounds`, `upper_bounds`
25
+ """
26
+ lower = [low[0] for low in source.values()]
27
+ upper = [up[1] for up in source.values()]
28
+
29
+ return lower, upper
30
+
31
+
32
+ def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir: Union[str, Path]):
33
+ """
34
+ Analyzes optimization results and plots the distribution of optimal values for each feature.
35
+
36
+ For features with more than two unique values, this function generates a color-coded
37
+ Kernel Density Estimate (KDE) plot. For binary or constant features, it generates a bar plot
38
+ showing relative frequency.
39
+
40
+ Parameters
41
+ ----------
42
+ results_dir : str or Path
43
+ The path to the directory containing the optimization result CSV files.
44
+ save_dir : str or Path
45
+ The directory where the output plots will be saved.
46
+ """
47
+ # Check results_dir and create output path
48
+ results_path = make_fullpath(results_dir)
49
+ output_path = make_fullpath(save_dir, make=True)
50
+
51
+ # Check that the directory contains csv files
52
+ list_csv_paths(results_path, verbose=False)
53
+
54
+ # --- Data Loading and Preparation ---
55
+ _LOGGER.info(f"📁 Starting analysis from results in: '{results_dir}'")
56
+ data_to_plot = []
57
+ for df, df_name in yield_dataframes_from_dir(results_path):
58
+ melted_df = df.iloc[:, :-1].melt(var_name='feature', value_name='value')
59
+ melted_df['target'] = df_name.replace("Optimization_", "")
60
+ data_to_plot.append(melted_df)
61
+
62
+ long_df = pd.concat(data_to_plot, ignore_index=True)
63
+ features = long_df['feature'].unique()
64
+ _LOGGER.info(f"📂 Found data for {len(features)} features across {len(long_df['target'].unique())} targets. Generating plots...")
65
+
66
+ # --- Plotting Loop ---
67
+ for feature_name in features:
68
+ plt.figure(figsize=(12, 7))
69
+ feature_df = long_df[long_df['feature'] == feature_name]
70
+
71
+ # Check if the feature is binary or constant
72
+ if feature_df['value'].nunique() <= 2:
73
+ # PLOT 1: For discrete values, calculate percentages and use a true bar plot.
74
+ # This ensures the X-axis is clean (e.g., just 0 and 1).
75
+ norm_df = (feature_df.groupby('target')['value']
76
+ .value_counts(normalize=True)
77
+ .mul(100)
78
+ .rename('percent')
79
+ .reset_index())
80
+
81
+ ax = sns.barplot(data=norm_df, x='value', y='percent', hue='target')
82
+
83
+ plt.title(f"Optimal Value Distribution for '{feature_name}'", fontsize=16)
84
+ plt.ylabel("Frequency (%)", fontsize=12)
85
+ ax.set_ylim(0, 100) # Set Y-axis from 0 to 100
86
+
87
+ else:
88
+ # PLOT 2: KDE plot for continuous values.
89
+ ax = sns.kdeplot(data=feature_df, x='value', hue='target',
90
+ fill=True, alpha=0.1, warn_singular=False)
91
+
92
+ plt.title(f"Optimal Value Distribution for '{feature_name}'", fontsize=16)
93
+ plt.ylabel("Density", fontsize=12) # Y-axis is "Density" for KDE plots
94
+
95
+ # --- Common settings for both plot types ---
96
+ plt.xlabel("Feature Value", fontsize=12)
97
+ plt.grid(axis='y', alpha=0.5, linestyle='--')
98
+
99
+ legend = ax.get_legend()
100
+ if legend:
101
+ legend.set_title('Target')
102
+
103
+ sanitized_feature_name = sanitize_filename(feature_name)
104
+ plot_filename = output_path / f"Distribution_{sanitized_feature_name}.svg"
105
+ plt.savefig(plot_filename, bbox_inches='tight')
106
+ plt.close()
107
+
108
+ _LOGGER.info(f"✅ All plots saved successfully to: '{output_path}'")
109
+
110
+
111
+ def _save_result(
112
+ result_dict: dict,
113
+ save_format: Literal['csv', 'sqlite', 'both'],
114
+ csv_path: Path,
115
+ db_manager: Optional[DatabaseManager] = None,
116
+ db_table_name: Optional[str] = None
117
+ ):
118
+ """
119
+ Private helper to handle saving a single result to CSV, SQLite, or both.
120
+ """
121
+ # Save to CSV
122
+ if save_format in ['csv', 'both']:
123
+ df_row = pd.DataFrame([result_dict])
124
+ file_exists = csv_path.exists()
125
+ df_row.to_csv(csv_path, mode='a', index=False, header=not file_exists)
126
+
127
+ # Save to SQLite
128
+ if save_format in ['sqlite', 'both']:
129
+ if db_manager and db_table_name:
130
+ db_manager.insert_row(db_table_name, result_dict)
131
+ else:
132
+ _LOGGER.warning("⚠️ SQLite saving requested but db_manager or table_name not provided.")
133
+
134
+
135
+
136
+ def info():
137
+ _script_info(__all__)
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dragon-ml-toolbox"
3
- version = "4.4.0"
3
+ version = "5.0.0"
4
4
  description = "A collection of tools for data science and machine learning projects."
5
5
  authors = [
6
6
  { name = "Karl Loza", email = "luigiloza@gmail.com" }
@@ -27,7 +27,7 @@ base = [
27
27
  "joblib"
28
28
  ]
29
29
 
30
- # Machine Learning main toolbox. Additionally Requires PyTorch with CUDA / MPS support if pytorch models are used
30
+ # Machine Learning main toolbox. Additionally Requires PyTorch with CUDA / MPS support
31
31
  ML = [
32
32
  "numpy",
33
33
  "pandas",
@@ -46,7 +46,8 @@ ML = [
46
46
  "lightgbm",
47
47
  "shap",
48
48
  "tqdm",
49
- "Pillow"
49
+ "Pillow",
50
+ "evotorch"
50
51
  ]
51
52
 
52
53
  # MICE and VIF - Requires a new virtual-env due to dependency version conflicts
File without changes