dragon-ml-toolbox 4.5.0__py3-none-any.whl → 5.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-4.5.0.dist-info → dragon_ml_toolbox-5.0.0.dist-info}/METADATA +4 -1
- {dragon_ml_toolbox-4.5.0.dist-info → dragon_ml_toolbox-5.0.0.dist-info}/RECORD +10 -8
- ml_tools/ML_optimization.py +236 -0
- ml_tools/PSO_optimization.py +8 -141
- ml_tools/optimization_tools.py +137 -0
- {dragon_ml_toolbox-4.5.0.dist-info → dragon_ml_toolbox-5.0.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-4.5.0.dist-info → dragon_ml_toolbox-5.0.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-4.5.0.dist-info → dragon_ml_toolbox-5.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-4.5.0.dist-info → dragon_ml_toolbox-5.0.0.dist-info}/top_level.txt +0 -0
- /ml_tools/{datasetmaster.py → ML_datasetmaster.py} +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dragon-ml-toolbox
|
|
3
|
-
Version:
|
|
3
|
+
Version: 5.0.0
|
|
4
4
|
Summary: A collection of tools for data science and machine learning projects.
|
|
5
5
|
Author-email: Karl Loza <luigiloza@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -36,6 +36,7 @@ Requires-Dist: lightgbm; extra == "ml"
|
|
|
36
36
|
Requires-Dist: shap; extra == "ml"
|
|
37
37
|
Requires-Dist: tqdm; extra == "ml"
|
|
38
38
|
Requires-Dist: Pillow; extra == "ml"
|
|
39
|
+
Requires-Dist: evotorch; extra == "ml"
|
|
39
40
|
Provides-Extra: mice
|
|
40
41
|
Requires-Dist: numpy<2.0; extra == "mice"
|
|
41
42
|
Requires-Dist: pandas; extra == "mice"
|
|
@@ -204,6 +205,7 @@ pip install "dragon-ml-toolbox[gui-boost,plot]"
|
|
|
204
205
|
#### Modules:
|
|
205
206
|
|
|
206
207
|
```Bash
|
|
208
|
+
custom_logger
|
|
207
209
|
GUI_tools
|
|
208
210
|
ensemble_inference
|
|
209
211
|
path_manager
|
|
@@ -224,6 +226,7 @@ pip install "dragon-ml-toolbox[gui-torch,plot]"
|
|
|
224
226
|
#### Modules:
|
|
225
227
|
|
|
226
228
|
```Bash
|
|
229
|
+
custom_logger
|
|
227
230
|
GUI_tools
|
|
228
231
|
ML_inference
|
|
229
232
|
path_manager
|
|
@@ -1,13 +1,15 @@
|
|
|
1
|
-
dragon_ml_toolbox-
|
|
2
|
-
dragon_ml_toolbox-
|
|
1
|
+
dragon_ml_toolbox-5.0.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
|
|
2
|
+
dragon_ml_toolbox-5.0.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
|
|
3
3
|
ml_tools/ETL_engineering.py,sha256=4wwZXi9_U7xfCY70jGBaKniOeZ0m75ppxWpQBd_DmLc,39369
|
|
4
4
|
ml_tools/GUI_tools.py,sha256=n4ZZ5kEjwK5rkOCFJE41HeLFfjhpJVLUSzk9Kd9Kr_0,45410
|
|
5
5
|
ml_tools/MICE_imputation.py,sha256=b6ZTs8RedXFifOpuMCzr68xM16mCBVh1Ua6kcGfiVtg,11462
|
|
6
6
|
ml_tools/ML_callbacks.py,sha256=0a-Rbr0Xp_B1FNopOKBBmuJ4MqazS5JgDiT7wx1dHvE,13161
|
|
7
|
+
ml_tools/ML_datasetmaster.py,sha256=_tNC2v98eCQGr3nMW_EFs83TRgRme8Uc7ttg1vosmQU,30106
|
|
7
8
|
ml_tools/ML_evaluation.py,sha256=4dVqe6JF1Ukmk1sAcY8E5EG1oB1_oy2HXE5OT-pZwCs,10273
|
|
8
9
|
ml_tools/ML_inference.py,sha256=Fh-X2UQn3AznWBjf-7iPSxwE-EzkGQm1VEIRUAkURmE,5336
|
|
10
|
+
ml_tools/ML_optimization.py,sha256=u3H-TYGycKDdog-njkMfiAxd8TBtmGeLLFplBPRmmxk,10057
|
|
9
11
|
ml_tools/ML_trainer.py,sha256=dJjMfCEEM07Txy9KEH-2srZ3CZUa4lFWTJhpNWQ4Ndk,14974
|
|
10
|
-
ml_tools/PSO_optimization.py,sha256=
|
|
12
|
+
ml_tools/PSO_optimization.py,sha256=stH2Ux1sftQgX5EwLc85kHcoT4Rmz6zv7sH2yzf4Zrw,22710
|
|
11
13
|
ml_tools/RNN_forecast.py,sha256=2CyjBLSYYc3xLHxwLXUmP5Qv8AmV1OB_EndETNX1IBk,1956
|
|
12
14
|
ml_tools/SQL.py,sha256=9zzS6AFEJM9aj6nE31hDe8S9TqLonk-J1amwZoiHNbk,10468
|
|
13
15
|
ml_tools/VIF_factor.py,sha256=2nUMupfUoogf8o6ghoFZk_OwWhFXU0R3C9Gj0HOlI14,10415
|
|
@@ -17,14 +19,14 @@ ml_tools/_pytorch_models.py,sha256=ewPPsTHgmRPzMMWwObZOdH1vxm2Ij2VWZP38NC6zSH4,1
|
|
|
17
19
|
ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
|
|
18
20
|
ml_tools/custom_logger.py,sha256=njM_0XPbQ1S-x5LeSQAaTo2if-XVOR_pQSGg4EDeiTU,4603
|
|
19
21
|
ml_tools/data_exploration.py,sha256=qc_Oolxco2x9IhlYu5zPIuVBGiBw65HnypuGm8cQOOM,23677
|
|
20
|
-
ml_tools/datasetmaster.py,sha256=_tNC2v98eCQGr3nMW_EFs83TRgRme8Uc7ttg1vosmQU,30106
|
|
21
22
|
ml_tools/ensemble_inference.py,sha256=0SNX3YAz5bpvtwYmqEwqyWeIJP2Pb-v-bemENRSO7qg,9426
|
|
22
23
|
ml_tools/ensemble_learning.py,sha256=Zi1oy6G2FWnTI5hBwjlexwF3JKALFS2FN6F8HAlVi_s,35391
|
|
23
24
|
ml_tools/handle_excel.py,sha256=J9iwIqMZemoxK49J5osSwp9Ge0h9YTKyYGbOm53hcno,13007
|
|
24
25
|
ml_tools/keys.py,sha256=kK9UF-hek2VcPGFILCKl5geoN6flmMOu7IzhdEA6z5Y,1068
|
|
26
|
+
ml_tools/optimization_tools.py,sha256=MuT4OG7_r1QqLUti-yYix7QeCpglezD0oe9BDCq0QXk,5086
|
|
25
27
|
ml_tools/path_manager.py,sha256=Z8e7w3MPqQaN8xmTnKuXZS6CIW59BFwwqGhGc00sdp4,13692
|
|
26
28
|
ml_tools/utilities.py,sha256=mz-M351DzxWxnYVcLX-7ZQ6c-RGoCV9g4VTS9Qif2Es,18348
|
|
27
|
-
dragon_ml_toolbox-
|
|
28
|
-
dragon_ml_toolbox-
|
|
29
|
-
dragon_ml_toolbox-
|
|
30
|
-
dragon_ml_toolbox-
|
|
29
|
+
dragon_ml_toolbox-5.0.0.dist-info/METADATA,sha256=N9-274zMIAQmEfaNoET6Ydj96huYS9_twKLBnl37bic,6639
|
|
30
|
+
dragon_ml_toolbox-5.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
31
|
+
dragon_ml_toolbox-5.0.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
32
|
+
dragon_ml_toolbox-5.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
import numpy
|
|
3
|
+
import evotorch
|
|
4
|
+
from evotorch.algorithms import CMAES, SteadyStateGA
|
|
5
|
+
from evotorch.logging import StdOutLogger
|
|
6
|
+
from typing import Literal, Union, Tuple, List, Optional
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from tqdm.auto import trange
|
|
9
|
+
from contextlib import nullcontext
|
|
10
|
+
|
|
11
|
+
from .path_manager import make_fullpath, sanitize_filename
|
|
12
|
+
from ._logger import _LOGGER
|
|
13
|
+
from ._script_info import _script_info
|
|
14
|
+
from .ML_inference import PyTorchInferenceHandler
|
|
15
|
+
from .keys import PyTorchInferenceKeys
|
|
16
|
+
from .SQL import DatabaseManager
|
|
17
|
+
from .optimization_tools import _save_result
|
|
18
|
+
from .utilities import threshold_binary_values
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
"create_pytorch_problem",
|
|
23
|
+
"run_optimization"
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def create_pytorch_problem(
|
|
28
|
+
handler: PyTorchInferenceHandler,
|
|
29
|
+
bounds: Tuple[List[float], List[float]],
|
|
30
|
+
binary_features: int,
|
|
31
|
+
task: Literal["minimize", "maximize"],
|
|
32
|
+
algorithm: Literal["CMAES", "GA"] = "CMAES",
|
|
33
|
+
verbose: bool = False,
|
|
34
|
+
**searcher_kwargs
|
|
35
|
+
) -> Tuple[evotorch.Problem, evotorch.Searcher]:
|
|
36
|
+
"""
|
|
37
|
+
Creates and configures an EvoTorch Problem and Searcher for a PyTorch model.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
handler (PyTorchInferenceHandler): An initialized inference handler
|
|
41
|
+
containing the model and weights.
|
|
42
|
+
bounds (tuple[list[float], list[float]]): A tuple containing the lower
|
|
43
|
+
and upper bounds for the solution features.
|
|
44
|
+
binary_features (int): Number of binary features located at the END of the feature vector. Will be automatically added to the bounds.
|
|
45
|
+
task (str): The optimization goal, either "minimize" or "maximize".
|
|
46
|
+
algorithm (str): The search algorithm to use, "CMAES" or "GA" (SteadyStateGA).
|
|
47
|
+
verbose (bool): Add an Evotorch logger for real-time console updates.
|
|
48
|
+
**searcher_kwargs: Additional keyword arguments to pass to the
|
|
49
|
+
selected search algorithm's constructor (e.g., stdev_init=0.5 for CMAES).
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
A tuple containing the configured evotorch.Problem and evotorch.Searcher.
|
|
53
|
+
"""
|
|
54
|
+
lower_bounds, upper_bounds = bounds
|
|
55
|
+
|
|
56
|
+
# add binary bounds
|
|
57
|
+
if binary_features > 0:
|
|
58
|
+
lower_bounds.extend([0.45] * binary_features)
|
|
59
|
+
upper_bounds.extend([0.55] * binary_features)
|
|
60
|
+
|
|
61
|
+
solution_length = len(lower_bounds)
|
|
62
|
+
device = handler.device
|
|
63
|
+
|
|
64
|
+
# Define the fitness function that EvoTorch will call.
|
|
65
|
+
@evotorch.decorators.to_tensor
|
|
66
|
+
@evotorch.decorators.on_aux_device(device)
|
|
67
|
+
def fitness_func(solution_tensor: torch.Tensor) -> torch.Tensor:
|
|
68
|
+
# Make a mutable copy of the solutions from the optimizer
|
|
69
|
+
processed_tensor = solution_tensor.clone()
|
|
70
|
+
|
|
71
|
+
# Apply thresholding if binary features are present
|
|
72
|
+
if binary_features > 0:
|
|
73
|
+
# Isolate the binary part of the tensor (the last n columns)
|
|
74
|
+
binary_part = processed_tensor[:, -binary_features:]
|
|
75
|
+
|
|
76
|
+
# Apply rounding to snap values to 0.0 or 1.0
|
|
77
|
+
processed_tensor[:, -binary_features:] = torch.round(binary_part)
|
|
78
|
+
|
|
79
|
+
# Use the processed tensor (with thresholded values) for prediction
|
|
80
|
+
predictions = handler.predict_batch(processed_tensor)[PyTorchInferenceKeys.PREDICTIONS]
|
|
81
|
+
return predictions.flatten()
|
|
82
|
+
|
|
83
|
+
# Create the Problem instance.
|
|
84
|
+
problem = evotorch.Problem(
|
|
85
|
+
objective_sense=task,
|
|
86
|
+
objective_func=fitness_func,
|
|
87
|
+
solution_length=solution_length,
|
|
88
|
+
initial_bounds=(lower_bounds, upper_bounds),
|
|
89
|
+
device=device,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
# Create the selected searcher instance.
|
|
93
|
+
if algorithm == "CMAES":
|
|
94
|
+
searcher = CMAES(problem, **searcher_kwargs)
|
|
95
|
+
elif algorithm == "GA":
|
|
96
|
+
searcher = SteadyStateGA(problem, **searcher_kwargs)
|
|
97
|
+
else:
|
|
98
|
+
raise ValueError(f"Unknown algorithm '{algorithm}'. Choose 'CMAES' or 'GA'.")
|
|
99
|
+
|
|
100
|
+
# Add a logger for real-time console updates.
|
|
101
|
+
# This gives the user immediate feedback on the optimization progress.
|
|
102
|
+
if verbose:
|
|
103
|
+
_ = StdOutLogger(searcher)
|
|
104
|
+
|
|
105
|
+
return problem, searcher
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def run_optimization(
|
|
109
|
+
problem: evotorch.Problem,
|
|
110
|
+
searcher: evotorch.Searcher,
|
|
111
|
+
num_generations: int,
|
|
112
|
+
target_name: str,
|
|
113
|
+
binary_features: int,
|
|
114
|
+
save_dir: Union[str, Path],
|
|
115
|
+
save_format: Literal['csv', 'sqlite', 'both'],
|
|
116
|
+
feature_names: Optional[List[str]],
|
|
117
|
+
repetitions: int = 1
|
|
118
|
+
) -> Optional[dict]:
|
|
119
|
+
"""
|
|
120
|
+
Runs the evolutionary optimization process, with support for multiple repetitions.
|
|
121
|
+
|
|
122
|
+
This function serves as the main engine for the optimization task. It takes a
|
|
123
|
+
configured Problem and a Searcher from EvoTorch and executes the optimization
|
|
124
|
+
for a specified number of generations.
|
|
125
|
+
|
|
126
|
+
It has two modes of operation:
|
|
127
|
+
1. **Single Run (repetitions=1):** Executes the optimization once, saves the
|
|
128
|
+
single best result to a CSV file, and returns it as a dictionary.
|
|
129
|
+
2. **Iterative Analysis (repetitions > 1):** Executes the optimization
|
|
130
|
+
multiple times. Results from each run are streamed incrementally to the
|
|
131
|
+
specified file formats (CSV and/or SQLite database). In this mode,
|
|
132
|
+
the function returns None.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
problem (evotorch.Problem): The configured problem instance, which defines
|
|
136
|
+
the objective function, solution space, and optimization sense.
|
|
137
|
+
searcher (evotorch.Searcher): The configured searcher instance, which
|
|
138
|
+
contains the evolutionary algorithm (e.g., CMAES, GA).
|
|
139
|
+
num_generations (int): The total number of generations to run the
|
|
140
|
+
search algorithm for in each repetition.
|
|
141
|
+
target_name (str): Target name that will also be used for the CSV filename and SQL table.
|
|
142
|
+
binary_features (int): Number of binary features located at the END of the feature vector.
|
|
143
|
+
save_dir (str | Path): The directory where the result file(s) will be saved.
|
|
144
|
+
save_format (Literal['csv', 'sqlite', 'both'], optional): The format for
|
|
145
|
+
saving results during iterative analysis. Defaults to 'both'.
|
|
146
|
+
feature_names (List[str], optional): Names of the solution features for
|
|
147
|
+
labeling the output files. If None, generic names like 'feature_0',
|
|
148
|
+
'feature_1', etc., will be created. Defaults to None.
|
|
149
|
+
repetitions (int, optional): The number of independent times to run the
|
|
150
|
+
entire optimization process. Defaults to 1.
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
Optional[dict]: A dictionary containing the best feature values and the
|
|
154
|
+
fitness score if `repetitions` is 1. Returns `None` if `repetitions`
|
|
155
|
+
is greater than 1, as results are streamed to files instead.
|
|
156
|
+
"""
|
|
157
|
+
# preprocess paths
|
|
158
|
+
save_path = make_fullpath(save_dir, make=True, enforce="directory")
|
|
159
|
+
|
|
160
|
+
sanitized_target_name = sanitize_filename(target_name)
|
|
161
|
+
if not sanitized_target_name.endswith(".csv"):
|
|
162
|
+
sanitized_target_name = sanitized_target_name + ".csv"
|
|
163
|
+
|
|
164
|
+
csv_path = save_path / sanitized_target_name
|
|
165
|
+
|
|
166
|
+
db_path = save_path / "Optimization.db"
|
|
167
|
+
db_table_name = target_name
|
|
168
|
+
|
|
169
|
+
# preprocess feature names
|
|
170
|
+
if feature_names is None:
|
|
171
|
+
feature_names = [f"feature_{i}" for i in range(problem.solution_length)]
|
|
172
|
+
|
|
173
|
+
# --- SINGLE RUN LOGIC ---
|
|
174
|
+
if repetitions <= 1:
|
|
175
|
+
_LOGGER.info(f"🤖 Starting optimization with {searcher.__class__.__name__} for {num_generations} generations...")
|
|
176
|
+
for _ in trange(num_generations, desc="Optimizing"):
|
|
177
|
+
searcher.step()
|
|
178
|
+
|
|
179
|
+
best_solution_tensor, best_fitness = searcher.best
|
|
180
|
+
best_solution_np = best_solution_tensor.cpu().numpy()
|
|
181
|
+
|
|
182
|
+
# threshold binary features
|
|
183
|
+
if binary_features > 0:
|
|
184
|
+
best_solution_thresholded = threshold_binary_values(input_array=best_solution_np, binary_values=binary_features)
|
|
185
|
+
else:
|
|
186
|
+
best_solution_thresholded = best_solution_np
|
|
187
|
+
|
|
188
|
+
result_dict = {name: value for name, value in zip(feature_names, best_solution_thresholded)}
|
|
189
|
+
result_dict[target_name] = best_fitness.item()
|
|
190
|
+
|
|
191
|
+
_save_result(result_dict, 'csv', csv_path) # Single run defaults to CSV
|
|
192
|
+
_LOGGER.info(f"✅ Optimization complete. Best solution saved to '{csv_path.name}'")
|
|
193
|
+
return result_dict
|
|
194
|
+
|
|
195
|
+
# --- MULTIPLE REPETITIONS LOGIC ---
|
|
196
|
+
else:
|
|
197
|
+
_LOGGER.info(f"🏁 Starting optimal solution space analysis with {repetitions} repetitions...")
|
|
198
|
+
|
|
199
|
+
db_context = DatabaseManager(db_path) if save_format in ['sqlite', 'both'] else nullcontext()
|
|
200
|
+
|
|
201
|
+
with db_context as db_manager:
|
|
202
|
+
if db_manager:
|
|
203
|
+
schema = {name: "REAL" for name in feature_names}
|
|
204
|
+
schema[target_name] = "REAL"
|
|
205
|
+
db_manager.create_table(db_table_name, schema)
|
|
206
|
+
|
|
207
|
+
for i in trange(repetitions, desc="Repetitions"):
|
|
208
|
+
_LOGGER.info(f"--- Starting Repetition {i+1}/{repetitions} ---")
|
|
209
|
+
|
|
210
|
+
# CRITICAL: Re-initialize the searcher to ensure each run is independent
|
|
211
|
+
searcher.reset()
|
|
212
|
+
|
|
213
|
+
for _ in range(num_generations): # Inner loop does not need a progress bar
|
|
214
|
+
searcher.step()
|
|
215
|
+
|
|
216
|
+
best_solution_tensor, best_fitness = searcher.best
|
|
217
|
+
best_solution_np = best_solution_tensor.cpu().numpy()
|
|
218
|
+
|
|
219
|
+
# threshold binary features
|
|
220
|
+
if binary_features > 0:
|
|
221
|
+
best_solution_thresholded = threshold_binary_values(input_array=best_solution_np, binary_values=binary_features)
|
|
222
|
+
else:
|
|
223
|
+
best_solution_thresholded = best_solution_np
|
|
224
|
+
|
|
225
|
+
result_dict = {name: value for name, value in zip(feature_names, best_solution_thresholded)}
|
|
226
|
+
result_dict[target_name] = best_fitness.item()
|
|
227
|
+
|
|
228
|
+
# Save each result incrementally
|
|
229
|
+
_save_result(result_dict, save_format, csv_path, db_manager, db_table_name)
|
|
230
|
+
|
|
231
|
+
_LOGGER.info(f"✅ Optimal solution space complete. Results saved to '{save_path}'")
|
|
232
|
+
return None
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def info():
|
|
236
|
+
_script_info(__all__)
|
ml_tools/PSO_optimization.py
CHANGED
|
@@ -2,32 +2,27 @@ import numpy as np
|
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
import xgboost as xgb
|
|
4
4
|
import lightgbm as lgb
|
|
5
|
-
from typing import Literal, Union, Tuple, Dict, Optional
|
|
6
|
-
import pandas as pd
|
|
5
|
+
from typing import Literal, Union, Tuple, Dict, Optional
|
|
7
6
|
from copy import deepcopy
|
|
8
7
|
from .utilities import (
|
|
9
8
|
threshold_binary_values,
|
|
10
9
|
threshold_binary_values_batch,
|
|
11
|
-
deserialize_object
|
|
12
|
-
|
|
13
|
-
from .path_manager import sanitize_filename, make_fullpath, list_files_by_extension, list_csv_paths
|
|
10
|
+
deserialize_object)
|
|
11
|
+
from .path_manager import sanitize_filename, make_fullpath, list_files_by_extension
|
|
14
12
|
import torch
|
|
15
13
|
from tqdm import trange
|
|
16
|
-
import matplotlib.pyplot as plt
|
|
17
|
-
import seaborn as sns
|
|
18
14
|
from ._logger import _LOGGER
|
|
19
15
|
from .keys import ModelSaveKeys
|
|
20
16
|
from ._script_info import _script_info
|
|
21
17
|
from .SQL import DatabaseManager
|
|
22
18
|
from contextlib import nullcontext
|
|
19
|
+
from .optimization_tools import _save_result
|
|
23
20
|
|
|
24
21
|
|
|
25
22
|
__all__ = [
|
|
26
23
|
"ObjectiveFunction",
|
|
27
24
|
"multiple_objective_functions_from_dir",
|
|
28
|
-
"
|
|
29
|
-
"run_pso",
|
|
30
|
-
"plot_optimal_feature_distributions"
|
|
25
|
+
"run_pso"
|
|
31
26
|
]
|
|
32
27
|
|
|
33
28
|
|
|
@@ -170,18 +165,6 @@ def multiple_objective_functions_from_dir(directory: Union[str,Path], add_noise:
|
|
|
170
165
|
return objective_functions, objective_function_names
|
|
171
166
|
|
|
172
167
|
|
|
173
|
-
def parse_lower_upper_bounds(source: dict[str,tuple[Any,Any]]):
|
|
174
|
-
"""
|
|
175
|
-
Parse lower and upper boundaries, returning 2 lists:
|
|
176
|
-
|
|
177
|
-
`lower_bounds`, `upper_bounds`
|
|
178
|
-
"""
|
|
179
|
-
lower = [low[0] for low in source.values()]
|
|
180
|
-
upper = [up[1] for up in source.values()]
|
|
181
|
-
|
|
182
|
-
return lower, upper
|
|
183
|
-
|
|
184
|
-
|
|
185
168
|
def _set_boundaries(lower_boundaries: list[float], upper_boundaries: list[float]):
|
|
186
169
|
assert len(lower_boundaries) == len(upper_boundaries), "Lower and upper boundaries must have the same length."
|
|
187
170
|
assert len(lower_boundaries) >= 1, "At least one boundary pair is required."
|
|
@@ -198,45 +181,6 @@ def _set_feature_names(size: int, names: Union[list[str], None]):
|
|
|
198
181
|
return names
|
|
199
182
|
|
|
200
183
|
|
|
201
|
-
def _save_result(result_dict: dict,
|
|
202
|
-
save_format: Literal['csv', 'sqlite', 'both'],
|
|
203
|
-
csv_path: Path,
|
|
204
|
-
db_manager: Optional[DatabaseManager] = None,
|
|
205
|
-
db_table_name: Optional[str] = None):
|
|
206
|
-
"""
|
|
207
|
-
Handles saving a single result to CSV, SQLite, or both.
|
|
208
|
-
"""
|
|
209
|
-
# Save to CSV
|
|
210
|
-
if save_format in ['csv', 'both']:
|
|
211
|
-
_save_or_append_to_csv(result_dict, csv_path)
|
|
212
|
-
|
|
213
|
-
# Save to SQLite
|
|
214
|
-
if save_format in ['sqlite', 'both']:
|
|
215
|
-
if db_manager and db_table_name:
|
|
216
|
-
db_manager.insert_row(db_table_name, result_dict)
|
|
217
|
-
else:
|
|
218
|
-
_LOGGER.warning("SQLite saving requested but db_manager or table_name not provided.")
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
def _save_or_append_to_csv(data_dict: dict, save_path: Path):
|
|
222
|
-
"""
|
|
223
|
-
Saves or appends a dictionary of data as a single row to a CSV file.
|
|
224
|
-
|
|
225
|
-
If the file doesn't exist, it creates it and writes the header.
|
|
226
|
-
If the file exists, it appends the new data without the header.
|
|
227
|
-
"""
|
|
228
|
-
df_row = pd.DataFrame([data_dict])
|
|
229
|
-
|
|
230
|
-
file_exists = save_path.exists()
|
|
231
|
-
|
|
232
|
-
df_row.to_csv(
|
|
233
|
-
save_path,
|
|
234
|
-
mode='a', # 'a' for append mode
|
|
235
|
-
index=False, # Don't write the DataFrame index
|
|
236
|
-
header=not file_exists # Write header only if file does NOT exist
|
|
237
|
-
)
|
|
238
|
-
|
|
239
|
-
|
|
240
184
|
def _run_single_pso(objective_function: ObjectiveFunction, pso_args: dict, feature_names: list[str], target_name: str, random_state: int, save_format: Literal['csv', 'sqlite', 'both'], csv_path: Path, db_manager: Optional[DatabaseManager], db_table_name: str):
|
|
241
185
|
"""Helper for a single PSO run that also handles saving."""
|
|
242
186
|
pso_args.update({"seed": random_state})
|
|
@@ -282,14 +226,14 @@ def run_pso(lower_boundaries: list[float],
|
|
|
282
226
|
upper_boundaries: list[float],
|
|
283
227
|
objective_function: ObjectiveFunction,
|
|
284
228
|
save_results_dir: Union[str,Path],
|
|
285
|
-
save_format: Literal['csv', 'sqlite', 'both']
|
|
229
|
+
save_format: Literal['csv', 'sqlite', 'both'],
|
|
286
230
|
auto_binary_boundaries: bool=True,
|
|
287
231
|
target_name: Union[str, None]=None,
|
|
288
232
|
feature_names: Union[list[str], None]=None,
|
|
289
233
|
swarm_size: int=200,
|
|
290
234
|
max_iterations: int=3000,
|
|
291
235
|
random_state: int=101,
|
|
292
|
-
post_hoc_analysis: Optional[int]=
|
|
236
|
+
post_hoc_analysis: Optional[int]=20) -> Optional[Tuple[Dict[str, float], Dict[str, float]]]:
|
|
293
237
|
"""
|
|
294
238
|
Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results as a CSV file.
|
|
295
239
|
|
|
@@ -303,7 +247,7 @@ def run_pso(lower_boundaries: list[float],
|
|
|
303
247
|
A callable object encapsulating a tree-based regression model.
|
|
304
248
|
save_results_dir : str | Path
|
|
305
249
|
Directory path to save the results CSV file.
|
|
306
|
-
save_format : {'csv', 'sqlite', 'both'}
|
|
250
|
+
save_format : {'csv', 'sqlite', 'both'}
|
|
307
251
|
The format for saving optimization results.
|
|
308
252
|
- 'csv': Saves results to a CSV file.
|
|
309
253
|
- 'sqlite': Saves results to an SQLite database file. ⚠️ If a database exists, new tables will be created using the target name.
|
|
@@ -578,83 +522,6 @@ def _pso(func: ObjectiveFunction,
|
|
|
578
522
|
return best_position, best_score
|
|
579
523
|
|
|
580
524
|
|
|
581
|
-
def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir: Union[str, Path]):
|
|
582
|
-
"""
|
|
583
|
-
Analyzes optimization results and plots the distribution of optimal values for each feature.
|
|
584
|
-
|
|
585
|
-
For features with more than two unique values, this function generates a color-coded
|
|
586
|
-
Kernel Density Estimate (KDE) plot. For binary or constant features, it generates a bar plot
|
|
587
|
-
showing relative frequency.
|
|
588
|
-
|
|
589
|
-
Parameters
|
|
590
|
-
----------
|
|
591
|
-
results_dir : str or Path
|
|
592
|
-
The path to the directory containing the optimization result CSV files.
|
|
593
|
-
save_dir : str or Path
|
|
594
|
-
The directory where the output plots will be saved.
|
|
595
|
-
"""
|
|
596
|
-
# Check results_dir and create output path
|
|
597
|
-
results_path = make_fullpath(results_dir)
|
|
598
|
-
output_path = make_fullpath(save_dir, make=True)
|
|
599
|
-
|
|
600
|
-
# Check that the directory contains csv files
|
|
601
|
-
list_csv_paths(results_path, verbose=False)
|
|
602
|
-
|
|
603
|
-
# --- Data Loading and Preparation ---
|
|
604
|
-
_LOGGER.info(f"📁 Starting analysis from results in: '{results_dir}'")
|
|
605
|
-
data_to_plot = []
|
|
606
|
-
for df, df_name in yield_dataframes_from_dir(results_path):
|
|
607
|
-
melted_df = df.iloc[:, :-1].melt(var_name='feature', value_name='value')
|
|
608
|
-
melted_df['target'] = df_name.replace("Optimization_", "")
|
|
609
|
-
data_to_plot.append(melted_df)
|
|
610
|
-
|
|
611
|
-
long_df = pd.concat(data_to_plot, ignore_index=True)
|
|
612
|
-
features = long_df['feature'].unique()
|
|
613
|
-
_LOGGER.info(f"📂 Found data for {len(features)} features across {len(long_df['target'].unique())} targets. Generating plots...")
|
|
614
|
-
|
|
615
|
-
# --- Plotting Loop ---
|
|
616
|
-
for feature_name in features:
|
|
617
|
-
plt.figure(figsize=(12, 7))
|
|
618
|
-
feature_df = long_df[long_df['feature'] == feature_name]
|
|
619
|
-
|
|
620
|
-
# Check if the feature is binary or constant
|
|
621
|
-
if feature_df['value'].nunique() <= 2:
|
|
622
|
-
# PLOT 1: For discrete values, calculate percentages and use a true bar plot.
|
|
623
|
-
# This ensures the X-axis is clean (e.g., just 0 and 1).
|
|
624
|
-
norm_df = (feature_df.groupby('target')['value']
|
|
625
|
-
.value_counts(normalize=True)
|
|
626
|
-
.mul(100)
|
|
627
|
-
.rename('percent')
|
|
628
|
-
.reset_index())
|
|
629
|
-
|
|
630
|
-
ax = sns.barplot(data=norm_df, x='value', y='percent', hue='target')
|
|
631
|
-
|
|
632
|
-
plt.title(f"Optimal Value Distribution for '{feature_name}'", fontsize=16)
|
|
633
|
-
plt.ylabel("Frequency (%)", fontsize=12)
|
|
634
|
-
ax.set_ylim(0, 100) # Set Y-axis from 0 to 100
|
|
635
|
-
|
|
636
|
-
else:
|
|
637
|
-
# PLOT 2: KDE plot for continuous values.
|
|
638
|
-
ax = sns.kdeplot(data=feature_df, x='value', hue='target',
|
|
639
|
-
fill=True, alpha=0.1, warn_singular=False)
|
|
640
|
-
|
|
641
|
-
plt.title(f"Optimal Value Distribution for '{feature_name}'", fontsize=16)
|
|
642
|
-
plt.ylabel("Density", fontsize=12) # Y-axis is "Density" for KDE plots
|
|
643
|
-
|
|
644
|
-
# --- Common settings for both plot types ---
|
|
645
|
-
plt.xlabel("Feature Value", fontsize=12)
|
|
646
|
-
plt.grid(axis='y', alpha=0.5, linestyle='--')
|
|
647
|
-
|
|
648
|
-
legend = ax.get_legend()
|
|
649
|
-
if legend:
|
|
650
|
-
legend.set_title('Target')
|
|
651
|
-
|
|
652
|
-
sanitized_feature_name = sanitize_filename(feature_name)
|
|
653
|
-
plot_filename = output_path / f"Distribution_{sanitized_feature_name}.svg"
|
|
654
|
-
plt.savefig(plot_filename, bbox_inches='tight')
|
|
655
|
-
plt.close()
|
|
656
|
-
|
|
657
|
-
_LOGGER.info(f"✅ All plots saved successfully to: '{output_path}'")
|
|
658
525
|
|
|
659
526
|
|
|
660
527
|
def info():
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
import matplotlib.pyplot as plt
|
|
2
|
+
import seaborn as sns
|
|
3
|
+
from typing import Union, Any, Literal, Optional
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from .path_manager import make_fullpath, list_csv_paths, sanitize_filename
|
|
8
|
+
from .utilities import yield_dataframes_from_dir
|
|
9
|
+
from ._logger import _LOGGER
|
|
10
|
+
from ._script_info import _script_info
|
|
11
|
+
from .SQL import DatabaseManager
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"parse_lower_upper_bounds",
|
|
16
|
+
"plot_optimal_feature_distributions"
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def parse_lower_upper_bounds(source: dict[str,tuple[Any,Any]]):
|
|
21
|
+
"""
|
|
22
|
+
Parse lower and upper boundaries, returning 2 lists:
|
|
23
|
+
|
|
24
|
+
`lower_bounds`, `upper_bounds`
|
|
25
|
+
"""
|
|
26
|
+
lower = [low[0] for low in source.values()]
|
|
27
|
+
upper = [up[1] for up in source.values()]
|
|
28
|
+
|
|
29
|
+
return lower, upper
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir: Union[str, Path]):
|
|
33
|
+
"""
|
|
34
|
+
Analyzes optimization results and plots the distribution of optimal values for each feature.
|
|
35
|
+
|
|
36
|
+
For features with more than two unique values, this function generates a color-coded
|
|
37
|
+
Kernel Density Estimate (KDE) plot. For binary or constant features, it generates a bar plot
|
|
38
|
+
showing relative frequency.
|
|
39
|
+
|
|
40
|
+
Parameters
|
|
41
|
+
----------
|
|
42
|
+
results_dir : str or Path
|
|
43
|
+
The path to the directory containing the optimization result CSV files.
|
|
44
|
+
save_dir : str or Path
|
|
45
|
+
The directory where the output plots will be saved.
|
|
46
|
+
"""
|
|
47
|
+
# Check results_dir and create output path
|
|
48
|
+
results_path = make_fullpath(results_dir)
|
|
49
|
+
output_path = make_fullpath(save_dir, make=True)
|
|
50
|
+
|
|
51
|
+
# Check that the directory contains csv files
|
|
52
|
+
list_csv_paths(results_path, verbose=False)
|
|
53
|
+
|
|
54
|
+
# --- Data Loading and Preparation ---
|
|
55
|
+
_LOGGER.info(f"📁 Starting analysis from results in: '{results_dir}'")
|
|
56
|
+
data_to_plot = []
|
|
57
|
+
for df, df_name in yield_dataframes_from_dir(results_path):
|
|
58
|
+
melted_df = df.iloc[:, :-1].melt(var_name='feature', value_name='value')
|
|
59
|
+
melted_df['target'] = df_name.replace("Optimization_", "")
|
|
60
|
+
data_to_plot.append(melted_df)
|
|
61
|
+
|
|
62
|
+
long_df = pd.concat(data_to_plot, ignore_index=True)
|
|
63
|
+
features = long_df['feature'].unique()
|
|
64
|
+
_LOGGER.info(f"📂 Found data for {len(features)} features across {len(long_df['target'].unique())} targets. Generating plots...")
|
|
65
|
+
|
|
66
|
+
# --- Plotting Loop ---
|
|
67
|
+
for feature_name in features:
|
|
68
|
+
plt.figure(figsize=(12, 7))
|
|
69
|
+
feature_df = long_df[long_df['feature'] == feature_name]
|
|
70
|
+
|
|
71
|
+
# Check if the feature is binary or constant
|
|
72
|
+
if feature_df['value'].nunique() <= 2:
|
|
73
|
+
# PLOT 1: For discrete values, calculate percentages and use a true bar plot.
|
|
74
|
+
# This ensures the X-axis is clean (e.g., just 0 and 1).
|
|
75
|
+
norm_df = (feature_df.groupby('target')['value']
|
|
76
|
+
.value_counts(normalize=True)
|
|
77
|
+
.mul(100)
|
|
78
|
+
.rename('percent')
|
|
79
|
+
.reset_index())
|
|
80
|
+
|
|
81
|
+
ax = sns.barplot(data=norm_df, x='value', y='percent', hue='target')
|
|
82
|
+
|
|
83
|
+
plt.title(f"Optimal Value Distribution for '{feature_name}'", fontsize=16)
|
|
84
|
+
plt.ylabel("Frequency (%)", fontsize=12)
|
|
85
|
+
ax.set_ylim(0, 100) # Set Y-axis from 0 to 100
|
|
86
|
+
|
|
87
|
+
else:
|
|
88
|
+
# PLOT 2: KDE plot for continuous values.
|
|
89
|
+
ax = sns.kdeplot(data=feature_df, x='value', hue='target',
|
|
90
|
+
fill=True, alpha=0.1, warn_singular=False)
|
|
91
|
+
|
|
92
|
+
plt.title(f"Optimal Value Distribution for '{feature_name}'", fontsize=16)
|
|
93
|
+
plt.ylabel("Density", fontsize=12) # Y-axis is "Density" for KDE plots
|
|
94
|
+
|
|
95
|
+
# --- Common settings for both plot types ---
|
|
96
|
+
plt.xlabel("Feature Value", fontsize=12)
|
|
97
|
+
plt.grid(axis='y', alpha=0.5, linestyle='--')
|
|
98
|
+
|
|
99
|
+
legend = ax.get_legend()
|
|
100
|
+
if legend:
|
|
101
|
+
legend.set_title('Target')
|
|
102
|
+
|
|
103
|
+
sanitized_feature_name = sanitize_filename(feature_name)
|
|
104
|
+
plot_filename = output_path / f"Distribution_{sanitized_feature_name}.svg"
|
|
105
|
+
plt.savefig(plot_filename, bbox_inches='tight')
|
|
106
|
+
plt.close()
|
|
107
|
+
|
|
108
|
+
_LOGGER.info(f"✅ All plots saved successfully to: '{output_path}'")
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _save_result(
|
|
112
|
+
result_dict: dict,
|
|
113
|
+
save_format: Literal['csv', 'sqlite', 'both'],
|
|
114
|
+
csv_path: Path,
|
|
115
|
+
db_manager: Optional[DatabaseManager] = None,
|
|
116
|
+
db_table_name: Optional[str] = None
|
|
117
|
+
):
|
|
118
|
+
"""
|
|
119
|
+
Private helper to handle saving a single result to CSV, SQLite, or both.
|
|
120
|
+
"""
|
|
121
|
+
# Save to CSV
|
|
122
|
+
if save_format in ['csv', 'both']:
|
|
123
|
+
df_row = pd.DataFrame([result_dict])
|
|
124
|
+
file_exists = csv_path.exists()
|
|
125
|
+
df_row.to_csv(csv_path, mode='a', index=False, header=not file_exists)
|
|
126
|
+
|
|
127
|
+
# Save to SQLite
|
|
128
|
+
if save_format in ['sqlite', 'both']:
|
|
129
|
+
if db_manager and db_table_name:
|
|
130
|
+
db_manager.insert_row(db_table_name, result_dict)
|
|
131
|
+
else:
|
|
132
|
+
_LOGGER.warning("⚠️ SQLite saving requested but db_manager or table_name not provided.")
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def info():
|
|
137
|
+
_script_info(__all__)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|