dragon-ml-toolbox 10.2.0__py3-none-any.whl → 14.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/METADATA +38 -63
- dragon_ml_toolbox-14.2.0.dist-info/RECORD +48 -0
- {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE +1 -1
- {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +11 -0
- ml_tools/ETL_cleaning.py +72 -34
- ml_tools/ETL_engineering.py +506 -70
- ml_tools/GUI_tools.py +2 -1
- ml_tools/MICE_imputation.py +212 -7
- ml_tools/ML_callbacks.py +73 -40
- ml_tools/ML_datasetmaster.py +267 -284
- ml_tools/ML_evaluation.py +119 -58
- ml_tools/ML_evaluation_multi.py +107 -32
- ml_tools/ML_inference.py +15 -5
- ml_tools/ML_models.py +234 -170
- ml_tools/ML_models_advanced.py +323 -0
- ml_tools/ML_optimization.py +321 -97
- ml_tools/ML_scaler.py +10 -5
- ml_tools/ML_trainer.py +585 -40
- ml_tools/ML_utilities.py +528 -0
- ml_tools/ML_vision_datasetmaster.py +1315 -0
- ml_tools/ML_vision_evaluation.py +260 -0
- ml_tools/ML_vision_inference.py +428 -0
- ml_tools/ML_vision_models.py +627 -0
- ml_tools/ML_vision_transformers.py +58 -0
- ml_tools/PSO_optimization.py +10 -7
- ml_tools/RNN_forecast.py +2 -0
- ml_tools/SQL.py +22 -9
- ml_tools/VIF_factor.py +4 -3
- ml_tools/_ML_vision_recipe.py +88 -0
- ml_tools/__init__.py +1 -0
- ml_tools/_logger.py +0 -2
- ml_tools/_schema.py +96 -0
- ml_tools/constants.py +79 -0
- ml_tools/custom_logger.py +164 -16
- ml_tools/data_exploration.py +1092 -109
- ml_tools/ensemble_evaluation.py +48 -1
- ml_tools/ensemble_inference.py +6 -7
- ml_tools/ensemble_learning.py +4 -3
- ml_tools/handle_excel.py +1 -0
- ml_tools/keys.py +80 -0
- ml_tools/math_utilities.py +259 -0
- ml_tools/optimization_tools.py +198 -24
- ml_tools/path_manager.py +144 -45
- ml_tools/serde.py +192 -0
- ml_tools/utilities.py +287 -227
- dragon_ml_toolbox-10.2.0.dist-info/RECORD +0 -36
- {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/top_level.txt +0 -0
ml_tools/ML_optimization.py
CHANGED
|
@@ -5,7 +5,7 @@ import evotorch
|
|
|
5
5
|
from evotorch.algorithms import SNES, CEM, GeneticAlgorithm
|
|
6
6
|
from evotorch.logging import PandasLogger
|
|
7
7
|
from evotorch.operators import SimulatedBinaryCrossOver, GaussianMutation
|
|
8
|
-
from typing import Literal, Union, Tuple, List, Optional, Any, Callable
|
|
8
|
+
from typing import Literal, Union, Tuple, List, Optional, Any, Callable, Dict
|
|
9
9
|
from pathlib import Path
|
|
10
10
|
from tqdm.auto import trange
|
|
11
11
|
from contextlib import nullcontext
|
|
@@ -17,19 +17,216 @@ from ._script_info import _script_info
|
|
|
17
17
|
from .ML_inference import PyTorchInferenceHandler
|
|
18
18
|
from .keys import PyTorchInferenceKeys
|
|
19
19
|
from .SQL import DatabaseManager
|
|
20
|
-
from .optimization_tools import _save_result
|
|
21
|
-
from .utilities import
|
|
20
|
+
from .optimization_tools import _save_result, create_optimization_bounds
|
|
21
|
+
from .utilities import save_dataframe_filename
|
|
22
|
+
from .math_utilities import discretize_categorical_values
|
|
23
|
+
from ._schema import FeatureSchema
|
|
24
|
+
|
|
22
25
|
|
|
23
26
|
__all__ = [
|
|
27
|
+
"MLOptimizer",
|
|
28
|
+
"FitnessEvaluator",
|
|
24
29
|
"create_pytorch_problem",
|
|
25
30
|
"run_optimization"
|
|
26
31
|
]
|
|
27
32
|
|
|
28
33
|
|
|
34
|
+
class MLOptimizer:
|
|
35
|
+
"""
|
|
36
|
+
A wrapper class for setting up and running EvoTorch optimization tasks.
|
|
37
|
+
|
|
38
|
+
This class combines the functionality of `FitnessEvaluator`, `create_pytorch_problem`, and
|
|
39
|
+
`run_optimization` into a single, streamlined workflow.
|
|
40
|
+
|
|
41
|
+
SNES and CEM algorithms do not accept bounds, the given bounds will be used as an initial starting point.
|
|
42
|
+
|
|
43
|
+
Example:
|
|
44
|
+
>>> # 1. Get the final schema from data exploration
|
|
45
|
+
>>> schema = data_exploration.finalize_feature_schema(...)
|
|
46
|
+
>>> # 2. Define bounds for continuous features
|
|
47
|
+
>>> cont_bounds = {'feature_A': (0, 100), 'feature_B': (-10, 10)}
|
|
48
|
+
>>>
|
|
49
|
+
>>> # 3. Initialize the optimizer
|
|
50
|
+
>>> optimizer = MLOptimizer(
|
|
51
|
+
... inference_handler=my_handler,
|
|
52
|
+
... schema=schema,
|
|
53
|
+
... continuous_bounds_map=cont_bounds,
|
|
54
|
+
... task="max",
|
|
55
|
+
... algorithm="Genetic",
|
|
56
|
+
... )
|
|
57
|
+
>>> # 4. Run the optimization
|
|
58
|
+
>>> best_result = optimizer.run(
|
|
59
|
+
... num_generations=100,
|
|
60
|
+
... target_name="my_target",
|
|
61
|
+
... save_dir="/path/to/results",
|
|
62
|
+
... save_format="csv"
|
|
63
|
+
... )
|
|
64
|
+
"""
|
|
65
|
+
def __init__(self,
|
|
66
|
+
inference_handler: PyTorchInferenceHandler,
|
|
67
|
+
schema: FeatureSchema,
|
|
68
|
+
continuous_bounds_map: Dict[str, Tuple[float, float]],
|
|
69
|
+
task: Literal["min", "max"],
|
|
70
|
+
algorithm: Literal["SNES", "CEM", "Genetic"] = "Genetic",
|
|
71
|
+
population_size: int = 200,
|
|
72
|
+
discretize_start_at_zero: bool = True,
|
|
73
|
+
**searcher_kwargs):
|
|
74
|
+
"""
|
|
75
|
+
Initializes the optimizer by creating the EvoTorch problem and searcher.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
inference_handler (PyTorchInferenceHandler):
|
|
79
|
+
An initialized inference handler containing the model.
|
|
80
|
+
schema (FeatureSchema):
|
|
81
|
+
The definitive schema object from data_exploration.
|
|
82
|
+
continuous_bounds_map (Dict[str, Tuple[float, float]]):
|
|
83
|
+
A dictionary mapping the *name* of each **continuous** feature
|
|
84
|
+
to its (min_bound, max_bound) tuple.
|
|
85
|
+
task (str): The optimization goal, either "min" or "max".
|
|
86
|
+
algorithm (str): The search algorithm to use ("SNES", "CEM", "Genetic").
|
|
87
|
+
population_size (int): Population size for CEM and GeneticAlgorithm.
|
|
88
|
+
discretize_start_at_zero (bool):
|
|
89
|
+
True if the discrete encoding starts at 0 (e.g., [0, 1, 2]).
|
|
90
|
+
False if it starts at 1 (e.g., [1, 2, 3]).
|
|
91
|
+
**searcher_kwargs: Additional keyword arguments for the selected
|
|
92
|
+
search algorithm's constructor.
|
|
93
|
+
"""
|
|
94
|
+
# --- Store schema ---
|
|
95
|
+
self.schema = schema
|
|
96
|
+
|
|
97
|
+
# --- 1. Create bounds from schema ---
|
|
98
|
+
# This is the new, robust way to get bounds
|
|
99
|
+
bounds = create_optimization_bounds(
|
|
100
|
+
schema=schema,
|
|
101
|
+
continuous_bounds_map=continuous_bounds_map,
|
|
102
|
+
start_at_zero=discretize_start_at_zero
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# --- 2. Make a fitness function ---
|
|
106
|
+
self.evaluator = FitnessEvaluator(
|
|
107
|
+
inference_handler=inference_handler,
|
|
108
|
+
# Get categorical info from the schema
|
|
109
|
+
categorical_index_map=schema.categorical_index_map,
|
|
110
|
+
discretize_start_at_zero=discretize_start_at_zero
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# --- 3. Create the problem and searcher factory ---
|
|
114
|
+
self.problem, self.searcher_factory = create_pytorch_problem(
|
|
115
|
+
evaluator=self.evaluator,
|
|
116
|
+
bounds=bounds,
|
|
117
|
+
task=task,
|
|
118
|
+
algorithm=algorithm,
|
|
119
|
+
population_size=population_size,
|
|
120
|
+
**searcher_kwargs
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# --- 4. Store other info needed by run() ---
|
|
124
|
+
self.discretize_start_at_zero = discretize_start_at_zero
|
|
125
|
+
|
|
126
|
+
def run(self,
|
|
127
|
+
num_generations: int,
|
|
128
|
+
target_name: str,
|
|
129
|
+
save_dir: Union[str, Path],
|
|
130
|
+
save_format: Literal['csv', 'sqlite', 'both'],
|
|
131
|
+
repetitions: int = 1,
|
|
132
|
+
verbose: bool = True) -> Optional[dict]:
|
|
133
|
+
"""
|
|
134
|
+
Runs the evolutionary optimization process using the pre-configured settings.
|
|
135
|
+
|
|
136
|
+
The `feature_names` are automatically pulled from the `FeatureSchema`
|
|
137
|
+
provided during initialization.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
num_generations (int): The total number of generations for each repetition.
|
|
141
|
+
target_name (str): Target name used for the CSV filename and/or SQL table.
|
|
142
|
+
save_dir (str | Path): The directory where result files will be saved.
|
|
143
|
+
save_format (Literal['csv', 'sqlite', 'both']): The format for saving results.
|
|
144
|
+
repetitions (int): The number of independent times to run the optimization.
|
|
145
|
+
verbose (bool): If True, enables detailed logging.
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
Optional[dict]: A dictionary with the best result if repetitions is 1,
|
|
149
|
+
otherwise None.
|
|
150
|
+
"""
|
|
151
|
+
# Call the existing run function, passing info from the schema
|
|
152
|
+
return run_optimization(
|
|
153
|
+
problem=self.problem,
|
|
154
|
+
searcher_factory=self.searcher_factory,
|
|
155
|
+
num_generations=num_generations,
|
|
156
|
+
target_name=target_name,
|
|
157
|
+
save_dir=save_dir,
|
|
158
|
+
save_format=save_format,
|
|
159
|
+
# Get the definitive feature names (as a list) from the schema
|
|
160
|
+
feature_names=list(self.schema.feature_names),
|
|
161
|
+
# Get categorical info from the schema
|
|
162
|
+
categorical_map=self.schema.categorical_index_map,
|
|
163
|
+
categorical_mappings=self.schema.categorical_mappings,
|
|
164
|
+
repetitions=repetitions,
|
|
165
|
+
verbose=verbose,
|
|
166
|
+
discretize_start_at_zero=self.discretize_start_at_zero
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class FitnessEvaluator:
|
|
171
|
+
"""
|
|
172
|
+
A callable class that wraps the PyTorch model inference handler and performs
|
|
173
|
+
on-the-fly discretization for the EvoTorch fitness function.
|
|
174
|
+
|
|
175
|
+
This class is automatically instantiated by MLOptimizer and passed to
|
|
176
|
+
create_pytorch_problem, encapsulating the evaluation logic.
|
|
177
|
+
"""
|
|
178
|
+
def __init__(self,
|
|
179
|
+
inference_handler: PyTorchInferenceHandler,
|
|
180
|
+
categorical_index_map: Optional[Dict[int, int]] = None,
|
|
181
|
+
discretize_start_at_zero: bool = True):
|
|
182
|
+
"""
|
|
183
|
+
Initializes the fitness evaluator.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
inference_handler (PyTorchInferenceHandler):
|
|
187
|
+
An initialized inference handler containing the model.
|
|
188
|
+
categorical_index_map (Dict[int, int] | None):
|
|
189
|
+
Maps {column_index: cardinality} for discretization.
|
|
190
|
+
discretize_start_at_zero (bool):
|
|
191
|
+
True if discrete encoding starts at 0.
|
|
192
|
+
"""
|
|
193
|
+
self.inference_handler = inference_handler
|
|
194
|
+
self.categorical_index_map = categorical_index_map
|
|
195
|
+
self.discretize_start_at_zero = discretize_start_at_zero
|
|
196
|
+
|
|
197
|
+
# Expose the device
|
|
198
|
+
self.device = self.inference_handler.device
|
|
199
|
+
|
|
200
|
+
def __call__(self, solution_tensor: torch.Tensor) -> torch.Tensor:
|
|
201
|
+
"""
|
|
202
|
+
This is the fitness function EvoTorch will call.
|
|
203
|
+
|
|
204
|
+
It receives a batch of continuous solutions, discretizes the
|
|
205
|
+
categorical ones, and returns the model's predictions.
|
|
206
|
+
"""
|
|
207
|
+
# Clone to avoid modifying the optimizer's internal state (SNES, CEM, GA)
|
|
208
|
+
processed_tensor = solution_tensor.clone()
|
|
209
|
+
|
|
210
|
+
if self.categorical_index_map:
|
|
211
|
+
for col_idx, cardinality in self.categorical_index_map.items():
|
|
212
|
+
# 1. Round (using torch.floor(x + 0.5) for "round half up" behavior)
|
|
213
|
+
rounded_col = torch.floor(processed_tensor[:, col_idx] + 0.5)
|
|
214
|
+
|
|
215
|
+
# 2. Determine clamping bounds
|
|
216
|
+
min_bound = 0 if self.discretize_start_at_zero else 1
|
|
217
|
+
max_bound = cardinality - 1 if self.discretize_start_at_zero else cardinality
|
|
218
|
+
|
|
219
|
+
# 3. Clamp the values and update the processed tensor
|
|
220
|
+
processed_tensor[:, col_idx] = torch.clamp(rounded_col, min_bound, max_bound)
|
|
221
|
+
|
|
222
|
+
# Use the *processed_tensor* for prediction
|
|
223
|
+
predictions = self.inference_handler.predict_batch(processed_tensor)[PyTorchInferenceKeys.PREDICTIONS]
|
|
224
|
+
return predictions.flatten()
|
|
225
|
+
|
|
226
|
+
|
|
29
227
|
def create_pytorch_problem(
|
|
30
|
-
|
|
228
|
+
evaluator: FitnessEvaluator,
|
|
31
229
|
bounds: Tuple[List[float], List[float]],
|
|
32
|
-
binary_features: int,
|
|
33
230
|
task: Literal["min", "max"],
|
|
34
231
|
algorithm: Literal["SNES", "CEM", "Genetic"] = "Genetic",
|
|
35
232
|
population_size: int = 200,
|
|
@@ -38,14 +235,14 @@ def create_pytorch_problem(
|
|
|
38
235
|
"""
|
|
39
236
|
Creates and configures an EvoTorch Problem and a Searcher factory class for a PyTorch model.
|
|
40
237
|
|
|
41
|
-
SNES and CEM do not accept bounds, the given bounds will be used as initial
|
|
238
|
+
SNES and CEM do not accept bounds, the given bounds will be used as an initial starting point.
|
|
42
239
|
|
|
43
240
|
The Genetic Algorithm works directly with the bounds, and operators such as SimulatedBinaryCrossOver and GaussianMutation.
|
|
44
241
|
|
|
45
242
|
Args:
|
|
46
|
-
|
|
243
|
+
evaluator (FitnessEvaluator): A callable class that wraps the model inference and handles on-the-fly discretization.
|
|
47
244
|
bounds (tuple[list[float], list[float]]): A tuple containing the lower and upper bounds for the solution features.
|
|
48
|
-
|
|
245
|
+
Use the `optimization_tools.create_optimization_bounds()` helper to easily generate this and ensure unbiased categorical bounds.
|
|
49
246
|
task (str): The optimization goal, either "minimize" or "maximize".
|
|
50
247
|
algorithm (str): The search algorithm to use.
|
|
51
248
|
population_size (int): Used for CEM and GeneticAlgorithm.
|
|
@@ -60,26 +257,14 @@ def create_pytorch_problem(
|
|
|
60
257
|
lower_bounds = list(bounds[0])
|
|
61
258
|
upper_bounds = list(bounds[1])
|
|
62
259
|
|
|
63
|
-
# add binary bounds
|
|
64
|
-
if binary_features > 0:
|
|
65
|
-
lower_bounds.extend([0.45] * binary_features)
|
|
66
|
-
upper_bounds.extend([0.55] * binary_features)
|
|
67
|
-
|
|
68
260
|
solution_length = len(lower_bounds)
|
|
69
|
-
device =
|
|
261
|
+
device = evaluator.device
|
|
70
262
|
|
|
71
|
-
# Define the fitness function that EvoTorch will call.
|
|
72
|
-
def fitness_func(solution_tensor: torch.Tensor) -> torch.Tensor:
|
|
73
|
-
# Directly use the continuous-valued tensor from the optimizer for prediction
|
|
74
|
-
predictions = inference_handler.predict_batch(solution_tensor)[PyTorchInferenceKeys.PREDICTIONS]
|
|
75
|
-
return predictions.flatten()
|
|
76
|
-
|
|
77
|
-
|
|
78
263
|
# Create the Problem instance.
|
|
79
264
|
if algorithm == "CEM" or algorithm == "SNES":
|
|
80
265
|
problem = evotorch.Problem(
|
|
81
266
|
objective_sense=task,
|
|
82
|
-
objective_func=
|
|
267
|
+
objective_func=evaluator,
|
|
83
268
|
solution_length=solution_length,
|
|
84
269
|
initial_bounds=(lower_bounds, upper_bounds),
|
|
85
270
|
device=device,
|
|
@@ -105,7 +290,7 @@ def create_pytorch_problem(
|
|
|
105
290
|
elif algorithm == "Genetic":
|
|
106
291
|
problem = evotorch.Problem(
|
|
107
292
|
objective_sense=task,
|
|
108
|
-
objective_func=
|
|
293
|
+
objective_func=evaluator,
|
|
109
294
|
solution_length=solution_length,
|
|
110
295
|
bounds=(lower_bounds, upper_bounds),
|
|
111
296
|
device=device,
|
|
@@ -141,12 +326,14 @@ def run_optimization(
|
|
|
141
326
|
searcher_factory: Callable[[],Any],
|
|
142
327
|
num_generations: int,
|
|
143
328
|
target_name: str,
|
|
144
|
-
binary_features: int,
|
|
145
329
|
save_dir: Union[str, Path],
|
|
146
330
|
save_format: Literal['csv', 'sqlite', 'both'],
|
|
147
331
|
feature_names: Optional[List[str]],
|
|
148
332
|
repetitions: int = 1,
|
|
149
|
-
verbose: bool = True
|
|
333
|
+
verbose: bool = True,
|
|
334
|
+
categorical_map: Optional[Dict[int, int]] = None,
|
|
335
|
+
categorical_mappings: Optional[Dict[str, Dict[str, int]]] = None,
|
|
336
|
+
discretize_start_at_zero: bool = True
|
|
150
337
|
) -> Optional[dict]:
|
|
151
338
|
"""
|
|
152
339
|
Runs the evolutionary optimization process, with support for multiple repetitions.
|
|
@@ -169,7 +356,6 @@ def run_optimization(
|
|
|
169
356
|
searcher_factory (Callable): The searcher factory to generate fresh evolutionary algorithms.
|
|
170
357
|
num_generations (int): The total number of generations to run the search algorithm for in each repetition.
|
|
171
358
|
target_name (str): Target name that will also be used for the CSV filename and SQL table.
|
|
172
|
-
binary_features (int): Number of binary features located at the END of the feature vector.
|
|
173
359
|
save_dir (str | Path): The directory where the result file(s) will be saved.
|
|
174
360
|
save_format (Literal['csv', 'sqlite', 'both'], optional): The format for
|
|
175
361
|
saving results during iterative analysis.
|
|
@@ -179,13 +365,18 @@ def run_optimization(
|
|
|
179
365
|
repetitions (int, optional): The number of independent times to run the
|
|
180
366
|
entire optimization process.
|
|
181
367
|
verbose (bool): Add an Evotorch Pandas logger saved as a csv. Only for the first repetition.
|
|
368
|
+
categorical_index_map (Dict[int, int] | None): Used to discretize values after optimization. Maps {column_index: cardinality}.
|
|
369
|
+
categorical_mappings (Dict[str, Dict[str, int]] | None): Used to map discrete integer values back to strings (e.g., {0: 'Category_A'}) before saving.
|
|
370
|
+
discretize_start_at_zero (bool):
|
|
371
|
+
True if the discrete encoding starts at 0 (e.g., [0, 1, 2]).
|
|
372
|
+
False if it starts at 1 (e.g., [1, 2, 3]).
|
|
182
373
|
|
|
183
374
|
Returns:
|
|
184
375
|
Optional[dict]: A dictionary containing the best feature values and the
|
|
185
376
|
fitness score if `repetitions` is 1. Returns `None` if `repetitions`
|
|
186
377
|
is greater than 1, as results are streamed to files instead.
|
|
187
378
|
"""
|
|
188
|
-
#
|
|
379
|
+
# --- 1. Setup Paths and Feature Names ---
|
|
189
380
|
save_path = make_fullpath(save_dir, make=True, enforce="directory")
|
|
190
381
|
|
|
191
382
|
sanitized_target_name = sanitize_filename(target_name)
|
|
@@ -193,54 +384,38 @@ def run_optimization(
|
|
|
193
384
|
sanitized_target_name = sanitized_target_name + ".csv"
|
|
194
385
|
|
|
195
386
|
csv_path = save_path / sanitized_target_name
|
|
196
|
-
|
|
197
387
|
db_path = save_path / "Optimization.db"
|
|
198
388
|
db_table_name = target_name
|
|
199
389
|
|
|
200
|
-
#
|
|
390
|
+
# Use problem's solution_length to create default names if none provided
|
|
201
391
|
if feature_names is None:
|
|
202
|
-
|
|
392
|
+
feat_len = problem.solution_length
|
|
393
|
+
feature_names = [f"feature_{i}" for i in range(feat_len)] # type: ignore
|
|
203
394
|
|
|
395
|
+
# --- 2. Run Optimization ---
|
|
204
396
|
# --- SINGLE RUN LOGIC ---
|
|
205
397
|
if repetitions <= 1:
|
|
206
|
-
|
|
207
|
-
_LOGGER.info(f"🤖 Starting optimization with {searcher.__class__.__name__} Algorithm for {num_generations} generations...")
|
|
208
|
-
# for _ in trange(num_generations, desc="Optimizing"):
|
|
209
|
-
# searcher.step()
|
|
398
|
+
_LOGGER.info(f"🤖 Starting optimization for {num_generations} generations...")
|
|
210
399
|
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
# print(searcher.status[status_key])
|
|
221
|
-
# print()
|
|
222
|
-
|
|
223
|
-
# Get results from the .status dictionary
|
|
224
|
-
# SNES and CEM use the key 'center' to get mean values if needed best_solution_tensor = searcher.status["center"]
|
|
225
|
-
best_solution_container = searcher.status["pop_best"]
|
|
226
|
-
best_solution_tensor = best_solution_container.values
|
|
227
|
-
best_fitness = best_solution_container.evals
|
|
228
|
-
|
|
229
|
-
best_solution_np = best_solution_tensor.cpu().numpy()
|
|
230
|
-
|
|
231
|
-
# threshold binary features
|
|
232
|
-
if binary_features > 0:
|
|
233
|
-
best_solution_thresholded = threshold_binary_values(input_array=best_solution_np, binary_values=binary_features)
|
|
234
|
-
else:
|
|
235
|
-
best_solution_thresholded = best_solution_np
|
|
236
|
-
|
|
237
|
-
result_dict = {name: value for name, value in zip(feature_names, best_solution_thresholded)}
|
|
238
|
-
result_dict[target_name] = best_fitness.item()
|
|
400
|
+
result_dict, pandas_logger = _run_single_optimization_rep(
|
|
401
|
+
searcher_factory=searcher_factory,
|
|
402
|
+
num_generations=num_generations,
|
|
403
|
+
feature_names=feature_names,
|
|
404
|
+
target_name=target_name,
|
|
405
|
+
categorical_map=categorical_map,
|
|
406
|
+
discretize_start_at_zero=discretize_start_at_zero,
|
|
407
|
+
attach_logger=verbose
|
|
408
|
+
)
|
|
239
409
|
|
|
240
|
-
|
|
410
|
+
# Single run defaults to CSV, pass mappings for reverse mapping
|
|
411
|
+
_save_result(
|
|
412
|
+
result_dict=result_dict,
|
|
413
|
+
save_format='csv',
|
|
414
|
+
csv_path=csv_path,
|
|
415
|
+
categorical_mappings=categorical_mappings
|
|
416
|
+
)
|
|
241
417
|
|
|
242
|
-
|
|
243
|
-
if verbose:
|
|
418
|
+
if pandas_logger:
|
|
244
419
|
_handle_pandas_log(pandas_logger, save_path=save_path, target_name=target_name)
|
|
245
420
|
|
|
246
421
|
_LOGGER.info(f"Optimization complete. Best solution saved to '{csv_path.name}'")
|
|
@@ -249,60 +424,109 @@ def run_optimization(
|
|
|
249
424
|
# --- MULTIPLE REPETITIONS LOGIC ---
|
|
250
425
|
else:
|
|
251
426
|
_LOGGER.info(f"🏁 Starting optimal solution space analysis with {repetitions} repetitions...")
|
|
252
|
-
|
|
427
|
+
|
|
428
|
+
first_run_logger = None # To store the logger from the first rep
|
|
253
429
|
db_context = DatabaseManager(db_path) if save_format in ['sqlite', 'both'] else nullcontext()
|
|
254
430
|
|
|
255
431
|
with db_context as db_manager:
|
|
432
|
+
# --- Setup Database Schema (if applicable) ---
|
|
256
433
|
if db_manager:
|
|
257
|
-
schema = {
|
|
434
|
+
schema = {}
|
|
435
|
+
categorical_cols = set(categorical_mappings.keys()) if categorical_mappings else set()
|
|
436
|
+
|
|
437
|
+
for name in feature_names:
|
|
438
|
+
schema[name] = "TEXT" if name in categorical_cols else "REAL"
|
|
258
439
|
schema[target_name] = "REAL"
|
|
440
|
+
|
|
259
441
|
db_manager.create_table(db_table_name, schema)
|
|
260
442
|
|
|
443
|
+
# --- Repetitions Loop ---
|
|
261
444
|
print("")
|
|
262
|
-
# Repetitions loop
|
|
263
|
-
pandas_logger = None
|
|
264
445
|
for i in trange(repetitions, desc="Repetitions"):
|
|
265
|
-
# CRITICAL: Create a fresh searcher for each run using the factory
|
|
266
|
-
searcher = searcher_factory()
|
|
267
|
-
|
|
268
|
-
# Attach logger if requested
|
|
269
|
-
if verbose and i==0:
|
|
270
|
-
pandas_logger = PandasLogger(searcher)
|
|
271
446
|
|
|
272
|
-
|
|
447
|
+
# Only attach a logger for the first repetition if verbose
|
|
448
|
+
attach_logger = verbose and (i == 0)
|
|
273
449
|
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
450
|
+
result_dict, pandas_logger = _run_single_optimization_rep(
|
|
451
|
+
searcher_factory=searcher_factory,
|
|
452
|
+
num_generations=num_generations,
|
|
453
|
+
feature_names=feature_names,
|
|
454
|
+
target_name=target_name,
|
|
455
|
+
categorical_map=categorical_map,
|
|
456
|
+
discretize_start_at_zero=discretize_start_at_zero,
|
|
457
|
+
attach_logger=attach_logger
|
|
458
|
+
)
|
|
281
459
|
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
best_solution_thresholded = threshold_binary_values(input_array=best_solution_np, binary_values=binary_features)
|
|
285
|
-
else:
|
|
286
|
-
best_solution_thresholded = best_solution_np
|
|
287
|
-
|
|
288
|
-
# make results dictionary
|
|
289
|
-
result_dict = {name: value for name, value in zip(feature_names, best_solution_thresholded)}
|
|
290
|
-
result_dict[target_name] = best_fitness.item()
|
|
460
|
+
if pandas_logger:
|
|
461
|
+
first_run_logger = pandas_logger
|
|
291
462
|
|
|
292
463
|
# Save each result incrementally
|
|
293
|
-
_save_result(
|
|
464
|
+
_save_result(
|
|
465
|
+
result_dict=result_dict,
|
|
466
|
+
save_format=save_format,
|
|
467
|
+
csv_path=csv_path,
|
|
468
|
+
db_manager=db_manager,
|
|
469
|
+
db_table_name=db_table_name,
|
|
470
|
+
categorical_mappings=categorical_mappings
|
|
471
|
+
)
|
|
294
472
|
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
_handle_pandas_log(pandas_logger, save_path=save_path, target_name=target_name)
|
|
473
|
+
if first_run_logger:
|
|
474
|
+
_handle_pandas_log(first_run_logger, save_path=save_path, target_name=target_name)
|
|
298
475
|
|
|
299
476
|
_LOGGER.info(f"Optimal solution space complete. Results saved to '{save_path}'")
|
|
300
477
|
return None
|
|
301
478
|
|
|
302
479
|
|
|
480
|
+
def _run_single_optimization_rep(
|
|
481
|
+
searcher_factory: Callable[[],Any],
|
|
482
|
+
num_generations: int,
|
|
483
|
+
feature_names: List[str],
|
|
484
|
+
target_name: str,
|
|
485
|
+
categorical_map: Optional[Dict[int, int]],
|
|
486
|
+
discretize_start_at_zero: bool,
|
|
487
|
+
attach_logger: bool
|
|
488
|
+
) -> Tuple[dict, Optional[PandasLogger]]:
|
|
489
|
+
"""
|
|
490
|
+
Internal helper to run one full optimization repetition.
|
|
491
|
+
|
|
492
|
+
Handles searcher creation, logging, running, and result post-processing.
|
|
493
|
+
"""
|
|
494
|
+
# CRITICAL: Create a fresh searcher for each run using the factory
|
|
495
|
+
searcher = searcher_factory()
|
|
496
|
+
|
|
497
|
+
# Attach logger if requested
|
|
498
|
+
pandas_logger = PandasLogger(searcher) if attach_logger else None
|
|
499
|
+
|
|
500
|
+
# Run the optimization
|
|
501
|
+
searcher.run(num_generations)
|
|
502
|
+
|
|
503
|
+
# Get the best result
|
|
504
|
+
best_solution_container = searcher.status["pop_best"]
|
|
505
|
+
best_solution_tensor = best_solution_container.values
|
|
506
|
+
best_fitness = best_solution_container.evals
|
|
507
|
+
|
|
508
|
+
best_solution_np = best_solution_tensor.cpu().numpy()
|
|
509
|
+
|
|
510
|
+
# Discretize categorical/binary features
|
|
511
|
+
if categorical_map:
|
|
512
|
+
best_solution_thresholded = discretize_categorical_values(
|
|
513
|
+
input_array=best_solution_np,
|
|
514
|
+
categorical_info=categorical_map,
|
|
515
|
+
start_at_zero=discretize_start_at_zero
|
|
516
|
+
)
|
|
517
|
+
else:
|
|
518
|
+
best_solution_thresholded = best_solution_np
|
|
519
|
+
|
|
520
|
+
# Format results into a dictionary
|
|
521
|
+
result_dict = {name: value for name, value in zip(feature_names, best_solution_thresholded)}
|
|
522
|
+
result_dict[target_name] = best_fitness.item()
|
|
523
|
+
|
|
524
|
+
return result_dict, pandas_logger
|
|
525
|
+
|
|
526
|
+
|
|
303
527
|
def _handle_pandas_log(logger: PandasLogger, save_path: Path, target_name: str):
|
|
304
528
|
log_dataframe = logger.to_dataframe()
|
|
305
|
-
|
|
529
|
+
save_dataframe_filename(df=log_dataframe, save_dir=save_path / "EvolutionLogs", filename=target_name)
|
|
306
530
|
|
|
307
531
|
|
|
308
532
|
def info():
|
ml_tools/ML_scaler.py
CHANGED
|
@@ -2,14 +2,17 @@ import torch
|
|
|
2
2
|
from torch.utils.data import Dataset, DataLoader
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import Union, List, Optional
|
|
5
|
+
|
|
5
6
|
from ._logger import _LOGGER
|
|
6
7
|
from ._script_info import _script_info
|
|
7
8
|
from .path_manager import make_fullpath
|
|
8
9
|
|
|
10
|
+
|
|
9
11
|
__all__ = [
|
|
10
12
|
"PytorchScaler"
|
|
11
13
|
]
|
|
12
14
|
|
|
15
|
+
|
|
13
16
|
class PytorchScaler:
|
|
14
17
|
"""
|
|
15
18
|
Standardizes continuous features in a PyTorch dataset by subtracting the
|
|
@@ -149,24 +152,25 @@ class PytorchScaler:
|
|
|
149
152
|
|
|
150
153
|
return data_clone
|
|
151
154
|
|
|
152
|
-
def save(self, filepath: Union[str, Path]):
|
|
155
|
+
def save(self, filepath: Union[str, Path], verbose: bool=True):
|
|
153
156
|
"""
|
|
154
157
|
Saves the scaler's state (mean, std, indices) to a .pth file.
|
|
155
158
|
|
|
156
159
|
Args:
|
|
157
160
|
filepath (str | Path): The path to save the file.
|
|
158
161
|
"""
|
|
159
|
-
path_obj = make_fullpath(filepath)
|
|
162
|
+
path_obj = make_fullpath(filepath, make=True, enforce="file")
|
|
160
163
|
state = {
|
|
161
164
|
'mean': self.mean_,
|
|
162
165
|
'std': self.std_,
|
|
163
166
|
'continuous_feature_indices': self.continuous_feature_indices
|
|
164
167
|
}
|
|
165
168
|
torch.save(state, path_obj)
|
|
166
|
-
|
|
169
|
+
if verbose:
|
|
170
|
+
_LOGGER.info(f"PytorchScaler state saved as '{path_obj.name}'.")
|
|
167
171
|
|
|
168
172
|
@staticmethod
|
|
169
|
-
def load(filepath: Union[str, Path]) -> 'PytorchScaler':
|
|
173
|
+
def load(filepath: Union[str, Path], verbose: bool=True) -> 'PytorchScaler':
|
|
170
174
|
"""
|
|
171
175
|
Loads a scaler's state from a .pth file.
|
|
172
176
|
|
|
@@ -178,7 +182,8 @@ class PytorchScaler:
|
|
|
178
182
|
"""
|
|
179
183
|
path_obj = make_fullpath(filepath, enforce="file")
|
|
180
184
|
state = torch.load(path_obj)
|
|
181
|
-
|
|
185
|
+
if verbose:
|
|
186
|
+
_LOGGER.info(f"PytorchScaler state loaded from '{path_obj.name}'.")
|
|
182
187
|
return PytorchScaler(
|
|
183
188
|
mean=state['mean'],
|
|
184
189
|
std=state['std'],
|