dragon-ml-toolbox 10.2.0__py3-none-any.whl → 14.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (48) hide show
  1. {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/METADATA +38 -63
  2. dragon_ml_toolbox-14.2.0.dist-info/RECORD +48 -0
  3. {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE +1 -1
  4. {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +11 -0
  5. ml_tools/ETL_cleaning.py +72 -34
  6. ml_tools/ETL_engineering.py +506 -70
  7. ml_tools/GUI_tools.py +2 -1
  8. ml_tools/MICE_imputation.py +212 -7
  9. ml_tools/ML_callbacks.py +73 -40
  10. ml_tools/ML_datasetmaster.py +267 -284
  11. ml_tools/ML_evaluation.py +119 -58
  12. ml_tools/ML_evaluation_multi.py +107 -32
  13. ml_tools/ML_inference.py +15 -5
  14. ml_tools/ML_models.py +234 -170
  15. ml_tools/ML_models_advanced.py +323 -0
  16. ml_tools/ML_optimization.py +321 -97
  17. ml_tools/ML_scaler.py +10 -5
  18. ml_tools/ML_trainer.py +585 -40
  19. ml_tools/ML_utilities.py +528 -0
  20. ml_tools/ML_vision_datasetmaster.py +1315 -0
  21. ml_tools/ML_vision_evaluation.py +260 -0
  22. ml_tools/ML_vision_inference.py +428 -0
  23. ml_tools/ML_vision_models.py +627 -0
  24. ml_tools/ML_vision_transformers.py +58 -0
  25. ml_tools/PSO_optimization.py +10 -7
  26. ml_tools/RNN_forecast.py +2 -0
  27. ml_tools/SQL.py +22 -9
  28. ml_tools/VIF_factor.py +4 -3
  29. ml_tools/_ML_vision_recipe.py +88 -0
  30. ml_tools/__init__.py +1 -0
  31. ml_tools/_logger.py +0 -2
  32. ml_tools/_schema.py +96 -0
  33. ml_tools/constants.py +79 -0
  34. ml_tools/custom_logger.py +164 -16
  35. ml_tools/data_exploration.py +1092 -109
  36. ml_tools/ensemble_evaluation.py +48 -1
  37. ml_tools/ensemble_inference.py +6 -7
  38. ml_tools/ensemble_learning.py +4 -3
  39. ml_tools/handle_excel.py +1 -0
  40. ml_tools/keys.py +80 -0
  41. ml_tools/math_utilities.py +259 -0
  42. ml_tools/optimization_tools.py +198 -24
  43. ml_tools/path_manager.py +144 -45
  44. ml_tools/serde.py +192 -0
  45. ml_tools/utilities.py +287 -227
  46. dragon_ml_toolbox-10.2.0.dist-info/RECORD +0 -36
  47. {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/WHEEL +0 -0
  48. {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/top_level.txt +0 -0
@@ -5,7 +5,7 @@ import evotorch
5
5
  from evotorch.algorithms import SNES, CEM, GeneticAlgorithm
6
6
  from evotorch.logging import PandasLogger
7
7
  from evotorch.operators import SimulatedBinaryCrossOver, GaussianMutation
8
- from typing import Literal, Union, Tuple, List, Optional, Any, Callable
8
+ from typing import Literal, Union, Tuple, List, Optional, Any, Callable, Dict
9
9
  from pathlib import Path
10
10
  from tqdm.auto import trange
11
11
  from contextlib import nullcontext
@@ -17,19 +17,216 @@ from ._script_info import _script_info
17
17
  from .ML_inference import PyTorchInferenceHandler
18
18
  from .keys import PyTorchInferenceKeys
19
19
  from .SQL import DatabaseManager
20
- from .optimization_tools import _save_result
21
- from .utilities import threshold_binary_values, save_dataframe
20
+ from .optimization_tools import _save_result, create_optimization_bounds
21
+ from .utilities import save_dataframe_filename
22
+ from .math_utilities import discretize_categorical_values
23
+ from ._schema import FeatureSchema
24
+
22
25
 
23
26
  __all__ = [
27
+ "MLOptimizer",
28
+ "FitnessEvaluator",
24
29
  "create_pytorch_problem",
25
30
  "run_optimization"
26
31
  ]
27
32
 
28
33
 
34
+ class MLOptimizer:
35
+ """
36
+ A wrapper class for setting up and running EvoTorch optimization tasks.
37
+
38
+ This class combines the functionality of `FitnessEvaluator`, `create_pytorch_problem`, and
39
+ `run_optimization` into a single, streamlined workflow.
40
+
41
+ SNES and CEM algorithms do not accept bounds, the given bounds will be used as an initial starting point.
42
+
43
+ Example:
44
+ >>> # 1. Get the final schema from data exploration
45
+ >>> schema = data_exploration.finalize_feature_schema(...)
46
+ >>> # 2. Define bounds for continuous features
47
+ >>> cont_bounds = {'feature_A': (0, 100), 'feature_B': (-10, 10)}
48
+ >>>
49
+ >>> # 3. Initialize the optimizer
50
+ >>> optimizer = MLOptimizer(
51
+ ... inference_handler=my_handler,
52
+ ... schema=schema,
53
+ ... continuous_bounds_map=cont_bounds,
54
+ ... task="max",
55
+ ... algorithm="Genetic",
56
+ ... )
57
+ >>> # 4. Run the optimization
58
+ >>> best_result = optimizer.run(
59
+ ... num_generations=100,
60
+ ... target_name="my_target",
61
+ ... save_dir="/path/to/results",
62
+ ... save_format="csv"
63
+ ... )
64
+ """
65
+ def __init__(self,
66
+ inference_handler: PyTorchInferenceHandler,
67
+ schema: FeatureSchema,
68
+ continuous_bounds_map: Dict[str, Tuple[float, float]],
69
+ task: Literal["min", "max"],
70
+ algorithm: Literal["SNES", "CEM", "Genetic"] = "Genetic",
71
+ population_size: int = 200,
72
+ discretize_start_at_zero: bool = True,
73
+ **searcher_kwargs):
74
+ """
75
+ Initializes the optimizer by creating the EvoTorch problem and searcher.
76
+
77
+ Args:
78
+ inference_handler (PyTorchInferenceHandler):
79
+ An initialized inference handler containing the model.
80
+ schema (FeatureSchema):
81
+ The definitive schema object from data_exploration.
82
+ continuous_bounds_map (Dict[str, Tuple[float, float]]):
83
+ A dictionary mapping the *name* of each **continuous** feature
84
+ to its (min_bound, max_bound) tuple.
85
+ task (str): The optimization goal, either "min" or "max".
86
+ algorithm (str): The search algorithm to use ("SNES", "CEM", "Genetic").
87
+ population_size (int): Population size for CEM and GeneticAlgorithm.
88
+ discretize_start_at_zero (bool):
89
+ True if the discrete encoding starts at 0 (e.g., [0, 1, 2]).
90
+ False if it starts at 1 (e.g., [1, 2, 3]).
91
+ **searcher_kwargs: Additional keyword arguments for the selected
92
+ search algorithm's constructor.
93
+ """
94
+ # --- Store schema ---
95
+ self.schema = schema
96
+
97
+ # --- 1. Create bounds from schema ---
98
+ # This is the new, robust way to get bounds
99
+ bounds = create_optimization_bounds(
100
+ schema=schema,
101
+ continuous_bounds_map=continuous_bounds_map,
102
+ start_at_zero=discretize_start_at_zero
103
+ )
104
+
105
+ # --- 2. Make a fitness function ---
106
+ self.evaluator = FitnessEvaluator(
107
+ inference_handler=inference_handler,
108
+ # Get categorical info from the schema
109
+ categorical_index_map=schema.categorical_index_map,
110
+ discretize_start_at_zero=discretize_start_at_zero
111
+ )
112
+
113
+ # --- 3. Create the problem and searcher factory ---
114
+ self.problem, self.searcher_factory = create_pytorch_problem(
115
+ evaluator=self.evaluator,
116
+ bounds=bounds,
117
+ task=task,
118
+ algorithm=algorithm,
119
+ population_size=population_size,
120
+ **searcher_kwargs
121
+ )
122
+
123
+ # --- 4. Store other info needed by run() ---
124
+ self.discretize_start_at_zero = discretize_start_at_zero
125
+
126
+ def run(self,
127
+ num_generations: int,
128
+ target_name: str,
129
+ save_dir: Union[str, Path],
130
+ save_format: Literal['csv', 'sqlite', 'both'],
131
+ repetitions: int = 1,
132
+ verbose: bool = True) -> Optional[dict]:
133
+ """
134
+ Runs the evolutionary optimization process using the pre-configured settings.
135
+
136
+ The `feature_names` are automatically pulled from the `FeatureSchema`
137
+ provided during initialization.
138
+
139
+ Args:
140
+ num_generations (int): The total number of generations for each repetition.
141
+ target_name (str): Target name used for the CSV filename and/or SQL table.
142
+ save_dir (str | Path): The directory where result files will be saved.
143
+ save_format (Literal['csv', 'sqlite', 'both']): The format for saving results.
144
+ repetitions (int): The number of independent times to run the optimization.
145
+ verbose (bool): If True, enables detailed logging.
146
+
147
+ Returns:
148
+ Optional[dict]: A dictionary with the best result if repetitions is 1,
149
+ otherwise None.
150
+ """
151
+ # Call the existing run function, passing info from the schema
152
+ return run_optimization(
153
+ problem=self.problem,
154
+ searcher_factory=self.searcher_factory,
155
+ num_generations=num_generations,
156
+ target_name=target_name,
157
+ save_dir=save_dir,
158
+ save_format=save_format,
159
+ # Get the definitive feature names (as a list) from the schema
160
+ feature_names=list(self.schema.feature_names),
161
+ # Get categorical info from the schema
162
+ categorical_map=self.schema.categorical_index_map,
163
+ categorical_mappings=self.schema.categorical_mappings,
164
+ repetitions=repetitions,
165
+ verbose=verbose,
166
+ discretize_start_at_zero=self.discretize_start_at_zero
167
+ )
168
+
169
+
170
+ class FitnessEvaluator:
171
+ """
172
+ A callable class that wraps the PyTorch model inference handler and performs
173
+ on-the-fly discretization for the EvoTorch fitness function.
174
+
175
+ This class is automatically instantiated by MLOptimizer and passed to
176
+ create_pytorch_problem, encapsulating the evaluation logic.
177
+ """
178
+ def __init__(self,
179
+ inference_handler: PyTorchInferenceHandler,
180
+ categorical_index_map: Optional[Dict[int, int]] = None,
181
+ discretize_start_at_zero: bool = True):
182
+ """
183
+ Initializes the fitness evaluator.
184
+
185
+ Args:
186
+ inference_handler (PyTorchInferenceHandler):
187
+ An initialized inference handler containing the model.
188
+ categorical_index_map (Dict[int, int] | None):
189
+ Maps {column_index: cardinality} for discretization.
190
+ discretize_start_at_zero (bool):
191
+ True if discrete encoding starts at 0.
192
+ """
193
+ self.inference_handler = inference_handler
194
+ self.categorical_index_map = categorical_index_map
195
+ self.discretize_start_at_zero = discretize_start_at_zero
196
+
197
+ # Expose the device
198
+ self.device = self.inference_handler.device
199
+
200
+ def __call__(self, solution_tensor: torch.Tensor) -> torch.Tensor:
201
+ """
202
+ This is the fitness function EvoTorch will call.
203
+
204
+ It receives a batch of continuous solutions, discretizes the
205
+ categorical ones, and returns the model's predictions.
206
+ """
207
+ # Clone to avoid modifying the optimizer's internal state (SNES, CEM, GA)
208
+ processed_tensor = solution_tensor.clone()
209
+
210
+ if self.categorical_index_map:
211
+ for col_idx, cardinality in self.categorical_index_map.items():
212
+ # 1. Round (using torch.floor(x + 0.5) for "round half up" behavior)
213
+ rounded_col = torch.floor(processed_tensor[:, col_idx] + 0.5)
214
+
215
+ # 2. Determine clamping bounds
216
+ min_bound = 0 if self.discretize_start_at_zero else 1
217
+ max_bound = cardinality - 1 if self.discretize_start_at_zero else cardinality
218
+
219
+ # 3. Clamp the values and update the processed tensor
220
+ processed_tensor[:, col_idx] = torch.clamp(rounded_col, min_bound, max_bound)
221
+
222
+ # Use the *processed_tensor* for prediction
223
+ predictions = self.inference_handler.predict_batch(processed_tensor)[PyTorchInferenceKeys.PREDICTIONS]
224
+ return predictions.flatten()
225
+
226
+
29
227
  def create_pytorch_problem(
30
- inference_handler: PyTorchInferenceHandler,
228
+ evaluator: FitnessEvaluator,
31
229
  bounds: Tuple[List[float], List[float]],
32
- binary_features: int,
33
230
  task: Literal["min", "max"],
34
231
  algorithm: Literal["SNES", "CEM", "Genetic"] = "Genetic",
35
232
  population_size: int = 200,
@@ -38,14 +235,14 @@ def create_pytorch_problem(
38
235
  """
39
236
  Creates and configures an EvoTorch Problem and a Searcher factory class for a PyTorch model.
40
237
 
41
- SNES and CEM do not accept bounds, the given bounds will be used as initial bounds only.
238
+ SNES and CEM do not accept bounds, the given bounds will be used as an initial starting point.
42
239
 
43
240
  The Genetic Algorithm works directly with the bounds, and operators such as SimulatedBinaryCrossOver and GaussianMutation.
44
241
 
45
242
  Args:
46
- inference_handler (PyTorchInferenceHandler): An initialized inference handler containing the model and weights.
243
+ evaluator (FitnessEvaluator): A callable class that wraps the model inference and handles on-the-fly discretization.
47
244
  bounds (tuple[list[float], list[float]]): A tuple containing the lower and upper bounds for the solution features.
48
- binary_features (int): Number of binary features located at the END of the feature vector. Will be automatically added to the bounds.
245
+ Use the `optimization_tools.create_optimization_bounds()` helper to easily generate this and ensure unbiased categorical bounds.
49
246
  task (str): The optimization goal, either "minimize" or "maximize".
50
247
  algorithm (str): The search algorithm to use.
51
248
  population_size (int): Used for CEM and GeneticAlgorithm.
@@ -60,26 +257,14 @@ def create_pytorch_problem(
60
257
  lower_bounds = list(bounds[0])
61
258
  upper_bounds = list(bounds[1])
62
259
 
63
- # add binary bounds
64
- if binary_features > 0:
65
- lower_bounds.extend([0.45] * binary_features)
66
- upper_bounds.extend([0.55] * binary_features)
67
-
68
260
  solution_length = len(lower_bounds)
69
- device = inference_handler.device
261
+ device = evaluator.device
70
262
 
71
- # Define the fitness function that EvoTorch will call.
72
- def fitness_func(solution_tensor: torch.Tensor) -> torch.Tensor:
73
- # Directly use the continuous-valued tensor from the optimizer for prediction
74
- predictions = inference_handler.predict_batch(solution_tensor)[PyTorchInferenceKeys.PREDICTIONS]
75
- return predictions.flatten()
76
-
77
-
78
263
  # Create the Problem instance.
79
264
  if algorithm == "CEM" or algorithm == "SNES":
80
265
  problem = evotorch.Problem(
81
266
  objective_sense=task,
82
- objective_func=fitness_func,
267
+ objective_func=evaluator,
83
268
  solution_length=solution_length,
84
269
  initial_bounds=(lower_bounds, upper_bounds),
85
270
  device=device,
@@ -105,7 +290,7 @@ def create_pytorch_problem(
105
290
  elif algorithm == "Genetic":
106
291
  problem = evotorch.Problem(
107
292
  objective_sense=task,
108
- objective_func=fitness_func,
293
+ objective_func=evaluator,
109
294
  solution_length=solution_length,
110
295
  bounds=(lower_bounds, upper_bounds),
111
296
  device=device,
@@ -141,12 +326,14 @@ def run_optimization(
141
326
  searcher_factory: Callable[[],Any],
142
327
  num_generations: int,
143
328
  target_name: str,
144
- binary_features: int,
145
329
  save_dir: Union[str, Path],
146
330
  save_format: Literal['csv', 'sqlite', 'both'],
147
331
  feature_names: Optional[List[str]],
148
332
  repetitions: int = 1,
149
- verbose: bool = True
333
+ verbose: bool = True,
334
+ categorical_map: Optional[Dict[int, int]] = None,
335
+ categorical_mappings: Optional[Dict[str, Dict[str, int]]] = None,
336
+ discretize_start_at_zero: bool = True
150
337
  ) -> Optional[dict]:
151
338
  """
152
339
  Runs the evolutionary optimization process, with support for multiple repetitions.
@@ -169,7 +356,6 @@ def run_optimization(
169
356
  searcher_factory (Callable): The searcher factory to generate fresh evolutionary algorithms.
170
357
  num_generations (int): The total number of generations to run the search algorithm for in each repetition.
171
358
  target_name (str): Target name that will also be used for the CSV filename and SQL table.
172
- binary_features (int): Number of binary features located at the END of the feature vector.
173
359
  save_dir (str | Path): The directory where the result file(s) will be saved.
174
360
  save_format (Literal['csv', 'sqlite', 'both'], optional): The format for
175
361
  saving results during iterative analysis.
@@ -179,13 +365,18 @@ def run_optimization(
179
365
  repetitions (int, optional): The number of independent times to run the
180
366
  entire optimization process.
181
367
  verbose (bool): Add an Evotorch Pandas logger saved as a csv. Only for the first repetition.
368
+ categorical_index_map (Dict[int, int] | None): Used to discretize values after optimization. Maps {column_index: cardinality}.
369
+ categorical_mappings (Dict[str, Dict[str, int]] | None): Used to map discrete integer values back to strings (e.g., {0: 'Category_A'}) before saving.
370
+ discretize_start_at_zero (bool):
371
+ True if the discrete encoding starts at 0 (e.g., [0, 1, 2]).
372
+ False if it starts at 1 (e.g., [1, 2, 3]).
182
373
 
183
374
  Returns:
184
375
  Optional[dict]: A dictionary containing the best feature values and the
185
376
  fitness score if `repetitions` is 1. Returns `None` if `repetitions`
186
377
  is greater than 1, as results are streamed to files instead.
187
378
  """
188
- # preprocess paths
379
+ # --- 1. Setup Paths and Feature Names ---
189
380
  save_path = make_fullpath(save_dir, make=True, enforce="directory")
190
381
 
191
382
  sanitized_target_name = sanitize_filename(target_name)
@@ -193,54 +384,38 @@ def run_optimization(
193
384
  sanitized_target_name = sanitized_target_name + ".csv"
194
385
 
195
386
  csv_path = save_path / sanitized_target_name
196
-
197
387
  db_path = save_path / "Optimization.db"
198
388
  db_table_name = target_name
199
389
 
200
- # preprocess feature names
390
+ # Use problem's solution_length to create default names if none provided
201
391
  if feature_names is None:
202
- feature_names = [f"feature_{i}" for i in range(problem.solution_length)] # type: ignore
392
+ feat_len = problem.solution_length
393
+ feature_names = [f"feature_{i}" for i in range(feat_len)] # type: ignore
203
394
 
395
+ # --- 2. Run Optimization ---
204
396
  # --- SINGLE RUN LOGIC ---
205
397
  if repetitions <= 1:
206
- searcher = searcher_factory()
207
- _LOGGER.info(f"🤖 Starting optimization with {searcher.__class__.__name__} Algorithm for {num_generations} generations...")
208
- # for _ in trange(num_generations, desc="Optimizing"):
209
- # searcher.step()
398
+ _LOGGER.info(f"🤖 Starting optimization for {num_generations} generations...")
210
399
 
211
- # Attach logger if requested
212
- if verbose:
213
- pandas_logger = PandasLogger(searcher)
214
-
215
- searcher.run(num_generations) # Use the built-in run method for simplicity
216
-
217
- # # DEBUG new searcher objects
218
- # for status_key in searcher.iter_status_keys():
219
- # print("===", status_key, "===")
220
- # print(searcher.status[status_key])
221
- # print()
222
-
223
- # Get results from the .status dictionary
224
- # SNES and CEM use the key 'center' to get mean values if needed best_solution_tensor = searcher.status["center"]
225
- best_solution_container = searcher.status["pop_best"]
226
- best_solution_tensor = best_solution_container.values
227
- best_fitness = best_solution_container.evals
228
-
229
- best_solution_np = best_solution_tensor.cpu().numpy()
230
-
231
- # threshold binary features
232
- if binary_features > 0:
233
- best_solution_thresholded = threshold_binary_values(input_array=best_solution_np, binary_values=binary_features)
234
- else:
235
- best_solution_thresholded = best_solution_np
236
-
237
- result_dict = {name: value for name, value in zip(feature_names, best_solution_thresholded)}
238
- result_dict[target_name] = best_fitness.item()
400
+ result_dict, pandas_logger = _run_single_optimization_rep(
401
+ searcher_factory=searcher_factory,
402
+ num_generations=num_generations,
403
+ feature_names=feature_names,
404
+ target_name=target_name,
405
+ categorical_map=categorical_map,
406
+ discretize_start_at_zero=discretize_start_at_zero,
407
+ attach_logger=verbose
408
+ )
239
409
 
240
- _save_result(result_dict, 'csv', csv_path) # Single run defaults to CSV
410
+ # Single run defaults to CSV, pass mappings for reverse mapping
411
+ _save_result(
412
+ result_dict=result_dict,
413
+ save_format='csv',
414
+ csv_path=csv_path,
415
+ categorical_mappings=categorical_mappings
416
+ )
241
417
 
242
- # Process logger
243
- if verbose:
418
+ if pandas_logger:
244
419
  _handle_pandas_log(pandas_logger, save_path=save_path, target_name=target_name)
245
420
 
246
421
  _LOGGER.info(f"Optimization complete. Best solution saved to '{csv_path.name}'")
@@ -249,60 +424,109 @@ def run_optimization(
249
424
  # --- MULTIPLE REPETITIONS LOGIC ---
250
425
  else:
251
426
  _LOGGER.info(f"🏁 Starting optimal solution space analysis with {repetitions} repetitions...")
252
-
427
+
428
+ first_run_logger = None # To store the logger from the first rep
253
429
  db_context = DatabaseManager(db_path) if save_format in ['sqlite', 'both'] else nullcontext()
254
430
 
255
431
  with db_context as db_manager:
432
+ # --- Setup Database Schema (if applicable) ---
256
433
  if db_manager:
257
- schema = {name: "REAL" for name in feature_names}
434
+ schema = {}
435
+ categorical_cols = set(categorical_mappings.keys()) if categorical_mappings else set()
436
+
437
+ for name in feature_names:
438
+ schema[name] = "TEXT" if name in categorical_cols else "REAL"
258
439
  schema[target_name] = "REAL"
440
+
259
441
  db_manager.create_table(db_table_name, schema)
260
442
 
443
+ # --- Repetitions Loop ---
261
444
  print("")
262
- # Repetitions loop
263
- pandas_logger = None
264
445
  for i in trange(repetitions, desc="Repetitions"):
265
- # CRITICAL: Create a fresh searcher for each run using the factory
266
- searcher = searcher_factory()
267
-
268
- # Attach logger if requested
269
- if verbose and i==0:
270
- pandas_logger = PandasLogger(searcher)
271
446
 
272
- searcher.run(num_generations) # Use the built-in run method for simplicity
447
+ # Only attach a logger for the first repetition if verbose
448
+ attach_logger = verbose and (i == 0)
273
449
 
274
- # Get results from the .status dictionary
275
- # SNES and CEM use the key 'center' to get mean values if needed best_solution_tensor = searcher.status["center"]
276
- best_solution_container = searcher.status["pop_best"]
277
- best_solution_tensor = best_solution_container.values
278
- best_fitness = best_solution_container.evals
279
-
280
- best_solution_np = best_solution_tensor.cpu().numpy()
450
+ result_dict, pandas_logger = _run_single_optimization_rep(
451
+ searcher_factory=searcher_factory,
452
+ num_generations=num_generations,
453
+ feature_names=feature_names,
454
+ target_name=target_name,
455
+ categorical_map=categorical_map,
456
+ discretize_start_at_zero=discretize_start_at_zero,
457
+ attach_logger=attach_logger
458
+ )
281
459
 
282
- # threshold binary features
283
- if binary_features > 0:
284
- best_solution_thresholded = threshold_binary_values(input_array=best_solution_np, binary_values=binary_features)
285
- else:
286
- best_solution_thresholded = best_solution_np
287
-
288
- # make results dictionary
289
- result_dict = {name: value for name, value in zip(feature_names, best_solution_thresholded)}
290
- result_dict[target_name] = best_fitness.item()
460
+ if pandas_logger:
461
+ first_run_logger = pandas_logger
291
462
 
292
463
  # Save each result incrementally
293
- _save_result(result_dict, save_format, csv_path, db_manager, db_table_name)
464
+ _save_result(
465
+ result_dict=result_dict,
466
+ save_format=save_format,
467
+ csv_path=csv_path,
468
+ db_manager=db_manager,
469
+ db_table_name=db_table_name,
470
+ categorical_mappings=categorical_mappings
471
+ )
294
472
 
295
- # Process logger
296
- if pandas_logger is not None:
297
- _handle_pandas_log(pandas_logger, save_path=save_path, target_name=target_name)
473
+ if first_run_logger:
474
+ _handle_pandas_log(first_run_logger, save_path=save_path, target_name=target_name)
298
475
 
299
476
  _LOGGER.info(f"Optimal solution space complete. Results saved to '{save_path}'")
300
477
  return None
301
478
 
302
479
 
480
+ def _run_single_optimization_rep(
481
+ searcher_factory: Callable[[],Any],
482
+ num_generations: int,
483
+ feature_names: List[str],
484
+ target_name: str,
485
+ categorical_map: Optional[Dict[int, int]],
486
+ discretize_start_at_zero: bool,
487
+ attach_logger: bool
488
+ ) -> Tuple[dict, Optional[PandasLogger]]:
489
+ """
490
+ Internal helper to run one full optimization repetition.
491
+
492
+ Handles searcher creation, logging, running, and result post-processing.
493
+ """
494
+ # CRITICAL: Create a fresh searcher for each run using the factory
495
+ searcher = searcher_factory()
496
+
497
+ # Attach logger if requested
498
+ pandas_logger = PandasLogger(searcher) if attach_logger else None
499
+
500
+ # Run the optimization
501
+ searcher.run(num_generations)
502
+
503
+ # Get the best result
504
+ best_solution_container = searcher.status["pop_best"]
505
+ best_solution_tensor = best_solution_container.values
506
+ best_fitness = best_solution_container.evals
507
+
508
+ best_solution_np = best_solution_tensor.cpu().numpy()
509
+
510
+ # Discretize categorical/binary features
511
+ if categorical_map:
512
+ best_solution_thresholded = discretize_categorical_values(
513
+ input_array=best_solution_np,
514
+ categorical_info=categorical_map,
515
+ start_at_zero=discretize_start_at_zero
516
+ )
517
+ else:
518
+ best_solution_thresholded = best_solution_np
519
+
520
+ # Format results into a dictionary
521
+ result_dict = {name: value for name, value in zip(feature_names, best_solution_thresholded)}
522
+ result_dict[target_name] = best_fitness.item()
523
+
524
+ return result_dict, pandas_logger
525
+
526
+
303
527
  def _handle_pandas_log(logger: PandasLogger, save_path: Path, target_name: str):
304
528
  log_dataframe = logger.to_dataframe()
305
- save_dataframe(df=log_dataframe, save_dir=save_path / "EvolutionLogs", filename=target_name)
529
+ save_dataframe_filename(df=log_dataframe, save_dir=save_path / "EvolutionLogs", filename=target_name)
306
530
 
307
531
 
308
532
  def info():
ml_tools/ML_scaler.py CHANGED
@@ -2,14 +2,17 @@ import torch
2
2
  from torch.utils.data import Dataset, DataLoader
3
3
  from pathlib import Path
4
4
  from typing import Union, List, Optional
5
+
5
6
  from ._logger import _LOGGER
6
7
  from ._script_info import _script_info
7
8
  from .path_manager import make_fullpath
8
9
 
10
+
9
11
  __all__ = [
10
12
  "PytorchScaler"
11
13
  ]
12
14
 
15
+
13
16
  class PytorchScaler:
14
17
  """
15
18
  Standardizes continuous features in a PyTorch dataset by subtracting the
@@ -149,24 +152,25 @@ class PytorchScaler:
149
152
 
150
153
  return data_clone
151
154
 
152
- def save(self, filepath: Union[str, Path]):
155
+ def save(self, filepath: Union[str, Path], verbose: bool=True):
153
156
  """
154
157
  Saves the scaler's state (mean, std, indices) to a .pth file.
155
158
 
156
159
  Args:
157
160
  filepath (str | Path): The path to save the file.
158
161
  """
159
- path_obj = make_fullpath(filepath)
162
+ path_obj = make_fullpath(filepath, make=True, enforce="file")
160
163
  state = {
161
164
  'mean': self.mean_,
162
165
  'std': self.std_,
163
166
  'continuous_feature_indices': self.continuous_feature_indices
164
167
  }
165
168
  torch.save(state, path_obj)
166
- _LOGGER.info(f"PytorchScaler state saved to '{path_obj.name}'.")
169
+ if verbose:
170
+ _LOGGER.info(f"PytorchScaler state saved as '{path_obj.name}'.")
167
171
 
168
172
  @staticmethod
169
- def load(filepath: Union[str, Path]) -> 'PytorchScaler':
173
+ def load(filepath: Union[str, Path], verbose: bool=True) -> 'PytorchScaler':
170
174
  """
171
175
  Loads a scaler's state from a .pth file.
172
176
 
@@ -178,7 +182,8 @@ class PytorchScaler:
178
182
  """
179
183
  path_obj = make_fullpath(filepath, enforce="file")
180
184
  state = torch.load(path_obj)
181
- _LOGGER.info(f"PytorchScaler state loaded from '{path_obj.name}'.")
185
+ if verbose:
186
+ _LOGGER.info(f"PytorchScaler state loaded from '{path_obj.name}'.")
182
187
  return PytorchScaler(
183
188
  mean=state['mean'],
184
189
  std=state['std'],