dragon-ml-toolbox 13.0.0__py3-none-any.whl → 13.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

ml_tools/serde.py CHANGED
@@ -6,12 +6,15 @@ from pathlib import Path
6
6
  from .path_manager import make_fullpath, sanitize_filename
7
7
  from ._script_info import _script_info
8
8
  from ._logger import _LOGGER
9
+ from ._schema import FeatureSchema
9
10
 
10
11
 
11
12
  __all__ = [
12
13
  "serialize_object_filename",
13
14
  "serialize_object",
14
15
  "deserialize_object",
16
+ "serialize_schema",
17
+ "deserialize_schema"
15
18
  ]
16
19
 
17
20
 
@@ -25,21 +28,20 @@ def serialize_object_filename(obj: Any, save_dir: Union[str,Path], filename: str
25
28
  filename (str) : Name for the output file, extension will be appended if needed.
26
29
  """
27
30
  try:
28
- save_path = make_fullpath(save_dir, make=True)
31
+ save_path = make_fullpath(save_dir, make=True, enforce="directory")
29
32
  sanitized_name = sanitize_filename(filename)
30
- if not sanitized_name.endswith('.joblib'):
31
- sanitized_name = sanitized_name + ".joblib"
32
33
  full_path = save_path / sanitized_name
33
- joblib.dump(obj, full_path)
34
- except (IOError, OSError, TypeError, TerminatedWorkerError) as e:
35
- _LOGGER.error(f"Failed to serialize object of type '{type(obj)}'.")
34
+ except (IOError, OSError, TypeError) as e:
35
+ _LOGGER.error(f"Failed to construct save path from dir='{save_dir}' and filename='{filename}'. Error: {e}")
36
36
  if raise_on_error:
37
37
  raise e
38
38
  return None
39
- else:
40
- if verbose:
41
- _LOGGER.info(f"Object of type '{type(obj)}' saved to '{full_path}'")
42
- return None
39
+
40
+ # call serialize_object with the fully constructed path.
41
+ serialize_object(obj=obj,
42
+ file_path=full_path,
43
+ verbose=verbose,
44
+ raise_on_error=raise_on_error)
43
45
 
44
46
 
45
47
  def serialize_object(obj: Any, file_path: Path, verbose: bool = True, raise_on_error: bool = False) -> None:
@@ -56,8 +58,7 @@ def serialize_object(obj: Any, file_path: Path, verbose: bool = True, raise_on_e
56
58
  """
57
59
  try:
58
60
  # Ensure the extension is correct
59
- if file_path.suffix != '.joblib':
60
- file_path = file_path.with_suffix(file_path.suffix + '.joblib')
61
+ file_path = file_path.with_suffix('.joblib')
61
62
 
62
63
  # Ensure the parent directory exists
63
64
  _save_dir = make_fullpath(file_path.parent, make=True, enforce="directory")
@@ -116,8 +117,7 @@ def deserialize_object(
116
117
  # Can't do an isinstance check on 'Any', skip it.
117
118
  if type_to_check is not Any and not isinstance(obj, type_to_check):
118
119
  error_msg = (
119
- f"Type mismatch: Expected an instance of '{expected_type}', "
120
- f"but found '{type(obj)}' in '{true_filepath}'."
120
+ f"Type mismatch: Expected an instance of '{expected_type}', but found '{type(obj)}' in '{true_filepath}'."
121
121
  )
122
122
  _LOGGER.error(error_msg)
123
123
  raise TypeError()
@@ -127,5 +127,46 @@ def deserialize_object(
127
127
 
128
128
  return obj
129
129
 
130
+
131
+ def serialize_schema(schema: FeatureSchema, file_path: Path):
132
+ """
133
+ Serializes a FeatureSchema object to a .joblib file.
134
+
135
+ This is a high-level wrapper around `serialize_object` that
136
+ specifically handles `FeatureSchema` instances and ensures
137
+ errors are raised on failure.
138
+
139
+ Args:
140
+ schema (FeatureSchema): The schema object to serialize.
141
+ file_path (Path): The full file path to save the schema to.
142
+ """
143
+ serialize_object(obj=schema,
144
+ file_path=file_path,
145
+ verbose=True,
146
+ raise_on_error=True)
147
+
148
+
149
+ def deserialize_schema(file_path: Path):
150
+ """
151
+ Deserializes a FeatureSchema object from a .joblib file.
152
+
153
+ This is a high-level wrapper around `deserialize_object` that
154
+ validates the loaded object is an instance of `FeatureSchema`.
155
+
156
+ Args:
157
+ file_path (Path): The full file path of the serialized schema.
158
+
159
+ Returns:
160
+ FeatureSchema: The deserialized schema object.
161
+
162
+ Raises:
163
+ TypeError: If the deserialized object is not an instance of `FeatureSchema`.
164
+ """
165
+ schema = deserialize_object(filepath=file_path,
166
+ expected_type=FeatureSchema,
167
+ verbose=True)
168
+ return schema
169
+
170
+
130
171
  def info():
131
172
  _script_info(__all__)
@@ -1,413 +0,0 @@
1
- import pandas # logger
2
- import torch
3
- import numpy #handling torch to numpy
4
- import evotorch
5
- from evotorch.algorithms import SNES, CEM, GeneticAlgorithm
6
- from evotorch.logging import PandasLogger
7
- from evotorch.operators import SimulatedBinaryCrossOver, GaussianMutation
8
- from typing import Literal, Union, Tuple, List, Optional, Any, Callable
9
- from pathlib import Path
10
- from tqdm.auto import trange
11
- from contextlib import nullcontext
12
- from functools import partial
13
-
14
- from .path_manager import make_fullpath, sanitize_filename
15
- from ._logger import _LOGGER
16
- from ._script_info import _script_info
17
- from .ML_inference import PyTorchInferenceHandler
18
- from .keys import PyTorchInferenceKeys
19
- from .SQL import DatabaseManager
20
- from .optimization_tools import _save_result
21
- from .utilities import save_dataframe_filename
22
- from .math_utilities import threshold_binary_values
23
-
24
- """
25
- DEPRECATED
26
- """
27
-
28
- __all__ = [
29
- "s_MLOptimizer",
30
- "s_create_pytorch_problem",
31
- "s_run_optimization"
32
- ]
33
-
34
-
35
- class s_MLOptimizer:
36
- """
37
- A wrapper class for setting up and running EvoTorch optimization tasks.
38
-
39
- This class combines the functionality of `create_pytorch_problem` and
40
- `run_optimization` functions into a single, streamlined workflow.
41
-
42
- SNES and CEM algorithms do not accept bounds, the given bounds will be used as an initial starting point.
43
-
44
- Example:
45
- >>> # 1. Initialize the optimizer with model and search parameters
46
- >>> optimizer = MLOptimizer(
47
- ... inference_handler=my_handler,
48
- ... bounds=(lower_bounds, upper_bounds),
49
- ... number_binary_features=2,
50
- ... task="max",
51
- ... algorithm="Genetic"
52
- ... )
53
- >>> # 2. Run the optimization and save the results
54
- >>> best_result = optimizer.run(
55
- ... num_generations=100,
56
- ... target_name="my_target",
57
- ... feature_names=my_feature_names,
58
- ... save_dir="/path/to/results",
59
- ... save_format="csv"
60
- ... )
61
- """
62
- def __init__(self,
63
- inference_handler: PyTorchInferenceHandler,
64
- bounds: Tuple[List[float], List[float]],
65
- number_binary_features: int,
66
- task: Literal["min", "max"],
67
- algorithm: Literal["SNES", "CEM", "Genetic"] = "Genetic",
68
- population_size: int = 200,
69
- **searcher_kwargs):
70
- """
71
- Initializes the optimizer by creating the EvoTorch problem and searcher.
72
-
73
- Args:
74
- inference_handler (PyTorchInferenceHandler): An initialized inference handler containing the model and weights.
75
- bounds (tuple[list[float], list[float]]): A tuple containing the lower and upper bounds for the solution features.
76
- number_binary_features (int): Number of binary features located at the END of the feature vector.
77
- task (str): The optimization goal, either "min" or "max".
78
- algorithm (str): The search algorithm to use ("SNES", "CEM", "Genetic").
79
- population_size (int): Population size for CEM and GeneticAlgorithm.
80
- **searcher_kwargs: Additional keyword arguments for the selected search algorithm's constructor.
81
- """
82
- # Call the existing factory function to get the problem and searcher factory
83
- self.problem, self.searcher_factory = s_create_pytorch_problem(
84
- inference_handler=inference_handler,
85
- bounds=bounds,
86
- binary_features=number_binary_features,
87
- task=task,
88
- algorithm=algorithm,
89
- population_size=population_size,
90
- **searcher_kwargs
91
- )
92
- # Store binary_features count to pass it to the run function later
93
- self._binary_features = number_binary_features
94
-
95
- def run(self,
96
- num_generations: int,
97
- target_name: str,
98
- save_dir: Union[str, Path],
99
- feature_names: Optional[List[str]],
100
- save_format: Literal['csv', 'sqlite', 'both'],
101
- repetitions: int = 1,
102
- verbose: bool = True) -> Optional[dict]:
103
- """
104
- Runs the evolutionary optimization process using the pre-configured settings.
105
-
106
- Args:
107
- num_generations (int): The total number of generations for each repetition.
108
- target_name (str): Target name used for the CSV filename and/or SQL table.
109
- save_dir (str | Path): The directory where result files will be saved.
110
- feature_names (List[str] | None): Names of the solution features for labeling output. If None, generic names like 'feature_0', 'feature_1', ... , will be created.
111
- save_format (Literal['csv', 'sqlite', 'both']): The format for saving results.
112
- repetitions (int): The number of independent times to run the optimization.
113
- verbose (bool): If True, enables detailed logging.
114
-
115
- Returns:
116
- Optional[dict]: A dictionary with the best result if repetitions is 1, otherwise None.
117
- """
118
- # Call the existing run function with the stored problem, searcher, and binary feature count
119
- return s_run_optimization(
120
- problem=self.problem,
121
- searcher_factory=self.searcher_factory,
122
- num_generations=num_generations,
123
- target_name=target_name,
124
- binary_features=self._binary_features,
125
- save_dir=save_dir,
126
- save_format=save_format,
127
- feature_names=feature_names,
128
- repetitions=repetitions,
129
- verbose=verbose
130
- )
131
-
132
-
133
- def s_create_pytorch_problem(
134
- inference_handler: PyTorchInferenceHandler,
135
- bounds: Tuple[List[float], List[float]],
136
- binary_features: int,
137
- task: Literal["min", "max"],
138
- algorithm: Literal["SNES", "CEM", "Genetic"] = "Genetic",
139
- population_size: int = 200,
140
- **searcher_kwargs
141
- ) -> Tuple[evotorch.Problem, Callable[[], Any]]:
142
- """
143
- Creates and configures an EvoTorch Problem and a Searcher factory class for a PyTorch model.
144
-
145
- SNES and CEM do not accept bounds, the given bounds will be used as an initial starting point.
146
-
147
- The Genetic Algorithm works directly with the bounds, and operators such as SimulatedBinaryCrossOver and GaussianMutation.
148
-
149
- Args:
150
- inference_handler (PyTorchInferenceHandler): An initialized inference handler containing the model and weights.
151
- bounds (tuple[list[float], list[float]]): A tuple containing the lower and upper bounds for the solution features.
152
- binary_features (int): Number of binary features located at the END of the feature vector. Will be automatically added to the bounds.
153
- task (str): The optimization goal, either "minimize" or "maximize".
154
- algorithm (str): The search algorithm to use.
155
- population_size (int): Used for CEM and GeneticAlgorithm.
156
- **searcher_kwargs: Additional keyword arguments to pass to the
157
- selected search algorithm's constructor (e.g., stdev_init=0.5 for CMAES).
158
-
159
- Returns:
160
- Tuple:
161
- A tuple containing the configured Problem and Searcher.
162
- """
163
- # Create copies to avoid modifying the original lists passed in the `bounds` tuple
164
- lower_bounds = list(bounds[0])
165
- upper_bounds = list(bounds[1])
166
-
167
- # add binary bounds
168
- if binary_features > 0:
169
- lower_bounds.extend([0.48] * binary_features)
170
- upper_bounds.extend([0.52] * binary_features)
171
-
172
- solution_length = len(lower_bounds)
173
- device = inference_handler.device
174
-
175
- # Define the fitness function that EvoTorch will call.
176
- def fitness_func(solution_tensor: torch.Tensor) -> torch.Tensor:
177
- # Directly use the continuous-valued tensor from the optimizer for prediction
178
- predictions = inference_handler.predict_batch(solution_tensor)[PyTorchInferenceKeys.PREDICTIONS]
179
- return predictions.flatten()
180
-
181
-
182
- # Create the Problem instance.
183
- if algorithm == "CEM" or algorithm == "SNES":
184
- problem = evotorch.Problem(
185
- objective_sense=task,
186
- objective_func=fitness_func,
187
- solution_length=solution_length,
188
- initial_bounds=(lower_bounds, upper_bounds),
189
- device=device,
190
- vectorized=True #Use batches
191
- )
192
-
193
- # If stdev_init is not provided, calculate it based on the bounds (used for SNES and CEM)
194
- if 'stdev_init' not in searcher_kwargs:
195
- # Calculate stdev for each parameter as 25% of its search range
196
- stdevs = [abs(up - low) * 0.25 for low, up in zip(lower_bounds, upper_bounds)]
197
- searcher_kwargs['stdev_init'] = torch.tensor(stdevs, dtype=torch.float32, requires_grad=False)
198
-
199
- if algorithm == "SNES":
200
- SearcherClass = SNES
201
- elif algorithm == "CEM":
202
- SearcherClass = CEM
203
- # Set a defaults for CEM if not provided
204
- if 'popsize' not in searcher_kwargs:
205
- searcher_kwargs['popsize'] = population_size
206
- if 'parenthood_ratio' not in searcher_kwargs:
207
- searcher_kwargs['parenthood_ratio'] = 0.2 #float 0.0 - 1.0
208
-
209
- elif algorithm == "Genetic":
210
- problem = evotorch.Problem(
211
- objective_sense=task,
212
- objective_func=fitness_func,
213
- solution_length=solution_length,
214
- bounds=(lower_bounds, upper_bounds),
215
- device=device,
216
- vectorized=True #Use batches
217
- )
218
-
219
- operators = [
220
- SimulatedBinaryCrossOver(problem,
221
- tournament_size=3,
222
- eta=0.6),
223
- GaussianMutation(problem,
224
- stdev=0.4)
225
- ]
226
-
227
- searcher_kwargs["operators"] = operators
228
- if 'popsize' not in searcher_kwargs:
229
- searcher_kwargs['popsize'] = population_size
230
-
231
- SearcherClass = GeneticAlgorithm
232
-
233
- else:
234
- _LOGGER.error(f"Unknown algorithm '{algorithm}'.")
235
- raise ValueError()
236
-
237
- # Create a factory function with all arguments pre-filled
238
- searcher_factory = partial(SearcherClass, problem, **searcher_kwargs)
239
-
240
- return problem, searcher_factory
241
-
242
-
243
- def s_run_optimization(
244
- problem: evotorch.Problem,
245
- searcher_factory: Callable[[],Any],
246
- num_generations: int,
247
- target_name: str,
248
- binary_features: int,
249
- save_dir: Union[str, Path],
250
- save_format: Literal['csv', 'sqlite', 'both'],
251
- feature_names: Optional[List[str]],
252
- repetitions: int = 1,
253
- verbose: bool = True
254
- ) -> Optional[dict]:
255
- """
256
- Runs the evolutionary optimization process, with support for multiple repetitions.
257
-
258
- This function serves as the main engine for the optimization task. It takes a
259
- configured Problem and a Searcher from EvoTorch and executes the optimization
260
- for a specified number of generations.
261
-
262
- It has two modes of operation:
263
- 1. **Single Run (repetitions=1):** Executes the optimization once, saves the
264
- single best result to a CSV file, and returns it as a dictionary.
265
- 2. **Iterative Analysis (repetitions > 1):** Executes the optimization
266
- multiple times. Results from each run are streamed incrementally to the
267
- specified file formats (CSV and/or SQLite database). In this mode,
268
- the function returns None.
269
-
270
- Args:
271
- problem (evotorch.Problem): The configured problem instance, which defines
272
- the objective function, solution space, and optimization sense.
273
- searcher_factory (Callable): The searcher factory to generate fresh evolutionary algorithms.
274
- num_generations (int): The total number of generations to run the search algorithm for in each repetition.
275
- target_name (str): Target name that will also be used for the CSV filename and SQL table.
276
- binary_features (int): Number of binary features located at the END of the feature vector.
277
- save_dir (str | Path): The directory where the result file(s) will be saved.
278
- save_format (Literal['csv', 'sqlite', 'both'], optional): The format for
279
- saving results during iterative analysis.
280
- feature_names (List[str], optional): Names of the solution features for
281
- labeling the output files. If None, generic names like 'feature_0',
282
- 'feature_1', etc., will be created.
283
- repetitions (int, optional): The number of independent times to run the
284
- entire optimization process.
285
- verbose (bool): Add an Evotorch Pandas logger saved as a csv. Only for the first repetition.
286
-
287
- Returns:
288
- Optional[dict]: A dictionary containing the best feature values and the
289
- fitness score if `repetitions` is 1. Returns `None` if `repetitions`
290
- is greater than 1, as results are streamed to files instead.
291
- """
292
- # preprocess paths
293
- save_path = make_fullpath(save_dir, make=True, enforce="directory")
294
-
295
- sanitized_target_name = sanitize_filename(target_name)
296
- if not sanitized_target_name.endswith(".csv"):
297
- sanitized_target_name = sanitized_target_name + ".csv"
298
-
299
- csv_path = save_path / sanitized_target_name
300
-
301
- db_path = save_path / "Optimization.db"
302
- db_table_name = target_name
303
-
304
- # preprocess feature names
305
- if feature_names is None:
306
- feature_names = [f"feature_{i}" for i in range(problem.solution_length)] # type: ignore
307
-
308
- # --- SINGLE RUN LOGIC ---
309
- if repetitions <= 1:
310
- searcher = searcher_factory()
311
- _LOGGER.info(f"🤖 Starting optimization with {searcher.__class__.__name__} Algorithm for {num_generations} generations...")
312
- # for _ in trange(num_generations, desc="Optimizing"):
313
- # searcher.step()
314
-
315
- # Attach logger if requested
316
- if verbose:
317
- pandas_logger = PandasLogger(searcher)
318
-
319
- searcher.run(num_generations) # Use the built-in run method for simplicity
320
-
321
- # # DEBUG new searcher objects
322
- # for status_key in searcher.iter_status_keys():
323
- # print("===", status_key, "===")
324
- # print(searcher.status[status_key])
325
- # print()
326
-
327
- # Get results from the .status dictionary
328
- # SNES and CEM use the key 'center' to get mean values if needed best_solution_tensor = searcher.status["center"]
329
- best_solution_container = searcher.status["pop_best"]
330
- best_solution_tensor = best_solution_container.values
331
- best_fitness = best_solution_container.evals
332
-
333
- best_solution_np = best_solution_tensor.cpu().numpy()
334
-
335
- # threshold binary features
336
- if binary_features > 0:
337
- best_solution_thresholded = threshold_binary_values(input_array=best_solution_np, binary_values=binary_features)
338
- else:
339
- best_solution_thresholded = best_solution_np
340
-
341
- result_dict = {name: value for name, value in zip(feature_names, best_solution_thresholded)}
342
- result_dict[target_name] = best_fitness.item()
343
-
344
- _save_result(result_dict, 'csv', csv_path) # Single run defaults to CSV
345
-
346
- # Process logger
347
- if verbose:
348
- _handle_pandas_log(pandas_logger, save_path=save_path, target_name=target_name)
349
-
350
- _LOGGER.info(f"Optimization complete. Best solution saved to '{csv_path.name}'")
351
- return result_dict
352
-
353
- # --- MULTIPLE REPETITIONS LOGIC ---
354
- else:
355
- _LOGGER.info(f"🏁 Starting optimal solution space analysis with {repetitions} repetitions...")
356
-
357
- db_context = DatabaseManager(db_path) if save_format in ['sqlite', 'both'] else nullcontext()
358
-
359
- with db_context as db_manager:
360
- if db_manager:
361
- schema = {name: "REAL" for name in feature_names}
362
- schema[target_name] = "REAL"
363
- db_manager.create_table(db_table_name, schema)
364
-
365
- print("")
366
- # Repetitions loop
367
- pandas_logger = None
368
- for i in trange(repetitions, desc="Repetitions"):
369
- # CRITICAL: Create a fresh searcher for each run using the factory
370
- searcher = searcher_factory()
371
-
372
- # Attach logger if requested
373
- if verbose and i==0:
374
- pandas_logger = PandasLogger(searcher)
375
-
376
- searcher.run(num_generations) # Use the built-in run method for simplicity
377
-
378
- # Get results from the .status dictionary
379
- # SNES and CEM use the key 'center' to get mean values if needed best_solution_tensor = searcher.status["center"]
380
- best_solution_container = searcher.status["pop_best"]
381
- best_solution_tensor = best_solution_container.values
382
- best_fitness = best_solution_container.evals
383
-
384
- best_solution_np = best_solution_tensor.cpu().numpy()
385
-
386
- # threshold binary features
387
- if binary_features > 0:
388
- best_solution_thresholded = threshold_binary_values(input_array=best_solution_np, binary_values=binary_features)
389
- else:
390
- best_solution_thresholded = best_solution_np
391
-
392
- # make results dictionary
393
- result_dict = {name: value for name, value in zip(feature_names, best_solution_thresholded)}
394
- result_dict[target_name] = best_fitness.item()
395
-
396
- # Save each result incrementally
397
- _save_result(result_dict, save_format, csv_path, db_manager, db_table_name)
398
-
399
- # Process logger
400
- if pandas_logger is not None:
401
- _handle_pandas_log(pandas_logger, save_path=save_path, target_name=target_name)
402
-
403
- _LOGGER.info(f"Optimal solution space complete. Results saved to '{save_path}'")
404
- return None
405
-
406
-
407
- def _handle_pandas_log(logger: PandasLogger, save_path: Path, target_name: str):
408
- log_dataframe = logger.to_dataframe()
409
- save_dataframe_filename(df=log_dataframe, save_dir=save_path / "EvolutionLogs", filename=target_name)
410
-
411
-
412
- def info():
413
- _script_info(__all__)