dragon-ml-toolbox 12.0.0__tar.gz → 12.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (47) hide show
  1. {dragon_ml_toolbox-12.0.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-12.1.0}/PKG-INFO +2 -2
  2. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0/dragon_ml_toolbox.egg-info}/PKG-INFO +2 -2
  3. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +1 -0
  4. dragon_ml_toolbox-12.1.0/ml_tools/ML_optimization.py +462 -0
  5. dragon_ml_toolbox-12.0.0/ml_tools/ML_optimization.py → dragon_ml_toolbox-12.1.0/ml_tools/ML_simple_optimization.py +11 -8
  6. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/ml_tools/data_exploration.py +96 -3
  7. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/ml_tools/math_utilities.py +30 -6
  8. dragon_ml_toolbox-12.1.0/ml_tools/optimization_tools.py +331 -0
  9. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/pyproject.toml +2 -2
  10. dragon_ml_toolbox-12.0.0/ml_tools/optimization_tools.py +0 -136
  11. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/LICENSE +0 -0
  12. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/LICENSE-THIRD-PARTY.md +0 -0
  13. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/README.md +0 -0
  14. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  15. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
  16. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  17. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/ml_tools/ETL_cleaning.py +0 -0
  18. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/ml_tools/ETL_engineering.py +0 -0
  19. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/ml_tools/GUI_tools.py +0 -0
  20. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/ml_tools/MICE_imputation.py +0 -0
  21. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/ml_tools/ML_callbacks.py +0 -0
  22. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/ml_tools/ML_datasetmaster.py +0 -0
  23. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/ml_tools/ML_evaluation.py +0 -0
  24. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/ml_tools/ML_evaluation_multi.py +0 -0
  25. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/ml_tools/ML_inference.py +0 -0
  26. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/ml_tools/ML_models.py +0 -0
  27. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/ml_tools/ML_scaler.py +0 -0
  28. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/ml_tools/ML_trainer.py +0 -0
  29. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/ml_tools/ML_utilities.py +0 -0
  30. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/ml_tools/PSO_optimization.py +0 -0
  31. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/ml_tools/RNN_forecast.py +0 -0
  32. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/ml_tools/SQL.py +0 -0
  33. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/ml_tools/VIF_factor.py +0 -0
  34. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/ml_tools/__init__.py +0 -0
  35. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/ml_tools/_logger.py +0 -0
  36. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/ml_tools/_script_info.py +0 -0
  37. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/ml_tools/constants.py +0 -0
  38. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/ml_tools/custom_logger.py +0 -0
  39. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/ml_tools/ensemble_evaluation.py +0 -0
  40. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/ml_tools/ensemble_inference.py +0 -0
  41. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/ml_tools/ensemble_learning.py +0 -0
  42. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/ml_tools/handle_excel.py +0 -0
  43. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/ml_tools/keys.py +0 -0
  44. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/ml_tools/path_manager.py +0 -0
  45. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/ml_tools/serde.py +0 -0
  46. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/ml_tools/utilities.py +0 -0
  47. {dragon_ml_toolbox-12.0.0 → dragon_ml_toolbox-12.1.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 12.0.0
3
+ Version: 12.1.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -8,7 +8,7 @@ Project-URL: Homepage, https://github.com/DrAg0n-BoRn/ML_tools
8
8
  Project-URL: Changelog, https://github.com/DrAg0n-BoRn/ML_tools/blob/master/CHANGELOG.md
9
9
  Classifier: Programming Language :: Python :: 3
10
10
  Classifier: Operating System :: OS Independent
11
- Requires-Python: ==3.12
11
+ Requires-Python: >=3.12
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
14
  License-File: LICENSE-THIRD-PARTY.md
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 12.0.0
3
+ Version: 12.1.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -8,7 +8,7 @@ Project-URL: Homepage, https://github.com/DrAg0n-BoRn/ML_tools
8
8
  Project-URL: Changelog, https://github.com/DrAg0n-BoRn/ML_tools/blob/master/CHANGELOG.md
9
9
  Classifier: Programming Language :: Python :: 3
10
10
  Classifier: Operating System :: OS Independent
11
- Requires-Python: ==3.12
11
+ Requires-Python: >=3.12
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
14
  License-File: LICENSE-THIRD-PARTY.md
@@ -19,6 +19,7 @@ ml_tools/ML_inference.py
19
19
  ml_tools/ML_models.py
20
20
  ml_tools/ML_optimization.py
21
21
  ml_tools/ML_scaler.py
22
+ ml_tools/ML_simple_optimization.py
22
23
  ml_tools/ML_trainer.py
23
24
  ml_tools/ML_utilities.py
24
25
  ml_tools/PSO_optimization.py
@@ -0,0 +1,462 @@
1
+ import pandas # logger
2
+ import torch
3
+ import numpy #handling torch to numpy
4
+ import evotorch
5
+ from evotorch.algorithms import SNES, CEM, GeneticAlgorithm
6
+ from evotorch.logging import PandasLogger
7
+ from evotorch.operators import SimulatedBinaryCrossOver, GaussianMutation
8
+ from typing import Literal, Union, Tuple, List, Optional, Any, Callable, Dict
9
+ from pathlib import Path
10
+ from tqdm.auto import trange
11
+ from contextlib import nullcontext
12
+ from functools import partial
13
+
14
+ from .path_manager import make_fullpath, sanitize_filename
15
+ from ._logger import _LOGGER
16
+ from ._script_info import _script_info
17
+ from .ML_inference import PyTorchInferenceHandler
18
+ from .keys import PyTorchInferenceKeys
19
+ from .SQL import DatabaseManager
20
+ from .optimization_tools import _save_result
21
+ from .utilities import save_dataframe
22
+ from .math_utilities import discretize_categorical_values
23
+
24
+
25
+ __all__ = [
26
+ "MLOptimizer",
27
+ "create_pytorch_problem",
28
+ "run_optimization"
29
+ ]
30
+
31
+
32
+ class MLOptimizer:
33
+ """
34
+ A wrapper class for setting up and running EvoTorch optimization tasks.
35
+
36
+ This class combines the functionality of `create_pytorch_problem` and
37
+ `run_optimization` functions into a single, streamlined workflow.
38
+
39
+ SNES and CEM algorithms do not accept bounds, the given bounds will be used as an initial starting point.
40
+
41
+ Example:
42
+ >>> # 1. Get categorical info from preprocessing steps
43
+ >>> # e.g., from data_exploration.encode_categorical_features
44
+ >>> cat_mappings = {'feature_C': {'A': 0, 'B': 1}, 'feature_D': {'X': 0, 'Y': 1}}
45
+ >>> # e.g., from data_exploration.create_transformer_categorical_map
46
+ >>> # Assumes feature_C is at index 2 (cardinality 2) and feature_D is at index 3 (cardinality 2)
47
+ >>> cat_index_map = {2: 2, 3: 2}
48
+ >>>
49
+ >>> # 2. Initialize the optimizer
50
+ >>> optimizer = MLOptimizer(
51
+ ... inference_handler=my_handler,
52
+ ... bounds=(lower_bounds, upper_bounds), # Bounds for ALL features
53
+ ... task="max",
54
+ ... algorithm="Genetic",
55
+ ... categorical_index_map=cat_index_map,
56
+ ... categorical_mappings=cat_mappings,
57
+ ... )
58
+ >>> # 3. Run the optimization
59
+ >>> best_result = optimizer.run(
60
+ ... num_generations=100,
61
+ ... target_name="my_target",
62
+ ... feature_names=my_feature_names,
63
+ ... save_dir="/path/to/results",
64
+ ... save_format="csv"
65
+ ... )
66
+ """
67
+ def __init__(self,
68
+ inference_handler: PyTorchInferenceHandler,
69
+ bounds: Tuple[List[float], List[float]],
70
+ task: Literal["min", "max"],
71
+ algorithm: Literal["SNES", "CEM", "Genetic"] = "Genetic",
72
+ population_size: int = 200,
73
+ categorical_index_map: Optional[Dict[int, int]] = None,
74
+ categorical_mappings: Optional[Dict[str, Dict[str, int]]] = None,
75
+ discretize_start_at_zero: bool = True,
76
+ **searcher_kwargs):
77
+ """
78
+ Initializes the optimizer by creating the EvoTorch problem and searcher.
79
+
80
+ Args:
81
+ inference_handler (PyTorchInferenceHandler): An initialized inference handler containing the model and weights.
82
+ bounds (tuple[list[float], list[float]]): A tuple containing the lower and upper bounds for ALL solution features.
83
+ Use the `optimization_tools.create_optimization_bounds()` helper to easily generate this and ensure unbiased categorical bounds.
84
+ task (str): The optimization goal, either "min" or "max".
85
+ algorithm (str): The search algorithm to use ("SNES", "CEM", "Genetic").
86
+ population_size (int): Population size for CEM and GeneticAlgorithm.
87
+ categorical_index_map (Dict[int, int] | None): Used to discretize values after optimization. Maps {column_index: cardinality}.
88
+ categorical_mappings (Dict[str, Dict[str, int]] | None): Used to map discrete integer values back to strings (e.g., {0: 'Category_A'}) before saving.
89
+ discretize_start_at_zero (bool):
90
+ True if the discrete encoding starts at 0 (e.g., [0, 1, 2]).
91
+ False if it starts at 1 (e.g., [1, 2, 3]).
92
+ **searcher_kwargs: Additional keyword arguments for the selected search algorithm's constructor.
93
+ """
94
+ # Call the existing factory function to get the problem and searcher factory
95
+ self.problem, self.searcher_factory = create_pytorch_problem(
96
+ inference_handler=inference_handler,
97
+ bounds=bounds,
98
+ task=task,
99
+ algorithm=algorithm,
100
+ population_size=population_size,
101
+ **searcher_kwargs
102
+ )
103
+ # Store categorical info to pass to the run function
104
+ self.categorical_map = categorical_index_map
105
+ self.categorical_mappings = categorical_mappings
106
+ self.discretize_start_at_zero = discretize_start_at_zero
107
+
108
+ def run(self,
109
+ num_generations: int,
110
+ target_name: str,
111
+ save_dir: Union[str, Path],
112
+ feature_names: Optional[List[str]],
113
+ save_format: Literal['csv', 'sqlite', 'both'],
114
+ repetitions: int = 1,
115
+ verbose: bool = True) -> Optional[dict]:
116
+ """
117
+ Runs the evolutionary optimization process using the pre-configured settings.
118
+
119
+ Args:
120
+ num_generations (int): The total number of generations for each repetition.
121
+ target_name (str): Target name used for the CSV filename and/or SQL table.
122
+ save_dir (str | Path): The directory where result files will be saved.
123
+ feature_names (List[str] | None): Names of the solution features for labeling output.
124
+ If None, generic names like 'feature_0', 'feature_1', ... , will be created.
125
+ save_format (Literal['csv', 'sqlite', 'both']): The format for saving results.
126
+ repetitions (int): The number of independent times to run the optimization.
127
+ verbose (bool): If True, enables detailed logging.
128
+
129
+ Returns:
130
+ Optional[dict]: A dictionary with the best result if repetitions is 1, otherwise None.
131
+ """
132
+ # Call the existing run function with the stored problem, searcher, and categorical info
133
+ return run_optimization(
134
+ problem=self.problem,
135
+ searcher_factory=self.searcher_factory,
136
+ num_generations=num_generations,
137
+ target_name=target_name,
138
+ save_dir=save_dir,
139
+ save_format=save_format,
140
+ feature_names=feature_names,
141
+ repetitions=repetitions,
142
+ verbose=verbose,
143
+ categorical_map=self.categorical_map,
144
+ categorical_mappings=self.categorical_mappings,
145
+ discretize_start_at_zero=self.discretize_start_at_zero
146
+ )
147
+
148
+
149
+ def create_pytorch_problem(
150
+ inference_handler: PyTorchInferenceHandler,
151
+ bounds: Tuple[List[float], List[float]],
152
+ task: Literal["min", "max"],
153
+ algorithm: Literal["SNES", "CEM", "Genetic"] = "Genetic",
154
+ population_size: int = 200,
155
+ **searcher_kwargs
156
+ ) -> Tuple[evotorch.Problem, Callable[[], Any]]:
157
+ """
158
+ Creates and configures an EvoTorch Problem and a Searcher factory class for a PyTorch model.
159
+
160
+ SNES and CEM do not accept bounds, the given bounds will be used as an initial starting point.
161
+
162
+ The Genetic Algorithm works directly with the bounds, and operators such as SimulatedBinaryCrossOver and GaussianMutation.
163
+
164
+ Args:
165
+ inference_handler (PyTorchInferenceHandler): An initialized inference handler containing the model and weights.
166
+ bounds (tuple[list[float], list[float]]): A tuple containing the lower and upper bounds for the solution features.
167
+ Use the `optimization_tools.create_optimization_bounds()` helper to easily generate this and ensure unbiased categorical bounds.
168
+ task (str): The optimization goal, either "minimize" or "maximize".
169
+ algorithm (str): The search algorithm to use.
170
+ population_size (int): Used for CEM and GeneticAlgorithm.
171
+ **searcher_kwargs: Additional keyword arguments to pass to the
172
+ selected search algorithm's constructor (e.g., stdev_init=0.5 for CMAES).
173
+
174
+ Returns:
175
+ Tuple:
176
+ A tuple containing the configured Problem and Searcher.
177
+ """
178
+ # Create copies to avoid modifying the original lists passed in the `bounds` tuple
179
+ lower_bounds = list(bounds[0])
180
+ upper_bounds = list(bounds[1])
181
+
182
+ solution_length = len(lower_bounds)
183
+ device = inference_handler.device
184
+
185
+ # Define the fitness function that EvoTorch will call.
186
+ def fitness_func(solution_tensor: torch.Tensor) -> torch.Tensor:
187
+ # Directly use the continuous-valued tensor from the optimizer for prediction
188
+ predictions = inference_handler.predict_batch(solution_tensor)[PyTorchInferenceKeys.PREDICTIONS]
189
+ return predictions.flatten()
190
+
191
+
192
+ # Create the Problem instance.
193
+ if algorithm == "CEM" or algorithm == "SNES":
194
+ problem = evotorch.Problem(
195
+ objective_sense=task,
196
+ objective_func=fitness_func,
197
+ solution_length=solution_length,
198
+ initial_bounds=(lower_bounds, upper_bounds),
199
+ device=device,
200
+ vectorized=True #Use batches
201
+ )
202
+
203
+ # If stdev_init is not provided, calculate it based on the bounds (used for SNES and CEM)
204
+ if 'stdev_init' not in searcher_kwargs:
205
+ # Calculate stdev for each parameter as 25% of its search range
206
+ stdevs = [abs(up - low) * 0.25 for low, up in zip(lower_bounds, upper_bounds)]
207
+ searcher_kwargs['stdev_init'] = torch.tensor(stdevs, dtype=torch.float32, requires_grad=False)
208
+
209
+ if algorithm == "SNES":
210
+ SearcherClass = SNES
211
+ elif algorithm == "CEM":
212
+ SearcherClass = CEM
213
+ # Set a defaults for CEM if not provided
214
+ if 'popsize' not in searcher_kwargs:
215
+ searcher_kwargs['popsize'] = population_size
216
+ if 'parenthood_ratio' not in searcher_kwargs:
217
+ searcher_kwargs['parenthood_ratio'] = 0.2 #float 0.0 - 1.0
218
+
219
+ elif algorithm == "Genetic":
220
+ problem = evotorch.Problem(
221
+ objective_sense=task,
222
+ objective_func=fitness_func,
223
+ solution_length=solution_length,
224
+ bounds=(lower_bounds, upper_bounds),
225
+ device=device,
226
+ vectorized=True #Use batches
227
+ )
228
+
229
+ operators = [
230
+ SimulatedBinaryCrossOver(problem,
231
+ tournament_size=3,
232
+ eta=0.6),
233
+ GaussianMutation(problem,
234
+ stdev=0.4)
235
+ ]
236
+
237
+ searcher_kwargs["operators"] = operators
238
+ if 'popsize' not in searcher_kwargs:
239
+ searcher_kwargs['popsize'] = population_size
240
+
241
+ SearcherClass = GeneticAlgorithm
242
+
243
+ else:
244
+ _LOGGER.error(f"Unknown algorithm '{algorithm}'.")
245
+ raise ValueError()
246
+
247
+ # Create a factory function with all arguments pre-filled
248
+ searcher_factory = partial(SearcherClass, problem, **searcher_kwargs)
249
+
250
+ return problem, searcher_factory
251
+
252
+
253
+ def run_optimization(
254
+ problem: evotorch.Problem,
255
+ searcher_factory: Callable[[],Any],
256
+ num_generations: int,
257
+ target_name: str,
258
+ save_dir: Union[str, Path],
259
+ save_format: Literal['csv', 'sqlite', 'both'],
260
+ feature_names: Optional[List[str]],
261
+ repetitions: int = 1,
262
+ verbose: bool = True,
263
+ categorical_map: Optional[Dict[int, int]] = None,
264
+ categorical_mappings: Optional[Dict[str, Dict[str, int]]] = None,
265
+ discretize_start_at_zero: bool = True
266
+ ) -> Optional[dict]:
267
+ """
268
+ Runs the evolutionary optimization process, with support for multiple repetitions.
269
+
270
+ This function serves as the main engine for the optimization task. It takes a
271
+ configured Problem and a Searcher from EvoTorch and executes the optimization
272
+ for a specified number of generations.
273
+
274
+ It has two modes of operation:
275
+ 1. **Single Run (repetitions=1):** Executes the optimization once, saves the
276
+ single best result to a CSV file, and returns it as a dictionary.
277
+ 2. **Iterative Analysis (repetitions > 1):** Executes the optimization
278
+ multiple times. Results from each run are streamed incrementally to the
279
+ specified file formats (CSV and/or SQLite database). In this mode,
280
+ the function returns None.
281
+
282
+ Args:
283
+ problem (evotorch.Problem): The configured problem instance, which defines
284
+ the objective function, solution space, and optimization sense.
285
+ searcher_factory (Callable): The searcher factory to generate fresh evolutionary algorithms.
286
+ num_generations (int): The total number of generations to run the search algorithm for in each repetition.
287
+ target_name (str): Target name that will also be used for the CSV filename and SQL table.
288
+ save_dir (str | Path): The directory where the result file(s) will be saved.
289
+ save_format (Literal['csv', 'sqlite', 'both'], optional): The format for
290
+ saving results during iterative analysis.
291
+ feature_names (List[str], optional): Names of the solution features for
292
+ labeling the output files. If None, generic names like 'feature_0',
293
+ 'feature_1', etc., will be created.
294
+ repetitions (int, optional): The number of independent times to run the
295
+ entire optimization process.
296
+ verbose (bool): Add an Evotorch Pandas logger saved as a csv. Only for the first repetition.
297
+ categorical_index_map (Dict[int, int] | None): Used to discretize values after optimization. Maps {column_index: cardinality}.
298
+ categorical_mappings (Dict[str, Dict[str, int]] | None): Used to map discrete integer values back to strings (e.g., {0: 'Category_A'}) before saving.
299
+ discretize_start_at_zero (bool):
300
+ True if the discrete encoding starts at 0 (e.g., [0, 1, 2]).
301
+ False if it starts at 1 (e.g., [1, 2, 3]).
302
+
303
+ Returns:
304
+ Optional[dict]: A dictionary containing the best feature values and the
305
+ fitness score if `repetitions` is 1. Returns `None` if `repetitions`
306
+ is greater than 1, as results are streamed to files instead.
307
+ """
308
+ # --- 1. Setup Paths and Feature Names ---
309
+ save_path = make_fullpath(save_dir, make=True, enforce="directory")
310
+
311
+ sanitized_target_name = sanitize_filename(target_name)
312
+ if not sanitized_target_name.endswith(".csv"):
313
+ sanitized_target_name = sanitized_target_name + ".csv"
314
+
315
+ csv_path = save_path / sanitized_target_name
316
+ db_path = save_path / "Optimization.db"
317
+ db_table_name = target_name
318
+
319
+ # Use problem's solution_length to create default names if none provided
320
+ if feature_names is None:
321
+ feat_len = problem.solution_length
322
+ feature_names = [f"feature_{i}" for i in range(feat_len)] # type: ignore
323
+
324
+ # --- 2. Run Optimization ---
325
+ # --- SINGLE RUN LOGIC ---
326
+ if repetitions <= 1:
327
+ _LOGGER.info(f"🤖 Starting optimization for {num_generations} generations...")
328
+
329
+ result_dict, pandas_logger = _run_single_optimization_rep(
330
+ searcher_factory=searcher_factory,
331
+ num_generations=num_generations,
332
+ feature_names=feature_names,
333
+ target_name=target_name,
334
+ categorical_map=categorical_map,
335
+ discretize_start_at_zero=discretize_start_at_zero,
336
+ attach_logger=verbose
337
+ )
338
+
339
+ # Single run defaults to CSV, pass mappings for reverse mapping
340
+ _save_result(
341
+ result_dict=result_dict,
342
+ save_format='csv',
343
+ csv_path=csv_path,
344
+ categorical_mappings=categorical_mappings
345
+ )
346
+
347
+ if pandas_logger:
348
+ _handle_pandas_log(pandas_logger, save_path=save_path, target_name=target_name)
349
+
350
+ _LOGGER.info(f"Optimization complete. Best solution saved to '{csv_path.name}'")
351
+ return result_dict
352
+
353
+ # --- MULTIPLE REPETITIONS LOGIC ---
354
+ else:
355
+ _LOGGER.info(f"🏁 Starting optimal solution space analysis with {repetitions} repetitions...")
356
+
357
+ first_run_logger = None # To store the logger from the first rep
358
+ db_context = DatabaseManager(db_path) if save_format in ['sqlite', 'both'] else nullcontext()
359
+
360
+ with db_context as db_manager:
361
+ # --- Setup Database Schema (if applicable) ---
362
+ if db_manager:
363
+ schema = {}
364
+ categorical_cols = set(categorical_mappings.keys()) if categorical_mappings else set()
365
+
366
+ for name in feature_names:
367
+ schema[name] = "TEXT" if name in categorical_cols else "REAL"
368
+ schema[target_name] = "REAL"
369
+
370
+ db_manager.create_table(db_table_name, schema)
371
+
372
+ # --- Repetitions Loop ---
373
+ print("")
374
+ for i in trange(repetitions, desc="Repetitions"):
375
+
376
+ # Only attach a logger for the first repetition if verbose
377
+ attach_logger = verbose and (i == 0)
378
+
379
+ result_dict, pandas_logger = _run_single_optimization_rep(
380
+ searcher_factory=searcher_factory,
381
+ num_generations=num_generations,
382
+ feature_names=feature_names,
383
+ target_name=target_name,
384
+ categorical_map=categorical_map,
385
+ discretize_start_at_zero=discretize_start_at_zero,
386
+ attach_logger=attach_logger
387
+ )
388
+
389
+ if pandas_logger:
390
+ first_run_logger = pandas_logger
391
+
392
+ # Save each result incrementally
393
+ _save_result(
394
+ result_dict=result_dict,
395
+ save_format=save_format,
396
+ csv_path=csv_path,
397
+ db_manager=db_manager,
398
+ db_table_name=db_table_name,
399
+ categorical_mappings=categorical_mappings
400
+ )
401
+
402
+ if first_run_logger:
403
+ _handle_pandas_log(first_run_logger, save_path=save_path, target_name=target_name)
404
+
405
+ _LOGGER.info(f"Optimal solution space complete. Results saved to '{save_path}'")
406
+ return None
407
+
408
+
409
+ def _run_single_optimization_rep(
410
+ searcher_factory: Callable[[],Any],
411
+ num_generations: int,
412
+ feature_names: List[str],
413
+ target_name: str,
414
+ categorical_map: Optional[Dict[int, int]],
415
+ discretize_start_at_zero: bool,
416
+ attach_logger: bool
417
+ ) -> Tuple[dict, Optional[PandasLogger]]:
418
+ """
419
+ Internal helper to run one full optimization repetition.
420
+
421
+ Handles searcher creation, logging, running, and result post-processing.
422
+ """
423
+ # CRITICAL: Create a fresh searcher for each run using the factory
424
+ searcher = searcher_factory()
425
+
426
+ # Attach logger if requested
427
+ pandas_logger = PandasLogger(searcher) if attach_logger else None
428
+
429
+ # Run the optimization
430
+ searcher.run(num_generations)
431
+
432
+ # Get the best result
433
+ best_solution_container = searcher.status["pop_best"]
434
+ best_solution_tensor = best_solution_container.values
435
+ best_fitness = best_solution_container.evals
436
+
437
+ best_solution_np = best_solution_tensor.cpu().numpy()
438
+
439
+ # Discretize categorical/binary features
440
+ if categorical_map:
441
+ best_solution_thresholded = discretize_categorical_values(
442
+ input_array=best_solution_np,
443
+ categorical_info=categorical_map,
444
+ start_at_zero=discretize_start_at_zero
445
+ )
446
+ else:
447
+ best_solution_thresholded = best_solution_np
448
+
449
+ # Format results into a dictionary
450
+ result_dict = {name: value for name, value in zip(feature_names, best_solution_thresholded)}
451
+ result_dict[target_name] = best_fitness.item()
452
+
453
+ return result_dict, pandas_logger
454
+
455
+
456
+ def _handle_pandas_log(logger: PandasLogger, save_path: Path, target_name: str):
457
+ log_dataframe = logger.to_dataframe()
458
+ save_dataframe(df=log_dataframe, save_dir=save_path / "EvolutionLogs", filename=target_name)
459
+
460
+
461
+ def info():
462
+ _script_info(__all__)
@@ -21,15 +21,18 @@ from .optimization_tools import _save_result
21
21
  from .utilities import save_dataframe
22
22
  from .math_utilities import threshold_binary_values
23
23
 
24
+ """
25
+ DEPRECATED
26
+ """
24
27
 
25
28
  __all__ = [
26
- "MLOptimizer",
27
- "create_pytorch_problem",
28
- "run_optimization"
29
+ "s_MLOptimizer",
30
+ "s_create_pytorch_problem",
31
+ "s_run_optimization"
29
32
  ]
30
33
 
31
34
 
32
- class MLOptimizer:
35
+ class s_MLOptimizer:
33
36
  """
34
37
  A wrapper class for setting up and running EvoTorch optimization tasks.
35
38
 
@@ -77,7 +80,7 @@ class MLOptimizer:
77
80
  **searcher_kwargs: Additional keyword arguments for the selected search algorithm's constructor.
78
81
  """
79
82
  # Call the existing factory function to get the problem and searcher factory
80
- self.problem, self.searcher_factory = create_pytorch_problem(
83
+ self.problem, self.searcher_factory = s_create_pytorch_problem(
81
84
  inference_handler=inference_handler,
82
85
  bounds=bounds,
83
86
  binary_features=number_binary_features,
@@ -113,7 +116,7 @@ class MLOptimizer:
113
116
  Optional[dict]: A dictionary with the best result if repetitions is 1, otherwise None.
114
117
  """
115
118
  # Call the existing run function with the stored problem, searcher, and binary feature count
116
- return run_optimization(
119
+ return s_run_optimization(
117
120
  problem=self.problem,
118
121
  searcher_factory=self.searcher_factory,
119
122
  num_generations=num_generations,
@@ -127,7 +130,7 @@ class MLOptimizer:
127
130
  )
128
131
 
129
132
 
130
- def create_pytorch_problem(
133
+ def s_create_pytorch_problem(
131
134
  inference_handler: PyTorchInferenceHandler,
132
135
  bounds: Tuple[List[float], List[float]],
133
136
  binary_features: int,
@@ -237,7 +240,7 @@ def create_pytorch_problem(
237
240
  return problem, searcher_factory
238
241
 
239
242
 
240
- def run_optimization(
243
+ def s_run_optimization(
241
244
  problem: evotorch.Problem,
242
245
  searcher_factory: Callable[[],Any],
243
246
  num_generations: int,
@@ -29,6 +29,7 @@ __all__ = [
29
29
  "plot_value_distributions",
30
30
  "clip_outliers_single",
31
31
  "clip_outliers_multi",
32
+ "drop_outlier_samples",
32
33
  "match_and_filter_columns_by_regex",
33
34
  "standardize_percentages",
34
35
  "create_transformer_categorical_map",
@@ -358,8 +359,8 @@ def encode_categorical_features(
358
359
  df (pd.DataFrame): The input DataFrame.
359
360
  columns_to_encode (List[str]): A list of column names to be encoded.
360
361
  encode_nulls (bool): If True, encodes Null values as a distinct category
361
- "Other" with a value of 0. Other categories start from 1.
362
- If False, Nulls are ignored.
362
+ "Other" with a value of 0. Other categories start from 1.
363
+ If False, Nulls are ignored and categories start from 0.
363
364
  split_resulting_dataset (bool): If True, returns two separate DataFrames:
364
365
  one with non-categorical columns and one with the encoded columns.
365
366
  If False, returns a single DataFrame with all columns.
@@ -758,7 +759,99 @@ def clip_outliers_multi(
758
759
  if skipped_columns:
759
760
  _LOGGER.warning("Skipped columns:")
760
761
  for col, msg in skipped_columns:
761
- print(f" - {col}: {msg}")
762
+ print(f" - {col}")
763
+
764
+ return new_df
765
+
766
+
767
+ def drop_outlier_samples(
768
+ df: pd.DataFrame,
769
+ bounds_dict: Dict[str, Tuple[Union[int, float], Union[int, float]]],
770
+ drop_on_nulls: bool = False,
771
+ verbose: bool = True
772
+ ) -> pd.DataFrame:
773
+ """
774
+ Drops entire rows where values in specified numeric columns fall outside
775
+ a given [min, max] range.
776
+
777
+ This function processes a copy of the DataFrame, ensuring the original is
778
+ not modified. It skips columns with invalid specifications.
779
+
780
+ Args:
781
+ df (pd.DataFrame): The input DataFrame.
782
+ bounds_dict (dict): A dictionary where keys are column names and values
783
+ are (min_val, max_val) tuples defining the valid range.
784
+ drop_on_nulls (bool): If True, rows with NaN/None in a checked column
785
+ will also be dropped. If False, NaN/None are ignored.
786
+ verbose (bool): If True, prints the number of rows dropped for each column.
787
+
788
+ Returns:
789
+ pd.DataFrame: A new DataFrame with the outlier rows removed.
790
+
791
+ Notes:
792
+ - Invalid specifications (e.g., missing column, non-numeric type,
793
+ incorrectly formatted bounds) will be reported and skipped.
794
+ """
795
+ new_df = df.copy()
796
+ skipped_columns: List[Tuple[str, str]] = []
797
+ initial_rows = len(new_df)
798
+
799
+ for col, bounds in bounds_dict.items():
800
+ try:
801
+ # --- Validation Checks ---
802
+ if col not in df.columns:
803
+ _LOGGER.error(f"Column '{col}' not found in DataFrame.")
804
+ raise ValueError()
805
+
806
+ if not pd.api.types.is_numeric_dtype(df[col]):
807
+ _LOGGER.error(f"Column '{col}' is not of a numeric data type.")
808
+ raise TypeError()
809
+
810
+ if not (isinstance(bounds, tuple) and len(bounds) == 2):
811
+ _LOGGER.error(f"Bounds for '{col}' must be a tuple of (min, max).")
812
+ raise ValueError()
813
+
814
+ # --- Filtering Logic ---
815
+ min_val, max_val = bounds
816
+ rows_before_drop = len(new_df)
817
+
818
+ # Create the base mask for values within the specified range
819
+ # .between() is inclusive and evaluates to False for NaN
820
+ mask_in_bounds = new_df[col].between(min_val, max_val)
821
+
822
+ if drop_on_nulls:
823
+ # Keep only rows that are within bounds.
824
+ # Since mask_in_bounds is False for NaN, nulls are dropped.
825
+ final_mask = mask_in_bounds
826
+ else:
827
+ # Keep rows that are within bounds OR are null.
828
+ mask_is_null = new_df[col].isnull()
829
+ final_mask = mask_in_bounds | mask_is_null
830
+
831
+ # Apply the final mask
832
+ new_df = new_df[final_mask]
833
+
834
+ rows_after_drop = len(new_df)
835
+
836
+ if verbose:
837
+ dropped_count = rows_before_drop - rows_after_drop
838
+ if dropped_count > 0:
839
+ print(
840
+ f" - Column '{col}': Dropped {dropped_count} rows with values outside range [{min_val}, {max_val}]."
841
+ )
842
+
843
+ except (ValueError, TypeError) as e:
844
+ skipped_columns.append((col, str(e)))
845
+ continue
846
+
847
+ total_dropped = initial_rows - len(new_df)
848
+ _LOGGER.info(f"Finished processing. Total rows dropped: {total_dropped}.")
849
+
850
+ if skipped_columns:
851
+ _LOGGER.warning("Skipped the following columns due to errors:")
852
+ for col, msg in skipped_columns:
853
+ # Only print the column name for cleaner output as the error was already logged
854
+ print(f" - {col}")
762
855
 
763
856
  return new_df
764
857