dragon-ml-toolbox 12.13.0__py3-none-any.whl → 14.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-12.13.0.dist-info → dragon_ml_toolbox-14.3.0.dist-info}/METADATA +11 -2
- dragon_ml_toolbox-14.3.0.dist-info/RECORD +48 -0
- {dragon_ml_toolbox-12.13.0.dist-info → dragon_ml_toolbox-14.3.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +10 -0
- ml_tools/MICE_imputation.py +207 -5
- ml_tools/ML_callbacks.py +40 -8
- ml_tools/ML_datasetmaster.py +200 -261
- ml_tools/ML_evaluation.py +29 -17
- ml_tools/ML_evaluation_multi.py +13 -10
- ml_tools/ML_inference.py +14 -5
- ml_tools/ML_models.py +135 -55
- ml_tools/ML_models_advanced.py +323 -0
- ml_tools/ML_optimization.py +49 -36
- ml_tools/ML_trainer.py +560 -30
- ml_tools/ML_utilities.py +302 -4
- ml_tools/ML_vision_datasetmaster.py +1352 -0
- ml_tools/ML_vision_evaluation.py +260 -0
- ml_tools/ML_vision_inference.py +428 -0
- ml_tools/ML_vision_models.py +627 -0
- ml_tools/ML_vision_transformers.py +58 -0
- ml_tools/PSO_optimization.py +5 -1
- ml_tools/_ML_vision_recipe.py +88 -0
- ml_tools/__init__.py +1 -0
- ml_tools/_schema.py +96 -0
- ml_tools/custom_logger.py +37 -14
- ml_tools/data_exploration.py +576 -138
- ml_tools/keys.py +51 -1
- ml_tools/math_utilities.py +1 -1
- ml_tools/optimization_tools.py +65 -86
- ml_tools/serde.py +78 -17
- ml_tools/utilities.py +192 -3
- dragon_ml_toolbox-12.13.0.dist-info/RECORD +0 -41
- ml_tools/ML_simple_optimization.py +0 -413
- {dragon_ml_toolbox-12.13.0.dist-info → dragon_ml_toolbox-14.3.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-12.13.0.dist-info → dragon_ml_toolbox-14.3.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-12.13.0.dist-info → dragon_ml_toolbox-14.3.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,323 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
from torch import nn
|
|
3
|
+
from typing import Union, Dict, Any
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
import json
|
|
6
|
+
|
|
7
|
+
from ._logger import _LOGGER
|
|
8
|
+
from .path_manager import make_fullpath
|
|
9
|
+
from .keys import PytorchModelArchitectureKeys
|
|
10
|
+
from ._schema import FeatureSchema
|
|
11
|
+
from ._script_info import _script_info
|
|
12
|
+
from .ML_models import _ArchitectureHandlerMixin
|
|
13
|
+
|
|
14
|
+
# Imports from pytorch_tabular
|
|
15
|
+
try:
|
|
16
|
+
from omegaconf import DictConfig
|
|
17
|
+
from pytorch_tabular.models import GatedAdditiveTreeEnsembleModel, NODEModel
|
|
18
|
+
except ImportError:
|
|
19
|
+
_LOGGER.error(f"GATE and NODE require 'pip install pytorch_tabular omegaconf' dependencies.")
|
|
20
|
+
raise ImportError()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"DragonGateModel",
|
|
25
|
+
"DragonNodeModel",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class _BasePytabWrapper(nn.Module, _ArchitectureHandlerMixin):
|
|
30
|
+
"""
|
|
31
|
+
Internal Base Class: Do not use directly.
|
|
32
|
+
|
|
33
|
+
This is an adapter to make pytorch_tabular models compatible with the
|
|
34
|
+
dragon-ml-toolbox pipeline.
|
|
35
|
+
|
|
36
|
+
It handles:
|
|
37
|
+
1. Schema-based initialization.
|
|
38
|
+
2. Single-tensor forward pass, which is then split into the
|
|
39
|
+
dict {'continuous': ..., 'categorical': ...} that pytorch_tabular expects.
|
|
40
|
+
3. Saving/Loading architecture using the pipeline's _ArchitectureHandlerMixin.
|
|
41
|
+
"""
|
|
42
|
+
def __init__(self, schema: FeatureSchema):
|
|
43
|
+
super().__init__()
|
|
44
|
+
|
|
45
|
+
self.schema = schema
|
|
46
|
+
self.model_name = "Base" # To be overridden by child
|
|
47
|
+
self.internal_model: nn.Module = None # type: ignore # To be set by child
|
|
48
|
+
self.model_hparams: Dict = dict() # To be set by child
|
|
49
|
+
|
|
50
|
+
# --- Derive indices from schema ---
|
|
51
|
+
categorical_map = schema.categorical_index_map
|
|
52
|
+
|
|
53
|
+
if categorical_map:
|
|
54
|
+
# The order of keys/values is implicitly linked and must be preserved
|
|
55
|
+
self.categorical_indices = list(categorical_map.keys())
|
|
56
|
+
self.cardinalities = list(categorical_map.values())
|
|
57
|
+
else:
|
|
58
|
+
self.categorical_indices = []
|
|
59
|
+
self.cardinalities = []
|
|
60
|
+
|
|
61
|
+
# Derive numerical indices by finding what's not categorical
|
|
62
|
+
all_indices = set(range(len(schema.feature_names)))
|
|
63
|
+
categorical_indices_set = set(self.categorical_indices)
|
|
64
|
+
self.numerical_indices = sorted(list(all_indices - categorical_indices_set))
|
|
65
|
+
|
|
66
|
+
def _build_pt_config(self, out_targets: int, **kwargs) -> DictConfig:
|
|
67
|
+
"""Helper to create the minimal config dict for a pytorch_tabular model."""
|
|
68
|
+
# 'regression' is the most neutral for model architecture. The final output_dim is what truly matters.
|
|
69
|
+
task = "regression"
|
|
70
|
+
|
|
71
|
+
config_dict = {
|
|
72
|
+
# --- Data / Schema Params ---
|
|
73
|
+
'task': task,
|
|
74
|
+
'continuous_cols': list(self.schema.continuous_feature_names),
|
|
75
|
+
'categorical_cols': list(self.schema.categorical_feature_names),
|
|
76
|
+
'continuous_dim': len(self.numerical_indices),
|
|
77
|
+
'categorical_dim': len(self.categorical_indices),
|
|
78
|
+
'categorical_cardinality': self.cardinalities,
|
|
79
|
+
'target': ['dummy_target'], # Required, but not used
|
|
80
|
+
|
|
81
|
+
# --- Model Params ---
|
|
82
|
+
'output_dim': out_targets,
|
|
83
|
+
**kwargs
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
# Add common params that most models need
|
|
87
|
+
if 'loss' not in config_dict:
|
|
88
|
+
config_dict['loss'] = 'NotUsed'
|
|
89
|
+
if 'metrics' not in config_dict:
|
|
90
|
+
config_dict['metrics'] = []
|
|
91
|
+
|
|
92
|
+
return DictConfig(config_dict)
|
|
93
|
+
|
|
94
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
95
|
+
"""
|
|
96
|
+
Accepts a single tensor and converts it to the dict
|
|
97
|
+
that pytorch_tabular models expect.
|
|
98
|
+
"""
|
|
99
|
+
# 1. Split the single tensor input
|
|
100
|
+
x_cont = x[:, self.numerical_indices].float()
|
|
101
|
+
x_cat = x[:, self.categorical_indices].long()
|
|
102
|
+
|
|
103
|
+
# 2. Create the input dict
|
|
104
|
+
input_dict = {
|
|
105
|
+
'continuous': x_cont,
|
|
106
|
+
'categorical': x_cat
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
# 3. Pass to the internal pytorch_tabular model
|
|
110
|
+
# The model returns a dict, we extract the logits
|
|
111
|
+
model_output_dict = self.internal_model(input_dict)
|
|
112
|
+
|
|
113
|
+
# 4. Return the logits tensor
|
|
114
|
+
return model_output_dict['logits']
|
|
115
|
+
|
|
116
|
+
def get_architecture_config(self) -> Dict[str, Any]:
|
|
117
|
+
"""Returns the full configuration of the model."""
|
|
118
|
+
# Deconstruct schema into a JSON-friendly dict
|
|
119
|
+
schema_dict = {
|
|
120
|
+
'feature_names': self.schema.feature_names,
|
|
121
|
+
'continuous_feature_names': self.schema.continuous_feature_names,
|
|
122
|
+
'categorical_feature_names': self.schema.categorical_feature_names,
|
|
123
|
+
'categorical_index_map': self.schema.categorical_index_map,
|
|
124
|
+
'categorical_mappings': self.schema.categorical_mappings
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
config = {
|
|
128
|
+
'schema_dict': schema_dict,
|
|
129
|
+
'out_targets': self.out_targets,
|
|
130
|
+
**self.model_hparams
|
|
131
|
+
}
|
|
132
|
+
return config
|
|
133
|
+
|
|
134
|
+
@classmethod
|
|
135
|
+
def load(cls: type, file_or_dir: Union[str, Path], verbose: bool = True) -> nn.Module:
|
|
136
|
+
"""Loads a model architecture from a JSON file."""
|
|
137
|
+
user_path = make_fullpath(file_or_dir)
|
|
138
|
+
|
|
139
|
+
if user_path.is_dir():
|
|
140
|
+
json_filename = PytorchModelArchitectureKeys.SAVENAME + ".json"
|
|
141
|
+
target_path = make_fullpath(user_path / json_filename, enforce="file")
|
|
142
|
+
elif user_path.is_file():
|
|
143
|
+
target_path = user_path
|
|
144
|
+
else:
|
|
145
|
+
_LOGGER.error(f"Invalid path: '{file_or_dir}'")
|
|
146
|
+
raise IOError()
|
|
147
|
+
|
|
148
|
+
with open(target_path, 'r') as f:
|
|
149
|
+
saved_data = json.load(f)
|
|
150
|
+
|
|
151
|
+
saved_class_name = saved_data[PytorchModelArchitectureKeys.MODEL]
|
|
152
|
+
config = saved_data[PytorchModelArchitectureKeys.CONFIG]
|
|
153
|
+
|
|
154
|
+
if saved_class_name != cls.__name__:
|
|
155
|
+
_LOGGER.error(f"Model class mismatch. File specifies '{saved_class_name}', but '{cls.__name__}' was expected.")
|
|
156
|
+
raise ValueError()
|
|
157
|
+
|
|
158
|
+
# --- RECONSTRUCTION LOGIC ---
|
|
159
|
+
if 'schema_dict' not in config:
|
|
160
|
+
_LOGGER.error("Invalid architecture file: missing 'schema_dict'. This file may be from an older version.")
|
|
161
|
+
raise ValueError("Missing 'schema_dict' in config.")
|
|
162
|
+
|
|
163
|
+
schema_data = config.pop('schema_dict')
|
|
164
|
+
|
|
165
|
+
# JSON saves all dict keys as strings, convert them back to int.
|
|
166
|
+
raw_index_map = schema_data['categorical_index_map']
|
|
167
|
+
if raw_index_map is not None:
|
|
168
|
+
rehydrated_index_map = {int(k): v for k, v in raw_index_map.items()}
|
|
169
|
+
else:
|
|
170
|
+
rehydrated_index_map = None
|
|
171
|
+
|
|
172
|
+
# JSON deserializes tuples as lists, convert them back.
|
|
173
|
+
schema = FeatureSchema(
|
|
174
|
+
feature_names=tuple(schema_data['feature_names']),
|
|
175
|
+
continuous_feature_names=tuple(schema_data['continuous_feature_names']),
|
|
176
|
+
categorical_feature_names=tuple(schema_data['categorical_feature_names']),
|
|
177
|
+
categorical_index_map=rehydrated_index_map,
|
|
178
|
+
categorical_mappings=schema_data['categorical_mappings']
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
config['schema'] = schema
|
|
182
|
+
# --- End Reconstruction ---
|
|
183
|
+
|
|
184
|
+
model = cls(**config)
|
|
185
|
+
if verbose:
|
|
186
|
+
_LOGGER.info(f"Successfully loaded architecture for '{saved_class_name}'")
|
|
187
|
+
return model
|
|
188
|
+
|
|
189
|
+
def __repr__(self) -> str:
|
|
190
|
+
internal_model_str = str(self.internal_model)
|
|
191
|
+
# Grab the first line of the internal model's repr
|
|
192
|
+
internal_repr = internal_model_str.split('\n')[0]
|
|
193
|
+
return f"{self.model_name}(internal_model={internal_repr})"
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
class DragonGateModel(_BasePytabWrapper):
|
|
197
|
+
"""
|
|
198
|
+
Adapter for the Gated Additive Tree Ensemble (GATE) model from the 'pytorch_tabular' library.
|
|
199
|
+
|
|
200
|
+
GATE is a hybrid model that uses Gated Feature Learning Units (GFLUs) to
|
|
201
|
+
learn powerful feature representations. These learned features are then
|
|
202
|
+
fed into an additive ensemble of differentiable decision trees, combining
|
|
203
|
+
the representation learning of deep networks with the structured
|
|
204
|
+
decision-making of tree ensembles.
|
|
205
|
+
"""
|
|
206
|
+
def __init__(self, *,
|
|
207
|
+
schema: FeatureSchema,
|
|
208
|
+
out_targets: int,
|
|
209
|
+
embedding_dim: int = 32,
|
|
210
|
+
gflu_stages: int = 6,
|
|
211
|
+
num_trees: int = 20,
|
|
212
|
+
tree_depth: int = 5,
|
|
213
|
+
dropout: float = 0.1):
|
|
214
|
+
"""
|
|
215
|
+
Args:
|
|
216
|
+
schema (FeatureSchema):
|
|
217
|
+
The definitive schema object from data_exploration.
|
|
218
|
+
out_targets (int):
|
|
219
|
+
Number of output targets.
|
|
220
|
+
embedding_dim (int):
|
|
221
|
+
Dimension of the categorical embeddings. (Recommended: 16 to 64)
|
|
222
|
+
gflu_stages (int):
|
|
223
|
+
Number of Gated Feature Learning Units (GFLU) stages. (Recommended: 2 to 6)
|
|
224
|
+
num_trees (int):
|
|
225
|
+
Number of trees in the ensemble. (Recommended: 10 to 50)
|
|
226
|
+
tree_depth (int):
|
|
227
|
+
Depth of each tree. (Recommended: 4 to 8)
|
|
228
|
+
dropout (float):
|
|
229
|
+
Dropout rate for the GFLU.
|
|
230
|
+
"""
|
|
231
|
+
super().__init__(schema)
|
|
232
|
+
self.model_name = "DragonGateModel"
|
|
233
|
+
self.out_targets = out_targets
|
|
234
|
+
|
|
235
|
+
# Store hparams for saving/loading
|
|
236
|
+
self.model_hparams = {
|
|
237
|
+
'embedding_dim': embedding_dim,
|
|
238
|
+
'gflu_stages': gflu_stages,
|
|
239
|
+
'num_trees': num_trees,
|
|
240
|
+
'tree_depth': tree_depth,
|
|
241
|
+
'dropout': dropout
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
# Build the minimal config for the GateModel
|
|
245
|
+
pt_config = self._build_pt_config(
|
|
246
|
+
out_targets=out_targets,
|
|
247
|
+
embedding_dim=embedding_dim,
|
|
248
|
+
gflu_stages=gflu_stages,
|
|
249
|
+
num_trees=num_trees,
|
|
250
|
+
tree_depth=tree_depth,
|
|
251
|
+
dropout=dropout,
|
|
252
|
+
# GATE-specific params
|
|
253
|
+
gflu_dropout=dropout,
|
|
254
|
+
chain_trees=False,
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
# Instantiate the internal pytorch_tabular model
|
|
258
|
+
self.internal_model = GatedAdditiveTreeEnsembleModel(config=pt_config)
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
class DragonNodeModel(_BasePytabWrapper):
|
|
262
|
+
"""
|
|
263
|
+
Adapter for the Neural Oblivious Decision Ensembles (NODE) model from the 'pytorch_tabular' library.
|
|
264
|
+
|
|
265
|
+
NODE is a model based on an ensemble of differentiable 'oblivious'
|
|
266
|
+
decision trees. An oblivious tree uses the same splitting feature and
|
|
267
|
+
threshold across all nodes at the same depth. This structure, combined
|
|
268
|
+
with a differentiable formulation, allows the model to be trained
|
|
269
|
+
end-to-end with gradient descent, learning feature interactions and
|
|
270
|
+
splitting thresholds simultaneously.
|
|
271
|
+
"""
|
|
272
|
+
def __init__(self, *,
|
|
273
|
+
schema: FeatureSchema,
|
|
274
|
+
out_targets: int,
|
|
275
|
+
embedding_dim: int = 32,
|
|
276
|
+
num_trees: int = 1024,
|
|
277
|
+
tree_depth: int = 6,
|
|
278
|
+
dropout: float = 0.1):
|
|
279
|
+
"""
|
|
280
|
+
Args:
|
|
281
|
+
schema (FeatureSchema):
|
|
282
|
+
The definitive schema object from data_exploration.
|
|
283
|
+
out_targets (int):
|
|
284
|
+
Number of output targets.
|
|
285
|
+
embedding_dim (int):
|
|
286
|
+
Dimension of the categorical embeddings. (Recommended: 16 to 64)
|
|
287
|
+
num_trees (int):
|
|
288
|
+
Total number of trees in the ensemble. (Recommended: 256 to 2048)
|
|
289
|
+
tree_depth (int):
|
|
290
|
+
Depth of each tree. (Recommended: 4 to 8)
|
|
291
|
+
dropout (float):
|
|
292
|
+
Dropout rate.
|
|
293
|
+
"""
|
|
294
|
+
super().__init__(schema)
|
|
295
|
+
self.model_name = "DragonNodeModel"
|
|
296
|
+
self.out_targets = out_targets
|
|
297
|
+
|
|
298
|
+
# Store hparams for saving/loading
|
|
299
|
+
self.model_hparams = {
|
|
300
|
+
'embedding_dim': embedding_dim,
|
|
301
|
+
'num_trees': num_trees,
|
|
302
|
+
'tree_depth': tree_depth,
|
|
303
|
+
'dropout': dropout
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
# Build the minimal config for the NodeModel
|
|
307
|
+
pt_config = self._build_pt_config(
|
|
308
|
+
out_targets=out_targets,
|
|
309
|
+
embedding_dim=embedding_dim,
|
|
310
|
+
num_trees=num_trees,
|
|
311
|
+
tree_depth=tree_depth,
|
|
312
|
+
# NODE-specific params
|
|
313
|
+
num_layers=1, # NODE uses num_layers=1 for a single ensemble
|
|
314
|
+
total_trees=num_trees,
|
|
315
|
+
dropout_rate=dropout,
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
# Instantiate the internal pytorch_tabular model
|
|
319
|
+
self.internal_model = NODEModel(config=pt_config)
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def info():
|
|
323
|
+
_script_info(__all__)
|
ml_tools/ML_optimization.py
CHANGED
|
@@ -17,9 +17,10 @@ from ._script_info import _script_info
|
|
|
17
17
|
from .ML_inference import PyTorchInferenceHandler
|
|
18
18
|
from .keys import PyTorchInferenceKeys
|
|
19
19
|
from .SQL import DatabaseManager
|
|
20
|
-
from .optimization_tools import _save_result
|
|
20
|
+
from .optimization_tools import _save_result, create_optimization_bounds
|
|
21
21
|
from .utilities import save_dataframe_filename
|
|
22
22
|
from .math_utilities import discretize_categorical_values
|
|
23
|
+
from ._schema import FeatureSchema
|
|
23
24
|
|
|
24
25
|
|
|
25
26
|
__all__ = [
|
|
@@ -40,66 +41,76 @@ class MLOptimizer:
|
|
|
40
41
|
SNES and CEM algorithms do not accept bounds, the given bounds will be used as an initial starting point.
|
|
41
42
|
|
|
42
43
|
Example:
|
|
43
|
-
>>> # 1. Get
|
|
44
|
-
>>>
|
|
45
|
-
>>>
|
|
46
|
-
>>>
|
|
47
|
-
>>> # Assumes feature_C is at index 2 (cardinality 2) and feature_D is at index 3 (cardinality 2)
|
|
48
|
-
>>> cat_index_map = {2: 2, 3: 2}
|
|
44
|
+
>>> # 1. Get the final schema from data exploration
|
|
45
|
+
>>> schema = data_exploration.finalize_feature_schema(...)
|
|
46
|
+
>>> # 2. Define bounds for continuous features
|
|
47
|
+
>>> cont_bounds = {'feature_A': (0, 100), 'feature_B': (-10, 10)}
|
|
49
48
|
>>>
|
|
50
|
-
>>> #
|
|
49
|
+
>>> # 3. Initialize the optimizer
|
|
51
50
|
>>> optimizer = MLOptimizer(
|
|
52
51
|
... inference_handler=my_handler,
|
|
53
|
-
...
|
|
52
|
+
... schema=schema,
|
|
53
|
+
... continuous_bounds_map=cont_bounds,
|
|
54
54
|
... task="max",
|
|
55
55
|
... algorithm="Genetic",
|
|
56
|
-
... categorical_index_map=cat_index_map,
|
|
57
|
-
... categorical_mappings=cat_mappings,
|
|
58
56
|
... )
|
|
59
|
-
>>> #
|
|
57
|
+
>>> # 4. Run the optimization
|
|
60
58
|
>>> best_result = optimizer.run(
|
|
61
59
|
... num_generations=100,
|
|
62
60
|
... target_name="my_target",
|
|
63
|
-
... feature_names=my_feature_names,
|
|
64
61
|
... save_dir="/path/to/results",
|
|
65
62
|
... save_format="csv"
|
|
66
63
|
... )
|
|
67
64
|
"""
|
|
68
65
|
def __init__(self,
|
|
69
66
|
inference_handler: PyTorchInferenceHandler,
|
|
70
|
-
|
|
67
|
+
schema: FeatureSchema,
|
|
68
|
+
continuous_bounds_map: Dict[str, Tuple[float, float]],
|
|
71
69
|
task: Literal["min", "max"],
|
|
72
70
|
algorithm: Literal["SNES", "CEM", "Genetic"] = "Genetic",
|
|
73
71
|
population_size: int = 200,
|
|
74
|
-
categorical_index_map: Optional[Dict[int, int]] = None,
|
|
75
|
-
categorical_mappings: Optional[Dict[str, Dict[str, int]]] = None,
|
|
76
72
|
discretize_start_at_zero: bool = True,
|
|
77
73
|
**searcher_kwargs):
|
|
78
74
|
"""
|
|
79
75
|
Initializes the optimizer by creating the EvoTorch problem and searcher.
|
|
80
76
|
|
|
81
77
|
Args:
|
|
82
|
-
inference_handler (PyTorchInferenceHandler):
|
|
83
|
-
|
|
84
|
-
|
|
78
|
+
inference_handler (PyTorchInferenceHandler):
|
|
79
|
+
An initialized inference handler containing the model.
|
|
80
|
+
schema (FeatureSchema):
|
|
81
|
+
The definitive schema object from data_exploration.
|
|
82
|
+
continuous_bounds_map (Dict[str, Tuple[float, float]]):
|
|
83
|
+
A dictionary mapping the *name* of each **continuous** feature
|
|
84
|
+
to its (min_bound, max_bound) tuple.
|
|
85
85
|
task (str): The optimization goal, either "min" or "max".
|
|
86
86
|
algorithm (str): The search algorithm to use ("SNES", "CEM", "Genetic").
|
|
87
87
|
population_size (int): Population size for CEM and GeneticAlgorithm.
|
|
88
|
-
categorical_index_map (Dict[int, int] | None): Used to discretize values after optimization. Maps {column_index: cardinality}.
|
|
89
|
-
categorical_mappings (Dict[str, Dict[str, int]] | None): Used to map discrete integer values back to strings (e.g., {0: 'Category_A'}) before saving.
|
|
90
88
|
discretize_start_at_zero (bool):
|
|
91
89
|
True if the discrete encoding starts at 0 (e.g., [0, 1, 2]).
|
|
92
90
|
False if it starts at 1 (e.g., [1, 2, 3]).
|
|
93
|
-
**searcher_kwargs: Additional keyword arguments for the selected
|
|
91
|
+
**searcher_kwargs: Additional keyword arguments for the selected
|
|
92
|
+
search algorithm's constructor.
|
|
94
93
|
"""
|
|
95
|
-
#
|
|
94
|
+
# --- Store schema ---
|
|
95
|
+
self.schema = schema
|
|
96
|
+
|
|
97
|
+
# --- 1. Create bounds from schema ---
|
|
98
|
+
# This is the new, robust way to get bounds
|
|
99
|
+
bounds = create_optimization_bounds(
|
|
100
|
+
schema=schema,
|
|
101
|
+
continuous_bounds_map=continuous_bounds_map,
|
|
102
|
+
start_at_zero=discretize_start_at_zero
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# --- 2. Make a fitness function ---
|
|
96
106
|
self.evaluator = FitnessEvaluator(
|
|
97
107
|
inference_handler=inference_handler,
|
|
98
|
-
|
|
108
|
+
# Get categorical info from the schema
|
|
109
|
+
categorical_index_map=schema.categorical_index_map,
|
|
99
110
|
discretize_start_at_zero=discretize_start_at_zero
|
|
100
111
|
)
|
|
101
112
|
|
|
102
|
-
#
|
|
113
|
+
# --- 3. Create the problem and searcher factory ---
|
|
103
114
|
self.problem, self.searcher_factory = create_pytorch_problem(
|
|
104
115
|
evaluator=self.evaluator,
|
|
105
116
|
bounds=bounds,
|
|
@@ -108,36 +119,36 @@ class MLOptimizer:
|
|
|
108
119
|
population_size=population_size,
|
|
109
120
|
**searcher_kwargs
|
|
110
121
|
)
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
self.categorical_mappings = categorical_mappings
|
|
122
|
+
|
|
123
|
+
# --- 4. Store other info needed by run() ---
|
|
114
124
|
self.discretize_start_at_zero = discretize_start_at_zero
|
|
115
125
|
|
|
116
126
|
def run(self,
|
|
117
127
|
num_generations: int,
|
|
118
128
|
target_name: str,
|
|
119
129
|
save_dir: Union[str, Path],
|
|
120
|
-
feature_names: Optional[List[str]],
|
|
121
130
|
save_format: Literal['csv', 'sqlite', 'both'],
|
|
122
131
|
repetitions: int = 1,
|
|
123
132
|
verbose: bool = True) -> Optional[dict]:
|
|
124
133
|
"""
|
|
125
134
|
Runs the evolutionary optimization process using the pre-configured settings.
|
|
126
135
|
|
|
136
|
+
The `feature_names` are automatically pulled from the `FeatureSchema`
|
|
137
|
+
provided during initialization.
|
|
138
|
+
|
|
127
139
|
Args:
|
|
128
140
|
num_generations (int): The total number of generations for each repetition.
|
|
129
141
|
target_name (str): Target name used for the CSV filename and/or SQL table.
|
|
130
142
|
save_dir (str | Path): The directory where result files will be saved.
|
|
131
|
-
feature_names (List[str] | None): Names of the solution features for labeling output.
|
|
132
|
-
If None, generic names like 'feature_0', 'feature_1', ... , will be created.
|
|
133
143
|
save_format (Literal['csv', 'sqlite', 'both']): The format for saving results.
|
|
134
144
|
repetitions (int): The number of independent times to run the optimization.
|
|
135
145
|
verbose (bool): If True, enables detailed logging.
|
|
136
146
|
|
|
137
147
|
Returns:
|
|
138
|
-
Optional[dict]: A dictionary with the best result if repetitions is 1,
|
|
148
|
+
Optional[dict]: A dictionary with the best result if repetitions is 1,
|
|
149
|
+
otherwise None.
|
|
139
150
|
"""
|
|
140
|
-
# Call the existing run function
|
|
151
|
+
# Call the existing run function, passing info from the schema
|
|
141
152
|
return run_optimization(
|
|
142
153
|
problem=self.problem,
|
|
143
154
|
searcher_factory=self.searcher_factory,
|
|
@@ -145,11 +156,13 @@ class MLOptimizer:
|
|
|
145
156
|
target_name=target_name,
|
|
146
157
|
save_dir=save_dir,
|
|
147
158
|
save_format=save_format,
|
|
148
|
-
|
|
159
|
+
# Get the definitive feature names (as a list) from the schema
|
|
160
|
+
feature_names=list(self.schema.feature_names),
|
|
161
|
+
# Get categorical info from the schema
|
|
162
|
+
categorical_map=self.schema.categorical_index_map,
|
|
163
|
+
categorical_mappings=self.schema.categorical_mappings,
|
|
149
164
|
repetitions=repetitions,
|
|
150
165
|
verbose=verbose,
|
|
151
|
-
categorical_map=self.categorical_map,
|
|
152
|
-
categorical_mappings=self.categorical_mappings,
|
|
153
166
|
discretize_start_at_zero=self.discretize_start_at_zero
|
|
154
167
|
)
|
|
155
168
|
|