dragon-ml-toolbox 13.0.0__py3-none-any.whl → 13.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-13.0.0.dist-info → dragon_ml_toolbox-13.1.0.dist-info}/METADATA +1 -1
- {dragon_ml_toolbox-13.0.0.dist-info → dragon_ml_toolbox-13.1.0.dist-info}/RECORD +14 -14
- ml_tools/ML_datasetmaster.py +144 -63
- ml_tools/ML_models.py +119 -55
- ml_tools/ML_optimization.py +49 -36
- ml_tools/PSO_optimization.py +5 -1
- ml_tools/_schema.py +19 -0
- ml_tools/data_exploration.py +75 -46
- ml_tools/optimization_tools.py +65 -86
- ml_tools/serde.py +1 -2
- ml_tools/ML_simple_optimization.py +0 -413
- {dragon_ml_toolbox-13.0.0.dist-info → dragon_ml_toolbox-13.1.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-13.0.0.dist-info → dragon_ml_toolbox-13.1.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-13.0.0.dist-info → dragon_ml_toolbox-13.1.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-13.0.0.dist-info → dragon_ml_toolbox-13.1.0.dist-info}/top_level.txt +0 -0
ml_tools/ML_optimization.py
CHANGED
|
@@ -17,9 +17,10 @@ from ._script_info import _script_info
|
|
|
17
17
|
from .ML_inference import PyTorchInferenceHandler
|
|
18
18
|
from .keys import PyTorchInferenceKeys
|
|
19
19
|
from .SQL import DatabaseManager
|
|
20
|
-
from .optimization_tools import _save_result
|
|
20
|
+
from .optimization_tools import _save_result, create_optimization_bounds
|
|
21
21
|
from .utilities import save_dataframe_filename
|
|
22
22
|
from .math_utilities import discretize_categorical_values
|
|
23
|
+
from ._schema import FeatureSchema
|
|
23
24
|
|
|
24
25
|
|
|
25
26
|
__all__ = [
|
|
@@ -40,66 +41,76 @@ class MLOptimizer:
|
|
|
40
41
|
SNES and CEM algorithms do not accept bounds, the given bounds will be used as an initial starting point.
|
|
41
42
|
|
|
42
43
|
Example:
|
|
43
|
-
>>> # 1. Get
|
|
44
|
-
>>>
|
|
45
|
-
>>>
|
|
46
|
-
>>>
|
|
47
|
-
>>> # Assumes feature_C is at index 2 (cardinality 2) and feature_D is at index 3 (cardinality 2)
|
|
48
|
-
>>> cat_index_map = {2: 2, 3: 2}
|
|
44
|
+
>>> # 1. Get the final schema from data exploration
|
|
45
|
+
>>> schema = data_exploration.finalize_feature_schema(...)
|
|
46
|
+
>>> # 2. Define bounds for continuous features
|
|
47
|
+
>>> cont_bounds = {'feature_A': (0, 100), 'feature_B': (-10, 10)}
|
|
49
48
|
>>>
|
|
50
|
-
>>> #
|
|
49
|
+
>>> # 3. Initialize the optimizer
|
|
51
50
|
>>> optimizer = MLOptimizer(
|
|
52
51
|
... inference_handler=my_handler,
|
|
53
|
-
...
|
|
52
|
+
... schema=schema,
|
|
53
|
+
... continuous_bounds_map=cont_bounds,
|
|
54
54
|
... task="max",
|
|
55
55
|
... algorithm="Genetic",
|
|
56
|
-
... categorical_index_map=cat_index_map,
|
|
57
|
-
... categorical_mappings=cat_mappings,
|
|
58
56
|
... )
|
|
59
|
-
>>> #
|
|
57
|
+
>>> # 4. Run the optimization
|
|
60
58
|
>>> best_result = optimizer.run(
|
|
61
59
|
... num_generations=100,
|
|
62
60
|
... target_name="my_target",
|
|
63
|
-
... feature_names=my_feature_names,
|
|
64
61
|
... save_dir="/path/to/results",
|
|
65
62
|
... save_format="csv"
|
|
66
63
|
... )
|
|
67
64
|
"""
|
|
68
65
|
def __init__(self,
|
|
69
66
|
inference_handler: PyTorchInferenceHandler,
|
|
70
|
-
|
|
67
|
+
schema: FeatureSchema,
|
|
68
|
+
continuous_bounds_map: Dict[str, Tuple[float, float]],
|
|
71
69
|
task: Literal["min", "max"],
|
|
72
70
|
algorithm: Literal["SNES", "CEM", "Genetic"] = "Genetic",
|
|
73
71
|
population_size: int = 200,
|
|
74
|
-
categorical_index_map: Optional[Dict[int, int]] = None,
|
|
75
|
-
categorical_mappings: Optional[Dict[str, Dict[str, int]]] = None,
|
|
76
72
|
discretize_start_at_zero: bool = True,
|
|
77
73
|
**searcher_kwargs):
|
|
78
74
|
"""
|
|
79
75
|
Initializes the optimizer by creating the EvoTorch problem and searcher.
|
|
80
76
|
|
|
81
77
|
Args:
|
|
82
|
-
inference_handler (PyTorchInferenceHandler):
|
|
83
|
-
|
|
84
|
-
|
|
78
|
+
inference_handler (PyTorchInferenceHandler):
|
|
79
|
+
An initialized inference handler containing the model.
|
|
80
|
+
schema (FeatureSchema):
|
|
81
|
+
The definitive schema object from data_exploration.
|
|
82
|
+
continuous_bounds_map (Dict[str, Tuple[float, float]]):
|
|
83
|
+
A dictionary mapping the *name* of each **continuous** feature
|
|
84
|
+
to its (min_bound, max_bound) tuple.
|
|
85
85
|
task (str): The optimization goal, either "min" or "max".
|
|
86
86
|
algorithm (str): The search algorithm to use ("SNES", "CEM", "Genetic").
|
|
87
87
|
population_size (int): Population size for CEM and GeneticAlgorithm.
|
|
88
|
-
categorical_index_map (Dict[int, int] | None): Used to discretize values after optimization. Maps {column_index: cardinality}.
|
|
89
|
-
categorical_mappings (Dict[str, Dict[str, int]] | None): Used to map discrete integer values back to strings (e.g., {0: 'Category_A'}) before saving.
|
|
90
88
|
discretize_start_at_zero (bool):
|
|
91
89
|
True if the discrete encoding starts at 0 (e.g., [0, 1, 2]).
|
|
92
90
|
False if it starts at 1 (e.g., [1, 2, 3]).
|
|
93
|
-
**searcher_kwargs: Additional keyword arguments for the selected
|
|
91
|
+
**searcher_kwargs: Additional keyword arguments for the selected
|
|
92
|
+
search algorithm's constructor.
|
|
94
93
|
"""
|
|
95
|
-
#
|
|
94
|
+
# --- Store schema ---
|
|
95
|
+
self.schema = schema
|
|
96
|
+
|
|
97
|
+
# --- 1. Create bounds from schema ---
|
|
98
|
+
# This is the new, robust way to get bounds
|
|
99
|
+
bounds = create_optimization_bounds(
|
|
100
|
+
schema=schema,
|
|
101
|
+
continuous_bounds_map=continuous_bounds_map,
|
|
102
|
+
start_at_zero=discretize_start_at_zero
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# --- 2. Make a fitness function ---
|
|
96
106
|
self.evaluator = FitnessEvaluator(
|
|
97
107
|
inference_handler=inference_handler,
|
|
98
|
-
|
|
108
|
+
# Get categorical info from the schema
|
|
109
|
+
categorical_index_map=schema.categorical_index_map,
|
|
99
110
|
discretize_start_at_zero=discretize_start_at_zero
|
|
100
111
|
)
|
|
101
112
|
|
|
102
|
-
#
|
|
113
|
+
# --- 3. Create the problem and searcher factory ---
|
|
103
114
|
self.problem, self.searcher_factory = create_pytorch_problem(
|
|
104
115
|
evaluator=self.evaluator,
|
|
105
116
|
bounds=bounds,
|
|
@@ -108,36 +119,36 @@ class MLOptimizer:
|
|
|
108
119
|
population_size=population_size,
|
|
109
120
|
**searcher_kwargs
|
|
110
121
|
)
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
self.categorical_mappings = categorical_mappings
|
|
122
|
+
|
|
123
|
+
# --- 4. Store other info needed by run() ---
|
|
114
124
|
self.discretize_start_at_zero = discretize_start_at_zero
|
|
115
125
|
|
|
116
126
|
def run(self,
|
|
117
127
|
num_generations: int,
|
|
118
128
|
target_name: str,
|
|
119
129
|
save_dir: Union[str, Path],
|
|
120
|
-
feature_names: Optional[List[str]],
|
|
121
130
|
save_format: Literal['csv', 'sqlite', 'both'],
|
|
122
131
|
repetitions: int = 1,
|
|
123
132
|
verbose: bool = True) -> Optional[dict]:
|
|
124
133
|
"""
|
|
125
134
|
Runs the evolutionary optimization process using the pre-configured settings.
|
|
126
135
|
|
|
136
|
+
The `feature_names` are automatically pulled from the `FeatureSchema`
|
|
137
|
+
provided during initialization.
|
|
138
|
+
|
|
127
139
|
Args:
|
|
128
140
|
num_generations (int): The total number of generations for each repetition.
|
|
129
141
|
target_name (str): Target name used for the CSV filename and/or SQL table.
|
|
130
142
|
save_dir (str | Path): The directory where result files will be saved.
|
|
131
|
-
feature_names (List[str] | None): Names of the solution features for labeling output.
|
|
132
|
-
If None, generic names like 'feature_0', 'feature_1', ... , will be created.
|
|
133
143
|
save_format (Literal['csv', 'sqlite', 'both']): The format for saving results.
|
|
134
144
|
repetitions (int): The number of independent times to run the optimization.
|
|
135
145
|
verbose (bool): If True, enables detailed logging.
|
|
136
146
|
|
|
137
147
|
Returns:
|
|
138
|
-
Optional[dict]: A dictionary with the best result if repetitions is 1,
|
|
148
|
+
Optional[dict]: A dictionary with the best result if repetitions is 1,
|
|
149
|
+
otherwise None.
|
|
139
150
|
"""
|
|
140
|
-
# Call the existing run function
|
|
151
|
+
# Call the existing run function, passing info from the schema
|
|
141
152
|
return run_optimization(
|
|
142
153
|
problem=self.problem,
|
|
143
154
|
searcher_factory=self.searcher_factory,
|
|
@@ -145,11 +156,13 @@ class MLOptimizer:
|
|
|
145
156
|
target_name=target_name,
|
|
146
157
|
save_dir=save_dir,
|
|
147
158
|
save_format=save_format,
|
|
148
|
-
|
|
159
|
+
# Get the definitive feature names (as a list) from the schema
|
|
160
|
+
feature_names=list(self.schema.feature_names),
|
|
161
|
+
# Get categorical info from the schema
|
|
162
|
+
categorical_map=self.schema.categorical_index_map,
|
|
163
|
+
categorical_mappings=self.schema.categorical_mappings,
|
|
149
164
|
repetitions=repetitions,
|
|
150
165
|
verbose=verbose,
|
|
151
|
-
categorical_map=self.categorical_map,
|
|
152
|
-
categorical_mappings=self.categorical_mappings,
|
|
153
166
|
discretize_start_at_zero=self.discretize_start_at_zero
|
|
154
167
|
)
|
|
155
168
|
|
ml_tools/PSO_optimization.py
CHANGED
|
@@ -17,6 +17,10 @@ from ._script_info import _script_info
|
|
|
17
17
|
from .SQL import DatabaseManager
|
|
18
18
|
from .optimization_tools import _save_result
|
|
19
19
|
|
|
20
|
+
"""
|
|
21
|
+
DEPRECATED
|
|
22
|
+
"""
|
|
23
|
+
|
|
20
24
|
|
|
21
25
|
__all__ = [
|
|
22
26
|
"ObjectiveFunction",
|
|
@@ -46,7 +50,7 @@ class ObjectiveFunction():
|
|
|
46
50
|
self.binary_features = binary_features
|
|
47
51
|
self.is_hybrid = False if binary_features <= 0 else True
|
|
48
52
|
self.use_noise = add_noise
|
|
49
|
-
self._artifact = deserialize_object(trained_model_path, verbose=False
|
|
53
|
+
self._artifact = deserialize_object(trained_model_path, verbose=False)
|
|
50
54
|
self.model = self._get_from_artifact(EnsembleKeys.MODEL)
|
|
51
55
|
self.feature_names: Optional[list[str]] = self._get_from_artifact(EnsembleKeys.FEATURES) # type: ignore
|
|
52
56
|
self.target_name: Optional[str] = self._get_from_artifact(EnsembleKeys.TARGET) # type: ignore
|
ml_tools/_schema.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from typing import NamedTuple, Tuple, Optional, Dict
|
|
2
|
+
|
|
3
|
+
class FeatureSchema(NamedTuple):
|
|
4
|
+
"""Holds the final, definitive schema for the model pipeline."""
|
|
5
|
+
|
|
6
|
+
# The final, ordered list of all feature names
|
|
7
|
+
feature_names: Tuple[str, ...]
|
|
8
|
+
|
|
9
|
+
# List of all continuous feature names
|
|
10
|
+
continuous_feature_names: Tuple[str, ...]
|
|
11
|
+
|
|
12
|
+
# List of all categorical feature names
|
|
13
|
+
categorical_feature_names: Tuple[str, ...]
|
|
14
|
+
|
|
15
|
+
# Map of {column_index: cardinality} for categorical features
|
|
16
|
+
categorical_index_map: Optional[Dict[int, int]]
|
|
17
|
+
|
|
18
|
+
# The original string-to-int mappings (e.g., {'color': {'red': 0, 'blue': 1}})
|
|
19
|
+
categorical_mappings: Optional[Dict[str, Dict[str, int]]]
|
ml_tools/data_exploration.py
CHANGED
|
@@ -11,7 +11,7 @@ from .path_manager import sanitize_filename, make_fullpath
|
|
|
11
11
|
from ._script_info import _script_info
|
|
12
12
|
from ._logger import _LOGGER
|
|
13
13
|
from .utilities import save_dataframe_filename
|
|
14
|
-
|
|
14
|
+
from ._schema import FeatureSchema
|
|
15
15
|
|
|
16
16
|
# Keep track of all available tools, show using `info()`
|
|
17
17
|
__all__ = [
|
|
@@ -32,9 +32,9 @@ __all__ = [
|
|
|
32
32
|
"drop_outlier_samples",
|
|
33
33
|
"match_and_filter_columns_by_regex",
|
|
34
34
|
"standardize_percentages",
|
|
35
|
-
"create_transformer_categorical_map",
|
|
36
35
|
"reconstruct_one_hot",
|
|
37
|
-
"reconstruct_binary"
|
|
36
|
+
"reconstruct_binary",
|
|
37
|
+
"finalize_feature_schema"
|
|
38
38
|
]
|
|
39
39
|
|
|
40
40
|
|
|
@@ -977,49 +977,6 @@ def standardize_percentages(
|
|
|
977
977
|
return df_copy
|
|
978
978
|
|
|
979
979
|
|
|
980
|
-
def create_transformer_categorical_map(
|
|
981
|
-
df: pd.DataFrame,
|
|
982
|
-
mappings: Dict[str, Dict[str, int]],
|
|
983
|
-
verbose: bool = True
|
|
984
|
-
) -> Dict[int, int]:
|
|
985
|
-
"""
|
|
986
|
-
Creates the `categorical_map` required by a `TabularTransformer` model.
|
|
987
|
-
|
|
988
|
-
This function should be called late in the preprocessing pipeline, after all
|
|
989
|
-
column additions, deletions, or reordering have occurred. It uses the final
|
|
990
|
-
DataFrame's column order to map the correct column index to its cardinality.
|
|
991
|
-
|
|
992
|
-
Args:
|
|
993
|
-
df (pd.DataFrame): The final, processed DataFrame.
|
|
994
|
-
mappings (Dict[str, Dict[str, int]]): The mappings dictionary generated by
|
|
995
|
-
`encode_categorical_features`, containing the category-to-integer
|
|
996
|
-
mapping for each categorical column.
|
|
997
|
-
verbose (bool): If True, prints mapping progress.
|
|
998
|
-
|
|
999
|
-
Returns:
|
|
1000
|
-
(Dict[int, int]): The final `categorical_map` for the transformer,
|
|
1001
|
-
mapping each column's current index to its cardinality (e.g., {0: 3}).
|
|
1002
|
-
"""
|
|
1003
|
-
transformer_map = {}
|
|
1004
|
-
categorical_column_names = mappings.keys()
|
|
1005
|
-
|
|
1006
|
-
_LOGGER.info("Creating categorical map for TabularTransformer.")
|
|
1007
|
-
for col_name in categorical_column_names:
|
|
1008
|
-
if col_name in df.columns:
|
|
1009
|
-
col_idx = df.columns.get_loc(col_name)
|
|
1010
|
-
|
|
1011
|
-
# Get cardinality directly from the length of the mapping dictionary
|
|
1012
|
-
cardinality = len(mappings[col_name])
|
|
1013
|
-
|
|
1014
|
-
transformer_map[col_idx] = cardinality
|
|
1015
|
-
if verbose:
|
|
1016
|
-
print(f" - Mapping column '{col_name}' at index {col_idx} with cardinality {cardinality}.")
|
|
1017
|
-
else:
|
|
1018
|
-
_LOGGER.warning(f"Categorical column '{col_name}' not found in the final DataFrame. Skipping.")
|
|
1019
|
-
|
|
1020
|
-
return transformer_map
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
980
|
def reconstruct_one_hot(
|
|
1024
981
|
df: pd.DataFrame,
|
|
1025
982
|
features_to_reconstruct: List[Union[str, Tuple[str, Optional[str]]]],
|
|
@@ -1274,6 +1231,78 @@ def reconstruct_binary(
|
|
|
1274
1231
|
return new_df
|
|
1275
1232
|
|
|
1276
1233
|
|
|
1234
|
+
def finalize_feature_schema(
|
|
1235
|
+
df_features: pd.DataFrame,
|
|
1236
|
+
categorical_mappings: Optional[Dict[str, Dict[str, int]]]
|
|
1237
|
+
) -> FeatureSchema:
|
|
1238
|
+
"""
|
|
1239
|
+
Analyzes the final features DataFrame to create a definitive schema.
|
|
1240
|
+
|
|
1241
|
+
This function is the "single source of truth" for column order
|
|
1242
|
+
and type (categorical vs. continuous) for the entire ML pipeline.
|
|
1243
|
+
|
|
1244
|
+
It should be called at the end of the feature engineering process.
|
|
1245
|
+
|
|
1246
|
+
Args:
|
|
1247
|
+
df_features (pd.DataFrame):
|
|
1248
|
+
The final, processed DataFrame containing *only* feature columns
|
|
1249
|
+
in the exact order they will be fed to the model.
|
|
1250
|
+
categorical_mappings (Dict[str, Dict[str, int]] | None):
|
|
1251
|
+
The mappings dictionary generated by
|
|
1252
|
+
`encode_categorical_features`. Can be None if no
|
|
1253
|
+
categorical features exist.
|
|
1254
|
+
|
|
1255
|
+
Returns:
|
|
1256
|
+
FeatureSchema: A NamedTuple containing all necessary metadata for the pipeline.
|
|
1257
|
+
"""
|
|
1258
|
+
feature_names: List[str] = df_features.columns.to_list()
|
|
1259
|
+
|
|
1260
|
+
# Intermediate lists for building
|
|
1261
|
+
continuous_feature_names_list: List[str] = []
|
|
1262
|
+
categorical_feature_names_list: List[str] = []
|
|
1263
|
+
categorical_index_map_dict: Dict[int, int] = {}
|
|
1264
|
+
|
|
1265
|
+
_LOGGER.info("Finalizing feature schema...")
|
|
1266
|
+
|
|
1267
|
+
if categorical_mappings:
|
|
1268
|
+
# --- Categorical features are present ---
|
|
1269
|
+
categorical_names_set = set(categorical_mappings.keys())
|
|
1270
|
+
|
|
1271
|
+
for index, name in enumerate(feature_names):
|
|
1272
|
+
if name in categorical_names_set:
|
|
1273
|
+
# This is a categorical feature
|
|
1274
|
+
cardinality = len(categorical_mappings[name])
|
|
1275
|
+
categorical_index_map_dict[index] = cardinality
|
|
1276
|
+
categorical_feature_names_list.append(name)
|
|
1277
|
+
else:
|
|
1278
|
+
# This is a continuous feature
|
|
1279
|
+
continuous_feature_names_list.append(name)
|
|
1280
|
+
|
|
1281
|
+
# Use the populated dict, or None if it's empty
|
|
1282
|
+
final_index_map = categorical_index_map_dict if categorical_index_map_dict else None
|
|
1283
|
+
|
|
1284
|
+
else:
|
|
1285
|
+
# --- No categorical features ---
|
|
1286
|
+
_LOGGER.info("No categorical mappings provided. Treating all features as continuous.")
|
|
1287
|
+
continuous_feature_names_list = list(feature_names)
|
|
1288
|
+
# categorical_feature_names_list remains empty
|
|
1289
|
+
# categorical_index_map_dict remains empty
|
|
1290
|
+
final_index_map = None # Explicitly set to None to match Optional type
|
|
1291
|
+
|
|
1292
|
+
_LOGGER.info(f"Schema created: {len(continuous_feature_names_list)} continuous, {len(categorical_feature_names_list)} categorical.")
|
|
1293
|
+
|
|
1294
|
+
# Create the final immutable instance
|
|
1295
|
+
schema_instance = FeatureSchema(
|
|
1296
|
+
feature_names=tuple(feature_names),
|
|
1297
|
+
continuous_feature_names=tuple(continuous_feature_names_list),
|
|
1298
|
+
categorical_feature_names=tuple(categorical_feature_names_list),
|
|
1299
|
+
categorical_index_map=final_index_map,
|
|
1300
|
+
categorical_mappings=categorical_mappings
|
|
1301
|
+
)
|
|
1302
|
+
|
|
1303
|
+
return schema_instance
|
|
1304
|
+
|
|
1305
|
+
|
|
1277
1306
|
def _validate_columns(df: pd.DataFrame, columns: list[str]):
|
|
1278
1307
|
valid_columns = [column for column in columns if column in df.columns]
|
|
1279
1308
|
return valid_columns
|
ml_tools/optimization_tools.py
CHANGED
|
@@ -9,6 +9,7 @@ from .utilities import yield_dataframes_from_dir
|
|
|
9
9
|
from ._logger import _LOGGER
|
|
10
10
|
from ._script_info import _script_info
|
|
11
11
|
from .SQL import DatabaseManager
|
|
12
|
+
from ._schema import FeatureSchema
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
__all__ = [
|
|
@@ -19,35 +20,25 @@ __all__ = [
|
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
def create_optimization_bounds(
|
|
22
|
-
|
|
23
|
+
schema: FeatureSchema,
|
|
23
24
|
continuous_bounds_map: Dict[str, Tuple[float, float]],
|
|
24
|
-
categorical_map: Dict[int, int],
|
|
25
|
-
target_column: Optional[str] = None,
|
|
26
25
|
start_at_zero: bool = True
|
|
27
26
|
) -> Tuple[List[float], List[float]]:
|
|
28
27
|
"""
|
|
29
|
-
Generates the lower and upper bounds lists for the optimizer from a
|
|
28
|
+
Generates the lower and upper bounds lists for the optimizer from a FeatureSchema.
|
|
30
29
|
|
|
31
30
|
This helper function automates the creation of unbiased bounds for
|
|
32
31
|
categorical features and combines them with user-defined bounds for
|
|
33
|
-
continuous features
|
|
34
|
-
|
|
35
|
-
It reads *only* the header of the provided CSV to determine the full
|
|
36
|
-
list of feature columns and their order, excluding the specified target.
|
|
37
|
-
This is memory-efficient as the full dataset is not loaded.
|
|
32
|
+
continuous features, using the schema as the single source of truth
|
|
33
|
+
for feature order and type.
|
|
38
34
|
|
|
39
35
|
Args:
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
36
|
+
schema (FeatureSchema):
|
|
37
|
+
The definitive schema object created by
|
|
38
|
+
`data_exploration.finalize_feature_schema()`.
|
|
43
39
|
continuous_bounds_map (Dict[str, Tuple[float, float]]):
|
|
44
40
|
A dictionary mapping the *name* of each **continuous** feature
|
|
45
41
|
to its (min_bound, max_bound) tuple.
|
|
46
|
-
categorical_map (Dict[int, int]):
|
|
47
|
-
The map from the *index* of each **categorical** feature to its cardinality.
|
|
48
|
-
(e.g., {2: 4} for a feature at index 2 with 4 categories).
|
|
49
|
-
target_column (Optional[str], optional):
|
|
50
|
-
The name of the target column to exclude. If None (default), the *last column* in the CSV is assumed to be the target.
|
|
51
42
|
start_at_zero (bool):
|
|
52
43
|
- If True, assumes categorical encoding is [0, 1, ..., k-1].
|
|
53
44
|
Bounds will be set as [-0.5, k - 0.5].
|
|
@@ -59,98 +50,86 @@ def create_optimization_bounds(
|
|
|
59
50
|
A tuple containing two lists: (lower_bounds, upper_bounds).
|
|
60
51
|
|
|
61
52
|
Raises:
|
|
62
|
-
ValueError: If a feature is
|
|
63
|
-
|
|
64
|
-
|
|
53
|
+
ValueError: If a feature is missing from `continuous_bounds_map`
|
|
54
|
+
or if a feature name in the map is not a
|
|
55
|
+
continuous feature according to the schema.
|
|
65
56
|
"""
|
|
66
|
-
# 1.
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
df_header = pd.read_csv(full_csv_path, nrows=0, encoding="utf-8")
|
|
70
|
-
except Exception as e:
|
|
71
|
-
_LOGGER.error(f"Failed to read header from CSV: {e}")
|
|
72
|
-
raise
|
|
73
|
-
|
|
74
|
-
all_column_names = df_header.columns.to_list()
|
|
75
|
-
feature_names: List[str] = []
|
|
76
|
-
|
|
77
|
-
if target_column is None:
|
|
78
|
-
feature_names = all_column_names[:-1]
|
|
79
|
-
excluded_target = all_column_names[-1]
|
|
80
|
-
_LOGGER.info(f"No target_column provided. Assuming last column '{excluded_target}' is the target.")
|
|
81
|
-
else:
|
|
82
|
-
if target_column not in all_column_names:
|
|
83
|
-
_LOGGER.error(f"Target column '{target_column}' not found in CSV header.")
|
|
84
|
-
raise ValueError()
|
|
85
|
-
feature_names = [name for name in all_column_names if name != target_column]
|
|
86
|
-
_LOGGER.info(f"Excluding target column '{target_column}'.")
|
|
87
|
-
|
|
88
|
-
# 2. Initialize bound lists
|
|
57
|
+
# 1. Get feature names and map from schema
|
|
58
|
+
feature_names = schema.feature_names
|
|
59
|
+
categorical_index_map = schema.categorical_index_map
|
|
89
60
|
total_features = len(feature_names)
|
|
61
|
+
|
|
90
62
|
if total_features <= 0:
|
|
91
|
-
_LOGGER.error("
|
|
63
|
+
_LOGGER.error("Schema contains no features.")
|
|
92
64
|
raise ValueError()
|
|
65
|
+
|
|
66
|
+
_LOGGER.info(f"Generating bounds for {total_features} total features...")
|
|
93
67
|
|
|
68
|
+
# 2. Initialize bound lists
|
|
94
69
|
lower_bounds: List[Optional[float]] = [None] * total_features
|
|
95
70
|
upper_bounds: List[Optional[float]] = [None] * total_features
|
|
96
|
-
|
|
97
|
-
_LOGGER.info(f"Generating bounds for {total_features} total features...")
|
|
98
71
|
|
|
99
72
|
# 3. Populate categorical bounds (Index-based)
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
upper_bounds[index] = high
|
|
73
|
+
if categorical_index_map:
|
|
74
|
+
for index, cardinality in categorical_index_map.items():
|
|
75
|
+
if not (0 <= index < total_features):
|
|
76
|
+
_LOGGER.error(f"Categorical index {index} is out of range for the {total_features} features.")
|
|
77
|
+
raise ValueError()
|
|
78
|
+
|
|
79
|
+
if start_at_zero:
|
|
80
|
+
# Rule for [0, k-1]: bounds are [-0.5, k - 0.5]
|
|
81
|
+
low = -0.5
|
|
82
|
+
high = float(cardinality) - 0.5
|
|
83
|
+
else:
|
|
84
|
+
# Rule for [1, k]: bounds are [0.5, k + 0.5]
|
|
85
|
+
low = 0.5
|
|
86
|
+
high = float(cardinality) + 0.5
|
|
87
|
+
|
|
88
|
+
lower_bounds[index] = low
|
|
89
|
+
upper_bounds[index] = high
|
|
118
90
|
|
|
119
|
-
|
|
91
|
+
_LOGGER.info(f"Automatically set bounds for {len(categorical_index_map)} categorical features.")
|
|
92
|
+
else:
|
|
93
|
+
_LOGGER.info("No categorical features found in schema.")
|
|
120
94
|
|
|
121
95
|
# 4. Populate continuous bounds (Name-based)
|
|
96
|
+
# Use schema.continuous_feature_names for robust checking
|
|
97
|
+
continuous_names_set = set(schema.continuous_feature_names)
|
|
98
|
+
|
|
99
|
+
if continuous_names_set != set(continuous_bounds_map.keys()):
|
|
100
|
+
missing_in_map = continuous_names_set - set(continuous_bounds_map.keys())
|
|
101
|
+
if missing_in_map:
|
|
102
|
+
_LOGGER.error(f"The following continuous features are missing from 'continuous_bounds_map': {list(missing_in_map)}")
|
|
103
|
+
|
|
104
|
+
extra_in_map = set(continuous_bounds_map.keys()) - continuous_names_set
|
|
105
|
+
if extra_in_map:
|
|
106
|
+
_LOGGER.error(f"The following features in 'continuous_bounds_map' are not defined as continuous in the schema: {list(extra_in_map)}")
|
|
107
|
+
|
|
108
|
+
raise ValueError("Mismatch between 'continuous_bounds_map' and schema's continuous features.")
|
|
109
|
+
|
|
122
110
|
count_continuous = 0
|
|
123
111
|
for name, (low, high) in continuous_bounds_map.items():
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
_LOGGER.warning(f"Feature name '{name}' from 'continuous_bounds_map' not found in the CSV's feature columns.")
|
|
129
|
-
continue
|
|
130
|
-
|
|
112
|
+
# Map name to its index in the *feature-only* list
|
|
113
|
+
# This is guaranteed to be correct by the schema
|
|
114
|
+
index = feature_names.index(name)
|
|
115
|
+
|
|
131
116
|
if lower_bounds[index] is not None:
|
|
132
|
-
# This
|
|
133
|
-
_LOGGER.error(f"Feature '{name}' (at index {index}) is defined
|
|
117
|
+
# This should be impossible if schema is correct, but good to check
|
|
118
|
+
_LOGGER.error(f"Schema conflict: Feature '{name}' (at index {index}) is defined as both continuous and categorical.")
|
|
134
119
|
raise ValueError()
|
|
135
|
-
|
|
120
|
+
|
|
136
121
|
lower_bounds[index] = float(low)
|
|
137
122
|
upper_bounds[index] = float(high)
|
|
138
123
|
count_continuous += 1
|
|
139
124
|
|
|
140
125
|
_LOGGER.info(f"Manually set bounds for {count_continuous} continuous features.")
|
|
141
126
|
|
|
142
|
-
# 5. Validation
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
if lower_bounds[i] is None:
|
|
146
|
-
missing_indices.append(i)
|
|
147
|
-
|
|
148
|
-
if missing_indices:
|
|
127
|
+
# 5. Final Validation (all Nones should be filled)
|
|
128
|
+
if None in lower_bounds:
|
|
129
|
+
missing_indices = [i for i, b in enumerate(lower_bounds) if b is None]
|
|
149
130
|
missing_names = [feature_names[i] for i in missing_indices]
|
|
150
|
-
_LOGGER.error(f"
|
|
151
|
-
raise
|
|
152
|
-
|
|
153
|
-
# _LOGGER.info("All bounds successfully created.")
|
|
131
|
+
_LOGGER.error(f"Failed to create all bounds. This indicates an internal logic error. Missing: {missing_names}")
|
|
132
|
+
raise RuntimeError("Internal error: Not all bounds were populated.")
|
|
154
133
|
|
|
155
134
|
# Cast to float lists, as 'None' sentinels are gone
|
|
156
135
|
return (
|
ml_tools/serde.py
CHANGED
|
@@ -116,8 +116,7 @@ def deserialize_object(
|
|
|
116
116
|
# Can't do an isinstance check on 'Any', skip it.
|
|
117
117
|
if type_to_check is not Any and not isinstance(obj, type_to_check):
|
|
118
118
|
error_msg = (
|
|
119
|
-
f"Type mismatch: Expected an instance of '{expected_type}', "
|
|
120
|
-
f"but found '{type(obj)}' in '{true_filepath}'."
|
|
119
|
+
f"Type mismatch: Expected an instance of '{expected_type}', but found '{type(obj)}' in '{true_filepath}'."
|
|
121
120
|
)
|
|
122
121
|
_LOGGER.error(error_msg)
|
|
123
122
|
raise TypeError()
|