dragon-ml-toolbox 13.0.0__py3-none-any.whl → 13.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

@@ -17,9 +17,10 @@ from ._script_info import _script_info
17
17
  from .ML_inference import PyTorchInferenceHandler
18
18
  from .keys import PyTorchInferenceKeys
19
19
  from .SQL import DatabaseManager
20
- from .optimization_tools import _save_result
20
+ from .optimization_tools import _save_result, create_optimization_bounds
21
21
  from .utilities import save_dataframe_filename
22
22
  from .math_utilities import discretize_categorical_values
23
+ from ._schema import FeatureSchema
23
24
 
24
25
 
25
26
  __all__ = [
@@ -40,66 +41,76 @@ class MLOptimizer:
40
41
  SNES and CEM algorithms do not accept bounds, the given bounds will be used as an initial starting point.
41
42
 
42
43
  Example:
43
- >>> # 1. Get categorical info from preprocessing steps
44
- >>> # e.g., from data_exploration.encode_categorical_features
45
- >>> cat_mappings = {'feature_C': {'A': 0, 'B': 1}, 'feature_D': {'X': 0, 'Y': 1}}
46
- >>> # e.g., from data_exploration.create_transformer_categorical_map
47
- >>> # Assumes feature_C is at index 2 (cardinality 2) and feature_D is at index 3 (cardinality 2)
48
- >>> cat_index_map = {2: 2, 3: 2}
44
+ >>> # 1. Get the final schema from data exploration
45
+ >>> schema = data_exploration.finalize_feature_schema(...)
46
+ >>> # 2. Define bounds for continuous features
47
+ >>> cont_bounds = {'feature_A': (0, 100), 'feature_B': (-10, 10)}
49
48
  >>>
50
- >>> # 2. Initialize the optimizer
49
+ >>> # 3. Initialize the optimizer
51
50
  >>> optimizer = MLOptimizer(
52
51
  ... inference_handler=my_handler,
53
- ... bounds=(lower_bounds, upper_bounds), # Bounds for ALL features
52
+ ... schema=schema,
53
+ ... continuous_bounds_map=cont_bounds,
54
54
  ... task="max",
55
55
  ... algorithm="Genetic",
56
- ... categorical_index_map=cat_index_map,
57
- ... categorical_mappings=cat_mappings,
58
56
  ... )
59
- >>> # 3. Run the optimization
57
+ >>> # 4. Run the optimization
60
58
  >>> best_result = optimizer.run(
61
59
  ... num_generations=100,
62
60
  ... target_name="my_target",
63
- ... feature_names=my_feature_names,
64
61
  ... save_dir="/path/to/results",
65
62
  ... save_format="csv"
66
63
  ... )
67
64
  """
68
65
  def __init__(self,
69
66
  inference_handler: PyTorchInferenceHandler,
70
- bounds: Tuple[List[float], List[float]],
67
+ schema: FeatureSchema,
68
+ continuous_bounds_map: Dict[str, Tuple[float, float]],
71
69
  task: Literal["min", "max"],
72
70
  algorithm: Literal["SNES", "CEM", "Genetic"] = "Genetic",
73
71
  population_size: int = 200,
74
- categorical_index_map: Optional[Dict[int, int]] = None,
75
- categorical_mappings: Optional[Dict[str, Dict[str, int]]] = None,
76
72
  discretize_start_at_zero: bool = True,
77
73
  **searcher_kwargs):
78
74
  """
79
75
  Initializes the optimizer by creating the EvoTorch problem and searcher.
80
76
 
81
77
  Args:
82
- inference_handler (PyTorchInferenceHandler): An initialized inference handler containing the model and weights.
83
- bounds (tuple[list[float], list[float]]): A tuple containing the lower and upper bounds for ALL solution features.
84
- Use the `optimization_tools.create_optimization_bounds()` helper to easily generate this and ensure unbiased categorical bounds.
78
+ inference_handler (PyTorchInferenceHandler):
79
+ An initialized inference handler containing the model.
80
+ schema (FeatureSchema):
81
+ The definitive schema object from data_exploration.
82
+ continuous_bounds_map (Dict[str, Tuple[float, float]]):
83
+ A dictionary mapping the *name* of each **continuous** feature
84
+ to its (min_bound, max_bound) tuple.
85
85
  task (str): The optimization goal, either "min" or "max".
86
86
  algorithm (str): The search algorithm to use ("SNES", "CEM", "Genetic").
87
87
  population_size (int): Population size for CEM and GeneticAlgorithm.
88
- categorical_index_map (Dict[int, int] | None): Used to discretize values after optimization. Maps {column_index: cardinality}.
89
- categorical_mappings (Dict[str, Dict[str, int]] | None): Used to map discrete integer values back to strings (e.g., {0: 'Category_A'}) before saving.
90
88
  discretize_start_at_zero (bool):
91
89
  True if the discrete encoding starts at 0 (e.g., [0, 1, 2]).
92
90
  False if it starts at 1 (e.g., [1, 2, 3]).
93
- **searcher_kwargs: Additional keyword arguments for the selected search algorithm's constructor.
91
+ **searcher_kwargs: Additional keyword arguments for the selected
92
+ search algorithm's constructor.
94
93
  """
95
- # Make a fitness function
94
+ # --- Store schema ---
95
+ self.schema = schema
96
+
97
+ # --- 1. Create bounds from schema ---
98
+ # This is the new, robust way to get bounds
99
+ bounds = create_optimization_bounds(
100
+ schema=schema,
101
+ continuous_bounds_map=continuous_bounds_map,
102
+ start_at_zero=discretize_start_at_zero
103
+ )
104
+
105
+ # --- 2. Make a fitness function ---
96
106
  self.evaluator = FitnessEvaluator(
97
107
  inference_handler=inference_handler,
98
- categorical_index_map=categorical_index_map,
108
+ # Get categorical info from the schema
109
+ categorical_index_map=schema.categorical_index_map,
99
110
  discretize_start_at_zero=discretize_start_at_zero
100
111
  )
101
112
 
102
- # Call the existing factory function to get the problem and searcher factory
113
+ # --- 3. Create the problem and searcher factory ---
103
114
  self.problem, self.searcher_factory = create_pytorch_problem(
104
115
  evaluator=self.evaluator,
105
116
  bounds=bounds,
@@ -108,36 +119,36 @@ class MLOptimizer:
108
119
  population_size=population_size,
109
120
  **searcher_kwargs
110
121
  )
111
- # Store categorical info to pass to the run function
112
- self.categorical_map = categorical_index_map
113
- self.categorical_mappings = categorical_mappings
122
+
123
+ # --- 4. Store other info needed by run() ---
114
124
  self.discretize_start_at_zero = discretize_start_at_zero
115
125
 
116
126
  def run(self,
117
127
  num_generations: int,
118
128
  target_name: str,
119
129
  save_dir: Union[str, Path],
120
- feature_names: Optional[List[str]],
121
130
  save_format: Literal['csv', 'sqlite', 'both'],
122
131
  repetitions: int = 1,
123
132
  verbose: bool = True) -> Optional[dict]:
124
133
  """
125
134
  Runs the evolutionary optimization process using the pre-configured settings.
126
135
 
136
+ The `feature_names` are automatically pulled from the `FeatureSchema`
137
+ provided during initialization.
138
+
127
139
  Args:
128
140
  num_generations (int): The total number of generations for each repetition.
129
141
  target_name (str): Target name used for the CSV filename and/or SQL table.
130
142
  save_dir (str | Path): The directory where result files will be saved.
131
- feature_names (List[str] | None): Names of the solution features for labeling output.
132
- If None, generic names like 'feature_0', 'feature_1', ... , will be created.
133
143
  save_format (Literal['csv', 'sqlite', 'both']): The format for saving results.
134
144
  repetitions (int): The number of independent times to run the optimization.
135
145
  verbose (bool): If True, enables detailed logging.
136
146
 
137
147
  Returns:
138
- Optional[dict]: A dictionary with the best result if repetitions is 1, otherwise None.
148
+ Optional[dict]: A dictionary with the best result if repetitions is 1,
149
+ otherwise None.
139
150
  """
140
- # Call the existing run function with the stored problem, searcher, and categorical info
151
+ # Call the existing run function, passing info from the schema
141
152
  return run_optimization(
142
153
  problem=self.problem,
143
154
  searcher_factory=self.searcher_factory,
@@ -145,11 +156,13 @@ class MLOptimizer:
145
156
  target_name=target_name,
146
157
  save_dir=save_dir,
147
158
  save_format=save_format,
148
- feature_names=feature_names,
159
+ # Get the definitive feature names (as a list) from the schema
160
+ feature_names=list(self.schema.feature_names),
161
+ # Get categorical info from the schema
162
+ categorical_map=self.schema.categorical_index_map,
163
+ categorical_mappings=self.schema.categorical_mappings,
149
164
  repetitions=repetitions,
150
165
  verbose=verbose,
151
- categorical_map=self.categorical_map,
152
- categorical_mappings=self.categorical_mappings,
153
166
  discretize_start_at_zero=self.discretize_start_at_zero
154
167
  )
155
168
 
@@ -17,6 +17,10 @@ from ._script_info import _script_info
17
17
  from .SQL import DatabaseManager
18
18
  from .optimization_tools import _save_result
19
19
 
20
+ """
21
+ DEPRECATED
22
+ """
23
+
20
24
 
21
25
  __all__ = [
22
26
  "ObjectiveFunction",
@@ -46,7 +50,7 @@ class ObjectiveFunction():
46
50
  self.binary_features = binary_features
47
51
  self.is_hybrid = False if binary_features <= 0 else True
48
52
  self.use_noise = add_noise
49
- self._artifact = deserialize_object(trained_model_path, verbose=False, raise_on_error=True)
53
+ self._artifact = deserialize_object(trained_model_path, verbose=False)
50
54
  self.model = self._get_from_artifact(EnsembleKeys.MODEL)
51
55
  self.feature_names: Optional[list[str]] = self._get_from_artifact(EnsembleKeys.FEATURES) # type: ignore
52
56
  self.target_name: Optional[str] = self._get_from_artifact(EnsembleKeys.TARGET) # type: ignore
ml_tools/_schema.py ADDED
@@ -0,0 +1,19 @@
1
+ from typing import NamedTuple, Tuple, Optional, Dict
2
+
3
+ class FeatureSchema(NamedTuple):
4
+ """Holds the final, definitive schema for the model pipeline."""
5
+
6
+ # The final, ordered list of all feature names
7
+ feature_names: Tuple[str, ...]
8
+
9
+ # List of all continuous feature names
10
+ continuous_feature_names: Tuple[str, ...]
11
+
12
+ # List of all categorical feature names
13
+ categorical_feature_names: Tuple[str, ...]
14
+
15
+ # Map of {column_index: cardinality} for categorical features
16
+ categorical_index_map: Optional[Dict[int, int]]
17
+
18
+ # The original string-to-int mappings (e.g., {'color': {'red': 0, 'blue': 1}})
19
+ categorical_mappings: Optional[Dict[str, Dict[str, int]]]
@@ -11,7 +11,7 @@ from .path_manager import sanitize_filename, make_fullpath
11
11
  from ._script_info import _script_info
12
12
  from ._logger import _LOGGER
13
13
  from .utilities import save_dataframe_filename
14
-
14
+ from ._schema import FeatureSchema
15
15
 
16
16
  # Keep track of all available tools, show using `info()`
17
17
  __all__ = [
@@ -32,9 +32,9 @@ __all__ = [
32
32
  "drop_outlier_samples",
33
33
  "match_and_filter_columns_by_regex",
34
34
  "standardize_percentages",
35
- "create_transformer_categorical_map",
36
35
  "reconstruct_one_hot",
37
- "reconstruct_binary"
36
+ "reconstruct_binary",
37
+ "finalize_feature_schema"
38
38
  ]
39
39
 
40
40
 
@@ -977,49 +977,6 @@ def standardize_percentages(
977
977
  return df_copy
978
978
 
979
979
 
980
- def create_transformer_categorical_map(
981
- df: pd.DataFrame,
982
- mappings: Dict[str, Dict[str, int]],
983
- verbose: bool = True
984
- ) -> Dict[int, int]:
985
- """
986
- Creates the `categorical_map` required by a `TabularTransformer` model.
987
-
988
- This function should be called late in the preprocessing pipeline, after all
989
- column additions, deletions, or reordering have occurred. It uses the final
990
- DataFrame's column order to map the correct column index to its cardinality.
991
-
992
- Args:
993
- df (pd.DataFrame): The final, processed DataFrame.
994
- mappings (Dict[str, Dict[str, int]]): The mappings dictionary generated by
995
- `encode_categorical_features`, containing the category-to-integer
996
- mapping for each categorical column.
997
- verbose (bool): If True, prints mapping progress.
998
-
999
- Returns:
1000
- (Dict[int, int]): The final `categorical_map` for the transformer,
1001
- mapping each column's current index to its cardinality (e.g., {0: 3}).
1002
- """
1003
- transformer_map = {}
1004
- categorical_column_names = mappings.keys()
1005
-
1006
- _LOGGER.info("Creating categorical map for TabularTransformer.")
1007
- for col_name in categorical_column_names:
1008
- if col_name in df.columns:
1009
- col_idx = df.columns.get_loc(col_name)
1010
-
1011
- # Get cardinality directly from the length of the mapping dictionary
1012
- cardinality = len(mappings[col_name])
1013
-
1014
- transformer_map[col_idx] = cardinality
1015
- if verbose:
1016
- print(f" - Mapping column '{col_name}' at index {col_idx} with cardinality {cardinality}.")
1017
- else:
1018
- _LOGGER.warning(f"Categorical column '{col_name}' not found in the final DataFrame. Skipping.")
1019
-
1020
- return transformer_map
1021
-
1022
-
1023
980
  def reconstruct_one_hot(
1024
981
  df: pd.DataFrame,
1025
982
  features_to_reconstruct: List[Union[str, Tuple[str, Optional[str]]]],
@@ -1274,6 +1231,78 @@ def reconstruct_binary(
1274
1231
  return new_df
1275
1232
 
1276
1233
 
1234
+ def finalize_feature_schema(
1235
+ df_features: pd.DataFrame,
1236
+ categorical_mappings: Optional[Dict[str, Dict[str, int]]]
1237
+ ) -> FeatureSchema:
1238
+ """
1239
+ Analyzes the final features DataFrame to create a definitive schema.
1240
+
1241
+ This function is the "single source of truth" for column order
1242
+ and type (categorical vs. continuous) for the entire ML pipeline.
1243
+
1244
+ It should be called at the end of the feature engineering process.
1245
+
1246
+ Args:
1247
+ df_features (pd.DataFrame):
1248
+ The final, processed DataFrame containing *only* feature columns
1249
+ in the exact order they will be fed to the model.
1250
+ categorical_mappings (Dict[str, Dict[str, int]] | None):
1251
+ The mappings dictionary generated by
1252
+ `encode_categorical_features`. Can be None if no
1253
+ categorical features exist.
1254
+
1255
+ Returns:
1256
+ FeatureSchema: A NamedTuple containing all necessary metadata for the pipeline.
1257
+ """
1258
+ feature_names: List[str] = df_features.columns.to_list()
1259
+
1260
+ # Intermediate lists for building
1261
+ continuous_feature_names_list: List[str] = []
1262
+ categorical_feature_names_list: List[str] = []
1263
+ categorical_index_map_dict: Dict[int, int] = {}
1264
+
1265
+ _LOGGER.info("Finalizing feature schema...")
1266
+
1267
+ if categorical_mappings:
1268
+ # --- Categorical features are present ---
1269
+ categorical_names_set = set(categorical_mappings.keys())
1270
+
1271
+ for index, name in enumerate(feature_names):
1272
+ if name in categorical_names_set:
1273
+ # This is a categorical feature
1274
+ cardinality = len(categorical_mappings[name])
1275
+ categorical_index_map_dict[index] = cardinality
1276
+ categorical_feature_names_list.append(name)
1277
+ else:
1278
+ # This is a continuous feature
1279
+ continuous_feature_names_list.append(name)
1280
+
1281
+ # Use the populated dict, or None if it's empty
1282
+ final_index_map = categorical_index_map_dict if categorical_index_map_dict else None
1283
+
1284
+ else:
1285
+ # --- No categorical features ---
1286
+ _LOGGER.info("No categorical mappings provided. Treating all features as continuous.")
1287
+ continuous_feature_names_list = list(feature_names)
1288
+ # categorical_feature_names_list remains empty
1289
+ # categorical_index_map_dict remains empty
1290
+ final_index_map = None # Explicitly set to None to match Optional type
1291
+
1292
+ _LOGGER.info(f"Schema created: {len(continuous_feature_names_list)} continuous, {len(categorical_feature_names_list)} categorical.")
1293
+
1294
+ # Create the final immutable instance
1295
+ schema_instance = FeatureSchema(
1296
+ feature_names=tuple(feature_names),
1297
+ continuous_feature_names=tuple(continuous_feature_names_list),
1298
+ categorical_feature_names=tuple(categorical_feature_names_list),
1299
+ categorical_index_map=final_index_map,
1300
+ categorical_mappings=categorical_mappings
1301
+ )
1302
+
1303
+ return schema_instance
1304
+
1305
+
1277
1306
  def _validate_columns(df: pd.DataFrame, columns: list[str]):
1278
1307
  valid_columns = [column for column in columns if column in df.columns]
1279
1308
  return valid_columns
@@ -9,6 +9,7 @@ from .utilities import yield_dataframes_from_dir
9
9
  from ._logger import _LOGGER
10
10
  from ._script_info import _script_info
11
11
  from .SQL import DatabaseManager
12
+ from ._schema import FeatureSchema
12
13
 
13
14
 
14
15
  __all__ = [
@@ -19,35 +20,25 @@ __all__ = [
19
20
 
20
21
 
21
22
  def create_optimization_bounds(
22
- csv_path: Union[str, Path],
23
+ schema: FeatureSchema,
23
24
  continuous_bounds_map: Dict[str, Tuple[float, float]],
24
- categorical_map: Dict[int, int],
25
- target_column: Optional[str] = None,
26
25
  start_at_zero: bool = True
27
26
  ) -> Tuple[List[float], List[float]]:
28
27
  """
29
- Generates the lower and upper bounds lists for the optimizer from a CSV header.
28
+ Generates the lower and upper bounds lists for the optimizer from a FeatureSchema.
30
29
 
31
30
  This helper function automates the creation of unbiased bounds for
32
31
  categorical features and combines them with user-defined bounds for
33
- continuous features.
34
-
35
- It reads *only* the header of the provided CSV to determine the full
36
- list of feature columns and their order, excluding the specified target.
37
- This is memory-efficient as the full dataset is not loaded.
32
+ continuous features, using the schema as the single source of truth
33
+ for feature order and type.
38
34
 
39
35
  Args:
40
- csv_path (Union[str, Path]):
41
- Path to the final, preprocessed CSV file. The column order in
42
- this file must match the order expected by the model.
36
+ schema (FeatureSchema):
37
+ The definitive schema object created by
38
+ `data_exploration.finalize_feature_schema()`.
43
39
  continuous_bounds_map (Dict[str, Tuple[float, float]]):
44
40
  A dictionary mapping the *name* of each **continuous** feature
45
41
  to its (min_bound, max_bound) tuple.
46
- categorical_map (Dict[int, int]):
47
- The map from the *index* of each **categorical** feature to its cardinality.
48
- (e.g., {2: 4} for a feature at index 2 with 4 categories).
49
- target_column (Optional[str], optional):
50
- The name of the target column to exclude. If None (default), the *last column* in the CSV is assumed to be the target.
51
42
  start_at_zero (bool):
52
43
  - If True, assumes categorical encoding is [0, 1, ..., k-1].
53
44
  Bounds will be set as [-0.5, k - 0.5].
@@ -59,98 +50,86 @@ def create_optimization_bounds(
59
50
  A tuple containing two lists: (lower_bounds, upper_bounds).
60
51
 
61
52
  Raises:
62
- ValueError: If a feature is defined in both maps, is missing from
63
- both maps, or if a name in `continuous_bounds_map`
64
- or `target_column` is not found in the CSV columns.
53
+ ValueError: If a feature is missing from `continuous_bounds_map`
54
+ or if a feature name in the map is not a
55
+ continuous feature according to the schema.
65
56
  """
66
- # 1. Read header and determine feature names
67
- full_csv_path = make_fullpath(csv_path, enforce="file")
68
- try:
69
- df_header = pd.read_csv(full_csv_path, nrows=0, encoding="utf-8")
70
- except Exception as e:
71
- _LOGGER.error(f"Failed to read header from CSV: {e}")
72
- raise
73
-
74
- all_column_names = df_header.columns.to_list()
75
- feature_names: List[str] = []
76
-
77
- if target_column is None:
78
- feature_names = all_column_names[:-1]
79
- excluded_target = all_column_names[-1]
80
- _LOGGER.info(f"No target_column provided. Assuming last column '{excluded_target}' is the target.")
81
- else:
82
- if target_column not in all_column_names:
83
- _LOGGER.error(f"Target column '{target_column}' not found in CSV header.")
84
- raise ValueError()
85
- feature_names = [name for name in all_column_names if name != target_column]
86
- _LOGGER.info(f"Excluding target column '{target_column}'.")
87
-
88
- # 2. Initialize bound lists
57
+ # 1. Get feature names and map from schema
58
+ feature_names = schema.feature_names
59
+ categorical_index_map = schema.categorical_index_map
89
60
  total_features = len(feature_names)
61
+
90
62
  if total_features <= 0:
91
- _LOGGER.error("No feature columns remain after excluding the target.")
63
+ _LOGGER.error("Schema contains no features.")
92
64
  raise ValueError()
65
+
66
+ _LOGGER.info(f"Generating bounds for {total_features} total features...")
93
67
 
68
+ # 2. Initialize bound lists
94
69
  lower_bounds: List[Optional[float]] = [None] * total_features
95
70
  upper_bounds: List[Optional[float]] = [None] * total_features
96
-
97
- _LOGGER.info(f"Generating bounds for {total_features} total features...")
98
71
 
99
72
  # 3. Populate categorical bounds (Index-based)
100
- # The indices in categorical_map (e.g., {2: 4}) directly correspond
101
- # to the indices in the `feature_names` list.
102
- for index, cardinality in categorical_map.items():
103
- if not (0 <= index < total_features):
104
- _LOGGER.error(f"Categorical index {index} is out of range for the {total_features} features.")
105
- raise ValueError()
106
-
107
- if start_at_zero:
108
- # Rule for [0, k-1]: bounds are [-0.5, k - 0.5]
109
- low = -0.5
110
- high = float(cardinality) - 0.5
111
- else:
112
- # Rule for [1, k]: bounds are [0.5, k + 0.5]
113
- low = 0.5
114
- high = float(cardinality) + 0.5
115
-
116
- lower_bounds[index] = low
117
- upper_bounds[index] = high
73
+ if categorical_index_map:
74
+ for index, cardinality in categorical_index_map.items():
75
+ if not (0 <= index < total_features):
76
+ _LOGGER.error(f"Categorical index {index} is out of range for the {total_features} features.")
77
+ raise ValueError()
78
+
79
+ if start_at_zero:
80
+ # Rule for [0, k-1]: bounds are [-0.5, k - 0.5]
81
+ low = -0.5
82
+ high = float(cardinality) - 0.5
83
+ else:
84
+ # Rule for [1, k]: bounds are [0.5, k + 0.5]
85
+ low = 0.5
86
+ high = float(cardinality) + 0.5
87
+
88
+ lower_bounds[index] = low
89
+ upper_bounds[index] = high
118
90
 
119
- _LOGGER.info(f"Automatically set bounds for {len(categorical_map)} categorical features.")
91
+ _LOGGER.info(f"Automatically set bounds for {len(categorical_index_map)} categorical features.")
92
+ else:
93
+ _LOGGER.info("No categorical features found in schema.")
120
94
 
121
95
  # 4. Populate continuous bounds (Name-based)
96
+ # Use schema.continuous_feature_names for robust checking
97
+ continuous_names_set = set(schema.continuous_feature_names)
98
+
99
+ if continuous_names_set != set(continuous_bounds_map.keys()):
100
+ missing_in_map = continuous_names_set - set(continuous_bounds_map.keys())
101
+ if missing_in_map:
102
+ _LOGGER.error(f"The following continuous features are missing from 'continuous_bounds_map': {list(missing_in_map)}")
103
+
104
+ extra_in_map = set(continuous_bounds_map.keys()) - continuous_names_set
105
+ if extra_in_map:
106
+ _LOGGER.error(f"The following features in 'continuous_bounds_map' are not defined as continuous in the schema: {list(extra_in_map)}")
107
+
108
+ raise ValueError("Mismatch between 'continuous_bounds_map' and schema's continuous features.")
109
+
122
110
  count_continuous = 0
123
111
  for name, (low, high) in continuous_bounds_map.items():
124
- try:
125
- # Map name to its index in the *feature-only* list
126
- index = feature_names.index(name)
127
- except ValueError:
128
- _LOGGER.warning(f"Feature name '{name}' from 'continuous_bounds_map' not found in the CSV's feature columns.")
129
- continue
130
-
112
+ # Map name to its index in the *feature-only* list
113
+ # This is guaranteed to be correct by the schema
114
+ index = feature_names.index(name)
115
+
131
116
  if lower_bounds[index] is not None:
132
- # This index was already set by the categorical map
133
- _LOGGER.error(f"Feature '{name}' (at index {index}) is defined in both 'categorical_map' and 'continuous_bounds_map'.")
117
+ # This should be impossible if schema is correct, but good to check
118
+ _LOGGER.error(f"Schema conflict: Feature '{name}' (at index {index}) is defined as both continuous and categorical.")
134
119
  raise ValueError()
135
-
120
+
136
121
  lower_bounds[index] = float(low)
137
122
  upper_bounds[index] = float(high)
138
123
  count_continuous += 1
139
124
 
140
125
  _LOGGER.info(f"Manually set bounds for {count_continuous} continuous features.")
141
126
 
142
- # 5. Validation: Check for any remaining None values
143
- missing_indices = []
144
- for i in range(total_features):
145
- if lower_bounds[i] is None:
146
- missing_indices.append(i)
147
-
148
- if missing_indices:
127
+ # 5. Final Validation (all Nones should be filled)
128
+ if None in lower_bounds:
129
+ missing_indices = [i for i, b in enumerate(lower_bounds) if b is None]
149
130
  missing_names = [feature_names[i] for i in missing_indices]
150
- _LOGGER.error(f"Bounds not defined for all features. Missing: {missing_names}")
151
- raise ValueError()
152
-
153
- # _LOGGER.info("All bounds successfully created.")
131
+ _LOGGER.error(f"Failed to create all bounds. This indicates an internal logic error. Missing: {missing_names}")
132
+ raise RuntimeError("Internal error: Not all bounds were populated.")
154
133
 
155
134
  # Cast to float lists, as 'None' sentinels are gone
156
135
  return (
ml_tools/serde.py CHANGED
@@ -116,8 +116,7 @@ def deserialize_object(
116
116
  # Can't do an isinstance check on 'Any', skip it.
117
117
  if type_to_check is not Any and not isinstance(obj, type_to_check):
118
118
  error_msg = (
119
- f"Type mismatch: Expected an instance of '{expected_type}', "
120
- f"but found '{type(obj)}' in '{true_filepath}'."
119
+ f"Type mismatch: Expected an instance of '{expected_type}', but found '{type(obj)}' in '{true_filepath}'."
121
120
  )
122
121
  _LOGGER.error(error_msg)
123
122
  raise TypeError()