dragon-ml-toolbox 12.13.0__py3-none-any.whl → 13.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 12.13.0
3
+ Version: 13.1.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -0,0 +1,41 @@
1
+ dragon_ml_toolbox-13.1.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
2
+ dragon_ml_toolbox-13.1.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=iy2r_R7wjzsCbz_Q_jMsp_jfZ6oP8XW9QhwzRBH0mGY,1904
3
+ ml_tools/ETL_cleaning.py,sha256=2VBRllV8F-ZiPylPp8Az2gwn5ztgazN0BH5OKnRUhV0,20402
4
+ ml_tools/ETL_engineering.py,sha256=KfYqgsxupAx6e_TxwO1LZXeu5mFkIhVXJrNjP3CzIZc,54927
5
+ ml_tools/GUI_tools.py,sha256=Va6ig-dHULPVRwQYYtH3fvY5XPIoqRcJpRW8oXC55Hw,45413
6
+ ml_tools/MICE_imputation.py,sha256=X273Qlgoqqg7KTmoKd75YDyAPB0UIbTzGP3xsCmRh3E,11717
7
+ ml_tools/ML_callbacks.py,sha256=elD2Yr030sv_6gX_m9GVd6HTyrbmt34nFS8lrgS4HtM,15808
8
+ ml_tools/ML_datasetmaster.py,sha256=7QJnOM6GWFklKt2fiukITM3DK49i3ThK8wazb5szwpE,34396
9
+ ml_tools/ML_evaluation.py,sha256=3u5dOhS77gn3kAshKr2GwSa5xZBF0YM77ZkFevqNPvA,18528
10
+ ml_tools/ML_evaluation_multi.py,sha256=L6Ub_uObXsI7ToVCF6DtmAFekHRcga5wWMOnRYRR-BY,16121
11
+ ml_tools/ML_inference.py,sha256=yq2gdN6s_OUYC5ZLQrIJC5BA5H33q8UKODXwb-_0M2c,23549
12
+ ml_tools/ML_models.py,sha256=4Kb23pSusPMRH8h-R9ztK6JoH1lMuckxq7ihorll-H8,29965
13
+ ml_tools/ML_optimization.py,sha256=P0zkhKAwTpkorIBtR0AOIDcyexo5ngmvFUzo3DfNO-E,22692
14
+ ml_tools/ML_scaler.py,sha256=tw6onj9o8_kk3FQYb930HUzvv1zsFZe2YZJdF3LtHkU,7538
15
+ ml_tools/ML_trainer.py,sha256=9BP6JFClqGfe7GL-FGG3n5e-no9ssjEOLol7P6baGrI,29019
16
+ ml_tools/ML_utilities.py,sha256=EnKpPTnJ2qjZmz7kvows4Uu5CfSA7ByRmI1v2-KarKw,9337
17
+ ml_tools/PSO_optimization.py,sha256=T-HWHMRJUnPvPwixdU5jif3_rnnI36TzcL8u3oSCwuA,22960
18
+ ml_tools/RNN_forecast.py,sha256=Qa2KoZfdAvSjZ4yE78N4BFXtr3tTr0Gx7tQJZPotsh0,1967
19
+ ml_tools/SQL.py,sha256=vXLPGfVVg8bfkbBE3HVfyEclVbdJy0TBhuQONtMwSCQ,11234
20
+ ml_tools/VIF_factor.py,sha256=at5IVqPvicja2-DNSTSIIy3SkzDWCmLzo3qTG_qr5n8,10422
21
+ ml_tools/__init__.py,sha256=q0y9faQ6e17XCQ7eUiCZ1FJ4Bg5EQqLjZ9f_l5REUUY,41
22
+ ml_tools/_logger.py,sha256=dlp5cGbzooK9YSNSZYB4yjZrOaQUGW8PTrM411AOvL8,4717
23
+ ml_tools/_schema.py,sha256=MYYAO8CYygIvwv9TkGBAxzZpG7xQ2IV8_yB5zzFin0c,710
24
+ ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
25
+ ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
26
+ ml_tools/custom_logger.py,sha256=7tSAgRL7e-Ekm7rS1FLDocaPLCnaoKc7VSrtfwCtCEg,10067
27
+ ml_tools/data_exploration.py,sha256=aVcxjoXVqrmFBpwBSbLvrG8quzJfr92On48Sy3K58Vs,51900
28
+ ml_tools/ensemble_evaluation.py,sha256=FGHSe8LBI8_w8LjNeJWOcYQ1UK_mc6fVah8gmSvNVGg,26853
29
+ ml_tools/ensemble_inference.py,sha256=0yLmLNj45RVVoSCLH1ZYJG9IoAhTkWUqEZmLOQTFGTY,9348
30
+ ml_tools/ensemble_learning.py,sha256=vsIED7nlheYI4w2SBzP6SC1AnNeMfn-2A1Gqw5EfxsM,21964
31
+ ml_tools/handle_excel.py,sha256=pfdAPb9ywegFkM9T54bRssDOsX-K7rSeV0RaMz7lEAo,14006
32
+ ml_tools/keys.py,sha256=eJ4St5fl8uHstEGO1XVdP8G-ddwjOxV9zqG0D6W8pCI,2124
33
+ ml_tools/math_utilities.py,sha256=PxoOrnuj6Ntp7_TJqyDWi0JX03WpAO5iaFNK2Oeq5I4,8800
34
+ ml_tools/optimization_tools.py,sha256=TYFQ2nSnp7xxs-VyoZISWgnGJghFbsWasHjruegyJRs,12763
35
+ ml_tools/path_manager.py,sha256=CyDU16pOKmC82jPubqJPT6EBt-u-3rGVbxyPIZCvDDY,18432
36
+ ml_tools/serde.py,sha256=Wjf8N1thSfJ4r6Vm_pWxP2UTPcP2f3s2FiGz0z6kqKI,4925
37
+ ml_tools/utilities.py,sha256=OcAyV1tEcYAfOWlGjRgopsjDLxU3DcI5EynzvWV4q3A,15754
38
+ dragon_ml_toolbox-13.1.0.dist-info/METADATA,sha256=8n0bhl_rSVdg6MDh51r7tl5JflbqIOdqZx5gjaBWk0o,6166
39
+ dragon_ml_toolbox-13.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
40
+ dragon_ml_toolbox-13.1.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
41
+ dragon_ml_toolbox-13.1.0.dist-info/RECORD,,
ml_tools/ML_callbacks.py CHANGED
@@ -5,7 +5,7 @@ from typing import Union, Literal, Optional
5
5
  from pathlib import Path
6
6
 
7
7
  from .path_manager import make_fullpath, sanitize_filename
8
- from .keys import PyTorchLogKeys
8
+ from .keys import PyTorchLogKeys, PyTorchCheckpointKeys
9
9
  from ._logger import _LOGGER
10
10
  from ._script_info import _script_info
11
11
 
@@ -189,7 +189,7 @@ class EarlyStopping(Callback):
189
189
 
190
190
  class ModelCheckpoint(Callback):
191
191
  """
192
- Saves the model weights to a directory with automated filename generation and rotation.
192
+ Saves the model weights, optimizer state, LR scheduler state (if any), and epoch number to a directory with automated filename generation and rotation.
193
193
  """
194
194
  def __init__(self, save_dir: Union[str,Path], checkpoint_name: Optional[str]=None, monitor: str = PyTorchLogKeys.VAL_LOSS,
195
195
  save_best_only: bool = True, mode: Literal['auto', 'min', 'max']= 'auto', verbose: int = 0):
@@ -200,7 +200,7 @@ class ModelCheckpoint(Callback):
200
200
  Args:
201
201
  save_dir (str): Directory where checkpoint files will be saved.
202
202
  checkpoint_name (str| None): If None, the filename will include the epoch and score.
203
- monitor (str): Metric to monitor for `save_best_only=True`.
203
+ monitor (str): Metric to monitor.
204
204
  save_best_only (bool): If true, save only the best model.
205
205
  mode (str): One of {'auto', 'min', 'max'}.
206
206
  verbose (int): Verbosity mode.
@@ -270,15 +270,29 @@ class ModelCheckpoint(Callback):
270
270
  if self.verbose > 0:
271
271
  _LOGGER.info(f"Epoch {epoch}: {self.monitor} improved from {old_best_str} to {current:.4f}, saving model to {new_filepath}")
272
272
 
273
+ # Update best score *before* saving
274
+ self.best = current
275
+
276
+ # Create a comprehensive checkpoint dictionary
277
+ checkpoint_data = {
278
+ PyTorchCheckpointKeys.EPOCH: epoch,
279
+ PyTorchCheckpointKeys.MODEL_STATE: self.trainer.model.state_dict(), # type: ignore
280
+ PyTorchCheckpointKeys.OPTIMIZER_STATE: self.trainer.optimizer.state_dict(), # type: ignore
281
+ PyTorchCheckpointKeys.BEST_SCORE: self.best,
282
+ }
283
+
284
+ # Check for scheduler
285
+ if hasattr(self.trainer, 'scheduler') and self.trainer.scheduler is not None: # type: ignore
286
+ checkpoint_data[PyTorchCheckpointKeys.SCHEDULER_STATE] = self.trainer.scheduler.state_dict() # type: ignore
287
+
273
288
  # Save the new best model
274
- torch.save(self.trainer.model.state_dict(), new_filepath) # type: ignore
289
+ torch.save(checkpoint_data, new_filepath)
275
290
 
276
291
  # Delete the old best model file
277
292
  if self.last_best_filepath and self.last_best_filepath.exists():
278
293
  self.last_best_filepath.unlink()
279
294
 
280
295
  # Update state
281
- self.best = current
282
296
  self.last_best_filepath = new_filepath
283
297
 
284
298
  def _save_rolling_checkpoints(self, epoch, logs):
@@ -292,7 +306,19 @@ class ModelCheckpoint(Callback):
292
306
 
293
307
  if self.verbose > 0:
294
308
  _LOGGER.info(f'Epoch {epoch}: saving model to {filepath}')
295
- torch.save(self.trainer.model.state_dict(), filepath) # type: ignore
309
+
310
+ # Create a comprehensive checkpoint dictionary
311
+ checkpoint_data = {
312
+ PyTorchCheckpointKeys.EPOCH: epoch,
313
+ PyTorchCheckpointKeys.MODEL_STATE: self.trainer.model.state_dict(), # type: ignore
314
+ PyTorchCheckpointKeys.OPTIMIZER_STATE: self.trainer.optimizer.state_dict(), # type: ignore
315
+ PyTorchCheckpointKeys.BEST_SCORE: self.best, # Save the current best score
316
+ }
317
+
318
+ if hasattr(self.trainer, 'scheduler') and self.trainer.scheduler is not None: # type: ignore
319
+ checkpoint_data[PyTorchCheckpointKeys.SCHEDULER_STATE] = self.trainer.scheduler.state_dict() # type: ignore
320
+
321
+ torch.save(checkpoint_data, filepath)
296
322
 
297
323
  self.saved_checkpoints.append(filepath)
298
324
 
@@ -309,19 +335,25 @@ class LRScheduler(Callback):
309
335
  """
310
336
  Callback to manage a PyTorch learning rate scheduler.
311
337
  """
312
- def __init__(self, scheduler, monitor: Optional[str] = None):
338
+ def __init__(self, scheduler, monitor: Optional[str] = PyTorchLogKeys.VAL_LOSS):
313
339
  """
314
340
  This callback automatically calls the scheduler's `step()` method at the
315
341
  end of each epoch. It also logs a message when the learning rate changes.
316
342
 
317
343
  Args:
318
344
  scheduler: An initialized PyTorch learning rate scheduler.
319
- monitor (str, optional): The metric to monitor for schedulers that require it, like `ReduceLROnPlateau`. Should match a key in the logs (e.g., 'val_loss').
345
+ monitor (str): The metric to monitor for schedulers that require it, like `ReduceLROnPlateau`. Should match a key in the logs (e.g., 'val_loss').
320
346
  """
321
347
  super().__init__()
322
348
  self.scheduler = scheduler
323
349
  self.monitor = monitor
324
350
  self.previous_lr = None
351
+
352
+ def set_trainer(self, trainer):
353
+ """This is called by the Trainer to associate itself with the callback."""
354
+ super().set_trainer(trainer)
355
+ # Register the scheduler with the trainer so it can be added to the checkpoint
356
+ self.trainer.scheduler = self.scheduler # type: ignore
325
357
 
326
358
  def on_train_begin(self, logs=None):
327
359
  """Store the initial learning rate."""
@@ -17,6 +17,7 @@ from ._script_info import _script_info
17
17
  from .custom_logger import save_list_strings
18
18
  from .ML_scaler import PytorchScaler
19
19
  from .keys import DatasetKeys
20
+ from ._schema import FeatureSchema
20
21
 
21
22
 
22
23
  __all__ = [
@@ -35,7 +36,7 @@ class _PytorchDataset(Dataset):
35
36
  Converts numpy/pandas data into tensors for model consumption.
36
37
  """
37
38
  def __init__(self, features: Union[numpy.ndarray, pandas.DataFrame],
38
- labels: Union[numpy.ndarray, pandas.Series],
39
+ labels: Union[numpy.ndarray, pandas.Series, pandas.DataFrame],
39
40
  labels_dtype: torch.dtype,
40
41
  features_dtype: torch.dtype = torch.float32,
41
42
  feature_names: Optional[List[str]] = None,
@@ -48,13 +49,16 @@ class _PytorchDataset(Dataset):
48
49
 
49
50
  if isinstance(features, numpy.ndarray):
50
51
  self.features = torch.tensor(features, dtype=features_dtype)
51
- else:
52
- self.features = torch.tensor(features.values, dtype=features_dtype)
52
+ else: # It's a pandas.DataFrame
53
+ self.features = torch.tensor(features.to_numpy(), dtype=features_dtype)
53
54
 
54
55
  if isinstance(labels, numpy.ndarray):
55
56
  self.labels = torch.tensor(labels, dtype=labels_dtype)
57
+ elif isinstance(labels, (pandas.Series, pandas.DataFrame)):
58
+ self.labels = torch.tensor(labels.to_numpy(), dtype=labels_dtype)
56
59
  else:
57
- self.labels = torch.tensor(labels.values, dtype=labels_dtype)
60
+ # Fallback for other types (though your type hints don't cover this)
61
+ self.labels = torch.tensor(labels, dtype=labels_dtype)
58
62
 
59
63
  self._feature_names = feature_names
60
64
  self._target_names = target_names
@@ -98,27 +102,34 @@ class _BaseDatasetMaker(ABC):
98
102
  self._X_test_shape = (0,0)
99
103
  self._y_train_shape = (0,)
100
104
  self._y_test_shape = (0,)
101
-
102
- def _prepare_scaler(self, X_train: pandas.DataFrame, y_train: Union[pandas.Series, pandas.DataFrame], X_test: pandas.DataFrame, label_dtype: torch.dtype, continuous_feature_columns: Optional[Union[List[int], List[str]]]):
103
- """Internal helper to fit and apply a PytorchScaler."""
105
+
106
+ def _prepare_scaler(self,
107
+ X_train: pandas.DataFrame,
108
+ y_train: Union[pandas.Series, pandas.DataFrame],
109
+ X_test: pandas.DataFrame,
110
+ label_dtype: torch.dtype,
111
+ schema: FeatureSchema):
112
+ """Internal helper to fit and apply a PytorchScaler using a FeatureSchema."""
104
113
  continuous_feature_indices: Optional[List[int]] = None
105
- if continuous_feature_columns:
106
- if all(isinstance(c, str) for c in continuous_feature_columns):
107
- name_to_idx = {name: i for i, name in enumerate(self._feature_names)}
108
- try:
109
- continuous_feature_indices = [name_to_idx[name] for name in continuous_feature_columns] # type: ignore
110
- except KeyError as e:
111
- _LOGGER.error(f"Feature column '{e.args[0]}' not found.")
112
- raise ValueError()
113
- elif all(isinstance(c, int) for c in continuous_feature_columns):
114
- continuous_feature_indices = continuous_feature_columns # type: ignore
115
- else:
116
- _LOGGER.error("'continuous_feature_columns' must be a list of all strings or all integers.")
117
- raise TypeError()
114
+
115
+ # Get continuous feature indices *from the schema*
116
+ if schema.continuous_feature_names:
117
+ _LOGGER.info("Getting continuous feature indices from schema.")
118
+ try:
119
+ # Convert columns to a standard list for .index()
120
+ train_cols_list = X_train.columns.to_list()
121
+ # Map names from schema to column indices in the training DataFrame
122
+ continuous_feature_indices = [train_cols_list.index(name) for name in schema.continuous_feature_names]
123
+ except ValueError as e: #
124
+ _LOGGER.error(f"Feature name from schema not found in training data columns:\n{e}")
125
+ raise ValueError()
126
+ else:
127
+ _LOGGER.info("No continuous features listed in schema. Scaler will not be fitted.")
118
128
 
119
129
  X_train_values = X_train.values
120
130
  X_test_values = X_test.values
121
131
 
132
+ # continuous_feature_indices is derived
122
133
  if self.scaler is None and continuous_feature_indices:
123
134
  _LOGGER.info("Fitting a new PytorchScaler on training data.")
124
135
  temp_train_ds = _PytorchDataset(X_train_values, y_train, label_dtype) # type: ignore
@@ -225,10 +236,8 @@ class DatasetMaker(_BaseDatasetMaker):
225
236
  """
226
237
  Dataset maker for pre-processed, numerical pandas DataFrames with a single target column.
227
238
 
228
- This class takes a DataFrame, automatically splits it into training and
229
- testing sets, and converts them into PyTorch Datasets. It assumes the
230
- target variable is the last column. It can also create, apply, and
231
- save a PytorchScaler for standardizing continuous features.
239
+ This class takes a DataFrame, and a FeatureSchema, automatically splits and converts them into PyTorch Datasets.
240
+ It can also create and apply a PytorchScaler using the schema.
232
241
 
233
242
  Attributes:
234
243
  `scaler` -> PytorchScaler | None
@@ -242,92 +251,164 @@ class DatasetMaker(_BaseDatasetMaker):
242
251
  """
243
252
  def __init__(self,
244
253
  pandas_df: pandas.DataFrame,
254
+ schema: FeatureSchema,
245
255
  kind: Literal["regression", "classification"],
246
256
  test_size: float = 0.2,
247
257
  random_state: int = 42,
248
- scaler: Optional[PytorchScaler] = None,
249
- continuous_feature_columns: Optional[Union[List[int], List[str]]] = None):
258
+ scaler: Optional[PytorchScaler] = None):
250
259
  """
251
260
  Args:
252
- pandas_df (pandas.DataFrame): The pre-processed input DataFrame with numerical data.
253
- kind (Literal["regression", "classification"]): The type of ML task. This determines the data type of the labels.
254
- test_size (float): The proportion of the dataset to allocate to the test split.
255
- random_state (int): The seed for the random number generator for reproducibility.
256
- scaler (PytorchScaler | None): A pre-fitted PytorchScaler instance.
257
- continuous_feature_columns (List[int] | List[str] | None): Column indices or names of continuous features to scale. If provided creates a new PytorchScaler.
261
+ pandas_df (pandas.DataFrame):
262
+ The pre-processed input DataFrame containing all columns. (features and single target).
263
+ schema (FeatureSchema):
264
+ The definitive schema object from data_exploration.
265
+ kind (Literal["regression", "classification"]):
266
+ The type of ML task. This determines the data type of the labels.
267
+ test_size (float):
268
+ The proportion of the dataset to allocate to the test split.
269
+ random_state (int):
270
+ The seed for the random number of generator for reproducibility.
271
+ scaler (PytorchScaler | None):
272
+ A pre-fitted PytorchScaler instance, if None a new scaler will be created.
258
273
  """
259
274
  super().__init__()
260
275
  self.scaler = scaler
261
276
 
262
- # --- 1. Identify features and target (single-target logic) ---
263
- features = pandas_df.iloc[:, :-1]
264
- target = pandas_df.iloc[:, -1]
265
- self._feature_names = features.columns.tolist()
266
- self._target_names = [str(target.name)]
267
- self._id = self._target_names[0]
277
+ # --- 1. Identify features (from schema) ---
278
+ self._feature_names = list(schema.feature_names)
279
+
280
+ # --- 2. Infer target (by set difference) ---
281
+ all_cols_set = set(pandas_df.columns)
282
+ feature_cols_set = set(self._feature_names)
283
+
284
+ target_cols_set = all_cols_set - feature_cols_set
285
+
286
+ if len(target_cols_set) == 0:
287
+ _LOGGER.error("No target column found. The schema's features match the DataFrame's columns exactly.")
288
+ raise ValueError("No target column found in DataFrame.")
289
+ if len(target_cols_set) > 1:
290
+ _LOGGER.error(f"Ambiguous target. Found {len(target_cols_set)} columns not in the schema: {list(target_cols_set)}. DatasetMaker (single-target) requires exactly one.")
291
+ raise ValueError("Ambiguous target: More than one non-feature column found.")
292
+
293
+ target_name = list(target_cols_set)[0]
294
+ self._target_names = [target_name]
295
+ self._id = target_name
296
+
297
+ # --- 3. Split Data ---
298
+ features_df = pandas_df[self._feature_names]
299
+ target_series = pandas_df[target_name]
268
300
 
269
- # --- 2. Split ---
270
301
  X_train, X_test, y_train, y_test = train_test_split(
271
- features, target, test_size=test_size, random_state=random_state
302
+ features_df,
303
+ target_series,
304
+ test_size=test_size,
305
+ random_state=random_state
272
306
  )
273
307
  self._X_train_shape, self._X_test_shape = X_train.shape, X_test.shape
274
308
  self._y_train_shape, self._y_test_shape = y_train.shape, y_test.shape
275
309
 
276
310
  label_dtype = torch.float32 if kind == "regression" else torch.int64
277
311
 
278
- # --- 3. Scale ---
312
+ # --- 4. Scale (using the schema) ---
279
313
  X_train_final, X_test_final = self._prepare_scaler(
280
- X_train, y_train, X_test, label_dtype, continuous_feature_columns
314
+ X_train, y_train, X_test, label_dtype, schema
281
315
  )
282
316
 
283
- # --- 4. Create Datasets ---
284
- self._train_ds = _PytorchDataset(X_train_final, y_train.values, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
285
- self._test_ds = _PytorchDataset(X_test_final, y_test.values, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
286
-
317
+ # --- 5. Create Datasets ---
318
+ self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
319
+ self._test_ds = _PytorchDataset(X_test_final, y_test, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
320
+
287
321
 
288
- # --- New Multi-Target Class ---
322
+ # --- Multi-Target Class ---
289
323
  class DatasetMakerMulti(_BaseDatasetMaker):
290
324
  """
291
- Dataset maker for pre-processed, numerical pandas DataFrames with a multiple target columns.
325
+ Dataset maker for pre-processed, numerical pandas DataFrames with
326
+ multiple target columns.
292
327
 
293
- This class takes a DataFrame, automatically splits it into training and testing sets, and converts them into PyTorch Datasets.
328
+ This class takes a *full* DataFrame, a *FeatureSchema*, and a list of
329
+ *target_columns*. It validates that the schema's features and the
330
+ target columns are mutually exclusive and together account for all
331
+ columns in the DataFrame.
332
+
333
+ Targets dtype is torch.float32
294
334
  """
295
335
  def __init__(self,
296
336
  pandas_df: pandas.DataFrame,
297
337
  target_columns: List[str],
338
+ schema: FeatureSchema,
298
339
  test_size: float = 0.2,
299
340
  random_state: int = 42,
300
- scaler: Optional[PytorchScaler] = None,
301
- continuous_feature_columns: Optional[Union[List[int], List[str]]] = None):
341
+ scaler: Optional[PytorchScaler] = None):
302
342
  """
303
343
  Args:
304
- pandas_df (pandas.DataFrame): The pre-processed input DataFrame with numerical data.
305
- target_columns (list[str]): List of target column names.
306
- test_size (float): The proportion of the dataset to allocate to the test split.
307
- random_state (int): The seed for the random number generator for reproducibility.
308
- scaler (PytorchScaler | None): A pre-fitted PytorchScaler instance.
309
- continuous_feature_columns (List[int] | List[str] | None): Column indices or names of continuous features to scale. If provided creates a new PytorchScaler.
344
+ pandas_df (pandas.DataFrame):
345
+ The pre-processed input DataFrame with *all* columns
346
+ (features and targets).
347
+ target_columns (list[str]):
348
+ List of target column names.
349
+ schema (FeatureSchema):
350
+ The definitive schema object from data_exploration.
351
+ test_size (float):
352
+ The proportion of the dataset to allocate to the test split.
353
+ random_state (int):
354
+ The seed for the random number generator for reproducibility.
355
+ scaler (PytorchScaler | None):
356
+ A pre-fitted PytorchScaler instance.
357
+
358
+ ## Note:
359
+ For multi-binary classification, the most common PyTorch loss function is nn.BCEWithLogitsLoss.
360
+ This loss function requires the labels to be torch.float32 which is the same type required for regression (multi-regression) tasks.
310
361
  """
311
362
  super().__init__()
312
363
  self.scaler = scaler
313
364
 
365
+ # --- 1. Get features and targets from schema/args ---
366
+ self._feature_names = list(schema.feature_names)
314
367
  self._target_names = target_columns
315
- self._feature_names = [col for col in pandas_df.columns if col not in target_columns]
316
- features = pandas_df[self._feature_names]
317
- target = pandas_df[self._target_names]
368
+
369
+ # --- 2. Validation ---
370
+ all_cols_set = set(pandas_df.columns)
371
+ feature_cols_set = set(self._feature_names)
372
+ target_cols_set = set(self._target_names)
373
+
374
+ overlap = feature_cols_set.intersection(target_cols_set)
375
+ if overlap:
376
+ _LOGGER.error(f"Features and targets are not mutually exclusive. Overlap: {list(overlap)}")
377
+ raise ValueError("Features and targets overlap.")
378
+
379
+ schema_plus_targets = feature_cols_set.union(target_cols_set)
380
+ missing_cols = all_cols_set - schema_plus_targets
381
+ if missing_cols:
382
+ _LOGGER.warning(f"Columns in DataFrame but not in schema or targets: {list(missing_cols)}")
383
+
384
+ extra_cols = schema_plus_targets - all_cols_set
385
+ if extra_cols:
386
+ _LOGGER.error(f"Columns in schema/targets but not in DataFrame: {list(extra_cols)}")
387
+ raise ValueError("Schema/target definition mismatch with DataFrame.")
388
+
389
+ # --- 3. Split Data ---
390
+ features_df = pandas_df[self._feature_names]
391
+ target_df = pandas_df[self._target_names]
318
392
 
319
393
  X_train, X_test, y_train, y_test = train_test_split(
320
- features, target, test_size=test_size, random_state=random_state
394
+ features_df,
395
+ target_df,
396
+ test_size=test_size,
397
+ random_state=random_state
321
398
  )
322
399
  self._X_train_shape, self._X_test_shape = X_train.shape, X_test.shape
323
400
  self._y_train_shape, self._y_test_shape = y_train.shape, y_test.shape
324
401
 
325
- label_dtype = torch.float32
402
+ # Multi-target for regression or multi-binary
403
+ label_dtype = torch.float32
326
404
 
405
+ # --- 4. Scale (using the schema) ---
327
406
  X_train_final, X_test_final = self._prepare_scaler(
328
- X_train, y_train, X_test, label_dtype, continuous_feature_columns
407
+ X_train, y_train, X_test, label_dtype, schema
329
408
  )
330
409
 
410
+ # --- 5. Create Datasets ---
411
+ # _PytorchDataset now correctly handles y_train (a DataFrame)
331
412
  self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
332
413
  self._test_ds = _PytorchDataset(X_test_final, y_test, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
333
414
 
ml_tools/ML_evaluation.py CHANGED
@@ -19,6 +19,7 @@ import torch
19
19
  import shap
20
20
  from pathlib import Path
21
21
  from typing import Union, Optional, List, Literal
22
+ import warnings
22
23
 
23
24
  from .path_manager import make_fullpath
24
25
  from ._logger import _LOGGER
@@ -298,8 +299,11 @@ def shap_summary_plot(model,
298
299
 
299
300
  background_data = background_data.to(device)
300
301
  instances_to_explain = instances_to_explain.to(device)
301
-
302
- explainer = shap.DeepExplainer(model, background_data)
302
+
303
+ with warnings.catch_warnings():
304
+ warnings.simplefilter("ignore", category=UserWarning)
305
+ explainer = shap.DeepExplainer(model, background_data)
306
+
303
307
  # print("Calculating SHAP values with DeepExplainer...")
304
308
  shap_values = explainer.shap_values(instances_to_explain)
305
309
  instances_to_explain_np = instances_to_explain.cpu().numpy()
@@ -20,6 +20,7 @@ from sklearn.metrics import (
20
20
  )
21
21
  from pathlib import Path
22
22
  from typing import Union, List, Literal
23
+ import warnings
23
24
 
24
25
  from .path_manager import make_fullpath, sanitize_filename
25
26
  from ._logger import _LOGGER
@@ -273,9 +274,12 @@ def multi_target_shap_summary_plot(
273
274
 
274
275
  background_data = background_data.to(device)
275
276
  instances_to_explain = instances_to_explain.to(device)
276
-
277
- explainer = shap.DeepExplainer(model, background_data)
278
- print("Calculating SHAP values with DeepExplainer...")
277
+
278
+ with warnings.catch_warnings():
279
+ warnings.simplefilter("ignore", category=UserWarning)
280
+ explainer = shap.DeepExplainer(model, background_data)
281
+
282
+ # print("Calculating SHAP values with DeepExplainer...")
279
283
  # DeepExplainer returns a list of arrays for multi-output models
280
284
  shap_values_list = explainer.shap_values(instances_to_explain)
281
285
  instances_to_explain_np = instances_to_explain.cpu().numpy()
@@ -304,7 +308,7 @@ def multi_target_shap_summary_plot(
304
308
  return output.cpu().numpy() # Return full multi-output array
305
309
 
306
310
  explainer = shap.KernelExplainer(prediction_wrapper, background_summary)
307
- print("Calculating SHAP values with KernelExplainer...")
311
+ # print("Calculating SHAP values with KernelExplainer...")
308
312
  # KernelExplainer also returns a list of arrays for multi-output models
309
313
  shap_values_list = explainer.shap_values(instances_to_explain_np, l1_reg="aic")
310
314
  # instances_to_explain_np is already set
ml_tools/ML_inference.py CHANGED
@@ -9,7 +9,7 @@ from .ML_scaler import PytorchScaler
9
9
  from ._script_info import _script_info
10
10
  from ._logger import _LOGGER
11
11
  from .path_manager import make_fullpath
12
- from .keys import PyTorchInferenceKeys
12
+ from .keys import PyTorchInferenceKeys, PyTorchCheckpointKeys
13
13
 
14
14
 
15
15
  __all__ = [
@@ -56,11 +56,21 @@ class _BaseInferenceHandler(ABC):
56
56
  model_p = make_fullpath(state_dict, enforce="file")
57
57
 
58
58
  try:
59
- # Load the state dictionary and apply it to the model structure
60
- self.model.load_state_dict(torch.load(model_p, map_location=self.device))
59
+ # Load whatever is in the file
60
+ loaded_data = torch.load(model_p, map_location=self.device)
61
+
62
+ # Check if it's the new checkpoint dictionary or an old weights-only file
63
+ if isinstance(loaded_data, dict) and PyTorchCheckpointKeys.MODEL_STATE in loaded_data:
64
+ # It's a new training checkpoint, extract the weights
65
+ self.model.load_state_dict(loaded_data[PyTorchCheckpointKeys.MODEL_STATE])
66
+ else:
67
+ # It's an old-style file (or just a state_dict), load it directly
68
+ self.model.load_state_dict(loaded_data)
69
+
70
+ _LOGGER.info(f"Model state loaded from '{model_p.name}'.")
71
+
61
72
  self.model.to(self.device)
62
73
  self.model.eval() # Set the model to evaluation mode
63
- _LOGGER.info(f"Model state loaded from '{model_p.name}' and set to evaluation mode.")
64
74
  except Exception as e:
65
75
  _LOGGER.error(f"Failed to load model state from '{model_p}': {e}")
66
76
  raise