dragon-ml-toolbox 12.13.0__py3-none-any.whl → 13.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-12.13.0.dist-info → dragon_ml_toolbox-13.1.0.dist-info}/METADATA +1 -1
- dragon_ml_toolbox-13.1.0.dist-info/RECORD +41 -0
- ml_tools/ML_callbacks.py +40 -8
- ml_tools/ML_datasetmaster.py +144 -63
- ml_tools/ML_evaluation.py +6 -2
- ml_tools/ML_evaluation_multi.py +8 -4
- ml_tools/ML_inference.py +14 -4
- ml_tools/ML_models.py +119 -55
- ml_tools/ML_optimization.py +49 -36
- ml_tools/ML_trainer.py +98 -11
- ml_tools/PSO_optimization.py +5 -1
- ml_tools/_schema.py +19 -0
- ml_tools/data_exploration.py +75 -46
- ml_tools/keys.py +9 -0
- ml_tools/optimization_tools.py +65 -86
- ml_tools/serde.py +1 -2
- dragon_ml_toolbox-12.13.0.dist-info/RECORD +0 -41
- ml_tools/ML_simple_optimization.py +0 -413
- {dragon_ml_toolbox-12.13.0.dist-info → dragon_ml_toolbox-13.1.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-12.13.0.dist-info → dragon_ml_toolbox-13.1.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-12.13.0.dist-info → dragon_ml_toolbox-13.1.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-12.13.0.dist-info → dragon_ml_toolbox-13.1.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
dragon_ml_toolbox-13.1.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
|
|
2
|
+
dragon_ml_toolbox-13.1.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=iy2r_R7wjzsCbz_Q_jMsp_jfZ6oP8XW9QhwzRBH0mGY,1904
|
|
3
|
+
ml_tools/ETL_cleaning.py,sha256=2VBRllV8F-ZiPylPp8Az2gwn5ztgazN0BH5OKnRUhV0,20402
|
|
4
|
+
ml_tools/ETL_engineering.py,sha256=KfYqgsxupAx6e_TxwO1LZXeu5mFkIhVXJrNjP3CzIZc,54927
|
|
5
|
+
ml_tools/GUI_tools.py,sha256=Va6ig-dHULPVRwQYYtH3fvY5XPIoqRcJpRW8oXC55Hw,45413
|
|
6
|
+
ml_tools/MICE_imputation.py,sha256=X273Qlgoqqg7KTmoKd75YDyAPB0UIbTzGP3xsCmRh3E,11717
|
|
7
|
+
ml_tools/ML_callbacks.py,sha256=elD2Yr030sv_6gX_m9GVd6HTyrbmt34nFS8lrgS4HtM,15808
|
|
8
|
+
ml_tools/ML_datasetmaster.py,sha256=7QJnOM6GWFklKt2fiukITM3DK49i3ThK8wazb5szwpE,34396
|
|
9
|
+
ml_tools/ML_evaluation.py,sha256=3u5dOhS77gn3kAshKr2GwSa5xZBF0YM77ZkFevqNPvA,18528
|
|
10
|
+
ml_tools/ML_evaluation_multi.py,sha256=L6Ub_uObXsI7ToVCF6DtmAFekHRcga5wWMOnRYRR-BY,16121
|
|
11
|
+
ml_tools/ML_inference.py,sha256=yq2gdN6s_OUYC5ZLQrIJC5BA5H33q8UKODXwb-_0M2c,23549
|
|
12
|
+
ml_tools/ML_models.py,sha256=4Kb23pSusPMRH8h-R9ztK6JoH1lMuckxq7ihorll-H8,29965
|
|
13
|
+
ml_tools/ML_optimization.py,sha256=P0zkhKAwTpkorIBtR0AOIDcyexo5ngmvFUzo3DfNO-E,22692
|
|
14
|
+
ml_tools/ML_scaler.py,sha256=tw6onj9o8_kk3FQYb930HUzvv1zsFZe2YZJdF3LtHkU,7538
|
|
15
|
+
ml_tools/ML_trainer.py,sha256=9BP6JFClqGfe7GL-FGG3n5e-no9ssjEOLol7P6baGrI,29019
|
|
16
|
+
ml_tools/ML_utilities.py,sha256=EnKpPTnJ2qjZmz7kvows4Uu5CfSA7ByRmI1v2-KarKw,9337
|
|
17
|
+
ml_tools/PSO_optimization.py,sha256=T-HWHMRJUnPvPwixdU5jif3_rnnI36TzcL8u3oSCwuA,22960
|
|
18
|
+
ml_tools/RNN_forecast.py,sha256=Qa2KoZfdAvSjZ4yE78N4BFXtr3tTr0Gx7tQJZPotsh0,1967
|
|
19
|
+
ml_tools/SQL.py,sha256=vXLPGfVVg8bfkbBE3HVfyEclVbdJy0TBhuQONtMwSCQ,11234
|
|
20
|
+
ml_tools/VIF_factor.py,sha256=at5IVqPvicja2-DNSTSIIy3SkzDWCmLzo3qTG_qr5n8,10422
|
|
21
|
+
ml_tools/__init__.py,sha256=q0y9faQ6e17XCQ7eUiCZ1FJ4Bg5EQqLjZ9f_l5REUUY,41
|
|
22
|
+
ml_tools/_logger.py,sha256=dlp5cGbzooK9YSNSZYB4yjZrOaQUGW8PTrM411AOvL8,4717
|
|
23
|
+
ml_tools/_schema.py,sha256=MYYAO8CYygIvwv9TkGBAxzZpG7xQ2IV8_yB5zzFin0c,710
|
|
24
|
+
ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
|
|
25
|
+
ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
|
|
26
|
+
ml_tools/custom_logger.py,sha256=7tSAgRL7e-Ekm7rS1FLDocaPLCnaoKc7VSrtfwCtCEg,10067
|
|
27
|
+
ml_tools/data_exploration.py,sha256=aVcxjoXVqrmFBpwBSbLvrG8quzJfr92On48Sy3K58Vs,51900
|
|
28
|
+
ml_tools/ensemble_evaluation.py,sha256=FGHSe8LBI8_w8LjNeJWOcYQ1UK_mc6fVah8gmSvNVGg,26853
|
|
29
|
+
ml_tools/ensemble_inference.py,sha256=0yLmLNj45RVVoSCLH1ZYJG9IoAhTkWUqEZmLOQTFGTY,9348
|
|
30
|
+
ml_tools/ensemble_learning.py,sha256=vsIED7nlheYI4w2SBzP6SC1AnNeMfn-2A1Gqw5EfxsM,21964
|
|
31
|
+
ml_tools/handle_excel.py,sha256=pfdAPb9ywegFkM9T54bRssDOsX-K7rSeV0RaMz7lEAo,14006
|
|
32
|
+
ml_tools/keys.py,sha256=eJ4St5fl8uHstEGO1XVdP8G-ddwjOxV9zqG0D6W8pCI,2124
|
|
33
|
+
ml_tools/math_utilities.py,sha256=PxoOrnuj6Ntp7_TJqyDWi0JX03WpAO5iaFNK2Oeq5I4,8800
|
|
34
|
+
ml_tools/optimization_tools.py,sha256=TYFQ2nSnp7xxs-VyoZISWgnGJghFbsWasHjruegyJRs,12763
|
|
35
|
+
ml_tools/path_manager.py,sha256=CyDU16pOKmC82jPubqJPT6EBt-u-3rGVbxyPIZCvDDY,18432
|
|
36
|
+
ml_tools/serde.py,sha256=Wjf8N1thSfJ4r6Vm_pWxP2UTPcP2f3s2FiGz0z6kqKI,4925
|
|
37
|
+
ml_tools/utilities.py,sha256=OcAyV1tEcYAfOWlGjRgopsjDLxU3DcI5EynzvWV4q3A,15754
|
|
38
|
+
dragon_ml_toolbox-13.1.0.dist-info/METADATA,sha256=8n0bhl_rSVdg6MDh51r7tl5JflbqIOdqZx5gjaBWk0o,6166
|
|
39
|
+
dragon_ml_toolbox-13.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
40
|
+
dragon_ml_toolbox-13.1.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
41
|
+
dragon_ml_toolbox-13.1.0.dist-info/RECORD,,
|
ml_tools/ML_callbacks.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import Union, Literal, Optional
|
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
|
|
7
7
|
from .path_manager import make_fullpath, sanitize_filename
|
|
8
|
-
from .keys import PyTorchLogKeys
|
|
8
|
+
from .keys import PyTorchLogKeys, PyTorchCheckpointKeys
|
|
9
9
|
from ._logger import _LOGGER
|
|
10
10
|
from ._script_info import _script_info
|
|
11
11
|
|
|
@@ -189,7 +189,7 @@ class EarlyStopping(Callback):
|
|
|
189
189
|
|
|
190
190
|
class ModelCheckpoint(Callback):
|
|
191
191
|
"""
|
|
192
|
-
Saves the model weights to a directory with automated filename generation and rotation.
|
|
192
|
+
Saves the model weights, optimizer state, LR scheduler state (if any), and epoch number to a directory with automated filename generation and rotation.
|
|
193
193
|
"""
|
|
194
194
|
def __init__(self, save_dir: Union[str,Path], checkpoint_name: Optional[str]=None, monitor: str = PyTorchLogKeys.VAL_LOSS,
|
|
195
195
|
save_best_only: bool = True, mode: Literal['auto', 'min', 'max']= 'auto', verbose: int = 0):
|
|
@@ -200,7 +200,7 @@ class ModelCheckpoint(Callback):
|
|
|
200
200
|
Args:
|
|
201
201
|
save_dir (str): Directory where checkpoint files will be saved.
|
|
202
202
|
checkpoint_name (str| None): If None, the filename will include the epoch and score.
|
|
203
|
-
monitor (str): Metric to monitor
|
|
203
|
+
monitor (str): Metric to monitor.
|
|
204
204
|
save_best_only (bool): If true, save only the best model.
|
|
205
205
|
mode (str): One of {'auto', 'min', 'max'}.
|
|
206
206
|
verbose (int): Verbosity mode.
|
|
@@ -270,15 +270,29 @@ class ModelCheckpoint(Callback):
|
|
|
270
270
|
if self.verbose > 0:
|
|
271
271
|
_LOGGER.info(f"Epoch {epoch}: {self.monitor} improved from {old_best_str} to {current:.4f}, saving model to {new_filepath}")
|
|
272
272
|
|
|
273
|
+
# Update best score *before* saving
|
|
274
|
+
self.best = current
|
|
275
|
+
|
|
276
|
+
# Create a comprehensive checkpoint dictionary
|
|
277
|
+
checkpoint_data = {
|
|
278
|
+
PyTorchCheckpointKeys.EPOCH: epoch,
|
|
279
|
+
PyTorchCheckpointKeys.MODEL_STATE: self.trainer.model.state_dict(), # type: ignore
|
|
280
|
+
PyTorchCheckpointKeys.OPTIMIZER_STATE: self.trainer.optimizer.state_dict(), # type: ignore
|
|
281
|
+
PyTorchCheckpointKeys.BEST_SCORE: self.best,
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
# Check for scheduler
|
|
285
|
+
if hasattr(self.trainer, 'scheduler') and self.trainer.scheduler is not None: # type: ignore
|
|
286
|
+
checkpoint_data[PyTorchCheckpointKeys.SCHEDULER_STATE] = self.trainer.scheduler.state_dict() # type: ignore
|
|
287
|
+
|
|
273
288
|
# Save the new best model
|
|
274
|
-
torch.save(
|
|
289
|
+
torch.save(checkpoint_data, new_filepath)
|
|
275
290
|
|
|
276
291
|
# Delete the old best model file
|
|
277
292
|
if self.last_best_filepath and self.last_best_filepath.exists():
|
|
278
293
|
self.last_best_filepath.unlink()
|
|
279
294
|
|
|
280
295
|
# Update state
|
|
281
|
-
self.best = current
|
|
282
296
|
self.last_best_filepath = new_filepath
|
|
283
297
|
|
|
284
298
|
def _save_rolling_checkpoints(self, epoch, logs):
|
|
@@ -292,7 +306,19 @@ class ModelCheckpoint(Callback):
|
|
|
292
306
|
|
|
293
307
|
if self.verbose > 0:
|
|
294
308
|
_LOGGER.info(f'Epoch {epoch}: saving model to {filepath}')
|
|
295
|
-
|
|
309
|
+
|
|
310
|
+
# Create a comprehensive checkpoint dictionary
|
|
311
|
+
checkpoint_data = {
|
|
312
|
+
PyTorchCheckpointKeys.EPOCH: epoch,
|
|
313
|
+
PyTorchCheckpointKeys.MODEL_STATE: self.trainer.model.state_dict(), # type: ignore
|
|
314
|
+
PyTorchCheckpointKeys.OPTIMIZER_STATE: self.trainer.optimizer.state_dict(), # type: ignore
|
|
315
|
+
PyTorchCheckpointKeys.BEST_SCORE: self.best, # Save the current best score
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
if hasattr(self.trainer, 'scheduler') and self.trainer.scheduler is not None: # type: ignore
|
|
319
|
+
checkpoint_data[PyTorchCheckpointKeys.SCHEDULER_STATE] = self.trainer.scheduler.state_dict() # type: ignore
|
|
320
|
+
|
|
321
|
+
torch.save(checkpoint_data, filepath)
|
|
296
322
|
|
|
297
323
|
self.saved_checkpoints.append(filepath)
|
|
298
324
|
|
|
@@ -309,19 +335,25 @@ class LRScheduler(Callback):
|
|
|
309
335
|
"""
|
|
310
336
|
Callback to manage a PyTorch learning rate scheduler.
|
|
311
337
|
"""
|
|
312
|
-
def __init__(self, scheduler, monitor: Optional[str] =
|
|
338
|
+
def __init__(self, scheduler, monitor: Optional[str] = PyTorchLogKeys.VAL_LOSS):
|
|
313
339
|
"""
|
|
314
340
|
This callback automatically calls the scheduler's `step()` method at the
|
|
315
341
|
end of each epoch. It also logs a message when the learning rate changes.
|
|
316
342
|
|
|
317
343
|
Args:
|
|
318
344
|
scheduler: An initialized PyTorch learning rate scheduler.
|
|
319
|
-
monitor (str
|
|
345
|
+
monitor (str): The metric to monitor for schedulers that require it, like `ReduceLROnPlateau`. Should match a key in the logs (e.g., 'val_loss').
|
|
320
346
|
"""
|
|
321
347
|
super().__init__()
|
|
322
348
|
self.scheduler = scheduler
|
|
323
349
|
self.monitor = monitor
|
|
324
350
|
self.previous_lr = None
|
|
351
|
+
|
|
352
|
+
def set_trainer(self, trainer):
|
|
353
|
+
"""This is called by the Trainer to associate itself with the callback."""
|
|
354
|
+
super().set_trainer(trainer)
|
|
355
|
+
# Register the scheduler with the trainer so it can be added to the checkpoint
|
|
356
|
+
self.trainer.scheduler = self.scheduler # type: ignore
|
|
325
357
|
|
|
326
358
|
def on_train_begin(self, logs=None):
|
|
327
359
|
"""Store the initial learning rate."""
|
ml_tools/ML_datasetmaster.py
CHANGED
|
@@ -17,6 +17,7 @@ from ._script_info import _script_info
|
|
|
17
17
|
from .custom_logger import save_list_strings
|
|
18
18
|
from .ML_scaler import PytorchScaler
|
|
19
19
|
from .keys import DatasetKeys
|
|
20
|
+
from ._schema import FeatureSchema
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
__all__ = [
|
|
@@ -35,7 +36,7 @@ class _PytorchDataset(Dataset):
|
|
|
35
36
|
Converts numpy/pandas data into tensors for model consumption.
|
|
36
37
|
"""
|
|
37
38
|
def __init__(self, features: Union[numpy.ndarray, pandas.DataFrame],
|
|
38
|
-
labels: Union[numpy.ndarray, pandas.Series],
|
|
39
|
+
labels: Union[numpy.ndarray, pandas.Series, pandas.DataFrame],
|
|
39
40
|
labels_dtype: torch.dtype,
|
|
40
41
|
features_dtype: torch.dtype = torch.float32,
|
|
41
42
|
feature_names: Optional[List[str]] = None,
|
|
@@ -48,13 +49,16 @@ class _PytorchDataset(Dataset):
|
|
|
48
49
|
|
|
49
50
|
if isinstance(features, numpy.ndarray):
|
|
50
51
|
self.features = torch.tensor(features, dtype=features_dtype)
|
|
51
|
-
else:
|
|
52
|
-
self.features = torch.tensor(features.
|
|
52
|
+
else: # It's a pandas.DataFrame
|
|
53
|
+
self.features = torch.tensor(features.to_numpy(), dtype=features_dtype)
|
|
53
54
|
|
|
54
55
|
if isinstance(labels, numpy.ndarray):
|
|
55
56
|
self.labels = torch.tensor(labels, dtype=labels_dtype)
|
|
57
|
+
elif isinstance(labels, (pandas.Series, pandas.DataFrame)):
|
|
58
|
+
self.labels = torch.tensor(labels.to_numpy(), dtype=labels_dtype)
|
|
56
59
|
else:
|
|
57
|
-
|
|
60
|
+
# Fallback for other types (though your type hints don't cover this)
|
|
61
|
+
self.labels = torch.tensor(labels, dtype=labels_dtype)
|
|
58
62
|
|
|
59
63
|
self._feature_names = feature_names
|
|
60
64
|
self._target_names = target_names
|
|
@@ -98,27 +102,34 @@ class _BaseDatasetMaker(ABC):
|
|
|
98
102
|
self._X_test_shape = (0,0)
|
|
99
103
|
self._y_train_shape = (0,)
|
|
100
104
|
self._y_test_shape = (0,)
|
|
101
|
-
|
|
102
|
-
def _prepare_scaler(self,
|
|
103
|
-
|
|
105
|
+
|
|
106
|
+
def _prepare_scaler(self,
|
|
107
|
+
X_train: pandas.DataFrame,
|
|
108
|
+
y_train: Union[pandas.Series, pandas.DataFrame],
|
|
109
|
+
X_test: pandas.DataFrame,
|
|
110
|
+
label_dtype: torch.dtype,
|
|
111
|
+
schema: FeatureSchema):
|
|
112
|
+
"""Internal helper to fit and apply a PytorchScaler using a FeatureSchema."""
|
|
104
113
|
continuous_feature_indices: Optional[List[int]] = None
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
114
|
+
|
|
115
|
+
# Get continuous feature indices *from the schema*
|
|
116
|
+
if schema.continuous_feature_names:
|
|
117
|
+
_LOGGER.info("Getting continuous feature indices from schema.")
|
|
118
|
+
try:
|
|
119
|
+
# Convert columns to a standard list for .index()
|
|
120
|
+
train_cols_list = X_train.columns.to_list()
|
|
121
|
+
# Map names from schema to column indices in the training DataFrame
|
|
122
|
+
continuous_feature_indices = [train_cols_list.index(name) for name in schema.continuous_feature_names]
|
|
123
|
+
except ValueError as e: #
|
|
124
|
+
_LOGGER.error(f"Feature name from schema not found in training data columns:\n{e}")
|
|
125
|
+
raise ValueError()
|
|
126
|
+
else:
|
|
127
|
+
_LOGGER.info("No continuous features listed in schema. Scaler will not be fitted.")
|
|
118
128
|
|
|
119
129
|
X_train_values = X_train.values
|
|
120
130
|
X_test_values = X_test.values
|
|
121
131
|
|
|
132
|
+
# continuous_feature_indices is derived
|
|
122
133
|
if self.scaler is None and continuous_feature_indices:
|
|
123
134
|
_LOGGER.info("Fitting a new PytorchScaler on training data.")
|
|
124
135
|
temp_train_ds = _PytorchDataset(X_train_values, y_train, label_dtype) # type: ignore
|
|
@@ -225,10 +236,8 @@ class DatasetMaker(_BaseDatasetMaker):
|
|
|
225
236
|
"""
|
|
226
237
|
Dataset maker for pre-processed, numerical pandas DataFrames with a single target column.
|
|
227
238
|
|
|
228
|
-
This class takes a DataFrame, automatically splits
|
|
229
|
-
|
|
230
|
-
target variable is the last column. It can also create, apply, and
|
|
231
|
-
save a PytorchScaler for standardizing continuous features.
|
|
239
|
+
This class takes a DataFrame, and a FeatureSchema, automatically splits and converts them into PyTorch Datasets.
|
|
240
|
+
It can also create and apply a PytorchScaler using the schema.
|
|
232
241
|
|
|
233
242
|
Attributes:
|
|
234
243
|
`scaler` -> PytorchScaler | None
|
|
@@ -242,92 +251,164 @@ class DatasetMaker(_BaseDatasetMaker):
|
|
|
242
251
|
"""
|
|
243
252
|
def __init__(self,
|
|
244
253
|
pandas_df: pandas.DataFrame,
|
|
254
|
+
schema: FeatureSchema,
|
|
245
255
|
kind: Literal["regression", "classification"],
|
|
246
256
|
test_size: float = 0.2,
|
|
247
257
|
random_state: int = 42,
|
|
248
|
-
scaler: Optional[PytorchScaler] = None
|
|
249
|
-
continuous_feature_columns: Optional[Union[List[int], List[str]]] = None):
|
|
258
|
+
scaler: Optional[PytorchScaler] = None):
|
|
250
259
|
"""
|
|
251
260
|
Args:
|
|
252
|
-
pandas_df (pandas.DataFrame):
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
261
|
+
pandas_df (pandas.DataFrame):
|
|
262
|
+
The pre-processed input DataFrame containing all columns. (features and single target).
|
|
263
|
+
schema (FeatureSchema):
|
|
264
|
+
The definitive schema object from data_exploration.
|
|
265
|
+
kind (Literal["regression", "classification"]):
|
|
266
|
+
The type of ML task. This determines the data type of the labels.
|
|
267
|
+
test_size (float):
|
|
268
|
+
The proportion of the dataset to allocate to the test split.
|
|
269
|
+
random_state (int):
|
|
270
|
+
The seed for the random number of generator for reproducibility.
|
|
271
|
+
scaler (PytorchScaler | None):
|
|
272
|
+
A pre-fitted PytorchScaler instance, if None a new scaler will be created.
|
|
258
273
|
"""
|
|
259
274
|
super().__init__()
|
|
260
275
|
self.scaler = scaler
|
|
261
276
|
|
|
262
|
-
# --- 1. Identify features
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
277
|
+
# --- 1. Identify features (from schema) ---
|
|
278
|
+
self._feature_names = list(schema.feature_names)
|
|
279
|
+
|
|
280
|
+
# --- 2. Infer target (by set difference) ---
|
|
281
|
+
all_cols_set = set(pandas_df.columns)
|
|
282
|
+
feature_cols_set = set(self._feature_names)
|
|
283
|
+
|
|
284
|
+
target_cols_set = all_cols_set - feature_cols_set
|
|
285
|
+
|
|
286
|
+
if len(target_cols_set) == 0:
|
|
287
|
+
_LOGGER.error("No target column found. The schema's features match the DataFrame's columns exactly.")
|
|
288
|
+
raise ValueError("No target column found in DataFrame.")
|
|
289
|
+
if len(target_cols_set) > 1:
|
|
290
|
+
_LOGGER.error(f"Ambiguous target. Found {len(target_cols_set)} columns not in the schema: {list(target_cols_set)}. DatasetMaker (single-target) requires exactly one.")
|
|
291
|
+
raise ValueError("Ambiguous target: More than one non-feature column found.")
|
|
292
|
+
|
|
293
|
+
target_name = list(target_cols_set)[0]
|
|
294
|
+
self._target_names = [target_name]
|
|
295
|
+
self._id = target_name
|
|
296
|
+
|
|
297
|
+
# --- 3. Split Data ---
|
|
298
|
+
features_df = pandas_df[self._feature_names]
|
|
299
|
+
target_series = pandas_df[target_name]
|
|
268
300
|
|
|
269
|
-
# --- 2. Split ---
|
|
270
301
|
X_train, X_test, y_train, y_test = train_test_split(
|
|
271
|
-
|
|
302
|
+
features_df,
|
|
303
|
+
target_series,
|
|
304
|
+
test_size=test_size,
|
|
305
|
+
random_state=random_state
|
|
272
306
|
)
|
|
273
307
|
self._X_train_shape, self._X_test_shape = X_train.shape, X_test.shape
|
|
274
308
|
self._y_train_shape, self._y_test_shape = y_train.shape, y_test.shape
|
|
275
309
|
|
|
276
310
|
label_dtype = torch.float32 if kind == "regression" else torch.int64
|
|
277
311
|
|
|
278
|
-
# ---
|
|
312
|
+
# --- 4. Scale (using the schema) ---
|
|
279
313
|
X_train_final, X_test_final = self._prepare_scaler(
|
|
280
|
-
X_train, y_train, X_test, label_dtype,
|
|
314
|
+
X_train, y_train, X_test, label_dtype, schema
|
|
281
315
|
)
|
|
282
316
|
|
|
283
|
-
# ---
|
|
284
|
-
self._train_ds = _PytorchDataset(X_train_final, y_train
|
|
285
|
-
self._test_ds = _PytorchDataset(X_test_final, y_test
|
|
286
|
-
|
|
317
|
+
# --- 5. Create Datasets ---
|
|
318
|
+
self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
|
|
319
|
+
self._test_ds = _PytorchDataset(X_test_final, y_test, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
|
|
320
|
+
|
|
287
321
|
|
|
288
|
-
# ---
|
|
322
|
+
# --- Multi-Target Class ---
|
|
289
323
|
class DatasetMakerMulti(_BaseDatasetMaker):
|
|
290
324
|
"""
|
|
291
|
-
Dataset maker for pre-processed, numerical pandas DataFrames with
|
|
325
|
+
Dataset maker for pre-processed, numerical pandas DataFrames with
|
|
326
|
+
multiple target columns.
|
|
292
327
|
|
|
293
|
-
This class takes a DataFrame,
|
|
328
|
+
This class takes a *full* DataFrame, a *FeatureSchema*, and a list of
|
|
329
|
+
*target_columns*. It validates that the schema's features and the
|
|
330
|
+
target columns are mutually exclusive and together account for all
|
|
331
|
+
columns in the DataFrame.
|
|
332
|
+
|
|
333
|
+
Targets dtype is torch.float32
|
|
294
334
|
"""
|
|
295
335
|
def __init__(self,
|
|
296
336
|
pandas_df: pandas.DataFrame,
|
|
297
337
|
target_columns: List[str],
|
|
338
|
+
schema: FeatureSchema,
|
|
298
339
|
test_size: float = 0.2,
|
|
299
340
|
random_state: int = 42,
|
|
300
|
-
scaler: Optional[PytorchScaler] = None
|
|
301
|
-
continuous_feature_columns: Optional[Union[List[int], List[str]]] = None):
|
|
341
|
+
scaler: Optional[PytorchScaler] = None):
|
|
302
342
|
"""
|
|
303
343
|
Args:
|
|
304
|
-
pandas_df (pandas.DataFrame):
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
344
|
+
pandas_df (pandas.DataFrame):
|
|
345
|
+
The pre-processed input DataFrame with *all* columns
|
|
346
|
+
(features and targets).
|
|
347
|
+
target_columns (list[str]):
|
|
348
|
+
List of target column names.
|
|
349
|
+
schema (FeatureSchema):
|
|
350
|
+
The definitive schema object from data_exploration.
|
|
351
|
+
test_size (float):
|
|
352
|
+
The proportion of the dataset to allocate to the test split.
|
|
353
|
+
random_state (int):
|
|
354
|
+
The seed for the random number generator for reproducibility.
|
|
355
|
+
scaler (PytorchScaler | None):
|
|
356
|
+
A pre-fitted PytorchScaler instance.
|
|
357
|
+
|
|
358
|
+
## Note:
|
|
359
|
+
For multi-binary classification, the most common PyTorch loss function is nn.BCEWithLogitsLoss.
|
|
360
|
+
This loss function requires the labels to be torch.float32 which is the same type required for regression (multi-regression) tasks.
|
|
310
361
|
"""
|
|
311
362
|
super().__init__()
|
|
312
363
|
self.scaler = scaler
|
|
313
364
|
|
|
365
|
+
# --- 1. Get features and targets from schema/args ---
|
|
366
|
+
self._feature_names = list(schema.feature_names)
|
|
314
367
|
self._target_names = target_columns
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
368
|
+
|
|
369
|
+
# --- 2. Validation ---
|
|
370
|
+
all_cols_set = set(pandas_df.columns)
|
|
371
|
+
feature_cols_set = set(self._feature_names)
|
|
372
|
+
target_cols_set = set(self._target_names)
|
|
373
|
+
|
|
374
|
+
overlap = feature_cols_set.intersection(target_cols_set)
|
|
375
|
+
if overlap:
|
|
376
|
+
_LOGGER.error(f"Features and targets are not mutually exclusive. Overlap: {list(overlap)}")
|
|
377
|
+
raise ValueError("Features and targets overlap.")
|
|
378
|
+
|
|
379
|
+
schema_plus_targets = feature_cols_set.union(target_cols_set)
|
|
380
|
+
missing_cols = all_cols_set - schema_plus_targets
|
|
381
|
+
if missing_cols:
|
|
382
|
+
_LOGGER.warning(f"Columns in DataFrame but not in schema or targets: {list(missing_cols)}")
|
|
383
|
+
|
|
384
|
+
extra_cols = schema_plus_targets - all_cols_set
|
|
385
|
+
if extra_cols:
|
|
386
|
+
_LOGGER.error(f"Columns in schema/targets but not in DataFrame: {list(extra_cols)}")
|
|
387
|
+
raise ValueError("Schema/target definition mismatch with DataFrame.")
|
|
388
|
+
|
|
389
|
+
# --- 3. Split Data ---
|
|
390
|
+
features_df = pandas_df[self._feature_names]
|
|
391
|
+
target_df = pandas_df[self._target_names]
|
|
318
392
|
|
|
319
393
|
X_train, X_test, y_train, y_test = train_test_split(
|
|
320
|
-
|
|
394
|
+
features_df,
|
|
395
|
+
target_df,
|
|
396
|
+
test_size=test_size,
|
|
397
|
+
random_state=random_state
|
|
321
398
|
)
|
|
322
399
|
self._X_train_shape, self._X_test_shape = X_train.shape, X_test.shape
|
|
323
400
|
self._y_train_shape, self._y_test_shape = y_train.shape, y_test.shape
|
|
324
401
|
|
|
325
|
-
|
|
402
|
+
# Multi-target for regression or multi-binary
|
|
403
|
+
label_dtype = torch.float32
|
|
326
404
|
|
|
405
|
+
# --- 4. Scale (using the schema) ---
|
|
327
406
|
X_train_final, X_test_final = self._prepare_scaler(
|
|
328
|
-
X_train, y_train, X_test, label_dtype,
|
|
407
|
+
X_train, y_train, X_test, label_dtype, schema
|
|
329
408
|
)
|
|
330
409
|
|
|
410
|
+
# --- 5. Create Datasets ---
|
|
411
|
+
# _PytorchDataset now correctly handles y_train (a DataFrame)
|
|
331
412
|
self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
|
|
332
413
|
self._test_ds = _PytorchDataset(X_test_final, y_test, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
|
|
333
414
|
|
ml_tools/ML_evaluation.py
CHANGED
|
@@ -19,6 +19,7 @@ import torch
|
|
|
19
19
|
import shap
|
|
20
20
|
from pathlib import Path
|
|
21
21
|
from typing import Union, Optional, List, Literal
|
|
22
|
+
import warnings
|
|
22
23
|
|
|
23
24
|
from .path_manager import make_fullpath
|
|
24
25
|
from ._logger import _LOGGER
|
|
@@ -298,8 +299,11 @@ def shap_summary_plot(model,
|
|
|
298
299
|
|
|
299
300
|
background_data = background_data.to(device)
|
|
300
301
|
instances_to_explain = instances_to_explain.to(device)
|
|
301
|
-
|
|
302
|
-
|
|
302
|
+
|
|
303
|
+
with warnings.catch_warnings():
|
|
304
|
+
warnings.simplefilter("ignore", category=UserWarning)
|
|
305
|
+
explainer = shap.DeepExplainer(model, background_data)
|
|
306
|
+
|
|
303
307
|
# print("Calculating SHAP values with DeepExplainer...")
|
|
304
308
|
shap_values = explainer.shap_values(instances_to_explain)
|
|
305
309
|
instances_to_explain_np = instances_to_explain.cpu().numpy()
|
ml_tools/ML_evaluation_multi.py
CHANGED
|
@@ -20,6 +20,7 @@ from sklearn.metrics import (
|
|
|
20
20
|
)
|
|
21
21
|
from pathlib import Path
|
|
22
22
|
from typing import Union, List, Literal
|
|
23
|
+
import warnings
|
|
23
24
|
|
|
24
25
|
from .path_manager import make_fullpath, sanitize_filename
|
|
25
26
|
from ._logger import _LOGGER
|
|
@@ -273,9 +274,12 @@ def multi_target_shap_summary_plot(
|
|
|
273
274
|
|
|
274
275
|
background_data = background_data.to(device)
|
|
275
276
|
instances_to_explain = instances_to_explain.to(device)
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
277
|
+
|
|
278
|
+
with warnings.catch_warnings():
|
|
279
|
+
warnings.simplefilter("ignore", category=UserWarning)
|
|
280
|
+
explainer = shap.DeepExplainer(model, background_data)
|
|
281
|
+
|
|
282
|
+
# print("Calculating SHAP values with DeepExplainer...")
|
|
279
283
|
# DeepExplainer returns a list of arrays for multi-output models
|
|
280
284
|
shap_values_list = explainer.shap_values(instances_to_explain)
|
|
281
285
|
instances_to_explain_np = instances_to_explain.cpu().numpy()
|
|
@@ -304,7 +308,7 @@ def multi_target_shap_summary_plot(
|
|
|
304
308
|
return output.cpu().numpy() # Return full multi-output array
|
|
305
309
|
|
|
306
310
|
explainer = shap.KernelExplainer(prediction_wrapper, background_summary)
|
|
307
|
-
print("Calculating SHAP values with KernelExplainer...")
|
|
311
|
+
# print("Calculating SHAP values with KernelExplainer...")
|
|
308
312
|
# KernelExplainer also returns a list of arrays for multi-output models
|
|
309
313
|
shap_values_list = explainer.shap_values(instances_to_explain_np, l1_reg="aic")
|
|
310
314
|
# instances_to_explain_np is already set
|
ml_tools/ML_inference.py
CHANGED
|
@@ -9,7 +9,7 @@ from .ML_scaler import PytorchScaler
|
|
|
9
9
|
from ._script_info import _script_info
|
|
10
10
|
from ._logger import _LOGGER
|
|
11
11
|
from .path_manager import make_fullpath
|
|
12
|
-
from .keys import PyTorchInferenceKeys
|
|
12
|
+
from .keys import PyTorchInferenceKeys, PyTorchCheckpointKeys
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
__all__ = [
|
|
@@ -56,11 +56,21 @@ class _BaseInferenceHandler(ABC):
|
|
|
56
56
|
model_p = make_fullpath(state_dict, enforce="file")
|
|
57
57
|
|
|
58
58
|
try:
|
|
59
|
-
# Load
|
|
60
|
-
|
|
59
|
+
# Load whatever is in the file
|
|
60
|
+
loaded_data = torch.load(model_p, map_location=self.device)
|
|
61
|
+
|
|
62
|
+
# Check if it's the new checkpoint dictionary or an old weights-only file
|
|
63
|
+
if isinstance(loaded_data, dict) and PyTorchCheckpointKeys.MODEL_STATE in loaded_data:
|
|
64
|
+
# It's a new training checkpoint, extract the weights
|
|
65
|
+
self.model.load_state_dict(loaded_data[PyTorchCheckpointKeys.MODEL_STATE])
|
|
66
|
+
else:
|
|
67
|
+
# It's an old-style file (or just a state_dict), load it directly
|
|
68
|
+
self.model.load_state_dict(loaded_data)
|
|
69
|
+
|
|
70
|
+
_LOGGER.info(f"Model state loaded from '{model_p.name}'.")
|
|
71
|
+
|
|
61
72
|
self.model.to(self.device)
|
|
62
73
|
self.model.eval() # Set the model to evaluation mode
|
|
63
|
-
_LOGGER.info(f"Model state loaded from '{model_p.name}' and set to evaluation mode.")
|
|
64
74
|
except Exception as e:
|
|
65
75
|
_LOGGER.error(f"Failed to load model state from '{model_p}': {e}")
|
|
66
76
|
raise
|