dragon-ml-toolbox 14.3.1__py3-none-any.whl → 16.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-14.3.1.dist-info → dragon_ml_toolbox-16.0.0.dist-info}/METADATA +10 -5
- dragon_ml_toolbox-16.0.0.dist-info/RECORD +51 -0
- ml_tools/ETL_cleaning.py +20 -20
- ml_tools/ETL_engineering.py +23 -25
- ml_tools/GUI_tools.py +20 -20
- ml_tools/MICE_imputation.py +3 -3
- ml_tools/ML_callbacks.py +43 -26
- ml_tools/ML_configuration.py +309 -0
- ml_tools/ML_datasetmaster.py +220 -260
- ml_tools/ML_evaluation.py +317 -81
- ml_tools/ML_evaluation_multi.py +127 -36
- ml_tools/ML_inference.py +249 -207
- ml_tools/ML_models.py +13 -102
- ml_tools/ML_models_advanced.py +1 -1
- ml_tools/ML_optimization.py +12 -12
- ml_tools/ML_scaler.py +11 -11
- ml_tools/ML_sequence_datasetmaster.py +341 -0
- ml_tools/ML_sequence_evaluation.py +215 -0
- ml_tools/ML_sequence_inference.py +391 -0
- ml_tools/ML_sequence_models.py +139 -0
- ml_tools/ML_trainer.py +1247 -338
- ml_tools/ML_utilities.py +51 -2
- ml_tools/ML_vision_datasetmaster.py +262 -118
- ml_tools/ML_vision_evaluation.py +26 -6
- ml_tools/ML_vision_inference.py +117 -140
- ml_tools/ML_vision_models.py +15 -1
- ml_tools/ML_vision_transformers.py +233 -7
- ml_tools/PSO_optimization.py +6 -6
- ml_tools/SQL.py +4 -4
- ml_tools/{keys.py → _keys.py} +45 -1
- ml_tools/_schema.py +1 -1
- ml_tools/ensemble_evaluation.py +54 -11
- ml_tools/ensemble_inference.py +7 -33
- ml_tools/ensemble_learning.py +1 -1
- ml_tools/optimization_tools.py +2 -2
- ml_tools/path_manager.py +5 -5
- ml_tools/utilities.py +1 -2
- dragon_ml_toolbox-14.3.1.dist-info/RECORD +0 -48
- ml_tools/RNN_forecast.py +0 -56
- ml_tools/_ML_vision_recipe.py +0 -88
- {dragon_ml_toolbox-14.3.1.dist-info → dragon_ml_toolbox-16.0.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-14.3.1.dist-info → dragon_ml_toolbox-16.0.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-14.3.1.dist-info → dragon_ml_toolbox-16.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-14.3.1.dist-info → dragon_ml_toolbox-16.0.0.dist-info}/top_level.txt +0 -0
ml_tools/ML_datasetmaster.py
CHANGED
|
@@ -3,27 +3,25 @@ from torch.utils.data import Dataset
|
|
|
3
3
|
import pandas
|
|
4
4
|
import numpy
|
|
5
5
|
from sklearn.model_selection import train_test_split
|
|
6
|
-
from typing import Literal, Union,
|
|
7
|
-
from abc import ABC
|
|
8
|
-
import matplotlib.pyplot as plt
|
|
6
|
+
from typing import Literal, Union, List, Optional
|
|
7
|
+
from abc import ABC
|
|
9
8
|
from pathlib import Path
|
|
10
9
|
|
|
11
10
|
from .path_manager import make_fullpath, sanitize_filename
|
|
12
11
|
from ._logger import _LOGGER
|
|
13
12
|
from ._script_info import _script_info
|
|
14
13
|
from .custom_logger import save_list_strings
|
|
15
|
-
from .ML_scaler import
|
|
16
|
-
from .
|
|
14
|
+
from .ML_scaler import DragonScaler
|
|
15
|
+
from ._keys import DatasetKeys, MLTaskKeys
|
|
17
16
|
from ._schema import FeatureSchema
|
|
17
|
+
from .custom_logger import custom_logger
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
__all__ = [
|
|
21
|
-
"
|
|
22
|
-
"
|
|
23
|
-
"SequenceMaker"
|
|
21
|
+
"DragonDataset",
|
|
22
|
+
"DragonDatasetMulti"
|
|
24
23
|
]
|
|
25
24
|
|
|
26
|
-
|
|
27
25
|
# --- Internal Helper Class ---
|
|
28
26
|
class _PytorchDataset(Dataset):
|
|
29
27
|
"""
|
|
@@ -57,6 +55,7 @@ class _PytorchDataset(Dataset):
|
|
|
57
55
|
|
|
58
56
|
self._feature_names = feature_names
|
|
59
57
|
self._target_names = target_names
|
|
58
|
+
self.classes: List[str] = []
|
|
60
59
|
|
|
61
60
|
def __len__(self):
|
|
62
61
|
return len(self.features)
|
|
@@ -78,6 +77,7 @@ class _PytorchDataset(Dataset):
|
|
|
78
77
|
return self._target_names
|
|
79
78
|
else:
|
|
80
79
|
_LOGGER.error(f"Dataset {self.__class__} has not been initialized with any target names.")
|
|
80
|
+
raise ValueError()
|
|
81
81
|
|
|
82
82
|
|
|
83
83
|
# --- Abstract Base Class ---
|
|
@@ -88,23 +88,28 @@ class _BaseDatasetMaker(ABC):
|
|
|
88
88
|
"""
|
|
89
89
|
def __init__(self):
|
|
90
90
|
self._train_ds: Optional[Dataset] = None
|
|
91
|
+
self._val_ds: Optional[Dataset] = None
|
|
91
92
|
self._test_ds: Optional[Dataset] = None
|
|
92
|
-
self.scaler: Optional[
|
|
93
|
+
self.scaler: Optional[DragonScaler] = None
|
|
93
94
|
self._id: Optional[str] = None
|
|
94
95
|
self._feature_names: List[str] = []
|
|
95
96
|
self._target_names: List[str] = []
|
|
96
97
|
self._X_train_shape = (0,0)
|
|
98
|
+
self._X_val_shape = (0,0)
|
|
97
99
|
self._X_test_shape = (0,0)
|
|
98
100
|
self._y_train_shape = (0,)
|
|
101
|
+
self._y_val_shape = (0,)
|
|
99
102
|
self._y_test_shape = (0,)
|
|
103
|
+
self.class_map: Optional[dict[str, int]] = None
|
|
100
104
|
|
|
101
105
|
def _prepare_scaler(self,
|
|
102
106
|
X_train: pandas.DataFrame,
|
|
103
107
|
y_train: Union[pandas.Series, pandas.DataFrame],
|
|
108
|
+
X_val: pandas.DataFrame,
|
|
104
109
|
X_test: pandas.DataFrame,
|
|
105
110
|
label_dtype: torch.dtype,
|
|
106
111
|
schema: FeatureSchema):
|
|
107
|
-
"""Internal helper to fit and apply a
|
|
112
|
+
"""Internal helper to fit and apply a DragonScaler using a FeatureSchema."""
|
|
108
113
|
continuous_feature_indices: Optional[List[int]] = None
|
|
109
114
|
|
|
110
115
|
# Get continuous feature indices *from the schema*
|
|
@@ -122,26 +127,33 @@ class _BaseDatasetMaker(ABC):
|
|
|
122
127
|
_LOGGER.info("No continuous features listed in schema. Scaler will not be fitted.")
|
|
123
128
|
|
|
124
129
|
X_train_values = X_train.to_numpy()
|
|
130
|
+
X_val_values = X_val.to_numpy()
|
|
125
131
|
X_test_values = X_test.to_numpy()
|
|
126
132
|
|
|
127
133
|
# continuous_feature_indices is derived
|
|
128
134
|
if self.scaler is None and continuous_feature_indices:
|
|
129
|
-
_LOGGER.info("Fitting a new
|
|
135
|
+
_LOGGER.info("Fitting a new DragonScaler on training data.")
|
|
130
136
|
temp_train_ds = _PytorchDataset(X_train_values, y_train, label_dtype) # type: ignore
|
|
131
|
-
self.scaler =
|
|
137
|
+
self.scaler = DragonScaler.fit(temp_train_ds, continuous_feature_indices)
|
|
132
138
|
|
|
133
139
|
if self.scaler and self.scaler.mean_ is not None:
|
|
134
|
-
_LOGGER.info("Applying scaler transformation to train and test feature sets.")
|
|
140
|
+
_LOGGER.info("Applying scaler transformation to train, validation, and test feature sets.")
|
|
135
141
|
X_train_tensor = self.scaler.transform(torch.tensor(X_train_values, dtype=torch.float32))
|
|
142
|
+
X_val_tensor = self.scaler.transform(torch.tensor(X_val_values, dtype=torch.float32))
|
|
136
143
|
X_test_tensor = self.scaler.transform(torch.tensor(X_test_values, dtype=torch.float32))
|
|
137
|
-
return X_train_tensor.numpy(), X_test_tensor.numpy()
|
|
144
|
+
return X_train_tensor.numpy(), X_val_tensor.numpy(), X_test_tensor.numpy()
|
|
138
145
|
|
|
139
|
-
return X_train_values, X_test_values
|
|
146
|
+
return X_train_values, X_val_values, X_test_values
|
|
140
147
|
|
|
141
148
|
@property
|
|
142
149
|
def train_dataset(self) -> Dataset:
|
|
143
150
|
if self._train_ds is None: raise RuntimeError("Dataset not yet created.")
|
|
144
151
|
return self._train_ds
|
|
152
|
+
|
|
153
|
+
@property
|
|
154
|
+
def validation_dataset(self) -> Dataset:
|
|
155
|
+
if self._val_ds is None: raise RuntimeError("Dataset not yet created.")
|
|
156
|
+
return self._val_ds
|
|
145
157
|
|
|
146
158
|
@property
|
|
147
159
|
def test_dataset(self) -> Dataset:
|
|
@@ -176,6 +188,7 @@ class _BaseDatasetMaker(ABC):
|
|
|
176
188
|
def dataframes_info(self) -> None:
|
|
177
189
|
print("--- DataFrame Shapes After Split ---")
|
|
178
190
|
print(f" X_train shape: {self._X_train_shape}, y_train shape: {self._y_train_shape}")
|
|
191
|
+
print(f" X_val shape: {self._X_val_shape}, y_val shape: {self._y_val_shape}")
|
|
179
192
|
print(f" X_test shape: {self._X_test_shape}, y_test shape: {self._y_test_shape}")
|
|
180
193
|
print("------------------------------------")
|
|
181
194
|
|
|
@@ -195,7 +208,7 @@ class _BaseDatasetMaker(ABC):
|
|
|
195
208
|
|
|
196
209
|
def save_scaler(self, directory: Union[str, Path], verbose: bool=True) -> None:
|
|
197
210
|
"""
|
|
198
|
-
Saves the fitted
|
|
211
|
+
Saves the fitted DragonScaler's state to a .pth file.
|
|
199
212
|
|
|
200
213
|
The filename is automatically generated based on the dataset id.
|
|
201
214
|
|
|
@@ -215,6 +228,24 @@ class _BaseDatasetMaker(ABC):
|
|
|
215
228
|
self.scaler.save(filepath, verbose=False)
|
|
216
229
|
if verbose:
|
|
217
230
|
_LOGGER.info(f"Scaler for dataset '{self.id}' saved as '{filepath.name}'.")
|
|
231
|
+
|
|
232
|
+
def save_class_map(self, directory: Union[str,Path], verbose: bool=True) -> None:
|
|
233
|
+
"""
|
|
234
|
+
Saves the class to index mapping {str: int} to a directory.
|
|
235
|
+
"""
|
|
236
|
+
if not self.class_map:
|
|
237
|
+
_LOGGER.warning(f"No class_map defined. Skipping.")
|
|
238
|
+
return
|
|
239
|
+
|
|
240
|
+
log_name = f"Class_to_Index_{self.id}" if self.id else "Class_to_Index"
|
|
241
|
+
|
|
242
|
+
custom_logger(data=self.class_map,
|
|
243
|
+
save_directory=directory,
|
|
244
|
+
log_name=log_name,
|
|
245
|
+
add_timestamp=False,
|
|
246
|
+
dict_as="json")
|
|
247
|
+
if verbose:
|
|
248
|
+
_LOGGER.info(f"Class map for '{self.id}' saved as '{log_name}.json'.")
|
|
218
249
|
|
|
219
250
|
def save_artifacts(self, directory: Union[str, Path], verbose: bool=True) -> None:
|
|
220
251
|
"""
|
|
@@ -224,19 +255,22 @@ class _BaseDatasetMaker(ABC):
|
|
|
224
255
|
self.save_target_names(directory=directory, verbose=verbose)
|
|
225
256
|
if self.scaler is not None:
|
|
226
257
|
self.save_scaler(directory=directory, verbose=verbose)
|
|
258
|
+
if self.class_map is not None:
|
|
259
|
+
self.save_class_map(directory=directory, verbose=verbose)
|
|
227
260
|
|
|
228
261
|
|
|
229
262
|
# Single target dataset
|
|
230
|
-
class
|
|
263
|
+
class DragonDataset(_BaseDatasetMaker):
|
|
231
264
|
"""
|
|
232
265
|
Dataset maker for pre-processed, numerical pandas DataFrames with a single target column.
|
|
233
266
|
|
|
234
267
|
This class takes a DataFrame, and a FeatureSchema, automatically splits and converts them into PyTorch Datasets.
|
|
235
|
-
It can also create and apply a
|
|
268
|
+
It can also create and apply a DragonScaler using the schema.
|
|
236
269
|
|
|
237
270
|
Attributes:
|
|
238
|
-
`scaler` ->
|
|
271
|
+
`scaler` -> DragonScaler | None
|
|
239
272
|
`train_dataset` -> PyTorch Dataset
|
|
273
|
+
`validation_dataset` -> PyTorch Dataset
|
|
240
274
|
`test_dataset` -> PyTorch Dataset
|
|
241
275
|
`feature_names` -> list[str]
|
|
242
276
|
`target_names` -> list[str]
|
|
@@ -247,9 +281,10 @@ class DatasetMaker(_BaseDatasetMaker):
|
|
|
247
281
|
def __init__(self,
|
|
248
282
|
pandas_df: pandas.DataFrame,
|
|
249
283
|
schema: FeatureSchema,
|
|
250
|
-
kind: Literal["regression", "classification"],
|
|
251
|
-
scaler: Union[Literal["fit"], Literal["none"],
|
|
252
|
-
|
|
284
|
+
kind: Literal["regression", "binary classification", "multiclass classification"],
|
|
285
|
+
scaler: Union[Literal["fit"], Literal["none"], DragonScaler],
|
|
286
|
+
validation_size: float = 0.2,
|
|
287
|
+
test_size: float = 0.1,
|
|
253
288
|
random_state: int = 42):
|
|
254
289
|
"""
|
|
255
290
|
Args:
|
|
@@ -257,32 +292,45 @@ class DatasetMaker(_BaseDatasetMaker):
|
|
|
257
292
|
The pre-processed input DataFrame containing all columns. (features and single target).
|
|
258
293
|
schema (FeatureSchema):
|
|
259
294
|
The definitive schema object from data_exploration.
|
|
260
|
-
kind (
|
|
261
|
-
The type of ML task.
|
|
262
|
-
|
|
295
|
+
kind (str):
|
|
296
|
+
The type of ML task. Must be one of:
|
|
297
|
+
- "regression"
|
|
298
|
+
- "binary classification"
|
|
299
|
+
- "multiclass classification"
|
|
300
|
+
scaler ("fit" | "none" | DragonScaler):
|
|
263
301
|
Strategy for data scaling:
|
|
264
|
-
- "fit": Fit a new
|
|
302
|
+
- "fit": Fit a new DragonScaler on continuous features.
|
|
265
303
|
- "none": Do not scale data (e.g., for TabularTransformer).
|
|
266
|
-
-
|
|
304
|
+
- DragonScaler instance: Use a pre-fitted scaler to transform data.
|
|
305
|
+
validation_size (float):
|
|
306
|
+
The proportion of the *original* dataset to allocate to the validation split.
|
|
267
307
|
test_size (float):
|
|
268
|
-
The proportion of the dataset to allocate to the test split.
|
|
308
|
+
The proportion of the dataset to allocate to the test split (can be 0).
|
|
269
309
|
random_state (int):
|
|
270
310
|
The seed for the random number of generator for reproducibility.
|
|
271
311
|
|
|
272
312
|
"""
|
|
273
313
|
super().__init__()
|
|
274
314
|
|
|
315
|
+
# --- Validation for split sizes ---
|
|
316
|
+
if (validation_size + test_size) >= 1.0:
|
|
317
|
+
_LOGGER.error(f"The sum of validation_size ({validation_size}) and test_size ({test_size}) must be less than 1.0.")
|
|
318
|
+
raise ValueError()
|
|
319
|
+
elif validation_size <= 0.0:
|
|
320
|
+
_LOGGER.error(f"Invalid validation split of {validation_size}.")
|
|
321
|
+
raise ValueError()
|
|
322
|
+
|
|
275
323
|
_apply_scaling: bool = False
|
|
276
324
|
if scaler == "fit":
|
|
277
325
|
self.scaler = None # To be created
|
|
278
326
|
_apply_scaling = True
|
|
279
327
|
elif scaler == "none":
|
|
280
328
|
self.scaler = None
|
|
281
|
-
elif isinstance(scaler,
|
|
329
|
+
elif isinstance(scaler, DragonScaler):
|
|
282
330
|
self.scaler = scaler # Use the provided one
|
|
283
331
|
_apply_scaling = True
|
|
284
332
|
else:
|
|
285
|
-
_LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a
|
|
333
|
+
_LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a DragonScaler instance.")
|
|
286
334
|
raise ValueError()
|
|
287
335
|
|
|
288
336
|
# --- 1. Identify features (from schema) ---
|
|
@@ -298,7 +346,7 @@ class DatasetMaker(_BaseDatasetMaker):
|
|
|
298
346
|
_LOGGER.error("No target column found. The schema's features match the DataFrame's columns exactly.")
|
|
299
347
|
raise ValueError("No target column found in DataFrame.")
|
|
300
348
|
if len(target_cols_set) > 1:
|
|
301
|
-
_LOGGER.error(f"Ambiguous target. Found {len(target_cols_set)} columns not in the schema: {list(target_cols_set)}.
|
|
349
|
+
_LOGGER.error(f"Ambiguous target. Found {len(target_cols_set)} columns not in the schema: {list(target_cols_set)}. One target required.")
|
|
302
350
|
raise ValueError("Ambiguous target: More than one non-feature column found.")
|
|
303
351
|
|
|
304
352
|
target_name = list(target_cols_set)[0]
|
|
@@ -308,35 +356,105 @@ class DatasetMaker(_BaseDatasetMaker):
|
|
|
308
356
|
# --- 3. Split Data ---
|
|
309
357
|
features_df = pandas_df[self._feature_names]
|
|
310
358
|
target_series = pandas_df[target_name]
|
|
311
|
-
|
|
312
|
-
|
|
359
|
+
|
|
360
|
+
# First split: (Train + Val) vs TesT
|
|
361
|
+
X_train_val, X_test, y_train_val, y_test = train_test_split(
|
|
313
362
|
features_df,
|
|
314
363
|
target_series,
|
|
315
364
|
test_size=test_size,
|
|
316
365
|
random_state=random_state
|
|
317
366
|
)
|
|
318
|
-
|
|
319
|
-
|
|
367
|
+
# Calculate validation split size relative to the (Train + Val) set
|
|
368
|
+
val_split_size = validation_size / (1.0 - test_size)
|
|
320
369
|
|
|
321
|
-
|
|
370
|
+
# Second split: Train vs Val
|
|
371
|
+
X_train, X_val, y_train, y_val = train_test_split(
|
|
372
|
+
X_train_val,
|
|
373
|
+
y_train_val,
|
|
374
|
+
test_size=val_split_size,
|
|
375
|
+
random_state=random_state
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
self._X_train_shape, self._X_val_shape, self._X_test_shape = X_train.shape, X_val.shape, X_test.shape
|
|
379
|
+
self._y_train_shape, self._y_val_shape, self._y_test_shape = y_train.shape, y_val.shape, y_test.shape
|
|
380
|
+
|
|
381
|
+
# --- label_dtype logic ---
|
|
382
|
+
if kind == MLTaskKeys.REGRESSION or kind == MLTaskKeys.BINARY_CLASSIFICATION:
|
|
383
|
+
label_dtype = torch.float32
|
|
384
|
+
elif kind == MLTaskKeys.MULTICLASS_CLASSIFICATION:
|
|
385
|
+
label_dtype = torch.int64
|
|
386
|
+
else:
|
|
387
|
+
_LOGGER.error(f"Invalid 'kind' {kind}. Must be '{MLTaskKeys.REGRESSION}', '{MLTaskKeys.BINARY_CLASSIFICATION}', or '{MLTaskKeys.MULTICLASS_CLASSIFICATION}'.")
|
|
388
|
+
raise ValueError()
|
|
389
|
+
self.kind = kind
|
|
322
390
|
|
|
323
391
|
# --- 4. Scale (using the schema) ---
|
|
324
392
|
if _apply_scaling:
|
|
325
|
-
X_train_final, X_test_final = self._prepare_scaler(
|
|
326
|
-
X_train, y_train, X_test, label_dtype, schema
|
|
393
|
+
X_train_final, X_val_final, X_test_final = self._prepare_scaler(
|
|
394
|
+
X_train, y_train, X_val, X_test, label_dtype, schema
|
|
327
395
|
)
|
|
328
396
|
else:
|
|
329
397
|
_LOGGER.info("Features have not been scaled as specified.")
|
|
330
398
|
X_train_final = X_train.to_numpy()
|
|
399
|
+
X_val_final = X_val.to_numpy()
|
|
331
400
|
X_test_final = X_test.to_numpy()
|
|
332
401
|
|
|
333
402
|
# --- 5. Create Datasets ---
|
|
334
403
|
self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
|
|
404
|
+
self._val_ds = _PytorchDataset(X_val_final, y_val, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
|
|
335
405
|
self._test_ds = _PytorchDataset(X_test_final, y_test, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
|
|
406
|
+
|
|
407
|
+
def set_class_map(self, class_map: dict[str, int]) -> None:
|
|
408
|
+
"""
|
|
409
|
+
Sets a map of class_name -> integer_label.
|
|
336
410
|
|
|
411
|
+
This is used by the InferenceHandler and to finalize the model after training.
|
|
412
|
+
|
|
413
|
+
Args:
|
|
414
|
+
class_map (Dict[str, int]): A dictionary mapping the integer label
|
|
415
|
+
to its string name.
|
|
416
|
+
Example: {'cat': 0, 'dog': 1, 'bird': 2}
|
|
417
|
+
"""
|
|
418
|
+
if self.kind == MLTaskKeys.REGRESSION:
|
|
419
|
+
_LOGGER.warning(f"Class Map is for classifications tasks only.")
|
|
420
|
+
return
|
|
421
|
+
|
|
422
|
+
self.class_map = class_map
|
|
423
|
+
|
|
424
|
+
try:
|
|
425
|
+
sorted_items = sorted(class_map.items(), key=lambda item: item[1])
|
|
426
|
+
class_list = [item[0] for item in sorted_items]
|
|
427
|
+
except Exception as e:
|
|
428
|
+
_LOGGER.error(f"Could not sort class map. Ensure it is a dict of {str: int}. Error: {e}")
|
|
429
|
+
raise TypeError()
|
|
430
|
+
|
|
431
|
+
if self._train_ds:
|
|
432
|
+
self._train_ds.classes = class_list # type: ignore
|
|
433
|
+
if self._val_ds:
|
|
434
|
+
self._val_ds.classes = class_list # type: ignore
|
|
435
|
+
if self._test_ds:
|
|
436
|
+
self._test_ds.classes = class_list # type: ignore
|
|
437
|
+
|
|
438
|
+
_LOGGER.info(f"Class map set for dataset '{self.id}':\n{class_map}")
|
|
439
|
+
|
|
440
|
+
def __repr__(self) -> str:
|
|
441
|
+
s = f"<{self.__class__.__name__} (ID: '{self.id}')>\n"
|
|
442
|
+
s += f" Target: {self.target_names[0]}\n"
|
|
443
|
+
s += f" Features: {self.number_of_features}\n"
|
|
444
|
+
s += f" Scaler: {'Fitted' if self.scaler else 'None'}\n"
|
|
445
|
+
|
|
446
|
+
if self._train_ds:
|
|
447
|
+
s += f" Train Samples: {len(self._train_ds)}\n" # type: ignore
|
|
448
|
+
if self._val_ds:
|
|
449
|
+
s += f" Validation Samples: {len(self._val_ds)}\n" # type: ignore
|
|
450
|
+
if self._test_ds:
|
|
451
|
+
s += f" Test Samples: {len(self._test_ds)}\n" # type: ignore
|
|
452
|
+
|
|
453
|
+
return s
|
|
454
|
+
|
|
337
455
|
|
|
338
456
|
# --- Multi-Target Class ---
|
|
339
|
-
class
|
|
457
|
+
class DragonDatasetMulti(_BaseDatasetMaker):
|
|
340
458
|
"""
|
|
341
459
|
Dataset maker for pre-processed, numerical pandas DataFrames with
|
|
342
460
|
multiple target columns.
|
|
@@ -345,15 +463,15 @@ class DatasetMakerMulti(_BaseDatasetMaker):
|
|
|
345
463
|
*target_columns*. It validates that the schema's features and the
|
|
346
464
|
target columns are mutually exclusive and together account for all
|
|
347
465
|
columns in the DataFrame.
|
|
348
|
-
|
|
349
|
-
Targets dtype is torch.float32
|
|
350
466
|
"""
|
|
351
467
|
def __init__(self,
|
|
352
468
|
pandas_df: pandas.DataFrame,
|
|
353
469
|
target_columns: List[str],
|
|
354
470
|
schema: FeatureSchema,
|
|
355
|
-
|
|
356
|
-
|
|
471
|
+
kind: Literal["multitarget regression", "multilabel binary classification"],
|
|
472
|
+
scaler: Union[Literal["fit"], Literal["none"], DragonScaler],
|
|
473
|
+
validation_size: float = 0.2,
|
|
474
|
+
test_size: float = 0.1,
|
|
357
475
|
random_state: int = 42):
|
|
358
476
|
"""
|
|
359
477
|
Args:
|
|
@@ -364,11 +482,17 @@ class DatasetMakerMulti(_BaseDatasetMaker):
|
|
|
364
482
|
List of target column names.
|
|
365
483
|
schema (FeatureSchema):
|
|
366
484
|
The definitive schema object from data_exploration.
|
|
367
|
-
|
|
485
|
+
kind (str):
|
|
486
|
+
The type of multi-target ML task. Must be one of:
|
|
487
|
+
- "multitarget regression"
|
|
488
|
+
- "multilabel binary classification"
|
|
489
|
+
scaler ("fit" | "none" | DragonScaler):
|
|
368
490
|
Strategy for data scaling:
|
|
369
|
-
- "fit": Fit a new
|
|
491
|
+
- "fit": Fit a new DragonScaler on continuous features.
|
|
370
492
|
- "none": Do not scale data (e.g., for TabularTransformer).
|
|
371
|
-
-
|
|
493
|
+
- DragonScaler instance: Use a pre-fitted scaler to transform data.
|
|
494
|
+
validation_size (float):
|
|
495
|
+
The proportion of the dataset to allocate to the validation split.
|
|
372
496
|
test_size (float):
|
|
373
497
|
The proportion of the dataset to allocate to the test split.
|
|
374
498
|
random_state (int):
|
|
@@ -376,21 +500,34 @@ class DatasetMakerMulti(_BaseDatasetMaker):
|
|
|
376
500
|
|
|
377
501
|
## Note:
|
|
378
502
|
For multi-binary classification, the most common PyTorch loss function is nn.BCEWithLogitsLoss.
|
|
379
|
-
This loss function requires the labels to be torch.float32 which is the same type required for
|
|
503
|
+
This loss function requires the labels to be torch.float32 which is the same type required for multi-regression tasks.
|
|
380
504
|
"""
|
|
381
505
|
super().__init__()
|
|
382
506
|
|
|
507
|
+
# --- Validation for split sizes ---
|
|
508
|
+
if (validation_size + test_size) >= 1.0:
|
|
509
|
+
_LOGGER.error(f"The sum of validation_size ({validation_size}) and test_size ({test_size}) must be less than 1.0.")
|
|
510
|
+
raise ValueError("validation_size and test_size sum must be < 1.0")
|
|
511
|
+
elif validation_size <= 0.0:
|
|
512
|
+
_LOGGER.error(f"Invalid validation split of {validation_size}.")
|
|
513
|
+
raise ValueError()
|
|
514
|
+
|
|
515
|
+
# --- Validate kind parameter ---
|
|
516
|
+
if kind not in [MLTaskKeys.MULTITARGET_REGRESSION, MLTaskKeys.MULTILABEL_BINARY_CLASSIFICATION]:
|
|
517
|
+
_LOGGER.error(f"Invalid 'kind' {kind}. Must be '{MLTaskKeys.MULTITARGET_REGRESSION}' or '{MLTaskKeys.MULTILABEL_BINARY_CLASSIFICATION}'.")
|
|
518
|
+
raise ValueError()
|
|
519
|
+
|
|
383
520
|
_apply_scaling: bool = False
|
|
384
521
|
if scaler == "fit":
|
|
385
522
|
self.scaler = None
|
|
386
523
|
_apply_scaling = True
|
|
387
524
|
elif scaler == "none":
|
|
388
525
|
self.scaler = None
|
|
389
|
-
elif isinstance(scaler,
|
|
526
|
+
elif isinstance(scaler, DragonScaler):
|
|
390
527
|
self.scaler = scaler # Use the provided one
|
|
391
528
|
_apply_scaling = True
|
|
392
529
|
else:
|
|
393
|
-
_LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a
|
|
530
|
+
_LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a DragonScaler instance.")
|
|
394
531
|
raise ValueError()
|
|
395
532
|
|
|
396
533
|
# --- 1. Get features and targets from schema/args ---
|
|
@@ -420,240 +557,63 @@ class DatasetMakerMulti(_BaseDatasetMaker):
|
|
|
420
557
|
# --- 3. Split Data ---
|
|
421
558
|
features_df = pandas_df[self._feature_names]
|
|
422
559
|
target_df = pandas_df[self._target_names]
|
|
423
|
-
|
|
424
|
-
|
|
560
|
+
|
|
561
|
+
# First split: (Train + Val) vs Test
|
|
562
|
+
X_train_val, X_test, y_train_val, y_test = train_test_split(
|
|
425
563
|
features_df,
|
|
426
564
|
target_df,
|
|
427
565
|
test_size=test_size,
|
|
428
566
|
random_state=random_state
|
|
429
567
|
)
|
|
430
|
-
|
|
431
|
-
|
|
568
|
+
|
|
569
|
+
# Calculate validation split size relative to the (Train + Val) set
|
|
570
|
+
val_split_size = validation_size / (1.0 - test_size)
|
|
571
|
+
|
|
572
|
+
# Second split: Train vs Val
|
|
573
|
+
X_train, X_val, y_train, y_val = train_test_split(
|
|
574
|
+
X_train_val,
|
|
575
|
+
y_train_val,
|
|
576
|
+
test_size=val_split_size,
|
|
577
|
+
random_state=random_state
|
|
578
|
+
)
|
|
579
|
+
|
|
580
|
+
self._X_train_shape, self._X_val_shape, self._X_test_shape = X_train.shape, X_val.shape, X_test.shape
|
|
581
|
+
self._y_train_shape, self._y_val_shape, self._y_test_shape = y_train.shape, y_val.shape, y_test.shape
|
|
432
582
|
|
|
433
583
|
# Multi-target for regression or multi-binary
|
|
434
584
|
label_dtype = torch.float32
|
|
435
585
|
|
|
436
586
|
# --- 4. Scale (using the schema) ---
|
|
437
587
|
if _apply_scaling:
|
|
438
|
-
X_train_final, X_test_final = self._prepare_scaler(
|
|
439
|
-
X_train, y_train, X_test, label_dtype, schema
|
|
588
|
+
X_train_final, X_val_final, X_test_final = self._prepare_scaler(
|
|
589
|
+
X_train, y_train, X_val, X_test, label_dtype, schema
|
|
440
590
|
)
|
|
441
591
|
else:
|
|
442
592
|
_LOGGER.info("Features have not been scaled as specified.")
|
|
443
593
|
X_train_final = X_train.to_numpy()
|
|
594
|
+
X_val_final = X_val.to_numpy()
|
|
444
595
|
X_test_final = X_test.to_numpy()
|
|
445
596
|
|
|
446
597
|
# --- 5. Create Datasets ---
|
|
447
598
|
# _PytorchDataset now correctly handles y_train (a DataFrame)
|
|
448
599
|
self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
|
|
600
|
+
self._val_ds = _PytorchDataset(X_val_final, y_val, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
|
|
449
601
|
self._test_ds = _PytorchDataset(X_test_final, y_test, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
|
|
450
602
|
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
"""
|
|
457
|
-
def __init__(self):
|
|
458
|
-
self._train_dataset = None
|
|
459
|
-
self._test_dataset = None
|
|
460
|
-
self._val_dataset = None
|
|
461
|
-
|
|
462
|
-
@abstractmethod
|
|
463
|
-
def get_datasets(self) -> Tuple[Dataset, ...]:
|
|
464
|
-
"""
|
|
465
|
-
The primary method to retrieve the final, processed PyTorch datasets.
|
|
466
|
-
Must be implemented by all subclasses.
|
|
467
|
-
"""
|
|
468
|
-
pass
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
# --- SequenceMaker ---
|
|
472
|
-
class SequenceMaker(_BaseMaker):
|
|
473
|
-
"""
|
|
474
|
-
Creates windowed PyTorch datasets from time-series data.
|
|
475
|
-
|
|
476
|
-
Pipeline:
|
|
477
|
-
|
|
478
|
-
1. `.split_data()`: Separate time series into training and testing portions.
|
|
479
|
-
2. `.normalize_data()`: Normalize the data. The scaler will be fitted on the training portion.
|
|
480
|
-
3. `.generate_windows()`: Create the windowed sequences from the split and normalized data.
|
|
481
|
-
4. `.get_datasets()`: Return Pytorch train and test datasets.
|
|
482
|
-
"""
|
|
483
|
-
def __init__(self, data: Union[pandas.DataFrame, pandas.Series, numpy.ndarray], sequence_length: int):
|
|
484
|
-
super().__init__()
|
|
485
|
-
self.sequence_length = sequence_length
|
|
486
|
-
self.scaler = None
|
|
487
|
-
|
|
488
|
-
if isinstance(data, pandas.DataFrame):
|
|
489
|
-
self.time_axis = data.index.values
|
|
490
|
-
self.sequence = data.iloc[:, 0].values.astype(numpy.float32)
|
|
491
|
-
elif isinstance(data, pandas.Series):
|
|
492
|
-
self.time_axis = data.index.values
|
|
493
|
-
self.sequence = data.values.astype(numpy.float32)
|
|
494
|
-
elif isinstance(data, numpy.ndarray):
|
|
495
|
-
self.time_axis = numpy.arange(len(data))
|
|
496
|
-
self.sequence = data.astype(numpy.float32)
|
|
497
|
-
else:
|
|
498
|
-
_LOGGER.error("Data must be a pandas DataFrame/Series or a numpy array.")
|
|
499
|
-
raise TypeError()
|
|
500
|
-
|
|
501
|
-
self.train_sequence = None
|
|
502
|
-
self.test_sequence = None
|
|
503
|
-
|
|
504
|
-
self._is_split = False
|
|
505
|
-
self._is_normalized = False
|
|
506
|
-
self._are_windows_generated = False
|
|
507
|
-
|
|
508
|
-
def normalize_data(self) -> 'SequenceMaker':
|
|
509
|
-
"""
|
|
510
|
-
Normalizes the sequence data using PytorchScaler. Must be called AFTER
|
|
511
|
-
splitting to prevent data leakage from the test set.
|
|
512
|
-
"""
|
|
513
|
-
if not self._is_split:
|
|
514
|
-
_LOGGER.error("Data must be split BEFORE normalizing. Call .split_data() first.")
|
|
515
|
-
raise RuntimeError()
|
|
516
|
-
|
|
517
|
-
if self.scaler:
|
|
518
|
-
_LOGGER.warning("Data has already been normalized.")
|
|
519
|
-
return self
|
|
520
|
-
|
|
521
|
-
# 1. PytorchScaler requires a Dataset to fit. Create a temporary one.
|
|
522
|
-
# The scaler expects 2D data [n_samples, n_features].
|
|
523
|
-
train_features = self.train_sequence.reshape(-1, 1) # type: ignore
|
|
524
|
-
|
|
525
|
-
# _PytorchDataset needs labels, so we create dummy ones.
|
|
526
|
-
dummy_labels = numpy.zeros(len(train_features))
|
|
527
|
-
temp_train_ds = _PytorchDataset(train_features, dummy_labels, labels_dtype=torch.float32)
|
|
528
|
-
|
|
529
|
-
# 2. Fit the PytorchScaler on the temporary training dataset.
|
|
530
|
-
# The sequence is a single feature, so its index is [0].
|
|
531
|
-
_LOGGER.info("Fitting PytorchScaler on the training data...")
|
|
532
|
-
self.scaler = PytorchScaler.fit(temp_train_ds, continuous_feature_indices=[0])
|
|
533
|
-
|
|
534
|
-
# 3. Transform sequences using the fitted scaler.
|
|
535
|
-
# The transform method requires a tensor, so we convert, transform, and convert back.
|
|
536
|
-
train_tensor = torch.tensor(self.train_sequence.reshape(-1, 1), dtype=torch.float32) # type: ignore
|
|
537
|
-
test_tensor = torch.tensor(self.test_sequence.reshape(-1, 1), dtype=torch.float32) # type: ignore
|
|
538
|
-
|
|
539
|
-
self.train_sequence = self.scaler.transform(train_tensor).numpy().flatten()
|
|
540
|
-
self.test_sequence = self.scaler.transform(test_tensor).numpy().flatten()
|
|
541
|
-
|
|
542
|
-
self._is_normalized = True
|
|
543
|
-
_LOGGER.info("Sequence data normalized using PytorchScaler.")
|
|
544
|
-
return self
|
|
545
|
-
|
|
546
|
-
def split_data(self, test_size: float = 0.2) -> 'SequenceMaker':
|
|
547
|
-
"""Splits the sequence into training and testing portions."""
|
|
548
|
-
if self._is_split:
|
|
549
|
-
_LOGGER.warning("Data has already been split.")
|
|
550
|
-
return self
|
|
551
|
-
|
|
552
|
-
split_idx = int(len(self.sequence) * (1 - test_size))
|
|
553
|
-
self.train_sequence = self.sequence[:split_idx]
|
|
554
|
-
self.test_sequence = self.sequence[split_idx - self.sequence_length:]
|
|
555
|
-
|
|
556
|
-
self.train_time_axis = self.time_axis[:split_idx]
|
|
557
|
-
self.test_time_axis = self.time_axis[split_idx:]
|
|
558
|
-
|
|
559
|
-
self._is_split = True
|
|
560
|
-
_LOGGER.info(f"Sequence split into training ({len(self.train_sequence)} points) and testing ({len(self.test_sequence)} points).")
|
|
561
|
-
return self
|
|
562
|
-
|
|
563
|
-
def generate_windows(self, sequence_to_sequence: bool = False) -> 'SequenceMaker':
|
|
564
|
-
"""
|
|
565
|
-
Generates overlapping windows for features and labels.
|
|
603
|
+
def __repr__(self) -> str:
|
|
604
|
+
s = f"<{self.__class__.__name__} (ID: '{self.id}')>\n"
|
|
605
|
+
s += f" Targets: {self.number_of_targets}\n"
|
|
606
|
+
s += f" Features: {self.number_of_features}\n"
|
|
607
|
+
s += f" Scaler: {'Fitted' if self.scaler else 'None'}\n"
|
|
566
608
|
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
if
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
self._train_dataset = self._create_windowed_dataset(self.train_sequence, sequence_to_sequence) # type: ignore
|
|
574
|
-
self._test_dataset = self._create_windowed_dataset(self.test_sequence, sequence_to_sequence) # type: ignore
|
|
575
|
-
|
|
576
|
-
self._are_windows_generated = True
|
|
577
|
-
_LOGGER.info("Feature and label windows generated for train and test sets.")
|
|
578
|
-
return self
|
|
579
|
-
|
|
580
|
-
def _create_windowed_dataset(self, data: numpy.ndarray, use_sequence_labels: bool) -> Dataset:
|
|
581
|
-
"""Efficiently creates windowed features and labels using numpy."""
|
|
582
|
-
if len(data) <= self.sequence_length:
|
|
583
|
-
_LOGGER.error("Data length must be greater than the sequence_length to create at least one window.")
|
|
584
|
-
raise ValueError()
|
|
585
|
-
|
|
586
|
-
if not use_sequence_labels:
|
|
587
|
-
features = data[:-1]
|
|
588
|
-
labels = data[self.sequence_length:]
|
|
609
|
+
if self._train_ds:
|
|
610
|
+
s += f" Train Samples: {len(self._train_ds)}\n" # type: ignore
|
|
611
|
+
if self._val_ds:
|
|
612
|
+
s += f" Validation Samples: {len(self._val_ds)}\n" # type: ignore
|
|
613
|
+
if self._test_ds:
|
|
614
|
+
s += f" Test Samples: {len(self._test_ds)}\n" # type: ignore
|
|
589
615
|
|
|
590
|
-
|
|
591
|
-
bytes_per_item = features.strides[0]
|
|
592
|
-
strided_features = numpy.lib.stride_tricks.as_strided(
|
|
593
|
-
features, shape=(n_windows, self.sequence_length), strides=(bytes_per_item, bytes_per_item)
|
|
594
|
-
)
|
|
595
|
-
return _PytorchDataset(strided_features, labels, labels_dtype=torch.float32)
|
|
596
|
-
|
|
597
|
-
else:
|
|
598
|
-
x_data = data[:-1]
|
|
599
|
-
y_data = data[1:]
|
|
600
|
-
|
|
601
|
-
n_windows = len(x_data) - self.sequence_length + 1
|
|
602
|
-
bytes_per_item = x_data.strides[0]
|
|
603
|
-
|
|
604
|
-
strided_x = numpy.lib.stride_tricks.as_strided(x_data, shape=(n_windows, self.sequence_length), strides=(bytes_per_item, bytes_per_item))
|
|
605
|
-
strided_y = numpy.lib.stride_tricks.as_strided(y_data, shape=(n_windows, self.sequence_length), strides=(bytes_per_item, bytes_per_item))
|
|
606
|
-
|
|
607
|
-
return _PytorchDataset(strided_x, strided_y, labels_dtype=torch.float32)
|
|
608
|
-
|
|
609
|
-
def denormalize(self, data: Union[torch.Tensor, numpy.ndarray]) -> numpy.ndarray:
|
|
610
|
-
"""Applies inverse transformation using the stored PytorchScaler."""
|
|
611
|
-
if self.scaler is None:
|
|
612
|
-
_LOGGER.error("Data was not normalized. Cannot denormalize.")
|
|
613
|
-
raise RuntimeError()
|
|
614
|
-
|
|
615
|
-
# Ensure data is a torch.Tensor
|
|
616
|
-
if isinstance(data, numpy.ndarray):
|
|
617
|
-
tensor_data = torch.tensor(data, dtype=torch.float32)
|
|
618
|
-
else:
|
|
619
|
-
tensor_data = data
|
|
620
|
-
|
|
621
|
-
# Reshape for the scaler [n_samples, n_features]
|
|
622
|
-
if tensor_data.ndim == 1:
|
|
623
|
-
tensor_data = tensor_data.view(-1, 1)
|
|
624
|
-
|
|
625
|
-
# Apply inverse transform and convert back to a flat numpy array
|
|
626
|
-
original_scale_tensor = self.scaler.inverse_transform(tensor_data)
|
|
627
|
-
return original_scale_tensor.cpu().numpy().flatten()
|
|
628
|
-
|
|
629
|
-
def plot(self, predictions: Optional[numpy.ndarray] = None):
|
|
630
|
-
"""Plots the original training and testing data, with optional predictions."""
|
|
631
|
-
if not self._is_split:
|
|
632
|
-
_LOGGER.error("Cannot plot before splitting data. Call .split_data() first.")
|
|
633
|
-
raise RuntimeError()
|
|
634
|
-
|
|
635
|
-
plt.figure(figsize=(15, 6))
|
|
636
|
-
plt.title("Time Series Data")
|
|
637
|
-
plt.grid(True)
|
|
638
|
-
plt.xlabel("Time")
|
|
639
|
-
plt.ylabel("Value")
|
|
640
|
-
|
|
641
|
-
plt.plot(self.train_time_axis, self.scaler.inverse_transform(self.train_sequence.reshape(-1, 1)), label='Train Data') # type: ignore
|
|
642
|
-
plt.plot(self.test_time_axis, self.scaler.inverse_transform(self.test_sequence[self.sequence_length-1:].reshape(-1, 1)), label='Test Data') # type: ignore
|
|
643
|
-
|
|
644
|
-
if predictions is not None:
|
|
645
|
-
pred_time_axis = self.test_time_axis[:len(predictions)]
|
|
646
|
-
plt.plot(pred_time_axis, predictions, label='Predictions', c='red')
|
|
647
|
-
|
|
648
|
-
plt.legend()
|
|
649
|
-
plt.show()
|
|
650
|
-
|
|
651
|
-
def get_datasets(self) -> Tuple[Dataset, Dataset]:
|
|
652
|
-
"""Returns the final train and test datasets."""
|
|
653
|
-
if not self._are_windows_generated:
|
|
654
|
-
_LOGGER.error("Windows have not been generated. Call .generate_windows() first.")
|
|
655
|
-
raise RuntimeError()
|
|
656
|
-
return self._train_dataset, self._test_dataset
|
|
616
|
+
return s
|
|
657
617
|
|
|
658
618
|
|
|
659
619
|
def info():
|