dragon-ml-toolbox 14.7.0__py3-none-any.whl → 16.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dragon_ml_toolbox-14.7.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/METADATA +9 -5
- dragon_ml_toolbox-16.2.0.dist-info/RECORD +51 -0
- ml_tools/ETL_cleaning.py +20 -20
- ml_tools/ETL_engineering.py +23 -25
- ml_tools/GUI_tools.py +20 -20
- ml_tools/MICE_imputation.py +3 -3
- ml_tools/ML_callbacks.py +43 -26
- ml_tools/ML_configuration.py +704 -24
- ml_tools/ML_datasetmaster.py +235 -280
- ml_tools/ML_evaluation.py +144 -39
- ml_tools/ML_evaluation_multi.py +103 -35
- ml_tools/ML_inference.py +290 -208
- ml_tools/ML_models.py +13 -102
- ml_tools/ML_models_advanced.py +1 -1
- ml_tools/ML_optimization.py +12 -12
- ml_tools/ML_scaler.py +11 -11
- ml_tools/ML_sequence_datasetmaster.py +341 -0
- ml_tools/ML_sequence_evaluation.py +219 -0
- ml_tools/ML_sequence_inference.py +391 -0
- ml_tools/ML_sequence_models.py +139 -0
- ml_tools/ML_trainer.py +1342 -386
- ml_tools/ML_utilities.py +1 -1
- ml_tools/ML_vision_datasetmaster.py +120 -72
- ml_tools/ML_vision_evaluation.py +30 -6
- ml_tools/ML_vision_inference.py +129 -152
- ml_tools/ML_vision_models.py +1 -1
- ml_tools/ML_vision_transformers.py +121 -40
- ml_tools/PSO_optimization.py +6 -6
- ml_tools/SQL.py +4 -4
- ml_tools/{keys.py → _keys.py} +45 -0
- ml_tools/_schema.py +1 -1
- ml_tools/ensemble_evaluation.py +1 -1
- ml_tools/ensemble_inference.py +7 -33
- ml_tools/ensemble_learning.py +1 -1
- ml_tools/optimization_tools.py +2 -2
- ml_tools/path_manager.py +5 -5
- ml_tools/utilities.py +1 -2
- dragon_ml_toolbox-14.7.0.dist-info/RECORD +0 -49
- ml_tools/RNN_forecast.py +0 -56
- ml_tools/_ML_vision_recipe.py +0 -88
- {dragon_ml_toolbox-14.7.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-14.7.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-14.7.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-14.7.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/top_level.txt +0 -0
ml_tools/ML_datasetmaster.py
CHANGED
|
@@ -3,27 +3,25 @@ from torch.utils.data import Dataset
|
|
|
3
3
|
import pandas
|
|
4
4
|
import numpy
|
|
5
5
|
from sklearn.model_selection import train_test_split
|
|
6
|
-
from typing import Literal, Union,
|
|
7
|
-
from abc import ABC
|
|
8
|
-
import matplotlib.pyplot as plt
|
|
6
|
+
from typing import Literal, Union, List, Optional
|
|
7
|
+
from abc import ABC
|
|
9
8
|
from pathlib import Path
|
|
10
9
|
|
|
11
10
|
from .path_manager import make_fullpath, sanitize_filename
|
|
12
11
|
from ._logger import _LOGGER
|
|
13
12
|
from ._script_info import _script_info
|
|
14
13
|
from .custom_logger import save_list_strings
|
|
15
|
-
from .ML_scaler import
|
|
16
|
-
from .
|
|
14
|
+
from .ML_scaler import DragonScaler
|
|
15
|
+
from ._keys import DatasetKeys, MLTaskKeys
|
|
17
16
|
from ._schema import FeatureSchema
|
|
17
|
+
from .custom_logger import custom_logger
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
__all__ = [
|
|
21
|
-
"
|
|
22
|
-
"
|
|
23
|
-
"SequenceMaker"
|
|
21
|
+
"DragonDataset",
|
|
22
|
+
"DragonDatasetMulti"
|
|
24
23
|
]
|
|
25
24
|
|
|
26
|
-
|
|
27
25
|
# --- Internal Helper Class ---
|
|
28
26
|
class _PytorchDataset(Dataset):
|
|
29
27
|
"""
|
|
@@ -57,6 +55,8 @@ class _PytorchDataset(Dataset):
|
|
|
57
55
|
|
|
58
56
|
self._feature_names = feature_names
|
|
59
57
|
self._target_names = target_names
|
|
58
|
+
self._classes: List[str] = []
|
|
59
|
+
self._class_map: dict[str,int] = dict()
|
|
60
60
|
|
|
61
61
|
def __len__(self):
|
|
62
62
|
return len(self.features)
|
|
@@ -78,6 +78,15 @@ class _PytorchDataset(Dataset):
|
|
|
78
78
|
return self._target_names
|
|
79
79
|
else:
|
|
80
80
|
_LOGGER.error(f"Dataset {self.__class__} has not been initialized with any target names.")
|
|
81
|
+
raise ValueError()
|
|
82
|
+
|
|
83
|
+
@property
|
|
84
|
+
def classes(self):
|
|
85
|
+
return self._classes
|
|
86
|
+
|
|
87
|
+
@property
|
|
88
|
+
def class_map(self):
|
|
89
|
+
return self._class_map
|
|
81
90
|
|
|
82
91
|
|
|
83
92
|
# --- Abstract Base Class ---
|
|
@@ -88,23 +97,29 @@ class _BaseDatasetMaker(ABC):
|
|
|
88
97
|
"""
|
|
89
98
|
def __init__(self):
|
|
90
99
|
self._train_ds: Optional[Dataset] = None
|
|
100
|
+
self._val_ds: Optional[Dataset] = None
|
|
91
101
|
self._test_ds: Optional[Dataset] = None
|
|
92
|
-
self.scaler: Optional[
|
|
102
|
+
self.scaler: Optional[DragonScaler] = None
|
|
93
103
|
self._id: Optional[str] = None
|
|
94
104
|
self._feature_names: List[str] = []
|
|
95
105
|
self._target_names: List[str] = []
|
|
96
106
|
self._X_train_shape = (0,0)
|
|
107
|
+
self._X_val_shape = (0,0)
|
|
97
108
|
self._X_test_shape = (0,0)
|
|
98
109
|
self._y_train_shape = (0,)
|
|
110
|
+
self._y_val_shape = (0,)
|
|
99
111
|
self._y_test_shape = (0,)
|
|
112
|
+
self.class_map: dict[str, int] = dict()
|
|
113
|
+
self.classes: list[str] = list()
|
|
100
114
|
|
|
101
115
|
def _prepare_scaler(self,
|
|
102
116
|
X_train: pandas.DataFrame,
|
|
103
117
|
y_train: Union[pandas.Series, pandas.DataFrame],
|
|
118
|
+
X_val: pandas.DataFrame,
|
|
104
119
|
X_test: pandas.DataFrame,
|
|
105
120
|
label_dtype: torch.dtype,
|
|
106
121
|
schema: FeatureSchema):
|
|
107
|
-
"""Internal helper to fit and apply a
|
|
122
|
+
"""Internal helper to fit and apply a DragonScaler using a FeatureSchema."""
|
|
108
123
|
continuous_feature_indices: Optional[List[int]] = None
|
|
109
124
|
|
|
110
125
|
# Get continuous feature indices *from the schema*
|
|
@@ -122,26 +137,33 @@ class _BaseDatasetMaker(ABC):
|
|
|
122
137
|
_LOGGER.info("No continuous features listed in schema. Scaler will not be fitted.")
|
|
123
138
|
|
|
124
139
|
X_train_values = X_train.to_numpy()
|
|
140
|
+
X_val_values = X_val.to_numpy()
|
|
125
141
|
X_test_values = X_test.to_numpy()
|
|
126
142
|
|
|
127
143
|
# continuous_feature_indices is derived
|
|
128
144
|
if self.scaler is None and continuous_feature_indices:
|
|
129
|
-
_LOGGER.info("Fitting a new
|
|
145
|
+
_LOGGER.info("Fitting a new DragonScaler on training data.")
|
|
130
146
|
temp_train_ds = _PytorchDataset(X_train_values, y_train, label_dtype) # type: ignore
|
|
131
|
-
self.scaler =
|
|
147
|
+
self.scaler = DragonScaler.fit(temp_train_ds, continuous_feature_indices)
|
|
132
148
|
|
|
133
149
|
if self.scaler and self.scaler.mean_ is not None:
|
|
134
|
-
_LOGGER.info("Applying scaler transformation to train and test feature sets.")
|
|
150
|
+
_LOGGER.info("Applying scaler transformation to train, validation, and test feature sets.")
|
|
135
151
|
X_train_tensor = self.scaler.transform(torch.tensor(X_train_values, dtype=torch.float32))
|
|
152
|
+
X_val_tensor = self.scaler.transform(torch.tensor(X_val_values, dtype=torch.float32))
|
|
136
153
|
X_test_tensor = self.scaler.transform(torch.tensor(X_test_values, dtype=torch.float32))
|
|
137
|
-
return X_train_tensor.numpy(), X_test_tensor.numpy()
|
|
154
|
+
return X_train_tensor.numpy(), X_val_tensor.numpy(), X_test_tensor.numpy()
|
|
138
155
|
|
|
139
|
-
return X_train_values, X_test_values
|
|
156
|
+
return X_train_values, X_val_values, X_test_values
|
|
140
157
|
|
|
141
158
|
@property
|
|
142
159
|
def train_dataset(self) -> Dataset:
|
|
143
160
|
if self._train_ds is None: raise RuntimeError("Dataset not yet created.")
|
|
144
161
|
return self._train_ds
|
|
162
|
+
|
|
163
|
+
@property
|
|
164
|
+
def validation_dataset(self) -> Dataset:
|
|
165
|
+
if self._val_ds is None: raise RuntimeError("Dataset not yet created.")
|
|
166
|
+
return self._val_ds
|
|
145
167
|
|
|
146
168
|
@property
|
|
147
169
|
def test_dataset(self) -> Dataset:
|
|
@@ -176,6 +198,7 @@ class _BaseDatasetMaker(ABC):
|
|
|
176
198
|
def dataframes_info(self) -> None:
|
|
177
199
|
print("--- DataFrame Shapes After Split ---")
|
|
178
200
|
print(f" X_train shape: {self._X_train_shape}, y_train shape: {self._y_train_shape}")
|
|
201
|
+
print(f" X_val shape: {self._X_val_shape}, y_val shape: {self._y_val_shape}")
|
|
179
202
|
print(f" X_test shape: {self._X_test_shape}, y_test shape: {self._y_test_shape}")
|
|
180
203
|
print("------------------------------------")
|
|
181
204
|
|
|
@@ -195,7 +218,7 @@ class _BaseDatasetMaker(ABC):
|
|
|
195
218
|
|
|
196
219
|
def save_scaler(self, directory: Union[str, Path], verbose: bool=True) -> None:
|
|
197
220
|
"""
|
|
198
|
-
Saves the fitted
|
|
221
|
+
Saves the fitted DragonScaler's state to a .pth file.
|
|
199
222
|
|
|
200
223
|
The filename is automatically generated based on the dataset id.
|
|
201
224
|
|
|
@@ -215,6 +238,24 @@ class _BaseDatasetMaker(ABC):
|
|
|
215
238
|
self.scaler.save(filepath, verbose=False)
|
|
216
239
|
if verbose:
|
|
217
240
|
_LOGGER.info(f"Scaler for dataset '{self.id}' saved as '{filepath.name}'.")
|
|
241
|
+
|
|
242
|
+
def save_class_map(self, directory: Union[str,Path], verbose: bool=True) -> None:
|
|
243
|
+
"""
|
|
244
|
+
Saves the class to index mapping {str: int} to a directory.
|
|
245
|
+
"""
|
|
246
|
+
if not self.class_map:
|
|
247
|
+
_LOGGER.warning(f"No class_map defined. Skipping.")
|
|
248
|
+
return
|
|
249
|
+
|
|
250
|
+
log_name = f"Class_to_Index_{self.id}" if self.id else "Class_to_Index"
|
|
251
|
+
|
|
252
|
+
custom_logger(data=self.class_map,
|
|
253
|
+
save_directory=directory,
|
|
254
|
+
log_name=log_name,
|
|
255
|
+
add_timestamp=False,
|
|
256
|
+
dict_as="json")
|
|
257
|
+
if verbose:
|
|
258
|
+
_LOGGER.info(f"Class map for '{self.id}' saved as '{log_name}.json'.")
|
|
218
259
|
|
|
219
260
|
def save_artifacts(self, directory: Union[str, Path], verbose: bool=True) -> None:
|
|
220
261
|
"""
|
|
@@ -224,19 +265,22 @@ class _BaseDatasetMaker(ABC):
|
|
|
224
265
|
self.save_target_names(directory=directory, verbose=verbose)
|
|
225
266
|
if self.scaler is not None:
|
|
226
267
|
self.save_scaler(directory=directory, verbose=verbose)
|
|
268
|
+
if self.class_map:
|
|
269
|
+
self.save_class_map(directory=directory, verbose=verbose)
|
|
227
270
|
|
|
228
271
|
|
|
229
272
|
# Single target dataset
|
|
230
|
-
class
|
|
273
|
+
class DragonDataset(_BaseDatasetMaker):
|
|
231
274
|
"""
|
|
232
275
|
Dataset maker for pre-processed, numerical pandas DataFrames with a single target column.
|
|
233
276
|
|
|
234
277
|
This class takes a DataFrame, and a FeatureSchema, automatically splits and converts them into PyTorch Datasets.
|
|
235
|
-
It can also create and apply a
|
|
278
|
+
It can also create and apply a DragonScaler using the schema.
|
|
236
279
|
|
|
237
280
|
Attributes:
|
|
238
|
-
`scaler` ->
|
|
281
|
+
`scaler` -> DragonScaler | None
|
|
239
282
|
`train_dataset` -> PyTorch Dataset
|
|
283
|
+
`validation_dataset` -> PyTorch Dataset
|
|
240
284
|
`test_dataset` -> PyTorch Dataset
|
|
241
285
|
`feature_names` -> list[str]
|
|
242
286
|
`target_names` -> list[str]
|
|
@@ -247,9 +291,11 @@ class DatasetMaker(_BaseDatasetMaker):
|
|
|
247
291
|
def __init__(self,
|
|
248
292
|
pandas_df: pandas.DataFrame,
|
|
249
293
|
schema: FeatureSchema,
|
|
250
|
-
kind: Literal["regression", "classification"],
|
|
251
|
-
scaler: Union[Literal["fit"], Literal["none"],
|
|
252
|
-
|
|
294
|
+
kind: Literal["regression", "binary classification", "multiclass classification"],
|
|
295
|
+
scaler: Union[Literal["fit"], Literal["none"], DragonScaler],
|
|
296
|
+
validation_size: float = 0.2,
|
|
297
|
+
test_size: float = 0.1,
|
|
298
|
+
class_map: Optional[dict[str,int]]=None,
|
|
253
299
|
random_state: int = 42):
|
|
254
300
|
"""
|
|
255
301
|
Args:
|
|
@@ -257,32 +303,46 @@ class DatasetMaker(_BaseDatasetMaker):
|
|
|
257
303
|
The pre-processed input DataFrame containing all columns. (features and single target).
|
|
258
304
|
schema (FeatureSchema):
|
|
259
305
|
The definitive schema object from data_exploration.
|
|
260
|
-
kind (
|
|
261
|
-
The type of ML task.
|
|
262
|
-
|
|
306
|
+
kind (str):
|
|
307
|
+
The type of ML task. Must be one of:
|
|
308
|
+
- "regression"
|
|
309
|
+
- "binary classification"
|
|
310
|
+
- "multiclass classification"
|
|
311
|
+
scaler ("fit" | "none" | DragonScaler):
|
|
263
312
|
Strategy for data scaling:
|
|
264
|
-
- "fit": Fit a new
|
|
313
|
+
- "fit": Fit a new DragonScaler on continuous features.
|
|
265
314
|
- "none": Do not scale data (e.g., for TabularTransformer).
|
|
266
|
-
-
|
|
315
|
+
- DragonScaler instance: Use a pre-fitted scaler to transform data.
|
|
316
|
+
validation_size (float):
|
|
317
|
+
The proportion of the *original* dataset to allocate to the validation split.
|
|
267
318
|
test_size (float):
|
|
268
|
-
The proportion of the dataset to allocate to the test split.
|
|
319
|
+
The proportion of the dataset to allocate to the test split (can be 0).
|
|
320
|
+
class_map (dict[str,int] | None): Optional class map for the target classes in classification tasks. Can be set later using `.set_class_map()`.
|
|
269
321
|
random_state (int):
|
|
270
322
|
The seed for the random number of generator for reproducibility.
|
|
271
323
|
|
|
272
324
|
"""
|
|
273
325
|
super().__init__()
|
|
274
326
|
|
|
327
|
+
# --- Validation for split sizes ---
|
|
328
|
+
if (validation_size + test_size) >= 1.0:
|
|
329
|
+
_LOGGER.error(f"The sum of validation_size ({validation_size}) and test_size ({test_size}) must be less than 1.0.")
|
|
330
|
+
raise ValueError()
|
|
331
|
+
elif validation_size <= 0.0:
|
|
332
|
+
_LOGGER.error(f"Invalid validation split of {validation_size}.")
|
|
333
|
+
raise ValueError()
|
|
334
|
+
|
|
275
335
|
_apply_scaling: bool = False
|
|
276
336
|
if scaler == "fit":
|
|
277
337
|
self.scaler = None # To be created
|
|
278
338
|
_apply_scaling = True
|
|
279
339
|
elif scaler == "none":
|
|
280
340
|
self.scaler = None
|
|
281
|
-
elif isinstance(scaler,
|
|
341
|
+
elif isinstance(scaler, DragonScaler):
|
|
282
342
|
self.scaler = scaler # Use the provided one
|
|
283
343
|
_apply_scaling = True
|
|
284
344
|
else:
|
|
285
|
-
_LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a
|
|
345
|
+
_LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a DragonScaler instance.")
|
|
286
346
|
raise ValueError()
|
|
287
347
|
|
|
288
348
|
# --- 1. Identify features (from schema) ---
|
|
@@ -298,7 +358,7 @@ class DatasetMaker(_BaseDatasetMaker):
|
|
|
298
358
|
_LOGGER.error("No target column found. The schema's features match the DataFrame's columns exactly.")
|
|
299
359
|
raise ValueError("No target column found in DataFrame.")
|
|
300
360
|
if len(target_cols_set) > 1:
|
|
301
|
-
_LOGGER.error(f"Ambiguous target. Found {len(target_cols_set)} columns not in the schema: {list(target_cols_set)}.
|
|
361
|
+
_LOGGER.error(f"Ambiguous target. Found {len(target_cols_set)} columns not in the schema: {list(target_cols_set)}. One target required.")
|
|
302
362
|
raise ValueError("Ambiguous target: More than one non-feature column found.")
|
|
303
363
|
|
|
304
364
|
target_name = list(target_cols_set)[0]
|
|
@@ -308,32 +368,112 @@ class DatasetMaker(_BaseDatasetMaker):
|
|
|
308
368
|
# --- 3. Split Data ---
|
|
309
369
|
features_df = pandas_df[self._feature_names]
|
|
310
370
|
target_series = pandas_df[target_name]
|
|
311
|
-
|
|
312
|
-
|
|
371
|
+
|
|
372
|
+
# First split: (Train + Val) vs TesT
|
|
373
|
+
X_train_val, X_test, y_train_val, y_test = train_test_split(
|
|
313
374
|
features_df,
|
|
314
375
|
target_series,
|
|
315
376
|
test_size=test_size,
|
|
316
377
|
random_state=random_state
|
|
317
378
|
)
|
|
318
|
-
|
|
319
|
-
|
|
379
|
+
# Calculate validation split size relative to the (Train + Val) set
|
|
380
|
+
val_split_size = validation_size / (1.0 - test_size)
|
|
381
|
+
|
|
382
|
+
# Second split: Train vs Val
|
|
383
|
+
X_train, X_val, y_train, y_val = train_test_split(
|
|
384
|
+
X_train_val,
|
|
385
|
+
y_train_val,
|
|
386
|
+
test_size=val_split_size,
|
|
387
|
+
random_state=random_state
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
self._X_train_shape, self._X_val_shape, self._X_test_shape = X_train.shape, X_val.shape, X_test.shape
|
|
391
|
+
self._y_train_shape, self._y_val_shape, self._y_test_shape = y_train.shape, y_val.shape, y_test.shape
|
|
320
392
|
|
|
321
|
-
|
|
393
|
+
# --- label_dtype logic ---
|
|
394
|
+
if kind == MLTaskKeys.REGRESSION or kind == MLTaskKeys.BINARY_CLASSIFICATION:
|
|
395
|
+
label_dtype = torch.float32
|
|
396
|
+
elif kind == MLTaskKeys.MULTICLASS_CLASSIFICATION:
|
|
397
|
+
label_dtype = torch.int64
|
|
398
|
+
else:
|
|
399
|
+
_LOGGER.error(f"Invalid 'kind' {kind}. Must be '{MLTaskKeys.REGRESSION}', '{MLTaskKeys.BINARY_CLASSIFICATION}', or '{MLTaskKeys.MULTICLASS_CLASSIFICATION}'.")
|
|
400
|
+
raise ValueError()
|
|
401
|
+
self.kind = kind
|
|
322
402
|
|
|
323
403
|
# --- 4. Scale (using the schema) ---
|
|
324
404
|
if _apply_scaling:
|
|
325
|
-
X_train_final, X_test_final = self._prepare_scaler(
|
|
326
|
-
X_train, y_train, X_test, label_dtype, schema
|
|
405
|
+
X_train_final, X_val_final, X_test_final = self._prepare_scaler(
|
|
406
|
+
X_train, y_train, X_val, X_test, label_dtype, schema
|
|
327
407
|
)
|
|
328
408
|
else:
|
|
329
409
|
_LOGGER.info("Features have not been scaled as specified.")
|
|
330
410
|
X_train_final = X_train.to_numpy()
|
|
411
|
+
X_val_final = X_val.to_numpy()
|
|
331
412
|
X_test_final = X_test.to_numpy()
|
|
332
413
|
|
|
333
414
|
# --- 5. Create Datasets ---
|
|
334
415
|
self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
|
|
416
|
+
self._val_ds = _PytorchDataset(X_val_final, y_val, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
|
|
335
417
|
self._test_ds = _PytorchDataset(X_test_final, y_test, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
|
|
336
|
-
|
|
418
|
+
|
|
419
|
+
# --- 6. create class map if given ---
|
|
420
|
+
if self.kind != MLTaskKeys.REGRESSION:
|
|
421
|
+
if class_map is None:
|
|
422
|
+
self.class_map = dict()
|
|
423
|
+
else:
|
|
424
|
+
self.set_class_map(class_map)
|
|
425
|
+
else:
|
|
426
|
+
self.class_map = dict()
|
|
427
|
+
|
|
428
|
+
def set_class_map(self, class_map: dict[str, int], force_overwrite: bool=False) -> None:
|
|
429
|
+
"""
|
|
430
|
+
Sets a map of class_name -> integer_label.
|
|
431
|
+
|
|
432
|
+
This is used by the InferenceHandler and to finalize the model after training.
|
|
433
|
+
|
|
434
|
+
Args:
|
|
435
|
+
class_map (Dict[str, int]): A dictionary mapping the integer label
|
|
436
|
+
to its string name.
|
|
437
|
+
Example: {'cat': 0, 'dog': 1, 'bird': 2}
|
|
438
|
+
force_overwrite (bool): Required to overwrite a previously set class map.
|
|
439
|
+
"""
|
|
440
|
+
if self.kind == MLTaskKeys.REGRESSION:
|
|
441
|
+
_LOGGER.warning(f"Class Map is for classifications tasks only.")
|
|
442
|
+
return
|
|
443
|
+
|
|
444
|
+
if self.class_map:
|
|
445
|
+
warning_message = f"Class map was previously set."
|
|
446
|
+
if not force_overwrite:
|
|
447
|
+
warning_message += " Use `force_overwrite=True` to set new values."
|
|
448
|
+
_LOGGER.warning(warning_message)
|
|
449
|
+
return
|
|
450
|
+
else:
|
|
451
|
+
warning_message += ". Setting new values..."
|
|
452
|
+
_LOGGER.warning(warning_message)
|
|
453
|
+
|
|
454
|
+
self.class_map = class_map
|
|
455
|
+
|
|
456
|
+
try:
|
|
457
|
+
sorted_items = sorted(class_map.items(), key=lambda item: item[1])
|
|
458
|
+
class_list = [item[0] for item in sorted_items]
|
|
459
|
+
except Exception as e:
|
|
460
|
+
_LOGGER.error(f"Could not sort class map. Ensure it is a dict of {str: int}. Error: {e}")
|
|
461
|
+
raise TypeError()
|
|
462
|
+
else:
|
|
463
|
+
self.classes = class_list
|
|
464
|
+
|
|
465
|
+
if self._train_ds:
|
|
466
|
+
self._train_ds._classes = class_list # type: ignore
|
|
467
|
+
self._train_ds._class_map = class_map # type: ignore
|
|
468
|
+
if self._val_ds:
|
|
469
|
+
self._val_ds._classes = class_list # type: ignore
|
|
470
|
+
self._val_ds._class_map = class_map # type: ignore
|
|
471
|
+
if self._test_ds:
|
|
472
|
+
self._test_ds._classes = class_list # type: ignore
|
|
473
|
+
self._test_ds._class_map = class_map # type: ignore
|
|
474
|
+
|
|
475
|
+
_LOGGER.info(f"Class map set for dataset '{self.id}' and its subsets:\n{class_map}")
|
|
476
|
+
|
|
337
477
|
def __repr__(self) -> str:
|
|
338
478
|
s = f"<{self.__class__.__name__} (ID: '{self.id}')>\n"
|
|
339
479
|
s += f" Target: {self.target_names[0]}\n"
|
|
@@ -342,6 +482,8 @@ class DatasetMaker(_BaseDatasetMaker):
|
|
|
342
482
|
|
|
343
483
|
if self._train_ds:
|
|
344
484
|
s += f" Train Samples: {len(self._train_ds)}\n" # type: ignore
|
|
485
|
+
if self._val_ds:
|
|
486
|
+
s += f" Validation Samples: {len(self._val_ds)}\n" # type: ignore
|
|
345
487
|
if self._test_ds:
|
|
346
488
|
s += f" Test Samples: {len(self._test_ds)}\n" # type: ignore
|
|
347
489
|
|
|
@@ -349,7 +491,7 @@ class DatasetMaker(_BaseDatasetMaker):
|
|
|
349
491
|
|
|
350
492
|
|
|
351
493
|
# --- Multi-Target Class ---
|
|
352
|
-
class
|
|
494
|
+
class DragonDatasetMulti(_BaseDatasetMaker):
|
|
353
495
|
"""
|
|
354
496
|
Dataset maker for pre-processed, numerical pandas DataFrames with
|
|
355
497
|
multiple target columns.
|
|
@@ -358,15 +500,15 @@ class DatasetMakerMulti(_BaseDatasetMaker):
|
|
|
358
500
|
*target_columns*. It validates that the schema's features and the
|
|
359
501
|
target columns are mutually exclusive and together account for all
|
|
360
502
|
columns in the DataFrame.
|
|
361
|
-
|
|
362
|
-
Targets dtype is torch.float32
|
|
363
503
|
"""
|
|
364
504
|
def __init__(self,
|
|
365
505
|
pandas_df: pandas.DataFrame,
|
|
366
506
|
target_columns: List[str],
|
|
367
507
|
schema: FeatureSchema,
|
|
368
|
-
|
|
369
|
-
|
|
508
|
+
kind: Literal["multitarget regression", "multilabel binary classification"],
|
|
509
|
+
scaler: Union[Literal["fit"], Literal["none"], DragonScaler],
|
|
510
|
+
validation_size: float = 0.2,
|
|
511
|
+
test_size: float = 0.1,
|
|
370
512
|
random_state: int = 42):
|
|
371
513
|
"""
|
|
372
514
|
Args:
|
|
@@ -377,11 +519,17 @@ class DatasetMakerMulti(_BaseDatasetMaker):
|
|
|
377
519
|
List of target column names.
|
|
378
520
|
schema (FeatureSchema):
|
|
379
521
|
The definitive schema object from data_exploration.
|
|
380
|
-
|
|
522
|
+
kind (str):
|
|
523
|
+
The type of multi-target ML task. Must be one of:
|
|
524
|
+
- "multitarget regression"
|
|
525
|
+
- "multilabel binary classification"
|
|
526
|
+
scaler ("fit" | "none" | DragonScaler):
|
|
381
527
|
Strategy for data scaling:
|
|
382
|
-
- "fit": Fit a new
|
|
528
|
+
- "fit": Fit a new DragonScaler on continuous features.
|
|
383
529
|
- "none": Do not scale data (e.g., for TabularTransformer).
|
|
384
|
-
-
|
|
530
|
+
- DragonScaler instance: Use a pre-fitted scaler to transform data.
|
|
531
|
+
validation_size (float):
|
|
532
|
+
The proportion of the dataset to allocate to the validation split.
|
|
385
533
|
test_size (float):
|
|
386
534
|
The proportion of the dataset to allocate to the test split.
|
|
387
535
|
random_state (int):
|
|
@@ -389,21 +537,34 @@ class DatasetMakerMulti(_BaseDatasetMaker):
|
|
|
389
537
|
|
|
390
538
|
## Note:
|
|
391
539
|
For multi-binary classification, the most common PyTorch loss function is nn.BCEWithLogitsLoss.
|
|
392
|
-
This loss function requires the labels to be torch.float32 which is the same type required for
|
|
540
|
+
This loss function requires the labels to be torch.float32 which is the same type required for multi-regression tasks.
|
|
393
541
|
"""
|
|
394
542
|
super().__init__()
|
|
395
543
|
|
|
544
|
+
# --- Validation for split sizes ---
|
|
545
|
+
if (validation_size + test_size) >= 1.0:
|
|
546
|
+
_LOGGER.error(f"The sum of validation_size ({validation_size}) and test_size ({test_size}) must be less than 1.0.")
|
|
547
|
+
raise ValueError("validation_size and test_size sum must be < 1.0")
|
|
548
|
+
elif validation_size <= 0.0:
|
|
549
|
+
_LOGGER.error(f"Invalid validation split of {validation_size}.")
|
|
550
|
+
raise ValueError()
|
|
551
|
+
|
|
552
|
+
# --- Validate kind parameter ---
|
|
553
|
+
if kind not in [MLTaskKeys.MULTITARGET_REGRESSION, MLTaskKeys.MULTILABEL_BINARY_CLASSIFICATION]:
|
|
554
|
+
_LOGGER.error(f"Invalid 'kind' {kind}. Must be '{MLTaskKeys.MULTITARGET_REGRESSION}' or '{MLTaskKeys.MULTILABEL_BINARY_CLASSIFICATION}'.")
|
|
555
|
+
raise ValueError()
|
|
556
|
+
|
|
396
557
|
_apply_scaling: bool = False
|
|
397
558
|
if scaler == "fit":
|
|
398
559
|
self.scaler = None
|
|
399
560
|
_apply_scaling = True
|
|
400
561
|
elif scaler == "none":
|
|
401
562
|
self.scaler = None
|
|
402
|
-
elif isinstance(scaler,
|
|
563
|
+
elif isinstance(scaler, DragonScaler):
|
|
403
564
|
self.scaler = scaler # Use the provided one
|
|
404
565
|
_apply_scaling = True
|
|
405
566
|
else:
|
|
406
|
-
_LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a
|
|
567
|
+
_LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a DragonScaler instance.")
|
|
407
568
|
raise ValueError()
|
|
408
569
|
|
|
409
570
|
# --- 1. Get features and targets from schema/args ---
|
|
@@ -433,32 +594,47 @@ class DatasetMakerMulti(_BaseDatasetMaker):
|
|
|
433
594
|
# --- 3. Split Data ---
|
|
434
595
|
features_df = pandas_df[self._feature_names]
|
|
435
596
|
target_df = pandas_df[self._target_names]
|
|
436
|
-
|
|
437
|
-
|
|
597
|
+
|
|
598
|
+
# First split: (Train + Val) vs Test
|
|
599
|
+
X_train_val, X_test, y_train_val, y_test = train_test_split(
|
|
438
600
|
features_df,
|
|
439
601
|
target_df,
|
|
440
602
|
test_size=test_size,
|
|
441
603
|
random_state=random_state
|
|
442
604
|
)
|
|
443
|
-
|
|
444
|
-
|
|
605
|
+
|
|
606
|
+
# Calculate validation split size relative to the (Train + Val) set
|
|
607
|
+
val_split_size = validation_size / (1.0 - test_size)
|
|
608
|
+
|
|
609
|
+
# Second split: Train vs Val
|
|
610
|
+
X_train, X_val, y_train, y_val = train_test_split(
|
|
611
|
+
X_train_val,
|
|
612
|
+
y_train_val,
|
|
613
|
+
test_size=val_split_size,
|
|
614
|
+
random_state=random_state
|
|
615
|
+
)
|
|
616
|
+
|
|
617
|
+
self._X_train_shape, self._X_val_shape, self._X_test_shape = X_train.shape, X_val.shape, X_test.shape
|
|
618
|
+
self._y_train_shape, self._y_val_shape, self._y_test_shape = y_train.shape, y_val.shape, y_test.shape
|
|
445
619
|
|
|
446
620
|
# Multi-target for regression or multi-binary
|
|
447
621
|
label_dtype = torch.float32
|
|
448
622
|
|
|
449
623
|
# --- 4. Scale (using the schema) ---
|
|
450
624
|
if _apply_scaling:
|
|
451
|
-
X_train_final, X_test_final = self._prepare_scaler(
|
|
452
|
-
X_train, y_train, X_test, label_dtype, schema
|
|
625
|
+
X_train_final, X_val_final, X_test_final = self._prepare_scaler(
|
|
626
|
+
X_train, y_train, X_val, X_test, label_dtype, schema
|
|
453
627
|
)
|
|
454
628
|
else:
|
|
455
629
|
_LOGGER.info("Features have not been scaled as specified.")
|
|
456
630
|
X_train_final = X_train.to_numpy()
|
|
631
|
+
X_val_final = X_val.to_numpy()
|
|
457
632
|
X_test_final = X_test.to_numpy()
|
|
458
633
|
|
|
459
634
|
# --- 5. Create Datasets ---
|
|
460
635
|
# _PytorchDataset now correctly handles y_train (a DataFrame)
|
|
461
636
|
self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
|
|
637
|
+
self._val_ds = _PytorchDataset(X_val_final, y_val, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
|
|
462
638
|
self._test_ds = _PytorchDataset(X_test_final, y_test, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
|
|
463
639
|
|
|
464
640
|
def __repr__(self) -> str:
|
|
@@ -469,234 +645,13 @@ class DatasetMakerMulti(_BaseDatasetMaker):
|
|
|
469
645
|
|
|
470
646
|
if self._train_ds:
|
|
471
647
|
s += f" Train Samples: {len(self._train_ds)}\n" # type: ignore
|
|
648
|
+
if self._val_ds:
|
|
649
|
+
s += f" Validation Samples: {len(self._val_ds)}\n" # type: ignore
|
|
472
650
|
if self._test_ds:
|
|
473
651
|
s += f" Test Samples: {len(self._test_ds)}\n" # type: ignore
|
|
474
652
|
|
|
475
653
|
return s
|
|
476
654
|
|
|
477
655
|
|
|
478
|
-
# --- Private Base Class ---
|
|
479
|
-
class _BaseMaker(ABC):
|
|
480
|
-
"""
|
|
481
|
-
Abstract Base Class for extra dataset makers.
|
|
482
|
-
"""
|
|
483
|
-
def __init__(self):
|
|
484
|
-
self._train_dataset = None
|
|
485
|
-
self._test_dataset = None
|
|
486
|
-
self._val_dataset = None
|
|
487
|
-
|
|
488
|
-
@abstractmethod
|
|
489
|
-
def get_datasets(self) -> Tuple[Dataset, ...]:
|
|
490
|
-
"""
|
|
491
|
-
The primary method to retrieve the final, processed PyTorch datasets.
|
|
492
|
-
Must be implemented by all subclasses.
|
|
493
|
-
"""
|
|
494
|
-
pass
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
# --- SequenceMaker ---
|
|
498
|
-
class SequenceMaker(_BaseMaker):
|
|
499
|
-
"""
|
|
500
|
-
Creates windowed PyTorch datasets from time-series data.
|
|
501
|
-
|
|
502
|
-
Pipeline:
|
|
503
|
-
|
|
504
|
-
1. `.split_data()`: Separate time series into training and testing portions.
|
|
505
|
-
2. `.normalize_data()`: Normalize the data. The scaler will be fitted on the training portion.
|
|
506
|
-
3. `.generate_windows()`: Create the windowed sequences from the split and normalized data.
|
|
507
|
-
4. `.get_datasets()`: Return Pytorch train and test datasets.
|
|
508
|
-
"""
|
|
509
|
-
def __init__(self, data: Union[pandas.DataFrame, pandas.Series, numpy.ndarray], sequence_length: int):
|
|
510
|
-
super().__init__()
|
|
511
|
-
self.sequence_length = sequence_length
|
|
512
|
-
self.scaler = None
|
|
513
|
-
|
|
514
|
-
if isinstance(data, pandas.DataFrame):
|
|
515
|
-
self.time_axis = data.index.values
|
|
516
|
-
self.sequence = data.iloc[:, 0].values.astype(numpy.float32)
|
|
517
|
-
elif isinstance(data, pandas.Series):
|
|
518
|
-
self.time_axis = data.index.values
|
|
519
|
-
self.sequence = data.values.astype(numpy.float32)
|
|
520
|
-
elif isinstance(data, numpy.ndarray):
|
|
521
|
-
self.time_axis = numpy.arange(len(data))
|
|
522
|
-
self.sequence = data.astype(numpy.float32)
|
|
523
|
-
else:
|
|
524
|
-
_LOGGER.error("Data must be a pandas DataFrame/Series or a numpy array.")
|
|
525
|
-
raise TypeError()
|
|
526
|
-
|
|
527
|
-
self.train_sequence = None
|
|
528
|
-
self.test_sequence = None
|
|
529
|
-
|
|
530
|
-
self._is_split = False
|
|
531
|
-
self._is_normalized = False
|
|
532
|
-
self._are_windows_generated = False
|
|
533
|
-
|
|
534
|
-
def normalize_data(self) -> 'SequenceMaker':
|
|
535
|
-
"""
|
|
536
|
-
Normalizes the sequence data using PytorchScaler. Must be called AFTER
|
|
537
|
-
splitting to prevent data leakage from the test set.
|
|
538
|
-
"""
|
|
539
|
-
if not self._is_split:
|
|
540
|
-
_LOGGER.error("Data must be split BEFORE normalizing. Call .split_data() first.")
|
|
541
|
-
raise RuntimeError()
|
|
542
|
-
|
|
543
|
-
if self.scaler:
|
|
544
|
-
_LOGGER.warning("Data has already been normalized.")
|
|
545
|
-
return self
|
|
546
|
-
|
|
547
|
-
# 1. PytorchScaler requires a Dataset to fit. Create a temporary one.
|
|
548
|
-
# The scaler expects 2D data [n_samples, n_features].
|
|
549
|
-
train_features = self.train_sequence.reshape(-1, 1) # type: ignore
|
|
550
|
-
|
|
551
|
-
# _PytorchDataset needs labels, so we create dummy ones.
|
|
552
|
-
dummy_labels = numpy.zeros(len(train_features))
|
|
553
|
-
temp_train_ds = _PytorchDataset(train_features, dummy_labels, labels_dtype=torch.float32)
|
|
554
|
-
|
|
555
|
-
# 2. Fit the PytorchScaler on the temporary training dataset.
|
|
556
|
-
# The sequence is a single feature, so its index is [0].
|
|
557
|
-
_LOGGER.info("Fitting PytorchScaler on the training data...")
|
|
558
|
-
self.scaler = PytorchScaler.fit(temp_train_ds, continuous_feature_indices=[0])
|
|
559
|
-
|
|
560
|
-
# 3. Transform sequences using the fitted scaler.
|
|
561
|
-
# The transform method requires a tensor, so we convert, transform, and convert back.
|
|
562
|
-
train_tensor = torch.tensor(self.train_sequence.reshape(-1, 1), dtype=torch.float32) # type: ignore
|
|
563
|
-
test_tensor = torch.tensor(self.test_sequence.reshape(-1, 1), dtype=torch.float32) # type: ignore
|
|
564
|
-
|
|
565
|
-
self.train_sequence = self.scaler.transform(train_tensor).numpy().flatten()
|
|
566
|
-
self.test_sequence = self.scaler.transform(test_tensor).numpy().flatten()
|
|
567
|
-
|
|
568
|
-
self._is_normalized = True
|
|
569
|
-
_LOGGER.info("Sequence data normalized using PytorchScaler.")
|
|
570
|
-
return self
|
|
571
|
-
|
|
572
|
-
def split_data(self, test_size: float = 0.2) -> 'SequenceMaker':
|
|
573
|
-
"""Splits the sequence into training and testing portions."""
|
|
574
|
-
if self._is_split:
|
|
575
|
-
_LOGGER.warning("Data has already been split.")
|
|
576
|
-
return self
|
|
577
|
-
|
|
578
|
-
split_idx = int(len(self.sequence) * (1 - test_size))
|
|
579
|
-
self.train_sequence = self.sequence[:split_idx]
|
|
580
|
-
self.test_sequence = self.sequence[split_idx - self.sequence_length:]
|
|
581
|
-
|
|
582
|
-
self.train_time_axis = self.time_axis[:split_idx]
|
|
583
|
-
self.test_time_axis = self.time_axis[split_idx:]
|
|
584
|
-
|
|
585
|
-
self._is_split = True
|
|
586
|
-
_LOGGER.info(f"Sequence split into training ({len(self.train_sequence)} points) and testing ({len(self.test_sequence)} points).")
|
|
587
|
-
return self
|
|
588
|
-
|
|
589
|
-
def generate_windows(self, sequence_to_sequence: bool = False) -> 'SequenceMaker':
|
|
590
|
-
"""
|
|
591
|
-
Generates overlapping windows for features and labels.
|
|
592
|
-
|
|
593
|
-
"sequence-to-sequence": Label vectors are of the same size as the feature vectors instead of a single future prediction.
|
|
594
|
-
"""
|
|
595
|
-
if not self._is_split:
|
|
596
|
-
_LOGGER.error("Cannot generate windows before splitting data. Call .split_data() first.")
|
|
597
|
-
raise RuntimeError()
|
|
598
|
-
|
|
599
|
-
self._train_dataset = self._create_windowed_dataset(self.train_sequence, sequence_to_sequence) # type: ignore
|
|
600
|
-
self._test_dataset = self._create_windowed_dataset(self.test_sequence, sequence_to_sequence) # type: ignore
|
|
601
|
-
|
|
602
|
-
self._are_windows_generated = True
|
|
603
|
-
_LOGGER.info("Feature and label windows generated for train and test sets.")
|
|
604
|
-
return self
|
|
605
|
-
|
|
606
|
-
def _create_windowed_dataset(self, data: numpy.ndarray, use_sequence_labels: bool) -> Dataset:
|
|
607
|
-
"""Efficiently creates windowed features and labels using numpy."""
|
|
608
|
-
if len(data) <= self.sequence_length:
|
|
609
|
-
_LOGGER.error("Data length must be greater than the sequence_length to create at least one window.")
|
|
610
|
-
raise ValueError()
|
|
611
|
-
|
|
612
|
-
if not use_sequence_labels:
|
|
613
|
-
features = data[:-1]
|
|
614
|
-
labels = data[self.sequence_length:]
|
|
615
|
-
|
|
616
|
-
n_windows = len(features) - self.sequence_length + 1
|
|
617
|
-
bytes_per_item = features.strides[0]
|
|
618
|
-
strided_features = numpy.lib.stride_tricks.as_strided(
|
|
619
|
-
features, shape=(n_windows, self.sequence_length), strides=(bytes_per_item, bytes_per_item)
|
|
620
|
-
)
|
|
621
|
-
return _PytorchDataset(strided_features, labels, labels_dtype=torch.float32)
|
|
622
|
-
|
|
623
|
-
else:
|
|
624
|
-
x_data = data[:-1]
|
|
625
|
-
y_data = data[1:]
|
|
626
|
-
|
|
627
|
-
n_windows = len(x_data) - self.sequence_length + 1
|
|
628
|
-
bytes_per_item = x_data.strides[0]
|
|
629
|
-
|
|
630
|
-
strided_x = numpy.lib.stride_tricks.as_strided(x_data, shape=(n_windows, self.sequence_length), strides=(bytes_per_item, bytes_per_item))
|
|
631
|
-
strided_y = numpy.lib.stride_tricks.as_strided(y_data, shape=(n_windows, self.sequence_length), strides=(bytes_per_item, bytes_per_item))
|
|
632
|
-
|
|
633
|
-
return _PytorchDataset(strided_x, strided_y, labels_dtype=torch.float32)
|
|
634
|
-
|
|
635
|
-
def denormalize(self, data: Union[torch.Tensor, numpy.ndarray]) -> numpy.ndarray:
|
|
636
|
-
"""Applies inverse transformation using the stored PytorchScaler."""
|
|
637
|
-
if self.scaler is None:
|
|
638
|
-
_LOGGER.error("Data was not normalized. Cannot denormalize.")
|
|
639
|
-
raise RuntimeError()
|
|
640
|
-
|
|
641
|
-
# Ensure data is a torch.Tensor
|
|
642
|
-
if isinstance(data, numpy.ndarray):
|
|
643
|
-
tensor_data = torch.tensor(data, dtype=torch.float32)
|
|
644
|
-
else:
|
|
645
|
-
tensor_data = data
|
|
646
|
-
|
|
647
|
-
# Reshape for the scaler [n_samples, n_features]
|
|
648
|
-
if tensor_data.ndim == 1:
|
|
649
|
-
tensor_data = tensor_data.view(-1, 1)
|
|
650
|
-
|
|
651
|
-
# Apply inverse transform and convert back to a flat numpy array
|
|
652
|
-
original_scale_tensor = self.scaler.inverse_transform(tensor_data)
|
|
653
|
-
return original_scale_tensor.cpu().numpy().flatten()
|
|
654
|
-
|
|
655
|
-
def plot(self, predictions: Optional[numpy.ndarray] = None):
|
|
656
|
-
"""Plots the original training and testing data, with optional predictions."""
|
|
657
|
-
if not self._is_split:
|
|
658
|
-
_LOGGER.error("Cannot plot before splitting data. Call .split_data() first.")
|
|
659
|
-
raise RuntimeError()
|
|
660
|
-
|
|
661
|
-
plt.figure(figsize=(15, 6))
|
|
662
|
-
plt.title("Time Series Data")
|
|
663
|
-
plt.grid(True)
|
|
664
|
-
plt.xlabel("Time")
|
|
665
|
-
plt.ylabel("Value")
|
|
666
|
-
|
|
667
|
-
plt.plot(self.train_time_axis, self.scaler.inverse_transform(self.train_sequence.reshape(-1, 1)), label='Train Data') # type: ignore
|
|
668
|
-
plt.plot(self.test_time_axis, self.scaler.inverse_transform(self.test_sequence[self.sequence_length-1:].reshape(-1, 1)), label='Test Data') # type: ignore
|
|
669
|
-
|
|
670
|
-
if predictions is not None:
|
|
671
|
-
pred_time_axis = self.test_time_axis[:len(predictions)]
|
|
672
|
-
plt.plot(pred_time_axis, predictions, label='Predictions', c='red')
|
|
673
|
-
|
|
674
|
-
plt.legend()
|
|
675
|
-
plt.show()
|
|
676
|
-
|
|
677
|
-
def get_datasets(self) -> Tuple[Dataset, Dataset]:
|
|
678
|
-
"""Returns the final train and test datasets."""
|
|
679
|
-
if not self._are_windows_generated:
|
|
680
|
-
_LOGGER.error("Windows have not been generated. Call .generate_windows() first.")
|
|
681
|
-
raise RuntimeError()
|
|
682
|
-
return self._train_dataset, self._test_dataset
|
|
683
|
-
|
|
684
|
-
def __repr__(self) -> str:
|
|
685
|
-
s = f"<{self.__class__.__name__}>:\n"
|
|
686
|
-
s += f" Sequence Length (Window): {self.sequence_length}\n"
|
|
687
|
-
s += f" Total Data Points: {len(self.sequence)}\n"
|
|
688
|
-
s += " --- Status ---\n"
|
|
689
|
-
s += f" Split: {self._is_split}\n"
|
|
690
|
-
s += f" Normalized: {self._is_normalized}\n"
|
|
691
|
-
s += f" Windows Generated: {self._are_windows_generated}\n"
|
|
692
|
-
|
|
693
|
-
if self._are_windows_generated:
|
|
694
|
-
train_len = len(self._train_dataset) if self._train_dataset else 0 # type: ignore
|
|
695
|
-
test_len = len(self._test_dataset) if self._test_dataset else 0 # type: ignore
|
|
696
|
-
s += f" Datasets (Train/Test): {train_len} / {test_len} windows\n"
|
|
697
|
-
|
|
698
|
-
return s
|
|
699
|
-
|
|
700
|
-
|
|
701
656
|
def info():
|
|
702
657
|
_script_info(__all__)
|