dragon-ml-toolbox 13.3.0__py3-none-any.whl → 16.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dragon_ml_toolbox-13.3.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/METADATA +20 -6
- dragon_ml_toolbox-16.2.0.dist-info/RECORD +51 -0
- {dragon_ml_toolbox-13.3.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +10 -0
- ml_tools/ETL_cleaning.py +20 -20
- ml_tools/ETL_engineering.py +23 -25
- ml_tools/GUI_tools.py +20 -20
- ml_tools/MICE_imputation.py +207 -5
- ml_tools/ML_callbacks.py +43 -26
- ml_tools/ML_configuration.py +788 -0
- ml_tools/ML_datasetmaster.py +303 -448
- ml_tools/ML_evaluation.py +351 -93
- ml_tools/ML_evaluation_multi.py +139 -42
- ml_tools/ML_inference.py +290 -209
- ml_tools/ML_models.py +33 -106
- ml_tools/ML_models_advanced.py +323 -0
- ml_tools/ML_optimization.py +12 -12
- ml_tools/ML_scaler.py +11 -11
- ml_tools/ML_sequence_datasetmaster.py +341 -0
- ml_tools/ML_sequence_evaluation.py +219 -0
- ml_tools/ML_sequence_inference.py +391 -0
- ml_tools/ML_sequence_models.py +139 -0
- ml_tools/ML_trainer.py +1604 -179
- ml_tools/ML_utilities.py +351 -4
- ml_tools/ML_vision_datasetmaster.py +1540 -0
- ml_tools/ML_vision_evaluation.py +284 -0
- ml_tools/ML_vision_inference.py +405 -0
- ml_tools/ML_vision_models.py +641 -0
- ml_tools/ML_vision_transformers.py +284 -0
- ml_tools/PSO_optimization.py +6 -6
- ml_tools/SQL.py +4 -4
- ml_tools/_keys.py +171 -0
- ml_tools/_schema.py +1 -1
- ml_tools/custom_logger.py +37 -14
- ml_tools/data_exploration.py +502 -93
- ml_tools/ensemble_evaluation.py +54 -11
- ml_tools/ensemble_inference.py +7 -33
- ml_tools/ensemble_learning.py +1 -1
- ml_tools/math_utilities.py +1 -1
- ml_tools/optimization_tools.py +2 -2
- ml_tools/path_manager.py +5 -5
- ml_tools/serde.py +2 -2
- ml_tools/utilities.py +192 -4
- dragon_ml_toolbox-13.3.0.dist-info/RECORD +0 -41
- ml_tools/RNN_forecast.py +0 -56
- ml_tools/keys.py +0 -87
- {dragon_ml_toolbox-13.3.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-13.3.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-13.3.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/top_level.txt +0 -0
ml_tools/ML_datasetmaster.py
CHANGED
|
@@ -1,34 +1,27 @@
|
|
|
1
1
|
import torch
|
|
2
|
-
from torch.utils.data import Dataset
|
|
2
|
+
from torch.utils.data import Dataset
|
|
3
3
|
import pandas
|
|
4
4
|
import numpy
|
|
5
5
|
from sklearn.model_selection import train_test_split
|
|
6
|
-
from typing import Literal, Union,
|
|
7
|
-
from abc import ABC
|
|
8
|
-
from PIL import Image, ImageOps
|
|
9
|
-
from torchvision.datasets import ImageFolder
|
|
10
|
-
from torchvision import transforms
|
|
11
|
-
import matplotlib.pyplot as plt
|
|
6
|
+
from typing import Literal, Union, List, Optional
|
|
7
|
+
from abc import ABC
|
|
12
8
|
from pathlib import Path
|
|
13
9
|
|
|
14
10
|
from .path_manager import make_fullpath, sanitize_filename
|
|
15
11
|
from ._logger import _LOGGER
|
|
16
12
|
from ._script_info import _script_info
|
|
17
13
|
from .custom_logger import save_list_strings
|
|
18
|
-
from .ML_scaler import
|
|
19
|
-
from .
|
|
14
|
+
from .ML_scaler import DragonScaler
|
|
15
|
+
from ._keys import DatasetKeys, MLTaskKeys
|
|
20
16
|
from ._schema import FeatureSchema
|
|
17
|
+
from .custom_logger import custom_logger
|
|
21
18
|
|
|
22
19
|
|
|
23
20
|
__all__ = [
|
|
24
|
-
"
|
|
25
|
-
"
|
|
26
|
-
"VisionDatasetMaker",
|
|
27
|
-
"SequenceMaker",
|
|
28
|
-
"ResizeAspectFill",
|
|
21
|
+
"DragonDataset",
|
|
22
|
+
"DragonDatasetMulti"
|
|
29
23
|
]
|
|
30
24
|
|
|
31
|
-
|
|
32
25
|
# --- Internal Helper Class ---
|
|
33
26
|
class _PytorchDataset(Dataset):
|
|
34
27
|
"""
|
|
@@ -62,6 +55,8 @@ class _PytorchDataset(Dataset):
|
|
|
62
55
|
|
|
63
56
|
self._feature_names = feature_names
|
|
64
57
|
self._target_names = target_names
|
|
58
|
+
self._classes: List[str] = []
|
|
59
|
+
self._class_map: dict[str,int] = dict()
|
|
65
60
|
|
|
66
61
|
def __len__(self):
|
|
67
62
|
return len(self.features)
|
|
@@ -83,6 +78,15 @@ class _PytorchDataset(Dataset):
|
|
|
83
78
|
return self._target_names
|
|
84
79
|
else:
|
|
85
80
|
_LOGGER.error(f"Dataset {self.__class__} has not been initialized with any target names.")
|
|
81
|
+
raise ValueError()
|
|
82
|
+
|
|
83
|
+
@property
|
|
84
|
+
def classes(self):
|
|
85
|
+
return self._classes
|
|
86
|
+
|
|
87
|
+
@property
|
|
88
|
+
def class_map(self):
|
|
89
|
+
return self._class_map
|
|
86
90
|
|
|
87
91
|
|
|
88
92
|
# --- Abstract Base Class ---
|
|
@@ -93,23 +97,29 @@ class _BaseDatasetMaker(ABC):
|
|
|
93
97
|
"""
|
|
94
98
|
def __init__(self):
|
|
95
99
|
self._train_ds: Optional[Dataset] = None
|
|
100
|
+
self._val_ds: Optional[Dataset] = None
|
|
96
101
|
self._test_ds: Optional[Dataset] = None
|
|
97
|
-
self.scaler: Optional[
|
|
102
|
+
self.scaler: Optional[DragonScaler] = None
|
|
98
103
|
self._id: Optional[str] = None
|
|
99
104
|
self._feature_names: List[str] = []
|
|
100
105
|
self._target_names: List[str] = []
|
|
101
106
|
self._X_train_shape = (0,0)
|
|
107
|
+
self._X_val_shape = (0,0)
|
|
102
108
|
self._X_test_shape = (0,0)
|
|
103
109
|
self._y_train_shape = (0,)
|
|
110
|
+
self._y_val_shape = (0,)
|
|
104
111
|
self._y_test_shape = (0,)
|
|
112
|
+
self.class_map: dict[str, int] = dict()
|
|
113
|
+
self.classes: list[str] = list()
|
|
105
114
|
|
|
106
115
|
def _prepare_scaler(self,
|
|
107
116
|
X_train: pandas.DataFrame,
|
|
108
117
|
y_train: Union[pandas.Series, pandas.DataFrame],
|
|
118
|
+
X_val: pandas.DataFrame,
|
|
109
119
|
X_test: pandas.DataFrame,
|
|
110
120
|
label_dtype: torch.dtype,
|
|
111
121
|
schema: FeatureSchema):
|
|
112
|
-
"""Internal helper to fit and apply a
|
|
122
|
+
"""Internal helper to fit and apply a DragonScaler using a FeatureSchema."""
|
|
113
123
|
continuous_feature_indices: Optional[List[int]] = None
|
|
114
124
|
|
|
115
125
|
# Get continuous feature indices *from the schema*
|
|
@@ -126,27 +136,34 @@ class _BaseDatasetMaker(ABC):
|
|
|
126
136
|
else:
|
|
127
137
|
_LOGGER.info("No continuous features listed in schema. Scaler will not be fitted.")
|
|
128
138
|
|
|
129
|
-
X_train_values = X_train.
|
|
130
|
-
|
|
139
|
+
X_train_values = X_train.to_numpy()
|
|
140
|
+
X_val_values = X_val.to_numpy()
|
|
141
|
+
X_test_values = X_test.to_numpy()
|
|
131
142
|
|
|
132
143
|
# continuous_feature_indices is derived
|
|
133
144
|
if self.scaler is None and continuous_feature_indices:
|
|
134
|
-
_LOGGER.info("Fitting a new
|
|
145
|
+
_LOGGER.info("Fitting a new DragonScaler on training data.")
|
|
135
146
|
temp_train_ds = _PytorchDataset(X_train_values, y_train, label_dtype) # type: ignore
|
|
136
|
-
self.scaler =
|
|
147
|
+
self.scaler = DragonScaler.fit(temp_train_ds, continuous_feature_indices)
|
|
137
148
|
|
|
138
149
|
if self.scaler and self.scaler.mean_ is not None:
|
|
139
|
-
_LOGGER.info("Applying scaler transformation to train and test feature sets.")
|
|
150
|
+
_LOGGER.info("Applying scaler transformation to train, validation, and test feature sets.")
|
|
140
151
|
X_train_tensor = self.scaler.transform(torch.tensor(X_train_values, dtype=torch.float32))
|
|
152
|
+
X_val_tensor = self.scaler.transform(torch.tensor(X_val_values, dtype=torch.float32))
|
|
141
153
|
X_test_tensor = self.scaler.transform(torch.tensor(X_test_values, dtype=torch.float32))
|
|
142
|
-
return X_train_tensor.numpy(), X_test_tensor.numpy()
|
|
154
|
+
return X_train_tensor.numpy(), X_val_tensor.numpy(), X_test_tensor.numpy()
|
|
143
155
|
|
|
144
|
-
return X_train_values, X_test_values
|
|
156
|
+
return X_train_values, X_val_values, X_test_values
|
|
145
157
|
|
|
146
158
|
@property
|
|
147
159
|
def train_dataset(self) -> Dataset:
|
|
148
160
|
if self._train_ds is None: raise RuntimeError("Dataset not yet created.")
|
|
149
161
|
return self._train_ds
|
|
162
|
+
|
|
163
|
+
@property
|
|
164
|
+
def validation_dataset(self) -> Dataset:
|
|
165
|
+
if self._val_ds is None: raise RuntimeError("Dataset not yet created.")
|
|
166
|
+
return self._val_ds
|
|
150
167
|
|
|
151
168
|
@property
|
|
152
169
|
def test_dataset(self) -> Dataset:
|
|
@@ -181,6 +198,7 @@ class _BaseDatasetMaker(ABC):
|
|
|
181
198
|
def dataframes_info(self) -> None:
|
|
182
199
|
print("--- DataFrame Shapes After Split ---")
|
|
183
200
|
print(f" X_train shape: {self._X_train_shape}, y_train shape: {self._y_train_shape}")
|
|
201
|
+
print(f" X_val shape: {self._X_val_shape}, y_val shape: {self._y_val_shape}")
|
|
184
202
|
print(f" X_test shape: {self._X_test_shape}, y_test shape: {self._y_test_shape}")
|
|
185
203
|
print("------------------------------------")
|
|
186
204
|
|
|
@@ -200,7 +218,7 @@ class _BaseDatasetMaker(ABC):
|
|
|
200
218
|
|
|
201
219
|
def save_scaler(self, directory: Union[str, Path], verbose: bool=True) -> None:
|
|
202
220
|
"""
|
|
203
|
-
Saves the fitted
|
|
221
|
+
Saves the fitted DragonScaler's state to a .pth file.
|
|
204
222
|
|
|
205
223
|
The filename is automatically generated based on the dataset id.
|
|
206
224
|
|
|
@@ -220,6 +238,24 @@ class _BaseDatasetMaker(ABC):
|
|
|
220
238
|
self.scaler.save(filepath, verbose=False)
|
|
221
239
|
if verbose:
|
|
222
240
|
_LOGGER.info(f"Scaler for dataset '{self.id}' saved as '{filepath.name}'.")
|
|
241
|
+
|
|
242
|
+
def save_class_map(self, directory: Union[str,Path], verbose: bool=True) -> None:
|
|
243
|
+
"""
|
|
244
|
+
Saves the class to index mapping {str: int} to a directory.
|
|
245
|
+
"""
|
|
246
|
+
if not self.class_map:
|
|
247
|
+
_LOGGER.warning(f"No class_map defined. Skipping.")
|
|
248
|
+
return
|
|
249
|
+
|
|
250
|
+
log_name = f"Class_to_Index_{self.id}" if self.id else "Class_to_Index"
|
|
251
|
+
|
|
252
|
+
custom_logger(data=self.class_map,
|
|
253
|
+
save_directory=directory,
|
|
254
|
+
log_name=log_name,
|
|
255
|
+
add_timestamp=False,
|
|
256
|
+
dict_as="json")
|
|
257
|
+
if verbose:
|
|
258
|
+
_LOGGER.info(f"Class map for '{self.id}' saved as '{log_name}.json'.")
|
|
223
259
|
|
|
224
260
|
def save_artifacts(self, directory: Union[str, Path], verbose: bool=True) -> None:
|
|
225
261
|
"""
|
|
@@ -229,19 +265,22 @@ class _BaseDatasetMaker(ABC):
|
|
|
229
265
|
self.save_target_names(directory=directory, verbose=verbose)
|
|
230
266
|
if self.scaler is not None:
|
|
231
267
|
self.save_scaler(directory=directory, verbose=verbose)
|
|
268
|
+
if self.class_map:
|
|
269
|
+
self.save_class_map(directory=directory, verbose=verbose)
|
|
232
270
|
|
|
233
271
|
|
|
234
272
|
# Single target dataset
|
|
235
|
-
class
|
|
273
|
+
class DragonDataset(_BaseDatasetMaker):
|
|
236
274
|
"""
|
|
237
275
|
Dataset maker for pre-processed, numerical pandas DataFrames with a single target column.
|
|
238
276
|
|
|
239
277
|
This class takes a DataFrame, and a FeatureSchema, automatically splits and converts them into PyTorch Datasets.
|
|
240
|
-
It can also create and apply a
|
|
278
|
+
It can also create and apply a DragonScaler using the schema.
|
|
241
279
|
|
|
242
280
|
Attributes:
|
|
243
|
-
`scaler` ->
|
|
281
|
+
`scaler` -> DragonScaler | None
|
|
244
282
|
`train_dataset` -> PyTorch Dataset
|
|
283
|
+
`validation_dataset` -> PyTorch Dataset
|
|
245
284
|
`test_dataset` -> PyTorch Dataset
|
|
246
285
|
`feature_names` -> list[str]
|
|
247
286
|
`target_names` -> list[str]
|
|
@@ -252,27 +291,59 @@ class DatasetMaker(_BaseDatasetMaker):
|
|
|
252
291
|
def __init__(self,
|
|
253
292
|
pandas_df: pandas.DataFrame,
|
|
254
293
|
schema: FeatureSchema,
|
|
255
|
-
kind: Literal["regression", "classification"],
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
294
|
+
kind: Literal["regression", "binary classification", "multiclass classification"],
|
|
295
|
+
scaler: Union[Literal["fit"], Literal["none"], DragonScaler],
|
|
296
|
+
validation_size: float = 0.2,
|
|
297
|
+
test_size: float = 0.1,
|
|
298
|
+
class_map: Optional[dict[str,int]]=None,
|
|
299
|
+
random_state: int = 42):
|
|
259
300
|
"""
|
|
260
301
|
Args:
|
|
261
302
|
pandas_df (pandas.DataFrame):
|
|
262
303
|
The pre-processed input DataFrame containing all columns. (features and single target).
|
|
263
304
|
schema (FeatureSchema):
|
|
264
305
|
The definitive schema object from data_exploration.
|
|
265
|
-
kind (
|
|
266
|
-
The type of ML task.
|
|
306
|
+
kind (str):
|
|
307
|
+
The type of ML task. Must be one of:
|
|
308
|
+
- "regression"
|
|
309
|
+
- "binary classification"
|
|
310
|
+
- "multiclass classification"
|
|
311
|
+
scaler ("fit" | "none" | DragonScaler):
|
|
312
|
+
Strategy for data scaling:
|
|
313
|
+
- "fit": Fit a new DragonScaler on continuous features.
|
|
314
|
+
- "none": Do not scale data (e.g., for TabularTransformer).
|
|
315
|
+
- DragonScaler instance: Use a pre-fitted scaler to transform data.
|
|
316
|
+
validation_size (float):
|
|
317
|
+
The proportion of the *original* dataset to allocate to the validation split.
|
|
267
318
|
test_size (float):
|
|
268
|
-
The proportion of the dataset to allocate to the test split.
|
|
319
|
+
The proportion of the dataset to allocate to the test split (can be 0).
|
|
320
|
+
class_map (dict[str,int] | None): Optional class map for the target classes in classification tasks. Can be set later using `.set_class_map()`.
|
|
269
321
|
random_state (int):
|
|
270
322
|
The seed for the random number of generator for reproducibility.
|
|
271
|
-
|
|
272
|
-
A pre-fitted PytorchScaler instance, if None a new scaler will be created.
|
|
323
|
+
|
|
273
324
|
"""
|
|
274
325
|
super().__init__()
|
|
275
|
-
|
|
326
|
+
|
|
327
|
+
# --- Validation for split sizes ---
|
|
328
|
+
if (validation_size + test_size) >= 1.0:
|
|
329
|
+
_LOGGER.error(f"The sum of validation_size ({validation_size}) and test_size ({test_size}) must be less than 1.0.")
|
|
330
|
+
raise ValueError()
|
|
331
|
+
elif validation_size <= 0.0:
|
|
332
|
+
_LOGGER.error(f"Invalid validation split of {validation_size}.")
|
|
333
|
+
raise ValueError()
|
|
334
|
+
|
|
335
|
+
_apply_scaling: bool = False
|
|
336
|
+
if scaler == "fit":
|
|
337
|
+
self.scaler = None # To be created
|
|
338
|
+
_apply_scaling = True
|
|
339
|
+
elif scaler == "none":
|
|
340
|
+
self.scaler = None
|
|
341
|
+
elif isinstance(scaler, DragonScaler):
|
|
342
|
+
self.scaler = scaler # Use the provided one
|
|
343
|
+
_apply_scaling = True
|
|
344
|
+
else:
|
|
345
|
+
_LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a DragonScaler instance.")
|
|
346
|
+
raise ValueError()
|
|
276
347
|
|
|
277
348
|
# --- 1. Identify features (from schema) ---
|
|
278
349
|
self._feature_names = list(schema.feature_names)
|
|
@@ -287,7 +358,7 @@ class DatasetMaker(_BaseDatasetMaker):
|
|
|
287
358
|
_LOGGER.error("No target column found. The schema's features match the DataFrame's columns exactly.")
|
|
288
359
|
raise ValueError("No target column found in DataFrame.")
|
|
289
360
|
if len(target_cols_set) > 1:
|
|
290
|
-
_LOGGER.error(f"Ambiguous target. Found {len(target_cols_set)} columns not in the schema: {list(target_cols_set)}.
|
|
361
|
+
_LOGGER.error(f"Ambiguous target. Found {len(target_cols_set)} columns not in the schema: {list(target_cols_set)}. One target required.")
|
|
291
362
|
raise ValueError("Ambiguous target: More than one non-feature column found.")
|
|
292
363
|
|
|
293
364
|
target_name = list(target_cols_set)[0]
|
|
@@ -297,30 +368,130 @@ class DatasetMaker(_BaseDatasetMaker):
|
|
|
297
368
|
# --- 3. Split Data ---
|
|
298
369
|
features_df = pandas_df[self._feature_names]
|
|
299
370
|
target_series = pandas_df[target_name]
|
|
300
|
-
|
|
301
|
-
|
|
371
|
+
|
|
372
|
+
# First split: (Train + Val) vs TesT
|
|
373
|
+
X_train_val, X_test, y_train_val, y_test = train_test_split(
|
|
302
374
|
features_df,
|
|
303
375
|
target_series,
|
|
304
376
|
test_size=test_size,
|
|
305
377
|
random_state=random_state
|
|
306
378
|
)
|
|
307
|
-
|
|
308
|
-
|
|
379
|
+
# Calculate validation split size relative to the (Train + Val) set
|
|
380
|
+
val_split_size = validation_size / (1.0 - test_size)
|
|
381
|
+
|
|
382
|
+
# Second split: Train vs Val
|
|
383
|
+
X_train, X_val, y_train, y_val = train_test_split(
|
|
384
|
+
X_train_val,
|
|
385
|
+
y_train_val,
|
|
386
|
+
test_size=val_split_size,
|
|
387
|
+
random_state=random_state
|
|
388
|
+
)
|
|
309
389
|
|
|
310
|
-
|
|
390
|
+
self._X_train_shape, self._X_val_shape, self._X_test_shape = X_train.shape, X_val.shape, X_test.shape
|
|
391
|
+
self._y_train_shape, self._y_val_shape, self._y_test_shape = y_train.shape, y_val.shape, y_test.shape
|
|
392
|
+
|
|
393
|
+
# --- label_dtype logic ---
|
|
394
|
+
if kind == MLTaskKeys.REGRESSION or kind == MLTaskKeys.BINARY_CLASSIFICATION:
|
|
395
|
+
label_dtype = torch.float32
|
|
396
|
+
elif kind == MLTaskKeys.MULTICLASS_CLASSIFICATION:
|
|
397
|
+
label_dtype = torch.int64
|
|
398
|
+
else:
|
|
399
|
+
_LOGGER.error(f"Invalid 'kind' {kind}. Must be '{MLTaskKeys.REGRESSION}', '{MLTaskKeys.BINARY_CLASSIFICATION}', or '{MLTaskKeys.MULTICLASS_CLASSIFICATION}'.")
|
|
400
|
+
raise ValueError()
|
|
401
|
+
self.kind = kind
|
|
311
402
|
|
|
312
403
|
# --- 4. Scale (using the schema) ---
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
404
|
+
if _apply_scaling:
|
|
405
|
+
X_train_final, X_val_final, X_test_final = self._prepare_scaler(
|
|
406
|
+
X_train, y_train, X_val, X_test, label_dtype, schema
|
|
407
|
+
)
|
|
408
|
+
else:
|
|
409
|
+
_LOGGER.info("Features have not been scaled as specified.")
|
|
410
|
+
X_train_final = X_train.to_numpy()
|
|
411
|
+
X_val_final = X_val.to_numpy()
|
|
412
|
+
X_test_final = X_test.to_numpy()
|
|
316
413
|
|
|
317
414
|
# --- 5. Create Datasets ---
|
|
318
415
|
self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
|
|
416
|
+
self._val_ds = _PytorchDataset(X_val_final, y_val, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
|
|
319
417
|
self._test_ds = _PytorchDataset(X_test_final, y_test, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
|
|
320
418
|
|
|
419
|
+
# --- 6. create class map if given ---
|
|
420
|
+
if self.kind != MLTaskKeys.REGRESSION:
|
|
421
|
+
if class_map is None:
|
|
422
|
+
self.class_map = dict()
|
|
423
|
+
else:
|
|
424
|
+
self.set_class_map(class_map)
|
|
425
|
+
else:
|
|
426
|
+
self.class_map = dict()
|
|
427
|
+
|
|
428
|
+
def set_class_map(self, class_map: dict[str, int], force_overwrite: bool=False) -> None:
|
|
429
|
+
"""
|
|
430
|
+
Sets a map of class_name -> integer_label.
|
|
431
|
+
|
|
432
|
+
This is used by the InferenceHandler and to finalize the model after training.
|
|
433
|
+
|
|
434
|
+
Args:
|
|
435
|
+
class_map (Dict[str, int]): A dictionary mapping the integer label
|
|
436
|
+
to its string name.
|
|
437
|
+
Example: {'cat': 0, 'dog': 1, 'bird': 2}
|
|
438
|
+
force_overwrite (bool): Required to overwrite a previously set class map.
|
|
439
|
+
"""
|
|
440
|
+
if self.kind == MLTaskKeys.REGRESSION:
|
|
441
|
+
_LOGGER.warning(f"Class Map is for classifications tasks only.")
|
|
442
|
+
return
|
|
443
|
+
|
|
444
|
+
if self.class_map:
|
|
445
|
+
warning_message = f"Class map was previously set."
|
|
446
|
+
if not force_overwrite:
|
|
447
|
+
warning_message += " Use `force_overwrite=True` to set new values."
|
|
448
|
+
_LOGGER.warning(warning_message)
|
|
449
|
+
return
|
|
450
|
+
else:
|
|
451
|
+
warning_message += ". Setting new values..."
|
|
452
|
+
_LOGGER.warning(warning_message)
|
|
453
|
+
|
|
454
|
+
self.class_map = class_map
|
|
455
|
+
|
|
456
|
+
try:
|
|
457
|
+
sorted_items = sorted(class_map.items(), key=lambda item: item[1])
|
|
458
|
+
class_list = [item[0] for item in sorted_items]
|
|
459
|
+
except Exception as e:
|
|
460
|
+
_LOGGER.error(f"Could not sort class map. Ensure it is a dict of {str: int}. Error: {e}")
|
|
461
|
+
raise TypeError()
|
|
462
|
+
else:
|
|
463
|
+
self.classes = class_list
|
|
464
|
+
|
|
465
|
+
if self._train_ds:
|
|
466
|
+
self._train_ds._classes = class_list # type: ignore
|
|
467
|
+
self._train_ds._class_map = class_map # type: ignore
|
|
468
|
+
if self._val_ds:
|
|
469
|
+
self._val_ds._classes = class_list # type: ignore
|
|
470
|
+
self._val_ds._class_map = class_map # type: ignore
|
|
471
|
+
if self._test_ds:
|
|
472
|
+
self._test_ds._classes = class_list # type: ignore
|
|
473
|
+
self._test_ds._class_map = class_map # type: ignore
|
|
474
|
+
|
|
475
|
+
_LOGGER.info(f"Class map set for dataset '{self.id}' and its subsets:\n{class_map}")
|
|
476
|
+
|
|
477
|
+
def __repr__(self) -> str:
|
|
478
|
+
s = f"<{self.__class__.__name__} (ID: '{self.id}')>\n"
|
|
479
|
+
s += f" Target: {self.target_names[0]}\n"
|
|
480
|
+
s += f" Features: {self.number_of_features}\n"
|
|
481
|
+
s += f" Scaler: {'Fitted' if self.scaler else 'None'}\n"
|
|
482
|
+
|
|
483
|
+
if self._train_ds:
|
|
484
|
+
s += f" Train Samples: {len(self._train_ds)}\n" # type: ignore
|
|
485
|
+
if self._val_ds:
|
|
486
|
+
s += f" Validation Samples: {len(self._val_ds)}\n" # type: ignore
|
|
487
|
+
if self._test_ds:
|
|
488
|
+
s += f" Test Samples: {len(self._test_ds)}\n" # type: ignore
|
|
489
|
+
|
|
490
|
+
return s
|
|
491
|
+
|
|
321
492
|
|
|
322
493
|
# --- Multi-Target Class ---
|
|
323
|
-
class
|
|
494
|
+
class DragonDatasetMulti(_BaseDatasetMaker):
|
|
324
495
|
"""
|
|
325
496
|
Dataset maker for pre-processed, numerical pandas DataFrames with
|
|
326
497
|
multiple target columns.
|
|
@@ -329,16 +500,16 @@ class DatasetMakerMulti(_BaseDatasetMaker):
|
|
|
329
500
|
*target_columns*. It validates that the schema's features and the
|
|
330
501
|
target columns are mutually exclusive and together account for all
|
|
331
502
|
columns in the DataFrame.
|
|
332
|
-
|
|
333
|
-
Targets dtype is torch.float32
|
|
334
503
|
"""
|
|
335
504
|
def __init__(self,
|
|
336
505
|
pandas_df: pandas.DataFrame,
|
|
337
506
|
target_columns: List[str],
|
|
338
507
|
schema: FeatureSchema,
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
508
|
+
kind: Literal["multitarget regression", "multilabel binary classification"],
|
|
509
|
+
scaler: Union[Literal["fit"], Literal["none"], DragonScaler],
|
|
510
|
+
validation_size: float = 0.2,
|
|
511
|
+
test_size: float = 0.1,
|
|
512
|
+
random_state: int = 42):
|
|
342
513
|
"""
|
|
343
514
|
Args:
|
|
344
515
|
pandas_df (pandas.DataFrame):
|
|
@@ -348,20 +519,54 @@ class DatasetMakerMulti(_BaseDatasetMaker):
|
|
|
348
519
|
List of target column names.
|
|
349
520
|
schema (FeatureSchema):
|
|
350
521
|
The definitive schema object from data_exploration.
|
|
522
|
+
kind (str):
|
|
523
|
+
The type of multi-target ML task. Must be one of:
|
|
524
|
+
- "multitarget regression"
|
|
525
|
+
- "multilabel binary classification"
|
|
526
|
+
scaler ("fit" | "none" | DragonScaler):
|
|
527
|
+
Strategy for data scaling:
|
|
528
|
+
- "fit": Fit a new DragonScaler on continuous features.
|
|
529
|
+
- "none": Do not scale data (e.g., for TabularTransformer).
|
|
530
|
+
- DragonScaler instance: Use a pre-fitted scaler to transform data.
|
|
531
|
+
validation_size (float):
|
|
532
|
+
The proportion of the dataset to allocate to the validation split.
|
|
351
533
|
test_size (float):
|
|
352
534
|
The proportion of the dataset to allocate to the test split.
|
|
353
535
|
random_state (int):
|
|
354
536
|
The seed for the random number generator for reproducibility.
|
|
355
|
-
scaler (PytorchScaler | None):
|
|
356
|
-
A pre-fitted PytorchScaler instance.
|
|
357
537
|
|
|
358
538
|
## Note:
|
|
359
539
|
For multi-binary classification, the most common PyTorch loss function is nn.BCEWithLogitsLoss.
|
|
360
|
-
This loss function requires the labels to be torch.float32 which is the same type required for
|
|
540
|
+
This loss function requires the labels to be torch.float32 which is the same type required for multi-regression tasks.
|
|
361
541
|
"""
|
|
362
542
|
super().__init__()
|
|
363
|
-
|
|
364
|
-
|
|
543
|
+
|
|
544
|
+
# --- Validation for split sizes ---
|
|
545
|
+
if (validation_size + test_size) >= 1.0:
|
|
546
|
+
_LOGGER.error(f"The sum of validation_size ({validation_size}) and test_size ({test_size}) must be less than 1.0.")
|
|
547
|
+
raise ValueError("validation_size and test_size sum must be < 1.0")
|
|
548
|
+
elif validation_size <= 0.0:
|
|
549
|
+
_LOGGER.error(f"Invalid validation split of {validation_size}.")
|
|
550
|
+
raise ValueError()
|
|
551
|
+
|
|
552
|
+
# --- Validate kind parameter ---
|
|
553
|
+
if kind not in [MLTaskKeys.MULTITARGET_REGRESSION, MLTaskKeys.MULTILABEL_BINARY_CLASSIFICATION]:
|
|
554
|
+
_LOGGER.error(f"Invalid 'kind' {kind}. Must be '{MLTaskKeys.MULTITARGET_REGRESSION}' or '{MLTaskKeys.MULTILABEL_BINARY_CLASSIFICATION}'.")
|
|
555
|
+
raise ValueError()
|
|
556
|
+
|
|
557
|
+
_apply_scaling: bool = False
|
|
558
|
+
if scaler == "fit":
|
|
559
|
+
self.scaler = None
|
|
560
|
+
_apply_scaling = True
|
|
561
|
+
elif scaler == "none":
|
|
562
|
+
self.scaler = None
|
|
563
|
+
elif isinstance(scaler, DragonScaler):
|
|
564
|
+
self.scaler = scaler # Use the provided one
|
|
565
|
+
_apply_scaling = True
|
|
566
|
+
else:
|
|
567
|
+
_LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a DragonScaler instance.")
|
|
568
|
+
raise ValueError()
|
|
569
|
+
|
|
365
570
|
# --- 1. Get features and targets from schema/args ---
|
|
366
571
|
self._feature_names = list(schema.feature_names)
|
|
367
572
|
self._target_names = target_columns
|
|
@@ -389,413 +594,63 @@ class DatasetMakerMulti(_BaseDatasetMaker):
|
|
|
389
594
|
# --- 3. Split Data ---
|
|
390
595
|
features_df = pandas_df[self._feature_names]
|
|
391
596
|
target_df = pandas_df[self._target_names]
|
|
392
|
-
|
|
393
|
-
|
|
597
|
+
|
|
598
|
+
# First split: (Train + Val) vs Test
|
|
599
|
+
X_train_val, X_test, y_train_val, y_test = train_test_split(
|
|
394
600
|
features_df,
|
|
395
601
|
target_df,
|
|
396
602
|
test_size=test_size,
|
|
397
603
|
random_state=random_state
|
|
398
604
|
)
|
|
399
|
-
|
|
400
|
-
|
|
605
|
+
|
|
606
|
+
# Calculate validation split size relative to the (Train + Val) set
|
|
607
|
+
val_split_size = validation_size / (1.0 - test_size)
|
|
608
|
+
|
|
609
|
+
# Second split: Train vs Val
|
|
610
|
+
X_train, X_val, y_train, y_val = train_test_split(
|
|
611
|
+
X_train_val,
|
|
612
|
+
y_train_val,
|
|
613
|
+
test_size=val_split_size,
|
|
614
|
+
random_state=random_state
|
|
615
|
+
)
|
|
616
|
+
|
|
617
|
+
self._X_train_shape, self._X_val_shape, self._X_test_shape = X_train.shape, X_val.shape, X_test.shape
|
|
618
|
+
self._y_train_shape, self._y_val_shape, self._y_test_shape = y_train.shape, y_val.shape, y_test.shape
|
|
401
619
|
|
|
402
620
|
# Multi-target for regression or multi-binary
|
|
403
621
|
label_dtype = torch.float32
|
|
404
622
|
|
|
405
623
|
# --- 4. Scale (using the schema) ---
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
624
|
+
if _apply_scaling:
|
|
625
|
+
X_train_final, X_val_final, X_test_final = self._prepare_scaler(
|
|
626
|
+
X_train, y_train, X_val, X_test, label_dtype, schema
|
|
627
|
+
)
|
|
628
|
+
else:
|
|
629
|
+
_LOGGER.info("Features have not been scaled as specified.")
|
|
630
|
+
X_train_final = X_train.to_numpy()
|
|
631
|
+
X_val_final = X_val.to_numpy()
|
|
632
|
+
X_test_final = X_test.to_numpy()
|
|
409
633
|
|
|
410
634
|
# --- 5. Create Datasets ---
|
|
411
635
|
# _PytorchDataset now correctly handles y_train (a DataFrame)
|
|
412
636
|
self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
|
|
637
|
+
self._val_ds = _PytorchDataset(X_val_final, y_val, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
|
|
413
638
|
self._test_ds = _PytorchDataset(X_test_final, y_test, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
|
|
414
639
|
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
self.
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
def get_datasets(self) -> Tuple[Dataset, ...]:
|
|
428
|
-
"""
|
|
429
|
-
The primary method to retrieve the final, processed PyTorch datasets.
|
|
430
|
-
Must be implemented by all subclasses.
|
|
431
|
-
"""
|
|
432
|
-
pass
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
# --- VisionDatasetMaker ---
|
|
436
|
-
class VisionDatasetMaker(_BaseMaker):
|
|
437
|
-
"""
|
|
438
|
-
Creates processed PyTorch datasets for computer vision tasks from an
|
|
439
|
-
image folder directory.
|
|
440
|
-
|
|
441
|
-
Uses online augmentations per epoch (image augmentation without creating new files).
|
|
442
|
-
"""
|
|
443
|
-
def __init__(self, full_dataset: ImageFolder):
|
|
444
|
-
super().__init__()
|
|
445
|
-
self.full_dataset = full_dataset
|
|
446
|
-
self.labels = [s[1] for s in self.full_dataset.samples]
|
|
447
|
-
self.class_map = full_dataset.class_to_idx
|
|
448
|
-
|
|
449
|
-
self._is_split = False
|
|
450
|
-
self._are_transforms_configured = False
|
|
451
|
-
|
|
452
|
-
@classmethod
|
|
453
|
-
def from_folder(cls, root_dir: str) -> 'VisionDatasetMaker':
|
|
454
|
-
"""Creates a maker instance from a root directory of images."""
|
|
455
|
-
initial_transform = transforms.Compose([transforms.ToTensor()])
|
|
456
|
-
full_dataset = ImageFolder(root=root_dir, transform=initial_transform)
|
|
457
|
-
_LOGGER.info(f"Found {len(full_dataset)} images in {len(full_dataset.classes)} classes.")
|
|
458
|
-
return cls(full_dataset)
|
|
459
|
-
|
|
460
|
-
@staticmethod
|
|
461
|
-
def inspect_folder(path: Union[str, Path]):
|
|
462
|
-
"""
|
|
463
|
-
Logs a report of the types, sizes, and channels of image files
|
|
464
|
-
found in the directory and its subdirectories.
|
|
465
|
-
"""
|
|
466
|
-
path_obj = make_fullpath(path)
|
|
467
|
-
|
|
468
|
-
non_image_files = set()
|
|
469
|
-
img_types = set()
|
|
470
|
-
img_sizes = set()
|
|
471
|
-
img_channels = set()
|
|
472
|
-
img_counter = 0
|
|
473
|
-
|
|
474
|
-
_LOGGER.info(f"Inspecting folder: {path_obj}...")
|
|
475
|
-
# Use rglob to recursively find all files
|
|
476
|
-
for filepath in path_obj.rglob('*'):
|
|
477
|
-
if filepath.is_file():
|
|
478
|
-
try:
|
|
479
|
-
# Using PIL to open is a more reliable check
|
|
480
|
-
with Image.open(filepath) as img:
|
|
481
|
-
img_types.add(img.format)
|
|
482
|
-
img_sizes.add(img.size)
|
|
483
|
-
img_channels.update(img.getbands())
|
|
484
|
-
img_counter += 1
|
|
485
|
-
except (IOError, SyntaxError):
|
|
486
|
-
non_image_files.add(filepath.name)
|
|
487
|
-
|
|
488
|
-
if non_image_files:
|
|
489
|
-
_LOGGER.warning(f"Non-image or corrupted files found and ignored: {non_image_files}")
|
|
490
|
-
|
|
491
|
-
report = (
|
|
492
|
-
f"\n--- Inspection Report for '{path_obj.name}' ---\n"
|
|
493
|
-
f"Total images found: {img_counter}\n"
|
|
494
|
-
f"Image formats: {img_types or 'None'}\n"
|
|
495
|
-
f"Image sizes (WxH): {img_sizes or 'None'}\n"
|
|
496
|
-
f"Image channels (bands): {img_channels or 'None'}\n"
|
|
497
|
-
f"--------------------------------------"
|
|
498
|
-
)
|
|
499
|
-
print(report)
|
|
500
|
-
|
|
501
|
-
def split_data(self, val_size: float = 0.2, test_size: float = 0.0,
|
|
502
|
-
stratify: bool = True, random_state: Optional[int] = None) -> 'VisionDatasetMaker':
|
|
503
|
-
"""Splits the dataset into training, validation, and optional test sets."""
|
|
504
|
-
if self._is_split:
|
|
505
|
-
_LOGGER.warning("Data has already been split.")
|
|
506
|
-
return self
|
|
507
|
-
|
|
508
|
-
if val_size + test_size >= 1.0:
|
|
509
|
-
_LOGGER.error("The sum of val_size and test_size must be less than 1.")
|
|
510
|
-
raise ValueError()
|
|
511
|
-
|
|
512
|
-
indices = list(range(len(self.full_dataset)))
|
|
513
|
-
labels_for_split = self.labels if stratify else None
|
|
514
|
-
|
|
515
|
-
train_indices, val_test_indices = train_test_split(
|
|
516
|
-
indices, test_size=(val_size + test_size), random_state=random_state, stratify=labels_for_split
|
|
517
|
-
)
|
|
518
|
-
|
|
519
|
-
if test_size > 0:
|
|
520
|
-
val_test_labels = [self.labels[i] for i in val_test_indices]
|
|
521
|
-
stratify_val_test = val_test_labels if stratify else None
|
|
522
|
-
val_indices, test_indices = train_test_split(
|
|
523
|
-
val_test_indices, test_size=(test_size / (val_size + test_size)),
|
|
524
|
-
random_state=random_state, stratify=stratify_val_test
|
|
525
|
-
)
|
|
526
|
-
self._test_dataset = Subset(self.full_dataset, test_indices)
|
|
527
|
-
_LOGGER.info(f"Test set created with {len(self._test_dataset)} images.")
|
|
528
|
-
else:
|
|
529
|
-
val_indices = val_test_indices
|
|
530
|
-
|
|
531
|
-
self._train_dataset = Subset(self.full_dataset, train_indices)
|
|
532
|
-
self._val_dataset = Subset(self.full_dataset, val_indices)
|
|
533
|
-
self._is_split = True
|
|
534
|
-
|
|
535
|
-
_LOGGER.info(f"Data split into: \n- Training: {len(self._train_dataset)} images \n- Validation: {len(self._val_dataset)} images")
|
|
536
|
-
return self
|
|
537
|
-
|
|
538
|
-
def configure_transforms(self, resize_size: int = 256, crop_size: int = 224,
|
|
539
|
-
mean: List[float] = [0.485, 0.456, 0.406],
|
|
540
|
-
std: List[float] = [0.229, 0.224, 0.225],
|
|
541
|
-
extra_train_transforms: Optional[List] = None) -> 'VisionDatasetMaker':
|
|
542
|
-
"""Configures and applies the image transformations (augmentations)."""
|
|
543
|
-
if not self._is_split:
|
|
544
|
-
_LOGGER.error("Transforms must be configured AFTER splitting data. Call .split_data() first.")
|
|
545
|
-
raise RuntimeError()
|
|
546
|
-
|
|
547
|
-
base_train_transforms = [transforms.RandomResizedCrop(crop_size), transforms.RandomHorizontalFlip()]
|
|
548
|
-
if extra_train_transforms:
|
|
549
|
-
base_train_transforms.extend(extra_train_transforms)
|
|
550
|
-
|
|
551
|
-
final_transforms = [transforms.ToTensor(), transforms.Normalize(mean=mean, std=std)]
|
|
552
|
-
|
|
553
|
-
val_transform = transforms.Compose([transforms.Resize(resize_size), transforms.CenterCrop(crop_size), *final_transforms])
|
|
554
|
-
train_transform = transforms.Compose([*base_train_transforms, *final_transforms])
|
|
555
|
-
|
|
556
|
-
self._train_dataset.dataset.transform = train_transform # type: ignore
|
|
557
|
-
self._val_dataset.dataset.transform = val_transform # type: ignore
|
|
558
|
-
if self._test_dataset:
|
|
559
|
-
self._test_dataset.dataset.transform = val_transform # type: ignore
|
|
560
|
-
|
|
561
|
-
self._are_transforms_configured = True
|
|
562
|
-
_LOGGER.info("Image transforms configured and applied.")
|
|
563
|
-
return self
|
|
564
|
-
|
|
565
|
-
def get_datasets(self) -> Tuple[Dataset, ...]:
|
|
566
|
-
"""Returns the final train, validation, and optional test datasets."""
|
|
567
|
-
if not self._is_split:
|
|
568
|
-
_LOGGER.error("Data has not been split. Call .split_data() first.")
|
|
569
|
-
raise RuntimeError()
|
|
570
|
-
if not self._are_transforms_configured:
|
|
571
|
-
_LOGGER.warning("Transforms have not been configured. Using default ToTensor only.")
|
|
572
|
-
|
|
573
|
-
if self._test_dataset:
|
|
574
|
-
return self._train_dataset, self._val_dataset, self._test_dataset
|
|
575
|
-
return self._train_dataset, self._val_dataset
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
# --- SequenceMaker ---
|
|
579
|
-
class SequenceMaker(_BaseMaker):
|
|
580
|
-
"""
|
|
581
|
-
Creates windowed PyTorch datasets from time-series data.
|
|
582
|
-
|
|
583
|
-
Pipeline:
|
|
584
|
-
|
|
585
|
-
1. `.split_data()`: Separate time series into training and testing portions.
|
|
586
|
-
2. `.normalize_data()`: Normalize the data. The scaler will be fitted on the training portion.
|
|
587
|
-
3. `.generate_windows()`: Create the windowed sequences from the split and normalized data.
|
|
588
|
-
4. `.get_datasets()`: Return Pytorch train and test datasets.
|
|
589
|
-
"""
|
|
590
|
-
def __init__(self, data: Union[pandas.DataFrame, pandas.Series, numpy.ndarray], sequence_length: int):
|
|
591
|
-
super().__init__()
|
|
592
|
-
self.sequence_length = sequence_length
|
|
593
|
-
self.scaler = None
|
|
594
|
-
|
|
595
|
-
if isinstance(data, pandas.DataFrame):
|
|
596
|
-
self.time_axis = data.index.values
|
|
597
|
-
self.sequence = data.iloc[:, 0].values.astype(numpy.float32)
|
|
598
|
-
elif isinstance(data, pandas.Series):
|
|
599
|
-
self.time_axis = data.index.values
|
|
600
|
-
self.sequence = data.values.astype(numpy.float32)
|
|
601
|
-
elif isinstance(data, numpy.ndarray):
|
|
602
|
-
self.time_axis = numpy.arange(len(data))
|
|
603
|
-
self.sequence = data.astype(numpy.float32)
|
|
604
|
-
else:
|
|
605
|
-
_LOGGER.error("Data must be a pandas DataFrame/Series or a numpy array.")
|
|
606
|
-
raise TypeError()
|
|
640
|
+
def __repr__(self) -> str:
|
|
641
|
+
s = f"<{self.__class__.__name__} (ID: '{self.id}')>\n"
|
|
642
|
+
s += f" Targets: {self.number_of_targets}\n"
|
|
643
|
+
s += f" Features: {self.number_of_features}\n"
|
|
644
|
+
s += f" Scaler: {'Fitted' if self.scaler else 'None'}\n"
|
|
645
|
+
|
|
646
|
+
if self._train_ds:
|
|
647
|
+
s += f" Train Samples: {len(self._train_ds)}\n" # type: ignore
|
|
648
|
+
if self._val_ds:
|
|
649
|
+
s += f" Validation Samples: {len(self._val_ds)}\n" # type: ignore
|
|
650
|
+
if self._test_ds:
|
|
651
|
+
s += f" Test Samples: {len(self._test_ds)}\n" # type: ignore
|
|
607
652
|
|
|
608
|
-
|
|
609
|
-
self.test_sequence = None
|
|
610
|
-
|
|
611
|
-
self._is_split = False
|
|
612
|
-
self._is_normalized = False
|
|
613
|
-
self._are_windows_generated = False
|
|
614
|
-
|
|
615
|
-
def normalize_data(self) -> 'SequenceMaker':
|
|
616
|
-
"""
|
|
617
|
-
Normalizes the sequence data using PytorchScaler. Must be called AFTER
|
|
618
|
-
splitting to prevent data leakage from the test set.
|
|
619
|
-
"""
|
|
620
|
-
if not self._is_split:
|
|
621
|
-
_LOGGER.error("Data must be split BEFORE normalizing. Call .split_data() first.")
|
|
622
|
-
raise RuntimeError()
|
|
623
|
-
|
|
624
|
-
if self.scaler:
|
|
625
|
-
_LOGGER.warning("Data has already been normalized.")
|
|
626
|
-
return self
|
|
627
|
-
|
|
628
|
-
# 1. PytorchScaler requires a Dataset to fit. Create a temporary one.
|
|
629
|
-
# The scaler expects 2D data [n_samples, n_features].
|
|
630
|
-
train_features = self.train_sequence.reshape(-1, 1) # type: ignore
|
|
631
|
-
|
|
632
|
-
# _PytorchDataset needs labels, so we create dummy ones.
|
|
633
|
-
dummy_labels = numpy.zeros(len(train_features))
|
|
634
|
-
temp_train_ds = _PytorchDataset(train_features, dummy_labels, labels_dtype=torch.float32)
|
|
635
|
-
|
|
636
|
-
# 2. Fit the PytorchScaler on the temporary training dataset.
|
|
637
|
-
# The sequence is a single feature, so its index is [0].
|
|
638
|
-
_LOGGER.info("Fitting PytorchScaler on the training data...")
|
|
639
|
-
self.scaler = PytorchScaler.fit(temp_train_ds, continuous_feature_indices=[0])
|
|
640
|
-
|
|
641
|
-
# 3. Transform sequences using the fitted scaler.
|
|
642
|
-
# The transform method requires a tensor, so we convert, transform, and convert back.
|
|
643
|
-
train_tensor = torch.tensor(self.train_sequence.reshape(-1, 1), dtype=torch.float32) # type: ignore
|
|
644
|
-
test_tensor = torch.tensor(self.test_sequence.reshape(-1, 1), dtype=torch.float32) # type: ignore
|
|
645
|
-
|
|
646
|
-
self.train_sequence = self.scaler.transform(train_tensor).numpy().flatten()
|
|
647
|
-
self.test_sequence = self.scaler.transform(test_tensor).numpy().flatten()
|
|
648
|
-
|
|
649
|
-
self._is_normalized = True
|
|
650
|
-
_LOGGER.info("Sequence data normalized using PytorchScaler.")
|
|
651
|
-
return self
|
|
652
|
-
|
|
653
|
-
def split_data(self, test_size: float = 0.2) -> 'SequenceMaker':
|
|
654
|
-
"""Splits the sequence into training and testing portions."""
|
|
655
|
-
if self._is_split:
|
|
656
|
-
_LOGGER.warning("Data has already been split.")
|
|
657
|
-
return self
|
|
658
|
-
|
|
659
|
-
split_idx = int(len(self.sequence) * (1 - test_size))
|
|
660
|
-
self.train_sequence = self.sequence[:split_idx]
|
|
661
|
-
self.test_sequence = self.sequence[split_idx - self.sequence_length:]
|
|
662
|
-
|
|
663
|
-
self.train_time_axis = self.time_axis[:split_idx]
|
|
664
|
-
self.test_time_axis = self.time_axis[split_idx:]
|
|
665
|
-
|
|
666
|
-
self._is_split = True
|
|
667
|
-
_LOGGER.info(f"Sequence split into training ({len(self.train_sequence)} points) and testing ({len(self.test_sequence)} points).")
|
|
668
|
-
return self
|
|
669
|
-
|
|
670
|
-
def generate_windows(self, sequence_to_sequence: bool = False) -> 'SequenceMaker':
|
|
671
|
-
"""
|
|
672
|
-
Generates overlapping windows for features and labels.
|
|
673
|
-
|
|
674
|
-
"sequence-to-sequence": Label vectors are of the same size as the feature vectors instead of a single future prediction.
|
|
675
|
-
"""
|
|
676
|
-
if not self._is_split:
|
|
677
|
-
_LOGGER.error("Cannot generate windows before splitting data. Call .split_data() first.")
|
|
678
|
-
raise RuntimeError()
|
|
679
|
-
|
|
680
|
-
self._train_dataset = self._create_windowed_dataset(self.train_sequence, sequence_to_sequence) # type: ignore
|
|
681
|
-
self._test_dataset = self._create_windowed_dataset(self.test_sequence, sequence_to_sequence) # type: ignore
|
|
682
|
-
|
|
683
|
-
self._are_windows_generated = True
|
|
684
|
-
_LOGGER.info("Feature and label windows generated for train and test sets.")
|
|
685
|
-
return self
|
|
686
|
-
|
|
687
|
-
def _create_windowed_dataset(self, data: numpy.ndarray, use_sequence_labels: bool) -> Dataset:
|
|
688
|
-
"""Efficiently creates windowed features and labels using numpy."""
|
|
689
|
-
if len(data) <= self.sequence_length:
|
|
690
|
-
_LOGGER.error("Data length must be greater than the sequence_length to create at least one window.")
|
|
691
|
-
raise ValueError()
|
|
692
|
-
|
|
693
|
-
if not use_sequence_labels:
|
|
694
|
-
features = data[:-1]
|
|
695
|
-
labels = data[self.sequence_length:]
|
|
696
|
-
|
|
697
|
-
n_windows = len(features) - self.sequence_length + 1
|
|
698
|
-
bytes_per_item = features.strides[0]
|
|
699
|
-
strided_features = numpy.lib.stride_tricks.as_strided(
|
|
700
|
-
features, shape=(n_windows, self.sequence_length), strides=(bytes_per_item, bytes_per_item)
|
|
701
|
-
)
|
|
702
|
-
return _PytorchDataset(strided_features, labels, labels_dtype=torch.float32)
|
|
703
|
-
|
|
704
|
-
else:
|
|
705
|
-
x_data = data[:-1]
|
|
706
|
-
y_data = data[1:]
|
|
707
|
-
|
|
708
|
-
n_windows = len(x_data) - self.sequence_length + 1
|
|
709
|
-
bytes_per_item = x_data.strides[0]
|
|
710
|
-
|
|
711
|
-
strided_x = numpy.lib.stride_tricks.as_strided(x_data, shape=(n_windows, self.sequence_length), strides=(bytes_per_item, bytes_per_item))
|
|
712
|
-
strided_y = numpy.lib.stride_tricks.as_strided(y_data, shape=(n_windows, self.sequence_length), strides=(bytes_per_item, bytes_per_item))
|
|
713
|
-
|
|
714
|
-
return _PytorchDataset(strided_x, strided_y, labels_dtype=torch.float32)
|
|
715
|
-
|
|
716
|
-
def denormalize(self, data: Union[torch.Tensor, numpy.ndarray]) -> numpy.ndarray:
|
|
717
|
-
"""Applies inverse transformation using the stored PytorchScaler."""
|
|
718
|
-
if self.scaler is None:
|
|
719
|
-
_LOGGER.error("Data was not normalized. Cannot denormalize.")
|
|
720
|
-
raise RuntimeError()
|
|
721
|
-
|
|
722
|
-
# Ensure data is a torch.Tensor
|
|
723
|
-
if isinstance(data, numpy.ndarray):
|
|
724
|
-
tensor_data = torch.tensor(data, dtype=torch.float32)
|
|
725
|
-
else:
|
|
726
|
-
tensor_data = data
|
|
727
|
-
|
|
728
|
-
# Reshape for the scaler [n_samples, n_features]
|
|
729
|
-
if tensor_data.ndim == 1:
|
|
730
|
-
tensor_data = tensor_data.view(-1, 1)
|
|
731
|
-
|
|
732
|
-
# Apply inverse transform and convert back to a flat numpy array
|
|
733
|
-
original_scale_tensor = self.scaler.inverse_transform(tensor_data)
|
|
734
|
-
return original_scale_tensor.cpu().numpy().flatten()
|
|
735
|
-
|
|
736
|
-
def plot(self, predictions: Optional[numpy.ndarray] = None):
|
|
737
|
-
"""Plots the original training and testing data, with optional predictions."""
|
|
738
|
-
if not self._is_split:
|
|
739
|
-
_LOGGER.error("Cannot plot before splitting data. Call .split_data() first.")
|
|
740
|
-
raise RuntimeError()
|
|
741
|
-
|
|
742
|
-
plt.figure(figsize=(15, 6))
|
|
743
|
-
plt.title("Time Series Data")
|
|
744
|
-
plt.grid(True)
|
|
745
|
-
plt.xlabel("Time")
|
|
746
|
-
plt.ylabel("Value")
|
|
747
|
-
|
|
748
|
-
plt.plot(self.train_time_axis, self.scaler.inverse_transform(self.train_sequence.reshape(-1, 1)), label='Train Data') # type: ignore
|
|
749
|
-
plt.plot(self.test_time_axis, self.scaler.inverse_transform(self.test_sequence[self.sequence_length-1:].reshape(-1, 1)), label='Test Data') # type: ignore
|
|
750
|
-
|
|
751
|
-
if predictions is not None:
|
|
752
|
-
pred_time_axis = self.test_time_axis[:len(predictions)]
|
|
753
|
-
plt.plot(pred_time_axis, predictions, label='Predictions', c='red')
|
|
754
|
-
|
|
755
|
-
plt.legend()
|
|
756
|
-
plt.show()
|
|
757
|
-
|
|
758
|
-
def get_datasets(self) -> Tuple[Dataset, Dataset]:
|
|
759
|
-
"""Returns the final train and test datasets."""
|
|
760
|
-
if not self._are_windows_generated:
|
|
761
|
-
_LOGGER.error("Windows have not been generated. Call .generate_windows() first.")
|
|
762
|
-
raise RuntimeError()
|
|
763
|
-
return self._train_dataset, self._test_dataset
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
# --- Custom Vision Transform Class ---
|
|
767
|
-
class ResizeAspectFill:
|
|
768
|
-
"""
|
|
769
|
-
Custom transformation to make an image square by padding it to match the
|
|
770
|
-
longest side, preserving the aspect ratio. The image is finally centered.
|
|
771
|
-
|
|
772
|
-
Args:
|
|
773
|
-
pad_color (Union[str, int]): Color to use for the padding.
|
|
774
|
-
Defaults to "black".
|
|
775
|
-
"""
|
|
776
|
-
def __init__(self, pad_color: Union[str, int] = "black") -> None:
|
|
777
|
-
self.pad_color = pad_color
|
|
778
|
-
|
|
779
|
-
def __call__(self, image: Image.Image) -> Image.Image:
|
|
780
|
-
if not isinstance(image, Image.Image):
|
|
781
|
-
_LOGGER.error(f"Expected PIL.Image.Image, got {type(image).__name__}")
|
|
782
|
-
raise TypeError()
|
|
783
|
-
|
|
784
|
-
w, h = image.size
|
|
785
|
-
if w == h:
|
|
786
|
-
return image
|
|
787
|
-
|
|
788
|
-
# Determine padding to center the image
|
|
789
|
-
if w > h:
|
|
790
|
-
top_padding = (w - h) // 2
|
|
791
|
-
bottom_padding = w - h - top_padding
|
|
792
|
-
padding = (0, top_padding, 0, bottom_padding)
|
|
793
|
-
else: # h > w
|
|
794
|
-
left_padding = (h - w) // 2
|
|
795
|
-
right_padding = h - w - left_padding
|
|
796
|
-
padding = (left_padding, 0, right_padding, 0)
|
|
797
|
-
|
|
798
|
-
return ImageOps.expand(image, padding, fill=self.pad_color)
|
|
653
|
+
return s
|
|
799
654
|
|
|
800
655
|
|
|
801
656
|
def info():
|