dragon-ml-toolbox 2.4.0__py3-none-any.whl → 3.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-2.4.0.dist-info → dragon_ml_toolbox-3.1.0.dist-info}/METADATA +7 -4
- dragon_ml_toolbox-3.1.0.dist-info/RECORD +25 -0
- ml_tools/ETL_engineering.py +49 -19
- ml_tools/GUI_tools.py +24 -25
- ml_tools/MICE_imputation.py +8 -4
- ml_tools/ML_callbacks.py +341 -0
- ml_tools/ML_evaluation.py +255 -0
- ml_tools/ML_trainer.py +344 -0
- ml_tools/ML_tutorial.py +300 -0
- ml_tools/PSO_optimization.py +27 -20
- ml_tools/RNN_forecast.py +49 -0
- ml_tools/VIF_factor.py +6 -5
- ml_tools/data_exploration.py +2 -2
- ml_tools/datasetmaster.py +601 -527
- ml_tools/ensemble_learning.py +12 -9
- ml_tools/handle_excel.py +9 -10
- ml_tools/logger.py +45 -8
- ml_tools/utilities.py +18 -1
- dragon_ml_toolbox-2.4.0.dist-info/RECORD +0 -22
- ml_tools/trainer.py +0 -346
- ml_tools/vision_helpers.py +0 -231
- {dragon_ml_toolbox-2.4.0.dist-info → dragon_ml_toolbox-3.1.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-2.4.0.dist-info → dragon_ml_toolbox-3.1.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-2.4.0.dist-info → dragon_ml_toolbox-3.1.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-2.4.0.dist-info → dragon_ml_toolbox-3.1.0.dist-info}/top_level.txt +0 -0
- /ml_tools/{pytorch_models.py → _pytorch_models.py} +0 -0
ml_tools/datasetmaster.py
CHANGED
|
@@ -1,606 +1,680 @@
|
|
|
1
|
-
import torch
|
|
2
|
-
from torch.utils.data import Dataset,
|
|
1
|
+
import torch
|
|
2
|
+
from torch.utils.data import Dataset, Subset
|
|
3
3
|
from torch import nn
|
|
4
4
|
import pandas
|
|
5
5
|
import numpy
|
|
6
6
|
from sklearn.model_selection import train_test_split
|
|
7
7
|
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
|
8
|
-
from typing import Literal, Union
|
|
8
|
+
from typing import Literal, Union, Tuple, List, Optional
|
|
9
9
|
from imblearn.combine import SMOTETomek
|
|
10
|
-
from
|
|
10
|
+
from abc import ABC, abstractmethod
|
|
11
|
+
from PIL import Image, ImageOps
|
|
11
12
|
from torchvision.datasets import ImageFolder
|
|
12
13
|
from torchvision import transforms
|
|
13
14
|
import matplotlib.pyplot as plt
|
|
15
|
+
from pathlib import Path
|
|
14
16
|
from .utilities import _script_info
|
|
17
|
+
from .logger import _LOGGER
|
|
15
18
|
|
|
16
19
|
|
|
20
|
+
# --- public-facing API ---
|
|
17
21
|
__all__ = [
|
|
18
22
|
"DatasetMaker",
|
|
19
|
-
"
|
|
20
|
-
"
|
|
21
|
-
"
|
|
23
|
+
"VisionDatasetMaker",
|
|
24
|
+
"SequenceMaker",
|
|
25
|
+
"ResizeAspectFill",
|
|
22
26
|
]
|
|
23
27
|
|
|
24
28
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
if cat_method not in ["one-hot", "embed", None]:
|
|
70
|
-
raise TypeError("cat_method must be 'one-hot', 'embed' or None.")
|
|
71
|
-
# Validate test size
|
|
72
|
-
if not isinstance(test_size, (float, int)):
|
|
73
|
-
raise TypeError("test_size must be a float in the range 0.0 to 1.0")
|
|
74
|
-
if not (1.0 >= test_size >= 0.0):
|
|
75
|
-
raise ValueError("test_size must be a float in the range 0.0 to 1.0")
|
|
76
|
-
# Validate random state
|
|
77
|
-
if not (isinstance(random_state, int) or random_state is None):
|
|
78
|
-
raise TypeError("random_state must be an integer or None.")
|
|
79
|
-
# validate normalize
|
|
80
|
-
if not (normalize in ["standard", "minmax"] or normalize is None):
|
|
81
|
-
raise TypeError("normalize must be 'standard', 'minmax' or None.")
|
|
82
|
-
# Validate cast labels
|
|
83
|
-
if not isinstance(cast_labels, bool):
|
|
84
|
-
raise TypeError("cast_labels must be either True or False.")
|
|
85
|
-
|
|
86
|
-
# Start-o
|
|
87
|
-
self._labels = pandas_df[label_col]
|
|
88
|
-
pandas_df = pandas_df.drop(columns=label_col)
|
|
89
|
-
# Set None parameters
|
|
90
|
-
self._categorical = None
|
|
91
|
-
self._continuous = None
|
|
92
|
-
self.labels_train = None
|
|
93
|
-
self.labels_test = None
|
|
94
|
-
self.labels_map = None
|
|
95
|
-
self.features_test = None
|
|
96
|
-
self.features_train = None
|
|
29
|
+
# --- Custom Vision Transform Class ---
|
|
30
|
+
class ResizeAspectFill:
|
|
31
|
+
"""
|
|
32
|
+
Custom transformation to make an image square by padding it to match the
|
|
33
|
+
longest side, preserving the aspect ratio. The image is finally centered.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
pad_color (Union[str, int]): Color to use for the padding.
|
|
37
|
+
Defaults to "black".
|
|
38
|
+
"""
|
|
39
|
+
def __init__(self, pad_color: Union[str, int] = "black") -> None:
|
|
40
|
+
self.pad_color = pad_color
|
|
41
|
+
|
|
42
|
+
def __call__(self, image: Image.Image) -> Image.Image:
|
|
43
|
+
if not isinstance(image, Image.Image):
|
|
44
|
+
raise TypeError(f"Expected PIL.Image.Image, got {type(image).__name__}")
|
|
45
|
+
|
|
46
|
+
w, h = image.size
|
|
47
|
+
if w == h:
|
|
48
|
+
return image
|
|
49
|
+
|
|
50
|
+
# Determine padding to center the image
|
|
51
|
+
if w > h:
|
|
52
|
+
top_padding = (w - h) // 2
|
|
53
|
+
bottom_padding = w - h - top_padding
|
|
54
|
+
padding = (0, top_padding, 0, bottom_padding)
|
|
55
|
+
else: # h > w
|
|
56
|
+
left_padding = (h - w) // 2
|
|
57
|
+
right_padding = h - w - left_padding
|
|
58
|
+
padding = (left_padding, 0, right_padding, 0)
|
|
59
|
+
|
|
60
|
+
return ImageOps.expand(image, padding, fill=self.pad_color)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# --- Internal Helper Class ---
|
|
64
|
+
class _PytorchDataset(Dataset):
|
|
65
|
+
"""
|
|
66
|
+
Internal helper class to create a PyTorch Dataset.
|
|
67
|
+
Converts numpy/pandas data into tensors for model consumption.
|
|
68
|
+
"""
|
|
69
|
+
def __init__(self, features: Union[numpy.ndarray, pandas.DataFrame],
|
|
70
|
+
labels: Union[numpy.ndarray, pandas.Series],
|
|
71
|
+
features_dtype: torch.dtype = torch.float32,
|
|
72
|
+
labels_dtype: torch.dtype = torch.int64):
|
|
97
73
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
if cat_method is not None:
|
|
101
|
-
if cat_features is None:
|
|
102
|
-
# find categorical columns from Object, String or Category dtypes automatically
|
|
103
|
-
for column_ in pandas_df.columns:
|
|
104
|
-
if pandas_df[column_].dtype == object or pandas_df[column_].dtype == 'string' or pandas_df[column_].dtype.name == 'category':
|
|
105
|
-
cat_columns.append(column_)
|
|
106
|
-
else:
|
|
107
|
-
cat_columns = cat_features
|
|
108
|
-
|
|
109
|
-
# Handle categorical data if required
|
|
110
|
-
if len(cat_columns) > 0:
|
|
111
|
-
# Set continuous/categorical data if categorical detected
|
|
112
|
-
self._continuous = pandas_df.drop(columns=cat_columns)
|
|
113
|
-
self._categorical = pandas_df[cat_columns].copy()
|
|
114
|
-
|
|
115
|
-
# Perform one-hot-encoding
|
|
116
|
-
if cat_method == "one-hot":
|
|
117
|
-
for col_ in cat_columns:
|
|
118
|
-
self._categorical[col_] = self._categorical[col_].astype("category")
|
|
119
|
-
self._categorical = pandas.get_dummies(data=self._categorical, dtype=numpy.int32, **kwargs)
|
|
120
|
-
# Perform embedding
|
|
121
|
-
else:
|
|
122
|
-
self._categorical = self.embed_categorical(cat_df=self._categorical, random_state=random_state, **kwargs)
|
|
123
|
-
|
|
124
|
-
# Something went wrong?
|
|
125
|
-
if self._categorical.empty:
|
|
126
|
-
raise AttributeError("Categorical data couldn't be processed")
|
|
74
|
+
if isinstance(features, numpy.ndarray):
|
|
75
|
+
self.features = torch.tensor(features, dtype=features_dtype)
|
|
127
76
|
else:
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
continuous_train, continuous_test = self.normalize_continuous(train_set=continuous_train, test_set=continuous_test, method=normalize)
|
|
156
|
-
|
|
157
|
-
# Merge continuous and categorical
|
|
158
|
-
if self._categorical is not None and self._continuous is not None:
|
|
159
|
-
self.features_train = pandas.concat(objs=[continuous_train, categorical_train], axis=1)
|
|
160
|
-
self.features_test = pandas.concat(objs=[continuous_test, categorical_test], axis=1)
|
|
161
|
-
elif self._continuous is not None:
|
|
162
|
-
self.features_train = continuous_train
|
|
163
|
-
self.features_test = continuous_test
|
|
164
|
-
elif self._categorical is not None:
|
|
165
|
-
self.features_train = categorical_train
|
|
166
|
-
self.features_test = categorical_test
|
|
167
|
-
|
|
168
|
-
# Balance train dataset
|
|
169
|
-
if balance and self.features_train is not None and self.labels_train is not None:
|
|
170
|
-
self.features_train, self.labels_train = self.balance_classes(train_features=self.features_train, train_labels=self.labels_train)
|
|
171
|
-
|
|
172
|
-
def to_pytorch(self):
|
|
173
|
-
"""
|
|
174
|
-
Convert the train and test features and labels to Pytorch Datasets with default dtypes.
|
|
175
|
-
|
|
176
|
-
Returns: Tuple(Train Dataset, Test Dataset)
|
|
77
|
+
self.features = torch.tensor(features.values, dtype=features_dtype)
|
|
78
|
+
|
|
79
|
+
if isinstance(labels, numpy.ndarray):
|
|
80
|
+
self.labels = torch.tensor(labels, dtype=labels_dtype)
|
|
81
|
+
else:
|
|
82
|
+
self.labels = torch.tensor(labels.values, dtype=labels_dtype)
|
|
83
|
+
|
|
84
|
+
def __len__(self):
|
|
85
|
+
return len(self.features)
|
|
86
|
+
|
|
87
|
+
def __getitem__(self, index):
|
|
88
|
+
return self.features[index], self.labels[index]
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
# --- Private Base Class ---
|
|
92
|
+
class _BaseMaker(ABC):
|
|
93
|
+
"""
|
|
94
|
+
Abstract Base Class for all dataset makers.
|
|
95
|
+
Ensures a consistent API across the library.
|
|
96
|
+
"""
|
|
97
|
+
def __init__(self):
|
|
98
|
+
self._train_dataset = None
|
|
99
|
+
self._test_dataset = None
|
|
100
|
+
self._val_dataset = None
|
|
101
|
+
|
|
102
|
+
@abstractmethod
|
|
103
|
+
def get_datasets(self) -> Tuple[Dataset, ...]:
|
|
177
104
|
"""
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
# Train set
|
|
181
|
-
if self.labels_train is not None and self.features_train is not None:
|
|
182
|
-
train = PytorchDataset(features=self.features_train, labels=self.labels_train)
|
|
183
|
-
# Test set
|
|
184
|
-
if self.labels_test is not None and self.features_test is not None:
|
|
185
|
-
test = PytorchDataset(features=self.features_test, labels=self.labels_test)
|
|
186
|
-
|
|
187
|
-
return train, test
|
|
188
|
-
|
|
189
|
-
@staticmethod
|
|
190
|
-
def embed_categorical(cat_df: pandas.DataFrame, random_state: Union[int, None]=None, **kwargs) -> pandas.DataFrame:
|
|
105
|
+
The primary method to retrieve the final, processed PyTorch datasets.
|
|
106
|
+
Must be implemented by all subclasses.
|
|
191
107
|
"""
|
|
192
|
-
|
|
108
|
+
pass
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
# --- Refactored DatasetMaker ---
|
|
112
|
+
class DatasetMaker(_BaseMaker):
|
|
113
|
+
"""
|
|
114
|
+
Creates processed PyTorch datasets from a Pandas DataFrame using a fluent, step-by-step interface.
|
|
115
|
+
|
|
116
|
+
Recommended pipeline:
|
|
117
|
+
|
|
118
|
+
- Full Control (step-by-step):
|
|
119
|
+
1. Process categorical features `.process_categoricals()`
|
|
120
|
+
2. Split train-test datasets `.split_data()`
|
|
121
|
+
3. Normalize continuous features `.normalize_continuous()`; `.denormalize()` becomes available.
|
|
122
|
+
4. [Optional][Classification only] Balance classes `.balance_data()`
|
|
123
|
+
5. Get PyTorch datasets: `train, test = .get_datasets()`
|
|
124
|
+
6. [Optional] Inspect the processed data as DataFrames `X_train, X_test, y_train, y_test = .inspect_dataframes()`
|
|
125
|
+
|
|
126
|
+
- Automated (single call):
|
|
127
|
+
```python
|
|
128
|
+
maker = DatasetMaker(df, label_col='target')
|
|
129
|
+
maker.process() # uses simplified arguments
|
|
130
|
+
train_ds, test_ds = maker.get_datasets()
|
|
131
|
+
```
|
|
132
|
+
"""
|
|
133
|
+
def __init__(self, pandas_df: pandas.DataFrame, label_col: str):
|
|
134
|
+
super().__init__()
|
|
135
|
+
if not isinstance(pandas_df, pandas.DataFrame):
|
|
136
|
+
raise TypeError("Input must be a pandas.DataFrame.")
|
|
137
|
+
if label_col not in pandas_df.columns:
|
|
138
|
+
raise ValueError(f"Label column '{label_col}' not found in DataFrame.")
|
|
139
|
+
|
|
140
|
+
self.labels = pandas_df[label_col]
|
|
141
|
+
self.features = pandas_df.drop(columns=label_col)
|
|
142
|
+
self.labels_map = None
|
|
143
|
+
self.scaler = None
|
|
144
|
+
|
|
145
|
+
self._is_split = False
|
|
146
|
+
self._is_balanced = False
|
|
147
|
+
self._is_normalized = False
|
|
148
|
+
self._is_categoricals_processed = False
|
|
193
149
|
|
|
194
|
-
|
|
150
|
+
self.features_train = None
|
|
151
|
+
self.features_test = None
|
|
152
|
+
self.labels_train = None
|
|
153
|
+
self.labels_test = None
|
|
195
154
|
|
|
196
|
-
|
|
155
|
+
self.continuous_columns = None
|
|
156
|
+
|
|
157
|
+
def process_categoricals(self, method: Literal["one-hot", "embed"] = "one-hot",
|
|
158
|
+
cat_features: Union[list[str], None] = None, **kwargs) -> 'DatasetMaker':
|
|
197
159
|
"""
|
|
198
|
-
|
|
199
|
-
embedded_tensors = list()
|
|
200
|
-
columns = list()
|
|
201
|
-
for col in df.columns:
|
|
202
|
-
df[col] = df[col].astype("category")
|
|
203
|
-
# Get number of categories
|
|
204
|
-
size: int = df[col].cat.categories.size
|
|
205
|
-
# Embedding dimension
|
|
206
|
-
embedding_dim: int = min(50, (size+1)//2)
|
|
207
|
-
# Create instance of Embedding tensor using half the value for embedding dimensions
|
|
208
|
-
with torch.no_grad():
|
|
209
|
-
if random_state:
|
|
210
|
-
torch.manual_seed(random_state)
|
|
211
|
-
embedder = nn.Embedding(num_embeddings=size, embedding_dim=embedding_dim, **kwargs)
|
|
212
|
-
# Embed column of features and store tensor
|
|
213
|
-
embedded_tensors.append(embedder(torch.LongTensor(df[col].cat.codes.copy().values)))
|
|
214
|
-
# Preserve column names for embedded features
|
|
215
|
-
for i in range(1, embedding_dim+1):
|
|
216
|
-
columns.append(f"{col}_{i}")
|
|
217
|
-
|
|
218
|
-
# Concatenate tensors
|
|
219
|
-
with torch.no_grad():
|
|
220
|
-
tensor = torch.cat(tensors=embedded_tensors, dim=1)
|
|
221
|
-
# Convert to dataframe
|
|
222
|
-
return pandas.DataFrame(data=tensor.numpy(), columns=columns)
|
|
160
|
+
Encodes categorical features using the specified method.
|
|
223
161
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
162
|
+
Args:
|
|
163
|
+
method (str, optional): 'one-hot' (default) or 'embed'.
|
|
164
|
+
cat_features (list, optional): A list of categorical column names.
|
|
165
|
+
If None, they will be inferred from the DataFrame's dtypes.
|
|
166
|
+
**kwargs: Additional keyword arguments to pass to the underlying
|
|
167
|
+
pandas.get_dummies() or torch.nn.Embedding() functions.
|
|
168
|
+
For 'one-hot' encoding, it is often recommended to add
|
|
169
|
+
`drop_first=True` to help reduce multicollinearity.
|
|
227
170
|
"""
|
|
228
|
-
|
|
171
|
+
if self._is_split:
|
|
172
|
+
raise RuntimeError("Categoricals must be processed before splitting data to avoid data leakage.")
|
|
229
173
|
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
174
|
+
if cat_features is None:
|
|
175
|
+
cat_columns = self.features.select_dtypes(include=['object', 'category', 'string']).columns.tolist()
|
|
176
|
+
else:
|
|
177
|
+
cat_columns = cat_features
|
|
178
|
+
|
|
179
|
+
if not cat_columns:
|
|
180
|
+
_LOGGER.info("No categorical features to process.")
|
|
181
|
+
self._is_categoricals_processed = True
|
|
182
|
+
return self
|
|
183
|
+
|
|
184
|
+
continuous_df = self.features.drop(columns=cat_columns)
|
|
185
|
+
# store continuous column names
|
|
186
|
+
self.continuous_columns = continuous_df.columns.tolist()
|
|
233
187
|
|
|
234
|
-
|
|
235
|
-
|
|
188
|
+
categorical_df = self.features[cat_columns].copy()
|
|
189
|
+
|
|
190
|
+
if method == "one-hot":
|
|
191
|
+
processed_cats = pandas.get_dummies(categorical_df, dtype=numpy.int32, **kwargs)
|
|
192
|
+
elif method == "embed":
|
|
193
|
+
processed_cats = self._embed_categorical(categorical_df, **kwargs)
|
|
194
|
+
else:
|
|
195
|
+
raise ValueError("`method` must be 'one-hot' or 'embed'.")
|
|
196
|
+
|
|
197
|
+
self.features = pandas.concat([continuous_df, processed_cats], axis=1)
|
|
198
|
+
self._is_categoricals_processed = True
|
|
199
|
+
_LOGGER.info("Categorical features processed.")
|
|
200
|
+
return self
|
|
201
|
+
|
|
202
|
+
def normalize_continuous(self, method: Literal["standard", "minmax"] = "standard") -> 'DatasetMaker':
|
|
203
|
+
"""Normalizes all numeric features and saves the scaler."""
|
|
204
|
+
if not self._is_split:
|
|
205
|
+
raise RuntimeError("Continuous features must be normalized AFTER splitting data. Call .split_data() first.")
|
|
206
|
+
if self._is_normalized:
|
|
207
|
+
_LOGGER.warning("Data has already been normalized.")
|
|
208
|
+
return self
|
|
209
|
+
|
|
210
|
+
# Use continuous features columns
|
|
211
|
+
self.scaler_columns = self.continuous_columns
|
|
212
|
+
if not self.scaler_columns:
|
|
213
|
+
_LOGGER.info("No continuous features to normalize.")
|
|
214
|
+
self._is_normalized = True
|
|
215
|
+
return self
|
|
216
|
+
|
|
236
217
|
if method == "standard":
|
|
237
|
-
scaler = StandardScaler()
|
|
218
|
+
self.scaler = StandardScaler()
|
|
238
219
|
elif method == "minmax":
|
|
239
|
-
scaler = MinMaxScaler()
|
|
240
|
-
else:
|
|
241
|
-
raise ValueError("Normalization method must be 'standard' or 'minmax'.")
|
|
242
|
-
|
|
243
|
-
X_train = scaler.fit_transform(train_set)
|
|
244
|
-
X_test = scaler.transform(test_set)
|
|
245
|
-
|
|
246
|
-
if isinstance(train_set, pandas.DataFrame):
|
|
247
|
-
train_indexes = train_set.index
|
|
248
|
-
test_indexes = test_set.index
|
|
249
|
-
cols = train_set.columns
|
|
250
|
-
X_train = pandas.DataFrame(data=X_train, index=train_indexes, columns=cols)
|
|
251
|
-
X_test = pandas.DataFrame(data=X_test, index=test_indexes, columns=cols)
|
|
252
|
-
elif isinstance(train_set, pandas.Series):
|
|
253
|
-
train_indexes = train_set.index
|
|
254
|
-
test_indexes = test_set.index
|
|
255
|
-
X_train = pandas.Series(data=X_train, index=train_indexes)
|
|
256
|
-
X_test = pandas.Series(data=X_test, index=test_indexes)
|
|
220
|
+
self.scaler = MinMaxScaler()
|
|
257
221
|
else:
|
|
258
|
-
|
|
222
|
+
raise ValueError("Normalization `method` must be 'standard' or 'minmax'.")
|
|
223
|
+
|
|
224
|
+
# Fit on training data only, then transform both
|
|
225
|
+
self.features_train[self.scaler_columns] = self.scaler.fit_transform(self.features_train[self.scaler_columns]) # type: ignore
|
|
226
|
+
self.features_test[self.scaler_columns] = self.scaler.transform(self.features_test[self.scaler_columns]) # type: ignore
|
|
259
227
|
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
228
|
+
self._is_normalized = True
|
|
229
|
+
_LOGGER.info(f"Continuous features normalized using {self.scaler.__class__.__name__}. Scaler stored in `self.scaler`.")
|
|
230
|
+
return self
|
|
231
|
+
|
|
232
|
+
def split_data(self, test_size: float = 0.2, stratify: bool = False, random_state: Optional[int] = None) -> 'DatasetMaker':
|
|
233
|
+
"""Splits the data into training and testing sets."""
|
|
234
|
+
if self._is_split:
|
|
235
|
+
_LOGGER.warning("Data has already been split.")
|
|
236
|
+
return self
|
|
237
|
+
|
|
238
|
+
if self.labels.dtype == 'object' or self.labels.dtype.name == 'category':
|
|
239
|
+
labels_numeric = self.labels.astype("category")
|
|
240
|
+
self.labels_map = {code: val for code, val in enumerate(labels_numeric.cat.categories)}
|
|
241
|
+
self.labels = pandas.Series(labels_numeric.cat.codes, index=self.labels.index)
|
|
242
|
+
_LOGGER.info("Labels have been encoded. Mapping stored in `self.labels_map`.")
|
|
243
|
+
|
|
244
|
+
stratify_array = self.labels if stratify else None
|
|
245
|
+
|
|
246
|
+
self.features_train, self.features_test, self.labels_train, self.labels_test = train_test_split(
|
|
247
|
+
self.features, self.labels, test_size=test_size, random_state=random_state, stratify=stratify_array
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
self._is_split = True
|
|
251
|
+
_LOGGER.info(f"Data split into training ({len(self.features_train)} samples) and testing ({len(self.features_test)} samples).")
|
|
252
|
+
return self
|
|
253
|
+
|
|
254
|
+
def balance_data(self, resampler=None, **kwargs) -> 'DatasetMaker':
|
|
264
255
|
"""
|
|
265
|
-
|
|
256
|
+
Only useful for classification tasks.
|
|
257
|
+
|
|
258
|
+
Balances the training data using a specified resampler.
|
|
259
|
+
|
|
260
|
+
Defaults to `SMOTETomek`.
|
|
266
261
|
"""
|
|
267
|
-
|
|
268
|
-
|
|
262
|
+
if not self._is_split:
|
|
263
|
+
raise RuntimeError("Cannot balance data before it has been split. Call .split_data() first.")
|
|
264
|
+
if self._is_balanced:
|
|
265
|
+
_LOGGER.warning("Training data has already been balanced.")
|
|
266
|
+
return self
|
|
267
|
+
|
|
268
|
+
if resampler is None:
|
|
269
|
+
resampler = SMOTETomek(**kwargs)
|
|
270
|
+
|
|
271
|
+
_LOGGER.info(f"Balancing training data with {resampler.__class__.__name__}...")
|
|
272
|
+
self.features_train, self.labels_train = resampler.fit_resample(self.features_train, self.labels_train) # type: ignore
|
|
269
273
|
|
|
270
|
-
|
|
274
|
+
self._is_balanced = True
|
|
275
|
+
_LOGGER.info(f"Balancing complete. New training set size: {len(self.features_train)} samples.")
|
|
276
|
+
return self
|
|
271
277
|
|
|
278
|
+
def process(self, test_size: float = 0.2, cat_method: Literal["one-hot", "embed"] = "one-hot", normalize_method: Literal["standard", "minmax"] = "standard",
|
|
279
|
+
balance: bool = False, random_state: Optional[int] = None) -> 'DatasetMaker':
|
|
280
|
+
"""Runs a standard, fully automated preprocessing pipeline."""
|
|
281
|
+
_LOGGER.info("--- Running Automated Processing Pipeline ---")
|
|
282
|
+
self.process_categoricals(method=cat_method)
|
|
283
|
+
self.split_data(test_size=test_size, stratify=True, random_state=random_state)
|
|
284
|
+
self.normalize_continuous(method=normalize_method)
|
|
285
|
+
if balance:
|
|
286
|
+
self.balance_data()
|
|
287
|
+
_LOGGER.info("--- Automated Processing Complete ---")
|
|
288
|
+
return self
|
|
289
|
+
|
|
290
|
+
def denormalize(self, data: Union[torch.Tensor, numpy.ndarray, pandas.DataFrame]) -> Union[numpy.ndarray, pandas.DataFrame]:
|
|
291
|
+
"""
|
|
292
|
+
Applies inverse transformation to denormalize data, preserving DataFrame
|
|
293
|
+
structure if provided.
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
data: The normalized data to be transformed back to its original scale.
|
|
297
|
+
Can be a PyTorch Tensor, NumPy array, or Pandas DataFrame.
|
|
298
|
+
If a DataFrame, it must contain the columns that were originally scaled.
|
|
272
299
|
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
300
|
+
Returns:
|
|
301
|
+
The denormalized data. Returns a Pandas DataFrame if the input was a
|
|
302
|
+
DataFrame, otherwise returns a NumPy array.
|
|
276
303
|
"""
|
|
277
|
-
|
|
304
|
+
if self.scaler is None:
|
|
305
|
+
raise RuntimeError("Data was not normalized. Cannot denormalize.")
|
|
306
|
+
|
|
307
|
+
if isinstance(data, pandas.DataFrame):
|
|
308
|
+
# If input is a DataFrame, denormalize in place and return a copy
|
|
309
|
+
if not all(col in data.columns for col in self.scaler_columns): # type: ignore
|
|
310
|
+
raise ValueError(f"Input DataFrame is missing one or more required columns for denormalization. Required: {self.scaler_columns}")
|
|
311
|
+
|
|
312
|
+
output_df = data.copy()
|
|
313
|
+
output_df[self.scaler_columns] = self.scaler.inverse_transform(data[self.scaler_columns]) # type: ignore
|
|
314
|
+
return output_df
|
|
315
|
+
|
|
316
|
+
# Handle tensor or numpy array input
|
|
317
|
+
if isinstance(data, torch.Tensor):
|
|
318
|
+
data_np = data.cpu().numpy()
|
|
319
|
+
else: # It's already a numpy array
|
|
320
|
+
data_np = data
|
|
321
|
+
|
|
322
|
+
if data_np.ndim == 1:
|
|
323
|
+
data_np = data_np.reshape(-1, 1)
|
|
324
|
+
|
|
325
|
+
if data_np.shape[1] != len(self.scaler_columns): # type: ignore
|
|
326
|
+
raise ValueError(f"Input array has {data_np.shape[1]} columns, but scaler was fitted on {len(self.scaler_columns)} columns.") # type: ignore
|
|
327
|
+
|
|
328
|
+
return self.scaler.inverse_transform(data_np)
|
|
329
|
+
|
|
330
|
+
def get_datasets(self) -> Tuple[_PytorchDataset, _PytorchDataset]:
|
|
331
|
+
"""Primary method to get the final PyTorch Datasets."""
|
|
332
|
+
if not self._is_split:
|
|
333
|
+
raise RuntimeError("Data has not been split yet. Call .split_data() or .process() first.")
|
|
278
334
|
|
|
279
|
-
|
|
335
|
+
self._train_dataset = _PytorchDataset(self.features_train, self.labels_train) # type: ignore
|
|
336
|
+
self._test_dataset = _PytorchDataset(self.features_test, self.labels_test) # type: ignore
|
|
280
337
|
|
|
281
|
-
|
|
282
|
-
Note: Only Train-Data should be balanced.
|
|
283
|
-
"""
|
|
284
|
-
# Validate features
|
|
285
|
-
if not isinstance(features, (pandas.DataFrame, pandas.Series, numpy.ndarray)):
|
|
286
|
-
raise TypeError("features must be a numpy.ndarray, pandas.Series or pandas.DataFrame")
|
|
287
|
-
# Validate labels
|
|
288
|
-
if not isinstance(labels, (pandas.DataFrame, pandas.Series, numpy.ndarray)):
|
|
289
|
-
raise TypeError("labels must be a numpy.ndarray, pandas.Series or pandas.DataFrame")
|
|
290
|
-
|
|
291
|
-
# Balance classes
|
|
292
|
-
if balance:
|
|
293
|
-
features, labels = self.balance_classes(train_features=features, train_labels=labels)
|
|
338
|
+
return self._train_dataset, self._test_dataset
|
|
294
339
|
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
340
|
+
def inspect_dataframes(self) -> Tuple[pandas.DataFrame, pandas.DataFrame, pandas.Series, pandas.Series]:
|
|
341
|
+
"""Utility method to inspect the processed data as Pandas DataFrames."""
|
|
342
|
+
if not self._is_split:
|
|
343
|
+
raise RuntimeError("Data has not been split yet. Call .split_data() or .process() first.")
|
|
344
|
+
return self.features_train, self.features_test, self.labels_train, self.labels_test # type: ignore
|
|
345
|
+
|
|
346
|
+
@staticmethod
|
|
347
|
+
def _embed_categorical(cat_df: pandas.DataFrame, random_state: Optional[int] = None, **kwargs) -> pandas.DataFrame:
|
|
348
|
+
"""Internal helper to perform embedding on categorical features."""
|
|
349
|
+
embedded_tensors = []
|
|
350
|
+
new_columns = []
|
|
351
|
+
for col in cat_df.columns:
|
|
352
|
+
cat_series = cat_df[col].astype("category")
|
|
353
|
+
num_categories = len(cat_series.cat.categories)
|
|
354
|
+
embedding_dim = min(50, (num_categories + 1) // 2)
|
|
355
|
+
|
|
356
|
+
if random_state:
|
|
357
|
+
torch.manual_seed(random_state)
|
|
358
|
+
|
|
359
|
+
embedder = nn.Embedding(num_embeddings=num_categories, embedding_dim=embedding_dim, **kwargs)
|
|
360
|
+
|
|
361
|
+
with torch.no_grad():
|
|
362
|
+
codes = torch.LongTensor(cat_series.cat.codes.values)
|
|
363
|
+
embedded_tensors.append(embedder(codes))
|
|
364
|
+
|
|
365
|
+
new_columns.extend([f"{col}_{i+1}" for i in range(embedding_dim)])
|
|
300
366
|
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
return self.features[index], self.labels[index]
|
|
367
|
+
with torch.no_grad():
|
|
368
|
+
full_tensor = torch.cat(embedded_tensors, dim=1)
|
|
369
|
+
return pandas.DataFrame(full_tensor.numpy(), columns=new_columns, index=cat_df.index)
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
# --- VisionDatasetMaker ---
|
|
373
|
+
class VisionDatasetMaker(_BaseMaker):
|
|
374
|
+
"""
|
|
375
|
+
Creates processed PyTorch datasets for computer vision tasks from an
|
|
376
|
+
image folder directory.
|
|
312
377
|
|
|
378
|
+
Uses online augmentations per epoch (image augmentation without creating new files).
|
|
379
|
+
"""
|
|
380
|
+
def __init__(self, full_dataset: ImageFolder):
|
|
381
|
+
super().__init__()
|
|
382
|
+
self.full_dataset = full_dataset
|
|
383
|
+
self.labels = [s[1] for s in self.full_dataset.samples]
|
|
384
|
+
self.class_map = full_dataset.class_to_idx
|
|
385
|
+
|
|
386
|
+
self._is_split = False
|
|
387
|
+
self._are_transforms_configured = False
|
|
388
|
+
|
|
389
|
+
@classmethod
|
|
390
|
+
def from_folder(cls, root_dir: str) -> 'VisionDatasetMaker':
|
|
391
|
+
"""Creates a maker instance from a root directory of images."""
|
|
392
|
+
initial_transform = transforms.Compose([transforms.ToTensor()])
|
|
393
|
+
full_dataset = ImageFolder(root=root_dir, transform=initial_transform)
|
|
394
|
+
_LOGGER.info(f"Found {len(full_dataset)} images in {len(full_dataset.classes)} classes.")
|
|
395
|
+
return cls(full_dataset)
|
|
396
|
+
|
|
313
397
|
@staticmethod
|
|
314
|
-
def
|
|
398
|
+
def inspect_folder(path: Union[str, Path]):
|
|
315
399
|
"""
|
|
316
|
-
|
|
400
|
+
Logs a report of the types, sizes, and channels of image files
|
|
401
|
+
found in the directory and its subdirectories.
|
|
317
402
|
"""
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
403
|
+
path_obj = Path(path)
|
|
404
|
+
if not path_obj.is_dir():
|
|
405
|
+
_LOGGER.error(f"Path is not a valid directory: {path_obj}")
|
|
406
|
+
return
|
|
322
407
|
|
|
408
|
+
non_image_files = set()
|
|
409
|
+
img_types = set()
|
|
410
|
+
img_sizes = set()
|
|
411
|
+
img_channels = set()
|
|
412
|
+
img_counter = 0
|
|
323
413
|
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
414
|
+
_LOGGER.info(f"Inspecting folder: {path_obj}...")
|
|
415
|
+
# Use rglob to recursively find all files
|
|
416
|
+
for filepath in path_obj.rglob('*'):
|
|
417
|
+
if filepath.is_file():
|
|
418
|
+
try:
|
|
419
|
+
# Using PIL to open is a more reliable check
|
|
420
|
+
with Image.open(filepath) as img:
|
|
421
|
+
img_types.add(img.format)
|
|
422
|
+
img_sizes.add(img.size)
|
|
423
|
+
img_channels.update(img.getbands())
|
|
424
|
+
img_counter += 1
|
|
425
|
+
except (IOError, SyntaxError):
|
|
426
|
+
non_image_files.add(filepath.name)
|
|
331
427
|
|
|
332
|
-
|
|
333
|
-
|
|
428
|
+
if non_image_files:
|
|
429
|
+
_LOGGER.warning(f"Non-image or corrupted files found and ignored: {non_image_files}")
|
|
430
|
+
|
|
431
|
+
report = (
|
|
432
|
+
f"\n--- Inspection Report for '{path_obj.name}' ---\n"
|
|
433
|
+
f"Total images found: {img_counter}\n"
|
|
434
|
+
f"Image formats: {img_types or 'None'}\n"
|
|
435
|
+
f"Image sizes (WxH): {img_sizes or 'None'}\n"
|
|
436
|
+
f"Image channels (bands): {img_channels or 'None'}\n"
|
|
437
|
+
f"--------------------------------------"
|
|
438
|
+
)
|
|
439
|
+
_LOGGER.info(report)
|
|
440
|
+
|
|
441
|
+
def split_data(self, val_size: float = 0.2, test_size: float = 0.0,
|
|
442
|
+
stratify: bool = True, random_state: Optional[int] = None) -> 'VisionDatasetMaker':
|
|
443
|
+
"""Splits the dataset into training, validation, and optional test sets."""
|
|
444
|
+
if self._is_split:
|
|
445
|
+
_LOGGER.warning("Data has already been split.")
|
|
446
|
+
return self
|
|
447
|
+
|
|
448
|
+
if val_size + test_size >= 1.0:
|
|
449
|
+
raise ValueError("The sum of val_size and test_size must be less than 1.")
|
|
450
|
+
|
|
451
|
+
indices = list(range(len(self.full_dataset)))
|
|
452
|
+
labels_for_split = self.labels if stratify else None
|
|
453
|
+
|
|
454
|
+
train_indices, val_test_indices = train_test_split(
|
|
455
|
+
indices, test_size=(val_size + test_size), random_state=random_state, stratify=labels_for_split
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
if test_size > 0:
|
|
459
|
+
val_test_labels = [self.labels[i] for i in val_test_indices]
|
|
460
|
+
stratify_val_test = val_test_labels if stratify else None
|
|
461
|
+
val_indices, test_indices = train_test_split(
|
|
462
|
+
val_test_indices, test_size=(test_size / (val_size + test_size)),
|
|
463
|
+
random_state=random_state, stratify=stratify_val_test
|
|
464
|
+
)
|
|
465
|
+
self._test_dataset = Subset(self.full_dataset, test_indices)
|
|
466
|
+
_LOGGER.info(f"Test set created with {len(self._test_dataset)} images.")
|
|
467
|
+
else:
|
|
468
|
+
val_indices = val_test_indices
|
|
334
469
|
|
|
335
|
-
|
|
336
|
-
|
|
470
|
+
self._train_dataset = Subset(self.full_dataset, train_indices)
|
|
471
|
+
self._val_dataset = Subset(self.full_dataset, val_indices)
|
|
472
|
+
self._is_split = True
|
|
337
473
|
|
|
338
|
-
|
|
474
|
+
_LOGGER.info(f"Data split into: \n- Training: {len(self._train_dataset)} images \n- Validation: {len(self._val_dataset)} images")
|
|
475
|
+
return self
|
|
476
|
+
|
|
477
|
+
def configure_transforms(self, resize_size: int = 256, crop_size: int = 224,
|
|
478
|
+
mean: List[float] = [0.485, 0.456, 0.406],
|
|
479
|
+
std: List[float] = [0.229, 0.224, 0.225],
|
|
480
|
+
extra_train_transforms: Optional[List] = None) -> 'VisionDatasetMaker':
|
|
481
|
+
"""Configures and applies the image transformations (augmentations)."""
|
|
482
|
+
if not self._is_split:
|
|
483
|
+
raise RuntimeError("Transforms must be configured AFTER splitting data. Call .split_data() first.")
|
|
484
|
+
|
|
485
|
+
base_train_transforms = [transforms.RandomResizedCrop(crop_size), transforms.RandomHorizontalFlip()]
|
|
486
|
+
if extra_train_transforms:
|
|
487
|
+
base_train_transforms.extend(extra_train_transforms)
|
|
339
488
|
|
|
340
|
-
|
|
489
|
+
final_transforms = [transforms.ToTensor(), transforms.Normalize(mean=mean, std=std)]
|
|
490
|
+
|
|
491
|
+
val_transform = transforms.Compose([transforms.Resize(resize_size), transforms.CenterCrop(crop_size), *final_transforms])
|
|
492
|
+
train_transform = transforms.Compose([*base_train_transforms, *final_transforms])
|
|
341
493
|
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
494
|
+
self._train_dataset.dataset.transform = train_transform # type: ignore
|
|
495
|
+
self._val_dataset.dataset.transform = val_transform # type: ignore
|
|
496
|
+
if self._test_dataset:
|
|
497
|
+
self._test_dataset.dataset.transform = val_transform # type: ignore
|
|
498
|
+
|
|
499
|
+
self._are_transforms_configured = True
|
|
500
|
+
_LOGGER.info("Image transforms configured and applied.")
|
|
501
|
+
return self
|
|
502
|
+
|
|
503
|
+
def get_datasets(self) -> Tuple[Dataset, ...]:
|
|
504
|
+
"""Returns the final train, validation, and optional test datasets."""
|
|
505
|
+
if not self._is_split:
|
|
506
|
+
raise RuntimeError("Data has not been split. Call .split_data() first.")
|
|
507
|
+
if not self._are_transforms_configured:
|
|
508
|
+
_LOGGER.warning("Transforms have not been configured. Using default ToTensor only.")
|
|
509
|
+
|
|
510
|
+
if self._test_dataset:
|
|
511
|
+
return self._train_dataset, self._val_dataset, self._test_dataset
|
|
512
|
+
return self._train_dataset, self._val_dataset
|
|
513
|
+
|
|
514
|
+
|
|
515
|
+
# --- SequenceMaker ---
|
|
516
|
+
class SequenceMaker(_BaseMaker):
|
|
345
517
|
"""
|
|
346
|
-
|
|
347
|
-
if not isinstance(inputs, (list, numpy.ndarray, str)):
|
|
348
|
-
raise TypeError("Inputs must be one of the following:\n\ta) List of PIL Image objects.\n\tb) Numpy array of 2D or 3D arrays.\
|
|
349
|
-
\n\tc) Directory path to image files.")
|
|
350
|
-
# Validate labels
|
|
351
|
-
if not (isinstance(labels, (list, numpy.ndarray)) or labels is None):
|
|
352
|
-
raise TypeError("Inputs must be one of the following:\n\ta) List of labels (integers).\n\tb) Numpy array of 2D or 3D arrays.\
|
|
353
|
-
\n\tc) None if inputs path is given.\nLabels size must match Inputs size.")
|
|
354
|
-
# Validate resize shape
|
|
355
|
-
if not isinstance(resize, int):
|
|
356
|
-
raise TypeError("Resize must be an integer value for a square image of shape (W, H).")
|
|
357
|
-
# Validate transform
|
|
358
|
-
if isinstance(transform, transforms.Compose):
|
|
359
|
-
pass
|
|
360
|
-
elif transform is None:
|
|
361
|
-
if test_set:
|
|
362
|
-
transform = transforms.Compose([
|
|
363
|
-
transforms.Resize(size=(resize,resize)),
|
|
364
|
-
transforms.ToTensor(),
|
|
365
|
-
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
|
366
|
-
])
|
|
367
|
-
else:
|
|
368
|
-
transform = transforms.Compose([
|
|
369
|
-
transforms.RandomHorizontalFlip(p=0.5),
|
|
370
|
-
transforms.RandomRotation(degrees=30),
|
|
371
|
-
transforms.Resize(size=(int(resize*1.2),int(resize*1.2))),
|
|
372
|
-
transforms.CenterCrop(size=resize),
|
|
373
|
-
transforms.ToTensor(),
|
|
374
|
-
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
|
375
|
-
])
|
|
376
|
-
else:
|
|
377
|
-
raise TypeError("Transform must be a `torchvision.transforms.Compose` object or None to use a default transform.")
|
|
518
|
+
Creates windowed PyTorch datasets from time-series data.
|
|
378
519
|
|
|
379
|
-
|
|
380
|
-
dataset = None
|
|
520
|
+
Pipeline:
|
|
381
521
|
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
522
|
+
1. `.split_data()`: Separate time series into training and testing portions.
|
|
523
|
+
2. `.normalize_data()`: Normalize the data. The scaler will be fitted on the training portion.
|
|
524
|
+
3. `.generate_windows()`: Create the windowed sequences from the split and normalized data.
|
|
525
|
+
"""
|
|
526
|
+
def __init__(self, data: Union[pandas.DataFrame, pandas.Series, numpy.ndarray], sequence_length: int):
|
|
527
|
+
super().__init__()
|
|
528
|
+
self.sequence_length = sequence_length
|
|
529
|
+
self.scaler = None
|
|
530
|
+
|
|
531
|
+
if isinstance(data, pandas.DataFrame):
|
|
532
|
+
self.time_axis = data.index.values
|
|
533
|
+
self.sequence = data.iloc[:, 0].values.astype(numpy.float32)
|
|
534
|
+
elif isinstance(data, pandas.Series):
|
|
535
|
+
self.time_axis = data.index.values
|
|
536
|
+
self.sequence = data.values.astype(numpy.float32)
|
|
537
|
+
elif isinstance(data, numpy.ndarray):
|
|
538
|
+
self.time_axis = numpy.arange(len(data))
|
|
539
|
+
self.sequence = data.astype(numpy.float32)
|
|
386
540
|
else:
|
|
387
|
-
raise TypeError("
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
for img_ in inputs:
|
|
396
|
-
transformed.append(transform(img_))
|
|
397
|
-
# Stack image tensors
|
|
398
|
-
features_ = torch.stack(transformed, dim=0)
|
|
399
|
-
|
|
400
|
-
# Make a dataset with images and labels
|
|
401
|
-
dataset = TensorDataset(features_, labels_)
|
|
402
|
-
else:
|
|
403
|
-
raise TypeError("Labels must be None if 'path' to inputs is provided. Labels will be inferred from subdirectory names in 'path'.")
|
|
404
|
-
|
|
405
|
-
return dataset
|
|
406
|
-
|
|
541
|
+
raise TypeError("Data must be a pandas DataFrame/Series or a numpy array.")
|
|
542
|
+
|
|
543
|
+
self.train_sequence = None
|
|
544
|
+
self.test_sequence = None
|
|
545
|
+
|
|
546
|
+
self._is_split = False
|
|
547
|
+
self._is_normalized = False
|
|
548
|
+
self._are_windows_generated = False
|
|
407
549
|
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
550
|
+
def normalize_data(self, method: Literal["standard", "minmax"] = "minmax") -> 'SequenceMaker':
|
|
551
|
+
"""
|
|
552
|
+
Normalizes the sequence data. Must be called AFTER splitting to prevent data leakage from the test set.
|
|
411
553
|
"""
|
|
412
|
-
|
|
554
|
+
if not self._is_split:
|
|
555
|
+
raise RuntimeError("Data must be split BEFORE normalizing. Call .split_data() first.")
|
|
413
556
|
|
|
414
|
-
|
|
557
|
+
if self.scaler:
|
|
558
|
+
_LOGGER.warning("Data has already been normalized.")
|
|
559
|
+
return self
|
|
560
|
+
|
|
561
|
+
if method == "standard":
|
|
562
|
+
self.scaler = StandardScaler()
|
|
563
|
+
elif method == "minmax":
|
|
564
|
+
self.scaler = MinMaxScaler(feature_range=(-1, 1))
|
|
565
|
+
else:
|
|
566
|
+
raise ValueError("Normalization `method` must be 'standard' or 'minmax'.")
|
|
567
|
+
|
|
568
|
+
# Fit scaler ONLY on the training data
|
|
569
|
+
self.scaler.fit(self.train_sequence.reshape(-1, 1)) # type: ignore
|
|
415
570
|
|
|
416
|
-
|
|
417
|
-
|
|
571
|
+
# Transform both train and test data using the fitted scaler
|
|
572
|
+
self.train_sequence = self.scaler.transform(self.train_sequence.reshape(-1, 1)).flatten() # type: ignore
|
|
573
|
+
self.test_sequence = self.scaler.transform(self.test_sequence.reshape(-1, 1)).flatten() # type: ignore
|
|
418
574
|
|
|
419
|
-
|
|
575
|
+
self._is_normalized = True
|
|
576
|
+
_LOGGER.info(f"Sequence data normalized using {self.scaler.__class__.__name__}. Scaler was fit on the training set only.")
|
|
577
|
+
return self
|
|
578
|
+
|
|
579
|
+
def split_data(self, test_size: float = 0.2) -> 'SequenceMaker':
|
|
580
|
+
"""Splits the sequence into training and testing portions."""
|
|
581
|
+
if self._is_split:
|
|
582
|
+
_LOGGER.warning("Data has already been split.")
|
|
583
|
+
return self
|
|
584
|
+
|
|
585
|
+
split_idx = int(len(self.sequence) * (1 - test_size))
|
|
586
|
+
self.train_sequence = self.sequence[:split_idx]
|
|
587
|
+
self.test_sequence = self.sequence[split_idx - self.sequence_length:]
|
|
420
588
|
|
|
421
|
-
|
|
422
|
-
|
|
589
|
+
self.train_time_axis = self.time_axis[:split_idx]
|
|
590
|
+
self.test_time_axis = self.time_axis[split_idx:]
|
|
423
591
|
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
* `normalize`: Whether to normalize ('minmax'), standardize ('standard') or ignore (None). Default is 'minmax'.
|
|
592
|
+
self._is_split = True
|
|
593
|
+
_LOGGER.info(f"Sequence split into training ({len(self.train_sequence)} points) and testing ({len(self.test_sequence)} points).")
|
|
594
|
+
return self
|
|
595
|
+
|
|
596
|
+
def generate_windows(self, sequence_to_sequence: bool = False) -> 'SequenceMaker':
|
|
430
597
|
"""
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
if not
|
|
436
|
-
raise
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
#
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
self.
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
598
|
+
Generates overlapping windows for features and labels.
|
|
599
|
+
|
|
600
|
+
"sequence-to-sequence": Label vectors are of the same size as the feature vectors instead of a single future prediction.
|
|
601
|
+
"""
|
|
602
|
+
if not self._is_split:
|
|
603
|
+
raise RuntimeError("Cannot generate windows before splitting data. Call .split_data() first.")
|
|
604
|
+
|
|
605
|
+
self._train_dataset = self._create_windowed_dataset(self.train_sequence, sequence_to_sequence) # type: ignore
|
|
606
|
+
self._test_dataset = self._create_windowed_dataset(self.test_sequence, sequence_to_sequence) # type: ignore
|
|
607
|
+
|
|
608
|
+
self._are_windows_generated = True
|
|
609
|
+
_LOGGER.info("Feature and label windows generated for train and test sets.")
|
|
610
|
+
return self
|
|
611
|
+
|
|
612
|
+
def _create_windowed_dataset(self, data: numpy.ndarray, use_sequence_labels: bool) -> _PytorchDataset:
|
|
613
|
+
"""Efficiently creates windowed features and labels using numpy."""
|
|
614
|
+
if len(data) <= self.sequence_length:
|
|
615
|
+
raise ValueError("Data length must be greater than the sequence_length to create at least one window.")
|
|
616
|
+
|
|
617
|
+
if not use_sequence_labels:
|
|
618
|
+
features = data[:-1]
|
|
619
|
+
labels = data[self.sequence_length:]
|
|
620
|
+
|
|
621
|
+
n_windows = len(features) - self.sequence_length + 1
|
|
622
|
+
bytes_per_item = features.strides[0]
|
|
623
|
+
strided_features = numpy.lib.stride_tricks.as_strided(
|
|
624
|
+
features, shape=(n_windows, self.sequence_length), strides=(bytes_per_item, bytes_per_item)
|
|
625
|
+
)
|
|
626
|
+
return _PytorchDataset(strided_features, labels, labels_dtype=torch.float32)
|
|
627
|
+
|
|
460
628
|
else:
|
|
461
|
-
|
|
629
|
+
x_data = data[:-1]
|
|
630
|
+
y_data = data[1:]
|
|
462
631
|
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
# Last sequence as test
|
|
467
|
-
train_sequence = self.sequence
|
|
468
|
-
test_sequence = None
|
|
469
|
-
if last_seq_test:
|
|
470
|
-
test_sequence = self.sequence[-(sequence_size*2):]
|
|
471
|
-
train_sequence = self.sequence[:-sequence_size]
|
|
472
|
-
|
|
473
|
-
# Normalize values
|
|
474
|
-
norm_train_sequence = train_sequence
|
|
475
|
-
norm_test_sequence = test_sequence
|
|
476
|
-
if normalize is not None:
|
|
477
|
-
# Define scaler
|
|
478
|
-
if normalize == "standard":
|
|
479
|
-
self.scaler = StandardScaler()
|
|
480
|
-
elif normalize == "minmax":
|
|
481
|
-
self.scaler = MinMaxScaler(feature_range=(-1,1))
|
|
482
|
-
# Scale and transform training set + reshape
|
|
483
|
-
self.scaler.fit(train_sequence.reshape(-1,1))
|
|
484
|
-
norm_train_sequence = self.scaler.transform(train_sequence.reshape(-1,1))
|
|
485
|
-
norm_train_sequence = norm_train_sequence.reshape(-1)
|
|
486
|
-
# Scale test if it exists + reshape
|
|
487
|
-
if last_seq_test:
|
|
488
|
-
norm_test_sequence = self.scaler.transform(test_sequence.reshape(-1,1))
|
|
489
|
-
norm_test_sequence = norm_test_sequence.reshape(-1)
|
|
490
|
-
|
|
491
|
-
# Divide train sequence into subsequences
|
|
492
|
-
train_features_list = list()
|
|
493
|
-
train_labels_list = list()
|
|
494
|
-
train_size = len(norm_train_sequence)
|
|
495
|
-
for i in range(train_size - sequence_size - 1):
|
|
496
|
-
subsequence = norm_train_sequence[i:sequence_size + i]
|
|
497
|
-
train_features_list.append(subsequence.reshape(1,-1))
|
|
498
|
-
# Labels as sequence
|
|
499
|
-
if seq_labels:
|
|
500
|
-
label = norm_train_sequence[i + 1:sequence_size + i + 1]
|
|
501
|
-
train_labels_list.append(label.reshape(1,-1))
|
|
502
|
-
# Single value label
|
|
503
|
-
else:
|
|
504
|
-
label = norm_train_sequence[sequence_size + i + 1]
|
|
505
|
-
train_labels_list.append(label)
|
|
632
|
+
n_windows = len(x_data) - self.sequence_length + 1
|
|
633
|
+
bytes_per_item = x_data.strides[0]
|
|
506
634
|
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
test_features_list = list()
|
|
510
|
-
test_labels_list = list()
|
|
511
|
-
test_size = len(norm_test_sequence)
|
|
512
|
-
for i in range(test_size - sequence_size - 1):
|
|
513
|
-
subsequence = norm_test_sequence[i:sequence_size + i]
|
|
514
|
-
test_features_list.append(subsequence.reshape(1,-1))
|
|
515
|
-
# Labels as sequence
|
|
516
|
-
if seq_labels:
|
|
517
|
-
label = norm_test_sequence[i + 1:sequence_size + i + 1]
|
|
518
|
-
test_labels_list.append(label.reshape(1,-1))
|
|
519
|
-
# Single value label
|
|
520
|
-
else:
|
|
521
|
-
label = norm_test_sequence[sequence_size + i + 1]
|
|
522
|
-
test_labels_list.append(label)
|
|
635
|
+
strided_x = numpy.lib.stride_tricks.as_strided(x_data, shape=(n_windows, self.sequence_length), strides=(bytes_per_item, bytes_per_item))
|
|
636
|
+
strided_y = numpy.lib.stride_tricks.as_strided(y_data, shape=(n_windows, self.sequence_length), strides=(bytes_per_item, bytes_per_item))
|
|
523
637
|
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
train_labels = numpy.array(train_labels_list).reshape(-1,1)
|
|
531
|
-
self.train_dataset = PytorchDataset(features=train_features, labels=train_labels, labels_dtype=torch.float32)
|
|
532
|
-
|
|
533
|
-
# Create test arrays then cast to pytorch dataset
|
|
534
|
-
if last_seq_test:
|
|
535
|
-
test_features = numpy.concatenate(test_features_list, axis=0)
|
|
536
|
-
# Check if labels are a sequence
|
|
537
|
-
if seq_labels:
|
|
538
|
-
test_labels = numpy.concatenate(test_labels_list, axis=0)
|
|
539
|
-
else:
|
|
540
|
-
test_labels = numpy.array(test_labels_list).reshape(-1,1)
|
|
541
|
-
self.test_dataset = PytorchDataset(features=test_features, labels=test_labels, labels_dtype=torch.float32)
|
|
542
|
-
else:
|
|
543
|
-
self.test_dataset = PytorchDataset(features=numpy.ones(shape=(10, sequence_size)), labels=numpy.ones(shape=(10,1)), labels_dtype=torch.float32)
|
|
638
|
+
return _PytorchDataset(strided_x, strided_y, labels_dtype=torch.float32)
|
|
639
|
+
|
|
640
|
+
def denormalize(self, data: Union[torch.Tensor, numpy.ndarray]) -> numpy.ndarray:
|
|
641
|
+
"""Applies inverse transformation using the stored scaler."""
|
|
642
|
+
if self.scaler is None:
|
|
643
|
+
raise RuntimeError("Data was not normalized. Cannot denormalize.")
|
|
544
644
|
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
645
|
+
if isinstance(data, torch.Tensor):
|
|
646
|
+
data_np = data.cpu().detach().numpy()
|
|
647
|
+
else:
|
|
648
|
+
data_np = data
|
|
649
|
+
|
|
650
|
+
return self.scaler.inverse_transform(data_np.reshape(-1, 1)).flatten()
|
|
551
651
|
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
plt.figure(figsize=(
|
|
558
|
-
plt.title(
|
|
652
|
+
def plot(self, predictions: Optional[numpy.ndarray] = None):
|
|
653
|
+
"""Plots the original training and testing data, with optional predictions."""
|
|
654
|
+
if not self._is_split:
|
|
655
|
+
raise RuntimeError("Cannot plot before splitting data. Call .split_data() first.")
|
|
656
|
+
|
|
657
|
+
plt.figure(figsize=(15, 6))
|
|
658
|
+
plt.title("Time Series Data")
|
|
559
659
|
plt.grid(True)
|
|
560
|
-
plt.
|
|
561
|
-
plt.
|
|
562
|
-
if x_pred is not None and y_pred is not None:
|
|
563
|
-
plt.plot(x_pred, y_pred)
|
|
564
|
-
plt.show()
|
|
660
|
+
plt.xlabel("Time")
|
|
661
|
+
plt.ylabel("Value")
|
|
565
662
|
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
Applies the inverse transformation of the object's stored scaler to a tensor or array.
|
|
663
|
+
plt.plot(self.train_time_axis, self.scaler.inverse_transform(self.train_sequence.reshape(-1, 1)), label='Train Data') # type: ignore
|
|
664
|
+
plt.plot(self.test_time_axis, self.scaler.inverse_transform(self.test_sequence[self.sequence_length-1:].reshape(-1, 1)), label='Test Data') # type: ignore
|
|
569
665
|
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
Returns: numpy.ndarray with default index.
|
|
574
|
-
"""
|
|
575
|
-
if isinstance(input, torch.Tensor):
|
|
576
|
-
with torch.no_grad():
|
|
577
|
-
array = input.numpy().reshape(-1,1)
|
|
578
|
-
elif isinstance(input, numpy.ndarray):
|
|
579
|
-
array = input.reshape(-1,1)
|
|
580
|
-
else:
|
|
581
|
-
raise TypeError("Input must be a Pytorch tensor or Numpy array.")
|
|
582
|
-
return self.scaler.inverse_transform(array)
|
|
583
|
-
|
|
584
|
-
def get_last_sequence(self, normalize: bool=True, to_tensor: bool=True):
|
|
585
|
-
"""
|
|
586
|
-
Returns the last subsequence of the sequence.
|
|
666
|
+
if predictions is not None:
|
|
667
|
+
pred_time_axis = self.test_time_axis[:len(predictions)]
|
|
668
|
+
plt.plot(pred_time_axis, predictions, label='Predictions', c='red')
|
|
587
669
|
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
`to_tensor`: Cast to Pytorch tensor. Defaults to True.
|
|
670
|
+
plt.legend()
|
|
671
|
+
plt.show()
|
|
592
672
|
|
|
593
|
-
|
|
594
|
-
"""
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
if to_tensor:
|
|
599
|
-
last_seq = torch.Tensor(last_seq)
|
|
600
|
-
return last_seq
|
|
601
|
-
|
|
602
|
-
def __len__(self):
|
|
603
|
-
return f"Train: {len(self.train_dataset)}, Test: {len(self.test_dataset)}"
|
|
673
|
+
def get_datasets(self) -> Tuple[_PytorchDataset, _PytorchDataset]:
|
|
674
|
+
"""Returns the final train and test datasets."""
|
|
675
|
+
if not self._are_windows_generated:
|
|
676
|
+
raise RuntimeError("Windows have not been generated. Call .generate_windows() first.")
|
|
677
|
+
return self._train_dataset, self._test_dataset
|
|
604
678
|
|
|
605
679
|
|
|
606
680
|
def info():
|