dragon-ml-toolbox 1.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- dragon_ml_toolbox-1.1.2.dist-info/METADATA +114 -0
- dragon_ml_toolbox-1.1.2.dist-info/RECORD +16 -0
- dragon_ml_toolbox-1.1.2.dist-info/WHEEL +5 -0
- dragon_ml_toolbox-1.1.2.dist-info/top_level.txt +1 -0
- ml_tools/MICE_imputation.py +178 -0
- ml_tools/__init__.py +0 -0
- ml_tools/data_exploration.py +751 -0
- ml_tools/datasetmaster.py +595 -0
- ml_tools/ensemble_learning.py +701 -0
- ml_tools/handle_excel.py +310 -0
- ml_tools/logger.py +145 -0
- ml_tools/particle_swarm_optimization.py +467 -0
- ml_tools/pytorch_models.py +227 -0
- ml_tools/trainer.py +366 -0
- ml_tools/utilities.py +168 -0
- ml_tools/vision_helpers.py +218 -0
|
@@ -0,0 +1,595 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
from torch.utils.data import Dataset, TensorDataset
|
|
3
|
+
from torch import nn
|
|
4
|
+
import pandas
|
|
5
|
+
import numpy
|
|
6
|
+
from sklearn.model_selection import train_test_split
|
|
7
|
+
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
|
8
|
+
from typing import Literal, Union
|
|
9
|
+
from imblearn.combine import SMOTETomek
|
|
10
|
+
from PIL import Image
|
|
11
|
+
from torchvision.datasets import ImageFolder
|
|
12
|
+
from torchvision import transforms
|
|
13
|
+
import matplotlib.pyplot as plt
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DatasetMaker():
|
|
17
|
+
def __init__(self, *, pandas_df: pandas.DataFrame, label_col: str, cat_features: Union[list[str], None]=None,
|
|
18
|
+
cat_method: Union[Literal["one-hot", "embed"], None]="one-hot", test_size: float=0.2, random_state: Union[int, None]=None,
|
|
19
|
+
normalize: Union[Literal["standard", "minmax"], None]="standard", cast_labels: bool=True, balance: bool=False, **kwargs):
|
|
20
|
+
"""
|
|
21
|
+
Create Train-Test datasets from a Pandas DataFrame. Four datasets will be created:
|
|
22
|
+
|
|
23
|
+
1. Features Train
|
|
24
|
+
2. Features Test
|
|
25
|
+
3. Labels Train
|
|
26
|
+
4. Labels Test
|
|
27
|
+
|
|
28
|
+
Use the method `to_pytorch()` to quickly get Train and Test PytorchDataset objects.
|
|
29
|
+
|
|
30
|
+
`label_col` Specify the name of the label column. If label encoding is required (str -> int) set `cast_labels=True` (default).
|
|
31
|
+
A dictionary will be created with the label mapping {code: original_name}.
|
|
32
|
+
|
|
33
|
+
`cat_features` List of column names to perform embedding or one-hot-encoding of categorical features.
|
|
34
|
+
Any categorical column not in the list will not be returned.
|
|
35
|
+
If `None` (default), columns containing categorical data will be inferred from dtypes: object, string and category, if any.
|
|
36
|
+
|
|
37
|
+
`cat_method` can be set to:
|
|
38
|
+
|
|
39
|
+
* `'one-hot'` (default) to perform One-Hot-Encoding using Pandas "get_dummies".
|
|
40
|
+
* `'embed'` to perform Embedding using PyTorch "nn.Embedding".
|
|
41
|
+
* `None` all data will be considered to be continuous.
|
|
42
|
+
|
|
43
|
+
`normalize` if not None, continuous features will be normalized using Scikit-Learn's StandardScaler or MinMaxScaler.
|
|
44
|
+
|
|
45
|
+
If `balance=True` attempts to balance the minority class(es) in the training data using Imbalanced-Learn's `SMOTETomek` algorithm.
|
|
46
|
+
|
|
47
|
+
`**kwargs` Pass any additional keyword parameters to `pandas.get_dummies()` or `torch.nn.Embedding()`.
|
|
48
|
+
i.e. pandas `drop_first=False`.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
# Validate dataframe
|
|
52
|
+
if not isinstance(pandas_df, pandas.DataFrame):
|
|
53
|
+
raise TypeError("pandas_df must be a pandas.DataFrame object.")
|
|
54
|
+
# Validate label column
|
|
55
|
+
if not isinstance(label_col, (str, list)):
|
|
56
|
+
raise TypeError("label_col must be a string or list of strings.")
|
|
57
|
+
# Validate categorical features
|
|
58
|
+
if not (isinstance(cat_features, list) or cat_features is None):
|
|
59
|
+
raise TypeError("cat_features must be a list of strings or None.")
|
|
60
|
+
if cat_method not in ["one-hot", "embed", None]:
|
|
61
|
+
raise TypeError("cat_method must be 'one-hot', 'embed' or None.")
|
|
62
|
+
# Validate test size
|
|
63
|
+
if not isinstance(test_size, (float, int)):
|
|
64
|
+
raise TypeError("test_size must be a float in the range 0.0 to 1.0")
|
|
65
|
+
if not (1.0 >= test_size >= 0.0):
|
|
66
|
+
raise ValueError("test_size must be a float in the range 0.0 to 1.0")
|
|
67
|
+
# Validate random state
|
|
68
|
+
if not (isinstance(random_state, int) or random_state is None):
|
|
69
|
+
raise TypeError("random_state must be an integer or None.")
|
|
70
|
+
# validate normalize
|
|
71
|
+
if not (normalize in ["standard", "minmax"] or normalize is None):
|
|
72
|
+
raise TypeError("normalize must be 'standard', 'minmax' or None.")
|
|
73
|
+
# Validate cast labels
|
|
74
|
+
if not isinstance(cast_labels, bool):
|
|
75
|
+
raise TypeError("cast_labels must be either True or False.")
|
|
76
|
+
|
|
77
|
+
# Start-o
|
|
78
|
+
self._labels = pandas_df[label_col]
|
|
79
|
+
pandas_df = pandas_df.drop(columns=label_col)
|
|
80
|
+
# Set None parameters
|
|
81
|
+
self._categorical = None
|
|
82
|
+
self._continuous = None
|
|
83
|
+
self.labels_train = None
|
|
84
|
+
self.labels_test = None
|
|
85
|
+
self.labels_map = None
|
|
86
|
+
self.features_test = None
|
|
87
|
+
self.features_train = None
|
|
88
|
+
|
|
89
|
+
# Find categorical
|
|
90
|
+
cat_columns = list()
|
|
91
|
+
if cat_method is not None:
|
|
92
|
+
if cat_features is None:
|
|
93
|
+
# find categorical columns from Object, String or Category dtypes automatically
|
|
94
|
+
for column_ in pandas_df.columns:
|
|
95
|
+
if pandas_df[column_].dtype == object or pandas_df[column_].dtype == 'string' or pandas_df[column_].dtype.name == 'category':
|
|
96
|
+
cat_columns.append(column_)
|
|
97
|
+
else:
|
|
98
|
+
cat_columns = cat_features
|
|
99
|
+
|
|
100
|
+
# Handle categorical data if required
|
|
101
|
+
if len(cat_columns) > 0:
|
|
102
|
+
# Set continuous/categorical data if categorical detected
|
|
103
|
+
self._continuous = pandas_df.drop(columns=cat_columns)
|
|
104
|
+
self._categorical = pandas_df[cat_columns].copy()
|
|
105
|
+
|
|
106
|
+
# Perform one-hot-encoding
|
|
107
|
+
if cat_method == "one-hot":
|
|
108
|
+
for col_ in cat_columns:
|
|
109
|
+
self._categorical[col_] = self._categorical[col_].astype("category")
|
|
110
|
+
self._categorical = pandas.get_dummies(data=self._categorical, dtype=numpy.int32, **kwargs)
|
|
111
|
+
# Perform embedding
|
|
112
|
+
else:
|
|
113
|
+
self._categorical = self.embed_categorical(cat_df=self._categorical, random_state=random_state, **kwargs)
|
|
114
|
+
|
|
115
|
+
# Something went wrong?
|
|
116
|
+
if self._categorical.empty:
|
|
117
|
+
raise AttributeError("Categorical data couldn't be processed")
|
|
118
|
+
else:
|
|
119
|
+
# Assume all data is continuous
|
|
120
|
+
if not pandas_df.empty:
|
|
121
|
+
self._continuous = pandas_df
|
|
122
|
+
|
|
123
|
+
# Map labels
|
|
124
|
+
if cast_labels:
|
|
125
|
+
labels_ = self._labels.astype("category")
|
|
126
|
+
# Get mapping
|
|
127
|
+
self.labels_map = {key: value for key, value in enumerate(labels_.cat.categories)}
|
|
128
|
+
self._labels = labels_.cat.codes
|
|
129
|
+
|
|
130
|
+
# Train-Test splits
|
|
131
|
+
if self._continuous is not None and self._categorical is not None:
|
|
132
|
+
continuous_train, continuous_test, categorical_train, categorical_test, self.labels_train, self.labels_test = train_test_split(self._continuous,
|
|
133
|
+
self._categorical,
|
|
134
|
+
self._labels,
|
|
135
|
+
test_size=test_size,
|
|
136
|
+
random_state=random_state)
|
|
137
|
+
elif self._categorical is None:
|
|
138
|
+
continuous_train, continuous_test, self.labels_train, self.labels_test = train_test_split(self._continuous, self._labels,
|
|
139
|
+
test_size=test_size, random_state=random_state)
|
|
140
|
+
elif self._continuous is None:
|
|
141
|
+
categorical_train, categorical_test, self.labels_train, self.labels_test = train_test_split(self._categorical, self._labels,
|
|
142
|
+
test_size=test_size, random_state=random_state)
|
|
143
|
+
|
|
144
|
+
# Normalize continuous features
|
|
145
|
+
if normalize is not None and self._continuous is not None:
|
|
146
|
+
continuous_train, continuous_test = self.normalize_continuous(train_set=continuous_train, test_set=continuous_test, method=normalize)
|
|
147
|
+
|
|
148
|
+
# Merge continuous and categorical
|
|
149
|
+
if self._categorical is not None and self._continuous is not None:
|
|
150
|
+
self.features_train = pandas.concat(objs=[continuous_train, categorical_train], axis=1)
|
|
151
|
+
self.features_test = pandas.concat(objs=[continuous_test, categorical_test], axis=1)
|
|
152
|
+
elif self._continuous is not None:
|
|
153
|
+
self.features_train = continuous_train
|
|
154
|
+
self.features_test = continuous_test
|
|
155
|
+
elif self._categorical is not None:
|
|
156
|
+
self.features_train = categorical_train
|
|
157
|
+
self.features_test = categorical_test
|
|
158
|
+
|
|
159
|
+
# Balance train dataset
|
|
160
|
+
if balance and self.features_train is not None and self.labels_train is not None:
|
|
161
|
+
self.features_train, self.labels_train = self.balance_classes(train_features=self.features_train, train_labels=self.labels_train)
|
|
162
|
+
|
|
163
|
+
def to_pytorch(self):
|
|
164
|
+
"""
|
|
165
|
+
Convert the train and test features and labels to Pytorch Datasets with default dtypes.
|
|
166
|
+
|
|
167
|
+
Returns: Tuple(Train Dataset, Test Dataset)
|
|
168
|
+
"""
|
|
169
|
+
train = None
|
|
170
|
+
test = None
|
|
171
|
+
# Train set
|
|
172
|
+
if self.labels_train is not None and self.features_train is not None:
|
|
173
|
+
train = PytorchDataset(features=self.features_train, labels=self.labels_train)
|
|
174
|
+
# Test set
|
|
175
|
+
if self.labels_test is not None and self.features_test is not None:
|
|
176
|
+
test = PytorchDataset(features=self.features_test, labels=self.labels_test)
|
|
177
|
+
|
|
178
|
+
return train, test
|
|
179
|
+
|
|
180
|
+
@staticmethod
|
|
181
|
+
def embed_categorical(cat_df: pandas.DataFrame, random_state: Union[int, None]=None, **kwargs) -> pandas.DataFrame:
|
|
182
|
+
"""
|
|
183
|
+
Takes a DataFrame object containing categorical data only.
|
|
184
|
+
|
|
185
|
+
Calculates embedding dimensions for each categorical feature. Using `(Number_of_categories + 1) // 2` up to a maximum value of 50.
|
|
186
|
+
|
|
187
|
+
Applies embedding using PyTorch and returns a Pandas Dataframe with embedded features.
|
|
188
|
+
"""
|
|
189
|
+
df = cat_df.copy()
|
|
190
|
+
embedded_tensors = list()
|
|
191
|
+
columns = list()
|
|
192
|
+
for col in df.columns:
|
|
193
|
+
df[col] = df[col].astype("category")
|
|
194
|
+
# Get number of categories
|
|
195
|
+
size: int = df[col].cat.categories.size
|
|
196
|
+
# Embedding dimension
|
|
197
|
+
embedding_dim: int = min(50, (size+1)//2)
|
|
198
|
+
# Create instance of Embedding tensor using half the value for embedding dimensions
|
|
199
|
+
with torch.no_grad():
|
|
200
|
+
if random_state:
|
|
201
|
+
torch.manual_seed(random_state)
|
|
202
|
+
embedder = nn.Embedding(num_embeddings=size, embedding_dim=embedding_dim, **kwargs)
|
|
203
|
+
# Embed column of features and store tensor
|
|
204
|
+
embedded_tensors.append(embedder(torch.LongTensor(df[col].cat.codes.copy().values)))
|
|
205
|
+
# Preserve column names for embedded features
|
|
206
|
+
for i in range(1, embedding_dim+1):
|
|
207
|
+
columns.append(f"{col}_{i}")
|
|
208
|
+
|
|
209
|
+
# Concatenate tensors
|
|
210
|
+
with torch.no_grad():
|
|
211
|
+
tensor = torch.cat(tensors=embedded_tensors, dim=1)
|
|
212
|
+
# Convert to dataframe
|
|
213
|
+
return pandas.DataFrame(data=tensor.numpy(), columns=columns)
|
|
214
|
+
|
|
215
|
+
@staticmethod
|
|
216
|
+
def normalize_continuous(train_set: Union[numpy.ndarray, pandas.DataFrame, pandas.Series], test_set: Union[numpy.ndarray, pandas.DataFrame, pandas.Series],
|
|
217
|
+
method: Literal["standard", "minmax"]="standard"):
|
|
218
|
+
"""
|
|
219
|
+
Takes a train and a test dataset, then returns the standardized datasets as a tuple (train, test).
|
|
220
|
+
|
|
221
|
+
`method`: Standardization by the mean and variance or MinMax Normalization.
|
|
222
|
+
|
|
223
|
+
The transformer is fitted on the training set, so there is no data-leak of the test set.
|
|
224
|
+
|
|
225
|
+
Output type is the same as Input type: nD-array, DataFrame or Series.
|
|
226
|
+
"""
|
|
227
|
+
if method == "standard":
|
|
228
|
+
scaler = StandardScaler()
|
|
229
|
+
elif method == "minmax":
|
|
230
|
+
scaler = MinMaxScaler()
|
|
231
|
+
else:
|
|
232
|
+
raise ValueError("Normalization method must be 'standard' or 'minmax'.")
|
|
233
|
+
|
|
234
|
+
X_train = scaler.fit_transform(train_set)
|
|
235
|
+
X_test = scaler.transform(test_set)
|
|
236
|
+
|
|
237
|
+
if isinstance(train_set, pandas.DataFrame):
|
|
238
|
+
train_indexes = train_set.index
|
|
239
|
+
test_indexes = test_set.index
|
|
240
|
+
cols = train_set.columns
|
|
241
|
+
X_train = pandas.DataFrame(data=X_train, index=train_indexes, columns=cols)
|
|
242
|
+
X_test = pandas.DataFrame(data=X_test, index=test_indexes, columns=cols)
|
|
243
|
+
elif isinstance(train_set, pandas.Series):
|
|
244
|
+
train_indexes = train_set.index
|
|
245
|
+
test_indexes = test_set.index
|
|
246
|
+
X_train = pandas.Series(data=X_train, index=train_indexes)
|
|
247
|
+
X_test = pandas.Series(data=X_test, index=test_indexes)
|
|
248
|
+
else:
|
|
249
|
+
pass
|
|
250
|
+
|
|
251
|
+
return X_train, X_test
|
|
252
|
+
|
|
253
|
+
@staticmethod
|
|
254
|
+
def balance_classes(train_features, train_labels, **kwargs):
|
|
255
|
+
"""
|
|
256
|
+
Attempts to balance the minority class(es) using Imbalanced-Learn's `SMOTETomek` algorithm.
|
|
257
|
+
"""
|
|
258
|
+
resampler = SMOTETomek(**kwargs)
|
|
259
|
+
X, y = resampler.fit_resample(X=train_features, y=train_labels)
|
|
260
|
+
|
|
261
|
+
return X, y
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
class PytorchDataset(Dataset):
|
|
265
|
+
def __init__(self, features: Union[numpy.ndarray, pandas.Series, pandas.DataFrame], labels: Union[numpy.ndarray, pandas.Series, pandas.DataFrame],
|
|
266
|
+
features_dtype: torch.dtype=torch.float32, labels_dtype: torch.dtype=torch.int64, balance: bool=False) -> None:
|
|
267
|
+
"""
|
|
268
|
+
Make a PyTorch dataset of Features and Labels casted to Tensors.
|
|
269
|
+
|
|
270
|
+
Defaults: `float32` for features and `int64` for labels.
|
|
271
|
+
|
|
272
|
+
If `balance=True` attempts to balance the minority class(es) using Imbalanced-Learn's `SMOTETomek` algorithm.
|
|
273
|
+
Note: Only Train-Data should be balanced.
|
|
274
|
+
"""
|
|
275
|
+
# Validate features
|
|
276
|
+
if not isinstance(features, (pandas.DataFrame, pandas.Series, numpy.ndarray)):
|
|
277
|
+
raise TypeError("features must be a numpy.ndarray, pandas.Series or pandas.DataFrame")
|
|
278
|
+
# Validate labels
|
|
279
|
+
if not isinstance(labels, (pandas.DataFrame, pandas.Series, numpy.ndarray)):
|
|
280
|
+
raise TypeError("labels must be a numpy.ndarray, pandas.Series or pandas.DataFrame")
|
|
281
|
+
|
|
282
|
+
# Balance classes
|
|
283
|
+
if balance:
|
|
284
|
+
features, labels = self.balance_classes(train_features=features, train_labels=labels)
|
|
285
|
+
|
|
286
|
+
# Cast features
|
|
287
|
+
if isinstance(features, numpy.ndarray):
|
|
288
|
+
self.features = torch.tensor(features, dtype=features_dtype)
|
|
289
|
+
else:
|
|
290
|
+
self.features = torch.tensor(features.values, dtype=features_dtype)
|
|
291
|
+
|
|
292
|
+
# Cast labels
|
|
293
|
+
if isinstance(labels, numpy.ndarray):
|
|
294
|
+
self.labels = torch.tensor(labels, dtype=labels_dtype)
|
|
295
|
+
else:
|
|
296
|
+
self.labels = torch.tensor(labels.values, dtype=labels_dtype)
|
|
297
|
+
|
|
298
|
+
def __len__(self):
|
|
299
|
+
return len(self.features)
|
|
300
|
+
|
|
301
|
+
def __getitem__(self, index):
|
|
302
|
+
return self.features[index], self.labels[index]
|
|
303
|
+
|
|
304
|
+
@staticmethod
|
|
305
|
+
def balance_classes(train_features, train_labels, **kwargs):
|
|
306
|
+
"""
|
|
307
|
+
Attempts to balance the minority class(es) using Imbalanced-Learn's `SMOTETomek` algorithm.
|
|
308
|
+
"""
|
|
309
|
+
resampler = SMOTETomek(**kwargs)
|
|
310
|
+
X, y = resampler.fit_resample(X=train_features, y=train_labels)
|
|
311
|
+
|
|
312
|
+
return X, y
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def make_vision_dataset(inputs: Union[list[Image.Image], numpy.ndarray, str], labels: Union[list[int], numpy.ndarray, None], resize: int=256,
|
|
316
|
+
transform: Union[transforms.Compose, None]=None, test_set: bool=False):
|
|
317
|
+
"""
|
|
318
|
+
Make a Torchvision Dataset of images to be used in a Convolutional Neural Network.
|
|
319
|
+
|
|
320
|
+
If no transform object is given, Images will undergo the following transformations by default: `RandomHorizontalFlip`, `RandomRotation`,
|
|
321
|
+
`Resize`, `CenterCrop`, `ToTensor`, `Normalize`. Except if 'test_set=True'.
|
|
322
|
+
|
|
323
|
+
Args:
|
|
324
|
+
`inputs`: List of PIL Image objects | Numpy array of image arrays | Path to root directory containing subdirectories that classify image files.
|
|
325
|
+
|
|
326
|
+
`labels`: List of integer values | Numpy array of labels. Labels size must match `inputs` size.
|
|
327
|
+
If a path to a directory is given, then `labels` must be None.
|
|
328
|
+
|
|
329
|
+
`transform`: Custom transformations to use. If None, use default transformations.
|
|
330
|
+
|
|
331
|
+
`test_set`: Flip, rotation and center-crop transformations will not be applied.
|
|
332
|
+
|
|
333
|
+
Returns:
|
|
334
|
+
`Dataset`: Either a `TensorDataset` or `ImageFolder` instance, depending on the method used.
|
|
335
|
+
Data dimensions: (samples, color channels, height, width).
|
|
336
|
+
"""
|
|
337
|
+
# Validate inputs
|
|
338
|
+
if not isinstance(inputs, (list, numpy.ndarray, str)):
|
|
339
|
+
raise TypeError("Inputs must be one of the following:\n\ta) List of PIL Image objects.\n\tb) Numpy array of 2D or 3D arrays.\
|
|
340
|
+
\n\tc) Directory path to image files.")
|
|
341
|
+
# Validate labels
|
|
342
|
+
if not (isinstance(labels, (list, numpy.ndarray)) or labels is None):
|
|
343
|
+
raise TypeError("Inputs must be one of the following:\n\ta) List of labels (integers).\n\tb) Numpy array of 2D or 3D arrays.\
|
|
344
|
+
\n\tc) None if inputs path is given.\nLabels size must match Inputs size.")
|
|
345
|
+
# Validate resize shape
|
|
346
|
+
if not isinstance(resize, int):
|
|
347
|
+
raise TypeError("Resize must be an integer value for a square image of shape (W, H).")
|
|
348
|
+
# Validate transform
|
|
349
|
+
if isinstance(transform, transforms.Compose):
|
|
350
|
+
pass
|
|
351
|
+
elif transform is None:
|
|
352
|
+
if test_set:
|
|
353
|
+
transform = transforms.Compose([
|
|
354
|
+
transforms.Resize(size=(resize,resize)),
|
|
355
|
+
transforms.ToTensor(),
|
|
356
|
+
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
|
357
|
+
])
|
|
358
|
+
else:
|
|
359
|
+
transform = transforms.Compose([
|
|
360
|
+
transforms.RandomHorizontalFlip(p=0.5),
|
|
361
|
+
transforms.RandomRotation(degrees=30),
|
|
362
|
+
transforms.Resize(size=(int(resize*1.2),int(resize*1.2))),
|
|
363
|
+
transforms.CenterCrop(size=resize),
|
|
364
|
+
transforms.ToTensor(),
|
|
365
|
+
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
|
366
|
+
])
|
|
367
|
+
else:
|
|
368
|
+
raise TypeError("Transform must be a `torchvision.transforms.Compose` object or None to use a default transform.")
|
|
369
|
+
|
|
370
|
+
# Start-o
|
|
371
|
+
dataset = None
|
|
372
|
+
|
|
373
|
+
# CASE A: input is a path to image files, Labels is None
|
|
374
|
+
if labels is None:
|
|
375
|
+
if isinstance(inputs, str):
|
|
376
|
+
dataset = ImageFolder(root=inputs, transform=transform)
|
|
377
|
+
else:
|
|
378
|
+
raise TypeError("Labels must be None if 'path' to inputs is provided. Labels will be inferred from subdirectory names in 'path'.")
|
|
379
|
+
# CASE B: input is Numpy array or a list of PIL Images. Labels is Numpy array or List of integers
|
|
380
|
+
elif not isinstance(inputs, str):
|
|
381
|
+
# Transform labels to tensor
|
|
382
|
+
labels_ = torch.tensor(labels, dtype=torch.int64)
|
|
383
|
+
|
|
384
|
+
# Transform each image to tensor
|
|
385
|
+
transformed = list()
|
|
386
|
+
for img_ in inputs:
|
|
387
|
+
transformed.append(transform(img_))
|
|
388
|
+
# Stack image tensors
|
|
389
|
+
features_ = torch.stack(transformed, dim=0)
|
|
390
|
+
|
|
391
|
+
# Make a dataset with images and labels
|
|
392
|
+
dataset = TensorDataset(features_, labels_)
|
|
393
|
+
else:
|
|
394
|
+
raise TypeError("Labels must be None if 'path' to inputs is provided. Labels will be inferred from subdirectory names in 'path'.")
|
|
395
|
+
|
|
396
|
+
return dataset
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
class SequenceDataset():
|
|
400
|
+
def __init__(self, data: Union[pandas.DataFrame, pandas.Series, numpy.ndarray], sequence_size: int, last_seq_test: bool=True,
|
|
401
|
+
seq_labels: bool=True, normalize: Union[Literal["standard", "minmax"], None]="minmax"):
|
|
402
|
+
"""
|
|
403
|
+
Make train/test datasets from a single timestamp sequence.
|
|
404
|
+
|
|
405
|
+
Create an object containing 2 PyTorchDataset objects to be used in a Recurrent Neural Network:
|
|
406
|
+
|
|
407
|
+
1. Train Dataset
|
|
408
|
+
2. Test Dataset
|
|
409
|
+
|
|
410
|
+
To plot call the static method `plot()`.
|
|
411
|
+
|
|
412
|
+
If normalization is used, an scaler object will be stored.
|
|
413
|
+
The scaler object can be used to invert normalization on a Tensor/Array using the method `self.denormalize()`.
|
|
414
|
+
|
|
415
|
+
Args:
|
|
416
|
+
* `data`: Pandas Dataframe with 2 columns [datetime, sequence] | 1-column Dataframe or Series sequence, where index is the datetime.
|
|
417
|
+
* `sequence_size (int)`: Length of each subsequence that will be used for training.
|
|
418
|
+
* `last_seq_test (bool)`: Last sequence will be used as test_set, if false a dummy test set will be returned. Default is True.
|
|
419
|
+
* `seq_labels (bool)`: Labels will be returned as sequences, if false return single values for 1 future timestamp.
|
|
420
|
+
* `normalize`: Whether to normalize ('minmax'), standardize ('standard') or ignore (None). Default is 'minmax'.
|
|
421
|
+
"""
|
|
422
|
+
# Validate data
|
|
423
|
+
if not isinstance(data, (pandas.Series, pandas.DataFrame, numpy.ndarray)):
|
|
424
|
+
raise TypeError("Data must be pandas dataframe, pandas series or numpy array.")
|
|
425
|
+
# Validate window size
|
|
426
|
+
if not isinstance(sequence_size, int):
|
|
427
|
+
raise TypeError("Sequence size must be an integer.")
|
|
428
|
+
elif len(data) % sequence_size != 0:
|
|
429
|
+
raise ValueError(f"data with length {len(data)} is not divisible in sequences of {sequence_size} values.")
|
|
430
|
+
# Validate test sequence
|
|
431
|
+
if not isinstance(last_seq_test, bool):
|
|
432
|
+
raise TypeError("Last sequence treated as Test-set must be True or False.")
|
|
433
|
+
# validate normalize
|
|
434
|
+
if not (normalize in ["standard", "minmax"] or normalize is None):
|
|
435
|
+
raise TypeError("normalize must be 'standard', 'minmax' or None.")
|
|
436
|
+
|
|
437
|
+
# Handle data -> array
|
|
438
|
+
self.time_axis = None
|
|
439
|
+
if isinstance(data, pandas.DataFrame):
|
|
440
|
+
if len(data.columns) == 2:
|
|
441
|
+
self.sequence = data[data.columns[1]].values.astype("float")
|
|
442
|
+
self.time_axis = data[data.columns[0]].values
|
|
443
|
+
elif len(data.columns) == 1:
|
|
444
|
+
self.sequence = data[data.columns[0]].values.astype("float")
|
|
445
|
+
self.time_axis = data.index.values
|
|
446
|
+
else:
|
|
447
|
+
raise ValueError("Dataframe contains more than 2 columns.")
|
|
448
|
+
elif isinstance(data, pandas.Series):
|
|
449
|
+
self.sequence = data.values.astype("float")
|
|
450
|
+
self.time_axis = data.index.values
|
|
451
|
+
else:
|
|
452
|
+
self.sequence = data.astype("float")
|
|
453
|
+
|
|
454
|
+
# Save last sequence
|
|
455
|
+
self._last_sequence = self.sequence[-sequence_size:]
|
|
456
|
+
|
|
457
|
+
# Last sequence as test
|
|
458
|
+
train_sequence = self.sequence
|
|
459
|
+
test_sequence = None
|
|
460
|
+
if last_seq_test:
|
|
461
|
+
test_sequence = self.sequence[-(sequence_size*2):]
|
|
462
|
+
train_sequence = self.sequence[:-sequence_size]
|
|
463
|
+
|
|
464
|
+
# Normalize values
|
|
465
|
+
norm_train_sequence = train_sequence
|
|
466
|
+
norm_test_sequence = test_sequence
|
|
467
|
+
if normalize is not None:
|
|
468
|
+
# Define scaler
|
|
469
|
+
if normalize == "standard":
|
|
470
|
+
self.scaler = StandardScaler()
|
|
471
|
+
elif normalize == "minmax":
|
|
472
|
+
self.scaler = MinMaxScaler(feature_range=(-1,1))
|
|
473
|
+
# Scale and transform training set + reshape
|
|
474
|
+
self.scaler.fit(train_sequence.reshape(-1,1))
|
|
475
|
+
norm_train_sequence = self.scaler.transform(train_sequence.reshape(-1,1))
|
|
476
|
+
norm_train_sequence = norm_train_sequence.reshape(-1)
|
|
477
|
+
# Scale test if it exists + reshape
|
|
478
|
+
if last_seq_test:
|
|
479
|
+
norm_test_sequence = self.scaler.transform(test_sequence.reshape(-1,1))
|
|
480
|
+
norm_test_sequence = norm_test_sequence.reshape(-1)
|
|
481
|
+
|
|
482
|
+
# Divide train sequence into subsequences
|
|
483
|
+
train_features_list = list()
|
|
484
|
+
train_labels_list = list()
|
|
485
|
+
train_size = len(norm_train_sequence)
|
|
486
|
+
for i in range(train_size - sequence_size - 1):
|
|
487
|
+
subsequence = norm_train_sequence[i:sequence_size + i]
|
|
488
|
+
train_features_list.append(subsequence.reshape(1,-1))
|
|
489
|
+
# Labels as sequence
|
|
490
|
+
if seq_labels:
|
|
491
|
+
label = norm_train_sequence[i + 1:sequence_size + i + 1]
|
|
492
|
+
train_labels_list.append(label.reshape(1,-1))
|
|
493
|
+
# Single value label
|
|
494
|
+
else:
|
|
495
|
+
label = norm_train_sequence[sequence_size + i + 1]
|
|
496
|
+
train_labels_list.append(label)
|
|
497
|
+
|
|
498
|
+
# Divide test sequence into subsequences
|
|
499
|
+
if last_seq_test:
|
|
500
|
+
test_features_list = list()
|
|
501
|
+
test_labels_list = list()
|
|
502
|
+
test_size = len(norm_test_sequence)
|
|
503
|
+
for i in range(test_size - sequence_size - 1):
|
|
504
|
+
subsequence = norm_test_sequence[i:sequence_size + i]
|
|
505
|
+
test_features_list.append(subsequence.reshape(1,-1))
|
|
506
|
+
# Labels as sequence
|
|
507
|
+
if seq_labels:
|
|
508
|
+
label = norm_test_sequence[i + 1:sequence_size + i + 1]
|
|
509
|
+
test_labels_list.append(label.reshape(1,-1))
|
|
510
|
+
# Single value label
|
|
511
|
+
else:
|
|
512
|
+
label = norm_test_sequence[sequence_size + i + 1]
|
|
513
|
+
test_labels_list.append(label)
|
|
514
|
+
|
|
515
|
+
# Create training arrays then cast to pytorch dataset
|
|
516
|
+
train_features = numpy.concatenate(train_features_list, axis=0)
|
|
517
|
+
# Check if labels are a sequence
|
|
518
|
+
if seq_labels:
|
|
519
|
+
train_labels = numpy.concatenate(train_labels_list, axis=0)
|
|
520
|
+
else:
|
|
521
|
+
train_labels = numpy.array(train_labels_list).reshape(-1,1)
|
|
522
|
+
self.train_dataset = PytorchDataset(features=train_features, labels=train_labels, labels_dtype=torch.float32)
|
|
523
|
+
|
|
524
|
+
# Create test arrays then cast to pytorch dataset
|
|
525
|
+
if last_seq_test:
|
|
526
|
+
test_features = numpy.concatenate(test_features_list, axis=0)
|
|
527
|
+
# Check if labels are a sequence
|
|
528
|
+
if seq_labels:
|
|
529
|
+
test_labels = numpy.concatenate(test_labels_list, axis=0)
|
|
530
|
+
else:
|
|
531
|
+
test_labels = numpy.array(test_labels_list).reshape(-1,1)
|
|
532
|
+
self.test_dataset = PytorchDataset(features=test_features, labels=test_labels, labels_dtype=torch.float32)
|
|
533
|
+
else:
|
|
534
|
+
self.test_dataset = PytorchDataset(features=numpy.ones(shape=(10, sequence_size)), labels=numpy.ones(shape=(10,1)), labels_dtype=torch.float32)
|
|
535
|
+
|
|
536
|
+
# Attempt to plot the sequence
|
|
537
|
+
if self.time_axis is not None:
|
|
538
|
+
try:
|
|
539
|
+
self.plot(self.time_axis, self.sequence)
|
|
540
|
+
except:
|
|
541
|
+
print("Plot failed, try it manually to find the problem.")
|
|
542
|
+
|
|
543
|
+
@staticmethod
|
|
544
|
+
def plot(x_axis, y_axis, x_pred=None, y_pred=None):
|
|
545
|
+
"""
|
|
546
|
+
Plot Time-values (X) Vs Data-values (Y).
|
|
547
|
+
"""
|
|
548
|
+
plt.figure(figsize=(12,5))
|
|
549
|
+
plt.title('Sequence')
|
|
550
|
+
plt.grid(True)
|
|
551
|
+
plt.autoscale(axis='x', tight=True)
|
|
552
|
+
plt.plot(x_axis, y_axis)
|
|
553
|
+
if x_pred is not None and y_pred is not None:
|
|
554
|
+
plt.plot(x_pred, y_pred)
|
|
555
|
+
plt.show()
|
|
556
|
+
|
|
557
|
+
def denormalize(self, input: Union[torch.Tensor, numpy.ndarray]) -> numpy.ndarray:
|
|
558
|
+
"""
|
|
559
|
+
Applies the inverse transformation of the object's stored scaler to a tensor or array.
|
|
560
|
+
|
|
561
|
+
Args:
|
|
562
|
+
`input`: Tensor/Array predicted using the current sequence.
|
|
563
|
+
|
|
564
|
+
Returns: numpy.ndarray with default index.
|
|
565
|
+
"""
|
|
566
|
+
if isinstance(input, torch.Tensor):
|
|
567
|
+
with torch.no_grad():
|
|
568
|
+
array = input.numpy().reshape(-1,1)
|
|
569
|
+
elif isinstance(input, numpy.ndarray):
|
|
570
|
+
array = input.reshape(-1,1)
|
|
571
|
+
else:
|
|
572
|
+
raise TypeError("Input must be a Pytorch tensor or Numpy array.")
|
|
573
|
+
return self.scaler.inverse_transform(array)
|
|
574
|
+
|
|
575
|
+
def get_last_sequence(self, normalize: bool=True, to_tensor: bool=True):
|
|
576
|
+
"""
|
|
577
|
+
Returns the last subsequence of the sequence.
|
|
578
|
+
|
|
579
|
+
Args:
|
|
580
|
+
`normalize`: Normalize using the object's stored scaler. Defaults to True.
|
|
581
|
+
|
|
582
|
+
`to_tensor`: Cast to Pytorch tensor. Defaults to True.
|
|
583
|
+
|
|
584
|
+
Returns: numpy.ndarray or torch.Tensor
|
|
585
|
+
"""
|
|
586
|
+
last_seq = self._last_sequence.reshape(-1,1)
|
|
587
|
+
if normalize:
|
|
588
|
+
last_seq = self.scaler.transform(last_seq)
|
|
589
|
+
if to_tensor:
|
|
590
|
+
last_seq = torch.Tensor(last_seq)
|
|
591
|
+
return last_seq
|
|
592
|
+
|
|
593
|
+
def __len__(self):
|
|
594
|
+
return f"Train: {len(self.train_dataset)}, Test: {len(self.test_dataset)}"
|
|
595
|
+
|