dragon-ml-toolbox 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

@@ -0,0 +1,595 @@
1
+ import torch
2
+ from torch.utils.data import Dataset, TensorDataset
3
+ from torch import nn
4
+ import pandas
5
+ import numpy
6
+ from sklearn.model_selection import train_test_split
7
+ from sklearn.preprocessing import StandardScaler, MinMaxScaler
8
+ from typing import Literal, Union
9
+ from imblearn.combine import SMOTETomek
10
+ from PIL import Image
11
+ from torchvision.datasets import ImageFolder
12
+ from torchvision import transforms
13
+ import matplotlib.pyplot as plt
14
+
15
+
16
+ class DatasetMaker():
17
+ def __init__(self, *, pandas_df: pandas.DataFrame, label_col: str, cat_features: Union[list[str], None]=None,
18
+ cat_method: Union[Literal["one-hot", "embed"], None]="one-hot", test_size: float=0.2, random_state: Union[int, None]=None,
19
+ normalize: Union[Literal["standard", "minmax"], None]="standard", cast_labels: bool=True, balance: bool=False, **kwargs):
20
+ """
21
+ Create Train-Test datasets from a Pandas DataFrame. Four datasets will be created:
22
+
23
+ 1. Features Train
24
+ 2. Features Test
25
+ 3. Labels Train
26
+ 4. Labels Test
27
+
28
+ Use the method `to_pytorch()` to quickly get Train and Test PytorchDataset objects.
29
+
30
+ `label_col` Specify the name of the label column. If label encoding is required (str -> int) set `cast_labels=True` (default).
31
+ A dictionary will be created with the label mapping {code: original_name}.
32
+
33
+ `cat_features` List of column names to perform embedding or one-hot-encoding of categorical features.
34
+ Any categorical column not in the list will not be returned.
35
+ If `None` (default), columns containing categorical data will be inferred from dtypes: object, string and category, if any.
36
+
37
+ `cat_method` can be set to:
38
+
39
+ * `'one-hot'` (default) to perform One-Hot-Encoding using Pandas "get_dummies".
40
+ * `'embed'` to perform Embedding using PyTorch "nn.Embedding".
41
+ * `None` all data will be considered to be continuous.
42
+
43
+ `normalize` if not None, continuous features will be normalized using Scikit-Learn's StandardScaler or MinMaxScaler.
44
+
45
+ If `balance=True` attempts to balance the minority class(es) in the training data using Imbalanced-Learn's `SMOTETomek` algorithm.
46
+
47
+ `**kwargs` Pass any additional keyword parameters to `pandas.get_dummies()` or `torch.nn.Embedding()`.
48
+ i.e. pandas `drop_first=False`.
49
+ """
50
+
51
+ # Validate dataframe
52
+ if not isinstance(pandas_df, pandas.DataFrame):
53
+ raise TypeError("pandas_df must be a pandas.DataFrame object.")
54
+ # Validate label column
55
+ if not isinstance(label_col, (str, list)):
56
+ raise TypeError("label_col must be a string or list of strings.")
57
+ # Validate categorical features
58
+ if not (isinstance(cat_features, list) or cat_features is None):
59
+ raise TypeError("cat_features must be a list of strings or None.")
60
+ if cat_method not in ["one-hot", "embed", None]:
61
+ raise TypeError("cat_method must be 'one-hot', 'embed' or None.")
62
+ # Validate test size
63
+ if not isinstance(test_size, (float, int)):
64
+ raise TypeError("test_size must be a float in the range 0.0 to 1.0")
65
+ if not (1.0 >= test_size >= 0.0):
66
+ raise ValueError("test_size must be a float in the range 0.0 to 1.0")
67
+ # Validate random state
68
+ if not (isinstance(random_state, int) or random_state is None):
69
+ raise TypeError("random_state must be an integer or None.")
70
+ # validate normalize
71
+ if not (normalize in ["standard", "minmax"] or normalize is None):
72
+ raise TypeError("normalize must be 'standard', 'minmax' or None.")
73
+ # Validate cast labels
74
+ if not isinstance(cast_labels, bool):
75
+ raise TypeError("cast_labels must be either True or False.")
76
+
77
+ # Start-o
78
+ self._labels = pandas_df[label_col]
79
+ pandas_df = pandas_df.drop(columns=label_col)
80
+ # Set None parameters
81
+ self._categorical = None
82
+ self._continuous = None
83
+ self.labels_train = None
84
+ self.labels_test = None
85
+ self.labels_map = None
86
+ self.features_test = None
87
+ self.features_train = None
88
+
89
+ # Find categorical
90
+ cat_columns = list()
91
+ if cat_method is not None:
92
+ if cat_features is None:
93
+ # find categorical columns from Object, String or Category dtypes automatically
94
+ for column_ in pandas_df.columns:
95
+ if pandas_df[column_].dtype == object or pandas_df[column_].dtype == 'string' or pandas_df[column_].dtype.name == 'category':
96
+ cat_columns.append(column_)
97
+ else:
98
+ cat_columns = cat_features
99
+
100
+ # Handle categorical data if required
101
+ if len(cat_columns) > 0:
102
+ # Set continuous/categorical data if categorical detected
103
+ self._continuous = pandas_df.drop(columns=cat_columns)
104
+ self._categorical = pandas_df[cat_columns].copy()
105
+
106
+ # Perform one-hot-encoding
107
+ if cat_method == "one-hot":
108
+ for col_ in cat_columns:
109
+ self._categorical[col_] = self._categorical[col_].astype("category")
110
+ self._categorical = pandas.get_dummies(data=self._categorical, dtype=numpy.int32, **kwargs)
111
+ # Perform embedding
112
+ else:
113
+ self._categorical = self.embed_categorical(cat_df=self._categorical, random_state=random_state, **kwargs)
114
+
115
+ # Something went wrong?
116
+ if self._categorical.empty:
117
+ raise AttributeError("Categorical data couldn't be processed")
118
+ else:
119
+ # Assume all data is continuous
120
+ if not pandas_df.empty:
121
+ self._continuous = pandas_df
122
+
123
+ # Map labels
124
+ if cast_labels:
125
+ labels_ = self._labels.astype("category")
126
+ # Get mapping
127
+ self.labels_map = {key: value for key, value in enumerate(labels_.cat.categories)}
128
+ self._labels = labels_.cat.codes
129
+
130
+ # Train-Test splits
131
+ if self._continuous is not None and self._categorical is not None:
132
+ continuous_train, continuous_test, categorical_train, categorical_test, self.labels_train, self.labels_test = train_test_split(self._continuous,
133
+ self._categorical,
134
+ self._labels,
135
+ test_size=test_size,
136
+ random_state=random_state)
137
+ elif self._categorical is None:
138
+ continuous_train, continuous_test, self.labels_train, self.labels_test = train_test_split(self._continuous, self._labels,
139
+ test_size=test_size, random_state=random_state)
140
+ elif self._continuous is None:
141
+ categorical_train, categorical_test, self.labels_train, self.labels_test = train_test_split(self._categorical, self._labels,
142
+ test_size=test_size, random_state=random_state)
143
+
144
+ # Normalize continuous features
145
+ if normalize is not None and self._continuous is not None:
146
+ continuous_train, continuous_test = self.normalize_continuous(train_set=continuous_train, test_set=continuous_test, method=normalize)
147
+
148
+ # Merge continuous and categorical
149
+ if self._categorical is not None and self._continuous is not None:
150
+ self.features_train = pandas.concat(objs=[continuous_train, categorical_train], axis=1)
151
+ self.features_test = pandas.concat(objs=[continuous_test, categorical_test], axis=1)
152
+ elif self._continuous is not None:
153
+ self.features_train = continuous_train
154
+ self.features_test = continuous_test
155
+ elif self._categorical is not None:
156
+ self.features_train = categorical_train
157
+ self.features_test = categorical_test
158
+
159
+ # Balance train dataset
160
+ if balance and self.features_train is not None and self.labels_train is not None:
161
+ self.features_train, self.labels_train = self.balance_classes(train_features=self.features_train, train_labels=self.labels_train)
162
+
163
+ def to_pytorch(self):
164
+ """
165
+ Convert the train and test features and labels to Pytorch Datasets with default dtypes.
166
+
167
+ Returns: Tuple(Train Dataset, Test Dataset)
168
+ """
169
+ train = None
170
+ test = None
171
+ # Train set
172
+ if self.labels_train is not None and self.features_train is not None:
173
+ train = PytorchDataset(features=self.features_train, labels=self.labels_train)
174
+ # Test set
175
+ if self.labels_test is not None and self.features_test is not None:
176
+ test = PytorchDataset(features=self.features_test, labels=self.labels_test)
177
+
178
+ return train, test
179
+
180
+ @staticmethod
181
+ def embed_categorical(cat_df: pandas.DataFrame, random_state: Union[int, None]=None, **kwargs) -> pandas.DataFrame:
182
+ """
183
+ Takes a DataFrame object containing categorical data only.
184
+
185
+ Calculates embedding dimensions for each categorical feature. Using `(Number_of_categories + 1) // 2` up to a maximum value of 50.
186
+
187
+ Applies embedding using PyTorch and returns a Pandas Dataframe with embedded features.
188
+ """
189
+ df = cat_df.copy()
190
+ embedded_tensors = list()
191
+ columns = list()
192
+ for col in df.columns:
193
+ df[col] = df[col].astype("category")
194
+ # Get number of categories
195
+ size: int = df[col].cat.categories.size
196
+ # Embedding dimension
197
+ embedding_dim: int = min(50, (size+1)//2)
198
+ # Create instance of Embedding tensor using half the value for embedding dimensions
199
+ with torch.no_grad():
200
+ if random_state:
201
+ torch.manual_seed(random_state)
202
+ embedder = nn.Embedding(num_embeddings=size, embedding_dim=embedding_dim, **kwargs)
203
+ # Embed column of features and store tensor
204
+ embedded_tensors.append(embedder(torch.LongTensor(df[col].cat.codes.copy().values)))
205
+ # Preserve column names for embedded features
206
+ for i in range(1, embedding_dim+1):
207
+ columns.append(f"{col}_{i}")
208
+
209
+ # Concatenate tensors
210
+ with torch.no_grad():
211
+ tensor = torch.cat(tensors=embedded_tensors, dim=1)
212
+ # Convert to dataframe
213
+ return pandas.DataFrame(data=tensor.numpy(), columns=columns)
214
+
215
+ @staticmethod
216
+ def normalize_continuous(train_set: Union[numpy.ndarray, pandas.DataFrame, pandas.Series], test_set: Union[numpy.ndarray, pandas.DataFrame, pandas.Series],
217
+ method: Literal["standard", "minmax"]="standard"):
218
+ """
219
+ Takes a train and a test dataset, then returns the standardized datasets as a tuple (train, test).
220
+
221
+ `method`: Standardization by the mean and variance or MinMax Normalization.
222
+
223
+ The transformer is fitted on the training set, so there is no data-leak of the test set.
224
+
225
+ Output type is the same as Input type: nD-array, DataFrame or Series.
226
+ """
227
+ if method == "standard":
228
+ scaler = StandardScaler()
229
+ elif method == "minmax":
230
+ scaler = MinMaxScaler()
231
+ else:
232
+ raise ValueError("Normalization method must be 'standard' or 'minmax'.")
233
+
234
+ X_train = scaler.fit_transform(train_set)
235
+ X_test = scaler.transform(test_set)
236
+
237
+ if isinstance(train_set, pandas.DataFrame):
238
+ train_indexes = train_set.index
239
+ test_indexes = test_set.index
240
+ cols = train_set.columns
241
+ X_train = pandas.DataFrame(data=X_train, index=train_indexes, columns=cols)
242
+ X_test = pandas.DataFrame(data=X_test, index=test_indexes, columns=cols)
243
+ elif isinstance(train_set, pandas.Series):
244
+ train_indexes = train_set.index
245
+ test_indexes = test_set.index
246
+ X_train = pandas.Series(data=X_train, index=train_indexes)
247
+ X_test = pandas.Series(data=X_test, index=test_indexes)
248
+ else:
249
+ pass
250
+
251
+ return X_train, X_test
252
+
253
+ @staticmethod
254
+ def balance_classes(train_features, train_labels, **kwargs):
255
+ """
256
+ Attempts to balance the minority class(es) using Imbalanced-Learn's `SMOTETomek` algorithm.
257
+ """
258
+ resampler = SMOTETomek(**kwargs)
259
+ X, y = resampler.fit_resample(X=train_features, y=train_labels)
260
+
261
+ return X, y
262
+
263
+
264
+ class PytorchDataset(Dataset):
265
+ def __init__(self, features: Union[numpy.ndarray, pandas.Series, pandas.DataFrame], labels: Union[numpy.ndarray, pandas.Series, pandas.DataFrame],
266
+ features_dtype: torch.dtype=torch.float32, labels_dtype: torch.dtype=torch.int64, balance: bool=False) -> None:
267
+ """
268
+ Make a PyTorch dataset of Features and Labels casted to Tensors.
269
+
270
+ Defaults: `float32` for features and `int64` for labels.
271
+
272
+ If `balance=True` attempts to balance the minority class(es) using Imbalanced-Learn's `SMOTETomek` algorithm.
273
+ Note: Only Train-Data should be balanced.
274
+ """
275
+ # Validate features
276
+ if not isinstance(features, (pandas.DataFrame, pandas.Series, numpy.ndarray)):
277
+ raise TypeError("features must be a numpy.ndarray, pandas.Series or pandas.DataFrame")
278
+ # Validate labels
279
+ if not isinstance(labels, (pandas.DataFrame, pandas.Series, numpy.ndarray)):
280
+ raise TypeError("labels must be a numpy.ndarray, pandas.Series or pandas.DataFrame")
281
+
282
+ # Balance classes
283
+ if balance:
284
+ features, labels = self.balance_classes(train_features=features, train_labels=labels)
285
+
286
+ # Cast features
287
+ if isinstance(features, numpy.ndarray):
288
+ self.features = torch.tensor(features, dtype=features_dtype)
289
+ else:
290
+ self.features = torch.tensor(features.values, dtype=features_dtype)
291
+
292
+ # Cast labels
293
+ if isinstance(labels, numpy.ndarray):
294
+ self.labels = torch.tensor(labels, dtype=labels_dtype)
295
+ else:
296
+ self.labels = torch.tensor(labels.values, dtype=labels_dtype)
297
+
298
+ def __len__(self):
299
+ return len(self.features)
300
+
301
+ def __getitem__(self, index):
302
+ return self.features[index], self.labels[index]
303
+
304
+ @staticmethod
305
+ def balance_classes(train_features, train_labels, **kwargs):
306
+ """
307
+ Attempts to balance the minority class(es) using Imbalanced-Learn's `SMOTETomek` algorithm.
308
+ """
309
+ resampler = SMOTETomek(**kwargs)
310
+ X, y = resampler.fit_resample(X=train_features, y=train_labels)
311
+
312
+ return X, y
313
+
314
+
315
+ def make_vision_dataset(inputs: Union[list[Image.Image], numpy.ndarray, str], labels: Union[list[int], numpy.ndarray, None], resize: int=256,
316
+ transform: Union[transforms.Compose, None]=None, test_set: bool=False):
317
+ """
318
+ Make a Torchvision Dataset of images to be used in a Convolutional Neural Network.
319
+
320
+ If no transform object is given, Images will undergo the following transformations by default: `RandomHorizontalFlip`, `RandomRotation`,
321
+ `Resize`, `CenterCrop`, `ToTensor`, `Normalize`. Except if 'test_set=True'.
322
+
323
+ Args:
324
+ `inputs`: List of PIL Image objects | Numpy array of image arrays | Path to root directory containing subdirectories that classify image files.
325
+
326
+ `labels`: List of integer values | Numpy array of labels. Labels size must match `inputs` size.
327
+ If a path to a directory is given, then `labels` must be None.
328
+
329
+ `transform`: Custom transformations to use. If None, use default transformations.
330
+
331
+ `test_set`: Flip, rotation and center-crop transformations will not be applied.
332
+
333
+ Returns:
334
+ `Dataset`: Either a `TensorDataset` or `ImageFolder` instance, depending on the method used.
335
+ Data dimensions: (samples, color channels, height, width).
336
+ """
337
+ # Validate inputs
338
+ if not isinstance(inputs, (list, numpy.ndarray, str)):
339
+ raise TypeError("Inputs must be one of the following:\n\ta) List of PIL Image objects.\n\tb) Numpy array of 2D or 3D arrays.\
340
+ \n\tc) Directory path to image files.")
341
+ # Validate labels
342
+ if not (isinstance(labels, (list, numpy.ndarray)) or labels is None):
343
+ raise TypeError("Inputs must be one of the following:\n\ta) List of labels (integers).\n\tb) Numpy array of 2D or 3D arrays.\
344
+ \n\tc) None if inputs path is given.\nLabels size must match Inputs size.")
345
+ # Validate resize shape
346
+ if not isinstance(resize, int):
347
+ raise TypeError("Resize must be an integer value for a square image of shape (W, H).")
348
+ # Validate transform
349
+ if isinstance(transform, transforms.Compose):
350
+ pass
351
+ elif transform is None:
352
+ if test_set:
353
+ transform = transforms.Compose([
354
+ transforms.Resize(size=(resize,resize)),
355
+ transforms.ToTensor(),
356
+ transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
357
+ ])
358
+ else:
359
+ transform = transforms.Compose([
360
+ transforms.RandomHorizontalFlip(p=0.5),
361
+ transforms.RandomRotation(degrees=30),
362
+ transforms.Resize(size=(int(resize*1.2),int(resize*1.2))),
363
+ transforms.CenterCrop(size=resize),
364
+ transforms.ToTensor(),
365
+ transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
366
+ ])
367
+ else:
368
+ raise TypeError("Transform must be a `torchvision.transforms.Compose` object or None to use a default transform.")
369
+
370
+ # Start-o
371
+ dataset = None
372
+
373
+ # CASE A: input is a path to image files, Labels is None
374
+ if labels is None:
375
+ if isinstance(inputs, str):
376
+ dataset = ImageFolder(root=inputs, transform=transform)
377
+ else:
378
+ raise TypeError("Labels must be None if 'path' to inputs is provided. Labels will be inferred from subdirectory names in 'path'.")
379
+ # CASE B: input is Numpy array or a list of PIL Images. Labels is Numpy array or List of integers
380
+ elif not isinstance(inputs, str):
381
+ # Transform labels to tensor
382
+ labels_ = torch.tensor(labels, dtype=torch.int64)
383
+
384
+ # Transform each image to tensor
385
+ transformed = list()
386
+ for img_ in inputs:
387
+ transformed.append(transform(img_))
388
+ # Stack image tensors
389
+ features_ = torch.stack(transformed, dim=0)
390
+
391
+ # Make a dataset with images and labels
392
+ dataset = TensorDataset(features_, labels_)
393
+ else:
394
+ raise TypeError("Labels must be None if 'path' to inputs is provided. Labels will be inferred from subdirectory names in 'path'.")
395
+
396
+ return dataset
397
+
398
+
399
+ class SequenceDataset():
400
+ def __init__(self, data: Union[pandas.DataFrame, pandas.Series, numpy.ndarray], sequence_size: int, last_seq_test: bool=True,
401
+ seq_labels: bool=True, normalize: Union[Literal["standard", "minmax"], None]="minmax"):
402
+ """
403
+ Make train/test datasets from a single timestamp sequence.
404
+
405
+ Create an object containing 2 PyTorchDataset objects to be used in a Recurrent Neural Network:
406
+
407
+ 1. Train Dataset
408
+ 2. Test Dataset
409
+
410
+ To plot call the static method `plot()`.
411
+
412
+ If normalization is used, an scaler object will be stored.
413
+ The scaler object can be used to invert normalization on a Tensor/Array using the method `self.denormalize()`.
414
+
415
+ Args:
416
+ * `data`: Pandas Dataframe with 2 columns [datetime, sequence] | 1-column Dataframe or Series sequence, where index is the datetime.
417
+ * `sequence_size (int)`: Length of each subsequence that will be used for training.
418
+ * `last_seq_test (bool)`: Last sequence will be used as test_set, if false a dummy test set will be returned. Default is True.
419
+ * `seq_labels (bool)`: Labels will be returned as sequences, if false return single values for 1 future timestamp.
420
+ * `normalize`: Whether to normalize ('minmax'), standardize ('standard') or ignore (None). Default is 'minmax'.
421
+ """
422
+ # Validate data
423
+ if not isinstance(data, (pandas.Series, pandas.DataFrame, numpy.ndarray)):
424
+ raise TypeError("Data must be pandas dataframe, pandas series or numpy array.")
425
+ # Validate window size
426
+ if not isinstance(sequence_size, int):
427
+ raise TypeError("Sequence size must be an integer.")
428
+ elif len(data) % sequence_size != 0:
429
+ raise ValueError(f"data with length {len(data)} is not divisible in sequences of {sequence_size} values.")
430
+ # Validate test sequence
431
+ if not isinstance(last_seq_test, bool):
432
+ raise TypeError("Last sequence treated as Test-set must be True or False.")
433
+ # validate normalize
434
+ if not (normalize in ["standard", "minmax"] or normalize is None):
435
+ raise TypeError("normalize must be 'standard', 'minmax' or None.")
436
+
437
+ # Handle data -> array
438
+ self.time_axis = None
439
+ if isinstance(data, pandas.DataFrame):
440
+ if len(data.columns) == 2:
441
+ self.sequence = data[data.columns[1]].values.astype("float")
442
+ self.time_axis = data[data.columns[0]].values
443
+ elif len(data.columns) == 1:
444
+ self.sequence = data[data.columns[0]].values.astype("float")
445
+ self.time_axis = data.index.values
446
+ else:
447
+ raise ValueError("Dataframe contains more than 2 columns.")
448
+ elif isinstance(data, pandas.Series):
449
+ self.sequence = data.values.astype("float")
450
+ self.time_axis = data.index.values
451
+ else:
452
+ self.sequence = data.astype("float")
453
+
454
+ # Save last sequence
455
+ self._last_sequence = self.sequence[-sequence_size:]
456
+
457
+ # Last sequence as test
458
+ train_sequence = self.sequence
459
+ test_sequence = None
460
+ if last_seq_test:
461
+ test_sequence = self.sequence[-(sequence_size*2):]
462
+ train_sequence = self.sequence[:-sequence_size]
463
+
464
+ # Normalize values
465
+ norm_train_sequence = train_sequence
466
+ norm_test_sequence = test_sequence
467
+ if normalize is not None:
468
+ # Define scaler
469
+ if normalize == "standard":
470
+ self.scaler = StandardScaler()
471
+ elif normalize == "minmax":
472
+ self.scaler = MinMaxScaler(feature_range=(-1,1))
473
+ # Scale and transform training set + reshape
474
+ self.scaler.fit(train_sequence.reshape(-1,1))
475
+ norm_train_sequence = self.scaler.transform(train_sequence.reshape(-1,1))
476
+ norm_train_sequence = norm_train_sequence.reshape(-1)
477
+ # Scale test if it exists + reshape
478
+ if last_seq_test:
479
+ norm_test_sequence = self.scaler.transform(test_sequence.reshape(-1,1))
480
+ norm_test_sequence = norm_test_sequence.reshape(-1)
481
+
482
+ # Divide train sequence into subsequences
483
+ train_features_list = list()
484
+ train_labels_list = list()
485
+ train_size = len(norm_train_sequence)
486
+ for i in range(train_size - sequence_size - 1):
487
+ subsequence = norm_train_sequence[i:sequence_size + i]
488
+ train_features_list.append(subsequence.reshape(1,-1))
489
+ # Labels as sequence
490
+ if seq_labels:
491
+ label = norm_train_sequence[i + 1:sequence_size + i + 1]
492
+ train_labels_list.append(label.reshape(1,-1))
493
+ # Single value label
494
+ else:
495
+ label = norm_train_sequence[sequence_size + i + 1]
496
+ train_labels_list.append(label)
497
+
498
+ # Divide test sequence into subsequences
499
+ if last_seq_test:
500
+ test_features_list = list()
501
+ test_labels_list = list()
502
+ test_size = len(norm_test_sequence)
503
+ for i in range(test_size - sequence_size - 1):
504
+ subsequence = norm_test_sequence[i:sequence_size + i]
505
+ test_features_list.append(subsequence.reshape(1,-1))
506
+ # Labels as sequence
507
+ if seq_labels:
508
+ label = norm_test_sequence[i + 1:sequence_size + i + 1]
509
+ test_labels_list.append(label.reshape(1,-1))
510
+ # Single value label
511
+ else:
512
+ label = norm_test_sequence[sequence_size + i + 1]
513
+ test_labels_list.append(label)
514
+
515
+ # Create training arrays then cast to pytorch dataset
516
+ train_features = numpy.concatenate(train_features_list, axis=0)
517
+ # Check if labels are a sequence
518
+ if seq_labels:
519
+ train_labels = numpy.concatenate(train_labels_list, axis=0)
520
+ else:
521
+ train_labels = numpy.array(train_labels_list).reshape(-1,1)
522
+ self.train_dataset = PytorchDataset(features=train_features, labels=train_labels, labels_dtype=torch.float32)
523
+
524
+ # Create test arrays then cast to pytorch dataset
525
+ if last_seq_test:
526
+ test_features = numpy.concatenate(test_features_list, axis=0)
527
+ # Check if labels are a sequence
528
+ if seq_labels:
529
+ test_labels = numpy.concatenate(test_labels_list, axis=0)
530
+ else:
531
+ test_labels = numpy.array(test_labels_list).reshape(-1,1)
532
+ self.test_dataset = PytorchDataset(features=test_features, labels=test_labels, labels_dtype=torch.float32)
533
+ else:
534
+ self.test_dataset = PytorchDataset(features=numpy.ones(shape=(10, sequence_size)), labels=numpy.ones(shape=(10,1)), labels_dtype=torch.float32)
535
+
536
+ # Attempt to plot the sequence
537
+ if self.time_axis is not None:
538
+ try:
539
+ self.plot(self.time_axis, self.sequence)
540
+ except:
541
+ print("Plot failed, try it manually to find the problem.")
542
+
543
+ @staticmethod
544
+ def plot(x_axis, y_axis, x_pred=None, y_pred=None):
545
+ """
546
+ Plot Time-values (X) Vs Data-values (Y).
547
+ """
548
+ plt.figure(figsize=(12,5))
549
+ plt.title('Sequence')
550
+ plt.grid(True)
551
+ plt.autoscale(axis='x', tight=True)
552
+ plt.plot(x_axis, y_axis)
553
+ if x_pred is not None and y_pred is not None:
554
+ plt.plot(x_pred, y_pred)
555
+ plt.show()
556
+
557
+ def denormalize(self, input: Union[torch.Tensor, numpy.ndarray]) -> numpy.ndarray:
558
+ """
559
+ Applies the inverse transformation of the object's stored scaler to a tensor or array.
560
+
561
+ Args:
562
+ `input`: Tensor/Array predicted using the current sequence.
563
+
564
+ Returns: numpy.ndarray with default index.
565
+ """
566
+ if isinstance(input, torch.Tensor):
567
+ with torch.no_grad():
568
+ array = input.numpy().reshape(-1,1)
569
+ elif isinstance(input, numpy.ndarray):
570
+ array = input.reshape(-1,1)
571
+ else:
572
+ raise TypeError("Input must be a Pytorch tensor or Numpy array.")
573
+ return self.scaler.inverse_transform(array)
574
+
575
+ def get_last_sequence(self, normalize: bool=True, to_tensor: bool=True):
576
+ """
577
+ Returns the last subsequence of the sequence.
578
+
579
+ Args:
580
+ `normalize`: Normalize using the object's stored scaler. Defaults to True.
581
+
582
+ `to_tensor`: Cast to Pytorch tensor. Defaults to True.
583
+
584
+ Returns: numpy.ndarray or torch.Tensor
585
+ """
586
+ last_seq = self._last_sequence.reshape(-1,1)
587
+ if normalize:
588
+ last_seq = self.scaler.transform(last_seq)
589
+ if to_tensor:
590
+ last_seq = torch.Tensor(last_seq)
591
+ return last_seq
592
+
593
+ def __len__(self):
594
+ return f"Train: {len(self.train_dataset)}, Test: {len(self.test_dataset)}"
595
+