dragon-ml-toolbox 2.3.0__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ml_tools/datasetmaster.py CHANGED
@@ -1,606 +1,680 @@
1
- import torch
2
- from torch.utils.data import Dataset, TensorDataset
1
+ import torch
2
+ from torch.utils.data import Dataset, Subset
3
3
  from torch import nn
4
4
  import pandas
5
5
  import numpy
6
6
  from sklearn.model_selection import train_test_split
7
7
  from sklearn.preprocessing import StandardScaler, MinMaxScaler
8
- from typing import Literal, Union
8
+ from typing import Literal, Union, Tuple, List, Optional
9
9
  from imblearn.combine import SMOTETomek
10
- from PIL import Image
10
+ from abc import ABC, abstractmethod
11
+ from PIL import Image, ImageOps
11
12
  from torchvision.datasets import ImageFolder
12
13
  from torchvision import transforms
13
14
  import matplotlib.pyplot as plt
15
+ from pathlib import Path
14
16
  from .utilities import _script_info
17
+ from .logger import _LOGGER
15
18
 
16
19
 
20
+ # --- public-facing API ---
17
21
  __all__ = [
18
22
  "DatasetMaker",
19
- "PytorchDataset",
20
- "make_vision_dataset",
21
- "SequenceDataset",
23
+ "VisionDatasetMaker",
24
+ "SequenceMaker",
25
+ "ResizeAspectFill",
22
26
  ]
23
27
 
24
28
 
25
- class DatasetMaker():
26
- def __init__(self, *, pandas_df: pandas.DataFrame, label_col: str, cat_features: Union[list[str], None]=None,
27
- cat_method: Union[Literal["one-hot", "embed"], None]="one-hot", test_size: float=0.2, random_state: Union[int, None]=None,
28
- normalize: Union[Literal["standard", "minmax"], None]="standard", cast_labels: bool=True, balance: bool=False, **kwargs):
29
- """
30
- Create Train-Test datasets from a Pandas DataFrame. Four datasets will be created:
31
-
32
- 1. Features Train
33
- 2. Features Test
34
- 3. Labels Train
35
- 4. Labels Test
36
-
37
- Use the method `to_pytorch()` to quickly get Train and Test PytorchDataset objects.
38
-
39
- `label_col` Specify the name of the label column. If label encoding is required (str -> int) set `cast_labels=True` (default).
40
- A dictionary will be created with the label mapping {code: original_name}.
41
-
42
- `cat_features` List of column names to perform embedding or one-hot-encoding of categorical features.
43
- Any categorical column not in the list will not be returned.
44
- If `None` (default), columns containing categorical data will be inferred from dtypes: object, string and category, if any.
45
-
46
- `cat_method` can be set to:
47
-
48
- * `'one-hot'` (default) to perform One-Hot-Encoding using Pandas "get_dummies".
49
- * `'embed'` to perform Embedding using PyTorch "nn.Embedding".
50
- * `None` all data will be considered to be continuous.
51
-
52
- `normalize` if not None, continuous features will be normalized using Scikit-Learn's StandardScaler or MinMaxScaler.
53
-
54
- If `balance=True` attempts to balance the minority class(es) in the training data using Imbalanced-Learn's `SMOTETomek` algorithm.
55
-
56
- `**kwargs` Pass any additional keyword parameters to `pandas.get_dummies()` or `torch.nn.Embedding()`.
57
- i.e. pandas `drop_first=False`.
58
- """
59
-
60
- # Validate dataframe
61
- if not isinstance(pandas_df, pandas.DataFrame):
62
- raise TypeError("pandas_df must be a pandas.DataFrame object.")
63
- # Validate label column
64
- if not isinstance(label_col, (str, list)):
65
- raise TypeError("label_col must be a string or list of strings.")
66
- # Validate categorical features
67
- if not (isinstance(cat_features, list) or cat_features is None):
68
- raise TypeError("cat_features must be a list of strings or None.")
69
- if cat_method not in ["one-hot", "embed", None]:
70
- raise TypeError("cat_method must be 'one-hot', 'embed' or None.")
71
- # Validate test size
72
- if not isinstance(test_size, (float, int)):
73
- raise TypeError("test_size must be a float in the range 0.0 to 1.0")
74
- if not (1.0 >= test_size >= 0.0):
75
- raise ValueError("test_size must be a float in the range 0.0 to 1.0")
76
- # Validate random state
77
- if not (isinstance(random_state, int) or random_state is None):
78
- raise TypeError("random_state must be an integer or None.")
79
- # validate normalize
80
- if not (normalize in ["standard", "minmax"] or normalize is None):
81
- raise TypeError("normalize must be 'standard', 'minmax' or None.")
82
- # Validate cast labels
83
- if not isinstance(cast_labels, bool):
84
- raise TypeError("cast_labels must be either True or False.")
85
-
86
- # Start-o
87
- self._labels = pandas_df[label_col]
88
- pandas_df = pandas_df.drop(columns=label_col)
89
- # Set None parameters
90
- self._categorical = None
91
- self._continuous = None
92
- self.labels_train = None
93
- self.labels_test = None
94
- self.labels_map = None
95
- self.features_test = None
96
- self.features_train = None
29
+ # --- Custom Vision Transform Class ---
30
+ class ResizeAspectFill:
31
+ """
32
+ Custom transformation to make an image square by padding it to match the
33
+ longest side, preserving the aspect ratio. The image is finally centered.
34
+
35
+ Args:
36
+ pad_color (Union[str, int]): Color to use for the padding.
37
+ Defaults to "black".
38
+ """
39
+ def __init__(self, pad_color: Union[str, int] = "black") -> None:
40
+ self.pad_color = pad_color
41
+
42
+ def __call__(self, image: Image.Image) -> Image.Image:
43
+ if not isinstance(image, Image.Image):
44
+ raise TypeError(f"Expected PIL.Image.Image, got {type(image).__name__}")
45
+
46
+ w, h = image.size
47
+ if w == h:
48
+ return image
49
+
50
+ # Determine padding to center the image
51
+ if w > h:
52
+ top_padding = (w - h) // 2
53
+ bottom_padding = w - h - top_padding
54
+ padding = (0, top_padding, 0, bottom_padding)
55
+ else: # h > w
56
+ left_padding = (h - w) // 2
57
+ right_padding = h - w - left_padding
58
+ padding = (left_padding, 0, right_padding, 0)
59
+
60
+ return ImageOps.expand(image, padding, fill=self.pad_color)
61
+
62
+
63
+ # --- Internal Helper Class ---
64
+ class _PytorchDataset(Dataset):
65
+ """
66
+ Internal helper class to create a PyTorch Dataset.
67
+ Converts numpy/pandas data into tensors for model consumption.
68
+ """
69
+ def __init__(self, features: Union[numpy.ndarray, pandas.DataFrame],
70
+ labels: Union[numpy.ndarray, pandas.Series],
71
+ features_dtype: torch.dtype = torch.float32,
72
+ labels_dtype: torch.dtype = torch.int64):
97
73
 
98
- # Find categorical
99
- cat_columns = list()
100
- if cat_method is not None:
101
- if cat_features is None:
102
- # find categorical columns from Object, String or Category dtypes automatically
103
- for column_ in pandas_df.columns:
104
- if pandas_df[column_].dtype == object or pandas_df[column_].dtype == 'string' or pandas_df[column_].dtype.name == 'category':
105
- cat_columns.append(column_)
106
- else:
107
- cat_columns = cat_features
108
-
109
- # Handle categorical data if required
110
- if len(cat_columns) > 0:
111
- # Set continuous/categorical data if categorical detected
112
- self._continuous = pandas_df.drop(columns=cat_columns)
113
- self._categorical = pandas_df[cat_columns].copy()
114
-
115
- # Perform one-hot-encoding
116
- if cat_method == "one-hot":
117
- for col_ in cat_columns:
118
- self._categorical[col_] = self._categorical[col_].astype("category")
119
- self._categorical = pandas.get_dummies(data=self._categorical, dtype=numpy.int32, **kwargs)
120
- # Perform embedding
121
- else:
122
- self._categorical = self.embed_categorical(cat_df=self._categorical, random_state=random_state, **kwargs)
123
-
124
- # Something went wrong?
125
- if self._categorical.empty:
126
- raise AttributeError("Categorical data couldn't be processed")
74
+ if isinstance(features, numpy.ndarray):
75
+ self.features = torch.tensor(features, dtype=features_dtype)
127
76
  else:
128
- # Assume all data is continuous
129
- if not pandas_df.empty:
130
- self._continuous = pandas_df
131
-
132
- # Map labels
133
- if cast_labels:
134
- labels_ = self._labels.astype("category")
135
- # Get mapping
136
- self.labels_map = {key: value for key, value in enumerate(labels_.cat.categories)}
137
- self._labels = labels_.cat.codes
138
-
139
- # Train-Test splits
140
- if self._continuous is not None and self._categorical is not None:
141
- continuous_train, continuous_test, categorical_train, categorical_test, self.labels_train, self.labels_test = train_test_split(self._continuous,
142
- self._categorical,
143
- self._labels,
144
- test_size=test_size,
145
- random_state=random_state)
146
- elif self._categorical is None:
147
- continuous_train, continuous_test, self.labels_train, self.labels_test = train_test_split(self._continuous, self._labels,
148
- test_size=test_size, random_state=random_state)
149
- elif self._continuous is None:
150
- categorical_train, categorical_test, self.labels_train, self.labels_test = train_test_split(self._categorical, self._labels,
151
- test_size=test_size, random_state=random_state)
152
-
153
- # Normalize continuous features
154
- if normalize is not None and self._continuous is not None:
155
- continuous_train, continuous_test = self.normalize_continuous(train_set=continuous_train, test_set=continuous_test, method=normalize)
156
-
157
- # Merge continuous and categorical
158
- if self._categorical is not None and self._continuous is not None:
159
- self.features_train = pandas.concat(objs=[continuous_train, categorical_train], axis=1)
160
- self.features_test = pandas.concat(objs=[continuous_test, categorical_test], axis=1)
161
- elif self._continuous is not None:
162
- self.features_train = continuous_train
163
- self.features_test = continuous_test
164
- elif self._categorical is not None:
165
- self.features_train = categorical_train
166
- self.features_test = categorical_test
167
-
168
- # Balance train dataset
169
- if balance and self.features_train is not None and self.labels_train is not None:
170
- self.features_train, self.labels_train = self.balance_classes(train_features=self.features_train, train_labels=self.labels_train)
171
-
172
- def to_pytorch(self):
173
- """
174
- Convert the train and test features and labels to Pytorch Datasets with default dtypes.
175
-
176
- Returns: Tuple(Train Dataset, Test Dataset)
77
+ self.features = torch.tensor(features.values, dtype=features_dtype)
78
+
79
+ if isinstance(labels, numpy.ndarray):
80
+ self.labels = torch.tensor(labels, dtype=labels_dtype)
81
+ else:
82
+ self.labels = torch.tensor(labels.values, dtype=labels_dtype)
83
+
84
+ def __len__(self):
85
+ return len(self.features)
86
+
87
+ def __getitem__(self, index):
88
+ return self.features[index], self.labels[index]
89
+
90
+
91
+ # --- Private Base Class ---
92
+ class _BaseMaker(ABC):
93
+ """
94
+ Abstract Base Class for all dataset makers.
95
+ Ensures a consistent API across the library.
96
+ """
97
+ def __init__(self):
98
+ self._train_dataset = None
99
+ self._test_dataset = None
100
+ self._val_dataset = None
101
+
102
+ @abstractmethod
103
+ def get_datasets(self) -> Tuple[Dataset, ...]:
177
104
  """
178
- train = None
179
- test = None
180
- # Train set
181
- if self.labels_train is not None and self.features_train is not None:
182
- train = PytorchDataset(features=self.features_train, labels=self.labels_train)
183
- # Test set
184
- if self.labels_test is not None and self.features_test is not None:
185
- test = PytorchDataset(features=self.features_test, labels=self.labels_test)
186
-
187
- return train, test
188
-
189
- @staticmethod
190
- def embed_categorical(cat_df: pandas.DataFrame, random_state: Union[int, None]=None, **kwargs) -> pandas.DataFrame:
105
+ The primary method to retrieve the final, processed PyTorch datasets.
106
+ Must be implemented by all subclasses.
191
107
  """
192
- Takes a DataFrame object containing categorical data only.
108
+ pass
109
+
110
+
111
+ # --- Refactored DatasetMaker ---
112
+ class DatasetMaker(_BaseMaker):
113
+ """
114
+ Creates processed PyTorch datasets from a Pandas DataFrame using a fluent, step-by-step interface.
115
+
116
+ Recommended pipeline:
117
+
118
+ - Full Control (step-by-step):
119
+ 1. Process categorical features `.process_categoricals()`
120
+ 2. Split train-test datasets `.split_data()`
121
+ 3. Normalize continuous features `.normalize_continuous()`; `.denormalize()` becomes available.
122
+ 4. [Optional][Classification only] Balance classes `.balance_data()`
123
+ 5. Get PyTorch datasets: `train, test = .get_datasets()`
124
+ 6. [Optional] Inspect the processed data as DataFrames `X_train, X_test, y_train, y_test = .inspect_dataframes()`
125
+
126
+ - Automated (single call):
127
+ ```python
128
+ maker = DatasetMaker(df, label_col='target')
129
+ maker.process() # uses simplified arguments
130
+ train_ds, test_ds = maker.get_datasets()
131
+ ```
132
+ """
133
+ def __init__(self, pandas_df: pandas.DataFrame, label_col: str):
134
+ super().__init__()
135
+ if not isinstance(pandas_df, pandas.DataFrame):
136
+ raise TypeError("Input must be a pandas.DataFrame.")
137
+ if label_col not in pandas_df.columns:
138
+ raise ValueError(f"Label column '{label_col}' not found in DataFrame.")
139
+
140
+ self.labels = pandas_df[label_col]
141
+ self.features = pandas_df.drop(columns=label_col)
142
+ self.labels_map = None
143
+ self.scaler = None
144
+
145
+ self._is_split = False
146
+ self._is_balanced = False
147
+ self._is_normalized = False
148
+ self._is_categoricals_processed = False
193
149
 
194
- Calculates embedding dimensions for each categorical feature. Using `(Number_of_categories + 1) // 2` up to a maximum value of 50.
150
+ self.features_train = None
151
+ self.features_test = None
152
+ self.labels_train = None
153
+ self.labels_test = None
195
154
 
196
- Applies embedding using PyTorch and returns a Pandas Dataframe with embedded features.
155
+ self.continuous_columns = None
156
+
157
+ def process_categoricals(self, method: Literal["one-hot", "embed"] = "one-hot",
158
+ cat_features: Union[list[str], None] = None, **kwargs) -> 'DatasetMaker':
197
159
  """
198
- df = cat_df.copy()
199
- embedded_tensors = list()
200
- columns = list()
201
- for col in df.columns:
202
- df[col] = df[col].astype("category")
203
- # Get number of categories
204
- size: int = df[col].cat.categories.size
205
- # Embedding dimension
206
- embedding_dim: int = min(50, (size+1)//2)
207
- # Create instance of Embedding tensor using half the value for embedding dimensions
208
- with torch.no_grad():
209
- if random_state:
210
- torch.manual_seed(random_state)
211
- embedder = nn.Embedding(num_embeddings=size, embedding_dim=embedding_dim, **kwargs)
212
- # Embed column of features and store tensor
213
- embedded_tensors.append(embedder(torch.LongTensor(df[col].cat.codes.copy().values)))
214
- # Preserve column names for embedded features
215
- for i in range(1, embedding_dim+1):
216
- columns.append(f"{col}_{i}")
217
-
218
- # Concatenate tensors
219
- with torch.no_grad():
220
- tensor = torch.cat(tensors=embedded_tensors, dim=1)
221
- # Convert to dataframe
222
- return pandas.DataFrame(data=tensor.numpy(), columns=columns)
160
+ Encodes categorical features using the specified method.
223
161
 
224
- @staticmethod
225
- def normalize_continuous(train_set: Union[numpy.ndarray, pandas.DataFrame, pandas.Series], test_set: Union[numpy.ndarray, pandas.DataFrame, pandas.Series],
226
- method: Literal["standard", "minmax"]="standard"):
162
+ Args:
163
+ method (str, optional): 'one-hot' (default) or 'embed'.
164
+ cat_features (list, optional): A list of categorical column names.
165
+ If None, they will be inferred from the DataFrame's dtypes.
166
+ **kwargs: Additional keyword arguments to pass to the underlying
167
+ pandas.get_dummies() or torch.nn.Embedding() functions.
168
+ For 'one-hot' encoding, it is often recommended to add
169
+ `drop_first=True` to help reduce multicollinearity.
227
170
  """
228
- Takes a train and a test dataset, then returns the standardized datasets as a tuple (train, test).
171
+ if self._is_split:
172
+ raise RuntimeError("Categoricals must be processed before splitting data to avoid data leakage.")
229
173
 
230
- `method`: Standardization by the mean and variance or MinMax Normalization.
231
-
232
- The transformer is fitted on the training set, so there is no data-leak of the test set.
174
+ if cat_features is None:
175
+ cat_columns = self.features.select_dtypes(include=['object', 'category', 'string']).columns.tolist()
176
+ else:
177
+ cat_columns = cat_features
178
+
179
+ if not cat_columns:
180
+ _LOGGER.info("No categorical features to process.")
181
+ self._is_categoricals_processed = True
182
+ return self
183
+
184
+ continuous_df = self.features.drop(columns=cat_columns)
185
+ # store continuous column names
186
+ self.continuous_columns = continuous_df.columns.tolist()
233
187
 
234
- Output type is the same as Input type: nD-array, DataFrame or Series.
235
- """
188
+ categorical_df = self.features[cat_columns].copy()
189
+
190
+ if method == "one-hot":
191
+ processed_cats = pandas.get_dummies(categorical_df, dtype=numpy.int32, **kwargs)
192
+ elif method == "embed":
193
+ processed_cats = self._embed_categorical(categorical_df, **kwargs)
194
+ else:
195
+ raise ValueError("`method` must be 'one-hot' or 'embed'.")
196
+
197
+ self.features = pandas.concat([continuous_df, processed_cats], axis=1)
198
+ self._is_categoricals_processed = True
199
+ _LOGGER.info("Categorical features processed.")
200
+ return self
201
+
202
+ def normalize_continuous(self, method: Literal["standard", "minmax"] = "standard") -> 'DatasetMaker':
203
+ """Normalizes all numeric features and saves the scaler."""
204
+ if not self._is_split:
205
+ raise RuntimeError("Continuous features must be normalized AFTER splitting data. Call .split_data() first.")
206
+ if self._is_normalized:
207
+ _LOGGER.warning("Data has already been normalized.")
208
+ return self
209
+
210
+ # Use continuous features columns
211
+ self.scaler_columns = self.continuous_columns
212
+ if not self.scaler_columns:
213
+ _LOGGER.info("No continuous features to normalize.")
214
+ self._is_normalized = True
215
+ return self
216
+
236
217
  if method == "standard":
237
- scaler = StandardScaler()
218
+ self.scaler = StandardScaler()
238
219
  elif method == "minmax":
239
- scaler = MinMaxScaler()
240
- else:
241
- raise ValueError("Normalization method must be 'standard' or 'minmax'.")
242
-
243
- X_train = scaler.fit_transform(train_set)
244
- X_test = scaler.transform(test_set)
245
-
246
- if isinstance(train_set, pandas.DataFrame):
247
- train_indexes = train_set.index
248
- test_indexes = test_set.index
249
- cols = train_set.columns
250
- X_train = pandas.DataFrame(data=X_train, index=train_indexes, columns=cols)
251
- X_test = pandas.DataFrame(data=X_test, index=test_indexes, columns=cols)
252
- elif isinstance(train_set, pandas.Series):
253
- train_indexes = train_set.index
254
- test_indexes = test_set.index
255
- X_train = pandas.Series(data=X_train, index=train_indexes)
256
- X_test = pandas.Series(data=X_test, index=test_indexes)
220
+ self.scaler = MinMaxScaler()
257
221
  else:
258
- pass
222
+ raise ValueError("Normalization `method` must be 'standard' or 'minmax'.")
223
+
224
+ # Fit on training data only, then transform both
225
+ self.features_train[self.scaler_columns] = self.scaler.fit_transform(self.features_train[self.scaler_columns]) # type: ignore
226
+ self.features_test[self.scaler_columns] = self.scaler.transform(self.features_test[self.scaler_columns]) # type: ignore
259
227
 
260
- return X_train, X_test
261
-
262
- @staticmethod
263
- def balance_classes(train_features, train_labels, **kwargs):
228
+ self._is_normalized = True
229
+ _LOGGER.info(f"Continuous features normalized using {self.scaler.__class__.__name__}. Scaler stored in `self.scaler`.")
230
+ return self
231
+
232
+ def split_data(self, test_size: float = 0.2, stratify: bool = False, random_state: Optional[int] = None) -> 'DatasetMaker':
233
+ """Splits the data into training and testing sets."""
234
+ if self._is_split:
235
+ _LOGGER.warning("Data has already been split.")
236
+ return self
237
+
238
+ if self.labels.dtype == 'object' or self.labels.dtype.name == 'category':
239
+ labels_numeric = self.labels.astype("category")
240
+ self.labels_map = {code: val for code, val in enumerate(labels_numeric.cat.categories)}
241
+ self.labels = pandas.Series(labels_numeric.cat.codes, index=self.labels.index)
242
+ _LOGGER.info("Labels have been encoded. Mapping stored in `self.labels_map`.")
243
+
244
+ stratify_array = self.labels if stratify else None
245
+
246
+ self.features_train, self.features_test, self.labels_train, self.labels_test = train_test_split(
247
+ self.features, self.labels, test_size=test_size, random_state=random_state, stratify=stratify_array
248
+ )
249
+
250
+ self._is_split = True
251
+ _LOGGER.info(f"Data split into training ({len(self.features_train)} samples) and testing ({len(self.features_test)} samples).")
252
+ return self
253
+
254
+ def balance_data(self, resampler=None, **kwargs) -> 'DatasetMaker':
264
255
  """
265
- Attempts to balance the minority class(es) using Imbalanced-Learn's `SMOTETomek` algorithm.
256
+ Only useful for classification tasks.
257
+
258
+ Balances the training data using a specified resampler.
259
+
260
+ Defaults to `SMOTETomek`.
266
261
  """
267
- resampler = SMOTETomek(**kwargs)
268
- X, y = resampler.fit_resample(X=train_features, y=train_labels)
262
+ if not self._is_split:
263
+ raise RuntimeError("Cannot balance data before it has been split. Call .split_data() first.")
264
+ if self._is_balanced:
265
+ _LOGGER.warning("Training data has already been balanced.")
266
+ return self
267
+
268
+ if resampler is None:
269
+ resampler = SMOTETomek(**kwargs)
270
+
271
+ _LOGGER.info(f"Balancing training data with {resampler.__class__.__name__}...")
272
+ self.features_train, self.labels_train = resampler.fit_resample(self.features_train, self.labels_train) # type: ignore
269
273
 
270
- return X, y
274
+ self._is_balanced = True
275
+ _LOGGER.info(f"Balancing complete. New training set size: {len(self.features_train)} samples.")
276
+ return self
271
277
 
278
+ def process(self, test_size: float = 0.2, cat_method: Literal["one-hot", "embed"] = "one-hot", normalize_method: Literal["standard", "minmax"] = "standard",
279
+ balance: bool = False, random_state: Optional[int] = None) -> 'DatasetMaker':
280
+ """Runs a standard, fully automated preprocessing pipeline."""
281
+ _LOGGER.info("--- Running Automated Processing Pipeline ---")
282
+ self.process_categoricals(method=cat_method)
283
+ self.split_data(test_size=test_size, stratify=True, random_state=random_state)
284
+ self.normalize_continuous(method=normalize_method)
285
+ if balance:
286
+ self.balance_data()
287
+ _LOGGER.info("--- Automated Processing Complete ---")
288
+ return self
289
+
290
+ def denormalize(self, data: Union[torch.Tensor, numpy.ndarray, pandas.DataFrame]) -> Union[numpy.ndarray, pandas.DataFrame]:
291
+ """
292
+ Applies inverse transformation to denormalize data, preserving DataFrame
293
+ structure if provided.
294
+
295
+ Args:
296
+ data: The normalized data to be transformed back to its original scale.
297
+ Can be a PyTorch Tensor, NumPy array, or Pandas DataFrame.
298
+ If a DataFrame, it must contain the columns that were originally scaled.
272
299
 
273
- class PytorchDataset(Dataset):
274
- def __init__(self, features: Union[numpy.ndarray, pandas.Series, pandas.DataFrame], labels: Union[numpy.ndarray, pandas.Series, pandas.DataFrame],
275
- features_dtype: torch.dtype=torch.float32, labels_dtype: torch.dtype=torch.int64, balance: bool=False) -> None:
300
+ Returns:
301
+ The denormalized data. Returns a Pandas DataFrame if the input was a
302
+ DataFrame, otherwise returns a NumPy array.
276
303
  """
277
- Make a PyTorch dataset of Features and Labels casted to Tensors.
304
+ if self.scaler is None:
305
+ raise RuntimeError("Data was not normalized. Cannot denormalize.")
306
+
307
+ if isinstance(data, pandas.DataFrame):
308
+ # If input is a DataFrame, denormalize in place and return a copy
309
+ if not all(col in data.columns for col in self.scaler_columns): # type: ignore
310
+ raise ValueError(f"Input DataFrame is missing one or more required columns for denormalization. Required: {self.scaler_columns}")
311
+
312
+ output_df = data.copy()
313
+ output_df[self.scaler_columns] = self.scaler.inverse_transform(data[self.scaler_columns]) # type: ignore
314
+ return output_df
315
+
316
+ # Handle tensor or numpy array input
317
+ if isinstance(data, torch.Tensor):
318
+ data_np = data.cpu().numpy()
319
+ else: # It's already a numpy array
320
+ data_np = data
321
+
322
+ if data_np.ndim == 1:
323
+ data_np = data_np.reshape(-1, 1)
324
+
325
+ if data_np.shape[1] != len(self.scaler_columns): # type: ignore
326
+ raise ValueError(f"Input array has {data_np.shape[1]} columns, but scaler was fitted on {len(self.scaler_columns)} columns.") # type: ignore
327
+
328
+ return self.scaler.inverse_transform(data_np)
329
+
330
+ def get_datasets(self) -> Tuple[_PytorchDataset, _PytorchDataset]:
331
+ """Primary method to get the final PyTorch Datasets."""
332
+ if not self._is_split:
333
+ raise RuntimeError("Data has not been split yet. Call .split_data() or .process() first.")
278
334
 
279
- Defaults: `float32` for features and `int64` for labels.
335
+ self._train_dataset = _PytorchDataset(self.features_train, self.labels_train) # type: ignore
336
+ self._test_dataset = _PytorchDataset(self.features_test, self.labels_test) # type: ignore
280
337
 
281
- If `balance=True` attempts to balance the minority class(es) using Imbalanced-Learn's `SMOTETomek` algorithm.
282
- Note: Only Train-Data should be balanced.
283
- """
284
- # Validate features
285
- if not isinstance(features, (pandas.DataFrame, pandas.Series, numpy.ndarray)):
286
- raise TypeError("features must be a numpy.ndarray, pandas.Series or pandas.DataFrame")
287
- # Validate labels
288
- if not isinstance(labels, (pandas.DataFrame, pandas.Series, numpy.ndarray)):
289
- raise TypeError("labels must be a numpy.ndarray, pandas.Series or pandas.DataFrame")
290
-
291
- # Balance classes
292
- if balance:
293
- features, labels = self.balance_classes(train_features=features, train_labels=labels)
338
+ return self._train_dataset, self._test_dataset
294
339
 
295
- # Cast features
296
- if isinstance(features, numpy.ndarray):
297
- self.features = torch.tensor(features, dtype=features_dtype)
298
- else:
299
- self.features = torch.tensor(features.values, dtype=features_dtype)
340
+ def inspect_dataframes(self) -> Tuple[pandas.DataFrame, pandas.DataFrame, pandas.Series, pandas.Series]:
341
+ """Utility method to inspect the processed data as Pandas DataFrames."""
342
+ if not self._is_split:
343
+ raise RuntimeError("Data has not been split yet. Call .split_data() or .process() first.")
344
+ return self.features_train, self.features_test, self.labels_train, self.labels_test # type: ignore
345
+
346
+ @staticmethod
347
+ def _embed_categorical(cat_df: pandas.DataFrame, random_state: Optional[int] = None, **kwargs) -> pandas.DataFrame:
348
+ """Internal helper to perform embedding on categorical features."""
349
+ embedded_tensors = []
350
+ new_columns = []
351
+ for col in cat_df.columns:
352
+ cat_series = cat_df[col].astype("category")
353
+ num_categories = len(cat_series.cat.categories)
354
+ embedding_dim = min(50, (num_categories + 1) // 2)
355
+
356
+ if random_state:
357
+ torch.manual_seed(random_state)
358
+
359
+ embedder = nn.Embedding(num_embeddings=num_categories, embedding_dim=embedding_dim, **kwargs)
360
+
361
+ with torch.no_grad():
362
+ codes = torch.LongTensor(cat_series.cat.codes.values)
363
+ embedded_tensors.append(embedder(codes))
364
+
365
+ new_columns.extend([f"{col}_{i+1}" for i in range(embedding_dim)])
300
366
 
301
- # Cast labels
302
- if isinstance(labels, numpy.ndarray):
303
- self.labels = torch.tensor(labels, dtype=labels_dtype)
304
- else:
305
- self.labels = torch.tensor(labels.values, dtype=labels_dtype)
306
-
307
- def __len__(self):
308
- return len(self.features)
309
-
310
- def __getitem__(self, index):
311
- return self.features[index], self.labels[index]
367
+ with torch.no_grad():
368
+ full_tensor = torch.cat(embedded_tensors, dim=1)
369
+ return pandas.DataFrame(full_tensor.numpy(), columns=new_columns, index=cat_df.index)
370
+
371
+
372
+ # --- VisionDatasetMaker ---
373
+ class VisionDatasetMaker(_BaseMaker):
374
+ """
375
+ Creates processed PyTorch datasets for computer vision tasks from an
376
+ image folder directory.
312
377
 
378
+ Uses online augmentations per epoch (image augmentation without creating new files).
379
+ """
380
+ def __init__(self, full_dataset: ImageFolder):
381
+ super().__init__()
382
+ self.full_dataset = full_dataset
383
+ self.labels = [s[1] for s in self.full_dataset.samples]
384
+ self.class_map = full_dataset.class_to_idx
385
+
386
+ self._is_split = False
387
+ self._are_transforms_configured = False
388
+
389
+ @classmethod
390
+ def from_folder(cls, root_dir: str) -> 'VisionDatasetMaker':
391
+ """Creates a maker instance from a root directory of images."""
392
+ initial_transform = transforms.Compose([transforms.ToTensor()])
393
+ full_dataset = ImageFolder(root=root_dir, transform=initial_transform)
394
+ _LOGGER.info(f"Found {len(full_dataset)} images in {len(full_dataset.classes)} classes.")
395
+ return cls(full_dataset)
396
+
313
397
  @staticmethod
314
- def balance_classes(train_features, train_labels, **kwargs):
398
+ def inspect_folder(path: Union[str, Path]):
315
399
  """
316
- Attempts to balance the minority class(es) using Imbalanced-Learn's `SMOTETomek` algorithm.
400
+ Logs a report of the types, sizes, and channels of image files
401
+ found in the directory and its subdirectories.
317
402
  """
318
- resampler = SMOTETomek(**kwargs)
319
- X, y = resampler.fit_resample(X=train_features, y=train_labels)
320
-
321
- return X, y
403
+ path_obj = Path(path)
404
+ if not path_obj.is_dir():
405
+ _LOGGER.error(f"Path is not a valid directory: {path_obj}")
406
+ return
322
407
 
408
+ non_image_files = set()
409
+ img_types = set()
410
+ img_sizes = set()
411
+ img_channels = set()
412
+ img_counter = 0
323
413
 
324
- def make_vision_dataset(inputs: Union[list[Image.Image], numpy.ndarray, str], labels: Union[list[int], numpy.ndarray, None], resize: int=256,
325
- transform: Union[transforms.Compose, None]=None, test_set: bool=False):
326
- """
327
- Make a Torchvision Dataset of images to be used in a Convolutional Neural Network.
328
-
329
- If no transform object is given, Images will undergo the following transformations by default: `RandomHorizontalFlip`, `RandomRotation`,
330
- `Resize`, `CenterCrop`, `ToTensor`, `Normalize`. Except if 'test_set=True'.
414
+ _LOGGER.info(f"Inspecting folder: {path_obj}...")
415
+ # Use rglob to recursively find all files
416
+ for filepath in path_obj.rglob('*'):
417
+ if filepath.is_file():
418
+ try:
419
+ # Using PIL to open is a more reliable check
420
+ with Image.open(filepath) as img:
421
+ img_types.add(img.format)
422
+ img_sizes.add(img.size)
423
+ img_channels.update(img.getbands())
424
+ img_counter += 1
425
+ except (IOError, SyntaxError):
426
+ non_image_files.add(filepath.name)
331
427
 
332
- Args:
333
- `inputs`: List of PIL Image objects | Numpy array of image arrays | Path to root directory containing subdirectories that classify image files.
428
+ if non_image_files:
429
+ _LOGGER.warning(f"Non-image or corrupted files found and ignored: {non_image_files}")
430
+
431
+ report = (
432
+ f"\n--- Inspection Report for '{path_obj.name}' ---\n"
433
+ f"Total images found: {img_counter}\n"
434
+ f"Image formats: {img_types or 'None'}\n"
435
+ f"Image sizes (WxH): {img_sizes or 'None'}\n"
436
+ f"Image channels (bands): {img_channels or 'None'}\n"
437
+ f"--------------------------------------"
438
+ )
439
+ _LOGGER.info(report)
440
+
441
+ def split_data(self, val_size: float = 0.2, test_size: float = 0.0,
442
+ stratify: bool = True, random_state: Optional[int] = None) -> 'VisionDatasetMaker':
443
+ """Splits the dataset into training, validation, and optional test sets."""
444
+ if self._is_split:
445
+ _LOGGER.warning("Data has already been split.")
446
+ return self
447
+
448
+ if val_size + test_size >= 1.0:
449
+ raise ValueError("The sum of val_size and test_size must be less than 1.")
450
+
451
+ indices = list(range(len(self.full_dataset)))
452
+ labels_for_split = self.labels if stratify else None
453
+
454
+ train_indices, val_test_indices = train_test_split(
455
+ indices, test_size=(val_size + test_size), random_state=random_state, stratify=labels_for_split
456
+ )
457
+
458
+ if test_size > 0:
459
+ val_test_labels = [self.labels[i] for i in val_test_indices]
460
+ stratify_val_test = val_test_labels if stratify else None
461
+ val_indices, test_indices = train_test_split(
462
+ val_test_indices, test_size=(test_size / (val_size + test_size)),
463
+ random_state=random_state, stratify=stratify_val_test
464
+ )
465
+ self._test_dataset = Subset(self.full_dataset, test_indices)
466
+ _LOGGER.info(f"Test set created with {len(self._test_dataset)} images.")
467
+ else:
468
+ val_indices = val_test_indices
334
469
 
335
- `labels`: List of integer values | Numpy array of labels. Labels size must match `inputs` size.
336
- If a path to a directory is given, then `labels` must be None.
470
+ self._train_dataset = Subset(self.full_dataset, train_indices)
471
+ self._val_dataset = Subset(self.full_dataset, val_indices)
472
+ self._is_split = True
337
473
 
338
- `transform`: Custom transformations to use. If None, use default transformations.
474
+ _LOGGER.info(f"Data split into: \n- Training: {len(self._train_dataset)} images \n- Validation: {len(self._val_dataset)} images")
475
+ return self
476
+
477
+ def configure_transforms(self, resize_size: int = 256, crop_size: int = 224,
478
+ mean: List[float] = [0.485, 0.456, 0.406],
479
+ std: List[float] = [0.229, 0.224, 0.225],
480
+ extra_train_transforms: Optional[List] = None) -> 'VisionDatasetMaker':
481
+ """Configures and applies the image transformations (augmentations)."""
482
+ if not self._is_split:
483
+ raise RuntimeError("Transforms must be configured AFTER splitting data. Call .split_data() first.")
484
+
485
+ base_train_transforms = [transforms.RandomResizedCrop(crop_size), transforms.RandomHorizontalFlip()]
486
+ if extra_train_transforms:
487
+ base_train_transforms.extend(extra_train_transforms)
339
488
 
340
- `test_set`: Flip, rotation and center-crop transformations will not be applied.
489
+ final_transforms = [transforms.ToTensor(), transforms.Normalize(mean=mean, std=std)]
490
+
491
+ val_transform = transforms.Compose([transforms.Resize(resize_size), transforms.CenterCrop(crop_size), *final_transforms])
492
+ train_transform = transforms.Compose([*base_train_transforms, *final_transforms])
341
493
 
342
- Returns:
343
- `Dataset`: Either a `TensorDataset` or `ImageFolder` instance, depending on the method used.
344
- Data dimensions: (samples, color channels, height, width).
494
+ self._train_dataset.dataset.transform = train_transform # type: ignore
495
+ self._val_dataset.dataset.transform = val_transform # type: ignore
496
+ if self._test_dataset:
497
+ self._test_dataset.dataset.transform = val_transform # type: ignore
498
+
499
+ self._are_transforms_configured = True
500
+ _LOGGER.info("Image transforms configured and applied.")
501
+ return self
502
+
503
+ def get_datasets(self) -> Tuple[Dataset, ...]:
504
+ """Returns the final train, validation, and optional test datasets."""
505
+ if not self._is_split:
506
+ raise RuntimeError("Data has not been split. Call .split_data() first.")
507
+ if not self._are_transforms_configured:
508
+ _LOGGER.warning("Transforms have not been configured. Using default ToTensor only.")
509
+
510
+ if self._test_dataset:
511
+ return self._train_dataset, self._val_dataset, self._test_dataset
512
+ return self._train_dataset, self._val_dataset
513
+
514
+
515
+ # --- SequenceMaker ---
516
+ class SequenceMaker(_BaseMaker):
345
517
  """
346
- # Validate inputs
347
- if not isinstance(inputs, (list, numpy.ndarray, str)):
348
- raise TypeError("Inputs must be one of the following:\n\ta) List of PIL Image objects.\n\tb) Numpy array of 2D or 3D arrays.\
349
- \n\tc) Directory path to image files.")
350
- # Validate labels
351
- if not (isinstance(labels, (list, numpy.ndarray)) or labels is None):
352
- raise TypeError("Inputs must be one of the following:\n\ta) List of labels (integers).\n\tb) Numpy array of 2D or 3D arrays.\
353
- \n\tc) None if inputs path is given.\nLabels size must match Inputs size.")
354
- # Validate resize shape
355
- if not isinstance(resize, int):
356
- raise TypeError("Resize must be an integer value for a square image of shape (W, H).")
357
- # Validate transform
358
- if isinstance(transform, transforms.Compose):
359
- pass
360
- elif transform is None:
361
- if test_set:
362
- transform = transforms.Compose([
363
- transforms.Resize(size=(resize,resize)),
364
- transforms.ToTensor(),
365
- transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
366
- ])
367
- else:
368
- transform = transforms.Compose([
369
- transforms.RandomHorizontalFlip(p=0.5),
370
- transforms.RandomRotation(degrees=30),
371
- transforms.Resize(size=(int(resize*1.2),int(resize*1.2))),
372
- transforms.CenterCrop(size=resize),
373
- transforms.ToTensor(),
374
- transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
375
- ])
376
- else:
377
- raise TypeError("Transform must be a `torchvision.transforms.Compose` object or None to use a default transform.")
518
+ Creates windowed PyTorch datasets from time-series data.
378
519
 
379
- # Start-o
380
- dataset = None
520
+ Pipeline:
381
521
 
382
- # CASE A: input is a path to image files, Labels is None
383
- if labels is None:
384
- if isinstance(inputs, str):
385
- dataset = ImageFolder(root=inputs, transform=transform)
522
+ 1. `.split_data()`: Separate time series into training and testing portions.
523
+ 2. `.normalize_data()`: Normalize the data. The scaler will be fitted on the training portion.
524
+ 3. `.generate_windows()`: Create the windowed sequences from the split and normalized data.
525
+ """
526
+ def __init__(self, data: Union[pandas.DataFrame, pandas.Series, numpy.ndarray], sequence_length: int):
527
+ super().__init__()
528
+ self.sequence_length = sequence_length
529
+ self.scaler = None
530
+
531
+ if isinstance(data, pandas.DataFrame):
532
+ self.time_axis = data.index.values
533
+ self.sequence = data.iloc[:, 0].values.astype(numpy.float32)
534
+ elif isinstance(data, pandas.Series):
535
+ self.time_axis = data.index.values
536
+ self.sequence = data.values.astype(numpy.float32)
537
+ elif isinstance(data, numpy.ndarray):
538
+ self.time_axis = numpy.arange(len(data))
539
+ self.sequence = data.astype(numpy.float32)
386
540
  else:
387
- raise TypeError("Labels must be None if 'path' to inputs is provided. Labels will be inferred from subdirectory names in 'path'.")
388
- # CASE B: input is Numpy array or a list of PIL Images. Labels is Numpy array or List of integers
389
- elif not isinstance(inputs, str):
390
- # Transform labels to tensor
391
- labels_ = torch.tensor(labels, dtype=torch.int64)
392
-
393
- # Transform each image to tensor
394
- transformed = list()
395
- for img_ in inputs:
396
- transformed.append(transform(img_))
397
- # Stack image tensors
398
- features_ = torch.stack(transformed, dim=0)
399
-
400
- # Make a dataset with images and labels
401
- dataset = TensorDataset(features_, labels_)
402
- else:
403
- raise TypeError("Labels must be None if 'path' to inputs is provided. Labels will be inferred from subdirectory names in 'path'.")
404
-
405
- return dataset
406
-
541
+ raise TypeError("Data must be a pandas DataFrame/Series or a numpy array.")
542
+
543
+ self.train_sequence = None
544
+ self.test_sequence = None
545
+
546
+ self._is_split = False
547
+ self._is_normalized = False
548
+ self._are_windows_generated = False
407
549
 
408
- class SequenceDataset():
409
- def __init__(self, data: Union[pandas.DataFrame, pandas.Series, numpy.ndarray], sequence_size: int, last_seq_test: bool=True,
410
- seq_labels: bool=True, normalize: Union[Literal["standard", "minmax"], None]="minmax"):
550
+ def normalize_data(self, method: Literal["standard", "minmax"] = "minmax") -> 'SequenceMaker':
551
+ """
552
+ Normalizes the sequence data. Must be called AFTER splitting to prevent data leakage from the test set.
411
553
  """
412
- Make train/test datasets from a single timestamp sequence.
554
+ if not self._is_split:
555
+ raise RuntimeError("Data must be split BEFORE normalizing. Call .split_data() first.")
413
556
 
414
- Create an object containing 2 PyTorchDataset objects to be used in a Recurrent Neural Network:
557
+ if self.scaler:
558
+ _LOGGER.warning("Data has already been normalized.")
559
+ return self
560
+
561
+ if method == "standard":
562
+ self.scaler = StandardScaler()
563
+ elif method == "minmax":
564
+ self.scaler = MinMaxScaler(feature_range=(-1, 1))
565
+ else:
566
+ raise ValueError("Normalization `method` must be 'standard' or 'minmax'.")
567
+
568
+ # Fit scaler ONLY on the training data
569
+ self.scaler.fit(self.train_sequence.reshape(-1, 1)) # type: ignore
415
570
 
416
- 1. Train Dataset
417
- 2. Test Dataset
571
+ # Transform both train and test data using the fitted scaler
572
+ self.train_sequence = self.scaler.transform(self.train_sequence.reshape(-1, 1)).flatten() # type: ignore
573
+ self.test_sequence = self.scaler.transform(self.test_sequence.reshape(-1, 1)).flatten() # type: ignore
418
574
 
419
- To plot call the static method `plot()`.
575
+ self._is_normalized = True
576
+ _LOGGER.info(f"Sequence data normalized using {self.scaler.__class__.__name__}. Scaler was fit on the training set only.")
577
+ return self
578
+
579
+ def split_data(self, test_size: float = 0.2) -> 'SequenceMaker':
580
+ """Splits the sequence into training and testing portions."""
581
+ if self._is_split:
582
+ _LOGGER.warning("Data has already been split.")
583
+ return self
584
+
585
+ split_idx = int(len(self.sequence) * (1 - test_size))
586
+ self.train_sequence = self.sequence[:split_idx]
587
+ self.test_sequence = self.sequence[split_idx - self.sequence_length:]
420
588
 
421
- If normalization is used, an scaler object will be stored.
422
- The scaler object can be used to invert normalization on a Tensor/Array using the method `self.denormalize()`.
589
+ self.train_time_axis = self.time_axis[:split_idx]
590
+ self.test_time_axis = self.time_axis[split_idx:]
423
591
 
424
- Args:
425
- * `data`: Pandas Dataframe with 2 columns [datetime, sequence] | 1-column Dataframe or Series sequence, where index is the datetime.
426
- * `sequence_size (int)`: Length of each subsequence that will be used for training.
427
- * `last_seq_test (bool)`: Last sequence will be used as test_set, if false a dummy test set will be returned. Default is True.
428
- * `seq_labels (bool)`: Labels will be returned as sequences, if false return single values for 1 future timestamp.
429
- * `normalize`: Whether to normalize ('minmax'), standardize ('standard') or ignore (None). Default is 'minmax'.
592
+ self._is_split = True
593
+ _LOGGER.info(f"Sequence split into training ({len(self.train_sequence)} points) and testing ({len(self.test_sequence)} points).")
594
+ return self
595
+
596
+ def generate_windows(self, sequence_to_sequence: bool = False) -> 'SequenceMaker':
430
597
  """
431
- # Validate data
432
- if not isinstance(data, (pandas.Series, pandas.DataFrame, numpy.ndarray)):
433
- raise TypeError("Data must be pandas dataframe, pandas series or numpy array.")
434
- # Validate window size
435
- if not isinstance(sequence_size, int):
436
- raise TypeError("Sequence size must be an integer.")
437
- elif len(data) % sequence_size != 0:
438
- raise ValueError(f"data with length {len(data)} is not divisible in sequences of {sequence_size} values.")
439
- # Validate test sequence
440
- if not isinstance(last_seq_test, bool):
441
- raise TypeError("Last sequence treated as Test-set must be True or False.")
442
- # validate normalize
443
- if not (normalize in ["standard", "minmax"] or normalize is None):
444
- raise TypeError("normalize must be 'standard', 'minmax' or None.")
445
-
446
- # Handle data -> array
447
- self.time_axis = None
448
- if isinstance(data, pandas.DataFrame):
449
- if len(data.columns) == 2:
450
- self.sequence = data[data.columns[1]].values.astype("float")
451
- self.time_axis = data[data.columns[0]].values
452
- elif len(data.columns) == 1:
453
- self.sequence = data[data.columns[0]].values.astype("float")
454
- self.time_axis = data.index.values
455
- else:
456
- raise ValueError("Dataframe contains more than 2 columns.")
457
- elif isinstance(data, pandas.Series):
458
- self.sequence = data.values.astype("float")
459
- self.time_axis = data.index.values
598
+ Generates overlapping windows for features and labels.
599
+
600
+ "sequence-to-sequence": Label vectors are of the same size as the feature vectors instead of a single future prediction.
601
+ """
602
+ if not self._is_split:
603
+ raise RuntimeError("Cannot generate windows before splitting data. Call .split_data() first.")
604
+
605
+ self._train_dataset = self._create_windowed_dataset(self.train_sequence, sequence_to_sequence) # type: ignore
606
+ self._test_dataset = self._create_windowed_dataset(self.test_sequence, sequence_to_sequence) # type: ignore
607
+
608
+ self._are_windows_generated = True
609
+ _LOGGER.info("Feature and label windows generated for train and test sets.")
610
+ return self
611
+
612
+ def _create_windowed_dataset(self, data: numpy.ndarray, use_sequence_labels: bool) -> _PytorchDataset:
613
+ """Efficiently creates windowed features and labels using numpy."""
614
+ if len(data) <= self.sequence_length:
615
+ raise ValueError("Data length must be greater than the sequence_length to create at least one window.")
616
+
617
+ if not use_sequence_labels:
618
+ features = data[:-1]
619
+ labels = data[self.sequence_length:]
620
+
621
+ n_windows = len(features) - self.sequence_length + 1
622
+ bytes_per_item = features.strides[0]
623
+ strided_features = numpy.lib.stride_tricks.as_strided(
624
+ features, shape=(n_windows, self.sequence_length), strides=(bytes_per_item, bytes_per_item)
625
+ )
626
+ return _PytorchDataset(strided_features, labels, labels_dtype=torch.float32)
627
+
460
628
  else:
461
- self.sequence = data.astype("float")
629
+ x_data = data[:-1]
630
+ y_data = data[1:]
462
631
 
463
- # Save last sequence
464
- self._last_sequence = self.sequence[-sequence_size:]
465
-
466
- # Last sequence as test
467
- train_sequence = self.sequence
468
- test_sequence = None
469
- if last_seq_test:
470
- test_sequence = self.sequence[-(sequence_size*2):]
471
- train_sequence = self.sequence[:-sequence_size]
472
-
473
- # Normalize values
474
- norm_train_sequence = train_sequence
475
- norm_test_sequence = test_sequence
476
- if normalize is not None:
477
- # Define scaler
478
- if normalize == "standard":
479
- self.scaler = StandardScaler()
480
- elif normalize == "minmax":
481
- self.scaler = MinMaxScaler(feature_range=(-1,1))
482
- # Scale and transform training set + reshape
483
- self.scaler.fit(train_sequence.reshape(-1,1))
484
- norm_train_sequence = self.scaler.transform(train_sequence.reshape(-1,1))
485
- norm_train_sequence = norm_train_sequence.reshape(-1)
486
- # Scale test if it exists + reshape
487
- if last_seq_test:
488
- norm_test_sequence = self.scaler.transform(test_sequence.reshape(-1,1))
489
- norm_test_sequence = norm_test_sequence.reshape(-1)
490
-
491
- # Divide train sequence into subsequences
492
- train_features_list = list()
493
- train_labels_list = list()
494
- train_size = len(norm_train_sequence)
495
- for i in range(train_size - sequence_size - 1):
496
- subsequence = norm_train_sequence[i:sequence_size + i]
497
- train_features_list.append(subsequence.reshape(1,-1))
498
- # Labels as sequence
499
- if seq_labels:
500
- label = norm_train_sequence[i + 1:sequence_size + i + 1]
501
- train_labels_list.append(label.reshape(1,-1))
502
- # Single value label
503
- else:
504
- label = norm_train_sequence[sequence_size + i + 1]
505
- train_labels_list.append(label)
632
+ n_windows = len(x_data) - self.sequence_length + 1
633
+ bytes_per_item = x_data.strides[0]
506
634
 
507
- # Divide test sequence into subsequences
508
- if last_seq_test:
509
- test_features_list = list()
510
- test_labels_list = list()
511
- test_size = len(norm_test_sequence)
512
- for i in range(test_size - sequence_size - 1):
513
- subsequence = norm_test_sequence[i:sequence_size + i]
514
- test_features_list.append(subsequence.reshape(1,-1))
515
- # Labels as sequence
516
- if seq_labels:
517
- label = norm_test_sequence[i + 1:sequence_size + i + 1]
518
- test_labels_list.append(label.reshape(1,-1))
519
- # Single value label
520
- else:
521
- label = norm_test_sequence[sequence_size + i + 1]
522
- test_labels_list.append(label)
635
+ strided_x = numpy.lib.stride_tricks.as_strided(x_data, shape=(n_windows, self.sequence_length), strides=(bytes_per_item, bytes_per_item))
636
+ strided_y = numpy.lib.stride_tricks.as_strided(y_data, shape=(n_windows, self.sequence_length), strides=(bytes_per_item, bytes_per_item))
523
637
 
524
- # Create training arrays then cast to pytorch dataset
525
- train_features = numpy.concatenate(train_features_list, axis=0)
526
- # Check if labels are a sequence
527
- if seq_labels:
528
- train_labels = numpy.concatenate(train_labels_list, axis=0)
529
- else:
530
- train_labels = numpy.array(train_labels_list).reshape(-1,1)
531
- self.train_dataset = PytorchDataset(features=train_features, labels=train_labels, labels_dtype=torch.float32)
532
-
533
- # Create test arrays then cast to pytorch dataset
534
- if last_seq_test:
535
- test_features = numpy.concatenate(test_features_list, axis=0)
536
- # Check if labels are a sequence
537
- if seq_labels:
538
- test_labels = numpy.concatenate(test_labels_list, axis=0)
539
- else:
540
- test_labels = numpy.array(test_labels_list).reshape(-1,1)
541
- self.test_dataset = PytorchDataset(features=test_features, labels=test_labels, labels_dtype=torch.float32)
542
- else:
543
- self.test_dataset = PytorchDataset(features=numpy.ones(shape=(10, sequence_size)), labels=numpy.ones(shape=(10,1)), labels_dtype=torch.float32)
638
+ return _PytorchDataset(strided_x, strided_y, labels_dtype=torch.float32)
639
+
640
+ def denormalize(self, data: Union[torch.Tensor, numpy.ndarray]) -> numpy.ndarray:
641
+ """Applies inverse transformation using the stored scaler."""
642
+ if self.scaler is None:
643
+ raise RuntimeError("Data was not normalized. Cannot denormalize.")
544
644
 
545
- # Attempt to plot the sequence
546
- if self.time_axis is not None:
547
- try:
548
- self.plot(self.time_axis, self.sequence)
549
- except:
550
- print("Plot failed, try it manually to find the problem.")
645
+ if isinstance(data, torch.Tensor):
646
+ data_np = data.cpu().detach().numpy()
647
+ else:
648
+ data_np = data
649
+
650
+ return self.scaler.inverse_transform(data_np.reshape(-1, 1)).flatten()
551
651
 
552
- @staticmethod
553
- def plot(x_axis, y_axis, x_pred=None, y_pred=None):
554
- """
555
- Plot Time-values (X) Vs Data-values (Y).
556
- """
557
- plt.figure(figsize=(12,5))
558
- plt.title('Sequence')
652
+ def plot(self, predictions: Optional[numpy.ndarray] = None):
653
+ """Plots the original training and testing data, with optional predictions."""
654
+ if not self._is_split:
655
+ raise RuntimeError("Cannot plot before splitting data. Call .split_data() first.")
656
+
657
+ plt.figure(figsize=(15, 6))
658
+ plt.title("Time Series Data")
559
659
  plt.grid(True)
560
- plt.autoscale(axis='x', tight=True)
561
- plt.plot(x_axis, y_axis)
562
- if x_pred is not None and y_pred is not None:
563
- plt.plot(x_pred, y_pred)
564
- plt.show()
660
+ plt.xlabel("Time")
661
+ plt.ylabel("Value")
565
662
 
566
- def denormalize(self, input: Union[torch.Tensor, numpy.ndarray]) -> numpy.ndarray:
567
- """
568
- Applies the inverse transformation of the object's stored scaler to a tensor or array.
663
+ plt.plot(self.train_time_axis, self.scaler.inverse_transform(self.train_sequence.reshape(-1, 1)), label='Train Data') # type: ignore
664
+ plt.plot(self.test_time_axis, self.scaler.inverse_transform(self.test_sequence[self.sequence_length-1:].reshape(-1, 1)), label='Test Data') # type: ignore
569
665
 
570
- Args:
571
- `input`: Tensor/Array predicted using the current sequence.
572
-
573
- Returns: numpy.ndarray with default index.
574
- """
575
- if isinstance(input, torch.Tensor):
576
- with torch.no_grad():
577
- array = input.numpy().reshape(-1,1)
578
- elif isinstance(input, numpy.ndarray):
579
- array = input.reshape(-1,1)
580
- else:
581
- raise TypeError("Input must be a Pytorch tensor or Numpy array.")
582
- return self.scaler.inverse_transform(array)
583
-
584
- def get_last_sequence(self, normalize: bool=True, to_tensor: bool=True):
585
- """
586
- Returns the last subsequence of the sequence.
666
+ if predictions is not None:
667
+ pred_time_axis = self.test_time_axis[:len(predictions)]
668
+ plt.plot(pred_time_axis, predictions, label='Predictions', c='red')
587
669
 
588
- Args:
589
- `normalize`: Normalize using the object's stored scaler. Defaults to True.
590
-
591
- `to_tensor`: Cast to Pytorch tensor. Defaults to True.
670
+ plt.legend()
671
+ plt.show()
592
672
 
593
- Returns: numpy.ndarray or torch.Tensor
594
- """
595
- last_seq = self._last_sequence.reshape(-1,1)
596
- if normalize:
597
- last_seq = self.scaler.transform(last_seq)
598
- if to_tensor:
599
- last_seq = torch.Tensor(last_seq)
600
- return last_seq
601
-
602
- def __len__(self):
603
- return f"Train: {len(self.train_dataset)}, Test: {len(self.test_dataset)}"
673
+ def get_datasets(self) -> Tuple[_PytorchDataset, _PytorchDataset]:
674
+ """Returns the final train and test datasets."""
675
+ if not self._are_windows_generated:
676
+ raise RuntimeError("Windows have not been generated. Call .generate_windows() first.")
677
+ return self._train_dataset, self._test_dataset
604
678
 
605
679
 
606
680
  def info():