dragon-ml-toolbox 6.4.1__py3-none-any.whl → 7.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-6.4.1.dist-info → dragon_ml_toolbox-7.0.0.dist-info}/METADATA +3 -1
- {dragon_ml_toolbox-6.4.1.dist-info → dragon_ml_toolbox-7.0.0.dist-info}/RECORD +12 -11
- ml_tools/ML_datasetmaster.py +197 -399
- ml_tools/ML_evaluation.py +115 -50
- ml_tools/ML_inference.py +27 -5
- ml_tools/ML_models.py +472 -47
- ml_tools/ML_scaler.py +197 -0
- ml_tools/ML_trainer.py +81 -5
- {dragon_ml_toolbox-6.4.1.dist-info → dragon_ml_toolbox-7.0.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-6.4.1.dist-info → dragon_ml_toolbox-7.0.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-6.4.1.dist-info → dragon_ml_toolbox-7.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-6.4.1.dist-info → dragon_ml_toolbox-7.0.0.dist-info}/top_level.txt +0 -0
ml_tools/ML_datasetmaster.py
CHANGED
|
@@ -1,68 +1,29 @@
|
|
|
1
1
|
import torch
|
|
2
2
|
from torch.utils.data import Dataset, Subset
|
|
3
|
-
from torch import nn
|
|
4
3
|
import pandas
|
|
5
4
|
import numpy
|
|
6
5
|
from sklearn.model_selection import train_test_split
|
|
7
|
-
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
|
8
6
|
from typing import Literal, Union, Tuple, List, Optional
|
|
9
|
-
from imblearn.combine import SMOTETomek
|
|
10
7
|
from abc import ABC, abstractmethod
|
|
11
8
|
from PIL import Image, ImageOps
|
|
12
9
|
from torchvision.datasets import ImageFolder
|
|
13
10
|
from torchvision import transforms
|
|
14
11
|
import matplotlib.pyplot as plt
|
|
15
12
|
from pathlib import Path
|
|
16
|
-
from .path_manager import make_fullpath
|
|
13
|
+
from .path_manager import make_fullpath, sanitize_filename
|
|
17
14
|
from ._logger import _LOGGER
|
|
18
15
|
from ._script_info import _script_info
|
|
19
16
|
from .custom_logger import save_list_strings
|
|
17
|
+
from .ML_scaler import PytorchScaler
|
|
20
18
|
|
|
21
|
-
|
|
22
|
-
# --- public-facing API ---
|
|
23
19
|
__all__ = [
|
|
24
20
|
"DatasetMaker",
|
|
25
|
-
"SimpleDatasetMaker",
|
|
26
21
|
"VisionDatasetMaker",
|
|
27
22
|
"SequenceMaker",
|
|
28
23
|
"ResizeAspectFill",
|
|
29
24
|
]
|
|
30
25
|
|
|
31
26
|
|
|
32
|
-
# --- Custom Vision Transform Class ---
|
|
33
|
-
class ResizeAspectFill:
|
|
34
|
-
"""
|
|
35
|
-
Custom transformation to make an image square by padding it to match the
|
|
36
|
-
longest side, preserving the aspect ratio. The image is finally centered.
|
|
37
|
-
|
|
38
|
-
Args:
|
|
39
|
-
pad_color (Union[str, int]): Color to use for the padding.
|
|
40
|
-
Defaults to "black".
|
|
41
|
-
"""
|
|
42
|
-
def __init__(self, pad_color: Union[str, int] = "black") -> None:
|
|
43
|
-
self.pad_color = pad_color
|
|
44
|
-
|
|
45
|
-
def __call__(self, image: Image.Image) -> Image.Image:
|
|
46
|
-
if not isinstance(image, Image.Image):
|
|
47
|
-
raise TypeError(f"Expected PIL.Image.Image, got {type(image).__name__}")
|
|
48
|
-
|
|
49
|
-
w, h = image.size
|
|
50
|
-
if w == h:
|
|
51
|
-
return image
|
|
52
|
-
|
|
53
|
-
# Determine padding to center the image
|
|
54
|
-
if w > h:
|
|
55
|
-
top_padding = (w - h) // 2
|
|
56
|
-
bottom_padding = w - h - top_padding
|
|
57
|
-
padding = (0, top_padding, 0, bottom_padding)
|
|
58
|
-
else: # h > w
|
|
59
|
-
left_padding = (h - w) // 2
|
|
60
|
-
right_padding = h - w - left_padding
|
|
61
|
-
padding = (left_padding, 0, right_padding, 0)
|
|
62
|
-
|
|
63
|
-
return ImageOps.expand(image, padding, fill=self.pad_color)
|
|
64
|
-
|
|
65
|
-
|
|
66
27
|
# --- Internal Helper Class ---
|
|
67
28
|
class _PytorchDataset(Dataset):
|
|
68
29
|
"""
|
|
@@ -71,8 +32,13 @@ class _PytorchDataset(Dataset):
|
|
|
71
32
|
"""
|
|
72
33
|
def __init__(self, features: Union[numpy.ndarray, pandas.DataFrame],
|
|
73
34
|
labels: Union[numpy.ndarray, pandas.Series],
|
|
74
|
-
|
|
75
|
-
|
|
35
|
+
labels_dtype: torch.dtype,
|
|
36
|
+
features_dtype: torch.dtype = torch.float32):
|
|
37
|
+
"""
|
|
38
|
+
integer labels for classification.
|
|
39
|
+
|
|
40
|
+
float labels for regression.
|
|
41
|
+
"""
|
|
76
42
|
|
|
77
43
|
if isinstance(features, numpy.ndarray):
|
|
78
44
|
self.features = torch.tensor(features, dtype=features_dtype)
|
|
@@ -91,337 +57,41 @@ class _PytorchDataset(Dataset):
|
|
|
91
57
|
return self.features[index], self.labels[index]
|
|
92
58
|
|
|
93
59
|
|
|
94
|
-
# --- Private Base Class ---
|
|
95
|
-
class _BaseMaker(ABC):
|
|
96
|
-
"""
|
|
97
|
-
Abstract Base Class for all dataset makers.
|
|
98
|
-
Ensures a consistent API across the library.
|
|
99
|
-
"""
|
|
100
|
-
def __init__(self):
|
|
101
|
-
self._train_dataset = None
|
|
102
|
-
self._test_dataset = None
|
|
103
|
-
self._val_dataset = None
|
|
104
|
-
|
|
105
|
-
@abstractmethod
|
|
106
|
-
def get_datasets(self) -> Tuple[Dataset, ...]:
|
|
107
|
-
"""
|
|
108
|
-
The primary method to retrieve the final, processed PyTorch datasets.
|
|
109
|
-
Must be implemented by all subclasses.
|
|
110
|
-
"""
|
|
111
|
-
pass
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
# --- Refactored DatasetMaker ---
|
|
115
|
-
class DatasetMaker(_BaseMaker):
|
|
116
|
-
"""
|
|
117
|
-
Creates processed PyTorch datasets from a Pandas DataFrame using a fluent, step-by-step interface.
|
|
118
|
-
|
|
119
|
-
Recommended pipeline:
|
|
120
|
-
|
|
121
|
-
- Full Control (step-by-step):
|
|
122
|
-
1. Process categorical features `.process_categoricals()`
|
|
123
|
-
2. Split train-test datasets `.split_data()`
|
|
124
|
-
3. Normalize continuous features `.normalize_continuous()`; `.denormalize()` becomes available.
|
|
125
|
-
4. [Optional][Classification only] Balance classes `.balance_data()`
|
|
126
|
-
5. Get PyTorch datasets: `train, test = .get_datasets()`
|
|
127
|
-
6. [Optional] Inspect the processed data as DataFrames `X_train, X_test, y_train, y_test = .inspect_dataframes()`
|
|
128
|
-
|
|
129
|
-
- Automated (single call):
|
|
130
|
-
```python
|
|
131
|
-
maker = DatasetMaker(df, label_col='target')
|
|
132
|
-
maker.auto_process() # uses simplified arguments
|
|
133
|
-
train_ds, test_ds = maker.get_datasets()
|
|
134
|
-
```
|
|
135
|
-
"""
|
|
136
|
-
def __init__(self, pandas_df: pandas.DataFrame, label_col: str, kind: Literal["regression", "classification"]):
|
|
137
|
-
super().__init__()
|
|
138
|
-
if not isinstance(pandas_df, pandas.DataFrame):
|
|
139
|
-
raise TypeError("Input must be a pandas.DataFrame.")
|
|
140
|
-
if label_col not in pandas_df.columns:
|
|
141
|
-
raise ValueError(f"Label column '{label_col}' not found in DataFrame.")
|
|
142
|
-
|
|
143
|
-
self.kind = kind
|
|
144
|
-
self.labels = pandas_df[label_col]
|
|
145
|
-
self.features = pandas_df.drop(columns=label_col)
|
|
146
|
-
self.labels_map = None
|
|
147
|
-
self.scaler = None
|
|
148
|
-
|
|
149
|
-
self._feature_names = self.features.columns.tolist()
|
|
150
|
-
self._target_name = str(self.labels.name)
|
|
151
|
-
|
|
152
|
-
self._is_split = False
|
|
153
|
-
self._is_balanced = False
|
|
154
|
-
self._is_normalized = False
|
|
155
|
-
self._is_categoricals_processed = False
|
|
156
|
-
|
|
157
|
-
self.features_train = None
|
|
158
|
-
self.features_test = None
|
|
159
|
-
self.labels_train = None
|
|
160
|
-
self.labels_test = None
|
|
161
|
-
|
|
162
|
-
self.continuous_columns = None
|
|
163
|
-
|
|
164
|
-
def process_categoricals(self, method: Literal["one-hot", "embed"] = "one-hot",
|
|
165
|
-
cat_features: Union[list[str], None] = None, **kwargs) -> 'DatasetMaker':
|
|
166
|
-
"""
|
|
167
|
-
Encodes categorical features using the specified method.
|
|
168
|
-
|
|
169
|
-
Args:
|
|
170
|
-
method (str, optional): 'one-hot' (default) or 'embed'.
|
|
171
|
-
cat_features (list, optional): A list of categorical column names.
|
|
172
|
-
If None, they will be inferred from the DataFrame's dtypes.
|
|
173
|
-
**kwargs: Additional keyword arguments to pass to the underlying
|
|
174
|
-
pandas.get_dummies() or torch.nn.Embedding() functions.
|
|
175
|
-
For 'one-hot' encoding, it is often recommended to add
|
|
176
|
-
`drop_first=True` to help reduce multicollinearity.
|
|
177
|
-
"""
|
|
178
|
-
if self._is_split:
|
|
179
|
-
raise RuntimeError("Categoricals must be processed before splitting data to avoid data leakage.")
|
|
180
|
-
|
|
181
|
-
if cat_features is None:
|
|
182
|
-
cat_columns = self.features.select_dtypes(include=['object', 'category', 'string']).columns.tolist()
|
|
183
|
-
else:
|
|
184
|
-
cat_columns = cat_features
|
|
185
|
-
|
|
186
|
-
if not cat_columns:
|
|
187
|
-
_LOGGER.info("No categorical features to process.")
|
|
188
|
-
self._is_categoricals_processed = True
|
|
189
|
-
return self
|
|
190
|
-
|
|
191
|
-
continuous_df = self.features.drop(columns=cat_columns)
|
|
192
|
-
# store continuous column names
|
|
193
|
-
self.continuous_columns = continuous_df.columns.tolist()
|
|
194
|
-
|
|
195
|
-
categorical_df = self.features[cat_columns].copy()
|
|
196
|
-
|
|
197
|
-
if method == "one-hot":
|
|
198
|
-
processed_cats = pandas.get_dummies(categorical_df, dtype=numpy.int32, **kwargs)
|
|
199
|
-
elif method == "embed":
|
|
200
|
-
processed_cats = self._embed_categorical(categorical_df, **kwargs)
|
|
201
|
-
else:
|
|
202
|
-
raise ValueError("`method` must be 'one-hot' or 'embed'.")
|
|
203
|
-
|
|
204
|
-
self.features = pandas.concat([continuous_df, processed_cats], axis=1)
|
|
205
|
-
self._is_categoricals_processed = True
|
|
206
|
-
_LOGGER.info("Categorical features processed.")
|
|
207
|
-
return self
|
|
208
|
-
|
|
209
|
-
def normalize_continuous(self, method: Literal["standard", "minmax"] = "standard") -> 'DatasetMaker':
|
|
210
|
-
"""Normalizes all numeric features and saves the scaler."""
|
|
211
|
-
if not self._is_split:
|
|
212
|
-
raise RuntimeError("Continuous features must be normalized AFTER splitting data. Call .split_data() first.")
|
|
213
|
-
if self._is_normalized:
|
|
214
|
-
_LOGGER.warning("⚠️ Data has already been normalized.")
|
|
215
|
-
return self
|
|
216
|
-
|
|
217
|
-
# Use continuous features columns
|
|
218
|
-
self.scaler_columns = self.continuous_columns
|
|
219
|
-
if not self.scaler_columns:
|
|
220
|
-
_LOGGER.info("No continuous features to normalize.")
|
|
221
|
-
self._is_normalized = True
|
|
222
|
-
return self
|
|
223
|
-
|
|
224
|
-
if method == "standard":
|
|
225
|
-
self.scaler = StandardScaler()
|
|
226
|
-
elif method == "minmax":
|
|
227
|
-
self.scaler = MinMaxScaler()
|
|
228
|
-
else:
|
|
229
|
-
raise ValueError("Normalization `method` must be 'standard' or 'minmax'.")
|
|
230
|
-
|
|
231
|
-
# Fit on training data only, then transform both
|
|
232
|
-
self.features_train[self.scaler_columns] = self.scaler.fit_transform(self.features_train[self.scaler_columns]) # type: ignore
|
|
233
|
-
self.features_test[self.scaler_columns] = self.scaler.transform(self.features_test[self.scaler_columns]) # type: ignore
|
|
234
|
-
|
|
235
|
-
self._is_normalized = True
|
|
236
|
-
_LOGGER.info(f"Continuous features normalized using {self.scaler.__class__.__name__}. Scaler stored in `self.scaler`.")
|
|
237
|
-
return self
|
|
238
|
-
|
|
239
|
-
def split_data(self, test_size: float = 0.2, stratify: bool = False, random_state: Optional[int] = None) -> 'DatasetMaker':
|
|
240
|
-
"""Splits the data into training and testing sets."""
|
|
241
|
-
if self._is_split:
|
|
242
|
-
_LOGGER.warning("⚠️ Data has already been split.")
|
|
243
|
-
return self
|
|
244
|
-
|
|
245
|
-
if self.labels.dtype == 'object' or self.labels.dtype.name == 'category':
|
|
246
|
-
labels_numeric = self.labels.astype("category")
|
|
247
|
-
self.labels_map = {code: val for code, val in enumerate(labels_numeric.cat.categories)}
|
|
248
|
-
self.labels = pandas.Series(labels_numeric.cat.codes, index=self.labels.index)
|
|
249
|
-
_LOGGER.info("Labels have been encoded. Mapping stored in `self.labels_map`.")
|
|
250
|
-
|
|
251
|
-
stratify_array = self.labels if stratify else None
|
|
252
|
-
|
|
253
|
-
self.features_train, self.features_test, self.labels_train, self.labels_test = train_test_split(
|
|
254
|
-
self.features, self.labels, test_size=test_size, random_state=random_state, stratify=stratify_array
|
|
255
|
-
)
|
|
256
|
-
|
|
257
|
-
self._is_split = True
|
|
258
|
-
_LOGGER.info(f"Data split into training ({len(self.features_train)} samples) and testing ({len(self.features_test)} samples).")
|
|
259
|
-
return self
|
|
260
|
-
|
|
261
|
-
def balance_data(self, resampler=None, **kwargs) -> 'DatasetMaker':
|
|
262
|
-
"""
|
|
263
|
-
Only useful for classification tasks.
|
|
264
|
-
|
|
265
|
-
Balances the training data using a specified resampler.
|
|
266
|
-
|
|
267
|
-
Defaults to `SMOTETomek`.
|
|
268
|
-
"""
|
|
269
|
-
if not self._is_split:
|
|
270
|
-
raise RuntimeError("❌ Cannot balance data before it has been split. Call .split_data() first.")
|
|
271
|
-
if self._is_balanced:
|
|
272
|
-
_LOGGER.warning("⚠️ Training data has already been balanced.")
|
|
273
|
-
return self
|
|
274
|
-
|
|
275
|
-
if resampler is None:
|
|
276
|
-
resampler = SMOTETomek(**kwargs)
|
|
277
|
-
|
|
278
|
-
_LOGGER.info(f"Balancing training data with {resampler.__class__.__name__}...")
|
|
279
|
-
self.features_train, self.labels_train = resampler.fit_resample(self.features_train, self.labels_train) # type: ignore
|
|
280
|
-
|
|
281
|
-
self._is_balanced = True
|
|
282
|
-
_LOGGER.info(f"Balancing complete. New training set size: {len(self.features_train)} samples.")
|
|
283
|
-
return self
|
|
284
|
-
|
|
285
|
-
def auto_process(self, test_size: float = 0.2, cat_method: Literal["one-hot", "embed"] = "one-hot", normalize_method: Literal["standard", "minmax"] = "standard",
|
|
286
|
-
balance: bool = False, random_state: Optional[int] = None) -> 'DatasetMaker':
|
|
287
|
-
"""Runs a standard, fully automated preprocessing pipeline."""
|
|
288
|
-
_LOGGER.info("--- 🤖 Running Automated Processing Pipeline ---")
|
|
289
|
-
self.process_categoricals(method=cat_method)
|
|
290
|
-
self.split_data(test_size=test_size, stratify=True, random_state=random_state)
|
|
291
|
-
self.normalize_continuous(method=normalize_method)
|
|
292
|
-
if balance:
|
|
293
|
-
self.balance_data()
|
|
294
|
-
_LOGGER.info("--- 🤖 Automated Processing Complete ---")
|
|
295
|
-
return self
|
|
296
|
-
|
|
297
|
-
def denormalize(self, data: Union[torch.Tensor, numpy.ndarray, pandas.DataFrame]) -> Union[numpy.ndarray, pandas.DataFrame]:
|
|
298
|
-
"""
|
|
299
|
-
Applies inverse transformation to denormalize data, preserving DataFrame
|
|
300
|
-
structure if provided.
|
|
301
|
-
|
|
302
|
-
Args:
|
|
303
|
-
data: The normalized data to be transformed back to its original scale.
|
|
304
|
-
Can be a PyTorch Tensor, NumPy array, or Pandas DataFrame.
|
|
305
|
-
If a DataFrame, it must contain the columns that were originally scaled.
|
|
306
|
-
|
|
307
|
-
Returns:
|
|
308
|
-
The denormalized data. Returns a Pandas DataFrame if the input was a
|
|
309
|
-
DataFrame, otherwise returns a NumPy array.
|
|
310
|
-
"""
|
|
311
|
-
if self.scaler is None:
|
|
312
|
-
raise RuntimeError("Data was not normalized. Cannot denormalize.")
|
|
313
|
-
|
|
314
|
-
if isinstance(data, pandas.DataFrame):
|
|
315
|
-
# If input is a DataFrame, denormalize in place and return a copy
|
|
316
|
-
if not all(col in data.columns for col in self.scaler_columns): # type: ignore
|
|
317
|
-
raise ValueError(f"Input DataFrame is missing one or more required columns for denormalization. Required: {self.scaler_columns}")
|
|
318
|
-
|
|
319
|
-
output_df = data.copy()
|
|
320
|
-
output_df[self.scaler_columns] = self.scaler.inverse_transform(data[self.scaler_columns]) # type: ignore
|
|
321
|
-
return output_df
|
|
322
|
-
|
|
323
|
-
# Handle tensor or numpy array input
|
|
324
|
-
if isinstance(data, torch.Tensor):
|
|
325
|
-
data_np = data.cpu().numpy()
|
|
326
|
-
else: # It's already a numpy array
|
|
327
|
-
data_np = data
|
|
328
|
-
|
|
329
|
-
if data_np.ndim == 1:
|
|
330
|
-
data_np = data_np.reshape(-1, 1)
|
|
331
|
-
|
|
332
|
-
if data_np.shape[1] != len(self.scaler_columns): # type: ignore
|
|
333
|
-
raise ValueError(f"Input array has {data_np.shape[1]} columns, but scaler was fitted on {len(self.scaler_columns)} columns.") # type: ignore
|
|
334
|
-
|
|
335
|
-
return self.scaler.inverse_transform(data_np)
|
|
336
|
-
|
|
337
|
-
def get_datasets(self) -> Tuple[Dataset, Dataset]:
|
|
338
|
-
"""Primary method to get the final PyTorch Datasets."""
|
|
339
|
-
if not self._is_split:
|
|
340
|
-
raise RuntimeError("Data has not been split yet. Call .split_data() or .process() first.")
|
|
341
|
-
|
|
342
|
-
label_dtype = torch.float32 if self.kind == "regression" else torch.int64
|
|
343
|
-
|
|
344
|
-
self._train_dataset = _PytorchDataset(self.features_train, self.labels_train, labels_dtype=label_dtype) # type: ignore
|
|
345
|
-
self._test_dataset = _PytorchDataset(self.features_test, self.labels_test, labels_dtype=label_dtype) # type: ignore
|
|
346
|
-
|
|
347
|
-
return self._train_dataset, self._test_dataset
|
|
348
|
-
|
|
349
|
-
def inspect_dataframes(self) -> Tuple[pandas.DataFrame, pandas.DataFrame, pandas.Series, pandas.Series]:
|
|
350
|
-
"""Utility method to inspect the processed data as Pandas DataFrames."""
|
|
351
|
-
if not self._is_split:
|
|
352
|
-
raise RuntimeError("Data has not been split yet. Call .split_data() or .process() first.")
|
|
353
|
-
return self.features_train, self.features_test, self.labels_train, self.labels_test # type: ignore
|
|
354
|
-
|
|
355
|
-
@property
|
|
356
|
-
def feature_names(self) -> list[str]:
|
|
357
|
-
"""Returns the list of feature column names."""
|
|
358
|
-
return self._feature_names
|
|
359
|
-
|
|
360
|
-
@property
|
|
361
|
-
def target_name(self) -> str:
|
|
362
|
-
"""Returns the name of the target column."""
|
|
363
|
-
return self._target_name
|
|
364
|
-
|
|
365
|
-
def save_feature_names(self, directory: Union[str, Path], verbose: bool=True) -> None:
|
|
366
|
-
"""Saves a list of feature names as a text file"""
|
|
367
|
-
save_list_strings(list_strings=self._feature_names,
|
|
368
|
-
directory=directory,
|
|
369
|
-
filename="feature_names",
|
|
370
|
-
verbose=verbose)
|
|
371
|
-
|
|
372
|
-
@staticmethod
|
|
373
|
-
def _embed_categorical(cat_df: pandas.DataFrame, random_state: Optional[int] = None, **kwargs) -> pandas.DataFrame:
|
|
374
|
-
"""Internal helper to perform embedding on categorical features."""
|
|
375
|
-
embedded_tensors = []
|
|
376
|
-
new_columns = []
|
|
377
|
-
for col in cat_df.columns:
|
|
378
|
-
cat_series = cat_df[col].astype("category")
|
|
379
|
-
num_categories = len(cat_series.cat.categories)
|
|
380
|
-
embedding_dim = min(50, (num_categories + 1) // 2)
|
|
381
|
-
|
|
382
|
-
if random_state:
|
|
383
|
-
torch.manual_seed(random_state)
|
|
384
|
-
|
|
385
|
-
embedder = nn.Embedding(num_embeddings=num_categories, embedding_dim=embedding_dim, **kwargs)
|
|
386
|
-
|
|
387
|
-
with torch.no_grad():
|
|
388
|
-
codes = torch.LongTensor(cat_series.cat.codes.values)
|
|
389
|
-
embedded_tensors.append(embedder(codes))
|
|
390
|
-
|
|
391
|
-
new_columns.extend([f"{col}_{i+1}" for i in range(embedding_dim)])
|
|
392
|
-
|
|
393
|
-
with torch.no_grad():
|
|
394
|
-
full_tensor = torch.cat(embedded_tensors, dim=1)
|
|
395
|
-
return pandas.DataFrame(full_tensor.numpy(), columns=new_columns, index=cat_df.index)
|
|
396
|
-
|
|
397
|
-
|
|
398
60
|
# Streamlined DatasetMaker version
|
|
399
|
-
class
|
|
61
|
+
class DatasetMaker:
|
|
400
62
|
"""
|
|
401
63
|
A simplified dataset maker for pre-processed, numerical pandas DataFrames.
|
|
402
64
|
|
|
403
65
|
This class takes a DataFrame, automatically splits it into training and
|
|
404
66
|
testing sets, and converts them into PyTorch Datasets. It assumes the
|
|
405
|
-
target variable is the last column.
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
67
|
+
target variable is the last column. It can also create, apply, and
|
|
68
|
+
save a PytorchScaler for standardizing continuous features.
|
|
69
|
+
|
|
70
|
+
Attributes:
|
|
71
|
+
`scaler` -> PytorchScaler | None
|
|
72
|
+
`train_dataset` -> PyTorch Dataset
|
|
73
|
+
`test_dataset` -> PyTorch Dataset
|
|
74
|
+
`feature_names` -> list[str]
|
|
75
|
+
`target_name` -> str
|
|
76
|
+
`id` -> str | None
|
|
77
|
+
|
|
78
|
+
The ID can be manually set to any string if needed, it is `None` by default.
|
|
414
79
|
"""
|
|
415
|
-
def __init__(self,
|
|
80
|
+
def __init__(self,
|
|
81
|
+
pandas_df: pandas.DataFrame,
|
|
82
|
+
kind: Literal["regression", "classification"],
|
|
83
|
+
test_size: float = 0.2,
|
|
84
|
+
random_state: int = 42,
|
|
85
|
+
scaler: Optional[PytorchScaler] = None,
|
|
86
|
+
continuous_feature_columns: Optional[Union[List[int], List[str]]] = None):
|
|
416
87
|
"""
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
The ID can be manually set to any string if needed, it is `None` by default.
|
|
88
|
+
Args:
|
|
89
|
+
pandas_df (pandas.DataFrame): The pre-processed input DataFrame with numerical data.
|
|
90
|
+
kind (Literal["regression", "classification"]): The type of ML task. This determines the data type of the labels.
|
|
91
|
+
test_size (float): The proportion of the dataset to allocate to the test split.
|
|
92
|
+
random_state (int): The seed for the random number generator for reproducibility.
|
|
93
|
+
scaler (PytorchScaler | None): A pre-fitted PytorchScaler instance.
|
|
94
|
+
continuous_feature_columns (List[int] | List[str] | None): Column indices or names of continuous features to scale. If provided creates a new PytorchScaler.
|
|
425
95
|
"""
|
|
426
96
|
# Validation
|
|
427
97
|
if not isinstance(pandas_df, pandas.DataFrame):
|
|
@@ -438,6 +108,8 @@ class SimpleDatasetMaker:
|
|
|
438
108
|
|
|
439
109
|
#set id
|
|
440
110
|
self._id: Optional[str] = None
|
|
111
|
+
# set scaler
|
|
112
|
+
self.scaler = scaler
|
|
441
113
|
|
|
442
114
|
# 2. Split the data
|
|
443
115
|
X_train, X_test, y_train, y_test = train_test_split(
|
|
@@ -448,12 +120,47 @@ class SimpleDatasetMaker:
|
|
|
448
120
|
self._X_test_shape = X_test.shape
|
|
449
121
|
self._y_train_shape = y_train.shape
|
|
450
122
|
self._y_test_shape = y_test.shape
|
|
451
|
-
|
|
452
|
-
# 3. Convert to PyTorch Datasets with the correct label dtype
|
|
453
|
-
label_dtype = torch.float32 if kind == "regression" else torch.int64
|
|
454
123
|
|
|
455
|
-
|
|
456
|
-
|
|
124
|
+
# 3. Handle Column to Index Conversion
|
|
125
|
+
continuous_feature_indices: Optional[List[int]] = None
|
|
126
|
+
if continuous_feature_columns:
|
|
127
|
+
if all(isinstance(c, str) for c in continuous_feature_columns):
|
|
128
|
+
name_to_idx = {name: i for i, name in enumerate(self._feature_names)}
|
|
129
|
+
try:
|
|
130
|
+
continuous_feature_indices = [name_to_idx[name] for name in continuous_feature_columns] # type: ignore
|
|
131
|
+
except KeyError as e:
|
|
132
|
+
raise ValueError(f"Feature column '{e.args[0]}' not found in DataFrame.")
|
|
133
|
+
elif all(isinstance(c, int) for c in continuous_feature_columns):
|
|
134
|
+
continuous_feature_indices = continuous_feature_columns # type: ignore
|
|
135
|
+
else:
|
|
136
|
+
raise TypeError("`continuous_feature_columns` must be a list of all strings or all integers.")
|
|
137
|
+
|
|
138
|
+
# 4. Handle Scaling
|
|
139
|
+
X_train_values = X_train.values
|
|
140
|
+
X_test_values = X_test.values
|
|
141
|
+
|
|
142
|
+
# If no scaler is provided, fit a new one from the training data
|
|
143
|
+
if self.scaler is None:
|
|
144
|
+
if continuous_feature_indices:
|
|
145
|
+
_LOGGER.info("Feature indices provided. Fitting a new PytorchScaler on training data.")
|
|
146
|
+
# A temporary dataset is needed for the PytorchScaler.fit method
|
|
147
|
+
temp_label_dtype = torch.float32 if kind == "regression" else torch.int64
|
|
148
|
+
temp_train_ds = _PytorchDataset(X_train_values, y_train.values, labels_dtype=temp_label_dtype)
|
|
149
|
+
self.scaler = PytorchScaler.fit(temp_train_ds, continuous_feature_indices)
|
|
150
|
+
|
|
151
|
+
# If a scaler exists (either passed in or just fitted), apply it
|
|
152
|
+
if self.scaler and self.scaler.mean_ is not None:
|
|
153
|
+
_LOGGER.info("Applying scaler transformation to train and test feature sets.")
|
|
154
|
+
X_train_tensor = self.scaler.transform(torch.tensor(X_train_values, dtype=torch.float32))
|
|
155
|
+
X_test_tensor = self.scaler.transform(torch.tensor(X_test_values, dtype=torch.float32))
|
|
156
|
+
# Convert back to numpy for the _PytorchDataset class
|
|
157
|
+
X_train_values = X_train_tensor.numpy()
|
|
158
|
+
X_test_values = X_test_tensor.numpy()
|
|
159
|
+
|
|
160
|
+
# 5. Convert to final PyTorch Datasets
|
|
161
|
+
label_dtype = torch.float32 if kind == "regression" else torch.int64
|
|
162
|
+
self._train_ds = _PytorchDataset(X_train_values, y_train.values, labels_dtype=label_dtype)
|
|
163
|
+
self._test_ds = _PytorchDataset(X_test_values, y_test.values, labels_dtype=label_dtype)
|
|
457
164
|
|
|
458
165
|
@property
|
|
459
166
|
def train_dataset(self) -> Dataset:
|
|
@@ -502,6 +209,47 @@ class SimpleDatasetMaker:
|
|
|
502
209
|
directory=directory,
|
|
503
210
|
filename="feature_names",
|
|
504
211
|
verbose=verbose)
|
|
212
|
+
|
|
213
|
+
def save_scaler(self, save_dir: Union[str, Path]):
|
|
214
|
+
"""
|
|
215
|
+
Saves the fitted PytorchScaler's state to a .pth file.
|
|
216
|
+
|
|
217
|
+
The filename is automatically generated based on the target name.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
save_dir (str | Path): The directory where the scaler will be saved.
|
|
221
|
+
"""
|
|
222
|
+
if not self.scaler:
|
|
223
|
+
_LOGGER.error("❌ No scaler was fitted or provided.")
|
|
224
|
+
return
|
|
225
|
+
|
|
226
|
+
save_path = make_fullpath(save_dir, make=True, enforce="directory")
|
|
227
|
+
|
|
228
|
+
# Sanitize the target name for use in a filename
|
|
229
|
+
sanitized_target = sanitize_filename(self.target_name)
|
|
230
|
+
filename = f"scaler_{sanitized_target}.pth"
|
|
231
|
+
|
|
232
|
+
filepath = save_path / filename
|
|
233
|
+
self.scaler.save(filepath)
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
# --- Private Base Class ---
|
|
237
|
+
class _BaseMaker(ABC):
|
|
238
|
+
"""
|
|
239
|
+
Abstract Base Class for extra dataset makers.
|
|
240
|
+
"""
|
|
241
|
+
def __init__(self):
|
|
242
|
+
self._train_dataset = None
|
|
243
|
+
self._test_dataset = None
|
|
244
|
+
self._val_dataset = None
|
|
245
|
+
|
|
246
|
+
@abstractmethod
|
|
247
|
+
def get_datasets(self) -> Tuple[Dataset, ...]:
|
|
248
|
+
"""
|
|
249
|
+
The primary method to retrieve the final, processed PyTorch datasets.
|
|
250
|
+
Must be implemented by all subclasses.
|
|
251
|
+
"""
|
|
252
|
+
pass
|
|
505
253
|
|
|
506
254
|
|
|
507
255
|
# --- VisionDatasetMaker ---
|
|
@@ -654,6 +402,7 @@ class SequenceMaker(_BaseMaker):
|
|
|
654
402
|
1. `.split_data()`: Separate time series into training and testing portions.
|
|
655
403
|
2. `.normalize_data()`: Normalize the data. The scaler will be fitted on the training portion.
|
|
656
404
|
3. `.generate_windows()`: Create the windowed sequences from the split and normalized data.
|
|
405
|
+
4. `.get_datasets()`: Return Pytorch train and test datasets.
|
|
657
406
|
"""
|
|
658
407
|
def __init__(self, data: Union[pandas.DataFrame, pandas.Series, numpy.ndarray], sequence_length: int):
|
|
659
408
|
super().__init__()
|
|
@@ -679,33 +428,41 @@ class SequenceMaker(_BaseMaker):
|
|
|
679
428
|
self._is_normalized = False
|
|
680
429
|
self._are_windows_generated = False
|
|
681
430
|
|
|
682
|
-
def normalize_data(self
|
|
431
|
+
def normalize_data(self) -> 'SequenceMaker':
|
|
683
432
|
"""
|
|
684
|
-
Normalizes the sequence data. Must be called AFTER
|
|
433
|
+
Normalizes the sequence data using PytorchScaler. Must be called AFTER
|
|
434
|
+
splitting to prevent data leakage from the test set.
|
|
685
435
|
"""
|
|
686
436
|
if not self._is_split:
|
|
687
437
|
raise RuntimeError("Data must be split BEFORE normalizing. Call .split_data() first.")
|
|
688
|
-
|
|
438
|
+
|
|
689
439
|
if self.scaler:
|
|
690
440
|
_LOGGER.warning("⚠️ Data has already been normalized.")
|
|
691
441
|
return self
|
|
692
|
-
|
|
693
|
-
if method == "standard":
|
|
694
|
-
self.scaler = StandardScaler()
|
|
695
|
-
elif method == "minmax":
|
|
696
|
-
self.scaler = MinMaxScaler(feature_range=(-1, 1))
|
|
697
|
-
else:
|
|
698
|
-
raise ValueError("Normalization `method` must be 'standard' or 'minmax'.")
|
|
699
442
|
|
|
700
|
-
#
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
443
|
+
# 1. PytorchScaler requires a Dataset to fit. Create a temporary one.
|
|
444
|
+
# The scaler expects 2D data [n_samples, n_features].
|
|
445
|
+
train_features = self.train_sequence.reshape(-1, 1) # type: ignore
|
|
446
|
+
|
|
447
|
+
# _PytorchDataset needs labels, so we create dummy ones.
|
|
448
|
+
dummy_labels = numpy.zeros(len(train_features))
|
|
449
|
+
temp_train_ds = _PytorchDataset(train_features, dummy_labels, labels_dtype=torch.float32)
|
|
450
|
+
|
|
451
|
+
# 2. Fit the PytorchScaler on the temporary training dataset.
|
|
452
|
+
# The sequence is a single feature, so its index is [0].
|
|
453
|
+
_LOGGER.info("Fitting PytorchScaler on the training data...")
|
|
454
|
+
self.scaler = PytorchScaler.fit(temp_train_ds, continuous_feature_indices=[0])
|
|
455
|
+
|
|
456
|
+
# 3. Transform sequences using the fitted scaler.
|
|
457
|
+
# The transform method requires a tensor, so we convert, transform, and convert back.
|
|
458
|
+
train_tensor = torch.tensor(self.train_sequence.reshape(-1, 1), dtype=torch.float32) # type: ignore
|
|
459
|
+
test_tensor = torch.tensor(self.test_sequence.reshape(-1, 1), dtype=torch.float32) # type: ignore
|
|
460
|
+
|
|
461
|
+
self.train_sequence = self.scaler.transform(train_tensor).numpy().flatten()
|
|
462
|
+
self.test_sequence = self.scaler.transform(test_tensor).numpy().flatten()
|
|
463
|
+
|
|
707
464
|
self._is_normalized = True
|
|
708
|
-
_LOGGER.info(
|
|
465
|
+
_LOGGER.info("✅ Sequence data normalized using PytorchScaler.")
|
|
709
466
|
return self
|
|
710
467
|
|
|
711
468
|
def split_data(self, test_size: float = 0.2) -> 'SequenceMaker':
|
|
@@ -741,7 +498,7 @@ class SequenceMaker(_BaseMaker):
|
|
|
741
498
|
_LOGGER.info("Feature and label windows generated for train and test sets.")
|
|
742
499
|
return self
|
|
743
500
|
|
|
744
|
-
def _create_windowed_dataset(self, data: numpy.ndarray, use_sequence_labels: bool) ->
|
|
501
|
+
def _create_windowed_dataset(self, data: numpy.ndarray, use_sequence_labels: bool) -> Dataset:
|
|
745
502
|
"""Efficiently creates windowed features and labels using numpy."""
|
|
746
503
|
if len(data) <= self.sequence_length:
|
|
747
504
|
raise ValueError("Data length must be greater than the sequence_length to create at least one window.")
|
|
@@ -768,18 +525,25 @@ class SequenceMaker(_BaseMaker):
|
|
|
768
525
|
strided_y = numpy.lib.stride_tricks.as_strided(y_data, shape=(n_windows, self.sequence_length), strides=(bytes_per_item, bytes_per_item))
|
|
769
526
|
|
|
770
527
|
return _PytorchDataset(strided_x, strided_y, labels_dtype=torch.float32)
|
|
771
|
-
|
|
528
|
+
|
|
772
529
|
def denormalize(self, data: Union[torch.Tensor, numpy.ndarray]) -> numpy.ndarray:
|
|
773
|
-
"""Applies inverse transformation using the stored
|
|
530
|
+
"""Applies inverse transformation using the stored PytorchScaler."""
|
|
774
531
|
if self.scaler is None:
|
|
775
532
|
raise RuntimeError("Data was not normalized. Cannot denormalize.")
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
533
|
+
|
|
534
|
+
# Ensure data is a torch.Tensor
|
|
535
|
+
if isinstance(data, numpy.ndarray):
|
|
536
|
+
tensor_data = torch.tensor(data, dtype=torch.float32)
|
|
779
537
|
else:
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
538
|
+
tensor_data = data
|
|
539
|
+
|
|
540
|
+
# Reshape for the scaler [n_samples, n_features]
|
|
541
|
+
if tensor_data.ndim == 1:
|
|
542
|
+
tensor_data = tensor_data.view(-1, 1)
|
|
543
|
+
|
|
544
|
+
# Apply inverse transform and convert back to a flat numpy array
|
|
545
|
+
original_scale_tensor = self.scaler.inverse_transform(tensor_data)
|
|
546
|
+
return original_scale_tensor.cpu().numpy().flatten()
|
|
783
547
|
|
|
784
548
|
def plot(self, predictions: Optional[numpy.ndarray] = None):
|
|
785
549
|
"""Plots the original training and testing data, with optional predictions."""
|
|
@@ -802,12 +566,46 @@ class SequenceMaker(_BaseMaker):
|
|
|
802
566
|
plt.legend()
|
|
803
567
|
plt.show()
|
|
804
568
|
|
|
805
|
-
def get_datasets(self) -> Tuple[
|
|
569
|
+
def get_datasets(self) -> Tuple[Dataset, Dataset]:
|
|
806
570
|
"""Returns the final train and test datasets."""
|
|
807
571
|
if not self._are_windows_generated:
|
|
808
572
|
raise RuntimeError("Windows have not been generated. Call .generate_windows() first.")
|
|
809
573
|
return self._train_dataset, self._test_dataset
|
|
810
574
|
|
|
811
575
|
|
|
576
|
+
# --- Custom Vision Transform Class ---
|
|
577
|
+
class ResizeAspectFill:
|
|
578
|
+
"""
|
|
579
|
+
Custom transformation to make an image square by padding it to match the
|
|
580
|
+
longest side, preserving the aspect ratio. The image is finally centered.
|
|
581
|
+
|
|
582
|
+
Args:
|
|
583
|
+
pad_color (Union[str, int]): Color to use for the padding.
|
|
584
|
+
Defaults to "black".
|
|
585
|
+
"""
|
|
586
|
+
def __init__(self, pad_color: Union[str, int] = "black") -> None:
|
|
587
|
+
self.pad_color = pad_color
|
|
588
|
+
|
|
589
|
+
def __call__(self, image: Image.Image) -> Image.Image:
|
|
590
|
+
if not isinstance(image, Image.Image):
|
|
591
|
+
raise TypeError(f"Expected PIL.Image.Image, got {type(image).__name__}")
|
|
592
|
+
|
|
593
|
+
w, h = image.size
|
|
594
|
+
if w == h:
|
|
595
|
+
return image
|
|
596
|
+
|
|
597
|
+
# Determine padding to center the image
|
|
598
|
+
if w > h:
|
|
599
|
+
top_padding = (w - h) // 2
|
|
600
|
+
bottom_padding = w - h - top_padding
|
|
601
|
+
padding = (0, top_padding, 0, bottom_padding)
|
|
602
|
+
else: # h > w
|
|
603
|
+
left_padding = (h - w) // 2
|
|
604
|
+
right_padding = h - w - left_padding
|
|
605
|
+
padding = (left_padding, 0, right_padding, 0)
|
|
606
|
+
|
|
607
|
+
return ImageOps.expand(image, padding, fill=self.pad_color)
|
|
608
|
+
|
|
609
|
+
|
|
812
610
|
def info():
|
|
813
611
|
_script_info(__all__)
|