dragon-ml-toolbox 6.4.1__py3-none-any.whl → 8.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-6.4.1.dist-info → dragon_ml_toolbox-8.0.0.dist-info}/METADATA +4 -1
- {dragon_ml_toolbox-6.4.1.dist-info → dragon_ml_toolbox-8.0.0.dist-info}/RECORD +14 -11
- ml_tools/ML_datasetmaster.py +285 -438
- ml_tools/ML_evaluation.py +119 -51
- ml_tools/ML_evaluation_multi.py +296 -0
- ml_tools/ML_inference.py +251 -31
- ml_tools/ML_models.py +468 -47
- ml_tools/ML_scaler.py +197 -0
- ml_tools/ML_trainer.py +246 -73
- ml_tools/_ML_optimization_multi.py +231 -0
- {dragon_ml_toolbox-6.4.1.dist-info → dragon_ml_toolbox-8.0.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-6.4.1.dist-info → dragon_ml_toolbox-8.0.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-6.4.1.dist-info → dragon_ml_toolbox-8.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-6.4.1.dist-info → dragon_ml_toolbox-8.0.0.dist-info}/top_level.txt +0 -0
ml_tools/ML_datasetmaster.py
CHANGED
|
@@ -1,68 +1,30 @@
|
|
|
1
1
|
import torch
|
|
2
2
|
from torch.utils.data import Dataset, Subset
|
|
3
|
-
from torch import nn
|
|
4
3
|
import pandas
|
|
5
4
|
import numpy
|
|
6
5
|
from sklearn.model_selection import train_test_split
|
|
7
|
-
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
|
8
6
|
from typing import Literal, Union, Tuple, List, Optional
|
|
9
|
-
from imblearn.combine import SMOTETomek
|
|
10
7
|
from abc import ABC, abstractmethod
|
|
11
8
|
from PIL import Image, ImageOps
|
|
12
9
|
from torchvision.datasets import ImageFolder
|
|
13
10
|
from torchvision import transforms
|
|
14
11
|
import matplotlib.pyplot as plt
|
|
15
12
|
from pathlib import Path
|
|
16
|
-
from .path_manager import make_fullpath
|
|
13
|
+
from .path_manager import make_fullpath, sanitize_filename
|
|
17
14
|
from ._logger import _LOGGER
|
|
18
15
|
from ._script_info import _script_info
|
|
19
16
|
from .custom_logger import save_list_strings
|
|
17
|
+
from .ML_scaler import PytorchScaler
|
|
20
18
|
|
|
21
|
-
|
|
22
|
-
# --- public-facing API ---
|
|
23
19
|
__all__ = [
|
|
24
20
|
"DatasetMaker",
|
|
25
|
-
"
|
|
21
|
+
"DatasetMakerMulti",
|
|
26
22
|
"VisionDatasetMaker",
|
|
27
23
|
"SequenceMaker",
|
|
28
24
|
"ResizeAspectFill",
|
|
29
25
|
]
|
|
30
26
|
|
|
31
27
|
|
|
32
|
-
# --- Custom Vision Transform Class ---
|
|
33
|
-
class ResizeAspectFill:
|
|
34
|
-
"""
|
|
35
|
-
Custom transformation to make an image square by padding it to match the
|
|
36
|
-
longest side, preserving the aspect ratio. The image is finally centered.
|
|
37
|
-
|
|
38
|
-
Args:
|
|
39
|
-
pad_color (Union[str, int]): Color to use for the padding.
|
|
40
|
-
Defaults to "black".
|
|
41
|
-
"""
|
|
42
|
-
def __init__(self, pad_color: Union[str, int] = "black") -> None:
|
|
43
|
-
self.pad_color = pad_color
|
|
44
|
-
|
|
45
|
-
def __call__(self, image: Image.Image) -> Image.Image:
|
|
46
|
-
if not isinstance(image, Image.Image):
|
|
47
|
-
raise TypeError(f"Expected PIL.Image.Image, got {type(image).__name__}")
|
|
48
|
-
|
|
49
|
-
w, h = image.size
|
|
50
|
-
if w == h:
|
|
51
|
-
return image
|
|
52
|
-
|
|
53
|
-
# Determine padding to center the image
|
|
54
|
-
if w > h:
|
|
55
|
-
top_padding = (w - h) // 2
|
|
56
|
-
bottom_padding = w - h - top_padding
|
|
57
|
-
padding = (0, top_padding, 0, bottom_padding)
|
|
58
|
-
else: # h > w
|
|
59
|
-
left_padding = (h - w) // 2
|
|
60
|
-
right_padding = h - w - left_padding
|
|
61
|
-
padding = (left_padding, 0, right_padding, 0)
|
|
62
|
-
|
|
63
|
-
return ImageOps.expand(image, padding, fill=self.pad_color)
|
|
64
|
-
|
|
65
|
-
|
|
66
28
|
# --- Internal Helper Class ---
|
|
67
29
|
class _PytorchDataset(Dataset):
|
|
68
30
|
"""
|
|
@@ -71,8 +33,13 @@ class _PytorchDataset(Dataset):
|
|
|
71
33
|
"""
|
|
72
34
|
def __init__(self, features: Union[numpy.ndarray, pandas.DataFrame],
|
|
73
35
|
labels: Union[numpy.ndarray, pandas.Series],
|
|
74
|
-
|
|
75
|
-
|
|
36
|
+
labels_dtype: torch.dtype,
|
|
37
|
+
features_dtype: torch.dtype = torch.float32):
|
|
38
|
+
"""
|
|
39
|
+
integer labels for classification.
|
|
40
|
+
|
|
41
|
+
float labels for regression.
|
|
42
|
+
"""
|
|
76
43
|
|
|
77
44
|
if isinstance(features, numpy.ndarray):
|
|
78
45
|
self.features = torch.tensor(features, dtype=features_dtype)
|
|
@@ -91,417 +58,247 @@ class _PytorchDataset(Dataset):
|
|
|
91
58
|
return self.features[index], self.labels[index]
|
|
92
59
|
|
|
93
60
|
|
|
94
|
-
# ---
|
|
95
|
-
|
|
61
|
+
# --- Abstract Base Class (New) ---
|
|
62
|
+
# --- Abstract Base Class (Corrected) ---
|
|
63
|
+
class _BaseDatasetMaker(ABC):
|
|
96
64
|
"""
|
|
97
|
-
Abstract
|
|
98
|
-
|
|
65
|
+
Abstract base class for dataset makers. Contains shared logic for
|
|
66
|
+
splitting, scaling, and accessing datasets to reduce code duplication.
|
|
99
67
|
"""
|
|
100
68
|
def __init__(self):
|
|
101
|
-
self.
|
|
102
|
-
self.
|
|
103
|
-
self.
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
raise ValueError(f"Label column '{label_col}' not found in DataFrame.")
|
|
142
|
-
|
|
143
|
-
self.kind = kind
|
|
144
|
-
self.labels = pandas_df[label_col]
|
|
145
|
-
self.features = pandas_df.drop(columns=label_col)
|
|
146
|
-
self.labels_map = None
|
|
147
|
-
self.scaler = None
|
|
148
|
-
|
|
149
|
-
self._feature_names = self.features.columns.tolist()
|
|
150
|
-
self._target_name = str(self.labels.name)
|
|
151
|
-
|
|
152
|
-
self._is_split = False
|
|
153
|
-
self._is_balanced = False
|
|
154
|
-
self._is_normalized = False
|
|
155
|
-
self._is_categoricals_processed = False
|
|
156
|
-
|
|
157
|
-
self.features_train = None
|
|
158
|
-
self.features_test = None
|
|
159
|
-
self.labels_train = None
|
|
160
|
-
self.labels_test = None
|
|
161
|
-
|
|
162
|
-
self.continuous_columns = None
|
|
163
|
-
|
|
164
|
-
def process_categoricals(self, method: Literal["one-hot", "embed"] = "one-hot",
|
|
165
|
-
cat_features: Union[list[str], None] = None, **kwargs) -> 'DatasetMaker':
|
|
166
|
-
"""
|
|
167
|
-
Encodes categorical features using the specified method.
|
|
168
|
-
|
|
169
|
-
Args:
|
|
170
|
-
method (str, optional): 'one-hot' (default) or 'embed'.
|
|
171
|
-
cat_features (list, optional): A list of categorical column names.
|
|
172
|
-
If None, they will be inferred from the DataFrame's dtypes.
|
|
173
|
-
**kwargs: Additional keyword arguments to pass to the underlying
|
|
174
|
-
pandas.get_dummies() or torch.nn.Embedding() functions.
|
|
175
|
-
For 'one-hot' encoding, it is often recommended to add
|
|
176
|
-
`drop_first=True` to help reduce multicollinearity.
|
|
177
|
-
"""
|
|
178
|
-
if self._is_split:
|
|
179
|
-
raise RuntimeError("Categoricals must be processed before splitting data to avoid data leakage.")
|
|
180
|
-
|
|
181
|
-
if cat_features is None:
|
|
182
|
-
cat_columns = self.features.select_dtypes(include=['object', 'category', 'string']).columns.tolist()
|
|
183
|
-
else:
|
|
184
|
-
cat_columns = cat_features
|
|
185
|
-
|
|
186
|
-
if not cat_columns:
|
|
187
|
-
_LOGGER.info("No categorical features to process.")
|
|
188
|
-
self._is_categoricals_processed = True
|
|
189
|
-
return self
|
|
190
|
-
|
|
191
|
-
continuous_df = self.features.drop(columns=cat_columns)
|
|
192
|
-
# store continuous column names
|
|
193
|
-
self.continuous_columns = continuous_df.columns.tolist()
|
|
194
|
-
|
|
195
|
-
categorical_df = self.features[cat_columns].copy()
|
|
196
|
-
|
|
197
|
-
if method == "one-hot":
|
|
198
|
-
processed_cats = pandas.get_dummies(categorical_df, dtype=numpy.int32, **kwargs)
|
|
199
|
-
elif method == "embed":
|
|
200
|
-
processed_cats = self._embed_categorical(categorical_df, **kwargs)
|
|
201
|
-
else:
|
|
202
|
-
raise ValueError("`method` must be 'one-hot' or 'embed'.")
|
|
203
|
-
|
|
204
|
-
self.features = pandas.concat([continuous_df, processed_cats], axis=1)
|
|
205
|
-
self._is_categoricals_processed = True
|
|
206
|
-
_LOGGER.info("Categorical features processed.")
|
|
207
|
-
return self
|
|
208
|
-
|
|
209
|
-
def normalize_continuous(self, method: Literal["standard", "minmax"] = "standard") -> 'DatasetMaker':
|
|
210
|
-
"""Normalizes all numeric features and saves the scaler."""
|
|
211
|
-
if not self._is_split:
|
|
212
|
-
raise RuntimeError("Continuous features must be normalized AFTER splitting data. Call .split_data() first.")
|
|
213
|
-
if self._is_normalized:
|
|
214
|
-
_LOGGER.warning("⚠️ Data has already been normalized.")
|
|
215
|
-
return self
|
|
69
|
+
self._train_ds: Optional[Dataset] = None
|
|
70
|
+
self._test_ds: Optional[Dataset] = None
|
|
71
|
+
self.scaler: Optional[PytorchScaler] = None
|
|
72
|
+
self._id: Optional[str] = None
|
|
73
|
+
self._feature_names: List[str] = []
|
|
74
|
+
self._X_train_shape = (0,0)
|
|
75
|
+
self._X_test_shape = (0,0)
|
|
76
|
+
self._y_train_shape = (0,)
|
|
77
|
+
self._y_test_shape = (0,)
|
|
78
|
+
|
|
79
|
+
def _prepare_scaler(self, X_train: pandas.DataFrame, y_train: Union[pandas.Series, pandas.DataFrame], X_test: pandas.DataFrame, label_dtype: torch.dtype, continuous_feature_columns: Optional[Union[List[int], List[str]]]):
|
|
80
|
+
"""Internal helper to fit and apply a PytorchScaler."""
|
|
81
|
+
continuous_feature_indices: Optional[List[int]] = None
|
|
82
|
+
if continuous_feature_columns:
|
|
83
|
+
if all(isinstance(c, str) for c in continuous_feature_columns):
|
|
84
|
+
name_to_idx = {name: i for i, name in enumerate(self._feature_names)}
|
|
85
|
+
try:
|
|
86
|
+
continuous_feature_indices = [name_to_idx[name] for name in continuous_feature_columns] # type: ignore
|
|
87
|
+
except KeyError as e:
|
|
88
|
+
raise ValueError(f"Feature column '{e.args[0]}' not found.")
|
|
89
|
+
elif all(isinstance(c, int) for c in continuous_feature_columns):
|
|
90
|
+
continuous_feature_indices = continuous_feature_columns # type: ignore
|
|
91
|
+
else:
|
|
92
|
+
raise TypeError("`continuous_feature_columns` must be a list of all strings or all integers.")
|
|
93
|
+
|
|
94
|
+
X_train_values = X_train.values
|
|
95
|
+
X_test_values = X_test.values
|
|
96
|
+
|
|
97
|
+
if self.scaler is None and continuous_feature_indices:
|
|
98
|
+
_LOGGER.info("Fitting a new PytorchScaler on training data.")
|
|
99
|
+
temp_train_ds = _PytorchDataset(X_train_values, y_train, label_dtype) # type: ignore
|
|
100
|
+
self.scaler = PytorchScaler.fit(temp_train_ds, continuous_feature_indices)
|
|
101
|
+
|
|
102
|
+
if self.scaler and self.scaler.mean_ is not None:
|
|
103
|
+
_LOGGER.info("Applying scaler transformation to train and test feature sets.")
|
|
104
|
+
X_train_tensor = self.scaler.transform(torch.tensor(X_train_values, dtype=torch.float32))
|
|
105
|
+
X_test_tensor = self.scaler.transform(torch.tensor(X_test_values, dtype=torch.float32))
|
|
106
|
+
return X_train_tensor.numpy(), X_test_tensor.numpy()
|
|
107
|
+
|
|
108
|
+
return X_train_values, X_test_values
|
|
216
109
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
if not
|
|
220
|
-
|
|
221
|
-
self._is_normalized = True
|
|
222
|
-
return self
|
|
110
|
+
@property
|
|
111
|
+
def train_dataset(self) -> Dataset:
|
|
112
|
+
if self._train_ds is None: raise RuntimeError("Dataset not yet created.")
|
|
113
|
+
return self._train_ds
|
|
223
114
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
else:
|
|
229
|
-
raise ValueError("Normalization `method` must be 'standard' or 'minmax'.")
|
|
115
|
+
@property
|
|
116
|
+
def test_dataset(self) -> Dataset:
|
|
117
|
+
if self._test_ds is None: raise RuntimeError("Dataset not yet created.")
|
|
118
|
+
return self._test_ds
|
|
230
119
|
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
self._is_normalized = True
|
|
236
|
-
_LOGGER.info(f"Continuous features normalized using {self.scaler.__class__.__name__}. Scaler stored in `self.scaler`.")
|
|
237
|
-
return self
|
|
120
|
+
@property
|
|
121
|
+
def feature_names(self) -> list[str]:
|
|
122
|
+
return self._feature_names
|
|
238
123
|
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
_LOGGER.warning("⚠️ Data has already been split.")
|
|
243
|
-
return self
|
|
124
|
+
@property
|
|
125
|
+
def id(self) -> Optional[str]:
|
|
126
|
+
return self._id
|
|
244
127
|
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
_LOGGER.info("Labels have been encoded. Mapping stored in `self.labels_map`.")
|
|
128
|
+
@id.setter
|
|
129
|
+
def id(self, dataset_id: str):
|
|
130
|
+
if not isinstance(dataset_id, str): raise ValueError("ID must be a string.")
|
|
131
|
+
self._id = dataset_id
|
|
250
132
|
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
)
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
133
|
+
def dataframes_info(self) -> None:
|
|
134
|
+
print("--- DataFrame Shapes After Split ---")
|
|
135
|
+
print(f" X_train shape: {self._X_train_shape}, y_train shape: {self._y_train_shape}")
|
|
136
|
+
print(f" X_test shape: {self._X_test_shape}, y_test shape: {self._y_test_shape}")
|
|
137
|
+
print("------------------------------------")
|
|
138
|
+
|
|
139
|
+
def save_feature_names(self, directory: Union[str, Path], verbose: bool=True) -> None:
|
|
140
|
+
"""Saves a list of feature names as a text file"""
|
|
141
|
+
save_list_strings(list_strings=self._feature_names,
|
|
142
|
+
directory=directory,
|
|
143
|
+
filename="feature_names",
|
|
144
|
+
verbose=verbose)
|
|
260
145
|
|
|
261
|
-
def
|
|
262
|
-
"""
|
|
263
|
-
Only useful for classification tasks.
|
|
264
|
-
|
|
265
|
-
Balances the training data using a specified resampler.
|
|
266
|
-
|
|
267
|
-
Defaults to `SMOTETomek`.
|
|
146
|
+
def save_scaler(self, save_dir: Union[str, Path]):
|
|
268
147
|
"""
|
|
269
|
-
|
|
270
|
-
raise RuntimeError("❌ Cannot balance data before it has been split. Call .split_data() first.")
|
|
271
|
-
if self._is_balanced:
|
|
272
|
-
_LOGGER.warning("⚠️ Training data has already been balanced.")
|
|
273
|
-
return self
|
|
148
|
+
Saves the fitted PytorchScaler's state to a .pth file.
|
|
274
149
|
|
|
275
|
-
|
|
276
|
-
resampler = SMOTETomek(**kwargs)
|
|
277
|
-
|
|
278
|
-
_LOGGER.info(f"Balancing training data with {resampler.__class__.__name__}...")
|
|
279
|
-
self.features_train, self.labels_train = resampler.fit_resample(self.features_train, self.labels_train) # type: ignore
|
|
150
|
+
The filename is automatically generated based on the dataset id.
|
|
280
151
|
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
return self
|
|
284
|
-
|
|
285
|
-
def auto_process(self, test_size: float = 0.2, cat_method: Literal["one-hot", "embed"] = "one-hot", normalize_method: Literal["standard", "minmax"] = "standard",
|
|
286
|
-
balance: bool = False, random_state: Optional[int] = None) -> 'DatasetMaker':
|
|
287
|
-
"""Runs a standard, fully automated preprocessing pipeline."""
|
|
288
|
-
_LOGGER.info("--- 🤖 Running Automated Processing Pipeline ---")
|
|
289
|
-
self.process_categoricals(method=cat_method)
|
|
290
|
-
self.split_data(test_size=test_size, stratify=True, random_state=random_state)
|
|
291
|
-
self.normalize_continuous(method=normalize_method)
|
|
292
|
-
if balance:
|
|
293
|
-
self.balance_data()
|
|
294
|
-
_LOGGER.info("--- 🤖 Automated Processing Complete ---")
|
|
295
|
-
return self
|
|
296
|
-
|
|
297
|
-
def denormalize(self, data: Union[torch.Tensor, numpy.ndarray, pandas.DataFrame]) -> Union[numpy.ndarray, pandas.DataFrame]:
|
|
152
|
+
Args:
|
|
153
|
+
save_dir (str | Path): The directory where the scaler will be saved.
|
|
298
154
|
"""
|
|
299
|
-
|
|
300
|
-
|
|
155
|
+
if not self.scaler: raise RuntimeError("No scaler was fitted or provided.")
|
|
156
|
+
if not self.id: raise ValueError("Must set the `id` before saving scaler.")
|
|
157
|
+
save_path = make_fullpath(save_dir, make=True, enforce="directory")
|
|
158
|
+
sanitized_id = sanitize_filename(self.id)
|
|
159
|
+
filename = f"scaler_{sanitized_id}.pth"
|
|
160
|
+
filepath = save_path / filename
|
|
161
|
+
self.scaler.save(filepath)
|
|
162
|
+
_LOGGER.info(f"Scaler for dataset '{self.id}' saved to '{filepath.name}'.")
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
# Single target dataset
|
|
166
|
+
class DatasetMaker(_BaseDatasetMaker):
|
|
167
|
+
"""
|
|
168
|
+
Dataset maker for pre-processed, numerical pandas DataFrames with a single target column.
|
|
301
169
|
|
|
170
|
+
This class takes a DataFrame, automatically splits it into training and
|
|
171
|
+
testing sets, and converts them into PyTorch Datasets. It assumes the
|
|
172
|
+
target variable is the last column. It can also create, apply, and
|
|
173
|
+
save a PytorchScaler for standardizing continuous features.
|
|
174
|
+
|
|
175
|
+
Attributes:
|
|
176
|
+
`scaler` -> PytorchScaler | None
|
|
177
|
+
`train_dataset` -> PyTorch Dataset
|
|
178
|
+
`test_dataset` -> PyTorch Dataset
|
|
179
|
+
`feature_names` -> list[str]
|
|
180
|
+
`target_name` -> str
|
|
181
|
+
`id` -> str
|
|
182
|
+
|
|
183
|
+
The ID can be manually set to any string if needed, it is the target name by default.
|
|
184
|
+
"""
|
|
185
|
+
def __init__(self,
|
|
186
|
+
pandas_df: pandas.DataFrame,
|
|
187
|
+
kind: Literal["regression", "classification"],
|
|
188
|
+
test_size: float = 0.2,
|
|
189
|
+
random_state: int = 42,
|
|
190
|
+
scaler: Optional[PytorchScaler] = None,
|
|
191
|
+
continuous_feature_columns: Optional[Union[List[int], List[str]]] = None):
|
|
192
|
+
"""
|
|
302
193
|
Args:
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
DataFrame, otherwise returns a NumPy array.
|
|
194
|
+
pandas_df (pandas.DataFrame): The pre-processed input DataFrame with numerical data.
|
|
195
|
+
kind (Literal["regression", "classification"]): The type of ML task. This determines the data type of the labels.
|
|
196
|
+
test_size (float): The proportion of the dataset to allocate to the test split.
|
|
197
|
+
random_state (int): The seed for the random number generator for reproducibility.
|
|
198
|
+
scaler (PytorchScaler | None): A pre-fitted PytorchScaler instance.
|
|
199
|
+
continuous_feature_columns (List[int] | List[str] | None): Column indices or names of continuous features to scale. If provided creates a new PytorchScaler.
|
|
310
200
|
"""
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
if isinstance(data, pandas.DataFrame):
|
|
315
|
-
# If input is a DataFrame, denormalize in place and return a copy
|
|
316
|
-
if not all(col in data.columns for col in self.scaler_columns): # type: ignore
|
|
317
|
-
raise ValueError(f"Input DataFrame is missing one or more required columns for denormalization. Required: {self.scaler_columns}")
|
|
318
|
-
|
|
319
|
-
output_df = data.copy()
|
|
320
|
-
output_df[self.scaler_columns] = self.scaler.inverse_transform(data[self.scaler_columns]) # type: ignore
|
|
321
|
-
return output_df
|
|
201
|
+
super().__init__()
|
|
202
|
+
self.scaler = scaler
|
|
322
203
|
|
|
323
|
-
#
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
if data_np.ndim == 1:
|
|
330
|
-
data_np = data_np.reshape(-1, 1)
|
|
331
|
-
|
|
332
|
-
if data_np.shape[1] != len(self.scaler_columns): # type: ignore
|
|
333
|
-
raise ValueError(f"Input array has {data_np.shape[1]} columns, but scaler was fitted on {len(self.scaler_columns)} columns.") # type: ignore
|
|
334
|
-
|
|
335
|
-
return self.scaler.inverse_transform(data_np)
|
|
204
|
+
# --- 1. Identify features and target (single-target logic) ---
|
|
205
|
+
features = pandas_df.iloc[:, :-1]
|
|
206
|
+
target = pandas_df.iloc[:, -1]
|
|
207
|
+
self._feature_names = features.columns.tolist()
|
|
208
|
+
self._target_name = str(target.name)
|
|
209
|
+
self._id = self._target_name
|
|
336
210
|
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
211
|
+
# --- 2. Split ---
|
|
212
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
213
|
+
features, target, test_size=test_size, random_state=random_state
|
|
214
|
+
)
|
|
215
|
+
self._X_train_shape, self._X_test_shape = X_train.shape, X_test.shape
|
|
216
|
+
self._y_train_shape, self._y_test_shape = y_train.shape, y_test.shape
|
|
343
217
|
|
|
344
|
-
|
|
345
|
-
|
|
218
|
+
label_dtype = torch.float32 if kind == "regression" else torch.int64
|
|
219
|
+
|
|
220
|
+
# --- 3. Scale ---
|
|
221
|
+
X_train_final, X_test_final = self._prepare_scaler(
|
|
222
|
+
X_train, y_train, X_test, label_dtype, continuous_feature_columns
|
|
223
|
+
)
|
|
346
224
|
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
"""Utility method to inspect the processed data as Pandas DataFrames."""
|
|
351
|
-
if not self._is_split:
|
|
352
|
-
raise RuntimeError("Data has not been split yet. Call .split_data() or .process() first.")
|
|
353
|
-
return self.features_train, self.features_test, self.labels_train, self.labels_test # type: ignore
|
|
354
|
-
|
|
355
|
-
@property
|
|
356
|
-
def feature_names(self) -> list[str]:
|
|
357
|
-
"""Returns the list of feature column names."""
|
|
358
|
-
return self._feature_names
|
|
225
|
+
# --- 4. Create Datasets ---
|
|
226
|
+
self._train_ds = _PytorchDataset(X_train_final, y_train.values, label_dtype)
|
|
227
|
+
self._test_ds = _PytorchDataset(X_test_final, y_test.values, label_dtype)
|
|
359
228
|
|
|
360
229
|
@property
|
|
361
230
|
def target_name(self) -> str:
|
|
362
|
-
"""Returns the name of the target column."""
|
|
363
231
|
return self._target_name
|
|
364
|
-
|
|
365
|
-
def save_feature_names(self, directory: Union[str, Path], verbose: bool=True) -> None:
|
|
366
|
-
"""Saves a list of feature names as a text file"""
|
|
367
|
-
save_list_strings(list_strings=self._feature_names,
|
|
368
|
-
directory=directory,
|
|
369
|
-
filename="feature_names",
|
|
370
|
-
verbose=verbose)
|
|
371
|
-
|
|
372
|
-
@staticmethod
|
|
373
|
-
def _embed_categorical(cat_df: pandas.DataFrame, random_state: Optional[int] = None, **kwargs) -> pandas.DataFrame:
|
|
374
|
-
"""Internal helper to perform embedding on categorical features."""
|
|
375
|
-
embedded_tensors = []
|
|
376
|
-
new_columns = []
|
|
377
|
-
for col in cat_df.columns:
|
|
378
|
-
cat_series = cat_df[col].astype("category")
|
|
379
|
-
num_categories = len(cat_series.cat.categories)
|
|
380
|
-
embedding_dim = min(50, (num_categories + 1) // 2)
|
|
381
|
-
|
|
382
|
-
if random_state:
|
|
383
|
-
torch.manual_seed(random_state)
|
|
384
|
-
|
|
385
|
-
embedder = nn.Embedding(num_embeddings=num_categories, embedding_dim=embedding_dim, **kwargs)
|
|
386
|
-
|
|
387
|
-
with torch.no_grad():
|
|
388
|
-
codes = torch.LongTensor(cat_series.cat.codes.values)
|
|
389
|
-
embedded_tensors.append(embedder(codes))
|
|
390
|
-
|
|
391
|
-
new_columns.extend([f"{col}_{i+1}" for i in range(embedding_dim)])
|
|
392
|
-
|
|
393
|
-
with torch.no_grad():
|
|
394
|
-
full_tensor = torch.cat(embedded_tensors, dim=1)
|
|
395
|
-
return pandas.DataFrame(full_tensor.numpy(), columns=new_columns, index=cat_df.index)
|
|
396
232
|
|
|
397
233
|
|
|
398
|
-
#
|
|
399
|
-
class
|
|
234
|
+
# --- New Multi-Target Class ---
|
|
235
|
+
class DatasetMakerMulti(_BaseDatasetMaker):
|
|
400
236
|
"""
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
This class takes a DataFrame, automatically splits it into training and
|
|
404
|
-
testing sets, and converts them into PyTorch Datasets. It assumes the
|
|
405
|
-
target variable is the last column.
|
|
237
|
+
Dataset maker for pre-processed, numerical pandas DataFrames with a multiple target columns.
|
|
406
238
|
|
|
407
|
-
|
|
408
|
-
pandas_df (pandas.DataFrame): The pre-processed input DataFrame with numerical data.
|
|
409
|
-
kind (Literal["regression", "classification"]): The type of ML task. This determines the data type of the labels.
|
|
410
|
-
test_size (float): The proportion of the dataset to allocate to the
|
|
411
|
-
test split.
|
|
412
|
-
random_state (int): The seed for the random number generator for
|
|
413
|
-
reproducibility.
|
|
239
|
+
This class takes a DataFrame, automatically splits it into training and testing sets, and converts them into PyTorch Datasets.
|
|
414
240
|
"""
|
|
415
|
-
def __init__(self,
|
|
241
|
+
def __init__(self,
|
|
242
|
+
pandas_df: pandas.DataFrame,
|
|
243
|
+
target_columns: List[str],
|
|
244
|
+
test_size: float = 0.2,
|
|
245
|
+
random_state: int = 42,
|
|
246
|
+
scaler: Optional[PytorchScaler] = None,
|
|
247
|
+
continuous_feature_columns: Optional[Union[List[int], List[str]]] = None):
|
|
416
248
|
"""
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
The ID can be manually set to any string if needed, it is `None` by default.
|
|
249
|
+
Args:
|
|
250
|
+
pandas_df (pandas.DataFrame): The pre-processed input DataFrame with numerical data.
|
|
251
|
+
target_columns (list[str]): List of target column names.
|
|
252
|
+
test_size (float): The proportion of the dataset to allocate to the test split.
|
|
253
|
+
random_state (int): The seed for the random number generator for reproducibility.
|
|
254
|
+
scaler (PytorchScaler | None): A pre-fitted PytorchScaler instance.
|
|
255
|
+
continuous_feature_columns (List[int] | List[str] | None): Column indices or names of continuous features to scale. If provided creates a new PytorchScaler.
|
|
425
256
|
"""
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
raise TypeError("Input must be a pandas.DataFrame.")
|
|
429
|
-
if kind not in ["regression", "classification"]:
|
|
430
|
-
raise ValueError("`kind` must be 'regression' or 'classification'.")
|
|
257
|
+
super().__init__()
|
|
258
|
+
self.scaler = scaler
|
|
431
259
|
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
260
|
+
self._target_names = target_columns
|
|
261
|
+
self._feature_names = [col for col in pandas_df.columns if col not in target_columns]
|
|
262
|
+
features = pandas_df[self._feature_names]
|
|
263
|
+
target = pandas_df[self._target_names]
|
|
435
264
|
|
|
436
|
-
self._feature_names = features.columns.tolist()
|
|
437
|
-
self._target_name = str(target.name)
|
|
438
|
-
|
|
439
|
-
#set id
|
|
440
|
-
self._id: Optional[str] = None
|
|
441
|
-
|
|
442
|
-
# 2. Split the data
|
|
443
265
|
X_train, X_test, y_train, y_test = train_test_split(
|
|
444
266
|
features, target, test_size=test_size, random_state=random_state
|
|
445
267
|
)
|
|
446
|
-
|
|
447
|
-
self.
|
|
448
|
-
self._X_test_shape = X_test.shape
|
|
449
|
-
self._y_train_shape = y_train.shape
|
|
450
|
-
self._y_test_shape = y_test.shape
|
|
451
|
-
|
|
452
|
-
# 3. Convert to PyTorch Datasets with the correct label dtype
|
|
453
|
-
label_dtype = torch.float32 if kind == "regression" else torch.int64
|
|
268
|
+
self._X_train_shape, self._X_test_shape = X_train.shape, X_test.shape
|
|
269
|
+
self._y_train_shape, self._y_test_shape = y_train.shape, y_test.shape
|
|
454
270
|
|
|
455
|
-
|
|
456
|
-
self._test_ds = _PytorchDataset(X_test.values, y_test.values, labels_dtype=label_dtype)
|
|
271
|
+
label_dtype = torch.float32
|
|
457
272
|
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
273
|
+
X_train_final, X_test_final = self._prepare_scaler(
|
|
274
|
+
X_train, y_train, X_test, label_dtype, continuous_feature_columns
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
self._train_ds = _PytorchDataset(X_train_final, y_train, label_dtype)
|
|
278
|
+
self._test_ds = _PytorchDataset(X_test_final, y_test, label_dtype)
|
|
462
279
|
|
|
463
280
|
@property
|
|
464
|
-
def
|
|
465
|
-
|
|
466
|
-
return self._test_ds
|
|
281
|
+
def target_names(self) -> list[str]:
|
|
282
|
+
return self._target_names
|
|
467
283
|
|
|
468
|
-
@property
|
|
469
|
-
def feature_names(self) -> list[str]:
|
|
470
|
-
"""Returns the list of feature column names."""
|
|
471
|
-
return self._feature_names
|
|
472
284
|
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
@id.setter
|
|
484
|
-
def id(self, dataset_id: str):
|
|
485
|
-
"""Sets the ID value"""
|
|
486
|
-
if not isinstance(dataset_id, str):
|
|
487
|
-
raise ValueError(f"Dataset ID '{type(dataset_id)}' is not a string.")
|
|
488
|
-
self._id = dataset_id
|
|
285
|
+
# --- Private Base Class ---
|
|
286
|
+
class _BaseMaker(ABC):
|
|
287
|
+
"""
|
|
288
|
+
Abstract Base Class for extra dataset makers.
|
|
289
|
+
"""
|
|
290
|
+
def __init__(self):
|
|
291
|
+
self._train_dataset = None
|
|
292
|
+
self._test_dataset = None
|
|
293
|
+
self._val_dataset = None
|
|
489
294
|
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
print("-------------------------------------------")
|
|
498
|
-
|
|
499
|
-
def save_feature_names(self, directory: Union[str, Path], verbose: bool=True) -> None:
|
|
500
|
-
"""Saves a list of feature names as a text file"""
|
|
501
|
-
save_list_strings(list_strings=self._feature_names,
|
|
502
|
-
directory=directory,
|
|
503
|
-
filename="feature_names",
|
|
504
|
-
verbose=verbose)
|
|
295
|
+
@abstractmethod
|
|
296
|
+
def get_datasets(self) -> Tuple[Dataset, ...]:
|
|
297
|
+
"""
|
|
298
|
+
The primary method to retrieve the final, processed PyTorch datasets.
|
|
299
|
+
Must be implemented by all subclasses.
|
|
300
|
+
"""
|
|
301
|
+
pass
|
|
505
302
|
|
|
506
303
|
|
|
507
304
|
# --- VisionDatasetMaker ---
|
|
@@ -654,6 +451,7 @@ class SequenceMaker(_BaseMaker):
|
|
|
654
451
|
1. `.split_data()`: Separate time series into training and testing portions.
|
|
655
452
|
2. `.normalize_data()`: Normalize the data. The scaler will be fitted on the training portion.
|
|
656
453
|
3. `.generate_windows()`: Create the windowed sequences from the split and normalized data.
|
|
454
|
+
4. `.get_datasets()`: Return Pytorch train and test datasets.
|
|
657
455
|
"""
|
|
658
456
|
def __init__(self, data: Union[pandas.DataFrame, pandas.Series, numpy.ndarray], sequence_length: int):
|
|
659
457
|
super().__init__()
|
|
@@ -679,33 +477,41 @@ class SequenceMaker(_BaseMaker):
|
|
|
679
477
|
self._is_normalized = False
|
|
680
478
|
self._are_windows_generated = False
|
|
681
479
|
|
|
682
|
-
def normalize_data(self
|
|
480
|
+
def normalize_data(self) -> 'SequenceMaker':
|
|
683
481
|
"""
|
|
684
|
-
Normalizes the sequence data. Must be called AFTER
|
|
482
|
+
Normalizes the sequence data using PytorchScaler. Must be called AFTER
|
|
483
|
+
splitting to prevent data leakage from the test set.
|
|
685
484
|
"""
|
|
686
485
|
if not self._is_split:
|
|
687
486
|
raise RuntimeError("Data must be split BEFORE normalizing. Call .split_data() first.")
|
|
688
|
-
|
|
487
|
+
|
|
689
488
|
if self.scaler:
|
|
690
489
|
_LOGGER.warning("⚠️ Data has already been normalized.")
|
|
691
490
|
return self
|
|
692
|
-
|
|
693
|
-
if method == "standard":
|
|
694
|
-
self.scaler = StandardScaler()
|
|
695
|
-
elif method == "minmax":
|
|
696
|
-
self.scaler = MinMaxScaler(feature_range=(-1, 1))
|
|
697
|
-
else:
|
|
698
|
-
raise ValueError("Normalization `method` must be 'standard' or 'minmax'.")
|
|
699
491
|
|
|
700
|
-
#
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
492
|
+
# 1. PytorchScaler requires a Dataset to fit. Create a temporary one.
|
|
493
|
+
# The scaler expects 2D data [n_samples, n_features].
|
|
494
|
+
train_features = self.train_sequence.reshape(-1, 1) # type: ignore
|
|
495
|
+
|
|
496
|
+
# _PytorchDataset needs labels, so we create dummy ones.
|
|
497
|
+
dummy_labels = numpy.zeros(len(train_features))
|
|
498
|
+
temp_train_ds = _PytorchDataset(train_features, dummy_labels, labels_dtype=torch.float32)
|
|
499
|
+
|
|
500
|
+
# 2. Fit the PytorchScaler on the temporary training dataset.
|
|
501
|
+
# The sequence is a single feature, so its index is [0].
|
|
502
|
+
_LOGGER.info("Fitting PytorchScaler on the training data...")
|
|
503
|
+
self.scaler = PytorchScaler.fit(temp_train_ds, continuous_feature_indices=[0])
|
|
504
|
+
|
|
505
|
+
# 3. Transform sequences using the fitted scaler.
|
|
506
|
+
# The transform method requires a tensor, so we convert, transform, and convert back.
|
|
507
|
+
train_tensor = torch.tensor(self.train_sequence.reshape(-1, 1), dtype=torch.float32) # type: ignore
|
|
508
|
+
test_tensor = torch.tensor(self.test_sequence.reshape(-1, 1), dtype=torch.float32) # type: ignore
|
|
509
|
+
|
|
510
|
+
self.train_sequence = self.scaler.transform(train_tensor).numpy().flatten()
|
|
511
|
+
self.test_sequence = self.scaler.transform(test_tensor).numpy().flatten()
|
|
512
|
+
|
|
707
513
|
self._is_normalized = True
|
|
708
|
-
_LOGGER.info(
|
|
514
|
+
_LOGGER.info("✅ Sequence data normalized using PytorchScaler.")
|
|
709
515
|
return self
|
|
710
516
|
|
|
711
517
|
def split_data(self, test_size: float = 0.2) -> 'SequenceMaker':
|
|
@@ -741,7 +547,7 @@ class SequenceMaker(_BaseMaker):
|
|
|
741
547
|
_LOGGER.info("Feature and label windows generated for train and test sets.")
|
|
742
548
|
return self
|
|
743
549
|
|
|
744
|
-
def _create_windowed_dataset(self, data: numpy.ndarray, use_sequence_labels: bool) ->
|
|
550
|
+
def _create_windowed_dataset(self, data: numpy.ndarray, use_sequence_labels: bool) -> Dataset:
|
|
745
551
|
"""Efficiently creates windowed features and labels using numpy."""
|
|
746
552
|
if len(data) <= self.sequence_length:
|
|
747
553
|
raise ValueError("Data length must be greater than the sequence_length to create at least one window.")
|
|
@@ -768,18 +574,25 @@ class SequenceMaker(_BaseMaker):
|
|
|
768
574
|
strided_y = numpy.lib.stride_tricks.as_strided(y_data, shape=(n_windows, self.sequence_length), strides=(bytes_per_item, bytes_per_item))
|
|
769
575
|
|
|
770
576
|
return _PytorchDataset(strided_x, strided_y, labels_dtype=torch.float32)
|
|
771
|
-
|
|
577
|
+
|
|
772
578
|
def denormalize(self, data: Union[torch.Tensor, numpy.ndarray]) -> numpy.ndarray:
|
|
773
|
-
"""Applies inverse transformation using the stored
|
|
579
|
+
"""Applies inverse transformation using the stored PytorchScaler."""
|
|
774
580
|
if self.scaler is None:
|
|
775
581
|
raise RuntimeError("Data was not normalized. Cannot denormalize.")
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
582
|
+
|
|
583
|
+
# Ensure data is a torch.Tensor
|
|
584
|
+
if isinstance(data, numpy.ndarray):
|
|
585
|
+
tensor_data = torch.tensor(data, dtype=torch.float32)
|
|
779
586
|
else:
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
587
|
+
tensor_data = data
|
|
588
|
+
|
|
589
|
+
# Reshape for the scaler [n_samples, n_features]
|
|
590
|
+
if tensor_data.ndim == 1:
|
|
591
|
+
tensor_data = tensor_data.view(-1, 1)
|
|
592
|
+
|
|
593
|
+
# Apply inverse transform and convert back to a flat numpy array
|
|
594
|
+
original_scale_tensor = self.scaler.inverse_transform(tensor_data)
|
|
595
|
+
return original_scale_tensor.cpu().numpy().flatten()
|
|
783
596
|
|
|
784
597
|
def plot(self, predictions: Optional[numpy.ndarray] = None):
|
|
785
598
|
"""Plots the original training and testing data, with optional predictions."""
|
|
@@ -802,12 +615,46 @@ class SequenceMaker(_BaseMaker):
|
|
|
802
615
|
plt.legend()
|
|
803
616
|
plt.show()
|
|
804
617
|
|
|
805
|
-
def get_datasets(self) -> Tuple[
|
|
618
|
+
def get_datasets(self) -> Tuple[Dataset, Dataset]:
|
|
806
619
|
"""Returns the final train and test datasets."""
|
|
807
620
|
if not self._are_windows_generated:
|
|
808
621
|
raise RuntimeError("Windows have not been generated. Call .generate_windows() first.")
|
|
809
622
|
return self._train_dataset, self._test_dataset
|
|
810
623
|
|
|
811
624
|
|
|
625
|
+
# --- Custom Vision Transform Class ---
|
|
626
|
+
class ResizeAspectFill:
|
|
627
|
+
"""
|
|
628
|
+
Custom transformation to make an image square by padding it to match the
|
|
629
|
+
longest side, preserving the aspect ratio. The image is finally centered.
|
|
630
|
+
|
|
631
|
+
Args:
|
|
632
|
+
pad_color (Union[str, int]): Color to use for the padding.
|
|
633
|
+
Defaults to "black".
|
|
634
|
+
"""
|
|
635
|
+
def __init__(self, pad_color: Union[str, int] = "black") -> None:
|
|
636
|
+
self.pad_color = pad_color
|
|
637
|
+
|
|
638
|
+
def __call__(self, image: Image.Image) -> Image.Image:
|
|
639
|
+
if not isinstance(image, Image.Image):
|
|
640
|
+
raise TypeError(f"Expected PIL.Image.Image, got {type(image).__name__}")
|
|
641
|
+
|
|
642
|
+
w, h = image.size
|
|
643
|
+
if w == h:
|
|
644
|
+
return image
|
|
645
|
+
|
|
646
|
+
# Determine padding to center the image
|
|
647
|
+
if w > h:
|
|
648
|
+
top_padding = (w - h) // 2
|
|
649
|
+
bottom_padding = w - h - top_padding
|
|
650
|
+
padding = (0, top_padding, 0, bottom_padding)
|
|
651
|
+
else: # h > w
|
|
652
|
+
left_padding = (h - w) // 2
|
|
653
|
+
right_padding = h - w - left_padding
|
|
654
|
+
padding = (left_padding, 0, right_padding, 0)
|
|
655
|
+
|
|
656
|
+
return ImageOps.expand(image, padding, fill=self.pad_color)
|
|
657
|
+
|
|
658
|
+
|
|
812
659
|
def info():
|
|
813
660
|
_script_info(__all__)
|