dragon-ml-toolbox 10.2.0__py3-none-any.whl → 14.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/METADATA +38 -63
- dragon_ml_toolbox-14.2.0.dist-info/RECORD +48 -0
- {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE +1 -1
- {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +11 -0
- ml_tools/ETL_cleaning.py +72 -34
- ml_tools/ETL_engineering.py +506 -70
- ml_tools/GUI_tools.py +2 -1
- ml_tools/MICE_imputation.py +212 -7
- ml_tools/ML_callbacks.py +73 -40
- ml_tools/ML_datasetmaster.py +267 -284
- ml_tools/ML_evaluation.py +119 -58
- ml_tools/ML_evaluation_multi.py +107 -32
- ml_tools/ML_inference.py +15 -5
- ml_tools/ML_models.py +234 -170
- ml_tools/ML_models_advanced.py +323 -0
- ml_tools/ML_optimization.py +321 -97
- ml_tools/ML_scaler.py +10 -5
- ml_tools/ML_trainer.py +585 -40
- ml_tools/ML_utilities.py +528 -0
- ml_tools/ML_vision_datasetmaster.py +1315 -0
- ml_tools/ML_vision_evaluation.py +260 -0
- ml_tools/ML_vision_inference.py +428 -0
- ml_tools/ML_vision_models.py +627 -0
- ml_tools/ML_vision_transformers.py +58 -0
- ml_tools/PSO_optimization.py +10 -7
- ml_tools/RNN_forecast.py +2 -0
- ml_tools/SQL.py +22 -9
- ml_tools/VIF_factor.py +4 -3
- ml_tools/_ML_vision_recipe.py +88 -0
- ml_tools/__init__.py +1 -0
- ml_tools/_logger.py +0 -2
- ml_tools/_schema.py +96 -0
- ml_tools/constants.py +79 -0
- ml_tools/custom_logger.py +164 -16
- ml_tools/data_exploration.py +1092 -109
- ml_tools/ensemble_evaluation.py +48 -1
- ml_tools/ensemble_inference.py +6 -7
- ml_tools/ensemble_learning.py +4 -3
- ml_tools/handle_excel.py +1 -0
- ml_tools/keys.py +80 -0
- ml_tools/math_utilities.py +259 -0
- ml_tools/optimization_tools.py +198 -24
- ml_tools/path_manager.py +144 -45
- ml_tools/serde.py +192 -0
- ml_tools/utilities.py +287 -227
- dragon_ml_toolbox-10.2.0.dist-info/RECORD +0 -36
- {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/top_level.txt +0 -0
ml_tools/ML_datasetmaster.py
CHANGED
|
@@ -1,27 +1,26 @@
|
|
|
1
1
|
import torch
|
|
2
|
-
from torch.utils.data import Dataset
|
|
2
|
+
from torch.utils.data import Dataset
|
|
3
3
|
import pandas
|
|
4
4
|
import numpy
|
|
5
5
|
from sklearn.model_selection import train_test_split
|
|
6
6
|
from typing import Literal, Union, Tuple, List, Optional
|
|
7
7
|
from abc import ABC, abstractmethod
|
|
8
|
-
from PIL import Image, ImageOps
|
|
9
|
-
from torchvision.datasets import ImageFolder
|
|
10
|
-
from torchvision import transforms
|
|
11
8
|
import matplotlib.pyplot as plt
|
|
12
9
|
from pathlib import Path
|
|
10
|
+
|
|
13
11
|
from .path_manager import make_fullpath, sanitize_filename
|
|
14
12
|
from ._logger import _LOGGER
|
|
15
13
|
from ._script_info import _script_info
|
|
16
14
|
from .custom_logger import save_list_strings
|
|
17
15
|
from .ML_scaler import PytorchScaler
|
|
16
|
+
from .keys import DatasetKeys
|
|
17
|
+
from ._schema import FeatureSchema
|
|
18
|
+
|
|
18
19
|
|
|
19
20
|
__all__ = [
|
|
20
21
|
"DatasetMaker",
|
|
21
22
|
"DatasetMakerMulti",
|
|
22
|
-
"
|
|
23
|
-
"SequenceMaker",
|
|
24
|
-
"ResizeAspectFill",
|
|
23
|
+
"SequenceMaker"
|
|
25
24
|
]
|
|
26
25
|
|
|
27
26
|
|
|
@@ -32,9 +31,11 @@ class _PytorchDataset(Dataset):
|
|
|
32
31
|
Converts numpy/pandas data into tensors for model consumption.
|
|
33
32
|
"""
|
|
34
33
|
def __init__(self, features: Union[numpy.ndarray, pandas.DataFrame],
|
|
35
|
-
labels: Union[numpy.ndarray, pandas.Series],
|
|
34
|
+
labels: Union[numpy.ndarray, pandas.Series, pandas.DataFrame],
|
|
36
35
|
labels_dtype: torch.dtype,
|
|
37
|
-
features_dtype: torch.dtype = torch.float32
|
|
36
|
+
features_dtype: torch.dtype = torch.float32,
|
|
37
|
+
feature_names: Optional[List[str]] = None,
|
|
38
|
+
target_names: Optional[List[str]] = None):
|
|
38
39
|
"""
|
|
39
40
|
integer labels for classification.
|
|
40
41
|
|
|
@@ -43,23 +44,43 @@ class _PytorchDataset(Dataset):
|
|
|
43
44
|
|
|
44
45
|
if isinstance(features, numpy.ndarray):
|
|
45
46
|
self.features = torch.tensor(features, dtype=features_dtype)
|
|
46
|
-
else:
|
|
47
|
-
self.features = torch.tensor(features.
|
|
47
|
+
else: # It's a pandas.DataFrame
|
|
48
|
+
self.features = torch.tensor(features.to_numpy(), dtype=features_dtype)
|
|
48
49
|
|
|
49
50
|
if isinstance(labels, numpy.ndarray):
|
|
50
51
|
self.labels = torch.tensor(labels, dtype=labels_dtype)
|
|
52
|
+
elif isinstance(labels, (pandas.Series, pandas.DataFrame)):
|
|
53
|
+
self.labels = torch.tensor(labels.to_numpy(), dtype=labels_dtype)
|
|
51
54
|
else:
|
|
52
|
-
|
|
55
|
+
# Fallback for other types (though your type hints don't cover this)
|
|
56
|
+
self.labels = torch.tensor(labels, dtype=labels_dtype)
|
|
57
|
+
|
|
58
|
+
self._feature_names = feature_names
|
|
59
|
+
self._target_names = target_names
|
|
53
60
|
|
|
54
61
|
def __len__(self):
|
|
55
62
|
return len(self.features)
|
|
56
63
|
|
|
57
64
|
def __getitem__(self, index):
|
|
58
65
|
return self.features[index], self.labels[index]
|
|
66
|
+
|
|
67
|
+
@property
|
|
68
|
+
def feature_names(self):
|
|
69
|
+
if self._feature_names is not None:
|
|
70
|
+
return self._feature_names
|
|
71
|
+
else:
|
|
72
|
+
_LOGGER.error(f"Dataset {self.__class__} has not been initialized with any feature names.")
|
|
73
|
+
raise ValueError()
|
|
74
|
+
|
|
75
|
+
@property
|
|
76
|
+
def target_names(self):
|
|
77
|
+
if self._target_names is not None:
|
|
78
|
+
return self._target_names
|
|
79
|
+
else:
|
|
80
|
+
_LOGGER.error(f"Dataset {self.__class__} has not been initialized with any target names.")
|
|
59
81
|
|
|
60
82
|
|
|
61
|
-
# --- Abstract Base Class
|
|
62
|
-
# --- Abstract Base Class (Corrected) ---
|
|
83
|
+
# --- Abstract Base Class ---
|
|
63
84
|
class _BaseDatasetMaker(ABC):
|
|
64
85
|
"""
|
|
65
86
|
Abstract base class for dataset makers. Contains shared logic for
|
|
@@ -71,31 +92,39 @@ class _BaseDatasetMaker(ABC):
|
|
|
71
92
|
self.scaler: Optional[PytorchScaler] = None
|
|
72
93
|
self._id: Optional[str] = None
|
|
73
94
|
self._feature_names: List[str] = []
|
|
95
|
+
self._target_names: List[str] = []
|
|
74
96
|
self._X_train_shape = (0,0)
|
|
75
97
|
self._X_test_shape = (0,0)
|
|
76
98
|
self._y_train_shape = (0,)
|
|
77
99
|
self._y_test_shape = (0,)
|
|
78
|
-
|
|
79
|
-
def _prepare_scaler(self,
|
|
80
|
-
|
|
100
|
+
|
|
101
|
+
def _prepare_scaler(self,
|
|
102
|
+
X_train: pandas.DataFrame,
|
|
103
|
+
y_train: Union[pandas.Series, pandas.DataFrame],
|
|
104
|
+
X_test: pandas.DataFrame,
|
|
105
|
+
label_dtype: torch.dtype,
|
|
106
|
+
schema: FeatureSchema):
|
|
107
|
+
"""Internal helper to fit and apply a PytorchScaler using a FeatureSchema."""
|
|
81
108
|
continuous_feature_indices: Optional[List[int]] = None
|
|
82
|
-
if continuous_feature_columns:
|
|
83
|
-
if all(isinstance(c, str) for c in continuous_feature_columns):
|
|
84
|
-
name_to_idx = {name: i for i, name in enumerate(self._feature_names)}
|
|
85
|
-
try:
|
|
86
|
-
continuous_feature_indices = [name_to_idx[name] for name in continuous_feature_columns] # type: ignore
|
|
87
|
-
except KeyError as e:
|
|
88
|
-
_LOGGER.error(f"Feature column '{e.args[0]}' not found.")
|
|
89
|
-
raise ValueError()
|
|
90
|
-
elif all(isinstance(c, int) for c in continuous_feature_columns):
|
|
91
|
-
continuous_feature_indices = continuous_feature_columns # type: ignore
|
|
92
|
-
else:
|
|
93
|
-
_LOGGER.error("'continuous_feature_columns' must be a list of all strings or all integers.")
|
|
94
|
-
raise TypeError()
|
|
95
|
-
|
|
96
|
-
X_train_values = X_train.values
|
|
97
|
-
X_test_values = X_test.values
|
|
98
109
|
|
|
110
|
+
# Get continuous feature indices *from the schema*
|
|
111
|
+
if schema.continuous_feature_names:
|
|
112
|
+
_LOGGER.info("Getting continuous feature indices from schema.")
|
|
113
|
+
try:
|
|
114
|
+
# Convert columns to a standard list for .index()
|
|
115
|
+
train_cols_list = X_train.columns.to_list()
|
|
116
|
+
# Map names from schema to column indices in the training DataFrame
|
|
117
|
+
continuous_feature_indices = [train_cols_list.index(name) for name in schema.continuous_feature_names]
|
|
118
|
+
except ValueError as e: #
|
|
119
|
+
_LOGGER.error(f"Feature name from schema not found in training data columns:\n{e}")
|
|
120
|
+
raise ValueError()
|
|
121
|
+
else:
|
|
122
|
+
_LOGGER.info("No continuous features listed in schema. Scaler will not be fitted.")
|
|
123
|
+
|
|
124
|
+
X_train_values = X_train.to_numpy()
|
|
125
|
+
X_test_values = X_test.to_numpy()
|
|
126
|
+
|
|
127
|
+
# continuous_feature_indices is derived
|
|
99
128
|
if self.scaler is None and continuous_feature_indices:
|
|
100
129
|
_LOGGER.info("Fitting a new PytorchScaler on training data.")
|
|
101
130
|
temp_train_ds = _PytorchDataset(X_train_values, y_train, label_dtype) # type: ignore
|
|
@@ -122,6 +151,18 @@ class _BaseDatasetMaker(ABC):
|
|
|
122
151
|
@property
|
|
123
152
|
def feature_names(self) -> list[str]:
|
|
124
153
|
return self._feature_names
|
|
154
|
+
|
|
155
|
+
@property
|
|
156
|
+
def target_names(self) -> list[str]:
|
|
157
|
+
return self._target_names
|
|
158
|
+
|
|
159
|
+
@property
|
|
160
|
+
def number_of_features(self) -> int:
|
|
161
|
+
return len(self._feature_names)
|
|
162
|
+
|
|
163
|
+
@property
|
|
164
|
+
def number_of_targets(self) -> int:
|
|
165
|
+
return len(self._target_names)
|
|
125
166
|
|
|
126
167
|
@property
|
|
127
168
|
def id(self) -> Optional[str]:
|
|
@@ -142,30 +183,47 @@ class _BaseDatasetMaker(ABC):
|
|
|
142
183
|
"""Saves a list of feature names as a text file"""
|
|
143
184
|
save_list_strings(list_strings=self._feature_names,
|
|
144
185
|
directory=directory,
|
|
145
|
-
filename=
|
|
146
|
-
verbose=verbose)
|
|
186
|
+
filename=DatasetKeys.FEATURE_NAMES,
|
|
187
|
+
verbose=verbose)
|
|
188
|
+
|
|
189
|
+
def save_target_names(self, directory: Union[str, Path], verbose: bool=True) -> None:
|
|
190
|
+
"""Saves a list of target names as a text file"""
|
|
191
|
+
save_list_strings(list_strings=self._target_names,
|
|
192
|
+
directory=directory,
|
|
193
|
+
filename=DatasetKeys.TARGET_NAMES,
|
|
194
|
+
verbose=verbose)
|
|
147
195
|
|
|
148
|
-
def save_scaler(self,
|
|
196
|
+
def save_scaler(self, directory: Union[str, Path], verbose: bool=True) -> None:
|
|
149
197
|
"""
|
|
150
198
|
Saves the fitted PytorchScaler's state to a .pth file.
|
|
151
199
|
|
|
152
200
|
The filename is automatically generated based on the dataset id.
|
|
153
201
|
|
|
154
202
|
Args:
|
|
155
|
-
|
|
203
|
+
directory (str | Path): The directory where the scaler will be saved.
|
|
156
204
|
"""
|
|
157
205
|
if not self.scaler:
|
|
158
206
|
_LOGGER.error("No scaler was fitted or provided.")
|
|
159
207
|
raise RuntimeError()
|
|
160
208
|
if not self.id:
|
|
161
|
-
_LOGGER.error("Must set the `id` before saving scaler.")
|
|
209
|
+
_LOGGER.error("Must set the dataset `id` before saving scaler.")
|
|
162
210
|
raise ValueError()
|
|
163
|
-
save_path = make_fullpath(
|
|
211
|
+
save_path = make_fullpath(directory, make=True, enforce="directory")
|
|
164
212
|
sanitized_id = sanitize_filename(self.id)
|
|
165
|
-
filename = f"
|
|
213
|
+
filename = f"{DatasetKeys.SCALER_PREFIX}{sanitized_id}.pth"
|
|
166
214
|
filepath = save_path / filename
|
|
167
|
-
self.scaler.save(filepath)
|
|
168
|
-
|
|
215
|
+
self.scaler.save(filepath, verbose=False)
|
|
216
|
+
if verbose:
|
|
217
|
+
_LOGGER.info(f"Scaler for dataset '{self.id}' saved as '{filepath.name}'.")
|
|
218
|
+
|
|
219
|
+
def save_artifacts(self, directory: Union[str, Path], verbose: bool=True) -> None:
|
|
220
|
+
"""
|
|
221
|
+
Convenience method to save feature names, target names, and the scaler (if a scaler was fitted)
|
|
222
|
+
"""
|
|
223
|
+
self.save_feature_names(directory=directory, verbose=verbose)
|
|
224
|
+
self.save_target_names(directory=directory, verbose=verbose)
|
|
225
|
+
if self.scaler is not None:
|
|
226
|
+
self.save_scaler(directory=directory, verbose=verbose)
|
|
169
227
|
|
|
170
228
|
|
|
171
229
|
# Single target dataset
|
|
@@ -173,119 +231,222 @@ class DatasetMaker(_BaseDatasetMaker):
|
|
|
173
231
|
"""
|
|
174
232
|
Dataset maker for pre-processed, numerical pandas DataFrames with a single target column.
|
|
175
233
|
|
|
176
|
-
This class takes a DataFrame, automatically splits
|
|
177
|
-
|
|
178
|
-
target variable is the last column. It can also create, apply, and
|
|
179
|
-
save a PytorchScaler for standardizing continuous features.
|
|
234
|
+
This class takes a DataFrame, and a FeatureSchema, automatically splits and converts them into PyTorch Datasets.
|
|
235
|
+
It can also create and apply a PytorchScaler using the schema.
|
|
180
236
|
|
|
181
237
|
Attributes:
|
|
182
238
|
`scaler` -> PytorchScaler | None
|
|
183
239
|
`train_dataset` -> PyTorch Dataset
|
|
184
240
|
`test_dataset` -> PyTorch Dataset
|
|
185
241
|
`feature_names` -> list[str]
|
|
186
|
-
`
|
|
242
|
+
`target_names` -> list[str]
|
|
187
243
|
`id` -> str
|
|
188
244
|
|
|
189
245
|
The ID can be manually set to any string if needed, it is the target name by default.
|
|
190
246
|
"""
|
|
191
247
|
def __init__(self,
|
|
192
248
|
pandas_df: pandas.DataFrame,
|
|
249
|
+
schema: FeatureSchema,
|
|
193
250
|
kind: Literal["regression", "classification"],
|
|
251
|
+
scaler: Union[Literal["fit"], Literal["none"], PytorchScaler],
|
|
194
252
|
test_size: float = 0.2,
|
|
195
|
-
random_state: int = 42
|
|
196
|
-
scaler: Optional[PytorchScaler] = None,
|
|
197
|
-
continuous_feature_columns: Optional[Union[List[int], List[str]]] = None):
|
|
253
|
+
random_state: int = 42):
|
|
198
254
|
"""
|
|
199
255
|
Args:
|
|
200
|
-
pandas_df (pandas.DataFrame):
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
256
|
+
pandas_df (pandas.DataFrame):
|
|
257
|
+
The pre-processed input DataFrame containing all columns. (features and single target).
|
|
258
|
+
schema (FeatureSchema):
|
|
259
|
+
The definitive schema object from data_exploration.
|
|
260
|
+
kind ("regression" | "classification"):
|
|
261
|
+
The type of ML task. This determines the data type of the labels.
|
|
262
|
+
scaler ("fit" | "none" | PytorchScaler):
|
|
263
|
+
Strategy for data scaling:
|
|
264
|
+
- "fit": Fit a new PytorchScaler on continuous features.
|
|
265
|
+
- "none": Do not scale data (e.g., for TabularTransformer).
|
|
266
|
+
- PytorchScaler instance: Use a pre-fitted scaler to transform data.
|
|
267
|
+
test_size (float):
|
|
268
|
+
The proportion of the dataset to allocate to the test split.
|
|
269
|
+
random_state (int):
|
|
270
|
+
The seed for the random number of generator for reproducibility.
|
|
271
|
+
|
|
206
272
|
"""
|
|
207
273
|
super().__init__()
|
|
208
|
-
self.scaler = scaler
|
|
209
274
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
275
|
+
_apply_scaling: bool = False
|
|
276
|
+
if scaler == "fit":
|
|
277
|
+
self.scaler = None # To be created
|
|
278
|
+
_apply_scaling = True
|
|
279
|
+
elif scaler == "none":
|
|
280
|
+
self.scaler = None
|
|
281
|
+
elif isinstance(scaler, PytorchScaler):
|
|
282
|
+
self.scaler = scaler # Use the provided one
|
|
283
|
+
_apply_scaling = True
|
|
284
|
+
else:
|
|
285
|
+
_LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a PytorchScaler instance.")
|
|
286
|
+
raise ValueError()
|
|
287
|
+
|
|
288
|
+
# --- 1. Identify features (from schema) ---
|
|
289
|
+
self._feature_names = list(schema.feature_names)
|
|
290
|
+
|
|
291
|
+
# --- 2. Infer target (by set difference) ---
|
|
292
|
+
all_cols_set = set(pandas_df.columns)
|
|
293
|
+
feature_cols_set = set(self._feature_names)
|
|
294
|
+
|
|
295
|
+
target_cols_set = all_cols_set - feature_cols_set
|
|
296
|
+
|
|
297
|
+
if len(target_cols_set) == 0:
|
|
298
|
+
_LOGGER.error("No target column found. The schema's features match the DataFrame's columns exactly.")
|
|
299
|
+
raise ValueError("No target column found in DataFrame.")
|
|
300
|
+
if len(target_cols_set) > 1:
|
|
301
|
+
_LOGGER.error(f"Ambiguous target. Found {len(target_cols_set)} columns not in the schema: {list(target_cols_set)}. DatasetMaker (single-target) requires exactly one.")
|
|
302
|
+
raise ValueError("Ambiguous target: More than one non-feature column found.")
|
|
303
|
+
|
|
304
|
+
target_name = list(target_cols_set)[0]
|
|
305
|
+
self._target_names = [target_name]
|
|
306
|
+
self._id = target_name
|
|
307
|
+
|
|
308
|
+
# --- 3. Split Data ---
|
|
309
|
+
features_df = pandas_df[self._feature_names]
|
|
310
|
+
target_series = pandas_df[target_name]
|
|
311
|
+
|
|
218
312
|
X_train, X_test, y_train, y_test = train_test_split(
|
|
219
|
-
|
|
313
|
+
features_df,
|
|
314
|
+
target_series,
|
|
315
|
+
test_size=test_size,
|
|
316
|
+
random_state=random_state
|
|
220
317
|
)
|
|
221
318
|
self._X_train_shape, self._X_test_shape = X_train.shape, X_test.shape
|
|
222
319
|
self._y_train_shape, self._y_test_shape = y_train.shape, y_test.shape
|
|
223
320
|
|
|
224
321
|
label_dtype = torch.float32 if kind == "regression" else torch.int64
|
|
225
322
|
|
|
226
|
-
# ---
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
323
|
+
# --- 4. Scale (using the schema) ---
|
|
324
|
+
if _apply_scaling:
|
|
325
|
+
X_train_final, X_test_final = self._prepare_scaler(
|
|
326
|
+
X_train, y_train, X_test, label_dtype, schema
|
|
327
|
+
)
|
|
328
|
+
else:
|
|
329
|
+
_LOGGER.info("Features have not been scaled as specified.")
|
|
330
|
+
X_train_final = X_train.to_numpy()
|
|
331
|
+
X_test_final = X_test.to_numpy()
|
|
332
|
+
|
|
333
|
+
# --- 5. Create Datasets ---
|
|
334
|
+
self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
|
|
335
|
+
self._test_ds = _PytorchDataset(X_test_final, y_test, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
|
|
230
336
|
|
|
231
|
-
# --- 4. Create Datasets ---
|
|
232
|
-
self._train_ds = _PytorchDataset(X_train_final, y_train.values, label_dtype)
|
|
233
|
-
self._test_ds = _PytorchDataset(X_test_final, y_test.values, label_dtype)
|
|
234
|
-
|
|
235
|
-
@property
|
|
236
|
-
def target_name(self) -> str:
|
|
237
|
-
return self._target_name
|
|
238
|
-
|
|
239
337
|
|
|
240
|
-
# ---
|
|
338
|
+
# --- Multi-Target Class ---
|
|
241
339
|
class DatasetMakerMulti(_BaseDatasetMaker):
|
|
242
340
|
"""
|
|
243
|
-
Dataset maker for pre-processed, numerical pandas DataFrames with
|
|
341
|
+
Dataset maker for pre-processed, numerical pandas DataFrames with
|
|
342
|
+
multiple target columns.
|
|
244
343
|
|
|
245
|
-
This class takes a DataFrame,
|
|
344
|
+
This class takes a *full* DataFrame, a *FeatureSchema*, and a list of
|
|
345
|
+
*target_columns*. It validates that the schema's features and the
|
|
346
|
+
target columns are mutually exclusive and together account for all
|
|
347
|
+
columns in the DataFrame.
|
|
348
|
+
|
|
349
|
+
Targets dtype is torch.float32
|
|
246
350
|
"""
|
|
247
351
|
def __init__(self,
|
|
248
352
|
pandas_df: pandas.DataFrame,
|
|
249
353
|
target_columns: List[str],
|
|
354
|
+
schema: FeatureSchema,
|
|
355
|
+
scaler: Union[Literal["fit"], Literal["none"], PytorchScaler],
|
|
250
356
|
test_size: float = 0.2,
|
|
251
|
-
random_state: int = 42
|
|
252
|
-
scaler: Optional[PytorchScaler] = None,
|
|
253
|
-
continuous_feature_columns: Optional[Union[List[int], List[str]]] = None):
|
|
357
|
+
random_state: int = 42):
|
|
254
358
|
"""
|
|
255
359
|
Args:
|
|
256
|
-
pandas_df (pandas.DataFrame):
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
360
|
+
pandas_df (pandas.DataFrame):
|
|
361
|
+
The pre-processed input DataFrame with *all* columns
|
|
362
|
+
(features and targets).
|
|
363
|
+
target_columns (list[str]):
|
|
364
|
+
List of target column names.
|
|
365
|
+
schema (FeatureSchema):
|
|
366
|
+
The definitive schema object from data_exploration.
|
|
367
|
+
scaler ("fit" | "none" | PytorchScaler):
|
|
368
|
+
Strategy for data scaling:
|
|
369
|
+
- "fit": Fit a new PytorchScaler on continuous features.
|
|
370
|
+
- "none": Do not scale data (e.g., for TabularTransformer).
|
|
371
|
+
- PytorchScaler instance: Use a pre-fitted scaler to transform data.
|
|
372
|
+
test_size (float):
|
|
373
|
+
The proportion of the dataset to allocate to the test split.
|
|
374
|
+
random_state (int):
|
|
375
|
+
The seed for the random number generator for reproducibility.
|
|
376
|
+
|
|
377
|
+
## Note:
|
|
378
|
+
For multi-binary classification, the most common PyTorch loss function is nn.BCEWithLogitsLoss.
|
|
379
|
+
This loss function requires the labels to be torch.float32 which is the same type required for regression (multi-regression) tasks.
|
|
262
380
|
"""
|
|
263
381
|
super().__init__()
|
|
264
|
-
|
|
265
|
-
|
|
382
|
+
|
|
383
|
+
_apply_scaling: bool = False
|
|
384
|
+
if scaler == "fit":
|
|
385
|
+
self.scaler = None
|
|
386
|
+
_apply_scaling = True
|
|
387
|
+
elif scaler == "none":
|
|
388
|
+
self.scaler = None
|
|
389
|
+
elif isinstance(scaler, PytorchScaler):
|
|
390
|
+
self.scaler = scaler # Use the provided one
|
|
391
|
+
_apply_scaling = True
|
|
392
|
+
else:
|
|
393
|
+
_LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a PytorchScaler instance.")
|
|
394
|
+
raise ValueError()
|
|
395
|
+
|
|
396
|
+
# --- 1. Get features and targets from schema/args ---
|
|
397
|
+
self._feature_names = list(schema.feature_names)
|
|
266
398
|
self._target_names = target_columns
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
399
|
+
|
|
400
|
+
# --- 2. Validation ---
|
|
401
|
+
all_cols_set = set(pandas_df.columns)
|
|
402
|
+
feature_cols_set = set(self._feature_names)
|
|
403
|
+
target_cols_set = set(self._target_names)
|
|
404
|
+
|
|
405
|
+
overlap = feature_cols_set.intersection(target_cols_set)
|
|
406
|
+
if overlap:
|
|
407
|
+
_LOGGER.error(f"Features and targets are not mutually exclusive. Overlap: {list(overlap)}")
|
|
408
|
+
raise ValueError("Features and targets overlap.")
|
|
409
|
+
|
|
410
|
+
schema_plus_targets = feature_cols_set.union(target_cols_set)
|
|
411
|
+
missing_cols = all_cols_set - schema_plus_targets
|
|
412
|
+
if missing_cols:
|
|
413
|
+
_LOGGER.warning(f"Columns in DataFrame but not in schema or targets: {list(missing_cols)}")
|
|
414
|
+
|
|
415
|
+
extra_cols = schema_plus_targets - all_cols_set
|
|
416
|
+
if extra_cols:
|
|
417
|
+
_LOGGER.error(f"Columns in schema/targets but not in DataFrame: {list(extra_cols)}")
|
|
418
|
+
raise ValueError("Schema/target definition mismatch with DataFrame.")
|
|
419
|
+
|
|
420
|
+
# --- 3. Split Data ---
|
|
421
|
+
features_df = pandas_df[self._feature_names]
|
|
422
|
+
target_df = pandas_df[self._target_names]
|
|
270
423
|
|
|
271
424
|
X_train, X_test, y_train, y_test = train_test_split(
|
|
272
|
-
|
|
425
|
+
features_df,
|
|
426
|
+
target_df,
|
|
427
|
+
test_size=test_size,
|
|
428
|
+
random_state=random_state
|
|
273
429
|
)
|
|
274
430
|
self._X_train_shape, self._X_test_shape = X_train.shape, X_test.shape
|
|
275
431
|
self._y_train_shape, self._y_test_shape = y_train.shape, y_test.shape
|
|
276
432
|
|
|
277
|
-
|
|
433
|
+
# Multi-target for regression or multi-binary
|
|
434
|
+
label_dtype = torch.float32
|
|
278
435
|
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
436
|
+
# --- 4. Scale (using the schema) ---
|
|
437
|
+
if _apply_scaling:
|
|
438
|
+
X_train_final, X_test_final = self._prepare_scaler(
|
|
439
|
+
X_train, y_train, X_test, label_dtype, schema
|
|
440
|
+
)
|
|
441
|
+
else:
|
|
442
|
+
_LOGGER.info("Features have not been scaled as specified.")
|
|
443
|
+
X_train_final = X_train.to_numpy()
|
|
444
|
+
X_test_final = X_test.to_numpy()
|
|
282
445
|
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
def target_names(self) -> list[str]:
|
|
288
|
-
return self._target_names
|
|
446
|
+
# --- 5. Create Datasets ---
|
|
447
|
+
# _PytorchDataset now correctly handles y_train (a DataFrame)
|
|
448
|
+
self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
|
|
449
|
+
self._test_ds = _PytorchDataset(X_test_final, y_test, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
|
|
289
450
|
|
|
290
451
|
|
|
291
452
|
# --- Private Base Class ---
|
|
@@ -307,149 +468,6 @@ class _BaseMaker(ABC):
|
|
|
307
468
|
pass
|
|
308
469
|
|
|
309
470
|
|
|
310
|
-
# --- VisionDatasetMaker ---
|
|
311
|
-
class VisionDatasetMaker(_BaseMaker):
|
|
312
|
-
"""
|
|
313
|
-
Creates processed PyTorch datasets for computer vision tasks from an
|
|
314
|
-
image folder directory.
|
|
315
|
-
|
|
316
|
-
Uses online augmentations per epoch (image augmentation without creating new files).
|
|
317
|
-
"""
|
|
318
|
-
def __init__(self, full_dataset: ImageFolder):
|
|
319
|
-
super().__init__()
|
|
320
|
-
self.full_dataset = full_dataset
|
|
321
|
-
self.labels = [s[1] for s in self.full_dataset.samples]
|
|
322
|
-
self.class_map = full_dataset.class_to_idx
|
|
323
|
-
|
|
324
|
-
self._is_split = False
|
|
325
|
-
self._are_transforms_configured = False
|
|
326
|
-
|
|
327
|
-
@classmethod
|
|
328
|
-
def from_folder(cls, root_dir: str) -> 'VisionDatasetMaker':
|
|
329
|
-
"""Creates a maker instance from a root directory of images."""
|
|
330
|
-
initial_transform = transforms.Compose([transforms.ToTensor()])
|
|
331
|
-
full_dataset = ImageFolder(root=root_dir, transform=initial_transform)
|
|
332
|
-
_LOGGER.info(f"Found {len(full_dataset)} images in {len(full_dataset.classes)} classes.")
|
|
333
|
-
return cls(full_dataset)
|
|
334
|
-
|
|
335
|
-
@staticmethod
|
|
336
|
-
def inspect_folder(path: Union[str, Path]):
|
|
337
|
-
"""
|
|
338
|
-
Logs a report of the types, sizes, and channels of image files
|
|
339
|
-
found in the directory and its subdirectories.
|
|
340
|
-
"""
|
|
341
|
-
path_obj = make_fullpath(path)
|
|
342
|
-
|
|
343
|
-
non_image_files = set()
|
|
344
|
-
img_types = set()
|
|
345
|
-
img_sizes = set()
|
|
346
|
-
img_channels = set()
|
|
347
|
-
img_counter = 0
|
|
348
|
-
|
|
349
|
-
_LOGGER.info(f"Inspecting folder: {path_obj}...")
|
|
350
|
-
# Use rglob to recursively find all files
|
|
351
|
-
for filepath in path_obj.rglob('*'):
|
|
352
|
-
if filepath.is_file():
|
|
353
|
-
try:
|
|
354
|
-
# Using PIL to open is a more reliable check
|
|
355
|
-
with Image.open(filepath) as img:
|
|
356
|
-
img_types.add(img.format)
|
|
357
|
-
img_sizes.add(img.size)
|
|
358
|
-
img_channels.update(img.getbands())
|
|
359
|
-
img_counter += 1
|
|
360
|
-
except (IOError, SyntaxError):
|
|
361
|
-
non_image_files.add(filepath.name)
|
|
362
|
-
|
|
363
|
-
if non_image_files:
|
|
364
|
-
_LOGGER.warning(f"Non-image or corrupted files found and ignored: {non_image_files}")
|
|
365
|
-
|
|
366
|
-
report = (
|
|
367
|
-
f"\n--- Inspection Report for '{path_obj.name}' ---\n"
|
|
368
|
-
f"Total images found: {img_counter}\n"
|
|
369
|
-
f"Image formats: {img_types or 'None'}\n"
|
|
370
|
-
f"Image sizes (WxH): {img_sizes or 'None'}\n"
|
|
371
|
-
f"Image channels (bands): {img_channels or 'None'}\n"
|
|
372
|
-
f"--------------------------------------"
|
|
373
|
-
)
|
|
374
|
-
print(report)
|
|
375
|
-
|
|
376
|
-
def split_data(self, val_size: float = 0.2, test_size: float = 0.0,
|
|
377
|
-
stratify: bool = True, random_state: Optional[int] = None) -> 'VisionDatasetMaker':
|
|
378
|
-
"""Splits the dataset into training, validation, and optional test sets."""
|
|
379
|
-
if self._is_split:
|
|
380
|
-
_LOGGER.warning("Data has already been split.")
|
|
381
|
-
return self
|
|
382
|
-
|
|
383
|
-
if val_size + test_size >= 1.0:
|
|
384
|
-
_LOGGER.error("The sum of val_size and test_size must be less than 1.")
|
|
385
|
-
raise ValueError()
|
|
386
|
-
|
|
387
|
-
indices = list(range(len(self.full_dataset)))
|
|
388
|
-
labels_for_split = self.labels if stratify else None
|
|
389
|
-
|
|
390
|
-
train_indices, val_test_indices = train_test_split(
|
|
391
|
-
indices, test_size=(val_size + test_size), random_state=random_state, stratify=labels_for_split
|
|
392
|
-
)
|
|
393
|
-
|
|
394
|
-
if test_size > 0:
|
|
395
|
-
val_test_labels = [self.labels[i] for i in val_test_indices]
|
|
396
|
-
stratify_val_test = val_test_labels if stratify else None
|
|
397
|
-
val_indices, test_indices = train_test_split(
|
|
398
|
-
val_test_indices, test_size=(test_size / (val_size + test_size)),
|
|
399
|
-
random_state=random_state, stratify=stratify_val_test
|
|
400
|
-
)
|
|
401
|
-
self._test_dataset = Subset(self.full_dataset, test_indices)
|
|
402
|
-
_LOGGER.info(f"Test set created with {len(self._test_dataset)} images.")
|
|
403
|
-
else:
|
|
404
|
-
val_indices = val_test_indices
|
|
405
|
-
|
|
406
|
-
self._train_dataset = Subset(self.full_dataset, train_indices)
|
|
407
|
-
self._val_dataset = Subset(self.full_dataset, val_indices)
|
|
408
|
-
self._is_split = True
|
|
409
|
-
|
|
410
|
-
_LOGGER.info(f"Data split into: \n- Training: {len(self._train_dataset)} images \n- Validation: {len(self._val_dataset)} images")
|
|
411
|
-
return self
|
|
412
|
-
|
|
413
|
-
def configure_transforms(self, resize_size: int = 256, crop_size: int = 224,
|
|
414
|
-
mean: List[float] = [0.485, 0.456, 0.406],
|
|
415
|
-
std: List[float] = [0.229, 0.224, 0.225],
|
|
416
|
-
extra_train_transforms: Optional[List] = None) -> 'VisionDatasetMaker':
|
|
417
|
-
"""Configures and applies the image transformations (augmentations)."""
|
|
418
|
-
if not self._is_split:
|
|
419
|
-
_LOGGER.error("Transforms must be configured AFTER splitting data. Call .split_data() first.")
|
|
420
|
-
raise RuntimeError()
|
|
421
|
-
|
|
422
|
-
base_train_transforms = [transforms.RandomResizedCrop(crop_size), transforms.RandomHorizontalFlip()]
|
|
423
|
-
if extra_train_transforms:
|
|
424
|
-
base_train_transforms.extend(extra_train_transforms)
|
|
425
|
-
|
|
426
|
-
final_transforms = [transforms.ToTensor(), transforms.Normalize(mean=mean, std=std)]
|
|
427
|
-
|
|
428
|
-
val_transform = transforms.Compose([transforms.Resize(resize_size), transforms.CenterCrop(crop_size), *final_transforms])
|
|
429
|
-
train_transform = transforms.Compose([*base_train_transforms, *final_transforms])
|
|
430
|
-
|
|
431
|
-
self._train_dataset.dataset.transform = train_transform # type: ignore
|
|
432
|
-
self._val_dataset.dataset.transform = val_transform # type: ignore
|
|
433
|
-
if self._test_dataset:
|
|
434
|
-
self._test_dataset.dataset.transform = val_transform # type: ignore
|
|
435
|
-
|
|
436
|
-
self._are_transforms_configured = True
|
|
437
|
-
_LOGGER.info("Image transforms configured and applied.")
|
|
438
|
-
return self
|
|
439
|
-
|
|
440
|
-
def get_datasets(self) -> Tuple[Dataset, ...]:
|
|
441
|
-
"""Returns the final train, validation, and optional test datasets."""
|
|
442
|
-
if not self._is_split:
|
|
443
|
-
_LOGGER.error("Data has not been split. Call .split_data() first.")
|
|
444
|
-
raise RuntimeError()
|
|
445
|
-
if not self._are_transforms_configured:
|
|
446
|
-
_LOGGER.warning("Transforms have not been configured. Using default ToTensor only.")
|
|
447
|
-
|
|
448
|
-
if self._test_dataset:
|
|
449
|
-
return self._train_dataset, self._val_dataset, self._test_dataset
|
|
450
|
-
return self._train_dataset, self._val_dataset
|
|
451
|
-
|
|
452
|
-
|
|
453
471
|
# --- SequenceMaker ---
|
|
454
472
|
class SequenceMaker(_BaseMaker):
|
|
455
473
|
"""
|
|
@@ -638,40 +656,5 @@ class SequenceMaker(_BaseMaker):
|
|
|
638
656
|
return self._train_dataset, self._test_dataset
|
|
639
657
|
|
|
640
658
|
|
|
641
|
-
# --- Custom Vision Transform Class ---
|
|
642
|
-
class ResizeAspectFill:
|
|
643
|
-
"""
|
|
644
|
-
Custom transformation to make an image square by padding it to match the
|
|
645
|
-
longest side, preserving the aspect ratio. The image is finally centered.
|
|
646
|
-
|
|
647
|
-
Args:
|
|
648
|
-
pad_color (Union[str, int]): Color to use for the padding.
|
|
649
|
-
Defaults to "black".
|
|
650
|
-
"""
|
|
651
|
-
def __init__(self, pad_color: Union[str, int] = "black") -> None:
|
|
652
|
-
self.pad_color = pad_color
|
|
653
|
-
|
|
654
|
-
def __call__(self, image: Image.Image) -> Image.Image:
|
|
655
|
-
if not isinstance(image, Image.Image):
|
|
656
|
-
_LOGGER.error(f"Expected PIL.Image.Image, got {type(image).__name__}")
|
|
657
|
-
raise TypeError()
|
|
658
|
-
|
|
659
|
-
w, h = image.size
|
|
660
|
-
if w == h:
|
|
661
|
-
return image
|
|
662
|
-
|
|
663
|
-
# Determine padding to center the image
|
|
664
|
-
if w > h:
|
|
665
|
-
top_padding = (w - h) // 2
|
|
666
|
-
bottom_padding = w - h - top_padding
|
|
667
|
-
padding = (0, top_padding, 0, bottom_padding)
|
|
668
|
-
else: # h > w
|
|
669
|
-
left_padding = (h - w) // 2
|
|
670
|
-
right_padding = h - w - left_padding
|
|
671
|
-
padding = (left_padding, 0, right_padding, 0)
|
|
672
|
-
|
|
673
|
-
return ImageOps.expand(image, padding, fill=self.pad_color)
|
|
674
|
-
|
|
675
|
-
|
|
676
659
|
def info():
|
|
677
660
|
_script_info(__all__)
|