dragon-ml-toolbox 10.1.1__py3-none-any.whl → 14.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (48) hide show
  1. {dragon_ml_toolbox-10.1.1.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/METADATA +38 -63
  2. dragon_ml_toolbox-14.2.0.dist-info/RECORD +48 -0
  3. {dragon_ml_toolbox-10.1.1.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE +1 -1
  4. {dragon_ml_toolbox-10.1.1.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +11 -0
  5. ml_tools/ETL_cleaning.py +175 -59
  6. ml_tools/ETL_engineering.py +506 -70
  7. ml_tools/GUI_tools.py +2 -1
  8. ml_tools/MICE_imputation.py +212 -7
  9. ml_tools/ML_callbacks.py +73 -40
  10. ml_tools/ML_datasetmaster.py +267 -284
  11. ml_tools/ML_evaluation.py +119 -58
  12. ml_tools/ML_evaluation_multi.py +107 -32
  13. ml_tools/ML_inference.py +15 -5
  14. ml_tools/ML_models.py +234 -170
  15. ml_tools/ML_models_advanced.py +323 -0
  16. ml_tools/ML_optimization.py +321 -97
  17. ml_tools/ML_scaler.py +10 -5
  18. ml_tools/ML_trainer.py +585 -40
  19. ml_tools/ML_utilities.py +528 -0
  20. ml_tools/ML_vision_datasetmaster.py +1315 -0
  21. ml_tools/ML_vision_evaluation.py +260 -0
  22. ml_tools/ML_vision_inference.py +428 -0
  23. ml_tools/ML_vision_models.py +627 -0
  24. ml_tools/ML_vision_transformers.py +58 -0
  25. ml_tools/PSO_optimization.py +10 -7
  26. ml_tools/RNN_forecast.py +2 -0
  27. ml_tools/SQL.py +22 -9
  28. ml_tools/VIF_factor.py +4 -3
  29. ml_tools/_ML_vision_recipe.py +88 -0
  30. ml_tools/__init__.py +1 -0
  31. ml_tools/_logger.py +0 -2
  32. ml_tools/_schema.py +96 -0
  33. ml_tools/constants.py +79 -0
  34. ml_tools/custom_logger.py +164 -16
  35. ml_tools/data_exploration.py +1092 -109
  36. ml_tools/ensemble_evaluation.py +48 -1
  37. ml_tools/ensemble_inference.py +6 -7
  38. ml_tools/ensemble_learning.py +4 -3
  39. ml_tools/handle_excel.py +1 -0
  40. ml_tools/keys.py +80 -0
  41. ml_tools/math_utilities.py +259 -0
  42. ml_tools/optimization_tools.py +198 -24
  43. ml_tools/path_manager.py +144 -45
  44. ml_tools/serde.py +192 -0
  45. ml_tools/utilities.py +287 -227
  46. dragon_ml_toolbox-10.1.1.dist-info/RECORD +0 -36
  47. {dragon_ml_toolbox-10.1.1.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/WHEEL +0 -0
  48. {dragon_ml_toolbox-10.1.1.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/top_level.txt +0 -0
@@ -1,27 +1,26 @@
1
1
  import torch
2
- from torch.utils.data import Dataset, Subset
2
+ from torch.utils.data import Dataset
3
3
  import pandas
4
4
  import numpy
5
5
  from sklearn.model_selection import train_test_split
6
6
  from typing import Literal, Union, Tuple, List, Optional
7
7
  from abc import ABC, abstractmethod
8
- from PIL import Image, ImageOps
9
- from torchvision.datasets import ImageFolder
10
- from torchvision import transforms
11
8
  import matplotlib.pyplot as plt
12
9
  from pathlib import Path
10
+
13
11
  from .path_manager import make_fullpath, sanitize_filename
14
12
  from ._logger import _LOGGER
15
13
  from ._script_info import _script_info
16
14
  from .custom_logger import save_list_strings
17
15
  from .ML_scaler import PytorchScaler
16
+ from .keys import DatasetKeys
17
+ from ._schema import FeatureSchema
18
+
18
19
 
19
20
  __all__ = [
20
21
  "DatasetMaker",
21
22
  "DatasetMakerMulti",
22
- "VisionDatasetMaker",
23
- "SequenceMaker",
24
- "ResizeAspectFill",
23
+ "SequenceMaker"
25
24
  ]
26
25
 
27
26
 
@@ -32,9 +31,11 @@ class _PytorchDataset(Dataset):
32
31
  Converts numpy/pandas data into tensors for model consumption.
33
32
  """
34
33
  def __init__(self, features: Union[numpy.ndarray, pandas.DataFrame],
35
- labels: Union[numpy.ndarray, pandas.Series],
34
+ labels: Union[numpy.ndarray, pandas.Series, pandas.DataFrame],
36
35
  labels_dtype: torch.dtype,
37
- features_dtype: torch.dtype = torch.float32):
36
+ features_dtype: torch.dtype = torch.float32,
37
+ feature_names: Optional[List[str]] = None,
38
+ target_names: Optional[List[str]] = None):
38
39
  """
39
40
  integer labels for classification.
40
41
 
@@ -43,23 +44,43 @@ class _PytorchDataset(Dataset):
43
44
 
44
45
  if isinstance(features, numpy.ndarray):
45
46
  self.features = torch.tensor(features, dtype=features_dtype)
46
- else:
47
- self.features = torch.tensor(features.values, dtype=features_dtype)
47
+ else: # It's a pandas.DataFrame
48
+ self.features = torch.tensor(features.to_numpy(), dtype=features_dtype)
48
49
 
49
50
  if isinstance(labels, numpy.ndarray):
50
51
  self.labels = torch.tensor(labels, dtype=labels_dtype)
52
+ elif isinstance(labels, (pandas.Series, pandas.DataFrame)):
53
+ self.labels = torch.tensor(labels.to_numpy(), dtype=labels_dtype)
51
54
  else:
52
- self.labels = torch.tensor(labels.values, dtype=labels_dtype)
55
+ # Fallback for other types (though your type hints don't cover this)
56
+ self.labels = torch.tensor(labels, dtype=labels_dtype)
57
+
58
+ self._feature_names = feature_names
59
+ self._target_names = target_names
53
60
 
54
61
  def __len__(self):
55
62
  return len(self.features)
56
63
 
57
64
  def __getitem__(self, index):
58
65
  return self.features[index], self.labels[index]
66
+
67
+ @property
68
+ def feature_names(self):
69
+ if self._feature_names is not None:
70
+ return self._feature_names
71
+ else:
72
+ _LOGGER.error(f"Dataset {self.__class__} has not been initialized with any feature names.")
73
+ raise ValueError()
74
+
75
+ @property
76
+ def target_names(self):
77
+ if self._target_names is not None:
78
+ return self._target_names
79
+ else:
80
+ _LOGGER.error(f"Dataset {self.__class__} has not been initialized with any target names.")
59
81
 
60
82
 
61
- # --- Abstract Base Class (New) ---
62
- # --- Abstract Base Class (Corrected) ---
83
+ # --- Abstract Base Class ---
63
84
  class _BaseDatasetMaker(ABC):
64
85
  """
65
86
  Abstract base class for dataset makers. Contains shared logic for
@@ -71,31 +92,39 @@ class _BaseDatasetMaker(ABC):
71
92
  self.scaler: Optional[PytorchScaler] = None
72
93
  self._id: Optional[str] = None
73
94
  self._feature_names: List[str] = []
95
+ self._target_names: List[str] = []
74
96
  self._X_train_shape = (0,0)
75
97
  self._X_test_shape = (0,0)
76
98
  self._y_train_shape = (0,)
77
99
  self._y_test_shape = (0,)
78
-
79
- def _prepare_scaler(self, X_train: pandas.DataFrame, y_train: Union[pandas.Series, pandas.DataFrame], X_test: pandas.DataFrame, label_dtype: torch.dtype, continuous_feature_columns: Optional[Union[List[int], List[str]]]):
80
- """Internal helper to fit and apply a PytorchScaler."""
100
+
101
+ def _prepare_scaler(self,
102
+ X_train: pandas.DataFrame,
103
+ y_train: Union[pandas.Series, pandas.DataFrame],
104
+ X_test: pandas.DataFrame,
105
+ label_dtype: torch.dtype,
106
+ schema: FeatureSchema):
107
+ """Internal helper to fit and apply a PytorchScaler using a FeatureSchema."""
81
108
  continuous_feature_indices: Optional[List[int]] = None
82
- if continuous_feature_columns:
83
- if all(isinstance(c, str) for c in continuous_feature_columns):
84
- name_to_idx = {name: i for i, name in enumerate(self._feature_names)}
85
- try:
86
- continuous_feature_indices = [name_to_idx[name] for name in continuous_feature_columns] # type: ignore
87
- except KeyError as e:
88
- _LOGGER.error(f"Feature column '{e.args[0]}' not found.")
89
- raise ValueError()
90
- elif all(isinstance(c, int) for c in continuous_feature_columns):
91
- continuous_feature_indices = continuous_feature_columns # type: ignore
92
- else:
93
- _LOGGER.error("'continuous_feature_columns' must be a list of all strings or all integers.")
94
- raise TypeError()
95
-
96
- X_train_values = X_train.values
97
- X_test_values = X_test.values
98
109
 
110
+ # Get continuous feature indices *from the schema*
111
+ if schema.continuous_feature_names:
112
+ _LOGGER.info("Getting continuous feature indices from schema.")
113
+ try:
114
+ # Convert columns to a standard list for .index()
115
+ train_cols_list = X_train.columns.to_list()
116
+ # Map names from schema to column indices in the training DataFrame
117
+ continuous_feature_indices = [train_cols_list.index(name) for name in schema.continuous_feature_names]
118
+ except ValueError as e: #
119
+ _LOGGER.error(f"Feature name from schema not found in training data columns:\n{e}")
120
+ raise ValueError()
121
+ else:
122
+ _LOGGER.info("No continuous features listed in schema. Scaler will not be fitted.")
123
+
124
+ X_train_values = X_train.to_numpy()
125
+ X_test_values = X_test.to_numpy()
126
+
127
+ # continuous_feature_indices is derived
99
128
  if self.scaler is None and continuous_feature_indices:
100
129
  _LOGGER.info("Fitting a new PytorchScaler on training data.")
101
130
  temp_train_ds = _PytorchDataset(X_train_values, y_train, label_dtype) # type: ignore
@@ -122,6 +151,18 @@ class _BaseDatasetMaker(ABC):
122
151
  @property
123
152
  def feature_names(self) -> list[str]:
124
153
  return self._feature_names
154
+
155
+ @property
156
+ def target_names(self) -> list[str]:
157
+ return self._target_names
158
+
159
+ @property
160
+ def number_of_features(self) -> int:
161
+ return len(self._feature_names)
162
+
163
+ @property
164
+ def number_of_targets(self) -> int:
165
+ return len(self._target_names)
125
166
 
126
167
  @property
127
168
  def id(self) -> Optional[str]:
@@ -142,30 +183,47 @@ class _BaseDatasetMaker(ABC):
142
183
  """Saves a list of feature names as a text file"""
143
184
  save_list_strings(list_strings=self._feature_names,
144
185
  directory=directory,
145
- filename="feature_names",
146
- verbose=verbose)
186
+ filename=DatasetKeys.FEATURE_NAMES,
187
+ verbose=verbose)
188
+
189
+ def save_target_names(self, directory: Union[str, Path], verbose: bool=True) -> None:
190
+ """Saves a list of target names as a text file"""
191
+ save_list_strings(list_strings=self._target_names,
192
+ directory=directory,
193
+ filename=DatasetKeys.TARGET_NAMES,
194
+ verbose=verbose)
147
195
 
148
- def save_scaler(self, save_dir: Union[str, Path]):
196
+ def save_scaler(self, directory: Union[str, Path], verbose: bool=True) -> None:
149
197
  """
150
198
  Saves the fitted PytorchScaler's state to a .pth file.
151
199
 
152
200
  The filename is automatically generated based on the dataset id.
153
201
 
154
202
  Args:
155
- save_dir (str | Path): The directory where the scaler will be saved.
203
+ directory (str | Path): The directory where the scaler will be saved.
156
204
  """
157
205
  if not self.scaler:
158
206
  _LOGGER.error("No scaler was fitted or provided.")
159
207
  raise RuntimeError()
160
208
  if not self.id:
161
- _LOGGER.error("Must set the `id` before saving scaler.")
209
+ _LOGGER.error("Must set the dataset `id` before saving scaler.")
162
210
  raise ValueError()
163
- save_path = make_fullpath(save_dir, make=True, enforce="directory")
211
+ save_path = make_fullpath(directory, make=True, enforce="directory")
164
212
  sanitized_id = sanitize_filename(self.id)
165
- filename = f"scaler_{sanitized_id}.pth"
213
+ filename = f"{DatasetKeys.SCALER_PREFIX}{sanitized_id}.pth"
166
214
  filepath = save_path / filename
167
- self.scaler.save(filepath)
168
- _LOGGER.info(f"Scaler for dataset '{self.id}' saved to '{filepath.name}'.")
215
+ self.scaler.save(filepath, verbose=False)
216
+ if verbose:
217
+ _LOGGER.info(f"Scaler for dataset '{self.id}' saved as '{filepath.name}'.")
218
+
219
+ def save_artifacts(self, directory: Union[str, Path], verbose: bool=True) -> None:
220
+ """
221
+ Convenience method to save feature names, target names, and the scaler (if a scaler was fitted)
222
+ """
223
+ self.save_feature_names(directory=directory, verbose=verbose)
224
+ self.save_target_names(directory=directory, verbose=verbose)
225
+ if self.scaler is not None:
226
+ self.save_scaler(directory=directory, verbose=verbose)
169
227
 
170
228
 
171
229
  # Single target dataset
@@ -173,119 +231,222 @@ class DatasetMaker(_BaseDatasetMaker):
173
231
  """
174
232
  Dataset maker for pre-processed, numerical pandas DataFrames with a single target column.
175
233
 
176
- This class takes a DataFrame, automatically splits it into training and
177
- testing sets, and converts them into PyTorch Datasets. It assumes the
178
- target variable is the last column. It can also create, apply, and
179
- save a PytorchScaler for standardizing continuous features.
234
+ This class takes a DataFrame, and a FeatureSchema, automatically splits and converts them into PyTorch Datasets.
235
+ It can also create and apply a PytorchScaler using the schema.
180
236
 
181
237
  Attributes:
182
238
  `scaler` -> PytorchScaler | None
183
239
  `train_dataset` -> PyTorch Dataset
184
240
  `test_dataset` -> PyTorch Dataset
185
241
  `feature_names` -> list[str]
186
- `target_name` -> str
242
+ `target_names` -> list[str]
187
243
  `id` -> str
188
244
 
189
245
  The ID can be manually set to any string if needed, it is the target name by default.
190
246
  """
191
247
  def __init__(self,
192
248
  pandas_df: pandas.DataFrame,
249
+ schema: FeatureSchema,
193
250
  kind: Literal["regression", "classification"],
251
+ scaler: Union[Literal["fit"], Literal["none"], PytorchScaler],
194
252
  test_size: float = 0.2,
195
- random_state: int = 42,
196
- scaler: Optional[PytorchScaler] = None,
197
- continuous_feature_columns: Optional[Union[List[int], List[str]]] = None):
253
+ random_state: int = 42):
198
254
  """
199
255
  Args:
200
- pandas_df (pandas.DataFrame): The pre-processed input DataFrame with numerical data.
201
- kind (Literal["regression", "classification"]): The type of ML task. This determines the data type of the labels.
202
- test_size (float): The proportion of the dataset to allocate to the test split.
203
- random_state (int): The seed for the random number generator for reproducibility.
204
- scaler (PytorchScaler | None): A pre-fitted PytorchScaler instance.
205
- continuous_feature_columns (List[int] | List[str] | None): Column indices or names of continuous features to scale. If provided creates a new PytorchScaler.
256
+ pandas_df (pandas.DataFrame):
257
+ The pre-processed input DataFrame containing all columns. (features and single target).
258
+ schema (FeatureSchema):
259
+ The definitive schema object from data_exploration.
260
+ kind ("regression" | "classification"):
261
+ The type of ML task. This determines the data type of the labels.
262
+ scaler ("fit" | "none" | PytorchScaler):
263
+ Strategy for data scaling:
264
+ - "fit": Fit a new PytorchScaler on continuous features.
265
+ - "none": Do not scale data (e.g., for TabularTransformer).
266
+ - PytorchScaler instance: Use a pre-fitted scaler to transform data.
267
+ test_size (float):
268
+ The proportion of the dataset to allocate to the test split.
269
+ random_state (int):
270
+ The seed for the random number of generator for reproducibility.
271
+
206
272
  """
207
273
  super().__init__()
208
- self.scaler = scaler
209
274
 
210
- # --- 1. Identify features and target (single-target logic) ---
211
- features = pandas_df.iloc[:, :-1]
212
- target = pandas_df.iloc[:, -1]
213
- self._feature_names = features.columns.tolist()
214
- self._target_name = str(target.name)
215
- self._id = self._target_name
216
-
217
- # --- 2. Split ---
275
+ _apply_scaling: bool = False
276
+ if scaler == "fit":
277
+ self.scaler = None # To be created
278
+ _apply_scaling = True
279
+ elif scaler == "none":
280
+ self.scaler = None
281
+ elif isinstance(scaler, PytorchScaler):
282
+ self.scaler = scaler # Use the provided one
283
+ _apply_scaling = True
284
+ else:
285
+ _LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a PytorchScaler instance.")
286
+ raise ValueError()
287
+
288
+ # --- 1. Identify features (from schema) ---
289
+ self._feature_names = list(schema.feature_names)
290
+
291
+ # --- 2. Infer target (by set difference) ---
292
+ all_cols_set = set(pandas_df.columns)
293
+ feature_cols_set = set(self._feature_names)
294
+
295
+ target_cols_set = all_cols_set - feature_cols_set
296
+
297
+ if len(target_cols_set) == 0:
298
+ _LOGGER.error("No target column found. The schema's features match the DataFrame's columns exactly.")
299
+ raise ValueError("No target column found in DataFrame.")
300
+ if len(target_cols_set) > 1:
301
+ _LOGGER.error(f"Ambiguous target. Found {len(target_cols_set)} columns not in the schema: {list(target_cols_set)}. DatasetMaker (single-target) requires exactly one.")
302
+ raise ValueError("Ambiguous target: More than one non-feature column found.")
303
+
304
+ target_name = list(target_cols_set)[0]
305
+ self._target_names = [target_name]
306
+ self._id = target_name
307
+
308
+ # --- 3. Split Data ---
309
+ features_df = pandas_df[self._feature_names]
310
+ target_series = pandas_df[target_name]
311
+
218
312
  X_train, X_test, y_train, y_test = train_test_split(
219
- features, target, test_size=test_size, random_state=random_state
313
+ features_df,
314
+ target_series,
315
+ test_size=test_size,
316
+ random_state=random_state
220
317
  )
221
318
  self._X_train_shape, self._X_test_shape = X_train.shape, X_test.shape
222
319
  self._y_train_shape, self._y_test_shape = y_train.shape, y_test.shape
223
320
 
224
321
  label_dtype = torch.float32 if kind == "regression" else torch.int64
225
322
 
226
- # --- 3. Scale ---
227
- X_train_final, X_test_final = self._prepare_scaler(
228
- X_train, y_train, X_test, label_dtype, continuous_feature_columns
229
- )
323
+ # --- 4. Scale (using the schema) ---
324
+ if _apply_scaling:
325
+ X_train_final, X_test_final = self._prepare_scaler(
326
+ X_train, y_train, X_test, label_dtype, schema
327
+ )
328
+ else:
329
+ _LOGGER.info("Features have not been scaled as specified.")
330
+ X_train_final = X_train.to_numpy()
331
+ X_test_final = X_test.to_numpy()
332
+
333
+ # --- 5. Create Datasets ---
334
+ self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
335
+ self._test_ds = _PytorchDataset(X_test_final, y_test, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
230
336
 
231
- # --- 4. Create Datasets ---
232
- self._train_ds = _PytorchDataset(X_train_final, y_train.values, label_dtype)
233
- self._test_ds = _PytorchDataset(X_test_final, y_test.values, label_dtype)
234
-
235
- @property
236
- def target_name(self) -> str:
237
- return self._target_name
238
-
239
337
 
240
- # --- New Multi-Target Class ---
338
+ # --- Multi-Target Class ---
241
339
  class DatasetMakerMulti(_BaseDatasetMaker):
242
340
  """
243
- Dataset maker for pre-processed, numerical pandas DataFrames with a multiple target columns.
341
+ Dataset maker for pre-processed, numerical pandas DataFrames with
342
+ multiple target columns.
244
343
 
245
- This class takes a DataFrame, automatically splits it into training and testing sets, and converts them into PyTorch Datasets.
344
+ This class takes a *full* DataFrame, a *FeatureSchema*, and a list of
345
+ *target_columns*. It validates that the schema's features and the
346
+ target columns are mutually exclusive and together account for all
347
+ columns in the DataFrame.
348
+
349
+ Targets dtype is torch.float32
246
350
  """
247
351
  def __init__(self,
248
352
  pandas_df: pandas.DataFrame,
249
353
  target_columns: List[str],
354
+ schema: FeatureSchema,
355
+ scaler: Union[Literal["fit"], Literal["none"], PytorchScaler],
250
356
  test_size: float = 0.2,
251
- random_state: int = 42,
252
- scaler: Optional[PytorchScaler] = None,
253
- continuous_feature_columns: Optional[Union[List[int], List[str]]] = None):
357
+ random_state: int = 42):
254
358
  """
255
359
  Args:
256
- pandas_df (pandas.DataFrame): The pre-processed input DataFrame with numerical data.
257
- target_columns (list[str]): List of target column names.
258
- test_size (float): The proportion of the dataset to allocate to the test split.
259
- random_state (int): The seed for the random number generator for reproducibility.
260
- scaler (PytorchScaler | None): A pre-fitted PytorchScaler instance.
261
- continuous_feature_columns (List[int] | List[str] | None): Column indices or names of continuous features to scale. If provided creates a new PytorchScaler.
360
+ pandas_df (pandas.DataFrame):
361
+ The pre-processed input DataFrame with *all* columns
362
+ (features and targets).
363
+ target_columns (list[str]):
364
+ List of target column names.
365
+ schema (FeatureSchema):
366
+ The definitive schema object from data_exploration.
367
+ scaler ("fit" | "none" | PytorchScaler):
368
+ Strategy for data scaling:
369
+ - "fit": Fit a new PytorchScaler on continuous features.
370
+ - "none": Do not scale data (e.g., for TabularTransformer).
371
+ - PytorchScaler instance: Use a pre-fitted scaler to transform data.
372
+ test_size (float):
373
+ The proportion of the dataset to allocate to the test split.
374
+ random_state (int):
375
+ The seed for the random number generator for reproducibility.
376
+
377
+ ## Note:
378
+ For multi-binary classification, the most common PyTorch loss function is nn.BCEWithLogitsLoss.
379
+ This loss function requires the labels to be torch.float32 which is the same type required for regression (multi-regression) tasks.
262
380
  """
263
381
  super().__init__()
264
- self.scaler = scaler
265
-
382
+
383
+ _apply_scaling: bool = False
384
+ if scaler == "fit":
385
+ self.scaler = None
386
+ _apply_scaling = True
387
+ elif scaler == "none":
388
+ self.scaler = None
389
+ elif isinstance(scaler, PytorchScaler):
390
+ self.scaler = scaler # Use the provided one
391
+ _apply_scaling = True
392
+ else:
393
+ _LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a PytorchScaler instance.")
394
+ raise ValueError()
395
+
396
+ # --- 1. Get features and targets from schema/args ---
397
+ self._feature_names = list(schema.feature_names)
266
398
  self._target_names = target_columns
267
- self._feature_names = [col for col in pandas_df.columns if col not in target_columns]
268
- features = pandas_df[self._feature_names]
269
- target = pandas_df[self._target_names]
399
+
400
+ # --- 2. Validation ---
401
+ all_cols_set = set(pandas_df.columns)
402
+ feature_cols_set = set(self._feature_names)
403
+ target_cols_set = set(self._target_names)
404
+
405
+ overlap = feature_cols_set.intersection(target_cols_set)
406
+ if overlap:
407
+ _LOGGER.error(f"Features and targets are not mutually exclusive. Overlap: {list(overlap)}")
408
+ raise ValueError("Features and targets overlap.")
409
+
410
+ schema_plus_targets = feature_cols_set.union(target_cols_set)
411
+ missing_cols = all_cols_set - schema_plus_targets
412
+ if missing_cols:
413
+ _LOGGER.warning(f"Columns in DataFrame but not in schema or targets: {list(missing_cols)}")
414
+
415
+ extra_cols = schema_plus_targets - all_cols_set
416
+ if extra_cols:
417
+ _LOGGER.error(f"Columns in schema/targets but not in DataFrame: {list(extra_cols)}")
418
+ raise ValueError("Schema/target definition mismatch with DataFrame.")
419
+
420
+ # --- 3. Split Data ---
421
+ features_df = pandas_df[self._feature_names]
422
+ target_df = pandas_df[self._target_names]
270
423
 
271
424
  X_train, X_test, y_train, y_test = train_test_split(
272
- features, target, test_size=test_size, random_state=random_state
425
+ features_df,
426
+ target_df,
427
+ test_size=test_size,
428
+ random_state=random_state
273
429
  )
274
430
  self._X_train_shape, self._X_test_shape = X_train.shape, X_test.shape
275
431
  self._y_train_shape, self._y_test_shape = y_train.shape, y_test.shape
276
432
 
277
- label_dtype = torch.float32
433
+ # Multi-target for regression or multi-binary
434
+ label_dtype = torch.float32
278
435
 
279
- X_train_final, X_test_final = self._prepare_scaler(
280
- X_train, y_train, X_test, label_dtype, continuous_feature_columns
281
- )
436
+ # --- 4. Scale (using the schema) ---
437
+ if _apply_scaling:
438
+ X_train_final, X_test_final = self._prepare_scaler(
439
+ X_train, y_train, X_test, label_dtype, schema
440
+ )
441
+ else:
442
+ _LOGGER.info("Features have not been scaled as specified.")
443
+ X_train_final = X_train.to_numpy()
444
+ X_test_final = X_test.to_numpy()
282
445
 
283
- self._train_ds = _PytorchDataset(X_train_final, y_train, label_dtype)
284
- self._test_ds = _PytorchDataset(X_test_final, y_test, label_dtype)
285
-
286
- @property
287
- def target_names(self) -> list[str]:
288
- return self._target_names
446
+ # --- 5. Create Datasets ---
447
+ # _PytorchDataset now correctly handles y_train (a DataFrame)
448
+ self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
449
+ self._test_ds = _PytorchDataset(X_test_final, y_test, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
289
450
 
290
451
 
291
452
  # --- Private Base Class ---
@@ -307,149 +468,6 @@ class _BaseMaker(ABC):
307
468
  pass
308
469
 
309
470
 
310
- # --- VisionDatasetMaker ---
311
- class VisionDatasetMaker(_BaseMaker):
312
- """
313
- Creates processed PyTorch datasets for computer vision tasks from an
314
- image folder directory.
315
-
316
- Uses online augmentations per epoch (image augmentation without creating new files).
317
- """
318
- def __init__(self, full_dataset: ImageFolder):
319
- super().__init__()
320
- self.full_dataset = full_dataset
321
- self.labels = [s[1] for s in self.full_dataset.samples]
322
- self.class_map = full_dataset.class_to_idx
323
-
324
- self._is_split = False
325
- self._are_transforms_configured = False
326
-
327
- @classmethod
328
- def from_folder(cls, root_dir: str) -> 'VisionDatasetMaker':
329
- """Creates a maker instance from a root directory of images."""
330
- initial_transform = transforms.Compose([transforms.ToTensor()])
331
- full_dataset = ImageFolder(root=root_dir, transform=initial_transform)
332
- _LOGGER.info(f"Found {len(full_dataset)} images in {len(full_dataset.classes)} classes.")
333
- return cls(full_dataset)
334
-
335
- @staticmethod
336
- def inspect_folder(path: Union[str, Path]):
337
- """
338
- Logs a report of the types, sizes, and channels of image files
339
- found in the directory and its subdirectories.
340
- """
341
- path_obj = make_fullpath(path)
342
-
343
- non_image_files = set()
344
- img_types = set()
345
- img_sizes = set()
346
- img_channels = set()
347
- img_counter = 0
348
-
349
- _LOGGER.info(f"Inspecting folder: {path_obj}...")
350
- # Use rglob to recursively find all files
351
- for filepath in path_obj.rglob('*'):
352
- if filepath.is_file():
353
- try:
354
- # Using PIL to open is a more reliable check
355
- with Image.open(filepath) as img:
356
- img_types.add(img.format)
357
- img_sizes.add(img.size)
358
- img_channels.update(img.getbands())
359
- img_counter += 1
360
- except (IOError, SyntaxError):
361
- non_image_files.add(filepath.name)
362
-
363
- if non_image_files:
364
- _LOGGER.warning(f"Non-image or corrupted files found and ignored: {non_image_files}")
365
-
366
- report = (
367
- f"\n--- Inspection Report for '{path_obj.name}' ---\n"
368
- f"Total images found: {img_counter}\n"
369
- f"Image formats: {img_types or 'None'}\n"
370
- f"Image sizes (WxH): {img_sizes or 'None'}\n"
371
- f"Image channels (bands): {img_channels or 'None'}\n"
372
- f"--------------------------------------"
373
- )
374
- print(report)
375
-
376
- def split_data(self, val_size: float = 0.2, test_size: float = 0.0,
377
- stratify: bool = True, random_state: Optional[int] = None) -> 'VisionDatasetMaker':
378
- """Splits the dataset into training, validation, and optional test sets."""
379
- if self._is_split:
380
- _LOGGER.warning("Data has already been split.")
381
- return self
382
-
383
- if val_size + test_size >= 1.0:
384
- _LOGGER.error("The sum of val_size and test_size must be less than 1.")
385
- raise ValueError()
386
-
387
- indices = list(range(len(self.full_dataset)))
388
- labels_for_split = self.labels if stratify else None
389
-
390
- train_indices, val_test_indices = train_test_split(
391
- indices, test_size=(val_size + test_size), random_state=random_state, stratify=labels_for_split
392
- )
393
-
394
- if test_size > 0:
395
- val_test_labels = [self.labels[i] for i in val_test_indices]
396
- stratify_val_test = val_test_labels if stratify else None
397
- val_indices, test_indices = train_test_split(
398
- val_test_indices, test_size=(test_size / (val_size + test_size)),
399
- random_state=random_state, stratify=stratify_val_test
400
- )
401
- self._test_dataset = Subset(self.full_dataset, test_indices)
402
- _LOGGER.info(f"Test set created with {len(self._test_dataset)} images.")
403
- else:
404
- val_indices = val_test_indices
405
-
406
- self._train_dataset = Subset(self.full_dataset, train_indices)
407
- self._val_dataset = Subset(self.full_dataset, val_indices)
408
- self._is_split = True
409
-
410
- _LOGGER.info(f"Data split into: \n- Training: {len(self._train_dataset)} images \n- Validation: {len(self._val_dataset)} images")
411
- return self
412
-
413
- def configure_transforms(self, resize_size: int = 256, crop_size: int = 224,
414
- mean: List[float] = [0.485, 0.456, 0.406],
415
- std: List[float] = [0.229, 0.224, 0.225],
416
- extra_train_transforms: Optional[List] = None) -> 'VisionDatasetMaker':
417
- """Configures and applies the image transformations (augmentations)."""
418
- if not self._is_split:
419
- _LOGGER.error("Transforms must be configured AFTER splitting data. Call .split_data() first.")
420
- raise RuntimeError()
421
-
422
- base_train_transforms = [transforms.RandomResizedCrop(crop_size), transforms.RandomHorizontalFlip()]
423
- if extra_train_transforms:
424
- base_train_transforms.extend(extra_train_transforms)
425
-
426
- final_transforms = [transforms.ToTensor(), transforms.Normalize(mean=mean, std=std)]
427
-
428
- val_transform = transforms.Compose([transforms.Resize(resize_size), transforms.CenterCrop(crop_size), *final_transforms])
429
- train_transform = transforms.Compose([*base_train_transforms, *final_transforms])
430
-
431
- self._train_dataset.dataset.transform = train_transform # type: ignore
432
- self._val_dataset.dataset.transform = val_transform # type: ignore
433
- if self._test_dataset:
434
- self._test_dataset.dataset.transform = val_transform # type: ignore
435
-
436
- self._are_transforms_configured = True
437
- _LOGGER.info("Image transforms configured and applied.")
438
- return self
439
-
440
- def get_datasets(self) -> Tuple[Dataset, ...]:
441
- """Returns the final train, validation, and optional test datasets."""
442
- if not self._is_split:
443
- _LOGGER.error("Data has not been split. Call .split_data() first.")
444
- raise RuntimeError()
445
- if not self._are_transforms_configured:
446
- _LOGGER.warning("Transforms have not been configured. Using default ToTensor only.")
447
-
448
- if self._test_dataset:
449
- return self._train_dataset, self._val_dataset, self._test_dataset
450
- return self._train_dataset, self._val_dataset
451
-
452
-
453
471
  # --- SequenceMaker ---
454
472
  class SequenceMaker(_BaseMaker):
455
473
  """
@@ -638,40 +656,5 @@ class SequenceMaker(_BaseMaker):
638
656
  return self._train_dataset, self._test_dataset
639
657
 
640
658
 
641
- # --- Custom Vision Transform Class ---
642
- class ResizeAspectFill:
643
- """
644
- Custom transformation to make an image square by padding it to match the
645
- longest side, preserving the aspect ratio. The image is finally centered.
646
-
647
- Args:
648
- pad_color (Union[str, int]): Color to use for the padding.
649
- Defaults to "black".
650
- """
651
- def __init__(self, pad_color: Union[str, int] = "black") -> None:
652
- self.pad_color = pad_color
653
-
654
- def __call__(self, image: Image.Image) -> Image.Image:
655
- if not isinstance(image, Image.Image):
656
- _LOGGER.error(f"Expected PIL.Image.Image, got {type(image).__name__}")
657
- raise TypeError()
658
-
659
- w, h = image.size
660
- if w == h:
661
- return image
662
-
663
- # Determine padding to center the image
664
- if w > h:
665
- top_padding = (w - h) // 2
666
- bottom_padding = w - h - top_padding
667
- padding = (0, top_padding, 0, bottom_padding)
668
- else: # h > w
669
- left_padding = (h - w) // 2
670
- right_padding = h - w - left_padding
671
- padding = (left_padding, 0, right_padding, 0)
672
-
673
- return ImageOps.expand(image, padding, fill=self.pad_color)
674
-
675
-
676
659
  def info():
677
660
  _script_info(__all__)