dragon-ml-toolbox 14.8.0__py3-none-any.whl → 16.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (44) hide show
  1. {dragon_ml_toolbox-14.8.0.dist-info → dragon_ml_toolbox-16.0.0.dist-info}/METADATA +9 -5
  2. dragon_ml_toolbox-16.0.0.dist-info/RECORD +51 -0
  3. ml_tools/ETL_cleaning.py +20 -20
  4. ml_tools/ETL_engineering.py +23 -25
  5. ml_tools/GUI_tools.py +20 -20
  6. ml_tools/MICE_imputation.py +3 -3
  7. ml_tools/ML_callbacks.py +43 -26
  8. ml_tools/ML_configuration.py +204 -11
  9. ml_tools/ML_datasetmaster.py +198 -280
  10. ml_tools/ML_evaluation.py +132 -41
  11. ml_tools/ML_evaluation_multi.py +96 -35
  12. ml_tools/ML_inference.py +249 -207
  13. ml_tools/ML_models.py +13 -102
  14. ml_tools/ML_models_advanced.py +1 -1
  15. ml_tools/ML_optimization.py +12 -12
  16. ml_tools/ML_scaler.py +11 -11
  17. ml_tools/ML_sequence_datasetmaster.py +341 -0
  18. ml_tools/ML_sequence_evaluation.py +215 -0
  19. ml_tools/ML_sequence_inference.py +391 -0
  20. ml_tools/ML_sequence_models.py +139 -0
  21. ml_tools/ML_trainer.py +1237 -354
  22. ml_tools/ML_utilities.py +1 -1
  23. ml_tools/ML_vision_datasetmaster.py +73 -67
  24. ml_tools/ML_vision_evaluation.py +26 -6
  25. ml_tools/ML_vision_inference.py +117 -140
  26. ml_tools/ML_vision_models.py +1 -1
  27. ml_tools/ML_vision_transformers.py +121 -40
  28. ml_tools/PSO_optimization.py +6 -6
  29. ml_tools/SQL.py +4 -4
  30. ml_tools/{keys.py → _keys.py} +43 -0
  31. ml_tools/_schema.py +1 -1
  32. ml_tools/ensemble_evaluation.py +1 -1
  33. ml_tools/ensemble_inference.py +7 -33
  34. ml_tools/ensemble_learning.py +1 -1
  35. ml_tools/optimization_tools.py +2 -2
  36. ml_tools/path_manager.py +5 -5
  37. ml_tools/utilities.py +1 -2
  38. dragon_ml_toolbox-14.8.0.dist-info/RECORD +0 -49
  39. ml_tools/RNN_forecast.py +0 -56
  40. ml_tools/_ML_vision_recipe.py +0 -88
  41. {dragon_ml_toolbox-14.8.0.dist-info → dragon_ml_toolbox-16.0.0.dist-info}/WHEEL +0 -0
  42. {dragon_ml_toolbox-14.8.0.dist-info → dragon_ml_toolbox-16.0.0.dist-info}/licenses/LICENSE +0 -0
  43. {dragon_ml_toolbox-14.8.0.dist-info → dragon_ml_toolbox-16.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
  44. {dragon_ml_toolbox-14.8.0.dist-info → dragon_ml_toolbox-16.0.0.dist-info}/top_level.txt +0 -0
@@ -3,27 +3,25 @@ from torch.utils.data import Dataset
3
3
  import pandas
4
4
  import numpy
5
5
  from sklearn.model_selection import train_test_split
6
- from typing import Literal, Union, Tuple, List, Optional
7
- from abc import ABC, abstractmethod
8
- import matplotlib.pyplot as plt
6
+ from typing import Literal, Union, List, Optional
7
+ from abc import ABC
9
8
  from pathlib import Path
10
9
 
11
10
  from .path_manager import make_fullpath, sanitize_filename
12
11
  from ._logger import _LOGGER
13
12
  from ._script_info import _script_info
14
13
  from .custom_logger import save_list_strings
15
- from .ML_scaler import PytorchScaler
16
- from .keys import DatasetKeys
14
+ from .ML_scaler import DragonScaler
15
+ from ._keys import DatasetKeys, MLTaskKeys
17
16
  from ._schema import FeatureSchema
17
+ from .custom_logger import custom_logger
18
18
 
19
19
 
20
20
  __all__ = [
21
- "DatasetMaker",
22
- "DatasetMakerMulti",
23
- "SequenceMaker"
21
+ "DragonDataset",
22
+ "DragonDatasetMulti"
24
23
  ]
25
24
 
26
-
27
25
  # --- Internal Helper Class ---
28
26
  class _PytorchDataset(Dataset):
29
27
  """
@@ -57,6 +55,7 @@ class _PytorchDataset(Dataset):
57
55
 
58
56
  self._feature_names = feature_names
59
57
  self._target_names = target_names
58
+ self.classes: List[str] = []
60
59
 
61
60
  def __len__(self):
62
61
  return len(self.features)
@@ -78,6 +77,7 @@ class _PytorchDataset(Dataset):
78
77
  return self._target_names
79
78
  else:
80
79
  _LOGGER.error(f"Dataset {self.__class__} has not been initialized with any target names.")
80
+ raise ValueError()
81
81
 
82
82
 
83
83
  # --- Abstract Base Class ---
@@ -88,23 +88,28 @@ class _BaseDatasetMaker(ABC):
88
88
  """
89
89
  def __init__(self):
90
90
  self._train_ds: Optional[Dataset] = None
91
+ self._val_ds: Optional[Dataset] = None
91
92
  self._test_ds: Optional[Dataset] = None
92
- self.scaler: Optional[PytorchScaler] = None
93
+ self.scaler: Optional[DragonScaler] = None
93
94
  self._id: Optional[str] = None
94
95
  self._feature_names: List[str] = []
95
96
  self._target_names: List[str] = []
96
97
  self._X_train_shape = (0,0)
98
+ self._X_val_shape = (0,0)
97
99
  self._X_test_shape = (0,0)
98
100
  self._y_train_shape = (0,)
101
+ self._y_val_shape = (0,)
99
102
  self._y_test_shape = (0,)
103
+ self.class_map: Optional[dict[str, int]] = None
100
104
 
101
105
  def _prepare_scaler(self,
102
106
  X_train: pandas.DataFrame,
103
107
  y_train: Union[pandas.Series, pandas.DataFrame],
108
+ X_val: pandas.DataFrame,
104
109
  X_test: pandas.DataFrame,
105
110
  label_dtype: torch.dtype,
106
111
  schema: FeatureSchema):
107
- """Internal helper to fit and apply a PytorchScaler using a FeatureSchema."""
112
+ """Internal helper to fit and apply a DragonScaler using a FeatureSchema."""
108
113
  continuous_feature_indices: Optional[List[int]] = None
109
114
 
110
115
  # Get continuous feature indices *from the schema*
@@ -122,26 +127,33 @@ class _BaseDatasetMaker(ABC):
122
127
  _LOGGER.info("No continuous features listed in schema. Scaler will not be fitted.")
123
128
 
124
129
  X_train_values = X_train.to_numpy()
130
+ X_val_values = X_val.to_numpy()
125
131
  X_test_values = X_test.to_numpy()
126
132
 
127
133
  # continuous_feature_indices is derived
128
134
  if self.scaler is None and continuous_feature_indices:
129
- _LOGGER.info("Fitting a new PytorchScaler on training data.")
135
+ _LOGGER.info("Fitting a new DragonScaler on training data.")
130
136
  temp_train_ds = _PytorchDataset(X_train_values, y_train, label_dtype) # type: ignore
131
- self.scaler = PytorchScaler.fit(temp_train_ds, continuous_feature_indices)
137
+ self.scaler = DragonScaler.fit(temp_train_ds, continuous_feature_indices)
132
138
 
133
139
  if self.scaler and self.scaler.mean_ is not None:
134
- _LOGGER.info("Applying scaler transformation to train and test feature sets.")
140
+ _LOGGER.info("Applying scaler transformation to train, validation, and test feature sets.")
135
141
  X_train_tensor = self.scaler.transform(torch.tensor(X_train_values, dtype=torch.float32))
142
+ X_val_tensor = self.scaler.transform(torch.tensor(X_val_values, dtype=torch.float32))
136
143
  X_test_tensor = self.scaler.transform(torch.tensor(X_test_values, dtype=torch.float32))
137
- return X_train_tensor.numpy(), X_test_tensor.numpy()
144
+ return X_train_tensor.numpy(), X_val_tensor.numpy(), X_test_tensor.numpy()
138
145
 
139
- return X_train_values, X_test_values
146
+ return X_train_values, X_val_values, X_test_values
140
147
 
141
148
  @property
142
149
  def train_dataset(self) -> Dataset:
143
150
  if self._train_ds is None: raise RuntimeError("Dataset not yet created.")
144
151
  return self._train_ds
152
+
153
+ @property
154
+ def validation_dataset(self) -> Dataset:
155
+ if self._val_ds is None: raise RuntimeError("Dataset not yet created.")
156
+ return self._val_ds
145
157
 
146
158
  @property
147
159
  def test_dataset(self) -> Dataset:
@@ -176,6 +188,7 @@ class _BaseDatasetMaker(ABC):
176
188
  def dataframes_info(self) -> None:
177
189
  print("--- DataFrame Shapes After Split ---")
178
190
  print(f" X_train shape: {self._X_train_shape}, y_train shape: {self._y_train_shape}")
191
+ print(f" X_val shape: {self._X_val_shape}, y_val shape: {self._y_val_shape}")
179
192
  print(f" X_test shape: {self._X_test_shape}, y_test shape: {self._y_test_shape}")
180
193
  print("------------------------------------")
181
194
 
@@ -195,7 +208,7 @@ class _BaseDatasetMaker(ABC):
195
208
 
196
209
  def save_scaler(self, directory: Union[str, Path], verbose: bool=True) -> None:
197
210
  """
198
- Saves the fitted PytorchScaler's state to a .pth file.
211
+ Saves the fitted DragonScaler's state to a .pth file.
199
212
 
200
213
  The filename is automatically generated based on the dataset id.
201
214
 
@@ -215,6 +228,24 @@ class _BaseDatasetMaker(ABC):
215
228
  self.scaler.save(filepath, verbose=False)
216
229
  if verbose:
217
230
  _LOGGER.info(f"Scaler for dataset '{self.id}' saved as '{filepath.name}'.")
231
+
232
+ def save_class_map(self, directory: Union[str,Path], verbose: bool=True) -> None:
233
+ """
234
+ Saves the class to index mapping {str: int} to a directory.
235
+ """
236
+ if not self.class_map:
237
+ _LOGGER.warning(f"No class_map defined. Skipping.")
238
+ return
239
+
240
+ log_name = f"Class_to_Index_{self.id}" if self.id else "Class_to_Index"
241
+
242
+ custom_logger(data=self.class_map,
243
+ save_directory=directory,
244
+ log_name=log_name,
245
+ add_timestamp=False,
246
+ dict_as="json")
247
+ if verbose:
248
+ _LOGGER.info(f"Class map for '{self.id}' saved as '{log_name}.json'.")
218
249
 
219
250
  def save_artifacts(self, directory: Union[str, Path], verbose: bool=True) -> None:
220
251
  """
@@ -224,19 +255,22 @@ class _BaseDatasetMaker(ABC):
224
255
  self.save_target_names(directory=directory, verbose=verbose)
225
256
  if self.scaler is not None:
226
257
  self.save_scaler(directory=directory, verbose=verbose)
258
+ if self.class_map is not None:
259
+ self.save_class_map(directory=directory, verbose=verbose)
227
260
 
228
261
 
229
262
  # Single target dataset
230
- class DatasetMaker(_BaseDatasetMaker):
263
+ class DragonDataset(_BaseDatasetMaker):
231
264
  """
232
265
  Dataset maker for pre-processed, numerical pandas DataFrames with a single target column.
233
266
 
234
267
  This class takes a DataFrame, and a FeatureSchema, automatically splits and converts them into PyTorch Datasets.
235
- It can also create and apply a PytorchScaler using the schema.
268
+ It can also create and apply a DragonScaler using the schema.
236
269
 
237
270
  Attributes:
238
- `scaler` -> PytorchScaler | None
271
+ `scaler` -> DragonScaler | None
239
272
  `train_dataset` -> PyTorch Dataset
273
+ `validation_dataset` -> PyTorch Dataset
240
274
  `test_dataset` -> PyTorch Dataset
241
275
  `feature_names` -> list[str]
242
276
  `target_names` -> list[str]
@@ -247,9 +281,10 @@ class DatasetMaker(_BaseDatasetMaker):
247
281
  def __init__(self,
248
282
  pandas_df: pandas.DataFrame,
249
283
  schema: FeatureSchema,
250
- kind: Literal["regression", "classification"],
251
- scaler: Union[Literal["fit"], Literal["none"], PytorchScaler],
252
- test_size: float = 0.2,
284
+ kind: Literal["regression", "binary classification", "multiclass classification"],
285
+ scaler: Union[Literal["fit"], Literal["none"], DragonScaler],
286
+ validation_size: float = 0.2,
287
+ test_size: float = 0.1,
253
288
  random_state: int = 42):
254
289
  """
255
290
  Args:
@@ -257,32 +292,45 @@ class DatasetMaker(_BaseDatasetMaker):
257
292
  The pre-processed input DataFrame containing all columns. (features and single target).
258
293
  schema (FeatureSchema):
259
294
  The definitive schema object from data_exploration.
260
- kind ("regression" | "classification"):
261
- The type of ML task. This determines the data type of the labels.
262
- scaler ("fit" | "none" | PytorchScaler):
295
+ kind (str):
296
+ The type of ML task. Must be one of:
297
+ - "regression"
298
+ - "binary classification"
299
+ - "multiclass classification"
300
+ scaler ("fit" | "none" | DragonScaler):
263
301
  Strategy for data scaling:
264
- - "fit": Fit a new PytorchScaler on continuous features.
302
+ - "fit": Fit a new DragonScaler on continuous features.
265
303
  - "none": Do not scale data (e.g., for TabularTransformer).
266
- - PytorchScaler instance: Use a pre-fitted scaler to transform data.
304
+ - DragonScaler instance: Use a pre-fitted scaler to transform data.
305
+ validation_size (float):
306
+ The proportion of the *original* dataset to allocate to the validation split.
267
307
  test_size (float):
268
- The proportion of the dataset to allocate to the test split.
308
+ The proportion of the dataset to allocate to the test split (can be 0).
269
309
  random_state (int):
270
310
  The seed for the random number of generator for reproducibility.
271
311
 
272
312
  """
273
313
  super().__init__()
274
314
 
315
+ # --- Validation for split sizes ---
316
+ if (validation_size + test_size) >= 1.0:
317
+ _LOGGER.error(f"The sum of validation_size ({validation_size}) and test_size ({test_size}) must be less than 1.0.")
318
+ raise ValueError()
319
+ elif validation_size <= 0.0:
320
+ _LOGGER.error(f"Invalid validation split of {validation_size}.")
321
+ raise ValueError()
322
+
275
323
  _apply_scaling: bool = False
276
324
  if scaler == "fit":
277
325
  self.scaler = None # To be created
278
326
  _apply_scaling = True
279
327
  elif scaler == "none":
280
328
  self.scaler = None
281
- elif isinstance(scaler, PytorchScaler):
329
+ elif isinstance(scaler, DragonScaler):
282
330
  self.scaler = scaler # Use the provided one
283
331
  _apply_scaling = True
284
332
  else:
285
- _LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a PytorchScaler instance.")
333
+ _LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a DragonScaler instance.")
286
334
  raise ValueError()
287
335
 
288
336
  # --- 1. Identify features (from schema) ---
@@ -298,7 +346,7 @@ class DatasetMaker(_BaseDatasetMaker):
298
346
  _LOGGER.error("No target column found. The schema's features match the DataFrame's columns exactly.")
299
347
  raise ValueError("No target column found in DataFrame.")
300
348
  if len(target_cols_set) > 1:
301
- _LOGGER.error(f"Ambiguous target. Found {len(target_cols_set)} columns not in the schema: {list(target_cols_set)}. DatasetMaker (single-target) requires exactly one.")
349
+ _LOGGER.error(f"Ambiguous target. Found {len(target_cols_set)} columns not in the schema: {list(target_cols_set)}. One target required.")
302
350
  raise ValueError("Ambiguous target: More than one non-feature column found.")
303
351
 
304
352
  target_name = list(target_cols_set)[0]
@@ -308,32 +356,87 @@ class DatasetMaker(_BaseDatasetMaker):
308
356
  # --- 3. Split Data ---
309
357
  features_df = pandas_df[self._feature_names]
310
358
  target_series = pandas_df[target_name]
311
-
312
- X_train, X_test, y_train, y_test = train_test_split(
359
+
360
+ # First split: (Train + Val) vs TesT
361
+ X_train_val, X_test, y_train_val, y_test = train_test_split(
313
362
  features_df,
314
363
  target_series,
315
364
  test_size=test_size,
316
365
  random_state=random_state
317
366
  )
318
- self._X_train_shape, self._X_test_shape = X_train.shape, X_test.shape
319
- self._y_train_shape, self._y_test_shape = y_train.shape, y_test.shape
367
+ # Calculate validation split size relative to the (Train + Val) set
368
+ val_split_size = validation_size / (1.0 - test_size)
369
+
370
+ # Second split: Train vs Val
371
+ X_train, X_val, y_train, y_val = train_test_split(
372
+ X_train_val,
373
+ y_train_val,
374
+ test_size=val_split_size,
375
+ random_state=random_state
376
+ )
320
377
 
321
- label_dtype = torch.float32 if kind == "regression" else torch.int64
378
+ self._X_train_shape, self._X_val_shape, self._X_test_shape = X_train.shape, X_val.shape, X_test.shape
379
+ self._y_train_shape, self._y_val_shape, self._y_test_shape = y_train.shape, y_val.shape, y_test.shape
380
+
381
+ # --- label_dtype logic ---
382
+ if kind == MLTaskKeys.REGRESSION or kind == MLTaskKeys.BINARY_CLASSIFICATION:
383
+ label_dtype = torch.float32
384
+ elif kind == MLTaskKeys.MULTICLASS_CLASSIFICATION:
385
+ label_dtype = torch.int64
386
+ else:
387
+ _LOGGER.error(f"Invalid 'kind' {kind}. Must be '{MLTaskKeys.REGRESSION}', '{MLTaskKeys.BINARY_CLASSIFICATION}', or '{MLTaskKeys.MULTICLASS_CLASSIFICATION}'.")
388
+ raise ValueError()
389
+ self.kind = kind
322
390
 
323
391
  # --- 4. Scale (using the schema) ---
324
392
  if _apply_scaling:
325
- X_train_final, X_test_final = self._prepare_scaler(
326
- X_train, y_train, X_test, label_dtype, schema
393
+ X_train_final, X_val_final, X_test_final = self._prepare_scaler(
394
+ X_train, y_train, X_val, X_test, label_dtype, schema
327
395
  )
328
396
  else:
329
397
  _LOGGER.info("Features have not been scaled as specified.")
330
398
  X_train_final = X_train.to_numpy()
399
+ X_val_final = X_val.to_numpy()
331
400
  X_test_final = X_test.to_numpy()
332
401
 
333
402
  # --- 5. Create Datasets ---
334
403
  self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
404
+ self._val_ds = _PytorchDataset(X_val_final, y_val, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
335
405
  self._test_ds = _PytorchDataset(X_test_final, y_test, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
336
-
406
+
407
+ def set_class_map(self, class_map: dict[str, int]) -> None:
408
+ """
409
+ Sets a map of class_name -> integer_label.
410
+
411
+ This is used by the InferenceHandler and to finalize the model after training.
412
+
413
+ Args:
414
+ class_map (Dict[str, int]): A dictionary mapping the integer label
415
+ to its string name.
416
+ Example: {'cat': 0, 'dog': 1, 'bird': 2}
417
+ """
418
+ if self.kind == MLTaskKeys.REGRESSION:
419
+ _LOGGER.warning(f"Class Map is for classifications tasks only.")
420
+ return
421
+
422
+ self.class_map = class_map
423
+
424
+ try:
425
+ sorted_items = sorted(class_map.items(), key=lambda item: item[1])
426
+ class_list = [item[0] for item in sorted_items]
427
+ except Exception as e:
428
+ _LOGGER.error(f"Could not sort class map. Ensure it is a dict of {str: int}. Error: {e}")
429
+ raise TypeError()
430
+
431
+ if self._train_ds:
432
+ self._train_ds.classes = class_list # type: ignore
433
+ if self._val_ds:
434
+ self._val_ds.classes = class_list # type: ignore
435
+ if self._test_ds:
436
+ self._test_ds.classes = class_list # type: ignore
437
+
438
+ _LOGGER.info(f"Class map set for dataset '{self.id}':\n{class_map}")
439
+
337
440
  def __repr__(self) -> str:
338
441
  s = f"<{self.__class__.__name__} (ID: '{self.id}')>\n"
339
442
  s += f" Target: {self.target_names[0]}\n"
@@ -342,6 +445,8 @@ class DatasetMaker(_BaseDatasetMaker):
342
445
 
343
446
  if self._train_ds:
344
447
  s += f" Train Samples: {len(self._train_ds)}\n" # type: ignore
448
+ if self._val_ds:
449
+ s += f" Validation Samples: {len(self._val_ds)}\n" # type: ignore
345
450
  if self._test_ds:
346
451
  s += f" Test Samples: {len(self._test_ds)}\n" # type: ignore
347
452
 
@@ -349,7 +454,7 @@ class DatasetMaker(_BaseDatasetMaker):
349
454
 
350
455
 
351
456
  # --- Multi-Target Class ---
352
- class DatasetMakerMulti(_BaseDatasetMaker):
457
+ class DragonDatasetMulti(_BaseDatasetMaker):
353
458
  """
354
459
  Dataset maker for pre-processed, numerical pandas DataFrames with
355
460
  multiple target columns.
@@ -358,15 +463,15 @@ class DatasetMakerMulti(_BaseDatasetMaker):
358
463
  *target_columns*. It validates that the schema's features and the
359
464
  target columns are mutually exclusive and together account for all
360
465
  columns in the DataFrame.
361
-
362
- Targets dtype is torch.float32
363
466
  """
364
467
  def __init__(self,
365
468
  pandas_df: pandas.DataFrame,
366
469
  target_columns: List[str],
367
470
  schema: FeatureSchema,
368
- scaler: Union[Literal["fit"], Literal["none"], PytorchScaler],
369
- test_size: float = 0.2,
471
+ kind: Literal["multitarget regression", "multilabel binary classification"],
472
+ scaler: Union[Literal["fit"], Literal["none"], DragonScaler],
473
+ validation_size: float = 0.2,
474
+ test_size: float = 0.1,
370
475
  random_state: int = 42):
371
476
  """
372
477
  Args:
@@ -377,11 +482,17 @@ class DatasetMakerMulti(_BaseDatasetMaker):
377
482
  List of target column names.
378
483
  schema (FeatureSchema):
379
484
  The definitive schema object from data_exploration.
380
- scaler ("fit" | "none" | PytorchScaler):
485
+ kind (str):
486
+ The type of multi-target ML task. Must be one of:
487
+ - "multitarget regression"
488
+ - "multilabel binary classification"
489
+ scaler ("fit" | "none" | DragonScaler):
381
490
  Strategy for data scaling:
382
- - "fit": Fit a new PytorchScaler on continuous features.
491
+ - "fit": Fit a new DragonScaler on continuous features.
383
492
  - "none": Do not scale data (e.g., for TabularTransformer).
384
- - PytorchScaler instance: Use a pre-fitted scaler to transform data.
493
+ - DragonScaler instance: Use a pre-fitted scaler to transform data.
494
+ validation_size (float):
495
+ The proportion of the dataset to allocate to the validation split.
385
496
  test_size (float):
386
497
  The proportion of the dataset to allocate to the test split.
387
498
  random_state (int):
@@ -389,21 +500,34 @@ class DatasetMakerMulti(_BaseDatasetMaker):
389
500
 
390
501
  ## Note:
391
502
  For multi-binary classification, the most common PyTorch loss function is nn.BCEWithLogitsLoss.
392
- This loss function requires the labels to be torch.float32 which is the same type required for regression (multi-regression) tasks.
503
+ This loss function requires the labels to be torch.float32 which is the same type required for multi-regression tasks.
393
504
  """
394
505
  super().__init__()
395
506
 
507
+ # --- Validation for split sizes ---
508
+ if (validation_size + test_size) >= 1.0:
509
+ _LOGGER.error(f"The sum of validation_size ({validation_size}) and test_size ({test_size}) must be less than 1.0.")
510
+ raise ValueError("validation_size and test_size sum must be < 1.0")
511
+ elif validation_size <= 0.0:
512
+ _LOGGER.error(f"Invalid validation split of {validation_size}.")
513
+ raise ValueError()
514
+
515
+ # --- Validate kind parameter ---
516
+ if kind not in [MLTaskKeys.MULTITARGET_REGRESSION, MLTaskKeys.MULTILABEL_BINARY_CLASSIFICATION]:
517
+ _LOGGER.error(f"Invalid 'kind' {kind}. Must be '{MLTaskKeys.MULTITARGET_REGRESSION}' or '{MLTaskKeys.MULTILABEL_BINARY_CLASSIFICATION}'.")
518
+ raise ValueError()
519
+
396
520
  _apply_scaling: bool = False
397
521
  if scaler == "fit":
398
522
  self.scaler = None
399
523
  _apply_scaling = True
400
524
  elif scaler == "none":
401
525
  self.scaler = None
402
- elif isinstance(scaler, PytorchScaler):
526
+ elif isinstance(scaler, DragonScaler):
403
527
  self.scaler = scaler # Use the provided one
404
528
  _apply_scaling = True
405
529
  else:
406
- _LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a PytorchScaler instance.")
530
+ _LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a DragonScaler instance.")
407
531
  raise ValueError()
408
532
 
409
533
  # --- 1. Get features and targets from schema/args ---
@@ -433,32 +557,47 @@ class DatasetMakerMulti(_BaseDatasetMaker):
433
557
  # --- 3. Split Data ---
434
558
  features_df = pandas_df[self._feature_names]
435
559
  target_df = pandas_df[self._target_names]
436
-
437
- X_train, X_test, y_train, y_test = train_test_split(
560
+
561
+ # First split: (Train + Val) vs Test
562
+ X_train_val, X_test, y_train_val, y_test = train_test_split(
438
563
  features_df,
439
564
  target_df,
440
565
  test_size=test_size,
441
566
  random_state=random_state
442
567
  )
443
- self._X_train_shape, self._X_test_shape = X_train.shape, X_test.shape
444
- self._y_train_shape, self._y_test_shape = y_train.shape, y_test.shape
568
+
569
+ # Calculate validation split size relative to the (Train + Val) set
570
+ val_split_size = validation_size / (1.0 - test_size)
571
+
572
+ # Second split: Train vs Val
573
+ X_train, X_val, y_train, y_val = train_test_split(
574
+ X_train_val,
575
+ y_train_val,
576
+ test_size=val_split_size,
577
+ random_state=random_state
578
+ )
579
+
580
+ self._X_train_shape, self._X_val_shape, self._X_test_shape = X_train.shape, X_val.shape, X_test.shape
581
+ self._y_train_shape, self._y_val_shape, self._y_test_shape = y_train.shape, y_val.shape, y_test.shape
445
582
 
446
583
  # Multi-target for regression or multi-binary
447
584
  label_dtype = torch.float32
448
585
 
449
586
  # --- 4. Scale (using the schema) ---
450
587
  if _apply_scaling:
451
- X_train_final, X_test_final = self._prepare_scaler(
452
- X_train, y_train, X_test, label_dtype, schema
588
+ X_train_final, X_val_final, X_test_final = self._prepare_scaler(
589
+ X_train, y_train, X_val, X_test, label_dtype, schema
453
590
  )
454
591
  else:
455
592
  _LOGGER.info("Features have not been scaled as specified.")
456
593
  X_train_final = X_train.to_numpy()
594
+ X_val_final = X_val.to_numpy()
457
595
  X_test_final = X_test.to_numpy()
458
596
 
459
597
  # --- 5. Create Datasets ---
460
598
  # _PytorchDataset now correctly handles y_train (a DataFrame)
461
599
  self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
600
+ self._val_ds = _PytorchDataset(X_val_final, y_val, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
462
601
  self._test_ds = _PytorchDataset(X_test_final, y_test, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
463
602
 
464
603
  def __repr__(self) -> str:
@@ -469,234 +608,13 @@ class DatasetMakerMulti(_BaseDatasetMaker):
469
608
 
470
609
  if self._train_ds:
471
610
  s += f" Train Samples: {len(self._train_ds)}\n" # type: ignore
611
+ if self._val_ds:
612
+ s += f" Validation Samples: {len(self._val_ds)}\n" # type: ignore
472
613
  if self._test_ds:
473
614
  s += f" Test Samples: {len(self._test_ds)}\n" # type: ignore
474
615
 
475
616
  return s
476
617
 
477
618
 
478
- # --- Private Base Class ---
479
- class _BaseMaker(ABC):
480
- """
481
- Abstract Base Class for extra dataset makers.
482
- """
483
- def __init__(self):
484
- self._train_dataset = None
485
- self._test_dataset = None
486
- self._val_dataset = None
487
-
488
- @abstractmethod
489
- def get_datasets(self) -> Tuple[Dataset, ...]:
490
- """
491
- The primary method to retrieve the final, processed PyTorch datasets.
492
- Must be implemented by all subclasses.
493
- """
494
- pass
495
-
496
-
497
- # --- SequenceMaker ---
498
- class SequenceMaker(_BaseMaker):
499
- """
500
- Creates windowed PyTorch datasets from time-series data.
501
-
502
- Pipeline:
503
-
504
- 1. `.split_data()`: Separate time series into training and testing portions.
505
- 2. `.normalize_data()`: Normalize the data. The scaler will be fitted on the training portion.
506
- 3. `.generate_windows()`: Create the windowed sequences from the split and normalized data.
507
- 4. `.get_datasets()`: Return Pytorch train and test datasets.
508
- """
509
- def __init__(self, data: Union[pandas.DataFrame, pandas.Series, numpy.ndarray], sequence_length: int):
510
- super().__init__()
511
- self.sequence_length = sequence_length
512
- self.scaler = None
513
-
514
- if isinstance(data, pandas.DataFrame):
515
- self.time_axis = data.index.values
516
- self.sequence = data.iloc[:, 0].values.astype(numpy.float32)
517
- elif isinstance(data, pandas.Series):
518
- self.time_axis = data.index.values
519
- self.sequence = data.values.astype(numpy.float32)
520
- elif isinstance(data, numpy.ndarray):
521
- self.time_axis = numpy.arange(len(data))
522
- self.sequence = data.astype(numpy.float32)
523
- else:
524
- _LOGGER.error("Data must be a pandas DataFrame/Series or a numpy array.")
525
- raise TypeError()
526
-
527
- self.train_sequence = None
528
- self.test_sequence = None
529
-
530
- self._is_split = False
531
- self._is_normalized = False
532
- self._are_windows_generated = False
533
-
534
- def normalize_data(self) -> 'SequenceMaker':
535
- """
536
- Normalizes the sequence data using PytorchScaler. Must be called AFTER
537
- splitting to prevent data leakage from the test set.
538
- """
539
- if not self._is_split:
540
- _LOGGER.error("Data must be split BEFORE normalizing. Call .split_data() first.")
541
- raise RuntimeError()
542
-
543
- if self.scaler:
544
- _LOGGER.warning("Data has already been normalized.")
545
- return self
546
-
547
- # 1. PytorchScaler requires a Dataset to fit. Create a temporary one.
548
- # The scaler expects 2D data [n_samples, n_features].
549
- train_features = self.train_sequence.reshape(-1, 1) # type: ignore
550
-
551
- # _PytorchDataset needs labels, so we create dummy ones.
552
- dummy_labels = numpy.zeros(len(train_features))
553
- temp_train_ds = _PytorchDataset(train_features, dummy_labels, labels_dtype=torch.float32)
554
-
555
- # 2. Fit the PytorchScaler on the temporary training dataset.
556
- # The sequence is a single feature, so its index is [0].
557
- _LOGGER.info("Fitting PytorchScaler on the training data...")
558
- self.scaler = PytorchScaler.fit(temp_train_ds, continuous_feature_indices=[0])
559
-
560
- # 3. Transform sequences using the fitted scaler.
561
- # The transform method requires a tensor, so we convert, transform, and convert back.
562
- train_tensor = torch.tensor(self.train_sequence.reshape(-1, 1), dtype=torch.float32) # type: ignore
563
- test_tensor = torch.tensor(self.test_sequence.reshape(-1, 1), dtype=torch.float32) # type: ignore
564
-
565
- self.train_sequence = self.scaler.transform(train_tensor).numpy().flatten()
566
- self.test_sequence = self.scaler.transform(test_tensor).numpy().flatten()
567
-
568
- self._is_normalized = True
569
- _LOGGER.info("Sequence data normalized using PytorchScaler.")
570
- return self
571
-
572
- def split_data(self, test_size: float = 0.2) -> 'SequenceMaker':
573
- """Splits the sequence into training and testing portions."""
574
- if self._is_split:
575
- _LOGGER.warning("Data has already been split.")
576
- return self
577
-
578
- split_idx = int(len(self.sequence) * (1 - test_size))
579
- self.train_sequence = self.sequence[:split_idx]
580
- self.test_sequence = self.sequence[split_idx - self.sequence_length:]
581
-
582
- self.train_time_axis = self.time_axis[:split_idx]
583
- self.test_time_axis = self.time_axis[split_idx:]
584
-
585
- self._is_split = True
586
- _LOGGER.info(f"Sequence split into training ({len(self.train_sequence)} points) and testing ({len(self.test_sequence)} points).")
587
- return self
588
-
589
- def generate_windows(self, sequence_to_sequence: bool = False) -> 'SequenceMaker':
590
- """
591
- Generates overlapping windows for features and labels.
592
-
593
- "sequence-to-sequence": Label vectors are of the same size as the feature vectors instead of a single future prediction.
594
- """
595
- if not self._is_split:
596
- _LOGGER.error("Cannot generate windows before splitting data. Call .split_data() first.")
597
- raise RuntimeError()
598
-
599
- self._train_dataset = self._create_windowed_dataset(self.train_sequence, sequence_to_sequence) # type: ignore
600
- self._test_dataset = self._create_windowed_dataset(self.test_sequence, sequence_to_sequence) # type: ignore
601
-
602
- self._are_windows_generated = True
603
- _LOGGER.info("Feature and label windows generated for train and test sets.")
604
- return self
605
-
606
- def _create_windowed_dataset(self, data: numpy.ndarray, use_sequence_labels: bool) -> Dataset:
607
- """Efficiently creates windowed features and labels using numpy."""
608
- if len(data) <= self.sequence_length:
609
- _LOGGER.error("Data length must be greater than the sequence_length to create at least one window.")
610
- raise ValueError()
611
-
612
- if not use_sequence_labels:
613
- features = data[:-1]
614
- labels = data[self.sequence_length:]
615
-
616
- n_windows = len(features) - self.sequence_length + 1
617
- bytes_per_item = features.strides[0]
618
- strided_features = numpy.lib.stride_tricks.as_strided(
619
- features, shape=(n_windows, self.sequence_length), strides=(bytes_per_item, bytes_per_item)
620
- )
621
- return _PytorchDataset(strided_features, labels, labels_dtype=torch.float32)
622
-
623
- else:
624
- x_data = data[:-1]
625
- y_data = data[1:]
626
-
627
- n_windows = len(x_data) - self.sequence_length + 1
628
- bytes_per_item = x_data.strides[0]
629
-
630
- strided_x = numpy.lib.stride_tricks.as_strided(x_data, shape=(n_windows, self.sequence_length), strides=(bytes_per_item, bytes_per_item))
631
- strided_y = numpy.lib.stride_tricks.as_strided(y_data, shape=(n_windows, self.sequence_length), strides=(bytes_per_item, bytes_per_item))
632
-
633
- return _PytorchDataset(strided_x, strided_y, labels_dtype=torch.float32)
634
-
635
- def denormalize(self, data: Union[torch.Tensor, numpy.ndarray]) -> numpy.ndarray:
636
- """Applies inverse transformation using the stored PytorchScaler."""
637
- if self.scaler is None:
638
- _LOGGER.error("Data was not normalized. Cannot denormalize.")
639
- raise RuntimeError()
640
-
641
- # Ensure data is a torch.Tensor
642
- if isinstance(data, numpy.ndarray):
643
- tensor_data = torch.tensor(data, dtype=torch.float32)
644
- else:
645
- tensor_data = data
646
-
647
- # Reshape for the scaler [n_samples, n_features]
648
- if tensor_data.ndim == 1:
649
- tensor_data = tensor_data.view(-1, 1)
650
-
651
- # Apply inverse transform and convert back to a flat numpy array
652
- original_scale_tensor = self.scaler.inverse_transform(tensor_data)
653
- return original_scale_tensor.cpu().numpy().flatten()
654
-
655
- def plot(self, predictions: Optional[numpy.ndarray] = None):
656
- """Plots the original training and testing data, with optional predictions."""
657
- if not self._is_split:
658
- _LOGGER.error("Cannot plot before splitting data. Call .split_data() first.")
659
- raise RuntimeError()
660
-
661
- plt.figure(figsize=(15, 6))
662
- plt.title("Time Series Data")
663
- plt.grid(True)
664
- plt.xlabel("Time")
665
- plt.ylabel("Value")
666
-
667
- plt.plot(self.train_time_axis, self.scaler.inverse_transform(self.train_sequence.reshape(-1, 1)), label='Train Data') # type: ignore
668
- plt.plot(self.test_time_axis, self.scaler.inverse_transform(self.test_sequence[self.sequence_length-1:].reshape(-1, 1)), label='Test Data') # type: ignore
669
-
670
- if predictions is not None:
671
- pred_time_axis = self.test_time_axis[:len(predictions)]
672
- plt.plot(pred_time_axis, predictions, label='Predictions', c='red')
673
-
674
- plt.legend()
675
- plt.show()
676
-
677
- def get_datasets(self) -> Tuple[Dataset, Dataset]:
678
- """Returns the final train and test datasets."""
679
- if not self._are_windows_generated:
680
- _LOGGER.error("Windows have not been generated. Call .generate_windows() first.")
681
- raise RuntimeError()
682
- return self._train_dataset, self._test_dataset
683
-
684
- def __repr__(self) -> str:
685
- s = f"<{self.__class__.__name__}>:\n"
686
- s += f" Sequence Length (Window): {self.sequence_length}\n"
687
- s += f" Total Data Points: {len(self.sequence)}\n"
688
- s += " --- Status ---\n"
689
- s += f" Split: {self._is_split}\n"
690
- s += f" Normalized: {self._is_normalized}\n"
691
- s += f" Windows Generated: {self._are_windows_generated}\n"
692
-
693
- if self._are_windows_generated:
694
- train_len = len(self._train_dataset) if self._train_dataset else 0 # type: ignore
695
- test_len = len(self._test_dataset) if self._test_dataset else 0 # type: ignore
696
- s += f" Datasets (Train/Test): {train_len} / {test_len} windows\n"
697
-
698
- return s
699
-
700
-
701
619
  def info():
702
620
  _script_info(__all__)