dragon-ml-toolbox 14.7.0__py3-none-any.whl → 16.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. {dragon_ml_toolbox-14.7.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/METADATA +9 -5
  2. dragon_ml_toolbox-16.2.0.dist-info/RECORD +51 -0
  3. ml_tools/ETL_cleaning.py +20 -20
  4. ml_tools/ETL_engineering.py +23 -25
  5. ml_tools/GUI_tools.py +20 -20
  6. ml_tools/MICE_imputation.py +3 -3
  7. ml_tools/ML_callbacks.py +43 -26
  8. ml_tools/ML_configuration.py +704 -24
  9. ml_tools/ML_datasetmaster.py +235 -280
  10. ml_tools/ML_evaluation.py +144 -39
  11. ml_tools/ML_evaluation_multi.py +103 -35
  12. ml_tools/ML_inference.py +290 -208
  13. ml_tools/ML_models.py +13 -102
  14. ml_tools/ML_models_advanced.py +1 -1
  15. ml_tools/ML_optimization.py +12 -12
  16. ml_tools/ML_scaler.py +11 -11
  17. ml_tools/ML_sequence_datasetmaster.py +341 -0
  18. ml_tools/ML_sequence_evaluation.py +219 -0
  19. ml_tools/ML_sequence_inference.py +391 -0
  20. ml_tools/ML_sequence_models.py +139 -0
  21. ml_tools/ML_trainer.py +1342 -386
  22. ml_tools/ML_utilities.py +1 -1
  23. ml_tools/ML_vision_datasetmaster.py +120 -72
  24. ml_tools/ML_vision_evaluation.py +30 -6
  25. ml_tools/ML_vision_inference.py +129 -152
  26. ml_tools/ML_vision_models.py +1 -1
  27. ml_tools/ML_vision_transformers.py +121 -40
  28. ml_tools/PSO_optimization.py +6 -6
  29. ml_tools/SQL.py +4 -4
  30. ml_tools/{keys.py → _keys.py} +45 -0
  31. ml_tools/_schema.py +1 -1
  32. ml_tools/ensemble_evaluation.py +1 -1
  33. ml_tools/ensemble_inference.py +7 -33
  34. ml_tools/ensemble_learning.py +1 -1
  35. ml_tools/optimization_tools.py +2 -2
  36. ml_tools/path_manager.py +5 -5
  37. ml_tools/utilities.py +1 -2
  38. dragon_ml_toolbox-14.7.0.dist-info/RECORD +0 -49
  39. ml_tools/RNN_forecast.py +0 -56
  40. ml_tools/_ML_vision_recipe.py +0 -88
  41. {dragon_ml_toolbox-14.7.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/WHEEL +0 -0
  42. {dragon_ml_toolbox-14.7.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/licenses/LICENSE +0 -0
  43. {dragon_ml_toolbox-14.7.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
  44. {dragon_ml_toolbox-14.7.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/top_level.txt +0 -0
@@ -3,27 +3,25 @@ from torch.utils.data import Dataset
3
3
  import pandas
4
4
  import numpy
5
5
  from sklearn.model_selection import train_test_split
6
- from typing import Literal, Union, Tuple, List, Optional
7
- from abc import ABC, abstractmethod
8
- import matplotlib.pyplot as plt
6
+ from typing import Literal, Union, List, Optional
7
+ from abc import ABC
9
8
  from pathlib import Path
10
9
 
11
10
  from .path_manager import make_fullpath, sanitize_filename
12
11
  from ._logger import _LOGGER
13
12
  from ._script_info import _script_info
14
13
  from .custom_logger import save_list_strings
15
- from .ML_scaler import PytorchScaler
16
- from .keys import DatasetKeys
14
+ from .ML_scaler import DragonScaler
15
+ from ._keys import DatasetKeys, MLTaskKeys
17
16
  from ._schema import FeatureSchema
17
+ from .custom_logger import custom_logger
18
18
 
19
19
 
20
20
  __all__ = [
21
- "DatasetMaker",
22
- "DatasetMakerMulti",
23
- "SequenceMaker"
21
+ "DragonDataset",
22
+ "DragonDatasetMulti"
24
23
  ]
25
24
 
26
-
27
25
  # --- Internal Helper Class ---
28
26
  class _PytorchDataset(Dataset):
29
27
  """
@@ -57,6 +55,8 @@ class _PytorchDataset(Dataset):
57
55
 
58
56
  self._feature_names = feature_names
59
57
  self._target_names = target_names
58
+ self._classes: List[str] = []
59
+ self._class_map: dict[str,int] = dict()
60
60
 
61
61
  def __len__(self):
62
62
  return len(self.features)
@@ -78,6 +78,15 @@ class _PytorchDataset(Dataset):
78
78
  return self._target_names
79
79
  else:
80
80
  _LOGGER.error(f"Dataset {self.__class__} has not been initialized with any target names.")
81
+ raise ValueError()
82
+
83
+ @property
84
+ def classes(self):
85
+ return self._classes
86
+
87
+ @property
88
+ def class_map(self):
89
+ return self._class_map
81
90
 
82
91
 
83
92
  # --- Abstract Base Class ---
@@ -88,23 +97,29 @@ class _BaseDatasetMaker(ABC):
88
97
  """
89
98
  def __init__(self):
90
99
  self._train_ds: Optional[Dataset] = None
100
+ self._val_ds: Optional[Dataset] = None
91
101
  self._test_ds: Optional[Dataset] = None
92
- self.scaler: Optional[PytorchScaler] = None
102
+ self.scaler: Optional[DragonScaler] = None
93
103
  self._id: Optional[str] = None
94
104
  self._feature_names: List[str] = []
95
105
  self._target_names: List[str] = []
96
106
  self._X_train_shape = (0,0)
107
+ self._X_val_shape = (0,0)
97
108
  self._X_test_shape = (0,0)
98
109
  self._y_train_shape = (0,)
110
+ self._y_val_shape = (0,)
99
111
  self._y_test_shape = (0,)
112
+ self.class_map: dict[str, int] = dict()
113
+ self.classes: list[str] = list()
100
114
 
101
115
  def _prepare_scaler(self,
102
116
  X_train: pandas.DataFrame,
103
117
  y_train: Union[pandas.Series, pandas.DataFrame],
118
+ X_val: pandas.DataFrame,
104
119
  X_test: pandas.DataFrame,
105
120
  label_dtype: torch.dtype,
106
121
  schema: FeatureSchema):
107
- """Internal helper to fit and apply a PytorchScaler using a FeatureSchema."""
122
+ """Internal helper to fit and apply a DragonScaler using a FeatureSchema."""
108
123
  continuous_feature_indices: Optional[List[int]] = None
109
124
 
110
125
  # Get continuous feature indices *from the schema*
@@ -122,26 +137,33 @@ class _BaseDatasetMaker(ABC):
122
137
  _LOGGER.info("No continuous features listed in schema. Scaler will not be fitted.")
123
138
 
124
139
  X_train_values = X_train.to_numpy()
140
+ X_val_values = X_val.to_numpy()
125
141
  X_test_values = X_test.to_numpy()
126
142
 
127
143
  # continuous_feature_indices is derived
128
144
  if self.scaler is None and continuous_feature_indices:
129
- _LOGGER.info("Fitting a new PytorchScaler on training data.")
145
+ _LOGGER.info("Fitting a new DragonScaler on training data.")
130
146
  temp_train_ds = _PytorchDataset(X_train_values, y_train, label_dtype) # type: ignore
131
- self.scaler = PytorchScaler.fit(temp_train_ds, continuous_feature_indices)
147
+ self.scaler = DragonScaler.fit(temp_train_ds, continuous_feature_indices)
132
148
 
133
149
  if self.scaler and self.scaler.mean_ is not None:
134
- _LOGGER.info("Applying scaler transformation to train and test feature sets.")
150
+ _LOGGER.info("Applying scaler transformation to train, validation, and test feature sets.")
135
151
  X_train_tensor = self.scaler.transform(torch.tensor(X_train_values, dtype=torch.float32))
152
+ X_val_tensor = self.scaler.transform(torch.tensor(X_val_values, dtype=torch.float32))
136
153
  X_test_tensor = self.scaler.transform(torch.tensor(X_test_values, dtype=torch.float32))
137
- return X_train_tensor.numpy(), X_test_tensor.numpy()
154
+ return X_train_tensor.numpy(), X_val_tensor.numpy(), X_test_tensor.numpy()
138
155
 
139
- return X_train_values, X_test_values
156
+ return X_train_values, X_val_values, X_test_values
140
157
 
141
158
  @property
142
159
  def train_dataset(self) -> Dataset:
143
160
  if self._train_ds is None: raise RuntimeError("Dataset not yet created.")
144
161
  return self._train_ds
162
+
163
+ @property
164
+ def validation_dataset(self) -> Dataset:
165
+ if self._val_ds is None: raise RuntimeError("Dataset not yet created.")
166
+ return self._val_ds
145
167
 
146
168
  @property
147
169
  def test_dataset(self) -> Dataset:
@@ -176,6 +198,7 @@ class _BaseDatasetMaker(ABC):
176
198
  def dataframes_info(self) -> None:
177
199
  print("--- DataFrame Shapes After Split ---")
178
200
  print(f" X_train shape: {self._X_train_shape}, y_train shape: {self._y_train_shape}")
201
+ print(f" X_val shape: {self._X_val_shape}, y_val shape: {self._y_val_shape}")
179
202
  print(f" X_test shape: {self._X_test_shape}, y_test shape: {self._y_test_shape}")
180
203
  print("------------------------------------")
181
204
 
@@ -195,7 +218,7 @@ class _BaseDatasetMaker(ABC):
195
218
 
196
219
  def save_scaler(self, directory: Union[str, Path], verbose: bool=True) -> None:
197
220
  """
198
- Saves the fitted PytorchScaler's state to a .pth file.
221
+ Saves the fitted DragonScaler's state to a .pth file.
199
222
 
200
223
  The filename is automatically generated based on the dataset id.
201
224
 
@@ -215,6 +238,24 @@ class _BaseDatasetMaker(ABC):
215
238
  self.scaler.save(filepath, verbose=False)
216
239
  if verbose:
217
240
  _LOGGER.info(f"Scaler for dataset '{self.id}' saved as '{filepath.name}'.")
241
+
242
+ def save_class_map(self, directory: Union[str,Path], verbose: bool=True) -> None:
243
+ """
244
+ Saves the class to index mapping {str: int} to a directory.
245
+ """
246
+ if not self.class_map:
247
+ _LOGGER.warning(f"No class_map defined. Skipping.")
248
+ return
249
+
250
+ log_name = f"Class_to_Index_{self.id}" if self.id else "Class_to_Index"
251
+
252
+ custom_logger(data=self.class_map,
253
+ save_directory=directory,
254
+ log_name=log_name,
255
+ add_timestamp=False,
256
+ dict_as="json")
257
+ if verbose:
258
+ _LOGGER.info(f"Class map for '{self.id}' saved as '{log_name}.json'.")
218
259
 
219
260
  def save_artifacts(self, directory: Union[str, Path], verbose: bool=True) -> None:
220
261
  """
@@ -224,19 +265,22 @@ class _BaseDatasetMaker(ABC):
224
265
  self.save_target_names(directory=directory, verbose=verbose)
225
266
  if self.scaler is not None:
226
267
  self.save_scaler(directory=directory, verbose=verbose)
268
+ if self.class_map:
269
+ self.save_class_map(directory=directory, verbose=verbose)
227
270
 
228
271
 
229
272
  # Single target dataset
230
- class DatasetMaker(_BaseDatasetMaker):
273
+ class DragonDataset(_BaseDatasetMaker):
231
274
  """
232
275
  Dataset maker for pre-processed, numerical pandas DataFrames with a single target column.
233
276
 
234
277
  This class takes a DataFrame, and a FeatureSchema, automatically splits and converts them into PyTorch Datasets.
235
- It can also create and apply a PytorchScaler using the schema.
278
+ It can also create and apply a DragonScaler using the schema.
236
279
 
237
280
  Attributes:
238
- `scaler` -> PytorchScaler | None
281
+ `scaler` -> DragonScaler | None
239
282
  `train_dataset` -> PyTorch Dataset
283
+ `validation_dataset` -> PyTorch Dataset
240
284
  `test_dataset` -> PyTorch Dataset
241
285
  `feature_names` -> list[str]
242
286
  `target_names` -> list[str]
@@ -247,9 +291,11 @@ class DatasetMaker(_BaseDatasetMaker):
247
291
  def __init__(self,
248
292
  pandas_df: pandas.DataFrame,
249
293
  schema: FeatureSchema,
250
- kind: Literal["regression", "classification"],
251
- scaler: Union[Literal["fit"], Literal["none"], PytorchScaler],
252
- test_size: float = 0.2,
294
+ kind: Literal["regression", "binary classification", "multiclass classification"],
295
+ scaler: Union[Literal["fit"], Literal["none"], DragonScaler],
296
+ validation_size: float = 0.2,
297
+ test_size: float = 0.1,
298
+ class_map: Optional[dict[str,int]]=None,
253
299
  random_state: int = 42):
254
300
  """
255
301
  Args:
@@ -257,32 +303,46 @@ class DatasetMaker(_BaseDatasetMaker):
257
303
  The pre-processed input DataFrame containing all columns. (features and single target).
258
304
  schema (FeatureSchema):
259
305
  The definitive schema object from data_exploration.
260
- kind ("regression" | "classification"):
261
- The type of ML task. This determines the data type of the labels.
262
- scaler ("fit" | "none" | PytorchScaler):
306
+ kind (str):
307
+ The type of ML task. Must be one of:
308
+ - "regression"
309
+ - "binary classification"
310
+ - "multiclass classification"
311
+ scaler ("fit" | "none" | DragonScaler):
263
312
  Strategy for data scaling:
264
- - "fit": Fit a new PytorchScaler on continuous features.
313
+ - "fit": Fit a new DragonScaler on continuous features.
265
314
  - "none": Do not scale data (e.g., for TabularTransformer).
266
- - PytorchScaler instance: Use a pre-fitted scaler to transform data.
315
+ - DragonScaler instance: Use a pre-fitted scaler to transform data.
316
+ validation_size (float):
317
+ The proportion of the *original* dataset to allocate to the validation split.
267
318
  test_size (float):
268
- The proportion of the dataset to allocate to the test split.
319
+ The proportion of the dataset to allocate to the test split (can be 0).
320
+ class_map (dict[str,int] | None): Optional class map for the target classes in classification tasks. Can be set later using `.set_class_map()`.
269
321
  random_state (int):
270
322
  The seed for the random number of generator for reproducibility.
271
323
 
272
324
  """
273
325
  super().__init__()
274
326
 
327
+ # --- Validation for split sizes ---
328
+ if (validation_size + test_size) >= 1.0:
329
+ _LOGGER.error(f"The sum of validation_size ({validation_size}) and test_size ({test_size}) must be less than 1.0.")
330
+ raise ValueError()
331
+ elif validation_size <= 0.0:
332
+ _LOGGER.error(f"Invalid validation split of {validation_size}.")
333
+ raise ValueError()
334
+
275
335
  _apply_scaling: bool = False
276
336
  if scaler == "fit":
277
337
  self.scaler = None # To be created
278
338
  _apply_scaling = True
279
339
  elif scaler == "none":
280
340
  self.scaler = None
281
- elif isinstance(scaler, PytorchScaler):
341
+ elif isinstance(scaler, DragonScaler):
282
342
  self.scaler = scaler # Use the provided one
283
343
  _apply_scaling = True
284
344
  else:
285
- _LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a PytorchScaler instance.")
345
+ _LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a DragonScaler instance.")
286
346
  raise ValueError()
287
347
 
288
348
  # --- 1. Identify features (from schema) ---
@@ -298,7 +358,7 @@ class DatasetMaker(_BaseDatasetMaker):
298
358
  _LOGGER.error("No target column found. The schema's features match the DataFrame's columns exactly.")
299
359
  raise ValueError("No target column found in DataFrame.")
300
360
  if len(target_cols_set) > 1:
301
- _LOGGER.error(f"Ambiguous target. Found {len(target_cols_set)} columns not in the schema: {list(target_cols_set)}. DatasetMaker (single-target) requires exactly one.")
361
+ _LOGGER.error(f"Ambiguous target. Found {len(target_cols_set)} columns not in the schema: {list(target_cols_set)}. One target required.")
302
362
  raise ValueError("Ambiguous target: More than one non-feature column found.")
303
363
 
304
364
  target_name = list(target_cols_set)[0]
@@ -308,32 +368,112 @@ class DatasetMaker(_BaseDatasetMaker):
308
368
  # --- 3. Split Data ---
309
369
  features_df = pandas_df[self._feature_names]
310
370
  target_series = pandas_df[target_name]
311
-
312
- X_train, X_test, y_train, y_test = train_test_split(
371
+
372
+ # First split: (Train + Val) vs TesT
373
+ X_train_val, X_test, y_train_val, y_test = train_test_split(
313
374
  features_df,
314
375
  target_series,
315
376
  test_size=test_size,
316
377
  random_state=random_state
317
378
  )
318
- self._X_train_shape, self._X_test_shape = X_train.shape, X_test.shape
319
- self._y_train_shape, self._y_test_shape = y_train.shape, y_test.shape
379
+ # Calculate validation split size relative to the (Train + Val) set
380
+ val_split_size = validation_size / (1.0 - test_size)
381
+
382
+ # Second split: Train vs Val
383
+ X_train, X_val, y_train, y_val = train_test_split(
384
+ X_train_val,
385
+ y_train_val,
386
+ test_size=val_split_size,
387
+ random_state=random_state
388
+ )
389
+
390
+ self._X_train_shape, self._X_val_shape, self._X_test_shape = X_train.shape, X_val.shape, X_test.shape
391
+ self._y_train_shape, self._y_val_shape, self._y_test_shape = y_train.shape, y_val.shape, y_test.shape
320
392
 
321
- label_dtype = torch.float32 if kind == "regression" else torch.int64
393
+ # --- label_dtype logic ---
394
+ if kind == MLTaskKeys.REGRESSION or kind == MLTaskKeys.BINARY_CLASSIFICATION:
395
+ label_dtype = torch.float32
396
+ elif kind == MLTaskKeys.MULTICLASS_CLASSIFICATION:
397
+ label_dtype = torch.int64
398
+ else:
399
+ _LOGGER.error(f"Invalid 'kind' {kind}. Must be '{MLTaskKeys.REGRESSION}', '{MLTaskKeys.BINARY_CLASSIFICATION}', or '{MLTaskKeys.MULTICLASS_CLASSIFICATION}'.")
400
+ raise ValueError()
401
+ self.kind = kind
322
402
 
323
403
  # --- 4. Scale (using the schema) ---
324
404
  if _apply_scaling:
325
- X_train_final, X_test_final = self._prepare_scaler(
326
- X_train, y_train, X_test, label_dtype, schema
405
+ X_train_final, X_val_final, X_test_final = self._prepare_scaler(
406
+ X_train, y_train, X_val, X_test, label_dtype, schema
327
407
  )
328
408
  else:
329
409
  _LOGGER.info("Features have not been scaled as specified.")
330
410
  X_train_final = X_train.to_numpy()
411
+ X_val_final = X_val.to_numpy()
331
412
  X_test_final = X_test.to_numpy()
332
413
 
333
414
  # --- 5. Create Datasets ---
334
415
  self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
416
+ self._val_ds = _PytorchDataset(X_val_final, y_val, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
335
417
  self._test_ds = _PytorchDataset(X_test_final, y_test, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
336
-
418
+
419
+ # --- 6. create class map if given ---
420
+ if self.kind != MLTaskKeys.REGRESSION:
421
+ if class_map is None:
422
+ self.class_map = dict()
423
+ else:
424
+ self.set_class_map(class_map)
425
+ else:
426
+ self.class_map = dict()
427
+
428
+ def set_class_map(self, class_map: dict[str, int], force_overwrite: bool=False) -> None:
429
+ """
430
+ Sets a map of class_name -> integer_label.
431
+
432
+ This is used by the InferenceHandler and to finalize the model after training.
433
+
434
+ Args:
435
+ class_map (Dict[str, int]): A dictionary mapping the integer label
436
+ to its string name.
437
+ Example: {'cat': 0, 'dog': 1, 'bird': 2}
438
+ force_overwrite (bool): Required to overwrite a previously set class map.
439
+ """
440
+ if self.kind == MLTaskKeys.REGRESSION:
441
+ _LOGGER.warning(f"Class Map is for classifications tasks only.")
442
+ return
443
+
444
+ if self.class_map:
445
+ warning_message = f"Class map was previously set."
446
+ if not force_overwrite:
447
+ warning_message += " Use `force_overwrite=True` to set new values."
448
+ _LOGGER.warning(warning_message)
449
+ return
450
+ else:
451
+ warning_message += ". Setting new values..."
452
+ _LOGGER.warning(warning_message)
453
+
454
+ self.class_map = class_map
455
+
456
+ try:
457
+ sorted_items = sorted(class_map.items(), key=lambda item: item[1])
458
+ class_list = [item[0] for item in sorted_items]
459
+ except Exception as e:
460
+ _LOGGER.error(f"Could not sort class map. Ensure it is a dict of {str: int}. Error: {e}")
461
+ raise TypeError()
462
+ else:
463
+ self.classes = class_list
464
+
465
+ if self._train_ds:
466
+ self._train_ds._classes = class_list # type: ignore
467
+ self._train_ds._class_map = class_map # type: ignore
468
+ if self._val_ds:
469
+ self._val_ds._classes = class_list # type: ignore
470
+ self._val_ds._class_map = class_map # type: ignore
471
+ if self._test_ds:
472
+ self._test_ds._classes = class_list # type: ignore
473
+ self._test_ds._class_map = class_map # type: ignore
474
+
475
+ _LOGGER.info(f"Class map set for dataset '{self.id}' and its subsets:\n{class_map}")
476
+
337
477
  def __repr__(self) -> str:
338
478
  s = f"<{self.__class__.__name__} (ID: '{self.id}')>\n"
339
479
  s += f" Target: {self.target_names[0]}\n"
@@ -342,6 +482,8 @@ class DatasetMaker(_BaseDatasetMaker):
342
482
 
343
483
  if self._train_ds:
344
484
  s += f" Train Samples: {len(self._train_ds)}\n" # type: ignore
485
+ if self._val_ds:
486
+ s += f" Validation Samples: {len(self._val_ds)}\n" # type: ignore
345
487
  if self._test_ds:
346
488
  s += f" Test Samples: {len(self._test_ds)}\n" # type: ignore
347
489
 
@@ -349,7 +491,7 @@ class DatasetMaker(_BaseDatasetMaker):
349
491
 
350
492
 
351
493
  # --- Multi-Target Class ---
352
- class DatasetMakerMulti(_BaseDatasetMaker):
494
+ class DragonDatasetMulti(_BaseDatasetMaker):
353
495
  """
354
496
  Dataset maker for pre-processed, numerical pandas DataFrames with
355
497
  multiple target columns.
@@ -358,15 +500,15 @@ class DatasetMakerMulti(_BaseDatasetMaker):
358
500
  *target_columns*. It validates that the schema's features and the
359
501
  target columns are mutually exclusive and together account for all
360
502
  columns in the DataFrame.
361
-
362
- Targets dtype is torch.float32
363
503
  """
364
504
  def __init__(self,
365
505
  pandas_df: pandas.DataFrame,
366
506
  target_columns: List[str],
367
507
  schema: FeatureSchema,
368
- scaler: Union[Literal["fit"], Literal["none"], PytorchScaler],
369
- test_size: float = 0.2,
508
+ kind: Literal["multitarget regression", "multilabel binary classification"],
509
+ scaler: Union[Literal["fit"], Literal["none"], DragonScaler],
510
+ validation_size: float = 0.2,
511
+ test_size: float = 0.1,
370
512
  random_state: int = 42):
371
513
  """
372
514
  Args:
@@ -377,11 +519,17 @@ class DatasetMakerMulti(_BaseDatasetMaker):
377
519
  List of target column names.
378
520
  schema (FeatureSchema):
379
521
  The definitive schema object from data_exploration.
380
- scaler ("fit" | "none" | PytorchScaler):
522
+ kind (str):
523
+ The type of multi-target ML task. Must be one of:
524
+ - "multitarget regression"
525
+ - "multilabel binary classification"
526
+ scaler ("fit" | "none" | DragonScaler):
381
527
  Strategy for data scaling:
382
- - "fit": Fit a new PytorchScaler on continuous features.
528
+ - "fit": Fit a new DragonScaler on continuous features.
383
529
  - "none": Do not scale data (e.g., for TabularTransformer).
384
- - PytorchScaler instance: Use a pre-fitted scaler to transform data.
530
+ - DragonScaler instance: Use a pre-fitted scaler to transform data.
531
+ validation_size (float):
532
+ The proportion of the dataset to allocate to the validation split.
385
533
  test_size (float):
386
534
  The proportion of the dataset to allocate to the test split.
387
535
  random_state (int):
@@ -389,21 +537,34 @@ class DatasetMakerMulti(_BaseDatasetMaker):
389
537
 
390
538
  ## Note:
391
539
  For multi-binary classification, the most common PyTorch loss function is nn.BCEWithLogitsLoss.
392
- This loss function requires the labels to be torch.float32 which is the same type required for regression (multi-regression) tasks.
540
+ This loss function requires the labels to be torch.float32 which is the same type required for multi-regression tasks.
393
541
  """
394
542
  super().__init__()
395
543
 
544
+ # --- Validation for split sizes ---
545
+ if (validation_size + test_size) >= 1.0:
546
+ _LOGGER.error(f"The sum of validation_size ({validation_size}) and test_size ({test_size}) must be less than 1.0.")
547
+ raise ValueError("validation_size and test_size sum must be < 1.0")
548
+ elif validation_size <= 0.0:
549
+ _LOGGER.error(f"Invalid validation split of {validation_size}.")
550
+ raise ValueError()
551
+
552
+ # --- Validate kind parameter ---
553
+ if kind not in [MLTaskKeys.MULTITARGET_REGRESSION, MLTaskKeys.MULTILABEL_BINARY_CLASSIFICATION]:
554
+ _LOGGER.error(f"Invalid 'kind' {kind}. Must be '{MLTaskKeys.MULTITARGET_REGRESSION}' or '{MLTaskKeys.MULTILABEL_BINARY_CLASSIFICATION}'.")
555
+ raise ValueError()
556
+
396
557
  _apply_scaling: bool = False
397
558
  if scaler == "fit":
398
559
  self.scaler = None
399
560
  _apply_scaling = True
400
561
  elif scaler == "none":
401
562
  self.scaler = None
402
- elif isinstance(scaler, PytorchScaler):
563
+ elif isinstance(scaler, DragonScaler):
403
564
  self.scaler = scaler # Use the provided one
404
565
  _apply_scaling = True
405
566
  else:
406
- _LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a PytorchScaler instance.")
567
+ _LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a DragonScaler instance.")
407
568
  raise ValueError()
408
569
 
409
570
  # --- 1. Get features and targets from schema/args ---
@@ -433,32 +594,47 @@ class DatasetMakerMulti(_BaseDatasetMaker):
433
594
  # --- 3. Split Data ---
434
595
  features_df = pandas_df[self._feature_names]
435
596
  target_df = pandas_df[self._target_names]
436
-
437
- X_train, X_test, y_train, y_test = train_test_split(
597
+
598
+ # First split: (Train + Val) vs Test
599
+ X_train_val, X_test, y_train_val, y_test = train_test_split(
438
600
  features_df,
439
601
  target_df,
440
602
  test_size=test_size,
441
603
  random_state=random_state
442
604
  )
443
- self._X_train_shape, self._X_test_shape = X_train.shape, X_test.shape
444
- self._y_train_shape, self._y_test_shape = y_train.shape, y_test.shape
605
+
606
+ # Calculate validation split size relative to the (Train + Val) set
607
+ val_split_size = validation_size / (1.0 - test_size)
608
+
609
+ # Second split: Train vs Val
610
+ X_train, X_val, y_train, y_val = train_test_split(
611
+ X_train_val,
612
+ y_train_val,
613
+ test_size=val_split_size,
614
+ random_state=random_state
615
+ )
616
+
617
+ self._X_train_shape, self._X_val_shape, self._X_test_shape = X_train.shape, X_val.shape, X_test.shape
618
+ self._y_train_shape, self._y_val_shape, self._y_test_shape = y_train.shape, y_val.shape, y_test.shape
445
619
 
446
620
  # Multi-target for regression or multi-binary
447
621
  label_dtype = torch.float32
448
622
 
449
623
  # --- 4. Scale (using the schema) ---
450
624
  if _apply_scaling:
451
- X_train_final, X_test_final = self._prepare_scaler(
452
- X_train, y_train, X_test, label_dtype, schema
625
+ X_train_final, X_val_final, X_test_final = self._prepare_scaler(
626
+ X_train, y_train, X_val, X_test, label_dtype, schema
453
627
  )
454
628
  else:
455
629
  _LOGGER.info("Features have not been scaled as specified.")
456
630
  X_train_final = X_train.to_numpy()
631
+ X_val_final = X_val.to_numpy()
457
632
  X_test_final = X_test.to_numpy()
458
633
 
459
634
  # --- 5. Create Datasets ---
460
635
  # _PytorchDataset now correctly handles y_train (a DataFrame)
461
636
  self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
637
+ self._val_ds = _PytorchDataset(X_val_final, y_val, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
462
638
  self._test_ds = _PytorchDataset(X_test_final, y_test, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
463
639
 
464
640
  def __repr__(self) -> str:
@@ -469,234 +645,13 @@ class DatasetMakerMulti(_BaseDatasetMaker):
469
645
 
470
646
  if self._train_ds:
471
647
  s += f" Train Samples: {len(self._train_ds)}\n" # type: ignore
648
+ if self._val_ds:
649
+ s += f" Validation Samples: {len(self._val_ds)}\n" # type: ignore
472
650
  if self._test_ds:
473
651
  s += f" Test Samples: {len(self._test_ds)}\n" # type: ignore
474
652
 
475
653
  return s
476
654
 
477
655
 
478
- # --- Private Base Class ---
479
- class _BaseMaker(ABC):
480
- """
481
- Abstract Base Class for extra dataset makers.
482
- """
483
- def __init__(self):
484
- self._train_dataset = None
485
- self._test_dataset = None
486
- self._val_dataset = None
487
-
488
- @abstractmethod
489
- def get_datasets(self) -> Tuple[Dataset, ...]:
490
- """
491
- The primary method to retrieve the final, processed PyTorch datasets.
492
- Must be implemented by all subclasses.
493
- """
494
- pass
495
-
496
-
497
- # --- SequenceMaker ---
498
- class SequenceMaker(_BaseMaker):
499
- """
500
- Creates windowed PyTorch datasets from time-series data.
501
-
502
- Pipeline:
503
-
504
- 1. `.split_data()`: Separate time series into training and testing portions.
505
- 2. `.normalize_data()`: Normalize the data. The scaler will be fitted on the training portion.
506
- 3. `.generate_windows()`: Create the windowed sequences from the split and normalized data.
507
- 4. `.get_datasets()`: Return Pytorch train and test datasets.
508
- """
509
- def __init__(self, data: Union[pandas.DataFrame, pandas.Series, numpy.ndarray], sequence_length: int):
510
- super().__init__()
511
- self.sequence_length = sequence_length
512
- self.scaler = None
513
-
514
- if isinstance(data, pandas.DataFrame):
515
- self.time_axis = data.index.values
516
- self.sequence = data.iloc[:, 0].values.astype(numpy.float32)
517
- elif isinstance(data, pandas.Series):
518
- self.time_axis = data.index.values
519
- self.sequence = data.values.astype(numpy.float32)
520
- elif isinstance(data, numpy.ndarray):
521
- self.time_axis = numpy.arange(len(data))
522
- self.sequence = data.astype(numpy.float32)
523
- else:
524
- _LOGGER.error("Data must be a pandas DataFrame/Series or a numpy array.")
525
- raise TypeError()
526
-
527
- self.train_sequence = None
528
- self.test_sequence = None
529
-
530
- self._is_split = False
531
- self._is_normalized = False
532
- self._are_windows_generated = False
533
-
534
- def normalize_data(self) -> 'SequenceMaker':
535
- """
536
- Normalizes the sequence data using PytorchScaler. Must be called AFTER
537
- splitting to prevent data leakage from the test set.
538
- """
539
- if not self._is_split:
540
- _LOGGER.error("Data must be split BEFORE normalizing. Call .split_data() first.")
541
- raise RuntimeError()
542
-
543
- if self.scaler:
544
- _LOGGER.warning("Data has already been normalized.")
545
- return self
546
-
547
- # 1. PytorchScaler requires a Dataset to fit. Create a temporary one.
548
- # The scaler expects 2D data [n_samples, n_features].
549
- train_features = self.train_sequence.reshape(-1, 1) # type: ignore
550
-
551
- # _PytorchDataset needs labels, so we create dummy ones.
552
- dummy_labels = numpy.zeros(len(train_features))
553
- temp_train_ds = _PytorchDataset(train_features, dummy_labels, labels_dtype=torch.float32)
554
-
555
- # 2. Fit the PytorchScaler on the temporary training dataset.
556
- # The sequence is a single feature, so its index is [0].
557
- _LOGGER.info("Fitting PytorchScaler on the training data...")
558
- self.scaler = PytorchScaler.fit(temp_train_ds, continuous_feature_indices=[0])
559
-
560
- # 3. Transform sequences using the fitted scaler.
561
- # The transform method requires a tensor, so we convert, transform, and convert back.
562
- train_tensor = torch.tensor(self.train_sequence.reshape(-1, 1), dtype=torch.float32) # type: ignore
563
- test_tensor = torch.tensor(self.test_sequence.reshape(-1, 1), dtype=torch.float32) # type: ignore
564
-
565
- self.train_sequence = self.scaler.transform(train_tensor).numpy().flatten()
566
- self.test_sequence = self.scaler.transform(test_tensor).numpy().flatten()
567
-
568
- self._is_normalized = True
569
- _LOGGER.info("Sequence data normalized using PytorchScaler.")
570
- return self
571
-
572
- def split_data(self, test_size: float = 0.2) -> 'SequenceMaker':
573
- """Splits the sequence into training and testing portions."""
574
- if self._is_split:
575
- _LOGGER.warning("Data has already been split.")
576
- return self
577
-
578
- split_idx = int(len(self.sequence) * (1 - test_size))
579
- self.train_sequence = self.sequence[:split_idx]
580
- self.test_sequence = self.sequence[split_idx - self.sequence_length:]
581
-
582
- self.train_time_axis = self.time_axis[:split_idx]
583
- self.test_time_axis = self.time_axis[split_idx:]
584
-
585
- self._is_split = True
586
- _LOGGER.info(f"Sequence split into training ({len(self.train_sequence)} points) and testing ({len(self.test_sequence)} points).")
587
- return self
588
-
589
- def generate_windows(self, sequence_to_sequence: bool = False) -> 'SequenceMaker':
590
- """
591
- Generates overlapping windows for features and labels.
592
-
593
- "sequence-to-sequence": Label vectors are of the same size as the feature vectors instead of a single future prediction.
594
- """
595
- if not self._is_split:
596
- _LOGGER.error("Cannot generate windows before splitting data. Call .split_data() first.")
597
- raise RuntimeError()
598
-
599
- self._train_dataset = self._create_windowed_dataset(self.train_sequence, sequence_to_sequence) # type: ignore
600
- self._test_dataset = self._create_windowed_dataset(self.test_sequence, sequence_to_sequence) # type: ignore
601
-
602
- self._are_windows_generated = True
603
- _LOGGER.info("Feature and label windows generated for train and test sets.")
604
- return self
605
-
606
- def _create_windowed_dataset(self, data: numpy.ndarray, use_sequence_labels: bool) -> Dataset:
607
- """Efficiently creates windowed features and labels using numpy."""
608
- if len(data) <= self.sequence_length:
609
- _LOGGER.error("Data length must be greater than the sequence_length to create at least one window.")
610
- raise ValueError()
611
-
612
- if not use_sequence_labels:
613
- features = data[:-1]
614
- labels = data[self.sequence_length:]
615
-
616
- n_windows = len(features) - self.sequence_length + 1
617
- bytes_per_item = features.strides[0]
618
- strided_features = numpy.lib.stride_tricks.as_strided(
619
- features, shape=(n_windows, self.sequence_length), strides=(bytes_per_item, bytes_per_item)
620
- )
621
- return _PytorchDataset(strided_features, labels, labels_dtype=torch.float32)
622
-
623
- else:
624
- x_data = data[:-1]
625
- y_data = data[1:]
626
-
627
- n_windows = len(x_data) - self.sequence_length + 1
628
- bytes_per_item = x_data.strides[0]
629
-
630
- strided_x = numpy.lib.stride_tricks.as_strided(x_data, shape=(n_windows, self.sequence_length), strides=(bytes_per_item, bytes_per_item))
631
- strided_y = numpy.lib.stride_tricks.as_strided(y_data, shape=(n_windows, self.sequence_length), strides=(bytes_per_item, bytes_per_item))
632
-
633
- return _PytorchDataset(strided_x, strided_y, labels_dtype=torch.float32)
634
-
635
- def denormalize(self, data: Union[torch.Tensor, numpy.ndarray]) -> numpy.ndarray:
636
- """Applies inverse transformation using the stored PytorchScaler."""
637
- if self.scaler is None:
638
- _LOGGER.error("Data was not normalized. Cannot denormalize.")
639
- raise RuntimeError()
640
-
641
- # Ensure data is a torch.Tensor
642
- if isinstance(data, numpy.ndarray):
643
- tensor_data = torch.tensor(data, dtype=torch.float32)
644
- else:
645
- tensor_data = data
646
-
647
- # Reshape for the scaler [n_samples, n_features]
648
- if tensor_data.ndim == 1:
649
- tensor_data = tensor_data.view(-1, 1)
650
-
651
- # Apply inverse transform and convert back to a flat numpy array
652
- original_scale_tensor = self.scaler.inverse_transform(tensor_data)
653
- return original_scale_tensor.cpu().numpy().flatten()
654
-
655
- def plot(self, predictions: Optional[numpy.ndarray] = None):
656
- """Plots the original training and testing data, with optional predictions."""
657
- if not self._is_split:
658
- _LOGGER.error("Cannot plot before splitting data. Call .split_data() first.")
659
- raise RuntimeError()
660
-
661
- plt.figure(figsize=(15, 6))
662
- plt.title("Time Series Data")
663
- plt.grid(True)
664
- plt.xlabel("Time")
665
- plt.ylabel("Value")
666
-
667
- plt.plot(self.train_time_axis, self.scaler.inverse_transform(self.train_sequence.reshape(-1, 1)), label='Train Data') # type: ignore
668
- plt.plot(self.test_time_axis, self.scaler.inverse_transform(self.test_sequence[self.sequence_length-1:].reshape(-1, 1)), label='Test Data') # type: ignore
669
-
670
- if predictions is not None:
671
- pred_time_axis = self.test_time_axis[:len(predictions)]
672
- plt.plot(pred_time_axis, predictions, label='Predictions', c='red')
673
-
674
- plt.legend()
675
- plt.show()
676
-
677
- def get_datasets(self) -> Tuple[Dataset, Dataset]:
678
- """Returns the final train and test datasets."""
679
- if not self._are_windows_generated:
680
- _LOGGER.error("Windows have not been generated. Call .generate_windows() first.")
681
- raise RuntimeError()
682
- return self._train_dataset, self._test_dataset
683
-
684
- def __repr__(self) -> str:
685
- s = f"<{self.__class__.__name__}>:\n"
686
- s += f" Sequence Length (Window): {self.sequence_length}\n"
687
- s += f" Total Data Points: {len(self.sequence)}\n"
688
- s += " --- Status ---\n"
689
- s += f" Split: {self._is_split}\n"
690
- s += f" Normalized: {self._is_normalized}\n"
691
- s += f" Windows Generated: {self._are_windows_generated}\n"
692
-
693
- if self._are_windows_generated:
694
- train_len = len(self._train_dataset) if self._train_dataset else 0 # type: ignore
695
- test_len = len(self._test_dataset) if self._test_dataset else 0 # type: ignore
696
- s += f" Datasets (Train/Test): {train_len} / {test_len} windows\n"
697
-
698
- return s
699
-
700
-
701
656
  def info():
702
657
  _script_info(__all__)