dragon-ml-toolbox 13.3.0__py3-none-any.whl → 16.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. {dragon_ml_toolbox-13.3.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/METADATA +20 -6
  2. dragon_ml_toolbox-16.2.0.dist-info/RECORD +51 -0
  3. {dragon_ml_toolbox-13.3.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +10 -0
  4. ml_tools/ETL_cleaning.py +20 -20
  5. ml_tools/ETL_engineering.py +23 -25
  6. ml_tools/GUI_tools.py +20 -20
  7. ml_tools/MICE_imputation.py +207 -5
  8. ml_tools/ML_callbacks.py +43 -26
  9. ml_tools/ML_configuration.py +788 -0
  10. ml_tools/ML_datasetmaster.py +303 -448
  11. ml_tools/ML_evaluation.py +351 -93
  12. ml_tools/ML_evaluation_multi.py +139 -42
  13. ml_tools/ML_inference.py +290 -209
  14. ml_tools/ML_models.py +33 -106
  15. ml_tools/ML_models_advanced.py +323 -0
  16. ml_tools/ML_optimization.py +12 -12
  17. ml_tools/ML_scaler.py +11 -11
  18. ml_tools/ML_sequence_datasetmaster.py +341 -0
  19. ml_tools/ML_sequence_evaluation.py +219 -0
  20. ml_tools/ML_sequence_inference.py +391 -0
  21. ml_tools/ML_sequence_models.py +139 -0
  22. ml_tools/ML_trainer.py +1604 -179
  23. ml_tools/ML_utilities.py +351 -4
  24. ml_tools/ML_vision_datasetmaster.py +1540 -0
  25. ml_tools/ML_vision_evaluation.py +284 -0
  26. ml_tools/ML_vision_inference.py +405 -0
  27. ml_tools/ML_vision_models.py +641 -0
  28. ml_tools/ML_vision_transformers.py +284 -0
  29. ml_tools/PSO_optimization.py +6 -6
  30. ml_tools/SQL.py +4 -4
  31. ml_tools/_keys.py +171 -0
  32. ml_tools/_schema.py +1 -1
  33. ml_tools/custom_logger.py +37 -14
  34. ml_tools/data_exploration.py +502 -93
  35. ml_tools/ensemble_evaluation.py +54 -11
  36. ml_tools/ensemble_inference.py +7 -33
  37. ml_tools/ensemble_learning.py +1 -1
  38. ml_tools/math_utilities.py +1 -1
  39. ml_tools/optimization_tools.py +2 -2
  40. ml_tools/path_manager.py +5 -5
  41. ml_tools/serde.py +2 -2
  42. ml_tools/utilities.py +192 -4
  43. dragon_ml_toolbox-13.3.0.dist-info/RECORD +0 -41
  44. ml_tools/RNN_forecast.py +0 -56
  45. ml_tools/keys.py +0 -87
  46. {dragon_ml_toolbox-13.3.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/WHEEL +0 -0
  47. {dragon_ml_toolbox-13.3.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/licenses/LICENSE +0 -0
  48. {dragon_ml_toolbox-13.3.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/top_level.txt +0 -0
@@ -1,34 +1,27 @@
1
1
  import torch
2
- from torch.utils.data import Dataset, Subset
2
+ from torch.utils.data import Dataset
3
3
  import pandas
4
4
  import numpy
5
5
  from sklearn.model_selection import train_test_split
6
- from typing import Literal, Union, Tuple, List, Optional
7
- from abc import ABC, abstractmethod
8
- from PIL import Image, ImageOps
9
- from torchvision.datasets import ImageFolder
10
- from torchvision import transforms
11
- import matplotlib.pyplot as plt
6
+ from typing import Literal, Union, List, Optional
7
+ from abc import ABC
12
8
  from pathlib import Path
13
9
 
14
10
  from .path_manager import make_fullpath, sanitize_filename
15
11
  from ._logger import _LOGGER
16
12
  from ._script_info import _script_info
17
13
  from .custom_logger import save_list_strings
18
- from .ML_scaler import PytorchScaler
19
- from .keys import DatasetKeys
14
+ from .ML_scaler import DragonScaler
15
+ from ._keys import DatasetKeys, MLTaskKeys
20
16
  from ._schema import FeatureSchema
17
+ from .custom_logger import custom_logger
21
18
 
22
19
 
23
20
  __all__ = [
24
- "DatasetMaker",
25
- "DatasetMakerMulti",
26
- "VisionDatasetMaker",
27
- "SequenceMaker",
28
- "ResizeAspectFill",
21
+ "DragonDataset",
22
+ "DragonDatasetMulti"
29
23
  ]
30
24
 
31
-
32
25
  # --- Internal Helper Class ---
33
26
  class _PytorchDataset(Dataset):
34
27
  """
@@ -62,6 +55,8 @@ class _PytorchDataset(Dataset):
62
55
 
63
56
  self._feature_names = feature_names
64
57
  self._target_names = target_names
58
+ self._classes: List[str] = []
59
+ self._class_map: dict[str,int] = dict()
65
60
 
66
61
  def __len__(self):
67
62
  return len(self.features)
@@ -83,6 +78,15 @@ class _PytorchDataset(Dataset):
83
78
  return self._target_names
84
79
  else:
85
80
  _LOGGER.error(f"Dataset {self.__class__} has not been initialized with any target names.")
81
+ raise ValueError()
82
+
83
+ @property
84
+ def classes(self):
85
+ return self._classes
86
+
87
+ @property
88
+ def class_map(self):
89
+ return self._class_map
86
90
 
87
91
 
88
92
  # --- Abstract Base Class ---
@@ -93,23 +97,29 @@ class _BaseDatasetMaker(ABC):
93
97
  """
94
98
  def __init__(self):
95
99
  self._train_ds: Optional[Dataset] = None
100
+ self._val_ds: Optional[Dataset] = None
96
101
  self._test_ds: Optional[Dataset] = None
97
- self.scaler: Optional[PytorchScaler] = None
102
+ self.scaler: Optional[DragonScaler] = None
98
103
  self._id: Optional[str] = None
99
104
  self._feature_names: List[str] = []
100
105
  self._target_names: List[str] = []
101
106
  self._X_train_shape = (0,0)
107
+ self._X_val_shape = (0,0)
102
108
  self._X_test_shape = (0,0)
103
109
  self._y_train_shape = (0,)
110
+ self._y_val_shape = (0,)
104
111
  self._y_test_shape = (0,)
112
+ self.class_map: dict[str, int] = dict()
113
+ self.classes: list[str] = list()
105
114
 
106
115
  def _prepare_scaler(self,
107
116
  X_train: pandas.DataFrame,
108
117
  y_train: Union[pandas.Series, pandas.DataFrame],
118
+ X_val: pandas.DataFrame,
109
119
  X_test: pandas.DataFrame,
110
120
  label_dtype: torch.dtype,
111
121
  schema: FeatureSchema):
112
- """Internal helper to fit and apply a PytorchScaler using a FeatureSchema."""
122
+ """Internal helper to fit and apply a DragonScaler using a FeatureSchema."""
113
123
  continuous_feature_indices: Optional[List[int]] = None
114
124
 
115
125
  # Get continuous feature indices *from the schema*
@@ -126,27 +136,34 @@ class _BaseDatasetMaker(ABC):
126
136
  else:
127
137
  _LOGGER.info("No continuous features listed in schema. Scaler will not be fitted.")
128
138
 
129
- X_train_values = X_train.values
130
- X_test_values = X_test.values
139
+ X_train_values = X_train.to_numpy()
140
+ X_val_values = X_val.to_numpy()
141
+ X_test_values = X_test.to_numpy()
131
142
 
132
143
  # continuous_feature_indices is derived
133
144
  if self.scaler is None and continuous_feature_indices:
134
- _LOGGER.info("Fitting a new PytorchScaler on training data.")
145
+ _LOGGER.info("Fitting a new DragonScaler on training data.")
135
146
  temp_train_ds = _PytorchDataset(X_train_values, y_train, label_dtype) # type: ignore
136
- self.scaler = PytorchScaler.fit(temp_train_ds, continuous_feature_indices)
147
+ self.scaler = DragonScaler.fit(temp_train_ds, continuous_feature_indices)
137
148
 
138
149
  if self.scaler and self.scaler.mean_ is not None:
139
- _LOGGER.info("Applying scaler transformation to train and test feature sets.")
150
+ _LOGGER.info("Applying scaler transformation to train, validation, and test feature sets.")
140
151
  X_train_tensor = self.scaler.transform(torch.tensor(X_train_values, dtype=torch.float32))
152
+ X_val_tensor = self.scaler.transform(torch.tensor(X_val_values, dtype=torch.float32))
141
153
  X_test_tensor = self.scaler.transform(torch.tensor(X_test_values, dtype=torch.float32))
142
- return X_train_tensor.numpy(), X_test_tensor.numpy()
154
+ return X_train_tensor.numpy(), X_val_tensor.numpy(), X_test_tensor.numpy()
143
155
 
144
- return X_train_values, X_test_values
156
+ return X_train_values, X_val_values, X_test_values
145
157
 
146
158
  @property
147
159
  def train_dataset(self) -> Dataset:
148
160
  if self._train_ds is None: raise RuntimeError("Dataset not yet created.")
149
161
  return self._train_ds
162
+
163
+ @property
164
+ def validation_dataset(self) -> Dataset:
165
+ if self._val_ds is None: raise RuntimeError("Dataset not yet created.")
166
+ return self._val_ds
150
167
 
151
168
  @property
152
169
  def test_dataset(self) -> Dataset:
@@ -181,6 +198,7 @@ class _BaseDatasetMaker(ABC):
181
198
  def dataframes_info(self) -> None:
182
199
  print("--- DataFrame Shapes After Split ---")
183
200
  print(f" X_train shape: {self._X_train_shape}, y_train shape: {self._y_train_shape}")
201
+ print(f" X_val shape: {self._X_val_shape}, y_val shape: {self._y_val_shape}")
184
202
  print(f" X_test shape: {self._X_test_shape}, y_test shape: {self._y_test_shape}")
185
203
  print("------------------------------------")
186
204
 
@@ -200,7 +218,7 @@ class _BaseDatasetMaker(ABC):
200
218
 
201
219
  def save_scaler(self, directory: Union[str, Path], verbose: bool=True) -> None:
202
220
  """
203
- Saves the fitted PytorchScaler's state to a .pth file.
221
+ Saves the fitted DragonScaler's state to a .pth file.
204
222
 
205
223
  The filename is automatically generated based on the dataset id.
206
224
 
@@ -220,6 +238,24 @@ class _BaseDatasetMaker(ABC):
220
238
  self.scaler.save(filepath, verbose=False)
221
239
  if verbose:
222
240
  _LOGGER.info(f"Scaler for dataset '{self.id}' saved as '{filepath.name}'.")
241
+
242
+ def save_class_map(self, directory: Union[str,Path], verbose: bool=True) -> None:
243
+ """
244
+ Saves the class to index mapping {str: int} to a directory.
245
+ """
246
+ if not self.class_map:
247
+ _LOGGER.warning(f"No class_map defined. Skipping.")
248
+ return
249
+
250
+ log_name = f"Class_to_Index_{self.id}" if self.id else "Class_to_Index"
251
+
252
+ custom_logger(data=self.class_map,
253
+ save_directory=directory,
254
+ log_name=log_name,
255
+ add_timestamp=False,
256
+ dict_as="json")
257
+ if verbose:
258
+ _LOGGER.info(f"Class map for '{self.id}' saved as '{log_name}.json'.")
223
259
 
224
260
  def save_artifacts(self, directory: Union[str, Path], verbose: bool=True) -> None:
225
261
  """
@@ -229,19 +265,22 @@ class _BaseDatasetMaker(ABC):
229
265
  self.save_target_names(directory=directory, verbose=verbose)
230
266
  if self.scaler is not None:
231
267
  self.save_scaler(directory=directory, verbose=verbose)
268
+ if self.class_map:
269
+ self.save_class_map(directory=directory, verbose=verbose)
232
270
 
233
271
 
234
272
  # Single target dataset
235
- class DatasetMaker(_BaseDatasetMaker):
273
+ class DragonDataset(_BaseDatasetMaker):
236
274
  """
237
275
  Dataset maker for pre-processed, numerical pandas DataFrames with a single target column.
238
276
 
239
277
  This class takes a DataFrame, and a FeatureSchema, automatically splits and converts them into PyTorch Datasets.
240
- It can also create and apply a PytorchScaler using the schema.
278
+ It can also create and apply a DragonScaler using the schema.
241
279
 
242
280
  Attributes:
243
- `scaler` -> PytorchScaler | None
281
+ `scaler` -> DragonScaler | None
244
282
  `train_dataset` -> PyTorch Dataset
283
+ `validation_dataset` -> PyTorch Dataset
245
284
  `test_dataset` -> PyTorch Dataset
246
285
  `feature_names` -> list[str]
247
286
  `target_names` -> list[str]
@@ -252,27 +291,59 @@ class DatasetMaker(_BaseDatasetMaker):
252
291
  def __init__(self,
253
292
  pandas_df: pandas.DataFrame,
254
293
  schema: FeatureSchema,
255
- kind: Literal["regression", "classification"],
256
- test_size: float = 0.2,
257
- random_state: int = 42,
258
- scaler: Optional[PytorchScaler] = None):
294
+ kind: Literal["regression", "binary classification", "multiclass classification"],
295
+ scaler: Union[Literal["fit"], Literal["none"], DragonScaler],
296
+ validation_size: float = 0.2,
297
+ test_size: float = 0.1,
298
+ class_map: Optional[dict[str,int]]=None,
299
+ random_state: int = 42):
259
300
  """
260
301
  Args:
261
302
  pandas_df (pandas.DataFrame):
262
303
  The pre-processed input DataFrame containing all columns. (features and single target).
263
304
  schema (FeatureSchema):
264
305
  The definitive schema object from data_exploration.
265
- kind (Literal["regression", "classification"]):
266
- The type of ML task. This determines the data type of the labels.
306
+ kind (str):
307
+ The type of ML task. Must be one of:
308
+ - "regression"
309
+ - "binary classification"
310
+ - "multiclass classification"
311
+ scaler ("fit" | "none" | DragonScaler):
312
+ Strategy for data scaling:
313
+ - "fit": Fit a new DragonScaler on continuous features.
314
+ - "none": Do not scale data (e.g., for TabularTransformer).
315
+ - DragonScaler instance: Use a pre-fitted scaler to transform data.
316
+ validation_size (float):
317
+ The proportion of the *original* dataset to allocate to the validation split.
267
318
  test_size (float):
268
- The proportion of the dataset to allocate to the test split.
319
+ The proportion of the dataset to allocate to the test split (can be 0).
320
+ class_map (dict[str,int] | None): Optional class map for the target classes in classification tasks. Can be set later using `.set_class_map()`.
269
321
  random_state (int):
270
322
  The seed for the random number of generator for reproducibility.
271
- scaler (PytorchScaler | None):
272
- A pre-fitted PytorchScaler instance, if None a new scaler will be created.
323
+
273
324
  """
274
325
  super().__init__()
275
- self.scaler = scaler
326
+
327
+ # --- Validation for split sizes ---
328
+ if (validation_size + test_size) >= 1.0:
329
+ _LOGGER.error(f"The sum of validation_size ({validation_size}) and test_size ({test_size}) must be less than 1.0.")
330
+ raise ValueError()
331
+ elif validation_size <= 0.0:
332
+ _LOGGER.error(f"Invalid validation split of {validation_size}.")
333
+ raise ValueError()
334
+
335
+ _apply_scaling: bool = False
336
+ if scaler == "fit":
337
+ self.scaler = None # To be created
338
+ _apply_scaling = True
339
+ elif scaler == "none":
340
+ self.scaler = None
341
+ elif isinstance(scaler, DragonScaler):
342
+ self.scaler = scaler # Use the provided one
343
+ _apply_scaling = True
344
+ else:
345
+ _LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a DragonScaler instance.")
346
+ raise ValueError()
276
347
 
277
348
  # --- 1. Identify features (from schema) ---
278
349
  self._feature_names = list(schema.feature_names)
@@ -287,7 +358,7 @@ class DatasetMaker(_BaseDatasetMaker):
287
358
  _LOGGER.error("No target column found. The schema's features match the DataFrame's columns exactly.")
288
359
  raise ValueError("No target column found in DataFrame.")
289
360
  if len(target_cols_set) > 1:
290
- _LOGGER.error(f"Ambiguous target. Found {len(target_cols_set)} columns not in the schema: {list(target_cols_set)}. DatasetMaker (single-target) requires exactly one.")
361
+ _LOGGER.error(f"Ambiguous target. Found {len(target_cols_set)} columns not in the schema: {list(target_cols_set)}. One target required.")
291
362
  raise ValueError("Ambiguous target: More than one non-feature column found.")
292
363
 
293
364
  target_name = list(target_cols_set)[0]
@@ -297,30 +368,130 @@ class DatasetMaker(_BaseDatasetMaker):
297
368
  # --- 3. Split Data ---
298
369
  features_df = pandas_df[self._feature_names]
299
370
  target_series = pandas_df[target_name]
300
-
301
- X_train, X_test, y_train, y_test = train_test_split(
371
+
372
+ # First split: (Train + Val) vs TesT
373
+ X_train_val, X_test, y_train_val, y_test = train_test_split(
302
374
  features_df,
303
375
  target_series,
304
376
  test_size=test_size,
305
377
  random_state=random_state
306
378
  )
307
- self._X_train_shape, self._X_test_shape = X_train.shape, X_test.shape
308
- self._y_train_shape, self._y_test_shape = y_train.shape, y_test.shape
379
+ # Calculate validation split size relative to the (Train + Val) set
380
+ val_split_size = validation_size / (1.0 - test_size)
381
+
382
+ # Second split: Train vs Val
383
+ X_train, X_val, y_train, y_val = train_test_split(
384
+ X_train_val,
385
+ y_train_val,
386
+ test_size=val_split_size,
387
+ random_state=random_state
388
+ )
309
389
 
310
- label_dtype = torch.float32 if kind == "regression" else torch.int64
390
+ self._X_train_shape, self._X_val_shape, self._X_test_shape = X_train.shape, X_val.shape, X_test.shape
391
+ self._y_train_shape, self._y_val_shape, self._y_test_shape = y_train.shape, y_val.shape, y_test.shape
392
+
393
+ # --- label_dtype logic ---
394
+ if kind == MLTaskKeys.REGRESSION or kind == MLTaskKeys.BINARY_CLASSIFICATION:
395
+ label_dtype = torch.float32
396
+ elif kind == MLTaskKeys.MULTICLASS_CLASSIFICATION:
397
+ label_dtype = torch.int64
398
+ else:
399
+ _LOGGER.error(f"Invalid 'kind' {kind}. Must be '{MLTaskKeys.REGRESSION}', '{MLTaskKeys.BINARY_CLASSIFICATION}', or '{MLTaskKeys.MULTICLASS_CLASSIFICATION}'.")
400
+ raise ValueError()
401
+ self.kind = kind
311
402
 
312
403
  # --- 4. Scale (using the schema) ---
313
- X_train_final, X_test_final = self._prepare_scaler(
314
- X_train, y_train, X_test, label_dtype, schema
315
- )
404
+ if _apply_scaling:
405
+ X_train_final, X_val_final, X_test_final = self._prepare_scaler(
406
+ X_train, y_train, X_val, X_test, label_dtype, schema
407
+ )
408
+ else:
409
+ _LOGGER.info("Features have not been scaled as specified.")
410
+ X_train_final = X_train.to_numpy()
411
+ X_val_final = X_val.to_numpy()
412
+ X_test_final = X_test.to_numpy()
316
413
 
317
414
  # --- 5. Create Datasets ---
318
415
  self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
416
+ self._val_ds = _PytorchDataset(X_val_final, y_val, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
319
417
  self._test_ds = _PytorchDataset(X_test_final, y_test, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
320
418
 
419
+ # --- 6. create class map if given ---
420
+ if self.kind != MLTaskKeys.REGRESSION:
421
+ if class_map is None:
422
+ self.class_map = dict()
423
+ else:
424
+ self.set_class_map(class_map)
425
+ else:
426
+ self.class_map = dict()
427
+
428
+ def set_class_map(self, class_map: dict[str, int], force_overwrite: bool=False) -> None:
429
+ """
430
+ Sets a map of class_name -> integer_label.
431
+
432
+ This is used by the InferenceHandler and to finalize the model after training.
433
+
434
+ Args:
435
+ class_map (Dict[str, int]): A dictionary mapping the integer label
436
+ to its string name.
437
+ Example: {'cat': 0, 'dog': 1, 'bird': 2}
438
+ force_overwrite (bool): Required to overwrite a previously set class map.
439
+ """
440
+ if self.kind == MLTaskKeys.REGRESSION:
441
+ _LOGGER.warning(f"Class Map is for classifications tasks only.")
442
+ return
443
+
444
+ if self.class_map:
445
+ warning_message = f"Class map was previously set."
446
+ if not force_overwrite:
447
+ warning_message += " Use `force_overwrite=True` to set new values."
448
+ _LOGGER.warning(warning_message)
449
+ return
450
+ else:
451
+ warning_message += ". Setting new values..."
452
+ _LOGGER.warning(warning_message)
453
+
454
+ self.class_map = class_map
455
+
456
+ try:
457
+ sorted_items = sorted(class_map.items(), key=lambda item: item[1])
458
+ class_list = [item[0] for item in sorted_items]
459
+ except Exception as e:
460
+ _LOGGER.error(f"Could not sort class map. Ensure it is a dict of {str: int}. Error: {e}")
461
+ raise TypeError()
462
+ else:
463
+ self.classes = class_list
464
+
465
+ if self._train_ds:
466
+ self._train_ds._classes = class_list # type: ignore
467
+ self._train_ds._class_map = class_map # type: ignore
468
+ if self._val_ds:
469
+ self._val_ds._classes = class_list # type: ignore
470
+ self._val_ds._class_map = class_map # type: ignore
471
+ if self._test_ds:
472
+ self._test_ds._classes = class_list # type: ignore
473
+ self._test_ds._class_map = class_map # type: ignore
474
+
475
+ _LOGGER.info(f"Class map set for dataset '{self.id}' and its subsets:\n{class_map}")
476
+
477
+ def __repr__(self) -> str:
478
+ s = f"<{self.__class__.__name__} (ID: '{self.id}')>\n"
479
+ s += f" Target: {self.target_names[0]}\n"
480
+ s += f" Features: {self.number_of_features}\n"
481
+ s += f" Scaler: {'Fitted' if self.scaler else 'None'}\n"
482
+
483
+ if self._train_ds:
484
+ s += f" Train Samples: {len(self._train_ds)}\n" # type: ignore
485
+ if self._val_ds:
486
+ s += f" Validation Samples: {len(self._val_ds)}\n" # type: ignore
487
+ if self._test_ds:
488
+ s += f" Test Samples: {len(self._test_ds)}\n" # type: ignore
489
+
490
+ return s
491
+
321
492
 
322
493
  # --- Multi-Target Class ---
323
- class DatasetMakerMulti(_BaseDatasetMaker):
494
+ class DragonDatasetMulti(_BaseDatasetMaker):
324
495
  """
325
496
  Dataset maker for pre-processed, numerical pandas DataFrames with
326
497
  multiple target columns.
@@ -329,16 +500,16 @@ class DatasetMakerMulti(_BaseDatasetMaker):
329
500
  *target_columns*. It validates that the schema's features and the
330
501
  target columns are mutually exclusive and together account for all
331
502
  columns in the DataFrame.
332
-
333
- Targets dtype is torch.float32
334
503
  """
335
504
  def __init__(self,
336
505
  pandas_df: pandas.DataFrame,
337
506
  target_columns: List[str],
338
507
  schema: FeatureSchema,
339
- test_size: float = 0.2,
340
- random_state: int = 42,
341
- scaler: Optional[PytorchScaler] = None):
508
+ kind: Literal["multitarget regression", "multilabel binary classification"],
509
+ scaler: Union[Literal["fit"], Literal["none"], DragonScaler],
510
+ validation_size: float = 0.2,
511
+ test_size: float = 0.1,
512
+ random_state: int = 42):
342
513
  """
343
514
  Args:
344
515
  pandas_df (pandas.DataFrame):
@@ -348,20 +519,54 @@ class DatasetMakerMulti(_BaseDatasetMaker):
348
519
  List of target column names.
349
520
  schema (FeatureSchema):
350
521
  The definitive schema object from data_exploration.
522
+ kind (str):
523
+ The type of multi-target ML task. Must be one of:
524
+ - "multitarget regression"
525
+ - "multilabel binary classification"
526
+ scaler ("fit" | "none" | DragonScaler):
527
+ Strategy for data scaling:
528
+ - "fit": Fit a new DragonScaler on continuous features.
529
+ - "none": Do not scale data (e.g., for TabularTransformer).
530
+ - DragonScaler instance: Use a pre-fitted scaler to transform data.
531
+ validation_size (float):
532
+ The proportion of the dataset to allocate to the validation split.
351
533
  test_size (float):
352
534
  The proportion of the dataset to allocate to the test split.
353
535
  random_state (int):
354
536
  The seed for the random number generator for reproducibility.
355
- scaler (PytorchScaler | None):
356
- A pre-fitted PytorchScaler instance.
357
537
 
358
538
  ## Note:
359
539
  For multi-binary classification, the most common PyTorch loss function is nn.BCEWithLogitsLoss.
360
- This loss function requires the labels to be torch.float32 which is the same type required for regression (multi-regression) tasks.
540
+ This loss function requires the labels to be torch.float32 which is the same type required for multi-regression tasks.
361
541
  """
362
542
  super().__init__()
363
- self.scaler = scaler
364
-
543
+
544
+ # --- Validation for split sizes ---
545
+ if (validation_size + test_size) >= 1.0:
546
+ _LOGGER.error(f"The sum of validation_size ({validation_size}) and test_size ({test_size}) must be less than 1.0.")
547
+ raise ValueError("validation_size and test_size sum must be < 1.0")
548
+ elif validation_size <= 0.0:
549
+ _LOGGER.error(f"Invalid validation split of {validation_size}.")
550
+ raise ValueError()
551
+
552
+ # --- Validate kind parameter ---
553
+ if kind not in [MLTaskKeys.MULTITARGET_REGRESSION, MLTaskKeys.MULTILABEL_BINARY_CLASSIFICATION]:
554
+ _LOGGER.error(f"Invalid 'kind' {kind}. Must be '{MLTaskKeys.MULTITARGET_REGRESSION}' or '{MLTaskKeys.MULTILABEL_BINARY_CLASSIFICATION}'.")
555
+ raise ValueError()
556
+
557
+ _apply_scaling: bool = False
558
+ if scaler == "fit":
559
+ self.scaler = None
560
+ _apply_scaling = True
561
+ elif scaler == "none":
562
+ self.scaler = None
563
+ elif isinstance(scaler, DragonScaler):
564
+ self.scaler = scaler # Use the provided one
565
+ _apply_scaling = True
566
+ else:
567
+ _LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a DragonScaler instance.")
568
+ raise ValueError()
569
+
365
570
  # --- 1. Get features and targets from schema/args ---
366
571
  self._feature_names = list(schema.feature_names)
367
572
  self._target_names = target_columns
@@ -389,413 +594,63 @@ class DatasetMakerMulti(_BaseDatasetMaker):
389
594
  # --- 3. Split Data ---
390
595
  features_df = pandas_df[self._feature_names]
391
596
  target_df = pandas_df[self._target_names]
392
-
393
- X_train, X_test, y_train, y_test = train_test_split(
597
+
598
+ # First split: (Train + Val) vs Test
599
+ X_train_val, X_test, y_train_val, y_test = train_test_split(
394
600
  features_df,
395
601
  target_df,
396
602
  test_size=test_size,
397
603
  random_state=random_state
398
604
  )
399
- self._X_train_shape, self._X_test_shape = X_train.shape, X_test.shape
400
- self._y_train_shape, self._y_test_shape = y_train.shape, y_test.shape
605
+
606
+ # Calculate validation split size relative to the (Train + Val) set
607
+ val_split_size = validation_size / (1.0 - test_size)
608
+
609
+ # Second split: Train vs Val
610
+ X_train, X_val, y_train, y_val = train_test_split(
611
+ X_train_val,
612
+ y_train_val,
613
+ test_size=val_split_size,
614
+ random_state=random_state
615
+ )
616
+
617
+ self._X_train_shape, self._X_val_shape, self._X_test_shape = X_train.shape, X_val.shape, X_test.shape
618
+ self._y_train_shape, self._y_val_shape, self._y_test_shape = y_train.shape, y_val.shape, y_test.shape
401
619
 
402
620
  # Multi-target for regression or multi-binary
403
621
  label_dtype = torch.float32
404
622
 
405
623
  # --- 4. Scale (using the schema) ---
406
- X_train_final, X_test_final = self._prepare_scaler(
407
- X_train, y_train, X_test, label_dtype, schema
408
- )
624
+ if _apply_scaling:
625
+ X_train_final, X_val_final, X_test_final = self._prepare_scaler(
626
+ X_train, y_train, X_val, X_test, label_dtype, schema
627
+ )
628
+ else:
629
+ _LOGGER.info("Features have not been scaled as specified.")
630
+ X_train_final = X_train.to_numpy()
631
+ X_val_final = X_val.to_numpy()
632
+ X_test_final = X_test.to_numpy()
409
633
 
410
634
  # --- 5. Create Datasets ---
411
635
  # _PytorchDataset now correctly handles y_train (a DataFrame)
412
636
  self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
637
+ self._val_ds = _PytorchDataset(X_val_final, y_val, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
413
638
  self._test_ds = _PytorchDataset(X_test_final, y_test, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
414
639
 
415
-
416
- # --- Private Base Class ---
417
- class _BaseMaker(ABC):
418
- """
419
- Abstract Base Class for extra dataset makers.
420
- """
421
- def __init__(self):
422
- self._train_dataset = None
423
- self._test_dataset = None
424
- self._val_dataset = None
425
-
426
- @abstractmethod
427
- def get_datasets(self) -> Tuple[Dataset, ...]:
428
- """
429
- The primary method to retrieve the final, processed PyTorch datasets.
430
- Must be implemented by all subclasses.
431
- """
432
- pass
433
-
434
-
435
- # --- VisionDatasetMaker ---
436
- class VisionDatasetMaker(_BaseMaker):
437
- """
438
- Creates processed PyTorch datasets for computer vision tasks from an
439
- image folder directory.
440
-
441
- Uses online augmentations per epoch (image augmentation without creating new files).
442
- """
443
- def __init__(self, full_dataset: ImageFolder):
444
- super().__init__()
445
- self.full_dataset = full_dataset
446
- self.labels = [s[1] for s in self.full_dataset.samples]
447
- self.class_map = full_dataset.class_to_idx
448
-
449
- self._is_split = False
450
- self._are_transforms_configured = False
451
-
452
- @classmethod
453
- def from_folder(cls, root_dir: str) -> 'VisionDatasetMaker':
454
- """Creates a maker instance from a root directory of images."""
455
- initial_transform = transforms.Compose([transforms.ToTensor()])
456
- full_dataset = ImageFolder(root=root_dir, transform=initial_transform)
457
- _LOGGER.info(f"Found {len(full_dataset)} images in {len(full_dataset.classes)} classes.")
458
- return cls(full_dataset)
459
-
460
- @staticmethod
461
- def inspect_folder(path: Union[str, Path]):
462
- """
463
- Logs a report of the types, sizes, and channels of image files
464
- found in the directory and its subdirectories.
465
- """
466
- path_obj = make_fullpath(path)
467
-
468
- non_image_files = set()
469
- img_types = set()
470
- img_sizes = set()
471
- img_channels = set()
472
- img_counter = 0
473
-
474
- _LOGGER.info(f"Inspecting folder: {path_obj}...")
475
- # Use rglob to recursively find all files
476
- for filepath in path_obj.rglob('*'):
477
- if filepath.is_file():
478
- try:
479
- # Using PIL to open is a more reliable check
480
- with Image.open(filepath) as img:
481
- img_types.add(img.format)
482
- img_sizes.add(img.size)
483
- img_channels.update(img.getbands())
484
- img_counter += 1
485
- except (IOError, SyntaxError):
486
- non_image_files.add(filepath.name)
487
-
488
- if non_image_files:
489
- _LOGGER.warning(f"Non-image or corrupted files found and ignored: {non_image_files}")
490
-
491
- report = (
492
- f"\n--- Inspection Report for '{path_obj.name}' ---\n"
493
- f"Total images found: {img_counter}\n"
494
- f"Image formats: {img_types or 'None'}\n"
495
- f"Image sizes (WxH): {img_sizes or 'None'}\n"
496
- f"Image channels (bands): {img_channels or 'None'}\n"
497
- f"--------------------------------------"
498
- )
499
- print(report)
500
-
501
- def split_data(self, val_size: float = 0.2, test_size: float = 0.0,
502
- stratify: bool = True, random_state: Optional[int] = None) -> 'VisionDatasetMaker':
503
- """Splits the dataset into training, validation, and optional test sets."""
504
- if self._is_split:
505
- _LOGGER.warning("Data has already been split.")
506
- return self
507
-
508
- if val_size + test_size >= 1.0:
509
- _LOGGER.error("The sum of val_size and test_size must be less than 1.")
510
- raise ValueError()
511
-
512
- indices = list(range(len(self.full_dataset)))
513
- labels_for_split = self.labels if stratify else None
514
-
515
- train_indices, val_test_indices = train_test_split(
516
- indices, test_size=(val_size + test_size), random_state=random_state, stratify=labels_for_split
517
- )
518
-
519
- if test_size > 0:
520
- val_test_labels = [self.labels[i] for i in val_test_indices]
521
- stratify_val_test = val_test_labels if stratify else None
522
- val_indices, test_indices = train_test_split(
523
- val_test_indices, test_size=(test_size / (val_size + test_size)),
524
- random_state=random_state, stratify=stratify_val_test
525
- )
526
- self._test_dataset = Subset(self.full_dataset, test_indices)
527
- _LOGGER.info(f"Test set created with {len(self._test_dataset)} images.")
528
- else:
529
- val_indices = val_test_indices
530
-
531
- self._train_dataset = Subset(self.full_dataset, train_indices)
532
- self._val_dataset = Subset(self.full_dataset, val_indices)
533
- self._is_split = True
534
-
535
- _LOGGER.info(f"Data split into: \n- Training: {len(self._train_dataset)} images \n- Validation: {len(self._val_dataset)} images")
536
- return self
537
-
538
- def configure_transforms(self, resize_size: int = 256, crop_size: int = 224,
539
- mean: List[float] = [0.485, 0.456, 0.406],
540
- std: List[float] = [0.229, 0.224, 0.225],
541
- extra_train_transforms: Optional[List] = None) -> 'VisionDatasetMaker':
542
- """Configures and applies the image transformations (augmentations)."""
543
- if not self._is_split:
544
- _LOGGER.error("Transforms must be configured AFTER splitting data. Call .split_data() first.")
545
- raise RuntimeError()
546
-
547
- base_train_transforms = [transforms.RandomResizedCrop(crop_size), transforms.RandomHorizontalFlip()]
548
- if extra_train_transforms:
549
- base_train_transforms.extend(extra_train_transforms)
550
-
551
- final_transforms = [transforms.ToTensor(), transforms.Normalize(mean=mean, std=std)]
552
-
553
- val_transform = transforms.Compose([transforms.Resize(resize_size), transforms.CenterCrop(crop_size), *final_transforms])
554
- train_transform = transforms.Compose([*base_train_transforms, *final_transforms])
555
-
556
- self._train_dataset.dataset.transform = train_transform # type: ignore
557
- self._val_dataset.dataset.transform = val_transform # type: ignore
558
- if self._test_dataset:
559
- self._test_dataset.dataset.transform = val_transform # type: ignore
560
-
561
- self._are_transforms_configured = True
562
- _LOGGER.info("Image transforms configured and applied.")
563
- return self
564
-
565
- def get_datasets(self) -> Tuple[Dataset, ...]:
566
- """Returns the final train, validation, and optional test datasets."""
567
- if not self._is_split:
568
- _LOGGER.error("Data has not been split. Call .split_data() first.")
569
- raise RuntimeError()
570
- if not self._are_transforms_configured:
571
- _LOGGER.warning("Transforms have not been configured. Using default ToTensor only.")
572
-
573
- if self._test_dataset:
574
- return self._train_dataset, self._val_dataset, self._test_dataset
575
- return self._train_dataset, self._val_dataset
576
-
577
-
578
- # --- SequenceMaker ---
579
- class SequenceMaker(_BaseMaker):
580
- """
581
- Creates windowed PyTorch datasets from time-series data.
582
-
583
- Pipeline:
584
-
585
- 1. `.split_data()`: Separate time series into training and testing portions.
586
- 2. `.normalize_data()`: Normalize the data. The scaler will be fitted on the training portion.
587
- 3. `.generate_windows()`: Create the windowed sequences from the split and normalized data.
588
- 4. `.get_datasets()`: Return Pytorch train and test datasets.
589
- """
590
- def __init__(self, data: Union[pandas.DataFrame, pandas.Series, numpy.ndarray], sequence_length: int):
591
- super().__init__()
592
- self.sequence_length = sequence_length
593
- self.scaler = None
594
-
595
- if isinstance(data, pandas.DataFrame):
596
- self.time_axis = data.index.values
597
- self.sequence = data.iloc[:, 0].values.astype(numpy.float32)
598
- elif isinstance(data, pandas.Series):
599
- self.time_axis = data.index.values
600
- self.sequence = data.values.astype(numpy.float32)
601
- elif isinstance(data, numpy.ndarray):
602
- self.time_axis = numpy.arange(len(data))
603
- self.sequence = data.astype(numpy.float32)
604
- else:
605
- _LOGGER.error("Data must be a pandas DataFrame/Series or a numpy array.")
606
- raise TypeError()
640
+ def __repr__(self) -> str:
641
+ s = f"<{self.__class__.__name__} (ID: '{self.id}')>\n"
642
+ s += f" Targets: {self.number_of_targets}\n"
643
+ s += f" Features: {self.number_of_features}\n"
644
+ s += f" Scaler: {'Fitted' if self.scaler else 'None'}\n"
645
+
646
+ if self._train_ds:
647
+ s += f" Train Samples: {len(self._train_ds)}\n" # type: ignore
648
+ if self._val_ds:
649
+ s += f" Validation Samples: {len(self._val_ds)}\n" # type: ignore
650
+ if self._test_ds:
651
+ s += f" Test Samples: {len(self._test_ds)}\n" # type: ignore
607
652
 
608
- self.train_sequence = None
609
- self.test_sequence = None
610
-
611
- self._is_split = False
612
- self._is_normalized = False
613
- self._are_windows_generated = False
614
-
615
- def normalize_data(self) -> 'SequenceMaker':
616
- """
617
- Normalizes the sequence data using PytorchScaler. Must be called AFTER
618
- splitting to prevent data leakage from the test set.
619
- """
620
- if not self._is_split:
621
- _LOGGER.error("Data must be split BEFORE normalizing. Call .split_data() first.")
622
- raise RuntimeError()
623
-
624
- if self.scaler:
625
- _LOGGER.warning("Data has already been normalized.")
626
- return self
627
-
628
- # 1. PytorchScaler requires a Dataset to fit. Create a temporary one.
629
- # The scaler expects 2D data [n_samples, n_features].
630
- train_features = self.train_sequence.reshape(-1, 1) # type: ignore
631
-
632
- # _PytorchDataset needs labels, so we create dummy ones.
633
- dummy_labels = numpy.zeros(len(train_features))
634
- temp_train_ds = _PytorchDataset(train_features, dummy_labels, labels_dtype=torch.float32)
635
-
636
- # 2. Fit the PytorchScaler on the temporary training dataset.
637
- # The sequence is a single feature, so its index is [0].
638
- _LOGGER.info("Fitting PytorchScaler on the training data...")
639
- self.scaler = PytorchScaler.fit(temp_train_ds, continuous_feature_indices=[0])
640
-
641
- # 3. Transform sequences using the fitted scaler.
642
- # The transform method requires a tensor, so we convert, transform, and convert back.
643
- train_tensor = torch.tensor(self.train_sequence.reshape(-1, 1), dtype=torch.float32) # type: ignore
644
- test_tensor = torch.tensor(self.test_sequence.reshape(-1, 1), dtype=torch.float32) # type: ignore
645
-
646
- self.train_sequence = self.scaler.transform(train_tensor).numpy().flatten()
647
- self.test_sequence = self.scaler.transform(test_tensor).numpy().flatten()
648
-
649
- self._is_normalized = True
650
- _LOGGER.info("Sequence data normalized using PytorchScaler.")
651
- return self
652
-
653
- def split_data(self, test_size: float = 0.2) -> 'SequenceMaker':
654
- """Splits the sequence into training and testing portions."""
655
- if self._is_split:
656
- _LOGGER.warning("Data has already been split.")
657
- return self
658
-
659
- split_idx = int(len(self.sequence) * (1 - test_size))
660
- self.train_sequence = self.sequence[:split_idx]
661
- self.test_sequence = self.sequence[split_idx - self.sequence_length:]
662
-
663
- self.train_time_axis = self.time_axis[:split_idx]
664
- self.test_time_axis = self.time_axis[split_idx:]
665
-
666
- self._is_split = True
667
- _LOGGER.info(f"Sequence split into training ({len(self.train_sequence)} points) and testing ({len(self.test_sequence)} points).")
668
- return self
669
-
670
- def generate_windows(self, sequence_to_sequence: bool = False) -> 'SequenceMaker':
671
- """
672
- Generates overlapping windows for features and labels.
673
-
674
- "sequence-to-sequence": Label vectors are of the same size as the feature vectors instead of a single future prediction.
675
- """
676
- if not self._is_split:
677
- _LOGGER.error("Cannot generate windows before splitting data. Call .split_data() first.")
678
- raise RuntimeError()
679
-
680
- self._train_dataset = self._create_windowed_dataset(self.train_sequence, sequence_to_sequence) # type: ignore
681
- self._test_dataset = self._create_windowed_dataset(self.test_sequence, sequence_to_sequence) # type: ignore
682
-
683
- self._are_windows_generated = True
684
- _LOGGER.info("Feature and label windows generated for train and test sets.")
685
- return self
686
-
687
- def _create_windowed_dataset(self, data: numpy.ndarray, use_sequence_labels: bool) -> Dataset:
688
- """Efficiently creates windowed features and labels using numpy."""
689
- if len(data) <= self.sequence_length:
690
- _LOGGER.error("Data length must be greater than the sequence_length to create at least one window.")
691
- raise ValueError()
692
-
693
- if not use_sequence_labels:
694
- features = data[:-1]
695
- labels = data[self.sequence_length:]
696
-
697
- n_windows = len(features) - self.sequence_length + 1
698
- bytes_per_item = features.strides[0]
699
- strided_features = numpy.lib.stride_tricks.as_strided(
700
- features, shape=(n_windows, self.sequence_length), strides=(bytes_per_item, bytes_per_item)
701
- )
702
- return _PytorchDataset(strided_features, labels, labels_dtype=torch.float32)
703
-
704
- else:
705
- x_data = data[:-1]
706
- y_data = data[1:]
707
-
708
- n_windows = len(x_data) - self.sequence_length + 1
709
- bytes_per_item = x_data.strides[0]
710
-
711
- strided_x = numpy.lib.stride_tricks.as_strided(x_data, shape=(n_windows, self.sequence_length), strides=(bytes_per_item, bytes_per_item))
712
- strided_y = numpy.lib.stride_tricks.as_strided(y_data, shape=(n_windows, self.sequence_length), strides=(bytes_per_item, bytes_per_item))
713
-
714
- return _PytorchDataset(strided_x, strided_y, labels_dtype=torch.float32)
715
-
716
- def denormalize(self, data: Union[torch.Tensor, numpy.ndarray]) -> numpy.ndarray:
717
- """Applies inverse transformation using the stored PytorchScaler."""
718
- if self.scaler is None:
719
- _LOGGER.error("Data was not normalized. Cannot denormalize.")
720
- raise RuntimeError()
721
-
722
- # Ensure data is a torch.Tensor
723
- if isinstance(data, numpy.ndarray):
724
- tensor_data = torch.tensor(data, dtype=torch.float32)
725
- else:
726
- tensor_data = data
727
-
728
- # Reshape for the scaler [n_samples, n_features]
729
- if tensor_data.ndim == 1:
730
- tensor_data = tensor_data.view(-1, 1)
731
-
732
- # Apply inverse transform and convert back to a flat numpy array
733
- original_scale_tensor = self.scaler.inverse_transform(tensor_data)
734
- return original_scale_tensor.cpu().numpy().flatten()
735
-
736
- def plot(self, predictions: Optional[numpy.ndarray] = None):
737
- """Plots the original training and testing data, with optional predictions."""
738
- if not self._is_split:
739
- _LOGGER.error("Cannot plot before splitting data. Call .split_data() first.")
740
- raise RuntimeError()
741
-
742
- plt.figure(figsize=(15, 6))
743
- plt.title("Time Series Data")
744
- plt.grid(True)
745
- plt.xlabel("Time")
746
- plt.ylabel("Value")
747
-
748
- plt.plot(self.train_time_axis, self.scaler.inverse_transform(self.train_sequence.reshape(-1, 1)), label='Train Data') # type: ignore
749
- plt.plot(self.test_time_axis, self.scaler.inverse_transform(self.test_sequence[self.sequence_length-1:].reshape(-1, 1)), label='Test Data') # type: ignore
750
-
751
- if predictions is not None:
752
- pred_time_axis = self.test_time_axis[:len(predictions)]
753
- plt.plot(pred_time_axis, predictions, label='Predictions', c='red')
754
-
755
- plt.legend()
756
- plt.show()
757
-
758
- def get_datasets(self) -> Tuple[Dataset, Dataset]:
759
- """Returns the final train and test datasets."""
760
- if not self._are_windows_generated:
761
- _LOGGER.error("Windows have not been generated. Call .generate_windows() first.")
762
- raise RuntimeError()
763
- return self._train_dataset, self._test_dataset
764
-
765
-
766
- # --- Custom Vision Transform Class ---
767
- class ResizeAspectFill:
768
- """
769
- Custom transformation to make an image square by padding it to match the
770
- longest side, preserving the aspect ratio. The image is finally centered.
771
-
772
- Args:
773
- pad_color (Union[str, int]): Color to use for the padding.
774
- Defaults to "black".
775
- """
776
- def __init__(self, pad_color: Union[str, int] = "black") -> None:
777
- self.pad_color = pad_color
778
-
779
- def __call__(self, image: Image.Image) -> Image.Image:
780
- if not isinstance(image, Image.Image):
781
- _LOGGER.error(f"Expected PIL.Image.Image, got {type(image).__name__}")
782
- raise TypeError()
783
-
784
- w, h = image.size
785
- if w == h:
786
- return image
787
-
788
- # Determine padding to center the image
789
- if w > h:
790
- top_padding = (w - h) // 2
791
- bottom_padding = w - h - top_padding
792
- padding = (0, top_padding, 0, bottom_padding)
793
- else: # h > w
794
- left_padding = (h - w) // 2
795
- right_padding = h - w - left_padding
796
- padding = (left_padding, 0, right_padding, 0)
797
-
798
- return ImageOps.expand(image, padding, fill=self.pad_color)
653
+ return s
799
654
 
800
655
 
801
656
  def info():