dragon-ml-toolbox 14.7.0__py3-none-any.whl → 16.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. {dragon_ml_toolbox-14.7.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/METADATA +9 -5
  2. dragon_ml_toolbox-16.2.0.dist-info/RECORD +51 -0
  3. ml_tools/ETL_cleaning.py +20 -20
  4. ml_tools/ETL_engineering.py +23 -25
  5. ml_tools/GUI_tools.py +20 -20
  6. ml_tools/MICE_imputation.py +3 -3
  7. ml_tools/ML_callbacks.py +43 -26
  8. ml_tools/ML_configuration.py +704 -24
  9. ml_tools/ML_datasetmaster.py +235 -280
  10. ml_tools/ML_evaluation.py +144 -39
  11. ml_tools/ML_evaluation_multi.py +103 -35
  12. ml_tools/ML_inference.py +290 -208
  13. ml_tools/ML_models.py +13 -102
  14. ml_tools/ML_models_advanced.py +1 -1
  15. ml_tools/ML_optimization.py +12 -12
  16. ml_tools/ML_scaler.py +11 -11
  17. ml_tools/ML_sequence_datasetmaster.py +341 -0
  18. ml_tools/ML_sequence_evaluation.py +219 -0
  19. ml_tools/ML_sequence_inference.py +391 -0
  20. ml_tools/ML_sequence_models.py +139 -0
  21. ml_tools/ML_trainer.py +1342 -386
  22. ml_tools/ML_utilities.py +1 -1
  23. ml_tools/ML_vision_datasetmaster.py +120 -72
  24. ml_tools/ML_vision_evaluation.py +30 -6
  25. ml_tools/ML_vision_inference.py +129 -152
  26. ml_tools/ML_vision_models.py +1 -1
  27. ml_tools/ML_vision_transformers.py +121 -40
  28. ml_tools/PSO_optimization.py +6 -6
  29. ml_tools/SQL.py +4 -4
  30. ml_tools/{keys.py → _keys.py} +45 -0
  31. ml_tools/_schema.py +1 -1
  32. ml_tools/ensemble_evaluation.py +1 -1
  33. ml_tools/ensemble_inference.py +7 -33
  34. ml_tools/ensemble_learning.py +1 -1
  35. ml_tools/optimization_tools.py +2 -2
  36. ml_tools/path_manager.py +5 -5
  37. ml_tools/utilities.py +1 -2
  38. dragon_ml_toolbox-14.7.0.dist-info/RECORD +0 -49
  39. ml_tools/RNN_forecast.py +0 -56
  40. ml_tools/_ML_vision_recipe.py +0 -88
  41. {dragon_ml_toolbox-14.7.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/WHEEL +0 -0
  42. {dragon_ml_toolbox-14.7.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/licenses/LICENSE +0 -0
  43. {dragon_ml_toolbox-14.7.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
  44. {dragon_ml_toolbox-14.7.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/top_level.txt +0 -0
ml_tools/ML_models.py CHANGED
@@ -7,16 +7,15 @@ import json
7
7
  from ._logger import _LOGGER
8
8
  from .path_manager import make_fullpath
9
9
  from ._script_info import _script_info
10
- from .keys import PytorchModelArchitectureKeys
10
+ from ._keys import PytorchModelArchitectureKeys
11
11
  from ._schema import FeatureSchema
12
12
 
13
13
 
14
14
  __all__ = [
15
- "MultilayerPerceptron",
16
- "AttentionMLP",
17
- "MultiHeadAttentionMLP",
18
- "TabularTransformer",
19
- "SequencePredictorLSTM",
15
+ "DragonMLP",
16
+ "DragonAttentionMLP",
17
+ "DragonMultiHeadAttentionNet",
18
+ "DragonTabularTransformer"
20
19
  ]
21
20
 
22
21
 
@@ -174,7 +173,7 @@ class _BaseAttention(_BaseMLP):
174
173
  return logits, attention_weights
175
174
 
176
175
 
177
- class MultilayerPerceptron(_BaseMLP):
176
+ class DragonMLP(_BaseMLP):
178
177
  """
179
178
  Creates a versatile Multilayer Perceptron (MLP) for regression or classification tasks.
180
179
  """
@@ -208,10 +207,10 @@ class MultilayerPerceptron(_BaseMLP):
208
207
  # Extracts the number of neurons from each nn.Linear layer
209
208
  layer_sizes = [str(layer.in_features) for layer in self.mlp if isinstance(layer, nn.Linear)]
210
209
 
211
- return self._repr_helper(name="MultilayerPerceptron", mlp_layers=layer_sizes)
210
+ return self._repr_helper(name="DragonMLP", mlp_layers=layer_sizes)
212
211
 
213
212
 
214
- class AttentionMLP(_BaseAttention):
213
+ class DragonAttentionMLP(_BaseAttention):
215
214
  """
216
215
  A Multilayer Perceptron (MLP) that incorporates an Attention layer to dynamically weigh input features.
217
216
 
@@ -244,10 +243,10 @@ class AttentionMLP(_BaseAttention):
244
243
  if isinstance(layer, nn.Linear):
245
244
  arch.append(str(layer.in_features))
246
245
 
247
- return self._repr_helper(name="AttentionMLP", mlp_layers=arch)
246
+ return self._repr_helper(name="DragonAttentionMLP", mlp_layers=arch)
248
247
 
249
248
 
250
- class MultiHeadAttentionMLP(_BaseAttention):
249
+ class DragonMultiHeadAttentionNet(_BaseAttention):
251
250
  """
252
251
  An MLP that incorporates a standard `nn.MultiheadAttention` layer to process
253
252
  the input features.
@@ -292,10 +291,10 @@ class MultiHeadAttentionMLP(_BaseAttention):
292
291
  )
293
292
  arch_str = f"{self.in_features} -> [MultiHead(h={self.num_heads})] -> {mlp_part}"
294
293
 
295
- return f"MultiHeadAttentionMLP(arch: {arch_str})"
294
+ return f"DragonMultiHeadAttentionNet(arch: {arch_str})"
296
295
 
297
296
 
298
- class TabularTransformer(nn.Module, _ArchitectureHandlerMixin):
297
+ class DragonTabularTransformer(nn.Module, _ArchitectureHandlerMixin):
299
298
  """
300
299
  A Transformer-based model for tabular data tasks.
301
300
 
@@ -502,7 +501,7 @@ class TabularTransformer(nn.Module, _ArchitectureHandlerMixin):
502
501
 
503
502
  arch_str = " -> ".join(parts)
504
503
 
505
- return f"TabularTransformer(arch: {arch_str})"
504
+ return f"DragonTabularTransformer(arch: {arch_str})"
506
505
 
507
506
 
508
507
  class _FeatureTokenizer(nn.Module):
@@ -662,93 +661,5 @@ class _MultiHeadAttentionLayer(nn.Module):
662
661
  return out, attn_weights.squeeze()
663
662
 
664
663
 
665
- class SequencePredictorLSTM(nn.Module, _ArchitectureHandlerMixin):
666
- """
667
- A simple LSTM-based network for sequence-to-sequence prediction tasks.
668
-
669
- This model is designed for datasets where each input sequence maps to an
670
- output sequence of the same length. It's suitable for forecasting problems
671
- prepared by the `SequenceMaker` class.
672
-
673
- The expected input shape is `(batch_size, sequence_length, features)`.
674
-
675
- Args:
676
- features (int): The number of features in the input sequence. Defaults to 1.
677
- hidden_size (int): The number of features in the LSTM's hidden state.
678
- Defaults to 100.
679
- recurrent_layers (int): The number of recurrent LSTM layers. Defaults to 1.
680
- dropout (float): The dropout probability for all but the last LSTM layer.
681
- Defaults to 0.
682
- """
683
- def __init__(self, features: int = 1, hidden_size: int = 100,
684
- recurrent_layers: int = 1, dropout: float = 0):
685
- super().__init__()
686
-
687
- # --- Validation ---
688
- if not isinstance(features, int) or features < 1:
689
- raise ValueError("features must be a positive integer.")
690
- if not isinstance(hidden_size, int) or hidden_size < 1:
691
- raise ValueError("hidden_size must be a positive integer.")
692
- if not isinstance(recurrent_layers, int) or recurrent_layers < 1:
693
- raise ValueError("recurrent_layers must be a positive integer.")
694
- if not (0.0 <= dropout < 1.0):
695
- raise ValueError("dropout must be a float between 0.0 and 1.0.")
696
-
697
- # --- Save configuration ---
698
- self.features = features
699
- self.hidden_size = hidden_size
700
- self.recurrent_layers = recurrent_layers
701
- self.dropout = dropout
702
-
703
- # Build model
704
- self.lstm = nn.LSTM(
705
- input_size=features,
706
- hidden_size=hidden_size,
707
- num_layers=recurrent_layers,
708
- dropout=dropout,
709
- batch_first=True # This is crucial for (batch, seq, feature) input
710
- )
711
- self.linear = nn.Linear(in_features=hidden_size, out_features=features)
712
-
713
- def forward(self, x: torch.Tensor) -> torch.Tensor:
714
- """
715
- Defines the forward pass.
716
-
717
- Args:
718
- x (torch.Tensor): The input tensor with shape
719
- (batch_size, sequence_length, features).
720
-
721
- Returns:
722
- torch.Tensor: The output tensor with shape
723
- (batch_size, sequence_length, features).
724
- """
725
- # The LSTM returns the full output sequence and the final hidden/cell states
726
- lstm_out, _ = self.lstm(x)
727
-
728
- # Pass the LSTM's output sequence to the linear layer
729
- predictions = self.linear(lstm_out)
730
-
731
- return predictions
732
-
733
- def get_architecture_config(self) -> dict:
734
- """Returns the configuration of the model."""
735
- return {
736
- 'features': self.features,
737
- 'hidden_size': self.hidden_size,
738
- 'recurrent_layers': self.recurrent_layers,
739
- 'dropout': self.dropout
740
- }
741
-
742
- def __repr__(self) -> str:
743
- """Returns the developer-friendly string representation of the model."""
744
- return (
745
- f"SequencePredictorLSTM(features={self.lstm.input_size}, "
746
- f"hidden_size={self.lstm.hidden_size}, "
747
- f"recurrent_layers={self.lstm.num_layers})"
748
- )
749
-
750
-
751
- # ---- PyTorch models ---
752
-
753
664
  def info():
754
665
  _script_info(__all__)
@@ -6,7 +6,7 @@ import json
6
6
 
7
7
  from ._logger import _LOGGER
8
8
  from .path_manager import make_fullpath
9
- from .keys import PytorchModelArchitectureKeys
9
+ from ._keys import PytorchModelArchitectureKeys
10
10
  from ._schema import FeatureSchema
11
11
  from ._script_info import _script_info
12
12
  from .ML_models import _ArchitectureHandlerMixin
@@ -14,9 +14,9 @@ from functools import partial
14
14
  from .path_manager import make_fullpath, sanitize_filename
15
15
  from ._logger import _LOGGER
16
16
  from ._script_info import _script_info
17
- from .ML_inference import PyTorchInferenceHandler
18
- from .keys import PyTorchInferenceKeys
19
- from .SQL import DatabaseManager
17
+ from .ML_inference import DragonInferenceHandler
18
+ from ._keys import PyTorchInferenceKeys
19
+ from .SQL import DragonSQL
20
20
  from .optimization_tools import _save_result, create_optimization_bounds
21
21
  from .utilities import save_dataframe_filename
22
22
  from .math_utilities import discretize_categorical_values
@@ -24,14 +24,14 @@ from ._schema import FeatureSchema
24
24
 
25
25
 
26
26
  __all__ = [
27
- "MLOptimizer",
27
+ "DragonOptimizer",
28
28
  "FitnessEvaluator",
29
29
  "create_pytorch_problem",
30
30
  "run_optimization"
31
31
  ]
32
32
 
33
33
 
34
- class MLOptimizer:
34
+ class DragonOptimizer:
35
35
  """
36
36
  A wrapper class for setting up and running EvoTorch optimization tasks.
37
37
 
@@ -47,7 +47,7 @@ class MLOptimizer:
47
47
  >>> cont_bounds = {'feature_A': (0, 100), 'feature_B': (-10, 10)}
48
48
  >>>
49
49
  >>> # 3. Initialize the optimizer
50
- >>> optimizer = MLOptimizer(
50
+ >>> optimizer = DragonOptimizer(
51
51
  ... inference_handler=my_handler,
52
52
  ... schema=schema,
53
53
  ... continuous_bounds_map=cont_bounds,
@@ -63,7 +63,7 @@ class MLOptimizer:
63
63
  ... )
64
64
  """
65
65
  def __init__(self,
66
- inference_handler: PyTorchInferenceHandler,
66
+ inference_handler: DragonInferenceHandler,
67
67
  schema: FeatureSchema,
68
68
  continuous_bounds_map: Dict[str, Tuple[float, float]],
69
69
  task: Literal["min", "max"],
@@ -75,7 +75,7 @@ class MLOptimizer:
75
75
  Initializes the optimizer by creating the EvoTorch problem and searcher.
76
76
 
77
77
  Args:
78
- inference_handler (PyTorchInferenceHandler):
78
+ inference_handler (DragonInferenceHandler):
79
79
  An initialized inference handler containing the model.
80
80
  schema (FeatureSchema):
81
81
  The definitive schema object from data_exploration.
@@ -172,18 +172,18 @@ class FitnessEvaluator:
172
172
  A callable class that wraps the PyTorch model inference handler and performs
173
173
  on-the-fly discretization for the EvoTorch fitness function.
174
174
 
175
- This class is automatically instantiated by MLOptimizer and passed to
175
+ This class is automatically instantiated by DragonOptimizer and passed to
176
176
  create_pytorch_problem, encapsulating the evaluation logic.
177
177
  """
178
178
  def __init__(self,
179
- inference_handler: PyTorchInferenceHandler,
179
+ inference_handler: DragonInferenceHandler,
180
180
  categorical_index_map: Optional[Dict[int, int]] = None,
181
181
  discretize_start_at_zero: bool = True):
182
182
  """
183
183
  Initializes the fitness evaluator.
184
184
 
185
185
  Args:
186
- inference_handler (PyTorchInferenceHandler):
186
+ inference_handler (DragonInferenceHandler):
187
187
  An initialized inference handler containing the model.
188
188
  categorical_index_map (Dict[int, int] | None):
189
189
  Maps {column_index: cardinality} for discretization.
@@ -426,7 +426,7 @@ def run_optimization(
426
426
  _LOGGER.info(f"🏁 Starting optimal solution space analysis with {repetitions} repetitions...")
427
427
 
428
428
  first_run_logger = None # To store the logger from the first rep
429
- db_context = DatabaseManager(db_path) if save_format in ['sqlite', 'both'] else nullcontext()
429
+ db_context = DragonSQL(db_path) if save_format in ['sqlite', 'both'] else nullcontext()
430
430
 
431
431
  with db_context as db_manager:
432
432
  # --- Setup Database Schema (if applicable) ---
ml_tools/ML_scaler.py CHANGED
@@ -9,11 +9,11 @@ from .path_manager import make_fullpath
9
9
 
10
10
 
11
11
  __all__ = [
12
- "PytorchScaler"
12
+ "DragonScaler"
13
13
  ]
14
14
 
15
15
 
16
- class PytorchScaler:
16
+ class DragonScaler:
17
17
  """
18
18
  Standardizes continuous features in a PyTorch dataset by subtracting the
19
19
  mean and dividing by the standard deviation.
@@ -38,7 +38,7 @@ class PytorchScaler:
38
38
  self.continuous_feature_indices = continuous_feature_indices
39
39
 
40
40
  @classmethod
41
- def fit(cls, dataset: Dataset, continuous_feature_indices: List[int], batch_size: int = 64) -> 'PytorchScaler':
41
+ def fit(cls, dataset: Dataset, continuous_feature_indices: List[int], batch_size: int = 64) -> 'DragonScaler':
42
42
  """
43
43
  Fits the scaler by computing the mean and std dev from a dataset using a
44
44
  fast, single-pass, vectorized algorithm.
@@ -50,7 +50,7 @@ class PytorchScaler:
50
50
  batch_size (int): The batch size for iterating through the dataset.
51
51
 
52
52
  Returns:
53
- PytorchScaler: A new, fitted instance of the scaler.
53
+ DragonScaler: A new, fitted instance of the scaler.
54
54
  """
55
55
  if not continuous_feature_indices:
56
56
  _LOGGER.error("No continuous feature indices provided. Scaler will not be fitted.")
@@ -167,10 +167,10 @@ class PytorchScaler:
167
167
  }
168
168
  torch.save(state, path_obj)
169
169
  if verbose:
170
- _LOGGER.info(f"PytorchScaler state saved as '{path_obj.name}'.")
170
+ _LOGGER.info(f"DragonScaler state saved as '{path_obj.name}'.")
171
171
 
172
172
  @staticmethod
173
- def load(filepath: Union[str, Path], verbose: bool=True) -> 'PytorchScaler':
173
+ def load(filepath: Union[str, Path], verbose: bool=True) -> 'DragonScaler':
174
174
  """
175
175
  Loads a scaler's state from a .pth file.
176
176
 
@@ -178,13 +178,13 @@ class PytorchScaler:
178
178
  filepath (str | Path): The path to the saved scaler file.
179
179
 
180
180
  Returns:
181
- PytorchScaler: An instance of the scaler with the loaded state.
181
+ DragonScaler: An instance of the scaler with the loaded state.
182
182
  """
183
183
  path_obj = make_fullpath(filepath, enforce="file")
184
184
  state = torch.load(path_obj)
185
185
  if verbose:
186
- _LOGGER.info(f"PytorchScaler state loaded from '{path_obj.name}'.")
187
- return PytorchScaler(
186
+ _LOGGER.info(f"DragonScaler state loaded from '{path_obj.name}'.")
187
+ return DragonScaler(
188
188
  mean=state['mean'],
189
189
  std=state['std'],
190
190
  continuous_feature_indices=state['continuous_feature_indices']
@@ -194,8 +194,8 @@ class PytorchScaler:
194
194
  """Returns the developer-friendly string representation of the scaler."""
195
195
  if self.continuous_feature_indices:
196
196
  num_features = len(self.continuous_feature_indices)
197
- return f"PytorchScaler(fitted for {num_features} features)"
198
- return "PytorchScaler(not fitted)"
197
+ return f"DragonScaler(fitted for {num_features} features)"
198
+ return "DragonScaler(not fitted)"
199
199
 
200
200
 
201
201
  def info():
@@ -0,0 +1,341 @@
1
+ import torch
2
+ from torch.utils.data import Dataset
3
+ import pandas
4
+ import numpy
5
+ from typing import Literal, Union, Tuple
6
+ import matplotlib.pyplot as plt
7
+ from pathlib import Path
8
+
9
+ from .path_manager import make_fullpath
10
+ from ._logger import _LOGGER
11
+ from ._script_info import _script_info
12
+ from .ML_scaler import DragonScaler
13
+ from .ML_datasetmaster import _PytorchDataset
14
+ from ._keys import DatasetKeys, MLTaskKeys
15
+
16
+
17
+ __all__ = [
18
+ "DragonDatasetSequence"
19
+ ]
20
+
21
+
22
+ # --- SequenceMaker ---
23
+ class DragonDatasetSequence:
24
+ """
25
+ Creates windowed PyTorch datasets from a univariate (one feature) sequential data.
26
+
27
+ Automatic Pipeline:
28
+
29
+ 1. Split Data: Separate data into training, validation, and testing portions.
30
+ 2. Normalize Data: Normalize the data. The scaler will be fitted on the training portion.
31
+ 3. Generate Windows: Create the windowed sequences from the normalized splits.
32
+ """
33
+ def __init__(self,
34
+ prediction_mode: Literal["sequence-to-sequence", "sequence-to-value"],
35
+ data: Union[pandas.DataFrame, pandas.Series, numpy.ndarray],
36
+ sequence_length: int,
37
+ validation_size: float = 0.2,
38
+ test_size: float = 0.1):
39
+ """
40
+ Initializes the dataset manager and automatically processes the data.
41
+
42
+ The constructor runs the full pipeline:
43
+ 1. Splits the data chronologically (train, validation, test).
44
+ 2. Fits a DragonScaler on the training split.
45
+ 3. Normalizes all splits using the fitted scaler.
46
+ 4. Generates windowed datasets for training, validation, and testing.
47
+
48
+ Args:
49
+ prediction_mode: The type of sequence task.
50
+ data: The input univariate time-series data.
51
+ - If pandas.DataFrame: The index is used for the time axis
52
+ and the *first column* is used as the sequence.
53
+ - If pandas.Series: The index is used for the time axis.
54
+ - If numpy.ndarray: A simple integer range is used for the time axis.
55
+ sequence_length (int): The number of time steps in each input window (X).
56
+ validation_size (float): The fraction of data to hold out for validation.
57
+ test_size (float): The fraction of data to hold out for testing.
58
+ """
59
+ self._train_dataset = None
60
+ self._test_dataset = None
61
+ self._val_dataset = None
62
+ self.sequence_length = sequence_length
63
+ self.scaler = None
64
+
65
+ if not prediction_mode in [MLTaskKeys.SEQUENCE_SEQUENCE, MLTaskKeys.SEQUENCE_VALUE]:
66
+ _LOGGER.error(f"Unrecognized prediction mode: '{prediction_mode}'.")
67
+ raise ValueError()
68
+ else:
69
+ self.prediction_mode = prediction_mode
70
+
71
+ if isinstance(data, pandas.DataFrame):
72
+ self.time_axis = data.index.values
73
+ self.sequence = data.iloc[:, 0].values.astype(numpy.float32)
74
+ elif isinstance(data, pandas.Series):
75
+ self.time_axis = data.index.values
76
+ self.sequence = data.values.astype(numpy.float32)
77
+ elif isinstance(data, numpy.ndarray):
78
+ self.time_axis = numpy.arange(len(data))
79
+ self.sequence = data.astype(numpy.float32)
80
+ else:
81
+ _LOGGER.error("Data must be a pandas DataFrame/Series or a numpy array.")
82
+ raise TypeError()
83
+
84
+ self.train_sequence = None
85
+ self.val_sequence = None
86
+ self.test_sequence = None
87
+
88
+ self.train_time_axis = None
89
+ self.val_time_axis = None
90
+ self.test_time_axis = None
91
+
92
+ self._is_split = False
93
+ self._is_normalized = False
94
+ self._are_windows_generated = False
95
+
96
+ # Automation
97
+ self._split_data(validation_size=validation_size, test_size=test_size)
98
+ self._normalize_data()
99
+ self._generate_windows()
100
+
101
+ def _split_data(self, validation_size: float = 0.2, test_size: float = 0.1) -> None:
102
+ """
103
+ Splits the sequence chronologically into training, validation, and testing portions.
104
+
105
+ To prevent windowing errors, the validation and test sets include an overlap of `sequence_length` from the preceding data.
106
+ """
107
+ if self._is_split:
108
+ _LOGGER.warning("Data has already been split.")
109
+ return
110
+
111
+ if (validation_size + test_size) >= 1.0:
112
+ _LOGGER.error(f"The sum of validation_size ({validation_size}) and test_size ({test_size}) must be less than 1.0.")
113
+ raise ValueError("validation_size and test_size sum must be < 1.0")
114
+
115
+ total_size = len(self.sequence)
116
+
117
+ # Calculate split indices
118
+ test_split_idx = int(total_size * (1 - test_size))
119
+ val_split_idx = int(total_size * (1 - test_size - validation_size))
120
+
121
+ # --- Create sequences ---
122
+ # Train sequence is from the beginning to the validation index
123
+ self.train_sequence = self.sequence[:val_split_idx]
124
+
125
+ # Validation sequence starts `sequence_length` before its split index for windowing
126
+ self.val_sequence = self.sequence[val_split_idx - self.sequence_length : test_split_idx]
127
+
128
+ # Test sequence starts `sequence_length` before its split index for windowing
129
+ self.test_sequence = self.sequence[test_split_idx - self.sequence_length:]
130
+
131
+ # --- Create time axes ---
132
+ self.train_time_axis = self.time_axis[:val_split_idx]
133
+ # The "plottable" validation/test time axes start from their respective split indices
134
+ self.val_time_axis = self.time_axis[val_split_idx : test_split_idx]
135
+ self.test_time_axis = self.time_axis[test_split_idx:]
136
+
137
+ self._is_split = True
138
+ _LOGGER.info(f"Sequence split into training ({len(self.train_sequence)}), validation ({len(self.val_sequence)}), and testing ({len(self.test_sequence)}) points.")
139
+
140
+ def _normalize_data(self) -> None:
141
+ """
142
+ Normalizes the sequence data using DragonScaler. Must be called AFTER splitting to prevent data leakage from the test set.
143
+ """
144
+ if not self._is_split:
145
+ _LOGGER.error("Data must be split BEFORE normalizing.")
146
+ raise RuntimeError()
147
+
148
+ if self.scaler:
149
+ _LOGGER.warning("Data has already been normalized.")
150
+ return
151
+
152
+ # 1. DragonScaler requires a Dataset to fit. Create a temporary one.
153
+ # The scaler expects 2D data [n_samples, n_features].
154
+ train_features = self.train_sequence.reshape(-1, 1) # type: ignore
155
+
156
+ # _PytorchDataset needs labels, so we create dummy ones.
157
+ dummy_labels = numpy.zeros(len(train_features))
158
+ temp_train_ds = _PytorchDataset(train_features, dummy_labels, labels_dtype=torch.float32)
159
+
160
+ # 2. Fit the DragonScaler on the temporary training dataset.
161
+ # The sequence is a single feature, so its index is [0].
162
+ _LOGGER.info("Fitting DragonScaler on the training data...")
163
+ self.scaler = DragonScaler.fit(temp_train_ds, continuous_feature_indices=[0])
164
+
165
+ # 3. Transform sequences using the fitted scaler.
166
+ # The transform method requires a tensor, so we convert, transform, and convert back.
167
+ train_tensor = torch.tensor(self.train_sequence.reshape(-1, 1), dtype=torch.float32) # type: ignore
168
+ val_tensor = torch.tensor(self.val_sequence.reshape(-1, 1), dtype=torch.float32) # type: ignore
169
+ test_tensor = torch.tensor(self.test_sequence.reshape(-1, 1), dtype=torch.float32) # type: ignore
170
+
171
+ self.train_sequence = self.scaler.transform(train_tensor).numpy().flatten()
172
+ self.val_sequence = self.scaler.transform(val_tensor).numpy().flatten()
173
+ self.test_sequence = self.scaler.transform(test_tensor).numpy().flatten()
174
+
175
+ self._is_normalized = True
176
+ _LOGGER.info("Sequence data normalized using DragonScaler.")
177
+
178
+ def _generate_windows(self) -> None:
179
+ """
180
+ Generates overlapping windows for features and labels.
181
+ """
182
+ if not self._is_split:
183
+ _LOGGER.error("Cannot generate windows before splitting data.")
184
+ raise RuntimeError()
185
+
186
+ if not self._is_normalized:
187
+ _LOGGER.error("Cannot generate windows before normalizing data.")
188
+ raise RuntimeError()
189
+
190
+ if self._are_windows_generated:
191
+ _LOGGER.warning("Windows have already been generated.")
192
+ return
193
+
194
+ self._train_dataset = self._create_windowed_dataset(self.train_sequence) # type: ignore
195
+ self._val_dataset = self._create_windowed_dataset(self.val_sequence) # type: ignore
196
+ self._test_dataset = self._create_windowed_dataset(self.test_sequence) # type: ignore
197
+
198
+ self._are_windows_generated = True
199
+ _LOGGER.info("Feature and label windows generated for train, validation, and test sets.")
200
+
201
+ def _create_windowed_dataset(self, data: numpy.ndarray) -> Dataset:
202
+ """Efficiently creates windowed features and labels using numpy."""
203
+ if len(data) <= self.sequence_length:
204
+ # Validation/Test sets of size 0 might be passed
205
+ _LOGGER.warning(f"Data length ({len(data)}) is not greater than sequence_length ({self.sequence_length}). Cannot create windows. Returning empty dataset.")
206
+ return _PytorchDataset(numpy.array([]), numpy.array([]), labels_dtype=torch.float32)
207
+
208
+ if self.prediction_mode == MLTaskKeys.SEQUENCE_VALUE:
209
+ # sequence-to-value
210
+ features = data[:-1]
211
+ labels = data[self.sequence_length:]
212
+
213
+ n_windows = len(features) - self.sequence_length + 1
214
+ bytes_per_item = features.strides[0]
215
+ strided_features = numpy.lib.stride_tricks.as_strided(
216
+ features, shape=(n_windows, self.sequence_length), strides=(bytes_per_item, bytes_per_item)
217
+ )
218
+ # Ensure labels align with the end of each feature window
219
+ aligned_labels = labels[:n_windows]
220
+ return _PytorchDataset(strided_features, aligned_labels, labels_dtype=torch.float32)
221
+
222
+ else:
223
+ # Sequence-to-sequence
224
+ x_data = data[:-1]
225
+ y_data = data[1:]
226
+
227
+ n_windows = len(x_data) - self.sequence_length + 1
228
+ bytes_per_item = x_data.strides[0]
229
+
230
+ strided_x = numpy.lib.stride_tricks.as_strided(x_data, shape=(n_windows, self.sequence_length), strides=(bytes_per_item, bytes_per_item))
231
+ strided_y = numpy.lib.stride_tricks.as_strided(y_data, shape=(n_windows, self.sequence_length), strides=(bytes_per_item, bytes_per_item))
232
+
233
+ return _PytorchDataset(strided_x, strided_y, labels_dtype=torch.float32)
234
+
235
+ def plot_splits(self, save_dir: Union[str, Path]):
236
+ """Plots the training, validation and testing data."""
237
+ if not self._is_split:
238
+ _LOGGER.error("Cannot plot before splitting data.")
239
+ raise RuntimeError()
240
+
241
+ if self.scaler is None:
242
+ _LOGGER.error("Cannot plot: data has not been normalized, or scaler is missing.")
243
+ return
244
+
245
+ save_path = make_fullpath(save_dir, make=True, enforce="directory")
246
+ full_path = save_path / "SequenceSplits.svg"
247
+
248
+ plt.figure(figsize=(15, 6))
249
+ plt.title("Sequential Data")
250
+ plt.grid(True)
251
+ plt.xlabel("Sequence")
252
+ plt.ylabel("Value")
253
+
254
+ # Plot denormalized training data
255
+ plt.plot(self.train_time_axis, self.scaler.inverse_transform(self.train_sequence.reshape(-1, 1)), label='Train Data') # type: ignore
256
+
257
+ # Plot denormalized validation data
258
+ # We must skip the overlapping 'sequence_length' part for plotting
259
+ val_plot_data = self.val_sequence[self.sequence_length:] # type: ignore
260
+ plt.plot(self.val_time_axis, self.scaler.inverse_transform(val_plot_data.reshape(-1, 1)), label='Validation Data', c='orange') # type: ignore
261
+
262
+ # Plot denormalized test data
263
+ # We must skip the overlapping 'sequence_length' part for plotting
264
+ test_plot_data = self.test_sequence[self.sequence_length:] # type: ignore
265
+ plt.plot(self.test_time_axis, self.scaler.inverse_transform(test_plot_data.reshape(-1, 1)), label='Test Data', c='green') # type: ignore
266
+
267
+ plt.legend()
268
+
269
+ plt.tight_layout()
270
+ plt.savefig(full_path)
271
+ _LOGGER.info(f"📈 Sequence data splits saved as '{full_path.name}'.")
272
+ plt.close()
273
+
274
+ def get_datasets(self) -> Tuple[Dataset, Dataset, Dataset]:
275
+ """Returns the final train, validation, and test datasets."""
276
+ if not self._are_windows_generated:
277
+ _LOGGER.error("Windows have not been generated. Call .generate_windows() first.")
278
+ raise RuntimeError()
279
+ return self._train_dataset, self._val_dataset, self._test_dataset # type: ignore
280
+
281
+ def save_scaler(self, directory: Union[str, Path], verbose: bool=True) -> None:
282
+ """
283
+ Saves the fitted DragonScaler's state to a .pth file.
284
+
285
+ Args:
286
+ directory (str | Path): The directory where the scaler will be saved.
287
+ """
288
+ if not self.scaler:
289
+ _LOGGER.error("No scaler was fitted or provided.")
290
+ raise RuntimeError()
291
+
292
+ save_path = make_fullpath(directory, make=True, enforce="directory")
293
+
294
+ filename = f"{DatasetKeys.SCALER_PREFIX}{self.prediction_mode}.pth"
295
+ filepath = save_path / filename
296
+ self.scaler.save(filepath, verbose=False)
297
+ if verbose:
298
+ _LOGGER.info(f"Scaler saved as '{filepath.name}'.")
299
+
300
+ def get_last_training_sequence(self) -> numpy.ndarray:
301
+ """
302
+ Returns the final, un-scaled sequence from the training data.
303
+ """
304
+ if not self._is_split:
305
+ _LOGGER.error("Data has not been split. Cannot get last training sequence.")
306
+ raise RuntimeError()
307
+
308
+ # The length of train_time_axis is our validation split index
309
+ val_split_idx = len(self.train_time_axis) # type: ignore
310
+
311
+ if val_split_idx < self.sequence_length:
312
+ _LOGGER.error(f"Training data length ({val_split_idx}) is less than sequence_length ({self.sequence_length}).")
313
+ raise ValueError()
314
+
315
+ # Get the slice from the *original* sequence
316
+ start_idx = val_split_idx - self.sequence_length
317
+ end_idx = val_split_idx
318
+
319
+ return self.sequence[start_idx:end_idx]
320
+
321
+ def __repr__(self) -> str:
322
+ s = f"<{self.__class__.__name__}>:\n"
323
+ s += f" Prediction Mode: {self.prediction_mode}\n"
324
+ s += f" Sequence Length (Window): {self.sequence_length}\n"
325
+ s += f" Total Data Points: {len(self.sequence)}\n"
326
+ s += " --- Status ---\n"
327
+ s += f" Split: {self._is_split}\n"
328
+ s += f" Normalized: {self._is_normalized}\n"
329
+ s += f" Windows Generated: {self._are_windows_generated}\n"
330
+
331
+ if self._are_windows_generated:
332
+ train_len = len(self._train_dataset) if self._train_dataset else 0 # type: ignore
333
+ val_len = len(self._val_dataset) if self._val_dataset else 0 # type: ignore
334
+ test_len = len(self._test_dataset) if self._test_dataset else 0 # type: ignore
335
+ s += f" Datasets (Train | Validation | Test): {train_len} | {val_len} | {test_len} windows\n"
336
+
337
+ return s
338
+
339
+
340
+ def info():
341
+ _script_info(__all__)