dragon-ml-toolbox 14.7.0__py3-none-any.whl → 16.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dragon_ml_toolbox-14.7.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/METADATA +9 -5
- dragon_ml_toolbox-16.2.0.dist-info/RECORD +51 -0
- ml_tools/ETL_cleaning.py +20 -20
- ml_tools/ETL_engineering.py +23 -25
- ml_tools/GUI_tools.py +20 -20
- ml_tools/MICE_imputation.py +3 -3
- ml_tools/ML_callbacks.py +43 -26
- ml_tools/ML_configuration.py +704 -24
- ml_tools/ML_datasetmaster.py +235 -280
- ml_tools/ML_evaluation.py +144 -39
- ml_tools/ML_evaluation_multi.py +103 -35
- ml_tools/ML_inference.py +290 -208
- ml_tools/ML_models.py +13 -102
- ml_tools/ML_models_advanced.py +1 -1
- ml_tools/ML_optimization.py +12 -12
- ml_tools/ML_scaler.py +11 -11
- ml_tools/ML_sequence_datasetmaster.py +341 -0
- ml_tools/ML_sequence_evaluation.py +219 -0
- ml_tools/ML_sequence_inference.py +391 -0
- ml_tools/ML_sequence_models.py +139 -0
- ml_tools/ML_trainer.py +1342 -386
- ml_tools/ML_utilities.py +1 -1
- ml_tools/ML_vision_datasetmaster.py +120 -72
- ml_tools/ML_vision_evaluation.py +30 -6
- ml_tools/ML_vision_inference.py +129 -152
- ml_tools/ML_vision_models.py +1 -1
- ml_tools/ML_vision_transformers.py +121 -40
- ml_tools/PSO_optimization.py +6 -6
- ml_tools/SQL.py +4 -4
- ml_tools/{keys.py → _keys.py} +45 -0
- ml_tools/_schema.py +1 -1
- ml_tools/ensemble_evaluation.py +1 -1
- ml_tools/ensemble_inference.py +7 -33
- ml_tools/ensemble_learning.py +1 -1
- ml_tools/optimization_tools.py +2 -2
- ml_tools/path_manager.py +5 -5
- ml_tools/utilities.py +1 -2
- dragon_ml_toolbox-14.7.0.dist-info/RECORD +0 -49
- ml_tools/RNN_forecast.py +0 -56
- ml_tools/_ML_vision_recipe.py +0 -88
- {dragon_ml_toolbox-14.7.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-14.7.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-14.7.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-14.7.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/top_level.txt +0 -0
ml_tools/ML_models.py
CHANGED
|
@@ -7,16 +7,15 @@ import json
|
|
|
7
7
|
from ._logger import _LOGGER
|
|
8
8
|
from .path_manager import make_fullpath
|
|
9
9
|
from ._script_info import _script_info
|
|
10
|
-
from .
|
|
10
|
+
from ._keys import PytorchModelArchitectureKeys
|
|
11
11
|
from ._schema import FeatureSchema
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
__all__ = [
|
|
15
|
-
"
|
|
16
|
-
"
|
|
17
|
-
"
|
|
18
|
-
"
|
|
19
|
-
"SequencePredictorLSTM",
|
|
15
|
+
"DragonMLP",
|
|
16
|
+
"DragonAttentionMLP",
|
|
17
|
+
"DragonMultiHeadAttentionNet",
|
|
18
|
+
"DragonTabularTransformer"
|
|
20
19
|
]
|
|
21
20
|
|
|
22
21
|
|
|
@@ -174,7 +173,7 @@ class _BaseAttention(_BaseMLP):
|
|
|
174
173
|
return logits, attention_weights
|
|
175
174
|
|
|
176
175
|
|
|
177
|
-
class
|
|
176
|
+
class DragonMLP(_BaseMLP):
|
|
178
177
|
"""
|
|
179
178
|
Creates a versatile Multilayer Perceptron (MLP) for regression or classification tasks.
|
|
180
179
|
"""
|
|
@@ -208,10 +207,10 @@ class MultilayerPerceptron(_BaseMLP):
|
|
|
208
207
|
# Extracts the number of neurons from each nn.Linear layer
|
|
209
208
|
layer_sizes = [str(layer.in_features) for layer in self.mlp if isinstance(layer, nn.Linear)]
|
|
210
209
|
|
|
211
|
-
return self._repr_helper(name="
|
|
210
|
+
return self._repr_helper(name="DragonMLP", mlp_layers=layer_sizes)
|
|
212
211
|
|
|
213
212
|
|
|
214
|
-
class
|
|
213
|
+
class DragonAttentionMLP(_BaseAttention):
|
|
215
214
|
"""
|
|
216
215
|
A Multilayer Perceptron (MLP) that incorporates an Attention layer to dynamically weigh input features.
|
|
217
216
|
|
|
@@ -244,10 +243,10 @@ class AttentionMLP(_BaseAttention):
|
|
|
244
243
|
if isinstance(layer, nn.Linear):
|
|
245
244
|
arch.append(str(layer.in_features))
|
|
246
245
|
|
|
247
|
-
return self._repr_helper(name="
|
|
246
|
+
return self._repr_helper(name="DragonAttentionMLP", mlp_layers=arch)
|
|
248
247
|
|
|
249
248
|
|
|
250
|
-
class
|
|
249
|
+
class DragonMultiHeadAttentionNet(_BaseAttention):
|
|
251
250
|
"""
|
|
252
251
|
An MLP that incorporates a standard `nn.MultiheadAttention` layer to process
|
|
253
252
|
the input features.
|
|
@@ -292,10 +291,10 @@ class MultiHeadAttentionMLP(_BaseAttention):
|
|
|
292
291
|
)
|
|
293
292
|
arch_str = f"{self.in_features} -> [MultiHead(h={self.num_heads})] -> {mlp_part}"
|
|
294
293
|
|
|
295
|
-
return f"
|
|
294
|
+
return f"DragonMultiHeadAttentionNet(arch: {arch_str})"
|
|
296
295
|
|
|
297
296
|
|
|
298
|
-
class
|
|
297
|
+
class DragonTabularTransformer(nn.Module, _ArchitectureHandlerMixin):
|
|
299
298
|
"""
|
|
300
299
|
A Transformer-based model for tabular data tasks.
|
|
301
300
|
|
|
@@ -502,7 +501,7 @@ class TabularTransformer(nn.Module, _ArchitectureHandlerMixin):
|
|
|
502
501
|
|
|
503
502
|
arch_str = " -> ".join(parts)
|
|
504
503
|
|
|
505
|
-
return f"
|
|
504
|
+
return f"DragonTabularTransformer(arch: {arch_str})"
|
|
506
505
|
|
|
507
506
|
|
|
508
507
|
class _FeatureTokenizer(nn.Module):
|
|
@@ -662,93 +661,5 @@ class _MultiHeadAttentionLayer(nn.Module):
|
|
|
662
661
|
return out, attn_weights.squeeze()
|
|
663
662
|
|
|
664
663
|
|
|
665
|
-
class SequencePredictorLSTM(nn.Module, _ArchitectureHandlerMixin):
|
|
666
|
-
"""
|
|
667
|
-
A simple LSTM-based network for sequence-to-sequence prediction tasks.
|
|
668
|
-
|
|
669
|
-
This model is designed for datasets where each input sequence maps to an
|
|
670
|
-
output sequence of the same length. It's suitable for forecasting problems
|
|
671
|
-
prepared by the `SequenceMaker` class.
|
|
672
|
-
|
|
673
|
-
The expected input shape is `(batch_size, sequence_length, features)`.
|
|
674
|
-
|
|
675
|
-
Args:
|
|
676
|
-
features (int): The number of features in the input sequence. Defaults to 1.
|
|
677
|
-
hidden_size (int): The number of features in the LSTM's hidden state.
|
|
678
|
-
Defaults to 100.
|
|
679
|
-
recurrent_layers (int): The number of recurrent LSTM layers. Defaults to 1.
|
|
680
|
-
dropout (float): The dropout probability for all but the last LSTM layer.
|
|
681
|
-
Defaults to 0.
|
|
682
|
-
"""
|
|
683
|
-
def __init__(self, features: int = 1, hidden_size: int = 100,
|
|
684
|
-
recurrent_layers: int = 1, dropout: float = 0):
|
|
685
|
-
super().__init__()
|
|
686
|
-
|
|
687
|
-
# --- Validation ---
|
|
688
|
-
if not isinstance(features, int) or features < 1:
|
|
689
|
-
raise ValueError("features must be a positive integer.")
|
|
690
|
-
if not isinstance(hidden_size, int) or hidden_size < 1:
|
|
691
|
-
raise ValueError("hidden_size must be a positive integer.")
|
|
692
|
-
if not isinstance(recurrent_layers, int) or recurrent_layers < 1:
|
|
693
|
-
raise ValueError("recurrent_layers must be a positive integer.")
|
|
694
|
-
if not (0.0 <= dropout < 1.0):
|
|
695
|
-
raise ValueError("dropout must be a float between 0.0 and 1.0.")
|
|
696
|
-
|
|
697
|
-
# --- Save configuration ---
|
|
698
|
-
self.features = features
|
|
699
|
-
self.hidden_size = hidden_size
|
|
700
|
-
self.recurrent_layers = recurrent_layers
|
|
701
|
-
self.dropout = dropout
|
|
702
|
-
|
|
703
|
-
# Build model
|
|
704
|
-
self.lstm = nn.LSTM(
|
|
705
|
-
input_size=features,
|
|
706
|
-
hidden_size=hidden_size,
|
|
707
|
-
num_layers=recurrent_layers,
|
|
708
|
-
dropout=dropout,
|
|
709
|
-
batch_first=True # This is crucial for (batch, seq, feature) input
|
|
710
|
-
)
|
|
711
|
-
self.linear = nn.Linear(in_features=hidden_size, out_features=features)
|
|
712
|
-
|
|
713
|
-
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
714
|
-
"""
|
|
715
|
-
Defines the forward pass.
|
|
716
|
-
|
|
717
|
-
Args:
|
|
718
|
-
x (torch.Tensor): The input tensor with shape
|
|
719
|
-
(batch_size, sequence_length, features).
|
|
720
|
-
|
|
721
|
-
Returns:
|
|
722
|
-
torch.Tensor: The output tensor with shape
|
|
723
|
-
(batch_size, sequence_length, features).
|
|
724
|
-
"""
|
|
725
|
-
# The LSTM returns the full output sequence and the final hidden/cell states
|
|
726
|
-
lstm_out, _ = self.lstm(x)
|
|
727
|
-
|
|
728
|
-
# Pass the LSTM's output sequence to the linear layer
|
|
729
|
-
predictions = self.linear(lstm_out)
|
|
730
|
-
|
|
731
|
-
return predictions
|
|
732
|
-
|
|
733
|
-
def get_architecture_config(self) -> dict:
|
|
734
|
-
"""Returns the configuration of the model."""
|
|
735
|
-
return {
|
|
736
|
-
'features': self.features,
|
|
737
|
-
'hidden_size': self.hidden_size,
|
|
738
|
-
'recurrent_layers': self.recurrent_layers,
|
|
739
|
-
'dropout': self.dropout
|
|
740
|
-
}
|
|
741
|
-
|
|
742
|
-
def __repr__(self) -> str:
|
|
743
|
-
"""Returns the developer-friendly string representation of the model."""
|
|
744
|
-
return (
|
|
745
|
-
f"SequencePredictorLSTM(features={self.lstm.input_size}, "
|
|
746
|
-
f"hidden_size={self.lstm.hidden_size}, "
|
|
747
|
-
f"recurrent_layers={self.lstm.num_layers})"
|
|
748
|
-
)
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
# ---- PyTorch models ---
|
|
752
|
-
|
|
753
664
|
def info():
|
|
754
665
|
_script_info(__all__)
|
ml_tools/ML_models_advanced.py
CHANGED
|
@@ -6,7 +6,7 @@ import json
|
|
|
6
6
|
|
|
7
7
|
from ._logger import _LOGGER
|
|
8
8
|
from .path_manager import make_fullpath
|
|
9
|
-
from .
|
|
9
|
+
from ._keys import PytorchModelArchitectureKeys
|
|
10
10
|
from ._schema import FeatureSchema
|
|
11
11
|
from ._script_info import _script_info
|
|
12
12
|
from .ML_models import _ArchitectureHandlerMixin
|
ml_tools/ML_optimization.py
CHANGED
|
@@ -14,9 +14,9 @@ from functools import partial
|
|
|
14
14
|
from .path_manager import make_fullpath, sanitize_filename
|
|
15
15
|
from ._logger import _LOGGER
|
|
16
16
|
from ._script_info import _script_info
|
|
17
|
-
from .ML_inference import
|
|
18
|
-
from .
|
|
19
|
-
from .SQL import
|
|
17
|
+
from .ML_inference import DragonInferenceHandler
|
|
18
|
+
from ._keys import PyTorchInferenceKeys
|
|
19
|
+
from .SQL import DragonSQL
|
|
20
20
|
from .optimization_tools import _save_result, create_optimization_bounds
|
|
21
21
|
from .utilities import save_dataframe_filename
|
|
22
22
|
from .math_utilities import discretize_categorical_values
|
|
@@ -24,14 +24,14 @@ from ._schema import FeatureSchema
|
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
__all__ = [
|
|
27
|
-
"
|
|
27
|
+
"DragonOptimizer",
|
|
28
28
|
"FitnessEvaluator",
|
|
29
29
|
"create_pytorch_problem",
|
|
30
30
|
"run_optimization"
|
|
31
31
|
]
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
class
|
|
34
|
+
class DragonOptimizer:
|
|
35
35
|
"""
|
|
36
36
|
A wrapper class for setting up and running EvoTorch optimization tasks.
|
|
37
37
|
|
|
@@ -47,7 +47,7 @@ class MLOptimizer:
|
|
|
47
47
|
>>> cont_bounds = {'feature_A': (0, 100), 'feature_B': (-10, 10)}
|
|
48
48
|
>>>
|
|
49
49
|
>>> # 3. Initialize the optimizer
|
|
50
|
-
>>> optimizer =
|
|
50
|
+
>>> optimizer = DragonOptimizer(
|
|
51
51
|
... inference_handler=my_handler,
|
|
52
52
|
... schema=schema,
|
|
53
53
|
... continuous_bounds_map=cont_bounds,
|
|
@@ -63,7 +63,7 @@ class MLOptimizer:
|
|
|
63
63
|
... )
|
|
64
64
|
"""
|
|
65
65
|
def __init__(self,
|
|
66
|
-
inference_handler:
|
|
66
|
+
inference_handler: DragonInferenceHandler,
|
|
67
67
|
schema: FeatureSchema,
|
|
68
68
|
continuous_bounds_map: Dict[str, Tuple[float, float]],
|
|
69
69
|
task: Literal["min", "max"],
|
|
@@ -75,7 +75,7 @@ class MLOptimizer:
|
|
|
75
75
|
Initializes the optimizer by creating the EvoTorch problem and searcher.
|
|
76
76
|
|
|
77
77
|
Args:
|
|
78
|
-
inference_handler (
|
|
78
|
+
inference_handler (DragonInferenceHandler):
|
|
79
79
|
An initialized inference handler containing the model.
|
|
80
80
|
schema (FeatureSchema):
|
|
81
81
|
The definitive schema object from data_exploration.
|
|
@@ -172,18 +172,18 @@ class FitnessEvaluator:
|
|
|
172
172
|
A callable class that wraps the PyTorch model inference handler and performs
|
|
173
173
|
on-the-fly discretization for the EvoTorch fitness function.
|
|
174
174
|
|
|
175
|
-
This class is automatically instantiated by
|
|
175
|
+
This class is automatically instantiated by DragonOptimizer and passed to
|
|
176
176
|
create_pytorch_problem, encapsulating the evaluation logic.
|
|
177
177
|
"""
|
|
178
178
|
def __init__(self,
|
|
179
|
-
inference_handler:
|
|
179
|
+
inference_handler: DragonInferenceHandler,
|
|
180
180
|
categorical_index_map: Optional[Dict[int, int]] = None,
|
|
181
181
|
discretize_start_at_zero: bool = True):
|
|
182
182
|
"""
|
|
183
183
|
Initializes the fitness evaluator.
|
|
184
184
|
|
|
185
185
|
Args:
|
|
186
|
-
inference_handler (
|
|
186
|
+
inference_handler (DragonInferenceHandler):
|
|
187
187
|
An initialized inference handler containing the model.
|
|
188
188
|
categorical_index_map (Dict[int, int] | None):
|
|
189
189
|
Maps {column_index: cardinality} for discretization.
|
|
@@ -426,7 +426,7 @@ def run_optimization(
|
|
|
426
426
|
_LOGGER.info(f"🏁 Starting optimal solution space analysis with {repetitions} repetitions...")
|
|
427
427
|
|
|
428
428
|
first_run_logger = None # To store the logger from the first rep
|
|
429
|
-
db_context =
|
|
429
|
+
db_context = DragonSQL(db_path) if save_format in ['sqlite', 'both'] else nullcontext()
|
|
430
430
|
|
|
431
431
|
with db_context as db_manager:
|
|
432
432
|
# --- Setup Database Schema (if applicable) ---
|
ml_tools/ML_scaler.py
CHANGED
|
@@ -9,11 +9,11 @@ from .path_manager import make_fullpath
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
__all__ = [
|
|
12
|
-
"
|
|
12
|
+
"DragonScaler"
|
|
13
13
|
]
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
class
|
|
16
|
+
class DragonScaler:
|
|
17
17
|
"""
|
|
18
18
|
Standardizes continuous features in a PyTorch dataset by subtracting the
|
|
19
19
|
mean and dividing by the standard deviation.
|
|
@@ -38,7 +38,7 @@ class PytorchScaler:
|
|
|
38
38
|
self.continuous_feature_indices = continuous_feature_indices
|
|
39
39
|
|
|
40
40
|
@classmethod
|
|
41
|
-
def fit(cls, dataset: Dataset, continuous_feature_indices: List[int], batch_size: int = 64) -> '
|
|
41
|
+
def fit(cls, dataset: Dataset, continuous_feature_indices: List[int], batch_size: int = 64) -> 'DragonScaler':
|
|
42
42
|
"""
|
|
43
43
|
Fits the scaler by computing the mean and std dev from a dataset using a
|
|
44
44
|
fast, single-pass, vectorized algorithm.
|
|
@@ -50,7 +50,7 @@ class PytorchScaler:
|
|
|
50
50
|
batch_size (int): The batch size for iterating through the dataset.
|
|
51
51
|
|
|
52
52
|
Returns:
|
|
53
|
-
|
|
53
|
+
DragonScaler: A new, fitted instance of the scaler.
|
|
54
54
|
"""
|
|
55
55
|
if not continuous_feature_indices:
|
|
56
56
|
_LOGGER.error("No continuous feature indices provided. Scaler will not be fitted.")
|
|
@@ -167,10 +167,10 @@ class PytorchScaler:
|
|
|
167
167
|
}
|
|
168
168
|
torch.save(state, path_obj)
|
|
169
169
|
if verbose:
|
|
170
|
-
_LOGGER.info(f"
|
|
170
|
+
_LOGGER.info(f"DragonScaler state saved as '{path_obj.name}'.")
|
|
171
171
|
|
|
172
172
|
@staticmethod
|
|
173
|
-
def load(filepath: Union[str, Path], verbose: bool=True) -> '
|
|
173
|
+
def load(filepath: Union[str, Path], verbose: bool=True) -> 'DragonScaler':
|
|
174
174
|
"""
|
|
175
175
|
Loads a scaler's state from a .pth file.
|
|
176
176
|
|
|
@@ -178,13 +178,13 @@ class PytorchScaler:
|
|
|
178
178
|
filepath (str | Path): The path to the saved scaler file.
|
|
179
179
|
|
|
180
180
|
Returns:
|
|
181
|
-
|
|
181
|
+
DragonScaler: An instance of the scaler with the loaded state.
|
|
182
182
|
"""
|
|
183
183
|
path_obj = make_fullpath(filepath, enforce="file")
|
|
184
184
|
state = torch.load(path_obj)
|
|
185
185
|
if verbose:
|
|
186
|
-
_LOGGER.info(f"
|
|
187
|
-
return
|
|
186
|
+
_LOGGER.info(f"DragonScaler state loaded from '{path_obj.name}'.")
|
|
187
|
+
return DragonScaler(
|
|
188
188
|
mean=state['mean'],
|
|
189
189
|
std=state['std'],
|
|
190
190
|
continuous_feature_indices=state['continuous_feature_indices']
|
|
@@ -194,8 +194,8 @@ class PytorchScaler:
|
|
|
194
194
|
"""Returns the developer-friendly string representation of the scaler."""
|
|
195
195
|
if self.continuous_feature_indices:
|
|
196
196
|
num_features = len(self.continuous_feature_indices)
|
|
197
|
-
return f"
|
|
198
|
-
return "
|
|
197
|
+
return f"DragonScaler(fitted for {num_features} features)"
|
|
198
|
+
return "DragonScaler(not fitted)"
|
|
199
199
|
|
|
200
200
|
|
|
201
201
|
def info():
|
|
@@ -0,0 +1,341 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
from torch.utils.data import Dataset
|
|
3
|
+
import pandas
|
|
4
|
+
import numpy
|
|
5
|
+
from typing import Literal, Union, Tuple
|
|
6
|
+
import matplotlib.pyplot as plt
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from .path_manager import make_fullpath
|
|
10
|
+
from ._logger import _LOGGER
|
|
11
|
+
from ._script_info import _script_info
|
|
12
|
+
from .ML_scaler import DragonScaler
|
|
13
|
+
from .ML_datasetmaster import _PytorchDataset
|
|
14
|
+
from ._keys import DatasetKeys, MLTaskKeys
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"DragonDatasetSequence"
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# --- SequenceMaker ---
|
|
23
|
+
class DragonDatasetSequence:
|
|
24
|
+
"""
|
|
25
|
+
Creates windowed PyTorch datasets from a univariate (one feature) sequential data.
|
|
26
|
+
|
|
27
|
+
Automatic Pipeline:
|
|
28
|
+
|
|
29
|
+
1. Split Data: Separate data into training, validation, and testing portions.
|
|
30
|
+
2. Normalize Data: Normalize the data. The scaler will be fitted on the training portion.
|
|
31
|
+
3. Generate Windows: Create the windowed sequences from the normalized splits.
|
|
32
|
+
"""
|
|
33
|
+
def __init__(self,
|
|
34
|
+
prediction_mode: Literal["sequence-to-sequence", "sequence-to-value"],
|
|
35
|
+
data: Union[pandas.DataFrame, pandas.Series, numpy.ndarray],
|
|
36
|
+
sequence_length: int,
|
|
37
|
+
validation_size: float = 0.2,
|
|
38
|
+
test_size: float = 0.1):
|
|
39
|
+
"""
|
|
40
|
+
Initializes the dataset manager and automatically processes the data.
|
|
41
|
+
|
|
42
|
+
The constructor runs the full pipeline:
|
|
43
|
+
1. Splits the data chronologically (train, validation, test).
|
|
44
|
+
2. Fits a DragonScaler on the training split.
|
|
45
|
+
3. Normalizes all splits using the fitted scaler.
|
|
46
|
+
4. Generates windowed datasets for training, validation, and testing.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
prediction_mode: The type of sequence task.
|
|
50
|
+
data: The input univariate time-series data.
|
|
51
|
+
- If pandas.DataFrame: The index is used for the time axis
|
|
52
|
+
and the *first column* is used as the sequence.
|
|
53
|
+
- If pandas.Series: The index is used for the time axis.
|
|
54
|
+
- If numpy.ndarray: A simple integer range is used for the time axis.
|
|
55
|
+
sequence_length (int): The number of time steps in each input window (X).
|
|
56
|
+
validation_size (float): The fraction of data to hold out for validation.
|
|
57
|
+
test_size (float): The fraction of data to hold out for testing.
|
|
58
|
+
"""
|
|
59
|
+
self._train_dataset = None
|
|
60
|
+
self._test_dataset = None
|
|
61
|
+
self._val_dataset = None
|
|
62
|
+
self.sequence_length = sequence_length
|
|
63
|
+
self.scaler = None
|
|
64
|
+
|
|
65
|
+
if not prediction_mode in [MLTaskKeys.SEQUENCE_SEQUENCE, MLTaskKeys.SEQUENCE_VALUE]:
|
|
66
|
+
_LOGGER.error(f"Unrecognized prediction mode: '{prediction_mode}'.")
|
|
67
|
+
raise ValueError()
|
|
68
|
+
else:
|
|
69
|
+
self.prediction_mode = prediction_mode
|
|
70
|
+
|
|
71
|
+
if isinstance(data, pandas.DataFrame):
|
|
72
|
+
self.time_axis = data.index.values
|
|
73
|
+
self.sequence = data.iloc[:, 0].values.astype(numpy.float32)
|
|
74
|
+
elif isinstance(data, pandas.Series):
|
|
75
|
+
self.time_axis = data.index.values
|
|
76
|
+
self.sequence = data.values.astype(numpy.float32)
|
|
77
|
+
elif isinstance(data, numpy.ndarray):
|
|
78
|
+
self.time_axis = numpy.arange(len(data))
|
|
79
|
+
self.sequence = data.astype(numpy.float32)
|
|
80
|
+
else:
|
|
81
|
+
_LOGGER.error("Data must be a pandas DataFrame/Series or a numpy array.")
|
|
82
|
+
raise TypeError()
|
|
83
|
+
|
|
84
|
+
self.train_sequence = None
|
|
85
|
+
self.val_sequence = None
|
|
86
|
+
self.test_sequence = None
|
|
87
|
+
|
|
88
|
+
self.train_time_axis = None
|
|
89
|
+
self.val_time_axis = None
|
|
90
|
+
self.test_time_axis = None
|
|
91
|
+
|
|
92
|
+
self._is_split = False
|
|
93
|
+
self._is_normalized = False
|
|
94
|
+
self._are_windows_generated = False
|
|
95
|
+
|
|
96
|
+
# Automation
|
|
97
|
+
self._split_data(validation_size=validation_size, test_size=test_size)
|
|
98
|
+
self._normalize_data()
|
|
99
|
+
self._generate_windows()
|
|
100
|
+
|
|
101
|
+
def _split_data(self, validation_size: float = 0.2, test_size: float = 0.1) -> None:
|
|
102
|
+
"""
|
|
103
|
+
Splits the sequence chronologically into training, validation, and testing portions.
|
|
104
|
+
|
|
105
|
+
To prevent windowing errors, the validation and test sets include an overlap of `sequence_length` from the preceding data.
|
|
106
|
+
"""
|
|
107
|
+
if self._is_split:
|
|
108
|
+
_LOGGER.warning("Data has already been split.")
|
|
109
|
+
return
|
|
110
|
+
|
|
111
|
+
if (validation_size + test_size) >= 1.0:
|
|
112
|
+
_LOGGER.error(f"The sum of validation_size ({validation_size}) and test_size ({test_size}) must be less than 1.0.")
|
|
113
|
+
raise ValueError("validation_size and test_size sum must be < 1.0")
|
|
114
|
+
|
|
115
|
+
total_size = len(self.sequence)
|
|
116
|
+
|
|
117
|
+
# Calculate split indices
|
|
118
|
+
test_split_idx = int(total_size * (1 - test_size))
|
|
119
|
+
val_split_idx = int(total_size * (1 - test_size - validation_size))
|
|
120
|
+
|
|
121
|
+
# --- Create sequences ---
|
|
122
|
+
# Train sequence is from the beginning to the validation index
|
|
123
|
+
self.train_sequence = self.sequence[:val_split_idx]
|
|
124
|
+
|
|
125
|
+
# Validation sequence starts `sequence_length` before its split index for windowing
|
|
126
|
+
self.val_sequence = self.sequence[val_split_idx - self.sequence_length : test_split_idx]
|
|
127
|
+
|
|
128
|
+
# Test sequence starts `sequence_length` before its split index for windowing
|
|
129
|
+
self.test_sequence = self.sequence[test_split_idx - self.sequence_length:]
|
|
130
|
+
|
|
131
|
+
# --- Create time axes ---
|
|
132
|
+
self.train_time_axis = self.time_axis[:val_split_idx]
|
|
133
|
+
# The "plottable" validation/test time axes start from their respective split indices
|
|
134
|
+
self.val_time_axis = self.time_axis[val_split_idx : test_split_idx]
|
|
135
|
+
self.test_time_axis = self.time_axis[test_split_idx:]
|
|
136
|
+
|
|
137
|
+
self._is_split = True
|
|
138
|
+
_LOGGER.info(f"Sequence split into training ({len(self.train_sequence)}), validation ({len(self.val_sequence)}), and testing ({len(self.test_sequence)}) points.")
|
|
139
|
+
|
|
140
|
+
def _normalize_data(self) -> None:
|
|
141
|
+
"""
|
|
142
|
+
Normalizes the sequence data using DragonScaler. Must be called AFTER splitting to prevent data leakage from the test set.
|
|
143
|
+
"""
|
|
144
|
+
if not self._is_split:
|
|
145
|
+
_LOGGER.error("Data must be split BEFORE normalizing.")
|
|
146
|
+
raise RuntimeError()
|
|
147
|
+
|
|
148
|
+
if self.scaler:
|
|
149
|
+
_LOGGER.warning("Data has already been normalized.")
|
|
150
|
+
return
|
|
151
|
+
|
|
152
|
+
# 1. DragonScaler requires a Dataset to fit. Create a temporary one.
|
|
153
|
+
# The scaler expects 2D data [n_samples, n_features].
|
|
154
|
+
train_features = self.train_sequence.reshape(-1, 1) # type: ignore
|
|
155
|
+
|
|
156
|
+
# _PytorchDataset needs labels, so we create dummy ones.
|
|
157
|
+
dummy_labels = numpy.zeros(len(train_features))
|
|
158
|
+
temp_train_ds = _PytorchDataset(train_features, dummy_labels, labels_dtype=torch.float32)
|
|
159
|
+
|
|
160
|
+
# 2. Fit the DragonScaler on the temporary training dataset.
|
|
161
|
+
# The sequence is a single feature, so its index is [0].
|
|
162
|
+
_LOGGER.info("Fitting DragonScaler on the training data...")
|
|
163
|
+
self.scaler = DragonScaler.fit(temp_train_ds, continuous_feature_indices=[0])
|
|
164
|
+
|
|
165
|
+
# 3. Transform sequences using the fitted scaler.
|
|
166
|
+
# The transform method requires a tensor, so we convert, transform, and convert back.
|
|
167
|
+
train_tensor = torch.tensor(self.train_sequence.reshape(-1, 1), dtype=torch.float32) # type: ignore
|
|
168
|
+
val_tensor = torch.tensor(self.val_sequence.reshape(-1, 1), dtype=torch.float32) # type: ignore
|
|
169
|
+
test_tensor = torch.tensor(self.test_sequence.reshape(-1, 1), dtype=torch.float32) # type: ignore
|
|
170
|
+
|
|
171
|
+
self.train_sequence = self.scaler.transform(train_tensor).numpy().flatten()
|
|
172
|
+
self.val_sequence = self.scaler.transform(val_tensor).numpy().flatten()
|
|
173
|
+
self.test_sequence = self.scaler.transform(test_tensor).numpy().flatten()
|
|
174
|
+
|
|
175
|
+
self._is_normalized = True
|
|
176
|
+
_LOGGER.info("Sequence data normalized using DragonScaler.")
|
|
177
|
+
|
|
178
|
+
def _generate_windows(self) -> None:
|
|
179
|
+
"""
|
|
180
|
+
Generates overlapping windows for features and labels.
|
|
181
|
+
"""
|
|
182
|
+
if not self._is_split:
|
|
183
|
+
_LOGGER.error("Cannot generate windows before splitting data.")
|
|
184
|
+
raise RuntimeError()
|
|
185
|
+
|
|
186
|
+
if not self._is_normalized:
|
|
187
|
+
_LOGGER.error("Cannot generate windows before normalizing data.")
|
|
188
|
+
raise RuntimeError()
|
|
189
|
+
|
|
190
|
+
if self._are_windows_generated:
|
|
191
|
+
_LOGGER.warning("Windows have already been generated.")
|
|
192
|
+
return
|
|
193
|
+
|
|
194
|
+
self._train_dataset = self._create_windowed_dataset(self.train_sequence) # type: ignore
|
|
195
|
+
self._val_dataset = self._create_windowed_dataset(self.val_sequence) # type: ignore
|
|
196
|
+
self._test_dataset = self._create_windowed_dataset(self.test_sequence) # type: ignore
|
|
197
|
+
|
|
198
|
+
self._are_windows_generated = True
|
|
199
|
+
_LOGGER.info("Feature and label windows generated for train, validation, and test sets.")
|
|
200
|
+
|
|
201
|
+
def _create_windowed_dataset(self, data: numpy.ndarray) -> Dataset:
|
|
202
|
+
"""Efficiently creates windowed features and labels using numpy."""
|
|
203
|
+
if len(data) <= self.sequence_length:
|
|
204
|
+
# Validation/Test sets of size 0 might be passed
|
|
205
|
+
_LOGGER.warning(f"Data length ({len(data)}) is not greater than sequence_length ({self.sequence_length}). Cannot create windows. Returning empty dataset.")
|
|
206
|
+
return _PytorchDataset(numpy.array([]), numpy.array([]), labels_dtype=torch.float32)
|
|
207
|
+
|
|
208
|
+
if self.prediction_mode == MLTaskKeys.SEQUENCE_VALUE:
|
|
209
|
+
# sequence-to-value
|
|
210
|
+
features = data[:-1]
|
|
211
|
+
labels = data[self.sequence_length:]
|
|
212
|
+
|
|
213
|
+
n_windows = len(features) - self.sequence_length + 1
|
|
214
|
+
bytes_per_item = features.strides[0]
|
|
215
|
+
strided_features = numpy.lib.stride_tricks.as_strided(
|
|
216
|
+
features, shape=(n_windows, self.sequence_length), strides=(bytes_per_item, bytes_per_item)
|
|
217
|
+
)
|
|
218
|
+
# Ensure labels align with the end of each feature window
|
|
219
|
+
aligned_labels = labels[:n_windows]
|
|
220
|
+
return _PytorchDataset(strided_features, aligned_labels, labels_dtype=torch.float32)
|
|
221
|
+
|
|
222
|
+
else:
|
|
223
|
+
# Sequence-to-sequence
|
|
224
|
+
x_data = data[:-1]
|
|
225
|
+
y_data = data[1:]
|
|
226
|
+
|
|
227
|
+
n_windows = len(x_data) - self.sequence_length + 1
|
|
228
|
+
bytes_per_item = x_data.strides[0]
|
|
229
|
+
|
|
230
|
+
strided_x = numpy.lib.stride_tricks.as_strided(x_data, shape=(n_windows, self.sequence_length), strides=(bytes_per_item, bytes_per_item))
|
|
231
|
+
strided_y = numpy.lib.stride_tricks.as_strided(y_data, shape=(n_windows, self.sequence_length), strides=(bytes_per_item, bytes_per_item))
|
|
232
|
+
|
|
233
|
+
return _PytorchDataset(strided_x, strided_y, labels_dtype=torch.float32)
|
|
234
|
+
|
|
235
|
+
def plot_splits(self, save_dir: Union[str, Path]):
|
|
236
|
+
"""Plots the training, validation and testing data."""
|
|
237
|
+
if not self._is_split:
|
|
238
|
+
_LOGGER.error("Cannot plot before splitting data.")
|
|
239
|
+
raise RuntimeError()
|
|
240
|
+
|
|
241
|
+
if self.scaler is None:
|
|
242
|
+
_LOGGER.error("Cannot plot: data has not been normalized, or scaler is missing.")
|
|
243
|
+
return
|
|
244
|
+
|
|
245
|
+
save_path = make_fullpath(save_dir, make=True, enforce="directory")
|
|
246
|
+
full_path = save_path / "SequenceSplits.svg"
|
|
247
|
+
|
|
248
|
+
plt.figure(figsize=(15, 6))
|
|
249
|
+
plt.title("Sequential Data")
|
|
250
|
+
plt.grid(True)
|
|
251
|
+
plt.xlabel("Sequence")
|
|
252
|
+
plt.ylabel("Value")
|
|
253
|
+
|
|
254
|
+
# Plot denormalized training data
|
|
255
|
+
plt.plot(self.train_time_axis, self.scaler.inverse_transform(self.train_sequence.reshape(-1, 1)), label='Train Data') # type: ignore
|
|
256
|
+
|
|
257
|
+
# Plot denormalized validation data
|
|
258
|
+
# We must skip the overlapping 'sequence_length' part for plotting
|
|
259
|
+
val_plot_data = self.val_sequence[self.sequence_length:] # type: ignore
|
|
260
|
+
plt.plot(self.val_time_axis, self.scaler.inverse_transform(val_plot_data.reshape(-1, 1)), label='Validation Data', c='orange') # type: ignore
|
|
261
|
+
|
|
262
|
+
# Plot denormalized test data
|
|
263
|
+
# We must skip the overlapping 'sequence_length' part for plotting
|
|
264
|
+
test_plot_data = self.test_sequence[self.sequence_length:] # type: ignore
|
|
265
|
+
plt.plot(self.test_time_axis, self.scaler.inverse_transform(test_plot_data.reshape(-1, 1)), label='Test Data', c='green') # type: ignore
|
|
266
|
+
|
|
267
|
+
plt.legend()
|
|
268
|
+
|
|
269
|
+
plt.tight_layout()
|
|
270
|
+
plt.savefig(full_path)
|
|
271
|
+
_LOGGER.info(f"📈 Sequence data splits saved as '{full_path.name}'.")
|
|
272
|
+
plt.close()
|
|
273
|
+
|
|
274
|
+
def get_datasets(self) -> Tuple[Dataset, Dataset, Dataset]:
|
|
275
|
+
"""Returns the final train, validation, and test datasets."""
|
|
276
|
+
if not self._are_windows_generated:
|
|
277
|
+
_LOGGER.error("Windows have not been generated. Call .generate_windows() first.")
|
|
278
|
+
raise RuntimeError()
|
|
279
|
+
return self._train_dataset, self._val_dataset, self._test_dataset # type: ignore
|
|
280
|
+
|
|
281
|
+
def save_scaler(self, directory: Union[str, Path], verbose: bool=True) -> None:
|
|
282
|
+
"""
|
|
283
|
+
Saves the fitted DragonScaler's state to a .pth file.
|
|
284
|
+
|
|
285
|
+
Args:
|
|
286
|
+
directory (str | Path): The directory where the scaler will be saved.
|
|
287
|
+
"""
|
|
288
|
+
if not self.scaler:
|
|
289
|
+
_LOGGER.error("No scaler was fitted or provided.")
|
|
290
|
+
raise RuntimeError()
|
|
291
|
+
|
|
292
|
+
save_path = make_fullpath(directory, make=True, enforce="directory")
|
|
293
|
+
|
|
294
|
+
filename = f"{DatasetKeys.SCALER_PREFIX}{self.prediction_mode}.pth"
|
|
295
|
+
filepath = save_path / filename
|
|
296
|
+
self.scaler.save(filepath, verbose=False)
|
|
297
|
+
if verbose:
|
|
298
|
+
_LOGGER.info(f"Scaler saved as '{filepath.name}'.")
|
|
299
|
+
|
|
300
|
+
def get_last_training_sequence(self) -> numpy.ndarray:
|
|
301
|
+
"""
|
|
302
|
+
Returns the final, un-scaled sequence from the training data.
|
|
303
|
+
"""
|
|
304
|
+
if not self._is_split:
|
|
305
|
+
_LOGGER.error("Data has not been split. Cannot get last training sequence.")
|
|
306
|
+
raise RuntimeError()
|
|
307
|
+
|
|
308
|
+
# The length of train_time_axis is our validation split index
|
|
309
|
+
val_split_idx = len(self.train_time_axis) # type: ignore
|
|
310
|
+
|
|
311
|
+
if val_split_idx < self.sequence_length:
|
|
312
|
+
_LOGGER.error(f"Training data length ({val_split_idx}) is less than sequence_length ({self.sequence_length}).")
|
|
313
|
+
raise ValueError()
|
|
314
|
+
|
|
315
|
+
# Get the slice from the *original* sequence
|
|
316
|
+
start_idx = val_split_idx - self.sequence_length
|
|
317
|
+
end_idx = val_split_idx
|
|
318
|
+
|
|
319
|
+
return self.sequence[start_idx:end_idx]
|
|
320
|
+
|
|
321
|
+
def __repr__(self) -> str:
|
|
322
|
+
s = f"<{self.__class__.__name__}>:\n"
|
|
323
|
+
s += f" Prediction Mode: {self.prediction_mode}\n"
|
|
324
|
+
s += f" Sequence Length (Window): {self.sequence_length}\n"
|
|
325
|
+
s += f" Total Data Points: {len(self.sequence)}\n"
|
|
326
|
+
s += " --- Status ---\n"
|
|
327
|
+
s += f" Split: {self._is_split}\n"
|
|
328
|
+
s += f" Normalized: {self._is_normalized}\n"
|
|
329
|
+
s += f" Windows Generated: {self._are_windows_generated}\n"
|
|
330
|
+
|
|
331
|
+
if self._are_windows_generated:
|
|
332
|
+
train_len = len(self._train_dataset) if self._train_dataset else 0 # type: ignore
|
|
333
|
+
val_len = len(self._val_dataset) if self._val_dataset else 0 # type: ignore
|
|
334
|
+
test_len = len(self._test_dataset) if self._test_dataset else 0 # type: ignore
|
|
335
|
+
s += f" Datasets (Train | Validation | Test): {train_len} | {val_len} | {test_len} windows\n"
|
|
336
|
+
|
|
337
|
+
return s
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def info():
|
|
341
|
+
_script_info(__all__)
|