sinapsis-data-analysis 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sinapsis_data_analysis/__init__.py +0 -0
- sinapsis_data_analysis/helpers/__init__.py +0 -0
- sinapsis_data_analysis/helpers/excluded_models.py +22 -0
- sinapsis_data_analysis/helpers/model_metrics.py +29 -0
- sinapsis_data_analysis/templates/__init__.py +29 -0
- sinapsis_data_analysis/templates/ml_base_inference.py +120 -0
- sinapsis_data_analysis/templates/ml_base_training.py +261 -0
- sinapsis_data_analysis/templates/sklearn_inference.py +28 -0
- sinapsis_data_analysis/templates/sklearn_manifold.py +160 -0
- sinapsis_data_analysis/templates/sklearn_train.py +204 -0
- sinapsis_data_analysis/templates/xgboost_inference.py +26 -0
- sinapsis_data_analysis/templates/xgboost_train.py +75 -0
- sinapsis_data_analysis-0.1.0.dist-info/METADATA +178 -0
- sinapsis_data_analysis-0.1.0.dist-info/RECORD +17 -0
- sinapsis_data_analysis-0.1.0.dist-info/WHEEL +5 -0
- sinapsis_data_analysis-0.1.0.dist-info/licenses/LICENSE +661 -0
- sinapsis_data_analysis-0.1.0.dist-info/top_level.txt +1 -0
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
excluded_linear_models = [
|
|
3
|
+
"enet_path",
|
|
4
|
+
"lars_path",
|
|
5
|
+
"lars_path_gram",
|
|
6
|
+
"lasso_path",
|
|
7
|
+
"orthogonal_mp",
|
|
8
|
+
"orthogonal_mp_gram",
|
|
9
|
+
"ridge_regression",
|
|
10
|
+
"_sgd_fast",
|
|
11
|
+
]
|
|
12
|
+
excluded_neighbors_models = [
|
|
13
|
+
"_ball_tree.BallTree",
|
|
14
|
+
"_ball_tree.KDTree",
|
|
15
|
+
"sort_graph_by_row_values",
|
|
16
|
+
"kneighbors_graph",
|
|
17
|
+
"radius_neighbors_graph",
|
|
18
|
+
"sort_graph_by_row_values",
|
|
19
|
+
"VALID_METRICS",
|
|
20
|
+
"VALID_METRICS_SPARSE",
|
|
21
|
+
]
|
|
22
|
+
excluded_tree_models = ["plot_tree", "export_text", "export_graphviz"]
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, ConfigDict
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class ModelMetrics(BaseModel):
|
|
8
|
+
"""Base class for storing model metrics"""
|
|
9
|
+
|
|
10
|
+
accuracy: float | None = None
|
|
11
|
+
precision: float | None = None
|
|
12
|
+
recall: float | None = None
|
|
13
|
+
f1_score: float | None = None
|
|
14
|
+
|
|
15
|
+
r2_score: float | None = None
|
|
16
|
+
mean_squared_error: float | None = None
|
|
17
|
+
mean_absolute_error: float | None = None
|
|
18
|
+
|
|
19
|
+
mape: float | None = None
|
|
20
|
+
|
|
21
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ModelPredictionResults(BaseModel):
|
|
25
|
+
"""Class to store model predictions and metrics"""
|
|
26
|
+
|
|
27
|
+
predictions: Any
|
|
28
|
+
metrics: ModelMetrics
|
|
29
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import importlib
|
|
3
|
+
|
|
4
|
+
from sinapsis.templates import _import_template_package
|
|
5
|
+
from sinapsis_core.template_base import Template
|
|
6
|
+
|
|
7
|
+
_root_lib_path: str = "sinapsis_data_analysis.templates"
|
|
8
|
+
|
|
9
|
+
_ADDITIONAL_TEMPLATE_MODULES = [
|
|
10
|
+
f"{_root_lib_path}.sklearn_manifold",
|
|
11
|
+
f"{_root_lib_path}.sklearn_train",
|
|
12
|
+
f"{_root_lib_path}.xgboost_train",
|
|
13
|
+
f"{_root_lib_path}.xgboost_inference",
|
|
14
|
+
]
|
|
15
|
+
_template_lookup: dict = {
|
|
16
|
+
"SKLearnInference": f"{_root_lib_path}.sklearn_inference",
|
|
17
|
+
"XGBoostInference": f"{_root_lib_path}.xgboost_inference",
|
|
18
|
+
}
|
|
19
|
+
for t_module in _ADDITIONAL_TEMPLATE_MODULES:
|
|
20
|
+
_template_lookup |= _import_template_package(t_module)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def __getattr__(name: str) -> Template:
|
|
24
|
+
if name in _template_lookup:
|
|
25
|
+
module = importlib.import_module(_template_lookup[name])
|
|
26
|
+
return getattr(module, name)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
__all__ = list(_template_lookup.keys())
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
from abc import abstractmethod
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from sinapsis_core.data_containers.data_packet import DataContainer
|
|
7
|
+
from sinapsis_core.template_base import TemplateAttributes
|
|
8
|
+
from sinapsis_core.template_base.template import Template
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class MLBaseInference(Template):
|
|
12
|
+
"""Abstract base class for machine learning model inference.
|
|
13
|
+
|
|
14
|
+
This class provides a framework for loading a trained model
|
|
15
|
+
and using it to make predictions on new data.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
class AttributesBaseModel(TemplateAttributes):
|
|
19
|
+
"""Attributes for the MLBaseInference template.
|
|
20
|
+
|
|
21
|
+
Attributes:
|
|
22
|
+
model_path (str): Path to the saved model file.
|
|
23
|
+
generic_field_key (str): Key of the generic field where data is stored.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
model_path: str
|
|
27
|
+
generic_field_key: str
|
|
28
|
+
|
|
29
|
+
def __init__(self, attributes: TemplateAttributes) -> None:
|
|
30
|
+
super().__init__(attributes)
|
|
31
|
+
self.model = self.load_model(self.attributes.model_path)
|
|
32
|
+
|
|
33
|
+
def get_data(self, container: DataContainer) -> Any:
|
|
34
|
+
"""Get the data from the data container
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
container (DataContainer): The data container with the data
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Any: The data from the generic field
|
|
41
|
+
"""
|
|
42
|
+
return self._get_generic_data(container, self.attributes.generic_field_key)
|
|
43
|
+
|
|
44
|
+
@staticmethod
|
|
45
|
+
def data_is_valid(data: Any) -> bool:
|
|
46
|
+
"""Check if the data is valid for inference
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
data (Any): The data to validate
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
bool: True if the data is valid, False otherwise
|
|
53
|
+
"""
|
|
54
|
+
return data is not None
|
|
55
|
+
|
|
56
|
+
def preprocess_data(self, data: Any) -> Any:
|
|
57
|
+
"""
|
|
58
|
+
This method can be overridden by subclasses to implement
|
|
59
|
+
specific preprocessing steps
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
data (Any): The data to preprocess
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
Any: The preprocessed data
|
|
66
|
+
"""
|
|
67
|
+
try:
|
|
68
|
+
data.pop("target")
|
|
69
|
+
except (KeyError, IndexError):
|
|
70
|
+
self.logger.info("No target column")
|
|
71
|
+
return data
|
|
72
|
+
|
|
73
|
+
@abstractmethod
|
|
74
|
+
def load_model(self, model_path: str) -> Any:
|
|
75
|
+
"""
|
|
76
|
+
This abstract method should be implemented by subclasses to define
|
|
77
|
+
how the model should be loaded
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
model_path (str): Path to the saved model file
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
Any: The loaded model
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
def predict(self, data: np.ndarray) -> np.ndarray:
|
|
87
|
+
"""Generate predictions using the loaded model
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
data (Any): The data to make predictions on
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
np.ndarray: The model's predictions
|
|
94
|
+
"""
|
|
95
|
+
|
|
96
|
+
return self.model.predict(data)
|
|
97
|
+
|
|
98
|
+
def execute(self, container: DataContainer) -> DataContainer:
|
|
99
|
+
"""
|
|
100
|
+
Gets the data, validates it, preprocesses it, makes predictions,
|
|
101
|
+
and stores the results
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
container (DataContainer): The data container with the input data
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
DataContainer: The container with added predictions
|
|
108
|
+
"""
|
|
109
|
+
data = self.get_data(container)
|
|
110
|
+
|
|
111
|
+
if not self.data_is_valid(data):
|
|
112
|
+
self.logger.warning("Invalid or missing data")
|
|
113
|
+
return container
|
|
114
|
+
|
|
115
|
+
data = self.preprocess_data(data)
|
|
116
|
+
predictions = self.predict(data)
|
|
117
|
+
|
|
118
|
+
self._set_generic_data(container, predictions)
|
|
119
|
+
|
|
120
|
+
return container
|
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import os
|
|
3
|
+
from abc import abstractmethod
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
from sinapsis_core.data_containers.data_packet import DataContainer
|
|
8
|
+
from sinapsis_core.template_base import TemplateAttributes
|
|
9
|
+
from sinapsis_core.template_base.dynamic_template import BaseDynamicWrapperTemplate
|
|
10
|
+
from sinapsis_data_readers.templates.datasets_readers.dataset_splitter import TabularDatasetSplit
|
|
11
|
+
from sklearn.base import is_classifier, is_regressor
|
|
12
|
+
from sklearn.metrics import (
|
|
13
|
+
accuracy_score,
|
|
14
|
+
f1_score,
|
|
15
|
+
mean_absolute_error,
|
|
16
|
+
mean_squared_error,
|
|
17
|
+
precision_score,
|
|
18
|
+
r2_score,
|
|
19
|
+
recall_score,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
from sinapsis_data_analysis.helpers.model_metrics import (
|
|
23
|
+
ModelMetrics,
|
|
24
|
+
ModelPredictionResults,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class MLBaseAttributes(TemplateAttributes):
|
|
29
|
+
"""Base attributes for machine learning model templates.
|
|
30
|
+
|
|
31
|
+
Attributes:
|
|
32
|
+
generic_field_key (str): Key of the generic field where datasets are stored.
|
|
33
|
+
model_save_path (str): Path where the trained model will be saved.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
generic_field_key: str
|
|
37
|
+
model_save_path: str
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class MLBaseTraining(BaseDynamicWrapperTemplate):
|
|
41
|
+
"""
|
|
42
|
+
This abstract class provides common functionality for loading data,
|
|
43
|
+
training models, making predictions, calculating metrics, and saving
|
|
44
|
+
models.
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
AttributesBaseModel = MLBaseAttributes
|
|
50
|
+
|
|
51
|
+
def __init__(self, attributes: TemplateAttributes) -> None:
|
|
52
|
+
"""Initialize the MLBase template.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
attributes (TemplateAttributes): The attributes for this template.
|
|
56
|
+
"""
|
|
57
|
+
super().__init__(attributes)
|
|
58
|
+
self.model = self.wrapped_callable
|
|
59
|
+
self.trained_model = None
|
|
60
|
+
|
|
61
|
+
def get_dataset(self, container: DataContainer) -> Any:
|
|
62
|
+
"""Get the dataset from the data container.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
container (DataContainer): The data container with the dataset.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
Any: The dataset from the generic field.
|
|
69
|
+
"""
|
|
70
|
+
return self._get_generic_data(container, self.attributes.generic_field_key)
|
|
71
|
+
|
|
72
|
+
@staticmethod
|
|
73
|
+
def dataset_is_valid(dataset: Any) -> bool:
|
|
74
|
+
"""Check if the dataset is valid
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
dataset (Any): The dataset to validate.
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
bool: True if the dataset is valid, False otherwise.
|
|
81
|
+
"""
|
|
82
|
+
return dataset is not None
|
|
83
|
+
|
|
84
|
+
def process_dataset(self, dataset: TabularDatasetSplit) -> tuple | None:
|
|
85
|
+
"""
|
|
86
|
+
Extracts x_train, y_train, x_test, y_test from the dataset
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
dataset (Any): The dataset to process
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
tuple | None: A tuple containing (x_train, y_train, x_test, y_test)
|
|
93
|
+
or None if the dataset doesn't have the expected attributes
|
|
94
|
+
"""
|
|
95
|
+
try:
|
|
96
|
+
x_train = dataset.x_train
|
|
97
|
+
y_train = dataset.y_train
|
|
98
|
+
x_test = dataset.x_test
|
|
99
|
+
y_test = dataset.y_test
|
|
100
|
+
|
|
101
|
+
return x_train, y_train, x_test, y_test
|
|
102
|
+
except AttributeError:
|
|
103
|
+
self.logger.warning("Dataset doesn't have the expected attributes")
|
|
104
|
+
return None
|
|
105
|
+
|
|
106
|
+
def train_model(self, x_train: Any, y_train: Any) -> None:
|
|
107
|
+
"""Train the model using the training data
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
x_train (Any): The training features
|
|
111
|
+
y_train (Any): The training targets
|
|
112
|
+
"""
|
|
113
|
+
self.trained_model = self.model.fit(x_train, y_train)
|
|
114
|
+
|
|
115
|
+
@staticmethod
|
|
116
|
+
def calculate_classification_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> ModelMetrics:
|
|
117
|
+
"""Calculate metrics specific to classification models
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
y_true (np.ndarray): The ground truth labels
|
|
121
|
+
y_pred (np.ndarray): The predicted labels
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
ModelMetrics: Object containing classification metrics
|
|
125
|
+
"""
|
|
126
|
+
metrics = ModelMetrics()
|
|
127
|
+
metrics.accuracy = float(accuracy_score(y_true, y_pred))
|
|
128
|
+
metrics.precision = float(precision_score(y_true, y_pred, average="weighted", zero_division=0))
|
|
129
|
+
metrics.recall = float(recall_score(y_true, y_pred, average="weighted", zero_division=0))
|
|
130
|
+
metrics.f1_score = float(f1_score(y_true, y_pred, average="weighted", zero_division=0))
|
|
131
|
+
return metrics
|
|
132
|
+
|
|
133
|
+
@staticmethod
|
|
134
|
+
def calculate_regression_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> ModelMetrics:
|
|
135
|
+
"""Calculate metrics specific to regression models
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
y_true (np.ndarray): The ground truth values
|
|
139
|
+
y_pred (np.ndarray): The predicted values
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
ModelMetrics: Object containing regression metrics
|
|
143
|
+
"""
|
|
144
|
+
metrics = ModelMetrics()
|
|
145
|
+
metrics.r2_score = float(r2_score(y_true, y_pred))
|
|
146
|
+
metrics.mean_squared_error = float(mean_squared_error(y_true, y_pred))
|
|
147
|
+
metrics.mean_absolute_error = float(mean_absolute_error(y_true, y_pred))
|
|
148
|
+
|
|
149
|
+
return metrics
|
|
150
|
+
|
|
151
|
+
def calculate_metrics(self, y_true: np.ndarray, y_pred: np.ndarray) -> ModelMetrics:
|
|
152
|
+
"""
|
|
153
|
+
Detects whether the model is a classifier or regressor and calculates
|
|
154
|
+
the appropriate metrics
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
y_true (np.ndarray): The ground truth values/labels
|
|
158
|
+
y_pred (np.ndarray): The predicted values/labels
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
ModelMetrics: Object containing the appropriate metrics
|
|
162
|
+
"""
|
|
163
|
+
if self.trained_model is not None:
|
|
164
|
+
if is_classifier(self.trained_model):
|
|
165
|
+
return self.calculate_classification_metrics(y_true, y_pred)
|
|
166
|
+
elif is_regressor(self.trained_model):
|
|
167
|
+
return self.calculate_regression_metrics(y_true, y_pred)
|
|
168
|
+
return ModelMetrics()
|
|
169
|
+
|
|
170
|
+
def generate_predictions(self, x_test: np.ndarray, y_test: np.ndarray) -> ModelPredictionResults | None:
|
|
171
|
+
"""
|
|
172
|
+
Uses the trained model to make predictions on the test data
|
|
173
|
+
and calculates the appropriate metrics
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
x_test (np.ndarray): The test features
|
|
177
|
+
y_test (np.ndarray): The test targets
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
ModelPredictionResults: Object containing predictions and metrics
|
|
181
|
+
"""
|
|
182
|
+
if self.trained_model is not None:
|
|
183
|
+
predictions = self.trained_model.predict(x_test)
|
|
184
|
+
|
|
185
|
+
metrics = self.calculate_metrics(y_test, predictions)
|
|
186
|
+
|
|
187
|
+
return ModelPredictionResults(predictions=predictions, metrics=metrics)
|
|
188
|
+
return None
|
|
189
|
+
|
|
190
|
+
def handle_model_training(self, processed_data: tuple) -> ModelPredictionResults | None:
|
|
191
|
+
"""Handle the model training and prediction workflow.
|
|
192
|
+
|
|
193
|
+
Extracts data from the processed dataset, trains the model,
|
|
194
|
+
and generates predictions.
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
processed_data (tuple): Tuple containing training and testing data.
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
ModelPredictionResults: Object containing predictions and metrics.
|
|
201
|
+
"""
|
|
202
|
+
x_train, y_train, x_test, y_test = processed_data
|
|
203
|
+
|
|
204
|
+
self.train_model(x_train, y_train)
|
|
205
|
+
return self.generate_predictions(x_test, y_test)
|
|
206
|
+
|
|
207
|
+
def save_model(self) -> None:
|
|
208
|
+
"""
|
|
209
|
+
Creates the necessary directories and calls the implementation-specific
|
|
210
|
+
method to save the model to the path specified in attributes.
|
|
211
|
+
If no trained model exists or an error occurs, it will be logged.
|
|
212
|
+
"""
|
|
213
|
+
if self.trained_model is None:
|
|
214
|
+
self.logger.error("No model to save")
|
|
215
|
+
return
|
|
216
|
+
|
|
217
|
+
try:
|
|
218
|
+
os.makedirs(os.path.dirname(self.attributes.model_save_path), exist_ok=True)
|
|
219
|
+
self._save_model_implementation()
|
|
220
|
+
self.logger.info(f"Model saved at {self.attributes.model_save_path}")
|
|
221
|
+
except (MemoryError, TypeError) as e:
|
|
222
|
+
self.logger.error(f"Error saving model: {e}")
|
|
223
|
+
|
|
224
|
+
@abstractmethod
|
|
225
|
+
def _save_model_implementation(self) -> None:
|
|
226
|
+
"""Save the trained model using an implementation-specific method.
|
|
227
|
+
|
|
228
|
+
This abstract method should be implemented by subclasses to define
|
|
229
|
+
how the model should be serialized and saved to disk.
|
|
230
|
+
"""
|
|
231
|
+
|
|
232
|
+
def execute(self, container: DataContainer) -> DataContainer:
|
|
233
|
+
"""
|
|
234
|
+
Gets the dataset, validates it, processes it, trains the model,
|
|
235
|
+
generates predictions, and stores the results
|
|
236
|
+
|
|
237
|
+
Args:
|
|
238
|
+
container (DataContainer): The data container with the dataset
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
DataContainer: The container with added predictions and metrics
|
|
242
|
+
"""
|
|
243
|
+
dataset = self.get_dataset(container)
|
|
244
|
+
|
|
245
|
+
if not self.dataset_is_valid(dataset):
|
|
246
|
+
self.logger.warning("Invalid or missing dataset")
|
|
247
|
+
return container
|
|
248
|
+
|
|
249
|
+
processed_data = self.process_dataset(dataset)
|
|
250
|
+
|
|
251
|
+
if processed_data is None:
|
|
252
|
+
self.logger.warning("Failed to process dataset")
|
|
253
|
+
return container
|
|
254
|
+
|
|
255
|
+
results = self.handle_model_training(processed_data)
|
|
256
|
+
|
|
257
|
+
if results is not None:
|
|
258
|
+
self._set_generic_data(container, results)
|
|
259
|
+
self.save_model()
|
|
260
|
+
|
|
261
|
+
return container
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
import joblib
|
|
5
|
+
|
|
6
|
+
from sinapsis_data_analysis.templates.ml_base_inference import MLBaseInference
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SKLearnInference(MLBaseInference):
|
|
10
|
+
"""Template for inference using sklearn models.
|
|
11
|
+
|
|
12
|
+
This template loads a saved sklearn model using joblib
|
|
13
|
+
and uses it to make predictions on new data.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
CATEGORY = "SKLearn"
|
|
17
|
+
|
|
18
|
+
def load_model(self, model_path: str) -> Any:
|
|
19
|
+
"""
|
|
20
|
+
Uses joblib to load a previously saved sklearn model
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
model_path (str): Path to the saved model file
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
Any: The loaded sklearn model
|
|
27
|
+
"""
|
|
28
|
+
return joblib.load(model_path)
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
from typing import cast
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from pydantic import BaseModel, ConfigDict
|
|
7
|
+
from sinapsis_core.data_containers.data_packet import DataContainer
|
|
8
|
+
from sinapsis_core.template_base import Template, TemplateAttributes
|
|
9
|
+
from sinapsis_core.template_base.dynamic_template import (
|
|
10
|
+
BaseDynamicWrapperTemplate,
|
|
11
|
+
WrapperEntryConfig,
|
|
12
|
+
)
|
|
13
|
+
from sinapsis_core.template_base.dynamic_template_factory import make_dynamic_template
|
|
14
|
+
from sinapsis_core.utils.env_var_keys import SINAPSIS_BUILD_DOCS
|
|
15
|
+
from sinapsis_data_readers.templates.datasets_readers.dataset_splitter import (
|
|
16
|
+
TabularDatasetSplit,
|
|
17
|
+
)
|
|
18
|
+
from sklearn import manifold
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ManifoldResults(BaseModel):
|
|
22
|
+
"""Class to store the results of manifold learning.
|
|
23
|
+
|
|
24
|
+
Attributes:
|
|
25
|
+
labels (np.ndarray | list): The original labels.
|
|
26
|
+
x_transformed (np.ndarray): The data after dimensionality reduction.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
labels: np.ndarray | list
|
|
30
|
+
x_transformed: np.ndarray
|
|
31
|
+
|
|
32
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class SKLearnManifold(BaseDynamicWrapperTemplate):
|
|
36
|
+
"""
|
|
37
|
+
This template dynamically wraps sklearn's manifold module,
|
|
38
|
+
providing access to dimensionality reduction techniques like
|
|
39
|
+
TSNE, MDS, Isomap, etc.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
WrapperEntry = WrapperEntryConfig(
|
|
43
|
+
wrapped_object=manifold,
|
|
44
|
+
signature_from_doc_string=True,
|
|
45
|
+
exclude_module_atts=["locally_linear_embedding"],
|
|
46
|
+
force_init_as_method=False,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
CATEGORY = "SKLearn"
|
|
50
|
+
|
|
51
|
+
class AttributesBaseModel(TemplateAttributes):
|
|
52
|
+
"""Attributes for the SKLearnManifold template.
|
|
53
|
+
|
|
54
|
+
Attributes:
|
|
55
|
+
generic_field_key (str): Key of the generic field
|
|
56
|
+
where the input data is stored.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
generic_field_key: str
|
|
60
|
+
|
|
61
|
+
def __init__(self, attributes: TemplateAttributes) -> None:
|
|
62
|
+
super().__init__(attributes)
|
|
63
|
+
self.manifold_model = self.wrapped_callable
|
|
64
|
+
|
|
65
|
+
@staticmethod
|
|
66
|
+
def reshape_arrays(feature_arrays: pd.DataFrame) -> np.ndarray:
|
|
67
|
+
"""
|
|
68
|
+
Converts a list of arrays into a 2D numpy array suitable for
|
|
69
|
+
manifold learning algorithms
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
feature_arrays (list): List of feature arrays
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
np.ndarray: Reshaped array suitable for manifold learning
|
|
76
|
+
"""
|
|
77
|
+
array_data = np.array(feature_arrays)
|
|
78
|
+
return array_data.reshape(array_data.shape[0], -1)
|
|
79
|
+
|
|
80
|
+
def get_dataset(self, container: DataContainer) -> TabularDatasetSplit | None:
|
|
81
|
+
"""Get the dataset from the data container
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
container (DataContainer): The data container with the dataset
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
TabularDatasetSplit | None: The dataset from the generic field,
|
|
88
|
+
or None if not found
|
|
89
|
+
"""
|
|
90
|
+
dataset = self._get_generic_data(container, self.attributes.generic_field_key)
|
|
91
|
+
dataset = cast(TabularDatasetSplit, dataset)
|
|
92
|
+
if dataset:
|
|
93
|
+
return dataset
|
|
94
|
+
return None
|
|
95
|
+
|
|
96
|
+
def process_dataset(self, dataset: TabularDatasetSplit) -> ManifoldResults | None:
|
|
97
|
+
"""
|
|
98
|
+
Extracts the training data, reshapes it, and applies the
|
|
99
|
+
manifold learning transformation
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
dataset (TabularDatasetSplit): The dataset to process
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
ManifoldResults | None: Results of the manifold transformation,
|
|
106
|
+
or None if the dataset is empty
|
|
107
|
+
"""
|
|
108
|
+
x_train = dataset.x_train
|
|
109
|
+
y_train = dataset.y_train
|
|
110
|
+
|
|
111
|
+
if x_train is None or x_train.empty:
|
|
112
|
+
return None
|
|
113
|
+
|
|
114
|
+
x_train_reshaped = self.reshape_arrays(x_train)
|
|
115
|
+
x_transformed = self.manifold_model.fit_transform(x_train_reshaped)
|
|
116
|
+
|
|
117
|
+
return ManifoldResults(labels=y_train, x_transformed=x_transformed)
|
|
118
|
+
|
|
119
|
+
def execute(self, container: DataContainer) -> DataContainer:
|
|
120
|
+
"""
|
|
121
|
+
Gets the dataset, processes it using the manifold learning algorithm,
|
|
122
|
+
and stores the results
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
container (DataContainer): The data container with the dataset
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
DataContainer: The container with added manifold learning results
|
|
129
|
+
"""
|
|
130
|
+
dataset = self.get_dataset(container)
|
|
131
|
+
if not dataset:
|
|
132
|
+
self.logger.warning("There is no dataset to process")
|
|
133
|
+
return container
|
|
134
|
+
|
|
135
|
+
results = self.process_dataset(dataset)
|
|
136
|
+
|
|
137
|
+
if results is not None:
|
|
138
|
+
self._set_generic_data(container, results)
|
|
139
|
+
|
|
140
|
+
return container
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def __getattr__(name: str) -> Template:
|
|
144
|
+
"""
|
|
145
|
+
Only create a template if it's imported, this avoids creating all the base models for all templates
|
|
146
|
+
and potential import errors due to not available packages.
|
|
147
|
+
"""
|
|
148
|
+
if name in SKLearnManifold.WrapperEntry.module_att_names:
|
|
149
|
+
return make_dynamic_template(name, SKLearnManifold)
|
|
150
|
+
raise AttributeError(f"template `{name}` not found in {__name__}")
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
__all__ = SKLearnManifold.WrapperEntry.module_att_names
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
if SINAPSIS_BUILD_DOCS:
|
|
157
|
+
dynamic_templates = [__getattr__(template_name) for template_name in __all__]
|
|
158
|
+
for template in dynamic_templates:
|
|
159
|
+
globals()[template.__name__] = template
|
|
160
|
+
del template
|