sinapsis-data-analysis 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
File without changes
@@ -0,0 +1,22 @@
1
+ # -*- coding: utf-8 -*-
2
+ excluded_linear_models = [
3
+ "enet_path",
4
+ "lars_path",
5
+ "lars_path_gram",
6
+ "lasso_path",
7
+ "orthogonal_mp",
8
+ "orthogonal_mp_gram",
9
+ "ridge_regression",
10
+ "_sgd_fast",
11
+ ]
12
+ excluded_neighbors_models = [
13
+ "_ball_tree.BallTree",
14
+ "_ball_tree.KDTree",
15
+ "sort_graph_by_row_values",
16
+ "kneighbors_graph",
17
+ "radius_neighbors_graph",
18
+ "sort_graph_by_row_values",
19
+ "VALID_METRICS",
20
+ "VALID_METRICS_SPARSE",
21
+ ]
22
+ excluded_tree_models = ["plot_tree", "export_text", "export_graphviz"]
@@ -0,0 +1,29 @@
1
+ # -*- coding: utf-8 -*-
2
+ from typing import Any
3
+
4
+ from pydantic import BaseModel, ConfigDict
5
+
6
+
7
+ class ModelMetrics(BaseModel):
8
+ """Base class for storing model metrics"""
9
+
10
+ accuracy: float | None = None
11
+ precision: float | None = None
12
+ recall: float | None = None
13
+ f1_score: float | None = None
14
+
15
+ r2_score: float | None = None
16
+ mean_squared_error: float | None = None
17
+ mean_absolute_error: float | None = None
18
+
19
+ mape: float | None = None
20
+
21
+ model_config = ConfigDict(arbitrary_types_allowed=True)
22
+
23
+
24
+ class ModelPredictionResults(BaseModel):
25
+ """Class to store model predictions and metrics"""
26
+
27
+ predictions: Any
28
+ metrics: ModelMetrics
29
+ model_config = ConfigDict(arbitrary_types_allowed=True)
@@ -0,0 +1,29 @@
1
+ # -*- coding: utf-8 -*-
2
+ import importlib
3
+
4
+ from sinapsis.templates import _import_template_package
5
+ from sinapsis_core.template_base import Template
6
+
7
+ _root_lib_path: str = "sinapsis_data_analysis.templates"
8
+
9
+ _ADDITIONAL_TEMPLATE_MODULES = [
10
+ f"{_root_lib_path}.sklearn_manifold",
11
+ f"{_root_lib_path}.sklearn_train",
12
+ f"{_root_lib_path}.xgboost_train",
13
+ f"{_root_lib_path}.xgboost_inference",
14
+ ]
15
+ _template_lookup: dict = {
16
+ "SKLearnInference": f"{_root_lib_path}.sklearn_inference",
17
+ "XGBoostInference": f"{_root_lib_path}.xgboost_inference",
18
+ }
19
+ for t_module in _ADDITIONAL_TEMPLATE_MODULES:
20
+ _template_lookup |= _import_template_package(t_module)
21
+
22
+
23
+ def __getattr__(name: str) -> Template:
24
+ if name in _template_lookup:
25
+ module = importlib.import_module(_template_lookup[name])
26
+ return getattr(module, name)
27
+
28
+
29
+ __all__ = list(_template_lookup.keys())
@@ -0,0 +1,120 @@
1
+ # -*- coding: utf-8 -*-
2
+ from abc import abstractmethod
3
+ from typing import Any
4
+
5
+ import numpy as np
6
+ from sinapsis_core.data_containers.data_packet import DataContainer
7
+ from sinapsis_core.template_base import TemplateAttributes
8
+ from sinapsis_core.template_base.template import Template
9
+
10
+
11
+ class MLBaseInference(Template):
12
+ """Abstract base class for machine learning model inference.
13
+
14
+ This class provides a framework for loading a trained model
15
+ and using it to make predictions on new data.
16
+ """
17
+
18
+ class AttributesBaseModel(TemplateAttributes):
19
+ """Attributes for the MLBaseInference template.
20
+
21
+ Attributes:
22
+ model_path (str): Path to the saved model file.
23
+ generic_field_key (str): Key of the generic field where data is stored.
24
+ """
25
+
26
+ model_path: str
27
+ generic_field_key: str
28
+
29
+ def __init__(self, attributes: TemplateAttributes) -> None:
30
+ super().__init__(attributes)
31
+ self.model = self.load_model(self.attributes.model_path)
32
+
33
+ def get_data(self, container: DataContainer) -> Any:
34
+ """Get the data from the data container
35
+
36
+ Args:
37
+ container (DataContainer): The data container with the data
38
+
39
+ Returns:
40
+ Any: The data from the generic field
41
+ """
42
+ return self._get_generic_data(container, self.attributes.generic_field_key)
43
+
44
+ @staticmethod
45
+ def data_is_valid(data: Any) -> bool:
46
+ """Check if the data is valid for inference
47
+
48
+ Args:
49
+ data (Any): The data to validate
50
+
51
+ Returns:
52
+ bool: True if the data is valid, False otherwise
53
+ """
54
+ return data is not None
55
+
56
+ def preprocess_data(self, data: Any) -> Any:
57
+ """
58
+ This method can be overridden by subclasses to implement
59
+ specific preprocessing steps
60
+
61
+ Args:
62
+ data (Any): The data to preprocess
63
+
64
+ Returns:
65
+ Any: The preprocessed data
66
+ """
67
+ try:
68
+ data.pop("target")
69
+ except (KeyError, IndexError):
70
+ self.logger.info("No target column")
71
+ return data
72
+
73
+ @abstractmethod
74
+ def load_model(self, model_path: str) -> Any:
75
+ """
76
+ This abstract method should be implemented by subclasses to define
77
+ how the model should be loaded
78
+
79
+ Args:
80
+ model_path (str): Path to the saved model file
81
+
82
+ Returns:
83
+ Any: The loaded model
84
+ """
85
+
86
+ def predict(self, data: np.ndarray) -> np.ndarray:
87
+ """Generate predictions using the loaded model
88
+
89
+ Args:
90
+ data (Any): The data to make predictions on
91
+
92
+ Returns:
93
+ np.ndarray: The model's predictions
94
+ """
95
+
96
+ return self.model.predict(data)
97
+
98
+ def execute(self, container: DataContainer) -> DataContainer:
99
+ """
100
+ Gets the data, validates it, preprocesses it, makes predictions,
101
+ and stores the results
102
+
103
+ Args:
104
+ container (DataContainer): The data container with the input data
105
+
106
+ Returns:
107
+ DataContainer: The container with added predictions
108
+ """
109
+ data = self.get_data(container)
110
+
111
+ if not self.data_is_valid(data):
112
+ self.logger.warning("Invalid or missing data")
113
+ return container
114
+
115
+ data = self.preprocess_data(data)
116
+ predictions = self.predict(data)
117
+
118
+ self._set_generic_data(container, predictions)
119
+
120
+ return container
@@ -0,0 +1,261 @@
1
+ # -*- coding: utf-8 -*-
2
+ import os
3
+ from abc import abstractmethod
4
+ from typing import Any
5
+
6
+ import numpy as np
7
+ from sinapsis_core.data_containers.data_packet import DataContainer
8
+ from sinapsis_core.template_base import TemplateAttributes
9
+ from sinapsis_core.template_base.dynamic_template import BaseDynamicWrapperTemplate
10
+ from sinapsis_data_readers.templates.datasets_readers.dataset_splitter import TabularDatasetSplit
11
+ from sklearn.base import is_classifier, is_regressor
12
+ from sklearn.metrics import (
13
+ accuracy_score,
14
+ f1_score,
15
+ mean_absolute_error,
16
+ mean_squared_error,
17
+ precision_score,
18
+ r2_score,
19
+ recall_score,
20
+ )
21
+
22
+ from sinapsis_data_analysis.helpers.model_metrics import (
23
+ ModelMetrics,
24
+ ModelPredictionResults,
25
+ )
26
+
27
+
28
+ class MLBaseAttributes(TemplateAttributes):
29
+ """Base attributes for machine learning model templates.
30
+
31
+ Attributes:
32
+ generic_field_key (str): Key of the generic field where datasets are stored.
33
+ model_save_path (str): Path where the trained model will be saved.
34
+ """
35
+
36
+ generic_field_key: str
37
+ model_save_path: str
38
+
39
+
40
+ class MLBaseTraining(BaseDynamicWrapperTemplate):
41
+ """
42
+ This abstract class provides common functionality for loading data,
43
+ training models, making predictions, calculating metrics, and saving
44
+ models.
45
+
46
+
47
+ """
48
+
49
+ AttributesBaseModel = MLBaseAttributes
50
+
51
+ def __init__(self, attributes: TemplateAttributes) -> None:
52
+ """Initialize the MLBase template.
53
+
54
+ Args:
55
+ attributes (TemplateAttributes): The attributes for this template.
56
+ """
57
+ super().__init__(attributes)
58
+ self.model = self.wrapped_callable
59
+ self.trained_model = None
60
+
61
+ def get_dataset(self, container: DataContainer) -> Any:
62
+ """Get the dataset from the data container.
63
+
64
+ Args:
65
+ container (DataContainer): The data container with the dataset.
66
+
67
+ Returns:
68
+ Any: The dataset from the generic field.
69
+ """
70
+ return self._get_generic_data(container, self.attributes.generic_field_key)
71
+
72
+ @staticmethod
73
+ def dataset_is_valid(dataset: Any) -> bool:
74
+ """Check if the dataset is valid
75
+
76
+ Args:
77
+ dataset (Any): The dataset to validate.
78
+
79
+ Returns:
80
+ bool: True if the dataset is valid, False otherwise.
81
+ """
82
+ return dataset is not None
83
+
84
+ def process_dataset(self, dataset: TabularDatasetSplit) -> tuple | None:
85
+ """
86
+ Extracts x_train, y_train, x_test, y_test from the dataset
87
+
88
+ Args:
89
+ dataset (Any): The dataset to process
90
+
91
+ Returns:
92
+ tuple | None: A tuple containing (x_train, y_train, x_test, y_test)
93
+ or None if the dataset doesn't have the expected attributes
94
+ """
95
+ try:
96
+ x_train = dataset.x_train
97
+ y_train = dataset.y_train
98
+ x_test = dataset.x_test
99
+ y_test = dataset.y_test
100
+
101
+ return x_train, y_train, x_test, y_test
102
+ except AttributeError:
103
+ self.logger.warning("Dataset doesn't have the expected attributes")
104
+ return None
105
+
106
+ def train_model(self, x_train: Any, y_train: Any) -> None:
107
+ """Train the model using the training data
108
+
109
+ Args:
110
+ x_train (Any): The training features
111
+ y_train (Any): The training targets
112
+ """
113
+ self.trained_model = self.model.fit(x_train, y_train)
114
+
115
+ @staticmethod
116
+ def calculate_classification_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> ModelMetrics:
117
+ """Calculate metrics specific to classification models
118
+
119
+ Args:
120
+ y_true (np.ndarray): The ground truth labels
121
+ y_pred (np.ndarray): The predicted labels
122
+
123
+ Returns:
124
+ ModelMetrics: Object containing classification metrics
125
+ """
126
+ metrics = ModelMetrics()
127
+ metrics.accuracy = float(accuracy_score(y_true, y_pred))
128
+ metrics.precision = float(precision_score(y_true, y_pred, average="weighted", zero_division=0))
129
+ metrics.recall = float(recall_score(y_true, y_pred, average="weighted", zero_division=0))
130
+ metrics.f1_score = float(f1_score(y_true, y_pred, average="weighted", zero_division=0))
131
+ return metrics
132
+
133
+ @staticmethod
134
+ def calculate_regression_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> ModelMetrics:
135
+ """Calculate metrics specific to regression models
136
+
137
+ Args:
138
+ y_true (np.ndarray): The ground truth values
139
+ y_pred (np.ndarray): The predicted values
140
+
141
+ Returns:
142
+ ModelMetrics: Object containing regression metrics
143
+ """
144
+ metrics = ModelMetrics()
145
+ metrics.r2_score = float(r2_score(y_true, y_pred))
146
+ metrics.mean_squared_error = float(mean_squared_error(y_true, y_pred))
147
+ metrics.mean_absolute_error = float(mean_absolute_error(y_true, y_pred))
148
+
149
+ return metrics
150
+
151
+ def calculate_metrics(self, y_true: np.ndarray, y_pred: np.ndarray) -> ModelMetrics:
152
+ """
153
+ Detects whether the model is a classifier or regressor and calculates
154
+ the appropriate metrics
155
+
156
+ Args:
157
+ y_true (np.ndarray): The ground truth values/labels
158
+ y_pred (np.ndarray): The predicted values/labels
159
+
160
+ Returns:
161
+ ModelMetrics: Object containing the appropriate metrics
162
+ """
163
+ if self.trained_model is not None:
164
+ if is_classifier(self.trained_model):
165
+ return self.calculate_classification_metrics(y_true, y_pred)
166
+ elif is_regressor(self.trained_model):
167
+ return self.calculate_regression_metrics(y_true, y_pred)
168
+ return ModelMetrics()
169
+
170
+ def generate_predictions(self, x_test: np.ndarray, y_test: np.ndarray) -> ModelPredictionResults | None:
171
+ """
172
+ Uses the trained model to make predictions on the test data
173
+ and calculates the appropriate metrics
174
+
175
+ Args:
176
+ x_test (np.ndarray): The test features
177
+ y_test (np.ndarray): The test targets
178
+
179
+ Returns:
180
+ ModelPredictionResults: Object containing predictions and metrics
181
+ """
182
+ if self.trained_model is not None:
183
+ predictions = self.trained_model.predict(x_test)
184
+
185
+ metrics = self.calculate_metrics(y_test, predictions)
186
+
187
+ return ModelPredictionResults(predictions=predictions, metrics=metrics)
188
+ return None
189
+
190
+ def handle_model_training(self, processed_data: tuple) -> ModelPredictionResults | None:
191
+ """Handle the model training and prediction workflow.
192
+
193
+ Extracts data from the processed dataset, trains the model,
194
+ and generates predictions.
195
+
196
+ Args:
197
+ processed_data (tuple): Tuple containing training and testing data.
198
+
199
+ Returns:
200
+ ModelPredictionResults: Object containing predictions and metrics.
201
+ """
202
+ x_train, y_train, x_test, y_test = processed_data
203
+
204
+ self.train_model(x_train, y_train)
205
+ return self.generate_predictions(x_test, y_test)
206
+
207
+ def save_model(self) -> None:
208
+ """
209
+ Creates the necessary directories and calls the implementation-specific
210
+ method to save the model to the path specified in attributes.
211
+ If no trained model exists or an error occurs, it will be logged.
212
+ """
213
+ if self.trained_model is None:
214
+ self.logger.error("No model to save")
215
+ return
216
+
217
+ try:
218
+ os.makedirs(os.path.dirname(self.attributes.model_save_path), exist_ok=True)
219
+ self._save_model_implementation()
220
+ self.logger.info(f"Model saved at {self.attributes.model_save_path}")
221
+ except (MemoryError, TypeError) as e:
222
+ self.logger.error(f"Error saving model: {e}")
223
+
224
+ @abstractmethod
225
+ def _save_model_implementation(self) -> None:
226
+ """Save the trained model using an implementation-specific method.
227
+
228
+ This abstract method should be implemented by subclasses to define
229
+ how the model should be serialized and saved to disk.
230
+ """
231
+
232
+ def execute(self, container: DataContainer) -> DataContainer:
233
+ """
234
+ Gets the dataset, validates it, processes it, trains the model,
235
+ generates predictions, and stores the results
236
+
237
+ Args:
238
+ container (DataContainer): The data container with the dataset
239
+
240
+ Returns:
241
+ DataContainer: The container with added predictions and metrics
242
+ """
243
+ dataset = self.get_dataset(container)
244
+
245
+ if not self.dataset_is_valid(dataset):
246
+ self.logger.warning("Invalid or missing dataset")
247
+ return container
248
+
249
+ processed_data = self.process_dataset(dataset)
250
+
251
+ if processed_data is None:
252
+ self.logger.warning("Failed to process dataset")
253
+ return container
254
+
255
+ results = self.handle_model_training(processed_data)
256
+
257
+ if results is not None:
258
+ self._set_generic_data(container, results)
259
+ self.save_model()
260
+
261
+ return container
@@ -0,0 +1,28 @@
1
+ # -*- coding: utf-8 -*-
2
+ from typing import Any
3
+
4
+ import joblib
5
+
6
+ from sinapsis_data_analysis.templates.ml_base_inference import MLBaseInference
7
+
8
+
9
+ class SKLearnInference(MLBaseInference):
10
+ """Template for inference using sklearn models.
11
+
12
+ This template loads a saved sklearn model using joblib
13
+ and uses it to make predictions on new data.
14
+ """
15
+
16
+ CATEGORY = "SKLearn"
17
+
18
+ def load_model(self, model_path: str) -> Any:
19
+ """
20
+ Uses joblib to load a previously saved sklearn model
21
+
22
+ Args:
23
+ model_path (str): Path to the saved model file
24
+
25
+ Returns:
26
+ Any: The loaded sklearn model
27
+ """
28
+ return joblib.load(model_path)
@@ -0,0 +1,160 @@
1
+ # -*- coding: utf-8 -*-
2
+ from typing import cast
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ from pydantic import BaseModel, ConfigDict
7
+ from sinapsis_core.data_containers.data_packet import DataContainer
8
+ from sinapsis_core.template_base import Template, TemplateAttributes
9
+ from sinapsis_core.template_base.dynamic_template import (
10
+ BaseDynamicWrapperTemplate,
11
+ WrapperEntryConfig,
12
+ )
13
+ from sinapsis_core.template_base.dynamic_template_factory import make_dynamic_template
14
+ from sinapsis_core.utils.env_var_keys import SINAPSIS_BUILD_DOCS
15
+ from sinapsis_data_readers.templates.datasets_readers.dataset_splitter import (
16
+ TabularDatasetSplit,
17
+ )
18
+ from sklearn import manifold
19
+
20
+
21
+ class ManifoldResults(BaseModel):
22
+ """Class to store the results of manifold learning.
23
+
24
+ Attributes:
25
+ labels (np.ndarray | list): The original labels.
26
+ x_transformed (np.ndarray): The data after dimensionality reduction.
27
+ """
28
+
29
+ labels: np.ndarray | list
30
+ x_transformed: np.ndarray
31
+
32
+ model_config = ConfigDict(arbitrary_types_allowed=True)
33
+
34
+
35
+ class SKLearnManifold(BaseDynamicWrapperTemplate):
36
+ """
37
+ This template dynamically wraps sklearn's manifold module,
38
+ providing access to dimensionality reduction techniques like
39
+ TSNE, MDS, Isomap, etc.
40
+ """
41
+
42
+ WrapperEntry = WrapperEntryConfig(
43
+ wrapped_object=manifold,
44
+ signature_from_doc_string=True,
45
+ exclude_module_atts=["locally_linear_embedding"],
46
+ force_init_as_method=False,
47
+ )
48
+
49
+ CATEGORY = "SKLearn"
50
+
51
+ class AttributesBaseModel(TemplateAttributes):
52
+ """Attributes for the SKLearnManifold template.
53
+
54
+ Attributes:
55
+ generic_field_key (str): Key of the generic field
56
+ where the input data is stored.
57
+ """
58
+
59
+ generic_field_key: str
60
+
61
+ def __init__(self, attributes: TemplateAttributes) -> None:
62
+ super().__init__(attributes)
63
+ self.manifold_model = self.wrapped_callable
64
+
65
+ @staticmethod
66
+ def reshape_arrays(feature_arrays: pd.DataFrame) -> np.ndarray:
67
+ """
68
+ Converts a list of arrays into a 2D numpy array suitable for
69
+ manifold learning algorithms
70
+
71
+ Args:
72
+ feature_arrays (list): List of feature arrays
73
+
74
+ Returns:
75
+ np.ndarray: Reshaped array suitable for manifold learning
76
+ """
77
+ array_data = np.array(feature_arrays)
78
+ return array_data.reshape(array_data.shape[0], -1)
79
+
80
+ def get_dataset(self, container: DataContainer) -> TabularDatasetSplit | None:
81
+ """Get the dataset from the data container
82
+
83
+ Args:
84
+ container (DataContainer): The data container with the dataset
85
+
86
+ Returns:
87
+ TabularDatasetSplit | None: The dataset from the generic field,
88
+ or None if not found
89
+ """
90
+ dataset = self._get_generic_data(container, self.attributes.generic_field_key)
91
+ dataset = cast(TabularDatasetSplit, dataset)
92
+ if dataset:
93
+ return dataset
94
+ return None
95
+
96
+ def process_dataset(self, dataset: TabularDatasetSplit) -> ManifoldResults | None:
97
+ """
98
+ Extracts the training data, reshapes it, and applies the
99
+ manifold learning transformation
100
+
101
+ Args:
102
+ dataset (TabularDatasetSplit): The dataset to process
103
+
104
+ Returns:
105
+ ManifoldResults | None: Results of the manifold transformation,
106
+ or None if the dataset is empty
107
+ """
108
+ x_train = dataset.x_train
109
+ y_train = dataset.y_train
110
+
111
+ if x_train is None or x_train.empty:
112
+ return None
113
+
114
+ x_train_reshaped = self.reshape_arrays(x_train)
115
+ x_transformed = self.manifold_model.fit_transform(x_train_reshaped)
116
+
117
+ return ManifoldResults(labels=y_train, x_transformed=x_transformed)
118
+
119
+ def execute(self, container: DataContainer) -> DataContainer:
120
+ """
121
+ Gets the dataset, processes it using the manifold learning algorithm,
122
+ and stores the results
123
+
124
+ Args:
125
+ container (DataContainer): The data container with the dataset
126
+
127
+ Returns:
128
+ DataContainer: The container with added manifold learning results
129
+ """
130
+ dataset = self.get_dataset(container)
131
+ if not dataset:
132
+ self.logger.warning("There is no dataset to process")
133
+ return container
134
+
135
+ results = self.process_dataset(dataset)
136
+
137
+ if results is not None:
138
+ self._set_generic_data(container, results)
139
+
140
+ return container
141
+
142
+
143
+ def __getattr__(name: str) -> Template:
144
+ """
145
+ Only create a template if it's imported, this avoids creating all the base models for all templates
146
+ and potential import errors due to not available packages.
147
+ """
148
+ if name in SKLearnManifold.WrapperEntry.module_att_names:
149
+ return make_dynamic_template(name, SKLearnManifold)
150
+ raise AttributeError(f"template `{name}` not found in {__name__}")
151
+
152
+
153
+ __all__ = SKLearnManifold.WrapperEntry.module_att_names
154
+
155
+
156
+ if SINAPSIS_BUILD_DOCS:
157
+ dynamic_templates = [__getattr__(template_name) for template_name in __all__]
158
+ for template in dynamic_templates:
159
+ globals()[template.__name__] = template
160
+ del template