ezyml 2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ezyml/__init__.py +9 -0
- ezyml/cli.py +102 -0
- ezyml/compiler/__init__.py +1 -0
- ezyml/compiler/compile.py +137 -0
- ezyml/core.py +1005 -0
- ezyml/deploy/__init__.py +5 -0
- ezyml/deploy/docker.py +21 -0
- ezyml/deploy/fastapi.py +30 -0
- ezyml/deploy/k8s.py +125 -0
- ezyml/deploy/openapi.py +19 -0
- ezyml/deploy/streamlit.py +205 -0
- ezyml/devx/__init___.py +1 -0
- ezyml/devx/doctor.py +7 -0
- ezyml/devx/init.py +6 -0
- ezyml/eda/__init__.py +0 -0
- ezyml/eda/auto_eda.py +22 -0
- ezyml/evaluation/__init__.py +0 -0
- ezyml/evaluation/evaluator.py +43 -0
- ezyml/evaluation/metrics.py +25 -0
- ezyml/evaluation/plots.py +23 -0
- ezyml/explain/__init__.py +0 -0
- ezyml/explain/learner.py +12 -0
- ezyml/monitoring/__init__.py +0 -0
- ezyml/monitoring/drift.py +9 -0
- ezyml/monitoring/fingerprint.py +8 -0
- ezyml/pipeline/__init__.py +0 -0
- ezyml/pipeline/loader.py +84 -0
- ezyml/pipeline/visualize.py +9 -0
- ezyml/training/__init__.py +0 -0
- ezyml/training/tuner.py +6 -0
- ezyml-2.dist-info/METADATA +341 -0
- ezyml-2.dist-info/RECORD +36 -0
- ezyml-2.dist-info/WHEEL +5 -0
- ezyml-2.dist-info/entry_points.txt +2 -0
- ezyml-2.dist-info/licenses/LICENSE +21 -0
- ezyml-2.dist-info/top_level.txt +1 -0
ezyml/core.py
ADDED
|
@@ -0,0 +1,1005 @@
|
|
|
1
|
+
# # # ezyml/ezyml.py
|
|
2
|
+
|
|
3
|
+
# # import pandas as pd
|
|
4
|
+
# # import numpy as np
|
|
5
|
+
# # import pickle
|
|
6
|
+
# # import json
|
|
7
|
+
|
|
8
|
+
# # # Preprocessing
|
|
9
|
+
# # from sklearn.model_selection import train_test_split
|
|
10
|
+
# # from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
|
11
|
+
# # from sklearn.compose import ColumnTransformer
|
|
12
|
+
# # from sklearn.pipeline import Pipeline
|
|
13
|
+
# # from sklearn.impute import SimpleImputer
|
|
14
|
+
|
|
15
|
+
# # # Models
|
|
16
|
+
# # from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso, ElasticNet
|
|
17
|
+
# # from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor, ExtraTreesClassifier
|
|
18
|
+
# # from sklearn.svm import SVC, SVR
|
|
19
|
+
# # from sklearn.neighbors import KNeighborsClassifier
|
|
20
|
+
# # from sklearn.naive_bayes import GaussianNB
|
|
21
|
+
# # from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
|
|
22
|
+
# # from sklearn.decomposition import PCA
|
|
23
|
+
# # from sklearn.manifold import TSNE
|
|
24
|
+
# # import xgboost as xgb
|
|
25
|
+
|
|
26
|
+
# # # Metrics
|
|
27
|
+
# # from sklearn.metrics import (
|
|
28
|
+
# # accuracy_score, f1_score, roc_auc_score, confusion_matrix,
|
|
29
|
+
# # mean_absolute_error, mean_squared_error, r2_score,
|
|
30
|
+
# # silhouette_score
|
|
31
|
+
# # )
|
|
32
|
+
|
|
33
|
+
# # # --- Model Dictionaries ---
|
|
34
|
+
# # CLASSIFICATION_MODELS = {
|
|
35
|
+
# # "logistic_regression": LogisticRegression,
|
|
36
|
+
# # "random_forest": RandomForestClassifier,
|
|
37
|
+
# # "xgboost": xgb.XGBClassifier,
|
|
38
|
+
# # "svm": SVC,
|
|
39
|
+
# # "naive_bayes": GaussianNB,
|
|
40
|
+
# # "gradient_boosting": GradientBoostingClassifier,
|
|
41
|
+
# # "extra_trees": ExtraTreesClassifier,
|
|
42
|
+
# # "knn": KNeighborsClassifier,
|
|
43
|
+
# # }
|
|
44
|
+
|
|
45
|
+
# # REGRESSION_MODELS = {
|
|
46
|
+
# # "linear_regression": LinearRegression,
|
|
47
|
+
# # "ridge": Ridge,
|
|
48
|
+
# # "lasso": Lasso,
|
|
49
|
+
# # "elasticnet": ElasticNet,
|
|
50
|
+
# # "random_forest": RandomForestRegressor,
|
|
51
|
+
# # "xgboost": xgb.XGBRegressor,
|
|
52
|
+
# # "svr": SVR,
|
|
53
|
+
# # "gradient_boosting": GradientBoostingRegressor,
|
|
54
|
+
# # }
|
|
55
|
+
|
|
56
|
+
# # CLUSTERING_MODELS = {
|
|
57
|
+
# # "kmeans": KMeans,
|
|
58
|
+
# # "dbscan": DBSCAN,
|
|
59
|
+
# # "agglo": AgglomerativeClustering,
|
|
60
|
+
# # }
|
|
61
|
+
|
|
62
|
+
# # DIM_REDUCTION_MODELS = {
|
|
63
|
+
# # "pca": PCA,
|
|
64
|
+
# # "tsne": TSNE,
|
|
65
|
+
# # }
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# # class EZTrainer:
|
|
69
|
+
# # """A class to easily train, evaluate, and export ML models."""
|
|
70
|
+
|
|
71
|
+
# # def __init__(self, data, target=None, model="random_forest", task="auto",
|
|
72
|
+
# # test_size=0.2, scale=True, n_components=None, random_state=42):
|
|
73
|
+
# # """
|
|
74
|
+
# # Initializes the EZTrainer.
|
|
75
|
+
|
|
76
|
+
# # Args:
|
|
77
|
+
# # data (str or pd.DataFrame): Path to CSV or a pandas DataFrame.
|
|
78
|
+
# # target (str, optional): Name of the target column. Defaults to None.
|
|
79
|
+
# # model (str, optional): Model to use. Defaults to "random_forest".
|
|
80
|
+
# # task (str, optional): Type of task. Can be 'auto', 'classification',
|
|
81
|
+
# # 'regression', 'clustering', 'dim_reduction'. Defaults to "auto".
|
|
82
|
+
# # test_size (float, optional): Proportion of data for the test set. Defaults to 0.2.
|
|
83
|
+
# # scale (bool, optional): Whether to scale numerical features. Defaults to True.
|
|
84
|
+
# # n_components (int, optional): Number of components for dimensionality reduction. Defaults to None.
|
|
85
|
+
# # random_state (int, optional): Random state for reproducibility. Defaults to 42.
|
|
86
|
+
# # """
|
|
87
|
+
# # self.target = target
|
|
88
|
+
# # self.model_name = model
|
|
89
|
+
# # self.task = task
|
|
90
|
+
# # self.test_size = test_size
|
|
91
|
+
# # self.scale = scale
|
|
92
|
+
# # self.n_components = n_components
|
|
93
|
+
# # self.random_state = random_state
|
|
94
|
+
|
|
95
|
+
# # self.df = self._load_data(data)
|
|
96
|
+
# # self._auto_detect_task()
|
|
97
|
+
|
|
98
|
+
# # self.X = None
|
|
99
|
+
# # self.y = None
|
|
100
|
+
# # self.X_train, self.X_test, self.y_train, self.y_test = [None] * 4
|
|
101
|
+
|
|
102
|
+
# # self.pipeline = None
|
|
103
|
+
# # self.report = {}
|
|
104
|
+
# # self.transformed_data = None
|
|
105
|
+
|
|
106
|
+
# # def _load_data(self, data):
|
|
107
|
+
# # """Loads data from path or uses the provided DataFrame."""
|
|
108
|
+
# # if isinstance(data, str):
|
|
109
|
+
# # print(f"Loading data from {data}...")
|
|
110
|
+
# # return pd.read_csv(data)
|
|
111
|
+
# # elif isinstance(data, pd.DataFrame):
|
|
112
|
+
# # print("Using provided DataFrame.")
|
|
113
|
+
# # return data.copy()
|
|
114
|
+
# # else:
|
|
115
|
+
# # raise TypeError("Data must be a file path (str) or a pandas DataFrame.")
|
|
116
|
+
|
|
117
|
+
# # def _auto_detect_task(self):
|
|
118
|
+
# # """Automatically detects the ML task based on data and parameters."""
|
|
119
|
+
# # if self.task != "auto":
|
|
120
|
+
# # print(f"Task specified as: {self.task}")
|
|
121
|
+
# # return
|
|
122
|
+
|
|
123
|
+
# # if self.target:
|
|
124
|
+
# # if self.target not in self.df.columns:
|
|
125
|
+
# # raise ValueError(f"Target column '{self.target}' not found in data.")
|
|
126
|
+
|
|
127
|
+
# # target_dtype = self.df[self.target].dtype
|
|
128
|
+
# # unique_values = self.df[self.target].nunique()
|
|
129
|
+
|
|
130
|
+
# # # Heuristic for classification vs. regression
|
|
131
|
+
# # if pd.api.types.is_numeric_dtype(target_dtype) and unique_values > 20:
|
|
132
|
+
# # self.task = "regression"
|
|
133
|
+
# # else:
|
|
134
|
+
# # self.task = "classification"
|
|
135
|
+
# # elif self.model_name in CLUSTERING_MODELS:
|
|
136
|
+
# # self.task = "clustering"
|
|
137
|
+
# # elif self.model_name in DIM_REDUCTION_MODELS:
|
|
138
|
+
# # self.task = "dim_reduction"
|
|
139
|
+
# # else:
|
|
140
|
+
# # raise ValueError("Could not auto-detect task. Please specify the 'task' parameter.")
|
|
141
|
+
|
|
142
|
+
# # print(f"Auto-detected task as: {self.task}")
|
|
143
|
+
|
|
144
|
+
# # def _get_preprocessor(self):
|
|
145
|
+
# # """Builds a preprocessor pipeline for numerical and categorical features."""
|
|
146
|
+
# # numerical_features = self.X.select_dtypes(include=np.number).columns.tolist()
|
|
147
|
+
# # categorical_features = self.X.select_dtypes(include=['object', 'category']).columns.tolist()
|
|
148
|
+
|
|
149
|
+
# # print(f"Identified {len(numerical_features)} numerical features: {numerical_features}")
|
|
150
|
+
# # print(f"Identified {len(categorical_features)} categorical features: {categorical_features}")
|
|
151
|
+
|
|
152
|
+
# # num_steps = [('imputer', SimpleImputer(strategy='median'))]
|
|
153
|
+
# # if self.scale:
|
|
154
|
+
# # num_steps.append(('scaler', StandardScaler()))
|
|
155
|
+
|
|
156
|
+
# # numerical_transformer = Pipeline(steps=num_steps)
|
|
157
|
+
# # categorical_transformer = Pipeline(steps=[
|
|
158
|
+
# # ('imputer', SimpleImputer(strategy='most_frequent')),
|
|
159
|
+
# # ('onehot', OneHotEncoder(handle_unknown='ignore'))
|
|
160
|
+
# # ])
|
|
161
|
+
|
|
162
|
+
# # return ColumnTransformer(transformers=[
|
|
163
|
+
# # ('num', numerical_transformer, numerical_features),
|
|
164
|
+
# # ('cat', categorical_transformer, categorical_features)
|
|
165
|
+
# # ], remainder='passthrough')
|
|
166
|
+
|
|
167
|
+
# # def _calculate_metrics(self):
|
|
168
|
+
# # """Calculates and stores performance metrics based on the task."""
|
|
169
|
+
# # print("Calculating metrics...")
|
|
170
|
+
# # if self.task == "classification":
|
|
171
|
+
# # preds = self.pipeline.predict(self.X_test)
|
|
172
|
+
# # self.report = {
|
|
173
|
+
# # "accuracy": accuracy_score(self.y_test, preds),
|
|
174
|
+
# # "f1_score": f1_score(self.y_test, preds, average='weighted'),
|
|
175
|
+
# # "confusion_matrix": confusion_matrix(self.y_test, preds).tolist(),
|
|
176
|
+
# # }
|
|
177
|
+
# # # ROC AUC for binary and multi-class (if applicable)
|
|
178
|
+
# # try:
|
|
179
|
+
# # if hasattr(self.pipeline, "predict_proba"):
|
|
180
|
+
# # probs = self.pipeline.predict_proba(self.X_test)
|
|
181
|
+
# # if probs.shape[1] == 2: # Binary
|
|
182
|
+
# # self.report["roc_auc"] = roc_auc_score(self.y_test, probs[:, 1])
|
|
183
|
+
# # else: # Multi-class
|
|
184
|
+
# # self.report["roc_auc"] = roc_auc_score(self.y_test, probs, multi_class='ovr')
|
|
185
|
+
# # except Exception as e:
|
|
186
|
+
# # print(f"Could not calculate ROC AUC score: {e}")
|
|
187
|
+
|
|
188
|
+
# # elif self.task == "regression":
|
|
189
|
+
# # preds = self.pipeline.predict(self.X_test)
|
|
190
|
+
# # self.report = {
|
|
191
|
+
# # "r2_score": r2_score(self.y_test, preds),
|
|
192
|
+
# # "mae": mean_absolute_error(self.y_test, preds),
|
|
193
|
+
# # "mse": mean_squared_error(self.y_test, preds),
|
|
194
|
+
# # "rmse": np.sqrt(mean_squared_error(self.y_test, preds)),
|
|
195
|
+
# # }
|
|
196
|
+
|
|
197
|
+
# # elif self.task == "clustering":
|
|
198
|
+
# # labels = self.pipeline.named_steps['model'].labels_
|
|
199
|
+
# # if len(set(labels)) > 1: # Silhouette score requires at least 2 clusters
|
|
200
|
+
# # self.report = {
|
|
201
|
+
# # "silhouette_score": silhouette_score(self.X, labels),
|
|
202
|
+
# # "n_clusters": len(set(labels))
|
|
203
|
+
# # }
|
|
204
|
+
# # else:
|
|
205
|
+
# # self.report = {"n_clusters": len(set(labels)), "silhouette_score": None}
|
|
206
|
+
|
|
207
|
+
# # print("Metrics report:")
|
|
208
|
+
# # print(json.dumps(self.report, indent=4))
|
|
209
|
+
|
|
210
|
+
# # def train(self):
|
|
211
|
+
# # """Trains the specified model."""
|
|
212
|
+
# # print(f"\n--- Starting Training for Task: {self.task.upper()} ---")
|
|
213
|
+
|
|
214
|
+
# # if self.task in ["classification", "regression"]:
|
|
215
|
+
# # self.X = self.df.drop(columns=[self.target])
|
|
216
|
+
# # self.y = self.df[self.target]
|
|
217
|
+
|
|
218
|
+
# # self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
|
|
219
|
+
# # self.X, self.y, test_size=self.test_size, random_state=self.random_state
|
|
220
|
+
# # )
|
|
221
|
+
|
|
222
|
+
# # preprocessor = self._get_preprocessor()
|
|
223
|
+
# # model_map = CLASSIFICATION_MODELS if self.task == "classification" else REGRESSION_MODELS
|
|
224
|
+
|
|
225
|
+
# # if self.model_name not in model_map:
|
|
226
|
+
# # raise ValueError(f"Model '{self.model_name}' not supported for {self.task}.")
|
|
227
|
+
|
|
228
|
+
# # model_instance = model_map[self.model_name](random_state=self.random_state) if 'random_state' in model_map[self.model_name]().get_params() else model_map[self.model_name]()
|
|
229
|
+
|
|
230
|
+
# # self.pipeline = Pipeline(steps=[
|
|
231
|
+
# # ('preprocessor', preprocessor),
|
|
232
|
+
# # ('model', model_instance)
|
|
233
|
+
# # ])
|
|
234
|
+
|
|
235
|
+
# # print(f"Training {self.model_name} model...")
|
|
236
|
+
# # self.pipeline.fit(self.X_train, self.y_train)
|
|
237
|
+
# # self._calculate_metrics()
|
|
238
|
+
|
|
239
|
+
# # elif self.task == "clustering":
|
|
240
|
+
# # self.X = self.df.copy()
|
|
241
|
+
# # preprocessor = self._get_preprocessor()
|
|
242
|
+
|
|
243
|
+
# # if self.model_name not in CLUSTERING_MODELS:
|
|
244
|
+
# # raise ValueError(f"Model '{self.model_name}' not supported for clustering.")
|
|
245
|
+
|
|
246
|
+
# # model_instance = CLUSTERING_MODELS[self.model_name]()
|
|
247
|
+
|
|
248
|
+
# # self.pipeline = Pipeline(steps=[
|
|
249
|
+
# # ('preprocessor', preprocessor),
|
|
250
|
+
# # ('model', model_instance)
|
|
251
|
+
# # ])
|
|
252
|
+
|
|
253
|
+
# # print(f"Fitting {self.model_name} model...")
|
|
254
|
+
# # self.pipeline.fit(self.X)
|
|
255
|
+
# # self._calculate_metrics()
|
|
256
|
+
|
|
257
|
+
# # elif self.task == "dim_reduction":
|
|
258
|
+
# # self.X = self.df.copy()
|
|
259
|
+
# # preprocessor = self._get_preprocessor()
|
|
260
|
+
|
|
261
|
+
# # if self.model_name not in DIM_REDUCTION_MODELS:
|
|
262
|
+
# # raise ValueError(f"Model '{self.model_name}' not supported for dimensionality reduction.")
|
|
263
|
+
|
|
264
|
+
# # model_instance = DIM_REDUCTION_MODELS[self.model_name](n_components=self.n_components, random_state=self.random_state) if self.n_components else DIM_REDUCTION_MODELS[self.model_name](random_state=self.random_state)
|
|
265
|
+
|
|
266
|
+
# # self.pipeline = Pipeline(steps=[
|
|
267
|
+
# # ('preprocessor', preprocessor),
|
|
268
|
+
# # ('model', model_instance)
|
|
269
|
+
# # ])
|
|
270
|
+
|
|
271
|
+
# # print(f"Transforming data with {self.model_name}...")
|
|
272
|
+
# # self.transformed_data = self.pipeline.fit_transform(self.X)
|
|
273
|
+
# # print(f"Data transformed into {self.transformed_data.shape[1]} dimensions.")
|
|
274
|
+
|
|
275
|
+
# # else:
|
|
276
|
+
# # raise ValueError(f"Task '{self.task}' is not supported.")
|
|
277
|
+
|
|
278
|
+
# # print("--- Training Complete ---")
|
|
279
|
+
|
|
280
|
+
# # def predict(self, X_new):
|
|
281
|
+
# # """Makes predictions on new data."""
|
|
282
|
+
# # if not self.pipeline:
|
|
283
|
+
# # raise RuntimeError("Model has not been trained yet. Call .train() first.")
|
|
284
|
+
# # if self.task not in ["classification", "regression"]:
|
|
285
|
+
# # raise RuntimeError(f"Predict is not available for task '{self.task}'.")
|
|
286
|
+
|
|
287
|
+
# # if isinstance(X_new, str):
|
|
288
|
+
# # X_new = pd.read_csv(X_new)
|
|
289
|
+
|
|
290
|
+
# # return self.pipeline.predict(X_new)
|
|
291
|
+
|
|
292
|
+
# # def save_model(self, path="model.pkl"):
|
|
293
|
+
# # """Saves the trained pipeline to a .pkl file."""
|
|
294
|
+
# # if not self.pipeline:
|
|
295
|
+
# # raise RuntimeError("No model to save. Call .train() first.")
|
|
296
|
+
|
|
297
|
+
# # with open(path, 'wb') as f:
|
|
298
|
+
# # pickle.dump(self.pipeline, f)
|
|
299
|
+
# # print(f"Model saved successfully to {path}")
|
|
300
|
+
|
|
301
|
+
# # def save_report(self, path="report.json"):
|
|
302
|
+
# # """Saves the metrics report to a .json file."""
|
|
303
|
+
# # if not self.report:
|
|
304
|
+
# # raise RuntimeError("No report to save. Call .train() and ensure metrics were calculated.")
|
|
305
|
+
|
|
306
|
+
# # with open(path, 'w') as f:
|
|
307
|
+
# # json.dump(self.report, f, indent=4)
|
|
308
|
+
# # print(f"Report saved successfully to {path}")
|
|
309
|
+
|
|
310
|
+
# # def save_transformed(self, path="transformed_data.csv"):
|
|
311
|
+
# # """Saves the transformed data from PCA/t-SNE to a .csv file."""
|
|
312
|
+
# # if self.transformed_data is None:
|
|
313
|
+
# # raise RuntimeError("No transformed data to save. Run a 'dim_reduction' task first.")
|
|
314
|
+
|
|
315
|
+
# # pd.DataFrame(self.transformed_data).to_csv(path, index=False)
|
|
316
|
+
# # print(f"Transformed data saved successfully to {path}")
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
# # ezyml/core.py
|
|
320
|
+
|
|
321
|
+
# import pandas as pd
|
|
322
|
+
# import numpy as np
|
|
323
|
+
# import pickle
|
|
324
|
+
# import json
|
|
325
|
+
|
|
326
|
+
# # Preprocessing
|
|
327
|
+
# from sklearn.model_selection import train_test_split
|
|
328
|
+
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
|
329
|
+
# from sklearn.compose import ColumnTransformer
|
|
330
|
+
# from sklearn.pipeline import Pipeline
|
|
331
|
+
# from sklearn.impute import SimpleImputer
|
|
332
|
+
|
|
333
|
+
# # Models
|
|
334
|
+
# from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso, ElasticNet
|
|
335
|
+
# from sklearn.ensemble import (
|
|
336
|
+
# RandomForestClassifier, RandomForestRegressor,
|
|
337
|
+
# GradientBoostingClassifier, GradientBoostingRegressor,
|
|
338
|
+
# ExtraTreesClassifier
|
|
339
|
+
# )
|
|
340
|
+
# from sklearn.svm import SVC, SVR
|
|
341
|
+
# from sklearn.neighbors import KNeighborsClassifier
|
|
342
|
+
# from sklearn.naive_bayes import GaussianNB
|
|
343
|
+
# from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
|
|
344
|
+
# from sklearn.decomposition import PCA
|
|
345
|
+
# from sklearn.manifold import TSNE
|
|
346
|
+
# import xgboost as xgb
|
|
347
|
+
|
|
348
|
+
# # Metrics
|
|
349
|
+
# from sklearn.metrics import (
|
|
350
|
+
# accuracy_score, f1_score, roc_auc_score, confusion_matrix,
|
|
351
|
+
# mean_absolute_error, mean_squared_error, r2_score,
|
|
352
|
+
# silhouette_score
|
|
353
|
+
# )
|
|
354
|
+
|
|
355
|
+
# # ------------------------------------------------------------------
|
|
356
|
+
# # MODEL REGISTRIES
|
|
357
|
+
# # ------------------------------------------------------------------
|
|
358
|
+
|
|
359
|
+
# CLASSIFICATION_MODELS = {
|
|
360
|
+
# "logistic_regression": LogisticRegression,
|
|
361
|
+
# "random_forest": RandomForestClassifier,
|
|
362
|
+
# "xgboost": xgb.XGBClassifier,
|
|
363
|
+
# "svm": SVC,
|
|
364
|
+
# "naive_bayes": GaussianNB,
|
|
365
|
+
# "gradient_boosting": GradientBoostingClassifier,
|
|
366
|
+
# "extra_trees": ExtraTreesClassifier,
|
|
367
|
+
# "knn": KNeighborsClassifier,
|
|
368
|
+
# }
|
|
369
|
+
|
|
370
|
+
# REGRESSION_MODELS = {
|
|
371
|
+
# "linear_regression": LinearRegression,
|
|
372
|
+
# "ridge": Ridge,
|
|
373
|
+
# "lasso": Lasso,
|
|
374
|
+
# "elasticnet": ElasticNet,
|
|
375
|
+
# "random_forest": RandomForestRegressor,
|
|
376
|
+
# "xgboost": xgb.XGBRegressor,
|
|
377
|
+
# "svr": SVR,
|
|
378
|
+
# "gradient_boosting": GradientBoostingRegressor,
|
|
379
|
+
# }
|
|
380
|
+
|
|
381
|
+
# CLUSTERING_MODELS = {
|
|
382
|
+
# "kmeans": KMeans,
|
|
383
|
+
# "dbscan": DBSCAN,
|
|
384
|
+
# "agglo": AgglomerativeClustering,
|
|
385
|
+
# }
|
|
386
|
+
|
|
387
|
+
# DIM_REDUCTION_MODELS = {
|
|
388
|
+
# "pca": PCA,
|
|
389
|
+
# "tsne": TSNE,
|
|
390
|
+
# }
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
# # ------------------------------------------------------------------
|
|
394
|
+
# # EZTRAINER
|
|
395
|
+
# # ------------------------------------------------------------------
|
|
396
|
+
|
|
397
|
+
# class EZTrainer:
|
|
398
|
+
# """
|
|
399
|
+
# Train, evaluate, and export ML models.
|
|
400
|
+
# """
|
|
401
|
+
|
|
402
|
+
# def __init__(
|
|
403
|
+
# self,
|
|
404
|
+
# data,
|
|
405
|
+
# target=None,
|
|
406
|
+
# model="random_forest",
|
|
407
|
+
# task="auto",
|
|
408
|
+
# test_size=0.2,
|
|
409
|
+
# scale=True,
|
|
410
|
+
# n_components=None,
|
|
411
|
+
# random_state=42
|
|
412
|
+
# ):
|
|
413
|
+
# self.target = target
|
|
414
|
+
# self.model_name = model
|
|
415
|
+
# self.task = task
|
|
416
|
+
# self.test_size = test_size
|
|
417
|
+
# self.scale = scale
|
|
418
|
+
# self.n_components = n_components
|
|
419
|
+
# self.random_state = random_state
|
|
420
|
+
|
|
421
|
+
# self.df = self._load_data(data)
|
|
422
|
+
# self._auto_detect_task()
|
|
423
|
+
|
|
424
|
+
# # Core data
|
|
425
|
+
# self.X = None
|
|
426
|
+
# self.y = None
|
|
427
|
+
# self.X_train = None
|
|
428
|
+
# self.X_test = None
|
|
429
|
+
# self.y_train = None
|
|
430
|
+
# self.y_test = None
|
|
431
|
+
|
|
432
|
+
# # Artifacts REQUIRED by compiler
|
|
433
|
+
# self.y_pred = None
|
|
434
|
+
# self.y_prob = None
|
|
435
|
+
|
|
436
|
+
# self.pipeline = None
|
|
437
|
+
# self.report = {}
|
|
438
|
+
# self.transformed_data = None
|
|
439
|
+
|
|
440
|
+
# # ------------------------------------------------------------------
|
|
441
|
+
|
|
442
|
+
# def _load_data(self, data):
|
|
443
|
+
# if isinstance(data, str):
|
|
444
|
+
# print(f"Loading data from {data}...")
|
|
445
|
+
# return pd.read_csv(data)
|
|
446
|
+
# elif isinstance(data, pd.DataFrame):
|
|
447
|
+
# print("Using provided DataFrame.")
|
|
448
|
+
# return data.copy()
|
|
449
|
+
# else:
|
|
450
|
+
# raise TypeError("Data must be a CSV path or a pandas DataFrame.")
|
|
451
|
+
|
|
452
|
+
# # ------------------------------------------------------------------
|
|
453
|
+
|
|
454
|
+
# def _auto_detect_task(self):
|
|
455
|
+
# if self.task != "auto":
|
|
456
|
+
# print(f"Task specified as: {self.task}")
|
|
457
|
+
# return
|
|
458
|
+
|
|
459
|
+
# if self.target:
|
|
460
|
+
# if self.target not in self.df.columns:
|
|
461
|
+
# raise ValueError(f"Target column '{self.target}' not found.")
|
|
462
|
+
|
|
463
|
+
# dtype = self.df[self.target].dtype
|
|
464
|
+
# uniq = self.df[self.target].nunique()
|
|
465
|
+
|
|
466
|
+
# if pd.api.types.is_numeric_dtype(dtype) and uniq > 20:
|
|
467
|
+
# self.task = "regression"
|
|
468
|
+
# else:
|
|
469
|
+
# self.task = "classification"
|
|
470
|
+
|
|
471
|
+
# elif self.model_name in CLUSTERING_MODELS:
|
|
472
|
+
# self.task = "clustering"
|
|
473
|
+
# elif self.model_name in DIM_REDUCTION_MODELS:
|
|
474
|
+
# self.task = "dim_reduction"
|
|
475
|
+
# else:
|
|
476
|
+
# raise ValueError("Could not auto-detect task.")
|
|
477
|
+
|
|
478
|
+
# print(f"Auto-detected task as: {self.task}")
|
|
479
|
+
|
|
480
|
+
# # ------------------------------------------------------------------
|
|
481
|
+
|
|
482
|
+
# def _get_preprocessor(self):
|
|
483
|
+
# numerical = self.X.select_dtypes(include=np.number).columns.tolist()
|
|
484
|
+
# categorical = self.X.select_dtypes(include=["object", "category"]).columns.tolist()
|
|
485
|
+
|
|
486
|
+
# print(f"Identified {len(numerical)} numerical features: {numerical}")
|
|
487
|
+
# print(f"Identified {len(categorical)} categorical features: {categorical}")
|
|
488
|
+
|
|
489
|
+
# num_steps = [("imputer", SimpleImputer(strategy="median"))]
|
|
490
|
+
# if self.scale:
|
|
491
|
+
# num_steps.append(("scaler", StandardScaler()))
|
|
492
|
+
|
|
493
|
+
# num_pipe = Pipeline(num_steps)
|
|
494
|
+
# cat_pipe = Pipeline([
|
|
495
|
+
# ("imputer", SimpleImputer(strategy="most_frequent")),
|
|
496
|
+
# ("onehot", OneHotEncoder(handle_unknown="ignore"))
|
|
497
|
+
# ])
|
|
498
|
+
|
|
499
|
+
# return ColumnTransformer([
|
|
500
|
+
# ("num", num_pipe, numerical),
|
|
501
|
+
# ("cat", cat_pipe, categorical)
|
|
502
|
+
# ])
|
|
503
|
+
|
|
504
|
+
# # ------------------------------------------------------------------
|
|
505
|
+
|
|
506
|
+
# def _calculate_metrics(self):
|
|
507
|
+
# print("Calculating metrics...")
|
|
508
|
+
|
|
509
|
+
# if self.task == "classification":
|
|
510
|
+
# self.report = {
|
|
511
|
+
# "accuracy": accuracy_score(self.y_test, self.y_pred),
|
|
512
|
+
# "f1_score": f1_score(self.y_test, self.y_pred, average="weighted"),
|
|
513
|
+
# "confusion_matrix": confusion_matrix(self.y_test, self.y_pred).tolist(),
|
|
514
|
+
# }
|
|
515
|
+
|
|
516
|
+
# if self.y_prob is not None:
|
|
517
|
+
# try:
|
|
518
|
+
# if self.y_prob.ndim == 1:
|
|
519
|
+
# self.report["roc_auc"] = roc_auc_score(self.y_test, self.y_prob)
|
|
520
|
+
# else:
|
|
521
|
+
# self.report["roc_auc"] = roc_auc_score(
|
|
522
|
+
# self.y_test, self.y_prob, multi_class="ovr"
|
|
523
|
+
# )
|
|
524
|
+
# except Exception:
|
|
525
|
+
# self.report["roc_auc"] = None
|
|
526
|
+
|
|
527
|
+
# elif self.task == "regression":
|
|
528
|
+
# self.report = {
|
|
529
|
+
# "r2": r2_score(self.y_test, self.y_pred),
|
|
530
|
+
# "mae": mean_absolute_error(self.y_test, self.y_pred),
|
|
531
|
+
# "rmse": np.sqrt(mean_squared_error(self.y_test, self.y_pred)),
|
|
532
|
+
# }
|
|
533
|
+
|
|
534
|
+
# elif self.task == "clustering":
|
|
535
|
+
# labels = self.pipeline.named_steps["model"].labels_
|
|
536
|
+
# self.report = {
|
|
537
|
+
# "n_clusters": len(set(labels)),
|
|
538
|
+
# "silhouette_score": silhouette_score(self.X, labels)
|
|
539
|
+
# if len(set(labels)) > 1 else None
|
|
540
|
+
# }
|
|
541
|
+
|
|
542
|
+
# print("Metrics report:")
|
|
543
|
+
# print(json.dumps(self.report, indent=4))
|
|
544
|
+
|
|
545
|
+
# # ------------------------------------------------------------------
|
|
546
|
+
|
|
547
|
+
# def train(self):
|
|
548
|
+
# print(f"\n--- Starting Training for Task: {self.task.upper()} ---")
|
|
549
|
+
|
|
550
|
+
# if self.task in ["classification", "regression"]:
|
|
551
|
+
# self.X = self.df.drop(columns=[self.target])
|
|
552
|
+
# self.y = self.df[self.target]
|
|
553
|
+
|
|
554
|
+
# self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
|
|
555
|
+
# self.X, self.y,
|
|
556
|
+
# test_size=self.test_size,
|
|
557
|
+
# random_state=self.random_state
|
|
558
|
+
# )
|
|
559
|
+
|
|
560
|
+
# preprocessor = self._get_preprocessor()
|
|
561
|
+
# model_map = CLASSIFICATION_MODELS if self.task == "classification" else REGRESSION_MODELS
|
|
562
|
+
|
|
563
|
+
# if self.model_name not in model_map:
|
|
564
|
+
# raise ValueError(f"Model '{self.model_name}' not supported.")
|
|
565
|
+
|
|
566
|
+
# Model = model_map[self.model_name]
|
|
567
|
+
# model = Model(random_state=self.random_state) if "random_state" in Model().get_params() else Model()
|
|
568
|
+
|
|
569
|
+
# self.pipeline = Pipeline([
|
|
570
|
+
# ("preprocessor", preprocessor),
|
|
571
|
+
# ("model", model)
|
|
572
|
+
# ])
|
|
573
|
+
|
|
574
|
+
# print(f"Training {self.model_name} model...")
|
|
575
|
+
# self.pipeline.fit(self.X_train, self.y_train)
|
|
576
|
+
|
|
577
|
+
# # REQUIRED FOR COMPILER
|
|
578
|
+
# self.y_pred = self.pipeline.predict(self.X_test)
|
|
579
|
+
|
|
580
|
+
# if hasattr(self.pipeline.named_steps["model"], "predict_proba"):
|
|
581
|
+
# probs = self.pipeline.predict_proba(self.X_test)
|
|
582
|
+
# self.y_prob = probs[:, 1] if probs.shape[1] == 2 else probs
|
|
583
|
+
# else:
|
|
584
|
+
# self.y_prob = None
|
|
585
|
+
|
|
586
|
+
# self._calculate_metrics()
|
|
587
|
+
|
|
588
|
+
# elif self.task == "clustering":
|
|
589
|
+
# self.X = self.df.copy()
|
|
590
|
+
# preprocessor = self._get_preprocessor()
|
|
591
|
+
# model = CLUSTERING_MODELS[self.model_name]()
|
|
592
|
+
|
|
593
|
+
# self.pipeline = Pipeline([
|
|
594
|
+
# ("preprocessor", preprocessor),
|
|
595
|
+
# ("model", model)
|
|
596
|
+
# ])
|
|
597
|
+
|
|
598
|
+
# self.pipeline.fit(self.X)
|
|
599
|
+
# self._calculate_metrics()
|
|
600
|
+
|
|
601
|
+
# elif self.task == "dim_reduction":
|
|
602
|
+
# self.X = self.df.copy()
|
|
603
|
+
# preprocessor = self._get_preprocessor()
|
|
604
|
+
# Model = DIM_REDUCTION_MODELS[self.model_name]
|
|
605
|
+
|
|
606
|
+
# model = Model(
|
|
607
|
+
# n_components=self.n_components,
|
|
608
|
+
# random_state=self.random_state
|
|
609
|
+
# ) if self.n_components else Model(random_state=self.random_state)
|
|
610
|
+
|
|
611
|
+
# self.pipeline = Pipeline([
|
|
612
|
+
# ("preprocessor", preprocessor),
|
|
613
|
+
# ("model", model)
|
|
614
|
+
# ])
|
|
615
|
+
|
|
616
|
+
# self.transformed_data = self.pipeline.fit_transform(self.X)
|
|
617
|
+
# print(f"Data transformed into {self.transformed_data.shape[1]} dimensions.")
|
|
618
|
+
|
|
619
|
+
# else:
|
|
620
|
+
# raise ValueError(f"Task '{self.task}' not supported.")
|
|
621
|
+
|
|
622
|
+
# print("--- Training Complete ---")
|
|
623
|
+
# return self
|
|
624
|
+
|
|
625
|
+
# # ------------------------------------------------------------------
|
|
626
|
+
|
|
627
|
+
# def predict(self, X_new):
|
|
628
|
+
# if not self.pipeline:
|
|
629
|
+
# raise RuntimeError("Model not trained.")
|
|
630
|
+
|
|
631
|
+
# if isinstance(X_new, str):
|
|
632
|
+
# X_new = pd.read_csv(X_new)
|
|
633
|
+
|
|
634
|
+
# return self.pipeline.predict(X_new)
|
|
635
|
+
|
|
636
|
+
# # ------------------------------------------------------------------
|
|
637
|
+
|
|
638
|
+
# def save_model(self, path="model.pkl"):
|
|
639
|
+
# with open(path, "wb") as f:
|
|
640
|
+
# pickle.dump(self.pipeline, f)
|
|
641
|
+
# print(f"Model saved successfully to {path}")
|
|
642
|
+
|
|
643
|
+
# # ------------------------------------------------------------------
|
|
644
|
+
|
|
645
|
+
# def save_report(self, path="report.json"):
|
|
646
|
+
# with open(path, "w") as f:
|
|
647
|
+
# json.dump(self.report, f, indent=4)
|
|
648
|
+
# print(f"Report saved successfully to {path}")
|
|
649
|
+
|
|
650
|
+
# # ------------------------------------------------------------------
|
|
651
|
+
|
|
652
|
+
# def save_transformed(self, path="transformed_data.csv"):
|
|
653
|
+
# if self.transformed_data is None:
|
|
654
|
+
# raise RuntimeError("No transformed data.")
|
|
655
|
+
# pd.DataFrame(self.transformed_data).to_csv(path, index=False)
|
|
656
|
+
# print(f"Transformed data saved to {path}")
|
|
657
|
+
|
|
658
|
+
|
|
659
|
+
# ezyml/core.py
|
|
660
|
+
|
|
661
|
+
import pandas as pd
|
|
662
|
+
import numpy as np
|
|
663
|
+
import pickle
|
|
664
|
+
import json
|
|
665
|
+
|
|
666
|
+
# ======================================================
|
|
667
|
+
# PREPROCESSING
|
|
668
|
+
# ======================================================
|
|
669
|
+
from sklearn.model_selection import train_test_split
|
|
670
|
+
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
|
671
|
+
from sklearn.compose import ColumnTransformer
|
|
672
|
+
from sklearn.pipeline import Pipeline
|
|
673
|
+
from sklearn.impute import SimpleImputer
|
|
674
|
+
|
|
675
|
+
# ======================================================
|
|
676
|
+
# MODELS
|
|
677
|
+
# ======================================================
|
|
678
|
+
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso, ElasticNet
|
|
679
|
+
from sklearn.ensemble import (
|
|
680
|
+
RandomForestClassifier,
|
|
681
|
+
RandomForestRegressor,
|
|
682
|
+
GradientBoostingClassifier,
|
|
683
|
+
GradientBoostingRegressor,
|
|
684
|
+
ExtraTreesClassifier
|
|
685
|
+
)
|
|
686
|
+
from sklearn.svm import SVC, SVR
|
|
687
|
+
from sklearn.neighbors import KNeighborsClassifier
|
|
688
|
+
from sklearn.naive_bayes import GaussianNB
|
|
689
|
+
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
|
|
690
|
+
from sklearn.decomposition import PCA
|
|
691
|
+
from sklearn.manifold import TSNE
|
|
692
|
+
import xgboost as xgb
|
|
693
|
+
|
|
694
|
+
# ======================================================
|
|
695
|
+
# METRICS
|
|
696
|
+
# ======================================================
|
|
697
|
+
from sklearn.metrics import (
|
|
698
|
+
accuracy_score,
|
|
699
|
+
f1_score,
|
|
700
|
+
roc_auc_score,
|
|
701
|
+
confusion_matrix,
|
|
702
|
+
mean_absolute_error,
|
|
703
|
+
mean_squared_error,
|
|
704
|
+
r2_score,
|
|
705
|
+
silhouette_score
|
|
706
|
+
)
|
|
707
|
+
|
|
708
|
+
# ======================================================
|
|
709
|
+
# MODEL REGISTRIES
|
|
710
|
+
# ======================================================
|
|
711
|
+
CLASSIFICATION_MODELS = {
|
|
712
|
+
"logistic_regression": LogisticRegression,
|
|
713
|
+
"random_forest": RandomForestClassifier,
|
|
714
|
+
"xgboost": xgb.XGBClassifier,
|
|
715
|
+
"svm": SVC,
|
|
716
|
+
"naive_bayes": GaussianNB,
|
|
717
|
+
"gradient_boosting": GradientBoostingClassifier,
|
|
718
|
+
"extra_trees": ExtraTreesClassifier,
|
|
719
|
+
"knn": KNeighborsClassifier,
|
|
720
|
+
}
|
|
721
|
+
|
|
722
|
+
REGRESSION_MODELS = {
|
|
723
|
+
"linear_regression": LinearRegression,
|
|
724
|
+
"ridge": Ridge,
|
|
725
|
+
"lasso": Lasso,
|
|
726
|
+
"elasticnet": ElasticNet,
|
|
727
|
+
"random_forest": RandomForestRegressor,
|
|
728
|
+
"xgboost": xgb.XGBRegressor,
|
|
729
|
+
"svr": SVR,
|
|
730
|
+
"gradient_boosting": GradientBoostingRegressor,
|
|
731
|
+
}
|
|
732
|
+
|
|
733
|
+
CLUSTERING_MODELS = {
|
|
734
|
+
"kmeans": KMeans,
|
|
735
|
+
"dbscan": DBSCAN,
|
|
736
|
+
"agglo": AgglomerativeClustering,
|
|
737
|
+
}
|
|
738
|
+
|
|
739
|
+
DIM_REDUCTION_MODELS = {
|
|
740
|
+
"pca": PCA,
|
|
741
|
+
"tsne": TSNE,
|
|
742
|
+
}
|
|
743
|
+
|
|
744
|
+
# ======================================================
|
|
745
|
+
# EZTRAINER
|
|
746
|
+
# ======================================================
|
|
747
|
+
class EZTrainer:
|
|
748
|
+
"""
|
|
749
|
+
Core trainer class used by:
|
|
750
|
+
- CLI (train / reduce)
|
|
751
|
+
- Pipeline
|
|
752
|
+
- Compiler (compile)
|
|
753
|
+
"""
|
|
754
|
+
|
|
755
|
+
def __init__(
|
|
756
|
+
self,
|
|
757
|
+
data,
|
|
758
|
+
target=None,
|
|
759
|
+
model="random_forest",
|
|
760
|
+
task="auto",
|
|
761
|
+
test_size=0.2,
|
|
762
|
+
scale=True,
|
|
763
|
+
n_components=None,
|
|
764
|
+
random_state=42
|
|
765
|
+
):
|
|
766
|
+
self.target = target
|
|
767
|
+
self.model_name = model
|
|
768
|
+
self.task = task
|
|
769
|
+
self.test_size = test_size
|
|
770
|
+
self.scale = scale
|
|
771
|
+
self.n_components = n_components
|
|
772
|
+
self.random_state = random_state
|
|
773
|
+
|
|
774
|
+
# Load data
|
|
775
|
+
self.df = self._load_data(data)
|
|
776
|
+
self._auto_detect_task()
|
|
777
|
+
|
|
778
|
+
# Data containers
|
|
779
|
+
self.X = None
|
|
780
|
+
self.y = None
|
|
781
|
+
self.X_train = None
|
|
782
|
+
self.X_test = None
|
|
783
|
+
self.y_train = None
|
|
784
|
+
self.y_test = None
|
|
785
|
+
|
|
786
|
+
# ===== REQUIRED CONTRACT ATTRIBUTES =====
|
|
787
|
+
self.pipeline = None # full sklearn pipeline
|
|
788
|
+
self.model = None # trained estimator only
|
|
789
|
+
self.y_pred = None # predictions
|
|
790
|
+
self.y_prob = None # probabilities (if any)
|
|
791
|
+
self.report = {} # metrics
|
|
792
|
+
self.transformed_data = None
|
|
793
|
+
|
|
794
|
+
# ==================================================
|
|
795
|
+
# INTERNAL HELPERS
|
|
796
|
+
# ==================================================
|
|
797
|
+
def _load_data(self, data):
|
|
798
|
+
if isinstance(data, str):
|
|
799
|
+
print(f"Loading data from {data}...")
|
|
800
|
+
return pd.read_csv(data)
|
|
801
|
+
elif isinstance(data, pd.DataFrame):
|
|
802
|
+
print("Using provided DataFrame.")
|
|
803
|
+
return data.copy()
|
|
804
|
+
else:
|
|
805
|
+
raise TypeError("Data must be a CSV path or pandas DataFrame.")
|
|
806
|
+
|
|
807
|
+
def _auto_detect_task(self):
|
|
808
|
+
if self.task != "auto":
|
|
809
|
+
print(f"Task specified as: {self.task}")
|
|
810
|
+
return
|
|
811
|
+
|
|
812
|
+
if self.target:
|
|
813
|
+
if self.target not in self.df.columns:
|
|
814
|
+
raise ValueError(f"Target column '{self.target}' not found.")
|
|
815
|
+
|
|
816
|
+
dtype = self.df[self.target].dtype
|
|
817
|
+
uniq = self.df[self.target].nunique()
|
|
818
|
+
|
|
819
|
+
if pd.api.types.is_numeric_dtype(dtype) and uniq > 20:
|
|
820
|
+
self.task = "regression"
|
|
821
|
+
else:
|
|
822
|
+
self.task = "classification"
|
|
823
|
+
|
|
824
|
+
elif self.model_name in CLUSTERING_MODELS:
|
|
825
|
+
self.task = "clustering"
|
|
826
|
+
elif self.model_name in DIM_REDUCTION_MODELS:
|
|
827
|
+
self.task = "dim_reduction"
|
|
828
|
+
else:
|
|
829
|
+
raise ValueError("Could not auto-detect task.")
|
|
830
|
+
|
|
831
|
+
print(f"Task specified as: {self.task}")
|
|
832
|
+
|
|
833
|
+
def _get_preprocessor(self):
|
|
834
|
+
numerical = self.X.select_dtypes(include=np.number).columns.tolist()
|
|
835
|
+
categorical = self.X.select_dtypes(include=["object", "category"]).columns.tolist()
|
|
836
|
+
|
|
837
|
+
print(f"Identified {len(numerical)} numerical features: {numerical}")
|
|
838
|
+
print(f"Identified {len(categorical)} categorical features: {categorical}")
|
|
839
|
+
|
|
840
|
+
num_steps = [("imputer", SimpleImputer(strategy="median"))]
|
|
841
|
+
if self.scale:
|
|
842
|
+
num_steps.append(("scaler", StandardScaler()))
|
|
843
|
+
|
|
844
|
+
num_pipe = Pipeline(num_steps)
|
|
845
|
+
cat_pipe = Pipeline([
|
|
846
|
+
("imputer", SimpleImputer(strategy="most_frequent")),
|
|
847
|
+
("onehot", OneHotEncoder(handle_unknown="ignore"))
|
|
848
|
+
])
|
|
849
|
+
|
|
850
|
+
return ColumnTransformer([
|
|
851
|
+
("num", num_pipe, numerical),
|
|
852
|
+
("cat", cat_pipe, categorical)
|
|
853
|
+
])
|
|
854
|
+
|
|
855
|
+
# ==================================================
|
|
856
|
+
# METRICS
|
|
857
|
+
# ==================================================
|
|
858
|
+
def _calculate_metrics(self):
|
|
859
|
+
print("Calculating metrics...")
|
|
860
|
+
|
|
861
|
+
if self.task == "classification":
|
|
862
|
+
self.report = {
|
|
863
|
+
"accuracy": accuracy_score(self.y_test, self.y_pred),
|
|
864
|
+
"f1_score": f1_score(self.y_test, self.y_pred, average="weighted"),
|
|
865
|
+
"confusion_matrix": confusion_matrix(self.y_test, self.y_pred).tolist()
|
|
866
|
+
}
|
|
867
|
+
|
|
868
|
+
if self.y_prob is not None:
|
|
869
|
+
try:
|
|
870
|
+
if self.y_prob.ndim == 1:
|
|
871
|
+
self.report["roc_auc"] = roc_auc_score(self.y_test, self.y_prob)
|
|
872
|
+
else:
|
|
873
|
+
self.report["roc_auc"] = roc_auc_score(
|
|
874
|
+
self.y_test, self.y_prob, multi_class="ovr"
|
|
875
|
+
)
|
|
876
|
+
except Exception:
|
|
877
|
+
self.report["roc_auc"] = None
|
|
878
|
+
|
|
879
|
+
elif self.task == "regression":
|
|
880
|
+
self.report = {
|
|
881
|
+
"r2": r2_score(self.y_test, self.y_pred),
|
|
882
|
+
"mae": mean_absolute_error(self.y_test, self.y_pred),
|
|
883
|
+
"rmse": np.sqrt(mean_squared_error(self.y_test, self.y_pred))
|
|
884
|
+
}
|
|
885
|
+
|
|
886
|
+
elif self.task == "clustering":
|
|
887
|
+
labels = self.pipeline.named_steps["model"].labels_
|
|
888
|
+
self.report = {
|
|
889
|
+
"n_clusters": len(set(labels)),
|
|
890
|
+
"silhouette_score": silhouette_score(self.X, labels)
|
|
891
|
+
if len(set(labels)) > 1 else None
|
|
892
|
+
}
|
|
893
|
+
|
|
894
|
+
print("Metrics report:")
|
|
895
|
+
print(json.dumps(self.report, indent=4))
|
|
896
|
+
|
|
897
|
+
# ==================================================
|
|
898
|
+
# TRAIN
|
|
899
|
+
# ==================================================
|
|
900
|
+
def train(self):
|
|
901
|
+
print(f"\n--- Starting Training for Task: {self.task.upper()} ---")
|
|
902
|
+
|
|
903
|
+
if self.task in ["classification", "regression"]:
|
|
904
|
+
self.X = self.df.drop(columns=[self.target])
|
|
905
|
+
self.y = self.df[self.target]
|
|
906
|
+
|
|
907
|
+
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
|
|
908
|
+
self.X,
|
|
909
|
+
self.y,
|
|
910
|
+
test_size=self.test_size,
|
|
911
|
+
random_state=self.random_state
|
|
912
|
+
)
|
|
913
|
+
|
|
914
|
+
preprocessor = self._get_preprocessor()
|
|
915
|
+
model_map = CLASSIFICATION_MODELS if self.task == "classification" else REGRESSION_MODELS
|
|
916
|
+
|
|
917
|
+
ModelCls = model_map[self.model_name]
|
|
918
|
+
model = (
|
|
919
|
+
ModelCls(random_state=self.random_state)
|
|
920
|
+
if "random_state" in ModelCls().get_params()
|
|
921
|
+
else ModelCls()
|
|
922
|
+
)
|
|
923
|
+
|
|
924
|
+
self.pipeline = Pipeline([
|
|
925
|
+
("preprocessor", preprocessor),
|
|
926
|
+
("model", model)
|
|
927
|
+
])
|
|
928
|
+
|
|
929
|
+
print(f"Training {self.model_name} model...")
|
|
930
|
+
self.pipeline.fit(self.X_train, self.y_train)
|
|
931
|
+
|
|
932
|
+
# ===== REQUIRED EXPORTS =====
|
|
933
|
+
self.model = self.pipeline.named_steps["model"]
|
|
934
|
+
self.y_pred = self.pipeline.predict(self.X_test)
|
|
935
|
+
|
|
936
|
+
if hasattr(self.model, "predict_proba"):
|
|
937
|
+
probs = self.pipeline.predict_proba(self.X_test)
|
|
938
|
+
self.y_prob = probs[:, 1] if probs.shape[1] == 2 else probs
|
|
939
|
+
else:
|
|
940
|
+
self.y_prob = None
|
|
941
|
+
|
|
942
|
+
self._calculate_metrics()
|
|
943
|
+
|
|
944
|
+
elif self.task == "clustering":
|
|
945
|
+
self.X = self.df.copy()
|
|
946
|
+
preprocessor = self._get_preprocessor()
|
|
947
|
+
model = CLUSTERING_MODELS[self.model_name]()
|
|
948
|
+
|
|
949
|
+
self.pipeline = Pipeline([
|
|
950
|
+
("preprocessor", preprocessor),
|
|
951
|
+
("model", model)
|
|
952
|
+
])
|
|
953
|
+
|
|
954
|
+
self.pipeline.fit(self.X)
|
|
955
|
+
self.model = model
|
|
956
|
+
self._calculate_metrics()
|
|
957
|
+
|
|
958
|
+
elif self.task == "dim_reduction":
|
|
959
|
+
self.X = self.df.copy()
|
|
960
|
+
preprocessor = self._get_preprocessor()
|
|
961
|
+
ModelCls = DIM_REDUCTION_MODELS[self.model_name]
|
|
962
|
+
|
|
963
|
+
model = (
|
|
964
|
+
ModelCls(n_components=self.n_components, random_state=self.random_state)
|
|
965
|
+
if self.n_components else ModelCls(random_state=self.random_state)
|
|
966
|
+
)
|
|
967
|
+
|
|
968
|
+
self.pipeline = Pipeline([
|
|
969
|
+
("preprocessor", preprocessor),
|
|
970
|
+
("model", model)
|
|
971
|
+
])
|
|
972
|
+
|
|
973
|
+
self.transformed_data = self.pipeline.fit_transform(self.X)
|
|
974
|
+
self.model = model
|
|
975
|
+
print(f"Data transformed into {self.transformed_data.shape[1]} dimensions.")
|
|
976
|
+
|
|
977
|
+
else:
|
|
978
|
+
raise ValueError(f"Task '{self.task}' not supported.")
|
|
979
|
+
|
|
980
|
+
print("--- Training Complete ---")
|
|
981
|
+
return self
|
|
982
|
+
|
|
983
|
+
# ==================================================
|
|
984
|
+
# UTILITIES
|
|
985
|
+
# ==================================================
|
|
986
|
+
def predict(self, X_new):
|
|
987
|
+
if isinstance(X_new, str):
|
|
988
|
+
X_new = pd.read_csv(X_new)
|
|
989
|
+
return self.pipeline.predict(X_new)
|
|
990
|
+
|
|
991
|
+
def save_model(self, path="model.pkl"):
|
|
992
|
+
with open(path, "wb") as f:
|
|
993
|
+
pickle.dump(self.pipeline, f)
|
|
994
|
+
print(f"Model saved successfully to {path}")
|
|
995
|
+
|
|
996
|
+
def save_report(self, path="report.json"):
|
|
997
|
+
with open(path, "w") as f:
|
|
998
|
+
json.dump(self.report, f, indent=4)
|
|
999
|
+
print(f"Report saved successfully to {path}")
|
|
1000
|
+
|
|
1001
|
+
def save_transformed(self, path="transformed_data.csv"):
|
|
1002
|
+
if self.transformed_data is None:
|
|
1003
|
+
raise RuntimeError("No transformed data available.")
|
|
1004
|
+
pd.DataFrame(self.transformed_data).to_csv(path, index=False)
|
|
1005
|
+
print(f"Transformed data saved to {path}")
|