ezyml 2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ezyml/core.py ADDED
@@ -0,0 +1,1005 @@
1
+ # # # ezyml/ezyml.py
2
+
3
+ # # import pandas as pd
4
+ # # import numpy as np
5
+ # # import pickle
6
+ # # import json
7
+
8
+ # # # Preprocessing
9
+ # # from sklearn.model_selection import train_test_split
10
+ # # from sklearn.preprocessing import StandardScaler, OneHotEncoder
11
+ # # from sklearn.compose import ColumnTransformer
12
+ # # from sklearn.pipeline import Pipeline
13
+ # # from sklearn.impute import SimpleImputer
14
+
15
+ # # # Models
16
+ # # from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso, ElasticNet
17
+ # # from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor, ExtraTreesClassifier
18
+ # # from sklearn.svm import SVC, SVR
19
+ # # from sklearn.neighbors import KNeighborsClassifier
20
+ # # from sklearn.naive_bayes import GaussianNB
21
+ # # from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
22
+ # # from sklearn.decomposition import PCA
23
+ # # from sklearn.manifold import TSNE
24
+ # # import xgboost as xgb
25
+
26
+ # # # Metrics
27
+ # # from sklearn.metrics import (
28
+ # # accuracy_score, f1_score, roc_auc_score, confusion_matrix,
29
+ # # mean_absolute_error, mean_squared_error, r2_score,
30
+ # # silhouette_score
31
+ # # )
32
+
33
+ # # # --- Model Dictionaries ---
34
+ # # CLASSIFICATION_MODELS = {
35
+ # # "logistic_regression": LogisticRegression,
36
+ # # "random_forest": RandomForestClassifier,
37
+ # # "xgboost": xgb.XGBClassifier,
38
+ # # "svm": SVC,
39
+ # # "naive_bayes": GaussianNB,
40
+ # # "gradient_boosting": GradientBoostingClassifier,
41
+ # # "extra_trees": ExtraTreesClassifier,
42
+ # # "knn": KNeighborsClassifier,
43
+ # # }
44
+
45
+ # # REGRESSION_MODELS = {
46
+ # # "linear_regression": LinearRegression,
47
+ # # "ridge": Ridge,
48
+ # # "lasso": Lasso,
49
+ # # "elasticnet": ElasticNet,
50
+ # # "random_forest": RandomForestRegressor,
51
+ # # "xgboost": xgb.XGBRegressor,
52
+ # # "svr": SVR,
53
+ # # "gradient_boosting": GradientBoostingRegressor,
54
+ # # }
55
+
56
+ # # CLUSTERING_MODELS = {
57
+ # # "kmeans": KMeans,
58
+ # # "dbscan": DBSCAN,
59
+ # # "agglo": AgglomerativeClustering,
60
+ # # }
61
+
62
+ # # DIM_REDUCTION_MODELS = {
63
+ # # "pca": PCA,
64
+ # # "tsne": TSNE,
65
+ # # }
66
+
67
+
68
+ # # class EZTrainer:
69
+ # # """A class to easily train, evaluate, and export ML models."""
70
+
71
+ # # def __init__(self, data, target=None, model="random_forest", task="auto",
72
+ # # test_size=0.2, scale=True, n_components=None, random_state=42):
73
+ # # """
74
+ # # Initializes the EZTrainer.
75
+
76
+ # # Args:
77
+ # # data (str or pd.DataFrame): Path to CSV or a pandas DataFrame.
78
+ # # target (str, optional): Name of the target column. Defaults to None.
79
+ # # model (str, optional): Model to use. Defaults to "random_forest".
80
+ # # task (str, optional): Type of task. Can be 'auto', 'classification',
81
+ # # 'regression', 'clustering', 'dim_reduction'. Defaults to "auto".
82
+ # # test_size (float, optional): Proportion of data for the test set. Defaults to 0.2.
83
+ # # scale (bool, optional): Whether to scale numerical features. Defaults to True.
84
+ # # n_components (int, optional): Number of components for dimensionality reduction. Defaults to None.
85
+ # # random_state (int, optional): Random state for reproducibility. Defaults to 42.
86
+ # # """
87
+ # # self.target = target
88
+ # # self.model_name = model
89
+ # # self.task = task
90
+ # # self.test_size = test_size
91
+ # # self.scale = scale
92
+ # # self.n_components = n_components
93
+ # # self.random_state = random_state
94
+
95
+ # # self.df = self._load_data(data)
96
+ # # self._auto_detect_task()
97
+
98
+ # # self.X = None
99
+ # # self.y = None
100
+ # # self.X_train, self.X_test, self.y_train, self.y_test = [None] * 4
101
+
102
+ # # self.pipeline = None
103
+ # # self.report = {}
104
+ # # self.transformed_data = None
105
+
106
+ # # def _load_data(self, data):
107
+ # # """Loads data from path or uses the provided DataFrame."""
108
+ # # if isinstance(data, str):
109
+ # # print(f"Loading data from {data}...")
110
+ # # return pd.read_csv(data)
111
+ # # elif isinstance(data, pd.DataFrame):
112
+ # # print("Using provided DataFrame.")
113
+ # # return data.copy()
114
+ # # else:
115
+ # # raise TypeError("Data must be a file path (str) or a pandas DataFrame.")
116
+
117
+ # # def _auto_detect_task(self):
118
+ # # """Automatically detects the ML task based on data and parameters."""
119
+ # # if self.task != "auto":
120
+ # # print(f"Task specified as: {self.task}")
121
+ # # return
122
+
123
+ # # if self.target:
124
+ # # if self.target not in self.df.columns:
125
+ # # raise ValueError(f"Target column '{self.target}' not found in data.")
126
+
127
+ # # target_dtype = self.df[self.target].dtype
128
+ # # unique_values = self.df[self.target].nunique()
129
+
130
+ # # # Heuristic for classification vs. regression
131
+ # # if pd.api.types.is_numeric_dtype(target_dtype) and unique_values > 20:
132
+ # # self.task = "regression"
133
+ # # else:
134
+ # # self.task = "classification"
135
+ # # elif self.model_name in CLUSTERING_MODELS:
136
+ # # self.task = "clustering"
137
+ # # elif self.model_name in DIM_REDUCTION_MODELS:
138
+ # # self.task = "dim_reduction"
139
+ # # else:
140
+ # # raise ValueError("Could not auto-detect task. Please specify the 'task' parameter.")
141
+
142
+ # # print(f"Auto-detected task as: {self.task}")
143
+
144
+ # # def _get_preprocessor(self):
145
+ # # """Builds a preprocessor pipeline for numerical and categorical features."""
146
+ # # numerical_features = self.X.select_dtypes(include=np.number).columns.tolist()
147
+ # # categorical_features = self.X.select_dtypes(include=['object', 'category']).columns.tolist()
148
+
149
+ # # print(f"Identified {len(numerical_features)} numerical features: {numerical_features}")
150
+ # # print(f"Identified {len(categorical_features)} categorical features: {categorical_features}")
151
+
152
+ # # num_steps = [('imputer', SimpleImputer(strategy='median'))]
153
+ # # if self.scale:
154
+ # # num_steps.append(('scaler', StandardScaler()))
155
+
156
+ # # numerical_transformer = Pipeline(steps=num_steps)
157
+ # # categorical_transformer = Pipeline(steps=[
158
+ # # ('imputer', SimpleImputer(strategy='most_frequent')),
159
+ # # ('onehot', OneHotEncoder(handle_unknown='ignore'))
160
+ # # ])
161
+
162
+ # # return ColumnTransformer(transformers=[
163
+ # # ('num', numerical_transformer, numerical_features),
164
+ # # ('cat', categorical_transformer, categorical_features)
165
+ # # ], remainder='passthrough')
166
+
167
+ # # def _calculate_metrics(self):
168
+ # # """Calculates and stores performance metrics based on the task."""
169
+ # # print("Calculating metrics...")
170
+ # # if self.task == "classification":
171
+ # # preds = self.pipeline.predict(self.X_test)
172
+ # # self.report = {
173
+ # # "accuracy": accuracy_score(self.y_test, preds),
174
+ # # "f1_score": f1_score(self.y_test, preds, average='weighted'),
175
+ # # "confusion_matrix": confusion_matrix(self.y_test, preds).tolist(),
176
+ # # }
177
+ # # # ROC AUC for binary and multi-class (if applicable)
178
+ # # try:
179
+ # # if hasattr(self.pipeline, "predict_proba"):
180
+ # # probs = self.pipeline.predict_proba(self.X_test)
181
+ # # if probs.shape[1] == 2: # Binary
182
+ # # self.report["roc_auc"] = roc_auc_score(self.y_test, probs[:, 1])
183
+ # # else: # Multi-class
184
+ # # self.report["roc_auc"] = roc_auc_score(self.y_test, probs, multi_class='ovr')
185
+ # # except Exception as e:
186
+ # # print(f"Could not calculate ROC AUC score: {e}")
187
+
188
+ # # elif self.task == "regression":
189
+ # # preds = self.pipeline.predict(self.X_test)
190
+ # # self.report = {
191
+ # # "r2_score": r2_score(self.y_test, preds),
192
+ # # "mae": mean_absolute_error(self.y_test, preds),
193
+ # # "mse": mean_squared_error(self.y_test, preds),
194
+ # # "rmse": np.sqrt(mean_squared_error(self.y_test, preds)),
195
+ # # }
196
+
197
+ # # elif self.task == "clustering":
198
+ # # labels = self.pipeline.named_steps['model'].labels_
199
+ # # if len(set(labels)) > 1: # Silhouette score requires at least 2 clusters
200
+ # # self.report = {
201
+ # # "silhouette_score": silhouette_score(self.X, labels),
202
+ # # "n_clusters": len(set(labels))
203
+ # # }
204
+ # # else:
205
+ # # self.report = {"n_clusters": len(set(labels)), "silhouette_score": None}
206
+
207
+ # # print("Metrics report:")
208
+ # # print(json.dumps(self.report, indent=4))
209
+
210
+ # # def train(self):
211
+ # # """Trains the specified model."""
212
+ # # print(f"\n--- Starting Training for Task: {self.task.upper()} ---")
213
+
214
+ # # if self.task in ["classification", "regression"]:
215
+ # # self.X = self.df.drop(columns=[self.target])
216
+ # # self.y = self.df[self.target]
217
+
218
+ # # self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
219
+ # # self.X, self.y, test_size=self.test_size, random_state=self.random_state
220
+ # # )
221
+
222
+ # # preprocessor = self._get_preprocessor()
223
+ # # model_map = CLASSIFICATION_MODELS if self.task == "classification" else REGRESSION_MODELS
224
+
225
+ # # if self.model_name not in model_map:
226
+ # # raise ValueError(f"Model '{self.model_name}' not supported for {self.task}.")
227
+
228
+ # # model_instance = model_map[self.model_name](random_state=self.random_state) if 'random_state' in model_map[self.model_name]().get_params() else model_map[self.model_name]()
229
+
230
+ # # self.pipeline = Pipeline(steps=[
231
+ # # ('preprocessor', preprocessor),
232
+ # # ('model', model_instance)
233
+ # # ])
234
+
235
+ # # print(f"Training {self.model_name} model...")
236
+ # # self.pipeline.fit(self.X_train, self.y_train)
237
+ # # self._calculate_metrics()
238
+
239
+ # # elif self.task == "clustering":
240
+ # # self.X = self.df.copy()
241
+ # # preprocessor = self._get_preprocessor()
242
+
243
+ # # if self.model_name not in CLUSTERING_MODELS:
244
+ # # raise ValueError(f"Model '{self.model_name}' not supported for clustering.")
245
+
246
+ # # model_instance = CLUSTERING_MODELS[self.model_name]()
247
+
248
+ # # self.pipeline = Pipeline(steps=[
249
+ # # ('preprocessor', preprocessor),
250
+ # # ('model', model_instance)
251
+ # # ])
252
+
253
+ # # print(f"Fitting {self.model_name} model...")
254
+ # # self.pipeline.fit(self.X)
255
+ # # self._calculate_metrics()
256
+
257
+ # # elif self.task == "dim_reduction":
258
+ # # self.X = self.df.copy()
259
+ # # preprocessor = self._get_preprocessor()
260
+
261
+ # # if self.model_name not in DIM_REDUCTION_MODELS:
262
+ # # raise ValueError(f"Model '{self.model_name}' not supported for dimensionality reduction.")
263
+
264
+ # # model_instance = DIM_REDUCTION_MODELS[self.model_name](n_components=self.n_components, random_state=self.random_state) if self.n_components else DIM_REDUCTION_MODELS[self.model_name](random_state=self.random_state)
265
+
266
+ # # self.pipeline = Pipeline(steps=[
267
+ # # ('preprocessor', preprocessor),
268
+ # # ('model', model_instance)
269
+ # # ])
270
+
271
+ # # print(f"Transforming data with {self.model_name}...")
272
+ # # self.transformed_data = self.pipeline.fit_transform(self.X)
273
+ # # print(f"Data transformed into {self.transformed_data.shape[1]} dimensions.")
274
+
275
+ # # else:
276
+ # # raise ValueError(f"Task '{self.task}' is not supported.")
277
+
278
+ # # print("--- Training Complete ---")
279
+
280
+ # # def predict(self, X_new):
281
+ # # """Makes predictions on new data."""
282
+ # # if not self.pipeline:
283
+ # # raise RuntimeError("Model has not been trained yet. Call .train() first.")
284
+ # # if self.task not in ["classification", "regression"]:
285
+ # # raise RuntimeError(f"Predict is not available for task '{self.task}'.")
286
+
287
+ # # if isinstance(X_new, str):
288
+ # # X_new = pd.read_csv(X_new)
289
+
290
+ # # return self.pipeline.predict(X_new)
291
+
292
+ # # def save_model(self, path="model.pkl"):
293
+ # # """Saves the trained pipeline to a .pkl file."""
294
+ # # if not self.pipeline:
295
+ # # raise RuntimeError("No model to save. Call .train() first.")
296
+
297
+ # # with open(path, 'wb') as f:
298
+ # # pickle.dump(self.pipeline, f)
299
+ # # print(f"Model saved successfully to {path}")
300
+
301
+ # # def save_report(self, path="report.json"):
302
+ # # """Saves the metrics report to a .json file."""
303
+ # # if not self.report:
304
+ # # raise RuntimeError("No report to save. Call .train() and ensure metrics were calculated.")
305
+
306
+ # # with open(path, 'w') as f:
307
+ # # json.dump(self.report, f, indent=4)
308
+ # # print(f"Report saved successfully to {path}")
309
+
310
+ # # def save_transformed(self, path="transformed_data.csv"):
311
+ # # """Saves the transformed data from PCA/t-SNE to a .csv file."""
312
+ # # if self.transformed_data is None:
313
+ # # raise RuntimeError("No transformed data to save. Run a 'dim_reduction' task first.")
314
+
315
+ # # pd.DataFrame(self.transformed_data).to_csv(path, index=False)
316
+ # # print(f"Transformed data saved successfully to {path}")
317
+
318
+
319
+ # # ezyml/core.py
320
+
321
+ # import pandas as pd
322
+ # import numpy as np
323
+ # import pickle
324
+ # import json
325
+
326
+ # # Preprocessing
327
+ # from sklearn.model_selection import train_test_split
328
+ # from sklearn.preprocessing import StandardScaler, OneHotEncoder
329
+ # from sklearn.compose import ColumnTransformer
330
+ # from sklearn.pipeline import Pipeline
331
+ # from sklearn.impute import SimpleImputer
332
+
333
+ # # Models
334
+ # from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso, ElasticNet
335
+ # from sklearn.ensemble import (
336
+ # RandomForestClassifier, RandomForestRegressor,
337
+ # GradientBoostingClassifier, GradientBoostingRegressor,
338
+ # ExtraTreesClassifier
339
+ # )
340
+ # from sklearn.svm import SVC, SVR
341
+ # from sklearn.neighbors import KNeighborsClassifier
342
+ # from sklearn.naive_bayes import GaussianNB
343
+ # from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
344
+ # from sklearn.decomposition import PCA
345
+ # from sklearn.manifold import TSNE
346
+ # import xgboost as xgb
347
+
348
+ # # Metrics
349
+ # from sklearn.metrics import (
350
+ # accuracy_score, f1_score, roc_auc_score, confusion_matrix,
351
+ # mean_absolute_error, mean_squared_error, r2_score,
352
+ # silhouette_score
353
+ # )
354
+
355
+ # # ------------------------------------------------------------------
356
+ # # MODEL REGISTRIES
357
+ # # ------------------------------------------------------------------
358
+
359
+ # CLASSIFICATION_MODELS = {
360
+ # "logistic_regression": LogisticRegression,
361
+ # "random_forest": RandomForestClassifier,
362
+ # "xgboost": xgb.XGBClassifier,
363
+ # "svm": SVC,
364
+ # "naive_bayes": GaussianNB,
365
+ # "gradient_boosting": GradientBoostingClassifier,
366
+ # "extra_trees": ExtraTreesClassifier,
367
+ # "knn": KNeighborsClassifier,
368
+ # }
369
+
370
+ # REGRESSION_MODELS = {
371
+ # "linear_regression": LinearRegression,
372
+ # "ridge": Ridge,
373
+ # "lasso": Lasso,
374
+ # "elasticnet": ElasticNet,
375
+ # "random_forest": RandomForestRegressor,
376
+ # "xgboost": xgb.XGBRegressor,
377
+ # "svr": SVR,
378
+ # "gradient_boosting": GradientBoostingRegressor,
379
+ # }
380
+
381
+ # CLUSTERING_MODELS = {
382
+ # "kmeans": KMeans,
383
+ # "dbscan": DBSCAN,
384
+ # "agglo": AgglomerativeClustering,
385
+ # }
386
+
387
+ # DIM_REDUCTION_MODELS = {
388
+ # "pca": PCA,
389
+ # "tsne": TSNE,
390
+ # }
391
+
392
+
393
+ # # ------------------------------------------------------------------
394
+ # # EZTRAINER
395
+ # # ------------------------------------------------------------------
396
+
397
+ # class EZTrainer:
398
+ # """
399
+ # Train, evaluate, and export ML models.
400
+ # """
401
+
402
+ # def __init__(
403
+ # self,
404
+ # data,
405
+ # target=None,
406
+ # model="random_forest",
407
+ # task="auto",
408
+ # test_size=0.2,
409
+ # scale=True,
410
+ # n_components=None,
411
+ # random_state=42
412
+ # ):
413
+ # self.target = target
414
+ # self.model_name = model
415
+ # self.task = task
416
+ # self.test_size = test_size
417
+ # self.scale = scale
418
+ # self.n_components = n_components
419
+ # self.random_state = random_state
420
+
421
+ # self.df = self._load_data(data)
422
+ # self._auto_detect_task()
423
+
424
+ # # Core data
425
+ # self.X = None
426
+ # self.y = None
427
+ # self.X_train = None
428
+ # self.X_test = None
429
+ # self.y_train = None
430
+ # self.y_test = None
431
+
432
+ # # Artifacts REQUIRED by compiler
433
+ # self.y_pred = None
434
+ # self.y_prob = None
435
+
436
+ # self.pipeline = None
437
+ # self.report = {}
438
+ # self.transformed_data = None
439
+
440
+ # # ------------------------------------------------------------------
441
+
442
+ # def _load_data(self, data):
443
+ # if isinstance(data, str):
444
+ # print(f"Loading data from {data}...")
445
+ # return pd.read_csv(data)
446
+ # elif isinstance(data, pd.DataFrame):
447
+ # print("Using provided DataFrame.")
448
+ # return data.copy()
449
+ # else:
450
+ # raise TypeError("Data must be a CSV path or a pandas DataFrame.")
451
+
452
+ # # ------------------------------------------------------------------
453
+
454
+ # def _auto_detect_task(self):
455
+ # if self.task != "auto":
456
+ # print(f"Task specified as: {self.task}")
457
+ # return
458
+
459
+ # if self.target:
460
+ # if self.target not in self.df.columns:
461
+ # raise ValueError(f"Target column '{self.target}' not found.")
462
+
463
+ # dtype = self.df[self.target].dtype
464
+ # uniq = self.df[self.target].nunique()
465
+
466
+ # if pd.api.types.is_numeric_dtype(dtype) and uniq > 20:
467
+ # self.task = "regression"
468
+ # else:
469
+ # self.task = "classification"
470
+
471
+ # elif self.model_name in CLUSTERING_MODELS:
472
+ # self.task = "clustering"
473
+ # elif self.model_name in DIM_REDUCTION_MODELS:
474
+ # self.task = "dim_reduction"
475
+ # else:
476
+ # raise ValueError("Could not auto-detect task.")
477
+
478
+ # print(f"Auto-detected task as: {self.task}")
479
+
480
+ # # ------------------------------------------------------------------
481
+
482
+ # def _get_preprocessor(self):
483
+ # numerical = self.X.select_dtypes(include=np.number).columns.tolist()
484
+ # categorical = self.X.select_dtypes(include=["object", "category"]).columns.tolist()
485
+
486
+ # print(f"Identified {len(numerical)} numerical features: {numerical}")
487
+ # print(f"Identified {len(categorical)} categorical features: {categorical}")
488
+
489
+ # num_steps = [("imputer", SimpleImputer(strategy="median"))]
490
+ # if self.scale:
491
+ # num_steps.append(("scaler", StandardScaler()))
492
+
493
+ # num_pipe = Pipeline(num_steps)
494
+ # cat_pipe = Pipeline([
495
+ # ("imputer", SimpleImputer(strategy="most_frequent")),
496
+ # ("onehot", OneHotEncoder(handle_unknown="ignore"))
497
+ # ])
498
+
499
+ # return ColumnTransformer([
500
+ # ("num", num_pipe, numerical),
501
+ # ("cat", cat_pipe, categorical)
502
+ # ])
503
+
504
+ # # ------------------------------------------------------------------
505
+
506
+ # def _calculate_metrics(self):
507
+ # print("Calculating metrics...")
508
+
509
+ # if self.task == "classification":
510
+ # self.report = {
511
+ # "accuracy": accuracy_score(self.y_test, self.y_pred),
512
+ # "f1_score": f1_score(self.y_test, self.y_pred, average="weighted"),
513
+ # "confusion_matrix": confusion_matrix(self.y_test, self.y_pred).tolist(),
514
+ # }
515
+
516
+ # if self.y_prob is not None:
517
+ # try:
518
+ # if self.y_prob.ndim == 1:
519
+ # self.report["roc_auc"] = roc_auc_score(self.y_test, self.y_prob)
520
+ # else:
521
+ # self.report["roc_auc"] = roc_auc_score(
522
+ # self.y_test, self.y_prob, multi_class="ovr"
523
+ # )
524
+ # except Exception:
525
+ # self.report["roc_auc"] = None
526
+
527
+ # elif self.task == "regression":
528
+ # self.report = {
529
+ # "r2": r2_score(self.y_test, self.y_pred),
530
+ # "mae": mean_absolute_error(self.y_test, self.y_pred),
531
+ # "rmse": np.sqrt(mean_squared_error(self.y_test, self.y_pred)),
532
+ # }
533
+
534
+ # elif self.task == "clustering":
535
+ # labels = self.pipeline.named_steps["model"].labels_
536
+ # self.report = {
537
+ # "n_clusters": len(set(labels)),
538
+ # "silhouette_score": silhouette_score(self.X, labels)
539
+ # if len(set(labels)) > 1 else None
540
+ # }
541
+
542
+ # print("Metrics report:")
543
+ # print(json.dumps(self.report, indent=4))
544
+
545
+ # # ------------------------------------------------------------------
546
+
547
+ # def train(self):
548
+ # print(f"\n--- Starting Training for Task: {self.task.upper()} ---")
549
+
550
+ # if self.task in ["classification", "regression"]:
551
+ # self.X = self.df.drop(columns=[self.target])
552
+ # self.y = self.df[self.target]
553
+
554
+ # self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
555
+ # self.X, self.y,
556
+ # test_size=self.test_size,
557
+ # random_state=self.random_state
558
+ # )
559
+
560
+ # preprocessor = self._get_preprocessor()
561
+ # model_map = CLASSIFICATION_MODELS if self.task == "classification" else REGRESSION_MODELS
562
+
563
+ # if self.model_name not in model_map:
564
+ # raise ValueError(f"Model '{self.model_name}' not supported.")
565
+
566
+ # Model = model_map[self.model_name]
567
+ # model = Model(random_state=self.random_state) if "random_state" in Model().get_params() else Model()
568
+
569
+ # self.pipeline = Pipeline([
570
+ # ("preprocessor", preprocessor),
571
+ # ("model", model)
572
+ # ])
573
+
574
+ # print(f"Training {self.model_name} model...")
575
+ # self.pipeline.fit(self.X_train, self.y_train)
576
+
577
+ # # REQUIRED FOR COMPILER
578
+ # self.y_pred = self.pipeline.predict(self.X_test)
579
+
580
+ # if hasattr(self.pipeline.named_steps["model"], "predict_proba"):
581
+ # probs = self.pipeline.predict_proba(self.X_test)
582
+ # self.y_prob = probs[:, 1] if probs.shape[1] == 2 else probs
583
+ # else:
584
+ # self.y_prob = None
585
+
586
+ # self._calculate_metrics()
587
+
588
+ # elif self.task == "clustering":
589
+ # self.X = self.df.copy()
590
+ # preprocessor = self._get_preprocessor()
591
+ # model = CLUSTERING_MODELS[self.model_name]()
592
+
593
+ # self.pipeline = Pipeline([
594
+ # ("preprocessor", preprocessor),
595
+ # ("model", model)
596
+ # ])
597
+
598
+ # self.pipeline.fit(self.X)
599
+ # self._calculate_metrics()
600
+
601
+ # elif self.task == "dim_reduction":
602
+ # self.X = self.df.copy()
603
+ # preprocessor = self._get_preprocessor()
604
+ # Model = DIM_REDUCTION_MODELS[self.model_name]
605
+
606
+ # model = Model(
607
+ # n_components=self.n_components,
608
+ # random_state=self.random_state
609
+ # ) if self.n_components else Model(random_state=self.random_state)
610
+
611
+ # self.pipeline = Pipeline([
612
+ # ("preprocessor", preprocessor),
613
+ # ("model", model)
614
+ # ])
615
+
616
+ # self.transformed_data = self.pipeline.fit_transform(self.X)
617
+ # print(f"Data transformed into {self.transformed_data.shape[1]} dimensions.")
618
+
619
+ # else:
620
+ # raise ValueError(f"Task '{self.task}' not supported.")
621
+
622
+ # print("--- Training Complete ---")
623
+ # return self
624
+
625
+ # # ------------------------------------------------------------------
626
+
627
+ # def predict(self, X_new):
628
+ # if not self.pipeline:
629
+ # raise RuntimeError("Model not trained.")
630
+
631
+ # if isinstance(X_new, str):
632
+ # X_new = pd.read_csv(X_new)
633
+
634
+ # return self.pipeline.predict(X_new)
635
+
636
+ # # ------------------------------------------------------------------
637
+
638
+ # def save_model(self, path="model.pkl"):
639
+ # with open(path, "wb") as f:
640
+ # pickle.dump(self.pipeline, f)
641
+ # print(f"Model saved successfully to {path}")
642
+
643
+ # # ------------------------------------------------------------------
644
+
645
+ # def save_report(self, path="report.json"):
646
+ # with open(path, "w") as f:
647
+ # json.dump(self.report, f, indent=4)
648
+ # print(f"Report saved successfully to {path}")
649
+
650
+ # # ------------------------------------------------------------------
651
+
652
+ # def save_transformed(self, path="transformed_data.csv"):
653
+ # if self.transformed_data is None:
654
+ # raise RuntimeError("No transformed data.")
655
+ # pd.DataFrame(self.transformed_data).to_csv(path, index=False)
656
+ # print(f"Transformed data saved to {path}")
657
+
658
+
659
+ # ezyml/core.py
660
+
661
+ import pandas as pd
662
+ import numpy as np
663
+ import pickle
664
+ import json
665
+
666
+ # ======================================================
667
+ # PREPROCESSING
668
+ # ======================================================
669
+ from sklearn.model_selection import train_test_split
670
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder
671
+ from sklearn.compose import ColumnTransformer
672
+ from sklearn.pipeline import Pipeline
673
+ from sklearn.impute import SimpleImputer
674
+
675
+ # ======================================================
676
+ # MODELS
677
+ # ======================================================
678
+ from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso, ElasticNet
679
+ from sklearn.ensemble import (
680
+ RandomForestClassifier,
681
+ RandomForestRegressor,
682
+ GradientBoostingClassifier,
683
+ GradientBoostingRegressor,
684
+ ExtraTreesClassifier
685
+ )
686
+ from sklearn.svm import SVC, SVR
687
+ from sklearn.neighbors import KNeighborsClassifier
688
+ from sklearn.naive_bayes import GaussianNB
689
+ from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
690
+ from sklearn.decomposition import PCA
691
+ from sklearn.manifold import TSNE
692
+ import xgboost as xgb
693
+
694
+ # ======================================================
695
+ # METRICS
696
+ # ======================================================
697
+ from sklearn.metrics import (
698
+ accuracy_score,
699
+ f1_score,
700
+ roc_auc_score,
701
+ confusion_matrix,
702
+ mean_absolute_error,
703
+ mean_squared_error,
704
+ r2_score,
705
+ silhouette_score
706
+ )
707
+
708
+ # ======================================================
709
+ # MODEL REGISTRIES
710
+ # ======================================================
711
+ CLASSIFICATION_MODELS = {
712
+ "logistic_regression": LogisticRegression,
713
+ "random_forest": RandomForestClassifier,
714
+ "xgboost": xgb.XGBClassifier,
715
+ "svm": SVC,
716
+ "naive_bayes": GaussianNB,
717
+ "gradient_boosting": GradientBoostingClassifier,
718
+ "extra_trees": ExtraTreesClassifier,
719
+ "knn": KNeighborsClassifier,
720
+ }
721
+
722
+ REGRESSION_MODELS = {
723
+ "linear_regression": LinearRegression,
724
+ "ridge": Ridge,
725
+ "lasso": Lasso,
726
+ "elasticnet": ElasticNet,
727
+ "random_forest": RandomForestRegressor,
728
+ "xgboost": xgb.XGBRegressor,
729
+ "svr": SVR,
730
+ "gradient_boosting": GradientBoostingRegressor,
731
+ }
732
+
733
+ CLUSTERING_MODELS = {
734
+ "kmeans": KMeans,
735
+ "dbscan": DBSCAN,
736
+ "agglo": AgglomerativeClustering,
737
+ }
738
+
739
+ DIM_REDUCTION_MODELS = {
740
+ "pca": PCA,
741
+ "tsne": TSNE,
742
+ }
743
+
744
+ # ======================================================
745
+ # EZTRAINER
746
+ # ======================================================
747
+ class EZTrainer:
748
+ """
749
+ Core trainer class used by:
750
+ - CLI (train / reduce)
751
+ - Pipeline
752
+ - Compiler (compile)
753
+ """
754
+
755
+ def __init__(
756
+ self,
757
+ data,
758
+ target=None,
759
+ model="random_forest",
760
+ task="auto",
761
+ test_size=0.2,
762
+ scale=True,
763
+ n_components=None,
764
+ random_state=42
765
+ ):
766
+ self.target = target
767
+ self.model_name = model
768
+ self.task = task
769
+ self.test_size = test_size
770
+ self.scale = scale
771
+ self.n_components = n_components
772
+ self.random_state = random_state
773
+
774
+ # Load data
775
+ self.df = self._load_data(data)
776
+ self._auto_detect_task()
777
+
778
+ # Data containers
779
+ self.X = None
780
+ self.y = None
781
+ self.X_train = None
782
+ self.X_test = None
783
+ self.y_train = None
784
+ self.y_test = None
785
+
786
+ # ===== REQUIRED CONTRACT ATTRIBUTES =====
787
+ self.pipeline = None # full sklearn pipeline
788
+ self.model = None # trained estimator only
789
+ self.y_pred = None # predictions
790
+ self.y_prob = None # probabilities (if any)
791
+ self.report = {} # metrics
792
+ self.transformed_data = None
793
+
794
+ # ==================================================
795
+ # INTERNAL HELPERS
796
+ # ==================================================
797
+ def _load_data(self, data):
798
+ if isinstance(data, str):
799
+ print(f"Loading data from {data}...")
800
+ return pd.read_csv(data)
801
+ elif isinstance(data, pd.DataFrame):
802
+ print("Using provided DataFrame.")
803
+ return data.copy()
804
+ else:
805
+ raise TypeError("Data must be a CSV path or pandas DataFrame.")
806
+
807
+ def _auto_detect_task(self):
808
+ if self.task != "auto":
809
+ print(f"Task specified as: {self.task}")
810
+ return
811
+
812
+ if self.target:
813
+ if self.target not in self.df.columns:
814
+ raise ValueError(f"Target column '{self.target}' not found.")
815
+
816
+ dtype = self.df[self.target].dtype
817
+ uniq = self.df[self.target].nunique()
818
+
819
+ if pd.api.types.is_numeric_dtype(dtype) and uniq > 20:
820
+ self.task = "regression"
821
+ else:
822
+ self.task = "classification"
823
+
824
+ elif self.model_name in CLUSTERING_MODELS:
825
+ self.task = "clustering"
826
+ elif self.model_name in DIM_REDUCTION_MODELS:
827
+ self.task = "dim_reduction"
828
+ else:
829
+ raise ValueError("Could not auto-detect task.")
830
+
831
+ print(f"Task specified as: {self.task}")
832
+
833
+ def _get_preprocessor(self):
834
+ numerical = self.X.select_dtypes(include=np.number).columns.tolist()
835
+ categorical = self.X.select_dtypes(include=["object", "category"]).columns.tolist()
836
+
837
+ print(f"Identified {len(numerical)} numerical features: {numerical}")
838
+ print(f"Identified {len(categorical)} categorical features: {categorical}")
839
+
840
+ num_steps = [("imputer", SimpleImputer(strategy="median"))]
841
+ if self.scale:
842
+ num_steps.append(("scaler", StandardScaler()))
843
+
844
+ num_pipe = Pipeline(num_steps)
845
+ cat_pipe = Pipeline([
846
+ ("imputer", SimpleImputer(strategy="most_frequent")),
847
+ ("onehot", OneHotEncoder(handle_unknown="ignore"))
848
+ ])
849
+
850
+ return ColumnTransformer([
851
+ ("num", num_pipe, numerical),
852
+ ("cat", cat_pipe, categorical)
853
+ ])
854
+
855
+ # ==================================================
856
+ # METRICS
857
+ # ==================================================
858
+ def _calculate_metrics(self):
859
+ print("Calculating metrics...")
860
+
861
+ if self.task == "classification":
862
+ self.report = {
863
+ "accuracy": accuracy_score(self.y_test, self.y_pred),
864
+ "f1_score": f1_score(self.y_test, self.y_pred, average="weighted"),
865
+ "confusion_matrix": confusion_matrix(self.y_test, self.y_pred).tolist()
866
+ }
867
+
868
+ if self.y_prob is not None:
869
+ try:
870
+ if self.y_prob.ndim == 1:
871
+ self.report["roc_auc"] = roc_auc_score(self.y_test, self.y_prob)
872
+ else:
873
+ self.report["roc_auc"] = roc_auc_score(
874
+ self.y_test, self.y_prob, multi_class="ovr"
875
+ )
876
+ except Exception:
877
+ self.report["roc_auc"] = None
878
+
879
+ elif self.task == "regression":
880
+ self.report = {
881
+ "r2": r2_score(self.y_test, self.y_pred),
882
+ "mae": mean_absolute_error(self.y_test, self.y_pred),
883
+ "rmse": np.sqrt(mean_squared_error(self.y_test, self.y_pred))
884
+ }
885
+
886
+ elif self.task == "clustering":
887
+ labels = self.pipeline.named_steps["model"].labels_
888
+ self.report = {
889
+ "n_clusters": len(set(labels)),
890
+ "silhouette_score": silhouette_score(self.X, labels)
891
+ if len(set(labels)) > 1 else None
892
+ }
893
+
894
+ print("Metrics report:")
895
+ print(json.dumps(self.report, indent=4))
896
+
897
+ # ==================================================
898
+ # TRAIN
899
+ # ==================================================
900
+ def train(self):
901
+ print(f"\n--- Starting Training for Task: {self.task.upper()} ---")
902
+
903
+ if self.task in ["classification", "regression"]:
904
+ self.X = self.df.drop(columns=[self.target])
905
+ self.y = self.df[self.target]
906
+
907
+ self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
908
+ self.X,
909
+ self.y,
910
+ test_size=self.test_size,
911
+ random_state=self.random_state
912
+ )
913
+
914
+ preprocessor = self._get_preprocessor()
915
+ model_map = CLASSIFICATION_MODELS if self.task == "classification" else REGRESSION_MODELS
916
+
917
+ ModelCls = model_map[self.model_name]
918
+ model = (
919
+ ModelCls(random_state=self.random_state)
920
+ if "random_state" in ModelCls().get_params()
921
+ else ModelCls()
922
+ )
923
+
924
+ self.pipeline = Pipeline([
925
+ ("preprocessor", preprocessor),
926
+ ("model", model)
927
+ ])
928
+
929
+ print(f"Training {self.model_name} model...")
930
+ self.pipeline.fit(self.X_train, self.y_train)
931
+
932
+ # ===== REQUIRED EXPORTS =====
933
+ self.model = self.pipeline.named_steps["model"]
934
+ self.y_pred = self.pipeline.predict(self.X_test)
935
+
936
+ if hasattr(self.model, "predict_proba"):
937
+ probs = self.pipeline.predict_proba(self.X_test)
938
+ self.y_prob = probs[:, 1] if probs.shape[1] == 2 else probs
939
+ else:
940
+ self.y_prob = None
941
+
942
+ self._calculate_metrics()
943
+
944
+ elif self.task == "clustering":
945
+ self.X = self.df.copy()
946
+ preprocessor = self._get_preprocessor()
947
+ model = CLUSTERING_MODELS[self.model_name]()
948
+
949
+ self.pipeline = Pipeline([
950
+ ("preprocessor", preprocessor),
951
+ ("model", model)
952
+ ])
953
+
954
+ self.pipeline.fit(self.X)
955
+ self.model = model
956
+ self._calculate_metrics()
957
+
958
+ elif self.task == "dim_reduction":
959
+ self.X = self.df.copy()
960
+ preprocessor = self._get_preprocessor()
961
+ ModelCls = DIM_REDUCTION_MODELS[self.model_name]
962
+
963
+ model = (
964
+ ModelCls(n_components=self.n_components, random_state=self.random_state)
965
+ if self.n_components else ModelCls(random_state=self.random_state)
966
+ )
967
+
968
+ self.pipeline = Pipeline([
969
+ ("preprocessor", preprocessor),
970
+ ("model", model)
971
+ ])
972
+
973
+ self.transformed_data = self.pipeline.fit_transform(self.X)
974
+ self.model = model
975
+ print(f"Data transformed into {self.transformed_data.shape[1]} dimensions.")
976
+
977
+ else:
978
+ raise ValueError(f"Task '{self.task}' not supported.")
979
+
980
+ print("--- Training Complete ---")
981
+ return self
982
+
983
+ # ==================================================
984
+ # UTILITIES
985
+ # ==================================================
986
+ def predict(self, X_new):
987
+ if isinstance(X_new, str):
988
+ X_new = pd.read_csv(X_new)
989
+ return self.pipeline.predict(X_new)
990
+
991
+ def save_model(self, path="model.pkl"):
992
+ with open(path, "wb") as f:
993
+ pickle.dump(self.pipeline, f)
994
+ print(f"Model saved successfully to {path}")
995
+
996
+ def save_report(self, path="report.json"):
997
+ with open(path, "w") as f:
998
+ json.dump(self.report, f, indent=4)
999
+ print(f"Report saved successfully to {path}")
1000
+
1001
+ def save_transformed(self, path="transformed_data.csv"):
1002
+ if self.transformed_data is None:
1003
+ raise RuntimeError("No transformed data available.")
1004
+ pd.DataFrame(self.transformed_data).to_csv(path, index=False)
1005
+ print(f"Transformed data saved to {path}")