adamops 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- adamops/__init__.py +40 -0
- adamops/cli.py +163 -0
- adamops/data/__init__.py +24 -0
- adamops/data/feature_engineering.py +284 -0
- adamops/data/loaders.py +922 -0
- adamops/data/preprocessors.py +227 -0
- adamops/data/splitters.py +218 -0
- adamops/data/validators.py +148 -0
- adamops/deployment/__init__.py +21 -0
- adamops/deployment/api.py +237 -0
- adamops/deployment/cloud.py +191 -0
- adamops/deployment/containerize.py +262 -0
- adamops/deployment/exporters.py +148 -0
- adamops/evaluation/__init__.py +24 -0
- adamops/evaluation/comparison.py +133 -0
- adamops/evaluation/explainability.py +143 -0
- adamops/evaluation/metrics.py +233 -0
- adamops/evaluation/reports.py +165 -0
- adamops/evaluation/visualization.py +238 -0
- adamops/models/__init__.py +21 -0
- adamops/models/automl.py +277 -0
- adamops/models/ensembles.py +228 -0
- adamops/models/modelops.py +308 -0
- adamops/models/registry.py +250 -0
- adamops/monitoring/__init__.py +21 -0
- adamops/monitoring/alerts.py +200 -0
- adamops/monitoring/dashboard.py +117 -0
- adamops/monitoring/drift.py +212 -0
- adamops/monitoring/performance.py +195 -0
- adamops/pipelines/__init__.py +15 -0
- adamops/pipelines/orchestrators.py +183 -0
- adamops/pipelines/workflows.py +212 -0
- adamops/utils/__init__.py +18 -0
- adamops/utils/config.py +457 -0
- adamops/utils/helpers.py +663 -0
- adamops/utils/logging.py +412 -0
- adamops-0.1.0.dist-info/METADATA +310 -0
- adamops-0.1.0.dist-info/RECORD +42 -0
- adamops-0.1.0.dist-info/WHEEL +5 -0
- adamops-0.1.0.dist-info/entry_points.txt +2 -0
- adamops-0.1.0.dist-info/licenses/LICENSE +21 -0
- adamops-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AdamOps Ensemble Models Module
|
|
3
|
+
|
|
4
|
+
Provides voting, stacking, blending, and weighted averaging ensembles.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from sklearn.ensemble import VotingClassifier, VotingRegressor, StackingClassifier, StackingRegressor
|
|
11
|
+
from sklearn.linear_model import LogisticRegression, Ridge
|
|
12
|
+
from sklearn.model_selection import cross_val_predict
|
|
13
|
+
|
|
14
|
+
from adamops.utils.logging import get_logger
|
|
15
|
+
from adamops.models.modelops import CLASSIFICATION_MODELS, REGRESSION_MODELS
|
|
16
|
+
|
|
17
|
+
logger = get_logger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class VotingEnsemble:
|
|
21
|
+
"""Voting ensemble for classification or regression."""
|
|
22
|
+
|
|
23
|
+
def __init__(self, estimators: List[Tuple[str, Any]], voting: str = "soft",
|
|
24
|
+
weights: Optional[List[float]] = None, task: str = "classification"):
|
|
25
|
+
self.estimators = estimators
|
|
26
|
+
self.voting = voting
|
|
27
|
+
self.weights = weights
|
|
28
|
+
self.task = task
|
|
29
|
+
|
|
30
|
+
if task == "classification":
|
|
31
|
+
self.model = VotingClassifier(estimators, voting=voting, weights=weights)
|
|
32
|
+
else:
|
|
33
|
+
self.model = VotingRegressor(estimators, weights=weights)
|
|
34
|
+
|
|
35
|
+
def fit(self, X, y):
|
|
36
|
+
self.model.fit(X, y)
|
|
37
|
+
return self
|
|
38
|
+
|
|
39
|
+
def predict(self, X):
|
|
40
|
+
return self.model.predict(X)
|
|
41
|
+
|
|
42
|
+
def predict_proba(self, X):
|
|
43
|
+
if self.task == "classification" and hasattr(self.model, "predict_proba"):
|
|
44
|
+
return self.model.predict_proba(X)
|
|
45
|
+
raise ValueError("Not available for regression")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class StackingEnsemble:
|
|
49
|
+
"""Stacking ensemble with meta-learner."""
|
|
50
|
+
|
|
51
|
+
def __init__(self, estimators: List[Tuple[str, Any]],
|
|
52
|
+
final_estimator: Optional[Any] = None,
|
|
53
|
+
task: str = "classification", cv: int = 5):
|
|
54
|
+
self.estimators = estimators
|
|
55
|
+
self.task = task
|
|
56
|
+
self.cv = cv
|
|
57
|
+
|
|
58
|
+
if final_estimator is None:
|
|
59
|
+
final_estimator = LogisticRegression() if task == "classification" else Ridge()
|
|
60
|
+
|
|
61
|
+
if task == "classification":
|
|
62
|
+
self.model = StackingClassifier(
|
|
63
|
+
estimators, final_estimator=final_estimator, cv=cv
|
|
64
|
+
)
|
|
65
|
+
else:
|
|
66
|
+
self.model = StackingRegressor(
|
|
67
|
+
estimators, final_estimator=final_estimator, cv=cv
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
def fit(self, X, y):
|
|
71
|
+
self.model.fit(X, y)
|
|
72
|
+
return self
|
|
73
|
+
|
|
74
|
+
def predict(self, X):
|
|
75
|
+
return self.model.predict(X)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class BlendingEnsemble:
|
|
79
|
+
"""Blending ensemble (holdout-based stacking)."""
|
|
80
|
+
|
|
81
|
+
def __init__(self, estimators: List[Tuple[str, Any]],
|
|
82
|
+
final_estimator: Optional[Any] = None,
|
|
83
|
+
task: str = "classification", blend_ratio: float = 0.2):
|
|
84
|
+
self.estimators = estimators
|
|
85
|
+
self.task = task
|
|
86
|
+
self.blend_ratio = blend_ratio
|
|
87
|
+
self.final_estimator = final_estimator or (
|
|
88
|
+
LogisticRegression() if task == "classification" else Ridge()
|
|
89
|
+
)
|
|
90
|
+
self.fitted_estimators = []
|
|
91
|
+
|
|
92
|
+
def fit(self, X, y):
|
|
93
|
+
from sklearn.model_selection import train_test_split
|
|
94
|
+
|
|
95
|
+
# Split for blending
|
|
96
|
+
X_train, X_blend, y_train, y_blend = train_test_split(
|
|
97
|
+
X, y, test_size=self.blend_ratio, random_state=42
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# Fit base models and get blend predictions
|
|
101
|
+
blend_features = []
|
|
102
|
+
self.fitted_estimators = []
|
|
103
|
+
|
|
104
|
+
for name, estimator in self.estimators:
|
|
105
|
+
estimator.fit(X_train, y_train)
|
|
106
|
+
self.fitted_estimators.append((name, estimator))
|
|
107
|
+
|
|
108
|
+
if self.task == "classification" and hasattr(estimator, "predict_proba"):
|
|
109
|
+
preds = estimator.predict_proba(X_blend)[:, 1]
|
|
110
|
+
else:
|
|
111
|
+
preds = estimator.predict(X_blend)
|
|
112
|
+
blend_features.append(preds)
|
|
113
|
+
|
|
114
|
+
# Stack blend predictions
|
|
115
|
+
blend_X = np.column_stack(blend_features)
|
|
116
|
+
|
|
117
|
+
# Fit meta-learner
|
|
118
|
+
self.final_estimator.fit(blend_X, y_blend)
|
|
119
|
+
|
|
120
|
+
return self
|
|
121
|
+
|
|
122
|
+
def predict(self, X):
|
|
123
|
+
# Get predictions from base models
|
|
124
|
+
features = []
|
|
125
|
+
for name, estimator in self.fitted_estimators:
|
|
126
|
+
if self.task == "classification" and hasattr(estimator, "predict_proba"):
|
|
127
|
+
preds = estimator.predict_proba(X)[:, 1]
|
|
128
|
+
else:
|
|
129
|
+
preds = estimator.predict(X)
|
|
130
|
+
features.append(preds)
|
|
131
|
+
|
|
132
|
+
meta_X = np.column_stack(features)
|
|
133
|
+
return self.final_estimator.predict(meta_X)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
class WeightedAverageEnsemble:
|
|
137
|
+
"""Weighted average ensemble."""
|
|
138
|
+
|
|
139
|
+
def __init__(self, estimators: List[Tuple[str, Any]],
|
|
140
|
+
weights: Optional[List[float]] = None,
|
|
141
|
+
task: str = "classification"):
|
|
142
|
+
self.estimators = estimators
|
|
143
|
+
self.weights = weights or [1.0 / len(estimators)] * len(estimators)
|
|
144
|
+
self.task = task
|
|
145
|
+
self.fitted_estimators = []
|
|
146
|
+
|
|
147
|
+
def fit(self, X, y):
|
|
148
|
+
self.fitted_estimators = []
|
|
149
|
+
for name, estimator in self.estimators:
|
|
150
|
+
estimator.fit(X, y)
|
|
151
|
+
self.fitted_estimators.append((name, estimator))
|
|
152
|
+
return self
|
|
153
|
+
|
|
154
|
+
def predict(self, X):
|
|
155
|
+
predictions = []
|
|
156
|
+
for (name, estimator), weight in zip(self.fitted_estimators, self.weights):
|
|
157
|
+
pred = estimator.predict(X)
|
|
158
|
+
predictions.append(pred * weight)
|
|
159
|
+
|
|
160
|
+
weighted_sum = sum(predictions)
|
|
161
|
+
|
|
162
|
+
if self.task == "classification":
|
|
163
|
+
return (weighted_sum > 0.5).astype(int)
|
|
164
|
+
return weighted_sum
|
|
165
|
+
|
|
166
|
+
def predict_proba(self, X):
|
|
167
|
+
if self.task != "classification":
|
|
168
|
+
raise ValueError("Not available for regression")
|
|
169
|
+
|
|
170
|
+
probas = []
|
|
171
|
+
for (name, estimator), weight in zip(self.fitted_estimators, self.weights):
|
|
172
|
+
if hasattr(estimator, "predict_proba"):
|
|
173
|
+
probas.append(estimator.predict_proba(X) * weight)
|
|
174
|
+
|
|
175
|
+
return sum(probas)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def create_voting_ensemble(
|
|
179
|
+
algorithms: List[str], task: str = "classification",
|
|
180
|
+
voting: str = "soft", weights: Optional[List[float]] = None
|
|
181
|
+
) -> VotingEnsemble:
|
|
182
|
+
"""Create voting ensemble from algorithm names."""
|
|
183
|
+
models = CLASSIFICATION_MODELS if task == "classification" else REGRESSION_MODELS
|
|
184
|
+
estimators = [(alg, models[alg]()) for alg in algorithms if alg in models]
|
|
185
|
+
return VotingEnsemble(estimators, voting=voting, weights=weights, task=task)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def create_stacking_ensemble(
|
|
189
|
+
algorithms: List[str], task: str = "classification",
|
|
190
|
+
final_estimator: Optional[Any] = None, cv: int = 5
|
|
191
|
+
) -> StackingEnsemble:
|
|
192
|
+
"""Create stacking ensemble from algorithm names."""
|
|
193
|
+
models = CLASSIFICATION_MODELS if task == "classification" else REGRESSION_MODELS
|
|
194
|
+
estimators = [(alg, models[alg]()) for alg in algorithms if alg in models]
|
|
195
|
+
return StackingEnsemble(estimators, final_estimator, task, cv)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def auto_ensemble(
|
|
199
|
+
X, y, task: str = "classification", top_n: int = 3, cv: int = 5
|
|
200
|
+
) -> Tuple[Any, Dict]:
|
|
201
|
+
"""
|
|
202
|
+
Automatically select and create best ensemble.
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
(ensemble, results): Best ensemble and evaluation results.
|
|
206
|
+
"""
|
|
207
|
+
from adamops.models.modelops import compare_models
|
|
208
|
+
|
|
209
|
+
# Compare base models
|
|
210
|
+
comparison = compare_models(X, y, task, cv=cv)
|
|
211
|
+
top_algorithms = comparison.head(top_n)["algorithm"].tolist()
|
|
212
|
+
|
|
213
|
+
logger.info(f"Selected top {top_n} algorithms: {top_algorithms}")
|
|
214
|
+
|
|
215
|
+
# Create and evaluate ensembles
|
|
216
|
+
results = {}
|
|
217
|
+
|
|
218
|
+
# Voting
|
|
219
|
+
voting = create_voting_ensemble(top_algorithms, task)
|
|
220
|
+
voting.fit(X, y)
|
|
221
|
+
results["voting"] = voting
|
|
222
|
+
|
|
223
|
+
# Stacking
|
|
224
|
+
stacking = create_stacking_ensemble(top_algorithms, task, cv=cv)
|
|
225
|
+
stacking.fit(X, y)
|
|
226
|
+
results["stacking"] = stacking
|
|
227
|
+
|
|
228
|
+
return stacking, {"algorithms": top_algorithms, "ensembles": list(results.keys())}
|
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AdamOps ModelOps Module
|
|
3
|
+
|
|
4
|
+
Provides model training for regression, classification, and clustering.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
import joblib
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LogisticRegression
|
|
14
|
+
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
|
15
|
+
from sklearn.ensemble import (
|
|
16
|
+
GradientBoostingClassifier, GradientBoostingRegressor,
|
|
17
|
+
RandomForestClassifier, RandomForestRegressor
|
|
18
|
+
)
|
|
19
|
+
from sklearn.naive_bayes import GaussianNB
|
|
20
|
+
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
|
|
21
|
+
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
|
|
22
|
+
from sklearn.mixture import GaussianMixture
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
import xgboost as xgb
|
|
26
|
+
XGB_AVAILABLE = True
|
|
27
|
+
except ImportError:
|
|
28
|
+
XGB_AVAILABLE = False
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
import lightgbm as lgb
|
|
32
|
+
LGB_AVAILABLE = True
|
|
33
|
+
except ImportError:
|
|
34
|
+
LGB_AVAILABLE = False
|
|
35
|
+
|
|
36
|
+
from adamops.utils.logging import get_logger
|
|
37
|
+
from adamops.utils.helpers import infer_task_type
|
|
38
|
+
|
|
39
|
+
logger = get_logger(__name__)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# Model Registry
|
|
43
|
+
REGRESSION_MODELS = {
|
|
44
|
+
"ridge": Ridge,
|
|
45
|
+
"lasso": Lasso,
|
|
46
|
+
"elasticnet": ElasticNet,
|
|
47
|
+
"decision_tree": DecisionTreeRegressor,
|
|
48
|
+
"random_forest": RandomForestRegressor,
|
|
49
|
+
"gradient_boosting": GradientBoostingRegressor,
|
|
50
|
+
"knn": KNeighborsRegressor,
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
CLASSIFICATION_MODELS = {
|
|
54
|
+
"logistic": LogisticRegression,
|
|
55
|
+
"decision_tree": DecisionTreeClassifier,
|
|
56
|
+
"random_forest": RandomForestClassifier,
|
|
57
|
+
"gradient_boosting": GradientBoostingClassifier,
|
|
58
|
+
"naive_bayes": GaussianNB,
|
|
59
|
+
"knn": KNeighborsClassifier,
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
CLUSTERING_MODELS = {
|
|
63
|
+
"kmeans": KMeans,
|
|
64
|
+
"dbscan": DBSCAN,
|
|
65
|
+
"hierarchical": AgglomerativeClustering,
|
|
66
|
+
"gmm": GaussianMixture,
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
# Add XGBoost if available
|
|
70
|
+
if XGB_AVAILABLE:
|
|
71
|
+
REGRESSION_MODELS["xgboost"] = xgb.XGBRegressor
|
|
72
|
+
CLASSIFICATION_MODELS["xgboost"] = xgb.XGBClassifier
|
|
73
|
+
|
|
74
|
+
# Add LightGBM if available
|
|
75
|
+
if LGB_AVAILABLE:
|
|
76
|
+
REGRESSION_MODELS["lightgbm"] = lgb.LGBMRegressor
|
|
77
|
+
CLASSIFICATION_MODELS["lightgbm"] = lgb.LGBMClassifier
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class TrainedModel:
|
|
81
|
+
"""Wrapper for trained models with metadata."""
|
|
82
|
+
|
|
83
|
+
def __init__(self, model: Any, task: str, algorithm: str, params: Dict,
|
|
84
|
+
feature_names: Optional[List[str]] = None):
|
|
85
|
+
self.model = model
|
|
86
|
+
self.task = task
|
|
87
|
+
self.algorithm = algorithm
|
|
88
|
+
self.params = params
|
|
89
|
+
self.feature_names = feature_names
|
|
90
|
+
self.is_fitted = True
|
|
91
|
+
|
|
92
|
+
def predict(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray:
|
|
93
|
+
"""Make predictions."""
|
|
94
|
+
return self.model.predict(X)
|
|
95
|
+
|
|
96
|
+
def predict_proba(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray:
|
|
97
|
+
"""Predict probabilities (classification only)."""
|
|
98
|
+
if hasattr(self.model, 'predict_proba'):
|
|
99
|
+
return self.model.predict_proba(X)
|
|
100
|
+
raise ValueError("Model does not support probability predictions")
|
|
101
|
+
|
|
102
|
+
def save(self, filepath: Union[str, Path]) -> None:
|
|
103
|
+
"""Save model to file."""
|
|
104
|
+
filepath = Path(filepath)
|
|
105
|
+
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
106
|
+
joblib.dump(self, filepath)
|
|
107
|
+
logger.info(f"Model saved to {filepath}")
|
|
108
|
+
|
|
109
|
+
@classmethod
|
|
110
|
+
def load(cls, filepath: Union[str, Path]) -> "TrainedModel":
|
|
111
|
+
"""Load model from file."""
|
|
112
|
+
return joblib.load(filepath)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def get_available_models(task: str = "classification") -> List[str]:
|
|
116
|
+
"""Get list of available models for a task."""
|
|
117
|
+
if task == "classification":
|
|
118
|
+
return list(CLASSIFICATION_MODELS.keys())
|
|
119
|
+
elif task == "regression":
|
|
120
|
+
return list(REGRESSION_MODELS.keys())
|
|
121
|
+
elif task == "clustering":
|
|
122
|
+
return list(CLUSTERING_MODELS.keys())
|
|
123
|
+
return []
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def train(
|
|
127
|
+
X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray],
|
|
128
|
+
task: str = "auto", algorithm: str = "random_forest",
|
|
129
|
+
params: Optional[Dict] = None, random_state: int = 42
|
|
130
|
+
) -> TrainedModel:
|
|
131
|
+
"""
|
|
132
|
+
Train a model.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
X: Features.
|
|
136
|
+
y: Target.
|
|
137
|
+
task: 'classification', 'regression', or 'auto'.
|
|
138
|
+
algorithm: Model algorithm name.
|
|
139
|
+
params: Model hyperparameters.
|
|
140
|
+
random_state: Random seed.
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
TrainedModel: Trained model wrapper.
|
|
144
|
+
"""
|
|
145
|
+
# Auto-detect task type
|
|
146
|
+
if task == "auto":
|
|
147
|
+
task = infer_task_type(y)
|
|
148
|
+
logger.info(f"Auto-detected task: {task}")
|
|
149
|
+
|
|
150
|
+
# Get model class
|
|
151
|
+
if task in ["classification", "multiclass"]:
|
|
152
|
+
if algorithm not in CLASSIFICATION_MODELS:
|
|
153
|
+
raise ValueError(f"Unknown classification algorithm: {algorithm}")
|
|
154
|
+
model_class = CLASSIFICATION_MODELS[algorithm]
|
|
155
|
+
elif task == "regression":
|
|
156
|
+
if algorithm not in REGRESSION_MODELS:
|
|
157
|
+
raise ValueError(f"Unknown regression algorithm: {algorithm}")
|
|
158
|
+
model_class = REGRESSION_MODELS[algorithm]
|
|
159
|
+
else:
|
|
160
|
+
raise ValueError(f"Unknown task: {task}")
|
|
161
|
+
|
|
162
|
+
# Set default params
|
|
163
|
+
default_params = {"random_state": random_state}
|
|
164
|
+
if params:
|
|
165
|
+
default_params.update(params)
|
|
166
|
+
|
|
167
|
+
# Filter params for model
|
|
168
|
+
import inspect
|
|
169
|
+
sig = inspect.signature(model_class)
|
|
170
|
+
valid_params = {k: v for k, v in default_params.items() if k in sig.parameters}
|
|
171
|
+
|
|
172
|
+
# Create and train model
|
|
173
|
+
logger.info(f"Training {algorithm} for {task}")
|
|
174
|
+
model = model_class(**valid_params)
|
|
175
|
+
model.fit(X, y)
|
|
176
|
+
|
|
177
|
+
feature_names = X.columns.tolist() if isinstance(X, pd.DataFrame) else None
|
|
178
|
+
|
|
179
|
+
return TrainedModel(
|
|
180
|
+
model=model, task=task, algorithm=algorithm,
|
|
181
|
+
params=valid_params, feature_names=feature_names
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def train_regression(
|
|
186
|
+
X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray],
|
|
187
|
+
algorithm: str = "ridge", params: Optional[Dict] = None
|
|
188
|
+
) -> TrainedModel:
|
|
189
|
+
"""Train a regression model."""
|
|
190
|
+
return train(X, y, task="regression", algorithm=algorithm, params=params)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def train_classification(
|
|
194
|
+
X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray],
|
|
195
|
+
algorithm: str = "random_forest", params: Optional[Dict] = None
|
|
196
|
+
) -> TrainedModel:
|
|
197
|
+
"""Train a classification model."""
|
|
198
|
+
return train(X, y, task="classification", algorithm=algorithm, params=params)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def train_clustering(
|
|
202
|
+
X: Union[pd.DataFrame, np.ndarray], algorithm: str = "kmeans",
|
|
203
|
+
n_clusters: int = 3, params: Optional[Dict] = None
|
|
204
|
+
) -> Tuple[Any, np.ndarray]:
|
|
205
|
+
"""
|
|
206
|
+
Train a clustering model.
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
(model, labels): Fitted model and cluster labels.
|
|
210
|
+
"""
|
|
211
|
+
if algorithm not in CLUSTERING_MODELS:
|
|
212
|
+
raise ValueError(f"Unknown clustering algorithm: {algorithm}")
|
|
213
|
+
|
|
214
|
+
model_class = CLUSTERING_MODELS[algorithm]
|
|
215
|
+
default_params = {}
|
|
216
|
+
|
|
217
|
+
# Set n_clusters for algorithms that support it
|
|
218
|
+
if algorithm in ["kmeans", "hierarchical"]:
|
|
219
|
+
default_params["n_clusters"] = n_clusters
|
|
220
|
+
elif algorithm == "gmm":
|
|
221
|
+
default_params["n_components"] = n_clusters
|
|
222
|
+
|
|
223
|
+
if params:
|
|
224
|
+
default_params.update(params)
|
|
225
|
+
|
|
226
|
+
logger.info(f"Training {algorithm} clustering")
|
|
227
|
+
model = model_class(**default_params)
|
|
228
|
+
|
|
229
|
+
if algorithm == "gmm":
|
|
230
|
+
model.fit(X)
|
|
231
|
+
labels = model.predict(X)
|
|
232
|
+
else:
|
|
233
|
+
labels = model.fit_predict(X)
|
|
234
|
+
|
|
235
|
+
return model, labels
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def cross_validate(
|
|
239
|
+
X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray],
|
|
240
|
+
task: str = "classification", algorithm: str = "random_forest",
|
|
241
|
+
cv: int = 5, scoring: Optional[str] = None, params: Optional[Dict] = None
|
|
242
|
+
) -> Dict[str, Any]:
|
|
243
|
+
"""
|
|
244
|
+
Cross-validate a model.
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
Dict with train_scores, test_scores, and mean/std values.
|
|
248
|
+
"""
|
|
249
|
+
from sklearn.model_selection import cross_validate as sklearn_cv
|
|
250
|
+
|
|
251
|
+
# Get model
|
|
252
|
+
if task == "classification":
|
|
253
|
+
model = CLASSIFICATION_MODELS[algorithm]()
|
|
254
|
+
else:
|
|
255
|
+
model = REGRESSION_MODELS[algorithm]()
|
|
256
|
+
|
|
257
|
+
if params:
|
|
258
|
+
model.set_params(**params)
|
|
259
|
+
|
|
260
|
+
if scoring is None:
|
|
261
|
+
scoring = "accuracy" if task == "classification" else "r2"
|
|
262
|
+
|
|
263
|
+
logger.info(f"Cross-validating {algorithm} with {cv} folds")
|
|
264
|
+
|
|
265
|
+
results = sklearn_cv(model, X, y, cv=cv, scoring=scoring, return_train_score=True)
|
|
266
|
+
|
|
267
|
+
return {
|
|
268
|
+
"train_scores": results["train_score"].tolist(),
|
|
269
|
+
"test_scores": results["test_score"].tolist(),
|
|
270
|
+
"train_mean": float(results["train_score"].mean()),
|
|
271
|
+
"train_std": float(results["train_score"].std()),
|
|
272
|
+
"test_mean": float(results["test_score"].mean()),
|
|
273
|
+
"test_std": float(results["test_score"].std()),
|
|
274
|
+
"fit_time": float(results["fit_time"].mean()),
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def compare_models(
|
|
279
|
+
X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray],
|
|
280
|
+
task: str = "classification", algorithms: Optional[List[str]] = None,
|
|
281
|
+
cv: int = 5, scoring: Optional[str] = None
|
|
282
|
+
) -> pd.DataFrame:
|
|
283
|
+
"""
|
|
284
|
+
Compare multiple models.
|
|
285
|
+
|
|
286
|
+
Returns:
|
|
287
|
+
DataFrame with model comparison results.
|
|
288
|
+
"""
|
|
289
|
+
if algorithms is None:
|
|
290
|
+
algorithms = list(CLASSIFICATION_MODELS.keys()) if task == "classification" \
|
|
291
|
+
else list(REGRESSION_MODELS.keys())
|
|
292
|
+
|
|
293
|
+
results = []
|
|
294
|
+
for algo in algorithms:
|
|
295
|
+
try:
|
|
296
|
+
cv_results = cross_validate(X, y, task, algo, cv, scoring)
|
|
297
|
+
results.append({
|
|
298
|
+
"algorithm": algo,
|
|
299
|
+
"cv_mean": cv_results["test_mean"],
|
|
300
|
+
"cv_std": cv_results["test_std"],
|
|
301
|
+
"train_mean": cv_results["train_mean"],
|
|
302
|
+
"fit_time": cv_results["fit_time"],
|
|
303
|
+
})
|
|
304
|
+
except Exception as e:
|
|
305
|
+
logger.warning(f"Failed to train {algo}: {e}")
|
|
306
|
+
|
|
307
|
+
df = pd.DataFrame(results).sort_values("cv_mean", ascending=False)
|
|
308
|
+
return df
|