adamops 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- adamops/__init__.py +40 -0
- adamops/cli.py +163 -0
- adamops/data/__init__.py +24 -0
- adamops/data/feature_engineering.py +284 -0
- adamops/data/loaders.py +922 -0
- adamops/data/preprocessors.py +227 -0
- adamops/data/splitters.py +218 -0
- adamops/data/validators.py +148 -0
- adamops/deployment/__init__.py +21 -0
- adamops/deployment/api.py +237 -0
- adamops/deployment/cloud.py +191 -0
- adamops/deployment/containerize.py +262 -0
- adamops/deployment/exporters.py +148 -0
- adamops/evaluation/__init__.py +24 -0
- adamops/evaluation/comparison.py +133 -0
- adamops/evaluation/explainability.py +143 -0
- adamops/evaluation/metrics.py +233 -0
- adamops/evaluation/reports.py +165 -0
- adamops/evaluation/visualization.py +238 -0
- adamops/models/__init__.py +21 -0
- adamops/models/automl.py +277 -0
- adamops/models/ensembles.py +228 -0
- adamops/models/modelops.py +308 -0
- adamops/models/registry.py +250 -0
- adamops/monitoring/__init__.py +21 -0
- adamops/monitoring/alerts.py +200 -0
- adamops/monitoring/dashboard.py +117 -0
- adamops/monitoring/drift.py +212 -0
- adamops/monitoring/performance.py +195 -0
- adamops/pipelines/__init__.py +15 -0
- adamops/pipelines/orchestrators.py +183 -0
- adamops/pipelines/workflows.py +212 -0
- adamops/utils/__init__.py +18 -0
- adamops/utils/config.py +457 -0
- adamops/utils/helpers.py +663 -0
- adamops/utils/logging.py +412 -0
- adamops-0.1.0.dist-info/METADATA +310 -0
- adamops-0.1.0.dist-info/RECORD +42 -0
- adamops-0.1.0.dist-info/WHEEL +5 -0
- adamops-0.1.0.dist-info/entry_points.txt +2 -0
- adamops-0.1.0.dist-info/licenses/LICENSE +21 -0
- adamops-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AdamOps Reports Module
|
|
3
|
+
|
|
4
|
+
Generates HTML/PDF reports for model evaluation.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Any, Dict, List, Optional
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
import json
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
import pandas as pd
|
|
14
|
+
|
|
15
|
+
from adamops.utils.logging import get_logger
|
|
16
|
+
from adamops.evaluation.metrics import evaluate, classification_report
|
|
17
|
+
|
|
18
|
+
logger = get_logger(__name__)
|
|
19
|
+
|
|
20
|
+
HTML_TEMPLATE = """
|
|
21
|
+
<!DOCTYPE html>
|
|
22
|
+
<html>
|
|
23
|
+
<head>
|
|
24
|
+
<title>{title}</title>
|
|
25
|
+
<style>
|
|
26
|
+
body {{ font-family: Arial, sans-serif; margin: 40px; background: #f5f5f5; }}
|
|
27
|
+
.container {{ max-width: 1200px; margin: 0 auto; background: white; padding: 30px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }}
|
|
28
|
+
h1 {{ color: #333; border-bottom: 2px solid #4a90d9; padding-bottom: 10px; }}
|
|
29
|
+
h2 {{ color: #666; margin-top: 30px; }}
|
|
30
|
+
table {{ border-collapse: collapse; width: 100%; margin: 20px 0; }}
|
|
31
|
+
th, td {{ border: 1px solid #ddd; padding: 12px; text-align: left; }}
|
|
32
|
+
th {{ background: #4a90d9; color: white; }}
|
|
33
|
+
tr:nth-child(even) {{ background: #f9f9f9; }}
|
|
34
|
+
.metric {{ font-size: 24px; font-weight: bold; color: #4a90d9; }}
|
|
35
|
+
.metric-card {{ display: inline-block; padding: 20px; margin: 10px; background: #f0f7ff; border-radius: 8px; text-align: center; min-width: 150px; }}
|
|
36
|
+
.metric-label {{ color: #666; font-size: 14px; }}
|
|
37
|
+
.section {{ margin: 30px 0; }}
|
|
38
|
+
.footer {{ margin-top: 40px; padding-top: 20px; border-top: 1px solid #ddd; color: #999; font-size: 12px; }}
|
|
39
|
+
</style>
|
|
40
|
+
</head>
|
|
41
|
+
<body>
|
|
42
|
+
<div class="container">
|
|
43
|
+
<h1>{title}</h1>
|
|
44
|
+
<p>Generated: {timestamp}</p>
|
|
45
|
+
{content}
|
|
46
|
+
<div class="footer">Generated by AdamOps v0.1.0</div>
|
|
47
|
+
</div>
|
|
48
|
+
</body>
|
|
49
|
+
</html>
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class EvaluationReport:
|
|
54
|
+
"""Model evaluation report generator."""
|
|
55
|
+
|
|
56
|
+
def __init__(self, model_name: str = "Model"):
|
|
57
|
+
self.model_name = model_name
|
|
58
|
+
self.sections = []
|
|
59
|
+
self.metrics = {}
|
|
60
|
+
self.timestamp = datetime.now().isoformat()
|
|
61
|
+
|
|
62
|
+
def add_metrics(self, metrics: Dict[str, float], title: str = "Performance Metrics"):
|
|
63
|
+
"""Add metrics section."""
|
|
64
|
+
self.metrics.update(metrics)
|
|
65
|
+
|
|
66
|
+
cards = ""
|
|
67
|
+
for name, value in metrics.items():
|
|
68
|
+
if isinstance(value, float):
|
|
69
|
+
value_str = f"{value:.4f}"
|
|
70
|
+
else:
|
|
71
|
+
value_str = str(value)
|
|
72
|
+
cards += f'<div class="metric-card"><div class="metric">{value_str}</div><div class="metric-label">{name}</div></div>'
|
|
73
|
+
|
|
74
|
+
self.sections.append(f'<div class="section"><h2>{title}</h2>{cards}</div>')
|
|
75
|
+
|
|
76
|
+
def add_table(self, df: pd.DataFrame, title: str = "Results"):
|
|
77
|
+
"""Add table section."""
|
|
78
|
+
table_html = df.to_html(index=False, classes='results-table')
|
|
79
|
+
self.sections.append(f'<div class="section"><h2>{title}</h2>{table_html}</div>')
|
|
80
|
+
|
|
81
|
+
def add_text(self, text: str, title: str = "Notes"):
|
|
82
|
+
"""Add text section."""
|
|
83
|
+
self.sections.append(f'<div class="section"><h2>{title}</h2><p>{text}</p></div>')
|
|
84
|
+
|
|
85
|
+
def add_confusion_matrix(self, y_true: np.ndarray, y_pred: np.ndarray,
|
|
86
|
+
labels: Optional[List[str]] = None):
|
|
87
|
+
"""Add confusion matrix section."""
|
|
88
|
+
from sklearn.metrics import confusion_matrix
|
|
89
|
+
cm = confusion_matrix(y_true, y_pred)
|
|
90
|
+
cm_df = pd.DataFrame(cm,
|
|
91
|
+
index=[f'Actual: {l}' for l in (labels or range(len(cm)))],
|
|
92
|
+
columns=[f'Pred: {l}' for l in (labels or range(len(cm)))])
|
|
93
|
+
self.add_table(cm_df, "Confusion Matrix")
|
|
94
|
+
|
|
95
|
+
def add_classification_report(self, y_true: np.ndarray, y_pred: np.ndarray):
|
|
96
|
+
"""Add classification report section."""
|
|
97
|
+
report = classification_report(y_true, y_pred, output_dict=True)
|
|
98
|
+
df = pd.DataFrame(report).T.reset_index().rename(columns={'index': 'class'})
|
|
99
|
+
self.add_table(df.round(4), "Classification Report")
|
|
100
|
+
|
|
101
|
+
def generate_html(self) -> str:
|
|
102
|
+
"""Generate HTML report."""
|
|
103
|
+
content = "\n".join(self.sections)
|
|
104
|
+
return HTML_TEMPLATE.format(
|
|
105
|
+
title=f"{self.model_name} Evaluation Report",
|
|
106
|
+
timestamp=self.timestamp,
|
|
107
|
+
content=content
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
def save_html(self, filepath: str):
|
|
111
|
+
"""Save HTML report to file."""
|
|
112
|
+
filepath = Path(filepath)
|
|
113
|
+
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
114
|
+
|
|
115
|
+
with open(filepath, 'w', encoding='utf-8') as f:
|
|
116
|
+
f.write(self.generate_html())
|
|
117
|
+
|
|
118
|
+
logger.info(f"Report saved to {filepath}")
|
|
119
|
+
|
|
120
|
+
def save_json(self, filepath: str):
|
|
121
|
+
"""Save report data as JSON."""
|
|
122
|
+
data = {
|
|
123
|
+
"model_name": self.model_name,
|
|
124
|
+
"timestamp": self.timestamp,
|
|
125
|
+
"metrics": self.metrics,
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
with open(filepath, 'w') as f:
|
|
129
|
+
json.dump(data, f, indent=2, default=str)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def generate_report(
|
|
133
|
+
model_name: str, y_true: np.ndarray, y_pred: np.ndarray,
|
|
134
|
+
y_prob: Optional[np.ndarray] = None, task: str = "classification",
|
|
135
|
+
save_path: Optional[str] = None
|
|
136
|
+
) -> EvaluationReport:
|
|
137
|
+
"""
|
|
138
|
+
Generate evaluation report.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
model_name: Name of the model.
|
|
142
|
+
y_true: True labels.
|
|
143
|
+
y_pred: Predicted labels.
|
|
144
|
+
y_prob: Probability predictions.
|
|
145
|
+
task: 'classification' or 'regression'.
|
|
146
|
+
save_path: Optional path to save HTML report.
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
EvaluationReport object.
|
|
150
|
+
"""
|
|
151
|
+
report = EvaluationReport(model_name)
|
|
152
|
+
|
|
153
|
+
# Add metrics
|
|
154
|
+
metrics = evaluate(y_true, y_pred, task, y_prob)
|
|
155
|
+
report.add_metrics(metrics)
|
|
156
|
+
|
|
157
|
+
# Add task-specific sections
|
|
158
|
+
if task == "classification":
|
|
159
|
+
report.add_confusion_matrix(y_true, y_pred)
|
|
160
|
+
report.add_classification_report(y_true, y_pred)
|
|
161
|
+
|
|
162
|
+
if save_path:
|
|
163
|
+
report.save_html(save_path)
|
|
164
|
+
|
|
165
|
+
return report
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AdamOps Visualization Module
|
|
3
|
+
|
|
4
|
+
Provides plotting for model evaluation: confusion matrices, ROC curves, etc.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import matplotlib.pyplot as plt
|
|
13
|
+
import seaborn as sns
|
|
14
|
+
PLT_AVAILABLE = True
|
|
15
|
+
except ImportError:
|
|
16
|
+
PLT_AVAILABLE = False
|
|
17
|
+
|
|
18
|
+
from sklearn import metrics as sklearn_metrics
|
|
19
|
+
from adamops.utils.logging import get_logger
|
|
20
|
+
|
|
21
|
+
logger = get_logger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _check_plt():
|
|
25
|
+
if not PLT_AVAILABLE:
|
|
26
|
+
raise ImportError("matplotlib and seaborn required. Install with: pip install matplotlib seaborn")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def plot_confusion_matrix(
|
|
30
|
+
y_true: np.ndarray, y_pred: np.ndarray,
|
|
31
|
+
labels: Optional[List[str]] = None, normalize: bool = False,
|
|
32
|
+
figsize: Tuple[int, int] = (8, 6), cmap: str = "Blues",
|
|
33
|
+
title: str = "Confusion Matrix", save_path: Optional[str] = None
|
|
34
|
+
) -> plt.Figure:
|
|
35
|
+
"""Plot confusion matrix."""
|
|
36
|
+
_check_plt()
|
|
37
|
+
|
|
38
|
+
cm = sklearn_metrics.confusion_matrix(y_true, y_pred)
|
|
39
|
+
if normalize:
|
|
40
|
+
cm = cm.astype('float') / cm.sum(axis=1, keepdims=True)
|
|
41
|
+
|
|
42
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
43
|
+
sns.heatmap(cm, annot=True, fmt='.2f' if normalize else 'd',
|
|
44
|
+
cmap=cmap, xticklabels=labels, yticklabels=labels, ax=ax)
|
|
45
|
+
ax.set_xlabel('Predicted')
|
|
46
|
+
ax.set_ylabel('Actual')
|
|
47
|
+
ax.set_title(title)
|
|
48
|
+
|
|
49
|
+
if save_path:
|
|
50
|
+
fig.savefig(save_path, dpi=150, bbox_inches='tight')
|
|
51
|
+
|
|
52
|
+
return fig
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def plot_roc_curve(
|
|
56
|
+
y_true: np.ndarray, y_prob: np.ndarray,
|
|
57
|
+
figsize: Tuple[int, int] = (8, 6), title: str = "ROC Curve",
|
|
58
|
+
save_path: Optional[str] = None
|
|
59
|
+
) -> plt.Figure:
|
|
60
|
+
"""Plot ROC curve."""
|
|
61
|
+
_check_plt()
|
|
62
|
+
|
|
63
|
+
if y_prob.ndim == 2:
|
|
64
|
+
y_prob = y_prob[:, 1]
|
|
65
|
+
|
|
66
|
+
fpr, tpr, _ = sklearn_metrics.roc_curve(y_true, y_prob)
|
|
67
|
+
auc = sklearn_metrics.roc_auc_score(y_true, y_prob)
|
|
68
|
+
|
|
69
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
70
|
+
ax.plot(fpr, tpr, 'b-', label=f'ROC (AUC = {auc:.3f})')
|
|
71
|
+
ax.plot([0, 1], [0, 1], 'k--', label='Random')
|
|
72
|
+
ax.set_xlabel('False Positive Rate')
|
|
73
|
+
ax.set_ylabel('True Positive Rate')
|
|
74
|
+
ax.set_title(title)
|
|
75
|
+
ax.legend()
|
|
76
|
+
ax.grid(True, alpha=0.3)
|
|
77
|
+
|
|
78
|
+
if save_path:
|
|
79
|
+
fig.savefig(save_path, dpi=150, bbox_inches='tight')
|
|
80
|
+
|
|
81
|
+
return fig
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def plot_precision_recall_curve(
|
|
85
|
+
y_true: np.ndarray, y_prob: np.ndarray,
|
|
86
|
+
figsize: Tuple[int, int] = (8, 6), save_path: Optional[str] = None
|
|
87
|
+
) -> plt.Figure:
|
|
88
|
+
"""Plot precision-recall curve."""
|
|
89
|
+
_check_plt()
|
|
90
|
+
|
|
91
|
+
if y_prob.ndim == 2:
|
|
92
|
+
y_prob = y_prob[:, 1]
|
|
93
|
+
|
|
94
|
+
precision, recall, _ = sklearn_metrics.precision_recall_curve(y_true, y_prob)
|
|
95
|
+
ap = sklearn_metrics.average_precision_score(y_true, y_prob)
|
|
96
|
+
|
|
97
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
98
|
+
ax.plot(recall, precision, 'b-', label=f'PR (AP = {ap:.3f})')
|
|
99
|
+
ax.set_xlabel('Recall')
|
|
100
|
+
ax.set_ylabel('Precision')
|
|
101
|
+
ax.set_title('Precision-Recall Curve')
|
|
102
|
+
ax.legend()
|
|
103
|
+
ax.grid(True, alpha=0.3)
|
|
104
|
+
|
|
105
|
+
if save_path:
|
|
106
|
+
fig.savefig(save_path, dpi=150, bbox_inches='tight')
|
|
107
|
+
|
|
108
|
+
return fig
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def plot_feature_importance(
|
|
112
|
+
importance: np.ndarray, feature_names: List[str],
|
|
113
|
+
top_n: int = 20, figsize: Tuple[int, int] = (10, 8),
|
|
114
|
+
title: str = "Feature Importance", save_path: Optional[str] = None
|
|
115
|
+
) -> plt.Figure:
|
|
116
|
+
"""Plot feature importance."""
|
|
117
|
+
_check_plt()
|
|
118
|
+
|
|
119
|
+
indices = np.argsort(importance)[-top_n:]
|
|
120
|
+
|
|
121
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
122
|
+
ax.barh(range(len(indices)), importance[indices], color='steelblue')
|
|
123
|
+
ax.set_yticks(range(len(indices)))
|
|
124
|
+
ax.set_yticklabels([feature_names[i] for i in indices])
|
|
125
|
+
ax.set_xlabel('Importance')
|
|
126
|
+
ax.set_title(title)
|
|
127
|
+
|
|
128
|
+
if save_path:
|
|
129
|
+
fig.savefig(save_path, dpi=150, bbox_inches='tight')
|
|
130
|
+
|
|
131
|
+
return fig
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def plot_residuals(
|
|
135
|
+
y_true: np.ndarray, y_pred: np.ndarray,
|
|
136
|
+
figsize: Tuple[int, int] = (12, 5), save_path: Optional[str] = None
|
|
137
|
+
) -> plt.Figure:
|
|
138
|
+
"""Plot residuals for regression."""
|
|
139
|
+
_check_plt()
|
|
140
|
+
|
|
141
|
+
residuals = y_true - y_pred
|
|
142
|
+
|
|
143
|
+
fig, axes = plt.subplots(1, 2, figsize=figsize)
|
|
144
|
+
|
|
145
|
+
# Residuals vs Predicted
|
|
146
|
+
axes[0].scatter(y_pred, residuals, alpha=0.5)
|
|
147
|
+
axes[0].axhline(y=0, color='r', linestyle='--')
|
|
148
|
+
axes[0].set_xlabel('Predicted')
|
|
149
|
+
axes[0].set_ylabel('Residuals')
|
|
150
|
+
axes[0].set_title('Residuals vs Predicted')
|
|
151
|
+
|
|
152
|
+
# Residual distribution
|
|
153
|
+
axes[1].hist(residuals, bins=30, edgecolor='black', alpha=0.7)
|
|
154
|
+
axes[1].set_xlabel('Residuals')
|
|
155
|
+
axes[1].set_ylabel('Frequency')
|
|
156
|
+
axes[1].set_title('Residual Distribution')
|
|
157
|
+
|
|
158
|
+
plt.tight_layout()
|
|
159
|
+
|
|
160
|
+
if save_path:
|
|
161
|
+
fig.savefig(save_path, dpi=150, bbox_inches='tight')
|
|
162
|
+
|
|
163
|
+
return fig
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def plot_actual_vs_predicted(
|
|
167
|
+
y_true: np.ndarray, y_pred: np.ndarray,
|
|
168
|
+
figsize: Tuple[int, int] = (8, 8), save_path: Optional[str] = None
|
|
169
|
+
) -> plt.Figure:
|
|
170
|
+
"""Plot actual vs predicted for regression."""
|
|
171
|
+
_check_plt()
|
|
172
|
+
|
|
173
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
174
|
+
|
|
175
|
+
ax.scatter(y_true, y_pred, alpha=0.5)
|
|
176
|
+
min_val = min(y_true.min(), y_pred.min())
|
|
177
|
+
max_val = max(y_true.max(), y_pred.max())
|
|
178
|
+
ax.plot([min_val, max_val], [min_val, max_val], 'r--', label='Perfect')
|
|
179
|
+
ax.set_xlabel('Actual')
|
|
180
|
+
ax.set_ylabel('Predicted')
|
|
181
|
+
ax.set_title('Actual vs Predicted')
|
|
182
|
+
ax.legend()
|
|
183
|
+
|
|
184
|
+
if save_path:
|
|
185
|
+
fig.savefig(save_path, dpi=150, bbox_inches='tight')
|
|
186
|
+
|
|
187
|
+
return fig
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def plot_learning_curve(
|
|
191
|
+
train_sizes: np.ndarray, train_scores: np.ndarray, val_scores: np.ndarray,
|
|
192
|
+
figsize: Tuple[int, int] = (8, 6), save_path: Optional[str] = None
|
|
193
|
+
) -> plt.Figure:
|
|
194
|
+
"""Plot learning curve."""
|
|
195
|
+
_check_plt()
|
|
196
|
+
|
|
197
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
198
|
+
|
|
199
|
+
train_mean = train_scores.mean(axis=1)
|
|
200
|
+
train_std = train_scores.std(axis=1)
|
|
201
|
+
val_mean = val_scores.mean(axis=1)
|
|
202
|
+
val_std = val_scores.std(axis=1)
|
|
203
|
+
|
|
204
|
+
ax.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1)
|
|
205
|
+
ax.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.1)
|
|
206
|
+
ax.plot(train_sizes, train_mean, 'o-', label='Training')
|
|
207
|
+
ax.plot(train_sizes, val_mean, 'o-', label='Validation')
|
|
208
|
+
|
|
209
|
+
ax.set_xlabel('Training Size')
|
|
210
|
+
ax.set_ylabel('Score')
|
|
211
|
+
ax.set_title('Learning Curve')
|
|
212
|
+
ax.legend()
|
|
213
|
+
ax.grid(True, alpha=0.3)
|
|
214
|
+
|
|
215
|
+
if save_path:
|
|
216
|
+
fig.savefig(save_path, dpi=150, bbox_inches='tight')
|
|
217
|
+
|
|
218
|
+
return fig
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def plot_model_comparison(
|
|
222
|
+
results: pd.DataFrame, metric: str = "cv_mean",
|
|
223
|
+
figsize: Tuple[int, int] = (10, 6), save_path: Optional[str] = None
|
|
224
|
+
) -> plt.Figure:
|
|
225
|
+
"""Plot model comparison bar chart."""
|
|
226
|
+
_check_plt()
|
|
227
|
+
|
|
228
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
229
|
+
|
|
230
|
+
results_sorted = results.sort_values(metric, ascending=True)
|
|
231
|
+
ax.barh(results_sorted['algorithm'], results_sorted[metric], color='steelblue')
|
|
232
|
+
ax.set_xlabel(metric)
|
|
233
|
+
ax.set_title('Model Comparison')
|
|
234
|
+
|
|
235
|
+
if save_path:
|
|
236
|
+
fig.savefig(save_path, dpi=150, bbox_inches='tight')
|
|
237
|
+
|
|
238
|
+
return fig
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AdamOps Models Module
|
|
3
|
+
|
|
4
|
+
Provides model training and management capabilities:
|
|
5
|
+
- modelops: Train various ML models (regression, classification, clustering)
|
|
6
|
+
- registry: Version and track models with metadata
|
|
7
|
+
- ensembles: Create ensemble models (voting, stacking, blending)
|
|
8
|
+
- automl: Automated model selection and hyperparameter tuning
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from adamops.models import modelops
|
|
12
|
+
from adamops.models import registry
|
|
13
|
+
from adamops.models import ensembles
|
|
14
|
+
from adamops.models import automl
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"modelops",
|
|
18
|
+
"registry",
|
|
19
|
+
"ensembles",
|
|
20
|
+
"automl",
|
|
21
|
+
]
|
adamops/models/automl.py
ADDED
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AdamOps AutoML Module
|
|
3
|
+
|
|
4
|
+
Provides automated model selection and hyperparameter tuning.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
8
|
+
import time
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from sklearn.model_selection import cross_val_score
|
|
12
|
+
|
|
13
|
+
from adamops.utils.logging import get_logger
|
|
14
|
+
from adamops.utils.helpers import infer_task_type
|
|
15
|
+
from adamops.models.modelops import (
|
|
16
|
+
CLASSIFICATION_MODELS, REGRESSION_MODELS, TrainedModel, train
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
logger = get_logger(__name__)
|
|
20
|
+
|
|
21
|
+
try:
|
|
22
|
+
import optuna
|
|
23
|
+
optuna.logging.set_verbosity(optuna.logging.WARNING)
|
|
24
|
+
OPTUNA_AVAILABLE = True
|
|
25
|
+
except ImportError:
|
|
26
|
+
OPTUNA_AVAILABLE = False
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# Default hyperparameter search spaces
|
|
30
|
+
PARAM_SPACES = {
|
|
31
|
+
"random_forest": {
|
|
32
|
+
"n_estimators": ("int", 50, 300),
|
|
33
|
+
"max_depth": ("int", 3, 20),
|
|
34
|
+
"min_samples_split": ("int", 2, 20),
|
|
35
|
+
"min_samples_leaf": ("int", 1, 10),
|
|
36
|
+
},
|
|
37
|
+
"gradient_boosting": {
|
|
38
|
+
"n_estimators": ("int", 50, 300),
|
|
39
|
+
"max_depth": ("int", 3, 10),
|
|
40
|
+
"learning_rate": ("float", 0.01, 0.3),
|
|
41
|
+
"min_samples_split": ("int", 2, 20),
|
|
42
|
+
},
|
|
43
|
+
"xgboost": {
|
|
44
|
+
"n_estimators": ("int", 50, 300),
|
|
45
|
+
"max_depth": ("int", 3, 12),
|
|
46
|
+
"learning_rate": ("float", 0.01, 0.3),
|
|
47
|
+
"subsample": ("float", 0.6, 1.0),
|
|
48
|
+
"colsample_bytree": ("float", 0.6, 1.0),
|
|
49
|
+
},
|
|
50
|
+
"lightgbm": {
|
|
51
|
+
"n_estimators": ("int", 50, 300),
|
|
52
|
+
"max_depth": ("int", 3, 12),
|
|
53
|
+
"learning_rate": ("float", 0.01, 0.3),
|
|
54
|
+
"num_leaves": ("int", 20, 100),
|
|
55
|
+
"subsample": ("float", 0.6, 1.0),
|
|
56
|
+
},
|
|
57
|
+
"ridge": {
|
|
58
|
+
"alpha": ("float", 0.001, 100.0, "log"),
|
|
59
|
+
},
|
|
60
|
+
"lasso": {
|
|
61
|
+
"alpha": ("float", 0.001, 100.0, "log"),
|
|
62
|
+
},
|
|
63
|
+
"knn": {
|
|
64
|
+
"n_neighbors": ("int", 1, 30),
|
|
65
|
+
"weights": ("categorical", ["uniform", "distance"]),
|
|
66
|
+
},
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class AutoMLResult:
|
|
71
|
+
"""Results from AutoML run."""
|
|
72
|
+
|
|
73
|
+
def __init__(self):
|
|
74
|
+
self.best_model: Optional[TrainedModel] = None
|
|
75
|
+
self.best_score: float = 0.0
|
|
76
|
+
self.best_algorithm: str = ""
|
|
77
|
+
self.best_params: Dict = {}
|
|
78
|
+
self.leaderboard: List[Dict] = []
|
|
79
|
+
self.time_elapsed: float = 0.0
|
|
80
|
+
|
|
81
|
+
def summary(self) -> str:
|
|
82
|
+
lines = [
|
|
83
|
+
"=" * 50, "AutoML Results", "=" * 50,
|
|
84
|
+
f"Best Algorithm: {self.best_algorithm}",
|
|
85
|
+
f"Best Score: {self.best_score:.4f}",
|
|
86
|
+
f"Time Elapsed: {self.time_elapsed:.1f}s",
|
|
87
|
+
"", "Leaderboard:",
|
|
88
|
+
]
|
|
89
|
+
for i, entry in enumerate(self.leaderboard[:10], 1):
|
|
90
|
+
lines.append(f" {i}. {entry['algorithm']}: {entry['score']:.4f}")
|
|
91
|
+
return "\n".join(lines)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def grid_search(
|
|
95
|
+
X, y, algorithm: str, param_grid: Dict[str, List],
|
|
96
|
+
task: str = "classification", cv: int = 5, scoring: Optional[str] = None
|
|
97
|
+
) -> Tuple[Dict, float]:
|
|
98
|
+
"""Grid search hyperparameter tuning."""
|
|
99
|
+
from sklearn.model_selection import GridSearchCV
|
|
100
|
+
|
|
101
|
+
models = CLASSIFICATION_MODELS if task == "classification" else REGRESSION_MODELS
|
|
102
|
+
if algorithm not in models:
|
|
103
|
+
raise ValueError(f"Unknown algorithm: {algorithm}")
|
|
104
|
+
|
|
105
|
+
model = models[algorithm]()
|
|
106
|
+
scoring = scoring or ("accuracy" if task == "classification" else "r2")
|
|
107
|
+
|
|
108
|
+
grid = GridSearchCV(model, param_grid, cv=cv, scoring=scoring, n_jobs=-1)
|
|
109
|
+
grid.fit(X, y)
|
|
110
|
+
|
|
111
|
+
return grid.best_params_, grid.best_score_
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def random_search(
|
|
115
|
+
X, y, algorithm: str, param_distributions: Dict,
|
|
116
|
+
task: str = "classification", cv: int = 5, n_iter: int = 50,
|
|
117
|
+
scoring: Optional[str] = None
|
|
118
|
+
) -> Tuple[Dict, float]:
|
|
119
|
+
"""Random search hyperparameter tuning."""
|
|
120
|
+
from sklearn.model_selection import RandomizedSearchCV
|
|
121
|
+
|
|
122
|
+
models = CLASSIFICATION_MODELS if task == "classification" else REGRESSION_MODELS
|
|
123
|
+
model = models[algorithm]()
|
|
124
|
+
scoring = scoring or ("accuracy" if task == "classification" else "r2")
|
|
125
|
+
|
|
126
|
+
search = RandomizedSearchCV(
|
|
127
|
+
model, param_distributions, n_iter=n_iter, cv=cv,
|
|
128
|
+
scoring=scoring, n_jobs=-1, random_state=42
|
|
129
|
+
)
|
|
130
|
+
search.fit(X, y)
|
|
131
|
+
|
|
132
|
+
return search.best_params_, search.best_score_
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def bayesian_search(
|
|
136
|
+
X, y, algorithm: str, task: str = "classification",
|
|
137
|
+
cv: int = 5, n_trials: int = 50, scoring: Optional[str] = None,
|
|
138
|
+
param_space: Optional[Dict] = None
|
|
139
|
+
) -> Tuple[Dict, float]:
|
|
140
|
+
"""Bayesian optimization with Optuna."""
|
|
141
|
+
if not OPTUNA_AVAILABLE:
|
|
142
|
+
raise ImportError("Optuna required. Install with: pip install optuna")
|
|
143
|
+
|
|
144
|
+
models = CLASSIFICATION_MODELS if task == "classification" else REGRESSION_MODELS
|
|
145
|
+
scoring = scoring or ("accuracy" if task == "classification" else "r2")
|
|
146
|
+
space = param_space or PARAM_SPACES.get(algorithm, {})
|
|
147
|
+
|
|
148
|
+
def objective(trial):
|
|
149
|
+
params = {}
|
|
150
|
+
for name, spec in space.items():
|
|
151
|
+
if spec[0] == "int":
|
|
152
|
+
params[name] = trial.suggest_int(name, spec[1], spec[2])
|
|
153
|
+
elif spec[0] == "float":
|
|
154
|
+
log = len(spec) > 3 and spec[3] == "log"
|
|
155
|
+
params[name] = trial.suggest_float(name, spec[1], spec[2], log=log)
|
|
156
|
+
elif spec[0] == "categorical":
|
|
157
|
+
params[name] = trial.suggest_categorical(name, spec[1])
|
|
158
|
+
|
|
159
|
+
model = models[algorithm](**params)
|
|
160
|
+
scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
|
|
161
|
+
return scores.mean()
|
|
162
|
+
|
|
163
|
+
study = optuna.create_study(direction="maximize")
|
|
164
|
+
study.optimize(objective, n_trials=n_trials, show_progress_bar=False)
|
|
165
|
+
|
|
166
|
+
return study.best_params, study.best_value
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def run(
|
|
170
|
+
X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray],
|
|
171
|
+
task: str = "auto", algorithms: Optional[List[str]] = None,
|
|
172
|
+
tuning: str = "bayesian", cv: int = 5, time_limit: int = 3600,
|
|
173
|
+
n_trials: int = 50, scoring: Optional[str] = None
|
|
174
|
+
) -> AutoMLResult:
|
|
175
|
+
"""
|
|
176
|
+
Run AutoML.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
X: Features.
|
|
180
|
+
y: Target.
|
|
181
|
+
task: 'classification', 'regression', or 'auto'.
|
|
182
|
+
algorithms: Algorithms to try (None for all).
|
|
183
|
+
tuning: 'grid', 'random', 'bayesian', or 'none'.
|
|
184
|
+
cv: Cross-validation folds.
|
|
185
|
+
time_limit: Max time in seconds.
|
|
186
|
+
n_trials: Trials per algorithm for tuning.
|
|
187
|
+
scoring: Scoring metric.
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
AutoMLResult: Results with best model and leaderboard.
|
|
191
|
+
"""
|
|
192
|
+
start_time = time.time()
|
|
193
|
+
result = AutoMLResult()
|
|
194
|
+
|
|
195
|
+
# Auto-detect task
|
|
196
|
+
if task == "auto":
|
|
197
|
+
task = infer_task_type(y)
|
|
198
|
+
logger.info(f"Auto-detected task: {task}")
|
|
199
|
+
|
|
200
|
+
# Get algorithms
|
|
201
|
+
models = CLASSIFICATION_MODELS if task == "classification" else REGRESSION_MODELS
|
|
202
|
+
if algorithms is None:
|
|
203
|
+
algorithms = list(models.keys())
|
|
204
|
+
|
|
205
|
+
scoring = scoring or ("accuracy" if task == "classification" else "r2")
|
|
206
|
+
logger.info(f"Running AutoML with {len(algorithms)} algorithms")
|
|
207
|
+
|
|
208
|
+
for algo in algorithms:
|
|
209
|
+
if time.time() - start_time > time_limit:
|
|
210
|
+
logger.warning("Time limit reached")
|
|
211
|
+
break
|
|
212
|
+
|
|
213
|
+
try:
|
|
214
|
+
logger.info(f"Tuning {algo}...")
|
|
215
|
+
|
|
216
|
+
if tuning == "bayesian" and algo in PARAM_SPACES and OPTUNA_AVAILABLE:
|
|
217
|
+
best_params, score = bayesian_search(
|
|
218
|
+
X, y, algo, task, cv, min(n_trials, 30), scoring
|
|
219
|
+
)
|
|
220
|
+
elif tuning == "none":
|
|
221
|
+
model = models[algo]()
|
|
222
|
+
scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
|
|
223
|
+
best_params, score = {}, scores.mean()
|
|
224
|
+
else:
|
|
225
|
+
# Default to random search
|
|
226
|
+
from scipy.stats import randint, uniform
|
|
227
|
+
param_dist = {}
|
|
228
|
+
if algo in PARAM_SPACES:
|
|
229
|
+
for name, spec in PARAM_SPACES[algo].items():
|
|
230
|
+
if spec[0] == "int":
|
|
231
|
+
param_dist[name] = randint(spec[1], spec[2])
|
|
232
|
+
elif spec[0] == "float":
|
|
233
|
+
param_dist[name] = uniform(spec[1], spec[2] - spec[1])
|
|
234
|
+
|
|
235
|
+
if param_dist:
|
|
236
|
+
best_params, score = random_search(
|
|
237
|
+
X, y, algo, param_dist, task, cv, min(n_trials, 20), scoring
|
|
238
|
+
)
|
|
239
|
+
else:
|
|
240
|
+
model = models[algo]()
|
|
241
|
+
scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
|
|
242
|
+
best_params, score = {}, scores.mean()
|
|
243
|
+
|
|
244
|
+
result.leaderboard.append({
|
|
245
|
+
"algorithm": algo, "score": score, "params": best_params
|
|
246
|
+
})
|
|
247
|
+
|
|
248
|
+
if score > result.best_score:
|
|
249
|
+
result.best_score = score
|
|
250
|
+
result.best_algorithm = algo
|
|
251
|
+
result.best_params = best_params
|
|
252
|
+
|
|
253
|
+
except Exception as e:
|
|
254
|
+
logger.warning(f"Failed {algo}: {e}")
|
|
255
|
+
|
|
256
|
+
# Sort leaderboard
|
|
257
|
+
result.leaderboard.sort(key=lambda x: x["score"], reverse=True)
|
|
258
|
+
|
|
259
|
+
# Train best model
|
|
260
|
+
if result.best_algorithm:
|
|
261
|
+
result.best_model = train(
|
|
262
|
+
X, y, task, result.best_algorithm, result.best_params
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
result.time_elapsed = time.time() - start_time
|
|
266
|
+
logger.info(f"AutoML complete. Best: {result.best_algorithm} ({result.best_score:.4f})")
|
|
267
|
+
|
|
268
|
+
return result
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def quick_run(
|
|
272
|
+
X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray],
|
|
273
|
+
task: str = "auto"
|
|
274
|
+
) -> TrainedModel:
|
|
275
|
+
"""Quick AutoML run with defaults."""
|
|
276
|
+
result = run(X, y, task, tuning="none", n_trials=10)
|
|
277
|
+
return result.best_model
|