omgkit 2.20.0 → 2.21.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +125 -10
- package/package.json +1 -1
- package/plugin/agents/ai-architect-agent.md +282 -0
- package/plugin/agents/data-scientist-agent.md +221 -0
- package/plugin/agents/experiment-analyst-agent.md +318 -0
- package/plugin/agents/ml-engineer-agent.md +165 -0
- package/plugin/agents/mlops-engineer-agent.md +324 -0
- package/plugin/agents/model-optimizer-agent.md +287 -0
- package/plugin/agents/production-engineer-agent.md +360 -0
- package/plugin/agents/research-scientist-agent.md +274 -0
- package/plugin/commands/omgdata/augment.md +86 -0
- package/plugin/commands/omgdata/collect.md +81 -0
- package/plugin/commands/omgdata/label.md +83 -0
- package/plugin/commands/omgdata/split.md +83 -0
- package/plugin/commands/omgdata/validate.md +76 -0
- package/plugin/commands/omgdata/version.md +85 -0
- package/plugin/commands/omgdeploy/ab.md +94 -0
- package/plugin/commands/omgdeploy/cloud.md +89 -0
- package/plugin/commands/omgdeploy/edge.md +93 -0
- package/plugin/commands/omgdeploy/package.md +91 -0
- package/plugin/commands/omgdeploy/serve.md +92 -0
- package/plugin/commands/omgfeature/embed.md +93 -0
- package/plugin/commands/omgfeature/extract.md +93 -0
- package/plugin/commands/omgfeature/select.md +85 -0
- package/plugin/commands/omgfeature/store.md +97 -0
- package/plugin/commands/omgml/init.md +60 -0
- package/plugin/commands/omgml/status.md +82 -0
- package/plugin/commands/omgops/drift.md +87 -0
- package/plugin/commands/omgops/monitor.md +99 -0
- package/plugin/commands/omgops/pipeline.md +102 -0
- package/plugin/commands/omgops/registry.md +109 -0
- package/plugin/commands/omgops/retrain.md +91 -0
- package/plugin/commands/omgoptim/distill.md +90 -0
- package/plugin/commands/omgoptim/profile.md +92 -0
- package/plugin/commands/omgoptim/prune.md +81 -0
- package/plugin/commands/omgoptim/quantize.md +83 -0
- package/plugin/commands/omgtrain/baseline.md +78 -0
- package/plugin/commands/omgtrain/compare.md +99 -0
- package/plugin/commands/omgtrain/evaluate.md +85 -0
- package/plugin/commands/omgtrain/train.md +81 -0
- package/plugin/commands/omgtrain/tune.md +89 -0
- package/plugin/registry.yaml +252 -2
- package/plugin/skills/ml-systems/SKILL.md +65 -0
- package/plugin/skills/ml-systems/ai-accelerators/SKILL.md +342 -0
- package/plugin/skills/ml-systems/data-eng/SKILL.md +126 -0
- package/plugin/skills/ml-systems/deep-learning-primer/SKILL.md +143 -0
- package/plugin/skills/ml-systems/deployment-paradigms/SKILL.md +148 -0
- package/plugin/skills/ml-systems/dnn-architectures/SKILL.md +128 -0
- package/plugin/skills/ml-systems/edge-deployment/SKILL.md +366 -0
- package/plugin/skills/ml-systems/efficient-ai/SKILL.md +316 -0
- package/plugin/skills/ml-systems/feature-engineering/SKILL.md +151 -0
- package/plugin/skills/ml-systems/ml-frameworks/SKILL.md +187 -0
- package/plugin/skills/ml-systems/ml-serving-optimization/SKILL.md +371 -0
- package/plugin/skills/ml-systems/ml-systems-fundamentals/SKILL.md +103 -0
- package/plugin/skills/ml-systems/ml-workflow/SKILL.md +162 -0
- package/plugin/skills/ml-systems/mlops/SKILL.md +386 -0
- package/plugin/skills/ml-systems/model-deployment/SKILL.md +350 -0
- package/plugin/skills/ml-systems/model-dev/SKILL.md +160 -0
- package/plugin/skills/ml-systems/model-optimization/SKILL.md +339 -0
- package/plugin/skills/ml-systems/robust-ai/SKILL.md +395 -0
- package/plugin/skills/ml-systems/training-data/SKILL.md +152 -0
- package/plugin/workflows/ml-systems/data-preparation-workflow.md +276 -0
- package/plugin/workflows/ml-systems/edge-deployment-workflow.md +413 -0
- package/plugin/workflows/ml-systems/full-ml-lifecycle-workflow.md +405 -0
- package/plugin/workflows/ml-systems/hyperparameter-tuning-workflow.md +352 -0
- package/plugin/workflows/ml-systems/mlops-pipeline-workflow.md +384 -0
- package/plugin/workflows/ml-systems/model-deployment-workflow.md +392 -0
- package/plugin/workflows/ml-systems/model-development-workflow.md +218 -0
- package/plugin/workflows/ml-systems/model-evaluation-workflow.md +416 -0
- package/plugin/workflows/ml-systems/model-optimization-workflow.md +390 -0
- package/plugin/workflows/ml-systems/monitoring-drift-workflow.md +446 -0
- package/plugin/workflows/ml-systems/retraining-workflow.md +401 -0
- package/plugin/workflows/ml-systems/training-pipeline-workflow.md +382 -0
|
@@ -0,0 +1,416 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: Model Evaluation Workflow
|
|
3
|
+
description: Comprehensive model evaluation workflow including performance metrics, error analysis, fairness assessment, and production readiness checks.
|
|
4
|
+
category: ml-systems
|
|
5
|
+
complexity: medium
|
|
6
|
+
agents:
|
|
7
|
+
- experiment-analyst-agent
|
|
8
|
+
- data-scientist-agent
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
# Model Evaluation Workflow
|
|
12
|
+
|
|
13
|
+
Comprehensive evaluation of ML models before deployment.
|
|
14
|
+
|
|
15
|
+
## Overview
|
|
16
|
+
|
|
17
|
+
```
|
|
18
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
19
|
+
│ MODEL EVALUATION WORKFLOW │
|
|
20
|
+
├─────────────────────────────────────────────────────────────┤
|
|
21
|
+
│ │
|
|
22
|
+
│ 1. PERFORMANCE 2. ERROR 3. FAIRNESS │
|
|
23
|
+
│ METRICS ANALYSIS ASSESSMENT │
|
|
24
|
+
│ ↓ ↓ ↓ │
|
|
25
|
+
│ Accuracy/F1 Confusion matrix Demographic parity │
|
|
26
|
+
│ ROC/PR curves Failure patterns Equalized odds │
|
|
27
|
+
│ Calibration Edge cases Bias detection │
|
|
28
|
+
│ │
|
|
29
|
+
│ 4. ROBUSTNESS 5. EXPLAIN- 6. PRODUCTION │
|
|
30
|
+
│ TESTING ABILITY READINESS │
|
|
31
|
+
│ ↓ ↓ ↓ │
|
|
32
|
+
│ Adversarial SHAP values Latency check │
|
|
33
|
+
│ Distribution shift Feature import. Memory footprint │
|
|
34
|
+
│ Noise sensitivity Model cards Integration test │
|
|
35
|
+
│ │
|
|
36
|
+
└─────────────────────────────────────────────────────────────┘
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Steps
|
|
40
|
+
|
|
41
|
+
### Step 1: Performance Metrics
|
|
42
|
+
**Agent**: experiment-analyst-agent
|
|
43
|
+
|
|
44
|
+
**Inputs**:
|
|
45
|
+
- Trained model
|
|
46
|
+
- Test dataset
|
|
47
|
+
- Metric requirements
|
|
48
|
+
|
|
49
|
+
**Actions**:
|
|
50
|
+
```bash
|
|
51
|
+
# Comprehensive evaluation
|
|
52
|
+
/omgtrain:evaluate --model model.pt --data test.csv --report full
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
from sklearn.metrics import (
|
|
57
|
+
classification_report, confusion_matrix, roc_auc_score,
|
|
58
|
+
precision_recall_curve, average_precision_score
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
def comprehensive_metrics(y_true, y_pred, y_prob):
|
|
62
|
+
metrics = {}
|
|
63
|
+
|
|
64
|
+
# Classification metrics
|
|
65
|
+
metrics['accuracy'] = accuracy_score(y_true, y_pred)
|
|
66
|
+
metrics['precision'] = precision_score(y_true, y_pred, average='weighted')
|
|
67
|
+
metrics['recall'] = recall_score(y_true, y_pred, average='weighted')
|
|
68
|
+
metrics['f1'] = f1_score(y_true, y_pred, average='weighted')
|
|
69
|
+
|
|
70
|
+
# Probability-based metrics
|
|
71
|
+
metrics['roc_auc'] = roc_auc_score(y_true, y_prob, multi_class='ovr')
|
|
72
|
+
metrics['pr_auc'] = average_precision_score(y_true, y_prob)
|
|
73
|
+
metrics['log_loss'] = log_loss(y_true, y_prob)
|
|
74
|
+
|
|
75
|
+
# Calibration
|
|
76
|
+
metrics['brier_score'] = brier_score_loss(y_true, y_prob[:, 1])
|
|
77
|
+
metrics['ece'] = expected_calibration_error(y_true, y_prob[:, 1])
|
|
78
|
+
|
|
79
|
+
return metrics
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
**Outputs**:
|
|
83
|
+
- Complete metrics report
|
|
84
|
+
- ROC/PR curves
|
|
85
|
+
- Calibration plots
|
|
86
|
+
|
|
87
|
+
### Step 2: Error Analysis
|
|
88
|
+
**Agent**: data-scientist-agent
|
|
89
|
+
|
|
90
|
+
**Inputs**:
|
|
91
|
+
- Predictions
|
|
92
|
+
- Ground truth
|
|
93
|
+
- Feature data
|
|
94
|
+
|
|
95
|
+
**Actions**:
|
|
96
|
+
```python
|
|
97
|
+
def error_analysis(model, X_test, y_test, feature_names):
|
|
98
|
+
y_pred = model.predict(X_test)
|
|
99
|
+
y_prob = model.predict_proba(X_test)
|
|
100
|
+
|
|
101
|
+
errors = y_pred != y_test
|
|
102
|
+
error_indices = np.where(errors)[0]
|
|
103
|
+
|
|
104
|
+
analysis = {
|
|
105
|
+
'error_rate': errors.mean(),
|
|
106
|
+
'confusion_matrix': confusion_matrix(y_test, y_pred),
|
|
107
|
+
'per_class_errors': {},
|
|
108
|
+
'error_patterns': [],
|
|
109
|
+
'confident_errors': []
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
# Per-class error analysis
|
|
113
|
+
for cls in np.unique(y_test):
|
|
114
|
+
mask = y_test == cls
|
|
115
|
+
analysis['per_class_errors'][cls] = {
|
|
116
|
+
'count': mask.sum(),
|
|
117
|
+
'error_rate': errors[mask].mean(),
|
|
118
|
+
'most_confused_with': Counter(y_pred[mask & errors]).most_common(3)
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
# High-confidence errors (most concerning)
|
|
122
|
+
max_prob = y_prob.max(axis=1)
|
|
123
|
+
confident_errors = error_indices[max_prob[error_indices] > 0.9]
|
|
124
|
+
analysis['confident_errors'] = {
|
|
125
|
+
'count': len(confident_errors),
|
|
126
|
+
'examples': confident_errors[:10].tolist()
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
# Error patterns by feature
|
|
130
|
+
for i, feature in enumerate(feature_names):
|
|
131
|
+
error_values = X_test[error_indices, i]
|
|
132
|
+
correct_values = X_test[~errors, i]
|
|
133
|
+
|
|
134
|
+
if len(error_values) > 10:
|
|
135
|
+
stat, p_value = stats.mannwhitneyu(error_values, correct_values)
|
|
136
|
+
if p_value < 0.01:
|
|
137
|
+
analysis['error_patterns'].append({
|
|
138
|
+
'feature': feature,
|
|
139
|
+
'p_value': p_value,
|
|
140
|
+
'error_mean': error_values.mean(),
|
|
141
|
+
'correct_mean': correct_values.mean()
|
|
142
|
+
})
|
|
143
|
+
|
|
144
|
+
return analysis
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
**Outputs**:
|
|
148
|
+
- Error patterns
|
|
149
|
+
- Confusion analysis
|
|
150
|
+
- High-confidence errors
|
|
151
|
+
|
|
152
|
+
### Step 3: Fairness Assessment
|
|
153
|
+
**Agent**: data-scientist-agent
|
|
154
|
+
|
|
155
|
+
**Inputs**:
|
|
156
|
+
- Predictions
|
|
157
|
+
- Sensitive attributes
|
|
158
|
+
- Fairness criteria
|
|
159
|
+
|
|
160
|
+
**Actions**:
|
|
161
|
+
```python
|
|
162
|
+
def fairness_assessment(y_true, y_pred, y_prob, sensitive_attrs):
|
|
163
|
+
from fairlearn.metrics import (
|
|
164
|
+
demographic_parity_difference,
|
|
165
|
+
equalized_odds_difference,
|
|
166
|
+
MetricFrame
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
results = {}
|
|
170
|
+
|
|
171
|
+
for attr_name, attr_values in sensitive_attrs.items():
|
|
172
|
+
# Create metric frame
|
|
173
|
+
metric_frame = MetricFrame(
|
|
174
|
+
metrics={
|
|
175
|
+
'accuracy': accuracy_score,
|
|
176
|
+
'precision': precision_score,
|
|
177
|
+
'recall': recall_score,
|
|
178
|
+
'selection_rate': lambda y_t, y_p: y_p.mean()
|
|
179
|
+
},
|
|
180
|
+
y_true=y_true,
|
|
181
|
+
y_pred=y_pred,
|
|
182
|
+
sensitive_features=attr_values
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
results[attr_name] = {
|
|
186
|
+
'by_group': metric_frame.by_group.to_dict(),
|
|
187
|
+
'overall': metric_frame.overall.to_dict(),
|
|
188
|
+
'demographic_parity_diff': demographic_parity_difference(
|
|
189
|
+
y_true, y_pred, sensitive_features=attr_values
|
|
190
|
+
),
|
|
191
|
+
'equalized_odds_diff': equalized_odds_difference(
|
|
192
|
+
y_true, y_pred, sensitive_features=attr_values
|
|
193
|
+
)
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
# Fairness summary
|
|
197
|
+
fairness_passed = all(
|
|
198
|
+
abs(r['demographic_parity_diff']) < 0.1
|
|
199
|
+
for r in results.values()
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
return {
|
|
203
|
+
'detailed': results,
|
|
204
|
+
'passed': fairness_passed,
|
|
205
|
+
'threshold': 0.1
|
|
206
|
+
}
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
**Outputs**:
|
|
210
|
+
- Fairness metrics
|
|
211
|
+
- Group disparities
|
|
212
|
+
- Recommendations
|
|
213
|
+
|
|
214
|
+
### Step 4: Robustness Testing
|
|
215
|
+
**Agent**: research-scientist-agent
|
|
216
|
+
|
|
217
|
+
**Inputs**:
|
|
218
|
+
- Model
|
|
219
|
+
- Test data
|
|
220
|
+
- Perturbation types
|
|
221
|
+
|
|
222
|
+
**Actions**:
|
|
223
|
+
```python
|
|
224
|
+
def robustness_testing(model, X_test, y_test):
|
|
225
|
+
results = {}
|
|
226
|
+
|
|
227
|
+
# Noise robustness
|
|
228
|
+
noise_levels = [0.01, 0.05, 0.1, 0.2]
|
|
229
|
+
results['noise'] = {}
|
|
230
|
+
for noise in noise_levels:
|
|
231
|
+
X_noisy = X_test + np.random.normal(0, noise, X_test.shape)
|
|
232
|
+
y_pred = model.predict(X_noisy)
|
|
233
|
+
results['noise'][noise] = accuracy_score(y_test, y_pred)
|
|
234
|
+
|
|
235
|
+
# Feature dropout
|
|
236
|
+
results['feature_dropout'] = {}
|
|
237
|
+
baseline_acc = accuracy_score(y_test, model.predict(X_test))
|
|
238
|
+
for i in range(X_test.shape[1]):
|
|
239
|
+
X_dropped = X_test.copy()
|
|
240
|
+
X_dropped[:, i] = 0
|
|
241
|
+
acc = accuracy_score(y_test, model.predict(X_dropped))
|
|
242
|
+
results['feature_dropout'][i] = baseline_acc - acc
|
|
243
|
+
|
|
244
|
+
# Out-of-distribution detection
|
|
245
|
+
# Use entropy of predictions
|
|
246
|
+
y_prob = model.predict_proba(X_test)
|
|
247
|
+
entropy = -np.sum(y_prob * np.log(y_prob + 1e-10), axis=1)
|
|
248
|
+
results['entropy_stats'] = {
|
|
249
|
+
'mean': entropy.mean(),
|
|
250
|
+
'std': entropy.std(),
|
|
251
|
+
'high_entropy_pct': (entropy > 0.5).mean()
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
return results
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
**Outputs**:
|
|
258
|
+
- Noise sensitivity
|
|
259
|
+
- Feature importance
|
|
260
|
+
- OOD detection
|
|
261
|
+
|
|
262
|
+
### Step 5: Explainability
|
|
263
|
+
**Agent**: data-scientist-agent
|
|
264
|
+
|
|
265
|
+
**Inputs**:
|
|
266
|
+
- Model
|
|
267
|
+
- Sample data
|
|
268
|
+
- Explanation requirements
|
|
269
|
+
|
|
270
|
+
**Actions**:
|
|
271
|
+
```python
|
|
272
|
+
import shap
|
|
273
|
+
|
|
274
|
+
def model_explainability(model, X_train, X_test, feature_names):
|
|
275
|
+
# SHAP values
|
|
276
|
+
explainer = shap.TreeExplainer(model)
|
|
277
|
+
shap_values = explainer.shap_values(X_test)
|
|
278
|
+
|
|
279
|
+
# Global feature importance
|
|
280
|
+
global_importance = np.abs(shap_values).mean(axis=0)
|
|
281
|
+
importance_ranking = sorted(
|
|
282
|
+
zip(feature_names, global_importance),
|
|
283
|
+
key=lambda x: -x[1]
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
# Generate plots
|
|
287
|
+
shap.summary_plot(shap_values, X_test, feature_names=feature_names)
|
|
288
|
+
|
|
289
|
+
# Create model card
|
|
290
|
+
model_card = {
|
|
291
|
+
'model_details': {
|
|
292
|
+
'type': type(model).__name__,
|
|
293
|
+
'framework': 'sklearn',
|
|
294
|
+
'version': '1.0'
|
|
295
|
+
},
|
|
296
|
+
'intended_use': {
|
|
297
|
+
'primary_use': 'Classification',
|
|
298
|
+
'users': 'Data science team',
|
|
299
|
+
'limitations': 'Not tested on populations outside training distribution'
|
|
300
|
+
},
|
|
301
|
+
'metrics': {
|
|
302
|
+
'accuracy': 0.92,
|
|
303
|
+
'auc': 0.95
|
|
304
|
+
},
|
|
305
|
+
'training_data': {
|
|
306
|
+
'size': len(X_train),
|
|
307
|
+
'features': len(feature_names)
|
|
308
|
+
},
|
|
309
|
+
'ethical_considerations': {
|
|
310
|
+
'fairness_tested': True,
|
|
311
|
+
'sensitive_features': ['age', 'gender']
|
|
312
|
+
},
|
|
313
|
+
'top_features': importance_ranking[:10]
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
return {
|
|
317
|
+
'shap_values': shap_values,
|
|
318
|
+
'importance': importance_ranking,
|
|
319
|
+
'model_card': model_card
|
|
320
|
+
}
|
|
321
|
+
```
|
|
322
|
+
|
|
323
|
+
**Outputs**:
|
|
324
|
+
- SHAP values
|
|
325
|
+
- Feature importance
|
|
326
|
+
- Model card
|
|
327
|
+
|
|
328
|
+
### Step 6: Production Readiness
|
|
329
|
+
**Agent**: production-engineer-agent
|
|
330
|
+
|
|
331
|
+
**Inputs**:
|
|
332
|
+
- Model
|
|
333
|
+
- Production requirements
|
|
334
|
+
- Infrastructure constraints
|
|
335
|
+
|
|
336
|
+
**Actions**:
|
|
337
|
+
```python
|
|
338
|
+
def production_readiness_check(model, X_sample, requirements):
|
|
339
|
+
results = {
|
|
340
|
+
'latency': {},
|
|
341
|
+
'memory': {},
|
|
342
|
+
'size': {},
|
|
343
|
+
'integration': {}
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
# Latency test
|
|
347
|
+
import time
|
|
348
|
+
latencies = []
|
|
349
|
+
for _ in range(100):
|
|
350
|
+
start = time.perf_counter()
|
|
351
|
+
model.predict(X_sample[:1])
|
|
352
|
+
latencies.append((time.perf_counter() - start) * 1000)
|
|
353
|
+
|
|
354
|
+
results['latency'] = {
|
|
355
|
+
'p50': np.percentile(latencies, 50),
|
|
356
|
+
'p95': np.percentile(latencies, 95),
|
|
357
|
+
'p99': np.percentile(latencies, 99),
|
|
358
|
+
'meets_sla': np.percentile(latencies, 99) < requirements['max_latency_ms']
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
# Memory footprint
|
|
362
|
+
import sys
|
|
363
|
+
import pickle
|
|
364
|
+
model_bytes = len(pickle.dumps(model))
|
|
365
|
+
results['size'] = {
|
|
366
|
+
'bytes': model_bytes,
|
|
367
|
+
'mb': model_bytes / 1024 / 1024,
|
|
368
|
+
'meets_limit': model_bytes < requirements['max_size_mb'] * 1024 * 1024
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
# Integration test
|
|
372
|
+
try:
|
|
373
|
+
# Test serialization
|
|
374
|
+
model_path = '/tmp/test_model.pkl'
|
|
375
|
+
pickle.dump(model, open(model_path, 'wb'))
|
|
376
|
+
loaded = pickle.load(open(model_path, 'rb'))
|
|
377
|
+
test_pred = loaded.predict(X_sample[:1])
|
|
378
|
+
results['integration']['serialization'] = 'passed'
|
|
379
|
+
except Exception as e:
|
|
380
|
+
results['integration']['serialization'] = f'failed: {str(e)}'
|
|
381
|
+
|
|
382
|
+
results['ready'] = all([
|
|
383
|
+
results['latency']['meets_sla'],
|
|
384
|
+
results['size']['meets_limit'],
|
|
385
|
+
results['integration']['serialization'] == 'passed'
|
|
386
|
+
])
|
|
387
|
+
|
|
388
|
+
return results
|
|
389
|
+
```
|
|
390
|
+
|
|
391
|
+
**Outputs**:
|
|
392
|
+
- Latency benchmarks
|
|
393
|
+
- Memory footprint
|
|
394
|
+
- Production readiness
|
|
395
|
+
|
|
396
|
+
## Artifacts
|
|
397
|
+
|
|
398
|
+
- `evaluation_report.json` - Complete metrics
|
|
399
|
+
- `error_analysis.json` - Error patterns
|
|
400
|
+
- `fairness_report.json` - Bias assessment
|
|
401
|
+
- `model_card.md` - Model documentation
|
|
402
|
+
- `visualizations/` - Plots and charts
|
|
403
|
+
|
|
404
|
+
## Next Workflows
|
|
405
|
+
|
|
406
|
+
After evaluation:
|
|
407
|
+
- → **model-optimization-workflow** if performance insufficient
|
|
408
|
+
- → **model-deployment-workflow** if ready for production
|
|
409
|
+
|
|
410
|
+
## Quality Gates
|
|
411
|
+
|
|
412
|
+
- [ ] All steps completed successfully
|
|
413
|
+
- [ ] Metrics meet defined thresholds
|
|
414
|
+
- [ ] Documentation updated
|
|
415
|
+
- [ ] Artifacts versioned and stored
|
|
416
|
+
- [ ] Stakeholder approval obtained
|