omgkit 2.20.0 → 2.21.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/README.md +125 -10
  2. package/package.json +1 -1
  3. package/plugin/agents/ai-architect-agent.md +282 -0
  4. package/plugin/agents/data-scientist-agent.md +221 -0
  5. package/plugin/agents/experiment-analyst-agent.md +318 -0
  6. package/plugin/agents/ml-engineer-agent.md +165 -0
  7. package/plugin/agents/mlops-engineer-agent.md +324 -0
  8. package/plugin/agents/model-optimizer-agent.md +287 -0
  9. package/plugin/agents/production-engineer-agent.md +360 -0
  10. package/plugin/agents/research-scientist-agent.md +274 -0
  11. package/plugin/commands/omgdata/augment.md +86 -0
  12. package/plugin/commands/omgdata/collect.md +81 -0
  13. package/plugin/commands/omgdata/label.md +83 -0
  14. package/plugin/commands/omgdata/split.md +83 -0
  15. package/plugin/commands/omgdata/validate.md +76 -0
  16. package/plugin/commands/omgdata/version.md +85 -0
  17. package/plugin/commands/omgdeploy/ab.md +94 -0
  18. package/plugin/commands/omgdeploy/cloud.md +89 -0
  19. package/plugin/commands/omgdeploy/edge.md +93 -0
  20. package/plugin/commands/omgdeploy/package.md +91 -0
  21. package/plugin/commands/omgdeploy/serve.md +92 -0
  22. package/plugin/commands/omgfeature/embed.md +93 -0
  23. package/plugin/commands/omgfeature/extract.md +93 -0
  24. package/plugin/commands/omgfeature/select.md +85 -0
  25. package/plugin/commands/omgfeature/store.md +97 -0
  26. package/plugin/commands/omgml/init.md +60 -0
  27. package/plugin/commands/omgml/status.md +82 -0
  28. package/plugin/commands/omgops/drift.md +87 -0
  29. package/plugin/commands/omgops/monitor.md +99 -0
  30. package/plugin/commands/omgops/pipeline.md +102 -0
  31. package/plugin/commands/omgops/registry.md +109 -0
  32. package/plugin/commands/omgops/retrain.md +91 -0
  33. package/plugin/commands/omgoptim/distill.md +90 -0
  34. package/plugin/commands/omgoptim/profile.md +92 -0
  35. package/plugin/commands/omgoptim/prune.md +81 -0
  36. package/plugin/commands/omgoptim/quantize.md +83 -0
  37. package/plugin/commands/omgtrain/baseline.md +78 -0
  38. package/plugin/commands/omgtrain/compare.md +99 -0
  39. package/plugin/commands/omgtrain/evaluate.md +85 -0
  40. package/plugin/commands/omgtrain/train.md +81 -0
  41. package/plugin/commands/omgtrain/tune.md +89 -0
  42. package/plugin/registry.yaml +252 -2
  43. package/plugin/skills/ml-systems/SKILL.md +65 -0
  44. package/plugin/skills/ml-systems/ai-accelerators/SKILL.md +342 -0
  45. package/plugin/skills/ml-systems/data-eng/SKILL.md +126 -0
  46. package/plugin/skills/ml-systems/deep-learning-primer/SKILL.md +143 -0
  47. package/plugin/skills/ml-systems/deployment-paradigms/SKILL.md +148 -0
  48. package/plugin/skills/ml-systems/dnn-architectures/SKILL.md +128 -0
  49. package/plugin/skills/ml-systems/edge-deployment/SKILL.md +366 -0
  50. package/plugin/skills/ml-systems/efficient-ai/SKILL.md +316 -0
  51. package/plugin/skills/ml-systems/feature-engineering/SKILL.md +151 -0
  52. package/plugin/skills/ml-systems/ml-frameworks/SKILL.md +187 -0
  53. package/plugin/skills/ml-systems/ml-serving-optimization/SKILL.md +371 -0
  54. package/plugin/skills/ml-systems/ml-systems-fundamentals/SKILL.md +103 -0
  55. package/plugin/skills/ml-systems/ml-workflow/SKILL.md +162 -0
  56. package/plugin/skills/ml-systems/mlops/SKILL.md +386 -0
  57. package/plugin/skills/ml-systems/model-deployment/SKILL.md +350 -0
  58. package/plugin/skills/ml-systems/model-dev/SKILL.md +160 -0
  59. package/plugin/skills/ml-systems/model-optimization/SKILL.md +339 -0
  60. package/plugin/skills/ml-systems/robust-ai/SKILL.md +395 -0
  61. package/plugin/skills/ml-systems/training-data/SKILL.md +152 -0
  62. package/plugin/workflows/ml-systems/data-preparation-workflow.md +276 -0
  63. package/plugin/workflows/ml-systems/edge-deployment-workflow.md +413 -0
  64. package/plugin/workflows/ml-systems/full-ml-lifecycle-workflow.md +405 -0
  65. package/plugin/workflows/ml-systems/hyperparameter-tuning-workflow.md +352 -0
  66. package/plugin/workflows/ml-systems/mlops-pipeline-workflow.md +384 -0
  67. package/plugin/workflows/ml-systems/model-deployment-workflow.md +392 -0
  68. package/plugin/workflows/ml-systems/model-development-workflow.md +218 -0
  69. package/plugin/workflows/ml-systems/model-evaluation-workflow.md +416 -0
  70. package/plugin/workflows/ml-systems/model-optimization-workflow.md +390 -0
  71. package/plugin/workflows/ml-systems/monitoring-drift-workflow.md +446 -0
  72. package/plugin/workflows/ml-systems/retraining-workflow.md +401 -0
  73. package/plugin/workflows/ml-systems/training-pipeline-workflow.md +382 -0
@@ -0,0 +1,416 @@
1
+ ---
2
+ name: Model Evaluation Workflow
3
+ description: Comprehensive model evaluation workflow including performance metrics, error analysis, fairness assessment, and production readiness checks.
4
+ category: ml-systems
5
+ complexity: medium
6
+ agents:
7
+ - experiment-analyst-agent
8
+ - data-scientist-agent
9
+ ---
10
+
11
+ # Model Evaluation Workflow
12
+
13
+ Comprehensive evaluation of ML models before deployment.
14
+
15
+ ## Overview
16
+
17
+ ```
18
+ ┌─────────────────────────────────────────────────────────────┐
19
+ │ MODEL EVALUATION WORKFLOW │
20
+ ├─────────────────────────────────────────────────────────────┤
21
+ │ │
22
+ │ 1. PERFORMANCE 2. ERROR 3. FAIRNESS │
23
+ │ METRICS ANALYSIS ASSESSMENT │
24
+ │ ↓ ↓ ↓ │
25
+ │ Accuracy/F1 Confusion matrix Demographic parity │
26
+ │ ROC/PR curves Failure patterns Equalized odds │
27
+ │ Calibration Edge cases Bias detection │
28
+ │ │
29
+ │ 4. ROBUSTNESS 5. EXPLAIN- 6. PRODUCTION │
30
+ │ TESTING ABILITY READINESS │
31
+ │ ↓ ↓ ↓ │
32
+ │ Adversarial SHAP values Latency check │
33
+ │ Distribution shift Feature import. Memory footprint │
34
+ │ Noise sensitivity Model cards Integration test │
35
+ │ │
36
+ └─────────────────────────────────────────────────────────────┘
37
+ ```
38
+
39
+ ## Steps
40
+
41
+ ### Step 1: Performance Metrics
42
+ **Agent**: experiment-analyst-agent
43
+
44
+ **Inputs**:
45
+ - Trained model
46
+ - Test dataset
47
+ - Metric requirements
48
+
49
+ **Actions**:
50
+ ```bash
51
+ # Comprehensive evaluation
52
+ /omgtrain:evaluate --model model.pt --data test.csv --report full
53
+ ```
54
+
55
+ ```python
56
+ from sklearn.metrics import (
57
+ classification_report, confusion_matrix, roc_auc_score,
58
+ precision_recall_curve, average_precision_score
59
+ )
60
+
61
+ def comprehensive_metrics(y_true, y_pred, y_prob):
62
+ metrics = {}
63
+
64
+ # Classification metrics
65
+ metrics['accuracy'] = accuracy_score(y_true, y_pred)
66
+ metrics['precision'] = precision_score(y_true, y_pred, average='weighted')
67
+ metrics['recall'] = recall_score(y_true, y_pred, average='weighted')
68
+ metrics['f1'] = f1_score(y_true, y_pred, average='weighted')
69
+
70
+ # Probability-based metrics
71
+ metrics['roc_auc'] = roc_auc_score(y_true, y_prob, multi_class='ovr')
72
+ metrics['pr_auc'] = average_precision_score(y_true, y_prob)
73
+ metrics['log_loss'] = log_loss(y_true, y_prob)
74
+
75
+ # Calibration
76
+ metrics['brier_score'] = brier_score_loss(y_true, y_prob[:, 1])
77
+ metrics['ece'] = expected_calibration_error(y_true, y_prob[:, 1])
78
+
79
+ return metrics
80
+ ```
81
+
82
+ **Outputs**:
83
+ - Complete metrics report
84
+ - ROC/PR curves
85
+ - Calibration plots
86
+
87
+ ### Step 2: Error Analysis
88
+ **Agent**: data-scientist-agent
89
+
90
+ **Inputs**:
91
+ - Predictions
92
+ - Ground truth
93
+ - Feature data
94
+
95
+ **Actions**:
96
+ ```python
97
+ def error_analysis(model, X_test, y_test, feature_names):
98
+ y_pred = model.predict(X_test)
99
+ y_prob = model.predict_proba(X_test)
100
+
101
+ errors = y_pred != y_test
102
+ error_indices = np.where(errors)[0]
103
+
104
+ analysis = {
105
+ 'error_rate': errors.mean(),
106
+ 'confusion_matrix': confusion_matrix(y_test, y_pred),
107
+ 'per_class_errors': {},
108
+ 'error_patterns': [],
109
+ 'confident_errors': []
110
+ }
111
+
112
+ # Per-class error analysis
113
+ for cls in np.unique(y_test):
114
+ mask = y_test == cls
115
+ analysis['per_class_errors'][cls] = {
116
+ 'count': mask.sum(),
117
+ 'error_rate': errors[mask].mean(),
118
+ 'most_confused_with': Counter(y_pred[mask & errors]).most_common(3)
119
+ }
120
+
121
+ # High-confidence errors (most concerning)
122
+ max_prob = y_prob.max(axis=1)
123
+ confident_errors = error_indices[max_prob[error_indices] > 0.9]
124
+ analysis['confident_errors'] = {
125
+ 'count': len(confident_errors),
126
+ 'examples': confident_errors[:10].tolist()
127
+ }
128
+
129
+ # Error patterns by feature
130
+ for i, feature in enumerate(feature_names):
131
+ error_values = X_test[error_indices, i]
132
+ correct_values = X_test[~errors, i]
133
+
134
+ if len(error_values) > 10:
135
+ stat, p_value = stats.mannwhitneyu(error_values, correct_values)
136
+ if p_value < 0.01:
137
+ analysis['error_patterns'].append({
138
+ 'feature': feature,
139
+ 'p_value': p_value,
140
+ 'error_mean': error_values.mean(),
141
+ 'correct_mean': correct_values.mean()
142
+ })
143
+
144
+ return analysis
145
+ ```
146
+
147
+ **Outputs**:
148
+ - Error patterns
149
+ - Confusion analysis
150
+ - High-confidence errors
151
+
152
+ ### Step 3: Fairness Assessment
153
+ **Agent**: data-scientist-agent
154
+
155
+ **Inputs**:
156
+ - Predictions
157
+ - Sensitive attributes
158
+ - Fairness criteria
159
+
160
+ **Actions**:
161
+ ```python
162
+ def fairness_assessment(y_true, y_pred, y_prob, sensitive_attrs):
163
+ from fairlearn.metrics import (
164
+ demographic_parity_difference,
165
+ equalized_odds_difference,
166
+ MetricFrame
167
+ )
168
+
169
+ results = {}
170
+
171
+ for attr_name, attr_values in sensitive_attrs.items():
172
+ # Create metric frame
173
+ metric_frame = MetricFrame(
174
+ metrics={
175
+ 'accuracy': accuracy_score,
176
+ 'precision': precision_score,
177
+ 'recall': recall_score,
178
+ 'selection_rate': lambda y_t, y_p: y_p.mean()
179
+ },
180
+ y_true=y_true,
181
+ y_pred=y_pred,
182
+ sensitive_features=attr_values
183
+ )
184
+
185
+ results[attr_name] = {
186
+ 'by_group': metric_frame.by_group.to_dict(),
187
+ 'overall': metric_frame.overall.to_dict(),
188
+ 'demographic_parity_diff': demographic_parity_difference(
189
+ y_true, y_pred, sensitive_features=attr_values
190
+ ),
191
+ 'equalized_odds_diff': equalized_odds_difference(
192
+ y_true, y_pred, sensitive_features=attr_values
193
+ )
194
+ }
195
+
196
+ # Fairness summary
197
+ fairness_passed = all(
198
+ abs(r['demographic_parity_diff']) < 0.1
199
+ for r in results.values()
200
+ )
201
+
202
+ return {
203
+ 'detailed': results,
204
+ 'passed': fairness_passed,
205
+ 'threshold': 0.1
206
+ }
207
+ ```
208
+
209
+ **Outputs**:
210
+ - Fairness metrics
211
+ - Group disparities
212
+ - Recommendations
213
+
214
+ ### Step 4: Robustness Testing
215
+ **Agent**: research-scientist-agent
216
+
217
+ **Inputs**:
218
+ - Model
219
+ - Test data
220
+ - Perturbation types
221
+
222
+ **Actions**:
223
+ ```python
224
+ def robustness_testing(model, X_test, y_test):
225
+ results = {}
226
+
227
+ # Noise robustness
228
+ noise_levels = [0.01, 0.05, 0.1, 0.2]
229
+ results['noise'] = {}
230
+ for noise in noise_levels:
231
+ X_noisy = X_test + np.random.normal(0, noise, X_test.shape)
232
+ y_pred = model.predict(X_noisy)
233
+ results['noise'][noise] = accuracy_score(y_test, y_pred)
234
+
235
+ # Feature dropout
236
+ results['feature_dropout'] = {}
237
+ baseline_acc = accuracy_score(y_test, model.predict(X_test))
238
+ for i in range(X_test.shape[1]):
239
+ X_dropped = X_test.copy()
240
+ X_dropped[:, i] = 0
241
+ acc = accuracy_score(y_test, model.predict(X_dropped))
242
+ results['feature_dropout'][i] = baseline_acc - acc
243
+
244
+ # Out-of-distribution detection
245
+ # Use entropy of predictions
246
+ y_prob = model.predict_proba(X_test)
247
+ entropy = -np.sum(y_prob * np.log(y_prob + 1e-10), axis=1)
248
+ results['entropy_stats'] = {
249
+ 'mean': entropy.mean(),
250
+ 'std': entropy.std(),
251
+ 'high_entropy_pct': (entropy > 0.5).mean()
252
+ }
253
+
254
+ return results
255
+ ```
256
+
257
+ **Outputs**:
258
+ - Noise sensitivity
259
+ - Feature importance
260
+ - OOD detection
261
+
262
+ ### Step 5: Explainability
263
+ **Agent**: data-scientist-agent
264
+
265
+ **Inputs**:
266
+ - Model
267
+ - Sample data
268
+ - Explanation requirements
269
+
270
+ **Actions**:
271
+ ```python
272
+ import shap
273
+
274
+ def model_explainability(model, X_train, X_test, feature_names):
275
+ # SHAP values
276
+ explainer = shap.TreeExplainer(model)
277
+ shap_values = explainer.shap_values(X_test)
278
+
279
+ # Global feature importance
280
+ global_importance = np.abs(shap_values).mean(axis=0)
281
+ importance_ranking = sorted(
282
+ zip(feature_names, global_importance),
283
+ key=lambda x: -x[1]
284
+ )
285
+
286
+ # Generate plots
287
+ shap.summary_plot(shap_values, X_test, feature_names=feature_names)
288
+
289
+ # Create model card
290
+ model_card = {
291
+ 'model_details': {
292
+ 'type': type(model).__name__,
293
+ 'framework': 'sklearn',
294
+ 'version': '1.0'
295
+ },
296
+ 'intended_use': {
297
+ 'primary_use': 'Classification',
298
+ 'users': 'Data science team',
299
+ 'limitations': 'Not tested on populations outside training distribution'
300
+ },
301
+ 'metrics': {
302
+ 'accuracy': 0.92,
303
+ 'auc': 0.95
304
+ },
305
+ 'training_data': {
306
+ 'size': len(X_train),
307
+ 'features': len(feature_names)
308
+ },
309
+ 'ethical_considerations': {
310
+ 'fairness_tested': True,
311
+ 'sensitive_features': ['age', 'gender']
312
+ },
313
+ 'top_features': importance_ranking[:10]
314
+ }
315
+
316
+ return {
317
+ 'shap_values': shap_values,
318
+ 'importance': importance_ranking,
319
+ 'model_card': model_card
320
+ }
321
+ ```
322
+
323
+ **Outputs**:
324
+ - SHAP values
325
+ - Feature importance
326
+ - Model card
327
+
328
+ ### Step 6: Production Readiness
329
+ **Agent**: production-engineer-agent
330
+
331
+ **Inputs**:
332
+ - Model
333
+ - Production requirements
334
+ - Infrastructure constraints
335
+
336
+ **Actions**:
337
+ ```python
338
+ def production_readiness_check(model, X_sample, requirements):
339
+ results = {
340
+ 'latency': {},
341
+ 'memory': {},
342
+ 'size': {},
343
+ 'integration': {}
344
+ }
345
+
346
+ # Latency test
347
+ import time
348
+ latencies = []
349
+ for _ in range(100):
350
+ start = time.perf_counter()
351
+ model.predict(X_sample[:1])
352
+ latencies.append((time.perf_counter() - start) * 1000)
353
+
354
+ results['latency'] = {
355
+ 'p50': np.percentile(latencies, 50),
356
+ 'p95': np.percentile(latencies, 95),
357
+ 'p99': np.percentile(latencies, 99),
358
+ 'meets_sla': np.percentile(latencies, 99) < requirements['max_latency_ms']
359
+ }
360
+
361
+ # Memory footprint
362
+ import sys
363
+ import pickle
364
+ model_bytes = len(pickle.dumps(model))
365
+ results['size'] = {
366
+ 'bytes': model_bytes,
367
+ 'mb': model_bytes / 1024 / 1024,
368
+ 'meets_limit': model_bytes < requirements['max_size_mb'] * 1024 * 1024
369
+ }
370
+
371
+ # Integration test
372
+ try:
373
+ # Test serialization
374
+ model_path = '/tmp/test_model.pkl'
375
+ pickle.dump(model, open(model_path, 'wb'))
376
+ loaded = pickle.load(open(model_path, 'rb'))
377
+ test_pred = loaded.predict(X_sample[:1])
378
+ results['integration']['serialization'] = 'passed'
379
+ except Exception as e:
380
+ results['integration']['serialization'] = f'failed: {str(e)}'
381
+
382
+ results['ready'] = all([
383
+ results['latency']['meets_sla'],
384
+ results['size']['meets_limit'],
385
+ results['integration']['serialization'] == 'passed'
386
+ ])
387
+
388
+ return results
389
+ ```
390
+
391
+ **Outputs**:
392
+ - Latency benchmarks
393
+ - Memory footprint
394
+ - Production readiness
395
+
396
+ ## Artifacts
397
+
398
+ - `evaluation_report.json` - Complete metrics
399
+ - `error_analysis.json` - Error patterns
400
+ - `fairness_report.json` - Bias assessment
401
+ - `model_card.md` - Model documentation
402
+ - `visualizations/` - Plots and charts
403
+
404
+ ## Next Workflows
405
+
406
+ After evaluation:
407
+ - → **model-optimization-workflow** if performance insufficient
408
+ - → **model-deployment-workflow** if ready for production
409
+
410
+ ## Quality Gates
411
+
412
+ - [ ] All steps completed successfully
413
+ - [ ] Metrics meet defined thresholds
414
+ - [ ] Documentation updated
415
+ - [ ] Artifacts versioned and stored
416
+ - [ ] Stakeholder approval obtained