omgkit 2.20.0 → 2.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/README.md +125 -10
  2. package/package.json +1 -1
  3. package/plugin/agents/ai-architect-agent.md +282 -0
  4. package/plugin/agents/data-scientist-agent.md +221 -0
  5. package/plugin/agents/experiment-analyst-agent.md +318 -0
  6. package/plugin/agents/ml-engineer-agent.md +165 -0
  7. package/plugin/agents/mlops-engineer-agent.md +324 -0
  8. package/plugin/agents/model-optimizer-agent.md +287 -0
  9. package/plugin/agents/production-engineer-agent.md +360 -0
  10. package/plugin/agents/research-scientist-agent.md +274 -0
  11. package/plugin/commands/omgdata/augment.md +86 -0
  12. package/plugin/commands/omgdata/collect.md +81 -0
  13. package/plugin/commands/omgdata/label.md +83 -0
  14. package/plugin/commands/omgdata/split.md +83 -0
  15. package/plugin/commands/omgdata/validate.md +76 -0
  16. package/plugin/commands/omgdata/version.md +85 -0
  17. package/plugin/commands/omgdeploy/ab.md +94 -0
  18. package/plugin/commands/omgdeploy/cloud.md +89 -0
  19. package/plugin/commands/omgdeploy/edge.md +93 -0
  20. package/plugin/commands/omgdeploy/package.md +91 -0
  21. package/plugin/commands/omgdeploy/serve.md +92 -0
  22. package/plugin/commands/omgfeature/embed.md +93 -0
  23. package/plugin/commands/omgfeature/extract.md +93 -0
  24. package/plugin/commands/omgfeature/select.md +85 -0
  25. package/plugin/commands/omgfeature/store.md +97 -0
  26. package/plugin/commands/omgml/init.md +60 -0
  27. package/plugin/commands/omgml/status.md +82 -0
  28. package/plugin/commands/omgops/drift.md +87 -0
  29. package/plugin/commands/omgops/monitor.md +99 -0
  30. package/plugin/commands/omgops/pipeline.md +102 -0
  31. package/plugin/commands/omgops/registry.md +109 -0
  32. package/plugin/commands/omgops/retrain.md +91 -0
  33. package/plugin/commands/omgoptim/distill.md +90 -0
  34. package/plugin/commands/omgoptim/profile.md +92 -0
  35. package/plugin/commands/omgoptim/prune.md +81 -0
  36. package/plugin/commands/omgoptim/quantize.md +83 -0
  37. package/plugin/commands/omgtrain/baseline.md +78 -0
  38. package/plugin/commands/omgtrain/compare.md +99 -0
  39. package/plugin/commands/omgtrain/evaluate.md +85 -0
  40. package/plugin/commands/omgtrain/train.md +81 -0
  41. package/plugin/commands/omgtrain/tune.md +89 -0
  42. package/plugin/registry.yaml +252 -2
  43. package/plugin/skills/ml-systems/SKILL.md +65 -0
  44. package/plugin/skills/ml-systems/ai-accelerators/SKILL.md +342 -0
  45. package/plugin/skills/ml-systems/data-eng/SKILL.md +126 -0
  46. package/plugin/skills/ml-systems/deep-learning-primer/SKILL.md +143 -0
  47. package/plugin/skills/ml-systems/deployment-paradigms/SKILL.md +148 -0
  48. package/plugin/skills/ml-systems/dnn-architectures/SKILL.md +128 -0
  49. package/plugin/skills/ml-systems/edge-deployment/SKILL.md +366 -0
  50. package/plugin/skills/ml-systems/efficient-ai/SKILL.md +316 -0
  51. package/plugin/skills/ml-systems/feature-engineering/SKILL.md +151 -0
  52. package/plugin/skills/ml-systems/ml-frameworks/SKILL.md +187 -0
  53. package/plugin/skills/ml-systems/ml-serving-optimization/SKILL.md +371 -0
  54. package/plugin/skills/ml-systems/ml-systems-fundamentals/SKILL.md +103 -0
  55. package/plugin/skills/ml-systems/ml-workflow/SKILL.md +162 -0
  56. package/plugin/skills/ml-systems/mlops/SKILL.md +386 -0
  57. package/plugin/skills/ml-systems/model-deployment/SKILL.md +350 -0
  58. package/plugin/skills/ml-systems/model-dev/SKILL.md +160 -0
  59. package/plugin/skills/ml-systems/model-optimization/SKILL.md +339 -0
  60. package/plugin/skills/ml-systems/robust-ai/SKILL.md +395 -0
  61. package/plugin/skills/ml-systems/training-data/SKILL.md +152 -0
  62. package/plugin/workflows/ml-systems/data-preparation-workflow.md +276 -0
  63. package/plugin/workflows/ml-systems/edge-deployment-workflow.md +413 -0
  64. package/plugin/workflows/ml-systems/full-ml-lifecycle-workflow.md +405 -0
  65. package/plugin/workflows/ml-systems/hyperparameter-tuning-workflow.md +352 -0
  66. package/plugin/workflows/ml-systems/mlops-pipeline-workflow.md +384 -0
  67. package/plugin/workflows/ml-systems/model-deployment-workflow.md +392 -0
  68. package/plugin/workflows/ml-systems/model-development-workflow.md +218 -0
  69. package/plugin/workflows/ml-systems/model-evaluation-workflow.md +416 -0
  70. package/plugin/workflows/ml-systems/model-optimization-workflow.md +390 -0
  71. package/plugin/workflows/ml-systems/monitoring-drift-workflow.md +446 -0
  72. package/plugin/workflows/ml-systems/retraining-workflow.md +401 -0
  73. package/plugin/workflows/ml-systems/training-pipeline-workflow.md +382 -0
@@ -0,0 +1,401 @@
1
+ ---
2
+ name: Retraining Workflow
3
+ description: Automated model retraining workflow triggered by drift, scheduled intervals, or manual requests with validation and safe deployment.
4
+ category: ml-systems
5
+ complexity: medium
6
+ agents:
7
+ - ml-engineer-agent
8
+ - mlops-engineer-agent
9
+ - experiment-analyst-agent
10
+ ---
11
+
12
+ # Retraining Workflow
13
+
14
+ Automated model retraining pipeline.
15
+
16
+ ## Overview
17
+
18
+ ```
19
+ ┌─────────────────────────────────────────────────────────────┐
20
+ │ RETRAINING WORKFLOW │
21
+ ├─────────────────────────────────────────────────────────────┤
22
+ │ │
23
+ │ 1. TRIGGER 2. DATA 3. RETRAIN │
24
+ │ ↓ REFRESH ↓ │
25
+ │ Drift/Schedule New data Train model │
26
+ │ Manual Validate Hyperparams │
27
+ │ Performance Feature eng Checkpoints │
28
+ │ │
29
+ │ 4. VALIDATE 5. COMPARE 6. DEPLOY │
30
+ │ ↓ ↓ ↓ │
31
+ │ Quality gates vs Production Canary rollout │
32
+ │ Regression test A/B ready Monitor │
33
+ │ Fairness Approval Rollback ready │
34
+ │ │
35
+ └─────────────────────────────────────────────────────────────┘
36
+ ```
37
+
38
+ ## Trigger Conditions
39
+
40
+ ```python
41
+ class RetrainingTrigger:
42
+ def __init__(self, config):
43
+ self.config = config
44
+
45
+ def should_retrain(self, metrics):
46
+ triggers = []
47
+
48
+ # 1. Scheduled retraining
49
+ if self.is_scheduled():
50
+ triggers.append({'reason': 'scheduled', 'priority': 'normal'})
51
+
52
+ # 2. Drift detected
53
+ if metrics.get('drift_psi', 0) > self.config['drift_threshold']:
54
+ triggers.append({'reason': 'data_drift', 'priority': 'high'})
55
+
56
+ # 3. Performance degradation
57
+ if metrics.get('accuracy', 1) < self.config['min_accuracy']:
58
+ triggers.append({'reason': 'performance_drop', 'priority': 'critical'})
59
+
60
+ # 4. New data volume
61
+ if metrics.get('new_data_count', 0) > self.config['data_threshold']:
62
+ triggers.append({'reason': 'new_data', 'priority': 'normal'})
63
+
64
+ # 5. Concept drift
65
+ if metrics.get('concept_drift_detected', False):
66
+ triggers.append({'reason': 'concept_drift', 'priority': 'high'})
67
+
68
+ return triggers
69
+
70
+ def is_scheduled(self):
71
+ last_training = self.get_last_training_time()
72
+ days_since = (datetime.now() - last_training).days
73
+ return days_since >= self.config['retraining_interval_days']
74
+ ```
75
+
76
+ ## Steps
77
+
78
+ ### Step 1: Trigger
79
+ **Agent**: mlops-engineer-agent
80
+
81
+ **Actions**:
82
+ ```bash
83
+ # Trigger retraining
84
+ /omgops:retrain --reason drift --priority high
85
+ ```
86
+
87
+ ```python
88
+ # Airflow DAG trigger
89
+ def trigger_retraining(reason, priority='normal'):
90
+ from airflow.api.client.local_client import Client
91
+
92
+ client = Client(None, None)
93
+
94
+ # Trigger with context
95
+ client.trigger_dag(
96
+ dag_id='model_retraining',
97
+ conf={
98
+ 'trigger_reason': reason,
99
+ 'priority': priority,
100
+ 'triggered_at': datetime.now().isoformat(),
101
+ 'triggered_by': 'drift_monitor'
102
+ }
103
+ )
104
+
105
+ # Log trigger
106
+ log_retraining_trigger(reason, priority)
107
+ ```
108
+
109
+ ### Step 2: Data Refresh
110
+ **Agent**: ml-engineer-agent
111
+
112
+ **Actions**:
113
+ ```python
114
+ class DataRefreshManager:
115
+ def __init__(self, data_sources, feature_store):
116
+ self.sources = data_sources
117
+ self.feature_store = feature_store
118
+
119
+ def refresh_training_data(self, lookback_days=90):
120
+ # 1. Collect new data
121
+ new_data = []
122
+ for source in self.sources:
123
+ data = source.fetch(
124
+ start_date=datetime.now() - timedelta(days=lookback_days),
125
+ end_date=datetime.now()
126
+ )
127
+ new_data.append(data)
128
+
129
+ combined = pd.concat(new_data)
130
+
131
+ # 2. Validate new data
132
+ validation = self.validate_data(combined)
133
+ if not validation['passed']:
134
+ raise DataValidationError(validation['errors'])
135
+
136
+ # 3. Engineer features
137
+ features = self.feature_store.compute_features(combined)
138
+
139
+ # 4. Create train/val/test splits
140
+ train, val, test = self.split_data(
141
+ features,
142
+ strategy='temporal',
143
+ train_end=datetime.now() - timedelta(days=14),
144
+ val_end=datetime.now() - timedelta(days=7)
145
+ )
146
+
147
+ # 5. Version data
148
+ version = self.version_data(train, val, test)
149
+
150
+ return {
151
+ 'train': train,
152
+ 'val': val,
153
+ 'test': test,
154
+ 'version': version,
155
+ 'stats': self.compute_stats(train)
156
+ }
157
+ ```
158
+
159
+ **Outputs**:
160
+ - Fresh training data
161
+ - Validation report
162
+ - Data version
163
+
164
+ ### Step 3: Retrain
165
+ **Agent**: ml-engineer-agent
166
+
167
+ **Actions**:
168
+ ```python
169
+ class RetrainingPipeline:
170
+ def __init__(self, config):
171
+ self.config = config
172
+
173
+ def retrain(self, train_data, val_data, production_model):
174
+ with mlflow.start_run(run_name=f"retrain_{datetime.now().strftime('%Y%m%d')}"):
175
+ # Log context
176
+ mlflow.log_params({
177
+ 'trigger_reason': self.config['trigger_reason'],
178
+ 'train_samples': len(train_data),
179
+ 'val_samples': len(val_data)
180
+ })
181
+
182
+ # Option 1: Fine-tune from production
183
+ if self.config.get('fine_tune', True):
184
+ model = self.fine_tune(production_model, train_data, val_data)
185
+
186
+ # Option 2: Train from scratch
187
+ else:
188
+ model = self.train_from_scratch(train_data, val_data)
189
+
190
+ # Log metrics
191
+ val_metrics = self.evaluate(model, val_data)
192
+ mlflow.log_metrics(val_metrics)
193
+
194
+ # Save model
195
+ mlflow.pytorch.log_model(model, "model")
196
+
197
+ return model, mlflow.active_run().info.run_id
198
+
199
+ def fine_tune(self, base_model, train_data, val_data):
200
+ # Lower learning rate for fine-tuning
201
+ config = self.config.copy()
202
+ config['learning_rate'] *= 0.1
203
+ config['epochs'] = min(config['epochs'], 10)
204
+
205
+ return train_model(base_model, train_data, val_data, config)
206
+
207
+ def train_from_scratch(self, train_data, val_data):
208
+ # Full training with hyperparameter optimization
209
+ if self.config.get('tune_hyperparams', False):
210
+ best_params = self.tune_hyperparameters(train_data, val_data)
211
+ return train_model(None, train_data, val_data, best_params)
212
+ else:
213
+ return train_model(None, train_data, val_data, self.config)
214
+ ```
215
+
216
+ **Outputs**:
217
+ - Retrained model
218
+ - Training metrics
219
+ - MLflow run ID
220
+
221
+ ### Step 4: Validate
222
+ **Agent**: experiment-analyst-agent
223
+
224
+ **Actions**:
225
+ ```python
226
+ class RetrainingValidator:
227
+ def __init__(self, quality_thresholds):
228
+ self.thresholds = quality_thresholds
229
+
230
+ def validate(self, new_model, test_data, production_model):
231
+ results = {
232
+ 'quality_gates': {},
233
+ 'comparison': {},
234
+ 'fairness': {}
235
+ }
236
+
237
+ # 1. Quality gates
238
+ metrics = evaluate(new_model, test_data)
239
+ for metric, threshold in self.thresholds.items():
240
+ results['quality_gates'][metric] = {
241
+ 'value': metrics[metric],
242
+ 'threshold': threshold,
243
+ 'passed': metrics[metric] >= threshold
244
+ }
245
+
246
+ # 2. Comparison with production
247
+ prod_metrics = evaluate(production_model, test_data)
248
+ for metric in metrics:
249
+ results['comparison'][metric] = {
250
+ 'new': metrics[metric],
251
+ 'production': prod_metrics[metric],
252
+ 'improvement': metrics[metric] - prod_metrics[metric],
253
+ 'regression': metrics[metric] < prod_metrics[metric] - 0.01
254
+ }
255
+
256
+ # 3. Fairness checks
257
+ results['fairness'] = self.check_fairness(new_model, test_data)
258
+
259
+ # 4. Overall decision
260
+ results['approved'] = (
261
+ all(g['passed'] for g in results['quality_gates'].values()) and
262
+ not any(c['regression'] for c in results['comparison'].values()) and
263
+ results['fairness']['passed']
264
+ )
265
+
266
+ return results
267
+ ```
268
+
269
+ **Outputs**:
270
+ - Validation results
271
+ - Comparison report
272
+ - Approval decision
273
+
274
+ ### Step 5: Compare
275
+ **Agent**: experiment-analyst-agent
276
+
277
+ **Actions**:
278
+ ```python
279
+ def generate_comparison_report(new_model_id, production_model_id, test_data):
280
+ new_model = load_model(new_model_id)
281
+ prod_model = load_model(production_model_id)
282
+
283
+ report = {
284
+ 'summary': {},
285
+ 'detailed_metrics': {},
286
+ 'error_analysis': {},
287
+ 'recommendations': []
288
+ }
289
+
290
+ # Metrics comparison
291
+ new_metrics = comprehensive_evaluate(new_model, test_data)
292
+ prod_metrics = comprehensive_evaluate(prod_model, test_data)
293
+
294
+ report['summary'] = {
295
+ 'accuracy_delta': new_metrics['accuracy'] - prod_metrics['accuracy'],
296
+ 'f1_delta': new_metrics['f1'] - prod_metrics['f1'],
297
+ 'latency_delta': new_metrics['latency'] - prod_metrics['latency']
298
+ }
299
+
300
+ # Statistical significance
301
+ report['significance'] = statistical_comparison(
302
+ new_model, prod_model, test_data
303
+ )
304
+
305
+ # Error analysis
306
+ report['error_analysis'] = {
307
+ 'new_errors': find_new_errors(new_model, prod_model, test_data),
308
+ 'fixed_errors': find_fixed_errors(new_model, prod_model, test_data)
309
+ }
310
+
311
+ # Recommendations
312
+ if report['summary']['accuracy_delta'] > 0.01:
313
+ report['recommendations'].append('Approve: Significant accuracy improvement')
314
+ elif report['summary']['accuracy_delta'] >= -0.005:
315
+ report['recommendations'].append('Approve: Performance maintained')
316
+ else:
317
+ report['recommendations'].append('Reject: Performance regression detected')
318
+
319
+ return report
320
+ ```
321
+
322
+ **Outputs**:
323
+ - Comparison report
324
+ - Significance tests
325
+ - Deployment recommendation
326
+
327
+ ### Step 6: Deploy
328
+ **Agent**: mlops-engineer-agent
329
+
330
+ **Actions**:
331
+ ```bash
332
+ # Deploy retrained model
333
+ /omgdeploy:cloud --model <run_id> --strategy canary --monitor-duration 2h
334
+ ```
335
+
336
+ ```python
337
+ class RetrainingDeployment:
338
+ def deploy(self, model_run_id, validation_results):
339
+ if not validation_results['approved']:
340
+ logging.warning("Model not approved, skipping deployment")
341
+ return {'deployed': False, 'reason': 'validation_failed'}
342
+
343
+ # 1. Register model
344
+ model_version = mlflow.register_model(
345
+ f"runs:/{model_run_id}/model",
346
+ "production_model"
347
+ )
348
+
349
+ # 2. Deploy canary
350
+ self.deploy_canary(model_version.version, traffic_pct=5)
351
+
352
+ # 3. Monitor canary
353
+ monitoring_results = self.monitor_canary(duration_hours=2)
354
+
355
+ if monitoring_results['healthy']:
356
+ # 4. Gradual rollout
357
+ for pct in [10, 25, 50, 100]:
358
+ self.update_traffic(model_version.version, pct)
359
+ time.sleep(600) # 10 min between steps
360
+
361
+ health = self.check_health()
362
+ if not health['ok']:
363
+ self.rollback()
364
+ return {'deployed': False, 'reason': 'rollout_failed'}
365
+
366
+ # 5. Promote to production
367
+ self.promote_to_production(model_version.version)
368
+
369
+ return {'deployed': True, 'version': model_version.version}
370
+
371
+ else:
372
+ self.rollback()
373
+ return {'deployed': False, 'reason': 'canary_failed'}
374
+ ```
375
+
376
+ **Outputs**:
377
+ - Deployed model
378
+ - Deployment metrics
379
+ - Rollback capability
380
+
381
+ ## Artifacts
382
+
383
+ - `retraining/` - Retraining configs
384
+ - `data_versions/` - Versioned datasets
385
+ - `models/` - Model artifacts
386
+ - `reports/` - Comparison reports
387
+ - `logs/` - Pipeline logs
388
+
389
+ ## Next Workflows
390
+
391
+ After retraining:
392
+ - → **monitoring-drift-workflow** for ongoing monitoring
393
+ - → **model-evaluation-workflow** for deep analysis
394
+
395
+ ## Quality Gates
396
+
397
+ - [ ] All steps completed successfully
398
+ - [ ] Metrics meet defined thresholds
399
+ - [ ] Documentation updated
400
+ - [ ] Artifacts versioned and stored
401
+ - [ ] Stakeholder approval obtained