omgkit 2.20.0 → 2.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +125 -10
- package/package.json +1 -1
- package/plugin/agents/ai-architect-agent.md +282 -0
- package/plugin/agents/data-scientist-agent.md +221 -0
- package/plugin/agents/experiment-analyst-agent.md +318 -0
- package/plugin/agents/ml-engineer-agent.md +165 -0
- package/plugin/agents/mlops-engineer-agent.md +324 -0
- package/plugin/agents/model-optimizer-agent.md +287 -0
- package/plugin/agents/production-engineer-agent.md +360 -0
- package/plugin/agents/research-scientist-agent.md +274 -0
- package/plugin/commands/omgdata/augment.md +86 -0
- package/plugin/commands/omgdata/collect.md +81 -0
- package/plugin/commands/omgdata/label.md +83 -0
- package/plugin/commands/omgdata/split.md +83 -0
- package/plugin/commands/omgdata/validate.md +76 -0
- package/plugin/commands/omgdata/version.md +85 -0
- package/plugin/commands/omgdeploy/ab.md +94 -0
- package/plugin/commands/omgdeploy/cloud.md +89 -0
- package/plugin/commands/omgdeploy/edge.md +93 -0
- package/plugin/commands/omgdeploy/package.md +91 -0
- package/plugin/commands/omgdeploy/serve.md +92 -0
- package/plugin/commands/omgfeature/embed.md +93 -0
- package/plugin/commands/omgfeature/extract.md +93 -0
- package/plugin/commands/omgfeature/select.md +85 -0
- package/plugin/commands/omgfeature/store.md +97 -0
- package/plugin/commands/omgml/init.md +60 -0
- package/plugin/commands/omgml/status.md +82 -0
- package/plugin/commands/omgops/drift.md +87 -0
- package/plugin/commands/omgops/monitor.md +99 -0
- package/plugin/commands/omgops/pipeline.md +102 -0
- package/plugin/commands/omgops/registry.md +109 -0
- package/plugin/commands/omgops/retrain.md +91 -0
- package/plugin/commands/omgoptim/distill.md +90 -0
- package/plugin/commands/omgoptim/profile.md +92 -0
- package/plugin/commands/omgoptim/prune.md +81 -0
- package/plugin/commands/omgoptim/quantize.md +83 -0
- package/plugin/commands/omgtrain/baseline.md +78 -0
- package/plugin/commands/omgtrain/compare.md +99 -0
- package/plugin/commands/omgtrain/evaluate.md +85 -0
- package/plugin/commands/omgtrain/train.md +81 -0
- package/plugin/commands/omgtrain/tune.md +89 -0
- package/plugin/registry.yaml +252 -2
- package/plugin/skills/ml-systems/SKILL.md +65 -0
- package/plugin/skills/ml-systems/ai-accelerators/SKILL.md +342 -0
- package/plugin/skills/ml-systems/data-eng/SKILL.md +126 -0
- package/plugin/skills/ml-systems/deep-learning-primer/SKILL.md +143 -0
- package/plugin/skills/ml-systems/deployment-paradigms/SKILL.md +148 -0
- package/plugin/skills/ml-systems/dnn-architectures/SKILL.md +128 -0
- package/plugin/skills/ml-systems/edge-deployment/SKILL.md +366 -0
- package/plugin/skills/ml-systems/efficient-ai/SKILL.md +316 -0
- package/plugin/skills/ml-systems/feature-engineering/SKILL.md +151 -0
- package/plugin/skills/ml-systems/ml-frameworks/SKILL.md +187 -0
- package/plugin/skills/ml-systems/ml-serving-optimization/SKILL.md +371 -0
- package/plugin/skills/ml-systems/ml-systems-fundamentals/SKILL.md +103 -0
- package/plugin/skills/ml-systems/ml-workflow/SKILL.md +162 -0
- package/plugin/skills/ml-systems/mlops/SKILL.md +386 -0
- package/plugin/skills/ml-systems/model-deployment/SKILL.md +350 -0
- package/plugin/skills/ml-systems/model-dev/SKILL.md +160 -0
- package/plugin/skills/ml-systems/model-optimization/SKILL.md +339 -0
- package/plugin/skills/ml-systems/robust-ai/SKILL.md +395 -0
- package/plugin/skills/ml-systems/training-data/SKILL.md +152 -0
- package/plugin/workflows/ml-systems/data-preparation-workflow.md +276 -0
- package/plugin/workflows/ml-systems/edge-deployment-workflow.md +413 -0
- package/plugin/workflows/ml-systems/full-ml-lifecycle-workflow.md +405 -0
- package/plugin/workflows/ml-systems/hyperparameter-tuning-workflow.md +352 -0
- package/plugin/workflows/ml-systems/mlops-pipeline-workflow.md +384 -0
- package/plugin/workflows/ml-systems/model-deployment-workflow.md +392 -0
- package/plugin/workflows/ml-systems/model-development-workflow.md +218 -0
- package/plugin/workflows/ml-systems/model-evaluation-workflow.md +416 -0
- package/plugin/workflows/ml-systems/model-optimization-workflow.md +390 -0
- package/plugin/workflows/ml-systems/monitoring-drift-workflow.md +446 -0
- package/plugin/workflows/ml-systems/retraining-workflow.md +401 -0
- package/plugin/workflows/ml-systems/training-pipeline-workflow.md +382 -0
|
@@ -0,0 +1,384 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: MLOps Pipeline Workflow
|
|
3
|
+
description: Complete MLOps pipeline workflow covering CI/CD for ML, automated testing, model validation, and continuous deployment.
|
|
4
|
+
category: ml-systems
|
|
5
|
+
complexity: medium
|
|
6
|
+
agents:
|
|
7
|
+
- mlops-engineer-agent
|
|
8
|
+
- production-engineer-agent
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
# MLOps Pipeline Workflow
|
|
12
|
+
|
|
13
|
+
End-to-end MLOps automation pipeline.
|
|
14
|
+
|
|
15
|
+
## Overview
|
|
16
|
+
|
|
17
|
+
```
|
|
18
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
19
|
+
│ MLOPS PIPELINE WORKFLOW │
|
|
20
|
+
├─────────────────────────────────────────────────────────────┤
|
|
21
|
+
│ │
|
|
22
|
+
│ TRIGGER CI PIPELINE CD PIPELINE │
|
|
23
|
+
│ ─────── ─────────── ─────────── │
|
|
24
|
+
│ Push/PR Lint & Test Deploy Staging │
|
|
25
|
+
│ Schedule Train Model Validate │
|
|
26
|
+
│ Manual Validate Deploy Prod │
|
|
27
|
+
│ │
|
|
28
|
+
│ ┌─────────────────────────────────────────────────────────┐│
|
|
29
|
+
│ │ Data Change → Feature Eng → Training → Evaluation → ││
|
|
30
|
+
│ │ Registration → Staging → Validation → Production ││
|
|
31
|
+
│ └─────────────────────────────────────────────────────────┘│
|
|
32
|
+
│ │
|
|
33
|
+
└─────────────────────────────────────────────────────────────┘
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Pipeline Configuration
|
|
37
|
+
|
|
38
|
+
### GitHub Actions Workflow
|
|
39
|
+
```yaml
|
|
40
|
+
# .github/workflows/ml-pipeline.yml
|
|
41
|
+
name: ML Pipeline
|
|
42
|
+
|
|
43
|
+
on:
|
|
44
|
+
push:
|
|
45
|
+
branches: [main, develop]
|
|
46
|
+
paths:
|
|
47
|
+
- 'src/**'
|
|
48
|
+
- 'models/**'
|
|
49
|
+
- 'data/**'
|
|
50
|
+
pull_request:
|
|
51
|
+
branches: [main]
|
|
52
|
+
schedule:
|
|
53
|
+
- cron: '0 2 * * 0' # Weekly retraining
|
|
54
|
+
workflow_dispatch:
|
|
55
|
+
inputs:
|
|
56
|
+
environment:
|
|
57
|
+
description: 'Target environment'
|
|
58
|
+
required: true
|
|
59
|
+
default: 'staging'
|
|
60
|
+
|
|
61
|
+
env:
|
|
62
|
+
MLFLOW_TRACKING_URI: ${{ secrets.MLFLOW_URI }}
|
|
63
|
+
MODEL_REGISTRY: ${{ secrets.MODEL_REGISTRY }}
|
|
64
|
+
|
|
65
|
+
jobs:
|
|
66
|
+
lint-and-test:
|
|
67
|
+
runs-on: ubuntu-latest
|
|
68
|
+
steps:
|
|
69
|
+
- uses: actions/checkout@v3
|
|
70
|
+
|
|
71
|
+
- name: Setup Python
|
|
72
|
+
uses: actions/setup-python@v4
|
|
73
|
+
with:
|
|
74
|
+
python-version: '3.10'
|
|
75
|
+
|
|
76
|
+
- name: Install dependencies
|
|
77
|
+
run: pip install -r requirements.txt -r requirements-dev.txt
|
|
78
|
+
|
|
79
|
+
- name: Lint
|
|
80
|
+
run: |
|
|
81
|
+
black --check src/
|
|
82
|
+
ruff src/
|
|
83
|
+
mypy src/
|
|
84
|
+
|
|
85
|
+
- name: Unit tests
|
|
86
|
+
run: pytest tests/unit/ -v --cov=src --cov-report=xml
|
|
87
|
+
|
|
88
|
+
data-validation:
|
|
89
|
+
runs-on: ubuntu-latest
|
|
90
|
+
needs: lint-and-test
|
|
91
|
+
steps:
|
|
92
|
+
- uses: actions/checkout@v3
|
|
93
|
+
|
|
94
|
+
- name: Pull data
|
|
95
|
+
run: dvc pull
|
|
96
|
+
|
|
97
|
+
- name: Validate data
|
|
98
|
+
run: |
|
|
99
|
+
python -m great_expectations checkpoint run data_quality
|
|
100
|
+
python scripts/validate_data.py
|
|
101
|
+
|
|
102
|
+
train:
|
|
103
|
+
runs-on: [self-hosted, gpu]
|
|
104
|
+
needs: data-validation
|
|
105
|
+
outputs:
|
|
106
|
+
run_id: ${{ steps.train.outputs.run_id }}
|
|
107
|
+
model_version: ${{ steps.train.outputs.model_version }}
|
|
108
|
+
steps:
|
|
109
|
+
- uses: actions/checkout@v3
|
|
110
|
+
|
|
111
|
+
- name: Train model
|
|
112
|
+
id: train
|
|
113
|
+
run: |
|
|
114
|
+
RUN_ID=$(python train.py --config configs/production.yaml)
|
|
115
|
+
echo "run_id=$RUN_ID" >> $GITHUB_OUTPUT
|
|
116
|
+
|
|
117
|
+
- name: Register model
|
|
118
|
+
run: |
|
|
119
|
+
VERSION=$(python scripts/register_model.py --run-id ${{ steps.train.outputs.run_id }})
|
|
120
|
+
echo "model_version=$VERSION" >> $GITHUB_OUTPUT
|
|
121
|
+
|
|
122
|
+
evaluate:
|
|
123
|
+
runs-on: ubuntu-latest
|
|
124
|
+
needs: train
|
|
125
|
+
steps:
|
|
126
|
+
- name: Evaluate model
|
|
127
|
+
run: |
|
|
128
|
+
python evaluate.py --run-id ${{ needs.train.outputs.run_id }}
|
|
129
|
+
python scripts/check_regression.py --baseline production
|
|
130
|
+
|
|
131
|
+
deploy-staging:
|
|
132
|
+
runs-on: ubuntu-latest
|
|
133
|
+
needs: evaluate
|
|
134
|
+
environment: staging
|
|
135
|
+
steps:
|
|
136
|
+
- name: Deploy to staging
|
|
137
|
+
run: |
|
|
138
|
+
kubectl apply -f k8s/staging/
|
|
139
|
+
kubectl set image deployment/ml-model \
|
|
140
|
+
model=registry/ml-model:${{ needs.train.outputs.model_version }}
|
|
141
|
+
|
|
142
|
+
- name: Smoke tests
|
|
143
|
+
run: python tests/smoke/test_staging.py
|
|
144
|
+
|
|
145
|
+
validate-staging:
|
|
146
|
+
runs-on: ubuntu-latest
|
|
147
|
+
needs: deploy-staging
|
|
148
|
+
steps:
|
|
149
|
+
- name: Integration tests
|
|
150
|
+
run: pytest tests/integration/ -v
|
|
151
|
+
|
|
152
|
+
- name: Load tests
|
|
153
|
+
run: |
|
|
154
|
+
locust -f tests/load/locustfile.py \
|
|
155
|
+
--headless -u 100 -r 10 -t 5m \
|
|
156
|
+
--host $STAGING_URL
|
|
157
|
+
|
|
158
|
+
- name: Shadow comparison
|
|
159
|
+
run: python scripts/shadow_compare.py --duration 1h
|
|
160
|
+
|
|
161
|
+
deploy-production:
|
|
162
|
+
runs-on: ubuntu-latest
|
|
163
|
+
needs: validate-staging
|
|
164
|
+
if: github.ref == 'refs/heads/main'
|
|
165
|
+
environment: production
|
|
166
|
+
steps:
|
|
167
|
+
- name: Deploy canary
|
|
168
|
+
run: |
|
|
169
|
+
kubectl apply -f k8s/production/canary.yaml
|
|
170
|
+
kubectl set image deployment/ml-model-canary \
|
|
171
|
+
model=registry/ml-model:${{ needs.train.outputs.model_version }}
|
|
172
|
+
|
|
173
|
+
- name: Monitor canary
|
|
174
|
+
run: |
|
|
175
|
+
python scripts/monitor_canary.py \
|
|
176
|
+
--duration 30m \
|
|
177
|
+
--error-threshold 0.01
|
|
178
|
+
|
|
179
|
+
- name: Promote to production
|
|
180
|
+
run: |
|
|
181
|
+
kubectl apply -f k8s/production/
|
|
182
|
+
kubectl set image deployment/ml-model \
|
|
183
|
+
model=registry/ml-model:${{ needs.train.outputs.model_version }}
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
## Steps
|
|
187
|
+
|
|
188
|
+
### Step 1: Continuous Integration
|
|
189
|
+
**Agent**: mlops-engineer-agent
|
|
190
|
+
|
|
191
|
+
**Components**:
|
|
192
|
+
```bash
|
|
193
|
+
# Setup CI pipeline
|
|
194
|
+
/omgops:pipeline --type ci --template ml-standard
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
**Tests**:
|
|
198
|
+
```python
|
|
199
|
+
# tests/unit/test_model.py
|
|
200
|
+
def test_model_forward():
|
|
201
|
+
model = load_model()
|
|
202
|
+
x = torch.randn(1, 100)
|
|
203
|
+
output = model(x)
|
|
204
|
+
assert output.shape == (1, 10)
|
|
205
|
+
|
|
206
|
+
# tests/unit/test_preprocessing.py
|
|
207
|
+
def test_preprocessing_pipeline():
|
|
208
|
+
raw = {"text": "Hello world"}
|
|
209
|
+
processed = preprocess(raw)
|
|
210
|
+
assert "input_ids" in processed
|
|
211
|
+
|
|
212
|
+
# tests/integration/test_endpoint.py
|
|
213
|
+
def test_prediction_endpoint():
|
|
214
|
+
response = requests.post(f"{API_URL}/predict", json=sample_input)
|
|
215
|
+
assert response.status_code == 200
|
|
216
|
+
assert "prediction" in response.json()
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
### Step 2: Continuous Training
|
|
220
|
+
**Agent**: ml-engineer-agent
|
|
221
|
+
|
|
222
|
+
**Training Automation**:
|
|
223
|
+
```python
|
|
224
|
+
# scripts/train.py
|
|
225
|
+
import mlflow
|
|
226
|
+
import argparse
|
|
227
|
+
|
|
228
|
+
def main():
|
|
229
|
+
parser = argparse.ArgumentParser()
|
|
230
|
+
parser.add_argument('--config', required=True)
|
|
231
|
+
args = parser.parse_args()
|
|
232
|
+
|
|
233
|
+
config = load_config(args.config)
|
|
234
|
+
|
|
235
|
+
with mlflow.start_run() as run:
|
|
236
|
+
# Log configuration
|
|
237
|
+
mlflow.log_params(config)
|
|
238
|
+
|
|
239
|
+
# Load data
|
|
240
|
+
train_data = load_data(config['data']['train'])
|
|
241
|
+
val_data = load_data(config['data']['val'])
|
|
242
|
+
|
|
243
|
+
# Train
|
|
244
|
+
model = train(train_data, val_data, config)
|
|
245
|
+
|
|
246
|
+
# Evaluate
|
|
247
|
+
metrics = evaluate(model, val_data)
|
|
248
|
+
mlflow.log_metrics(metrics)
|
|
249
|
+
|
|
250
|
+
# Log model
|
|
251
|
+
mlflow.pytorch.log_model(model, "model")
|
|
252
|
+
|
|
253
|
+
print(run.info.run_id)
|
|
254
|
+
return run.info.run_id
|
|
255
|
+
|
|
256
|
+
if __name__ == '__main__':
|
|
257
|
+
main()
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
### Step 3: Model Validation
|
|
261
|
+
**Agent**: experiment-analyst-agent
|
|
262
|
+
|
|
263
|
+
**Validation Gates**:
|
|
264
|
+
```python
|
|
265
|
+
# scripts/check_regression.py
|
|
266
|
+
def check_regression(new_run_id, baseline='production'):
|
|
267
|
+
# Get baseline metrics
|
|
268
|
+
baseline_model = mlflow.pyfunc.load_model(f"models:/my_model/{baseline}")
|
|
269
|
+
baseline_metrics = evaluate(baseline_model, test_data)
|
|
270
|
+
|
|
271
|
+
# Get new metrics
|
|
272
|
+
new_model = mlflow.pyfunc.load_model(f"runs:/{new_run_id}/model")
|
|
273
|
+
new_metrics = evaluate(new_model, test_data)
|
|
274
|
+
|
|
275
|
+
# Check regression
|
|
276
|
+
checks = {
|
|
277
|
+
'accuracy': new_metrics['accuracy'] >= baseline_metrics['accuracy'] - 0.01,
|
|
278
|
+
'latency': new_metrics['latency'] <= baseline_metrics['latency'] * 1.1,
|
|
279
|
+
'f1': new_metrics['f1'] >= baseline_metrics['f1'] - 0.02
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
if not all(checks.values()):
|
|
283
|
+
print("Regression detected!")
|
|
284
|
+
for check, passed in checks.items():
|
|
285
|
+
print(f" {check}: {'✓' if passed else '✗'}")
|
|
286
|
+
sys.exit(1)
|
|
287
|
+
|
|
288
|
+
print("All checks passed!")
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
### Step 4: Continuous Deployment
|
|
292
|
+
**Agent**: mlops-engineer-agent
|
|
293
|
+
|
|
294
|
+
**Deployment Strategy**:
|
|
295
|
+
```python
|
|
296
|
+
class DeploymentManager:
|
|
297
|
+
def __init__(self, k8s_client):
|
|
298
|
+
self.k8s = k8s_client
|
|
299
|
+
|
|
300
|
+
def canary_deploy(self, model_version, canary_pct=10):
|
|
301
|
+
# Deploy canary
|
|
302
|
+
self.k8s.apply(f"""
|
|
303
|
+
apiVersion: apps/v1
|
|
304
|
+
kind: Deployment
|
|
305
|
+
metadata:
|
|
306
|
+
name: ml-model-canary
|
|
307
|
+
spec:
|
|
308
|
+
replicas: 1
|
|
309
|
+
template:
|
|
310
|
+
spec:
|
|
311
|
+
containers:
|
|
312
|
+
- name: model
|
|
313
|
+
image: registry/ml-model:{model_version}
|
|
314
|
+
""")
|
|
315
|
+
|
|
316
|
+
# Configure traffic split
|
|
317
|
+
self.k8s.apply(f"""
|
|
318
|
+
apiVersion: networking.istio.io/v1alpha3
|
|
319
|
+
kind: VirtualService
|
|
320
|
+
spec:
|
|
321
|
+
http:
|
|
322
|
+
- route:
|
|
323
|
+
- destination:
|
|
324
|
+
host: ml-model
|
|
325
|
+
weight: {100 - canary_pct}
|
|
326
|
+
- destination:
|
|
327
|
+
host: ml-model-canary
|
|
328
|
+
weight: {canary_pct}
|
|
329
|
+
""")
|
|
330
|
+
|
|
331
|
+
def promote(self, model_version):
|
|
332
|
+
# Update production
|
|
333
|
+
self.k8s.set_image('deployment/ml-model', f'model=registry/ml-model:{model_version}')
|
|
334
|
+
|
|
335
|
+
# Remove canary
|
|
336
|
+
self.k8s.delete('deployment/ml-model-canary')
|
|
337
|
+
|
|
338
|
+
def rollback(self):
|
|
339
|
+
self.k8s.rollout_undo('deployment/ml-model')
|
|
340
|
+
```
|
|
341
|
+
|
|
342
|
+
### Step 5: Observability
|
|
343
|
+
**Agent**: mlops-engineer-agent
|
|
344
|
+
|
|
345
|
+
**Pipeline Metrics**:
|
|
346
|
+
```python
|
|
347
|
+
# Prometheus metrics for pipeline
|
|
348
|
+
from prometheus_client import Counter, Histogram, Gauge
|
|
349
|
+
|
|
350
|
+
PIPELINE_RUNS = Counter('mlops_pipeline_runs_total', 'Pipeline executions', ['status'])
|
|
351
|
+
PIPELINE_DURATION = Histogram('mlops_pipeline_duration_seconds', 'Pipeline duration')
|
|
352
|
+
MODEL_VERSIONS = Counter('mlops_model_versions_total', 'Model versions deployed')
|
|
353
|
+
TRAINING_TIME = Histogram('mlops_training_duration_seconds', 'Training duration')
|
|
354
|
+
|
|
355
|
+
# Dashboard metrics
|
|
356
|
+
pipeline_metrics = {
|
|
357
|
+
'runs_per_day': 'rate(mlops_pipeline_runs_total[24h])',
|
|
358
|
+
'success_rate': 'sum(mlops_pipeline_runs_total{status="success"}) / sum(mlops_pipeline_runs_total)',
|
|
359
|
+
'avg_duration': 'avg(mlops_pipeline_duration_seconds)',
|
|
360
|
+
'deployments_per_week': 'sum(increase(mlops_model_versions_total[7d]))'
|
|
361
|
+
}
|
|
362
|
+
```
|
|
363
|
+
|
|
364
|
+
## Artifacts
|
|
365
|
+
|
|
366
|
+
- `.github/workflows/` - CI/CD definitions
|
|
367
|
+
- `scripts/` - Pipeline scripts
|
|
368
|
+
- `configs/` - Configuration files
|
|
369
|
+
- `k8s/` - Kubernetes manifests
|
|
370
|
+
- `tests/` - Test suites
|
|
371
|
+
|
|
372
|
+
## Next Workflows
|
|
373
|
+
|
|
374
|
+
After MLOps pipeline:
|
|
375
|
+
- → **monitoring-drift-workflow** for production monitoring
|
|
376
|
+
- → **retraining-workflow** for automated retraining
|
|
377
|
+
|
|
378
|
+
## Quality Gates
|
|
379
|
+
|
|
380
|
+
- [ ] All steps completed successfully
|
|
381
|
+
- [ ] Metrics meet defined thresholds
|
|
382
|
+
- [ ] Documentation updated
|
|
383
|
+
- [ ] Artifacts versioned and stored
|
|
384
|
+
- [ ] Stakeholder approval obtained
|