omgkit 2.20.0 → 2.21.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +125 -10
- package/package.json +1 -1
- package/plugin/agents/ai-architect-agent.md +282 -0
- package/plugin/agents/data-scientist-agent.md +221 -0
- package/plugin/agents/experiment-analyst-agent.md +318 -0
- package/plugin/agents/ml-engineer-agent.md +165 -0
- package/plugin/agents/mlops-engineer-agent.md +324 -0
- package/plugin/agents/model-optimizer-agent.md +287 -0
- package/plugin/agents/production-engineer-agent.md +360 -0
- package/plugin/agents/research-scientist-agent.md +274 -0
- package/plugin/commands/omgdata/augment.md +86 -0
- package/plugin/commands/omgdata/collect.md +81 -0
- package/plugin/commands/omgdata/label.md +83 -0
- package/plugin/commands/omgdata/split.md +83 -0
- package/plugin/commands/omgdata/validate.md +76 -0
- package/plugin/commands/omgdata/version.md +85 -0
- package/plugin/commands/omgdeploy/ab.md +94 -0
- package/plugin/commands/omgdeploy/cloud.md +89 -0
- package/plugin/commands/omgdeploy/edge.md +93 -0
- package/plugin/commands/omgdeploy/package.md +91 -0
- package/plugin/commands/omgdeploy/serve.md +92 -0
- package/plugin/commands/omgfeature/embed.md +93 -0
- package/plugin/commands/omgfeature/extract.md +93 -0
- package/plugin/commands/omgfeature/select.md +85 -0
- package/plugin/commands/omgfeature/store.md +97 -0
- package/plugin/commands/omgml/init.md +60 -0
- package/plugin/commands/omgml/status.md +82 -0
- package/plugin/commands/omgops/drift.md +87 -0
- package/plugin/commands/omgops/monitor.md +99 -0
- package/plugin/commands/omgops/pipeline.md +102 -0
- package/plugin/commands/omgops/registry.md +109 -0
- package/plugin/commands/omgops/retrain.md +91 -0
- package/plugin/commands/omgoptim/distill.md +90 -0
- package/plugin/commands/omgoptim/profile.md +92 -0
- package/plugin/commands/omgoptim/prune.md +81 -0
- package/plugin/commands/omgoptim/quantize.md +83 -0
- package/plugin/commands/omgtrain/baseline.md +78 -0
- package/plugin/commands/omgtrain/compare.md +99 -0
- package/plugin/commands/omgtrain/evaluate.md +85 -0
- package/plugin/commands/omgtrain/train.md +81 -0
- package/plugin/commands/omgtrain/tune.md +89 -0
- package/plugin/registry.yaml +252 -2
- package/plugin/skills/ml-systems/SKILL.md +65 -0
- package/plugin/skills/ml-systems/ai-accelerators/SKILL.md +342 -0
- package/plugin/skills/ml-systems/data-eng/SKILL.md +126 -0
- package/plugin/skills/ml-systems/deep-learning-primer/SKILL.md +143 -0
- package/plugin/skills/ml-systems/deployment-paradigms/SKILL.md +148 -0
- package/plugin/skills/ml-systems/dnn-architectures/SKILL.md +128 -0
- package/plugin/skills/ml-systems/edge-deployment/SKILL.md +366 -0
- package/plugin/skills/ml-systems/efficient-ai/SKILL.md +316 -0
- package/plugin/skills/ml-systems/feature-engineering/SKILL.md +151 -0
- package/plugin/skills/ml-systems/ml-frameworks/SKILL.md +187 -0
- package/plugin/skills/ml-systems/ml-serving-optimization/SKILL.md +371 -0
- package/plugin/skills/ml-systems/ml-systems-fundamentals/SKILL.md +103 -0
- package/plugin/skills/ml-systems/ml-workflow/SKILL.md +162 -0
- package/plugin/skills/ml-systems/mlops/SKILL.md +386 -0
- package/plugin/skills/ml-systems/model-deployment/SKILL.md +350 -0
- package/plugin/skills/ml-systems/model-dev/SKILL.md +160 -0
- package/plugin/skills/ml-systems/model-optimization/SKILL.md +339 -0
- package/plugin/skills/ml-systems/robust-ai/SKILL.md +395 -0
- package/plugin/skills/ml-systems/training-data/SKILL.md +152 -0
- package/plugin/workflows/ml-systems/data-preparation-workflow.md +276 -0
- package/plugin/workflows/ml-systems/edge-deployment-workflow.md +413 -0
- package/plugin/workflows/ml-systems/full-ml-lifecycle-workflow.md +405 -0
- package/plugin/workflows/ml-systems/hyperparameter-tuning-workflow.md +352 -0
- package/plugin/workflows/ml-systems/mlops-pipeline-workflow.md +384 -0
- package/plugin/workflows/ml-systems/model-deployment-workflow.md +392 -0
- package/plugin/workflows/ml-systems/model-development-workflow.md +218 -0
- package/plugin/workflows/ml-systems/model-evaluation-workflow.md +416 -0
- package/plugin/workflows/ml-systems/model-optimization-workflow.md +390 -0
- package/plugin/workflows/ml-systems/monitoring-drift-workflow.md +446 -0
- package/plugin/workflows/ml-systems/retraining-workflow.md +401 -0
- package/plugin/workflows/ml-systems/training-pipeline-workflow.md +382 -0
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: ml-systems-fundamentals
|
|
3
|
+
description: Core ML systems concepts including ML lifecycle, system architecture, requirements, and design principles for production ML.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# ML Systems Fundamentals
|
|
7
|
+
|
|
8
|
+
Foundation concepts for building production ML systems.
|
|
9
|
+
|
|
10
|
+
## ML System Architecture
|
|
11
|
+
|
|
12
|
+
```
|
|
13
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
14
|
+
│ ML SYSTEM ARCHITECTURE │
|
|
15
|
+
├─────────────────────────────────────────────────────────────┤
|
|
16
|
+
│ │
|
|
17
|
+
│ DATA LAYER │
|
|
18
|
+
│ ├── Data Collection ├── Data Storage │
|
|
19
|
+
│ ├── Data Processing └── Feature Store │
|
|
20
|
+
│ │
|
|
21
|
+
│ MODEL LAYER │
|
|
22
|
+
│ ├── Training Pipeline ├── Experiment Tracking │
|
|
23
|
+
│ ├── Model Registry └── Evaluation │
|
|
24
|
+
│ │
|
|
25
|
+
│ SERVING LAYER │
|
|
26
|
+
│ ├── Model Serving ├── Feature Serving │
|
|
27
|
+
│ ├── Prediction Cache └── Load Balancing │
|
|
28
|
+
│ │
|
|
29
|
+
│ MONITORING LAYER │
|
|
30
|
+
│ ├── Data Monitoring ├── Model Monitoring │
|
|
31
|
+
│ ├── System Metrics └── Alerting │
|
|
32
|
+
│ │
|
|
33
|
+
└─────────────────────────────────────────────────────────────┘
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## ML Lifecycle
|
|
37
|
+
|
|
38
|
+
1. **Problem Definition** - Business goal → ML task
|
|
39
|
+
2. **Data Collection** - Gather relevant data
|
|
40
|
+
3. **Data Processing** - Clean, transform, validate
|
|
41
|
+
4. **Feature Engineering** - Create informative features
|
|
42
|
+
5. **Model Development** - Train, tune, evaluate
|
|
43
|
+
6. **Deployment** - Serve predictions
|
|
44
|
+
7. **Monitoring** - Track performance
|
|
45
|
+
8. **Iteration** - Improve based on feedback
|
|
46
|
+
|
|
47
|
+
## System Requirements
|
|
48
|
+
|
|
49
|
+
### Reliability
|
|
50
|
+
- Handle failures gracefully
|
|
51
|
+
- Maintain prediction quality
|
|
52
|
+
- Provide consistent latency
|
|
53
|
+
|
|
54
|
+
### Scalability
|
|
55
|
+
- Handle growing data
|
|
56
|
+
- Support more requests
|
|
57
|
+
- Enable parallel training
|
|
58
|
+
|
|
59
|
+
### Maintainability
|
|
60
|
+
- Easy to update models
|
|
61
|
+
- Clear documentation
|
|
62
|
+
- Reproducible experiments
|
|
63
|
+
|
|
64
|
+
### Adaptability
|
|
65
|
+
- Respond to data changes
|
|
66
|
+
- Support new features
|
|
67
|
+
- Enable quick iterations
|
|
68
|
+
|
|
69
|
+
## Design Principles
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
# 1. Start Simple
|
|
73
|
+
baseline = LogisticRegression()
|
|
74
|
+
baseline.fit(X_train, y_train)
|
|
75
|
+
print(f"Baseline: {baseline.score(X_test, y_test)}")
|
|
76
|
+
|
|
77
|
+
# 2. Data Quality > Model Complexity
|
|
78
|
+
def validate_data(df):
|
|
79
|
+
assert df.isnull().sum().sum() == 0
|
|
80
|
+
assert df.duplicated().sum() == 0
|
|
81
|
+
return True
|
|
82
|
+
|
|
83
|
+
# 3. Version Everything
|
|
84
|
+
import mlflow
|
|
85
|
+
mlflow.log_param("model_version", "1.0.0")
|
|
86
|
+
mlflow.log_artifact("data/processed/")
|
|
87
|
+
|
|
88
|
+
# 4. Monitor Continuously
|
|
89
|
+
def check_drift(reference, current):
|
|
90
|
+
return ks_2samp(reference, current).pvalue < 0.05
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## Commands
|
|
94
|
+
- `/omgml:init` - Initialize ML project
|
|
95
|
+
- `/omgml:status` - Project status
|
|
96
|
+
|
|
97
|
+
## Best Practices
|
|
98
|
+
|
|
99
|
+
1. Define clear success metrics
|
|
100
|
+
2. Establish baselines early
|
|
101
|
+
3. Invest in data quality
|
|
102
|
+
4. Automate everything possible
|
|
103
|
+
5. Monitor production models
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: ml-workflow
|
|
3
|
+
description: ML development workflow covering experiment design, baseline establishment, iterative improvement, and experiment tracking best practices.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# ML Workflow
|
|
7
|
+
|
|
8
|
+
Systematic approach to ML model development.
|
|
9
|
+
|
|
10
|
+
## Development Lifecycle
|
|
11
|
+
|
|
12
|
+
```
|
|
13
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
14
|
+
│ ML DEVELOPMENT WORKFLOW │
|
|
15
|
+
├─────────────────────────────────────────────────────────────┤
|
|
16
|
+
│ │
|
|
17
|
+
│ 1. PROBLEM 2. BASELINE 3. EXPERIMENT │
|
|
18
|
+
│ SETUP MODEL ITERATE │
|
|
19
|
+
│ ↓ ↓ ↓ │
|
|
20
|
+
│ Define metrics Simple model Hypothesis │
|
|
21
|
+
│ Success criteria Benchmark Test ideas │
|
|
22
|
+
│ Constraints Comparison Track results │
|
|
23
|
+
│ │
|
|
24
|
+
│ 4. EVALUATE 5. VALIDATE 6. DEPLOY │
|
|
25
|
+
│ ↓ ↓ ↓ │
|
|
26
|
+
│ Full metrics Production Ship to prod │
|
|
27
|
+
│ Error analysis validation Monitor │
|
|
28
|
+
│ Fairness A/B test Iterate │
|
|
29
|
+
│ │
|
|
30
|
+
└─────────────────────────────────────────────────────────────┘
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Experiment Design
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
import mlflow
|
|
37
|
+
from dataclasses import dataclass
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class Experiment:
|
|
41
|
+
name: str
|
|
42
|
+
hypothesis: str
|
|
43
|
+
metrics: list
|
|
44
|
+
success_criteria: dict
|
|
45
|
+
|
|
46
|
+
experiment = Experiment(
|
|
47
|
+
name="feature_engineering_v2",
|
|
48
|
+
hypothesis="Adding temporal features improves prediction",
|
|
49
|
+
metrics=["accuracy", "f1", "latency_ms"],
|
|
50
|
+
success_criteria={"f1": 0.85, "latency_ms": 50}
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# Track experiment
|
|
54
|
+
mlflow.set_experiment(experiment.name)
|
|
55
|
+
with mlflow.start_run():
|
|
56
|
+
mlflow.log_param("hypothesis", experiment.hypothesis)
|
|
57
|
+
# ... training code ...
|
|
58
|
+
mlflow.log_metrics(results)
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Baseline Models
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
from sklearn.dummy import DummyClassifier
|
|
65
|
+
from sklearn.linear_model import LogisticRegression
|
|
66
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
67
|
+
|
|
68
|
+
baselines = {
|
|
69
|
+
"majority": DummyClassifier(strategy="most_frequent"),
|
|
70
|
+
"logistic": LogisticRegression(),
|
|
71
|
+
"random_forest": RandomForestClassifier(n_estimators=100)
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
results = {}
|
|
75
|
+
for name, model in baselines.items():
|
|
76
|
+
model.fit(X_train, y_train)
|
|
77
|
+
y_pred = model.predict(X_test)
|
|
78
|
+
results[name] = {
|
|
79
|
+
"accuracy": accuracy_score(y_test, y_pred),
|
|
80
|
+
"f1": f1_score(y_test, y_pred, average="macro")
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
# Best baseline
|
|
84
|
+
best = max(results.items(), key=lambda x: x[1]["f1"])
|
|
85
|
+
print(f"Best baseline: {best[0]} with F1={best[1]['f1']:.3f}")
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Experiment Tracking
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
import mlflow
|
|
92
|
+
import mlflow.pytorch
|
|
93
|
+
|
|
94
|
+
# Start experiment
|
|
95
|
+
mlflow.set_tracking_uri("http://mlflow.example.com")
|
|
96
|
+
mlflow.set_experiment("churn_prediction")
|
|
97
|
+
|
|
98
|
+
with mlflow.start_run(run_name="xgboost_v3"):
|
|
99
|
+
# Log parameters
|
|
100
|
+
mlflow.log_params({
|
|
101
|
+
"model_type": "xgboost",
|
|
102
|
+
"max_depth": 6,
|
|
103
|
+
"learning_rate": 0.1
|
|
104
|
+
})
|
|
105
|
+
|
|
106
|
+
# Train model
|
|
107
|
+
model = train_model(X_train, y_train, params)
|
|
108
|
+
|
|
109
|
+
# Log metrics
|
|
110
|
+
mlflow.log_metrics({
|
|
111
|
+
"train_accuracy": train_acc,
|
|
112
|
+
"val_accuracy": val_acc,
|
|
113
|
+
"f1_score": f1
|
|
114
|
+
})
|
|
115
|
+
|
|
116
|
+
# Log model
|
|
117
|
+
mlflow.sklearn.log_model(model, "model")
|
|
118
|
+
|
|
119
|
+
# Log artifacts
|
|
120
|
+
mlflow.log_artifact("feature_importance.png")
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## Iterative Improvement
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
class ExperimentIterator:
|
|
127
|
+
def __init__(self, baseline_metrics):
|
|
128
|
+
self.baseline = baseline_metrics
|
|
129
|
+
self.experiments = []
|
|
130
|
+
|
|
131
|
+
def run_experiment(self, name, model_fn, hypothesis):
|
|
132
|
+
with mlflow.start_run(run_name=name):
|
|
133
|
+
mlflow.log_param("hypothesis", hypothesis)
|
|
134
|
+
model, metrics = model_fn()
|
|
135
|
+
mlflow.log_metrics(metrics)
|
|
136
|
+
|
|
137
|
+
improvement = {k: metrics[k] - self.baseline[k]
|
|
138
|
+
for k in metrics}
|
|
139
|
+
mlflow.log_metrics({f"{k}_improvement": v
|
|
140
|
+
for k, v in improvement.items()})
|
|
141
|
+
|
|
142
|
+
self.experiments.append({
|
|
143
|
+
"name": name,
|
|
144
|
+
"hypothesis": hypothesis,
|
|
145
|
+
"metrics": metrics,
|
|
146
|
+
"improvement": improvement
|
|
147
|
+
})
|
|
148
|
+
|
|
149
|
+
return model, metrics
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
## Commands
|
|
153
|
+
- `/omgml:init` - Initialize project
|
|
154
|
+
- `/omgtrain:baseline` - Train baselines
|
|
155
|
+
|
|
156
|
+
## Best Practices
|
|
157
|
+
|
|
158
|
+
1. Always start with a baseline
|
|
159
|
+
2. Change one thing at a time
|
|
160
|
+
3. Track all experiments
|
|
161
|
+
4. Document hypotheses
|
|
162
|
+
5. Validate before deploying
|
|
@@ -0,0 +1,386 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: mlops
|
|
3
|
+
description: MLOps practices including CI/CD for ML, experiment tracking, model monitoring, pipeline orchestration, and production ML operations.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# MLOps
|
|
7
|
+
|
|
8
|
+
Production ML operations and automation.
|
|
9
|
+
|
|
10
|
+
## MLOps Maturity Model
|
|
11
|
+
|
|
12
|
+
```
|
|
13
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
14
|
+
│ MLOPS MATURITY LEVELS │
|
|
15
|
+
├─────────────────────────────────────────────────────────────┤
|
|
16
|
+
│ │
|
|
17
|
+
│ LEVEL 0 LEVEL 1 LEVEL 2 │
|
|
18
|
+
│ Manual ML Pipeline CI/CD for ML │
|
|
19
|
+
│ ─────── ────────── ────────── │
|
|
20
|
+
│ Notebooks Automated Automated │
|
|
21
|
+
│ Manual deploy training retraining │
|
|
22
|
+
│ No monitoring Basic pipeline Full automation │
|
|
23
|
+
│ │
|
|
24
|
+
│ Components: │
|
|
25
|
+
│ ├── Version Control (Git, DVC) │
|
|
26
|
+
│ ├── Experiment Tracking (MLflow, W&B) │
|
|
27
|
+
│ ├── Feature Store (Feast, Tecton) │
|
|
28
|
+
│ ├── Model Registry (MLflow, Sagemaker) │
|
|
29
|
+
│ ├── Orchestration (Airflow, Kubeflow) │
|
|
30
|
+
│ └── Monitoring (Prometheus, Evidently) │
|
|
31
|
+
│ │
|
|
32
|
+
└─────────────────────────────────────────────────────────────┘
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Experiment Tracking
|
|
36
|
+
|
|
37
|
+
### MLflow Integration
|
|
38
|
+
```python
|
|
39
|
+
import mlflow
|
|
40
|
+
from mlflow.tracking import MlflowClient
|
|
41
|
+
|
|
42
|
+
# Set tracking server
|
|
43
|
+
mlflow.set_tracking_uri("http://mlflow.example.com:5000")
|
|
44
|
+
mlflow.set_experiment("churn_prediction")
|
|
45
|
+
|
|
46
|
+
# Start run with context manager
|
|
47
|
+
with mlflow.start_run(run_name="xgboost_v2") as run:
|
|
48
|
+
# Log parameters
|
|
49
|
+
mlflow.log_params({
|
|
50
|
+
"model_type": "xgboost",
|
|
51
|
+
"learning_rate": 0.1,
|
|
52
|
+
"max_depth": 6,
|
|
53
|
+
"n_estimators": 100
|
|
54
|
+
})
|
|
55
|
+
|
|
56
|
+
# Train model
|
|
57
|
+
model = train_model(params)
|
|
58
|
+
|
|
59
|
+
# Log metrics
|
|
60
|
+
mlflow.log_metrics({
|
|
61
|
+
"accuracy": 0.92,
|
|
62
|
+
"f1_score": 0.89,
|
|
63
|
+
"auc_roc": 0.95
|
|
64
|
+
})
|
|
65
|
+
|
|
66
|
+
# Log artifacts
|
|
67
|
+
mlflow.log_artifact("feature_importance.png")
|
|
68
|
+
mlflow.log_artifact("confusion_matrix.png")
|
|
69
|
+
|
|
70
|
+
# Log model
|
|
71
|
+
mlflow.sklearn.log_model(
|
|
72
|
+
model,
|
|
73
|
+
"model",
|
|
74
|
+
registered_model_name="churn_predictor"
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# Log custom metrics over time
|
|
78
|
+
for epoch in range(100):
|
|
79
|
+
mlflow.log_metric("loss", train_loss, step=epoch)
|
|
80
|
+
|
|
81
|
+
# Compare runs
|
|
82
|
+
client = MlflowClient()
|
|
83
|
+
runs = client.search_runs(
|
|
84
|
+
experiment_ids=["1"],
|
|
85
|
+
filter_string="metrics.f1_score > 0.85",
|
|
86
|
+
order_by=["metrics.f1_score DESC"]
|
|
87
|
+
)
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### Weights & Biases
|
|
91
|
+
```python
|
|
92
|
+
import wandb
|
|
93
|
+
|
|
94
|
+
wandb.init(
|
|
95
|
+
project="ml-project",
|
|
96
|
+
config={
|
|
97
|
+
"learning_rate": 0.001,
|
|
98
|
+
"architecture": "ResNet50",
|
|
99
|
+
"epochs": 100
|
|
100
|
+
}
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
# Log metrics
|
|
104
|
+
for epoch in range(100):
|
|
105
|
+
wandb.log({
|
|
106
|
+
"epoch": epoch,
|
|
107
|
+
"loss": train_loss,
|
|
108
|
+
"val_loss": val_loss,
|
|
109
|
+
"accuracy": accuracy
|
|
110
|
+
})
|
|
111
|
+
|
|
112
|
+
# Log images
|
|
113
|
+
wandb.log({"examples": [wandb.Image(img, caption=label) for img, label in samples]})
|
|
114
|
+
|
|
115
|
+
# Log model
|
|
116
|
+
wandb.save("model.pt")
|
|
117
|
+
|
|
118
|
+
# Hyperparameter sweeps
|
|
119
|
+
sweep_config = {
|
|
120
|
+
"method": "bayes",
|
|
121
|
+
"metric": {"name": "val_loss", "goal": "minimize"},
|
|
122
|
+
"parameters": {
|
|
123
|
+
"learning_rate": {"min": 0.0001, "max": 0.1},
|
|
124
|
+
"batch_size": {"values": [16, 32, 64]}
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
sweep_id = wandb.sweep(sweep_config)
|
|
128
|
+
wandb.agent(sweep_id, train_function, count=50)
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## Model Registry
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
from mlflow.tracking import MlflowClient
|
|
135
|
+
|
|
136
|
+
client = MlflowClient()
|
|
137
|
+
|
|
138
|
+
# Register model
|
|
139
|
+
model_uri = f"runs:/{run_id}/model"
|
|
140
|
+
result = mlflow.register_model(model_uri, "production_model")
|
|
141
|
+
|
|
142
|
+
# Transition stages
|
|
143
|
+
client.transition_model_version_stage(
|
|
144
|
+
name="production_model",
|
|
145
|
+
version=result.version,
|
|
146
|
+
stage="Staging"
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
# Add description and tags
|
|
150
|
+
client.update_model_version(
|
|
151
|
+
name="production_model",
|
|
152
|
+
version=result.version,
|
|
153
|
+
description="XGBoost model trained on Q4 data"
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
client.set_model_version_tag(
|
|
157
|
+
name="production_model",
|
|
158
|
+
version=result.version,
|
|
159
|
+
key="validation_status",
|
|
160
|
+
value="passed"
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
# Load production model
|
|
164
|
+
model = mlflow.pyfunc.load_model("models:/production_model/Production")
|
|
165
|
+
|
|
166
|
+
# Compare versions
|
|
167
|
+
def compare_model_versions(model_name, version_a, version_b, test_data):
|
|
168
|
+
model_a = mlflow.pyfunc.load_model(f"models:/{model_name}/{version_a}")
|
|
169
|
+
model_b = mlflow.pyfunc.load_model(f"models:/{model_name}/{version_b}")
|
|
170
|
+
|
|
171
|
+
metrics_a = evaluate(model_a, test_data)
|
|
172
|
+
metrics_b = evaluate(model_b, test_data)
|
|
173
|
+
|
|
174
|
+
return {
|
|
175
|
+
"version_a": {"version": version_a, **metrics_a},
|
|
176
|
+
"version_b": {"version": version_b, **metrics_b}
|
|
177
|
+
}
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
## Pipeline Orchestration
|
|
181
|
+
|
|
182
|
+
### Airflow DAG
|
|
183
|
+
```python
|
|
184
|
+
from airflow import DAG
|
|
185
|
+
from airflow.operators.python import PythonOperator
|
|
186
|
+
from airflow.sensors.filesystem import FileSensor
|
|
187
|
+
from datetime import datetime, timedelta
|
|
188
|
+
|
|
189
|
+
default_args = {
|
|
190
|
+
'owner': 'ml-team',
|
|
191
|
+
'depends_on_past': False,
|
|
192
|
+
'email_on_failure': True,
|
|
193
|
+
'retries': 3,
|
|
194
|
+
'retry_delay': timedelta(minutes=5)
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
dag = DAG(
|
|
198
|
+
'ml_training_pipeline',
|
|
199
|
+
default_args=default_args,
|
|
200
|
+
schedule_interval='@daily',
|
|
201
|
+
start_date=datetime(2024, 1, 1),
|
|
202
|
+
catchup=False
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
# Tasks
|
|
206
|
+
extract_data = PythonOperator(
|
|
207
|
+
task_id='extract_data',
|
|
208
|
+
python_callable=extract_training_data,
|
|
209
|
+
dag=dag
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
validate_data = PythonOperator(
|
|
213
|
+
task_id='validate_data',
|
|
214
|
+
python_callable=validate_data_quality,
|
|
215
|
+
dag=dag
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
train_model = PythonOperator(
|
|
219
|
+
task_id='train_model',
|
|
220
|
+
python_callable=train_and_log_model,
|
|
221
|
+
dag=dag
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
evaluate_model = PythonOperator(
|
|
225
|
+
task_id='evaluate_model',
|
|
226
|
+
python_callable=evaluate_model_performance,
|
|
227
|
+
dag=dag
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
deploy_model = PythonOperator(
|
|
231
|
+
task_id='deploy_model',
|
|
232
|
+
python_callable=deploy_to_production,
|
|
233
|
+
dag=dag
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
# Dependencies
|
|
237
|
+
extract_data >> validate_data >> train_model >> evaluate_model >> deploy_model
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
### Kubeflow Pipeline
|
|
241
|
+
```python
|
|
242
|
+
from kfp import dsl
|
|
243
|
+
from kfp.components import create_component_from_func
|
|
244
|
+
|
|
245
|
+
@create_component_from_func
|
|
246
|
+
def preprocess_data(input_path: str, output_path: str):
|
|
247
|
+
import pandas as pd
|
|
248
|
+
df = pd.read_csv(input_path)
|
|
249
|
+
# Preprocessing logic
|
|
250
|
+
df.to_parquet(output_path)
|
|
251
|
+
|
|
252
|
+
@create_component_from_func
|
|
253
|
+
def train_model(data_path: str, model_path: str, hyperparameters: dict):
|
|
254
|
+
import joblib
|
|
255
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
256
|
+
# Training logic
|
|
257
|
+
model = RandomForestClassifier(**hyperparameters)
|
|
258
|
+
joblib.dump(model, model_path)
|
|
259
|
+
|
|
260
|
+
@dsl.pipeline(
|
|
261
|
+
name='ML Training Pipeline',
|
|
262
|
+
description='End-to-end ML training pipeline'
|
|
263
|
+
)
|
|
264
|
+
def ml_pipeline(input_data: str, hyperparameters: dict):
|
|
265
|
+
preprocess_op = preprocess_data(input_data, '/tmp/processed.parquet')
|
|
266
|
+
|
|
267
|
+
train_op = train_model(
|
|
268
|
+
preprocess_op.output,
|
|
269
|
+
'/tmp/model.joblib',
|
|
270
|
+
hyperparameters
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
# Add GPU resources
|
|
274
|
+
train_op.set_gpu_limit(1)
|
|
275
|
+
train_op.set_memory_limit('8Gi')
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
## CI/CD for ML
|
|
279
|
+
|
|
280
|
+
```yaml
|
|
281
|
+
# .github/workflows/ml-pipeline.yml
|
|
282
|
+
name: ML Pipeline
|
|
283
|
+
|
|
284
|
+
on:
|
|
285
|
+
push:
|
|
286
|
+
paths:
|
|
287
|
+
- 'src/**'
|
|
288
|
+
- 'data/**'
|
|
289
|
+
schedule:
|
|
290
|
+
- cron: '0 0 * * 0' # Weekly retraining
|
|
291
|
+
|
|
292
|
+
jobs:
|
|
293
|
+
data-validation:
|
|
294
|
+
runs-on: ubuntu-latest
|
|
295
|
+
steps:
|
|
296
|
+
- uses: actions/checkout@v3
|
|
297
|
+
- name: Validate data
|
|
298
|
+
run: |
|
|
299
|
+
python -m pytest tests/data_validation/
|
|
300
|
+
dvc pull
|
|
301
|
+
great_expectations checkpoint run data_quality
|
|
302
|
+
|
|
303
|
+
train:
|
|
304
|
+
needs: data-validation
|
|
305
|
+
runs-on: [self-hosted, gpu]
|
|
306
|
+
steps:
|
|
307
|
+
- uses: actions/checkout@v3
|
|
308
|
+
- name: Train model
|
|
309
|
+
run: |
|
|
310
|
+
python train.py --config configs/production.yaml
|
|
311
|
+
mlflow run . -P epochs=100
|
|
312
|
+
|
|
313
|
+
evaluate:
|
|
314
|
+
needs: train
|
|
315
|
+
runs-on: ubuntu-latest
|
|
316
|
+
steps:
|
|
317
|
+
- name: Evaluate model
|
|
318
|
+
run: |
|
|
319
|
+
python evaluate.py --model-version ${{ github.sha }}
|
|
320
|
+
python check_performance_regression.py
|
|
321
|
+
|
|
322
|
+
deploy:
|
|
323
|
+
needs: evaluate
|
|
324
|
+
if: github.ref == 'refs/heads/main'
|
|
325
|
+
runs-on: ubuntu-latest
|
|
326
|
+
steps:
|
|
327
|
+
- name: Deploy to staging
|
|
328
|
+
run: |
|
|
329
|
+
kubectl apply -f k8s/staging/
|
|
330
|
+
python smoke_test.py --env staging
|
|
331
|
+
|
|
332
|
+
- name: Deploy to production
|
|
333
|
+
run: |
|
|
334
|
+
kubectl apply -f k8s/production/
|
|
335
|
+
python smoke_test.py --env production
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
## Data Version Control
|
|
339
|
+
|
|
340
|
+
```bash
|
|
341
|
+
# Initialize DVC
|
|
342
|
+
dvc init
|
|
343
|
+
dvc remote add -d storage s3://my-bucket/dvc-storage
|
|
344
|
+
|
|
345
|
+
# Track data files
|
|
346
|
+
dvc add data/training.csv
|
|
347
|
+
git add data/training.csv.dvc data/.gitignore
|
|
348
|
+
git commit -m "Add training data"
|
|
349
|
+
|
|
350
|
+
# Push data
|
|
351
|
+
dvc push
|
|
352
|
+
|
|
353
|
+
# Create pipeline
|
|
354
|
+
dvc run -n preprocess \
|
|
355
|
+
-d src/preprocess.py -d data/raw.csv \
|
|
356
|
+
-o data/processed.csv \
|
|
357
|
+
python src/preprocess.py
|
|
358
|
+
|
|
359
|
+
dvc run -n train \
|
|
360
|
+
-d src/train.py -d data/processed.csv \
|
|
361
|
+
-o models/model.pkl \
|
|
362
|
+
-M metrics.json \
|
|
363
|
+
python src/train.py
|
|
364
|
+
|
|
365
|
+
# Reproduce pipeline
|
|
366
|
+
dvc repro
|
|
367
|
+
|
|
368
|
+
# Compare experiments
|
|
369
|
+
dvc exp run --set-param train.lr=0.001
|
|
370
|
+
dvc exp show
|
|
371
|
+
dvc exp diff
|
|
372
|
+
```
|
|
373
|
+
|
|
374
|
+
## Commands
|
|
375
|
+
- `/omgops:pipeline` - Pipeline management
|
|
376
|
+
- `/omgops:registry` - Model registry
|
|
377
|
+
- `/omgops:monitor` - System monitoring
|
|
378
|
+
- `/omgml:status` - Project status
|
|
379
|
+
|
|
380
|
+
## Best Practices
|
|
381
|
+
|
|
382
|
+
1. Version everything (code, data, models)
|
|
383
|
+
2. Automate training pipelines
|
|
384
|
+
3. Implement quality gates
|
|
385
|
+
4. Track all experiments
|
|
386
|
+
5. Use feature stores for consistency
|