omgkit 2.20.0 → 2.21.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/README.md +125 -10
  2. package/package.json +1 -1
  3. package/plugin/agents/ai-architect-agent.md +282 -0
  4. package/plugin/agents/data-scientist-agent.md +221 -0
  5. package/plugin/agents/experiment-analyst-agent.md +318 -0
  6. package/plugin/agents/ml-engineer-agent.md +165 -0
  7. package/plugin/agents/mlops-engineer-agent.md +324 -0
  8. package/plugin/agents/model-optimizer-agent.md +287 -0
  9. package/plugin/agents/production-engineer-agent.md +360 -0
  10. package/plugin/agents/research-scientist-agent.md +274 -0
  11. package/plugin/commands/omgdata/augment.md +86 -0
  12. package/plugin/commands/omgdata/collect.md +81 -0
  13. package/plugin/commands/omgdata/label.md +83 -0
  14. package/plugin/commands/omgdata/split.md +83 -0
  15. package/plugin/commands/omgdata/validate.md +76 -0
  16. package/plugin/commands/omgdata/version.md +85 -0
  17. package/plugin/commands/omgdeploy/ab.md +94 -0
  18. package/plugin/commands/omgdeploy/cloud.md +89 -0
  19. package/plugin/commands/omgdeploy/edge.md +93 -0
  20. package/plugin/commands/omgdeploy/package.md +91 -0
  21. package/plugin/commands/omgdeploy/serve.md +92 -0
  22. package/plugin/commands/omgfeature/embed.md +93 -0
  23. package/plugin/commands/omgfeature/extract.md +93 -0
  24. package/plugin/commands/omgfeature/select.md +85 -0
  25. package/plugin/commands/omgfeature/store.md +97 -0
  26. package/plugin/commands/omgml/init.md +60 -0
  27. package/plugin/commands/omgml/status.md +82 -0
  28. package/plugin/commands/omgops/drift.md +87 -0
  29. package/plugin/commands/omgops/monitor.md +99 -0
  30. package/plugin/commands/omgops/pipeline.md +102 -0
  31. package/plugin/commands/omgops/registry.md +109 -0
  32. package/plugin/commands/omgops/retrain.md +91 -0
  33. package/plugin/commands/omgoptim/distill.md +90 -0
  34. package/plugin/commands/omgoptim/profile.md +92 -0
  35. package/plugin/commands/omgoptim/prune.md +81 -0
  36. package/plugin/commands/omgoptim/quantize.md +83 -0
  37. package/plugin/commands/omgtrain/baseline.md +78 -0
  38. package/plugin/commands/omgtrain/compare.md +99 -0
  39. package/plugin/commands/omgtrain/evaluate.md +85 -0
  40. package/plugin/commands/omgtrain/train.md +81 -0
  41. package/plugin/commands/omgtrain/tune.md +89 -0
  42. package/plugin/registry.yaml +252 -2
  43. package/plugin/skills/ml-systems/SKILL.md +65 -0
  44. package/plugin/skills/ml-systems/ai-accelerators/SKILL.md +342 -0
  45. package/plugin/skills/ml-systems/data-eng/SKILL.md +126 -0
  46. package/plugin/skills/ml-systems/deep-learning-primer/SKILL.md +143 -0
  47. package/plugin/skills/ml-systems/deployment-paradigms/SKILL.md +148 -0
  48. package/plugin/skills/ml-systems/dnn-architectures/SKILL.md +128 -0
  49. package/plugin/skills/ml-systems/edge-deployment/SKILL.md +366 -0
  50. package/plugin/skills/ml-systems/efficient-ai/SKILL.md +316 -0
  51. package/plugin/skills/ml-systems/feature-engineering/SKILL.md +151 -0
  52. package/plugin/skills/ml-systems/ml-frameworks/SKILL.md +187 -0
  53. package/plugin/skills/ml-systems/ml-serving-optimization/SKILL.md +371 -0
  54. package/plugin/skills/ml-systems/ml-systems-fundamentals/SKILL.md +103 -0
  55. package/plugin/skills/ml-systems/ml-workflow/SKILL.md +162 -0
  56. package/plugin/skills/ml-systems/mlops/SKILL.md +386 -0
  57. package/plugin/skills/ml-systems/model-deployment/SKILL.md +350 -0
  58. package/plugin/skills/ml-systems/model-dev/SKILL.md +160 -0
  59. package/plugin/skills/ml-systems/model-optimization/SKILL.md +339 -0
  60. package/plugin/skills/ml-systems/robust-ai/SKILL.md +395 -0
  61. package/plugin/skills/ml-systems/training-data/SKILL.md +152 -0
  62. package/plugin/workflows/ml-systems/data-preparation-workflow.md +276 -0
  63. package/plugin/workflows/ml-systems/edge-deployment-workflow.md +413 -0
  64. package/plugin/workflows/ml-systems/full-ml-lifecycle-workflow.md +405 -0
  65. package/plugin/workflows/ml-systems/hyperparameter-tuning-workflow.md +352 -0
  66. package/plugin/workflows/ml-systems/mlops-pipeline-workflow.md +384 -0
  67. package/plugin/workflows/ml-systems/model-deployment-workflow.md +392 -0
  68. package/plugin/workflows/ml-systems/model-development-workflow.md +218 -0
  69. package/plugin/workflows/ml-systems/model-evaluation-workflow.md +416 -0
  70. package/plugin/workflows/ml-systems/model-optimization-workflow.md +390 -0
  71. package/plugin/workflows/ml-systems/monitoring-drift-workflow.md +446 -0
  72. package/plugin/workflows/ml-systems/retraining-workflow.md +401 -0
  73. package/plugin/workflows/ml-systems/training-pipeline-workflow.md +382 -0
@@ -0,0 +1,103 @@
1
+ ---
2
+ name: ml-systems-fundamentals
3
+ description: Core ML systems concepts including ML lifecycle, system architecture, requirements, and design principles for production ML.
4
+ ---
5
+
6
+ # ML Systems Fundamentals
7
+
8
+ Foundation concepts for building production ML systems.
9
+
10
+ ## ML System Architecture
11
+
12
+ ```
13
+ ┌─────────────────────────────────────────────────────────────┐
14
+ │ ML SYSTEM ARCHITECTURE │
15
+ ├─────────────────────────────────────────────────────────────┤
16
+ │ │
17
+ │ DATA LAYER │
18
+ │ ├── Data Collection ├── Data Storage │
19
+ │ ├── Data Processing └── Feature Store │
20
+ │ │
21
+ │ MODEL LAYER │
22
+ │ ├── Training Pipeline ├── Experiment Tracking │
23
+ │ ├── Model Registry └── Evaluation │
24
+ │ │
25
+ │ SERVING LAYER │
26
+ │ ├── Model Serving ├── Feature Serving │
27
+ │ ├── Prediction Cache └── Load Balancing │
28
+ │ │
29
+ │ MONITORING LAYER │
30
+ │ ├── Data Monitoring ├── Model Monitoring │
31
+ │ ├── System Metrics └── Alerting │
32
+ │ │
33
+ └─────────────────────────────────────────────────────────────┘
34
+ ```
35
+
36
+ ## ML Lifecycle
37
+
38
+ 1. **Problem Definition** - Business goal → ML task
39
+ 2. **Data Collection** - Gather relevant data
40
+ 3. **Data Processing** - Clean, transform, validate
41
+ 4. **Feature Engineering** - Create informative features
42
+ 5. **Model Development** - Train, tune, evaluate
43
+ 6. **Deployment** - Serve predictions
44
+ 7. **Monitoring** - Track performance
45
+ 8. **Iteration** - Improve based on feedback
46
+
47
+ ## System Requirements
48
+
49
+ ### Reliability
50
+ - Handle failures gracefully
51
+ - Maintain prediction quality
52
+ - Provide consistent latency
53
+
54
+ ### Scalability
55
+ - Handle growing data
56
+ - Support more requests
57
+ - Enable parallel training
58
+
59
+ ### Maintainability
60
+ - Easy to update models
61
+ - Clear documentation
62
+ - Reproducible experiments
63
+
64
+ ### Adaptability
65
+ - Respond to data changes
66
+ - Support new features
67
+ - Enable quick iterations
68
+
69
+ ## Design Principles
70
+
71
+ ```python
72
+ # 1. Start Simple
73
+ baseline = LogisticRegression()
74
+ baseline.fit(X_train, y_train)
75
+ print(f"Baseline: {baseline.score(X_test, y_test)}")
76
+
77
+ # 2. Data Quality > Model Complexity
78
+ def validate_data(df):
79
+ assert df.isnull().sum().sum() == 0
80
+ assert df.duplicated().sum() == 0
81
+ return True
82
+
83
+ # 3. Version Everything
84
+ import mlflow
85
+ mlflow.log_param("model_version", "1.0.0")
86
+ mlflow.log_artifact("data/processed/")
87
+
88
+ # 4. Monitor Continuously
89
+ def check_drift(reference, current):
90
+ return ks_2samp(reference, current).pvalue < 0.05
91
+ ```
92
+
93
+ ## Commands
94
+ - `/omgml:init` - Initialize ML project
95
+ - `/omgml:status` - Project status
96
+
97
+ ## Best Practices
98
+
99
+ 1. Define clear success metrics
100
+ 2. Establish baselines early
101
+ 3. Invest in data quality
102
+ 4. Automate everything possible
103
+ 5. Monitor production models
@@ -0,0 +1,162 @@
1
+ ---
2
+ name: ml-workflow
3
+ description: ML development workflow covering experiment design, baseline establishment, iterative improvement, and experiment tracking best practices.
4
+ ---
5
+
6
+ # ML Workflow
7
+
8
+ Systematic approach to ML model development.
9
+
10
+ ## Development Lifecycle
11
+
12
+ ```
13
+ ┌─────────────────────────────────────────────────────────────┐
14
+ │ ML DEVELOPMENT WORKFLOW │
15
+ ├─────────────────────────────────────────────────────────────┤
16
+ │ │
17
+ │ 1. PROBLEM 2. BASELINE 3. EXPERIMENT │
18
+ │ SETUP MODEL ITERATE │
19
+ │ ↓ ↓ ↓ │
20
+ │ Define metrics Simple model Hypothesis │
21
+ │ Success criteria Benchmark Test ideas │
22
+ │ Constraints Comparison Track results │
23
+ │ │
24
+ │ 4. EVALUATE 5. VALIDATE 6. DEPLOY │
25
+ │ ↓ ↓ ↓ │
26
+ │ Full metrics Production Ship to prod │
27
+ │ Error analysis validation Monitor │
28
+ │ Fairness A/B test Iterate │
29
+ │ │
30
+ └─────────────────────────────────────────────────────────────┘
31
+ ```
32
+
33
+ ## Experiment Design
34
+
35
+ ```python
36
+ import mlflow
37
+ from dataclasses import dataclass
38
+
39
+ @dataclass
40
+ class Experiment:
41
+ name: str
42
+ hypothesis: str
43
+ metrics: list
44
+ success_criteria: dict
45
+
46
+ experiment = Experiment(
47
+ name="feature_engineering_v2",
48
+ hypothesis="Adding temporal features improves prediction",
49
+ metrics=["accuracy", "f1", "latency_ms"],
50
+ success_criteria={"f1": 0.85, "latency_ms": 50}
51
+ )
52
+
53
+ # Track experiment
54
+ mlflow.set_experiment(experiment.name)
55
+ with mlflow.start_run():
56
+ mlflow.log_param("hypothesis", experiment.hypothesis)
57
+ # ... training code ...
58
+ mlflow.log_metrics(results)
59
+ ```
60
+
61
+ ## Baseline Models
62
+
63
+ ```python
64
+ from sklearn.dummy import DummyClassifier
65
+ from sklearn.linear_model import LogisticRegression
66
+ from sklearn.ensemble import RandomForestClassifier
67
+
68
+ baselines = {
69
+ "majority": DummyClassifier(strategy="most_frequent"),
70
+ "logistic": LogisticRegression(),
71
+ "random_forest": RandomForestClassifier(n_estimators=100)
72
+ }
73
+
74
+ results = {}
75
+ for name, model in baselines.items():
76
+ model.fit(X_train, y_train)
77
+ y_pred = model.predict(X_test)
78
+ results[name] = {
79
+ "accuracy": accuracy_score(y_test, y_pred),
80
+ "f1": f1_score(y_test, y_pred, average="macro")
81
+ }
82
+
83
+ # Best baseline
84
+ best = max(results.items(), key=lambda x: x[1]["f1"])
85
+ print(f"Best baseline: {best[0]} with F1={best[1]['f1']:.3f}")
86
+ ```
87
+
88
+ ## Experiment Tracking
89
+
90
+ ```python
91
+ import mlflow
92
+ import mlflow.pytorch
93
+
94
+ # Start experiment
95
+ mlflow.set_tracking_uri("http://mlflow.example.com")
96
+ mlflow.set_experiment("churn_prediction")
97
+
98
+ with mlflow.start_run(run_name="xgboost_v3"):
99
+ # Log parameters
100
+ mlflow.log_params({
101
+ "model_type": "xgboost",
102
+ "max_depth": 6,
103
+ "learning_rate": 0.1
104
+ })
105
+
106
+ # Train model
107
+ model = train_model(X_train, y_train, params)
108
+
109
+ # Log metrics
110
+ mlflow.log_metrics({
111
+ "train_accuracy": train_acc,
112
+ "val_accuracy": val_acc,
113
+ "f1_score": f1
114
+ })
115
+
116
+ # Log model
117
+ mlflow.sklearn.log_model(model, "model")
118
+
119
+ # Log artifacts
120
+ mlflow.log_artifact("feature_importance.png")
121
+ ```
122
+
123
+ ## Iterative Improvement
124
+
125
+ ```python
126
+ class ExperimentIterator:
127
+ def __init__(self, baseline_metrics):
128
+ self.baseline = baseline_metrics
129
+ self.experiments = []
130
+
131
+ def run_experiment(self, name, model_fn, hypothesis):
132
+ with mlflow.start_run(run_name=name):
133
+ mlflow.log_param("hypothesis", hypothesis)
134
+ model, metrics = model_fn()
135
+ mlflow.log_metrics(metrics)
136
+
137
+ improvement = {k: metrics[k] - self.baseline[k]
138
+ for k in metrics}
139
+ mlflow.log_metrics({f"{k}_improvement": v
140
+ for k, v in improvement.items()})
141
+
142
+ self.experiments.append({
143
+ "name": name,
144
+ "hypothesis": hypothesis,
145
+ "metrics": metrics,
146
+ "improvement": improvement
147
+ })
148
+
149
+ return model, metrics
150
+ ```
151
+
152
+ ## Commands
153
+ - `/omgml:init` - Initialize project
154
+ - `/omgtrain:baseline` - Train baselines
155
+
156
+ ## Best Practices
157
+
158
+ 1. Always start with a baseline
159
+ 2. Change one thing at a time
160
+ 3. Track all experiments
161
+ 4. Document hypotheses
162
+ 5. Validate before deploying
@@ -0,0 +1,386 @@
1
+ ---
2
+ name: mlops
3
+ description: MLOps practices including CI/CD for ML, experiment tracking, model monitoring, pipeline orchestration, and production ML operations.
4
+ ---
5
+
6
+ # MLOps
7
+
8
+ Production ML operations and automation.
9
+
10
+ ## MLOps Maturity Model
11
+
12
+ ```
13
+ ┌─────────────────────────────────────────────────────────────┐
14
+ │ MLOPS MATURITY LEVELS │
15
+ ├─────────────────────────────────────────────────────────────┤
16
+ │ │
17
+ │ LEVEL 0 LEVEL 1 LEVEL 2 │
18
+ │ Manual ML Pipeline CI/CD for ML │
19
+ │ ─────── ────────── ────────── │
20
+ │ Notebooks Automated Automated │
21
+ │ Manual deploy training retraining │
22
+ │ No monitoring Basic pipeline Full automation │
23
+ │ │
24
+ │ Components: │
25
+ │ ├── Version Control (Git, DVC) │
26
+ │ ├── Experiment Tracking (MLflow, W&B) │
27
+ │ ├── Feature Store (Feast, Tecton) │
28
+ │ ├── Model Registry (MLflow, Sagemaker) │
29
+ │ ├── Orchestration (Airflow, Kubeflow) │
30
+ │ └── Monitoring (Prometheus, Evidently) │
31
+ │ │
32
+ └─────────────────────────────────────────────────────────────┘
33
+ ```
34
+
35
+ ## Experiment Tracking
36
+
37
+ ### MLflow Integration
38
+ ```python
39
+ import mlflow
40
+ from mlflow.tracking import MlflowClient
41
+
42
+ # Set tracking server
43
+ mlflow.set_tracking_uri("http://mlflow.example.com:5000")
44
+ mlflow.set_experiment("churn_prediction")
45
+
46
+ # Start run with context manager
47
+ with mlflow.start_run(run_name="xgboost_v2") as run:
48
+ # Log parameters
49
+ mlflow.log_params({
50
+ "model_type": "xgboost",
51
+ "learning_rate": 0.1,
52
+ "max_depth": 6,
53
+ "n_estimators": 100
54
+ })
55
+
56
+ # Train model
57
+ model = train_model(params)
58
+
59
+ # Log metrics
60
+ mlflow.log_metrics({
61
+ "accuracy": 0.92,
62
+ "f1_score": 0.89,
63
+ "auc_roc": 0.95
64
+ })
65
+
66
+ # Log artifacts
67
+ mlflow.log_artifact("feature_importance.png")
68
+ mlflow.log_artifact("confusion_matrix.png")
69
+
70
+ # Log model
71
+ mlflow.sklearn.log_model(
72
+ model,
73
+ "model",
74
+ registered_model_name="churn_predictor"
75
+ )
76
+
77
+ # Log custom metrics over time
78
+ for epoch in range(100):
79
+ mlflow.log_metric("loss", train_loss, step=epoch)
80
+
81
+ # Compare runs
82
+ client = MlflowClient()
83
+ runs = client.search_runs(
84
+ experiment_ids=["1"],
85
+ filter_string="metrics.f1_score > 0.85",
86
+ order_by=["metrics.f1_score DESC"]
87
+ )
88
+ ```
89
+
90
+ ### Weights & Biases
91
+ ```python
92
+ import wandb
93
+
94
+ wandb.init(
95
+ project="ml-project",
96
+ config={
97
+ "learning_rate": 0.001,
98
+ "architecture": "ResNet50",
99
+ "epochs": 100
100
+ }
101
+ )
102
+
103
+ # Log metrics
104
+ for epoch in range(100):
105
+ wandb.log({
106
+ "epoch": epoch,
107
+ "loss": train_loss,
108
+ "val_loss": val_loss,
109
+ "accuracy": accuracy
110
+ })
111
+
112
+ # Log images
113
+ wandb.log({"examples": [wandb.Image(img, caption=label) for img, label in samples]})
114
+
115
+ # Log model
116
+ wandb.save("model.pt")
117
+
118
+ # Hyperparameter sweeps
119
+ sweep_config = {
120
+ "method": "bayes",
121
+ "metric": {"name": "val_loss", "goal": "minimize"},
122
+ "parameters": {
123
+ "learning_rate": {"min": 0.0001, "max": 0.1},
124
+ "batch_size": {"values": [16, 32, 64]}
125
+ }
126
+ }
127
+ sweep_id = wandb.sweep(sweep_config)
128
+ wandb.agent(sweep_id, train_function, count=50)
129
+ ```
130
+
131
+ ## Model Registry
132
+
133
+ ```python
134
+ from mlflow.tracking import MlflowClient
135
+
136
+ client = MlflowClient()
137
+
138
+ # Register model
139
+ model_uri = f"runs:/{run_id}/model"
140
+ result = mlflow.register_model(model_uri, "production_model")
141
+
142
+ # Transition stages
143
+ client.transition_model_version_stage(
144
+ name="production_model",
145
+ version=result.version,
146
+ stage="Staging"
147
+ )
148
+
149
+ # Add description and tags
150
+ client.update_model_version(
151
+ name="production_model",
152
+ version=result.version,
153
+ description="XGBoost model trained on Q4 data"
154
+ )
155
+
156
+ client.set_model_version_tag(
157
+ name="production_model",
158
+ version=result.version,
159
+ key="validation_status",
160
+ value="passed"
161
+ )
162
+
163
+ # Load production model
164
+ model = mlflow.pyfunc.load_model("models:/production_model/Production")
165
+
166
+ # Compare versions
167
+ def compare_model_versions(model_name, version_a, version_b, test_data):
168
+ model_a = mlflow.pyfunc.load_model(f"models:/{model_name}/{version_a}")
169
+ model_b = mlflow.pyfunc.load_model(f"models:/{model_name}/{version_b}")
170
+
171
+ metrics_a = evaluate(model_a, test_data)
172
+ metrics_b = evaluate(model_b, test_data)
173
+
174
+ return {
175
+ "version_a": {"version": version_a, **metrics_a},
176
+ "version_b": {"version": version_b, **metrics_b}
177
+ }
178
+ ```
179
+
180
+ ## Pipeline Orchestration
181
+
182
+ ### Airflow DAG
183
+ ```python
184
+ from airflow import DAG
185
+ from airflow.operators.python import PythonOperator
186
+ from airflow.sensors.filesystem import FileSensor
187
+ from datetime import datetime, timedelta
188
+
189
+ default_args = {
190
+ 'owner': 'ml-team',
191
+ 'depends_on_past': False,
192
+ 'email_on_failure': True,
193
+ 'retries': 3,
194
+ 'retry_delay': timedelta(minutes=5)
195
+ }
196
+
197
+ dag = DAG(
198
+ 'ml_training_pipeline',
199
+ default_args=default_args,
200
+ schedule_interval='@daily',
201
+ start_date=datetime(2024, 1, 1),
202
+ catchup=False
203
+ )
204
+
205
+ # Tasks
206
+ extract_data = PythonOperator(
207
+ task_id='extract_data',
208
+ python_callable=extract_training_data,
209
+ dag=dag
210
+ )
211
+
212
+ validate_data = PythonOperator(
213
+ task_id='validate_data',
214
+ python_callable=validate_data_quality,
215
+ dag=dag
216
+ )
217
+
218
+ train_model = PythonOperator(
219
+ task_id='train_model',
220
+ python_callable=train_and_log_model,
221
+ dag=dag
222
+ )
223
+
224
+ evaluate_model = PythonOperator(
225
+ task_id='evaluate_model',
226
+ python_callable=evaluate_model_performance,
227
+ dag=dag
228
+ )
229
+
230
+ deploy_model = PythonOperator(
231
+ task_id='deploy_model',
232
+ python_callable=deploy_to_production,
233
+ dag=dag
234
+ )
235
+
236
+ # Dependencies
237
+ extract_data >> validate_data >> train_model >> evaluate_model >> deploy_model
238
+ ```
239
+
240
+ ### Kubeflow Pipeline
241
+ ```python
242
+ from kfp import dsl
243
+ from kfp.components import create_component_from_func
244
+
245
+ @create_component_from_func
246
+ def preprocess_data(input_path: str, output_path: str):
247
+ import pandas as pd
248
+ df = pd.read_csv(input_path)
249
+ # Preprocessing logic
250
+ df.to_parquet(output_path)
251
+
252
+ @create_component_from_func
253
+ def train_model(data_path: str, model_path: str, hyperparameters: dict):
254
+ import joblib
255
+ from sklearn.ensemble import RandomForestClassifier
256
+ # Training logic
257
+ model = RandomForestClassifier(**hyperparameters)
258
+ joblib.dump(model, model_path)
259
+
260
+ @dsl.pipeline(
261
+ name='ML Training Pipeline',
262
+ description='End-to-end ML training pipeline'
263
+ )
264
+ def ml_pipeline(input_data: str, hyperparameters: dict):
265
+ preprocess_op = preprocess_data(input_data, '/tmp/processed.parquet')
266
+
267
+ train_op = train_model(
268
+ preprocess_op.output,
269
+ '/tmp/model.joblib',
270
+ hyperparameters
271
+ )
272
+
273
+ # Add GPU resources
274
+ train_op.set_gpu_limit(1)
275
+ train_op.set_memory_limit('8Gi')
276
+ ```
277
+
278
+ ## CI/CD for ML
279
+
280
+ ```yaml
281
+ # .github/workflows/ml-pipeline.yml
282
+ name: ML Pipeline
283
+
284
+ on:
285
+ push:
286
+ paths:
287
+ - 'src/**'
288
+ - 'data/**'
289
+ schedule:
290
+ - cron: '0 0 * * 0' # Weekly retraining
291
+
292
+ jobs:
293
+ data-validation:
294
+ runs-on: ubuntu-latest
295
+ steps:
296
+ - uses: actions/checkout@v3
297
+ - name: Validate data
298
+ run: |
299
+ python -m pytest tests/data_validation/
300
+ dvc pull
301
+ great_expectations checkpoint run data_quality
302
+
303
+ train:
304
+ needs: data-validation
305
+ runs-on: [self-hosted, gpu]
306
+ steps:
307
+ - uses: actions/checkout@v3
308
+ - name: Train model
309
+ run: |
310
+ python train.py --config configs/production.yaml
311
+ mlflow run . -P epochs=100
312
+
313
+ evaluate:
314
+ needs: train
315
+ runs-on: ubuntu-latest
316
+ steps:
317
+ - name: Evaluate model
318
+ run: |
319
+ python evaluate.py --model-version ${{ github.sha }}
320
+ python check_performance_regression.py
321
+
322
+ deploy:
323
+ needs: evaluate
324
+ if: github.ref == 'refs/heads/main'
325
+ runs-on: ubuntu-latest
326
+ steps:
327
+ - name: Deploy to staging
328
+ run: |
329
+ kubectl apply -f k8s/staging/
330
+ python smoke_test.py --env staging
331
+
332
+ - name: Deploy to production
333
+ run: |
334
+ kubectl apply -f k8s/production/
335
+ python smoke_test.py --env production
336
+ ```
337
+
338
+ ## Data Version Control
339
+
340
+ ```bash
341
+ # Initialize DVC
342
+ dvc init
343
+ dvc remote add -d storage s3://my-bucket/dvc-storage
344
+
345
+ # Track data files
346
+ dvc add data/training.csv
347
+ git add data/training.csv.dvc data/.gitignore
348
+ git commit -m "Add training data"
349
+
350
+ # Push data
351
+ dvc push
352
+
353
+ # Create pipeline
354
+ dvc run -n preprocess \
355
+ -d src/preprocess.py -d data/raw.csv \
356
+ -o data/processed.csv \
357
+ python src/preprocess.py
358
+
359
+ dvc run -n train \
360
+ -d src/train.py -d data/processed.csv \
361
+ -o models/model.pkl \
362
+ -M metrics.json \
363
+ python src/train.py
364
+
365
+ # Reproduce pipeline
366
+ dvc repro
367
+
368
+ # Compare experiments
369
+ dvc exp run --set-param train.lr=0.001
370
+ dvc exp show
371
+ dvc exp diff
372
+ ```
373
+
374
+ ## Commands
375
+ - `/omgops:pipeline` - Pipeline management
376
+ - `/omgops:registry` - Model registry
377
+ - `/omgops:monitor` - System monitoring
378
+ - `/omgml:status` - Project status
379
+
380
+ ## Best Practices
381
+
382
+ 1. Version everything (code, data, models)
383
+ 2. Automate training pipelines
384
+ 3. Implement quality gates
385
+ 4. Track all experiments
386
+ 5. Use feature stores for consistency