omgkit 2.19.3 → 2.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +537 -338
- package/package.json +2 -2
- package/plugin/agents/ai-architect-agent.md +282 -0
- package/plugin/agents/data-scientist-agent.md +221 -0
- package/plugin/agents/experiment-analyst-agent.md +318 -0
- package/plugin/agents/ml-engineer-agent.md +165 -0
- package/plugin/agents/mlops-engineer-agent.md +324 -0
- package/plugin/agents/model-optimizer-agent.md +287 -0
- package/plugin/agents/production-engineer-agent.md +360 -0
- package/plugin/agents/research-scientist-agent.md +274 -0
- package/plugin/commands/omgdata/augment.md +86 -0
- package/plugin/commands/omgdata/collect.md +81 -0
- package/plugin/commands/omgdata/label.md +83 -0
- package/plugin/commands/omgdata/split.md +83 -0
- package/plugin/commands/omgdata/validate.md +76 -0
- package/plugin/commands/omgdata/version.md +85 -0
- package/plugin/commands/omgdeploy/ab.md +94 -0
- package/plugin/commands/omgdeploy/cloud.md +89 -0
- package/plugin/commands/omgdeploy/edge.md +93 -0
- package/plugin/commands/omgdeploy/package.md +91 -0
- package/plugin/commands/omgdeploy/serve.md +92 -0
- package/plugin/commands/omgfeature/embed.md +93 -0
- package/plugin/commands/omgfeature/extract.md +93 -0
- package/plugin/commands/omgfeature/select.md +85 -0
- package/plugin/commands/omgfeature/store.md +97 -0
- package/plugin/commands/omgml/init.md +60 -0
- package/plugin/commands/omgml/status.md +82 -0
- package/plugin/commands/omgops/drift.md +87 -0
- package/plugin/commands/omgops/monitor.md +99 -0
- package/plugin/commands/omgops/pipeline.md +102 -0
- package/plugin/commands/omgops/registry.md +109 -0
- package/plugin/commands/omgops/retrain.md +91 -0
- package/plugin/commands/omgoptim/distill.md +90 -0
- package/plugin/commands/omgoptim/profile.md +92 -0
- package/plugin/commands/omgoptim/prune.md +81 -0
- package/plugin/commands/omgoptim/quantize.md +83 -0
- package/plugin/commands/omgtrain/baseline.md +78 -0
- package/plugin/commands/omgtrain/compare.md +99 -0
- package/plugin/commands/omgtrain/evaluate.md +85 -0
- package/plugin/commands/omgtrain/train.md +81 -0
- package/plugin/commands/omgtrain/tune.md +89 -0
- package/plugin/registry.yaml +252 -2
- package/plugin/skills/ml-systems/SKILL.md +65 -0
- package/plugin/skills/ml-systems/ai-accelerators/SKILL.md +342 -0
- package/plugin/skills/ml-systems/data-eng/SKILL.md +126 -0
- package/plugin/skills/ml-systems/deep-learning-primer/SKILL.md +143 -0
- package/plugin/skills/ml-systems/deployment-paradigms/SKILL.md +148 -0
- package/plugin/skills/ml-systems/dnn-architectures/SKILL.md +128 -0
- package/plugin/skills/ml-systems/edge-deployment/SKILL.md +366 -0
- package/plugin/skills/ml-systems/efficient-ai/SKILL.md +316 -0
- package/plugin/skills/ml-systems/feature-engineering/SKILL.md +151 -0
- package/plugin/skills/ml-systems/ml-frameworks/SKILL.md +187 -0
- package/plugin/skills/ml-systems/ml-serving-optimization/SKILL.md +371 -0
- package/plugin/skills/ml-systems/ml-systems-fundamentals/SKILL.md +103 -0
- package/plugin/skills/ml-systems/ml-workflow/SKILL.md +162 -0
- package/plugin/skills/ml-systems/mlops/SKILL.md +386 -0
- package/plugin/skills/ml-systems/model-deployment/SKILL.md +350 -0
- package/plugin/skills/ml-systems/model-dev/SKILL.md +160 -0
- package/plugin/skills/ml-systems/model-optimization/SKILL.md +339 -0
- package/plugin/skills/ml-systems/robust-ai/SKILL.md +395 -0
- package/plugin/skills/ml-systems/training-data/SKILL.md +152 -0
- package/plugin/workflows/ml-systems/data-preparation-workflow.md +276 -0
- package/plugin/workflows/ml-systems/edge-deployment-workflow.md +413 -0
- package/plugin/workflows/ml-systems/full-ml-lifecycle-workflow.md +405 -0
- package/plugin/workflows/ml-systems/hyperparameter-tuning-workflow.md +352 -0
- package/plugin/workflows/ml-systems/mlops-pipeline-workflow.md +384 -0
- package/plugin/workflows/ml-systems/model-deployment-workflow.md +392 -0
- package/plugin/workflows/ml-systems/model-development-workflow.md +218 -0
- package/plugin/workflows/ml-systems/model-evaluation-workflow.md +416 -0
- package/plugin/workflows/ml-systems/model-optimization-workflow.md +390 -0
- package/plugin/workflows/ml-systems/monitoring-drift-workflow.md +446 -0
- package/plugin/workflows/ml-systems/retraining-workflow.md +401 -0
- package/plugin/workflows/ml-systems/training-pipeline-workflow.md +382 -0
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: model-deployment
|
|
3
|
+
description: Model deployment strategies including serving infrastructure, containerization, model packaging, versioning, and production deployment patterns.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Model Deployment
|
|
7
|
+
|
|
8
|
+
Deploying ML models to production.
|
|
9
|
+
|
|
10
|
+
## Deployment Architecture
|
|
11
|
+
|
|
12
|
+
```
|
|
13
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
14
|
+
│ ML DEPLOYMENT PATTERNS │
|
|
15
|
+
├─────────────────────────────────────────────────────────────┤
|
|
16
|
+
│ │
|
|
17
|
+
│ BATCH INFERENCE REAL-TIME STREAMING │
|
|
18
|
+
│ ─────────────── ───────── ───────── │
|
|
19
|
+
│ Spark/Airflow REST/gRPC Kafka/Flink │
|
|
20
|
+
│ High throughput Low latency Continuous │
|
|
21
|
+
│ Scheduled runs On-demand Event-driven │
|
|
22
|
+
│ │
|
|
23
|
+
│ EMBEDDED EDGE SERVERLESS │
|
|
24
|
+
│ ──────── ──── ────────── │
|
|
25
|
+
│ Mobile SDK IoT devices AWS Lambda │
|
|
26
|
+
│ On-device Local inference Auto-scaling │
|
|
27
|
+
│ Offline capable Bandwidth limited Pay per request │
|
|
28
|
+
│ │
|
|
29
|
+
└─────────────────────────────────────────────────────────────┘
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Model Serving Frameworks
|
|
33
|
+
|
|
34
|
+
### TorchServe
|
|
35
|
+
```python
|
|
36
|
+
# Handler for TorchServe
|
|
37
|
+
from ts.torch_handler.base_handler import BaseHandler
|
|
38
|
+
import torch
|
|
39
|
+
|
|
40
|
+
class ModelHandler(BaseHandler):
|
|
41
|
+
def initialize(self, context):
|
|
42
|
+
self.manifest = context.manifest
|
|
43
|
+
model_dir = context.system_properties.get("model_dir")
|
|
44
|
+
self.model = torch.jit.load(f"{model_dir}/model.pt")
|
|
45
|
+
self.model.eval()
|
|
46
|
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
47
|
+
self.model.to(self.device)
|
|
48
|
+
|
|
49
|
+
def preprocess(self, data):
|
|
50
|
+
inputs = []
|
|
51
|
+
for row in data:
|
|
52
|
+
input_data = row.get("data") or row.get("body")
|
|
53
|
+
inputs.append(torch.tensor(input_data))
|
|
54
|
+
return torch.stack(inputs).to(self.device)
|
|
55
|
+
|
|
56
|
+
def inference(self, data):
|
|
57
|
+
with torch.no_grad():
|
|
58
|
+
return self.model(data)
|
|
59
|
+
|
|
60
|
+
def postprocess(self, inference_output):
|
|
61
|
+
return inference_output.tolist()
|
|
62
|
+
|
|
63
|
+
# Package model
|
|
64
|
+
# torch-model-archiver --model-name model --version 1.0 \
|
|
65
|
+
# --serialized-file model.pt --handler handler.py
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### TensorFlow Serving
|
|
69
|
+
```python
|
|
70
|
+
import tensorflow as tf
|
|
71
|
+
|
|
72
|
+
# Save model in SavedModel format
|
|
73
|
+
tf.saved_model.save(model, "saved_model/1")
|
|
74
|
+
|
|
75
|
+
# Serve with Docker
|
|
76
|
+
# docker run -p 8501:8501 \
|
|
77
|
+
# -v /path/to/saved_model:/models/model \
|
|
78
|
+
# -e MODEL_NAME=model \
|
|
79
|
+
# tensorflow/serving
|
|
80
|
+
|
|
81
|
+
# Client request
|
|
82
|
+
import requests
|
|
83
|
+
import json
|
|
84
|
+
|
|
85
|
+
data = {"instances": [[1.0, 2.0, 3.0]]}
|
|
86
|
+
response = requests.post(
|
|
87
|
+
"http://localhost:8501/v1/models/model:predict",
|
|
88
|
+
json=data
|
|
89
|
+
)
|
|
90
|
+
predictions = response.json()["predictions"]
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Triton Inference Server
|
|
94
|
+
```python
|
|
95
|
+
# Model repository structure
|
|
96
|
+
# models/
|
|
97
|
+
# model_name/
|
|
98
|
+
# config.pbtxt
|
|
99
|
+
# 1/
|
|
100
|
+
# model.onnx
|
|
101
|
+
|
|
102
|
+
# config.pbtxt
|
|
103
|
+
"""
|
|
104
|
+
name: "my_model"
|
|
105
|
+
platform: "onnxruntime_onnx"
|
|
106
|
+
max_batch_size: 64
|
|
107
|
+
input [
|
|
108
|
+
{
|
|
109
|
+
name: "input"
|
|
110
|
+
data_type: TYPE_FP32
|
|
111
|
+
dims: [ -1, 784 ]
|
|
112
|
+
}
|
|
113
|
+
]
|
|
114
|
+
output [
|
|
115
|
+
{
|
|
116
|
+
name: "output"
|
|
117
|
+
data_type: TYPE_FP32
|
|
118
|
+
dims: [ -1, 10 ]
|
|
119
|
+
}
|
|
120
|
+
]
|
|
121
|
+
instance_group [
|
|
122
|
+
{ count: 2, kind: KIND_GPU }
|
|
123
|
+
]
|
|
124
|
+
dynamic_batching {
|
|
125
|
+
preferred_batch_size: [ 16, 32 ]
|
|
126
|
+
max_queue_delay_microseconds: 100
|
|
127
|
+
}
|
|
128
|
+
"""
|
|
129
|
+
|
|
130
|
+
# Python client
|
|
131
|
+
import tritonclient.grpc as grpcclient
|
|
132
|
+
|
|
133
|
+
client = grpcclient.InferenceServerClient("localhost:8001")
|
|
134
|
+
inputs = [grpcclient.InferInput("input", [1, 784], "FP32")]
|
|
135
|
+
inputs[0].set_data_from_numpy(input_data)
|
|
136
|
+
outputs = [grpcclient.InferRequestedOutput("output")]
|
|
137
|
+
result = client.infer("my_model", inputs, outputs=outputs)
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## Containerization
|
|
141
|
+
|
|
142
|
+
### Docker for ML
|
|
143
|
+
```dockerfile
|
|
144
|
+
# Multi-stage build for production
|
|
145
|
+
FROM python:3.10-slim as builder
|
|
146
|
+
|
|
147
|
+
WORKDIR /app
|
|
148
|
+
COPY requirements.txt .
|
|
149
|
+
RUN pip install --user --no-cache-dir -r requirements.txt
|
|
150
|
+
|
|
151
|
+
FROM python:3.10-slim
|
|
152
|
+
|
|
153
|
+
# Non-root user for security
|
|
154
|
+
RUN useradd -m -u 1000 appuser
|
|
155
|
+
USER appuser
|
|
156
|
+
|
|
157
|
+
WORKDIR /app
|
|
158
|
+
COPY --from=builder /root/.local /home/appuser/.local
|
|
159
|
+
COPY --chown=appuser:appuser . .
|
|
160
|
+
|
|
161
|
+
ENV PATH=/home/appuser/.local/bin:$PATH
|
|
162
|
+
ENV MODEL_PATH=/app/models/model.pt
|
|
163
|
+
|
|
164
|
+
HEALTHCHECK --interval=30s --timeout=10s --retries=3 \
|
|
165
|
+
CMD curl -f http://localhost:8000/health || exit 1
|
|
166
|
+
|
|
167
|
+
EXPOSE 8000
|
|
168
|
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
### Kubernetes Deployment
|
|
172
|
+
```yaml
|
|
173
|
+
apiVersion: apps/v1
|
|
174
|
+
kind: Deployment
|
|
175
|
+
metadata:
|
|
176
|
+
name: ml-model
|
|
177
|
+
spec:
|
|
178
|
+
replicas: 3
|
|
179
|
+
selector:
|
|
180
|
+
matchLabels:
|
|
181
|
+
app: ml-model
|
|
182
|
+
template:
|
|
183
|
+
metadata:
|
|
184
|
+
labels:
|
|
185
|
+
app: ml-model
|
|
186
|
+
spec:
|
|
187
|
+
containers:
|
|
188
|
+
- name: model
|
|
189
|
+
image: ml-model:v1.0
|
|
190
|
+
resources:
|
|
191
|
+
requests:
|
|
192
|
+
memory: "2Gi"
|
|
193
|
+
cpu: "1"
|
|
194
|
+
nvidia.com/gpu: 1
|
|
195
|
+
limits:
|
|
196
|
+
memory: "4Gi"
|
|
197
|
+
cpu: "2"
|
|
198
|
+
nvidia.com/gpu: 1
|
|
199
|
+
ports:
|
|
200
|
+
- containerPort: 8000
|
|
201
|
+
livenessProbe:
|
|
202
|
+
httpGet:
|
|
203
|
+
path: /health
|
|
204
|
+
port: 8000
|
|
205
|
+
initialDelaySeconds: 30
|
|
206
|
+
periodSeconds: 10
|
|
207
|
+
readinessProbe:
|
|
208
|
+
httpGet:
|
|
209
|
+
path: /ready
|
|
210
|
+
port: 8000
|
|
211
|
+
initialDelaySeconds: 5
|
|
212
|
+
periodSeconds: 5
|
|
213
|
+
env:
|
|
214
|
+
- name: MODEL_VERSION
|
|
215
|
+
value: "1.0"
|
|
216
|
+
---
|
|
217
|
+
apiVersion: v1
|
|
218
|
+
kind: Service
|
|
219
|
+
metadata:
|
|
220
|
+
name: ml-model-service
|
|
221
|
+
spec:
|
|
222
|
+
selector:
|
|
223
|
+
app: ml-model
|
|
224
|
+
ports:
|
|
225
|
+
- port: 80
|
|
226
|
+
targetPort: 8000
|
|
227
|
+
type: LoadBalancer
|
|
228
|
+
---
|
|
229
|
+
apiVersion: autoscaling/v2
|
|
230
|
+
kind: HorizontalPodAutoscaler
|
|
231
|
+
metadata:
|
|
232
|
+
name: ml-model-hpa
|
|
233
|
+
spec:
|
|
234
|
+
scaleTargetRef:
|
|
235
|
+
apiVersion: apps/v1
|
|
236
|
+
kind: Deployment
|
|
237
|
+
name: ml-model
|
|
238
|
+
minReplicas: 2
|
|
239
|
+
maxReplicas: 10
|
|
240
|
+
metrics:
|
|
241
|
+
- type: Resource
|
|
242
|
+
resource:
|
|
243
|
+
name: cpu
|
|
244
|
+
target:
|
|
245
|
+
type: Utilization
|
|
246
|
+
averageUtilization: 70
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
## FastAPI Model Server
|
|
250
|
+
|
|
251
|
+
```python
|
|
252
|
+
from fastapi import FastAPI, HTTPException
|
|
253
|
+
from pydantic import BaseModel
|
|
254
|
+
import torch
|
|
255
|
+
import numpy as np
|
|
256
|
+
|
|
257
|
+
app = FastAPI(title="ML Model API", version="1.0")
|
|
258
|
+
|
|
259
|
+
class PredictionRequest(BaseModel):
|
|
260
|
+
features: list[float]
|
|
261
|
+
|
|
262
|
+
class PredictionResponse(BaseModel):
|
|
263
|
+
prediction: int
|
|
264
|
+
confidence: float
|
|
265
|
+
model_version: str
|
|
266
|
+
|
|
267
|
+
# Load model on startup
|
|
268
|
+
@app.on_event("startup")
|
|
269
|
+
async def load_model():
|
|
270
|
+
global model
|
|
271
|
+
model = torch.jit.load("model.pt")
|
|
272
|
+
model.eval()
|
|
273
|
+
|
|
274
|
+
@app.get("/health")
|
|
275
|
+
async def health():
|
|
276
|
+
return {"status": "healthy"}
|
|
277
|
+
|
|
278
|
+
@app.get("/ready")
|
|
279
|
+
async def ready():
|
|
280
|
+
if model is None:
|
|
281
|
+
raise HTTPException(status_code=503, detail="Model not loaded")
|
|
282
|
+
return {"status": "ready"}
|
|
283
|
+
|
|
284
|
+
@app.post("/predict", response_model=PredictionResponse)
|
|
285
|
+
async def predict(request: PredictionRequest):
|
|
286
|
+
try:
|
|
287
|
+
input_tensor = torch.tensor([request.features])
|
|
288
|
+
with torch.no_grad():
|
|
289
|
+
output = model(input_tensor)
|
|
290
|
+
probs = torch.softmax(output, dim=1)
|
|
291
|
+
prediction = output.argmax(dim=1).item()
|
|
292
|
+
confidence = probs[0][prediction].item()
|
|
293
|
+
|
|
294
|
+
return PredictionResponse(
|
|
295
|
+
prediction=prediction,
|
|
296
|
+
confidence=confidence,
|
|
297
|
+
model_version="1.0"
|
|
298
|
+
)
|
|
299
|
+
except Exception as e:
|
|
300
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
301
|
+
|
|
302
|
+
@app.post("/batch_predict")
|
|
303
|
+
async def batch_predict(requests: list[PredictionRequest]):
|
|
304
|
+
inputs = torch.tensor([r.features for r in requests])
|
|
305
|
+
with torch.no_grad():
|
|
306
|
+
outputs = model(inputs)
|
|
307
|
+
return {"predictions": outputs.argmax(dim=1).tolist()}
|
|
308
|
+
```
|
|
309
|
+
|
|
310
|
+
## Model Versioning
|
|
311
|
+
|
|
312
|
+
```python
|
|
313
|
+
import mlflow
|
|
314
|
+
|
|
315
|
+
# Register model version
|
|
316
|
+
with mlflow.start_run():
|
|
317
|
+
mlflow.sklearn.log_model(model, "model", registered_model_name="production_model")
|
|
318
|
+
|
|
319
|
+
# Transition to production
|
|
320
|
+
client = mlflow.tracking.MlflowClient()
|
|
321
|
+
client.transition_model_version_stage(
|
|
322
|
+
name="production_model",
|
|
323
|
+
version=3,
|
|
324
|
+
stage="Production"
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
# Load production model
|
|
328
|
+
model = mlflow.pyfunc.load_model("models:/production_model/Production")
|
|
329
|
+
|
|
330
|
+
# Canary deployment
|
|
331
|
+
def route_request(request, canary_percentage=10):
|
|
332
|
+
import random
|
|
333
|
+
if random.random() < canary_percentage / 100:
|
|
334
|
+
return canary_model.predict(request)
|
|
335
|
+
return production_model.predict(request)
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
## Commands
|
|
339
|
+
- `/omgdeploy:package` - Package model
|
|
340
|
+
- `/omgdeploy:serve` - Serve model
|
|
341
|
+
- `/omgdeploy:cloud` - Cloud deployment
|
|
342
|
+
- `/omgops:registry` - Model registry
|
|
343
|
+
|
|
344
|
+
## Best Practices
|
|
345
|
+
|
|
346
|
+
1. Use health and readiness probes
|
|
347
|
+
2. Implement graceful shutdown
|
|
348
|
+
3. Version models explicitly
|
|
349
|
+
4. Monitor inference latency
|
|
350
|
+
5. Use canary deployments for safety
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: model-development
|
|
3
|
+
description: Model development practices including model selection, training pipelines, hyperparameter tuning, evaluation, and model selection strategies.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Model Development
|
|
7
|
+
|
|
8
|
+
Building and training ML models effectively.
|
|
9
|
+
|
|
10
|
+
## Model Selection
|
|
11
|
+
|
|
12
|
+
```python
|
|
13
|
+
from sklearn.model_selection import cross_val_score
|
|
14
|
+
|
|
15
|
+
models = {
|
|
16
|
+
"logistic": LogisticRegression(),
|
|
17
|
+
"random_forest": RandomForestClassifier(),
|
|
18
|
+
"xgboost": XGBClassifier(),
|
|
19
|
+
"lightgbm": LGBMClassifier(),
|
|
20
|
+
"catboost": CatBoostClassifier(verbose=False)
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
results = {}
|
|
24
|
+
for name, model in models.items():
|
|
25
|
+
scores = cross_val_score(model, X, y, cv=5, scoring="f1_macro")
|
|
26
|
+
results[name] = {
|
|
27
|
+
"mean": scores.mean(),
|
|
28
|
+
"std": scores.std()
|
|
29
|
+
}
|
|
30
|
+
print(f"{name}: {scores.mean():.3f} (+/- {scores.std():.3f})")
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Training Pipeline
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
import torch
|
|
37
|
+
import torch.nn as nn
|
|
38
|
+
from torch.utils.data import DataLoader
|
|
39
|
+
|
|
40
|
+
class TrainingPipeline:
|
|
41
|
+
def __init__(self, model, optimizer, criterion, device):
|
|
42
|
+
self.model = model.to(device)
|
|
43
|
+
self.optimizer = optimizer
|
|
44
|
+
self.criterion = criterion
|
|
45
|
+
self.device = device
|
|
46
|
+
|
|
47
|
+
def train_epoch(self, dataloader):
|
|
48
|
+
self.model.train()
|
|
49
|
+
total_loss = 0
|
|
50
|
+
for batch in dataloader:
|
|
51
|
+
x, y = batch[0].to(self.device), batch[1].to(self.device)
|
|
52
|
+
self.optimizer.zero_grad()
|
|
53
|
+
output = self.model(x)
|
|
54
|
+
loss = self.criterion(output, y)
|
|
55
|
+
loss.backward()
|
|
56
|
+
torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
|
|
57
|
+
self.optimizer.step()
|
|
58
|
+
total_loss += loss.item()
|
|
59
|
+
return total_loss / len(dataloader)
|
|
60
|
+
|
|
61
|
+
def evaluate(self, dataloader):
|
|
62
|
+
self.model.eval()
|
|
63
|
+
predictions, targets = [], []
|
|
64
|
+
with torch.no_grad():
|
|
65
|
+
for batch in dataloader:
|
|
66
|
+
x, y = batch[0].to(self.device), batch[1].to(self.device)
|
|
67
|
+
output = self.model(x)
|
|
68
|
+
predictions.extend(output.argmax(dim=1).cpu().numpy())
|
|
69
|
+
targets.extend(y.cpu().numpy())
|
|
70
|
+
return accuracy_score(targets, predictions)
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Hyperparameter Tuning
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
import optuna
|
|
77
|
+
|
|
78
|
+
def objective(trial):
|
|
79
|
+
params = {
|
|
80
|
+
"learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True),
|
|
81
|
+
"max_depth": trial.suggest_int("max_depth", 3, 10),
|
|
82
|
+
"n_estimators": trial.suggest_int("n_estimators", 50, 500),
|
|
83
|
+
"min_child_weight": trial.suggest_int("min_child_weight", 1, 10)
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
model = XGBClassifier(**params, use_label_encoder=False, eval_metric="logloss")
|
|
87
|
+
scores = cross_val_score(model, X_train, y_train, cv=5, scoring="f1_macro")
|
|
88
|
+
|
|
89
|
+
return scores.mean()
|
|
90
|
+
|
|
91
|
+
study = optuna.create_study(direction="maximize")
|
|
92
|
+
study.optimize(objective, n_trials=100)
|
|
93
|
+
|
|
94
|
+
print(f"Best params: {study.best_params}")
|
|
95
|
+
print(f"Best F1: {study.best_value:.3f}")
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## Model Evaluation
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
from sklearn.metrics import classification_report, confusion_matrix
|
|
102
|
+
|
|
103
|
+
def comprehensive_evaluation(model, X_test, y_test):
|
|
104
|
+
y_pred = model.predict(X_test)
|
|
105
|
+
y_prob = model.predict_proba(X_test)[:, 1]
|
|
106
|
+
|
|
107
|
+
# Classification metrics
|
|
108
|
+
print(classification_report(y_test, y_pred))
|
|
109
|
+
|
|
110
|
+
# Confusion matrix
|
|
111
|
+
cm = confusion_matrix(y_test, y_pred)
|
|
112
|
+
print(f"Confusion Matrix:\n{cm}")
|
|
113
|
+
|
|
114
|
+
# ROC-AUC
|
|
115
|
+
roc_auc = roc_auc_score(y_test, y_prob)
|
|
116
|
+
print(f"ROC-AUC: {roc_auc:.3f}")
|
|
117
|
+
|
|
118
|
+
# Precision-Recall AUC
|
|
119
|
+
pr_auc = average_precision_score(y_test, y_prob)
|
|
120
|
+
print(f"PR-AUC: {pr_auc:.3f}")
|
|
121
|
+
|
|
122
|
+
return {
|
|
123
|
+
"classification_report": classification_report(y_test, y_pred, output_dict=True),
|
|
124
|
+
"confusion_matrix": cm,
|
|
125
|
+
"roc_auc": roc_auc,
|
|
126
|
+
"pr_auc": pr_auc
|
|
127
|
+
}
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
## Model Registry
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
import mlflow.sklearn
|
|
134
|
+
|
|
135
|
+
# Register model
|
|
136
|
+
with mlflow.start_run():
|
|
137
|
+
mlflow.sklearn.log_model(
|
|
138
|
+
model,
|
|
139
|
+
"model",
|
|
140
|
+
registered_model_name="churn_predictor"
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# Load registered model
|
|
144
|
+
model = mlflow.pyfunc.load_model(
|
|
145
|
+
model_uri="models:/churn_predictor/Production"
|
|
146
|
+
)
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
## Commands
|
|
150
|
+
- `/omgtrain:train` - Train model
|
|
151
|
+
- `/omgtrain:tune` - Hyperparameter tuning
|
|
152
|
+
- `/omgtrain:evaluate` - Evaluate model
|
|
153
|
+
|
|
154
|
+
## Best Practices
|
|
155
|
+
|
|
156
|
+
1. Use cross-validation
|
|
157
|
+
2. Tune hyperparameters systematically
|
|
158
|
+
3. Evaluate on multiple metrics
|
|
159
|
+
4. Check for overfitting
|
|
160
|
+
5. Register successful models
|