omgkit 2.19.3 → 2.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +537 -338
- package/package.json +2 -2
- package/plugin/agents/ai-architect-agent.md +282 -0
- package/plugin/agents/data-scientist-agent.md +221 -0
- package/plugin/agents/experiment-analyst-agent.md +318 -0
- package/plugin/agents/ml-engineer-agent.md +165 -0
- package/plugin/agents/mlops-engineer-agent.md +324 -0
- package/plugin/agents/model-optimizer-agent.md +287 -0
- package/plugin/agents/production-engineer-agent.md +360 -0
- package/plugin/agents/research-scientist-agent.md +274 -0
- package/plugin/commands/omgdata/augment.md +86 -0
- package/plugin/commands/omgdata/collect.md +81 -0
- package/plugin/commands/omgdata/label.md +83 -0
- package/plugin/commands/omgdata/split.md +83 -0
- package/plugin/commands/omgdata/validate.md +76 -0
- package/plugin/commands/omgdata/version.md +85 -0
- package/plugin/commands/omgdeploy/ab.md +94 -0
- package/plugin/commands/omgdeploy/cloud.md +89 -0
- package/plugin/commands/omgdeploy/edge.md +93 -0
- package/plugin/commands/omgdeploy/package.md +91 -0
- package/plugin/commands/omgdeploy/serve.md +92 -0
- package/plugin/commands/omgfeature/embed.md +93 -0
- package/plugin/commands/omgfeature/extract.md +93 -0
- package/plugin/commands/omgfeature/select.md +85 -0
- package/plugin/commands/omgfeature/store.md +97 -0
- package/plugin/commands/omgml/init.md +60 -0
- package/plugin/commands/omgml/status.md +82 -0
- package/plugin/commands/omgops/drift.md +87 -0
- package/plugin/commands/omgops/monitor.md +99 -0
- package/plugin/commands/omgops/pipeline.md +102 -0
- package/plugin/commands/omgops/registry.md +109 -0
- package/plugin/commands/omgops/retrain.md +91 -0
- package/plugin/commands/omgoptim/distill.md +90 -0
- package/plugin/commands/omgoptim/profile.md +92 -0
- package/plugin/commands/omgoptim/prune.md +81 -0
- package/plugin/commands/omgoptim/quantize.md +83 -0
- package/plugin/commands/omgtrain/baseline.md +78 -0
- package/plugin/commands/omgtrain/compare.md +99 -0
- package/plugin/commands/omgtrain/evaluate.md +85 -0
- package/plugin/commands/omgtrain/train.md +81 -0
- package/plugin/commands/omgtrain/tune.md +89 -0
- package/plugin/registry.yaml +252 -2
- package/plugin/skills/ml-systems/SKILL.md +65 -0
- package/plugin/skills/ml-systems/ai-accelerators/SKILL.md +342 -0
- package/plugin/skills/ml-systems/data-eng/SKILL.md +126 -0
- package/plugin/skills/ml-systems/deep-learning-primer/SKILL.md +143 -0
- package/plugin/skills/ml-systems/deployment-paradigms/SKILL.md +148 -0
- package/plugin/skills/ml-systems/dnn-architectures/SKILL.md +128 -0
- package/plugin/skills/ml-systems/edge-deployment/SKILL.md +366 -0
- package/plugin/skills/ml-systems/efficient-ai/SKILL.md +316 -0
- package/plugin/skills/ml-systems/feature-engineering/SKILL.md +151 -0
- package/plugin/skills/ml-systems/ml-frameworks/SKILL.md +187 -0
- package/plugin/skills/ml-systems/ml-serving-optimization/SKILL.md +371 -0
- package/plugin/skills/ml-systems/ml-systems-fundamentals/SKILL.md +103 -0
- package/plugin/skills/ml-systems/ml-workflow/SKILL.md +162 -0
- package/plugin/skills/ml-systems/mlops/SKILL.md +386 -0
- package/plugin/skills/ml-systems/model-deployment/SKILL.md +350 -0
- package/plugin/skills/ml-systems/model-dev/SKILL.md +160 -0
- package/plugin/skills/ml-systems/model-optimization/SKILL.md +339 -0
- package/plugin/skills/ml-systems/robust-ai/SKILL.md +395 -0
- package/plugin/skills/ml-systems/training-data/SKILL.md +152 -0
- package/plugin/workflows/ml-systems/data-preparation-workflow.md +276 -0
- package/plugin/workflows/ml-systems/edge-deployment-workflow.md +413 -0
- package/plugin/workflows/ml-systems/full-ml-lifecycle-workflow.md +405 -0
- package/plugin/workflows/ml-systems/hyperparameter-tuning-workflow.md +352 -0
- package/plugin/workflows/ml-systems/mlops-pipeline-workflow.md +384 -0
- package/plugin/workflows/ml-systems/model-deployment-workflow.md +392 -0
- package/plugin/workflows/ml-systems/model-development-workflow.md +218 -0
- package/plugin/workflows/ml-systems/model-evaluation-workflow.md +416 -0
- package/plugin/workflows/ml-systems/model-optimization-workflow.md +390 -0
- package/plugin/workflows/ml-systems/monitoring-drift-workflow.md +446 -0
- package/plugin/workflows/ml-systems/retraining-workflow.md +401 -0
- package/plugin/workflows/ml-systems/training-pipeline-workflow.md +382 -0
|
@@ -0,0 +1,339 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: model-optimization
|
|
3
|
+
description: Model optimization techniques including hyperparameter tuning, architecture search, training optimization, and performance profiling for ML systems.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Model Optimization
|
|
7
|
+
|
|
8
|
+
Techniques for optimizing ML model performance.
|
|
9
|
+
|
|
10
|
+
## Hyperparameter Optimization
|
|
11
|
+
|
|
12
|
+
### Optuna Integration
|
|
13
|
+
```python
|
|
14
|
+
import optuna
|
|
15
|
+
from optuna.pruners import MedianPruner
|
|
16
|
+
from optuna.samplers import TPESampler
|
|
17
|
+
|
|
18
|
+
def objective(trial):
|
|
19
|
+
# Suggest hyperparameters
|
|
20
|
+
params = {
|
|
21
|
+
'learning_rate': trial.suggest_float('lr', 1e-5, 1e-1, log=True),
|
|
22
|
+
'batch_size': trial.suggest_categorical('batch_size', [16, 32, 64, 128]),
|
|
23
|
+
'num_layers': trial.suggest_int('num_layers', 2, 8),
|
|
24
|
+
'hidden_dim': trial.suggest_int('hidden_dim', 64, 512, step=64),
|
|
25
|
+
'dropout': trial.suggest_float('dropout', 0.1, 0.5),
|
|
26
|
+
'optimizer': trial.suggest_categorical('optimizer', ['adam', 'sgd', 'adamw']),
|
|
27
|
+
'weight_decay': trial.suggest_float('weight_decay', 1e-6, 1e-2, log=True),
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
model = build_model(params)
|
|
31
|
+
|
|
32
|
+
for epoch in range(max_epochs):
|
|
33
|
+
train_loss = train_epoch(model, train_loader)
|
|
34
|
+
val_loss = validate(model, val_loader)
|
|
35
|
+
|
|
36
|
+
# Report for pruning
|
|
37
|
+
trial.report(val_loss, epoch)
|
|
38
|
+
if trial.should_prune():
|
|
39
|
+
raise optuna.TrialPruned()
|
|
40
|
+
|
|
41
|
+
return val_loss
|
|
42
|
+
|
|
43
|
+
# Create study with pruning
|
|
44
|
+
study = optuna.create_study(
|
|
45
|
+
direction='minimize',
|
|
46
|
+
sampler=TPESampler(seed=42),
|
|
47
|
+
pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=10)
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
study.optimize(objective, n_trials=100, n_jobs=4)
|
|
51
|
+
|
|
52
|
+
print(f"Best params: {study.best_params}")
|
|
53
|
+
print(f"Best value: {study.best_value:.4f}")
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### Hyperparameter Search Strategies
|
|
57
|
+
```python
|
|
58
|
+
# Grid Search
|
|
59
|
+
from sklearn.model_selection import GridSearchCV
|
|
60
|
+
|
|
61
|
+
param_grid = {
|
|
62
|
+
'n_estimators': [100, 200, 300],
|
|
63
|
+
'max_depth': [3, 5, 7, 10],
|
|
64
|
+
'learning_rate': [0.01, 0.1, 0.3]
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
grid = GridSearchCV(model, param_grid, cv=5, scoring='f1_macro', n_jobs=-1)
|
|
68
|
+
grid.fit(X_train, y_train)
|
|
69
|
+
|
|
70
|
+
# Random Search
|
|
71
|
+
from sklearn.model_selection import RandomizedSearchCV
|
|
72
|
+
from scipy.stats import uniform, randint
|
|
73
|
+
|
|
74
|
+
param_distributions = {
|
|
75
|
+
'n_estimators': randint(50, 500),
|
|
76
|
+
'max_depth': randint(2, 15),
|
|
77
|
+
'learning_rate': uniform(0.001, 0.5)
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
random_search = RandomizedSearchCV(
|
|
81
|
+
model, param_distributions, n_iter=100, cv=5, random_state=42
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Bayesian Optimization with scikit-optimize
|
|
85
|
+
from skopt import BayesSearchCV
|
|
86
|
+
from skopt.space import Real, Integer, Categorical
|
|
87
|
+
|
|
88
|
+
search_spaces = {
|
|
89
|
+
'learning_rate': Real(1e-5, 1e-1, prior='log-uniform'),
|
|
90
|
+
'num_layers': Integer(2, 10),
|
|
91
|
+
'activation': Categorical(['relu', 'gelu', 'swish'])
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
bayes_search = BayesSearchCV(
|
|
95
|
+
model, search_spaces, n_iter=50, cv=5, random_state=42
|
|
96
|
+
)
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## Training Optimization
|
|
100
|
+
|
|
101
|
+
### Learning Rate Scheduling
|
|
102
|
+
```python
|
|
103
|
+
import torch.optim.lr_scheduler as lr_scheduler
|
|
104
|
+
|
|
105
|
+
# Warmup + Cosine Annealing
|
|
106
|
+
class WarmupCosineScheduler:
|
|
107
|
+
def __init__(self, optimizer, warmup_steps, total_steps):
|
|
108
|
+
self.optimizer = optimizer
|
|
109
|
+
self.warmup_steps = warmup_steps
|
|
110
|
+
self.total_steps = total_steps
|
|
111
|
+
self.current_step = 0
|
|
112
|
+
self.base_lr = optimizer.param_groups[0]['lr']
|
|
113
|
+
|
|
114
|
+
def step(self):
|
|
115
|
+
self.current_step += 1
|
|
116
|
+
if self.current_step < self.warmup_steps:
|
|
117
|
+
lr = self.base_lr * self.current_step / self.warmup_steps
|
|
118
|
+
else:
|
|
119
|
+
progress = (self.current_step - self.warmup_steps) / (self.total_steps - self.warmup_steps)
|
|
120
|
+
lr = self.base_lr * 0.5 * (1 + math.cos(math.pi * progress))
|
|
121
|
+
|
|
122
|
+
for param_group in self.optimizer.param_groups:
|
|
123
|
+
param_group['lr'] = lr
|
|
124
|
+
|
|
125
|
+
# One Cycle Policy
|
|
126
|
+
scheduler = lr_scheduler.OneCycleLR(
|
|
127
|
+
optimizer,
|
|
128
|
+
max_lr=0.01,
|
|
129
|
+
total_steps=total_steps,
|
|
130
|
+
pct_start=0.3,
|
|
131
|
+
anneal_strategy='cos'
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
# Reduce on Plateau
|
|
135
|
+
scheduler = lr_scheduler.ReduceLROnPlateau(
|
|
136
|
+
optimizer, mode='min', factor=0.5, patience=5, verbose=True
|
|
137
|
+
)
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
### Mixed Precision Training
|
|
141
|
+
```python
|
|
142
|
+
from torch.cuda.amp import autocast, GradScaler
|
|
143
|
+
|
|
144
|
+
scaler = GradScaler()
|
|
145
|
+
|
|
146
|
+
for epoch in range(epochs):
|
|
147
|
+
for batch in train_loader:
|
|
148
|
+
optimizer.zero_grad()
|
|
149
|
+
|
|
150
|
+
with autocast():
|
|
151
|
+
outputs = model(inputs)
|
|
152
|
+
loss = criterion(outputs, targets)
|
|
153
|
+
|
|
154
|
+
scaler.scale(loss).backward()
|
|
155
|
+
scaler.unscale_(optimizer)
|
|
156
|
+
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
|
|
157
|
+
scaler.step(optimizer)
|
|
158
|
+
scaler.update()
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
### Gradient Accumulation
|
|
162
|
+
```python
|
|
163
|
+
accumulation_steps = 4
|
|
164
|
+
optimizer.zero_grad()
|
|
165
|
+
|
|
166
|
+
for i, batch in enumerate(train_loader):
|
|
167
|
+
outputs = model(batch)
|
|
168
|
+
loss = criterion(outputs, targets) / accumulation_steps
|
|
169
|
+
loss.backward()
|
|
170
|
+
|
|
171
|
+
if (i + 1) % accumulation_steps == 0:
|
|
172
|
+
optimizer.step()
|
|
173
|
+
optimizer.zero_grad()
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
## Architecture Optimization
|
|
177
|
+
|
|
178
|
+
### Neural Architecture Search
|
|
179
|
+
```python
|
|
180
|
+
import nni
|
|
181
|
+
from nni.nas.pytorch import mutables
|
|
182
|
+
|
|
183
|
+
class SearchableBlock(nn.Module):
|
|
184
|
+
def __init__(self, in_channels, out_channels):
|
|
185
|
+
super().__init__()
|
|
186
|
+
self.op_choice = mutables.LayerChoice([
|
|
187
|
+
nn.Conv2d(in_channels, out_channels, 3, padding=1),
|
|
188
|
+
nn.Conv2d(in_channels, out_channels, 5, padding=2),
|
|
189
|
+
DepthSeparableConv(in_channels, out_channels),
|
|
190
|
+
nn.Identity() if in_channels == out_channels else nn.Conv2d(in_channels, out_channels, 1)
|
|
191
|
+
])
|
|
192
|
+
|
|
193
|
+
def forward(self, x):
|
|
194
|
+
return self.op_choice(x)
|
|
195
|
+
|
|
196
|
+
# AutoML with Ray Tune
|
|
197
|
+
from ray import tune
|
|
198
|
+
from ray.tune.schedulers import ASHAScheduler
|
|
199
|
+
|
|
200
|
+
def train_model(config):
|
|
201
|
+
model = build_model(config)
|
|
202
|
+
for epoch in range(config['epochs']):
|
|
203
|
+
train_loss = train(model)
|
|
204
|
+
val_acc = validate(model)
|
|
205
|
+
tune.report(loss=train_loss, accuracy=val_acc)
|
|
206
|
+
|
|
207
|
+
analysis = tune.run(
|
|
208
|
+
train_model,
|
|
209
|
+
config={
|
|
210
|
+
"lr": tune.loguniform(1e-5, 1e-1),
|
|
211
|
+
"layers": tune.choice([2, 4, 6, 8]),
|
|
212
|
+
"hidden": tune.choice([128, 256, 512]),
|
|
213
|
+
},
|
|
214
|
+
scheduler=ASHAScheduler(metric="accuracy", mode="max"),
|
|
215
|
+
num_samples=100
|
|
216
|
+
)
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
## Memory Optimization
|
|
220
|
+
|
|
221
|
+
### Gradient Checkpointing
|
|
222
|
+
```python
|
|
223
|
+
from torch.utils.checkpoint import checkpoint
|
|
224
|
+
|
|
225
|
+
class CheckpointedModel(nn.Module):
|
|
226
|
+
def __init__(self):
|
|
227
|
+
super().__init__()
|
|
228
|
+
self.layers = nn.ModuleList([
|
|
229
|
+
TransformerBlock(d_model=512) for _ in range(24)
|
|
230
|
+
])
|
|
231
|
+
|
|
232
|
+
def forward(self, x):
|
|
233
|
+
for layer in self.layers:
|
|
234
|
+
x = checkpoint(layer, x, use_reentrant=False)
|
|
235
|
+
return x
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
### Memory-Efficient Attention
|
|
239
|
+
```python
|
|
240
|
+
# Flash Attention (via xformers or native)
|
|
241
|
+
from xformers.ops import memory_efficient_attention
|
|
242
|
+
|
|
243
|
+
class EfficientAttention(nn.Module):
|
|
244
|
+
def forward(self, q, k, v, mask=None):
|
|
245
|
+
return memory_efficient_attention(q, k, v, attn_bias=mask)
|
|
246
|
+
|
|
247
|
+
# Sliding Window Attention
|
|
248
|
+
class SlidingWindowAttention(nn.Module):
|
|
249
|
+
def __init__(self, window_size=256):
|
|
250
|
+
super().__init__()
|
|
251
|
+
self.window_size = window_size
|
|
252
|
+
|
|
253
|
+
def forward(self, q, k, v):
|
|
254
|
+
seq_len = q.size(1)
|
|
255
|
+
outputs = []
|
|
256
|
+
|
|
257
|
+
for i in range(0, seq_len, self.window_size):
|
|
258
|
+
start = max(0, i - self.window_size // 2)
|
|
259
|
+
end = min(seq_len, i + self.window_size)
|
|
260
|
+
|
|
261
|
+
q_chunk = q[:, i:min(i + self.window_size, seq_len)]
|
|
262
|
+
k_chunk = k[:, start:end]
|
|
263
|
+
v_chunk = v[:, start:end]
|
|
264
|
+
|
|
265
|
+
attn = torch.matmul(q_chunk, k_chunk.transpose(-2, -1))
|
|
266
|
+
attn = F.softmax(attn / math.sqrt(q.size(-1)), dim=-1)
|
|
267
|
+
outputs.append(torch.matmul(attn, v_chunk))
|
|
268
|
+
|
|
269
|
+
return torch.cat(outputs, dim=1)
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
## Performance Profiling
|
|
273
|
+
|
|
274
|
+
```python
|
|
275
|
+
import torch.profiler as profiler
|
|
276
|
+
|
|
277
|
+
with profiler.profile(
|
|
278
|
+
activities=[
|
|
279
|
+
profiler.ProfilerActivity.CPU,
|
|
280
|
+
profiler.ProfilerActivity.CUDA,
|
|
281
|
+
],
|
|
282
|
+
schedule=profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
|
|
283
|
+
on_trace_ready=profiler.tensorboard_trace_handler('./logs'),
|
|
284
|
+
record_shapes=True,
|
|
285
|
+
profile_memory=True,
|
|
286
|
+
with_stack=True
|
|
287
|
+
) as prof:
|
|
288
|
+
for step, batch in enumerate(train_loader):
|
|
289
|
+
if step >= 5:
|
|
290
|
+
break
|
|
291
|
+
train_step(model, batch)
|
|
292
|
+
prof.step()
|
|
293
|
+
|
|
294
|
+
# Print summary
|
|
295
|
+
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
|
|
296
|
+
```
|
|
297
|
+
|
|
298
|
+
## Distributed Training
|
|
299
|
+
|
|
300
|
+
```python
|
|
301
|
+
import torch.distributed as dist
|
|
302
|
+
from torch.nn.parallel import DistributedDataParallel as DDP
|
|
303
|
+
|
|
304
|
+
def setup(rank, world_size):
|
|
305
|
+
dist.init_process_group("nccl", rank=rank, world_size=world_size)
|
|
306
|
+
torch.cuda.set_device(rank)
|
|
307
|
+
|
|
308
|
+
def train_ddp(rank, world_size):
|
|
309
|
+
setup(rank, world_size)
|
|
310
|
+
|
|
311
|
+
model = Model().to(rank)
|
|
312
|
+
model = DDP(model, device_ids=[rank])
|
|
313
|
+
|
|
314
|
+
sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank)
|
|
315
|
+
loader = DataLoader(dataset, sampler=sampler, batch_size=32)
|
|
316
|
+
|
|
317
|
+
for epoch in range(epochs):
|
|
318
|
+
sampler.set_epoch(epoch)
|
|
319
|
+
for batch in loader:
|
|
320
|
+
train_step(model, batch)
|
|
321
|
+
|
|
322
|
+
dist.destroy_process_group()
|
|
323
|
+
|
|
324
|
+
# Launch with torchrun
|
|
325
|
+
# torchrun --nproc_per_node=4 train.py
|
|
326
|
+
```
|
|
327
|
+
|
|
328
|
+
## Commands
|
|
329
|
+
- `/omgtrain:tune` - Hyperparameter tuning
|
|
330
|
+
- `/omgoptim:profile` - Profile model
|
|
331
|
+
- `/omgtrain:train` - Train with optimizations
|
|
332
|
+
|
|
333
|
+
## Best Practices
|
|
334
|
+
|
|
335
|
+
1. Profile before optimizing
|
|
336
|
+
2. Use mixed precision by default
|
|
337
|
+
3. Start with proven architectures
|
|
338
|
+
4. Tune learning rate first
|
|
339
|
+
5. Use distributed training for scale
|