omgkit 2.20.0 → 2.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/README.md +125 -10
  2. package/package.json +1 -1
  3. package/plugin/agents/ai-architect-agent.md +282 -0
  4. package/plugin/agents/data-scientist-agent.md +221 -0
  5. package/plugin/agents/experiment-analyst-agent.md +318 -0
  6. package/plugin/agents/ml-engineer-agent.md +165 -0
  7. package/plugin/agents/mlops-engineer-agent.md +324 -0
  8. package/plugin/agents/model-optimizer-agent.md +287 -0
  9. package/plugin/agents/production-engineer-agent.md +360 -0
  10. package/plugin/agents/research-scientist-agent.md +274 -0
  11. package/plugin/commands/omgdata/augment.md +86 -0
  12. package/plugin/commands/omgdata/collect.md +81 -0
  13. package/plugin/commands/omgdata/label.md +83 -0
  14. package/plugin/commands/omgdata/split.md +83 -0
  15. package/plugin/commands/omgdata/validate.md +76 -0
  16. package/plugin/commands/omgdata/version.md +85 -0
  17. package/plugin/commands/omgdeploy/ab.md +94 -0
  18. package/plugin/commands/omgdeploy/cloud.md +89 -0
  19. package/plugin/commands/omgdeploy/edge.md +93 -0
  20. package/plugin/commands/omgdeploy/package.md +91 -0
  21. package/plugin/commands/omgdeploy/serve.md +92 -0
  22. package/plugin/commands/omgfeature/embed.md +93 -0
  23. package/plugin/commands/omgfeature/extract.md +93 -0
  24. package/plugin/commands/omgfeature/select.md +85 -0
  25. package/plugin/commands/omgfeature/store.md +97 -0
  26. package/plugin/commands/omgml/init.md +60 -0
  27. package/plugin/commands/omgml/status.md +82 -0
  28. package/plugin/commands/omgops/drift.md +87 -0
  29. package/plugin/commands/omgops/monitor.md +99 -0
  30. package/plugin/commands/omgops/pipeline.md +102 -0
  31. package/plugin/commands/omgops/registry.md +109 -0
  32. package/plugin/commands/omgops/retrain.md +91 -0
  33. package/plugin/commands/omgoptim/distill.md +90 -0
  34. package/plugin/commands/omgoptim/profile.md +92 -0
  35. package/plugin/commands/omgoptim/prune.md +81 -0
  36. package/plugin/commands/omgoptim/quantize.md +83 -0
  37. package/plugin/commands/omgtrain/baseline.md +78 -0
  38. package/plugin/commands/omgtrain/compare.md +99 -0
  39. package/plugin/commands/omgtrain/evaluate.md +85 -0
  40. package/plugin/commands/omgtrain/train.md +81 -0
  41. package/plugin/commands/omgtrain/tune.md +89 -0
  42. package/plugin/registry.yaml +252 -2
  43. package/plugin/skills/ml-systems/SKILL.md +65 -0
  44. package/plugin/skills/ml-systems/ai-accelerators/SKILL.md +342 -0
  45. package/plugin/skills/ml-systems/data-eng/SKILL.md +126 -0
  46. package/plugin/skills/ml-systems/deep-learning-primer/SKILL.md +143 -0
  47. package/plugin/skills/ml-systems/deployment-paradigms/SKILL.md +148 -0
  48. package/plugin/skills/ml-systems/dnn-architectures/SKILL.md +128 -0
  49. package/plugin/skills/ml-systems/edge-deployment/SKILL.md +366 -0
  50. package/plugin/skills/ml-systems/efficient-ai/SKILL.md +316 -0
  51. package/plugin/skills/ml-systems/feature-engineering/SKILL.md +151 -0
  52. package/plugin/skills/ml-systems/ml-frameworks/SKILL.md +187 -0
  53. package/plugin/skills/ml-systems/ml-serving-optimization/SKILL.md +371 -0
  54. package/plugin/skills/ml-systems/ml-systems-fundamentals/SKILL.md +103 -0
  55. package/plugin/skills/ml-systems/ml-workflow/SKILL.md +162 -0
  56. package/plugin/skills/ml-systems/mlops/SKILL.md +386 -0
  57. package/plugin/skills/ml-systems/model-deployment/SKILL.md +350 -0
  58. package/plugin/skills/ml-systems/model-dev/SKILL.md +160 -0
  59. package/plugin/skills/ml-systems/model-optimization/SKILL.md +339 -0
  60. package/plugin/skills/ml-systems/robust-ai/SKILL.md +395 -0
  61. package/plugin/skills/ml-systems/training-data/SKILL.md +152 -0
  62. package/plugin/workflows/ml-systems/data-preparation-workflow.md +276 -0
  63. package/plugin/workflows/ml-systems/edge-deployment-workflow.md +413 -0
  64. package/plugin/workflows/ml-systems/full-ml-lifecycle-workflow.md +405 -0
  65. package/plugin/workflows/ml-systems/hyperparameter-tuning-workflow.md +352 -0
  66. package/plugin/workflows/ml-systems/mlops-pipeline-workflow.md +384 -0
  67. package/plugin/workflows/ml-systems/model-deployment-workflow.md +392 -0
  68. package/plugin/workflows/ml-systems/model-development-workflow.md +218 -0
  69. package/plugin/workflows/ml-systems/model-evaluation-workflow.md +416 -0
  70. package/plugin/workflows/ml-systems/model-optimization-workflow.md +390 -0
  71. package/plugin/workflows/ml-systems/monitoring-drift-workflow.md +446 -0
  72. package/plugin/workflows/ml-systems/retraining-workflow.md +401 -0
  73. package/plugin/workflows/ml-systems/training-pipeline-workflow.md +382 -0
@@ -0,0 +1,366 @@
1
+ ---
2
+ name: edge-deployment
3
+ description: Edge deployment strategies including mobile optimization, embedded systems, TFLite, Core ML, and resource-constrained inference.
4
+ ---
5
+
6
+ # Edge Deployment
7
+
8
+ Deploying ML models to edge devices.
9
+
10
+ ## Edge Deployment Landscape
11
+
12
+ ```
13
+ ┌─────────────────────────────────────────────────────────────┐
14
+ │ EDGE DEPLOYMENT TARGETS │
15
+ ├─────────────────────────────────────────────────────────────┤
16
+ │ │
17
+ │ MOBILE EMBEDDED IOT/SENSORS │
18
+ │ ────── ──────── ─────────── │
19
+ │ iOS (Core ML) Raspberry Pi Arduino/ESP32 │
20
+ │ Android (TFLite) NVIDIA Jetson Microcontrollers │
21
+ │ React Native Google Coral FPGA boards │
22
+ │ │
23
+ │ CONSTRAINTS: │
24
+ │ ├── Memory: 256MB - 8GB │
25
+ │ ├── Compute: CPU/GPU/NPU │
26
+ │ ├── Power: Battery/USB/Wall │
27
+ │ ├── Connectivity: Always/Sometimes/Never │
28
+ │ └── Latency: 1ms - 100ms │
29
+ │ │
30
+ └─────────────────────────────────────────────────────────────┘
31
+ ```
32
+
33
+ ## TensorFlow Lite
34
+
35
+ ### Model Conversion
36
+ ```python
37
+ import tensorflow as tf
38
+
39
+ # Basic conversion
40
+ converter = tf.lite.TFLiteConverter.from_saved_model('saved_model/')
41
+ tflite_model = converter.convert()
42
+
43
+ # With optimizations
44
+ converter.optimizations = [tf.lite.Optimize.DEFAULT]
45
+
46
+ # Float16 quantization
47
+ converter.target_spec.supported_types = [tf.float16]
48
+
49
+ # Full integer quantization
50
+ def representative_dataset():
51
+ for data in calibration_data:
52
+ yield [data.astype(np.float32)]
53
+
54
+ converter.representative_dataset = representative_dataset
55
+ converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
56
+ converter.inference_input_type = tf.int8
57
+ converter.inference_output_type = tf.int8
58
+
59
+ # Save model
60
+ with open('model.tflite', 'wb') as f:
61
+ f.write(converter.convert())
62
+ ```
63
+
64
+ ### TFLite Inference
65
+ ```python
66
+ import numpy as np
67
+ import tensorflow as tf
68
+
69
+ # Load model
70
+ interpreter = tf.lite.Interpreter(model_path='model.tflite')
71
+ interpreter.allocate_tensors()
72
+
73
+ # Get input/output details
74
+ input_details = interpreter.get_input_details()
75
+ output_details = interpreter.get_output_details()
76
+
77
+ # Inference
78
+ def predict(input_data):
79
+ interpreter.set_tensor(input_details[0]['index'], input_data)
80
+ interpreter.invoke()
81
+ return interpreter.get_tensor(output_details[0]['index'])
82
+
83
+ # With delegate (GPU acceleration)
84
+ delegate = tf.lite.experimental.load_delegate('libedgetpu.so.1')
85
+ interpreter = tf.lite.Interpreter(
86
+ model_path='model_edgetpu.tflite',
87
+ experimental_delegates=[delegate]
88
+ )
89
+ ```
90
+
91
+ ## Core ML (iOS)
92
+
93
+ ### PyTorch to Core ML
94
+ ```python
95
+ import coremltools as ct
96
+ import torch
97
+
98
+ # Export to Core ML
99
+ model.eval()
100
+ example_input = torch.rand(1, 3, 224, 224)
101
+ traced_model = torch.jit.trace(model, example_input)
102
+
103
+ mlmodel = ct.convert(
104
+ traced_model,
105
+ inputs=[ct.TensorType(shape=example_input.shape, name="image")],
106
+ outputs=[ct.TensorType(name="predictions")],
107
+ compute_precision=ct.precision.FLOAT16,
108
+ minimum_deployment_target=ct.target.iOS15
109
+ )
110
+
111
+ # Add metadata
112
+ mlmodel.author = "Your Name"
113
+ mlmodel.short_description = "Image classifier"
114
+ mlmodel.input_description["image"] = "Input image"
115
+ mlmodel.output_description["predictions"] = "Class probabilities"
116
+
117
+ mlmodel.save("Model.mlpackage")
118
+ ```
119
+
120
+ ### Swift Integration
121
+ ```swift
122
+ import CoreML
123
+ import Vision
124
+
125
+ class ModelInference {
126
+ let model: VNCoreMLModel
127
+
128
+ init() throws {
129
+ let config = MLModelConfiguration()
130
+ config.computeUnits = .all // Use Neural Engine
131
+ let mlModel = try MyModel(configuration: config)
132
+ self.model = try VNCoreMLModel(for: mlModel.model)
133
+ }
134
+
135
+ func predict(image: CGImage, completion: @escaping ([String: Double]) -> Void) {
136
+ let request = VNCoreMLRequest(model: model) { request, error in
137
+ guard let results = request.results as? [VNClassificationObservation] else { return }
138
+ let predictions = Dictionary(
139
+ uniqueKeysWithValues: results.prefix(5).map { ($0.identifier, Double($0.confidence)) }
140
+ )
141
+ completion(predictions)
142
+ }
143
+
144
+ let handler = VNImageRequestHandler(cgImage: image)
145
+ try? handler.perform([request])
146
+ }
147
+ }
148
+ ```
149
+
150
+ ## NVIDIA Jetson
151
+
152
+ ### TensorRT Optimization
153
+ ```python
154
+ import tensorrt as trt
155
+
156
+ def build_engine(onnx_path, engine_path, precision='fp16'):
157
+ logger = trt.Logger(trt.Logger.INFO)
158
+ builder = trt.Builder(logger)
159
+ network = builder.create_network(
160
+ 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
161
+ )
162
+ parser = trt.OnnxParser(network, logger)
163
+
164
+ # Parse ONNX
165
+ with open(onnx_path, 'rb') as f:
166
+ parser.parse(f.read())
167
+
168
+ # Build config
169
+ config = builder.create_builder_config()
170
+ config.max_workspace_size = 1 << 30 # 1GB
171
+
172
+ if precision == 'fp16':
173
+ config.set_flag(trt.BuilderFlag.FP16)
174
+ elif precision == 'int8':
175
+ config.set_flag(trt.BuilderFlag.INT8)
176
+ config.int8_calibrator = EntropyCalibrator(calibration_data)
177
+
178
+ # Build engine
179
+ engine = builder.build_engine(network, config)
180
+
181
+ with open(engine_path, 'wb') as f:
182
+ f.write(engine.serialize())
183
+
184
+ return engine
185
+
186
+ # Inference with TensorRT
187
+ class TRTInference:
188
+ def __init__(self, engine_path):
189
+ logger = trt.Logger(trt.Logger.WARNING)
190
+ with open(engine_path, 'rb') as f:
191
+ self.engine = trt.Runtime(logger).deserialize_cuda_engine(f.read())
192
+ self.context = self.engine.create_execution_context()
193
+
194
+ def infer(self, input_data):
195
+ # Allocate buffers and run inference
196
+ pass
197
+ ```
198
+
199
+ ### DeepStream Pipeline
200
+ ```python
201
+ # DeepStream config for video inference
202
+ # config_infer_primary.txt
203
+ """
204
+ [property]
205
+ gpu-id=0
206
+ net-scale-factor=0.0039215697906911373
207
+ model-file=resnet18.onnx
208
+ model-engine-file=resnet18.engine
209
+ labelfile-path=labels.txt
210
+ batch-size=4
211
+ network-mode=2 # FP16
212
+ num-detected-classes=80
213
+ interval=0
214
+ gie-unique-id=1
215
+ process-mode=1
216
+ network-type=0
217
+ cluster-mode=2
218
+ maintain-aspect-ratio=1
219
+ symmetric-padding=1
220
+ """
221
+
222
+ # Python pipeline
223
+ import gi
224
+ gi.require_version('Gst', '1.0')
225
+ from gi.repository import Gst
226
+
227
+ Gst.init(None)
228
+ pipeline = Gst.parse_launch("""
229
+ filesrc location=video.mp4 !
230
+ decodebin !
231
+ nvvideoconvert !
232
+ nvinfer config-file-path=config.txt !
233
+ nvdsosd !
234
+ nvegltransform !
235
+ nveglglessink
236
+ """)
237
+ pipeline.set_state(Gst.State.PLAYING)
238
+ ```
239
+
240
+ ## Microcontroller Deployment
241
+
242
+ ### TensorFlow Lite Micro
243
+ ```cpp
244
+ #include "tensorflow/lite/micro/all_ops_resolver.h"
245
+ #include "tensorflow/lite/micro/micro_interpreter.h"
246
+ #include "model_data.h"
247
+
248
+ // Allocate tensor arena
249
+ constexpr int kTensorArenaSize = 10 * 1024;
250
+ uint8_t tensor_arena[kTensorArenaSize];
251
+
252
+ void setup() {
253
+ // Set up model
254
+ const tflite::Model* model = tflite::GetModel(model_data);
255
+
256
+ // Set up resolver
257
+ tflite::AllOpsResolver resolver;
258
+
259
+ // Build interpreter
260
+ tflite::MicroInterpreter interpreter(
261
+ model, resolver, tensor_arena, kTensorArenaSize
262
+ );
263
+ interpreter.AllocateTensors();
264
+
265
+ // Get input tensor
266
+ TfLiteTensor* input = interpreter.input(0);
267
+
268
+ // Fill input and invoke
269
+ // input->data.f[0] = sensor_value;
270
+ interpreter.Invoke();
271
+
272
+ // Get output
273
+ TfLiteTensor* output = interpreter.output(0);
274
+ float prediction = output->data.f[0];
275
+ }
276
+ ```
277
+
278
+ ## Model Optimization for Edge
279
+
280
+ ```python
281
+ def optimize_for_edge(model, target_size_mb=10, target_latency_ms=50):
282
+ """Optimize model for edge deployment."""
283
+ optimizations = []
284
+
285
+ # 1. Quantization
286
+ quantized = quantize_dynamic(model, {nn.Linear, nn.Conv2d}, dtype=torch.qint8)
287
+ if get_size_mb(quantized) <= target_size_mb:
288
+ optimizations.append(('quantization', quantized))
289
+
290
+ # 2. Pruning
291
+ pruned = prune_model(model, amount=0.5)
292
+ if get_size_mb(pruned) <= target_size_mb:
293
+ optimizations.append(('pruning', pruned))
294
+
295
+ # 3. Knowledge distillation
296
+ student = create_smaller_model(model)
297
+ distilled = distill(teacher=model, student=student)
298
+ if get_size_mb(distilled) <= target_size_mb:
299
+ optimizations.append(('distillation', distilled))
300
+
301
+ # Evaluate each
302
+ results = []
303
+ for name, opt_model in optimizations:
304
+ latency = measure_latency(opt_model)
305
+ accuracy = evaluate(opt_model)
306
+ if latency <= target_latency_ms:
307
+ results.append({
308
+ 'method': name,
309
+ 'latency': latency,
310
+ 'accuracy': accuracy,
311
+ 'size_mb': get_size_mb(opt_model)
312
+ })
313
+
314
+ return sorted(results, key=lambda x: x['accuracy'], reverse=True)
315
+ ```
316
+
317
+ ## Offline Inference
318
+
319
+ ```python
320
+ class OfflineInferenceManager:
321
+ def __init__(self, model_path, cache_dir='./cache'):
322
+ self.model = load_model(model_path)
323
+ self.cache_dir = cache_dir
324
+ self.pending_queue = []
325
+
326
+ def predict(self, input_data, priority='normal'):
327
+ """Run inference locally."""
328
+ return self.model(input_data)
329
+
330
+ def predict_with_fallback(self, input_data, cloud_endpoint=None):
331
+ """Try cloud first, fall back to local."""
332
+ try:
333
+ if self._is_online() and cloud_endpoint:
334
+ return self._cloud_predict(input_data, cloud_endpoint)
335
+ except Exception:
336
+ pass
337
+
338
+ return self.predict(input_data)
339
+
340
+ def queue_for_sync(self, input_data, result):
341
+ """Queue predictions for later sync."""
342
+ self.pending_queue.append({
343
+ 'input': input_data,
344
+ 'result': result,
345
+ 'timestamp': time.time()
346
+ })
347
+
348
+ def sync_when_online(self, endpoint):
349
+ """Sync queued predictions when connectivity restored."""
350
+ while self.pending_queue and self._is_online():
351
+ item = self.pending_queue.pop(0)
352
+ requests.post(endpoint, json=item)
353
+ ```
354
+
355
+ ## Commands
356
+ - `/omgdeploy:edge` - Edge deployment
357
+ - `/omgoptim:quantize` - Quantization
358
+ - `/omgdeploy:package` - Package for target
359
+
360
+ ## Best Practices
361
+
362
+ 1. Profile on target device early
363
+ 2. Use hardware-specific frameworks
364
+ 3. Quantize to int8 when possible
365
+ 4. Implement offline fallbacks
366
+ 5. Monitor battery and thermal impact
@@ -0,0 +1,316 @@
1
+ ---
2
+ name: efficient-ai
3
+ description: Efficient AI techniques including model compression, quantization, pruning, knowledge distillation, and hardware-aware optimization for production systems.
4
+ ---
5
+
6
+ # Efficient AI
7
+
8
+ Techniques for building resource-efficient ML systems.
9
+
10
+ ## Model Compression Overview
11
+
12
+ ```
13
+ ┌─────────────────────────────────────────────────────────────┐
14
+ │ MODEL COMPRESSION TECHNIQUES │
15
+ ├─────────────────────────────────────────────────────────────┤
16
+ │ │
17
+ │ QUANTIZATION PRUNING DISTILLATION │
18
+ │ ───────────── ────────── ──────────── │
19
+ │ FP32 → INT8 Remove weights Teacher→Student │
20
+ │ 2-4x smaller 50-90% sparse 10-100x smaller │
21
+ │ 1.5-3x faster 2-4x faster Same accuracy │
22
+ │ │
23
+ │ ARCHITECTURE LOW-RANK NEURAL ARCH │
24
+ │ ───────────── ────────── ──────────── │
25
+ │ MobileNet Matrix decomp AutoML search │
26
+ │ EfficientNet LoRA adapters Hardware-aware │
27
+ │ Depth-separable Rank reduction Latency targets │
28
+ │ │
29
+ └─────────────────────────────────────────────────────────────┘
30
+ ```
31
+
32
+ ## Quantization
33
+
34
+ ### Post-Training Quantization
35
+ ```python
36
+ import torch
37
+ from torch.quantization import quantize_dynamic, quantize_static
38
+
39
+ # Dynamic quantization (weights only)
40
+ model_dynamic = quantize_dynamic(
41
+ model,
42
+ {torch.nn.Linear, torch.nn.LSTM},
43
+ dtype=torch.qint8
44
+ )
45
+
46
+ # Static quantization (weights + activations)
47
+ model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
48
+ model_prepared = torch.quantization.prepare(model)
49
+
50
+ # Calibrate with representative data
51
+ with torch.no_grad():
52
+ for batch in calibration_loader:
53
+ model_prepared(batch)
54
+
55
+ model_static = torch.quantization.convert(model_prepared)
56
+ ```
57
+
58
+ ### Quantization-Aware Training
59
+ ```python
60
+ import torch.quantization as quant
61
+
62
+ class QuantizedModel(nn.Module):
63
+ def __init__(self):
64
+ super().__init__()
65
+ self.quant = quant.QuantStub()
66
+ self.dequant = quant.DeQuantStub()
67
+ self.layers = nn.Sequential(
68
+ nn.Linear(784, 256),
69
+ nn.ReLU(),
70
+ nn.Linear(256, 10)
71
+ )
72
+
73
+ def forward(self, x):
74
+ x = self.quant(x)
75
+ x = self.layers(x)
76
+ x = self.dequant(x)
77
+ return x
78
+
79
+ # Enable QAT
80
+ model.qconfig = quant.get_default_qat_qconfig('fbgemm')
81
+ model = quant.prepare_qat(model)
82
+
83
+ # Train normally
84
+ for epoch in range(epochs):
85
+ train(model, train_loader)
86
+
87
+ # Convert to quantized
88
+ model = quant.convert(model)
89
+ ```
90
+
91
+ ## Pruning
92
+
93
+ ### Magnitude Pruning
94
+ ```python
95
+ import torch.nn.utils.prune as prune
96
+
97
+ # Unstructured pruning (individual weights)
98
+ prune.l1_unstructured(model.layer1, name='weight', amount=0.3)
99
+
100
+ # Structured pruning (entire channels)
101
+ prune.ln_structured(
102
+ model.conv1, name='weight', amount=0.5,
103
+ n=2, dim=0 # Prune 50% of output channels
104
+ )
105
+
106
+ # Global pruning (across layers)
107
+ parameters_to_prune = [
108
+ (model.layer1, 'weight'),
109
+ (model.layer2, 'weight'),
110
+ ]
111
+ prune.global_unstructured(
112
+ parameters_to_prune,
113
+ pruning_method=prune.L1Unstructured,
114
+ amount=0.4
115
+ )
116
+
117
+ # Make pruning permanent
118
+ for module, name in parameters_to_prune:
119
+ prune.remove(module, name)
120
+ ```
121
+
122
+ ### Iterative Pruning with Fine-tuning
123
+ ```python
124
+ def iterative_pruning(model, train_loader, target_sparsity=0.9):
125
+ current_sparsity = 0
126
+ sparsity_schedule = [0.5, 0.75, 0.9]
127
+
128
+ for target in sparsity_schedule:
129
+ # Prune
130
+ for name, module in model.named_modules():
131
+ if isinstance(module, nn.Linear):
132
+ prune.l1_unstructured(module, 'weight', amount=target)
133
+
134
+ # Fine-tune
135
+ for epoch in range(fine_tune_epochs):
136
+ train_epoch(model, train_loader)
137
+
138
+ # Measure sparsity
139
+ total_zeros = sum((p == 0).sum().item() for p in model.parameters())
140
+ total_params = sum(p.numel() for p in model.parameters())
141
+ current_sparsity = total_zeros / total_params
142
+ print(f"Sparsity: {current_sparsity:.2%}")
143
+
144
+ return model
145
+ ```
146
+
147
+ ## Knowledge Distillation
148
+
149
+ ```python
150
+ class DistillationLoss(nn.Module):
151
+ def __init__(self, temperature=4.0, alpha=0.5):
152
+ super().__init__()
153
+ self.temperature = temperature
154
+ self.alpha = alpha
155
+ self.ce_loss = nn.CrossEntropyLoss()
156
+ self.kl_loss = nn.KLDivLoss(reduction='batchmean')
157
+
158
+ def forward(self, student_logits, teacher_logits, labels):
159
+ # Hard label loss
160
+ hard_loss = self.ce_loss(student_logits, labels)
161
+
162
+ # Soft label loss (distillation)
163
+ soft_student = F.log_softmax(student_logits / self.temperature, dim=1)
164
+ soft_teacher = F.softmax(teacher_logits / self.temperature, dim=1)
165
+ soft_loss = self.kl_loss(soft_student, soft_teacher) * (self.temperature ** 2)
166
+
167
+ return self.alpha * hard_loss + (1 - self.alpha) * soft_loss
168
+
169
+ # Training loop
170
+ teacher.eval()
171
+ for batch in train_loader:
172
+ x, y = batch
173
+ with torch.no_grad():
174
+ teacher_logits = teacher(x)
175
+ student_logits = student(x)
176
+ loss = distill_loss(student_logits, teacher_logits, y)
177
+ loss.backward()
178
+ optimizer.step()
179
+ ```
180
+
181
+ ## Efficient Architectures
182
+
183
+ ### Depth-Separable Convolutions
184
+ ```python
185
+ class DepthSeparableConv(nn.Module):
186
+ def __init__(self, in_channels, out_channels, kernel_size=3):
187
+ super().__init__()
188
+ self.depthwise = nn.Conv2d(
189
+ in_channels, in_channels, kernel_size,
190
+ padding=kernel_size//2, groups=in_channels
191
+ )
192
+ self.pointwise = nn.Conv2d(in_channels, out_channels, 1)
193
+
194
+ def forward(self, x):
195
+ x = self.depthwise(x)
196
+ x = self.pointwise(x)
197
+ return x
198
+
199
+ # Compare params: Regular 3x3 conv with C_in=64, C_out=128
200
+ # Regular: 64 * 128 * 3 * 3 = 73,728 params
201
+ # DepthSep: 64 * 3 * 3 + 64 * 128 = 576 + 8,192 = 8,768 params (8.4x fewer)
202
+ ```
203
+
204
+ ### MobileNet Inverted Residual Block
205
+ ```python
206
+ class InvertedResidual(nn.Module):
207
+ def __init__(self, in_ch, out_ch, stride, expand_ratio):
208
+ super().__init__()
209
+ hidden_dim = in_ch * expand_ratio
210
+ self.use_residual = stride == 1 and in_ch == out_ch
211
+
212
+ self.conv = nn.Sequential(
213
+ # Expand
214
+ nn.Conv2d(in_ch, hidden_dim, 1, bias=False),
215
+ nn.BatchNorm2d(hidden_dim),
216
+ nn.ReLU6(inplace=True),
217
+ # Depthwise
218
+ nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
219
+ nn.BatchNorm2d(hidden_dim),
220
+ nn.ReLU6(inplace=True),
221
+ # Project
222
+ nn.Conv2d(hidden_dim, out_ch, 1, bias=False),
223
+ nn.BatchNorm2d(out_ch),
224
+ )
225
+
226
+ def forward(self, x):
227
+ if self.use_residual:
228
+ return x + self.conv(x)
229
+ return self.conv(x)
230
+ ```
231
+
232
+ ## Low-Rank Factorization
233
+
234
+ ```python
235
+ import torch.nn.utils.parametrize as parametrize
236
+
237
+ class LowRankLinear(nn.Module):
238
+ def __init__(self, in_features, out_features, rank):
239
+ super().__init__()
240
+ self.A = nn.Linear(in_features, rank, bias=False)
241
+ self.B = nn.Linear(rank, out_features, bias=True)
242
+
243
+ def forward(self, x):
244
+ return self.B(self.A(x))
245
+
246
+ # LoRA-style adaptation
247
+ class LoRALayer(nn.Module):
248
+ def __init__(self, original_layer, rank=8, alpha=16):
249
+ super().__init__()
250
+ self.original = original_layer
251
+ self.lora_A = nn.Linear(original_layer.in_features, rank, bias=False)
252
+ self.lora_B = nn.Linear(rank, original_layer.out_features, bias=False)
253
+ self.scaling = alpha / rank
254
+
255
+ nn.init.kaiming_uniform_(self.lora_A.weight)
256
+ nn.init.zeros_(self.lora_B.weight)
257
+
258
+ def forward(self, x):
259
+ return self.original(x) + self.scaling * self.lora_B(self.lora_A(x))
260
+ ```
261
+
262
+ ## Efficiency Metrics
263
+
264
+ ```python
265
+ def measure_efficiency(model, input_shape, device='cuda'):
266
+ import time
267
+
268
+ model = model.to(device)
269
+ model.eval()
270
+
271
+ # Model size
272
+ param_size = sum(p.numel() * p.element_size() for p in model.parameters())
273
+ buffer_size = sum(b.numel() * b.element_size() for b in model.buffers())
274
+ size_mb = (param_size + buffer_size) / 1024 / 1024
275
+
276
+ # FLOPs (using thop)
277
+ from thop import profile
278
+ dummy_input = torch.randn(1, *input_shape).to(device)
279
+ flops, params = profile(model, inputs=(dummy_input,))
280
+
281
+ # Latency
282
+ warmup = 10
283
+ iterations = 100
284
+
285
+ for _ in range(warmup):
286
+ model(dummy_input)
287
+
288
+ torch.cuda.synchronize()
289
+ start = time.time()
290
+ for _ in range(iterations):
291
+ model(dummy_input)
292
+ torch.cuda.synchronize()
293
+ latency_ms = (time.time() - start) / iterations * 1000
294
+
295
+ return {
296
+ "size_mb": size_mb,
297
+ "params": params,
298
+ "flops": flops,
299
+ "latency_ms": latency_ms,
300
+ "throughput": 1000 / latency_ms
301
+ }
302
+ ```
303
+
304
+ ## Commands
305
+ - `/omgoptim:quantize` - Apply quantization
306
+ - `/omgoptim:prune` - Apply pruning
307
+ - `/omgoptim:distill` - Knowledge distillation
308
+ - `/omgoptim:profile` - Profile efficiency
309
+
310
+ ## Best Practices
311
+
312
+ 1. Start with the largest model that works
313
+ 2. Quantize first (usually free accuracy)
314
+ 3. Prune iteratively with fine-tuning
315
+ 4. Use distillation for maximum compression
316
+ 5. Profile on target hardware