omgkit 2.20.0 → 2.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +125 -10
- package/package.json +1 -1
- package/plugin/agents/ai-architect-agent.md +282 -0
- package/plugin/agents/data-scientist-agent.md +221 -0
- package/plugin/agents/experiment-analyst-agent.md +318 -0
- package/plugin/agents/ml-engineer-agent.md +165 -0
- package/plugin/agents/mlops-engineer-agent.md +324 -0
- package/plugin/agents/model-optimizer-agent.md +287 -0
- package/plugin/agents/production-engineer-agent.md +360 -0
- package/plugin/agents/research-scientist-agent.md +274 -0
- package/plugin/commands/omgdata/augment.md +86 -0
- package/plugin/commands/omgdata/collect.md +81 -0
- package/plugin/commands/omgdata/label.md +83 -0
- package/plugin/commands/omgdata/split.md +83 -0
- package/plugin/commands/omgdata/validate.md +76 -0
- package/plugin/commands/omgdata/version.md +85 -0
- package/plugin/commands/omgdeploy/ab.md +94 -0
- package/plugin/commands/omgdeploy/cloud.md +89 -0
- package/plugin/commands/omgdeploy/edge.md +93 -0
- package/plugin/commands/omgdeploy/package.md +91 -0
- package/plugin/commands/omgdeploy/serve.md +92 -0
- package/plugin/commands/omgfeature/embed.md +93 -0
- package/plugin/commands/omgfeature/extract.md +93 -0
- package/plugin/commands/omgfeature/select.md +85 -0
- package/plugin/commands/omgfeature/store.md +97 -0
- package/plugin/commands/omgml/init.md +60 -0
- package/plugin/commands/omgml/status.md +82 -0
- package/plugin/commands/omgops/drift.md +87 -0
- package/plugin/commands/omgops/monitor.md +99 -0
- package/plugin/commands/omgops/pipeline.md +102 -0
- package/plugin/commands/omgops/registry.md +109 -0
- package/plugin/commands/omgops/retrain.md +91 -0
- package/plugin/commands/omgoptim/distill.md +90 -0
- package/plugin/commands/omgoptim/profile.md +92 -0
- package/plugin/commands/omgoptim/prune.md +81 -0
- package/plugin/commands/omgoptim/quantize.md +83 -0
- package/plugin/commands/omgtrain/baseline.md +78 -0
- package/plugin/commands/omgtrain/compare.md +99 -0
- package/plugin/commands/omgtrain/evaluate.md +85 -0
- package/plugin/commands/omgtrain/train.md +81 -0
- package/plugin/commands/omgtrain/tune.md +89 -0
- package/plugin/registry.yaml +252 -2
- package/plugin/skills/ml-systems/SKILL.md +65 -0
- package/plugin/skills/ml-systems/ai-accelerators/SKILL.md +342 -0
- package/plugin/skills/ml-systems/data-eng/SKILL.md +126 -0
- package/plugin/skills/ml-systems/deep-learning-primer/SKILL.md +143 -0
- package/plugin/skills/ml-systems/deployment-paradigms/SKILL.md +148 -0
- package/plugin/skills/ml-systems/dnn-architectures/SKILL.md +128 -0
- package/plugin/skills/ml-systems/edge-deployment/SKILL.md +366 -0
- package/plugin/skills/ml-systems/efficient-ai/SKILL.md +316 -0
- package/plugin/skills/ml-systems/feature-engineering/SKILL.md +151 -0
- package/plugin/skills/ml-systems/ml-frameworks/SKILL.md +187 -0
- package/plugin/skills/ml-systems/ml-serving-optimization/SKILL.md +371 -0
- package/plugin/skills/ml-systems/ml-systems-fundamentals/SKILL.md +103 -0
- package/plugin/skills/ml-systems/ml-workflow/SKILL.md +162 -0
- package/plugin/skills/ml-systems/mlops/SKILL.md +386 -0
- package/plugin/skills/ml-systems/model-deployment/SKILL.md +350 -0
- package/plugin/skills/ml-systems/model-dev/SKILL.md +160 -0
- package/plugin/skills/ml-systems/model-optimization/SKILL.md +339 -0
- package/plugin/skills/ml-systems/robust-ai/SKILL.md +395 -0
- package/plugin/skills/ml-systems/training-data/SKILL.md +152 -0
- package/plugin/workflows/ml-systems/data-preparation-workflow.md +276 -0
- package/plugin/workflows/ml-systems/edge-deployment-workflow.md +413 -0
- package/plugin/workflows/ml-systems/full-ml-lifecycle-workflow.md +405 -0
- package/plugin/workflows/ml-systems/hyperparameter-tuning-workflow.md +352 -0
- package/plugin/workflows/ml-systems/mlops-pipeline-workflow.md +384 -0
- package/plugin/workflows/ml-systems/model-deployment-workflow.md +392 -0
- package/plugin/workflows/ml-systems/model-development-workflow.md +218 -0
- package/plugin/workflows/ml-systems/model-evaluation-workflow.md +416 -0
- package/plugin/workflows/ml-systems/model-optimization-workflow.md +390 -0
- package/plugin/workflows/ml-systems/monitoring-drift-workflow.md +446 -0
- package/plugin/workflows/ml-systems/retraining-workflow.md +401 -0
- package/plugin/workflows/ml-systems/training-pipeline-workflow.md +382 -0
|
@@ -0,0 +1,366 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: edge-deployment
|
|
3
|
+
description: Edge deployment strategies including mobile optimization, embedded systems, TFLite, Core ML, and resource-constrained inference.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Edge Deployment
|
|
7
|
+
|
|
8
|
+
Deploying ML models to edge devices.
|
|
9
|
+
|
|
10
|
+
## Edge Deployment Landscape
|
|
11
|
+
|
|
12
|
+
```
|
|
13
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
14
|
+
│ EDGE DEPLOYMENT TARGETS │
|
|
15
|
+
├─────────────────────────────────────────────────────────────┤
|
|
16
|
+
│ │
|
|
17
|
+
│ MOBILE EMBEDDED IOT/SENSORS │
|
|
18
|
+
│ ────── ──────── ─────────── │
|
|
19
|
+
│ iOS (Core ML) Raspberry Pi Arduino/ESP32 │
|
|
20
|
+
│ Android (TFLite) NVIDIA Jetson Microcontrollers │
|
|
21
|
+
│ React Native Google Coral FPGA boards │
|
|
22
|
+
│ │
|
|
23
|
+
│ CONSTRAINTS: │
|
|
24
|
+
│ ├── Memory: 256MB - 8GB │
|
|
25
|
+
│ ├── Compute: CPU/GPU/NPU │
|
|
26
|
+
│ ├── Power: Battery/USB/Wall │
|
|
27
|
+
│ ├── Connectivity: Always/Sometimes/Never │
|
|
28
|
+
│ └── Latency: 1ms - 100ms │
|
|
29
|
+
│ │
|
|
30
|
+
└─────────────────────────────────────────────────────────────┘
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## TensorFlow Lite
|
|
34
|
+
|
|
35
|
+
### Model Conversion
|
|
36
|
+
```python
|
|
37
|
+
import tensorflow as tf
|
|
38
|
+
|
|
39
|
+
# Basic conversion
|
|
40
|
+
converter = tf.lite.TFLiteConverter.from_saved_model('saved_model/')
|
|
41
|
+
tflite_model = converter.convert()
|
|
42
|
+
|
|
43
|
+
# With optimizations
|
|
44
|
+
converter.optimizations = [tf.lite.Optimize.DEFAULT]
|
|
45
|
+
|
|
46
|
+
# Float16 quantization
|
|
47
|
+
converter.target_spec.supported_types = [tf.float16]
|
|
48
|
+
|
|
49
|
+
# Full integer quantization
|
|
50
|
+
def representative_dataset():
|
|
51
|
+
for data in calibration_data:
|
|
52
|
+
yield [data.astype(np.float32)]
|
|
53
|
+
|
|
54
|
+
converter.representative_dataset = representative_dataset
|
|
55
|
+
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
|
|
56
|
+
converter.inference_input_type = tf.int8
|
|
57
|
+
converter.inference_output_type = tf.int8
|
|
58
|
+
|
|
59
|
+
# Save model
|
|
60
|
+
with open('model.tflite', 'wb') as f:
|
|
61
|
+
f.write(converter.convert())
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### TFLite Inference
|
|
65
|
+
```python
|
|
66
|
+
import numpy as np
|
|
67
|
+
import tensorflow as tf
|
|
68
|
+
|
|
69
|
+
# Load model
|
|
70
|
+
interpreter = tf.lite.Interpreter(model_path='model.tflite')
|
|
71
|
+
interpreter.allocate_tensors()
|
|
72
|
+
|
|
73
|
+
# Get input/output details
|
|
74
|
+
input_details = interpreter.get_input_details()
|
|
75
|
+
output_details = interpreter.get_output_details()
|
|
76
|
+
|
|
77
|
+
# Inference
|
|
78
|
+
def predict(input_data):
|
|
79
|
+
interpreter.set_tensor(input_details[0]['index'], input_data)
|
|
80
|
+
interpreter.invoke()
|
|
81
|
+
return interpreter.get_tensor(output_details[0]['index'])
|
|
82
|
+
|
|
83
|
+
# With delegate (GPU acceleration)
|
|
84
|
+
delegate = tf.lite.experimental.load_delegate('libedgetpu.so.1')
|
|
85
|
+
interpreter = tf.lite.Interpreter(
|
|
86
|
+
model_path='model_edgetpu.tflite',
|
|
87
|
+
experimental_delegates=[delegate]
|
|
88
|
+
)
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## Core ML (iOS)
|
|
92
|
+
|
|
93
|
+
### PyTorch to Core ML
|
|
94
|
+
```python
|
|
95
|
+
import coremltools as ct
|
|
96
|
+
import torch
|
|
97
|
+
|
|
98
|
+
# Export to Core ML
|
|
99
|
+
model.eval()
|
|
100
|
+
example_input = torch.rand(1, 3, 224, 224)
|
|
101
|
+
traced_model = torch.jit.trace(model, example_input)
|
|
102
|
+
|
|
103
|
+
mlmodel = ct.convert(
|
|
104
|
+
traced_model,
|
|
105
|
+
inputs=[ct.TensorType(shape=example_input.shape, name="image")],
|
|
106
|
+
outputs=[ct.TensorType(name="predictions")],
|
|
107
|
+
compute_precision=ct.precision.FLOAT16,
|
|
108
|
+
minimum_deployment_target=ct.target.iOS15
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
# Add metadata
|
|
112
|
+
mlmodel.author = "Your Name"
|
|
113
|
+
mlmodel.short_description = "Image classifier"
|
|
114
|
+
mlmodel.input_description["image"] = "Input image"
|
|
115
|
+
mlmodel.output_description["predictions"] = "Class probabilities"
|
|
116
|
+
|
|
117
|
+
mlmodel.save("Model.mlpackage")
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### Swift Integration
|
|
121
|
+
```swift
|
|
122
|
+
import CoreML
|
|
123
|
+
import Vision
|
|
124
|
+
|
|
125
|
+
class ModelInference {
|
|
126
|
+
let model: VNCoreMLModel
|
|
127
|
+
|
|
128
|
+
init() throws {
|
|
129
|
+
let config = MLModelConfiguration()
|
|
130
|
+
config.computeUnits = .all // Use Neural Engine
|
|
131
|
+
let mlModel = try MyModel(configuration: config)
|
|
132
|
+
self.model = try VNCoreMLModel(for: mlModel.model)
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
func predict(image: CGImage, completion: @escaping ([String: Double]) -> Void) {
|
|
136
|
+
let request = VNCoreMLRequest(model: model) { request, error in
|
|
137
|
+
guard let results = request.results as? [VNClassificationObservation] else { return }
|
|
138
|
+
let predictions = Dictionary(
|
|
139
|
+
uniqueKeysWithValues: results.prefix(5).map { ($0.identifier, Double($0.confidence)) }
|
|
140
|
+
)
|
|
141
|
+
completion(predictions)
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
let handler = VNImageRequestHandler(cgImage: image)
|
|
145
|
+
try? handler.perform([request])
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
## NVIDIA Jetson
|
|
151
|
+
|
|
152
|
+
### TensorRT Optimization
|
|
153
|
+
```python
|
|
154
|
+
import tensorrt as trt
|
|
155
|
+
|
|
156
|
+
def build_engine(onnx_path, engine_path, precision='fp16'):
|
|
157
|
+
logger = trt.Logger(trt.Logger.INFO)
|
|
158
|
+
builder = trt.Builder(logger)
|
|
159
|
+
network = builder.create_network(
|
|
160
|
+
1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
|
|
161
|
+
)
|
|
162
|
+
parser = trt.OnnxParser(network, logger)
|
|
163
|
+
|
|
164
|
+
# Parse ONNX
|
|
165
|
+
with open(onnx_path, 'rb') as f:
|
|
166
|
+
parser.parse(f.read())
|
|
167
|
+
|
|
168
|
+
# Build config
|
|
169
|
+
config = builder.create_builder_config()
|
|
170
|
+
config.max_workspace_size = 1 << 30 # 1GB
|
|
171
|
+
|
|
172
|
+
if precision == 'fp16':
|
|
173
|
+
config.set_flag(trt.BuilderFlag.FP16)
|
|
174
|
+
elif precision == 'int8':
|
|
175
|
+
config.set_flag(trt.BuilderFlag.INT8)
|
|
176
|
+
config.int8_calibrator = EntropyCalibrator(calibration_data)
|
|
177
|
+
|
|
178
|
+
# Build engine
|
|
179
|
+
engine = builder.build_engine(network, config)
|
|
180
|
+
|
|
181
|
+
with open(engine_path, 'wb') as f:
|
|
182
|
+
f.write(engine.serialize())
|
|
183
|
+
|
|
184
|
+
return engine
|
|
185
|
+
|
|
186
|
+
# Inference with TensorRT
|
|
187
|
+
class TRTInference:
|
|
188
|
+
def __init__(self, engine_path):
|
|
189
|
+
logger = trt.Logger(trt.Logger.WARNING)
|
|
190
|
+
with open(engine_path, 'rb') as f:
|
|
191
|
+
self.engine = trt.Runtime(logger).deserialize_cuda_engine(f.read())
|
|
192
|
+
self.context = self.engine.create_execution_context()
|
|
193
|
+
|
|
194
|
+
def infer(self, input_data):
|
|
195
|
+
# Allocate buffers and run inference
|
|
196
|
+
pass
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
### DeepStream Pipeline
|
|
200
|
+
```python
|
|
201
|
+
# DeepStream config for video inference
|
|
202
|
+
# config_infer_primary.txt
|
|
203
|
+
"""
|
|
204
|
+
[property]
|
|
205
|
+
gpu-id=0
|
|
206
|
+
net-scale-factor=0.0039215697906911373
|
|
207
|
+
model-file=resnet18.onnx
|
|
208
|
+
model-engine-file=resnet18.engine
|
|
209
|
+
labelfile-path=labels.txt
|
|
210
|
+
batch-size=4
|
|
211
|
+
network-mode=2 # FP16
|
|
212
|
+
num-detected-classes=80
|
|
213
|
+
interval=0
|
|
214
|
+
gie-unique-id=1
|
|
215
|
+
process-mode=1
|
|
216
|
+
network-type=0
|
|
217
|
+
cluster-mode=2
|
|
218
|
+
maintain-aspect-ratio=1
|
|
219
|
+
symmetric-padding=1
|
|
220
|
+
"""
|
|
221
|
+
|
|
222
|
+
# Python pipeline
|
|
223
|
+
import gi
|
|
224
|
+
gi.require_version('Gst', '1.0')
|
|
225
|
+
from gi.repository import Gst
|
|
226
|
+
|
|
227
|
+
Gst.init(None)
|
|
228
|
+
pipeline = Gst.parse_launch("""
|
|
229
|
+
filesrc location=video.mp4 !
|
|
230
|
+
decodebin !
|
|
231
|
+
nvvideoconvert !
|
|
232
|
+
nvinfer config-file-path=config.txt !
|
|
233
|
+
nvdsosd !
|
|
234
|
+
nvegltransform !
|
|
235
|
+
nveglglessink
|
|
236
|
+
""")
|
|
237
|
+
pipeline.set_state(Gst.State.PLAYING)
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
## Microcontroller Deployment
|
|
241
|
+
|
|
242
|
+
### TensorFlow Lite Micro
|
|
243
|
+
```cpp
|
|
244
|
+
#include "tensorflow/lite/micro/all_ops_resolver.h"
|
|
245
|
+
#include "tensorflow/lite/micro/micro_interpreter.h"
|
|
246
|
+
#include "model_data.h"
|
|
247
|
+
|
|
248
|
+
// Allocate tensor arena
|
|
249
|
+
constexpr int kTensorArenaSize = 10 * 1024;
|
|
250
|
+
uint8_t tensor_arena[kTensorArenaSize];
|
|
251
|
+
|
|
252
|
+
void setup() {
|
|
253
|
+
// Set up model
|
|
254
|
+
const tflite::Model* model = tflite::GetModel(model_data);
|
|
255
|
+
|
|
256
|
+
// Set up resolver
|
|
257
|
+
tflite::AllOpsResolver resolver;
|
|
258
|
+
|
|
259
|
+
// Build interpreter
|
|
260
|
+
tflite::MicroInterpreter interpreter(
|
|
261
|
+
model, resolver, tensor_arena, kTensorArenaSize
|
|
262
|
+
);
|
|
263
|
+
interpreter.AllocateTensors();
|
|
264
|
+
|
|
265
|
+
// Get input tensor
|
|
266
|
+
TfLiteTensor* input = interpreter.input(0);
|
|
267
|
+
|
|
268
|
+
// Fill input and invoke
|
|
269
|
+
// input->data.f[0] = sensor_value;
|
|
270
|
+
interpreter.Invoke();
|
|
271
|
+
|
|
272
|
+
// Get output
|
|
273
|
+
TfLiteTensor* output = interpreter.output(0);
|
|
274
|
+
float prediction = output->data.f[0];
|
|
275
|
+
}
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
## Model Optimization for Edge
|
|
279
|
+
|
|
280
|
+
```python
|
|
281
|
+
def optimize_for_edge(model, target_size_mb=10, target_latency_ms=50):
|
|
282
|
+
"""Optimize model for edge deployment."""
|
|
283
|
+
optimizations = []
|
|
284
|
+
|
|
285
|
+
# 1. Quantization
|
|
286
|
+
quantized = quantize_dynamic(model, {nn.Linear, nn.Conv2d}, dtype=torch.qint8)
|
|
287
|
+
if get_size_mb(quantized) <= target_size_mb:
|
|
288
|
+
optimizations.append(('quantization', quantized))
|
|
289
|
+
|
|
290
|
+
# 2. Pruning
|
|
291
|
+
pruned = prune_model(model, amount=0.5)
|
|
292
|
+
if get_size_mb(pruned) <= target_size_mb:
|
|
293
|
+
optimizations.append(('pruning', pruned))
|
|
294
|
+
|
|
295
|
+
# 3. Knowledge distillation
|
|
296
|
+
student = create_smaller_model(model)
|
|
297
|
+
distilled = distill(teacher=model, student=student)
|
|
298
|
+
if get_size_mb(distilled) <= target_size_mb:
|
|
299
|
+
optimizations.append(('distillation', distilled))
|
|
300
|
+
|
|
301
|
+
# Evaluate each
|
|
302
|
+
results = []
|
|
303
|
+
for name, opt_model in optimizations:
|
|
304
|
+
latency = measure_latency(opt_model)
|
|
305
|
+
accuracy = evaluate(opt_model)
|
|
306
|
+
if latency <= target_latency_ms:
|
|
307
|
+
results.append({
|
|
308
|
+
'method': name,
|
|
309
|
+
'latency': latency,
|
|
310
|
+
'accuracy': accuracy,
|
|
311
|
+
'size_mb': get_size_mb(opt_model)
|
|
312
|
+
})
|
|
313
|
+
|
|
314
|
+
return sorted(results, key=lambda x: x['accuracy'], reverse=True)
|
|
315
|
+
```
|
|
316
|
+
|
|
317
|
+
## Offline Inference
|
|
318
|
+
|
|
319
|
+
```python
|
|
320
|
+
class OfflineInferenceManager:
|
|
321
|
+
def __init__(self, model_path, cache_dir='./cache'):
|
|
322
|
+
self.model = load_model(model_path)
|
|
323
|
+
self.cache_dir = cache_dir
|
|
324
|
+
self.pending_queue = []
|
|
325
|
+
|
|
326
|
+
def predict(self, input_data, priority='normal'):
|
|
327
|
+
"""Run inference locally."""
|
|
328
|
+
return self.model(input_data)
|
|
329
|
+
|
|
330
|
+
def predict_with_fallback(self, input_data, cloud_endpoint=None):
|
|
331
|
+
"""Try cloud first, fall back to local."""
|
|
332
|
+
try:
|
|
333
|
+
if self._is_online() and cloud_endpoint:
|
|
334
|
+
return self._cloud_predict(input_data, cloud_endpoint)
|
|
335
|
+
except Exception:
|
|
336
|
+
pass
|
|
337
|
+
|
|
338
|
+
return self.predict(input_data)
|
|
339
|
+
|
|
340
|
+
def queue_for_sync(self, input_data, result):
|
|
341
|
+
"""Queue predictions for later sync."""
|
|
342
|
+
self.pending_queue.append({
|
|
343
|
+
'input': input_data,
|
|
344
|
+
'result': result,
|
|
345
|
+
'timestamp': time.time()
|
|
346
|
+
})
|
|
347
|
+
|
|
348
|
+
def sync_when_online(self, endpoint):
|
|
349
|
+
"""Sync queued predictions when connectivity restored."""
|
|
350
|
+
while self.pending_queue and self._is_online():
|
|
351
|
+
item = self.pending_queue.pop(0)
|
|
352
|
+
requests.post(endpoint, json=item)
|
|
353
|
+
```
|
|
354
|
+
|
|
355
|
+
## Commands
|
|
356
|
+
- `/omgdeploy:edge` - Edge deployment
|
|
357
|
+
- `/omgoptim:quantize` - Quantization
|
|
358
|
+
- `/omgdeploy:package` - Package for target
|
|
359
|
+
|
|
360
|
+
## Best Practices
|
|
361
|
+
|
|
362
|
+
1. Profile on target device early
|
|
363
|
+
2. Use hardware-specific frameworks
|
|
364
|
+
3. Quantize to int8 when possible
|
|
365
|
+
4. Implement offline fallbacks
|
|
366
|
+
5. Monitor battery and thermal impact
|
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: efficient-ai
|
|
3
|
+
description: Efficient AI techniques including model compression, quantization, pruning, knowledge distillation, and hardware-aware optimization for production systems.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Efficient AI
|
|
7
|
+
|
|
8
|
+
Techniques for building resource-efficient ML systems.
|
|
9
|
+
|
|
10
|
+
## Model Compression Overview
|
|
11
|
+
|
|
12
|
+
```
|
|
13
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
14
|
+
│ MODEL COMPRESSION TECHNIQUES │
|
|
15
|
+
├─────────────────────────────────────────────────────────────┤
|
|
16
|
+
│ │
|
|
17
|
+
│ QUANTIZATION PRUNING DISTILLATION │
|
|
18
|
+
│ ───────────── ────────── ──────────── │
|
|
19
|
+
│ FP32 → INT8 Remove weights Teacher→Student │
|
|
20
|
+
│ 2-4x smaller 50-90% sparse 10-100x smaller │
|
|
21
|
+
│ 1.5-3x faster 2-4x faster Same accuracy │
|
|
22
|
+
│ │
|
|
23
|
+
│ ARCHITECTURE LOW-RANK NEURAL ARCH │
|
|
24
|
+
│ ───────────── ────────── ──────────── │
|
|
25
|
+
│ MobileNet Matrix decomp AutoML search │
|
|
26
|
+
│ EfficientNet LoRA adapters Hardware-aware │
|
|
27
|
+
│ Depth-separable Rank reduction Latency targets │
|
|
28
|
+
│ │
|
|
29
|
+
└─────────────────────────────────────────────────────────────┘
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Quantization
|
|
33
|
+
|
|
34
|
+
### Post-Training Quantization
|
|
35
|
+
```python
|
|
36
|
+
import torch
|
|
37
|
+
from torch.quantization import quantize_dynamic, quantize_static
|
|
38
|
+
|
|
39
|
+
# Dynamic quantization (weights only)
|
|
40
|
+
model_dynamic = quantize_dynamic(
|
|
41
|
+
model,
|
|
42
|
+
{torch.nn.Linear, torch.nn.LSTM},
|
|
43
|
+
dtype=torch.qint8
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
# Static quantization (weights + activations)
|
|
47
|
+
model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
|
|
48
|
+
model_prepared = torch.quantization.prepare(model)
|
|
49
|
+
|
|
50
|
+
# Calibrate with representative data
|
|
51
|
+
with torch.no_grad():
|
|
52
|
+
for batch in calibration_loader:
|
|
53
|
+
model_prepared(batch)
|
|
54
|
+
|
|
55
|
+
model_static = torch.quantization.convert(model_prepared)
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### Quantization-Aware Training
|
|
59
|
+
```python
|
|
60
|
+
import torch.quantization as quant
|
|
61
|
+
|
|
62
|
+
class QuantizedModel(nn.Module):
|
|
63
|
+
def __init__(self):
|
|
64
|
+
super().__init__()
|
|
65
|
+
self.quant = quant.QuantStub()
|
|
66
|
+
self.dequant = quant.DeQuantStub()
|
|
67
|
+
self.layers = nn.Sequential(
|
|
68
|
+
nn.Linear(784, 256),
|
|
69
|
+
nn.ReLU(),
|
|
70
|
+
nn.Linear(256, 10)
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
def forward(self, x):
|
|
74
|
+
x = self.quant(x)
|
|
75
|
+
x = self.layers(x)
|
|
76
|
+
x = self.dequant(x)
|
|
77
|
+
return x
|
|
78
|
+
|
|
79
|
+
# Enable QAT
|
|
80
|
+
model.qconfig = quant.get_default_qat_qconfig('fbgemm')
|
|
81
|
+
model = quant.prepare_qat(model)
|
|
82
|
+
|
|
83
|
+
# Train normally
|
|
84
|
+
for epoch in range(epochs):
|
|
85
|
+
train(model, train_loader)
|
|
86
|
+
|
|
87
|
+
# Convert to quantized
|
|
88
|
+
model = quant.convert(model)
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## Pruning
|
|
92
|
+
|
|
93
|
+
### Magnitude Pruning
|
|
94
|
+
```python
|
|
95
|
+
import torch.nn.utils.prune as prune
|
|
96
|
+
|
|
97
|
+
# Unstructured pruning (individual weights)
|
|
98
|
+
prune.l1_unstructured(model.layer1, name='weight', amount=0.3)
|
|
99
|
+
|
|
100
|
+
# Structured pruning (entire channels)
|
|
101
|
+
prune.ln_structured(
|
|
102
|
+
model.conv1, name='weight', amount=0.5,
|
|
103
|
+
n=2, dim=0 # Prune 50% of output channels
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
# Global pruning (across layers)
|
|
107
|
+
parameters_to_prune = [
|
|
108
|
+
(model.layer1, 'weight'),
|
|
109
|
+
(model.layer2, 'weight'),
|
|
110
|
+
]
|
|
111
|
+
prune.global_unstructured(
|
|
112
|
+
parameters_to_prune,
|
|
113
|
+
pruning_method=prune.L1Unstructured,
|
|
114
|
+
amount=0.4
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Make pruning permanent
|
|
118
|
+
for module, name in parameters_to_prune:
|
|
119
|
+
prune.remove(module, name)
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Iterative Pruning with Fine-tuning
|
|
123
|
+
```python
|
|
124
|
+
def iterative_pruning(model, train_loader, target_sparsity=0.9):
|
|
125
|
+
current_sparsity = 0
|
|
126
|
+
sparsity_schedule = [0.5, 0.75, 0.9]
|
|
127
|
+
|
|
128
|
+
for target in sparsity_schedule:
|
|
129
|
+
# Prune
|
|
130
|
+
for name, module in model.named_modules():
|
|
131
|
+
if isinstance(module, nn.Linear):
|
|
132
|
+
prune.l1_unstructured(module, 'weight', amount=target)
|
|
133
|
+
|
|
134
|
+
# Fine-tune
|
|
135
|
+
for epoch in range(fine_tune_epochs):
|
|
136
|
+
train_epoch(model, train_loader)
|
|
137
|
+
|
|
138
|
+
# Measure sparsity
|
|
139
|
+
total_zeros = sum((p == 0).sum().item() for p in model.parameters())
|
|
140
|
+
total_params = sum(p.numel() for p in model.parameters())
|
|
141
|
+
current_sparsity = total_zeros / total_params
|
|
142
|
+
print(f"Sparsity: {current_sparsity:.2%}")
|
|
143
|
+
|
|
144
|
+
return model
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
## Knowledge Distillation
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
class DistillationLoss(nn.Module):
|
|
151
|
+
def __init__(self, temperature=4.0, alpha=0.5):
|
|
152
|
+
super().__init__()
|
|
153
|
+
self.temperature = temperature
|
|
154
|
+
self.alpha = alpha
|
|
155
|
+
self.ce_loss = nn.CrossEntropyLoss()
|
|
156
|
+
self.kl_loss = nn.KLDivLoss(reduction='batchmean')
|
|
157
|
+
|
|
158
|
+
def forward(self, student_logits, teacher_logits, labels):
|
|
159
|
+
# Hard label loss
|
|
160
|
+
hard_loss = self.ce_loss(student_logits, labels)
|
|
161
|
+
|
|
162
|
+
# Soft label loss (distillation)
|
|
163
|
+
soft_student = F.log_softmax(student_logits / self.temperature, dim=1)
|
|
164
|
+
soft_teacher = F.softmax(teacher_logits / self.temperature, dim=1)
|
|
165
|
+
soft_loss = self.kl_loss(soft_student, soft_teacher) * (self.temperature ** 2)
|
|
166
|
+
|
|
167
|
+
return self.alpha * hard_loss + (1 - self.alpha) * soft_loss
|
|
168
|
+
|
|
169
|
+
# Training loop
|
|
170
|
+
teacher.eval()
|
|
171
|
+
for batch in train_loader:
|
|
172
|
+
x, y = batch
|
|
173
|
+
with torch.no_grad():
|
|
174
|
+
teacher_logits = teacher(x)
|
|
175
|
+
student_logits = student(x)
|
|
176
|
+
loss = distill_loss(student_logits, teacher_logits, y)
|
|
177
|
+
loss.backward()
|
|
178
|
+
optimizer.step()
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
## Efficient Architectures
|
|
182
|
+
|
|
183
|
+
### Depth-Separable Convolutions
|
|
184
|
+
```python
|
|
185
|
+
class DepthSeparableConv(nn.Module):
|
|
186
|
+
def __init__(self, in_channels, out_channels, kernel_size=3):
|
|
187
|
+
super().__init__()
|
|
188
|
+
self.depthwise = nn.Conv2d(
|
|
189
|
+
in_channels, in_channels, kernel_size,
|
|
190
|
+
padding=kernel_size//2, groups=in_channels
|
|
191
|
+
)
|
|
192
|
+
self.pointwise = nn.Conv2d(in_channels, out_channels, 1)
|
|
193
|
+
|
|
194
|
+
def forward(self, x):
|
|
195
|
+
x = self.depthwise(x)
|
|
196
|
+
x = self.pointwise(x)
|
|
197
|
+
return x
|
|
198
|
+
|
|
199
|
+
# Compare params: Regular 3x3 conv with C_in=64, C_out=128
|
|
200
|
+
# Regular: 64 * 128 * 3 * 3 = 73,728 params
|
|
201
|
+
# DepthSep: 64 * 3 * 3 + 64 * 128 = 576 + 8,192 = 8,768 params (8.4x fewer)
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
### MobileNet Inverted Residual Block
|
|
205
|
+
```python
|
|
206
|
+
class InvertedResidual(nn.Module):
|
|
207
|
+
def __init__(self, in_ch, out_ch, stride, expand_ratio):
|
|
208
|
+
super().__init__()
|
|
209
|
+
hidden_dim = in_ch * expand_ratio
|
|
210
|
+
self.use_residual = stride == 1 and in_ch == out_ch
|
|
211
|
+
|
|
212
|
+
self.conv = nn.Sequential(
|
|
213
|
+
# Expand
|
|
214
|
+
nn.Conv2d(in_ch, hidden_dim, 1, bias=False),
|
|
215
|
+
nn.BatchNorm2d(hidden_dim),
|
|
216
|
+
nn.ReLU6(inplace=True),
|
|
217
|
+
# Depthwise
|
|
218
|
+
nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
|
|
219
|
+
nn.BatchNorm2d(hidden_dim),
|
|
220
|
+
nn.ReLU6(inplace=True),
|
|
221
|
+
# Project
|
|
222
|
+
nn.Conv2d(hidden_dim, out_ch, 1, bias=False),
|
|
223
|
+
nn.BatchNorm2d(out_ch),
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
def forward(self, x):
|
|
227
|
+
if self.use_residual:
|
|
228
|
+
return x + self.conv(x)
|
|
229
|
+
return self.conv(x)
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
## Low-Rank Factorization
|
|
233
|
+
|
|
234
|
+
```python
|
|
235
|
+
import torch.nn.utils.parametrize as parametrize
|
|
236
|
+
|
|
237
|
+
class LowRankLinear(nn.Module):
|
|
238
|
+
def __init__(self, in_features, out_features, rank):
|
|
239
|
+
super().__init__()
|
|
240
|
+
self.A = nn.Linear(in_features, rank, bias=False)
|
|
241
|
+
self.B = nn.Linear(rank, out_features, bias=True)
|
|
242
|
+
|
|
243
|
+
def forward(self, x):
|
|
244
|
+
return self.B(self.A(x))
|
|
245
|
+
|
|
246
|
+
# LoRA-style adaptation
|
|
247
|
+
class LoRALayer(nn.Module):
|
|
248
|
+
def __init__(self, original_layer, rank=8, alpha=16):
|
|
249
|
+
super().__init__()
|
|
250
|
+
self.original = original_layer
|
|
251
|
+
self.lora_A = nn.Linear(original_layer.in_features, rank, bias=False)
|
|
252
|
+
self.lora_B = nn.Linear(rank, original_layer.out_features, bias=False)
|
|
253
|
+
self.scaling = alpha / rank
|
|
254
|
+
|
|
255
|
+
nn.init.kaiming_uniform_(self.lora_A.weight)
|
|
256
|
+
nn.init.zeros_(self.lora_B.weight)
|
|
257
|
+
|
|
258
|
+
def forward(self, x):
|
|
259
|
+
return self.original(x) + self.scaling * self.lora_B(self.lora_A(x))
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
## Efficiency Metrics
|
|
263
|
+
|
|
264
|
+
```python
|
|
265
|
+
def measure_efficiency(model, input_shape, device='cuda'):
|
|
266
|
+
import time
|
|
267
|
+
|
|
268
|
+
model = model.to(device)
|
|
269
|
+
model.eval()
|
|
270
|
+
|
|
271
|
+
# Model size
|
|
272
|
+
param_size = sum(p.numel() * p.element_size() for p in model.parameters())
|
|
273
|
+
buffer_size = sum(b.numel() * b.element_size() for b in model.buffers())
|
|
274
|
+
size_mb = (param_size + buffer_size) / 1024 / 1024
|
|
275
|
+
|
|
276
|
+
# FLOPs (using thop)
|
|
277
|
+
from thop import profile
|
|
278
|
+
dummy_input = torch.randn(1, *input_shape).to(device)
|
|
279
|
+
flops, params = profile(model, inputs=(dummy_input,))
|
|
280
|
+
|
|
281
|
+
# Latency
|
|
282
|
+
warmup = 10
|
|
283
|
+
iterations = 100
|
|
284
|
+
|
|
285
|
+
for _ in range(warmup):
|
|
286
|
+
model(dummy_input)
|
|
287
|
+
|
|
288
|
+
torch.cuda.synchronize()
|
|
289
|
+
start = time.time()
|
|
290
|
+
for _ in range(iterations):
|
|
291
|
+
model(dummy_input)
|
|
292
|
+
torch.cuda.synchronize()
|
|
293
|
+
latency_ms = (time.time() - start) / iterations * 1000
|
|
294
|
+
|
|
295
|
+
return {
|
|
296
|
+
"size_mb": size_mb,
|
|
297
|
+
"params": params,
|
|
298
|
+
"flops": flops,
|
|
299
|
+
"latency_ms": latency_ms,
|
|
300
|
+
"throughput": 1000 / latency_ms
|
|
301
|
+
}
|
|
302
|
+
```
|
|
303
|
+
|
|
304
|
+
## Commands
|
|
305
|
+
- `/omgoptim:quantize` - Apply quantization
|
|
306
|
+
- `/omgoptim:prune` - Apply pruning
|
|
307
|
+
- `/omgoptim:distill` - Knowledge distillation
|
|
308
|
+
- `/omgoptim:profile` - Profile efficiency
|
|
309
|
+
|
|
310
|
+
## Best Practices
|
|
311
|
+
|
|
312
|
+
1. Start with the largest model that works
|
|
313
|
+
2. Quantize first (usually free accuracy)
|
|
314
|
+
3. Prune iteratively with fine-tuning
|
|
315
|
+
4. Use distillation for maximum compression
|
|
316
|
+
5. Profile on target hardware
|