omgkit 2.19.3 → 2.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/README.md +537 -338
  2. package/package.json +2 -2
  3. package/plugin/agents/ai-architect-agent.md +282 -0
  4. package/plugin/agents/data-scientist-agent.md +221 -0
  5. package/plugin/agents/experiment-analyst-agent.md +318 -0
  6. package/plugin/agents/ml-engineer-agent.md +165 -0
  7. package/plugin/agents/mlops-engineer-agent.md +324 -0
  8. package/plugin/agents/model-optimizer-agent.md +287 -0
  9. package/plugin/agents/production-engineer-agent.md +360 -0
  10. package/plugin/agents/research-scientist-agent.md +274 -0
  11. package/plugin/commands/omgdata/augment.md +86 -0
  12. package/plugin/commands/omgdata/collect.md +81 -0
  13. package/plugin/commands/omgdata/label.md +83 -0
  14. package/plugin/commands/omgdata/split.md +83 -0
  15. package/plugin/commands/omgdata/validate.md +76 -0
  16. package/plugin/commands/omgdata/version.md +85 -0
  17. package/plugin/commands/omgdeploy/ab.md +94 -0
  18. package/plugin/commands/omgdeploy/cloud.md +89 -0
  19. package/plugin/commands/omgdeploy/edge.md +93 -0
  20. package/plugin/commands/omgdeploy/package.md +91 -0
  21. package/plugin/commands/omgdeploy/serve.md +92 -0
  22. package/plugin/commands/omgfeature/embed.md +93 -0
  23. package/plugin/commands/omgfeature/extract.md +93 -0
  24. package/plugin/commands/omgfeature/select.md +85 -0
  25. package/plugin/commands/omgfeature/store.md +97 -0
  26. package/plugin/commands/omgml/init.md +60 -0
  27. package/plugin/commands/omgml/status.md +82 -0
  28. package/plugin/commands/omgops/drift.md +87 -0
  29. package/plugin/commands/omgops/monitor.md +99 -0
  30. package/plugin/commands/omgops/pipeline.md +102 -0
  31. package/plugin/commands/omgops/registry.md +109 -0
  32. package/plugin/commands/omgops/retrain.md +91 -0
  33. package/plugin/commands/omgoptim/distill.md +90 -0
  34. package/plugin/commands/omgoptim/profile.md +92 -0
  35. package/plugin/commands/omgoptim/prune.md +81 -0
  36. package/plugin/commands/omgoptim/quantize.md +83 -0
  37. package/plugin/commands/omgtrain/baseline.md +78 -0
  38. package/plugin/commands/omgtrain/compare.md +99 -0
  39. package/plugin/commands/omgtrain/evaluate.md +85 -0
  40. package/plugin/commands/omgtrain/train.md +81 -0
  41. package/plugin/commands/omgtrain/tune.md +89 -0
  42. package/plugin/registry.yaml +252 -2
  43. package/plugin/skills/ml-systems/SKILL.md +65 -0
  44. package/plugin/skills/ml-systems/ai-accelerators/SKILL.md +342 -0
  45. package/plugin/skills/ml-systems/data-eng/SKILL.md +126 -0
  46. package/plugin/skills/ml-systems/deep-learning-primer/SKILL.md +143 -0
  47. package/plugin/skills/ml-systems/deployment-paradigms/SKILL.md +148 -0
  48. package/plugin/skills/ml-systems/dnn-architectures/SKILL.md +128 -0
  49. package/plugin/skills/ml-systems/edge-deployment/SKILL.md +366 -0
  50. package/plugin/skills/ml-systems/efficient-ai/SKILL.md +316 -0
  51. package/plugin/skills/ml-systems/feature-engineering/SKILL.md +151 -0
  52. package/plugin/skills/ml-systems/ml-frameworks/SKILL.md +187 -0
  53. package/plugin/skills/ml-systems/ml-serving-optimization/SKILL.md +371 -0
  54. package/plugin/skills/ml-systems/ml-systems-fundamentals/SKILL.md +103 -0
  55. package/plugin/skills/ml-systems/ml-workflow/SKILL.md +162 -0
  56. package/plugin/skills/ml-systems/mlops/SKILL.md +386 -0
  57. package/plugin/skills/ml-systems/model-deployment/SKILL.md +350 -0
  58. package/plugin/skills/ml-systems/model-dev/SKILL.md +160 -0
  59. package/plugin/skills/ml-systems/model-optimization/SKILL.md +339 -0
  60. package/plugin/skills/ml-systems/robust-ai/SKILL.md +395 -0
  61. package/plugin/skills/ml-systems/training-data/SKILL.md +152 -0
  62. package/plugin/workflows/ml-systems/data-preparation-workflow.md +276 -0
  63. package/plugin/workflows/ml-systems/edge-deployment-workflow.md +413 -0
  64. package/plugin/workflows/ml-systems/full-ml-lifecycle-workflow.md +405 -0
  65. package/plugin/workflows/ml-systems/hyperparameter-tuning-workflow.md +352 -0
  66. package/plugin/workflows/ml-systems/mlops-pipeline-workflow.md +384 -0
  67. package/plugin/workflows/ml-systems/model-deployment-workflow.md +392 -0
  68. package/plugin/workflows/ml-systems/model-development-workflow.md +218 -0
  69. package/plugin/workflows/ml-systems/model-evaluation-workflow.md +416 -0
  70. package/plugin/workflows/ml-systems/model-optimization-workflow.md +390 -0
  71. package/plugin/workflows/ml-systems/monitoring-drift-workflow.md +446 -0
  72. package/plugin/workflows/ml-systems/retraining-workflow.md +401 -0
  73. package/plugin/workflows/ml-systems/training-pipeline-workflow.md +382 -0
@@ -0,0 +1,151 @@
1
+ ---
2
+ name: feature-engineering
3
+ description: Feature engineering techniques including feature extraction, transformation, selection, and feature store management for ML systems.
4
+ ---
5
+
6
+ # Feature Engineering
7
+
8
+ Creating informative features for ML models.
9
+
10
+ ## Feature Types
11
+
12
+ ### Numerical Features
13
+ ```python
14
+ from sklearn.preprocessing import StandardScaler, RobustScaler
15
+
16
+ # Scaling
17
+ scaler = StandardScaler() # Mean=0, Std=1
18
+ robust_scaler = RobustScaler() # Robust to outliers
19
+
20
+ # Log transform (for skewed data)
21
+ df['log_income'] = np.log1p(df['income'])
22
+
23
+ # Polynomial features
24
+ from sklearn.preprocessing import PolynomialFeatures
25
+ poly = PolynomialFeatures(degree=2, include_bias=False)
26
+ X_poly = poly.fit_transform(X)
27
+
28
+ # Binning
29
+ df['age_group'] = pd.cut(df['age'], bins=[0, 18, 35, 55, 100],
30
+ labels=['youth', 'young_adult', 'middle', 'senior'])
31
+ ```
32
+
33
+ ### Categorical Features
34
+ ```python
35
+ from sklearn.preprocessing import OneHotEncoder, LabelEncoder
36
+
37
+ # One-hot encoding
38
+ ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
39
+ encoded = ohe.fit_transform(df[['category']])
40
+
41
+ # Target encoding
42
+ def target_encode(df, col, target, smoothing=10):
43
+ global_mean = df[target].mean()
44
+ agg = df.groupby(col)[target].agg(['mean', 'count'])
45
+ smooth = (agg['count'] * agg['mean'] + smoothing * global_mean) / (agg['count'] + smoothing)
46
+ return df[col].map(smooth)
47
+
48
+ # Hash encoding (for high cardinality)
49
+ from sklearn.feature_extraction import FeatureHasher
50
+ hasher = FeatureHasher(n_features=100, input_type='string')
51
+ ```
52
+
53
+ ### Text Features
54
+ ```python
55
+ from sklearn.feature_extraction.text import TfidfVectorizer
56
+
57
+ # TF-IDF
58
+ tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
59
+ text_features = tfidf.fit_transform(df['text'])
60
+
61
+ # Embeddings
62
+ from sentence_transformers import SentenceTransformer
63
+ model = SentenceTransformer('all-MiniLM-L6-v2')
64
+ embeddings = model.encode(df['text'].tolist())
65
+
66
+ # Text statistics
67
+ df['text_length'] = df['text'].str.len()
68
+ df['word_count'] = df['text'].str.split().str.len()
69
+ df['avg_word_length'] = df['text'].str.split().apply(lambda x: np.mean([len(w) for w in x]))
70
+ ```
71
+
72
+ ### Temporal Features
73
+ ```python
74
+ # Datetime components
75
+ df['hour'] = df['timestamp'].dt.hour
76
+ df['day_of_week'] = df['timestamp'].dt.dayofweek
77
+ df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
78
+
79
+ # Cyclical encoding
80
+ df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
81
+ df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
82
+
83
+ # Lag features
84
+ df['lag_1'] = df['value'].shift(1)
85
+ df['lag_7'] = df['value'].shift(7)
86
+ df['rolling_mean_7'] = df['value'].rolling(window=7).mean()
87
+ ```
88
+
89
+ ## Feature Selection
90
+
91
+ ```python
92
+ from sklearn.feature_selection import SelectKBest, mutual_info_classif
93
+
94
+ # Filter method
95
+ selector = SelectKBest(mutual_info_classif, k=50)
96
+ X_selected = selector.fit_transform(X, y)
97
+
98
+ # Embedded method (tree importance)
99
+ from sklearn.ensemble import RandomForestClassifier
100
+ rf = RandomForestClassifier()
101
+ rf.fit(X, y)
102
+ importances = pd.Series(rf.feature_importances_, index=feature_names)
103
+
104
+ # Recursive Feature Elimination
105
+ from sklearn.feature_selection import RFE
106
+ rfe = RFE(estimator=LogisticRegression(), n_features_to_select=20)
107
+ X_rfe = rfe.fit_transform(X, y)
108
+ ```
109
+
110
+ ## Feature Store
111
+
112
+ ```python
113
+ from feast import Entity, FeatureView, Feature, FileSource
114
+
115
+ # Define feature view
116
+ user_features = FeatureView(
117
+ name="user_features",
118
+ entities=["user_id"],
119
+ features=[
120
+ Feature(name="total_purchases", dtype=Float32),
121
+ Feature(name="avg_order_value", dtype=Float32),
122
+ ],
123
+ ttl=timedelta(days=1),
124
+ source=FileSource(path="data/user_features.parquet")
125
+ )
126
+
127
+ # Get features for training
128
+ training_df = store.get_historical_features(
129
+ entity_df=entity_df,
130
+ features=["user_features:total_purchases"]
131
+ ).to_df()
132
+
133
+ # Get features for inference
134
+ online_features = store.get_online_features(
135
+ entity_rows=[{"user_id": 123}],
136
+ features=["user_features:total_purchases"]
137
+ )
138
+ ```
139
+
140
+ ## Commands
141
+ - `/omgfeature:extract` - Extract features
142
+ - `/omgfeature:select` - Select features
143
+ - `/omgfeature:store` - Feature store ops
144
+
145
+ ## Best Practices
146
+
147
+ 1. Start with simple features
148
+ 2. Use domain knowledge
149
+ 3. Validate feature distributions
150
+ 4. Document feature definitions
151
+ 5. Monitor feature drift
@@ -0,0 +1,187 @@
1
+ ---
2
+ name: ml-frameworks
3
+ description: ML framework best practices for PyTorch, TensorFlow, scikit-learn, and modern ML libraries including training patterns and optimization.
4
+ ---
5
+
6
+ # ML Frameworks
7
+
8
+ Best practices for popular ML frameworks.
9
+
10
+ ## PyTorch
11
+
12
+ ```python
13
+ import torch
14
+ import torch.nn as nn
15
+ from torch.utils.data import Dataset, DataLoader
16
+
17
+ # Custom Dataset
18
+ class CustomDataset(Dataset):
19
+ def __init__(self, X, y, transform=None):
20
+ self.X = torch.FloatTensor(X)
21
+ self.y = torch.LongTensor(y)
22
+ self.transform = transform
23
+
24
+ def __len__(self):
25
+ return len(self.y)
26
+
27
+ def __getitem__(self, idx):
28
+ x = self.X[idx]
29
+ if self.transform:
30
+ x = self.transform(x)
31
+ return x, self.y[idx]
32
+
33
+ # Model Definition
34
+ class Net(nn.Module):
35
+ def __init__(self, input_dim, hidden_dim, output_dim):
36
+ super().__init__()
37
+ self.net = nn.Sequential(
38
+ nn.Linear(input_dim, hidden_dim),
39
+ nn.LayerNorm(hidden_dim),
40
+ nn.ReLU(),
41
+ nn.Dropout(0.2),
42
+ nn.Linear(hidden_dim, output_dim)
43
+ )
44
+
45
+ def forward(self, x):
46
+ return self.net(x)
47
+
48
+ # Training Loop
49
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
50
+ model = Net(100, 256, 10).to(device)
51
+ optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=0.01)
52
+ scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100)
53
+ scaler = torch.cuda.amp.GradScaler() # Mixed precision
54
+
55
+ for epoch in range(100):
56
+ model.train()
57
+ for batch in train_loader:
58
+ x, y = batch[0].to(device), batch[1].to(device)
59
+ with torch.cuda.amp.autocast():
60
+ output = model(x)
61
+ loss = F.cross_entropy(output, y)
62
+ scaler.scale(loss).backward()
63
+ scaler.step(optimizer)
64
+ scaler.update()
65
+ optimizer.zero_grad()
66
+ scheduler.step()
67
+ ```
68
+
69
+ ## TensorFlow/Keras
70
+
71
+ ```python
72
+ import tensorflow as tf
73
+ from tensorflow import keras
74
+
75
+ # Model Definition
76
+ model = keras.Sequential([
77
+ keras.layers.Dense(256, activation='relu', input_shape=(100,)),
78
+ keras.layers.BatchNormalization(),
79
+ keras.layers.Dropout(0.2),
80
+ keras.layers.Dense(128, activation='relu'),
81
+ keras.layers.Dense(10, activation='softmax')
82
+ ])
83
+
84
+ model.compile(
85
+ optimizer=keras.optimizers.AdamW(learning_rate=1e-3),
86
+ loss='sparse_categorical_crossentropy',
87
+ metrics=['accuracy']
88
+ )
89
+
90
+ # Callbacks
91
+ callbacks = [
92
+ keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True),
93
+ keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=5),
94
+ keras.callbacks.ModelCheckpoint('best_model.h5', save_best_only=True),
95
+ keras.callbacks.TensorBoard(log_dir='./logs')
96
+ ]
97
+
98
+ # Training
99
+ history = model.fit(
100
+ X_train, y_train,
101
+ validation_data=(X_val, y_val),
102
+ epochs=100,
103
+ batch_size=32,
104
+ callbacks=callbacks
105
+ )
106
+ ```
107
+
108
+ ## Scikit-learn
109
+
110
+ ```python
111
+ from sklearn.pipeline import Pipeline
112
+ from sklearn.compose import ColumnTransformer
113
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder
114
+ from sklearn.ensemble import GradientBoostingClassifier
115
+
116
+ # Preprocessing Pipeline
117
+ numeric_features = ['age', 'income']
118
+ categorical_features = ['city', 'occupation']
119
+
120
+ preprocessor = ColumnTransformer(
121
+ transformers=[
122
+ ('num', StandardScaler(), numeric_features),
123
+ ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
124
+ ]
125
+ )
126
+
127
+ # Full Pipeline
128
+ pipeline = Pipeline([
129
+ ('preprocessor', preprocessor),
130
+ ('classifier', GradientBoostingClassifier())
131
+ ])
132
+
133
+ # Grid Search
134
+ from sklearn.model_selection import GridSearchCV
135
+
136
+ param_grid = {
137
+ 'classifier__n_estimators': [100, 200],
138
+ 'classifier__max_depth': [3, 5, 7],
139
+ 'classifier__learning_rate': [0.01, 0.1]
140
+ }
141
+
142
+ grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro')
143
+ grid_search.fit(X_train, y_train)
144
+ ```
145
+
146
+ ## Hugging Face Transformers
147
+
148
+ ```python
149
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
150
+ from transformers import Trainer, TrainingArguments
151
+
152
+ model = AutoModelForSequenceClassification.from_pretrained(
153
+ "bert-base-uncased", num_labels=2
154
+ )
155
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
156
+
157
+ training_args = TrainingArguments(
158
+ output_dir="./results",
159
+ num_train_epochs=3,
160
+ per_device_train_batch_size=16,
161
+ per_device_eval_batch_size=64,
162
+ warmup_steps=500,
163
+ weight_decay=0.01,
164
+ evaluation_strategy="epoch",
165
+ fp16=True
166
+ )
167
+
168
+ trainer = Trainer(
169
+ model=model,
170
+ args=training_args,
171
+ train_dataset=train_dataset,
172
+ eval_dataset=eval_dataset
173
+ )
174
+
175
+ trainer.train()
176
+ ```
177
+
178
+ ## Commands
179
+ - `/omgtrain:train` - Train model
180
+
181
+ ## Best Practices
182
+
183
+ 1. Use mixed precision training
184
+ 2. Implement proper data loading
185
+ 3. Use learning rate scheduling
186
+ 4. Enable gradient clipping
187
+ 5. Save checkpoints regularly
@@ -0,0 +1,371 @@
1
+ ---
2
+ name: ml-serving-optimization
3
+ description: ML serving optimization techniques including batching, caching, model compilation, and latency reduction for production ML systems.
4
+ ---
5
+
6
+ # ML Serving Optimization
7
+
8
+ Optimizing ML model inference for production.
9
+
10
+ ## Inference Pipeline
11
+
12
+ ```
13
+ ┌─────────────────────────────────────────────────────────────┐
14
+ │ INFERENCE OPTIMIZATION STACK │
15
+ ├─────────────────────────────────────────────────────────────┤
16
+ │ │
17
+ │ REQUEST PREPROCESSING INFERENCE POSTPROCESS │
18
+ │ ──────── ───────────── ───────── ─────────── │
19
+ │ Batching Vectorization Compiled Response │
20
+ │ Queuing Caching Quantized Caching │
21
+ │ Load balance Async I/O Parallelized Streaming │
22
+ │ │
23
+ │ LATENCY BREAKDOWN (typical): │
24
+ │ ├── Network: 1-5ms │
25
+ │ ├── Preprocessing: 2-10ms │
26
+ │ ├── Model inference: 5-100ms │
27
+ │ └── Postprocessing: 1-5ms │
28
+ │ │
29
+ └─────────────────────────────────────────────────────────────┘
30
+ ```
31
+
32
+ ## Dynamic Batching
33
+
34
+ ```python
35
+ import asyncio
36
+ from collections import deque
37
+ import time
38
+
39
+ class DynamicBatcher:
40
+ def __init__(self, model, max_batch_size=32, max_wait_ms=10):
41
+ self.model = model
42
+ self.max_batch_size = max_batch_size
43
+ self.max_wait_ms = max_wait_ms
44
+ self.queue = deque()
45
+ self.lock = asyncio.Lock()
46
+
47
+ async def predict(self, input_data):
48
+ future = asyncio.Future()
49
+ async with self.lock:
50
+ self.queue.append((input_data, future))
51
+
52
+ if len(self.queue) >= self.max_batch_size:
53
+ await self._process_batch()
54
+
55
+ # Wait for result with timeout
56
+ await asyncio.wait_for(future, timeout=self.max_wait_ms / 1000 + 1)
57
+ return future.result()
58
+
59
+ async def _process_batch(self):
60
+ if not self.queue:
61
+ return
62
+
63
+ batch_items = []
64
+ while self.queue and len(batch_items) < self.max_batch_size:
65
+ batch_items.append(self.queue.popleft())
66
+
67
+ inputs = torch.stack([item[0] for item in batch_items])
68
+ with torch.no_grad():
69
+ outputs = self.model(inputs)
70
+
71
+ for i, (_, future) in enumerate(batch_items):
72
+ future.set_result(outputs[i])
73
+
74
+ async def batch_loop(self):
75
+ while True:
76
+ await asyncio.sleep(self.max_wait_ms / 1000)
77
+ async with self.lock:
78
+ if self.queue:
79
+ await self._process_batch()
80
+ ```
81
+
82
+ ## Model Compilation
83
+
84
+ ### TorchScript
85
+ ```python
86
+ import torch
87
+
88
+ # Tracing
89
+ traced_model = torch.jit.trace(model, example_input)
90
+ traced_model.save("model_traced.pt")
91
+
92
+ # Scripting (for control flow)
93
+ @torch.jit.script
94
+ def forward_with_control(x, threshold: float = 0.5):
95
+ output = model(x)
96
+ if output.max() > threshold:
97
+ return output
98
+ return torch.zeros_like(output)
99
+
100
+ # Optimize for inference
101
+ traced_model = torch.jit.optimize_for_inference(traced_model)
102
+ ```
103
+
104
+ ### torch.compile (PyTorch 2.0+)
105
+ ```python
106
+ import torch
107
+
108
+ # Default compilation
109
+ compiled_model = torch.compile(model)
110
+
111
+ # With options
112
+ compiled_model = torch.compile(
113
+ model,
114
+ mode="reduce-overhead", # or "max-autotune"
115
+ fullgraph=True,
116
+ dynamic=False
117
+ )
118
+
119
+ # Inference-optimized
120
+ compiled_model = torch.compile(
121
+ model,
122
+ mode="reduce-overhead",
123
+ backend="inductor"
124
+ )
125
+ ```
126
+
127
+ ### TensorRT
128
+ ```python
129
+ import torch_tensorrt
130
+
131
+ # Compile to TensorRT
132
+ trt_model = torch_tensorrt.compile(
133
+ model,
134
+ inputs=[torch_tensorrt.Input(
135
+ min_shape=[1, 3, 224, 224],
136
+ opt_shape=[8, 3, 224, 224],
137
+ max_shape=[32, 3, 224, 224],
138
+ dtype=torch.float16
139
+ )],
140
+ enabled_precisions={torch.float16},
141
+ workspace_size=1 << 30
142
+ )
143
+
144
+ # Save and load
145
+ torch.jit.save(trt_model, "model_trt.ts")
146
+ loaded = torch.jit.load("model_trt.ts")
147
+ ```
148
+
149
+ ### ONNX Runtime
150
+ ```python
151
+ import onnxruntime as ort
152
+
153
+ # Export to ONNX
154
+ torch.onnx.export(
155
+ model, example_input, "model.onnx",
156
+ input_names=['input'],
157
+ output_names=['output'],
158
+ dynamic_axes={'input': {0: 'batch_size'}}
159
+ )
160
+
161
+ # Create optimized session
162
+ session_options = ort.SessionOptions()
163
+ session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
164
+ session_options.intra_op_num_threads = 4
165
+
166
+ # GPU execution
167
+ providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
168
+ session = ort.InferenceSession("model.onnx", session_options, providers=providers)
169
+
170
+ # Inference
171
+ outputs = session.run(None, {'input': input_data.numpy()})
172
+ ```
173
+
174
+ ## Caching Strategies
175
+
176
+ ```python
177
+ from functools import lru_cache
178
+ import hashlib
179
+ import redis
180
+ import pickle
181
+
182
+ class InferenceCache:
183
+ def __init__(self, redis_client, ttl=3600):
184
+ self.redis = redis_client
185
+ self.ttl = ttl
186
+
187
+ def _hash_input(self, input_data):
188
+ return hashlib.sha256(input_data.tobytes()).hexdigest()
189
+
190
+ def get(self, input_data):
191
+ key = self._hash_input(input_data)
192
+ cached = self.redis.get(key)
193
+ if cached:
194
+ return pickle.loads(cached)
195
+ return None
196
+
197
+ def set(self, input_data, output):
198
+ key = self._hash_input(input_data)
199
+ self.redis.setex(key, self.ttl, pickle.dumps(output))
200
+
201
+ # Embedding cache for similar inputs
202
+ class EmbeddingCache:
203
+ def __init__(self, threshold=0.95):
204
+ self.embeddings = []
205
+ self.outputs = []
206
+ self.threshold = threshold
207
+
208
+ def find_similar(self, embedding):
209
+ if not self.embeddings:
210
+ return None
211
+
212
+ similarities = torch.cosine_similarity(
213
+ embedding.unsqueeze(0),
214
+ torch.stack(self.embeddings)
215
+ )
216
+ max_sim, idx = similarities.max(0)
217
+
218
+ if max_sim > self.threshold:
219
+ return self.outputs[idx]
220
+ return None
221
+
222
+ def add(self, embedding, output):
223
+ self.embeddings.append(embedding)
224
+ self.outputs.append(output)
225
+ ```
226
+
227
+ ## Async Inference
228
+
229
+ ```python
230
+ import asyncio
231
+ from concurrent.futures import ThreadPoolExecutor
232
+
233
+ class AsyncInferenceService:
234
+ def __init__(self, model, num_workers=4):
235
+ self.model = model
236
+ self.executor = ThreadPoolExecutor(max_workers=num_workers)
237
+
238
+ def _sync_predict(self, input_data):
239
+ with torch.no_grad():
240
+ return self.model(input_data)
241
+
242
+ async def predict(self, input_data):
243
+ loop = asyncio.get_event_loop()
244
+ return await loop.run_in_executor(
245
+ self.executor,
246
+ self._sync_predict,
247
+ input_data
248
+ )
249
+
250
+ async def predict_batch(self, inputs):
251
+ tasks = [self.predict(inp) for inp in inputs]
252
+ return await asyncio.gather(*tasks)
253
+
254
+ # CUDA streams for parallel inference
255
+ class StreamedInference:
256
+ def __init__(self, model, num_streams=4):
257
+ self.model = model
258
+ self.streams = [torch.cuda.Stream() for _ in range(num_streams)]
259
+
260
+ def predict_parallel(self, inputs):
261
+ outputs = []
262
+ for i, inp in enumerate(inputs):
263
+ stream = self.streams[i % len(self.streams)]
264
+ with torch.cuda.stream(stream):
265
+ outputs.append(self.model(inp))
266
+
267
+ torch.cuda.synchronize()
268
+ return outputs
269
+ ```
270
+
271
+ ## Latency Profiling
272
+
273
+ ```python
274
+ import time
275
+ from contextlib import contextmanager
276
+ from dataclasses import dataclass
277
+ from typing import Dict, List
278
+
279
+ @dataclass
280
+ class LatencyStats:
281
+ mean: float
282
+ p50: float
283
+ p95: float
284
+ p99: float
285
+ max: float
286
+
287
+ class LatencyProfiler:
288
+ def __init__(self):
289
+ self.timings: Dict[str, List[float]] = {}
290
+
291
+ @contextmanager
292
+ def measure(self, name: str):
293
+ start = time.perf_counter()
294
+ yield
295
+ elapsed = (time.perf_counter() - start) * 1000 # ms
296
+ if name not in self.timings:
297
+ self.timings[name] = []
298
+ self.timings[name].append(elapsed)
299
+
300
+ def stats(self, name: str) -> LatencyStats:
301
+ times = sorted(self.timings[name])
302
+ n = len(times)
303
+ return LatencyStats(
304
+ mean=sum(times) / n,
305
+ p50=times[n // 2],
306
+ p95=times[int(n * 0.95)],
307
+ p99=times[int(n * 0.99)],
308
+ max=times[-1]
309
+ )
310
+
311
+ def report(self):
312
+ for name in self.timings:
313
+ s = self.stats(name)
314
+ print(f"{name}: mean={s.mean:.2f}ms p95={s.p95:.2f}ms p99={s.p99:.2f}ms")
315
+
316
+ # Usage
317
+ profiler = LatencyProfiler()
318
+
319
+ with profiler.measure("preprocess"):
320
+ preprocessed = preprocess(data)
321
+
322
+ with profiler.measure("inference"):
323
+ output = model(preprocessed)
324
+
325
+ with profiler.measure("postprocess"):
326
+ result = postprocess(output)
327
+
328
+ profiler.report()
329
+ ```
330
+
331
+ ## KV Cache for Transformers
332
+
333
+ ```python
334
+ class KVCacheAttention(nn.Module):
335
+ def __init__(self, d_model, num_heads):
336
+ super().__init__()
337
+ self.num_heads = num_heads
338
+ self.head_dim = d_model // num_heads
339
+ self.qkv = nn.Linear(d_model, 3 * d_model)
340
+ self.out = nn.Linear(d_model, d_model)
341
+
342
+ def forward(self, x, past_kv=None):
343
+ B, T, C = x.shape
344
+ qkv = self.qkv(x).reshape(B, T, 3, self.num_heads, self.head_dim)
345
+ q, k, v = qkv.unbind(2)
346
+
347
+ if past_kv is not None:
348
+ past_k, past_v = past_kv
349
+ k = torch.cat([past_k, k], dim=1)
350
+ v = torch.cat([past_v, v], dim=1)
351
+
352
+ # Attention computation
353
+ attn = torch.matmul(q, k.transpose(-2, -1)) / (self.head_dim ** 0.5)
354
+ attn = F.softmax(attn, dim=-1)
355
+ out = torch.matmul(attn, v)
356
+
357
+ return self.out(out.reshape(B, T, C)), (k, v)
358
+ ```
359
+
360
+ ## Commands
361
+ - `/omgoptim:profile` - Profile latency
362
+ - `/omgdeploy:serve` - Optimized serving
363
+ - `/omgoptim:quantize` - Model quantization
364
+
365
+ ## Best Practices
366
+
367
+ 1. Measure before optimizing
368
+ 2. Use dynamic batching for throughput
369
+ 3. Compile models for production
370
+ 4. Cache repeated computations
371
+ 5. Profile end-to-end latency