npm - omgkit - Versions diffs - 2.19.3 → 2.21.0 - Mend

omgkit 2.19.3 → 2.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

package/README.md +537 -338
package/package.json +2 -2
package/plugin/agents/ai-architect-agent.md +282 -0
package/plugin/agents/data-scientist-agent.md +221 -0
package/plugin/agents/experiment-analyst-agent.md +318 -0
package/plugin/agents/ml-engineer-agent.md +165 -0
package/plugin/agents/mlops-engineer-agent.md +324 -0
package/plugin/agents/model-optimizer-agent.md +287 -0
package/plugin/agents/production-engineer-agent.md +360 -0
package/plugin/agents/research-scientist-agent.md +274 -0
package/plugin/commands/omgdata/augment.md +86 -0
package/plugin/commands/omgdata/collect.md +81 -0
package/plugin/commands/omgdata/label.md +83 -0
package/plugin/commands/omgdata/split.md +83 -0
package/plugin/commands/omgdata/validate.md +76 -0
package/plugin/commands/omgdata/version.md +85 -0
package/plugin/commands/omgdeploy/ab.md +94 -0
package/plugin/commands/omgdeploy/cloud.md +89 -0
package/plugin/commands/omgdeploy/edge.md +93 -0
package/plugin/commands/omgdeploy/package.md +91 -0
package/plugin/commands/omgdeploy/serve.md +92 -0
package/plugin/commands/omgfeature/embed.md +93 -0
package/plugin/commands/omgfeature/extract.md +93 -0
package/plugin/commands/omgfeature/select.md +85 -0
package/plugin/commands/omgfeature/store.md +97 -0
package/plugin/commands/omgml/init.md +60 -0
package/plugin/commands/omgml/status.md +82 -0
package/plugin/commands/omgops/drift.md +87 -0
package/plugin/commands/omgops/monitor.md +99 -0
package/plugin/commands/omgops/pipeline.md +102 -0
package/plugin/commands/omgops/registry.md +109 -0
package/plugin/commands/omgops/retrain.md +91 -0
package/plugin/commands/omgoptim/distill.md +90 -0
package/plugin/commands/omgoptim/profile.md +92 -0
package/plugin/commands/omgoptim/prune.md +81 -0
package/plugin/commands/omgoptim/quantize.md +83 -0
package/plugin/commands/omgtrain/baseline.md +78 -0
package/plugin/commands/omgtrain/compare.md +99 -0
package/plugin/commands/omgtrain/evaluate.md +85 -0
package/plugin/commands/omgtrain/train.md +81 -0
package/plugin/commands/omgtrain/tune.md +89 -0
package/plugin/registry.yaml +252 -2
package/plugin/skills/ml-systems/SKILL.md +65 -0
package/plugin/skills/ml-systems/ai-accelerators/SKILL.md +342 -0
package/plugin/skills/ml-systems/data-eng/SKILL.md +126 -0
package/plugin/skills/ml-systems/deep-learning-primer/SKILL.md +143 -0
package/plugin/skills/ml-systems/deployment-paradigms/SKILL.md +148 -0
package/plugin/skills/ml-systems/dnn-architectures/SKILL.md +128 -0
package/plugin/skills/ml-systems/edge-deployment/SKILL.md +366 -0
package/plugin/skills/ml-systems/efficient-ai/SKILL.md +316 -0
package/plugin/skills/ml-systems/feature-engineering/SKILL.md +151 -0
package/plugin/skills/ml-systems/ml-frameworks/SKILL.md +187 -0
package/plugin/skills/ml-systems/ml-serving-optimization/SKILL.md +371 -0
package/plugin/skills/ml-systems/ml-systems-fundamentals/SKILL.md +103 -0
package/plugin/skills/ml-systems/ml-workflow/SKILL.md +162 -0
package/plugin/skills/ml-systems/mlops/SKILL.md +386 -0
package/plugin/skills/ml-systems/model-deployment/SKILL.md +350 -0
package/plugin/skills/ml-systems/model-dev/SKILL.md +160 -0
package/plugin/skills/ml-systems/model-optimization/SKILL.md +339 -0
package/plugin/skills/ml-systems/robust-ai/SKILL.md +395 -0
package/plugin/skills/ml-systems/training-data/SKILL.md +152 -0
package/plugin/workflows/ml-systems/data-preparation-workflow.md +276 -0
package/plugin/workflows/ml-systems/edge-deployment-workflow.md +413 -0
package/plugin/workflows/ml-systems/full-ml-lifecycle-workflow.md +405 -0
package/plugin/workflows/ml-systems/hyperparameter-tuning-workflow.md +352 -0
package/plugin/workflows/ml-systems/mlops-pipeline-workflow.md +384 -0
package/plugin/workflows/ml-systems/model-deployment-workflow.md +392 -0
package/plugin/workflows/ml-systems/model-development-workflow.md +218 -0
package/plugin/workflows/ml-systems/model-evaluation-workflow.md +416 -0
package/plugin/workflows/ml-systems/model-optimization-workflow.md +390 -0
package/plugin/workflows/ml-systems/monitoring-drift-workflow.md +446 -0
package/plugin/workflows/ml-systems/retraining-workflow.md +401 -0
package/plugin/workflows/ml-systems/training-pipeline-workflow.md +382 -0

package/plugin/skills/ml-systems/feature-engineering/SKILL.md ADDED Viewed

@@ -0,0 +1,151 @@
+---
+name: feature-engineering
+description: Feature engineering techniques including feature extraction, transformation, selection, and feature store management for ML systems.
+---
+# Feature Engineering
+Creating informative features for ML models.
+## Feature Types
+### Numerical Features
+```python
+from sklearn.preprocessing import StandardScaler, RobustScaler
+# Scaling
+scaler = StandardScaler()  # Mean=0, Std=1
+robust_scaler = RobustScaler()  # Robust to outliers
+# Log transform (for skewed data)
+df['log_income'] = np.log1p(df['income'])
+# Polynomial features
+from sklearn.preprocessing import PolynomialFeatures
+poly = PolynomialFeatures(degree=2, include_bias=False)
+X_poly = poly.fit_transform(X)
+# Binning
+df['age_group'] = pd.cut(df['age'], bins=[0, 18, 35, 55, 100],
+                         labels=['youth', 'young_adult', 'middle', 'senior'])
+```
+### Categorical Features
+```python
+from sklearn.preprocessing import OneHotEncoder, LabelEncoder
+# One-hot encoding
+ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
+encoded = ohe.fit_transform(df[['category']])
+# Target encoding
+def target_encode(df, col, target, smoothing=10):
+    global_mean = df[target].mean()
+    agg = df.groupby(col)[target].agg(['mean', 'count'])
+    smooth = (agg['count'] * agg['mean'] + smoothing * global_mean) / (agg['count'] + smoothing)
+    return df[col].map(smooth)
+# Hash encoding (for high cardinality)
+from sklearn.feature_extraction import FeatureHasher
+hasher = FeatureHasher(n_features=100, input_type='string')
+```
+### Text Features
+```python
+from sklearn.feature_extraction.text import TfidfVectorizer
+# TF-IDF
+tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
+text_features = tfidf.fit_transform(df['text'])
+# Embeddings
+from sentence_transformers import SentenceTransformer
+model = SentenceTransformer('all-MiniLM-L6-v2')
+embeddings = model.encode(df['text'].tolist())
+# Text statistics
+df['text_length'] = df['text'].str.len()
+df['word_count'] = df['text'].str.split().str.len()
+df['avg_word_length'] = df['text'].str.split().apply(lambda x: np.mean([len(w) for w in x]))
+```
+### Temporal Features
+```python
+# Datetime components
+df['hour'] = df['timestamp'].dt.hour
+df['day_of_week'] = df['timestamp'].dt.dayofweek
+df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
+# Cyclical encoding
+df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
+df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
+# Lag features
+df['lag_1'] = df['value'].shift(1)
+df['lag_7'] = df['value'].shift(7)
+df['rolling_mean_7'] = df['value'].rolling(window=7).mean()
+```
+## Feature Selection
+```python
+from sklearn.feature_selection import SelectKBest, mutual_info_classif
+# Filter method
+selector = SelectKBest(mutual_info_classif, k=50)
+X_selected = selector.fit_transform(X, y)
+# Embedded method (tree importance)
+from sklearn.ensemble import RandomForestClassifier
+rf = RandomForestClassifier()
+rf.fit(X, y)
+importances = pd.Series(rf.feature_importances_, index=feature_names)
+# Recursive Feature Elimination
+from sklearn.feature_selection import RFE
+rfe = RFE(estimator=LogisticRegression(), n_features_to_select=20)
+X_rfe = rfe.fit_transform(X, y)
+```
+## Feature Store
+```python
+from feast import Entity, FeatureView, Feature, FileSource
+# Define feature view
+user_features = FeatureView(
+    name="user_features",
+    entities=["user_id"],
+    features=[
+        Feature(name="total_purchases", dtype=Float32),
+        Feature(name="avg_order_value", dtype=Float32),
+    ],
+    ttl=timedelta(days=1),
+    source=FileSource(path="data/user_features.parquet")
+)
+# Get features for training
+training_df = store.get_historical_features(
+    entity_df=entity_df,
+    features=["user_features:total_purchases"]
+).to_df()
+# Get features for inference
+online_features = store.get_online_features(
+    entity_rows=[{"user_id": 123}],
+    features=["user_features:total_purchases"]
+)
+```
+## Commands
+- `/omgfeature:extract` - Extract features
+- `/omgfeature:select` - Select features
+- `/omgfeature:store` - Feature store ops
+## Best Practices
+1. Start with simple features
+2. Use domain knowledge
+3. Validate feature distributions
+4. Document feature definitions
+5. Monitor feature drift

package/plugin/skills/ml-systems/ml-frameworks/SKILL.md ADDED Viewed

@@ -0,0 +1,187 @@
+---
+name: ml-frameworks
+description: ML framework best practices for PyTorch, TensorFlow, scikit-learn, and modern ML libraries including training patterns and optimization.
+---
+# ML Frameworks
+Best practices for popular ML frameworks.
+## PyTorch
+```python
+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset, DataLoader
+# Custom Dataset
+class CustomDataset(Dataset):
+    def __init__(self, X, y, transform=None):
+        self.X = torch.FloatTensor(X)
+        self.y = torch.LongTensor(y)
+        self.transform = transform
+    def __len__(self):
+        return len(self.y)
+    def __getitem__(self, idx):
+        x = self.X[idx]
+        if self.transform:
+            x = self.transform(x)
+        return x, self.y[idx]
+# Model Definition
+class Net(nn.Module):
+    def __init__(self, input_dim, hidden_dim, output_dim):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim),
+            nn.LayerNorm(hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(0.2),
+            nn.Linear(hidden_dim, output_dim)
+        )
+    def forward(self, x):
+        return self.net(x)
+# Training Loop
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = Net(100, 256, 10).to(device)
+optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=0.01)
+scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100)
+scaler = torch.cuda.amp.GradScaler()  # Mixed precision
+for epoch in range(100):
+    model.train()
+    for batch in train_loader:
+        x, y = batch[0].to(device), batch[1].to(device)
+        with torch.cuda.amp.autocast():
+            output = model(x)
+            loss = F.cross_entropy(output, y)
+        scaler.scale(loss).backward()
+        scaler.step(optimizer)
+        scaler.update()
+        optimizer.zero_grad()
+    scheduler.step()
+```
+## TensorFlow/Keras
+```python
+import tensorflow as tf
+from tensorflow import keras
+# Model Definition
+model = keras.Sequential([
+    keras.layers.Dense(256, activation='relu', input_shape=(100,)),
+    keras.layers.BatchNormalization(),
+    keras.layers.Dropout(0.2),
+    keras.layers.Dense(128, activation='relu'),
+    keras.layers.Dense(10, activation='softmax')
+])
+model.compile(
+    optimizer=keras.optimizers.AdamW(learning_rate=1e-3),
+    loss='sparse_categorical_crossentropy',
+    metrics=['accuracy']
+)
+# Callbacks
+callbacks = [
+    keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True),
+    keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=5),
+    keras.callbacks.ModelCheckpoint('best_model.h5', save_best_only=True),
+    keras.callbacks.TensorBoard(log_dir='./logs')
+]
+# Training
+history = model.fit(
+    X_train, y_train,
+    validation_data=(X_val, y_val),
+    epochs=100,
+    batch_size=32,
+    callbacks=callbacks
+)
+```
+## Scikit-learn
+```python
+from sklearn.pipeline import Pipeline
+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.ensemble import GradientBoostingClassifier
+# Preprocessing Pipeline
+numeric_features = ['age', 'income']
+categorical_features = ['city', 'occupation']
+preprocessor = ColumnTransformer(
+    transformers=[
+        ('num', StandardScaler(), numeric_features),
+        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
+    ]
+)
+# Full Pipeline
+pipeline = Pipeline([
+    ('preprocessor', preprocessor),
+    ('classifier', GradientBoostingClassifier())
+])
+# Grid Search
+from sklearn.model_selection import GridSearchCV
+param_grid = {
+    'classifier__n_estimators': [100, 200],
+    'classifier__max_depth': [3, 5, 7],
+    'classifier__learning_rate': [0.01, 0.1]
+}
+grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro')
+grid_search.fit(X_train, y_train)
+```
+## Hugging Face Transformers
+```python
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from transformers import Trainer, TrainingArguments
+model = AutoModelForSequenceClassification.from_pretrained(
+    "bert-base-uncased", num_labels=2
+)
+tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+training_args = TrainingArguments(
+    output_dir="./results",
+    num_train_epochs=3,
+    per_device_train_batch_size=16,
+    per_device_eval_batch_size=64,
+    warmup_steps=500,
+    weight_decay=0.01,
+    evaluation_strategy="epoch",
+    fp16=True
+)
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset,
+    eval_dataset=eval_dataset
+)
+trainer.train()
+```
+## Commands
+- `/omgtrain:train` - Train model
+## Best Practices
+1. Use mixed precision training
+2. Implement proper data loading
+3. Use learning rate scheduling
+4. Enable gradient clipping
+5. Save checkpoints regularly

package/plugin/skills/ml-systems/ml-serving-optimization/SKILL.md ADDED Viewed

@@ -0,0 +1,371 @@
+---
+name: ml-serving-optimization
+description: ML serving optimization techniques including batching, caching, model compilation, and latency reduction for production ML systems.
+---
+# ML Serving Optimization
+Optimizing ML model inference for production.
+## Inference Pipeline
+```
+┌─────────────────────────────────────────────────────────────┐
+│                 INFERENCE OPTIMIZATION STACK                 │
+├─────────────────────────────────────────────────────────────┤
+│                                                              │
+│  REQUEST       PREPROCESSING    INFERENCE     POSTPROCESS   │
+│  ────────      ─────────────    ─────────     ───────────   │
+│  Batching      Vectorization    Compiled      Response      │
+│  Queuing       Caching          Quantized     Caching       │
+│  Load balance  Async I/O        Parallelized  Streaming     │
+│                                                              │
+│  LATENCY BREAKDOWN (typical):                               │
+│  ├── Network: 1-5ms                                         │
+│  ├── Preprocessing: 2-10ms                                  │
+│  ├── Model inference: 5-100ms                               │
+│  └── Postprocessing: 1-5ms                                  │
+│                                                              │
+└─────────────────────────────────────────────────────────────┘
+```
+## Dynamic Batching
+```python
+import asyncio
+from collections import deque
+import time
+class DynamicBatcher:
+    def __init__(self, model, max_batch_size=32, max_wait_ms=10):
+        self.model = model
+        self.max_batch_size = max_batch_size
+        self.max_wait_ms = max_wait_ms
+        self.queue = deque()
+        self.lock = asyncio.Lock()
+    async def predict(self, input_data):
+        future = asyncio.Future()
+        async with self.lock:
+            self.queue.append((input_data, future))
+            if len(self.queue) >= self.max_batch_size:
+                await self._process_batch()
+        # Wait for result with timeout
+        await asyncio.wait_for(future, timeout=self.max_wait_ms / 1000 + 1)
+        return future.result()
+    async def _process_batch(self):
+        if not self.queue:
+            return
+        batch_items = []
+        while self.queue and len(batch_items) < self.max_batch_size:
+            batch_items.append(self.queue.popleft())
+        inputs = torch.stack([item[0] for item in batch_items])
+        with torch.no_grad():
+            outputs = self.model(inputs)
+        for i, (_, future) in enumerate(batch_items):
+            future.set_result(outputs[i])
+    async def batch_loop(self):
+        while True:
+            await asyncio.sleep(self.max_wait_ms / 1000)
+            async with self.lock:
+                if self.queue:
+                    await self._process_batch()
+```
+## Model Compilation
+### TorchScript
+```python
+import torch
+# Tracing
+traced_model = torch.jit.trace(model, example_input)
+traced_model.save("model_traced.pt")
+# Scripting (for control flow)
+@torch.jit.script
+def forward_with_control(x, threshold: float = 0.5):
+    output = model(x)
+    if output.max() > threshold:
+        return output
+    return torch.zeros_like(output)
+# Optimize for inference
+traced_model = torch.jit.optimize_for_inference(traced_model)
+```
+### torch.compile (PyTorch 2.0+)
+```python
+import torch
+# Default compilation
+compiled_model = torch.compile(model)
+# With options
+compiled_model = torch.compile(
+    model,
+    mode="reduce-overhead",  # or "max-autotune"
+    fullgraph=True,
+    dynamic=False
+)
+# Inference-optimized
+compiled_model = torch.compile(
+    model,
+    mode="reduce-overhead",
+    backend="inductor"
+)
+```
+### TensorRT
+```python
+import torch_tensorrt
+# Compile to TensorRT
+trt_model = torch_tensorrt.compile(
+    model,
+    inputs=[torch_tensorrt.Input(
+        min_shape=[1, 3, 224, 224],
+        opt_shape=[8, 3, 224, 224],
+        max_shape=[32, 3, 224, 224],
+        dtype=torch.float16
+    )],
+    enabled_precisions={torch.float16},
+    workspace_size=1 << 30
+)
+# Save and load
+torch.jit.save(trt_model, "model_trt.ts")
+loaded = torch.jit.load("model_trt.ts")
+```
+### ONNX Runtime
+```python
+import onnxruntime as ort
+# Export to ONNX
+torch.onnx.export(
+    model, example_input, "model.onnx",
+    input_names=['input'],
+    output_names=['output'],
+    dynamic_axes={'input': {0: 'batch_size'}}
+)
+# Create optimized session
+session_options = ort.SessionOptions()
+session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+session_options.intra_op_num_threads = 4
+# GPU execution
+providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
+session = ort.InferenceSession("model.onnx", session_options, providers=providers)
+# Inference
+outputs = session.run(None, {'input': input_data.numpy()})
+```
+## Caching Strategies
+```python
+from functools import lru_cache
+import hashlib
+import redis
+import pickle
+class InferenceCache:
+    def __init__(self, redis_client, ttl=3600):
+        self.redis = redis_client
+        self.ttl = ttl
+    def _hash_input(self, input_data):
+        return hashlib.sha256(input_data.tobytes()).hexdigest()
+    def get(self, input_data):
+        key = self._hash_input(input_data)
+        cached = self.redis.get(key)
+        if cached:
+            return pickle.loads(cached)
+        return None
+    def set(self, input_data, output):
+        key = self._hash_input(input_data)
+        self.redis.setex(key, self.ttl, pickle.dumps(output))
+# Embedding cache for similar inputs
+class EmbeddingCache:
+    def __init__(self, threshold=0.95):
+        self.embeddings = []
+        self.outputs = []
+        self.threshold = threshold
+    def find_similar(self, embedding):
+        if not self.embeddings:
+            return None
+        similarities = torch.cosine_similarity(
+            embedding.unsqueeze(0),
+            torch.stack(self.embeddings)
+        )
+        max_sim, idx = similarities.max(0)
+        if max_sim > self.threshold:
+            return self.outputs[idx]
+        return None
+    def add(self, embedding, output):
+        self.embeddings.append(embedding)
+        self.outputs.append(output)
+```
+## Async Inference
+```python
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+class AsyncInferenceService:
+    def __init__(self, model, num_workers=4):
+        self.model = model
+        self.executor = ThreadPoolExecutor(max_workers=num_workers)
+    def _sync_predict(self, input_data):
+        with torch.no_grad():
+            return self.model(input_data)
+    async def predict(self, input_data):
+        loop = asyncio.get_event_loop()
+        return await loop.run_in_executor(
+            self.executor,
+            self._sync_predict,
+            input_data
+        )
+    async def predict_batch(self, inputs):
+        tasks = [self.predict(inp) for inp in inputs]
+        return await asyncio.gather(*tasks)
+# CUDA streams for parallel inference
+class StreamedInference:
+    def __init__(self, model, num_streams=4):
+        self.model = model
+        self.streams = [torch.cuda.Stream() for _ in range(num_streams)]
+    def predict_parallel(self, inputs):
+        outputs = []
+        for i, inp in enumerate(inputs):
+            stream = self.streams[i % len(self.streams)]
+            with torch.cuda.stream(stream):
+                outputs.append(self.model(inp))
+        torch.cuda.synchronize()
+        return outputs
+```
+## Latency Profiling
+```python
+import time
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import Dict, List
+@dataclass
+class LatencyStats:
+    mean: float
+    p50: float
+    p95: float
+    p99: float
+    max: float
+class LatencyProfiler:
+    def __init__(self):
+        self.timings: Dict[str, List[float]] = {}
+    @contextmanager
+    def measure(self, name: str):
+        start = time.perf_counter()
+        yield
+        elapsed = (time.perf_counter() - start) * 1000  # ms
+        if name not in self.timings:
+            self.timings[name] = []
+        self.timings[name].append(elapsed)
+    def stats(self, name: str) -> LatencyStats:
+        times = sorted(self.timings[name])
+        n = len(times)
+        return LatencyStats(
+            mean=sum(times) / n,
+            p50=times[n // 2],
+            p95=times[int(n * 0.95)],
+            p99=times[int(n * 0.99)],
+            max=times[-1]
+        )
+    def report(self):
+        for name in self.timings:
+            s = self.stats(name)
+            print(f"{name}: mean={s.mean:.2f}ms p95={s.p95:.2f}ms p99={s.p99:.2f}ms")
+# Usage
+profiler = LatencyProfiler()
+with profiler.measure("preprocess"):
+    preprocessed = preprocess(data)
+with profiler.measure("inference"):
+    output = model(preprocessed)
+with profiler.measure("postprocess"):
+    result = postprocess(output)
+profiler.report()
+```
+## KV Cache for Transformers
+```python
+class KVCacheAttention(nn.Module):
+    def __init__(self, d_model, num_heads):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = d_model // num_heads
+        self.qkv = nn.Linear(d_model, 3 * d_model)
+        self.out = nn.Linear(d_model, d_model)
+    def forward(self, x, past_kv=None):
+        B, T, C = x.shape
+        qkv = self.qkv(x).reshape(B, T, 3, self.num_heads, self.head_dim)
+        q, k, v = qkv.unbind(2)
+        if past_kv is not None:
+            past_k, past_v = past_kv
+            k = torch.cat([past_k, k], dim=1)
+            v = torch.cat([past_v, v], dim=1)
+        # Attention computation
+        attn = torch.matmul(q, k.transpose(-2, -1)) / (self.head_dim ** 0.5)
+        attn = F.softmax(attn, dim=-1)
+        out = torch.matmul(attn, v)
+        return self.out(out.reshape(B, T, C)), (k, v)
+```
+## Commands
+- `/omgoptim:profile` - Profile latency
+- `/omgdeploy:serve` - Optimized serving
+- `/omgoptim:quantize` - Model quantization
+## Best Practices
+1. Measure before optimizing
+2. Use dynamic batching for throughput
+3. Compile models for production
+4. Cache repeated computations
+5. Profile end-to-end latency