omgkit 2.13.0 → 2.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +93 -10
- package/package.json +2 -2
- package/plugin/agents/api-designer.md +5 -0
- package/plugin/agents/architect.md +8 -0
- package/plugin/agents/brainstormer.md +4 -0
- package/plugin/agents/cicd-manager.md +6 -0
- package/plugin/agents/code-reviewer.md +6 -0
- package/plugin/agents/copywriter.md +2 -0
- package/plugin/agents/data-engineer.md +255 -0
- package/plugin/agents/database-admin.md +10 -0
- package/plugin/agents/debugger.md +10 -0
- package/plugin/agents/devsecops.md +314 -0
- package/plugin/agents/docs-manager.md +4 -0
- package/plugin/agents/domain-decomposer.md +181 -0
- package/plugin/agents/embedded-systems.md +397 -0
- package/plugin/agents/fullstack-developer.md +12 -0
- package/plugin/agents/game-systems-designer.md +375 -0
- package/plugin/agents/git-manager.md +10 -0
- package/plugin/agents/journal-writer.md +2 -0
- package/plugin/agents/ml-engineer.md +284 -0
- package/plugin/agents/observability-engineer.md +353 -0
- package/plugin/agents/oracle.md +9 -0
- package/plugin/agents/performance-engineer.md +290 -0
- package/plugin/agents/pipeline-architect.md +6 -0
- package/plugin/agents/planner.md +12 -0
- package/plugin/agents/platform-engineer.md +325 -0
- package/plugin/agents/project-manager.md +3 -0
- package/plugin/agents/researcher.md +5 -0
- package/plugin/agents/scientific-computing.md +426 -0
- package/plugin/agents/scout.md +3 -0
- package/plugin/agents/security-auditor.md +7 -0
- package/plugin/agents/sprint-master.md +17 -0
- package/plugin/agents/tester.md +10 -0
- package/plugin/agents/ui-ux-designer.md +12 -0
- package/plugin/agents/vulnerability-scanner.md +6 -0
- package/plugin/commands/data/pipeline.md +47 -0
- package/plugin/commands/data/quality.md +49 -0
- package/plugin/commands/domain/analyze.md +34 -0
- package/plugin/commands/domain/map.md +41 -0
- package/plugin/commands/game/balance.md +56 -0
- package/plugin/commands/game/optimize.md +62 -0
- package/plugin/commands/iot/provision.md +58 -0
- package/plugin/commands/ml/evaluate.md +47 -0
- package/plugin/commands/ml/train.md +48 -0
- package/plugin/commands/perf/benchmark.md +54 -0
- package/plugin/commands/perf/profile.md +49 -0
- package/plugin/commands/platform/blueprint.md +56 -0
- package/plugin/commands/security/audit.md +54 -0
- package/plugin/commands/security/scan.md +55 -0
- package/plugin/commands/sre/dashboard.md +53 -0
- package/plugin/registry.yaml +711 -0
- package/plugin/skills/ai-ml/experiment-tracking/SKILL.md +338 -0
- package/plugin/skills/ai-ml/feature-stores/SKILL.md +340 -0
- package/plugin/skills/ai-ml/llm-ops/SKILL.md +454 -0
- package/plugin/skills/ai-ml/ml-pipelines/SKILL.md +390 -0
- package/plugin/skills/ai-ml/model-monitoring/SKILL.md +398 -0
- package/plugin/skills/ai-ml/model-serving/SKILL.md +386 -0
- package/plugin/skills/event-driven/cqrs-patterns/SKILL.md +348 -0
- package/plugin/skills/event-driven/event-sourcing/SKILL.md +334 -0
- package/plugin/skills/event-driven/kafka-deep/SKILL.md +252 -0
- package/plugin/skills/event-driven/saga-orchestration/SKILL.md +335 -0
- package/plugin/skills/event-driven/schema-registry/SKILL.md +328 -0
- package/plugin/skills/event-driven/stream-processing/SKILL.md +313 -0
- package/plugin/skills/game/game-audio/SKILL.md +446 -0
- package/plugin/skills/game/game-networking/SKILL.md +490 -0
- package/plugin/skills/game/godot-patterns/SKILL.md +413 -0
- package/plugin/skills/game/shader-programming/SKILL.md +492 -0
- package/plugin/skills/game/unity-patterns/SKILL.md +488 -0
- package/plugin/skills/iot/device-provisioning/SKILL.md +405 -0
- package/plugin/skills/iot/edge-computing/SKILL.md +369 -0
- package/plugin/skills/iot/industrial-protocols/SKILL.md +438 -0
- package/plugin/skills/iot/mqtt-deep/SKILL.md +418 -0
- package/plugin/skills/iot/ota-updates/SKILL.md +426 -0
- package/plugin/skills/microservices/api-gateway-patterns/SKILL.md +201 -0
- package/plugin/skills/microservices/circuit-breaker-patterns/SKILL.md +246 -0
- package/plugin/skills/microservices/contract-testing/SKILL.md +284 -0
- package/plugin/skills/microservices/distributed-tracing/SKILL.md +246 -0
- package/plugin/skills/microservices/service-discovery/SKILL.md +304 -0
- package/plugin/skills/microservices/service-mesh/SKILL.md +181 -0
- package/plugin/skills/mobile-advanced/mobile-ci-cd/SKILL.md +407 -0
- package/plugin/skills/mobile-advanced/mobile-security/SKILL.md +403 -0
- package/plugin/skills/mobile-advanced/offline-first/SKILL.md +473 -0
- package/plugin/skills/mobile-advanced/push-notifications/SKILL.md +494 -0
- package/plugin/skills/mobile-advanced/react-native-deep/SKILL.md +374 -0
- package/plugin/skills/simulation/numerical-methods/SKILL.md +434 -0
- package/plugin/skills/simulation/parallel-computing/SKILL.md +382 -0
- package/plugin/skills/simulation/physics-engines/SKILL.md +377 -0
- package/plugin/skills/simulation/validation-verification/SKILL.md +479 -0
- package/plugin/skills/simulation/visualization-scientific/SKILL.md +365 -0
- package/plugin/workflows/ai-engineering/agent-development.md +3 -3
- package/plugin/workflows/ai-engineering/fine-tuning.md +3 -3
- package/plugin/workflows/ai-engineering/model-evaluation.md +3 -3
- package/plugin/workflows/ai-engineering/prompt-engineering.md +2 -2
- package/plugin/workflows/ai-engineering/rag-development.md +4 -4
- package/plugin/workflows/ai-ml/data-pipeline.md +188 -0
- package/plugin/workflows/ai-ml/experiment-cycle.md +203 -0
- package/plugin/workflows/ai-ml/feature-engineering.md +208 -0
- package/plugin/workflows/ai-ml/model-deployment.md +199 -0
- package/plugin/workflows/ai-ml/monitoring-setup.md +227 -0
- package/plugin/workflows/api/api-design.md +1 -1
- package/plugin/workflows/api/api-testing.md +2 -2
- package/plugin/workflows/content/technical-docs.md +1 -1
- package/plugin/workflows/database/migration.md +1 -1
- package/plugin/workflows/database/optimization.md +1 -1
- package/plugin/workflows/database/schema-design.md +3 -3
- package/plugin/workflows/development/bug-fix.md +3 -3
- package/plugin/workflows/development/code-review.md +2 -1
- package/plugin/workflows/development/feature.md +3 -3
- package/plugin/workflows/development/refactor.md +2 -2
- package/plugin/workflows/event-driven/consumer-groups.md +190 -0
- package/plugin/workflows/event-driven/event-storming.md +172 -0
- package/plugin/workflows/event-driven/replay-testing.md +186 -0
- package/plugin/workflows/event-driven/saga-implementation.md +206 -0
- package/plugin/workflows/event-driven/schema-evolution.md +173 -0
- package/plugin/workflows/fullstack/authentication.md +4 -4
- package/plugin/workflows/fullstack/full-feature.md +4 -4
- package/plugin/workflows/game-dev/content-pipeline.md +218 -0
- package/plugin/workflows/game-dev/platform-submission.md +263 -0
- package/plugin/workflows/game-dev/playtesting.md +237 -0
- package/plugin/workflows/game-dev/prototype-to-production.md +205 -0
- package/plugin/workflows/microservices/contract-first.md +151 -0
- package/plugin/workflows/microservices/distributed-tracing.md +166 -0
- package/plugin/workflows/microservices/domain-decomposition.md +123 -0
- package/plugin/workflows/microservices/integration-testing.md +149 -0
- package/plugin/workflows/microservices/service-mesh-setup.md +153 -0
- package/plugin/workflows/microservices/service-scaffolding.md +151 -0
- package/plugin/workflows/omega/1000x-innovation.md +2 -2
- package/plugin/workflows/omega/100x-architecture.md +2 -2
- package/plugin/workflows/omega/10x-improvement.md +2 -2
- package/plugin/workflows/quality/performance-optimization.md +2 -2
- package/plugin/workflows/research/best-practices.md +1 -1
- package/plugin/workflows/research/technology-research.md +1 -1
- package/plugin/workflows/security/penetration-testing.md +3 -3
- package/plugin/workflows/security/security-audit.md +3 -3
- package/plugin/workflows/sprint/sprint-execution.md +2 -2
- package/plugin/workflows/sprint/sprint-retrospective.md +1 -1
- package/plugin/workflows/sprint/sprint-setup.md +1 -1
|
@@ -0,0 +1,338 @@
|
|
|
1
|
+
# Experiment Tracking
|
|
2
|
+
|
|
3
|
+
MLflow, Weights & Biases, experiment versioning, hyperparameter tracking, and model registry integration.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
Experiment tracking provides systematic recording and comparison of ML experiments including parameters, metrics, artifacts, and model versions.
|
|
8
|
+
|
|
9
|
+
## Core Concepts
|
|
10
|
+
|
|
11
|
+
### What to Track
|
|
12
|
+
- **Parameters**: Hyperparameters, configurations
|
|
13
|
+
- **Metrics**: Loss, accuracy, custom metrics
|
|
14
|
+
- **Artifacts**: Models, plots, datasets
|
|
15
|
+
- **Code**: Git commit, environment
|
|
16
|
+
- **Metadata**: Tags, notes, lineage
|
|
17
|
+
|
|
18
|
+
### Tracking Hierarchy
|
|
19
|
+
```
|
|
20
|
+
Project
|
|
21
|
+
└── Experiment (logical grouping)
|
|
22
|
+
└── Run (single training execution)
|
|
23
|
+
├── Parameters
|
|
24
|
+
├── Metrics
|
|
25
|
+
├── Artifacts
|
|
26
|
+
└── Tags
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## MLflow Implementation
|
|
30
|
+
|
|
31
|
+
### Basic Tracking
|
|
32
|
+
```python
|
|
33
|
+
import mlflow
|
|
34
|
+
from mlflow.tracking import MlflowClient
|
|
35
|
+
|
|
36
|
+
# Set tracking URI
|
|
37
|
+
mlflow.set_tracking_uri("http://mlflow-server:5000")
|
|
38
|
+
|
|
39
|
+
# Set experiment
|
|
40
|
+
mlflow.set_experiment("recommendation-model")
|
|
41
|
+
|
|
42
|
+
# Start run
|
|
43
|
+
with mlflow.start_run(run_name="baseline-v1") as run:
|
|
44
|
+
# Log parameters
|
|
45
|
+
mlflow.log_params({
|
|
46
|
+
"learning_rate": 0.001,
|
|
47
|
+
"batch_size": 32,
|
|
48
|
+
"epochs": 100,
|
|
49
|
+
"optimizer": "adam",
|
|
50
|
+
"model_type": "transformer"
|
|
51
|
+
})
|
|
52
|
+
|
|
53
|
+
# Training loop
|
|
54
|
+
for epoch in range(100):
|
|
55
|
+
train_loss, val_loss = train_epoch(model, data)
|
|
56
|
+
|
|
57
|
+
# Log metrics
|
|
58
|
+
mlflow.log_metrics({
|
|
59
|
+
"train_loss": train_loss,
|
|
60
|
+
"val_loss": val_loss,
|
|
61
|
+
"epoch": epoch
|
|
62
|
+
}, step=epoch)
|
|
63
|
+
|
|
64
|
+
# Log model
|
|
65
|
+
mlflow.pytorch.log_model(model, "model")
|
|
66
|
+
|
|
67
|
+
# Log artifacts
|
|
68
|
+
mlflow.log_artifact("confusion_matrix.png")
|
|
69
|
+
mlflow.log_artifact("feature_importance.json")
|
|
70
|
+
|
|
71
|
+
# Set tags
|
|
72
|
+
mlflow.set_tags({
|
|
73
|
+
"team": "ml-platform",
|
|
74
|
+
"version": "1.0.0",
|
|
75
|
+
"stage": "development"
|
|
76
|
+
})
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### Model Registry
|
|
80
|
+
```python
|
|
81
|
+
from mlflow.tracking import MlflowClient
|
|
82
|
+
|
|
83
|
+
client = MlflowClient()
|
|
84
|
+
|
|
85
|
+
# Register model
|
|
86
|
+
model_uri = f"runs:/{run.info.run_id}/model"
|
|
87
|
+
model_version = mlflow.register_model(model_uri, "RecommendationModel")
|
|
88
|
+
|
|
89
|
+
# Transition to staging
|
|
90
|
+
client.transition_model_version_stage(
|
|
91
|
+
name="RecommendationModel",
|
|
92
|
+
version=model_version.version,
|
|
93
|
+
stage="Staging"
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# Add description
|
|
97
|
+
client.update_model_version(
|
|
98
|
+
name="RecommendationModel",
|
|
99
|
+
version=model_version.version,
|
|
100
|
+
description="Baseline transformer model with 85% accuracy"
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
# Load model by stage
|
|
104
|
+
model = mlflow.pyfunc.load_model(
|
|
105
|
+
model_uri="models:/RecommendationModel/Production"
|
|
106
|
+
)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### Autologging
|
|
110
|
+
```python
|
|
111
|
+
# Enable autologging for framework
|
|
112
|
+
mlflow.pytorch.autolog()
|
|
113
|
+
mlflow.sklearn.autolog()
|
|
114
|
+
mlflow.tensorflow.autolog()
|
|
115
|
+
|
|
116
|
+
# All parameters, metrics, and models logged automatically
|
|
117
|
+
with mlflow.start_run():
|
|
118
|
+
model.fit(X_train, y_train)
|
|
119
|
+
# Automatically logs: parameters, metrics, model, feature importance
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
## Weights & Biases
|
|
123
|
+
|
|
124
|
+
### Basic Tracking
|
|
125
|
+
```python
|
|
126
|
+
import wandb
|
|
127
|
+
|
|
128
|
+
# Initialize
|
|
129
|
+
wandb.init(
|
|
130
|
+
project="recommendation-model",
|
|
131
|
+
name="baseline-v1",
|
|
132
|
+
config={
|
|
133
|
+
"learning_rate": 0.001,
|
|
134
|
+
"batch_size": 32,
|
|
135
|
+
"epochs": 100,
|
|
136
|
+
"architecture": "transformer"
|
|
137
|
+
}
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
# Training loop
|
|
141
|
+
for epoch in range(100):
|
|
142
|
+
train_loss, val_loss = train_epoch(model, data)
|
|
143
|
+
|
|
144
|
+
# Log metrics
|
|
145
|
+
wandb.log({
|
|
146
|
+
"train_loss": train_loss,
|
|
147
|
+
"val_loss": val_loss,
|
|
148
|
+
"epoch": epoch
|
|
149
|
+
})
|
|
150
|
+
|
|
151
|
+
# Log artifacts
|
|
152
|
+
wandb.log({"confusion_matrix": wandb.Image("confusion_matrix.png")})
|
|
153
|
+
wandb.log({"model": wandb.Artifact("model", type="model")})
|
|
154
|
+
|
|
155
|
+
# Finish
|
|
156
|
+
wandb.finish()
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
### Hyperparameter Sweeps
|
|
160
|
+
```python
|
|
161
|
+
# Define sweep configuration
|
|
162
|
+
sweep_config = {
|
|
163
|
+
"method": "bayes",
|
|
164
|
+
"metric": {"name": "val_loss", "goal": "minimize"},
|
|
165
|
+
"parameters": {
|
|
166
|
+
"learning_rate": {"min": 0.0001, "max": 0.1, "distribution": "log_uniform_values"},
|
|
167
|
+
"batch_size": {"values": [16, 32, 64, 128]},
|
|
168
|
+
"hidden_size": {"min": 64, "max": 512, "distribution": "int_uniform"},
|
|
169
|
+
"dropout": {"min": 0.1, "max": 0.5}
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
# Create sweep
|
|
174
|
+
sweep_id = wandb.sweep(sweep_config, project="recommendation-model")
|
|
175
|
+
|
|
176
|
+
# Define training function
|
|
177
|
+
def train():
|
|
178
|
+
wandb.init()
|
|
179
|
+
config = wandb.config
|
|
180
|
+
|
|
181
|
+
model = create_model(
|
|
182
|
+
hidden_size=config.hidden_size,
|
|
183
|
+
dropout=config.dropout
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
optimizer = Adam(model.parameters(), lr=config.learning_rate)
|
|
187
|
+
|
|
188
|
+
for epoch in range(100):
|
|
189
|
+
loss = train_epoch(model, config.batch_size)
|
|
190
|
+
wandb.log({"val_loss": loss})
|
|
191
|
+
|
|
192
|
+
# Run sweep
|
|
193
|
+
wandb.agent(sweep_id, train, count=50)
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
### Tables and Visualizations
|
|
197
|
+
```python
|
|
198
|
+
# Create table for predictions
|
|
199
|
+
columns = ["image", "prediction", "ground_truth", "confidence"]
|
|
200
|
+
table = wandb.Table(columns=columns)
|
|
201
|
+
|
|
202
|
+
for img, pred, gt, conf in predictions:
|
|
203
|
+
table.add_data(
|
|
204
|
+
wandb.Image(img),
|
|
205
|
+
pred,
|
|
206
|
+
gt,
|
|
207
|
+
conf
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
wandb.log({"predictions": table})
|
|
211
|
+
|
|
212
|
+
# Custom charts
|
|
213
|
+
wandb.log({
|
|
214
|
+
"pr_curve": wandb.plot.pr_curve(y_true, y_scores),
|
|
215
|
+
"roc_curve": wandb.plot.roc_curve(y_true, y_scores),
|
|
216
|
+
"confusion_matrix": wandb.plot.confusion_matrix(
|
|
217
|
+
y_true=y_true,
|
|
218
|
+
preds=y_pred,
|
|
219
|
+
class_names=class_names
|
|
220
|
+
)
|
|
221
|
+
})
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
## Experiment Comparison
|
|
225
|
+
|
|
226
|
+
### MLflow Query API
|
|
227
|
+
```python
|
|
228
|
+
from mlflow.tracking import MlflowClient
|
|
229
|
+
|
|
230
|
+
client = MlflowClient()
|
|
231
|
+
|
|
232
|
+
# Search runs
|
|
233
|
+
runs = client.search_runs(
|
|
234
|
+
experiment_ids=["1"],
|
|
235
|
+
filter_string="metrics.val_accuracy > 0.8 AND params.model_type = 'transformer'",
|
|
236
|
+
order_by=["metrics.val_accuracy DESC"],
|
|
237
|
+
max_results=10
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
# Compare runs
|
|
241
|
+
for run in runs:
|
|
242
|
+
print(f"Run: {run.info.run_id}")
|
|
243
|
+
print(f" Accuracy: {run.data.metrics['val_accuracy']}")
|
|
244
|
+
print(f" Learning Rate: {run.data.params['learning_rate']}")
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
### Reproducibility
|
|
248
|
+
```python
|
|
249
|
+
import mlflow
|
|
250
|
+
import random
|
|
251
|
+
import numpy as np
|
|
252
|
+
import torch
|
|
253
|
+
|
|
254
|
+
def set_seed(seed: int):
|
|
255
|
+
random.seed(seed)
|
|
256
|
+
np.random.seed(seed)
|
|
257
|
+
torch.manual_seed(seed)
|
|
258
|
+
torch.cuda.manual_seed_all(seed)
|
|
259
|
+
|
|
260
|
+
with mlflow.start_run():
|
|
261
|
+
seed = 42
|
|
262
|
+
set_seed(seed)
|
|
263
|
+
|
|
264
|
+
# Log environment
|
|
265
|
+
mlflow.log_params({
|
|
266
|
+
"seed": seed,
|
|
267
|
+
"python_version": sys.version,
|
|
268
|
+
"torch_version": torch.__version__,
|
|
269
|
+
"cuda_version": torch.version.cuda
|
|
270
|
+
})
|
|
271
|
+
|
|
272
|
+
# Log git commit
|
|
273
|
+
mlflow.set_tag("git_commit", get_git_commit())
|
|
274
|
+
|
|
275
|
+
# Log requirements
|
|
276
|
+
mlflow.log_artifact("requirements.txt")
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
## Best Practices
|
|
280
|
+
|
|
281
|
+
1. **Consistent Naming**: Use clear experiment/run names
|
|
282
|
+
2. **Version Control**: Link to git commits
|
|
283
|
+
3. **Environment Logging**: Capture dependencies
|
|
284
|
+
4. **Artifact Organization**: Structure artifacts logically
|
|
285
|
+
5. **Tagging Strategy**: Use tags for filtering
|
|
286
|
+
|
|
287
|
+
## CI/CD Integration
|
|
288
|
+
|
|
289
|
+
### GitHub Actions
|
|
290
|
+
```yaml
|
|
291
|
+
name: ML Experiment
|
|
292
|
+
on:
|
|
293
|
+
push:
|
|
294
|
+
paths:
|
|
295
|
+
- 'models/**'
|
|
296
|
+
- 'experiments/**'
|
|
297
|
+
|
|
298
|
+
jobs:
|
|
299
|
+
train:
|
|
300
|
+
runs-on: ubuntu-latest
|
|
301
|
+
steps:
|
|
302
|
+
- uses: actions/checkout@v3
|
|
303
|
+
|
|
304
|
+
- name: Set up Python
|
|
305
|
+
uses: actions/setup-python@v4
|
|
306
|
+
with:
|
|
307
|
+
python-version: '3.10'
|
|
308
|
+
|
|
309
|
+
- name: Train model
|
|
310
|
+
env:
|
|
311
|
+
MLFLOW_TRACKING_URI: ${{ secrets.MLFLOW_URI }}
|
|
312
|
+
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
|
|
313
|
+
run: |
|
|
314
|
+
pip install -r requirements.txt
|
|
315
|
+
python train.py --experiment-name "ci-${GITHUB_SHA}"
|
|
316
|
+
```
|
|
317
|
+
|
|
318
|
+
## Anti-Patterns
|
|
319
|
+
|
|
320
|
+
- Not logging all hyperparameters
|
|
321
|
+
- Missing environment information
|
|
322
|
+
- Inconsistent metric names
|
|
323
|
+
- Not tagging experiments properly
|
|
324
|
+
- Ignoring failed runs
|
|
325
|
+
|
|
326
|
+
## When to Use
|
|
327
|
+
|
|
328
|
+
- Model development and iteration
|
|
329
|
+
- Hyperparameter optimization
|
|
330
|
+
- Team collaboration on ML
|
|
331
|
+
- Reproducibility requirements
|
|
332
|
+
- Model governance
|
|
333
|
+
|
|
334
|
+
## When NOT to Use
|
|
335
|
+
|
|
336
|
+
- Simple one-off scripts
|
|
337
|
+
- No iteration expected
|
|
338
|
+
- Strict air-gapped environments
|
|
@@ -0,0 +1,340 @@
|
|
|
1
|
+
# Feature Stores
|
|
2
|
+
|
|
3
|
+
Feast, Tecton, feature engineering pipelines, online/offline feature serving, and feature versioning.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
Feature stores provide centralized management of ML features, enabling consistent feature computation, storage, and serving across training and inference.
|
|
8
|
+
|
|
9
|
+
## Core Concepts
|
|
10
|
+
|
|
11
|
+
### Feature Store Components
|
|
12
|
+
- **Feature Registry**: Metadata and definitions
|
|
13
|
+
- **Offline Store**: Historical features for training
|
|
14
|
+
- **Online Store**: Low-latency features for inference
|
|
15
|
+
- **Feature Transformation**: Computation pipelines
|
|
16
|
+
- **Feature Serving**: API for retrieving features
|
|
17
|
+
|
|
18
|
+
### Feature Types
|
|
19
|
+
```
|
|
20
|
+
Point-in-Time Features:
|
|
21
|
+
- User age at transaction time
|
|
22
|
+
- Account balance when order placed
|
|
23
|
+
|
|
24
|
+
Aggregated Features:
|
|
25
|
+
- 7-day rolling average spend
|
|
26
|
+
- Count of logins last 30 days
|
|
27
|
+
|
|
28
|
+
Real-time Features:
|
|
29
|
+
- Current session duration
|
|
30
|
+
- Items in cart right now
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Feast Implementation
|
|
34
|
+
|
|
35
|
+
### Feature Definitions
|
|
36
|
+
```python
|
|
37
|
+
from feast import Entity, Feature, FeatureView, FileSource, ValueType
|
|
38
|
+
from feast.types import Float32, Int64, String
|
|
39
|
+
from datetime import timedelta
|
|
40
|
+
|
|
41
|
+
# Define entity
|
|
42
|
+
customer = Entity(
|
|
43
|
+
name="customer_id",
|
|
44
|
+
value_type=ValueType.STRING,
|
|
45
|
+
description="Unique customer identifier"
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
# Define data source
|
|
49
|
+
customer_stats_source = FileSource(
|
|
50
|
+
path="s3://bucket/customer_stats.parquet",
|
|
51
|
+
timestamp_field="event_timestamp",
|
|
52
|
+
created_timestamp_column="created_timestamp"
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# Define feature view
|
|
56
|
+
customer_stats_fv = FeatureView(
|
|
57
|
+
name="customer_stats",
|
|
58
|
+
entities=["customer_id"],
|
|
59
|
+
ttl=timedelta(days=1),
|
|
60
|
+
schema=[
|
|
61
|
+
Feature(name="total_transactions", dtype=Int64),
|
|
62
|
+
Feature(name="avg_transaction_amount", dtype=Float32),
|
|
63
|
+
Feature(name="days_since_last_purchase", dtype=Int64),
|
|
64
|
+
Feature(name="customer_segment", dtype=String),
|
|
65
|
+
],
|
|
66
|
+
online=True,
|
|
67
|
+
source=customer_stats_source,
|
|
68
|
+
tags={"team": "ml-platform", "owner": "data-team"}
|
|
69
|
+
)
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Feature Registration
|
|
73
|
+
```python
|
|
74
|
+
from feast import FeatureStore
|
|
75
|
+
|
|
76
|
+
# Initialize store
|
|
77
|
+
store = FeatureStore(repo_path="feature_repo/")
|
|
78
|
+
|
|
79
|
+
# Apply definitions
|
|
80
|
+
store.apply([customer, customer_stats_fv])
|
|
81
|
+
|
|
82
|
+
# Materialize to online store
|
|
83
|
+
store.materialize_incremental(end_date=datetime.now())
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### Training Data Retrieval
|
|
87
|
+
```python
|
|
88
|
+
from feast import FeatureStore
|
|
89
|
+
import pandas as pd
|
|
90
|
+
|
|
91
|
+
store = FeatureStore(repo_path="feature_repo/")
|
|
92
|
+
|
|
93
|
+
# Entity dataframe with timestamps
|
|
94
|
+
entity_df = pd.DataFrame({
|
|
95
|
+
"customer_id": ["C001", "C002", "C003"],
|
|
96
|
+
"event_timestamp": [
|
|
97
|
+
datetime(2024, 1, 15),
|
|
98
|
+
datetime(2024, 1, 16),
|
|
99
|
+
datetime(2024, 1, 17)
|
|
100
|
+
]
|
|
101
|
+
})
|
|
102
|
+
|
|
103
|
+
# Get historical features (point-in-time correct)
|
|
104
|
+
training_df = store.get_historical_features(
|
|
105
|
+
entity_df=entity_df,
|
|
106
|
+
features=[
|
|
107
|
+
"customer_stats:total_transactions",
|
|
108
|
+
"customer_stats:avg_transaction_amount",
|
|
109
|
+
"customer_stats:days_since_last_purchase",
|
|
110
|
+
"customer_stats:customer_segment"
|
|
111
|
+
]
|
|
112
|
+
).to_df()
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
### Online Feature Retrieval
|
|
116
|
+
```python
|
|
117
|
+
# Get features for inference
|
|
118
|
+
feature_vector = store.get_online_features(
|
|
119
|
+
features=[
|
|
120
|
+
"customer_stats:total_transactions",
|
|
121
|
+
"customer_stats:avg_transaction_amount",
|
|
122
|
+
"customer_stats:customer_segment"
|
|
123
|
+
],
|
|
124
|
+
entity_rows=[
|
|
125
|
+
{"customer_id": "C001"},
|
|
126
|
+
{"customer_id": "C002"}
|
|
127
|
+
]
|
|
128
|
+
).to_dict()
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## On-Demand Features
|
|
132
|
+
|
|
133
|
+
### Real-time Transformations
|
|
134
|
+
```python
|
|
135
|
+
from feast import on_demand_feature_view, Field
|
|
136
|
+
from feast.types import Float32
|
|
137
|
+
import pandas as pd
|
|
138
|
+
|
|
139
|
+
@on_demand_feature_view(
|
|
140
|
+
sources=[customer_stats_fv],
|
|
141
|
+
schema=[
|
|
142
|
+
Field(name="transaction_velocity", dtype=Float32),
|
|
143
|
+
Field(name="is_high_value", dtype=Int64)
|
|
144
|
+
]
|
|
145
|
+
)
|
|
146
|
+
def customer_derived_features(inputs: pd.DataFrame) -> pd.DataFrame:
|
|
147
|
+
df = pd.DataFrame()
|
|
148
|
+
df["transaction_velocity"] = (
|
|
149
|
+
inputs["total_transactions"] /
|
|
150
|
+
(inputs["days_since_last_purchase"] + 1)
|
|
151
|
+
)
|
|
152
|
+
df["is_high_value"] = (inputs["avg_transaction_amount"] > 100).astype(int)
|
|
153
|
+
return df
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
## Stream Features
|
|
157
|
+
|
|
158
|
+
### Kafka Integration
|
|
159
|
+
```python
|
|
160
|
+
from feast import KafkaSource, FeatureView
|
|
161
|
+
from feast.data_format import JsonFormat
|
|
162
|
+
|
|
163
|
+
# Kafka source for real-time features
|
|
164
|
+
click_stream_source = KafkaSource(
|
|
165
|
+
name="click_stream",
|
|
166
|
+
kafka_bootstrap_servers="kafka:9092",
|
|
167
|
+
topic="user_clicks",
|
|
168
|
+
timestamp_field="event_timestamp",
|
|
169
|
+
message_format=JsonFormat(
|
|
170
|
+
schema_json="""
|
|
171
|
+
{
|
|
172
|
+
"type": "record",
|
|
173
|
+
"name": "click",
|
|
174
|
+
"fields": [
|
|
175
|
+
{"name": "user_id", "type": "string"},
|
|
176
|
+
{"name": "page_id", "type": "string"},
|
|
177
|
+
{"name": "event_timestamp", "type": "long"}
|
|
178
|
+
]
|
|
179
|
+
}
|
|
180
|
+
"""
|
|
181
|
+
)
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
# Stream feature view
|
|
185
|
+
click_features = FeatureView(
|
|
186
|
+
name="user_click_features",
|
|
187
|
+
entities=["user_id"],
|
|
188
|
+
ttl=timedelta(minutes=30),
|
|
189
|
+
schema=[
|
|
190
|
+
Feature(name="click_count_30m", dtype=Int64),
|
|
191
|
+
Feature(name="unique_pages_30m", dtype=Int64)
|
|
192
|
+
],
|
|
193
|
+
source=click_stream_source,
|
|
194
|
+
online=True
|
|
195
|
+
)
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
## Feature Engineering Pipelines
|
|
199
|
+
|
|
200
|
+
### Batch Pipeline (Spark)
|
|
201
|
+
```python
|
|
202
|
+
from pyspark.sql import SparkSession
|
|
203
|
+
from pyspark.sql import functions as F
|
|
204
|
+
from pyspark.sql.window import Window
|
|
205
|
+
|
|
206
|
+
def compute_customer_features(spark: SparkSession, date: str):
|
|
207
|
+
transactions = spark.read.parquet(f"s3://data/transactions/date={date}")
|
|
208
|
+
|
|
209
|
+
# Aggregation window
|
|
210
|
+
window_30d = Window.partitionBy("customer_id").orderBy("timestamp").rangeBetween(-30*86400, 0)
|
|
211
|
+
|
|
212
|
+
features = transactions.groupBy("customer_id").agg(
|
|
213
|
+
F.count("*").alias("total_transactions"),
|
|
214
|
+
F.avg("amount").alias("avg_transaction_amount"),
|
|
215
|
+
F.sum("amount").alias("total_spend"),
|
|
216
|
+
F.max("timestamp").alias("last_transaction"),
|
|
217
|
+
F.countDistinct("merchant_id").alias("unique_merchants")
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
# Compute derived features
|
|
221
|
+
features = features.withColumn(
|
|
222
|
+
"days_since_last_purchase",
|
|
223
|
+
F.datediff(F.current_date(), F.col("last_transaction"))
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
# Write to feature store
|
|
227
|
+
features.write.mode("overwrite").parquet(
|
|
228
|
+
f"s3://feature-store/customer_stats/date={date}"
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
return features
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
### Streaming Pipeline (Flink)
|
|
235
|
+
```python
|
|
236
|
+
from pyflink.datastream import StreamExecutionEnvironment
|
|
237
|
+
from pyflink.table import StreamTableEnvironment
|
|
238
|
+
|
|
239
|
+
def create_streaming_features():
|
|
240
|
+
env = StreamExecutionEnvironment.get_execution_environment()
|
|
241
|
+
t_env = StreamTableEnvironment.create(env)
|
|
242
|
+
|
|
243
|
+
# Read from Kafka
|
|
244
|
+
t_env.execute_sql("""
|
|
245
|
+
CREATE TABLE clicks (
|
|
246
|
+
user_id STRING,
|
|
247
|
+
page_id STRING,
|
|
248
|
+
event_time TIMESTAMP(3),
|
|
249
|
+
WATERMARK FOR event_time AS event_time - INTERVAL '5' SECOND
|
|
250
|
+
) WITH (
|
|
251
|
+
'connector' = 'kafka',
|
|
252
|
+
'topic' = 'user_clicks',
|
|
253
|
+
'properties.bootstrap.servers' = 'kafka:9092',
|
|
254
|
+
'format' = 'json'
|
|
255
|
+
)
|
|
256
|
+
""")
|
|
257
|
+
|
|
258
|
+
# Windowed aggregation
|
|
259
|
+
t_env.execute_sql("""
|
|
260
|
+
CREATE TABLE click_features (
|
|
261
|
+
user_id STRING,
|
|
262
|
+
window_start TIMESTAMP(3),
|
|
263
|
+
click_count BIGINT,
|
|
264
|
+
unique_pages BIGINT,
|
|
265
|
+
PRIMARY KEY (user_id) NOT ENFORCED
|
|
266
|
+
) WITH (
|
|
267
|
+
'connector' = 'upsert-kafka',
|
|
268
|
+
'topic' = 'click_features',
|
|
269
|
+
'properties.bootstrap.servers' = 'kafka:9092',
|
|
270
|
+
'key.format' = 'json',
|
|
271
|
+
'value.format' = 'json'
|
|
272
|
+
)
|
|
273
|
+
""")
|
|
274
|
+
|
|
275
|
+
t_env.execute_sql("""
|
|
276
|
+
INSERT INTO click_features
|
|
277
|
+
SELECT
|
|
278
|
+
user_id,
|
|
279
|
+
TUMBLE_START(event_time, INTERVAL '30' MINUTE) as window_start,
|
|
280
|
+
COUNT(*) as click_count,
|
|
281
|
+
COUNT(DISTINCT page_id) as unique_pages
|
|
282
|
+
FROM clicks
|
|
283
|
+
GROUP BY user_id, TUMBLE(event_time, INTERVAL '30' MINUTE)
|
|
284
|
+
""")
|
|
285
|
+
```
|
|
286
|
+
|
|
287
|
+
## Best Practices
|
|
288
|
+
|
|
289
|
+
1. **Point-in-Time Correctness**: Prevent data leakage
|
|
290
|
+
2. **Feature Documentation**: Clear descriptions
|
|
291
|
+
3. **Version Control**: Track feature definitions
|
|
292
|
+
4. **Monitoring**: Feature freshness and quality
|
|
293
|
+
5. **Naming Conventions**: Consistent feature names
|
|
294
|
+
|
|
295
|
+
## Feature Quality Monitoring
|
|
296
|
+
|
|
297
|
+
```python
|
|
298
|
+
from great_expectations import DataContext
|
|
299
|
+
|
|
300
|
+
def validate_features(feature_df):
|
|
301
|
+
context = DataContext()
|
|
302
|
+
|
|
303
|
+
expectation_suite = context.create_expectation_suite("feature_validation")
|
|
304
|
+
|
|
305
|
+
# Add expectations
|
|
306
|
+
validator = context.get_validator(
|
|
307
|
+
batch_request=batch_request,
|
|
308
|
+
expectation_suite_name="feature_validation"
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
validator.expect_column_values_to_not_be_null("customer_id")
|
|
312
|
+
validator.expect_column_values_to_be_between("avg_transaction_amount", 0, 10000)
|
|
313
|
+
validator.expect_column_values_to_be_in_set("customer_segment", ["bronze", "silver", "gold"])
|
|
314
|
+
|
|
315
|
+
results = validator.validate()
|
|
316
|
+
return results.success
|
|
317
|
+
```
|
|
318
|
+
|
|
319
|
+
## Anti-Patterns
|
|
320
|
+
|
|
321
|
+
- Training/serving skew
|
|
322
|
+
- Missing point-in-time correctness
|
|
323
|
+
- Feature definitions in notebooks only
|
|
324
|
+
- No feature documentation
|
|
325
|
+
- Ignoring feature freshness
|
|
326
|
+
|
|
327
|
+
## When to Use
|
|
328
|
+
|
|
329
|
+
- Multiple models share features
|
|
330
|
+
- Need training/serving consistency
|
|
331
|
+
- Feature computation is expensive
|
|
332
|
+
- Team collaboration on features
|
|
333
|
+
- Regulatory requirements
|
|
334
|
+
|
|
335
|
+
## When NOT to Use
|
|
336
|
+
|
|
337
|
+
- Single simple model
|
|
338
|
+
- Features change frequently
|
|
339
|
+
- No reuse expected
|
|
340
|
+
- Small team, simple pipeline
|