agentic-team-templates 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +280 -0
- package/bin/cli.js +5 -0
- package/package.json +47 -0
- package/src/index.js +521 -0
- package/templates/_shared/code-quality.md +162 -0
- package/templates/_shared/communication.md +114 -0
- package/templates/_shared/core-principles.md +62 -0
- package/templates/_shared/git-workflow.md +165 -0
- package/templates/_shared/security-fundamentals.md +173 -0
- package/templates/blockchain/.cursorrules/defi-patterns.md +520 -0
- package/templates/blockchain/.cursorrules/gas-optimization.md +339 -0
- package/templates/blockchain/.cursorrules/overview.md +130 -0
- package/templates/blockchain/.cursorrules/security.md +318 -0
- package/templates/blockchain/.cursorrules/smart-contracts.md +364 -0
- package/templates/blockchain/.cursorrules/testing.md +415 -0
- package/templates/blockchain/.cursorrules/web3-integration.md +538 -0
- package/templates/blockchain/CLAUDE.md +389 -0
- package/templates/cli-tools/.cursorrules/architecture.md +412 -0
- package/templates/cli-tools/.cursorrules/arguments.md +406 -0
- package/templates/cli-tools/.cursorrules/distribution.md +546 -0
- package/templates/cli-tools/.cursorrules/error-handling.md +455 -0
- package/templates/cli-tools/.cursorrules/overview.md +136 -0
- package/templates/cli-tools/.cursorrules/testing.md +537 -0
- package/templates/cli-tools/.cursorrules/user-experience.md +545 -0
- package/templates/cli-tools/CLAUDE.md +356 -0
- package/templates/data-engineering/.cursorrules/data-modeling.md +367 -0
- package/templates/data-engineering/.cursorrules/data-quality.md +455 -0
- package/templates/data-engineering/.cursorrules/overview.md +85 -0
- package/templates/data-engineering/.cursorrules/performance.md +339 -0
- package/templates/data-engineering/.cursorrules/pipeline-design.md +280 -0
- package/templates/data-engineering/.cursorrules/security.md +460 -0
- package/templates/data-engineering/.cursorrules/testing.md +452 -0
- package/templates/data-engineering/CLAUDE.md +974 -0
- package/templates/devops-sre/.cursorrules/capacity-planning.md +653 -0
- package/templates/devops-sre/.cursorrules/change-management.md +584 -0
- package/templates/devops-sre/.cursorrules/chaos-engineering.md +651 -0
- package/templates/devops-sre/.cursorrules/disaster-recovery.md +641 -0
- package/templates/devops-sre/.cursorrules/incident-management.md +565 -0
- package/templates/devops-sre/.cursorrules/observability.md +714 -0
- package/templates/devops-sre/.cursorrules/overview.md +230 -0
- package/templates/devops-sre/.cursorrules/postmortems.md +588 -0
- package/templates/devops-sre/.cursorrules/runbooks.md +760 -0
- package/templates/devops-sre/.cursorrules/slo-sli.md +617 -0
- package/templates/devops-sre/.cursorrules/toil-reduction.md +567 -0
- package/templates/devops-sre/CLAUDE.md +1007 -0
- package/templates/documentation/.cursorrules/adr.md +277 -0
- package/templates/documentation/.cursorrules/api-documentation.md +411 -0
- package/templates/documentation/.cursorrules/code-comments.md +253 -0
- package/templates/documentation/.cursorrules/maintenance.md +260 -0
- package/templates/documentation/.cursorrules/overview.md +82 -0
- package/templates/documentation/.cursorrules/readme-standards.md +306 -0
- package/templates/documentation/CLAUDE.md +120 -0
- package/templates/fullstack/.cursorrules/api-contracts.md +331 -0
- package/templates/fullstack/.cursorrules/architecture.md +298 -0
- package/templates/fullstack/.cursorrules/overview.md +109 -0
- package/templates/fullstack/.cursorrules/shared-types.md +348 -0
- package/templates/fullstack/.cursorrules/testing.md +386 -0
- package/templates/fullstack/CLAUDE.md +349 -0
- package/templates/ml-ai/.cursorrules/data-engineering.md +483 -0
- package/templates/ml-ai/.cursorrules/deployment.md +601 -0
- package/templates/ml-ai/.cursorrules/model-development.md +538 -0
- package/templates/ml-ai/.cursorrules/monitoring.md +658 -0
- package/templates/ml-ai/.cursorrules/overview.md +131 -0
- package/templates/ml-ai/.cursorrules/security.md +637 -0
- package/templates/ml-ai/.cursorrules/testing.md +678 -0
- package/templates/ml-ai/CLAUDE.md +1136 -0
- package/templates/mobile/.cursorrules/navigation.md +246 -0
- package/templates/mobile/.cursorrules/offline-first.md +302 -0
- package/templates/mobile/.cursorrules/overview.md +71 -0
- package/templates/mobile/.cursorrules/performance.md +345 -0
- package/templates/mobile/.cursorrules/testing.md +339 -0
- package/templates/mobile/CLAUDE.md +233 -0
- package/templates/platform-engineering/.cursorrules/ci-cd.md +778 -0
- package/templates/platform-engineering/.cursorrules/developer-experience.md +632 -0
- package/templates/platform-engineering/.cursorrules/infrastructure-as-code.md +600 -0
- package/templates/platform-engineering/.cursorrules/kubernetes.md +710 -0
- package/templates/platform-engineering/.cursorrules/observability.md +747 -0
- package/templates/platform-engineering/.cursorrules/overview.md +215 -0
- package/templates/platform-engineering/.cursorrules/security.md +855 -0
- package/templates/platform-engineering/.cursorrules/testing.md +878 -0
- package/templates/platform-engineering/CLAUDE.md +850 -0
- package/templates/utility-agent/.cursorrules/action-control.md +284 -0
- package/templates/utility-agent/.cursorrules/context-management.md +186 -0
- package/templates/utility-agent/.cursorrules/hallucination-prevention.md +253 -0
- package/templates/utility-agent/.cursorrules/overview.md +78 -0
- package/templates/utility-agent/.cursorrules/token-optimization.md +369 -0
- package/templates/utility-agent/CLAUDE.md +513 -0
- package/templates/web-backend/.cursorrules/api-design.md +255 -0
- package/templates/web-backend/.cursorrules/authentication.md +309 -0
- package/templates/web-backend/.cursorrules/database-patterns.md +298 -0
- package/templates/web-backend/.cursorrules/error-handling.md +366 -0
- package/templates/web-backend/.cursorrules/overview.md +69 -0
- package/templates/web-backend/.cursorrules/security.md +358 -0
- package/templates/web-backend/.cursorrules/testing.md +395 -0
- package/templates/web-backend/CLAUDE.md +366 -0
- package/templates/web-frontend/.cursorrules/accessibility.md +296 -0
- package/templates/web-frontend/.cursorrules/component-patterns.md +204 -0
- package/templates/web-frontend/.cursorrules/overview.md +72 -0
- package/templates/web-frontend/.cursorrules/performance.md +325 -0
- package/templates/web-frontend/.cursorrules/state-management.md +227 -0
- package/templates/web-frontend/.cursorrules/styling.md +271 -0
- package/templates/web-frontend/.cursorrules/testing.md +311 -0
- package/templates/web-frontend/CLAUDE.md +399 -0
|
@@ -0,0 +1,601 @@
|
|
|
1
|
+
# Model Deployment
|
|
2
|
+
|
|
3
|
+
Guidelines for deploying machine learning models to production, including serving patterns, scaling strategies, and infrastructure configuration.
|
|
4
|
+
|
|
5
|
+
## Deployment Patterns
|
|
6
|
+
|
|
7
|
+
### Real-Time Inference
|
|
8
|
+
|
|
9
|
+
For low-latency, synchronous predictions:
|
|
10
|
+
|
|
11
|
+
```yaml
|
|
12
|
+
# kserve/inference-service.yaml
|
|
13
|
+
apiVersion: serving.kserve.io/v1beta1
|
|
14
|
+
kind: InferenceService
|
|
15
|
+
metadata:
|
|
16
|
+
name: fraud-detector
|
|
17
|
+
annotations:
|
|
18
|
+
serving.kserve.io/deploymentMode: Serverless
|
|
19
|
+
spec:
|
|
20
|
+
predictor:
|
|
21
|
+
model:
|
|
22
|
+
modelFormat:
|
|
23
|
+
name: mlflow
|
|
24
|
+
storageUri: s3://models/fraud-detector/v1
|
|
25
|
+
resources:
|
|
26
|
+
limits:
|
|
27
|
+
cpu: "2"
|
|
28
|
+
memory: 4Gi
|
|
29
|
+
nvidia.com/gpu: "1"
|
|
30
|
+
requests:
|
|
31
|
+
cpu: "1"
|
|
32
|
+
memory: 2Gi
|
|
33
|
+
minReplicas: 2
|
|
34
|
+
maxReplicas: 10
|
|
35
|
+
scaleTarget: 100
|
|
36
|
+
scaleMetric: concurrency
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
### Batch Inference
|
|
40
|
+
|
|
41
|
+
For high-throughput, asynchronous predictions:
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
from prefect import flow, task
|
|
45
|
+
from prefect.tasks import task_input_hash
|
|
46
|
+
|
|
47
|
+
@task(
|
|
48
|
+
cache_key_fn=task_input_hash,
|
|
49
|
+
cache_expiration=timedelta(hours=1),
|
|
50
|
+
retries=3,
|
|
51
|
+
retry_delay_seconds=60,
|
|
52
|
+
)
|
|
53
|
+
def run_batch_inference(
|
|
54
|
+
data_path: str,
|
|
55
|
+
model_uri: str,
|
|
56
|
+
output_path: str,
|
|
57
|
+
batch_size: int = 10000,
|
|
58
|
+
) -> str:
|
|
59
|
+
"""Run batch inference on a dataset."""
|
|
60
|
+
|
|
61
|
+
model = mlflow.pyfunc.load_model(model_uri)
|
|
62
|
+
|
|
63
|
+
# Process in chunks to manage memory
|
|
64
|
+
chunks = pd.read_parquet(data_path, chunksize=batch_size)
|
|
65
|
+
|
|
66
|
+
results = []
|
|
67
|
+
for chunk in chunks:
|
|
68
|
+
predictions = model.predict(chunk)
|
|
69
|
+
chunk["prediction"] = predictions
|
|
70
|
+
chunk["model_version"] = model_uri.split("/")[-1]
|
|
71
|
+
chunk["inference_timestamp"] = datetime.utcnow()
|
|
72
|
+
results.append(chunk)
|
|
73
|
+
|
|
74
|
+
# Write results
|
|
75
|
+
output_df = pd.concat(results)
|
|
76
|
+
output_df.to_parquet(output_path, index=False)
|
|
77
|
+
|
|
78
|
+
return output_path
|
|
79
|
+
|
|
80
|
+
@flow(name="daily-batch-inference")
|
|
81
|
+
def batch_inference_pipeline(date: str):
|
|
82
|
+
"""Daily batch inference pipeline."""
|
|
83
|
+
data_path = f"s3://data/features/{date}/"
|
|
84
|
+
model_uri = "models:/fraud-detector/Production"
|
|
85
|
+
output_path = f"s3://predictions/{date}/predictions.parquet"
|
|
86
|
+
|
|
87
|
+
run_batch_inference(data_path, model_uri, output_path)
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### Streaming Inference
|
|
91
|
+
|
|
92
|
+
For real-time event processing:
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
from kafka import KafkaConsumer, KafkaProducer
|
|
96
|
+
import json
|
|
97
|
+
|
|
98
|
+
class StreamingPredictor:
|
|
99
|
+
"""Process predictions from a Kafka stream."""
|
|
100
|
+
|
|
101
|
+
def __init__(self, model_uri: str, input_topic: str, output_topic: str):
|
|
102
|
+
self.model = mlflow.pyfunc.load_model(model_uri)
|
|
103
|
+
self.consumer = KafkaConsumer(
|
|
104
|
+
input_topic,
|
|
105
|
+
bootstrap_servers=["kafka:9092"],
|
|
106
|
+
value_deserializer=lambda m: json.loads(m.decode("utf-8")),
|
|
107
|
+
group_id="inference-group",
|
|
108
|
+
auto_offset_reset="latest",
|
|
109
|
+
)
|
|
110
|
+
self.producer = KafkaProducer(
|
|
111
|
+
bootstrap_servers=["kafka:9092"],
|
|
112
|
+
value_serializer=lambda m: json.dumps(m).encode("utf-8"),
|
|
113
|
+
)
|
|
114
|
+
self.output_topic = output_topic
|
|
115
|
+
|
|
116
|
+
def run(self):
|
|
117
|
+
"""Process messages continuously."""
|
|
118
|
+
for message in self.consumer:
|
|
119
|
+
try:
|
|
120
|
+
features = message.value
|
|
121
|
+
prediction = self.model.predict(pd.DataFrame([features]))[0]
|
|
122
|
+
|
|
123
|
+
result = {
|
|
124
|
+
"input": features,
|
|
125
|
+
"prediction": float(prediction),
|
|
126
|
+
"timestamp": datetime.utcnow().isoformat(),
|
|
127
|
+
"model_version": self.model_version,
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
self.producer.send(self.output_topic, result)
|
|
131
|
+
|
|
132
|
+
except Exception as e:
|
|
133
|
+
logger.error(f"Prediction failed: {e}", extra={"input": features})
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## Custom Predictors
|
|
137
|
+
|
|
138
|
+
### KServe Custom Predictor
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
from kserve import Model, ModelServer
|
|
142
|
+
from kserve.errors import ModelMissingError
|
|
143
|
+
import torch
|
|
144
|
+
|
|
145
|
+
class CustomPredictor(Model):
|
|
146
|
+
"""Custom KServe predictor with preprocessing."""
|
|
147
|
+
|
|
148
|
+
def __init__(self, name: str):
|
|
149
|
+
super().__init__(name)
|
|
150
|
+
self.model = None
|
|
151
|
+
self.transformer = None
|
|
152
|
+
self.ready = False
|
|
153
|
+
|
|
154
|
+
def load(self) -> bool:
|
|
155
|
+
"""Load model and artifacts."""
|
|
156
|
+
model_path = os.environ.get("MODEL_PATH", "/mnt/models")
|
|
157
|
+
|
|
158
|
+
# Load model
|
|
159
|
+
self.model = torch.jit.load(f"{model_path}/model.pt")
|
|
160
|
+
self.model.eval()
|
|
161
|
+
|
|
162
|
+
# Load preprocessing
|
|
163
|
+
self.transformer = FeatureTransformer.load(f"{model_path}/transformer.pkl")
|
|
164
|
+
|
|
165
|
+
# Load config
|
|
166
|
+
with open(f"{model_path}/config.yaml") as f:
|
|
167
|
+
self.config = yaml.safe_load(f)
|
|
168
|
+
|
|
169
|
+
self.ready = True
|
|
170
|
+
return self.ready
|
|
171
|
+
|
|
172
|
+
def preprocess(self, inputs: dict, headers: dict = None) -> torch.Tensor:
|
|
173
|
+
"""Preprocess input data."""
|
|
174
|
+
df = pd.DataFrame(inputs["instances"])
|
|
175
|
+
|
|
176
|
+
# Validate
|
|
177
|
+
validated = self.validate_input(df)
|
|
178
|
+
|
|
179
|
+
# Transform
|
|
180
|
+
features = self.transformer.transform(validated)
|
|
181
|
+
|
|
182
|
+
return torch.tensor(features.values, dtype=torch.float32)
|
|
183
|
+
|
|
184
|
+
def predict(self, inputs: torch.Tensor, headers: dict = None) -> dict:
|
|
185
|
+
"""Run inference."""
|
|
186
|
+
if not self.ready:
|
|
187
|
+
raise ModelMissingError(self.name)
|
|
188
|
+
|
|
189
|
+
with torch.no_grad():
|
|
190
|
+
logits = self.model(inputs)
|
|
191
|
+
probabilities = torch.sigmoid(logits).numpy()
|
|
192
|
+
|
|
193
|
+
return probabilities
|
|
194
|
+
|
|
195
|
+
def postprocess(self, outputs: np.ndarray, headers: dict = None) -> dict:
|
|
196
|
+
"""Postprocess predictions."""
|
|
197
|
+
return {
|
|
198
|
+
"predictions": outputs.tolist(),
|
|
199
|
+
"model_version": os.environ.get("MODEL_VERSION", "unknown"),
|
|
200
|
+
"threshold": self.config.get("threshold", 0.5),
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
def validate_input(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
204
|
+
"""Validate input against schema."""
|
|
205
|
+
required_cols = self.config["required_features"]
|
|
206
|
+
missing = set(required_cols) - set(df.columns)
|
|
207
|
+
if missing:
|
|
208
|
+
raise ValueError(f"Missing required features: {missing}")
|
|
209
|
+
|
|
210
|
+
return df[required_cols]
|
|
211
|
+
|
|
212
|
+
if __name__ == "__main__":
|
|
213
|
+
model = CustomPredictor("fraud-detector")
|
|
214
|
+
ModelServer().start([model])
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
### FastAPI Serving
|
|
218
|
+
|
|
219
|
+
```python
|
|
220
|
+
from fastapi import FastAPI, HTTPException
|
|
221
|
+
from pydantic import BaseModel, Field
|
|
222
|
+
import uvicorn
|
|
223
|
+
|
|
224
|
+
app = FastAPI(title="ML Model API", version="1.0.0")
|
|
225
|
+
|
|
226
|
+
class PredictionRequest(BaseModel):
|
|
227
|
+
features: dict[str, float]
|
|
228
|
+
request_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
|
229
|
+
|
|
230
|
+
class PredictionResponse(BaseModel):
|
|
231
|
+
prediction: float
|
|
232
|
+
probability: float
|
|
233
|
+
model_version: str
|
|
234
|
+
request_id: str
|
|
235
|
+
latency_ms: float
|
|
236
|
+
|
|
237
|
+
# Load model at startup
|
|
238
|
+
@app.on_event("startup")
|
|
239
|
+
async def load_model():
|
|
240
|
+
global model, transformer
|
|
241
|
+
model = mlflow.pyfunc.load_model("models:/fraud-detector/Production")
|
|
242
|
+
transformer = FeatureTransformer.load("transformer.pkl")
|
|
243
|
+
|
|
244
|
+
@app.post("/predict", response_model=PredictionResponse)
|
|
245
|
+
async def predict(request: PredictionRequest):
|
|
246
|
+
start_time = time.time()
|
|
247
|
+
|
|
248
|
+
try:
|
|
249
|
+
# Transform
|
|
250
|
+
df = pd.DataFrame([request.features])
|
|
251
|
+
features = transformer.transform(df)
|
|
252
|
+
|
|
253
|
+
# Predict
|
|
254
|
+
probability = model.predict(features)[0]
|
|
255
|
+
prediction = int(probability >= THRESHOLD)
|
|
256
|
+
|
|
257
|
+
latency_ms = (time.time() - start_time) * 1000
|
|
258
|
+
|
|
259
|
+
return PredictionResponse(
|
|
260
|
+
prediction=prediction,
|
|
261
|
+
probability=float(probability),
|
|
262
|
+
model_version=MODEL_VERSION,
|
|
263
|
+
request_id=request.request_id,
|
|
264
|
+
latency_ms=latency_ms,
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
except Exception as e:
|
|
268
|
+
logger.error(f"Prediction failed: {e}")
|
|
269
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
270
|
+
|
|
271
|
+
@app.get("/health")
|
|
272
|
+
async def health():
|
|
273
|
+
return {"status": "healthy", "model_loaded": model is not None}
|
|
274
|
+
|
|
275
|
+
if __name__ == "__main__":
|
|
276
|
+
uvicorn.run(app, host="0.0.0.0", port=8080)
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
## Containerization
|
|
280
|
+
|
|
281
|
+
### Dockerfile
|
|
282
|
+
|
|
283
|
+
```dockerfile
|
|
284
|
+
# Multi-stage build for smaller image
|
|
285
|
+
FROM python:3.11-slim as builder
|
|
286
|
+
|
|
287
|
+
WORKDIR /app
|
|
288
|
+
|
|
289
|
+
# Install build dependencies
|
|
290
|
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
291
|
+
build-essential \
|
|
292
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
293
|
+
|
|
294
|
+
# Install Python dependencies
|
|
295
|
+
COPY requirements.txt .
|
|
296
|
+
RUN pip wheel --no-cache-dir --wheel-dir /app/wheels -r requirements.txt
|
|
297
|
+
|
|
298
|
+
# Production image
|
|
299
|
+
FROM python:3.11-slim
|
|
300
|
+
|
|
301
|
+
WORKDIR /app
|
|
302
|
+
|
|
303
|
+
# Install runtime dependencies only
|
|
304
|
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
305
|
+
libgomp1 \
|
|
306
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
307
|
+
|
|
308
|
+
# Copy wheels and install
|
|
309
|
+
COPY --from=builder /app/wheels /wheels
|
|
310
|
+
RUN pip install --no-cache /wheels/*
|
|
311
|
+
|
|
312
|
+
# Copy application code
|
|
313
|
+
COPY src/ ./src/
|
|
314
|
+
COPY configs/ ./configs/
|
|
315
|
+
|
|
316
|
+
# Non-root user for security
|
|
317
|
+
RUN useradd -m -u 1000 appuser
|
|
318
|
+
USER appuser
|
|
319
|
+
|
|
320
|
+
# Health check
|
|
321
|
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
|
322
|
+
CMD curl -f http://localhost:8080/health || exit 1
|
|
323
|
+
|
|
324
|
+
EXPOSE 8080
|
|
325
|
+
|
|
326
|
+
CMD ["python", "-m", "src.serve"]
|
|
327
|
+
```
|
|
328
|
+
|
|
329
|
+
### GPU Support
|
|
330
|
+
|
|
331
|
+
```dockerfile
|
|
332
|
+
FROM nvidia/cuda:12.1-runtime-ubuntu22.04
|
|
333
|
+
|
|
334
|
+
# Install Python
|
|
335
|
+
RUN apt-get update && apt-get install -y python3.11 python3-pip
|
|
336
|
+
|
|
337
|
+
# Install PyTorch with CUDA
|
|
338
|
+
RUN pip install torch==2.1.0+cu121 -f https://download.pytorch.org/whl/torch_stable.html
|
|
339
|
+
|
|
340
|
+
# ... rest of Dockerfile
|
|
341
|
+
```
|
|
342
|
+
|
|
343
|
+
## Scaling Strategies
|
|
344
|
+
|
|
345
|
+
### Horizontal Pod Autoscaler
|
|
346
|
+
|
|
347
|
+
```yaml
|
|
348
|
+
apiVersion: autoscaling/v2
|
|
349
|
+
kind: HorizontalPodAutoscaler
|
|
350
|
+
metadata:
|
|
351
|
+
name: model-server-hpa
|
|
352
|
+
spec:
|
|
353
|
+
scaleTargetRef:
|
|
354
|
+
apiVersion: apps/v1
|
|
355
|
+
kind: Deployment
|
|
356
|
+
name: model-server
|
|
357
|
+
minReplicas: 2
|
|
358
|
+
maxReplicas: 20
|
|
359
|
+
metrics:
|
|
360
|
+
- type: Resource
|
|
361
|
+
resource:
|
|
362
|
+
name: cpu
|
|
363
|
+
target:
|
|
364
|
+
type: Utilization
|
|
365
|
+
averageUtilization: 70
|
|
366
|
+
- type: Pods
|
|
367
|
+
pods:
|
|
368
|
+
metric:
|
|
369
|
+
name: requests_per_second
|
|
370
|
+
target:
|
|
371
|
+
type: AverageValue
|
|
372
|
+
averageValue: "100"
|
|
373
|
+
behavior:
|
|
374
|
+
scaleDown:
|
|
375
|
+
stabilizationWindowSeconds: 300
|
|
376
|
+
policies:
|
|
377
|
+
- type: Percent
|
|
378
|
+
value: 10
|
|
379
|
+
periodSeconds: 60
|
|
380
|
+
scaleUp:
|
|
381
|
+
stabilizationWindowSeconds: 0
|
|
382
|
+
policies:
|
|
383
|
+
- type: Percent
|
|
384
|
+
value: 100
|
|
385
|
+
periodSeconds: 15
|
|
386
|
+
```
|
|
387
|
+
|
|
388
|
+
### GPU Scheduling
|
|
389
|
+
|
|
390
|
+
```yaml
|
|
391
|
+
apiVersion: v1
|
|
392
|
+
kind: Pod
|
|
393
|
+
metadata:
|
|
394
|
+
name: gpu-inference
|
|
395
|
+
spec:
|
|
396
|
+
containers:
|
|
397
|
+
- name: model-server
|
|
398
|
+
image: model-server:latest
|
|
399
|
+
resources:
|
|
400
|
+
limits:
|
|
401
|
+
nvidia.com/gpu: 1
|
|
402
|
+
memory: "16Gi"
|
|
403
|
+
requests:
|
|
404
|
+
nvidia.com/gpu: 1
|
|
405
|
+
memory: "8Gi"
|
|
406
|
+
nodeSelector:
|
|
407
|
+
accelerator: nvidia-tesla-t4
|
|
408
|
+
tolerations:
|
|
409
|
+
- key: nvidia.com/gpu
|
|
410
|
+
operator: Exists
|
|
411
|
+
effect: NoSchedule
|
|
412
|
+
```
|
|
413
|
+
|
|
414
|
+
## Model Loading Patterns
|
|
415
|
+
|
|
416
|
+
### Lazy Loading
|
|
417
|
+
|
|
418
|
+
```python
|
|
419
|
+
class LazyModel:
|
|
420
|
+
"""Load model on first request."""
|
|
421
|
+
|
|
422
|
+
def __init__(self, model_uri: str):
|
|
423
|
+
self.model_uri = model_uri
|
|
424
|
+
self._model = None
|
|
425
|
+
|
|
426
|
+
@property
|
|
427
|
+
def model(self):
|
|
428
|
+
if self._model is None:
|
|
429
|
+
self._model = mlflow.pyfunc.load_model(self.model_uri)
|
|
430
|
+
return self._model
|
|
431
|
+
|
|
432
|
+
def predict(self, features):
|
|
433
|
+
return self.model.predict(features)
|
|
434
|
+
```
|
|
435
|
+
|
|
436
|
+
### Model Caching
|
|
437
|
+
|
|
438
|
+
```python
|
|
439
|
+
from functools import lru_cache
|
|
440
|
+
|
|
441
|
+
@lru_cache(maxsize=3)
|
|
442
|
+
def load_model(model_uri: str):
|
|
443
|
+
"""Cache loaded models."""
|
|
444
|
+
return mlflow.pyfunc.load_model(model_uri)
|
|
445
|
+
|
|
446
|
+
class ModelManager:
|
|
447
|
+
"""Manage multiple model versions."""
|
|
448
|
+
|
|
449
|
+
def __init__(self):
|
|
450
|
+
self.models: dict[str, Any] = {}
|
|
451
|
+
self.default_version = "production"
|
|
452
|
+
|
|
453
|
+
def load_version(self, version: str) -> None:
|
|
454
|
+
"""Load a specific model version."""
|
|
455
|
+
model_uri = f"models:/fraud-detector/{version}"
|
|
456
|
+
self.models[version] = mlflow.pyfunc.load_model(model_uri)
|
|
457
|
+
|
|
458
|
+
def predict(self, features, version: str = None) -> np.ndarray:
|
|
459
|
+
"""Predict using specified or default version."""
|
|
460
|
+
version = version or self.default_version
|
|
461
|
+
|
|
462
|
+
if version not in self.models:
|
|
463
|
+
self.load_version(version)
|
|
464
|
+
|
|
465
|
+
return self.models[version].predict(features)
|
|
466
|
+
```
|
|
467
|
+
|
|
468
|
+
## A/B Testing
|
|
469
|
+
|
|
470
|
+
### Traffic Splitting
|
|
471
|
+
|
|
472
|
+
```yaml
|
|
473
|
+
# Istio VirtualService for traffic splitting
|
|
474
|
+
apiVersion: networking.istio.io/v1beta1
|
|
475
|
+
kind: VirtualService
|
|
476
|
+
metadata:
|
|
477
|
+
name: model-routing
|
|
478
|
+
spec:
|
|
479
|
+
hosts:
|
|
480
|
+
- model-service
|
|
481
|
+
http:
|
|
482
|
+
- match:
|
|
483
|
+
- headers:
|
|
484
|
+
x-model-version:
|
|
485
|
+
exact: "v2"
|
|
486
|
+
route:
|
|
487
|
+
- destination:
|
|
488
|
+
host: model-service-v2
|
|
489
|
+
- route:
|
|
490
|
+
- destination:
|
|
491
|
+
host: model-service-v1
|
|
492
|
+
weight: 90
|
|
493
|
+
- destination:
|
|
494
|
+
host: model-service-v2
|
|
495
|
+
weight: 10
|
|
496
|
+
```
|
|
497
|
+
|
|
498
|
+
### Shadow Deployment
|
|
499
|
+
|
|
500
|
+
```python
|
|
501
|
+
class ShadowPredictor:
|
|
502
|
+
"""Run predictions against shadow model for comparison."""
|
|
503
|
+
|
|
504
|
+
def __init__(self, primary_model, shadow_model):
|
|
505
|
+
self.primary = primary_model
|
|
506
|
+
self.shadow = shadow_model
|
|
507
|
+
|
|
508
|
+
async def predict(self, features):
|
|
509
|
+
# Run primary prediction (blocking)
|
|
510
|
+
primary_result = self.primary.predict(features)
|
|
511
|
+
|
|
512
|
+
# Run shadow prediction (non-blocking)
|
|
513
|
+
asyncio.create_task(self._shadow_predict(features, primary_result))
|
|
514
|
+
|
|
515
|
+
return primary_result
|
|
516
|
+
|
|
517
|
+
async def _shadow_predict(self, features, primary_result):
|
|
518
|
+
"""Compare shadow predictions asynchronously."""
|
|
519
|
+
try:
|
|
520
|
+
shadow_result = self.shadow.predict(features)
|
|
521
|
+
|
|
522
|
+
# Log comparison
|
|
523
|
+
logger.info(
|
|
524
|
+
"shadow_comparison",
|
|
525
|
+
primary=primary_result,
|
|
526
|
+
shadow=shadow_result,
|
|
527
|
+
match=np.allclose(primary_result, shadow_result, rtol=0.01),
|
|
528
|
+
)
|
|
529
|
+
except Exception as e:
|
|
530
|
+
logger.error(f"Shadow prediction failed: {e}")
|
|
531
|
+
```
|
|
532
|
+
|
|
533
|
+
## Rollback Procedures
|
|
534
|
+
|
|
535
|
+
```python
|
|
536
|
+
class ModelDeployer:
|
|
537
|
+
"""Manage model deployments with rollback capability."""
|
|
538
|
+
|
|
539
|
+
def __init__(self, client: MlflowClient):
|
|
540
|
+
self.client = client
|
|
541
|
+
|
|
542
|
+
def deploy(self, model_name: str, version: str) -> None:
|
|
543
|
+
"""Deploy a model version to production."""
|
|
544
|
+
# Record current production version for rollback
|
|
545
|
+
current_prod = self.client.get_latest_versions(model_name, stages=["Production"])
|
|
546
|
+
if current_prod:
|
|
547
|
+
self._record_rollback_version(model_name, current_prod[0].version)
|
|
548
|
+
|
|
549
|
+
# Transition to production
|
|
550
|
+
self.client.transition_model_version_stage(
|
|
551
|
+
name=model_name,
|
|
552
|
+
version=version,
|
|
553
|
+
stage="Production",
|
|
554
|
+
)
|
|
555
|
+
|
|
556
|
+
def rollback(self, model_name: str) -> str:
|
|
557
|
+
"""Rollback to previous production version."""
|
|
558
|
+
previous_version = self._get_rollback_version(model_name)
|
|
559
|
+
|
|
560
|
+
if not previous_version:
|
|
561
|
+
raise ValueError("No rollback version available")
|
|
562
|
+
|
|
563
|
+
# Archive current
|
|
564
|
+
current_prod = self.client.get_latest_versions(model_name, stages=["Production"])
|
|
565
|
+
for v in current_prod:
|
|
566
|
+
self.client.transition_model_version_stage(
|
|
567
|
+
name=model_name,
|
|
568
|
+
version=v.version,
|
|
569
|
+
stage="Archived",
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
# Restore previous
|
|
573
|
+
self.client.transition_model_version_stage(
|
|
574
|
+
name=model_name,
|
|
575
|
+
version=previous_version,
|
|
576
|
+
stage="Production",
|
|
577
|
+
)
|
|
578
|
+
|
|
579
|
+
return previous_version
|
|
580
|
+
```
|
|
581
|
+
|
|
582
|
+
## Best Practices
|
|
583
|
+
|
|
584
|
+
### Pre-Deployment Checklist
|
|
585
|
+
|
|
586
|
+
- [ ] Model validated on holdout test set
|
|
587
|
+
- [ ] Inference latency meets SLA
|
|
588
|
+
- [ ] Memory footprint acceptable
|
|
589
|
+
- [ ] Load testing completed
|
|
590
|
+
- [ ] Rollback procedure documented
|
|
591
|
+
- [ ] Monitoring configured
|
|
592
|
+
- [ ] Feature transformer included
|
|
593
|
+
|
|
594
|
+
### Post-Deployment Checklist
|
|
595
|
+
|
|
596
|
+
- [ ] Health checks passing
|
|
597
|
+
- [ ] Predictions flowing to monitoring
|
|
598
|
+
- [ ] Alerts configured
|
|
599
|
+
- [ ] A/B test metrics tracking
|
|
600
|
+
- [ ] Shadow comparison (if applicable)
|
|
601
|
+
- [ ] Documentation updated
|