claude-code-pack 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +120 -0
- package/assets/statusline-command.sh +116 -0
- package/bin/claude-pack.mjs +48 -0
- package/claude-pack.config.json +37 -0
- package/package.json +31 -0
- package/skills/cloud-devops/SKILL.md +235 -0
- package/skills/fastapi/SKILL.md +436 -0
- package/skills/fastapi/references/dependencies.md +142 -0
- package/skills/fastapi/references/other-tools.md +76 -0
- package/skills/fastapi/references/streaming.md +105 -0
- package/skills/senior-ml-engineer/SKILL.md +304 -0
- package/skills/senior-ml-engineer/references/llm_integration_guide.md +317 -0
- package/skills/senior-ml-engineer/references/mlops_production_patterns.md +265 -0
- package/skills/senior-ml-engineer/references/rag_system_architecture.md +371 -0
- package/skills/senior-ml-engineer/scripts/ml_monitoring_suite.py +100 -0
- package/skills/senior-ml-engineer/scripts/model_deployment_pipeline.py +100 -0
- package/skills/senior-ml-engineer/scripts/rag_system_builder.py +100 -0
- package/skills/technical-writer/technical-writer/SKILL.md +351 -0
- package/src/install.mjs +391 -0
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# Streaming
|
|
2
|
+
|
|
3
|
+
## Stream JSON Lines
|
|
4
|
+
|
|
5
|
+
To stream JSON Lines, declare the return type and use `yield` to return the data.
|
|
6
|
+
|
|
7
|
+
```python
|
|
8
|
+
@app.get("/items/stream")
|
|
9
|
+
async def stream_items() -> AsyncIterable[Item]:
|
|
10
|
+
for item in items:
|
|
11
|
+
yield item
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
## Server-Sent Events (SSE)
|
|
15
|
+
|
|
16
|
+
To stream Server-Sent Events, use `response_class=EventSourceResponse` and `yield` items from the endpoint.
|
|
17
|
+
|
|
18
|
+
Plain objects are automatically JSON-serialized as `data:` fields, declare the return type so the serialization is done by Pydantic:
|
|
19
|
+
|
|
20
|
+
```python
|
|
21
|
+
from collections.abc import AsyncIterable
|
|
22
|
+
|
|
23
|
+
from fastapi import FastAPI
|
|
24
|
+
from fastapi.sse import EventSourceResponse
|
|
25
|
+
from pydantic import BaseModel
|
|
26
|
+
|
|
27
|
+
app = FastAPI()
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class Item(BaseModel):
|
|
31
|
+
name: str
|
|
32
|
+
price: float
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@app.get("/items/stream", response_class=EventSourceResponse)
|
|
36
|
+
async def stream_items() -> AsyncIterable[Item]:
|
|
37
|
+
yield Item(name="Plumbus", price=32.99)
|
|
38
|
+
yield Item(name="Portal Gun", price=999.99)
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
For full control over SSE fields (`event`, `id`, `retry`, `comment`), yield `ServerSentEvent` instances:
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
from collections.abc import AsyncIterable
|
|
45
|
+
|
|
46
|
+
from fastapi import FastAPI
|
|
47
|
+
from fastapi.sse import EventSourceResponse, ServerSentEvent
|
|
48
|
+
|
|
49
|
+
app = FastAPI()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@app.get("/events", response_class=EventSourceResponse)
|
|
53
|
+
async def stream_events() -> AsyncIterable[ServerSentEvent]:
|
|
54
|
+
yield ServerSentEvent(data={"status": "started"}, event="status", id="1")
|
|
55
|
+
yield ServerSentEvent(data={"progress": 50}, event="progress", id="2")
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Use `raw_data` instead of `data` to send pre-formatted strings without JSON encoding:
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
yield ServerSentEvent(raw_data="plain text line", event="log")
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Stream bytes
|
|
65
|
+
|
|
66
|
+
To stream bytes, declare a `response_class=` of `StreamingResponse` or a sub-class, and use `yield` to return the data.
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
from fastapi import FastAPI
|
|
70
|
+
from fastapi.responses import StreamingResponse
|
|
71
|
+
from app.utils import read_image
|
|
72
|
+
|
|
73
|
+
app = FastAPI()
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class PNGStreamingResponse(StreamingResponse):
|
|
77
|
+
media_type = "image/png"
|
|
78
|
+
|
|
79
|
+
@app.get("/image", response_class=PNGStreamingResponse)
|
|
80
|
+
def stream_image_no_async_no_annotation():
|
|
81
|
+
with read_image() as image_file:
|
|
82
|
+
yield from image_file
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
prefer this over returning a `StreamingResponse` directly:
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
# DO NOT DO THIS
|
|
89
|
+
|
|
90
|
+
import anyio
|
|
91
|
+
from fastapi import FastAPI
|
|
92
|
+
from fastapi.responses import StreamingResponse
|
|
93
|
+
from app.utils import read_image
|
|
94
|
+
|
|
95
|
+
app = FastAPI()
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class PNGStreamingResponse(StreamingResponse):
|
|
99
|
+
media_type = "image/png"
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@app.get("/")
|
|
103
|
+
async def main():
|
|
104
|
+
return PNGStreamingResponse(read_image())
|
|
105
|
+
```
|
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: "senior-ml-engineer"
|
|
3
|
+
description: ML engineering skill for productionizing models, building MLOps pipelines, and integrating LLMs. Covers model deployment, feature stores, drift monitoring, RAG systems, and cost optimization. Use when the user asks about deploying ML models to production, setting up MLOps infrastructure (MLflow, Kubeflow, Kubernetes, Docker), monitoring model performance or drift, building RAG pipelines, or integrating LLM APIs with retry logic and cost controls. Focused on production and operational concerns rather than model research or initial training.
|
|
4
|
+
triggers:
|
|
5
|
+
- MLOps pipeline
|
|
6
|
+
- model deployment
|
|
7
|
+
- feature store
|
|
8
|
+
- model monitoring
|
|
9
|
+
- drift detection
|
|
10
|
+
- RAG system
|
|
11
|
+
- LLM integration
|
|
12
|
+
- model serving
|
|
13
|
+
- A/B testing ML
|
|
14
|
+
- automated retraining
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
# Senior ML Engineer
|
|
18
|
+
|
|
19
|
+
Production ML engineering patterns for model deployment, MLOps infrastructure, and LLM integration.
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## Table of Contents
|
|
24
|
+
|
|
25
|
+
- [Model Deployment Workflow](#model-deployment-workflow)
|
|
26
|
+
- [MLOps Pipeline Setup](#mlops-pipeline-setup)
|
|
27
|
+
- [LLM Integration Workflow](#llm-integration-workflow)
|
|
28
|
+
- [RAG System Implementation](#rag-system-implementation)
|
|
29
|
+
- [Model Monitoring](#model-monitoring)
|
|
30
|
+
- [Reference Documentation](#reference-documentation)
|
|
31
|
+
- [Tools](#tools)
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
## Model Deployment Workflow
|
|
36
|
+
|
|
37
|
+
Deploy a trained model to production with monitoring:
|
|
38
|
+
|
|
39
|
+
1. Export model to standardized format (ONNX, TorchScript, SavedModel)
|
|
40
|
+
2. Package model with dependencies in Docker container
|
|
41
|
+
3. Deploy to staging environment
|
|
42
|
+
4. Run integration tests against staging
|
|
43
|
+
5. Deploy canary (5% traffic) to production
|
|
44
|
+
6. Monitor latency and error rates for 1 hour
|
|
45
|
+
7. Promote to full production if metrics pass
|
|
46
|
+
8. **Validation:** p95 latency < 100ms, error rate < 0.1%
|
|
47
|
+
|
|
48
|
+
### Container Template
|
|
49
|
+
|
|
50
|
+
```dockerfile
|
|
51
|
+
FROM python:3.11-slim
|
|
52
|
+
|
|
53
|
+
COPY requirements.txt .
|
|
54
|
+
RUN pip install --no-cache-dir -r requirements.txt
|
|
55
|
+
|
|
56
|
+
COPY model/ /app/model/
|
|
57
|
+
COPY src/ /app/src/
|
|
58
|
+
|
|
59
|
+
HEALTHCHECK CMD curl -f http://localhost:8080/health || exit 1
|
|
60
|
+
|
|
61
|
+
EXPOSE 8080
|
|
62
|
+
CMD ["uvicorn", "src.server:app", "--host", "0.0.0.0", "--port", "8080"]
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### Serving Options
|
|
66
|
+
|
|
67
|
+
| Option | Latency | Throughput | Use Case |
|
|
68
|
+
|--------|---------|------------|----------|
|
|
69
|
+
| FastAPI + Uvicorn | Low | Medium | REST APIs, small models |
|
|
70
|
+
| Triton Inference Server | Very Low | Very High | GPU inference, batching |
|
|
71
|
+
| TensorFlow Serving | Low | High | TensorFlow models |
|
|
72
|
+
| TorchServe | Low | High | PyTorch models |
|
|
73
|
+
| Ray Serve | Medium | High | Complex pipelines, multi-model |
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## MLOps Pipeline Setup
|
|
78
|
+
|
|
79
|
+
Establish automated training and deployment:
|
|
80
|
+
|
|
81
|
+
1. Configure feature store (Feast, Tecton) for training data
|
|
82
|
+
2. Set up experiment tracking (MLflow, Weights & Biases)
|
|
83
|
+
3. Create training pipeline with hyperparameter logging
|
|
84
|
+
4. Register model in model registry with version metadata
|
|
85
|
+
5. Configure staging deployment triggered by registry events
|
|
86
|
+
6. Set up A/B testing infrastructure for model comparison
|
|
87
|
+
7. Enable drift monitoring with alerting
|
|
88
|
+
8. **Validation:** New models automatically evaluated against baseline
|
|
89
|
+
|
|
90
|
+
### Feature Store Pattern
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
from feast import Entity, Feature, FeatureView, FileSource
|
|
94
|
+
|
|
95
|
+
user = Entity(name="user_id", value_type=ValueType.INT64)
|
|
96
|
+
|
|
97
|
+
user_features = FeatureView(
|
|
98
|
+
name="user_features",
|
|
99
|
+
entities=["user_id"],
|
|
100
|
+
ttl=timedelta(days=1),
|
|
101
|
+
features=[
|
|
102
|
+
Feature(name="purchase_count_30d", dtype=ValueType.INT64),
|
|
103
|
+
Feature(name="avg_order_value", dtype=ValueType.FLOAT),
|
|
104
|
+
],
|
|
105
|
+
online=True,
|
|
106
|
+
source=FileSource(path="data/user_features.parquet"),
|
|
107
|
+
)
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Retraining Triggers
|
|
111
|
+
|
|
112
|
+
| Trigger | Detection | Action |
|
|
113
|
+
|---------|-----------|--------|
|
|
114
|
+
| Scheduled | Cron (weekly/monthly) | Full retrain |
|
|
115
|
+
| Performance drop | Accuracy < threshold | Immediate retrain |
|
|
116
|
+
| Data drift | PSI > 0.2 | Evaluate, then retrain |
|
|
117
|
+
| New data volume | X new samples | Incremental update |
|
|
118
|
+
|
|
119
|
+
---
|
|
120
|
+
|
|
121
|
+
## LLM Integration Workflow
|
|
122
|
+
|
|
123
|
+
Integrate LLM APIs into production applications:
|
|
124
|
+
|
|
125
|
+
1. Create provider abstraction layer for vendor flexibility
|
|
126
|
+
2. Implement retry logic with exponential backoff
|
|
127
|
+
3. Configure fallback to secondary provider
|
|
128
|
+
4. Set up token counting and context truncation
|
|
129
|
+
5. Add response caching for repeated queries
|
|
130
|
+
6. Implement cost tracking per request
|
|
131
|
+
7. Add structured output validation with Pydantic
|
|
132
|
+
8. **Validation:** Response parses correctly, cost within budget
|
|
133
|
+
|
|
134
|
+
### Provider Abstraction
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
from abc import ABC, abstractmethod
|
|
138
|
+
from tenacity import retry, stop_after_attempt, wait_exponential
|
|
139
|
+
|
|
140
|
+
class LLMProvider(ABC):
|
|
141
|
+
@abstractmethod
|
|
142
|
+
def complete(self, prompt: str, **kwargs) -> str:
|
|
143
|
+
pass
|
|
144
|
+
|
|
145
|
+
@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=10))
|
|
146
|
+
def call_llm_with_retry(provider: LLMProvider, prompt: str) -> str:
|
|
147
|
+
return provider.complete(prompt)
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### Cost Management
|
|
151
|
+
|
|
152
|
+
| Provider | Input Cost | Output Cost |
|
|
153
|
+
|----------|------------|-------------|
|
|
154
|
+
| GPT-4 | $0.03/1K | $0.06/1K |
|
|
155
|
+
| GPT-3.5 | $0.0005/1K | $0.0015/1K |
|
|
156
|
+
| Claude 3 Opus | $0.015/1K | $0.075/1K |
|
|
157
|
+
| Claude 3 Haiku | $0.00025/1K | $0.00125/1K |
|
|
158
|
+
|
|
159
|
+
---
|
|
160
|
+
|
|
161
|
+
## RAG System Implementation
|
|
162
|
+
|
|
163
|
+
Build retrieval-augmented generation pipeline:
|
|
164
|
+
|
|
165
|
+
1. Choose vector database (Pinecone, Qdrant, Weaviate)
|
|
166
|
+
2. Select embedding model based on quality/cost tradeoff
|
|
167
|
+
3. Implement document chunking strategy
|
|
168
|
+
4. Create ingestion pipeline with metadata extraction
|
|
169
|
+
5. Build retrieval with query embedding
|
|
170
|
+
6. Add reranking for relevance improvement
|
|
171
|
+
7. Format context and send to LLM
|
|
172
|
+
8. **Validation:** Response references retrieved context, no hallucinations
|
|
173
|
+
|
|
174
|
+
### Vector Database Selection
|
|
175
|
+
|
|
176
|
+
| Database | Hosting | Scale | Latency | Best For |
|
|
177
|
+
|----------|---------|-------|---------|----------|
|
|
178
|
+
| Pinecone | Managed | High | Low | Production, managed |
|
|
179
|
+
| Qdrant | Both | High | Very Low | Performance-critical |
|
|
180
|
+
| Weaviate | Both | High | Low | Hybrid search |
|
|
181
|
+
| Chroma | Self-hosted | Medium | Low | Prototyping |
|
|
182
|
+
| pgvector | Self-hosted | Medium | Medium | Existing Postgres |
|
|
183
|
+
|
|
184
|
+
### Chunking Strategies
|
|
185
|
+
|
|
186
|
+
| Strategy | Chunk Size | Overlap | Best For |
|
|
187
|
+
|----------|------------|---------|----------|
|
|
188
|
+
| Fixed | 500-1000 tokens | 50-100 | General text |
|
|
189
|
+
| Sentence | 3-5 sentences | 1 sentence | Structured text |
|
|
190
|
+
| Semantic | Variable | Based on meaning | Research papers |
|
|
191
|
+
| Recursive | Hierarchical | Parent-child | Long documents |
|
|
192
|
+
|
|
193
|
+
---
|
|
194
|
+
|
|
195
|
+
## Model Monitoring
|
|
196
|
+
|
|
197
|
+
Monitor production models for drift and degradation:
|
|
198
|
+
|
|
199
|
+
1. Set up latency tracking (p50, p95, p99)
|
|
200
|
+
2. Configure error rate alerting
|
|
201
|
+
3. Implement input data drift detection
|
|
202
|
+
4. Track prediction distribution shifts
|
|
203
|
+
5. Log ground truth when available
|
|
204
|
+
6. Compare model versions with A/B metrics
|
|
205
|
+
7. Set up automated retraining triggers
|
|
206
|
+
8. **Validation:** Alerts fire before user-visible degradation
|
|
207
|
+
|
|
208
|
+
### Drift Detection
|
|
209
|
+
|
|
210
|
+
```python
|
|
211
|
+
from scipy.stats import ks_2samp
|
|
212
|
+
|
|
213
|
+
def detect_drift(reference, current, threshold=0.05):
|
|
214
|
+
statistic, p_value = ks_2samp(reference, current)
|
|
215
|
+
return {
|
|
216
|
+
"drift_detected": p_value < threshold,
|
|
217
|
+
"ks_statistic": statistic,
|
|
218
|
+
"p_value": p_value
|
|
219
|
+
}
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
### Alert Thresholds
|
|
223
|
+
|
|
224
|
+
| Metric | Warning | Critical |
|
|
225
|
+
|--------|---------|----------|
|
|
226
|
+
| p95 latency | > 100ms | > 200ms |
|
|
227
|
+
| Error rate | > 0.1% | > 1% |
|
|
228
|
+
| PSI (drift) | > 0.1 | > 0.2 |
|
|
229
|
+
| Accuracy drop | > 2% | > 5% |
|
|
230
|
+
|
|
231
|
+
---
|
|
232
|
+
|
|
233
|
+
## Reference Documentation
|
|
234
|
+
|
|
235
|
+
### MLOps Production Patterns
|
|
236
|
+
|
|
237
|
+
`references/mlops_production_patterns.md` contains:
|
|
238
|
+
|
|
239
|
+
- Model deployment pipeline with Kubernetes manifests
|
|
240
|
+
- Feature store architecture with Feast examples
|
|
241
|
+
- Model monitoring with drift detection code
|
|
242
|
+
- A/B testing infrastructure with traffic splitting
|
|
243
|
+
- Automated retraining pipeline with MLflow
|
|
244
|
+
|
|
245
|
+
### LLM Integration Guide
|
|
246
|
+
|
|
247
|
+
`references/llm_integration_guide.md` contains:
|
|
248
|
+
|
|
249
|
+
- Provider abstraction layer pattern
|
|
250
|
+
- Retry and fallback strategies with tenacity
|
|
251
|
+
- Prompt engineering templates (few-shot, CoT)
|
|
252
|
+
- Token optimization with tiktoken
|
|
253
|
+
- Cost calculation and tracking
|
|
254
|
+
|
|
255
|
+
### RAG System Architecture
|
|
256
|
+
|
|
257
|
+
`references/rag_system_architecture.md` contains:
|
|
258
|
+
|
|
259
|
+
- RAG pipeline implementation with code
|
|
260
|
+
- Vector database comparison and integration
|
|
261
|
+
- Chunking strategies (fixed, semantic, recursive)
|
|
262
|
+
- Embedding model selection guide
|
|
263
|
+
- Hybrid search and reranking patterns
|
|
264
|
+
|
|
265
|
+
---
|
|
266
|
+
|
|
267
|
+
## Tools
|
|
268
|
+
|
|
269
|
+
### Model Deployment Pipeline
|
|
270
|
+
|
|
271
|
+
```bash
|
|
272
|
+
python scripts/model_deployment_pipeline.py --model model.pkl --target staging
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
Generates deployment artifacts: Dockerfile, Kubernetes manifests, health checks.
|
|
276
|
+
|
|
277
|
+
### RAG System Builder
|
|
278
|
+
|
|
279
|
+
```bash
|
|
280
|
+
python scripts/rag_system_builder.py --config rag_config.yaml --analyze
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
Scaffolds RAG pipeline with vector store integration and retrieval logic.
|
|
284
|
+
|
|
285
|
+
### ML Monitoring Suite
|
|
286
|
+
|
|
287
|
+
```bash
|
|
288
|
+
python scripts/ml_monitoring_suite.py --config monitoring.yaml --deploy
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
Sets up drift detection, alerting, and performance dashboards.
|
|
292
|
+
|
|
293
|
+
---
|
|
294
|
+
|
|
295
|
+
## Tech Stack
|
|
296
|
+
|
|
297
|
+
| Category | Tools |
|
|
298
|
+
|----------|-------|
|
|
299
|
+
| ML Frameworks | PyTorch, TensorFlow, Scikit-learn, XGBoost |
|
|
300
|
+
| LLM Frameworks | LangChain, LlamaIndex, DSPy |
|
|
301
|
+
| MLOps | MLflow, Weights & Biases, Kubeflow |
|
|
302
|
+
| Data | Spark, Airflow, dbt, Kafka |
|
|
303
|
+
| Deployment | Docker, Kubernetes, Triton |
|
|
304
|
+
| Databases | PostgreSQL, BigQuery, Pinecone, Redis |
|