claude-code-pack 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,317 @@
1
+ # LLM Integration Guide
2
+
3
+ Production patterns for integrating Large Language Models into applications.
4
+
5
+ ---
6
+
7
+ ## Table of Contents
8
+
9
+ - [API Integration Patterns](#api-integration-patterns)
10
+ - [Prompt Engineering](#prompt-engineering)
11
+ - [Token Optimization](#token-optimization)
12
+ - [Cost Management](#cost-management)
13
+ - [Error Handling](#error-handling)
14
+
15
+ ---
16
+
17
+ ## API Integration Patterns
18
+
19
+ ### Provider Abstraction Layer
20
+
21
+ ```python
22
+ from abc import ABC, abstractmethod
23
+ from typing import List, Dict, Any
24
+
25
+ class LLMProvider(ABC):
26
+ """Abstract base class for LLM providers."""
27
+
28
+ @abstractmethod
29
+ def complete(self, prompt: str, **kwargs) -> str:
30
+ pass
31
+
32
+ @abstractmethod
33
+ def chat(self, messages: List[Dict], **kwargs) -> str:
34
+ pass
35
+
36
+ class OpenAIProvider(LLMProvider):
37
+ def __init__(self, api_key: str, model: str = "gpt-4"):
38
+ self.client = OpenAI(api_key=api_key)
39
+ self.model = model
40
+
41
+ def complete(self, prompt: str, **kwargs) -> str:
42
+ response = self.client.completions.create(
43
+ model=self.model,
44
+ prompt=prompt,
45
+ **kwargs
46
+ )
47
+ return response.choices[0].text
48
+
49
+ class AnthropicProvider(LLMProvider):
50
+ def __init__(self, api_key: str, model: str = "claude-3-opus"):
51
+ self.client = Anthropic(api_key=api_key)
52
+ self.model = model
53
+
54
+ def chat(self, messages: List[Dict], **kwargs) -> str:
55
+ response = self.client.messages.create(
56
+ model=self.model,
57
+ messages=messages,
58
+ **kwargs
59
+ )
60
+ return response.content[0].text
61
+ ```
62
+
63
+ ### Retry and Fallback Strategy
64
+
65
+ ```python
66
+ import time
67
+ from tenacity import retry, stop_after_attempt, wait_exponential
68
+
69
+ @retry(
70
+ stop=stop_after_attempt(3),
71
+ wait=wait_exponential(multiplier=1, min=1, max=10)
72
+ )
73
+ def call_llm_with_retry(provider: LLMProvider, prompt: str) -> str:
74
+ """Call LLM with exponential backoff retry."""
75
+ return provider.complete(prompt)
76
+
77
+ def call_with_fallback(
78
+ primary: LLMProvider,
79
+ fallback: LLMProvider,
80
+ prompt: str
81
+ ) -> str:
82
+ """Try primary provider, fall back on failure."""
83
+ try:
84
+ return call_llm_with_retry(primary, prompt)
85
+ except Exception as e:
86
+ logger.warning(f"Primary provider failed: {e}, using fallback")
87
+ return call_llm_with_retry(fallback, prompt)
88
+ ```
89
+
90
+ ---
91
+
92
+ ## Prompt Engineering
93
+
94
+ ### Prompt Templates
95
+
96
+ | Pattern | Use Case | Structure |
97
+ |---------|----------|-----------|
98
+ | Zero-shot | Simple tasks | Task description + input |
99
+ | Few-shot | Complex tasks | Examples + task + input |
100
+ | Chain-of-thought | Reasoning | "Think step by step" + task |
101
+ | Role-based | Specialized output | System role + task |
102
+
103
+ ### Few-Shot Template
104
+
105
+ ```python
106
+ FEW_SHOT_TEMPLATE = """
107
+ You are a sentiment classifier. Classify the sentiment as positive, negative, or neutral.
108
+
109
+ Examples:
110
+ Input: "This product is amazing, I love it!"
111
+ Output: positive
112
+
113
+ Input: "Terrible experience, waste of money."
114
+ Output: negative
115
+
116
+ Input: "The product arrived on time."
117
+ Output: neutral
118
+
119
+ Now classify:
120
+ Input: "{user_input}"
121
+ Output:"""
122
+
123
+ def classify_sentiment(text: str, provider: LLMProvider) -> str:
124
+ prompt = FEW_SHOT_TEMPLATE.format(user_input=text)
125
+ response = provider.complete(prompt, max_tokens=10, temperature=0)
126
+ return response.strip().lower()
127
+ ```
128
+
129
+ ### System Prompts for Consistency
130
+
131
+ ```python
132
+ SYSTEM_PROMPT = """You are a helpful assistant that answers questions about our product.
133
+
134
+ Guidelines:
135
+ - Be concise and direct
136
+ - Use bullet points for lists
137
+ - If unsure, say "I don't have that information"
138
+ - Never make up information
139
+ - Keep responses under 200 words
140
+
141
+ Product context:
142
+ {product_context}
143
+ """
144
+
145
+ def create_chat_messages(user_query: str, context: str) -> List[Dict]:
146
+ return [
147
+ {"role": "system", "content": SYSTEM_PROMPT.format(product_context=context)},
148
+ {"role": "user", "content": user_query}
149
+ ]
150
+ ```
151
+
152
+ ---
153
+
154
+ ## Token Optimization
155
+
156
+ ### Token Counting
157
+
158
+ ```python
159
+ import tiktoken
160
+
161
+ def count_tokens(text: str, model: str = "gpt-4") -> int:
162
+ """Count tokens for a given text and model."""
163
+ encoding = tiktoken.encoding_for_model(model)
164
+ return len(encoding.encode(text))
165
+
166
+ def truncate_to_token_limit(text: str, max_tokens: int, model: str = "gpt-4") -> str:
167
+ """Truncate text to fit within token limit."""
168
+ encoding = tiktoken.encoding_for_model(model)
169
+ tokens = encoding.encode(text)
170
+
171
+ if len(tokens) <= max_tokens:
172
+ return text
173
+
174
+ return encoding.decode(tokens[:max_tokens])
175
+ ```
176
+
177
+ ### Context Window Management
178
+
179
+ | Model | Context Window | Effective Limit |
180
+ |-------|----------------|-----------------|
181
+ | GPT-4 | 8,192 | ~6,000 (leave room for response) |
182
+ | GPT-4-32k | 32,768 | ~28,000 |
183
+ | Claude 3 | 200,000 | ~180,000 |
184
+ | Llama 3 | 8,192 | ~6,000 |
185
+
186
+ ### Chunking Strategy
187
+
188
+ ```python
189
+ def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 100) -> List[str]:
190
+ """Split text into overlapping chunks."""
191
+ chunks = []
192
+ start = 0
193
+
194
+ while start < len(text):
195
+ end = start + chunk_size
196
+ chunk = text[start:end]
197
+ chunks.append(chunk)
198
+ start = end - overlap
199
+
200
+ return chunks
201
+ ```
202
+
203
+ ---
204
+
205
+ ## Cost Management
206
+
207
+ ### Cost Calculation
208
+
209
+ | Provider | Input Cost | Output Cost | Example (1K tokens) |
210
+ |----------|------------|-------------|---------------------|
211
+ | GPT-4 | $0.03/1K | $0.06/1K | $0.09 |
212
+ | GPT-3.5 | $0.0005/1K | $0.0015/1K | $0.002 |
213
+ | Claude 3 Opus | $0.015/1K | $0.075/1K | $0.09 |
214
+ | Claude 3 Haiku | $0.00025/1K | $0.00125/1K | $0.0015 |
215
+
216
+ ### Cost Tracking
217
+
218
+ ```python
219
+ from dataclasses import dataclass
220
+ from typing import Optional
221
+
222
+ @dataclass
223
+ class LLMUsage:
224
+ input_tokens: int
225
+ output_tokens: int
226
+ model: str
227
+ cost: float
228
+
229
+ def calculate_cost(
230
+ input_tokens: int,
231
+ output_tokens: int,
232
+ model: str
233
+ ) -> float:
234
+ """Calculate cost based on token usage."""
235
+ PRICING = {
236
+ "gpt-4": {"input": 0.03, "output": 0.06},
237
+ "gpt-3.5-turbo": {"input": 0.0005, "output": 0.0015},
238
+ "claude-3-opus": {"input": 0.015, "output": 0.075},
239
+ }
240
+
241
+ prices = PRICING.get(model, {"input": 0.01, "output": 0.03})
242
+
243
+ input_cost = (input_tokens / 1000) * prices["input"]
244
+ output_cost = (output_tokens / 1000) * prices["output"]
245
+
246
+ return input_cost + output_cost
247
+ ```
248
+
249
+ ### Cost Optimization Strategies
250
+
251
+ 1. **Use smaller models for simple tasks** - GPT-3.5 for classification, GPT-4 for reasoning
252
+ 2. **Cache common responses** - Store results for repeated queries
253
+ 3. **Batch requests** - Combine multiple items in single prompt
254
+ 4. **Truncate context** - Only include relevant information
255
+ 5. **Set max_tokens limit** - Prevent runaway responses
256
+
257
+ ---
258
+
259
+ ## Error Handling
260
+
261
+ ### Common Error Types
262
+
263
+ | Error | Cause | Handling |
264
+ |-------|-------|----------|
265
+ | RateLimitError | Too many requests | Exponential backoff |
266
+ | InvalidRequestError | Bad input | Validate before sending |
267
+ | AuthenticationError | Invalid API key | Check credentials |
268
+ | ServiceUnavailable | Provider down | Fallback to alternative |
269
+ | ContextLengthExceeded | Input too long | Truncate or chunk |
270
+
271
+ ### Error Handling Pattern
272
+
273
+ ```python
274
+ from openai import RateLimitError, APIError
275
+
276
+ def safe_llm_call(provider: LLMProvider, prompt: str, max_retries: int = 3) -> str:
277
+ """Safely call LLM with comprehensive error handling."""
278
+ for attempt in range(max_retries):
279
+ try:
280
+ return provider.complete(prompt)
281
+
282
+ except RateLimitError:
283
+ wait_time = 2 ** attempt
284
+ logger.warning(f"Rate limited, waiting {wait_time}s")
285
+ time.sleep(wait_time)
286
+
287
+ except APIError as e:
288
+ if e.status_code >= 500:
289
+ logger.warning(f"Server error: {e}, retrying...")
290
+ time.sleep(1)
291
+ else:
292
+ raise
293
+
294
+ raise Exception(f"Failed after {max_retries} attempts")
295
+ ```
296
+
297
+ ### Response Validation
298
+
299
+ ```python
300
+ import json
301
+ from pydantic import BaseModel, ValidationError
302
+
303
+ class StructuredResponse(BaseModel):
304
+ answer: str
305
+ confidence: float
306
+ sources: List[str]
307
+
308
+ def parse_structured_response(response: str) -> StructuredResponse:
309
+ """Parse and validate LLM JSON response."""
310
+ try:
311
+ data = json.loads(response)
312
+ return StructuredResponse(**data)
313
+ except json.JSONDecodeError:
314
+ raise ValueError("Response is not valid JSON")
315
+ except ValidationError as e:
316
+ raise ValueError(f"Response validation failed: {e}")
317
+ ```
@@ -0,0 +1,265 @@
1
+ # MLOps Production Patterns
2
+
3
+ Production ML infrastructure patterns for model deployment, monitoring, and lifecycle management.
4
+
5
+ ---
6
+
7
+ ## Table of Contents
8
+
9
+ - [Model Deployment Pipeline](#model-deployment-pipeline)
10
+ - [Feature Store Architecture](#feature-store-architecture)
11
+ - [Model Monitoring](#model-monitoring)
12
+ - [A/B Testing Infrastructure](#ab-testing-infrastructure)
13
+ - [Automated Retraining](#automated-retraining)
14
+
15
+ ---
16
+
17
+ ## Model Deployment Pipeline
18
+
19
+ ### Deployment Workflow
20
+
21
+ 1. Export trained model to standardized format (ONNX, TorchScript, SavedModel)
22
+ 2. Package model with dependencies in Docker container
23
+ 3. Deploy to staging environment
24
+ 4. Run integration tests against staging
25
+ 5. Deploy canary (5% traffic) to production
26
+ 6. Monitor latency and error rates for 1 hour
27
+ 7. Promote to full production if metrics pass
28
+ 8. **Validation:** p95 latency < 100ms, error rate < 0.1%
29
+
30
+ ### Container Structure
31
+
32
+ ```dockerfile
33
+ FROM python:3.11-slim
34
+
35
+ # Install dependencies
36
+ COPY requirements.txt .
37
+ RUN pip install --no-cache-dir -r requirements.txt
38
+
39
+ # Copy model artifacts
40
+ COPY model/ /app/model/
41
+ COPY src/ /app/src/
42
+
43
+ # Health check endpoint
44
+ HEALTHCHECK CMD curl -f http://localhost:8080/health || exit 1
45
+
46
+ EXPOSE 8080
47
+ CMD ["uvicorn", "src.server:app", "--host", "0.0.0.0", "--port", "8080"]
48
+ ```
49
+
50
+ ### Model Serving Options
51
+
52
+ | Option | Latency | Throughput | Use Case |
53
+ |--------|---------|------------|----------|
54
+ | FastAPI + Uvicorn | Low | Medium | REST APIs, small models |
55
+ | Triton Inference Server | Very Low | Very High | GPU inference, batching |
56
+ | TensorFlow Serving | Low | High | TensorFlow models |
57
+ | TorchServe | Low | High | PyTorch models |
58
+ | Ray Serve | Medium | High | Complex pipelines, multi-model |
59
+
60
+ ### Kubernetes Deployment
61
+
62
+ ```yaml
63
+ apiVersion: apps/v1
64
+ kind: Deployment
65
+ metadata:
66
+ name: model-serving
67
+ spec:
68
+ replicas: 3
69
+ selector:
70
+ matchLabels:
71
+ app: model-serving
72
+ template:
73
+ spec:
74
+ containers:
75
+ - name: model
76
+ image: model:v1.0.0
77
+ resources:
78
+ requests:
79
+ memory: "2Gi"
80
+ cpu: "1"
81
+ limits:
82
+ memory: "4Gi"
83
+ cpu: "2"
84
+ readinessProbe:
85
+ httpGet:
86
+ path: /health
87
+ port: 8080
88
+ initialDelaySeconds: 10
89
+ periodSeconds: 5
90
+ ```
91
+
92
+ ---
93
+
94
+ ## Feature Store Architecture
95
+
96
+ ### Feature Store Components
97
+
98
+ | Component | Purpose | Tools |
99
+ |-----------|---------|-------|
100
+ | Offline Store | Training data, batch features | BigQuery, Snowflake, S3 |
101
+ | Online Store | Low-latency serving | Redis, DynamoDB, Feast |
102
+ | Feature Registry | Metadata, lineage | Feast, Tecton, Hopsworks |
103
+ | Transformation | Feature engineering | Spark, Flink, dbt |
104
+
105
+ ### Feature Pipeline Workflow
106
+
107
+ 1. Define feature schema in registry
108
+ 2. Implement transformation logic (SQL or Python)
109
+ 3. Backfill historical features to offline store
110
+ 4. Schedule incremental updates
111
+ 5. Materialize to online store for serving
112
+ 6. Monitor feature freshness and quality
113
+ 7. **Validation:** Feature values within expected ranges, no nulls in required fields
114
+
115
+ ### Feature Definition Example
116
+
117
+ ```python
118
+ from feast import Entity, Feature, FeatureView, FileSource
119
+
120
+ user = Entity(name="user_id", value_type=ValueType.INT64)
121
+
122
+ user_features = FeatureView(
123
+ name="user_features",
124
+ entities=["user_id"],
125
+ ttl=timedelta(days=1),
126
+ features=[
127
+ Feature(name="purchase_count_30d", dtype=ValueType.INT64),
128
+ Feature(name="avg_order_value", dtype=ValueType.FLOAT),
129
+ Feature(name="days_since_last_purchase", dtype=ValueType.INT64),
130
+ ],
131
+ online=True,
132
+ source=FileSource(path="data/user_features.parquet"),
133
+ )
134
+ ```
135
+
136
+ ---
137
+
138
+ ## Model Monitoring
139
+
140
+ ### Monitoring Dimensions
141
+
142
+ | Dimension | Metrics | Alert Threshold |
143
+ |-----------|---------|-----------------|
144
+ | Latency | p50, p95, p99 | p95 > 100ms |
145
+ | Throughput | requests/sec | < 80% baseline |
146
+ | Errors | error rate, 5xx count | > 0.1% |
147
+ | Data Drift | PSI, KS statistic | PSI > 0.2 |
148
+ | Model Drift | accuracy, AUC decay | > 5% drop |
149
+
150
+ ### Data Drift Detection
151
+
152
+ ```python
153
+ from scipy.stats import ks_2samp
154
+ import numpy as np
155
+
156
+ def detect_drift(reference: np.array, current: np.array, threshold: float = 0.05):
157
+ """Detect distribution drift using Kolmogorov-Smirnov test."""
158
+ statistic, p_value = ks_2samp(reference, current)
159
+
160
+ drift_detected = p_value < threshold
161
+
162
+ return {
163
+ "drift_detected": drift_detected,
164
+ "ks_statistic": statistic,
165
+ "p_value": p_value,
166
+ "threshold": threshold
167
+ }
168
+ ```
169
+
170
+ ### Monitoring Dashboard Metrics
171
+
172
+ **Infrastructure:**
173
+ - Request latency (p50, p95, p99)
174
+ - Requests per second
175
+ - Error rate by type
176
+ - CPU/memory utilization
177
+ - GPU utilization (if applicable)
178
+
179
+ **Model Performance:**
180
+ - Prediction distribution
181
+ - Feature value distributions
182
+ - Model output confidence
183
+ - Ground truth vs predictions (when available)
184
+
185
+ ---
186
+
187
+ ## A/B Testing Infrastructure
188
+
189
+ ### Experiment Workflow
190
+
191
+ 1. Define experiment hypothesis and success metrics
192
+ 2. Calculate required sample size for statistical power
193
+ 3. Configure traffic split (control vs treatment)
194
+ 4. Deploy treatment model alongside control
195
+ 5. Route traffic based on user/session hash
196
+ 6. Collect metrics for both variants
197
+ 7. Run statistical significance test
198
+ 8. **Validation:** p-value < 0.05, minimum sample size reached
199
+
200
+ ### Traffic Splitting
201
+
202
+ ```python
203
+ import hashlib
204
+
205
+ def get_variant(user_id: str, experiment: str, control_pct: float = 0.5) -> str:
206
+ """Deterministic traffic splitting based on user ID."""
207
+ hash_input = f"{user_id}:{experiment}"
208
+ hash_value = int(hashlib.md5(hash_input.encode()).hexdigest(), 16)
209
+ bucket = (hash_value % 100) / 100.0
210
+
211
+ return "control" if bucket < control_pct else "treatment"
212
+ ```
213
+
214
+ ### Metrics Collection
215
+
216
+ | Metric Type | Examples | Collection Method |
217
+ |-------------|----------|-------------------|
218
+ | Primary | Conversion rate, revenue | Event logging |
219
+ | Secondary | Latency, engagement | Request logs |
220
+ | Guardrail | Error rate, crashes | Monitoring system |
221
+
222
+ ---
223
+
224
+ ## Automated Retraining
225
+
226
+ ### Retraining Triggers
227
+
228
+ | Trigger | Detection Method | Action |
229
+ |---------|------------------|--------|
230
+ | Scheduled | Cron (weekly/monthly) | Full retrain |
231
+ | Performance drop | Accuracy < threshold | Immediate retrain |
232
+ | Data drift | PSI > 0.2 | Evaluate, then retrain |
233
+ | New data volume | X new samples | Incremental update |
234
+
235
+ ### Retraining Pipeline
236
+
237
+ 1. Trigger detection (schedule, drift, performance)
238
+ 2. Fetch latest training data from feature store
239
+ 3. Run training job with hyperparameter config
240
+ 4. Evaluate model on holdout set
241
+ 5. Compare against production model
242
+ 6. If improved: register new model version
243
+ 7. Deploy to staging for validation
244
+ 8. Promote to production via canary
245
+ 9. **Validation:** New model outperforms baseline on key metrics
246
+
247
+ ### MLflow Model Registry Integration
248
+
249
+ ```python
250
+ import mlflow
251
+
252
+ def register_model(model, metrics: dict, model_name: str):
253
+ """Register trained model with MLflow."""
254
+ with mlflow.start_run():
255
+ # Log metrics
256
+ for name, value in metrics.items():
257
+ mlflow.log_metric(name, value)
258
+
259
+ # Log model
260
+ mlflow.sklearn.log_model(model, "model")
261
+
262
+ # Register in model registry
263
+ model_uri = f"runs:/{mlflow.active_run().info.run_id}/model"
264
+ mlflow.register_model(model_uri, model_name)
265
+ ```