omgkit 2.13.0 → 2.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +93 -10
- package/package.json +2 -2
- package/plugin/agents/api-designer.md +5 -0
- package/plugin/agents/architect.md +8 -0
- package/plugin/agents/brainstormer.md +4 -0
- package/plugin/agents/cicd-manager.md +6 -0
- package/plugin/agents/code-reviewer.md +6 -0
- package/plugin/agents/copywriter.md +2 -0
- package/plugin/agents/data-engineer.md +255 -0
- package/plugin/agents/database-admin.md +10 -0
- package/plugin/agents/debugger.md +10 -0
- package/plugin/agents/devsecops.md +314 -0
- package/plugin/agents/docs-manager.md +4 -0
- package/plugin/agents/domain-decomposer.md +181 -0
- package/plugin/agents/embedded-systems.md +397 -0
- package/plugin/agents/fullstack-developer.md +12 -0
- package/plugin/agents/game-systems-designer.md +375 -0
- package/plugin/agents/git-manager.md +10 -0
- package/plugin/agents/journal-writer.md +2 -0
- package/plugin/agents/ml-engineer.md +284 -0
- package/plugin/agents/observability-engineer.md +353 -0
- package/plugin/agents/oracle.md +9 -0
- package/plugin/agents/performance-engineer.md +290 -0
- package/plugin/agents/pipeline-architect.md +6 -0
- package/plugin/agents/planner.md +12 -0
- package/plugin/agents/platform-engineer.md +325 -0
- package/plugin/agents/project-manager.md +3 -0
- package/plugin/agents/researcher.md +5 -0
- package/plugin/agents/scientific-computing.md +426 -0
- package/plugin/agents/scout.md +3 -0
- package/plugin/agents/security-auditor.md +7 -0
- package/plugin/agents/sprint-master.md +17 -0
- package/plugin/agents/tester.md +10 -0
- package/plugin/agents/ui-ux-designer.md +12 -0
- package/plugin/agents/vulnerability-scanner.md +6 -0
- package/plugin/commands/data/pipeline.md +47 -0
- package/plugin/commands/data/quality.md +49 -0
- package/plugin/commands/domain/analyze.md +34 -0
- package/plugin/commands/domain/map.md +41 -0
- package/plugin/commands/game/balance.md +56 -0
- package/plugin/commands/game/optimize.md +62 -0
- package/plugin/commands/iot/provision.md +58 -0
- package/plugin/commands/ml/evaluate.md +47 -0
- package/plugin/commands/ml/train.md +48 -0
- package/plugin/commands/perf/benchmark.md +54 -0
- package/plugin/commands/perf/profile.md +49 -0
- package/plugin/commands/platform/blueprint.md +56 -0
- package/plugin/commands/security/audit.md +54 -0
- package/plugin/commands/security/scan.md +55 -0
- package/plugin/commands/sre/dashboard.md +53 -0
- package/plugin/registry.yaml +711 -0
- package/plugin/skills/ai-ml/experiment-tracking/SKILL.md +338 -0
- package/plugin/skills/ai-ml/feature-stores/SKILL.md +340 -0
- package/plugin/skills/ai-ml/llm-ops/SKILL.md +454 -0
- package/plugin/skills/ai-ml/ml-pipelines/SKILL.md +390 -0
- package/plugin/skills/ai-ml/model-monitoring/SKILL.md +398 -0
- package/plugin/skills/ai-ml/model-serving/SKILL.md +386 -0
- package/plugin/skills/event-driven/cqrs-patterns/SKILL.md +348 -0
- package/plugin/skills/event-driven/event-sourcing/SKILL.md +334 -0
- package/plugin/skills/event-driven/kafka-deep/SKILL.md +252 -0
- package/plugin/skills/event-driven/saga-orchestration/SKILL.md +335 -0
- package/plugin/skills/event-driven/schema-registry/SKILL.md +328 -0
- package/plugin/skills/event-driven/stream-processing/SKILL.md +313 -0
- package/plugin/skills/game/game-audio/SKILL.md +446 -0
- package/plugin/skills/game/game-networking/SKILL.md +490 -0
- package/plugin/skills/game/godot-patterns/SKILL.md +413 -0
- package/plugin/skills/game/shader-programming/SKILL.md +492 -0
- package/plugin/skills/game/unity-patterns/SKILL.md +488 -0
- package/plugin/skills/iot/device-provisioning/SKILL.md +405 -0
- package/plugin/skills/iot/edge-computing/SKILL.md +369 -0
- package/plugin/skills/iot/industrial-protocols/SKILL.md +438 -0
- package/plugin/skills/iot/mqtt-deep/SKILL.md +418 -0
- package/plugin/skills/iot/ota-updates/SKILL.md +426 -0
- package/plugin/skills/microservices/api-gateway-patterns/SKILL.md +201 -0
- package/plugin/skills/microservices/circuit-breaker-patterns/SKILL.md +246 -0
- package/plugin/skills/microservices/contract-testing/SKILL.md +284 -0
- package/plugin/skills/microservices/distributed-tracing/SKILL.md +246 -0
- package/plugin/skills/microservices/service-discovery/SKILL.md +304 -0
- package/plugin/skills/microservices/service-mesh/SKILL.md +181 -0
- package/plugin/skills/mobile-advanced/mobile-ci-cd/SKILL.md +407 -0
- package/plugin/skills/mobile-advanced/mobile-security/SKILL.md +403 -0
- package/plugin/skills/mobile-advanced/offline-first/SKILL.md +473 -0
- package/plugin/skills/mobile-advanced/push-notifications/SKILL.md +494 -0
- package/plugin/skills/mobile-advanced/react-native-deep/SKILL.md +374 -0
- package/plugin/skills/simulation/numerical-methods/SKILL.md +434 -0
- package/plugin/skills/simulation/parallel-computing/SKILL.md +382 -0
- package/plugin/skills/simulation/physics-engines/SKILL.md +377 -0
- package/plugin/skills/simulation/validation-verification/SKILL.md +479 -0
- package/plugin/skills/simulation/visualization-scientific/SKILL.md +365 -0
- package/plugin/workflows/ai-engineering/agent-development.md +3 -3
- package/plugin/workflows/ai-engineering/fine-tuning.md +3 -3
- package/plugin/workflows/ai-engineering/model-evaluation.md +3 -3
- package/plugin/workflows/ai-engineering/prompt-engineering.md +2 -2
- package/plugin/workflows/ai-engineering/rag-development.md +4 -4
- package/plugin/workflows/ai-ml/data-pipeline.md +188 -0
- package/plugin/workflows/ai-ml/experiment-cycle.md +203 -0
- package/plugin/workflows/ai-ml/feature-engineering.md +208 -0
- package/plugin/workflows/ai-ml/model-deployment.md +199 -0
- package/plugin/workflows/ai-ml/monitoring-setup.md +227 -0
- package/plugin/workflows/api/api-design.md +1 -1
- package/plugin/workflows/api/api-testing.md +2 -2
- package/plugin/workflows/content/technical-docs.md +1 -1
- package/plugin/workflows/database/migration.md +1 -1
- package/plugin/workflows/database/optimization.md +1 -1
- package/plugin/workflows/database/schema-design.md +3 -3
- package/plugin/workflows/development/bug-fix.md +3 -3
- package/plugin/workflows/development/code-review.md +2 -1
- package/plugin/workflows/development/feature.md +3 -3
- package/plugin/workflows/development/refactor.md +2 -2
- package/plugin/workflows/event-driven/consumer-groups.md +190 -0
- package/plugin/workflows/event-driven/event-storming.md +172 -0
- package/plugin/workflows/event-driven/replay-testing.md +186 -0
- package/plugin/workflows/event-driven/saga-implementation.md +206 -0
- package/plugin/workflows/event-driven/schema-evolution.md +173 -0
- package/plugin/workflows/fullstack/authentication.md +4 -4
- package/plugin/workflows/fullstack/full-feature.md +4 -4
- package/plugin/workflows/game-dev/content-pipeline.md +218 -0
- package/plugin/workflows/game-dev/platform-submission.md +263 -0
- package/plugin/workflows/game-dev/playtesting.md +237 -0
- package/plugin/workflows/game-dev/prototype-to-production.md +205 -0
- package/plugin/workflows/microservices/contract-first.md +151 -0
- package/plugin/workflows/microservices/distributed-tracing.md +166 -0
- package/plugin/workflows/microservices/domain-decomposition.md +123 -0
- package/plugin/workflows/microservices/integration-testing.md +149 -0
- package/plugin/workflows/microservices/service-mesh-setup.md +153 -0
- package/plugin/workflows/microservices/service-scaffolding.md +151 -0
- package/plugin/workflows/omega/1000x-innovation.md +2 -2
- package/plugin/workflows/omega/100x-architecture.md +2 -2
- package/plugin/workflows/omega/10x-improvement.md +2 -2
- package/plugin/workflows/quality/performance-optimization.md +2 -2
- package/plugin/workflows/research/best-practices.md +1 -1
- package/plugin/workflows/research/technology-research.md +1 -1
- package/plugin/workflows/security/penetration-testing.md +3 -3
- package/plugin/workflows/security/security-audit.md +3 -3
- package/plugin/workflows/sprint/sprint-execution.md +2 -2
- package/plugin/workflows/sprint/sprint-retrospective.md +1 -1
- package/plugin/workflows/sprint/sprint-setup.md +1 -1
|
@@ -0,0 +1,454 @@
|
|
|
1
|
+
# LLMOps
|
|
2
|
+
|
|
3
|
+
LLM deployment, prompt management, RAG pipelines, fine-tuning workflows, and LLM evaluation frameworks.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
LLMOps extends MLOps practices for Large Language Models, addressing unique challenges like prompt engineering, context management, and evaluation of generative outputs.
|
|
8
|
+
|
|
9
|
+
## Core Concepts
|
|
10
|
+
|
|
11
|
+
### LLM Lifecycle
|
|
12
|
+
- **Prompt Engineering**: Design and iterate prompts
|
|
13
|
+
- **Fine-tuning**: Adapt models to specific tasks
|
|
14
|
+
- **Deployment**: Serve models at scale
|
|
15
|
+
- **Evaluation**: Measure quality and safety
|
|
16
|
+
- **Monitoring**: Track performance and costs
|
|
17
|
+
|
|
18
|
+
### Key Challenges
|
|
19
|
+
- Non-deterministic outputs
|
|
20
|
+
- Context window limitations
|
|
21
|
+
- Cost management
|
|
22
|
+
- Latency optimization
|
|
23
|
+
- Safety and alignment
|
|
24
|
+
|
|
25
|
+
## Prompt Management
|
|
26
|
+
|
|
27
|
+
### Prompt Registry
|
|
28
|
+
```python
|
|
29
|
+
from dataclasses import dataclass, field
|
|
30
|
+
from typing import Dict, List, Optional
|
|
31
|
+
from datetime import datetime
|
|
32
|
+
import hashlib
|
|
33
|
+
import json
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class PromptTemplate:
|
|
37
|
+
name: str
|
|
38
|
+
version: str
|
|
39
|
+
template: str
|
|
40
|
+
variables: List[str]
|
|
41
|
+
model: str
|
|
42
|
+
temperature: float = 0.7
|
|
43
|
+
max_tokens: int = 1000
|
|
44
|
+
metadata: Dict = field(default_factory=dict)
|
|
45
|
+
created_at: datetime = field(default_factory=datetime.now)
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def hash(self) -> str:
|
|
49
|
+
content = f"{self.template}{self.model}{self.temperature}"
|
|
50
|
+
return hashlib.sha256(content.encode()).hexdigest()[:12]
|
|
51
|
+
|
|
52
|
+
class PromptRegistry:
|
|
53
|
+
def __init__(self, storage_backend):
|
|
54
|
+
self.storage = storage_backend
|
|
55
|
+
|
|
56
|
+
def register(self, prompt: PromptTemplate) -> str:
|
|
57
|
+
# Check for existing version
|
|
58
|
+
existing = self.get(prompt.name, prompt.version)
|
|
59
|
+
if existing and existing.hash != prompt.hash:
|
|
60
|
+
raise ValueError(f"Version {prompt.version} exists with different content")
|
|
61
|
+
|
|
62
|
+
self.storage.save(prompt)
|
|
63
|
+
return prompt.hash
|
|
64
|
+
|
|
65
|
+
def get(self, name: str, version: str = "latest") -> Optional[PromptTemplate]:
|
|
66
|
+
return self.storage.load(name, version)
|
|
67
|
+
|
|
68
|
+
def list_versions(self, name: str) -> List[str]:
|
|
69
|
+
return self.storage.list_versions(name)
|
|
70
|
+
|
|
71
|
+
def render(self, name: str, version: str, variables: Dict) -> str:
|
|
72
|
+
prompt = self.get(name, version)
|
|
73
|
+
return prompt.template.format(**variables)
|
|
74
|
+
|
|
75
|
+
# Usage
|
|
76
|
+
registry = PromptRegistry(storage)
|
|
77
|
+
|
|
78
|
+
prompt = PromptTemplate(
|
|
79
|
+
name="customer_support",
|
|
80
|
+
version="1.2.0",
|
|
81
|
+
template="""You are a helpful customer support agent for {company_name}.
|
|
82
|
+
|
|
83
|
+
Customer query: {query}
|
|
84
|
+
|
|
85
|
+
Respond helpfully and professionally. If you don't know the answer, say so.""",
|
|
86
|
+
variables=["company_name", "query"],
|
|
87
|
+
model="gpt-4",
|
|
88
|
+
temperature=0.3
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
registry.register(prompt)
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### A/B Testing Prompts
|
|
95
|
+
```python
|
|
96
|
+
import random
|
|
97
|
+
from typing import Callable
|
|
98
|
+
|
|
99
|
+
class PromptExperiment:
|
|
100
|
+
def __init__(
|
|
101
|
+
self,
|
|
102
|
+
name: str,
|
|
103
|
+
variants: Dict[str, PromptTemplate],
|
|
104
|
+
weights: Optional[Dict[str, float]] = None
|
|
105
|
+
):
|
|
106
|
+
self.name = name
|
|
107
|
+
self.variants = variants
|
|
108
|
+
self.weights = weights or {k: 1.0/len(variants) for k in variants}
|
|
109
|
+
|
|
110
|
+
def select_variant(self, user_id: str) -> tuple[str, PromptTemplate]:
|
|
111
|
+
# Deterministic selection based on user_id for consistency
|
|
112
|
+
hash_val = int(hashlib.md5(f"{self.name}:{user_id}".encode()).hexdigest(), 16)
|
|
113
|
+
rand_val = (hash_val % 1000) / 1000
|
|
114
|
+
|
|
115
|
+
cumulative = 0
|
|
116
|
+
for variant_name, weight in self.weights.items():
|
|
117
|
+
cumulative += weight
|
|
118
|
+
if rand_val < cumulative:
|
|
119
|
+
return variant_name, self.variants[variant_name]
|
|
120
|
+
|
|
121
|
+
return list(self.variants.items())[-1]
|
|
122
|
+
|
|
123
|
+
class ExperimentTracker:
|
|
124
|
+
def __init__(self, db):
|
|
125
|
+
self.db = db
|
|
126
|
+
|
|
127
|
+
def log_experiment(
|
|
128
|
+
self,
|
|
129
|
+
experiment_name: str,
|
|
130
|
+
variant: str,
|
|
131
|
+
user_id: str,
|
|
132
|
+
input_data: dict,
|
|
133
|
+
output: str,
|
|
134
|
+
metrics: dict
|
|
135
|
+
):
|
|
136
|
+
self.db.insert({
|
|
137
|
+
"experiment": experiment_name,
|
|
138
|
+
"variant": variant,
|
|
139
|
+
"user_id": user_id,
|
|
140
|
+
"input": input_data,
|
|
141
|
+
"output": output,
|
|
142
|
+
"metrics": metrics,
|
|
143
|
+
"timestamp": datetime.now()
|
|
144
|
+
})
|
|
145
|
+
|
|
146
|
+
def get_variant_metrics(self, experiment_name: str) -> Dict:
|
|
147
|
+
# Aggregate metrics per variant
|
|
148
|
+
return self.db.aggregate([
|
|
149
|
+
{"$match": {"experiment": experiment_name}},
|
|
150
|
+
{"$group": {
|
|
151
|
+
"_id": "$variant",
|
|
152
|
+
"count": {"$sum": 1},
|
|
153
|
+
"avg_latency": {"$avg": "$metrics.latency"},
|
|
154
|
+
"avg_quality": {"$avg": "$metrics.quality_score"}
|
|
155
|
+
}}
|
|
156
|
+
])
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
## RAG Pipeline
|
|
160
|
+
|
|
161
|
+
### Document Processing
|
|
162
|
+
```python
|
|
163
|
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
164
|
+
from langchain.embeddings import OpenAIEmbeddings
|
|
165
|
+
from langchain.vectorstores import Pinecone
|
|
166
|
+
import pinecone
|
|
167
|
+
|
|
168
|
+
class RAGPipeline:
|
|
169
|
+
def __init__(self, index_name: str):
|
|
170
|
+
self.embeddings = OpenAIEmbeddings()
|
|
171
|
+
self.text_splitter = RecursiveCharacterTextSplitter(
|
|
172
|
+
chunk_size=1000,
|
|
173
|
+
chunk_overlap=200,
|
|
174
|
+
separators=["\n\n", "\n", " ", ""]
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
pinecone.init(api_key=os.environ["PINECONE_API_KEY"])
|
|
178
|
+
self.vectorstore = Pinecone.from_existing_index(
|
|
179
|
+
index_name,
|
|
180
|
+
self.embeddings
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
def ingest_documents(self, documents: List[Document]):
|
|
184
|
+
# Split documents
|
|
185
|
+
chunks = self.text_splitter.split_documents(documents)
|
|
186
|
+
|
|
187
|
+
# Add metadata
|
|
188
|
+
for i, chunk in enumerate(chunks):
|
|
189
|
+
chunk.metadata["chunk_id"] = f"{chunk.metadata['source']}_{i}"
|
|
190
|
+
chunk.metadata["ingested_at"] = datetime.now().isoformat()
|
|
191
|
+
|
|
192
|
+
# Embed and store
|
|
193
|
+
self.vectorstore.add_documents(chunks)
|
|
194
|
+
|
|
195
|
+
def retrieve(
|
|
196
|
+
self,
|
|
197
|
+
query: str,
|
|
198
|
+
k: int = 5,
|
|
199
|
+
filter: Optional[dict] = None
|
|
200
|
+
) -> List[Document]:
|
|
201
|
+
return self.vectorstore.similarity_search(
|
|
202
|
+
query,
|
|
203
|
+
k=k,
|
|
204
|
+
filter=filter
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
def generate(
|
|
208
|
+
self,
|
|
209
|
+
query: str,
|
|
210
|
+
context_docs: List[Document],
|
|
211
|
+
model: str = "gpt-4"
|
|
212
|
+
) -> str:
|
|
213
|
+
context = "\n\n".join([doc.page_content for doc in context_docs])
|
|
214
|
+
|
|
215
|
+
prompt = f"""Answer the question based on the context below.
|
|
216
|
+
|
|
217
|
+
Context:
|
|
218
|
+
{context}
|
|
219
|
+
|
|
220
|
+
Question: {query}
|
|
221
|
+
|
|
222
|
+
Answer:"""
|
|
223
|
+
|
|
224
|
+
response = openai.ChatCompletion.create(
|
|
225
|
+
model=model,
|
|
226
|
+
messages=[{"role": "user", "content": prompt}],
|
|
227
|
+
temperature=0.3
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
return response.choices[0].message.content
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
### Hybrid Search
|
|
234
|
+
```python
|
|
235
|
+
from rank_bm25 import BM25Okapi
|
|
236
|
+
import numpy as np
|
|
237
|
+
|
|
238
|
+
class HybridRetriever:
|
|
239
|
+
def __init__(self, vectorstore, documents: List[str]):
|
|
240
|
+
self.vectorstore = vectorstore
|
|
241
|
+
self.documents = documents
|
|
242
|
+
|
|
243
|
+
# BM25 for keyword search
|
|
244
|
+
tokenized = [doc.split() for doc in documents]
|
|
245
|
+
self.bm25 = BM25Okapi(tokenized)
|
|
246
|
+
|
|
247
|
+
def retrieve(
|
|
248
|
+
self,
|
|
249
|
+
query: str,
|
|
250
|
+
k: int = 10,
|
|
251
|
+
alpha: float = 0.5 # Weight for vector search
|
|
252
|
+
) -> List[Document]:
|
|
253
|
+
# Vector search
|
|
254
|
+
vector_results = self.vectorstore.similarity_search_with_score(query, k=k*2)
|
|
255
|
+
|
|
256
|
+
# BM25 search
|
|
257
|
+
tokenized_query = query.split()
|
|
258
|
+
bm25_scores = self.bm25.get_scores(tokenized_query)
|
|
259
|
+
bm25_top_k = np.argsort(bm25_scores)[-k*2:][::-1]
|
|
260
|
+
|
|
261
|
+
# Combine scores with RRF (Reciprocal Rank Fusion)
|
|
262
|
+
scores = {}
|
|
263
|
+
for rank, (doc, score) in enumerate(vector_results):
|
|
264
|
+
doc_id = doc.metadata["chunk_id"]
|
|
265
|
+
scores[doc_id] = scores.get(doc_id, 0) + alpha / (rank + 60)
|
|
266
|
+
|
|
267
|
+
for rank, idx in enumerate(bm25_top_k):
|
|
268
|
+
doc_id = self.documents[idx].metadata["chunk_id"]
|
|
269
|
+
scores[doc_id] = scores.get(doc_id, 0) + (1-alpha) / (rank + 60)
|
|
270
|
+
|
|
271
|
+
# Sort and return top k
|
|
272
|
+
sorted_ids = sorted(scores.keys(), key=lambda x: scores[x], reverse=True)[:k]
|
|
273
|
+
return [self._get_doc(doc_id) for doc_id in sorted_ids]
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
## LLM Evaluation
|
|
277
|
+
|
|
278
|
+
### Evaluation Framework
|
|
279
|
+
```python
|
|
280
|
+
from typing import List, Dict, Callable
|
|
281
|
+
from dataclasses import dataclass
|
|
282
|
+
import openai
|
|
283
|
+
|
|
284
|
+
@dataclass
|
|
285
|
+
class EvalResult:
|
|
286
|
+
score: float
|
|
287
|
+
reasoning: str
|
|
288
|
+
metadata: dict
|
|
289
|
+
|
|
290
|
+
class LLMEvaluator:
|
|
291
|
+
def __init__(self, judge_model: str = "gpt-4"):
|
|
292
|
+
self.judge_model = judge_model
|
|
293
|
+
|
|
294
|
+
def evaluate_relevance(
|
|
295
|
+
self,
|
|
296
|
+
query: str,
|
|
297
|
+
response: str,
|
|
298
|
+
context: str
|
|
299
|
+
) -> EvalResult:
|
|
300
|
+
prompt = f"""Rate the relevance of the response to the query on a scale of 1-5.
|
|
301
|
+
|
|
302
|
+
Query: {query}
|
|
303
|
+
Context: {context}
|
|
304
|
+
Response: {response}
|
|
305
|
+
|
|
306
|
+
Provide your rating and reasoning in JSON format:
|
|
307
|
+
{{"score": <1-5>, "reasoning": "<explanation>"}}"""
|
|
308
|
+
|
|
309
|
+
result = openai.ChatCompletion.create(
|
|
310
|
+
model=self.judge_model,
|
|
311
|
+
messages=[{"role": "user", "content": prompt}],
|
|
312
|
+
temperature=0
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
parsed = json.loads(result.choices[0].message.content)
|
|
316
|
+
return EvalResult(
|
|
317
|
+
score=parsed["score"] / 5,
|
|
318
|
+
reasoning=parsed["reasoning"],
|
|
319
|
+
metadata={"query": query}
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
def evaluate_faithfulness(
|
|
323
|
+
self,
|
|
324
|
+
response: str,
|
|
325
|
+
context: str
|
|
326
|
+
) -> EvalResult:
|
|
327
|
+
prompt = f"""Evaluate if the response is faithful to the context (no hallucinations).
|
|
328
|
+
|
|
329
|
+
Context: {context}
|
|
330
|
+
Response: {response}
|
|
331
|
+
|
|
332
|
+
Rate faithfulness 1-5 and explain any hallucinations:
|
|
333
|
+
{{"score": <1-5>, "reasoning": "<explanation>", "hallucinations": ["<list of hallucinated claims>"]}}"""
|
|
334
|
+
|
|
335
|
+
result = openai.ChatCompletion.create(
|
|
336
|
+
model=self.judge_model,
|
|
337
|
+
messages=[{"role": "user", "content": prompt}],
|
|
338
|
+
temperature=0
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
parsed = json.loads(result.choices[0].message.content)
|
|
342
|
+
return EvalResult(
|
|
343
|
+
score=parsed["score"] / 5,
|
|
344
|
+
reasoning=parsed["reasoning"],
|
|
345
|
+
metadata={"hallucinations": parsed.get("hallucinations", [])}
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
class EvalPipeline:
|
|
349
|
+
def __init__(self, evaluators: List[Callable]):
|
|
350
|
+
self.evaluators = evaluators
|
|
351
|
+
|
|
352
|
+
def run(self, test_cases: List[dict]) -> Dict:
|
|
353
|
+
results = []
|
|
354
|
+
for case in test_cases:
|
|
355
|
+
case_results = {}
|
|
356
|
+
for evaluator in self.evaluators:
|
|
357
|
+
case_results[evaluator.__name__] = evaluator(**case)
|
|
358
|
+
results.append(case_results)
|
|
359
|
+
|
|
360
|
+
return {
|
|
361
|
+
"individual": results,
|
|
362
|
+
"aggregate": self._aggregate(results)
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
def _aggregate(self, results: List[dict]) -> dict:
|
|
366
|
+
metrics = {}
|
|
367
|
+
for metric in results[0].keys():
|
|
368
|
+
scores = [r[metric].score for r in results]
|
|
369
|
+
metrics[metric] = {
|
|
370
|
+
"mean": np.mean(scores),
|
|
371
|
+
"std": np.std(scores),
|
|
372
|
+
"min": min(scores),
|
|
373
|
+
"max": max(scores)
|
|
374
|
+
}
|
|
375
|
+
return metrics
|
|
376
|
+
```
|
|
377
|
+
|
|
378
|
+
## Cost Management
|
|
379
|
+
|
|
380
|
+
### Token Tracking
|
|
381
|
+
```python
|
|
382
|
+
import tiktoken
|
|
383
|
+
from functools import wraps
|
|
384
|
+
|
|
385
|
+
class TokenTracker:
|
|
386
|
+
def __init__(self):
|
|
387
|
+
self.usage = {}
|
|
388
|
+
|
|
389
|
+
def track(self, model: str, prompt_tokens: int, completion_tokens: int):
|
|
390
|
+
if model not in self.usage:
|
|
391
|
+
self.usage[model] = {"prompt": 0, "completion": 0}
|
|
392
|
+
|
|
393
|
+
self.usage[model]["prompt"] += prompt_tokens
|
|
394
|
+
self.usage[model]["completion"] += completion_tokens
|
|
395
|
+
|
|
396
|
+
def estimate_cost(self) -> float:
|
|
397
|
+
pricing = {
|
|
398
|
+
"gpt-4": {"prompt": 0.03, "completion": 0.06},
|
|
399
|
+
"gpt-4-turbo": {"prompt": 0.01, "completion": 0.03},
|
|
400
|
+
"gpt-3.5-turbo": {"prompt": 0.0005, "completion": 0.0015}
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
total = 0
|
|
404
|
+
for model, usage in self.usage.items():
|
|
405
|
+
if model in pricing:
|
|
406
|
+
total += (usage["prompt"] / 1000) * pricing[model]["prompt"]
|
|
407
|
+
total += (usage["completion"] / 1000) * pricing[model]["completion"]
|
|
408
|
+
return total
|
|
409
|
+
|
|
410
|
+
def track_tokens(tracker: TokenTracker):
|
|
411
|
+
def decorator(func):
|
|
412
|
+
@wraps(func)
|
|
413
|
+
def wrapper(*args, **kwargs):
|
|
414
|
+
result = func(*args, **kwargs)
|
|
415
|
+
tracker.track(
|
|
416
|
+
model=kwargs.get("model", "gpt-4"),
|
|
417
|
+
prompt_tokens=result.usage.prompt_tokens,
|
|
418
|
+
completion_tokens=result.usage.completion_tokens
|
|
419
|
+
)
|
|
420
|
+
return result
|
|
421
|
+
return wrapper
|
|
422
|
+
return decorator
|
|
423
|
+
```
|
|
424
|
+
|
|
425
|
+
## Best Practices
|
|
426
|
+
|
|
427
|
+
1. **Version Prompts**: Track all prompt changes
|
|
428
|
+
2. **Cache Responses**: Reduce costs and latency
|
|
429
|
+
3. **Structured Outputs**: Use JSON mode when possible
|
|
430
|
+
4. **Fallback Models**: Have cheaper alternatives
|
|
431
|
+
5. **Rate Limiting**: Protect against cost spikes
|
|
432
|
+
|
|
433
|
+
## Anti-Patterns
|
|
434
|
+
|
|
435
|
+
- Hardcoded prompts in code
|
|
436
|
+
- No evaluation pipeline
|
|
437
|
+
- Ignoring token costs
|
|
438
|
+
- Missing safety filters
|
|
439
|
+
- No prompt versioning
|
|
440
|
+
|
|
441
|
+
## When to Use
|
|
442
|
+
|
|
443
|
+
- Production LLM applications
|
|
444
|
+
- Multiple prompt iterations
|
|
445
|
+
- Team collaboration on prompts
|
|
446
|
+
- Cost-sensitive deployments
|
|
447
|
+
- RAG systems at scale
|
|
448
|
+
|
|
449
|
+
## When NOT to Use
|
|
450
|
+
|
|
451
|
+
- Simple one-off queries
|
|
452
|
+
- Prototyping phase
|
|
453
|
+
- No iteration expected
|
|
454
|
+
- Single developer project
|