ateschh-kit 1.0.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/.claude/settings.local.json +4 -1
  2. package/CHANGELOG.md +15 -0
  3. package/CLAUDE.md +16 -16
  4. package/package.json +1 -2
  5. package/skills/build/SKILL.md +642 -0
  6. package/skills/cloudflare-workers-expert/SKILL.md +89 -0
  7. package/skills/docker-expert/SKILL.md +413 -0
  8. package/skills/electron-development/SKILL.md +856 -0
  9. package/skills/expo-api-routes/SKILL.md +368 -0
  10. package/skills/expo-deployment/SKILL.md +73 -0
  11. package/skills/fastapi-pro/SKILL.md +190 -0
  12. package/skills/flutter-expert/SKILL.md +197 -0
  13. package/skills/llm-app-patterns/SKILL.md +763 -0
  14. package/skills/nextjs-app-router-patterns/SKILL.md +36 -0
  15. package/skills/nextjs-best-practices/SKILL.md +208 -0
  16. package/skills/nodejs-backend-patterns/SKILL.md +38 -0
  17. package/skills/postgres-best-practices/SKILL.md +59 -0
  18. package/skills/prisma-expert/SKILL.md +361 -0
  19. package/skills/prompt-engineering/SKILL.md +177 -0
  20. package/skills/rag-implementation/SKILL.md +196 -0
  21. package/skills/react-best-practices/SKILL.md +127 -0
  22. package/skills/react-native-architecture/SKILL.md +36 -0
  23. package/skills/shadcn/SKILL.md +250 -0
  24. package/skills/supabase-automation/SKILL.md +240 -0
  25. package/skills/tailwind-design-system/SKILL.md +36 -0
  26. package/skills/typescript-expert/SKILL.md +426 -0
  27. package/skills/vercel-deployment/SKILL.md +80 -0
  28. /package/{workflows → .claude/commands}/_TEMPLATE.md +0 -0
  29. /package/{workflows → .claude/commands}/brainstorm.md +0 -0
  30. /package/{workflows → .claude/commands}/build.md +0 -0
  31. /package/{workflows → .claude/commands}/deploy.md +0 -0
  32. /package/{workflows → .claude/commands}/design.md +0 -0
  33. /package/{workflows → .claude/commands}/finish.md +0 -0
  34. /package/{workflows → .claude/commands}/map-codebase.md +0 -0
  35. /package/{workflows → .claude/commands}/new-project.md +0 -0
  36. /package/{workflows → .claude/commands}/next.md +0 -0
  37. /package/{workflows → .claude/commands}/quick.md +0 -0
  38. /package/{workflows → .claude/commands}/requirements.md +0 -0
  39. /package/{workflows → .claude/commands}/resume.md +0 -0
  40. /package/{workflows → .claude/commands}/save.md +0 -0
  41. /package/{workflows → .claude/commands}/settings.md +0 -0
  42. /package/{workflows → .claude/commands}/status.md +0 -0
  43. /package/{workflows → .claude/commands}/test.md +0 -0
@@ -0,0 +1,763 @@
1
+ ---
2
+ name: llm-app-patterns
3
+ description: "Production-ready patterns for building LLM applications. Covers RAG pipelines, agent architectures, prompt IDEs, and LLMOps monitoring. Use when designing AI applications, implementing RAG, buildin..."
4
+ risk: unknown
5
+ source: community
6
+ date_added: "2026-02-27"
7
+ ---
8
+
9
+ # 🤖 LLM Application Patterns
10
+
11
+ > Production-ready patterns for building LLM applications, inspired by [Dify](https://github.com/langgenius/dify) and industry best practices.
12
+
13
+ ## When to Use This Skill
14
+
15
+ Use this skill when:
16
+
17
+ - Designing LLM-powered applications
18
+ - Implementing RAG (Retrieval-Augmented Generation)
19
+ - Building AI agents with tools
20
+ - Setting up LLMOps monitoring
21
+ - Choosing between agent architectures
22
+
23
+ ---
24
+
25
+ ## 1. RAG Pipeline Architecture
26
+
27
+ ### Overview
28
+
29
+ RAG (Retrieval-Augmented Generation) grounds LLM responses in your data.
30
+
31
+ ```
32
+ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐
33
+ │ Ingest │────▶│ Retrieve │────▶│ Generate │
34
+ │ Documents │ │ Context │ │ Response │
35
+ └─────────────┘ └─────────────┘ └─────────────┘
36
+ │ │ │
37
+ ▼ ▼ ▼
38
+ ┌─────────┐ ┌───────────┐ ┌───────────┐
39
+ │ Chunking│ │ Vector │ │ LLM │
40
+ │Embedding│ │ Search │ │ + Context│
41
+ └─────────┘ └───────────┘ └───────────┘
42
+ ```
43
+
44
+ ### 1.1 Document Ingestion
45
+
46
+ ```python
47
+ # Chunking strategies
48
+ class ChunkingStrategy:
49
+ # Fixed-size chunks (simple but may break context)
50
+ FIXED_SIZE = "fixed_size" # e.g., 512 tokens
51
+
52
+ # Semantic chunking (preserves meaning)
53
+ SEMANTIC = "semantic" # Split on paragraphs/sections
54
+
55
+ # Recursive splitting (tries multiple separators)
56
+ RECURSIVE = "recursive" # ["\n\n", "\n", " ", ""]
57
+
58
+ # Document-aware (respects structure)
59
+ DOCUMENT_AWARE = "document_aware" # Headers, lists, etc.
60
+
61
+ # Recommended settings
62
+ CHUNK_CONFIG = {
63
+ "chunk_size": 512, # tokens
64
+ "chunk_overlap": 50, # token overlap between chunks
65
+ "separators": ["\n\n", "\n", ". ", " "],
66
+ }
67
+ ```
68
+
69
+ ### 1.2 Embedding & Storage
70
+
71
+ ```python
72
+ # Vector database selection
73
+ VECTOR_DB_OPTIONS = {
74
+ "pinecone": {
75
+ "use_case": "Production, managed service",
76
+ "scale": "Billions of vectors",
77
+ "features": ["Hybrid search", "Metadata filtering"]
78
+ },
79
+ "weaviate": {
80
+ "use_case": "Self-hosted, multi-modal",
81
+ "scale": "Millions of vectors",
82
+ "features": ["GraphQL API", "Modules"]
83
+ },
84
+ "chromadb": {
85
+ "use_case": "Development, prototyping",
86
+ "scale": "Thousands of vectors",
87
+ "features": ["Simple API", "In-memory option"]
88
+ },
89
+ "pgvector": {
90
+ "use_case": "Existing Postgres infrastructure",
91
+ "scale": "Millions of vectors",
92
+ "features": ["SQL integration", "ACID compliance"]
93
+ }
94
+ }
95
+
96
+ # Embedding model selection
97
+ EMBEDDING_MODELS = {
98
+ "openai/text-embedding-3-small": {
99
+ "dimensions": 1536,
100
+ "cost": "$0.02/1M tokens",
101
+ "quality": "Good for most use cases"
102
+ },
103
+ "openai/text-embedding-3-large": {
104
+ "dimensions": 3072,
105
+ "cost": "$0.13/1M tokens",
106
+ "quality": "Best for complex queries"
107
+ },
108
+ "local/bge-large": {
109
+ "dimensions": 1024,
110
+ "cost": "Free (compute only)",
111
+ "quality": "Comparable to OpenAI small"
112
+ }
113
+ }
114
+ ```
115
+
116
+ ### 1.3 Retrieval Strategies
117
+
118
+ ```python
119
+ # Basic semantic search
120
+ def semantic_search(query: str, top_k: int = 5):
121
+ query_embedding = embed(query)
122
+ results = vector_db.similarity_search(
123
+ query_embedding,
124
+ top_k=top_k
125
+ )
126
+ return results
127
+
128
+ # Hybrid search (semantic + keyword)
129
+ def hybrid_search(query: str, top_k: int = 5, alpha: float = 0.5):
130
+ """
131
+ alpha=1.0: Pure semantic
132
+ alpha=0.0: Pure keyword (BM25)
133
+ alpha=0.5: Balanced
134
+ """
135
+ semantic_results = vector_db.similarity_search(query)
136
+ keyword_results = bm25_search(query)
137
+
138
+ # Reciprocal Rank Fusion
139
+ return rrf_merge(semantic_results, keyword_results, alpha)
140
+
141
+ # Multi-query retrieval
142
+ def multi_query_retrieval(query: str):
143
+ """Generate multiple query variations for better recall"""
144
+ queries = llm.generate_query_variations(query, n=3)
145
+ all_results = []
146
+ for q in queries:
147
+ all_results.extend(semantic_search(q))
148
+ return deduplicate(all_results)
149
+
150
+ # Contextual compression
151
+ def compressed_retrieval(query: str):
152
+ """Retrieve then compress to relevant parts only"""
153
+ docs = semantic_search(query, top_k=10)
154
+ compressed = llm.extract_relevant_parts(docs, query)
155
+ return compressed
156
+ ```
157
+
158
+ ### 1.4 Generation with Context
159
+
160
+ ```python
161
+ RAG_PROMPT_TEMPLATE = """
162
+ Answer the user's question based ONLY on the following context.
163
+ If the context doesn't contain enough information, say "I don't have enough information to answer that."
164
+
165
+ Context:
166
+ {context}
167
+
168
+ Question: {question}
169
+
170
+ Answer:"""
171
+
172
+ def generate_with_rag(question: str):
173
+ # Retrieve
174
+ context_docs = hybrid_search(question, top_k=5)
175
+ context = "\n\n".join([doc.content for doc in context_docs])
176
+
177
+ # Generate
178
+ prompt = RAG_PROMPT_TEMPLATE.format(
179
+ context=context,
180
+ question=question
181
+ )
182
+
183
+ response = llm.generate(prompt)
184
+
185
+ # Return with citations
186
+ return {
187
+ "answer": response,
188
+ "sources": [doc.metadata for doc in context_docs]
189
+ }
190
+ ```
191
+
192
+ ---
193
+
194
+ ## 2. Agent Architectures
195
+
196
+ ### 2.1 ReAct Pattern (Reasoning + Acting)
197
+
198
+ ```
199
+ Thought: I need to search for information about X
200
+ Action: search("X")
201
+ Observation: [search results]
202
+ Thought: Based on the results, I should...
203
+ Action: calculate(...)
204
+ Observation: [calculation result]
205
+ Thought: I now have enough information
206
+ Action: final_answer("The answer is...")
207
+ ```
208
+
209
+ ```python
210
+ REACT_PROMPT = """
211
+ You are an AI assistant that can use tools to answer questions.
212
+
213
+ Available tools:
214
+ {tools_description}
215
+
216
+ Use this format:
217
+ Thought: [your reasoning about what to do next]
218
+ Action: [tool_name(arguments)]
219
+ Observation: [tool result - this will be filled in]
220
+ ... (repeat Thought/Action/Observation as needed)
221
+ Thought: I have enough information to answer
222
+ Final Answer: [your final response]
223
+
224
+ Question: {question}
225
+ """
226
+
227
+ class ReActAgent:
228
+ def __init__(self, tools: list, llm):
229
+ self.tools = {t.name: t for t in tools}
230
+ self.llm = llm
231
+ self.max_iterations = 10
232
+
233
+ def run(self, question: str) -> str:
234
+ prompt = REACT_PROMPT.format(
235
+ tools_description=self._format_tools(),
236
+ question=question
237
+ )
238
+
239
+ for _ in range(self.max_iterations):
240
+ response = self.llm.generate(prompt)
241
+
242
+ if "Final Answer:" in response:
243
+ return self._extract_final_answer(response)
244
+
245
+ action = self._parse_action(response)
246
+ observation = self._execute_tool(action)
247
+ prompt += f"\nObservation: {observation}\n"
248
+
249
+ return "Max iterations reached"
250
+ ```
251
+
252
+ ### 2.2 Function Calling Pattern
253
+
254
+ ```python
255
+ # Define tools as functions with schemas
256
+ TOOLS = [
257
+ {
258
+ "name": "search_web",
259
+ "description": "Search the web for current information",
260
+ "parameters": {
261
+ "type": "object",
262
+ "properties": {
263
+ "query": {
264
+ "type": "string",
265
+ "description": "Search query"
266
+ }
267
+ },
268
+ "required": ["query"]
269
+ }
270
+ },
271
+ {
272
+ "name": "calculate",
273
+ "description": "Perform mathematical calculations",
274
+ "parameters": {
275
+ "type": "object",
276
+ "properties": {
277
+ "expression": {
278
+ "type": "string",
279
+ "description": "Math expression to evaluate"
280
+ }
281
+ },
282
+ "required": ["expression"]
283
+ }
284
+ }
285
+ ]
286
+
287
+ class FunctionCallingAgent:
288
+ def run(self, question: str) -> str:
289
+ messages = [{"role": "user", "content": question}]
290
+
291
+ while True:
292
+ response = self.llm.chat(
293
+ messages=messages,
294
+ tools=TOOLS,
295
+ tool_choice="auto"
296
+ )
297
+
298
+ if response.tool_calls:
299
+ for tool_call in response.tool_calls:
300
+ result = self._execute_tool(
301
+ tool_call.name,
302
+ tool_call.arguments
303
+ )
304
+ messages.append({
305
+ "role": "tool",
306
+ "tool_call_id": tool_call.id,
307
+ "content": str(result)
308
+ })
309
+ else:
310
+ return response.content
311
+ ```
312
+
313
+ ### 2.3 Plan-and-Execute Pattern
314
+
315
+ ```python
316
+ class PlanAndExecuteAgent:
317
+ """
318
+ 1. Create a plan (list of steps)
319
+ 2. Execute each step
320
+ 3. Replan if needed
321
+ """
322
+
323
+ def run(self, task: str) -> str:
324
+ # Planning phase
325
+ plan = self.planner.create_plan(task)
326
+ # Returns: ["Step 1: ...", "Step 2: ...", ...]
327
+
328
+ results = []
329
+ for step in plan:
330
+ # Execute each step
331
+ result = self.executor.execute(step, context=results)
332
+ results.append(result)
333
+
334
+ # Check if replan needed
335
+ if self._needs_replan(task, results):
336
+ new_plan = self.planner.replan(
337
+ task,
338
+ completed=results,
339
+ remaining=plan[len(results):]
340
+ )
341
+ plan = new_plan
342
+
343
+ # Synthesize final answer
344
+ return self.synthesizer.summarize(task, results)
345
+ ```
346
+
347
+ ### 2.4 Multi-Agent Collaboration
348
+
349
+ ```python
350
+ class AgentTeam:
351
+ """
352
+ Specialized agents collaborating on complex tasks
353
+ """
354
+
355
+ def __init__(self):
356
+ self.agents = {
357
+ "researcher": ResearchAgent(),
358
+ "analyst": AnalystAgent(),
359
+ "writer": WriterAgent(),
360
+ "critic": CriticAgent()
361
+ }
362
+ self.coordinator = CoordinatorAgent()
363
+
364
+ def solve(self, task: str) -> str:
365
+ # Coordinator assigns subtasks
366
+ assignments = self.coordinator.decompose(task)
367
+
368
+ results = {}
369
+ for assignment in assignments:
370
+ agent = self.agents[assignment.agent]
371
+ result = agent.execute(
372
+ assignment.subtask,
373
+ context=results
374
+ )
375
+ results[assignment.id] = result
376
+
377
+ # Critic reviews
378
+ critique = self.agents["critic"].review(results)
379
+
380
+ if critique.needs_revision:
381
+ # Iterate with feedback
382
+ return self.solve_with_feedback(task, results, critique)
383
+
384
+ return self.coordinator.synthesize(results)
385
+ ```
386
+
387
+ ---
388
+
389
+ ## 3. Prompt IDE Patterns
390
+
391
+ ### 3.1 Prompt Templates with Variables
392
+
393
+ ```python
394
+ class PromptTemplate:
395
+ def __init__(self, template: str, variables: list[str]):
396
+ self.template = template
397
+ self.variables = variables
398
+
399
+ def format(self, **kwargs) -> str:
400
+ # Validate all variables provided
401
+ missing = set(self.variables) - set(kwargs.keys())
402
+ if missing:
403
+ raise ValueError(f"Missing variables: {missing}")
404
+
405
+ return self.template.format(**kwargs)
406
+
407
+ def with_examples(self, examples: list[dict]) -> str:
408
+ """Add few-shot examples"""
409
+ example_text = "\n\n".join([
410
+ f"Input: {ex['input']}\nOutput: {ex['output']}"
411
+ for ex in examples
412
+ ])
413
+ return f"{example_text}\n\n{self.template}"
414
+
415
+ # Usage
416
+ summarizer = PromptTemplate(
417
+ template="Summarize the following text in {style} style:\n\n{text}",
418
+ variables=["style", "text"]
419
+ )
420
+
421
+ prompt = summarizer.format(
422
+ style="professional",
423
+ text="Long article content..."
424
+ )
425
+ ```
426
+
427
+ ### 3.2 Prompt Versioning & A/B Testing
428
+
429
+ ```python
430
+ class PromptRegistry:
431
+ def __init__(self, db):
432
+ self.db = db
433
+
434
+ def register(self, name: str, template: str, version: str):
435
+ """Store prompt with version"""
436
+ self.db.save({
437
+ "name": name,
438
+ "template": template,
439
+ "version": version,
440
+ "created_at": datetime.now(),
441
+ "metrics": {}
442
+ })
443
+
444
+ def get(self, name: str, version: str = "latest") -> str:
445
+ """Retrieve specific version"""
446
+ return self.db.get(name, version)
447
+
448
+ def ab_test(self, name: str, user_id: str) -> str:
449
+ """Return variant based on user bucket"""
450
+ variants = self.db.get_all_versions(name)
451
+ bucket = hash(user_id) % len(variants)
452
+ return variants[bucket]
453
+
454
+ def record_outcome(self, prompt_id: str, outcome: dict):
455
+ """Track prompt performance"""
456
+ self.db.update_metrics(prompt_id, outcome)
457
+ ```
458
+
459
+ ### 3.3 Prompt Chaining
460
+
461
+ ```python
462
+ class PromptChain:
463
+ """
464
+ Chain prompts together, passing output as input to next
465
+ """
466
+
467
+ def __init__(self, steps: list[dict]):
468
+ self.steps = steps
469
+
470
+ def run(self, initial_input: str) -> dict:
471
+ context = {"input": initial_input}
472
+ results = []
473
+
474
+ for step in self.steps:
475
+ prompt = step["prompt"].format(**context)
476
+ output = llm.generate(prompt)
477
+
478
+ # Parse output if needed
479
+ if step.get("parser"):
480
+ output = step"parser"
481
+
482
+ context[step["output_key"]] = output
483
+ results.append({
484
+ "step": step["name"],
485
+ "output": output
486
+ })
487
+
488
+ return {
489
+ "final_output": context[self.steps[-1]["output_key"]],
490
+ "intermediate_results": results
491
+ }
492
+
493
+ # Example: Research → Analyze → Summarize
494
+ chain = PromptChain([
495
+ {
496
+ "name": "research",
497
+ "prompt": "Research the topic: {input}",
498
+ "output_key": "research"
499
+ },
500
+ {
501
+ "name": "analyze",
502
+ "prompt": "Analyze these findings:\n{research}",
503
+ "output_key": "analysis"
504
+ },
505
+ {
506
+ "name": "summarize",
507
+ "prompt": "Summarize this analysis in 3 bullet points:\n{analysis}",
508
+ "output_key": "summary"
509
+ }
510
+ ])
511
+ ```
512
+
513
+ ---
514
+
515
+ ## 4. LLMOps & Observability
516
+
517
+ ### 4.1 Metrics to Track
518
+
519
+ ```python
520
+ LLM_METRICS = {
521
+ # Performance
522
+ "latency_p50": "50th percentile response time",
523
+ "latency_p99": "99th percentile response time",
524
+ "tokens_per_second": "Generation speed",
525
+
526
+ # Quality
527
+ "user_satisfaction": "Thumbs up/down ratio",
528
+ "task_completion": "% tasks completed successfully",
529
+ "hallucination_rate": "% responses with factual errors",
530
+
531
+ # Cost
532
+ "cost_per_request": "Average $ per API call",
533
+ "tokens_per_request": "Average tokens used",
534
+ "cache_hit_rate": "% requests served from cache",
535
+
536
+ # Reliability
537
+ "error_rate": "% failed requests",
538
+ "timeout_rate": "% requests that timed out",
539
+ "retry_rate": "% requests needing retry"
540
+ }
541
+ ```
542
+
543
+ ### 4.2 Logging & Tracing
544
+
545
+ ```python
546
+ import logging
547
+ from opentelemetry import trace
548
+
549
+ tracer = trace.get_tracer(__name__)
550
+
551
+ class LLMLogger:
552
+ def log_request(self, request_id: str, data: dict):
553
+ """Log LLM request for debugging and analysis"""
554
+ log_entry = {
555
+ "request_id": request_id,
556
+ "timestamp": datetime.now().isoformat(),
557
+ "model": data["model"],
558
+ "prompt": data["prompt"][:500], # Truncate for storage
559
+ "prompt_tokens": data["prompt_tokens"],
560
+ "temperature": data.get("temperature", 1.0),
561
+ "user_id": data.get("user_id"),
562
+ }
563
+ logging.info(f"LLM_REQUEST: {json.dumps(log_entry)}")
564
+
565
+ def log_response(self, request_id: str, data: dict):
566
+ """Log LLM response"""
567
+ log_entry = {
568
+ "request_id": request_id,
569
+ "completion_tokens": data["completion_tokens"],
570
+ "total_tokens": data["total_tokens"],
571
+ "latency_ms": data["latency_ms"],
572
+ "finish_reason": data["finish_reason"],
573
+ "cost_usd": self._calculate_cost(data),
574
+ }
575
+ logging.info(f"LLM_RESPONSE: {json.dumps(log_entry)}")
576
+
577
+ # Distributed tracing
578
+ @tracer.start_as_current_span("llm_call")
579
+ def call_llm(prompt: str) -> str:
580
+ span = trace.get_current_span()
581
+ span.set_attribute("prompt.length", len(prompt))
582
+
583
+ response = llm.generate(prompt)
584
+
585
+ span.set_attribute("response.length", len(response))
586
+ span.set_attribute("tokens.total", response.usage.total_tokens)
587
+
588
+ return response.content
589
+ ```
590
+
591
+ ### 4.3 Evaluation Framework
592
+
593
+ ```python
594
+ class LLMEvaluator:
595
+ """
596
+ Evaluate LLM outputs for quality
597
+ """
598
+
599
+ def evaluate_response(self,
600
+ question: str,
601
+ response: str,
602
+ ground_truth: str = None) -> dict:
603
+ scores = {}
604
+
605
+ # Relevance: Does it answer the question?
606
+ scores["relevance"] = self._score_relevance(question, response)
607
+
608
+ # Coherence: Is it well-structured?
609
+ scores["coherence"] = self._score_coherence(response)
610
+
611
+ # Groundedness: Is it based on provided context?
612
+ scores["groundedness"] = self._score_groundedness(response)
613
+
614
+ # Accuracy: Does it match ground truth?
615
+ if ground_truth:
616
+ scores["accuracy"] = self._score_accuracy(response, ground_truth)
617
+
618
+ # Harmfulness: Is it safe?
619
+ scores["safety"] = self._score_safety(response)
620
+
621
+ return scores
622
+
623
+ def run_benchmark(self, test_cases: list[dict]) -> dict:
624
+ """Run evaluation on test set"""
625
+ results = []
626
+ for case in test_cases:
627
+ response = llm.generate(case["prompt"])
628
+ scores = self.evaluate_response(
629
+ question=case["prompt"],
630
+ response=response,
631
+ ground_truth=case.get("expected")
632
+ )
633
+ results.append(scores)
634
+
635
+ return self._aggregate_scores(results)
636
+ ```
637
+
638
+ ---
639
+
640
+ ## 5. Production Patterns
641
+
642
+ ### 5.1 Caching Strategy
643
+
644
+ ```python
645
+ import hashlib
646
+ from functools import lru_cache
647
+
648
+ class LLMCache:
649
+ def __init__(self, redis_client, ttl_seconds=3600):
650
+ self.redis = redis_client
651
+ self.ttl = ttl_seconds
652
+
653
+ def _cache_key(self, prompt: str, model: str, **kwargs) -> str:
654
+ """Generate deterministic cache key"""
655
+ content = f"{model}:{prompt}:{json.dumps(kwargs, sort_keys=True)}"
656
+ return hashlib.sha256(content.encode()).hexdigest()
657
+
658
+ def get_or_generate(self, prompt: str, model: str, **kwargs) -> str:
659
+ key = self._cache_key(prompt, model, **kwargs)
660
+
661
+ # Check cache
662
+ cached = self.redis.get(key)
663
+ if cached:
664
+ return cached.decode()
665
+
666
+ # Generate
667
+ response = llm.generate(prompt, model=model, **kwargs)
668
+
669
+ # Cache (only cache deterministic outputs)
670
+ if kwargs.get("temperature", 1.0) == 0:
671
+ self.redis.setex(key, self.ttl, response)
672
+
673
+ return response
674
+ ```
675
+
676
+ ### 5.2 Rate Limiting & Retry
677
+
678
+ ```python
679
+ import time
680
+ from tenacity import retry, wait_exponential, stop_after_attempt
681
+
682
+ class RateLimiter:
683
+ def __init__(self, requests_per_minute: int):
684
+ self.rpm = requests_per_minute
685
+ self.timestamps = []
686
+
687
+ def acquire(self):
688
+ """Wait if rate limit would be exceeded"""
689
+ now = time.time()
690
+
691
+ # Remove old timestamps
692
+ self.timestamps = [t for t in self.timestamps if now - t < 60]
693
+
694
+ if len(self.timestamps) >= self.rpm:
695
+ sleep_time = 60 - (now - self.timestamps[0])
696
+ time.sleep(sleep_time)
697
+
698
+ self.timestamps.append(time.time())
699
+
700
+ # Retry with exponential backoff
701
+ @retry(
702
+ wait=wait_exponential(multiplier=1, min=4, max=60),
703
+ stop=stop_after_attempt(5)
704
+ )
705
+ def call_llm_with_retry(prompt: str) -> str:
706
+ try:
707
+ return llm.generate(prompt)
708
+ except RateLimitError:
709
+ raise # Will trigger retry
710
+ except APIError as e:
711
+ if e.status_code >= 500:
712
+ raise # Retry server errors
713
+ raise # Don't retry client errors
714
+ ```
715
+
716
+ ### 5.3 Fallback Strategy
717
+
718
+ ```python
719
+ class LLMWithFallback:
720
+ def __init__(self, primary: str, fallbacks: list[str]):
721
+ self.primary = primary
722
+ self.fallbacks = fallbacks
723
+
724
+ def generate(self, prompt: str, **kwargs) -> str:
725
+ models = [self.primary] + self.fallbacks
726
+
727
+ for model in models:
728
+ try:
729
+ return llm.generate(prompt, model=model, **kwargs)
730
+ except (RateLimitError, APIError) as e:
731
+ logging.warning(f"Model {model} failed: {e}")
732
+ continue
733
+
734
+ raise AllModelsFailedError("All models exhausted")
735
+
736
+ # Usage
737
+ llm_client = LLMWithFallback(
738
+ primary="gpt-4-turbo",
739
+ fallbacks=["gpt-3.5-turbo", "claude-3-sonnet"]
740
+ )
741
+ ```
742
+
743
+ ---
744
+
745
+ ## Architecture Decision Matrix
746
+
747
+ | Pattern | Use When | Complexity | Cost |
748
+ | :------------------- | :--------------- | :--------- | :-------- |
749
+ | **Simple RAG** | FAQ, docs search | Low | Low |
750
+ | **Hybrid RAG** | Mixed queries | Medium | Medium |
751
+ | **ReAct Agent** | Multi-step tasks | Medium | Medium |
752
+ | **Function Calling** | Structured tools | Low | Low |
753
+ | **Plan-Execute** | Complex tasks | High | High |
754
+ | **Multi-Agent** | Research tasks | Very High | Very High |
755
+
756
+ ---
757
+
758
+ ## Resources
759
+
760
+ - [Dify Platform](https://github.com/langgenius/dify)
761
+ - [LangChain Docs](https://python.langchain.com/)
762
+ - [LlamaIndex](https://www.llamaindex.ai/)
763
+ - [Anthropic Cookbook](https://github.com/anthropics/anthropic-cookbook)