tribunal-kit 3.0.0 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. package/.agent/ARCHITECTURE.md +99 -99
  2. package/.agent/GEMINI.md +52 -52
  3. package/.agent/agents/accessibility-reviewer.md +187 -220
  4. package/.agent/agents/ai-code-reviewer.md +199 -233
  5. package/.agent/agents/backend-specialist.md +215 -238
  6. package/.agent/agents/code-archaeologist.md +161 -181
  7. package/.agent/agents/database-architect.md +184 -207
  8. package/.agent/agents/debugger.md +191 -218
  9. package/.agent/agents/dependency-reviewer.md +103 -136
  10. package/.agent/agents/devops-engineer.md +218 -238
  11. package/.agent/agents/documentation-writer.md +201 -221
  12. package/.agent/agents/explorer-agent.md +160 -180
  13. package/.agent/agents/frontend-reviewer.md +160 -194
  14. package/.agent/agents/frontend-specialist.md +248 -237
  15. package/.agent/agents/game-developer.md +48 -52
  16. package/.agent/agents/logic-reviewer.md +116 -149
  17. package/.agent/agents/mobile-developer.md +200 -223
  18. package/.agent/agents/mobile-reviewer.md +162 -195
  19. package/.agent/agents/orchestrator.md +181 -211
  20. package/.agent/agents/penetration-tester.md +157 -174
  21. package/.agent/agents/performance-optimizer.md +183 -203
  22. package/.agent/agents/performance-reviewer.md +178 -211
  23. package/.agent/agents/precedence-reviewer.md +213 -0
  24. package/.agent/agents/product-manager.md +142 -162
  25. package/.agent/agents/product-owner.md +6 -25
  26. package/.agent/agents/project-planner.md +142 -162
  27. package/.agent/agents/qa-automation-engineer.md +225 -242
  28. package/.agent/agents/security-auditor.md +174 -194
  29. package/.agent/agents/seo-specialist.md +193 -213
  30. package/.agent/agents/sql-reviewer.md +161 -194
  31. package/.agent/agents/supervisor-agent.md +184 -203
  32. package/.agent/agents/swarm-worker-contracts.md +17 -17
  33. package/.agent/agents/swarm-worker-registry.md +46 -46
  34. package/.agent/agents/test-coverage-reviewer.md +160 -193
  35. package/.agent/agents/test-engineer.md +0 -21
  36. package/.agent/agents/type-safety-reviewer.md +175 -208
  37. package/.agent/patterns/generator.md +9 -9
  38. package/.agent/patterns/inversion.md +12 -12
  39. package/.agent/patterns/pipeline.md +9 -9
  40. package/.agent/patterns/reviewer.md +13 -13
  41. package/.agent/patterns/tool-wrapper.md +9 -9
  42. package/.agent/rules/GEMINI.md +63 -63
  43. package/.agent/scripts/append_flow.js +72 -0
  44. package/.agent/scripts/case_law_manager.py +525 -0
  45. package/.agent/scripts/compress_skills.py +167 -0
  46. package/.agent/scripts/consolidate_skills.py +173 -0
  47. package/.agent/scripts/deep_compress.py +202 -0
  48. package/.agent/scripts/minify_context.py +80 -0
  49. package/.agent/scripts/security_scan.py +1 -1
  50. package/.agent/scripts/skill_evolution.py +563 -0
  51. package/.agent/scripts/strip_tribunal.py +41 -0
  52. package/.agent/skills/agent-organizer/SKILL.md +100 -126
  53. package/.agent/skills/agentic-patterns/SKILL.md +0 -70
  54. package/.agent/skills/ai-prompt-injection-defense/SKILL.md +134 -160
  55. package/.agent/skills/api-patterns/SKILL.md +123 -215
  56. package/.agent/skills/api-security-auditor/SKILL.md +143 -177
  57. package/.agent/skills/app-builder/SKILL.md +334 -50
  58. package/.agent/skills/app-builder/templates/SKILL.md +13 -15
  59. package/.agent/skills/app-builder/templates/astro-static/TEMPLATE.md +16 -16
  60. package/.agent/skills/app-builder/templates/chrome-extension/TEMPLATE.md +22 -22
  61. package/.agent/skills/app-builder/templates/cli-tool/TEMPLATE.md +18 -18
  62. package/.agent/skills/app-builder/templates/electron-desktop/TEMPLATE.md +20 -20
  63. package/.agent/skills/app-builder/templates/express-api/TEMPLATE.md +17 -17
  64. package/.agent/skills/app-builder/templates/flutter-app/TEMPLATE.md +18 -18
  65. package/.agent/skills/app-builder/templates/monorepo-turborepo/TEMPLATE.md +21 -21
  66. package/.agent/skills/app-builder/templates/nextjs-fullstack/TEMPLATE.md +19 -19
  67. package/.agent/skills/app-builder/templates/nextjs-saas/TEMPLATE.md +26 -26
  68. package/.agent/skills/app-builder/templates/nextjs-static/TEMPLATE.md +26 -26
  69. package/.agent/skills/app-builder/templates/nuxt-app/TEMPLATE.md +19 -19
  70. package/.agent/skills/app-builder/templates/python-fastapi/TEMPLATE.md +18 -18
  71. package/.agent/skills/app-builder/templates/react-native-app/TEMPLATE.md +20 -20
  72. package/.agent/skills/appflow-wireframe/SKILL.md +95 -121
  73. package/.agent/skills/architecture/SKILL.md +169 -331
  74. package/.agent/skills/authentication-best-practices/SKILL.md +139 -173
  75. package/.agent/skills/bash-linux/SKILL.md +129 -154
  76. package/.agent/skills/behavioral-modes/SKILL.md +8 -69
  77. package/.agent/skills/brainstorming/SKILL.md +436 -104
  78. package/.agent/skills/building-native-ui/SKILL.md +152 -174
  79. package/.agent/skills/clean-code/SKILL.md +331 -360
  80. package/.agent/skills/code-review-checklist/SKILL.md +0 -62
  81. package/.agent/skills/config-validator/SKILL.md +115 -141
  82. package/.agent/skills/csharp-developer/SKILL.md +468 -528
  83. package/.agent/skills/database-design/SKILL.md +104 -369
  84. package/.agent/skills/deployment-procedures/SKILL.md +119 -145
  85. package/.agent/skills/devops-engineer/SKILL.md +295 -332
  86. package/.agent/skills/devops-incident-responder/SKILL.md +87 -113
  87. package/.agent/skills/doc.md +5 -5
  88. package/.agent/skills/documentation-templates/SKILL.md +27 -63
  89. package/.agent/skills/edge-computing/SKILL.md +131 -157
  90. package/.agent/skills/extract-design-system/SKILL.md +108 -134
  91. package/.agent/skills/framer-motion-expert/SKILL.md +111 -855
  92. package/.agent/skills/frontend-design/SKILL.md +151 -499
  93. package/.agent/skills/game-design-expert/SKILL.md +79 -105
  94. package/.agent/skills/game-engineering-expert/SKILL.md +96 -122
  95. package/.agent/skills/geo-fundamentals/SKILL.md +97 -124
  96. package/.agent/skills/github-operations/SKILL.md +279 -314
  97. package/.agent/skills/gsap-expert/SKILL.md +119 -826
  98. package/.agent/skills/i18n-localization/SKILL.md +113 -138
  99. package/.agent/skills/intelligent-routing/SKILL.md +167 -127
  100. package/.agent/skills/lint-and-validate/SKILL.md +16 -52
  101. package/.agent/skills/llm-engineering/SKILL.md +344 -357
  102. package/.agent/skills/local-first/SKILL.md +128 -154
  103. package/.agent/skills/mcp-builder/SKILL.md +92 -118
  104. package/.agent/skills/mobile-design/SKILL.md +213 -219
  105. package/.agent/skills/motion-engineering/SKILL.md +184 -0
  106. package/.agent/skills/nextjs-react-expert/SKILL.md +99 -698
  107. package/.agent/skills/nodejs-best-practices/SKILL.md +498 -559
  108. package/.agent/skills/observability/SKILL.md +293 -330
  109. package/.agent/skills/parallel-agents/SKILL.md +96 -122
  110. package/.agent/skills/performance-profiling/SKILL.md +217 -254
  111. package/.agent/skills/plan-writing/SKILL.md +92 -118
  112. package/.agent/skills/platform-engineer/SKILL.md +97 -123
  113. package/.agent/skills/playwright-best-practices/SKILL.md +137 -162
  114. package/.agent/skills/powershell-windows/SKILL.md +112 -146
  115. package/.agent/skills/project-idioms/SKILL.md +87 -0
  116. package/.agent/skills/python-patterns/SKILL.md +15 -35
  117. package/.agent/skills/python-pro/SKILL.md +148 -754
  118. package/.agent/skills/react-specialist/SKILL.md +123 -827
  119. package/.agent/skills/readme-builder/SKILL.md +23 -85
  120. package/.agent/skills/realtime-patterns/SKILL.md +269 -304
  121. package/.agent/skills/red-team-tactics/SKILL.md +18 -51
  122. package/.agent/skills/rust-pro/SKILL.md +623 -701
  123. package/.agent/skills/seo-fundamentals/SKILL.md +129 -154
  124. package/.agent/skills/server-management/SKILL.md +164 -190
  125. package/.agent/skills/shadcn-ui-expert/SKILL.md +181 -206
  126. package/.agent/skills/skill-creator/SKILL.md +24 -56
  127. package/.agent/skills/sql-pro/SKILL.md +579 -633
  128. package/.agent/skills/supabase-postgres-best-practices/SKILL.md +35 -66
  129. package/.agent/skills/swiftui-expert/SKILL.md +151 -176
  130. package/.agent/skills/systematic-debugging/SKILL.md +92 -118
  131. package/.agent/skills/tailwind-patterns/SKILL.md +516 -576
  132. package/.agent/skills/tdd-workflow/SKILL.md +111 -137
  133. package/.agent/skills/test-result-analyzer/SKILL.md +33 -73
  134. package/.agent/skills/testing-patterns/SKILL.md +512 -573
  135. package/.agent/skills/trend-researcher/SKILL.md +30 -71
  136. package/.agent/skills/ui-ux-pro-max/SKILL.md +8 -41
  137. package/.agent/skills/ui-ux-researcher/SKILL.md +51 -91
  138. package/.agent/skills/vue-expert/SKILL.md +127 -866
  139. package/.agent/skills/vulnerability-scanner/SKILL.md +354 -269
  140. package/.agent/skills/web-accessibility-auditor/SKILL.md +168 -193
  141. package/.agent/skills/web-design-guidelines/SKILL.md +25 -61
  142. package/.agent/skills/webapp-testing/SKILL.md +119 -145
  143. package/.agent/skills/whimsy-injector/SKILL.md +58 -132
  144. package/.agent/skills/workflow-optimizer/SKILL.md +28 -68
  145. package/.agent/workflows/api-tester.md +151 -151
  146. package/.agent/workflows/audit.md +127 -138
  147. package/.agent/workflows/brainstorm.md +110 -110
  148. package/.agent/workflows/changelog.md +112 -112
  149. package/.agent/workflows/create.md +124 -124
  150. package/.agent/workflows/debug.md +165 -189
  151. package/.agent/workflows/deploy.md +180 -189
  152. package/.agent/workflows/enhance.md +128 -151
  153. package/.agent/workflows/fix.md +114 -135
  154. package/.agent/workflows/generate.md +13 -4
  155. package/.agent/workflows/migrate.md +160 -160
  156. package/.agent/workflows/orchestrate.md +168 -168
  157. package/.agent/workflows/performance-benchmarker.md +114 -123
  158. package/.agent/workflows/plan.md +173 -173
  159. package/.agent/workflows/preview.md +80 -80
  160. package/.agent/workflows/refactor.md +161 -183
  161. package/.agent/workflows/review-ai.md +101 -129
  162. package/.agent/workflows/review.md +116 -116
  163. package/.agent/workflows/session.md +94 -94
  164. package/.agent/workflows/status.md +79 -79
  165. package/.agent/workflows/strengthen-skills.md +138 -139
  166. package/.agent/workflows/swarm.md +179 -179
  167. package/.agent/workflows/test.md +189 -211
  168. package/.agent/workflows/tribunal-backend.md +94 -113
  169. package/.agent/workflows/tribunal-database.md +95 -115
  170. package/.agent/workflows/tribunal-frontend.md +96 -118
  171. package/.agent/workflows/tribunal-full.md +93 -133
  172. package/.agent/workflows/tribunal-mobile.md +95 -119
  173. package/.agent/workflows/tribunal-performance.md +110 -133
  174. package/.agent/workflows/ui-ux-pro-max.md +122 -143
  175. package/README.md +30 -1
  176. package/bin/tribunal-kit.js +175 -12
  177. package/package.json +25 -4
  178. package/.agent/skills/api-patterns/api-style.md +0 -42
  179. package/.agent/skills/api-patterns/auth.md +0 -24
  180. package/.agent/skills/api-patterns/documentation.md +0 -26
  181. package/.agent/skills/api-patterns/graphql.md +0 -41
  182. package/.agent/skills/api-patterns/rate-limiting.md +0 -31
  183. package/.agent/skills/api-patterns/response.md +0 -37
  184. package/.agent/skills/api-patterns/rest.md +0 -40
  185. package/.agent/skills/api-patterns/security-testing.md +0 -122
  186. package/.agent/skills/api-patterns/trpc.md +0 -41
  187. package/.agent/skills/api-patterns/versioning.md +0 -22
  188. package/.agent/skills/app-builder/agent-coordination.md +0 -71
  189. package/.agent/skills/app-builder/feature-building.md +0 -53
  190. package/.agent/skills/app-builder/project-detection.md +0 -34
  191. package/.agent/skills/app-builder/scaffolding.md +0 -118
  192. package/.agent/skills/app-builder/tech-stack.md +0 -40
  193. package/.agent/skills/architecture/context-discovery.md +0 -43
  194. package/.agent/skills/architecture/examples.md +0 -94
  195. package/.agent/skills/architecture/pattern-selection.md +0 -68
  196. package/.agent/skills/architecture/patterns-reference.md +0 -50
  197. package/.agent/skills/architecture/trade-off-analysis.md +0 -77
  198. package/.agent/skills/brainstorming/dynamic-questioning.md +0 -360
  199. package/.agent/skills/database-design/database-selection.md +0 -43
  200. package/.agent/skills/database-design/indexing.md +0 -39
  201. package/.agent/skills/database-design/migrations.md +0 -48
  202. package/.agent/skills/database-design/optimization.md +0 -36
  203. package/.agent/skills/database-design/orm-selection.md +0 -30
  204. package/.agent/skills/database-design/schema-design.md +0 -56
  205. package/.agent/skills/frontend-design/animation-guide.md +0 -331
  206. package/.agent/skills/frontend-design/color-system.md +0 -329
  207. package/.agent/skills/frontend-design/decision-trees.md +0 -418
  208. package/.agent/skills/frontend-design/motion-graphics.md +0 -306
  209. package/.agent/skills/frontend-design/typography-system.md +0 -363
  210. package/.agent/skills/frontend-design/ux-psychology.md +0 -1116
  211. package/.agent/skills/frontend-design/visual-effects.md +0 -383
  212. package/.agent/skills/intelligent-routing/router-manifest.md +0 -65
  213. package/.agent/skills/mobile-design/decision-trees.md +0 -516
  214. package/.agent/skills/mobile-design/mobile-backend.md +0 -491
  215. package/.agent/skills/mobile-design/mobile-color-system.md +0 -420
  216. package/.agent/skills/mobile-design/mobile-debugging.md +0 -122
  217. package/.agent/skills/mobile-design/mobile-design-thinking.md +0 -357
  218. package/.agent/skills/mobile-design/mobile-navigation.md +0 -458
  219. package/.agent/skills/mobile-design/mobile-performance.md +0 -767
  220. package/.agent/skills/mobile-design/mobile-testing.md +0 -356
  221. package/.agent/skills/mobile-design/mobile-typography.md +0 -433
  222. package/.agent/skills/mobile-design/platform-android.md +0 -666
  223. package/.agent/skills/mobile-design/platform-ios.md +0 -561
  224. package/.agent/skills/mobile-design/touch-psychology.md +0 -537
  225. package/.agent/skills/nextjs-react-expert/1-async-eliminating-waterfalls.md +0 -312
  226. package/.agent/skills/nextjs-react-expert/2-bundle-bundle-size-optimization.md +0 -240
  227. package/.agent/skills/nextjs-react-expert/3-server-server-side-performance.md +0 -490
  228. package/.agent/skills/nextjs-react-expert/4-client-client-side-data-fetching.md +0 -264
  229. package/.agent/skills/nextjs-react-expert/5-rerender-re-render-optimization.md +0 -581
  230. package/.agent/skills/nextjs-react-expert/6-rendering-rendering-performance.md +0 -432
  231. package/.agent/skills/nextjs-react-expert/7-js-javascript-performance.md +0 -684
  232. package/.agent/skills/nextjs-react-expert/8-advanced-advanced-patterns.md +0 -150
  233. package/.agent/skills/vulnerability-scanner/checklists.md +0 -121
@@ -1,357 +1,344 @@
1
- ---
2
- name: llm-engineering
3
- description: LLM engineering mastery for production AI systems. Prompt engineering, RAG pipeline design, vector store selection, embedding strategies, chunking, reranking, structured output, function calling, streaming, evals, guard-rails, cost optimization, and LLMOps. Use when building AI features, chat interfaces, semantic search, or any system calling an LLM API.
4
- allowed-tools: Read, Write, Edit, Glob, Grep
5
- version: 2.0.0
6
- last-updated: 2026-04-01
7
- applies-to-model: gemini-2.5-pro, claude-3-7-sonnet
8
- ---
9
-
10
- # LLM Engineering — Production AI Systems Mastery
11
-
12
- > An LLM without guardrails is a liability generator.
13
- > Every prompt is a contract. Every response is untrusted. Every token costs money.
14
-
15
- ---
16
-
17
- ## Model Selection
18
-
19
- ```
20
- Model Use Case Cost Tier
21
- ─────────────────────────┼───────────────────────────────────────┼──────────
22
- GPT-4o Complex reasoning, code generation │ $$$
23
- GPT-4o-mini Classification, summaries, chat │ $
24
- Claude 3.7 Sonnet Long documents, analysis, code │ $$$
25
- Claude 3.5 HaikuFast responses, simple tasks │ $
26
- Gemini 2.5 Pro Large context, multimodal, code $$$
27
- Gemini 2.5 Flash High throughput, cost-efficient $
28
- Llama 3.3 70B (open) │ Self-hosted, data privacy │ Free*
29
- Mistral Large │ European data residency, code │ $$
30
-
31
- * = compute costs only
32
-
33
- Selection rules:
34
- 1. Start with the cheapest model that works
35
- 2. Upgrade only when eval scores require it
36
- 3. Use large models for complex reasoning, small models for classification
37
- 4. Fine-tune ONLY after prompt engineering and RAG are exhausted
38
- ```
39
-
40
- ---
41
-
42
- ## Prompt Engineering
43
-
44
- ### System Prompt Design
45
-
46
- ```typescript
47
- const SYSTEM_PROMPT = `You are a customer support agent for Acme Corp.
48
-
49
- ## Rules
50
- 1. Answer ONLY questions about Acme products and services.
51
- 2. If you don't know the answer, say "I'll connect you with a specialist."
52
- 3. Never discuss competitors.
53
- 4. Never make up product features or pricing.
54
- 5. Keep responses under 200 words.
55
-
56
- ## Response Format
57
- - Use bullet points for lists
58
- - Include product links when relevant
59
- - End with a follow-up question
60
-
61
- ## Context
62
- Current date: ${new Date().toISOString().split("T")[0]}
63
- User plan: {{user_plan}}
64
- `;
65
-
66
- // ❌ HALLUCINATION TRAP: System prompts are NOT secrets
67
- // Users can extract system prompts with jailbreak techniques
68
- // Never put API keys, internal URLs, or secrets in system prompts
69
- ```
70
-
71
- ### Structured Output (JSON Mode)
72
-
73
- ```typescript
74
- import { z } from "zod";
75
- import OpenAI from "openai";
76
-
77
- const SentimentSchema = z.object({
78
- sentiment: z.enum(["positive", "negative", "neutral"]),
79
- confidence: z.number().min(0).max(1),
80
- reasoning: z.string(),
81
- topics: z.array(z.string()),
82
- });
83
-
84
- type Sentiment = z.infer<typeof SentimentSchema>;
85
-
86
- async function analyzeSentiment(text: string): Promise<Sentiment> {
87
- const response = await openai.chat.completions.create({
88
- model: "gpt-4o-mini",
89
- response_format: { type: "json_object" },
90
- messages: [
91
- {
92
- role: "system",
93
- content: `Analyze the sentiment of the given text.
94
- Respond with JSON matching this schema:
95
- {
96
- "sentiment": "positive" | "negative" | "neutral",
97
- "confidence": 0-1,
98
- "reasoning": "brief explanation",
99
- "topics": ["topic1", "topic2"]
100
- }`,
101
- },
102
- { role: "user", content: text },
103
- ],
104
- });
105
-
106
- const raw = JSON.parse(response.choices[0].message.content ?? "{}");
107
- return SentimentSchema.parse(raw); // Zod validates the LLM response
108
- }
109
-
110
- // ❌ HALLUCINATION TRAP: Always validate LLM JSON output with Zod/schema
111
- // LLMs produce malformed JSON, wrong types, missing fields
112
- // const result = JSON.parse(response); // trust blindly
113
- // const result = Schema.parse(JSON.parse(response)); // validate
114
- ```
115
-
116
- ### Function Calling / Tool Use
117
-
118
- ```typescript
119
- const tools: OpenAI.ChatCompletionTool[] = [
120
- {
121
- type: "function",
122
- function: {
123
- name: "search_products",
124
- description: "Search products by name, category, or price range",
125
- parameters: {
126
- type: "object",
127
- properties: {
128
- query: { type: "string", description: "Search query" },
129
- category: { type: "string", enum: ["electronics", "clothing", "home"] },
130
- max_price: { type: "number", description: "Maximum price in USD" },
131
- },
132
- required: ["query"],
133
- },
134
- },
135
- },
136
- {
137
- type: "function",
138
- function: {
139
- name: "get_order_status",
140
- description: "Get the status of an order by order ID",
141
- parameters: {
142
- type: "object",
143
- properties: {
144
- order_id: { type: "string", description: "The order ID (e.g., ORD-12345)" },
145
- },
146
- required: ["order_id"],
147
- },
148
- },
149
- },
150
- ];
151
-
152
- // Tool execution loop
153
- async function chatWithTools(userMessage: string) {
154
- const messages: OpenAI.ChatCompletionMessageParam[] = [
155
- { role: "system", content: SYSTEM_PROMPT },
156
- { role: "user", content: userMessage },
157
- ];
158
-
159
- let response = await openai.chat.completions.create({
160
- model: "gpt-4o-mini",
161
- messages,
162
- tools,
163
- });
164
-
165
- // Process tool calls
166
- while (response.choices[0].finish_reason === "tool_calls") {
167
- const toolCalls = response.choices[0].message.tool_calls ?? [];
168
- messages.push(response.choices[0].message);
169
-
170
- for (const call of toolCalls) {
171
- const args = JSON.parse(call.function.arguments);
172
- const result = await executeFunction(call.function.name, args);
173
- messages.push({
174
- role: "tool",
175
- tool_call_id: call.id,
176
- content: JSON.stringify(result),
177
- });
178
- }
179
-
180
- response = await openai.chat.completions.create({
181
- model: "gpt-4o-mini",
182
- messages,
183
- tools,
184
- });
185
- }
186
-
187
- return response.choices[0].message.content;
188
- }
189
- ```
190
-
191
- ---
192
-
193
- ## RAG (Retrieval-Augmented Generation)
194
-
195
- ### Pipeline
196
-
197
- ```
198
- User Query
199
-
200
- [1] Embed query → vector
201
-
202
- [2] Search vector DB → top K chunks
203
-
204
- [3] (Optional) Rerank results → top N
205
-
206
- [4] Build prompt: system + context chunks + query
207
-
208
- [5] LLM generates answer with citations
209
-
210
- [6] Validate response (hallucination check)
211
- ```
212
-
213
- ### Chunking Strategy
214
-
215
- ```typescript
216
- // ❌ BAD: Arbitrary character splitting
217
- const chunks = text.match(/.{1,1000}/g); // breaks mid-sentence, mid-word
218
-
219
- // ✅ GOOD: Semantic chunking with overlap
220
- function chunkDocument(text: string, options: ChunkOptions = {}): Chunk[] {
221
- const {
222
- maxTokens = 512, // chunk size
223
- overlapTokens = 50, // overlap between chunks
224
- separator = "\n\n", // split on paragraph boundaries first
225
- } = options;
226
-
227
- const paragraphs = text.split(separator);
228
- const chunks: Chunk[] = [];
229
- let current = "";
230
-
231
- for (const para of paragraphs) {
232
- if (tokenCount(current + para) > maxTokens && current) {
233
- chunks.push({ text: current.trim(), tokens: tokenCount(current) });
234
- // Keep overlap from previous chunk
235
- const words = current.split(" ");
236
- current = words.slice(-overlapTokens).join(" ") + separator + para;
237
- } else {
238
- current += separator + para;
239
- }
240
- }
241
- if (current.trim()) chunks.push({ text: current.trim(), tokens: tokenCount(current) });
242
-
243
- return chunks;
244
- }
245
-
246
- // Chunk size guidelines:
247
- // 256-512 tokens → precise retrieval (Q&A, support)
248
- // 512-1024 tokens → balanced (general RAG)
249
- // 1024-2048 tokens broad context (summarization)
250
- ```
251
-
252
- ### Vector Store Selection
253
-
254
- ```
255
- pgvector (PostgreSQL) → Already using Postgres, <10M vectors, simple
256
- Pinecone → Managed, serverless, easy scaling
257
- Weaviate → Hybrid search (vector + keyword), multi-model
258
- Qdrant → High performance, Rust-based, self-hostable
259
- Chroma → Local development, prototyping
260
- Milvus → Enterprise scale, GPU acceleration
261
-
262
- // HALLUCINATION TRAP: Vector search is NOT keyword search
263
- // "Apple CEO" might not find "Tim Cook runs Apple Inc."
264
- // Use HYBRID search (vector + BM25 keyword) for production
265
- ```
266
-
267
- ---
268
-
269
- ## Streaming
270
-
271
- ```typescript
272
- // Server-Sent Events for AI token streaming
273
- app.get("/api/chat", async (req, res) => {
274
- res.setHeader("Content-Type", "text/event-stream");
275
- res.setHeader("Cache-Control", "no-cache");
276
- res.setHeader("Connection", "keep-alive");
277
-
278
- const stream = await openai.chat.completions.create({
279
- model: "gpt-4o-mini",
280
- messages: [{ role: "user", content: req.query.message as string }],
281
- stream: true,
282
- });
283
-
284
- for await (const chunk of stream) {
285
- const content = chunk.choices[0]?.delta?.content;
286
- if (content) {
287
- res.write(`data: ${JSON.stringify({ content })}\n\n`);
288
- }
289
- }
290
-
291
- res.write("data: [DONE]\n\n");
292
- res.end();
293
- });
294
-
295
- // Client-side consumption
296
- const eventSource = new EventSource(`/api/chat?message=${encodeURIComponent(msg)}`);
297
- eventSource.onmessage = (event) => {
298
- if (event.data === "[DONE]") { eventSource.close(); return; }
299
- const { content } = JSON.parse(event.data);
300
- appendToChat(content);
301
- };
302
- ```
303
-
304
- ---
305
-
306
- ## Cost Optimization
307
-
308
- ```
309
- 1. Prompt caching → Cache system prompts (OpenAI, Anthropic support this)
310
- 2. Output token limiting → Set max_tokens to prevent runaway responses
311
- 3. Tiered models → Use cheap models for classification, expensive for reasoning
312
- 4. Batch processing → Use batch APIs for offline processing (50% discount)
313
- 5. Chunked context → Send only relevant chunks, not entire documents
314
- 6. Response streaming → Stream to reduce TTFT (time to first token)
315
- 7. Structured output → Shorter JSON responses vs verbose prose
316
-
317
- // Cost estimation:
318
- // GPT-4o: ~$2.50/1M input, ~$10/1M output
319
- // GPT-4o-mini: ~$0.15/1M input, ~$0.60/1M output
320
- // 1M tokens 750,000 words ≈ 3,000 pages
321
- ```
322
-
323
- ---
324
-
325
- ## 🤖 LLM-Specific Traps
326
-
327
- 1. **Trusting LLM JSON Output:** Always validate with Zod/schema. LLMs produce malformed JSON.
328
- 2. **Secrets in System Prompts:** System prompts can be extracted. Never include API keys or internal URLs.
329
- 3. **Fixed Character Chunking:** Splitting at 1000 chars breaks sentences. Use semantic/paragraph chunking.
330
- 4. **Vector-Only Search:** Pure vector search misses exact matches. Use hybrid search for production.
331
- 5. **No Token Limits:** Without `max_tokens`, models can generate 4000+ token responses. Set limits.
332
- 6. **Single Model for Everything:** Use tiered models — cheap for simple tasks, expensive for reasoning.
333
- 7. **No Eval Suite:** Deploying AI without evaluations is deploying untested code. Build evals.
334
- 8. **Prompt Injection Blindness:** User input can override system instructions. Always sanitize and delimit.
335
- 9. **Infinite Tool Loops:** Tool-calling agents can loop forever. Set max iterations (3-5).
336
- 10. **No Rate Limiting:** API calls without rate limiting = surprise $10,000 bill. Set spend limits.
337
-
338
- ---
339
-
340
- ## 🏛️ Tribunal Integration
341
-
342
- **Slash command: `/review-ai`**
343
-
344
- ### ✅ Pre-Flight Self-Audit
345
-
346
- ```
347
- ✅ Am I validating all LLM responses with a schema?
348
- ✅ Are there no secrets in system prompts?
349
- ✅ Is user input delimited from system instructions?
350
- ✅ Did I set max_tokens on all completions?
351
- ✅ Is there rate limiting and cost monitoring?
352
- ✅ Am I using the cheapest model that works?
353
- ✅ Is chunking semantic (not fixed-character)?
354
- ✅ Is search hybrid (vector + keyword)?
355
- ✅ Do tool-calling loops have a max iteration limit?
356
- ✅ Did I build evaluation tests for AI quality?
357
- ```
1
+ ---
2
+ name: llm-engineering
3
+ description: LLM engineering mastery for production AI systems. Prompt engineering, RAG pipeline design, vector store selection, embedding strategies, chunking, reranking, structured output, function calling, streaming, evals, guard-rails, cost optimization, and LLMOps. Use when building AI features, chat interfaces, semantic search, or any system calling an LLM API.
4
+ allowed-tools: Read, Write, Edit, Glob, Grep
5
+ version: 3.2.0
6
+ last-updated: 2026-04-07
7
+ applies-to-model: gemini-3-1-pro, claude-3-7-sonnet
8
+ ---
9
+
10
+ # LLM Engineering — Production AI Systems Mastery
11
+
12
+ ---
13
+
14
+ ## Model Selection
15
+
16
+ ```
17
+ Model Use Case │ Cost Tier
18
+ ─────────────────────────┼───────────────────────────────────────┼──────────
19
+ GPT-4o │ Complex reasoning, vision, code │ $$$
20
+ GPT-4o-mini Classification, summaries, chat $
21
+ o3-mini │ Deep reasoning, math, code review │ $$
22
+ Claude 3.7 Sonnet Long documents, analysis, code │ $$$
23
+ Claude 3.5 Haiku Fast responses, simple tasks │ $
24
+ Gemini 3.1 Pro (High) Large context, multimodal, code │ $$$
25
+ Gemini 3.0 FlashHigh throughput, cost-efficient │ $
26
+ Llama 3.3 70B (open) Self-hosted, data privacy Free*
27
+ Mistral Large 2 European data residency, code $$
28
+
29
+ * = compute costs only
30
+
31
+ Selection rules:
32
+ 1. Start with the cheapest model that passes your evals
33
+ 2. Upgrade only when eval scores require it
34
+ 3. Use large models for complex reasoning, small for classification/routing
35
+ 4. Fine-tune ONLY after prompt engineering and RAG are exhausted
36
+ 5. HALLUCINATION TRAP: Model names change frequently always verify current names
37
+ from provider docs before hardcoding (e.g. "gpt-4o" vs "gpt-4o-2024-11-20")
38
+ ```
39
+
40
+ ---
41
+
42
+ ## Prompt Engineering
43
+
44
+ ### System Prompt Design
45
+
46
+ ```typescript
47
+ const SYSTEM_PROMPT = `You are a customer support agent for Acme Corp.
48
+
49
+ ## Rules
50
+ 1. Answer ONLY questions about Acme products and services.
51
+ 2. If you don't know the answer, say "I'll connect you with a specialist."
52
+ 3. Never discuss competitors.
53
+ 4. Never make up product features or pricing.
54
+ 5. Keep responses under 200 words.
55
+
56
+ ## Response Format
57
+ - Use bullet points for lists
58
+ - Include product links when relevant
59
+ - End with a follow-up question
60
+
61
+ ## Context
62
+ Current date: ${new Date().toISOString().split("T")[0]}
63
+ User plan: {{user_plan}}
64
+ `;
65
+
66
+ // ❌ HALLUCINATION TRAP: System prompts are NOT secrets
67
+ // Users can extract system prompts with jailbreak techniques
68
+ // Never put API keys, internal URLs, or secrets in system prompts
69
+ ```
70
+
71
+ ### Structured Output (JSON Mode)
72
+
73
+ ```typescript
74
+ import { z } from "zod";
75
+ import OpenAI from "openai";
76
+
77
+ const SentimentSchema = z.object({
78
+ sentiment: z.enum(["positive", "negative", "neutral"]),
79
+ confidence: z.number().min(0).max(1),
80
+ reasoning: z.string(),
81
+ topics: z.array(z.string()),
82
+ });
83
+
84
+ // OpenAI — json_schema mode (strict = true enforces schema exactly)
85
+ async function analyzeSentiment(text: string) {
86
+ const response = await openai.chat.completions.create({
87
+ model: "gpt-4o-mini",
88
+ response_format: {
89
+ type: "json_schema",
90
+ json_schema: {
91
+ name: "sentiment_analysis",
92
+ strict: true,
93
+ schema: {
94
+ type: "object",
95
+ properties: {
96
+ sentiment: { type: "string", enum: ["positive", "negative", "neutral"] },
97
+ confidence: { type: "number" },
98
+ reasoning: { type: "string" },
99
+ topics: { type: "array", items: { type: "string" } },
100
+ },
101
+ required: ["sentiment", "confidence", "reasoning", "topics"],
102
+ additionalProperties: false, // required for strict mode
103
+ },
104
+ },
105
+ },
106
+ messages: [{ role: "system", content: "Analyze sentiment." }, { role: "user", content: text }],
107
+ });
108
+ const raw = JSON.parse(response.choices[0].message.content ?? "{}");
109
+ return SentimentSchema.parse(raw); // always validate with Zod even in strict mode
110
+ }
111
+
112
+ // Gemini response_mime_type + response_schema
113
+ import { GoogleGenerativeAI, SchemaType } from "@google/generative-ai";
114
+ const genAI = new GoogleGenerativeAI(process.env.GEMINI_API_KEY!);
115
+ const model = genAI.getGenerativeModel({
116
+ model: "gemini-2.0-flash",
117
+ generationConfig: {
118
+ responseMimeType: "application/json",
119
+ responseSchema: {
120
+ type: SchemaType.OBJECT,
121
+ properties: {
122
+ sentiment: { type: SchemaType.STRING, enum: ["positive", "negative", "neutral"] },
123
+ confidence: { type: SchemaType.NUMBER },
124
+ topics: { type: SchemaType.ARRAY, items: { type: SchemaType.STRING } },
125
+ },
126
+ required: ["sentiment", "confidence", "topics"],
127
+ },
128
+ },
129
+ });
130
+
131
+ // ❌ HALLUCINATION TRAP: Always validate LLM JSON output with Zod/schema
132
+ // LLMs produce malformed JSON, wrong types, missing fields even with strict mode
133
+ // ❌ const result = JSON.parse(response); // trust blindly
134
+ // ✅ const result = Schema.parse(JSON.parse(response)); // validate always
135
+ ```
136
+
137
+ ### Function Calling / Tool Use
138
+
139
+ ```typescript
140
+ const tools: OpenAI.ChatCompletionTool[] = [
141
+ {
142
+ type: "function",
143
+ function: {
144
+ name: "search_products",
145
+ description: "Search products by name, category, or price range",
146
+ parameters: {
147
+ type: "object",
148
+ properties: {
149
+ query: { type: "string", description: "Search query" },
150
+ category: { type: "string", enum: ["electronics", "clothing", "home"] },
151
+ max_price: { type: "number", description: "Maximum price in USD" },
152
+ },
153
+ required: ["query"],
154
+ },
155
+ },
156
+ },
157
+ {
158
+ type: "function",
159
+ function: {
160
+ name: "get_order_status",
161
+ description: "Get the status of an order by order ID",
162
+ parameters: {
163
+ type: "object",
164
+ properties: {
165
+ order_id: { type: "string", description: "The order ID (e.g., ORD-12345)" },
166
+ },
167
+ required: ["order_id"],
168
+ },
169
+ },
170
+ },
171
+ ];
172
+
173
+ // Tool execution loop
174
+ async function chatWithTools(userMessage: string) {
175
+ const messages: OpenAI.ChatCompletionMessageParam[] = [
176
+ { role: "system", content: SYSTEM_PROMPT },
177
+ { role: "user", content: userMessage },
178
+ ];
179
+
180
+ let response = await openai.chat.completions.create({
181
+ model: "gpt-4o-mini",
182
+ messages,
183
+ tools,
184
+ });
185
+
186
+ // Process tool calls
187
+ while (response.choices[0].finish_reason === "tool_calls") {
188
+ const toolCalls = response.choices[0].message.tool_calls ?? [];
189
+ messages.push(response.choices[0].message);
190
+
191
+ for (const call of toolCalls) {
192
+ const args = JSON.parse(call.function.arguments);
193
+ const result = await executeFunction(call.function.name, args);
194
+ messages.push({
195
+ role: "tool",
196
+ tool_call_id: call.id,
197
+ content: JSON.stringify(result),
198
+ });
199
+ }
200
+
201
+ response = await openai.chat.completions.create({
202
+ model: "gpt-4o-mini",
203
+ messages,
204
+ tools,
205
+ });
206
+ }
207
+
208
+ return response.choices[0].message.content;
209
+ }
210
+ ```
211
+
212
+ ---
213
+
214
+ ## RAG (Retrieval-Augmented Generation)
215
+
216
+ ### Pipeline
217
+
218
+ ```
219
+ User Query
220
+
221
+ [1] Embed query → vector
222
+
223
+ [2] Search vector DB top K chunks
224
+
225
+ [3] (Optional) Rerank results → top N
226
+
227
+ [4] Build prompt: system + context chunks + query
228
+
229
+ [5] LLM generates answer with citations
230
+
231
+ [6] Validate response (hallucination check)
232
+ ```
233
+
234
+ ### Chunking Strategy
235
+
236
+ ```typescript
237
+ // BAD: Arbitrary character splitting
238
+ const chunks = text.match(/.{1,1000}/g); // breaks mid-sentence, mid-word
239
+
240
+ // ✅ GOOD: Semantic chunking with overlap
241
+ function chunkDocument(text: string, options: ChunkOptions = {}): Chunk[] {
242
+ const {
243
+ maxTokens = 512, // chunk size
244
+ overlapTokens = 50, // overlap between chunks
245
+ separator = "\n\n", // split on paragraph boundaries first
246
+ } = options;
247
+
248
+ const paragraphs = text.split(separator);
249
+ const chunks: Chunk[] = [];
250
+ let current = "";
251
+
252
+ for (const para of paragraphs) {
253
+ if (tokenCount(current + para) > maxTokens && current) {
254
+ chunks.push({ text: current.trim(), tokens: tokenCount(current) });
255
+ // Keep overlap from previous chunk
256
+ const words = current.split(" ");
257
+ current = words.slice(-overlapTokens).join(" ") + separator + para;
258
+ } else {
259
+ current += separator + para;
260
+ }
261
+ }
262
+ if (current.trim()) chunks.push({ text: current.trim(), tokens: tokenCount(current) });
263
+
264
+ return chunks;
265
+ }
266
+
267
+ // Chunk size guidelines:
268
+ // 256-512 tokens → precise retrieval (Q&A, support)
269
+ // 512-1024 tokens → balanced (general RAG)
270
+ // 1024-2048 tokens → broad context (summarization)
271
+ ```
272
+
273
+ ### Vector Store Selection
274
+
275
+ ```
276
+ pgvector (PostgreSQL) → Already using Postgres, <10M vectors, simple
277
+ Pinecone → Managed, serverless, easy scaling
278
+ Weaviate → Hybrid search (vector + keyword), multi-model
279
+ Qdrant → High performance, Rust-based, self-hostable
280
+ Chroma → Local development, prototyping
281
+ Milvus → Enterprise scale, GPU acceleration
282
+
283
+ // ❌ HALLUCINATION TRAP: Vector search is NOT keyword search
284
+ // "Apple CEO" might not find "Tim Cook runs Apple Inc."
285
+ // Use HYBRID search (vector + BM25 keyword) for production
286
+ ```
287
+
288
+ ---
289
+
290
+ ## Streaming
291
+
292
+ ```typescript
293
+ // Server-Sent Events for AI token streaming
294
+ app.get("/api/chat", async (req, res) => {
295
+ res.setHeader("Content-Type", "text/event-stream");
296
+ res.setHeader("Cache-Control", "no-cache");
297
+ res.setHeader("Connection", "keep-alive");
298
+
299
+ const stream = await openai.chat.completions.create({
300
+ model: "gpt-4o-mini",
301
+ messages: [{ role: "user", content: req.query.message as string }],
302
+ stream: true,
303
+ });
304
+
305
+ for await (const chunk of stream) {
306
+ const content = chunk.choices[0]?.delta?.content;
307
+ if (content) {
308
+ res.write(`data: ${JSON.stringify({ content })}\n\n`);
309
+ }
310
+ }
311
+
312
+ res.write("data: [DONE]\n\n");
313
+ res.end();
314
+ });
315
+
316
+ // Client-side consumption
317
+ const eventSource = new EventSource(`/api/chat?message=${encodeURIComponent(msg)}`);
318
+ eventSource.onmessage = (event) => {
319
+ if (event.data === "[DONE]") { eventSource.close(); return; }
320
+ const { content } = JSON.parse(event.data);
321
+ appendToChat(content);
322
+ };
323
+ ```
324
+
325
+ ---
326
+
327
+ ## Cost Optimization
328
+
329
+ ```
330
+ 1. Prompt caching → Cache system prompts (OpenAI, Anthropic support this)
331
+ 2. Output token limiting Set max_tokens to prevent runaway responses
332
+ 3. Tiered models → Use cheap models for classification, expensive for reasoning
333
+ 4. Batch processing → Use batch APIs for offline processing (50% discount)
334
+ 5. Chunked context → Send only relevant chunks, not entire documents
335
+ 6. Response streaming → Stream to reduce TTFT (time to first token)
336
+ 7. Structured output → Shorter JSON responses vs verbose prose
337
+
338
+ // Cost estimation:
339
+ // GPT-4o: ~$2.50/1M input, ~$10/1M output
340
+ // GPT-4o-mini: ~$0.15/1M input, ~$0.60/1M output
341
+ // 1M tokens ≈ 750,000 words ≈ 3,000 pages
342
+ ```
343
+
344
+ ---