tribunal-kit 3.0.0 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (226) hide show
  1. package/.agent/ARCHITECTURE.md +99 -99
  2. package/.agent/GEMINI.md +52 -52
  3. package/.agent/agents/accessibility-reviewer.md +187 -220
  4. package/.agent/agents/ai-code-reviewer.md +199 -233
  5. package/.agent/agents/backend-specialist.md +215 -238
  6. package/.agent/agents/code-archaeologist.md +161 -181
  7. package/.agent/agents/database-architect.md +184 -207
  8. package/.agent/agents/debugger.md +191 -218
  9. package/.agent/agents/dependency-reviewer.md +103 -136
  10. package/.agent/agents/devops-engineer.md +218 -238
  11. package/.agent/agents/documentation-writer.md +201 -221
  12. package/.agent/agents/explorer-agent.md +160 -180
  13. package/.agent/agents/frontend-reviewer.md +160 -194
  14. package/.agent/agents/frontend-specialist.md +248 -237
  15. package/.agent/agents/game-developer.md +48 -52
  16. package/.agent/agents/logic-reviewer.md +116 -149
  17. package/.agent/agents/mobile-developer.md +200 -223
  18. package/.agent/agents/mobile-reviewer.md +162 -195
  19. package/.agent/agents/orchestrator.md +181 -211
  20. package/.agent/agents/penetration-tester.md +157 -174
  21. package/.agent/agents/performance-optimizer.md +183 -203
  22. package/.agent/agents/performance-reviewer.md +178 -211
  23. package/.agent/agents/product-manager.md +142 -162
  24. package/.agent/agents/product-owner.md +6 -25
  25. package/.agent/agents/project-planner.md +142 -162
  26. package/.agent/agents/qa-automation-engineer.md +225 -242
  27. package/.agent/agents/security-auditor.md +174 -194
  28. package/.agent/agents/seo-specialist.md +193 -213
  29. package/.agent/agents/sql-reviewer.md +161 -194
  30. package/.agent/agents/supervisor-agent.md +184 -203
  31. package/.agent/agents/swarm-worker-contracts.md +17 -17
  32. package/.agent/agents/swarm-worker-registry.md +46 -46
  33. package/.agent/agents/test-coverage-reviewer.md +160 -193
  34. package/.agent/agents/test-engineer.md +0 -21
  35. package/.agent/agents/type-safety-reviewer.md +175 -208
  36. package/.agent/patterns/generator.md +9 -9
  37. package/.agent/patterns/inversion.md +12 -12
  38. package/.agent/patterns/pipeline.md +9 -9
  39. package/.agent/patterns/reviewer.md +13 -13
  40. package/.agent/patterns/tool-wrapper.md +9 -9
  41. package/.agent/rules/GEMINI.md +63 -63
  42. package/.agent/scripts/compress_skills.py +167 -0
  43. package/.agent/scripts/consolidate_skills.py +173 -0
  44. package/.agent/scripts/deep_compress.py +202 -0
  45. package/.agent/scripts/minify_context.py +80 -0
  46. package/.agent/scripts/security_scan.py +1 -1
  47. package/.agent/scripts/strip_tribunal.py +41 -0
  48. package/.agent/skills/agent-organizer/SKILL.md +92 -126
  49. package/.agent/skills/agentic-patterns/SKILL.md +0 -70
  50. package/.agent/skills/ai-prompt-injection-defense/SKILL.md +126 -160
  51. package/.agent/skills/api-patterns/SKILL.md +123 -215
  52. package/.agent/skills/api-security-auditor/SKILL.md +143 -177
  53. package/.agent/skills/app-builder/SKILL.md +326 -50
  54. package/.agent/skills/app-builder/templates/SKILL.md +13 -15
  55. package/.agent/skills/app-builder/templates/astro-static/TEMPLATE.md +16 -16
  56. package/.agent/skills/app-builder/templates/chrome-extension/TEMPLATE.md +22 -22
  57. package/.agent/skills/app-builder/templates/cli-tool/TEMPLATE.md +18 -18
  58. package/.agent/skills/app-builder/templates/electron-desktop/TEMPLATE.md +20 -20
  59. package/.agent/skills/app-builder/templates/express-api/TEMPLATE.md +17 -17
  60. package/.agent/skills/app-builder/templates/flutter-app/TEMPLATE.md +18 -18
  61. package/.agent/skills/app-builder/templates/monorepo-turborepo/TEMPLATE.md +21 -21
  62. package/.agent/skills/app-builder/templates/nextjs-fullstack/TEMPLATE.md +19 -19
  63. package/.agent/skills/app-builder/templates/nextjs-saas/TEMPLATE.md +26 -26
  64. package/.agent/skills/app-builder/templates/nextjs-static/TEMPLATE.md +26 -26
  65. package/.agent/skills/app-builder/templates/nuxt-app/TEMPLATE.md +19 -19
  66. package/.agent/skills/app-builder/templates/python-fastapi/TEMPLATE.md +18 -18
  67. package/.agent/skills/app-builder/templates/react-native-app/TEMPLATE.md +20 -20
  68. package/.agent/skills/appflow-wireframe/SKILL.md +87 -121
  69. package/.agent/skills/architecture/SKILL.md +82 -252
  70. package/.agent/skills/authentication-best-practices/SKILL.md +139 -173
  71. package/.agent/skills/bash-linux/SKILL.md +120 -154
  72. package/.agent/skills/behavioral-modes/SKILL.md +8 -69
  73. package/.agent/skills/brainstorming/SKILL.md +428 -104
  74. package/.agent/skills/building-native-ui/SKILL.md +143 -174
  75. package/.agent/skills/clean-code/SKILL.md +323 -360
  76. package/.agent/skills/code-review-checklist/SKILL.md +0 -62
  77. package/.agent/skills/config-validator/SKILL.md +107 -141
  78. package/.agent/skills/csharp-developer/SKILL.md +468 -528
  79. package/.agent/skills/database-design/SKILL.md +104 -369
  80. package/.agent/skills/deployment-procedures/SKILL.md +111 -145
  81. package/.agent/skills/devops-engineer/SKILL.md +295 -332
  82. package/.agent/skills/devops-incident-responder/SKILL.md +79 -113
  83. package/.agent/skills/doc.md +5 -5
  84. package/.agent/skills/documentation-templates/SKILL.md +19 -63
  85. package/.agent/skills/edge-computing/SKILL.md +123 -157
  86. package/.agent/skills/extract-design-system/SKILL.md +100 -134
  87. package/.agent/skills/framer-motion-expert/SKILL.md +111 -855
  88. package/.agent/skills/frontend-design/SKILL.md +151 -499
  89. package/.agent/skills/game-design-expert/SKILL.md +71 -105
  90. package/.agent/skills/game-engineering-expert/SKILL.md +88 -122
  91. package/.agent/skills/geo-fundamentals/SKILL.md +89 -124
  92. package/.agent/skills/github-operations/SKILL.md +279 -314
  93. package/.agent/skills/gsap-expert/SKILL.md +119 -826
  94. package/.agent/skills/i18n-localization/SKILL.md +104 -138
  95. package/.agent/skills/intelligent-routing/SKILL.md +159 -127
  96. package/.agent/skills/lint-and-validate/SKILL.md +8 -52
  97. package/.agent/skills/llm-engineering/SKILL.md +344 -357
  98. package/.agent/skills/local-first/SKILL.md +120 -154
  99. package/.agent/skills/mcp-builder/SKILL.md +84 -118
  100. package/.agent/skills/mobile-design/SKILL.md +213 -219
  101. package/.agent/skills/motion-engineering/SKILL.md +184 -0
  102. package/.agent/skills/nextjs-react-expert/SKILL.md +99 -698
  103. package/.agent/skills/nodejs-best-practices/SKILL.md +498 -559
  104. package/.agent/skills/observability/SKILL.md +293 -330
  105. package/.agent/skills/parallel-agents/SKILL.md +88 -122
  106. package/.agent/skills/performance-profiling/SKILL.md +217 -254
  107. package/.agent/skills/plan-writing/SKILL.md +84 -118
  108. package/.agent/skills/platform-engineer/SKILL.md +89 -123
  109. package/.agent/skills/playwright-best-practices/SKILL.md +128 -162
  110. package/.agent/skills/powershell-windows/SKILL.md +112 -146
  111. package/.agent/skills/python-patterns/SKILL.md +7 -35
  112. package/.agent/skills/python-pro/SKILL.md +148 -754
  113. package/.agent/skills/react-specialist/SKILL.md +123 -827
  114. package/.agent/skills/readme-builder/SKILL.md +15 -85
  115. package/.agent/skills/realtime-patterns/SKILL.md +269 -304
  116. package/.agent/skills/red-team-tactics/SKILL.md +10 -51
  117. package/.agent/skills/rust-pro/SKILL.md +623 -701
  118. package/.agent/skills/seo-fundamentals/SKILL.md +120 -154
  119. package/.agent/skills/server-management/SKILL.md +156 -190
  120. package/.agent/skills/shadcn-ui-expert/SKILL.md +172 -206
  121. package/.agent/skills/skill-creator/SKILL.md +18 -58
  122. package/.agent/skills/sql-pro/SKILL.md +579 -633
  123. package/.agent/skills/supabase-postgres-best-practices/SKILL.md +28 -68
  124. package/.agent/skills/swiftui-expert/SKILL.md +142 -176
  125. package/.agent/skills/systematic-debugging/SKILL.md +84 -118
  126. package/.agent/skills/tailwind-patterns/SKILL.md +516 -576
  127. package/.agent/skills/tdd-workflow/SKILL.md +103 -137
  128. package/.agent/skills/test-result-analyzer/SKILL.md +33 -73
  129. package/.agent/skills/testing-patterns/SKILL.md +512 -573
  130. package/.agent/skills/trend-researcher/SKILL.md +30 -71
  131. package/.agent/skills/ui-ux-pro-max/SKILL.md +0 -41
  132. package/.agent/skills/ui-ux-researcher/SKILL.md +51 -91
  133. package/.agent/skills/vue-expert/SKILL.md +127 -866
  134. package/.agent/skills/vulnerability-scanner/SKILL.md +354 -269
  135. package/.agent/skills/web-accessibility-auditor/SKILL.md +159 -193
  136. package/.agent/skills/web-design-guidelines/SKILL.md +17 -61
  137. package/.agent/skills/webapp-testing/SKILL.md +111 -145
  138. package/.agent/skills/whimsy-injector/SKILL.md +58 -132
  139. package/.agent/skills/workflow-optimizer/SKILL.md +28 -68
  140. package/.agent/workflows/api-tester.md +151 -151
  141. package/.agent/workflows/audit.md +127 -138
  142. package/.agent/workflows/brainstorm.md +110 -110
  143. package/.agent/workflows/changelog.md +112 -112
  144. package/.agent/workflows/create.md +124 -124
  145. package/.agent/workflows/debug.md +165 -189
  146. package/.agent/workflows/deploy.md +180 -189
  147. package/.agent/workflows/enhance.md +128 -151
  148. package/.agent/workflows/fix.md +114 -135
  149. package/.agent/workflows/generate.md +12 -4
  150. package/.agent/workflows/migrate.md +160 -160
  151. package/.agent/workflows/orchestrate.md +168 -168
  152. package/.agent/workflows/performance-benchmarker.md +114 -123
  153. package/.agent/workflows/plan.md +173 -173
  154. package/.agent/workflows/preview.md +80 -80
  155. package/.agent/workflows/refactor.md +161 -183
  156. package/.agent/workflows/review-ai.md +101 -129
  157. package/.agent/workflows/review.md +116 -116
  158. package/.agent/workflows/session.md +94 -94
  159. package/.agent/workflows/status.md +79 -79
  160. package/.agent/workflows/strengthen-skills.md +138 -139
  161. package/.agent/workflows/swarm.md +179 -179
  162. package/.agent/workflows/test.md +189 -211
  163. package/.agent/workflows/tribunal-backend.md +93 -113
  164. package/.agent/workflows/tribunal-database.md +94 -115
  165. package/.agent/workflows/tribunal-frontend.md +95 -118
  166. package/.agent/workflows/tribunal-full.md +92 -133
  167. package/.agent/workflows/tribunal-mobile.md +94 -119
  168. package/.agent/workflows/tribunal-performance.md +109 -133
  169. package/.agent/workflows/ui-ux-pro-max.md +122 -143
  170. package/package.json +1 -1
  171. package/.agent/skills/api-patterns/api-style.md +0 -42
  172. package/.agent/skills/api-patterns/auth.md +0 -24
  173. package/.agent/skills/api-patterns/documentation.md +0 -26
  174. package/.agent/skills/api-patterns/graphql.md +0 -41
  175. package/.agent/skills/api-patterns/rate-limiting.md +0 -31
  176. package/.agent/skills/api-patterns/response.md +0 -37
  177. package/.agent/skills/api-patterns/rest.md +0 -40
  178. package/.agent/skills/api-patterns/security-testing.md +0 -122
  179. package/.agent/skills/api-patterns/trpc.md +0 -41
  180. package/.agent/skills/api-patterns/versioning.md +0 -22
  181. package/.agent/skills/app-builder/agent-coordination.md +0 -71
  182. package/.agent/skills/app-builder/feature-building.md +0 -53
  183. package/.agent/skills/app-builder/project-detection.md +0 -34
  184. package/.agent/skills/app-builder/scaffolding.md +0 -118
  185. package/.agent/skills/app-builder/tech-stack.md +0 -40
  186. package/.agent/skills/architecture/context-discovery.md +0 -43
  187. package/.agent/skills/architecture/examples.md +0 -94
  188. package/.agent/skills/architecture/pattern-selection.md +0 -68
  189. package/.agent/skills/architecture/patterns-reference.md +0 -50
  190. package/.agent/skills/architecture/trade-off-analysis.md +0 -77
  191. package/.agent/skills/brainstorming/dynamic-questioning.md +0 -360
  192. package/.agent/skills/database-design/database-selection.md +0 -43
  193. package/.agent/skills/database-design/indexing.md +0 -39
  194. package/.agent/skills/database-design/migrations.md +0 -48
  195. package/.agent/skills/database-design/optimization.md +0 -36
  196. package/.agent/skills/database-design/orm-selection.md +0 -30
  197. package/.agent/skills/database-design/schema-design.md +0 -56
  198. package/.agent/skills/frontend-design/animation-guide.md +0 -331
  199. package/.agent/skills/frontend-design/color-system.md +0 -329
  200. package/.agent/skills/frontend-design/decision-trees.md +0 -418
  201. package/.agent/skills/frontend-design/motion-graphics.md +0 -306
  202. package/.agent/skills/frontend-design/typography-system.md +0 -363
  203. package/.agent/skills/frontend-design/ux-psychology.md +0 -1116
  204. package/.agent/skills/frontend-design/visual-effects.md +0 -383
  205. package/.agent/skills/intelligent-routing/router-manifest.md +0 -65
  206. package/.agent/skills/mobile-design/decision-trees.md +0 -516
  207. package/.agent/skills/mobile-design/mobile-backend.md +0 -491
  208. package/.agent/skills/mobile-design/mobile-color-system.md +0 -420
  209. package/.agent/skills/mobile-design/mobile-debugging.md +0 -122
  210. package/.agent/skills/mobile-design/mobile-design-thinking.md +0 -357
  211. package/.agent/skills/mobile-design/mobile-navigation.md +0 -458
  212. package/.agent/skills/mobile-design/mobile-performance.md +0 -767
  213. package/.agent/skills/mobile-design/mobile-testing.md +0 -356
  214. package/.agent/skills/mobile-design/mobile-typography.md +0 -433
  215. package/.agent/skills/mobile-design/platform-android.md +0 -666
  216. package/.agent/skills/mobile-design/platform-ios.md +0 -561
  217. package/.agent/skills/mobile-design/touch-psychology.md +0 -537
  218. package/.agent/skills/nextjs-react-expert/1-async-eliminating-waterfalls.md +0 -312
  219. package/.agent/skills/nextjs-react-expert/2-bundle-bundle-size-optimization.md +0 -240
  220. package/.agent/skills/nextjs-react-expert/3-server-server-side-performance.md +0 -490
  221. package/.agent/skills/nextjs-react-expert/4-client-client-side-data-fetching.md +0 -264
  222. package/.agent/skills/nextjs-react-expert/5-rerender-re-render-optimization.md +0 -581
  223. package/.agent/skills/nextjs-react-expert/6-rendering-rendering-performance.md +0 -432
  224. package/.agent/skills/nextjs-react-expert/7-js-javascript-performance.md +0 -684
  225. package/.agent/skills/nextjs-react-expert/8-advanced-advanced-patterns.md +0 -150
  226. package/.agent/skills/vulnerability-scanner/checklists.md +0 -121
@@ -1,330 +1,293 @@
1
- ---
2
- name: observability
3
- description: Production observability mastery. Structured logging (Pino/Winston), OpenTelemetry tracing, metrics (Prometheus/Grafana), SLIs/SLOs/error budgets, distributed tracing, alerting design, health checks, and AI observability. Use when setting up monitoring, debugging production issues, or designing observable distributed systems.
4
- allowed-tools: Read, Write, Edit, Glob, Grep
5
- version: 2.0.0
6
- last-updated: 2026-04-01
7
- applies-to-model: gemini-2.5-pro, claude-3-7-sonnet
8
- ---
9
-
10
- # Observability — Production Monitoring Mastery
11
-
12
- > You can't fix what you can't see. You can't see what you don't measure.
13
- > Every request gets a trace. Every error gets structured context. Every SLO has an error budget.
14
-
15
- ---
16
-
17
- ## The Three Pillars
18
-
19
- ```
20
- Logs → WHAT happened (structured events)
21
- Traces → WHERE it happened (request flow across services)
22
- Metrics → HOW MUCH is happening (counters, histograms, gauges)
23
-
24
- All three are needed. Logs alone are not observability.
25
- ```
26
-
27
- ---
28
-
29
- ## Structured Logging
30
-
31
- ```typescript
32
- import pino from "pino";
33
-
34
- // ✅ Structured JSON logging
35
- const logger = pino({
36
- level: process.env.LOG_LEVEL ?? "info",
37
- timestamp: pino.stdTimeFunctions.isoTime,
38
- ...(process.env.NODE_ENV === "development" && {
39
- transport: { target: "pino-pretty" },
40
- }),
41
- });
42
-
43
- // GOOD: Structured with context
44
- logger.info({ userId: user.id, action: "login", ip: req.ip }, "User logged in");
45
- logger.error({ err, orderId: order.id, paymentGateway: "stripe" }, "Payment failed");
46
- logger.warn({ queueDepth: 1500, threshold: 1000 }, "Queue depth exceeding threshold");
47
-
48
- // ❌ BAD: Unstructured string logging
49
- console.log("User " + user.id + " logged in from " + req.ip);
50
- console.log("Error: " + error.message);
51
-
52
- // HALLUCINATION TRAP: console.log is NOT production logging
53
- // - No severity levels (info/warn/error)
54
- // - No structured fields (can't search/filter)
55
- // - No timestamps in ISO format
56
- // - Can't be collected by log aggregators
57
- // Use Pino (Node.js) or structlog (Python) for production
58
- ```
59
-
60
- ### Log Levels
61
-
62
- ```
63
- fatal App is crashing, immediate attention required
64
- errorOperation failed, needs investigation
65
- warn Something unexpected, but app continues
66
- info → Business events (user login, order placed, deploy)
67
- debug → Technical details (query timing, cache hit/miss)
68
- trace Verbose debugging (only in development)
69
-
70
- Rules:
71
- - Production default: info
72
- - Never log PII (names, emails, SSNs) at any level
73
- - Never log secrets (tokens, passwords, API keys)
74
- - Log request IDs for correlation
75
- - Log durations for performance tracking
76
- ```
77
-
78
- ### Request Context / Correlation
79
-
80
- ```typescript
81
- import { AsyncLocalStorage } from "node:async_hooks";
82
-
83
- const requestContext = new AsyncLocalStorage<{ requestId: string; userId?: string }>();
84
-
85
- // Middleware: set context per request
86
- app.use((req, res, next) => {
87
- const requestId = req.headers["x-request-id"]?.toString() ?? crypto.randomUUID();
88
- res.setHeader("x-request-id", requestId);
89
- requestContext.run({ requestId, userId: req.user?.id }, next);
90
- });
91
-
92
- // Child logger with context
93
- function getLogger() {
94
- const ctx = requestContext.getStore();
95
- return logger.child({
96
- requestId: ctx?.requestId,
97
- userId: ctx?.userId,
98
- });
99
- }
100
-
101
- // Every log from this request includes requestId and userId
102
- const log = getLogger();
103
- log.info("Processing order"); // { requestId: "abc-123", userId: "42", msg: "Processing order" }
104
- ```
105
-
106
- ---
107
-
108
- ## Distributed Tracing (OpenTelemetry)
109
-
110
- ```typescript
111
- import { NodeSDK } from "@opentelemetry/sdk-node";
112
- import { getNodeAutoInstrumentations } from "@opentelemetry/auto-instrumentations-node";
113
- import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
114
-
115
- // Initialize OpenTelemetry
116
- const sdk = new NodeSDK({
117
- traceExporter: new OTLPTraceExporter({
118
- url: process.env.OTEL_EXPORTER_OTLP_ENDPOINT ?? "http://localhost:4318/v1/traces",
119
- }),
120
- instrumentations: [
121
- getNodeAutoInstrumentations({
122
- "@opentelemetry/instrumentation-http": { enabled: true },
123
- "@opentelemetry/instrumentation-express": { enabled: true },
124
- "@opentelemetry/instrumentation-pg": { enabled: true },
125
- "@opentelemetry/instrumentation-redis": { enabled: true },
126
- }),
127
- ],
128
- });
129
-
130
- sdk.start();
131
-
132
- // Manual span for custom business logic
133
- import { trace } from "@opentelemetry/api";
134
-
135
- const tracer = trace.getTracer("order-service");
136
-
137
- async function processOrder(order: Order) {
138
- return tracer.startActiveSpan("processOrder", async (span) => {
139
- try {
140
- span.setAttribute("order.id", order.id);
141
- span.setAttribute("order.total", order.total);
142
- span.setAttribute("order.items.count", order.items.length);
143
-
144
- const result = await executeOrder(order);
145
- span.setStatus({ code: SpanStatusCode.OK });
146
- return result;
147
- } catch (error) {
148
- span.setStatus({ code: SpanStatusCode.ERROR, message: error.message });
149
- span.recordException(error);
150
- throw error;
151
- } finally {
152
- span.end();
153
- }
154
- });
155
- }
156
- ```
157
-
158
- ---
159
-
160
- ## Metrics
161
-
162
- ```typescript
163
- import { metrics } from "@opentelemetry/api";
164
-
165
- const meter = metrics.getMeter("api-server");
166
-
167
- // Counter — things that only go up
168
- const requestCounter = meter.createCounter("http.requests.total", {
169
- description: "Total HTTP requests",
170
- });
171
-
172
- // Histogram — request durations
173
- const requestDuration = meter.createHistogram("http.request.duration_ms", {
174
- description: "HTTP request duration in milliseconds",
175
- unit: "ms",
176
- });
177
-
178
- // Gauge — current values
179
- const activeConnections = meter.createUpDownCounter("db.connections.active", {
180
- description: "Active database connections",
181
- });
182
-
183
- // Middleware to record metrics
184
- app.use((req, res, next) => {
185
- const start = performance.now();
186
- res.on("finish", () => {
187
- const duration = performance.now() - start;
188
- requestCounter.add(1, {
189
- method: req.method,
190
- path: req.route?.path ?? req.path,
191
- status: res.statusCode.toString(),
192
- });
193
- requestDuration.record(duration, {
194
- method: req.method,
195
- status: res.statusCode.toString(),
196
- });
197
- });
198
- next();
199
- });
200
- ```
201
-
202
- ### Key Metrics to Track
203
-
204
- ```
205
- RED method (for services):
206
- Rate → requests per second
207
- Errors → error rate (4xx, 5xx)
208
- Durationlatency percentiles (P50, P95, P99)
209
-
210
- USE method (for resources):
211
- Utilization → CPU %, memory %, disk %
212
- Saturation → queue depth, thread pool saturation
213
- Errors → disk failures, OOM kills
214
-
215
- Business metrics:
216
- - Sign-ups per hour
217
- - Orders processed per minute
218
- - Revenue per day
219
- - API calls per customer
220
- ```
221
-
222
- ---
223
-
224
- ## SLIs, SLOs & Error Budgets
225
-
226
- ```
227
- SLI (Service Level Indicator) → What you measure
228
- "99.2% of requests complete in <500ms"
229
-
230
- SLO (Service Level Objective) → Your target
231
- "99.9% of requests should complete in <500ms"
232
-
233
- SLA (Service Level Agreement) Your contract (with penalties)
234
- "99.95% uptime or we refund 10%"
235
-
236
- Error Budget = 100% - SLO
237
- SLO: 99.9% → Error budget: 0.1% → 43 min downtime/month
238
- SLO: 99.5% Error budget: 0.5%3.6 hours downtime/month
239
-
240
- Rules:
241
- - Burn error budget too fast freeze deployments
242
- - Error budget remaining → ship features faster
243
- - Don't set SLOs you can't measure
244
- - SLOs should be slightly below actual performance
245
- ```
246
-
247
- ---
248
-
249
- ## Health Checks
250
-
251
- ```typescript
252
- // Liveness: Is the process running?
253
- app.get("/health/live", (req, res) => {
254
- res.status(200).json({ status: "ok" });
255
- });
256
-
257
- // Readiness: Can it accept traffic?
258
- app.get("/health/ready", async (req, res) => {
259
- try {
260
- await db.raw("SELECT 1"); // database check
261
- await redis.ping(); // cache check
262
- res.status(200).json({
263
- status: "ready",
264
- checks: { database: "ok", cache: "ok" },
265
- });
266
- } catch (error) {
267
- res.status(503).json({
268
- status: "not ready",
269
- checks: { database: error.message },
270
- });
271
- }
272
- });
273
-
274
- // HALLUCINATION TRAP: Liveness Readiness
275
- // Liveness fails → container restarts (only for unrecoverable states)
276
- // Readiness fails → stop sending traffic (temporary — DB down, etc.)
277
- // Making liveness check the DB → DB outage restarts all containers → cascade failure
278
- ```
279
-
280
- ---
281
-
282
- ## Alerting
283
-
284
- ```
285
- Alert design rules:
286
- 1. Alert on SYMPTOMS, not causes (high latency, not "CPU is 80%")
287
- 2. Every alert must have a runbook link
288
- 3. Every alert must be ACTIONABLE — if you can't do anything, it's a notification
289
- 4. Use severity levels:
290
- - Critical page on-call (customer-facing outage)
291
- - Warning → Slack notification (degraded, not broken)
292
- - Info → dashboard only (awareness)
293
- 5. Avoid alert fatigue — fewer, meaningful alerts beat many noisy ones
294
- ```
295
-
296
- ---
297
-
298
- ## 🤖 LLM-Specific Traps
299
-
300
- 1. **`console.log` in Production:** Use structured logging (Pino/Winston). `console.log` can't be searched or filtered.
301
- 2. **Logging PII:** Never log emails, names, passwords, or tokens. Use redaction.
302
- 3. **Liveness Checking Dependencies:** Liveness probes must NOT check DB/Redis. Only readiness probes check dependencies.
303
- 4. **Alerting on Causes:** "CPU is 80%" is not actionable. Alert on "P95 latency > 1s" instead.
304
- 5. **Missing Request IDs:** Without correlation IDs, debugging distributed systems is impossible.
305
- 6. **Percentiles vs Averages:** Average latency hides outliers. Track P50, P95, P99.
306
- 7. **No Error Budgets:** Without SLOs and error budgets, "availability" is subjective.
307
- 8. **Metrics Without Labels:** `requests_total` without `method`, `path`, `status` labels is useless.
308
- 9. **Tracing Without Sampling:** 100% trace collection is expensive. Use head-based or tail-based sampling.
309
- 10. **Log Levels in Code:** Hardcoded `logger.debug()` everywhere. Use configurable log levels via env.
310
-
311
- ---
312
-
313
- ## 🏛️ Tribunal Integration
314
-
315
- **Slash command: `/tribunal-backend`**
316
-
317
- ### ✅ Pre-Flight Self-Audit
318
-
319
- ```
320
- ✅ Am I using structured logging (not console.log)?
321
- ✅ Do all logs include requestId for correlation?
322
- ✅ Am I NOT logging PII or secrets?
323
- ✅ Are liveness and readiness checks separate?
324
- ✅ Is OpenTelemetry tracing configured?
325
- ✅ Am I tracking RED metrics (Rate, Errors, Duration)?
326
- ✅ Are SLOs defined with error budgets?
327
- ✅ Do alerts have runbook links?
328
- ✅ Am I alerting on symptoms (not causes)?
329
- ✅ Are log levels configurable via environment variable?
330
- ```
1
+ ---
2
+ name: observability
3
+ description: Production observability mastery. Structured logging (Pino/Winston), OpenTelemetry tracing, metrics (Prometheus/Grafana), SLIs/SLOs/error budgets, distributed tracing, alerting design, health checks, and AI observability. Use when setting up monitoring, debugging production issues, or designing observable distributed systems.
4
+ allowed-tools: Read, Write, Edit, Glob, Grep
5
+ version: 2.0.0
6
+ last-updated: 2026-04-01
7
+ applies-to-model: gemini-2.5-pro, claude-3-7-sonnet
8
+ ---
9
+
10
+ # Observability — Production Monitoring Mastery
11
+
12
+ ---
13
+
14
+ ## The Three Pillars
15
+
16
+ ```
17
+ Logs → WHAT happened (structured events)
18
+ Traces → WHERE it happened (request flow across services)
19
+ Metrics → HOW MUCH is happening (counters, histograms, gauges)
20
+
21
+ All three are needed. Logs alone are not observability.
22
+ ```
23
+
24
+ ---
25
+
26
+ ## Structured Logging
27
+
28
+ ```typescript
29
+ import pino from "pino";
30
+
31
+ // ✅ Structured JSON logging
32
+ const logger = pino({
33
+ level: process.env.LOG_LEVEL ?? "info",
34
+ timestamp: pino.stdTimeFunctions.isoTime,
35
+ ...(process.env.NODE_ENV === "development" && {
36
+ transport: { target: "pino-pretty" },
37
+ }),
38
+ });
39
+
40
+ // ✅ GOOD: Structured with context
41
+ logger.info({ userId: user.id, action: "login", ip: req.ip }, "User logged in");
42
+ logger.error({ err, orderId: order.id, paymentGateway: "stripe" }, "Payment failed");
43
+ logger.warn({ queueDepth: 1500, threshold: 1000 }, "Queue depth exceeding threshold");
44
+
45
+ // BAD: Unstructured string logging
46
+ console.log("User " + user.id + " logged in from " + req.ip);
47
+ console.log("Error: " + error.message);
48
+
49
+ // HALLUCINATION TRAP: console.log is NOT production logging
50
+ // - No severity levels (info/warn/error)
51
+ // - No structured fields (can't search/filter)
52
+ // - No timestamps in ISO format
53
+ // - Can't be collected by log aggregators
54
+ // Use Pino (Node.js) or structlog (Python) for production
55
+ ```
56
+
57
+ ### Log Levels
58
+
59
+ ```
60
+ fatal App is crashing, immediate attention required
61
+ error → Operation failed, needs investigation
62
+ warn → Something unexpected, but app continues
63
+ info Business events (user login, order placed, deploy)
64
+ debugTechnical details (query timing, cache hit/miss)
65
+ trace Verbose debugging (only in development)
66
+
67
+ Rules:
68
+ - Production default: info
69
+ - Never log PII (names, emails, SSNs) at any level
70
+ - Never log secrets (tokens, passwords, API keys)
71
+ - Log request IDs for correlation
72
+ - Log durations for performance tracking
73
+ ```
74
+
75
+ ### Request Context / Correlation
76
+
77
+ ```typescript
78
+ import { AsyncLocalStorage } from "node:async_hooks";
79
+
80
+ const requestContext = new AsyncLocalStorage<{ requestId: string; userId?: string }>();
81
+
82
+ // Middleware: set context per request
83
+ app.use((req, res, next) => {
84
+ const requestId = req.headers["x-request-id"]?.toString() ?? crypto.randomUUID();
85
+ res.setHeader("x-request-id", requestId);
86
+ requestContext.run({ requestId, userId: req.user?.id }, next);
87
+ });
88
+
89
+ // Child logger with context
90
+ function getLogger() {
91
+ const ctx = requestContext.getStore();
92
+ return logger.child({
93
+ requestId: ctx?.requestId,
94
+ userId: ctx?.userId,
95
+ });
96
+ }
97
+
98
+ // Every log from this request includes requestId and userId
99
+ const log = getLogger();
100
+ log.info("Processing order"); // { requestId: "abc-123", userId: "42", msg: "Processing order" }
101
+ ```
102
+
103
+ ---
104
+
105
+ ## Distributed Tracing (OpenTelemetry)
106
+
107
+ ```typescript
108
+ import { NodeSDK } from "@opentelemetry/sdk-node";
109
+ import { getNodeAutoInstrumentations } from "@opentelemetry/auto-instrumentations-node";
110
+ import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
111
+
112
+ // Initialize OpenTelemetry
113
+ const sdk = new NodeSDK({
114
+ traceExporter: new OTLPTraceExporter({
115
+ url: process.env.OTEL_EXPORTER_OTLP_ENDPOINT ?? "http://localhost:4318/v1/traces",
116
+ }),
117
+ instrumentations: [
118
+ getNodeAutoInstrumentations({
119
+ "@opentelemetry/instrumentation-http": { enabled: true },
120
+ "@opentelemetry/instrumentation-express": { enabled: true },
121
+ "@opentelemetry/instrumentation-pg": { enabled: true },
122
+ "@opentelemetry/instrumentation-redis": { enabled: true },
123
+ }),
124
+ ],
125
+ });
126
+
127
+ sdk.start();
128
+
129
+ // Manual span for custom business logic
130
+ import { trace } from "@opentelemetry/api";
131
+
132
+ const tracer = trace.getTracer("order-service");
133
+
134
+ async function processOrder(order: Order) {
135
+ return tracer.startActiveSpan("processOrder", async (span) => {
136
+ try {
137
+ span.setAttribute("order.id", order.id);
138
+ span.setAttribute("order.total", order.total);
139
+ span.setAttribute("order.items.count", order.items.length);
140
+
141
+ const result = await executeOrder(order);
142
+ span.setStatus({ code: SpanStatusCode.OK });
143
+ return result;
144
+ } catch (error) {
145
+ span.setStatus({ code: SpanStatusCode.ERROR, message: error.message });
146
+ span.recordException(error);
147
+ throw error;
148
+ } finally {
149
+ span.end();
150
+ }
151
+ });
152
+ }
153
+ ```
154
+
155
+ ---
156
+
157
+ ## Metrics
158
+
159
+ ```typescript
160
+ import { metrics } from "@opentelemetry/api";
161
+
162
+ const meter = metrics.getMeter("api-server");
163
+
164
+ // Counter — things that only go up
165
+ const requestCounter = meter.createCounter("http.requests.total", {
166
+ description: "Total HTTP requests",
167
+ });
168
+
169
+ // Histogram request durations
170
+ const requestDuration = meter.createHistogram("http.request.duration_ms", {
171
+ description: "HTTP request duration in milliseconds",
172
+ unit: "ms",
173
+ });
174
+
175
+ // Gauge — current values
176
+ const activeConnections = meter.createUpDownCounter("db.connections.active", {
177
+ description: "Active database connections",
178
+ });
179
+
180
+ // Middleware to record metrics
181
+ app.use((req, res, next) => {
182
+ const start = performance.now();
183
+ res.on("finish", () => {
184
+ const duration = performance.now() - start;
185
+ requestCounter.add(1, {
186
+ method: req.method,
187
+ path: req.route?.path ?? req.path,
188
+ status: res.statusCode.toString(),
189
+ });
190
+ requestDuration.record(duration, {
191
+ method: req.method,
192
+ status: res.statusCode.toString(),
193
+ });
194
+ });
195
+ next();
196
+ });
197
+ ```
198
+
199
+ ### Key Metrics to Track
200
+
201
+ ```
202
+ RED method (for services):
203
+ Rate → requests per second
204
+ Errors → error rate (4xx, 5xx)
205
+ Duration latency percentiles (P50, P95, P99)
206
+
207
+ USE method (for resources):
208
+ UtilizationCPU %, memory %, disk %
209
+ Saturation → queue depth, thread pool saturation
210
+ Errors → disk failures, OOM kills
211
+
212
+ Business metrics:
213
+ - Sign-ups per hour
214
+ - Orders processed per minute
215
+ - Revenue per day
216
+ - API calls per customer
217
+ ```
218
+
219
+ ---
220
+
221
+ ## SLIs, SLOs & Error Budgets
222
+
223
+ ```
224
+ SLI (Service Level Indicator) What you measure
225
+ "99.2% of requests complete in <500ms"
226
+
227
+ SLO (Service Level Objective) → Your target
228
+ "99.9% of requests should complete in <500ms"
229
+
230
+ SLA (Service Level Agreement) → Your contract (with penalties)
231
+ "99.95% uptime or we refund 10%"
232
+
233
+ Error Budget = 100% - SLO
234
+ SLO: 99.9% Error budget: 0.1% → 43 min downtime/month
235
+ SLO: 99.5% → Error budget: 0.5% → 3.6 hours downtime/month
236
+
237
+ Rules:
238
+ - Burn error budget too fast freeze deployments
239
+ - Error budget remaining → ship features faster
240
+ - Don't set SLOs you can't measure
241
+ - SLOs should be slightly below actual performance
242
+ ```
243
+
244
+ ---
245
+
246
+ ## Health Checks
247
+
248
+ ```typescript
249
+ // Liveness: Is the process running?
250
+ app.get("/health/live", (req, res) => {
251
+ res.status(200).json({ status: "ok" });
252
+ });
253
+
254
+ // Readiness: Can it accept traffic?
255
+ app.get("/health/ready", async (req, res) => {
256
+ try {
257
+ await db.raw("SELECT 1"); // database check
258
+ await redis.ping(); // cache check
259
+ res.status(200).json({
260
+ status: "ready",
261
+ checks: { database: "ok", cache: "ok" },
262
+ });
263
+ } catch (error) {
264
+ res.status(503).json({
265
+ status: "not ready",
266
+ checks: { database: error.message },
267
+ });
268
+ }
269
+ });
270
+
271
+ // ❌ HALLUCINATION TRAP: Liveness ≠ Readiness
272
+ // Liveness fails → container restarts (only for unrecoverable states)
273
+ // Readiness fails → stop sending traffic (temporary — DB down, etc.)
274
+ // Making liveness check the DB → DB outage restarts all containers → cascade failure
275
+ ```
276
+
277
+ ---
278
+
279
+ ## Alerting
280
+
281
+ ```
282
+ Alert design rules:
283
+ 1. Alert on SYMPTOMS, not causes (high latency, not "CPU is 80%")
284
+ 2. Every alert must have a runbook link
285
+ 3. Every alert must be ACTIONABLE — if you can't do anything, it's a notification
286
+ 4. Use severity levels:
287
+ - Critical page on-call (customer-facing outage)
288
+ - Warning → Slack notification (degraded, not broken)
289
+ - Info → dashboard only (awareness)
290
+ 5. Avoid alert fatigue fewer, meaningful alerts beat many noisy ones
291
+ ```
292
+
293
+ ---