tribunal-kit 3.0.0 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. package/.agent/ARCHITECTURE.md +99 -99
  2. package/.agent/GEMINI.md +52 -52
  3. package/.agent/agents/accessibility-reviewer.md +187 -220
  4. package/.agent/agents/ai-code-reviewer.md +199 -233
  5. package/.agent/agents/backend-specialist.md +215 -238
  6. package/.agent/agents/code-archaeologist.md +161 -181
  7. package/.agent/agents/database-architect.md +184 -207
  8. package/.agent/agents/debugger.md +191 -218
  9. package/.agent/agents/dependency-reviewer.md +103 -136
  10. package/.agent/agents/devops-engineer.md +218 -238
  11. package/.agent/agents/documentation-writer.md +201 -221
  12. package/.agent/agents/explorer-agent.md +160 -180
  13. package/.agent/agents/frontend-reviewer.md +160 -194
  14. package/.agent/agents/frontend-specialist.md +248 -237
  15. package/.agent/agents/game-developer.md +48 -52
  16. package/.agent/agents/logic-reviewer.md +116 -149
  17. package/.agent/agents/mobile-developer.md +200 -223
  18. package/.agent/agents/mobile-reviewer.md +162 -195
  19. package/.agent/agents/orchestrator.md +181 -211
  20. package/.agent/agents/penetration-tester.md +157 -174
  21. package/.agent/agents/performance-optimizer.md +183 -203
  22. package/.agent/agents/performance-reviewer.md +178 -211
  23. package/.agent/agents/precedence-reviewer.md +213 -0
  24. package/.agent/agents/product-manager.md +142 -162
  25. package/.agent/agents/product-owner.md +6 -25
  26. package/.agent/agents/project-planner.md +142 -162
  27. package/.agent/agents/qa-automation-engineer.md +225 -242
  28. package/.agent/agents/security-auditor.md +174 -194
  29. package/.agent/agents/seo-specialist.md +193 -213
  30. package/.agent/agents/sql-reviewer.md +161 -194
  31. package/.agent/agents/supervisor-agent.md +184 -203
  32. package/.agent/agents/swarm-worker-contracts.md +17 -17
  33. package/.agent/agents/swarm-worker-registry.md +46 -46
  34. package/.agent/agents/test-coverage-reviewer.md +160 -193
  35. package/.agent/agents/test-engineer.md +0 -21
  36. package/.agent/agents/type-safety-reviewer.md +175 -208
  37. package/.agent/patterns/generator.md +9 -9
  38. package/.agent/patterns/inversion.md +12 -12
  39. package/.agent/patterns/pipeline.md +9 -9
  40. package/.agent/patterns/reviewer.md +13 -13
  41. package/.agent/patterns/tool-wrapper.md +9 -9
  42. package/.agent/rules/GEMINI.md +63 -63
  43. package/.agent/scripts/append_flow.js +72 -0
  44. package/.agent/scripts/case_law_manager.py +525 -0
  45. package/.agent/scripts/compress_skills.py +167 -0
  46. package/.agent/scripts/consolidate_skills.py +173 -0
  47. package/.agent/scripts/deep_compress.py +202 -0
  48. package/.agent/scripts/minify_context.py +80 -0
  49. package/.agent/scripts/security_scan.py +1 -1
  50. package/.agent/scripts/skill_evolution.py +563 -0
  51. package/.agent/scripts/strip_tribunal.py +41 -0
  52. package/.agent/skills/agent-organizer/SKILL.md +100 -126
  53. package/.agent/skills/agentic-patterns/SKILL.md +0 -70
  54. package/.agent/skills/ai-prompt-injection-defense/SKILL.md +134 -160
  55. package/.agent/skills/api-patterns/SKILL.md +123 -215
  56. package/.agent/skills/api-security-auditor/SKILL.md +143 -177
  57. package/.agent/skills/app-builder/SKILL.md +334 -50
  58. package/.agent/skills/app-builder/templates/SKILL.md +13 -15
  59. package/.agent/skills/app-builder/templates/astro-static/TEMPLATE.md +16 -16
  60. package/.agent/skills/app-builder/templates/chrome-extension/TEMPLATE.md +22 -22
  61. package/.agent/skills/app-builder/templates/cli-tool/TEMPLATE.md +18 -18
  62. package/.agent/skills/app-builder/templates/electron-desktop/TEMPLATE.md +20 -20
  63. package/.agent/skills/app-builder/templates/express-api/TEMPLATE.md +17 -17
  64. package/.agent/skills/app-builder/templates/flutter-app/TEMPLATE.md +18 -18
  65. package/.agent/skills/app-builder/templates/monorepo-turborepo/TEMPLATE.md +21 -21
  66. package/.agent/skills/app-builder/templates/nextjs-fullstack/TEMPLATE.md +19 -19
  67. package/.agent/skills/app-builder/templates/nextjs-saas/TEMPLATE.md +26 -26
  68. package/.agent/skills/app-builder/templates/nextjs-static/TEMPLATE.md +26 -26
  69. package/.agent/skills/app-builder/templates/nuxt-app/TEMPLATE.md +19 -19
  70. package/.agent/skills/app-builder/templates/python-fastapi/TEMPLATE.md +18 -18
  71. package/.agent/skills/app-builder/templates/react-native-app/TEMPLATE.md +20 -20
  72. package/.agent/skills/appflow-wireframe/SKILL.md +95 -121
  73. package/.agent/skills/architecture/SKILL.md +169 -331
  74. package/.agent/skills/authentication-best-practices/SKILL.md +139 -173
  75. package/.agent/skills/bash-linux/SKILL.md +129 -154
  76. package/.agent/skills/behavioral-modes/SKILL.md +8 -69
  77. package/.agent/skills/brainstorming/SKILL.md +436 -104
  78. package/.agent/skills/building-native-ui/SKILL.md +152 -174
  79. package/.agent/skills/clean-code/SKILL.md +331 -360
  80. package/.agent/skills/code-review-checklist/SKILL.md +0 -62
  81. package/.agent/skills/config-validator/SKILL.md +115 -141
  82. package/.agent/skills/csharp-developer/SKILL.md +468 -528
  83. package/.agent/skills/database-design/SKILL.md +104 -369
  84. package/.agent/skills/deployment-procedures/SKILL.md +119 -145
  85. package/.agent/skills/devops-engineer/SKILL.md +295 -332
  86. package/.agent/skills/devops-incident-responder/SKILL.md +87 -113
  87. package/.agent/skills/doc.md +5 -5
  88. package/.agent/skills/documentation-templates/SKILL.md +27 -63
  89. package/.agent/skills/edge-computing/SKILL.md +131 -157
  90. package/.agent/skills/extract-design-system/SKILL.md +108 -134
  91. package/.agent/skills/framer-motion-expert/SKILL.md +111 -855
  92. package/.agent/skills/frontend-design/SKILL.md +151 -499
  93. package/.agent/skills/game-design-expert/SKILL.md +79 -105
  94. package/.agent/skills/game-engineering-expert/SKILL.md +96 -122
  95. package/.agent/skills/geo-fundamentals/SKILL.md +97 -124
  96. package/.agent/skills/github-operations/SKILL.md +279 -314
  97. package/.agent/skills/gsap-expert/SKILL.md +119 -826
  98. package/.agent/skills/i18n-localization/SKILL.md +113 -138
  99. package/.agent/skills/intelligent-routing/SKILL.md +167 -127
  100. package/.agent/skills/lint-and-validate/SKILL.md +16 -52
  101. package/.agent/skills/llm-engineering/SKILL.md +344 -357
  102. package/.agent/skills/local-first/SKILL.md +128 -154
  103. package/.agent/skills/mcp-builder/SKILL.md +92 -118
  104. package/.agent/skills/mobile-design/SKILL.md +213 -219
  105. package/.agent/skills/motion-engineering/SKILL.md +184 -0
  106. package/.agent/skills/nextjs-react-expert/SKILL.md +99 -698
  107. package/.agent/skills/nodejs-best-practices/SKILL.md +498 -559
  108. package/.agent/skills/observability/SKILL.md +293 -330
  109. package/.agent/skills/parallel-agents/SKILL.md +96 -122
  110. package/.agent/skills/performance-profiling/SKILL.md +217 -254
  111. package/.agent/skills/plan-writing/SKILL.md +92 -118
  112. package/.agent/skills/platform-engineer/SKILL.md +97 -123
  113. package/.agent/skills/playwright-best-practices/SKILL.md +137 -162
  114. package/.agent/skills/powershell-windows/SKILL.md +112 -146
  115. package/.agent/skills/project-idioms/SKILL.md +87 -0
  116. package/.agent/skills/python-patterns/SKILL.md +15 -35
  117. package/.agent/skills/python-pro/SKILL.md +148 -754
  118. package/.agent/skills/react-specialist/SKILL.md +123 -827
  119. package/.agent/skills/readme-builder/SKILL.md +23 -85
  120. package/.agent/skills/realtime-patterns/SKILL.md +269 -304
  121. package/.agent/skills/red-team-tactics/SKILL.md +18 -51
  122. package/.agent/skills/rust-pro/SKILL.md +623 -701
  123. package/.agent/skills/seo-fundamentals/SKILL.md +129 -154
  124. package/.agent/skills/server-management/SKILL.md +164 -190
  125. package/.agent/skills/shadcn-ui-expert/SKILL.md +181 -206
  126. package/.agent/skills/skill-creator/SKILL.md +24 -56
  127. package/.agent/skills/sql-pro/SKILL.md +579 -633
  128. package/.agent/skills/supabase-postgres-best-practices/SKILL.md +35 -66
  129. package/.agent/skills/swiftui-expert/SKILL.md +151 -176
  130. package/.agent/skills/systematic-debugging/SKILL.md +92 -118
  131. package/.agent/skills/tailwind-patterns/SKILL.md +516 -576
  132. package/.agent/skills/tdd-workflow/SKILL.md +111 -137
  133. package/.agent/skills/test-result-analyzer/SKILL.md +33 -73
  134. package/.agent/skills/testing-patterns/SKILL.md +512 -573
  135. package/.agent/skills/trend-researcher/SKILL.md +30 -71
  136. package/.agent/skills/ui-ux-pro-max/SKILL.md +8 -41
  137. package/.agent/skills/ui-ux-researcher/SKILL.md +51 -91
  138. package/.agent/skills/vue-expert/SKILL.md +127 -866
  139. package/.agent/skills/vulnerability-scanner/SKILL.md +354 -269
  140. package/.agent/skills/web-accessibility-auditor/SKILL.md +168 -193
  141. package/.agent/skills/web-design-guidelines/SKILL.md +25 -61
  142. package/.agent/skills/webapp-testing/SKILL.md +119 -145
  143. package/.agent/skills/whimsy-injector/SKILL.md +58 -132
  144. package/.agent/skills/workflow-optimizer/SKILL.md +28 -68
  145. package/.agent/workflows/api-tester.md +151 -151
  146. package/.agent/workflows/audit.md +127 -138
  147. package/.agent/workflows/brainstorm.md +110 -110
  148. package/.agent/workflows/changelog.md +112 -112
  149. package/.agent/workflows/create.md +124 -124
  150. package/.agent/workflows/debug.md +165 -189
  151. package/.agent/workflows/deploy.md +180 -189
  152. package/.agent/workflows/enhance.md +128 -151
  153. package/.agent/workflows/fix.md +114 -135
  154. package/.agent/workflows/generate.md +13 -4
  155. package/.agent/workflows/migrate.md +160 -160
  156. package/.agent/workflows/orchestrate.md +168 -168
  157. package/.agent/workflows/performance-benchmarker.md +114 -123
  158. package/.agent/workflows/plan.md +173 -173
  159. package/.agent/workflows/preview.md +80 -80
  160. package/.agent/workflows/refactor.md +161 -183
  161. package/.agent/workflows/review-ai.md +101 -129
  162. package/.agent/workflows/review.md +116 -116
  163. package/.agent/workflows/session.md +94 -94
  164. package/.agent/workflows/status.md +79 -79
  165. package/.agent/workflows/strengthen-skills.md +138 -139
  166. package/.agent/workflows/swarm.md +179 -179
  167. package/.agent/workflows/test.md +189 -211
  168. package/.agent/workflows/tribunal-backend.md +94 -113
  169. package/.agent/workflows/tribunal-database.md +95 -115
  170. package/.agent/workflows/tribunal-frontend.md +96 -118
  171. package/.agent/workflows/tribunal-full.md +93 -133
  172. package/.agent/workflows/tribunal-mobile.md +95 -119
  173. package/.agent/workflows/tribunal-performance.md +110 -133
  174. package/.agent/workflows/ui-ux-pro-max.md +122 -143
  175. package/README.md +30 -1
  176. package/bin/tribunal-kit.js +175 -12
  177. package/package.json +25 -4
  178. package/.agent/skills/api-patterns/api-style.md +0 -42
  179. package/.agent/skills/api-patterns/auth.md +0 -24
  180. package/.agent/skills/api-patterns/documentation.md +0 -26
  181. package/.agent/skills/api-patterns/graphql.md +0 -41
  182. package/.agent/skills/api-patterns/rate-limiting.md +0 -31
  183. package/.agent/skills/api-patterns/response.md +0 -37
  184. package/.agent/skills/api-patterns/rest.md +0 -40
  185. package/.agent/skills/api-patterns/security-testing.md +0 -122
  186. package/.agent/skills/api-patterns/trpc.md +0 -41
  187. package/.agent/skills/api-patterns/versioning.md +0 -22
  188. package/.agent/skills/app-builder/agent-coordination.md +0 -71
  189. package/.agent/skills/app-builder/feature-building.md +0 -53
  190. package/.agent/skills/app-builder/project-detection.md +0 -34
  191. package/.agent/skills/app-builder/scaffolding.md +0 -118
  192. package/.agent/skills/app-builder/tech-stack.md +0 -40
  193. package/.agent/skills/architecture/context-discovery.md +0 -43
  194. package/.agent/skills/architecture/examples.md +0 -94
  195. package/.agent/skills/architecture/pattern-selection.md +0 -68
  196. package/.agent/skills/architecture/patterns-reference.md +0 -50
  197. package/.agent/skills/architecture/trade-off-analysis.md +0 -77
  198. package/.agent/skills/brainstorming/dynamic-questioning.md +0 -360
  199. package/.agent/skills/database-design/database-selection.md +0 -43
  200. package/.agent/skills/database-design/indexing.md +0 -39
  201. package/.agent/skills/database-design/migrations.md +0 -48
  202. package/.agent/skills/database-design/optimization.md +0 -36
  203. package/.agent/skills/database-design/orm-selection.md +0 -30
  204. package/.agent/skills/database-design/schema-design.md +0 -56
  205. package/.agent/skills/frontend-design/animation-guide.md +0 -331
  206. package/.agent/skills/frontend-design/color-system.md +0 -329
  207. package/.agent/skills/frontend-design/decision-trees.md +0 -418
  208. package/.agent/skills/frontend-design/motion-graphics.md +0 -306
  209. package/.agent/skills/frontend-design/typography-system.md +0 -363
  210. package/.agent/skills/frontend-design/ux-psychology.md +0 -1116
  211. package/.agent/skills/frontend-design/visual-effects.md +0 -383
  212. package/.agent/skills/intelligent-routing/router-manifest.md +0 -65
  213. package/.agent/skills/mobile-design/decision-trees.md +0 -516
  214. package/.agent/skills/mobile-design/mobile-backend.md +0 -491
  215. package/.agent/skills/mobile-design/mobile-color-system.md +0 -420
  216. package/.agent/skills/mobile-design/mobile-debugging.md +0 -122
  217. package/.agent/skills/mobile-design/mobile-design-thinking.md +0 -357
  218. package/.agent/skills/mobile-design/mobile-navigation.md +0 -458
  219. package/.agent/skills/mobile-design/mobile-performance.md +0 -767
  220. package/.agent/skills/mobile-design/mobile-testing.md +0 -356
  221. package/.agent/skills/mobile-design/mobile-typography.md +0 -433
  222. package/.agent/skills/mobile-design/platform-android.md +0 -666
  223. package/.agent/skills/mobile-design/platform-ios.md +0 -561
  224. package/.agent/skills/mobile-design/touch-psychology.md +0 -537
  225. package/.agent/skills/nextjs-react-expert/1-async-eliminating-waterfalls.md +0 -312
  226. package/.agent/skills/nextjs-react-expert/2-bundle-bundle-size-optimization.md +0 -240
  227. package/.agent/skills/nextjs-react-expert/3-server-server-side-performance.md +0 -490
  228. package/.agent/skills/nextjs-react-expert/4-client-client-side-data-fetching.md +0 -264
  229. package/.agent/skills/nextjs-react-expert/5-rerender-re-render-optimization.md +0 -581
  230. package/.agent/skills/nextjs-react-expert/6-rendering-rendering-performance.md +0 -432
  231. package/.agent/skills/nextjs-react-expert/7-js-javascript-performance.md +0 -684
  232. package/.agent/skills/nextjs-react-expert/8-advanced-advanced-patterns.md +0 -150
  233. package/.agent/skills/vulnerability-scanner/checklists.md +0 -121
@@ -1,330 +1,293 @@
1
- ---
2
- name: observability
3
- description: Production observability mastery. Structured logging (Pino/Winston), OpenTelemetry tracing, metrics (Prometheus/Grafana), SLIs/SLOs/error budgets, distributed tracing, alerting design, health checks, and AI observability. Use when setting up monitoring, debugging production issues, or designing observable distributed systems.
4
- allowed-tools: Read, Write, Edit, Glob, Grep
5
- version: 2.0.0
6
- last-updated: 2026-04-01
7
- applies-to-model: gemini-2.5-pro, claude-3-7-sonnet
8
- ---
9
-
10
- # Observability — Production Monitoring Mastery
11
-
12
- > You can't fix what you can't see. You can't see what you don't measure.
13
- > Every request gets a trace. Every error gets structured context. Every SLO has an error budget.
14
-
15
- ---
16
-
17
- ## The Three Pillars
18
-
19
- ```
20
- Logs → WHAT happened (structured events)
21
- Traces → WHERE it happened (request flow across services)
22
- Metrics → HOW MUCH is happening (counters, histograms, gauges)
23
-
24
- All three are needed. Logs alone are not observability.
25
- ```
26
-
27
- ---
28
-
29
- ## Structured Logging
30
-
31
- ```typescript
32
- import pino from "pino";
33
-
34
- // ✅ Structured JSON logging
35
- const logger = pino({
36
- level: process.env.LOG_LEVEL ?? "info",
37
- timestamp: pino.stdTimeFunctions.isoTime,
38
- ...(process.env.NODE_ENV === "development" && {
39
- transport: { target: "pino-pretty" },
40
- }),
41
- });
42
-
43
- // GOOD: Structured with context
44
- logger.info({ userId: user.id, action: "login", ip: req.ip }, "User logged in");
45
- logger.error({ err, orderId: order.id, paymentGateway: "stripe" }, "Payment failed");
46
- logger.warn({ queueDepth: 1500, threshold: 1000 }, "Queue depth exceeding threshold");
47
-
48
- // ❌ BAD: Unstructured string logging
49
- console.log("User " + user.id + " logged in from " + req.ip);
50
- console.log("Error: " + error.message);
51
-
52
- // HALLUCINATION TRAP: console.log is NOT production logging
53
- // - No severity levels (info/warn/error)
54
- // - No structured fields (can't search/filter)
55
- // - No timestamps in ISO format
56
- // - Can't be collected by log aggregators
57
- // Use Pino (Node.js) or structlog (Python) for production
58
- ```
59
-
60
- ### Log Levels
61
-
62
- ```
63
- fatal App is crashing, immediate attention required
64
- errorOperation failed, needs investigation
65
- warn Something unexpected, but app continues
66
- info → Business events (user login, order placed, deploy)
67
- debug → Technical details (query timing, cache hit/miss)
68
- trace Verbose debugging (only in development)
69
-
70
- Rules:
71
- - Production default: info
72
- - Never log PII (names, emails, SSNs) at any level
73
- - Never log secrets (tokens, passwords, API keys)
74
- - Log request IDs for correlation
75
- - Log durations for performance tracking
76
- ```
77
-
78
- ### Request Context / Correlation
79
-
80
- ```typescript
81
- import { AsyncLocalStorage } from "node:async_hooks";
82
-
83
- const requestContext = new AsyncLocalStorage<{ requestId: string; userId?: string }>();
84
-
85
- // Middleware: set context per request
86
- app.use((req, res, next) => {
87
- const requestId = req.headers["x-request-id"]?.toString() ?? crypto.randomUUID();
88
- res.setHeader("x-request-id", requestId);
89
- requestContext.run({ requestId, userId: req.user?.id }, next);
90
- });
91
-
92
- // Child logger with context
93
- function getLogger() {
94
- const ctx = requestContext.getStore();
95
- return logger.child({
96
- requestId: ctx?.requestId,
97
- userId: ctx?.userId,
98
- });
99
- }
100
-
101
- // Every log from this request includes requestId and userId
102
- const log = getLogger();
103
- log.info("Processing order"); // { requestId: "abc-123", userId: "42", msg: "Processing order" }
104
- ```
105
-
106
- ---
107
-
108
- ## Distributed Tracing (OpenTelemetry)
109
-
110
- ```typescript
111
- import { NodeSDK } from "@opentelemetry/sdk-node";
112
- import { getNodeAutoInstrumentations } from "@opentelemetry/auto-instrumentations-node";
113
- import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
114
-
115
- // Initialize OpenTelemetry
116
- const sdk = new NodeSDK({
117
- traceExporter: new OTLPTraceExporter({
118
- url: process.env.OTEL_EXPORTER_OTLP_ENDPOINT ?? "http://localhost:4318/v1/traces",
119
- }),
120
- instrumentations: [
121
- getNodeAutoInstrumentations({
122
- "@opentelemetry/instrumentation-http": { enabled: true },
123
- "@opentelemetry/instrumentation-express": { enabled: true },
124
- "@opentelemetry/instrumentation-pg": { enabled: true },
125
- "@opentelemetry/instrumentation-redis": { enabled: true },
126
- }),
127
- ],
128
- });
129
-
130
- sdk.start();
131
-
132
- // Manual span for custom business logic
133
- import { trace } from "@opentelemetry/api";
134
-
135
- const tracer = trace.getTracer("order-service");
136
-
137
- async function processOrder(order: Order) {
138
- return tracer.startActiveSpan("processOrder", async (span) => {
139
- try {
140
- span.setAttribute("order.id", order.id);
141
- span.setAttribute("order.total", order.total);
142
- span.setAttribute("order.items.count", order.items.length);
143
-
144
- const result = await executeOrder(order);
145
- span.setStatus({ code: SpanStatusCode.OK });
146
- return result;
147
- } catch (error) {
148
- span.setStatus({ code: SpanStatusCode.ERROR, message: error.message });
149
- span.recordException(error);
150
- throw error;
151
- } finally {
152
- span.end();
153
- }
154
- });
155
- }
156
- ```
157
-
158
- ---
159
-
160
- ## Metrics
161
-
162
- ```typescript
163
- import { metrics } from "@opentelemetry/api";
164
-
165
- const meter = metrics.getMeter("api-server");
166
-
167
- // Counter — things that only go up
168
- const requestCounter = meter.createCounter("http.requests.total", {
169
- description: "Total HTTP requests",
170
- });
171
-
172
- // Histogram — request durations
173
- const requestDuration = meter.createHistogram("http.request.duration_ms", {
174
- description: "HTTP request duration in milliseconds",
175
- unit: "ms",
176
- });
177
-
178
- // Gauge — current values
179
- const activeConnections = meter.createUpDownCounter("db.connections.active", {
180
- description: "Active database connections",
181
- });
182
-
183
- // Middleware to record metrics
184
- app.use((req, res, next) => {
185
- const start = performance.now();
186
- res.on("finish", () => {
187
- const duration = performance.now() - start;
188
- requestCounter.add(1, {
189
- method: req.method,
190
- path: req.route?.path ?? req.path,
191
- status: res.statusCode.toString(),
192
- });
193
- requestDuration.record(duration, {
194
- method: req.method,
195
- status: res.statusCode.toString(),
196
- });
197
- });
198
- next();
199
- });
200
- ```
201
-
202
- ### Key Metrics to Track
203
-
204
- ```
205
- RED method (for services):
206
- Rate → requests per second
207
- Errors → error rate (4xx, 5xx)
208
- Durationlatency percentiles (P50, P95, P99)
209
-
210
- USE method (for resources):
211
- Utilization → CPU %, memory %, disk %
212
- Saturation → queue depth, thread pool saturation
213
- Errors → disk failures, OOM kills
214
-
215
- Business metrics:
216
- - Sign-ups per hour
217
- - Orders processed per minute
218
- - Revenue per day
219
- - API calls per customer
220
- ```
221
-
222
- ---
223
-
224
- ## SLIs, SLOs & Error Budgets
225
-
226
- ```
227
- SLI (Service Level Indicator) → What you measure
228
- "99.2% of requests complete in <500ms"
229
-
230
- SLO (Service Level Objective) → Your target
231
- "99.9% of requests should complete in <500ms"
232
-
233
- SLA (Service Level Agreement) Your contract (with penalties)
234
- "99.95% uptime or we refund 10%"
235
-
236
- Error Budget = 100% - SLO
237
- SLO: 99.9% → Error budget: 0.1% → 43 min downtime/month
238
- SLO: 99.5% Error budget: 0.5%3.6 hours downtime/month
239
-
240
- Rules:
241
- - Burn error budget too fast freeze deployments
242
- - Error budget remaining → ship features faster
243
- - Don't set SLOs you can't measure
244
- - SLOs should be slightly below actual performance
245
- ```
246
-
247
- ---
248
-
249
- ## Health Checks
250
-
251
- ```typescript
252
- // Liveness: Is the process running?
253
- app.get("/health/live", (req, res) => {
254
- res.status(200).json({ status: "ok" });
255
- });
256
-
257
- // Readiness: Can it accept traffic?
258
- app.get("/health/ready", async (req, res) => {
259
- try {
260
- await db.raw("SELECT 1"); // database check
261
- await redis.ping(); // cache check
262
- res.status(200).json({
263
- status: "ready",
264
- checks: { database: "ok", cache: "ok" },
265
- });
266
- } catch (error) {
267
- res.status(503).json({
268
- status: "not ready",
269
- checks: { database: error.message },
270
- });
271
- }
272
- });
273
-
274
- // HALLUCINATION TRAP: Liveness Readiness
275
- // Liveness fails → container restarts (only for unrecoverable states)
276
- // Readiness fails → stop sending traffic (temporary — DB down, etc.)
277
- // Making liveness check the DB → DB outage restarts all containers → cascade failure
278
- ```
279
-
280
- ---
281
-
282
- ## Alerting
283
-
284
- ```
285
- Alert design rules:
286
- 1. Alert on SYMPTOMS, not causes (high latency, not "CPU is 80%")
287
- 2. Every alert must have a runbook link
288
- 3. Every alert must be ACTIONABLE — if you can't do anything, it's a notification
289
- 4. Use severity levels:
290
- - Critical page on-call (customer-facing outage)
291
- - Warning → Slack notification (degraded, not broken)
292
- - Info → dashboard only (awareness)
293
- 5. Avoid alert fatigue — fewer, meaningful alerts beat many noisy ones
294
- ```
295
-
296
- ---
297
-
298
- ## 🤖 LLM-Specific Traps
299
-
300
- 1. **`console.log` in Production:** Use structured logging (Pino/Winston). `console.log` can't be searched or filtered.
301
- 2. **Logging PII:** Never log emails, names, passwords, or tokens. Use redaction.
302
- 3. **Liveness Checking Dependencies:** Liveness probes must NOT check DB/Redis. Only readiness probes check dependencies.
303
- 4. **Alerting on Causes:** "CPU is 80%" is not actionable. Alert on "P95 latency > 1s" instead.
304
- 5. **Missing Request IDs:** Without correlation IDs, debugging distributed systems is impossible.
305
- 6. **Percentiles vs Averages:** Average latency hides outliers. Track P50, P95, P99.
306
- 7. **No Error Budgets:** Without SLOs and error budgets, "availability" is subjective.
307
- 8. **Metrics Without Labels:** `requests_total` without `method`, `path`, `status` labels is useless.
308
- 9. **Tracing Without Sampling:** 100% trace collection is expensive. Use head-based or tail-based sampling.
309
- 10. **Log Levels in Code:** Hardcoded `logger.debug()` everywhere. Use configurable log levels via env.
310
-
311
- ---
312
-
313
- ## 🏛️ Tribunal Integration
314
-
315
- **Slash command: `/tribunal-backend`**
316
-
317
- ### ✅ Pre-Flight Self-Audit
318
-
319
- ```
320
- ✅ Am I using structured logging (not console.log)?
321
- ✅ Do all logs include requestId for correlation?
322
- ✅ Am I NOT logging PII or secrets?
323
- ✅ Are liveness and readiness checks separate?
324
- ✅ Is OpenTelemetry tracing configured?
325
- ✅ Am I tracking RED metrics (Rate, Errors, Duration)?
326
- ✅ Are SLOs defined with error budgets?
327
- ✅ Do alerts have runbook links?
328
- ✅ Am I alerting on symptoms (not causes)?
329
- ✅ Are log levels configurable via environment variable?
330
- ```
1
+ ---
2
+ name: observability
3
+ description: Production observability mastery. Structured logging (Pino/Winston), OpenTelemetry tracing, metrics (Prometheus/Grafana), SLIs/SLOs/error budgets, distributed tracing, alerting design, health checks, and AI observability. Use when setting up monitoring, debugging production issues, or designing observable distributed systems.
4
+ allowed-tools: Read, Write, Edit, Glob, Grep
5
+ version: 2.0.0
6
+ last-updated: 2026-04-01
7
+ applies-to-model: gemini-2.5-pro, claude-3-7-sonnet
8
+ ---
9
+
10
+ # Observability — Production Monitoring Mastery
11
+
12
+ ---
13
+
14
+ ## The Three Pillars
15
+
16
+ ```
17
+ Logs → WHAT happened (structured events)
18
+ Traces → WHERE it happened (request flow across services)
19
+ Metrics → HOW MUCH is happening (counters, histograms, gauges)
20
+
21
+ All three are needed. Logs alone are not observability.
22
+ ```
23
+
24
+ ---
25
+
26
+ ## Structured Logging
27
+
28
+ ```typescript
29
+ import pino from "pino";
30
+
31
+ // ✅ Structured JSON logging
32
+ const logger = pino({
33
+ level: process.env.LOG_LEVEL ?? "info",
34
+ timestamp: pino.stdTimeFunctions.isoTime,
35
+ ...(process.env.NODE_ENV === "development" && {
36
+ transport: { target: "pino-pretty" },
37
+ }),
38
+ });
39
+
40
+ // ✅ GOOD: Structured with context
41
+ logger.info({ userId: user.id, action: "login", ip: req.ip }, "User logged in");
42
+ logger.error({ err, orderId: order.id, paymentGateway: "stripe" }, "Payment failed");
43
+ logger.warn({ queueDepth: 1500, threshold: 1000 }, "Queue depth exceeding threshold");
44
+
45
+ // BAD: Unstructured string logging
46
+ console.log("User " + user.id + " logged in from " + req.ip);
47
+ console.log("Error: " + error.message);
48
+
49
+ // HALLUCINATION TRAP: console.log is NOT production logging
50
+ // - No severity levels (info/warn/error)
51
+ // - No structured fields (can't search/filter)
52
+ // - No timestamps in ISO format
53
+ // - Can't be collected by log aggregators
54
+ // Use Pino (Node.js) or structlog (Python) for production
55
+ ```
56
+
57
+ ### Log Levels
58
+
59
+ ```
60
+ fatal App is crashing, immediate attention required
61
+ error → Operation failed, needs investigation
62
+ warn → Something unexpected, but app continues
63
+ info Business events (user login, order placed, deploy)
64
+ debugTechnical details (query timing, cache hit/miss)
65
+ trace Verbose debugging (only in development)
66
+
67
+ Rules:
68
+ - Production default: info
69
+ - Never log PII (names, emails, SSNs) at any level
70
+ - Never log secrets (tokens, passwords, API keys)
71
+ - Log request IDs for correlation
72
+ - Log durations for performance tracking
73
+ ```
74
+
75
+ ### Request Context / Correlation
76
+
77
+ ```typescript
78
+ import { AsyncLocalStorage } from "node:async_hooks";
79
+
80
+ const requestContext = new AsyncLocalStorage<{ requestId: string; userId?: string }>();
81
+
82
+ // Middleware: set context per request
83
+ app.use((req, res, next) => {
84
+ const requestId = req.headers["x-request-id"]?.toString() ?? crypto.randomUUID();
85
+ res.setHeader("x-request-id", requestId);
86
+ requestContext.run({ requestId, userId: req.user?.id }, next);
87
+ });
88
+
89
+ // Child logger with context
90
+ function getLogger() {
91
+ const ctx = requestContext.getStore();
92
+ return logger.child({
93
+ requestId: ctx?.requestId,
94
+ userId: ctx?.userId,
95
+ });
96
+ }
97
+
98
+ // Every log from this request includes requestId and userId
99
+ const log = getLogger();
100
+ log.info("Processing order"); // { requestId: "abc-123", userId: "42", msg: "Processing order" }
101
+ ```
102
+
103
+ ---
104
+
105
+ ## Distributed Tracing (OpenTelemetry)
106
+
107
+ ```typescript
108
+ import { NodeSDK } from "@opentelemetry/sdk-node";
109
+ import { getNodeAutoInstrumentations } from "@opentelemetry/auto-instrumentations-node";
110
+ import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
111
+
112
+ // Initialize OpenTelemetry
113
+ const sdk = new NodeSDK({
114
+ traceExporter: new OTLPTraceExporter({
115
+ url: process.env.OTEL_EXPORTER_OTLP_ENDPOINT ?? "http://localhost:4318/v1/traces",
116
+ }),
117
+ instrumentations: [
118
+ getNodeAutoInstrumentations({
119
+ "@opentelemetry/instrumentation-http": { enabled: true },
120
+ "@opentelemetry/instrumentation-express": { enabled: true },
121
+ "@opentelemetry/instrumentation-pg": { enabled: true },
122
+ "@opentelemetry/instrumentation-redis": { enabled: true },
123
+ }),
124
+ ],
125
+ });
126
+
127
+ sdk.start();
128
+
129
+ // Manual span for custom business logic
130
+ import { trace } from "@opentelemetry/api";
131
+
132
+ const tracer = trace.getTracer("order-service");
133
+
134
+ async function processOrder(order: Order) {
135
+ return tracer.startActiveSpan("processOrder", async (span) => {
136
+ try {
137
+ span.setAttribute("order.id", order.id);
138
+ span.setAttribute("order.total", order.total);
139
+ span.setAttribute("order.items.count", order.items.length);
140
+
141
+ const result = await executeOrder(order);
142
+ span.setStatus({ code: SpanStatusCode.OK });
143
+ return result;
144
+ } catch (error) {
145
+ span.setStatus({ code: SpanStatusCode.ERROR, message: error.message });
146
+ span.recordException(error);
147
+ throw error;
148
+ } finally {
149
+ span.end();
150
+ }
151
+ });
152
+ }
153
+ ```
154
+
155
+ ---
156
+
157
+ ## Metrics
158
+
159
+ ```typescript
160
+ import { metrics } from "@opentelemetry/api";
161
+
162
+ const meter = metrics.getMeter("api-server");
163
+
164
+ // Counter — things that only go up
165
+ const requestCounter = meter.createCounter("http.requests.total", {
166
+ description: "Total HTTP requests",
167
+ });
168
+
169
+ // Histogram request durations
170
+ const requestDuration = meter.createHistogram("http.request.duration_ms", {
171
+ description: "HTTP request duration in milliseconds",
172
+ unit: "ms",
173
+ });
174
+
175
+ // Gauge — current values
176
+ const activeConnections = meter.createUpDownCounter("db.connections.active", {
177
+ description: "Active database connections",
178
+ });
179
+
180
+ // Middleware to record metrics
181
+ app.use((req, res, next) => {
182
+ const start = performance.now();
183
+ res.on("finish", () => {
184
+ const duration = performance.now() - start;
185
+ requestCounter.add(1, {
186
+ method: req.method,
187
+ path: req.route?.path ?? req.path,
188
+ status: res.statusCode.toString(),
189
+ });
190
+ requestDuration.record(duration, {
191
+ method: req.method,
192
+ status: res.statusCode.toString(),
193
+ });
194
+ });
195
+ next();
196
+ });
197
+ ```
198
+
199
+ ### Key Metrics to Track
200
+
201
+ ```
202
+ RED method (for services):
203
+ Rate → requests per second
204
+ Errors → error rate (4xx, 5xx)
205
+ Duration latency percentiles (P50, P95, P99)
206
+
207
+ USE method (for resources):
208
+ UtilizationCPU %, memory %, disk %
209
+ Saturation → queue depth, thread pool saturation
210
+ Errors → disk failures, OOM kills
211
+
212
+ Business metrics:
213
+ - Sign-ups per hour
214
+ - Orders processed per minute
215
+ - Revenue per day
216
+ - API calls per customer
217
+ ```
218
+
219
+ ---
220
+
221
+ ## SLIs, SLOs & Error Budgets
222
+
223
+ ```
224
+ SLI (Service Level Indicator) What you measure
225
+ "99.2% of requests complete in <500ms"
226
+
227
+ SLO (Service Level Objective) → Your target
228
+ "99.9% of requests should complete in <500ms"
229
+
230
+ SLA (Service Level Agreement) → Your contract (with penalties)
231
+ "99.95% uptime or we refund 10%"
232
+
233
+ Error Budget = 100% - SLO
234
+ SLO: 99.9% Error budget: 0.1% → 43 min downtime/month
235
+ SLO: 99.5% → Error budget: 0.5% → 3.6 hours downtime/month
236
+
237
+ Rules:
238
+ - Burn error budget too fast freeze deployments
239
+ - Error budget remaining → ship features faster
240
+ - Don't set SLOs you can't measure
241
+ - SLOs should be slightly below actual performance
242
+ ```
243
+
244
+ ---
245
+
246
+ ## Health Checks
247
+
248
+ ```typescript
249
+ // Liveness: Is the process running?
250
+ app.get("/health/live", (req, res) => {
251
+ res.status(200).json({ status: "ok" });
252
+ });
253
+
254
+ // Readiness: Can it accept traffic?
255
+ app.get("/health/ready", async (req, res) => {
256
+ try {
257
+ await db.raw("SELECT 1"); // database check
258
+ await redis.ping(); // cache check
259
+ res.status(200).json({
260
+ status: "ready",
261
+ checks: { database: "ok", cache: "ok" },
262
+ });
263
+ } catch (error) {
264
+ res.status(503).json({
265
+ status: "not ready",
266
+ checks: { database: error.message },
267
+ });
268
+ }
269
+ });
270
+
271
+ // ❌ HALLUCINATION TRAP: Liveness ≠ Readiness
272
+ // Liveness fails → container restarts (only for unrecoverable states)
273
+ // Readiness fails → stop sending traffic (temporary — DB down, etc.)
274
+ // Making liveness check the DB → DB outage restarts all containers → cascade failure
275
+ ```
276
+
277
+ ---
278
+
279
+ ## Alerting
280
+
281
+ ```
282
+ Alert design rules:
283
+ 1. Alert on SYMPTOMS, not causes (high latency, not "CPU is 80%")
284
+ 2. Every alert must have a runbook link
285
+ 3. Every alert must be ACTIONABLE — if you can't do anything, it's a notification
286
+ 4. Use severity levels:
287
+ - Critical page on-call (customer-facing outage)
288
+ - Warning → Slack notification (degraded, not broken)
289
+ - Info → dashboard only (awareness)
290
+ 5. Avoid alert fatigue fewer, meaningful alerts beat many noisy ones
291
+ ```
292
+
293
+ ---