npm - tribunal-kit - Versions diffs - 3.0.0 → 3.1.0 - Mend

tribunal-kit 3.0.0 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (226) hide show

package/.agent/ARCHITECTURE.md +99 -99
package/.agent/GEMINI.md +52 -52
package/.agent/agents/accessibility-reviewer.md +187 -220
package/.agent/agents/ai-code-reviewer.md +199 -233
package/.agent/agents/backend-specialist.md +215 -238
package/.agent/agents/code-archaeologist.md +161 -181
package/.agent/agents/database-architect.md +184 -207
package/.agent/agents/debugger.md +191 -218
package/.agent/agents/dependency-reviewer.md +103 -136
package/.agent/agents/devops-engineer.md +218 -238
package/.agent/agents/documentation-writer.md +201 -221
package/.agent/agents/explorer-agent.md +160 -180
package/.agent/agents/frontend-reviewer.md +160 -194
package/.agent/agents/frontend-specialist.md +248 -237
package/.agent/agents/game-developer.md +48 -52
package/.agent/agents/logic-reviewer.md +116 -149
package/.agent/agents/mobile-developer.md +200 -223
package/.agent/agents/mobile-reviewer.md +162 -195
package/.agent/agents/orchestrator.md +181 -211
package/.agent/agents/penetration-tester.md +157 -174
package/.agent/agents/performance-optimizer.md +183 -203
package/.agent/agents/performance-reviewer.md +178 -211
package/.agent/agents/product-manager.md +142 -162
package/.agent/agents/product-owner.md +6 -25
package/.agent/agents/project-planner.md +142 -162
package/.agent/agents/qa-automation-engineer.md +225 -242
package/.agent/agents/security-auditor.md +174 -194
package/.agent/agents/seo-specialist.md +193 -213
package/.agent/agents/sql-reviewer.md +161 -194
package/.agent/agents/supervisor-agent.md +184 -203
package/.agent/agents/swarm-worker-contracts.md +17 -17
package/.agent/agents/swarm-worker-registry.md +46 -46
package/.agent/agents/test-coverage-reviewer.md +160 -193
package/.agent/agents/test-engineer.md +0 -21
package/.agent/agents/type-safety-reviewer.md +175 -208
package/.agent/patterns/generator.md +9 -9
package/.agent/patterns/inversion.md +12 -12
package/.agent/patterns/pipeline.md +9 -9
package/.agent/patterns/reviewer.md +13 -13
package/.agent/patterns/tool-wrapper.md +9 -9
package/.agent/rules/GEMINI.md +63 -63
package/.agent/scripts/compress_skills.py +167 -0
package/.agent/scripts/consolidate_skills.py +173 -0
package/.agent/scripts/deep_compress.py +202 -0
package/.agent/scripts/minify_context.py +80 -0
package/.agent/scripts/security_scan.py +1 -1
package/.agent/scripts/strip_tribunal.py +41 -0
package/.agent/skills/agent-organizer/SKILL.md +92 -126
package/.agent/skills/agentic-patterns/SKILL.md +0 -70
package/.agent/skills/ai-prompt-injection-defense/SKILL.md +126 -160
package/.agent/skills/api-patterns/SKILL.md +123 -215
package/.agent/skills/api-security-auditor/SKILL.md +143 -177
package/.agent/skills/app-builder/SKILL.md +326 -50
package/.agent/skills/app-builder/templates/SKILL.md +13 -15
package/.agent/skills/app-builder/templates/astro-static/TEMPLATE.md +16 -16
package/.agent/skills/app-builder/templates/chrome-extension/TEMPLATE.md +22 -22
package/.agent/skills/app-builder/templates/cli-tool/TEMPLATE.md +18 -18
package/.agent/skills/app-builder/templates/electron-desktop/TEMPLATE.md +20 -20
package/.agent/skills/app-builder/templates/express-api/TEMPLATE.md +17 -17
package/.agent/skills/app-builder/templates/flutter-app/TEMPLATE.md +18 -18
package/.agent/skills/app-builder/templates/monorepo-turborepo/TEMPLATE.md +21 -21
package/.agent/skills/app-builder/templates/nextjs-fullstack/TEMPLATE.md +19 -19
package/.agent/skills/app-builder/templates/nextjs-saas/TEMPLATE.md +26 -26
package/.agent/skills/app-builder/templates/nextjs-static/TEMPLATE.md +26 -26
package/.agent/skills/app-builder/templates/nuxt-app/TEMPLATE.md +19 -19
package/.agent/skills/app-builder/templates/python-fastapi/TEMPLATE.md +18 -18
package/.agent/skills/app-builder/templates/react-native-app/TEMPLATE.md +20 -20
package/.agent/skills/appflow-wireframe/SKILL.md +87 -121
package/.agent/skills/architecture/SKILL.md +82 -252
package/.agent/skills/authentication-best-practices/SKILL.md +139 -173
package/.agent/skills/bash-linux/SKILL.md +120 -154
package/.agent/skills/behavioral-modes/SKILL.md +8 -69
package/.agent/skills/brainstorming/SKILL.md +428 -104
package/.agent/skills/building-native-ui/SKILL.md +143 -174
package/.agent/skills/clean-code/SKILL.md +323 -360
package/.agent/skills/code-review-checklist/SKILL.md +0 -62
package/.agent/skills/config-validator/SKILL.md +107 -141
package/.agent/skills/csharp-developer/SKILL.md +468 -528
package/.agent/skills/database-design/SKILL.md +104 -369
package/.agent/skills/deployment-procedures/SKILL.md +111 -145
package/.agent/skills/devops-engineer/SKILL.md +295 -332
package/.agent/skills/devops-incident-responder/SKILL.md +79 -113
package/.agent/skills/doc.md +5 -5
package/.agent/skills/documentation-templates/SKILL.md +19 -63
package/.agent/skills/edge-computing/SKILL.md +123 -157
package/.agent/skills/extract-design-system/SKILL.md +100 -134
package/.agent/skills/framer-motion-expert/SKILL.md +111 -855
package/.agent/skills/frontend-design/SKILL.md +151 -499
package/.agent/skills/game-design-expert/SKILL.md +71 -105
package/.agent/skills/game-engineering-expert/SKILL.md +88 -122
package/.agent/skills/geo-fundamentals/SKILL.md +89 -124
package/.agent/skills/github-operations/SKILL.md +279 -314
package/.agent/skills/gsap-expert/SKILL.md +119 -826
package/.agent/skills/i18n-localization/SKILL.md +104 -138
package/.agent/skills/intelligent-routing/SKILL.md +159 -127
package/.agent/skills/lint-and-validate/SKILL.md +8 -52
package/.agent/skills/llm-engineering/SKILL.md +344 -357
package/.agent/skills/local-first/SKILL.md +120 -154
package/.agent/skills/mcp-builder/SKILL.md +84 -118
package/.agent/skills/mobile-design/SKILL.md +213 -219
package/.agent/skills/motion-engineering/SKILL.md +184 -0
package/.agent/skills/nextjs-react-expert/SKILL.md +99 -698
package/.agent/skills/nodejs-best-practices/SKILL.md +498 -559
package/.agent/skills/observability/SKILL.md +293 -330
package/.agent/skills/parallel-agents/SKILL.md +88 -122
package/.agent/skills/performance-profiling/SKILL.md +217 -254
package/.agent/skills/plan-writing/SKILL.md +84 -118
package/.agent/skills/platform-engineer/SKILL.md +89 -123
package/.agent/skills/playwright-best-practices/SKILL.md +128 -162
package/.agent/skills/powershell-windows/SKILL.md +112 -146
package/.agent/skills/python-patterns/SKILL.md +7 -35
package/.agent/skills/python-pro/SKILL.md +148 -754
package/.agent/skills/react-specialist/SKILL.md +123 -827
package/.agent/skills/readme-builder/SKILL.md +15 -85
package/.agent/skills/realtime-patterns/SKILL.md +269 -304
package/.agent/skills/red-team-tactics/SKILL.md +10 -51
package/.agent/skills/rust-pro/SKILL.md +623 -701
package/.agent/skills/seo-fundamentals/SKILL.md +120 -154
package/.agent/skills/server-management/SKILL.md +156 -190
package/.agent/skills/shadcn-ui-expert/SKILL.md +172 -206
package/.agent/skills/skill-creator/SKILL.md +18 -58
package/.agent/skills/sql-pro/SKILL.md +579 -633
package/.agent/skills/supabase-postgres-best-practices/SKILL.md +28 -68
package/.agent/skills/swiftui-expert/SKILL.md +142 -176
package/.agent/skills/systematic-debugging/SKILL.md +84 -118
package/.agent/skills/tailwind-patterns/SKILL.md +516 -576
package/.agent/skills/tdd-workflow/SKILL.md +103 -137
package/.agent/skills/test-result-analyzer/SKILL.md +33 -73
package/.agent/skills/testing-patterns/SKILL.md +512 -573
package/.agent/skills/trend-researcher/SKILL.md +30 -71
package/.agent/skills/ui-ux-pro-max/SKILL.md +0 -41
package/.agent/skills/ui-ux-researcher/SKILL.md +51 -91
package/.agent/skills/vue-expert/SKILL.md +127 -866
package/.agent/skills/vulnerability-scanner/SKILL.md +354 -269
package/.agent/skills/web-accessibility-auditor/SKILL.md +159 -193
package/.agent/skills/web-design-guidelines/SKILL.md +17 -61
package/.agent/skills/webapp-testing/SKILL.md +111 -145
package/.agent/skills/whimsy-injector/SKILL.md +58 -132
package/.agent/skills/workflow-optimizer/SKILL.md +28 -68
package/.agent/workflows/api-tester.md +151 -151
package/.agent/workflows/audit.md +127 -138
package/.agent/workflows/brainstorm.md +110 -110
package/.agent/workflows/changelog.md +112 -112
package/.agent/workflows/create.md +124 -124
package/.agent/workflows/debug.md +165 -189
package/.agent/workflows/deploy.md +180 -189
package/.agent/workflows/enhance.md +128 -151
package/.agent/workflows/fix.md +114 -135
package/.agent/workflows/generate.md +12 -4
package/.agent/workflows/migrate.md +160 -160
package/.agent/workflows/orchestrate.md +168 -168
package/.agent/workflows/performance-benchmarker.md +114 -123
package/.agent/workflows/plan.md +173 -173
package/.agent/workflows/preview.md +80 -80
package/.agent/workflows/refactor.md +161 -183
package/.agent/workflows/review-ai.md +101 -129
package/.agent/workflows/review.md +116 -116
package/.agent/workflows/session.md +94 -94
package/.agent/workflows/status.md +79 -79
package/.agent/workflows/strengthen-skills.md +138 -139
package/.agent/workflows/swarm.md +179 -179
package/.agent/workflows/test.md +189 -211
package/.agent/workflows/tribunal-backend.md +93 -113
package/.agent/workflows/tribunal-database.md +94 -115
package/.agent/workflows/tribunal-frontend.md +95 -118
package/.agent/workflows/tribunal-full.md +92 -133
package/.agent/workflows/tribunal-mobile.md +94 -119
package/.agent/workflows/tribunal-performance.md +109 -133
package/.agent/workflows/ui-ux-pro-max.md +122 -143
package/package.json +1 -1
package/.agent/skills/api-patterns/api-style.md +0 -42
package/.agent/skills/api-patterns/auth.md +0 -24
package/.agent/skills/api-patterns/documentation.md +0 -26
package/.agent/skills/api-patterns/graphql.md +0 -41
package/.agent/skills/api-patterns/rate-limiting.md +0 -31
package/.agent/skills/api-patterns/response.md +0 -37
package/.agent/skills/api-patterns/rest.md +0 -40
package/.agent/skills/api-patterns/security-testing.md +0 -122
package/.agent/skills/api-patterns/trpc.md +0 -41
package/.agent/skills/api-patterns/versioning.md +0 -22
package/.agent/skills/app-builder/agent-coordination.md +0 -71
package/.agent/skills/app-builder/feature-building.md +0 -53
package/.agent/skills/app-builder/project-detection.md +0 -34
package/.agent/skills/app-builder/scaffolding.md +0 -118
package/.agent/skills/app-builder/tech-stack.md +0 -40
package/.agent/skills/architecture/context-discovery.md +0 -43
package/.agent/skills/architecture/examples.md +0 -94
package/.agent/skills/architecture/pattern-selection.md +0 -68
package/.agent/skills/architecture/patterns-reference.md +0 -50
package/.agent/skills/architecture/trade-off-analysis.md +0 -77
package/.agent/skills/brainstorming/dynamic-questioning.md +0 -360
package/.agent/skills/database-design/database-selection.md +0 -43
package/.agent/skills/database-design/indexing.md +0 -39
package/.agent/skills/database-design/migrations.md +0 -48
package/.agent/skills/database-design/optimization.md +0 -36
package/.agent/skills/database-design/orm-selection.md +0 -30
package/.agent/skills/database-design/schema-design.md +0 -56
package/.agent/skills/frontend-design/animation-guide.md +0 -331
package/.agent/skills/frontend-design/color-system.md +0 -329
package/.agent/skills/frontend-design/decision-trees.md +0 -418
package/.agent/skills/frontend-design/motion-graphics.md +0 -306
package/.agent/skills/frontend-design/typography-system.md +0 -363
package/.agent/skills/frontend-design/ux-psychology.md +0 -1116
package/.agent/skills/frontend-design/visual-effects.md +0 -383
package/.agent/skills/intelligent-routing/router-manifest.md +0 -65
package/.agent/skills/mobile-design/decision-trees.md +0 -516
package/.agent/skills/mobile-design/mobile-backend.md +0 -491
package/.agent/skills/mobile-design/mobile-color-system.md +0 -420
package/.agent/skills/mobile-design/mobile-debugging.md +0 -122
package/.agent/skills/mobile-design/mobile-design-thinking.md +0 -357
package/.agent/skills/mobile-design/mobile-navigation.md +0 -458
package/.agent/skills/mobile-design/mobile-performance.md +0 -767
package/.agent/skills/mobile-design/mobile-testing.md +0 -356
package/.agent/skills/mobile-design/mobile-typography.md +0 -433
package/.agent/skills/mobile-design/platform-android.md +0 -666
package/.agent/skills/mobile-design/platform-ios.md +0 -561
package/.agent/skills/mobile-design/touch-psychology.md +0 -537
package/.agent/skills/nextjs-react-expert/1-async-eliminating-waterfalls.md +0 -312
package/.agent/skills/nextjs-react-expert/2-bundle-bundle-size-optimization.md +0 -240
package/.agent/skills/nextjs-react-expert/3-server-server-side-performance.md +0 -490
package/.agent/skills/nextjs-react-expert/4-client-client-side-data-fetching.md +0 -264
package/.agent/skills/nextjs-react-expert/5-rerender-re-render-optimization.md +0 -581
package/.agent/skills/nextjs-react-expert/6-rendering-rendering-performance.md +0 -432
package/.agent/skills/nextjs-react-expert/7-js-javascript-performance.md +0 -684
package/.agent/skills/nextjs-react-expert/8-advanced-advanced-patterns.md +0 -150
package/.agent/skills/vulnerability-scanner/checklists.md +0 -121

package/.agent/skills/observability/SKILL.md CHANGED Viewed

@@ -1,330 +1,293 @@
----
-name: observability
-description: Production observability mastery. Structured logging (Pino/Winston), OpenTelemetry tracing, metrics (Prometheus/Grafana), SLIs/SLOs/error budgets, distributed tracing, alerting design, health checks, and AI observability. Use when setting up monitoring, debugging production issues, or designing observable distributed systems.
-allowed-tools: Read, Write, Edit, Glob, Grep
-version: 2.0.0
-last-updated: 2026-04-01
-applies-to-model: gemini-2.5-pro, claude-3-7-sonnet
----
-# Observability — Production Monitoring Mastery
-> You can't fix what you can't see. You can't see what you don't measure.
-> Every request gets a trace. Every error gets structured context. Every SLO has an error budget.
----
-## The Three Pillars
-```
-Logs    → WHAT happened (structured events)
-Traces  → WHERE it happened (request flow across services)
-Metrics → HOW MUCH is happening (counters, histograms, gauges)
-All three are needed. Logs alone are not observability.
-```
----
-## Structured Logging
-```typescript
-import pino from "pino";
-// ✅ Structured JSON logging
-const logger = pino({
-  level: process.env.LOG_LEVEL ?? "info",
-  timestamp: pino.stdTimeFunctions.isoTime,
-  ...(process.env.NODE_ENV === "development" && {
-    transport: { target: "pino-pretty" },
-  }),
-});
-// ✅ GOOD: Structured with context
-logger.info({ userId: user.id, action: "login", ip: req.ip }, "User logged in");
-logger.error({ err, orderId: order.id, paymentGateway: "stripe" }, "Payment failed");
-logger.warn({ queueDepth: 1500, threshold: 1000 }, "Queue depth exceeding threshold");
-// ❌ BAD: Unstructured string logging
-console.log("User " + user.id + " logged in from " + req.ip);
-console.log("Error: " + error.message);
-// ❌ HALLUCINATION TRAP: console.log is NOT production logging
-// - No severity levels (info/warn/error)
-// - No structured fields (can't search/filter)
-// - No timestamps in ISO format
-// - Can't be collected by log aggregators
-// ✅ Use Pino (Node.js) or structlog (Python) for production
-```
-### Log Levels
-```
-fatal → App is crashing, immediate attention required
-error → Operation failed, needs investigation
-warn  → Something unexpected, but app continues
-info  → Business events (user login, order placed, deploy)
-debug → Technical details (query timing, cache hit/miss)
-trace → Verbose debugging (only in development)
-Rules:
-- Production default: info
-- Never log PII (names, emails, SSNs) at any level
-- Never log secrets (tokens, passwords, API keys)
-- Log request IDs for correlation
-- Log durations for performance tracking
-```
-### Request Context / Correlation
-```typescript
-import { AsyncLocalStorage } from "node:async_hooks";
-const requestContext = new AsyncLocalStorage<{ requestId: string; userId?: string }>();
-// Middleware: set context per request
-app.use((req, res, next) => {
-  const requestId = req.headers["x-request-id"]?.toString() ?? crypto.randomUUID();
-  res.setHeader("x-request-id", requestId);
-  requestContext.run({ requestId, userId: req.user?.id }, next);
-});
-// Child logger with context
-function getLogger() {
-  const ctx = requestContext.getStore();
-  return logger.child({
-    requestId: ctx?.requestId,
-    userId: ctx?.userId,
-  });
-}
-// Every log from this request includes requestId and userId
-const log = getLogger();
-log.info("Processing order");  // { requestId: "abc-123", userId: "42", msg: "Processing order" }
-```
----
-## Distributed Tracing (OpenTelemetry)
-```typescript
-import { NodeSDK } from "@opentelemetry/sdk-node";
-import { getNodeAutoInstrumentations } from "@opentelemetry/auto-instrumentations-node";
-import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
-// Initialize OpenTelemetry
-const sdk = new NodeSDK({
-  traceExporter: new OTLPTraceExporter({
-    url: process.env.OTEL_EXPORTER_OTLP_ENDPOINT ?? "http://localhost:4318/v1/traces",
-  }),
-  instrumentations: [
-    getNodeAutoInstrumentations({
-      "@opentelemetry/instrumentation-http": { enabled: true },
-      "@opentelemetry/instrumentation-express": { enabled: true },
-      "@opentelemetry/instrumentation-pg": { enabled: true },
-      "@opentelemetry/instrumentation-redis": { enabled: true },
-    }),
-  ],
-});
-sdk.start();
-// Manual span for custom business logic
-import { trace } from "@opentelemetry/api";
-const tracer = trace.getTracer("order-service");
-async function processOrder(order: Order) {
-  return tracer.startActiveSpan("processOrder", async (span) => {
-    try {
-      span.setAttribute("order.id", order.id);
-      span.setAttribute("order.total", order.total);
-      span.setAttribute("order.items.count", order.items.length);
-      const result = await executeOrder(order);
-      span.setStatus({ code: SpanStatusCode.OK });
-      return result;
-    } catch (error) {
-      span.setStatus({ code: SpanStatusCode.ERROR, message: error.message });
-      span.recordException(error);
-      throw error;
-    } finally {
-      span.end();
-    }
-  });
-}
-```
----
-## Metrics
-```typescript
-import { metrics } from "@opentelemetry/api";
-const meter = metrics.getMeter("api-server");
-// Counter — things that only go up
-const requestCounter = meter.createCounter("http.requests.total", {
-  description: "Total HTTP requests",
-});
-// Histogram — request durations
-const requestDuration = meter.createHistogram("http.request.duration_ms", {
-  description: "HTTP request duration in milliseconds",
-  unit: "ms",
-});
-// Gauge — current values
-const activeConnections = meter.createUpDownCounter("db.connections.active", {
-  description: "Active database connections",
-});
-// Middleware to record metrics
-app.use((req, res, next) => {
-  const start = performance.now();
-  res.on("finish", () => {
-    const duration = performance.now() - start;
-    requestCounter.add(1, {
-      method: req.method,
-      path: req.route?.path ?? req.path,
-      status: res.statusCode.toString(),
-    });
-    requestDuration.record(duration, {
-      method: req.method,
-      status: res.statusCode.toString(),
-    });
-  });
-  next();
-});
-```
-### Key Metrics to Track
-```
-RED method (for services):
-  Rate     → requests per second
-  Errors   → error rate (4xx, 5xx)
-  Duration → latency percentiles (P50, P95, P99)
-USE method (for resources):
-  Utilization → CPU %, memory %, disk %
-  Saturation  → queue depth, thread pool saturation
-  Errors      → disk failures, OOM kills
-Business metrics:
-  - Sign-ups per hour
-  - Orders processed per minute
-  - Revenue per day
-  - API calls per customer
-```
----
-## SLIs, SLOs & Error Budgets
-```
-SLI (Service Level Indicator) → What you measure
-  "99.2% of requests complete in <500ms"
-SLO (Service Level Objective) → Your target
-  "99.9% of requests should complete in <500ms"
-SLA (Service Level Agreement) → Your contract (with penalties)
-  "99.95% uptime or we refund 10%"
-Error Budget = 100% - SLO
-  SLO: 99.9% → Error budget: 0.1% → 43 min downtime/month
-  SLO: 99.5% → Error budget: 0.5% → 3.6 hours downtime/month
-Rules:
-- Burn error budget too fast → freeze deployments
-- Error budget remaining → ship features faster
-- Don't set SLOs you can't measure
-- SLOs should be slightly below actual performance
-```
----
-## Health Checks
-```typescript
-// Liveness: Is the process running?
-app.get("/health/live", (req, res) => {
-  res.status(200).json({ status: "ok" });
-});
-// Readiness: Can it accept traffic?
-app.get("/health/ready", async (req, res) => {
-  try {
-    await db.raw("SELECT 1");           // database check
-    await redis.ping();                  // cache check
-    res.status(200).json({
-      status: "ready",
-      checks: { database: "ok", cache: "ok" },
-    });
-  } catch (error) {
-    res.status(503).json({
-      status: "not ready",
-      checks: { database: error.message },
-    });
-  }
-});
-// ❌ HALLUCINATION TRAP: Liveness ≠ Readiness
-// Liveness fails → container restarts (only for unrecoverable states)
-// Readiness fails → stop sending traffic (temporary — DB down, etc.)
-// Making liveness check the DB → DB outage restarts all containers → cascade failure
-```
----
-## Alerting
-```
-Alert design rules:
-1. Alert on SYMPTOMS, not causes (high latency, not "CPU is 80%")
-2. Every alert must have a runbook link
-3. Every alert must be ACTIONABLE — if you can't do anything, it's a notification
-4. Use severity levels:
-   - Critical → page on-call (customer-facing outage)
-   - Warning  → Slack notification (degraded, not broken)
-   - Info     → dashboard only (awareness)
-5. Avoid alert fatigue — fewer, meaningful alerts beat many noisy ones
-```
----
-## 🤖 LLM-Specific Traps
-1. **`console.log` in Production:** Use structured logging (Pino/Winston). `console.log` can't be searched or filtered.
-2. **Logging PII:** Never log emails, names, passwords, or tokens. Use redaction.
-3. **Liveness Checking Dependencies:** Liveness probes must NOT check DB/Redis. Only readiness probes check dependencies.
-4. **Alerting on Causes:** "CPU is 80%" is not actionable. Alert on "P95 latency > 1s" instead.
-5. **Missing Request IDs:** Without correlation IDs, debugging distributed systems is impossible.
-6. **Percentiles vs Averages:** Average latency hides outliers. Track P50, P95, P99.
-7. **No Error Budgets:** Without SLOs and error budgets, "availability" is subjective.
-8. **Metrics Without Labels:** `requests_total` without `method`, `path`, `status` labels is useless.
-9. **Tracing Without Sampling:** 100% trace collection is expensive. Use head-based or tail-based sampling.
-10. **Log Levels in Code:** Hardcoded `logger.debug()` everywhere. Use configurable log levels via env.
----
-## 🏛️ Tribunal Integration
-**Slash command: `/tribunal-backend`**
-### ✅ Pre-Flight Self-Audit
-```
-✅ Am I using structured logging (not console.log)?
-✅ Do all logs include requestId for correlation?
-✅ Am I NOT logging PII or secrets?
-✅ Are liveness and readiness checks separate?
-✅ Is OpenTelemetry tracing configured?
-✅ Am I tracking RED metrics (Rate, Errors, Duration)?
-✅ Are SLOs defined with error budgets?
-✅ Do alerts have runbook links?
-✅ Am I alerting on symptoms (not causes)?
-✅ Are log levels configurable via environment variable?
-```
+---
+name: observability
+description: Production observability mastery. Structured logging (Pino/Winston), OpenTelemetry tracing, metrics (Prometheus/Grafana), SLIs/SLOs/error budgets, distributed tracing, alerting design, health checks, and AI observability. Use when setting up monitoring, debugging production issues, or designing observable distributed systems.
+allowed-tools: Read, Write, Edit, Glob, Grep
+version: 2.0.0
+last-updated: 2026-04-01
+applies-to-model: gemini-2.5-pro, claude-3-7-sonnet
+---
+# Observability — Production Monitoring Mastery
+---
+## The Three Pillars
+```
+Logs    → WHAT happened (structured events)
+Traces  → WHERE it happened (request flow across services)
+Metrics → HOW MUCH is happening (counters, histograms, gauges)
+All three are needed. Logs alone are not observability.
+```
+---
+## Structured Logging
+```typescript
+import pino from "pino";
+// ✅ Structured JSON logging
+const logger = pino({
+  level: process.env.LOG_LEVEL ?? "info",
+  timestamp: pino.stdTimeFunctions.isoTime,
+  ...(process.env.NODE_ENV === "development" && {
+    transport: { target: "pino-pretty" },
+  }),
+});
+// ✅ GOOD: Structured with context
+logger.info({ userId: user.id, action: "login", ip: req.ip }, "User logged in");
+logger.error({ err, orderId: order.id, paymentGateway: "stripe" }, "Payment failed");
+logger.warn({ queueDepth: 1500, threshold: 1000 }, "Queue depth exceeding threshold");
+// ❌ BAD: Unstructured string logging
+console.log("User " + user.id + " logged in from " + req.ip);
+console.log("Error: " + error.message);
+// ❌ HALLUCINATION TRAP: console.log is NOT production logging
+// - No severity levels (info/warn/error)
+// - No structured fields (can't search/filter)
+// - No timestamps in ISO format
+// - Can't be collected by log aggregators
+// ✅ Use Pino (Node.js) or structlog (Python) for production
+```
+### Log Levels
+```
+fatal → App is crashing, immediate attention required
+error → Operation failed, needs investigation
+warn  → Something unexpected, but app continues
+info  → Business events (user login, order placed, deploy)
+debug → Technical details (query timing, cache hit/miss)
+trace → Verbose debugging (only in development)
+Rules:
+- Production default: info
+- Never log PII (names, emails, SSNs) at any level
+- Never log secrets (tokens, passwords, API keys)
+- Log request IDs for correlation
+- Log durations for performance tracking
+```
+### Request Context / Correlation
+```typescript
+import { AsyncLocalStorage } from "node:async_hooks";
+const requestContext = new AsyncLocalStorage<{ requestId: string; userId?: string }>();
+// Middleware: set context per request
+app.use((req, res, next) => {
+  const requestId = req.headers["x-request-id"]?.toString() ?? crypto.randomUUID();
+  res.setHeader("x-request-id", requestId);
+  requestContext.run({ requestId, userId: req.user?.id }, next);
+});
+// Child logger with context
+function getLogger() {
+  const ctx = requestContext.getStore();
+  return logger.child({
+    requestId: ctx?.requestId,
+    userId: ctx?.userId,
+  });
+}
+// Every log from this request includes requestId and userId
+const log = getLogger();
+log.info("Processing order");  // { requestId: "abc-123", userId: "42", msg: "Processing order" }
+```
+---
+## Distributed Tracing (OpenTelemetry)
+```typescript
+import { NodeSDK } from "@opentelemetry/sdk-node";
+import { getNodeAutoInstrumentations } from "@opentelemetry/auto-instrumentations-node";
+import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
+// Initialize OpenTelemetry
+const sdk = new NodeSDK({
+  traceExporter: new OTLPTraceExporter({
+    url: process.env.OTEL_EXPORTER_OTLP_ENDPOINT ?? "http://localhost:4318/v1/traces",
+  }),
+  instrumentations: [
+    getNodeAutoInstrumentations({
+      "@opentelemetry/instrumentation-http": { enabled: true },
+      "@opentelemetry/instrumentation-express": { enabled: true },
+      "@opentelemetry/instrumentation-pg": { enabled: true },
+      "@opentelemetry/instrumentation-redis": { enabled: true },
+    }),
+  ],
+});
+sdk.start();
+// Manual span for custom business logic
+import { trace } from "@opentelemetry/api";
+const tracer = trace.getTracer("order-service");
+async function processOrder(order: Order) {
+  return tracer.startActiveSpan("processOrder", async (span) => {
+    try {
+      span.setAttribute("order.id", order.id);
+      span.setAttribute("order.total", order.total);
+      span.setAttribute("order.items.count", order.items.length);
+      const result = await executeOrder(order);
+      span.setStatus({ code: SpanStatusCode.OK });
+      return result;
+    } catch (error) {
+      span.setStatus({ code: SpanStatusCode.ERROR, message: error.message });
+      span.recordException(error);
+      throw error;
+    } finally {
+      span.end();
+    }
+  });
+}
+```
+---
+## Metrics
+```typescript
+import { metrics } from "@opentelemetry/api";
+const meter = metrics.getMeter("api-server");
+// Counter — things that only go up
+const requestCounter = meter.createCounter("http.requests.total", {
+  description: "Total HTTP requests",
+});
+// Histogram — request durations
+const requestDuration = meter.createHistogram("http.request.duration_ms", {
+  description: "HTTP request duration in milliseconds",
+  unit: "ms",
+});
+// Gauge — current values
+const activeConnections = meter.createUpDownCounter("db.connections.active", {
+  description: "Active database connections",
+});
+// Middleware to record metrics
+app.use((req, res, next) => {
+  const start = performance.now();
+  res.on("finish", () => {
+    const duration = performance.now() - start;
+    requestCounter.add(1, {
+      method: req.method,
+      path: req.route?.path ?? req.path,
+      status: res.statusCode.toString(),
+    });
+    requestDuration.record(duration, {
+      method: req.method,
+      status: res.statusCode.toString(),
+    });
+  });
+  next();
+});
+```
+### Key Metrics to Track
+```
+RED method (for services):
+  Rate     → requests per second
+  Errors   → error rate (4xx, 5xx)
+  Duration → latency percentiles (P50, P95, P99)
+USE method (for resources):
+  Utilization → CPU %, memory %, disk %
+  Saturation  → queue depth, thread pool saturation
+  Errors      → disk failures, OOM kills
+Business metrics:
+  - Sign-ups per hour
+  - Orders processed per minute
+  - Revenue per day
+  - API calls per customer
+```
+---
+## SLIs, SLOs & Error Budgets
+```
+SLI (Service Level Indicator) → What you measure
+  "99.2% of requests complete in <500ms"
+SLO (Service Level Objective) → Your target
+  "99.9% of requests should complete in <500ms"
+SLA (Service Level Agreement) → Your contract (with penalties)
+  "99.95% uptime or we refund 10%"
+Error Budget = 100% - SLO
+  SLO: 99.9% → Error budget: 0.1% → 43 min downtime/month
+  SLO: 99.5% → Error budget: 0.5% → 3.6 hours downtime/month
+Rules:
+- Burn error budget too fast → freeze deployments
+- Error budget remaining → ship features faster
+- Don't set SLOs you can't measure
+- SLOs should be slightly below actual performance
+```
+---
+## Health Checks
+```typescript
+// Liveness: Is the process running?
+app.get("/health/live", (req, res) => {
+  res.status(200).json({ status: "ok" });
+});
+// Readiness: Can it accept traffic?
+app.get("/health/ready", async (req, res) => {
+  try {
+    await db.raw("SELECT 1");           // database check
+    await redis.ping();                  // cache check
+    res.status(200).json({
+      status: "ready",
+      checks: { database: "ok", cache: "ok" },
+    });
+  } catch (error) {
+    res.status(503).json({
+      status: "not ready",
+      checks: { database: error.message },
+    });
+  }
+});
+// ❌ HALLUCINATION TRAP: Liveness ≠ Readiness
+// Liveness fails → container restarts (only for unrecoverable states)
+// Readiness fails → stop sending traffic (temporary — DB down, etc.)
+// Making liveness check the DB → DB outage restarts all containers → cascade failure
+```
+---
+## Alerting
+```
+Alert design rules:
+1. Alert on SYMPTOMS, not causes (high latency, not "CPU is 80%")
+2. Every alert must have a runbook link
+3. Every alert must be ACTIONABLE — if you can't do anything, it's a notification
+4. Use severity levels:
+   - Critical → page on-call (customer-facing outage)
+   - Warning  → Slack notification (degraded, not broken)
+   - Info     → dashboard only (awareness)
+5. Avoid alert fatigue — fewer, meaningful alerts beat many noisy ones
+```
+---