tribunal-kit 2.4.6 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent/ARCHITECTURE.md +99 -99
- package/.agent/GEMINI.md +52 -52
- package/.agent/agents/accessibility-reviewer.md +139 -86
- package/.agent/agents/ai-code-reviewer.md +160 -90
- package/.agent/agents/backend-specialist.md +164 -127
- package/.agent/agents/code-archaeologist.md +115 -73
- package/.agent/agents/database-architect.md +130 -110
- package/.agent/agents/debugger.md +137 -97
- package/.agent/agents/dependency-reviewer.md +78 -30
- package/.agent/agents/devops-engineer.md +161 -118
- package/.agent/agents/documentation-writer.md +151 -87
- package/.agent/agents/explorer-agent.md +117 -99
- package/.agent/agents/frontend-reviewer.md +127 -47
- package/.agent/agents/frontend-specialist.md +169 -109
- package/.agent/agents/game-developer.md +28 -164
- package/.agent/agents/logic-reviewer.md +87 -49
- package/.agent/agents/mobile-developer.md +151 -103
- package/.agent/agents/mobile-reviewer.md +133 -50
- package/.agent/agents/orchestrator.md +121 -110
- package/.agent/agents/penetration-tester.md +103 -77
- package/.agent/agents/performance-optimizer.md +136 -92
- package/.agent/agents/performance-reviewer.md +139 -69
- package/.agent/agents/product-manager.md +104 -70
- package/.agent/agents/product-owner.md +6 -25
- package/.agent/agents/project-planner.md +95 -95
- package/.agent/agents/qa-automation-engineer.md +174 -87
- package/.agent/agents/security-auditor.md +133 -129
- package/.agent/agents/seo-specialist.md +160 -99
- package/.agent/agents/sql-reviewer.md +132 -44
- package/.agent/agents/supervisor-agent.md +137 -109
- package/.agent/agents/swarm-worker-contracts.md +17 -17
- package/.agent/agents/swarm-worker-registry.md +46 -46
- package/.agent/agents/test-coverage-reviewer.md +132 -53
- package/.agent/agents/test-engineer.md +0 -21
- package/.agent/agents/type-safety-reviewer.md +143 -33
- package/.agent/patterns/generator.md +9 -9
- package/.agent/patterns/inversion.md +12 -12
- package/.agent/patterns/pipeline.md +9 -9
- package/.agent/patterns/reviewer.md +13 -13
- package/.agent/patterns/tool-wrapper.md +9 -9
- package/.agent/rules/GEMINI.md +63 -63
- package/.agent/scripts/__pycache__/auto_preview.cpython-311.pyc +0 -0
- package/.agent/scripts/__pycache__/bundle_analyzer.cpython-311.pyc +0 -0
- package/.agent/scripts/__pycache__/checklist.cpython-311.pyc +0 -0
- package/.agent/scripts/__pycache__/dependency_analyzer.cpython-311.pyc +0 -0
- package/.agent/scripts/__pycache__/security_scan.cpython-311.pyc +0 -0
- package/.agent/scripts/__pycache__/session_manager.cpython-311.pyc +0 -0
- package/.agent/scripts/__pycache__/skill_integrator.cpython-311.pyc +0 -0
- package/.agent/scripts/__pycache__/swarm_dispatcher.cpython-311.pyc +0 -0
- package/.agent/scripts/__pycache__/test_runner.cpython-311.pyc +0 -0
- package/.agent/scripts/__pycache__/verify_all.cpython-311.pyc +0 -0
- package/.agent/scripts/compress_skills.py +167 -0
- package/.agent/scripts/consolidate_skills.py +173 -0
- package/.agent/scripts/deep_compress.py +202 -0
- package/.agent/scripts/minify_context.py +80 -0
- package/.agent/scripts/security_scan.py +1 -1
- package/.agent/scripts/strip_tribunal.py +41 -0
- package/.agent/skills/agent-organizer/SKILL.md +60 -100
- package/.agent/skills/agentic-patterns/SKILL.md +0 -70
- package/.agent/skills/ai-prompt-injection-defense/SKILL.md +108 -53
- package/.agent/skills/api-patterns/SKILL.md +197 -257
- package/.agent/skills/api-security-auditor/SKILL.md +125 -57
- package/.agent/skills/app-builder/SKILL.md +326 -50
- package/.agent/skills/app-builder/templates/SKILL.md +13 -15
- package/.agent/skills/app-builder/templates/astro-static/TEMPLATE.md +16 -16
- package/.agent/skills/app-builder/templates/chrome-extension/TEMPLATE.md +22 -22
- package/.agent/skills/app-builder/templates/cli-tool/TEMPLATE.md +18 -18
- package/.agent/skills/app-builder/templates/electron-desktop/TEMPLATE.md +20 -20
- package/.agent/skills/app-builder/templates/express-api/TEMPLATE.md +17 -17
- package/.agent/skills/app-builder/templates/flutter-app/TEMPLATE.md +18 -18
- package/.agent/skills/app-builder/templates/monorepo-turborepo/TEMPLATE.md +21 -21
- package/.agent/skills/app-builder/templates/nextjs-fullstack/TEMPLATE.md +19 -19
- package/.agent/skills/app-builder/templates/nextjs-saas/TEMPLATE.md +26 -26
- package/.agent/skills/app-builder/templates/nextjs-static/TEMPLATE.md +26 -26
- package/.agent/skills/app-builder/templates/nuxt-app/TEMPLATE.md +19 -19
- package/.agent/skills/app-builder/templates/python-fastapi/TEMPLATE.md +18 -18
- package/.agent/skills/app-builder/templates/react-native-app/TEMPLATE.md +20 -20
- package/.agent/skills/appflow-wireframe/SKILL.md +71 -98
- package/.agent/skills/architecture/SKILL.md +161 -200
- package/.agent/skills/authentication-best-practices/SKILL.md +121 -54
- package/.agent/skills/bash-linux/SKILL.md +71 -166
- package/.agent/skills/behavioral-modes/SKILL.md +8 -69
- package/.agent/skills/brainstorming/SKILL.md +345 -127
- package/.agent/skills/building-native-ui/SKILL.md +125 -57
- package/.agent/skills/clean-code/SKILL.md +266 -149
- package/.agent/skills/code-review-checklist/SKILL.md +0 -62
- package/.agent/skills/config-validator/SKILL.md +73 -131
- package/.agent/skills/csharp-developer/SKILL.md +434 -73
- package/.agent/skills/database-design/SKILL.md +190 -275
- package/.agent/skills/deployment-procedures/SKILL.md +81 -158
- package/.agent/skills/devops-engineer/SKILL.md +255 -94
- package/.agent/skills/devops-incident-responder/SKILL.md +50 -69
- package/.agent/skills/doc.md +5 -5
- package/.agent/skills/documentation-templates/SKILL.md +19 -63
- package/.agent/skills/edge-computing/SKILL.md +75 -165
- package/.agent/skills/extract-design-system/SKILL.md +84 -58
- package/.agent/skills/framer-motion-expert/SKILL.md +195 -0
- package/.agent/skills/frontend-design/SKILL.md +151 -499
- package/.agent/skills/game-design-expert/SKILL.md +71 -0
- package/.agent/skills/game-engineering-expert/SKILL.md +88 -0
- package/.agent/skills/geo-fundamentals/SKILL.md +52 -178
- package/.agent/skills/github-operations/SKILL.md +197 -272
- package/.agent/skills/gsap-expert/SKILL.md +194 -0
- package/.agent/skills/i18n-localization/SKILL.md +60 -172
- package/.agent/skills/intelligent-routing/SKILL.md +123 -103
- package/.agent/skills/lint-and-validate/SKILL.md +8 -52
- package/.agent/skills/llm-engineering/SKILL.md +281 -195
- package/.agent/skills/local-first/SKILL.md +76 -159
- package/.agent/skills/mcp-builder/SKILL.md +48 -188
- package/.agent/skills/mobile-design/SKILL.md +213 -219
- package/.agent/skills/motion-engineering/SKILL.md +184 -0
- package/.agent/skills/nextjs-react-expert/SKILL.md +184 -203
- package/.agent/skills/nodejs-best-practices/SKILL.md +403 -185
- package/.agent/skills/observability/SKILL.md +211 -203
- package/.agent/skills/parallel-agents/SKILL.md +53 -146
- package/.agent/skills/performance-profiling/SKILL.md +171 -151
- package/.agent/skills/plan-writing/SKILL.md +49 -153
- package/.agent/skills/platform-engineer/SKILL.md +57 -103
- package/.agent/skills/playwright-best-practices/SKILL.md +110 -63
- package/.agent/skills/powershell-windows/SKILL.md +61 -179
- package/.agent/skills/python-patterns/SKILL.md +7 -35
- package/.agent/skills/python-pro/SKILL.md +273 -114
- package/.agent/skills/react-specialist/SKILL.md +227 -108
- package/.agent/skills/readme-builder/SKILL.md +15 -85
- package/.agent/skills/realtime-patterns/SKILL.md +216 -243
- package/.agent/skills/red-team-tactics/SKILL.md +10 -51
- package/.agent/skills/rust-pro/SKILL.md +525 -142
- package/.agent/skills/seo-fundamentals/SKILL.md +92 -153
- package/.agent/skills/server-management/SKILL.md +110 -166
- package/.agent/skills/shadcn-ui-expert/SKILL.md +154 -55
- package/.agent/skills/skill-creator/SKILL.md +18 -58
- package/.agent/skills/sql-pro/SKILL.md +543 -68
- package/.agent/skills/supabase-postgres-best-practices/SKILL.md +28 -68
- package/.agent/skills/swiftui-expert/SKILL.md +124 -57
- package/.agent/skills/systematic-debugging/SKILL.md +49 -151
- package/.agent/skills/tailwind-patterns/SKILL.md +433 -149
- package/.agent/skills/tdd-workflow/SKILL.md +63 -169
- package/.agent/skills/test-result-analyzer/SKILL.md +33 -73
- package/.agent/skills/testing-patterns/SKILL.md +437 -130
- package/.agent/skills/trend-researcher/SKILL.md +30 -71
- package/.agent/skills/ui-ux-pro-max/SKILL.md +0 -41
- package/.agent/skills/ui-ux-researcher/SKILL.md +51 -91
- package/.agent/skills/vue-expert/SKILL.md +225 -119
- package/.agent/skills/vulnerability-scanner/SKILL.md +264 -226
- package/.agent/skills/web-accessibility-auditor/SKILL.md +141 -58
- package/.agent/skills/web-design-guidelines/SKILL.md +17 -61
- package/.agent/skills/webapp-testing/SKILL.md +71 -196
- package/.agent/skills/whimsy-injector/SKILL.md +58 -132
- package/.agent/skills/workflow-optimizer/SKILL.md +28 -68
- package/.agent/workflows/api-tester.md +96 -224
- package/.agent/workflows/audit.md +81 -122
- package/.agent/workflows/brainstorm.md +69 -105
- package/.agent/workflows/changelog.md +65 -97
- package/.agent/workflows/create.md +73 -88
- package/.agent/workflows/debug.md +80 -111
- package/.agent/workflows/deploy.md +119 -92
- package/.agent/workflows/enhance.md +80 -91
- package/.agent/workflows/fix.md +68 -97
- package/.agent/workflows/generate.md +165 -164
- package/.agent/workflows/migrate.md +106 -109
- package/.agent/workflows/orchestrate.md +103 -86
- package/.agent/workflows/performance-benchmarker.md +77 -268
- package/.agent/workflows/plan.md +120 -98
- package/.agent/workflows/preview.md +39 -96
- package/.agent/workflows/refactor.md +105 -97
- package/.agent/workflows/review-ai.md +63 -102
- package/.agent/workflows/review.md +71 -110
- package/.agent/workflows/session.md +53 -113
- package/.agent/workflows/status.md +42 -88
- package/.agent/workflows/strengthen-skills.md +90 -51
- package/.agent/workflows/swarm.md +114 -129
- package/.agent/workflows/test.md +125 -102
- package/.agent/workflows/tribunal-backend.md +60 -78
- package/.agent/workflows/tribunal-database.md +62 -100
- package/.agent/workflows/tribunal-frontend.md +62 -82
- package/.agent/workflows/tribunal-full.md +56 -100
- package/.agent/workflows/tribunal-mobile.md +65 -94
- package/.agent/workflows/tribunal-performance.md +62 -105
- package/.agent/workflows/ui-ux-pro-max.md +72 -121
- package/README.md +11 -15
- package/package.json +1 -1
- package/.agent/skills/api-patterns/api-style.md +0 -42
- package/.agent/skills/api-patterns/auth.md +0 -24
- package/.agent/skills/api-patterns/documentation.md +0 -26
- package/.agent/skills/api-patterns/graphql.md +0 -41
- package/.agent/skills/api-patterns/rate-limiting.md +0 -31
- package/.agent/skills/api-patterns/response.md +0 -37
- package/.agent/skills/api-patterns/rest.md +0 -40
- package/.agent/skills/api-patterns/security-testing.md +0 -122
- package/.agent/skills/api-patterns/trpc.md +0 -41
- package/.agent/skills/api-patterns/versioning.md +0 -22
- package/.agent/skills/app-builder/agent-coordination.md +0 -71
- package/.agent/skills/app-builder/feature-building.md +0 -53
- package/.agent/skills/app-builder/project-detection.md +0 -34
- package/.agent/skills/app-builder/scaffolding.md +0 -118
- package/.agent/skills/app-builder/tech-stack.md +0 -40
- package/.agent/skills/architecture/context-discovery.md +0 -43
- package/.agent/skills/architecture/examples.md +0 -94
- package/.agent/skills/architecture/pattern-selection.md +0 -68
- package/.agent/skills/architecture/patterns-reference.md +0 -50
- package/.agent/skills/architecture/trade-off-analysis.md +0 -77
- package/.agent/skills/brainstorming/dynamic-questioning.md +0 -360
- package/.agent/skills/database-design/database-selection.md +0 -43
- package/.agent/skills/database-design/indexing.md +0 -39
- package/.agent/skills/database-design/migrations.md +0 -48
- package/.agent/skills/database-design/optimization.md +0 -36
- package/.agent/skills/database-design/orm-selection.md +0 -30
- package/.agent/skills/database-design/schema-design.md +0 -56
- package/.agent/skills/dotnet-core-expert/SKILL.md +0 -103
- package/.agent/skills/framer-motion-animations/SKILL.md +0 -74
- package/.agent/skills/frontend-design/animation-guide.md +0 -331
- package/.agent/skills/frontend-design/color-system.md +0 -329
- package/.agent/skills/frontend-design/decision-trees.md +0 -418
- package/.agent/skills/frontend-design/motion-graphics.md +0 -306
- package/.agent/skills/frontend-design/typography-system.md +0 -363
- package/.agent/skills/frontend-design/ux-psychology.md +0 -1116
- package/.agent/skills/frontend-design/visual-effects.md +0 -383
- package/.agent/skills/game-development/2d-games/SKILL.md +0 -119
- package/.agent/skills/game-development/3d-games/SKILL.md +0 -135
- package/.agent/skills/game-development/SKILL.md +0 -236
- package/.agent/skills/game-development/game-art/SKILL.md +0 -185
- package/.agent/skills/game-development/game-audio/SKILL.md +0 -190
- package/.agent/skills/game-development/game-design/SKILL.md +0 -129
- package/.agent/skills/game-development/mobile-games/SKILL.md +0 -108
- package/.agent/skills/game-development/multiplayer/SKILL.md +0 -132
- package/.agent/skills/game-development/pc-games/SKILL.md +0 -144
- package/.agent/skills/game-development/vr-ar/SKILL.md +0 -123
- package/.agent/skills/game-development/web-games/SKILL.md +0 -150
- package/.agent/skills/intelligent-routing/router-manifest.md +0 -65
- package/.agent/skills/mobile-design/decision-trees.md +0 -516
- package/.agent/skills/mobile-design/mobile-backend.md +0 -491
- package/.agent/skills/mobile-design/mobile-color-system.md +0 -420
- package/.agent/skills/mobile-design/mobile-debugging.md +0 -122
- package/.agent/skills/mobile-design/mobile-design-thinking.md +0 -357
- package/.agent/skills/mobile-design/mobile-navigation.md +0 -458
- package/.agent/skills/mobile-design/mobile-performance.md +0 -767
- package/.agent/skills/mobile-design/mobile-testing.md +0 -356
- package/.agent/skills/mobile-design/mobile-typography.md +0 -433
- package/.agent/skills/mobile-design/platform-android.md +0 -666
- package/.agent/skills/mobile-design/platform-ios.md +0 -561
- package/.agent/skills/mobile-design/touch-psychology.md +0 -537
- package/.agent/skills/nextjs-react-expert/1-async-eliminating-waterfalls.md +0 -312
- package/.agent/skills/nextjs-react-expert/2-bundle-bundle-size-optimization.md +0 -240
- package/.agent/skills/nextjs-react-expert/3-server-server-side-performance.md +0 -490
- package/.agent/skills/nextjs-react-expert/4-client-client-side-data-fetching.md +0 -264
- package/.agent/skills/nextjs-react-expert/5-rerender-re-render-optimization.md +0 -581
- package/.agent/skills/nextjs-react-expert/6-rendering-rendering-performance.md +0 -432
- package/.agent/skills/nextjs-react-expert/7-js-javascript-performance.md +0 -684
- package/.agent/skills/nextjs-react-expert/8-advanced-advanced-patterns.md +0 -150
- package/.agent/skills/vulnerability-scanner/checklists.md +0 -121
|
@@ -1,285 +1,293 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: observability
|
|
3
|
-
description: Production observability
|
|
3
|
+
description: Production observability mastery. Structured logging (Pino/Winston), OpenTelemetry tracing, metrics (Prometheus/Grafana), SLIs/SLOs/error budgets, distributed tracing, alerting design, health checks, and AI observability. Use when setting up monitoring, debugging production issues, or designing observable distributed systems.
|
|
4
4
|
allowed-tools: Read, Write, Edit, Glob, Grep
|
|
5
|
-
version:
|
|
6
|
-
last-updated: 2026-
|
|
5
|
+
version: 2.0.0
|
|
6
|
+
last-updated: 2026-04-01
|
|
7
7
|
applies-to-model: gemini-2.5-pro, claude-3-7-sonnet
|
|
8
8
|
---
|
|
9
9
|
|
|
10
|
-
# Observability
|
|
11
|
-
|
|
12
|
-
> Monitoring tells you when something is broken.
|
|
13
|
-
> Observability tells you why.
|
|
10
|
+
# Observability — Production Monitoring Mastery
|
|
14
11
|
|
|
15
12
|
---
|
|
16
13
|
|
|
17
14
|
## The Three Pillars
|
|
18
15
|
|
|
19
16
|
```
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
LOGS → Discrete events with context
|
|
24
|
-
"What exactly happened at 14:23:07?"
|
|
17
|
+
Logs → WHAT happened (structured events)
|
|
18
|
+
Traces → WHERE it happened (request flow across services)
|
|
19
|
+
Metrics → HOW MUCH is happening (counters, histograms, gauges)
|
|
25
20
|
|
|
26
|
-
|
|
27
|
-
"What is our error rate over the last hour?"
|
|
21
|
+
All three are needed. Logs alone are not observability.
|
|
28
22
|
```
|
|
29
23
|
|
|
30
|
-
Use all three. They answer different questions. None replaces the others.
|
|
31
|
-
|
|
32
24
|
---
|
|
33
25
|
|
|
34
|
-
##
|
|
35
|
-
|
|
36
|
-
OpenTelemetry (OTel) is the vendor-neutral standard for instrumentation. Use it and you can swap backends (Jaeger, Grafana Tempo, Honeycomb, Datadog) without changing application code.
|
|
26
|
+
## Structured Logging
|
|
37
27
|
|
|
38
|
-
```
|
|
39
|
-
|
|
40
|
-
import { NodeSDK } from '@opentelemetry/sdk-node';
|
|
41
|
-
import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http';
|
|
42
|
-
import { Resource } from '@opentelemetry/resources';
|
|
43
|
-
import { SemanticResourceAttributes } from '@opentelemetry/semantic-conventions';
|
|
28
|
+
```typescript
|
|
29
|
+
import pino from "pino";
|
|
44
30
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
url: process.env.OTEL_EXPORTER_OTLP_ENDPOINT,
|
|
31
|
+
// ✅ Structured JSON logging
|
|
32
|
+
const logger = pino({
|
|
33
|
+
level: process.env.LOG_LEVEL ?? "info",
|
|
34
|
+
timestamp: pino.stdTimeFunctions.isoTime,
|
|
35
|
+
...(process.env.NODE_ENV === "development" && {
|
|
36
|
+
transport: { target: "pino-pretty" },
|
|
52
37
|
}),
|
|
53
38
|
});
|
|
54
39
|
|
|
55
|
-
|
|
56
|
-
|
|
40
|
+
// ✅ GOOD: Structured with context
|
|
41
|
+
logger.info({ userId: user.id, action: "login", ip: req.ip }, "User logged in");
|
|
42
|
+
logger.error({ err, orderId: order.id, paymentGateway: "stripe" }, "Payment failed");
|
|
43
|
+
logger.warn({ queueDepth: 1500, threshold: 1000 }, "Queue depth exceeding threshold");
|
|
44
|
+
|
|
45
|
+
// ❌ BAD: Unstructured string logging
|
|
46
|
+
console.log("User " + user.id + " logged in from " + req.ip);
|
|
47
|
+
console.log("Error: " + error.message);
|
|
48
|
+
|
|
49
|
+
// ❌ HALLUCINATION TRAP: console.log is NOT production logging
|
|
50
|
+
// - No severity levels (info/warn/error)
|
|
51
|
+
// - No structured fields (can't search/filter)
|
|
52
|
+
// - No timestamps in ISO format
|
|
53
|
+
// - Can't be collected by log aggregators
|
|
54
|
+
// ✅ Use Pino (Node.js) or structlog (Python) for production
|
|
57
55
|
```
|
|
58
56
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
## Distributed Tracing
|
|
57
|
+
### Log Levels
|
|
62
58
|
|
|
63
|
-
|
|
59
|
+
```
|
|
60
|
+
fatal → App is crashing, immediate attention required
|
|
61
|
+
error → Operation failed, needs investigation
|
|
62
|
+
warn → Something unexpected, but app continues
|
|
63
|
+
info → Business events (user login, order placed, deploy)
|
|
64
|
+
debug → Technical details (query timing, cache hit/miss)
|
|
65
|
+
trace → Verbose debugging (only in development)
|
|
66
|
+
|
|
67
|
+
Rules:
|
|
68
|
+
- Production default: info
|
|
69
|
+
- Never log PII (names, emails, SSNs) at any level
|
|
70
|
+
- Never log secrets (tokens, passwords, API keys)
|
|
71
|
+
- Log request IDs for correlation
|
|
72
|
+
- Log durations for performance tracking
|
|
73
|
+
```
|
|
64
74
|
|
|
65
|
-
|
|
66
|
-
import { trace, context, SpanStatusCode } from '@opentelemetry/api';
|
|
75
|
+
### Request Context / Correlation
|
|
67
76
|
|
|
68
|
-
|
|
77
|
+
```typescript
|
|
78
|
+
import { AsyncLocalStorage } from "node:async_hooks";
|
|
69
79
|
|
|
70
|
-
|
|
71
|
-
return tracer.startActiveSpan('payment.process', async (span) => {
|
|
72
|
-
try {
|
|
73
|
-
// Add business context to the span
|
|
74
|
-
span.setAttributes({
|
|
75
|
-
'order.id': orderId,
|
|
76
|
-
'payment.amount': amount,
|
|
77
|
-
'payment.currency': 'USD',
|
|
78
|
-
});
|
|
80
|
+
const requestContext = new AsyncLocalStorage<{ requestId: string; userId?: string }>();
|
|
79
81
|
|
|
80
|
-
|
|
82
|
+
// Middleware: set context per request
|
|
83
|
+
app.use((req, res, next) => {
|
|
84
|
+
const requestId = req.headers["x-request-id"]?.toString() ?? crypto.randomUUID();
|
|
85
|
+
res.setHeader("x-request-id", requestId);
|
|
86
|
+
requestContext.run({ requestId, userId: req.user?.id }, next);
|
|
87
|
+
});
|
|
81
88
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
throw err;
|
|
89
|
-
} finally {
|
|
90
|
-
span.end();
|
|
91
|
-
}
|
|
89
|
+
// Child logger with context
|
|
90
|
+
function getLogger() {
|
|
91
|
+
const ctx = requestContext.getStore();
|
|
92
|
+
return logger.child({
|
|
93
|
+
requestId: ctx?.requestId,
|
|
94
|
+
userId: ctx?.userId,
|
|
92
95
|
});
|
|
93
96
|
}
|
|
97
|
+
|
|
98
|
+
// Every log from this request includes requestId and userId
|
|
99
|
+
const log = getLogger();
|
|
100
|
+
log.info("Processing order"); // { requestId: "abc-123", userId: "42", msg: "Processing order" }
|
|
94
101
|
```
|
|
95
102
|
|
|
96
103
|
---
|
|
97
104
|
|
|
98
|
-
##
|
|
99
|
-
|
|
100
|
-
Logs must be machine-parseable:
|
|
105
|
+
## Distributed Tracing (OpenTelemetry)
|
|
101
106
|
|
|
102
|
-
```
|
|
103
|
-
|
|
104
|
-
|
|
107
|
+
```typescript
|
|
108
|
+
import { NodeSDK } from "@opentelemetry/sdk-node";
|
|
109
|
+
import { getNodeAutoInstrumentations } from "@opentelemetry/auto-instrumentations-node";
|
|
110
|
+
import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
|
|
105
111
|
|
|
106
|
-
//
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
112
|
+
// Initialize OpenTelemetry
|
|
113
|
+
const sdk = new NodeSDK({
|
|
114
|
+
traceExporter: new OTLPTraceExporter({
|
|
115
|
+
url: process.env.OTEL_EXPORTER_OTLP_ENDPOINT ?? "http://localhost:4318/v1/traces",
|
|
116
|
+
}),
|
|
117
|
+
instrumentations: [
|
|
118
|
+
getNodeAutoInstrumentations({
|
|
119
|
+
"@opentelemetry/instrumentation-http": { enabled: true },
|
|
120
|
+
"@opentelemetry/instrumentation-express": { enabled: true },
|
|
121
|
+
"@opentelemetry/instrumentation-pg": { enabled: true },
|
|
122
|
+
"@opentelemetry/instrumentation-redis": { enabled: true },
|
|
123
|
+
}),
|
|
124
|
+
],
|
|
114
125
|
});
|
|
115
|
-
```
|
|
116
126
|
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
| Always | Never |
|
|
120
|
-
|---|---|
|
|
121
|
-
| Request ID / trace ID | Passwords or password hashes |
|
|
122
|
-
| User ID (not PII) | Credit card numbers |
|
|
123
|
-
| Error type + message | API keys or tokens |
|
|
124
|
-
| Duration (ms) | Full request bodies (may contain PII) |
|
|
125
|
-
| HTTP status code | |
|
|
126
|
-
|
|
127
|
-
---
|
|
128
|
-
|
|
129
|
-
## Metrics: What to Measure
|
|
130
|
-
|
|
131
|
-
The four golden signals (Google SRE):
|
|
127
|
+
sdk.start();
|
|
132
128
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
Track p50, p95, p99 — not just average
|
|
136
|
-
Average hides the worst-case user experience
|
|
129
|
+
// Manual span for custom business logic
|
|
130
|
+
import { trace } from "@opentelemetry/api";
|
|
137
131
|
|
|
138
|
-
|
|
139
|
-
requests/sec, messages/sec, bytes/sec
|
|
132
|
+
const tracer = trace.getTracer("order-service");
|
|
140
133
|
|
|
141
|
-
|
|
142
|
-
|
|
134
|
+
async function processOrder(order: Order) {
|
|
135
|
+
return tracer.startActiveSpan("processOrder", async (span) => {
|
|
136
|
+
try {
|
|
137
|
+
span.setAttribute("order.id", order.id);
|
|
138
|
+
span.setAttribute("order.total", order.total);
|
|
139
|
+
span.setAttribute("order.items.count", order.items.length);
|
|
143
140
|
|
|
144
|
-
|
|
145
|
-
|
|
141
|
+
const result = await executeOrder(order);
|
|
142
|
+
span.setStatus({ code: SpanStatusCode.OK });
|
|
143
|
+
return result;
|
|
144
|
+
} catch (error) {
|
|
145
|
+
span.setStatus({ code: SpanStatusCode.ERROR, message: error.message });
|
|
146
|
+
span.recordException(error);
|
|
147
|
+
throw error;
|
|
148
|
+
} finally {
|
|
149
|
+
span.end();
|
|
150
|
+
}
|
|
151
|
+
});
|
|
152
|
+
}
|
|
146
153
|
```
|
|
147
154
|
|
|
148
155
|
---
|
|
149
156
|
|
|
150
|
-
##
|
|
157
|
+
## Metrics
|
|
151
158
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
```
|
|
155
|
-
SLI (Service Level Indicator) — a specific, measurable signal:
|
|
156
|
-
"HTTP 200 responses as % of all responses to /api/checkout"
|
|
159
|
+
```typescript
|
|
160
|
+
import { metrics } from "@opentelemetry/api";
|
|
157
161
|
|
|
158
|
-
|
|
159
|
-
"99.9% of checkout requests succeed over a 30-day window"
|
|
162
|
+
const meter = metrics.getMeter("api-server");
|
|
160
163
|
|
|
161
|
-
|
|
162
|
-
|
|
164
|
+
// Counter — things that only go up
|
|
165
|
+
const requestCounter = meter.createCounter("http.requests.total", {
|
|
166
|
+
description: "Total HTTP requests",
|
|
167
|
+
});
|
|
163
168
|
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
169
|
+
// Histogram — request durations
|
|
170
|
+
const requestDuration = meter.createHistogram("http.request.duration_ms", {
|
|
171
|
+
description: "HTTP request duration in milliseconds",
|
|
172
|
+
unit: "ms",
|
|
173
|
+
});
|
|
168
174
|
|
|
169
|
-
|
|
175
|
+
// Gauge — current values
|
|
176
|
+
const activeConnections = meter.createUpDownCounter("db.connections.active", {
|
|
177
|
+
description: "Active database connections",
|
|
178
|
+
});
|
|
170
179
|
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
// Eval scores (from async evaluation pipeline)
|
|
188
|
-
eval_faithfulness: 0.92, // Did output match sources?
|
|
189
|
-
eval_relevance: 0.88, // Did output answer the question?
|
|
180
|
+
// Middleware to record metrics
|
|
181
|
+
app.use((req, res, next) => {
|
|
182
|
+
const start = performance.now();
|
|
183
|
+
res.on("finish", () => {
|
|
184
|
+
const duration = performance.now() - start;
|
|
185
|
+
requestCounter.add(1, {
|
|
186
|
+
method: req.method,
|
|
187
|
+
path: req.route?.path ?? req.path,
|
|
188
|
+
status: res.statusCode.toString(),
|
|
189
|
+
});
|
|
190
|
+
requestDuration.record(duration, {
|
|
191
|
+
method: req.method,
|
|
192
|
+
status: res.statusCode.toString(),
|
|
193
|
+
});
|
|
194
|
+
});
|
|
195
|
+
next();
|
|
190
196
|
});
|
|
191
197
|
```
|
|
192
198
|
|
|
193
|
-
###
|
|
199
|
+
### Key Metrics to Track
|
|
194
200
|
|
|
195
201
|
```
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
202
|
+
RED method (for services):
|
|
203
|
+
Rate → requests per second
|
|
204
|
+
Errors → error rate (4xx, 5xx)
|
|
205
|
+
Duration → latency percentiles (P50, P95, P99)
|
|
206
|
+
|
|
207
|
+
USE method (for resources):
|
|
208
|
+
Utilization → CPU %, memory %, disk %
|
|
209
|
+
Saturation → queue depth, thread pool saturation
|
|
210
|
+
Errors → disk failures, OOM kills
|
|
211
|
+
|
|
212
|
+
Business metrics:
|
|
213
|
+
- Sign-ups per hour
|
|
214
|
+
- Orders processed per minute
|
|
215
|
+
- Revenue per day
|
|
216
|
+
- API calls per customer
|
|
200
217
|
```
|
|
201
218
|
|
|
202
219
|
---
|
|
203
220
|
|
|
204
|
-
##
|
|
221
|
+
## SLIs, SLOs & Error Budgets
|
|
205
222
|
|
|
206
|
-
When this skill produces a recommendation or design decision, structure your output as:
|
|
207
|
-
|
|
208
|
-
```
|
|
209
|
-
━━━ Observability Recommendation ━━━━━━━━━━━━━━━━
|
|
210
|
-
Decision: [what was chosen / proposed]
|
|
211
|
-
Rationale: [why — one concise line]
|
|
212
|
-
Trade-offs: [what is consciously accepted]
|
|
213
|
-
Next action: [concrete next step for the user]
|
|
214
|
-
─────────────────────────────────────────────────
|
|
215
|
-
Pre-Flight: ✅ All checks passed
|
|
216
|
-
or ❌ [blocking item that must be resolved first]
|
|
217
223
|
```
|
|
224
|
+
SLI (Service Level Indicator) → What you measure
|
|
225
|
+
"99.2% of requests complete in <500ms"
|
|
218
226
|
|
|
227
|
+
SLO (Service Level Objective) → Your target
|
|
228
|
+
"99.9% of requests should complete in <500ms"
|
|
219
229
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
## 🏛️ Tribunal Integration (Anti-Hallucination)
|
|
223
|
-
|
|
224
|
-
**Slash command: `/tribunal-backend`**
|
|
225
|
-
**Active reviewers: `logic` · `security` · `performance`**
|
|
226
|
-
|
|
227
|
-
### ❌ Forbidden AI Tropes in Observability
|
|
228
|
-
|
|
229
|
-
1. **Logging sensitive data** — never log request bodies wholesale — they contain passwords, tokens, PII. Log only specific, safe fields.
|
|
230
|
-
2. **Tracking averages only** — `avg(latency)` hides the 1% of users who get 10x worse experience. Always use percentiles (p95, p99).
|
|
231
|
-
3. **100% SLO targets** — `99.999%` SLOs are wrong for most services. They consume all error budget instantly and paralyze product velocity.
|
|
232
|
-
4. **Inventing OTel packages** — only use `@opentelemetry/{sdk-node,api,exporter-*}` from the official `@opentelemetry` npm org.
|
|
230
|
+
SLA (Service Level Agreement) → Your contract (with penalties)
|
|
231
|
+
"99.95% uptime or we refund 10%"
|
|
233
232
|
|
|
234
|
-
|
|
233
|
+
Error Budget = 100% - SLO
|
|
234
|
+
SLO: 99.9% → Error budget: 0.1% → 43 min downtime/month
|
|
235
|
+
SLO: 99.5% → Error budget: 0.5% → 3.6 hours downtime/month
|
|
235
236
|
|
|
237
|
+
Rules:
|
|
238
|
+
- Burn error budget too fast → freeze deployments
|
|
239
|
+
- Error budget remaining → ship features faster
|
|
240
|
+
- Don't set SLOs you can't measure
|
|
241
|
+
- SLOs should be slightly below actual performance
|
|
236
242
|
```
|
|
237
|
-
✅ Are logs structured JSON (not string-interpolated messages)?
|
|
238
|
-
✅ Is no PII or credential data being logged?
|
|
239
|
-
✅ Are latency measurements tracking percentiles (p95/p99), not just averages?
|
|
240
|
-
✅ Does every async operation have a trace span with error recording?
|
|
241
|
-
✅ Are AI calls instrumented with token count + cost + latency tracking?
|
|
242
|
-
✅ Is there an SLO defined with an explicit error budget policy?
|
|
243
|
-
```
|
|
244
|
-
|
|
245
243
|
|
|
246
244
|
---
|
|
247
245
|
|
|
248
|
-
##
|
|
249
|
-
|
|
250
|
-
AI coding assistants often fall into specific bad habits when dealing with this domain. These are strictly forbidden:
|
|
251
|
-
|
|
252
|
-
1. **Over-engineering:** Proposing complex abstractions or distributed systems when a simpler approach suffices.
|
|
253
|
-
2. **Hallucinated Libraries/Methods:** Using non-existent methods or packages. Always `// VERIFY` or check `package.json` / `requirements.txt`.
|
|
254
|
-
3. **Skipping Edge Cases:** Writing the "happy path" and ignoring error handling, timeouts, or data validation.
|
|
255
|
-
4. **Context Amnesia:** Forgetting the user's constraints and offering generic advice instead of tailored solutions.
|
|
256
|
-
5. **Silent Degradation:** Catching and suppressing errors without logging or re-raising.
|
|
246
|
+
## Health Checks
|
|
257
247
|
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
248
|
+
```typescript
|
|
249
|
+
// Liveness: Is the process running?
|
|
250
|
+
app.get("/health/live", (req, res) => {
|
|
251
|
+
res.status(200).json({ status: "ok" });
|
|
252
|
+
});
|
|
261
253
|
|
|
262
|
-
|
|
263
|
-
|
|
254
|
+
// Readiness: Can it accept traffic?
|
|
255
|
+
app.get("/health/ready", async (req, res) => {
|
|
256
|
+
try {
|
|
257
|
+
await db.raw("SELECT 1"); // database check
|
|
258
|
+
await redis.ping(); // cache check
|
|
259
|
+
res.status(200).json({
|
|
260
|
+
status: "ready",
|
|
261
|
+
checks: { database: "ok", cache: "ok" },
|
|
262
|
+
});
|
|
263
|
+
} catch (error) {
|
|
264
|
+
res.status(503).json({
|
|
265
|
+
status: "not ready",
|
|
266
|
+
checks: { database: error.message },
|
|
267
|
+
});
|
|
268
|
+
}
|
|
269
|
+
});
|
|
264
270
|
|
|
265
|
-
|
|
271
|
+
// ❌ HALLUCINATION TRAP: Liveness ≠ Readiness
|
|
272
|
+
// Liveness fails → container restarts (only for unrecoverable states)
|
|
273
|
+
// Readiness fails → stop sending traffic (temporary — DB down, etc.)
|
|
274
|
+
// Making liveness check the DB → DB outage restarts all containers → cascade failure
|
|
275
|
+
```
|
|
266
276
|
|
|
267
|
-
|
|
268
|
-
2. **Silent Degradation:** Catching and suppressing errors without logging or handling.
|
|
269
|
-
3. **Context Amnesia:** Forgetting the user's constraints and offering generic advice instead of tailored solutions.
|
|
277
|
+
---
|
|
270
278
|
|
|
271
|
-
|
|
279
|
+
## Alerting
|
|
272
280
|
|
|
273
|
-
Review these questions before confirming output:
|
|
274
281
|
```
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
282
|
+
Alert design rules:
|
|
283
|
+
1. Alert on SYMPTOMS, not causes (high latency, not "CPU is 80%")
|
|
284
|
+
2. Every alert must have a runbook link
|
|
285
|
+
3. Every alert must be ACTIONABLE — if you can't do anything, it's a notification
|
|
286
|
+
4. Use severity levels:
|
|
287
|
+
- Critical → page on-call (customer-facing outage)
|
|
288
|
+
- Warning → Slack notification (degraded, not broken)
|
|
289
|
+
- Info → dashboard only (awareness)
|
|
290
|
+
5. Avoid alert fatigue — fewer, meaningful alerts beat many noisy ones
|
|
279
291
|
```
|
|
280
292
|
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
**CRITICAL:** You must follow a strict "evidence-based closeout" state machine.
|
|
284
|
-
- ❌ **Forbidden:** Declaring a task complete because the output "looks correct."
|
|
285
|
-
- ✅ **Required:** You are explicitly forbidden from finalizing any task without providing **concrete evidence** (terminal output, passing tests, compile success, or equivalent proof) that your output works as intended.
|
|
293
|
+
---
|