@grainulation/silo 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,175 @@
1
+ {
2
+ "name": "Frontend Engineering",
3
+ "description": "Performance budgets, Core Web Vitals targets, accessibility (WCAG 2.1), state management patterns, bundle optimization, and SSR vs CSR tradeoffs.",
4
+ "version": "1.0.0",
5
+ "claims": [
6
+ {
7
+ "id": "fe-001",
8
+ "type": "constraint",
9
+ "topic": "Core Web Vitals thresholds",
10
+ "content": "Google Core Web Vitals targets for good UX: LCP (Largest Contentful Paint) under 2.5s, INP (Interaction to Next Paint) under 200ms, CLS (Cumulative Layout Shift) under 0.1. These directly affect search ranking. Measure with field data (CrUX, RUM) not just lab data (Lighthouse), since field p75 is what Google uses.",
11
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
12
+ "evidence": "documented",
13
+ "status": "active",
14
+ "phase_added": "define",
15
+ "timestamp": "2025-01-01T00:00:00.000Z",
16
+ "conflicts_with": [],
17
+ "resolved_by": null,
18
+ "tags": ["frontend", "performance", "core-web-vitals", "seo"]
19
+ },
20
+ {
21
+ "id": "fe-002",
22
+ "type": "recommendation",
23
+ "topic": "JavaScript bundle size budget",
24
+ "content": "Initial JavaScript bundle should be under 150 KB compressed (gzip) for mobile-first applications. Total page weight including images should be under 1.5 MB. Every 100 KB of JavaScript adds approximately 350ms parse/compile time on median mobile devices. Use bundle analyzer (webpack-bundle-analyzer, source-map-explorer) in CI to enforce budgets.",
25
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
26
+ "evidence": "tested",
27
+ "status": "active",
28
+ "phase_added": "define",
29
+ "timestamp": "2025-01-01T00:00:00.000Z",
30
+ "conflicts_with": [],
31
+ "resolved_by": null,
32
+ "tags": ["frontend", "performance", "bundle-size", "mobile"]
33
+ },
34
+ {
35
+ "id": "fe-003",
36
+ "type": "constraint",
37
+ "topic": "WCAG 2.1 AA compliance",
38
+ "content": "WCAG 2.1 Level AA is the legal standard in the US (ADA), EU (EAA 2025), and Canada (AODA). Key requirements: color contrast ratio 4.5:1 for normal text and 3:1 for large text, all interactive elements keyboard-accessible, form inputs have visible labels, images have alt text, focus indicators visible, no content reliant solely on color.",
39
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
40
+ "evidence": "documented",
41
+ "status": "active",
42
+ "phase_added": "define",
43
+ "timestamp": "2025-01-01T00:00:00.000Z",
44
+ "conflicts_with": [],
45
+ "resolved_by": null,
46
+ "tags": ["frontend", "accessibility", "wcag", "compliance"]
47
+ },
48
+ {
49
+ "id": "fe-004",
50
+ "type": "factual",
51
+ "topic": "SSR vs CSR tradeoffs",
52
+ "content": "Server-Side Rendering (SSR) provides faster First Contentful Paint and better SEO but increases server load and Time to Interactive (hydration cost). Client-Side Rendering (CSR) has faster subsequent navigations but a blank page until JS loads. Use SSR/SSG for content-heavy, SEO-critical pages; CSR for authenticated app-like interfaces behind login.",
53
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
54
+ "evidence": "documented",
55
+ "status": "active",
56
+ "phase_added": "define",
57
+ "timestamp": "2025-01-01T00:00:00.000Z",
58
+ "conflicts_with": [],
59
+ "resolved_by": null,
60
+ "tags": ["frontend", "ssr", "csr", "architecture"]
61
+ },
62
+ {
63
+ "id": "fe-005",
64
+ "type": "risk",
65
+ "topic": "third-party script performance",
66
+ "content": "Third-party scripts (analytics, ads, chat widgets) are the leading cause of performance regression. A single poorly-written third-party script can add 500ms-2s to page load. Load all third-party scripts with async or defer. Set performance budgets that include third-party weight. Use a tag manager with server-side option to control loading.",
67
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
68
+ "evidence": "production",
69
+ "status": "active",
70
+ "phase_added": "define",
71
+ "timestamp": "2025-01-01T00:00:00.000Z",
72
+ "conflicts_with": [],
73
+ "resolved_by": null,
74
+ "tags": ["frontend", "performance", "third-party", "scripts"]
75
+ },
76
+ {
77
+ "id": "fe-006",
78
+ "type": "recommendation",
79
+ "topic": "image optimization pipeline",
80
+ "content": "Serve images in WebP or AVIF format (30-50% smaller than JPEG). Use srcset with 2-4 size variants for responsive images. Lazy-load images below the fold with loading='lazy'. Set explicit width and height attributes to prevent CLS. Automate optimization in the build pipeline or use an image CDN (Cloudinary, imgix, Cloudflare Images).",
81
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
82
+ "evidence": "tested",
83
+ "status": "active",
84
+ "phase_added": "define",
85
+ "timestamp": "2025-01-01T00:00:00.000Z",
86
+ "conflicts_with": [],
87
+ "resolved_by": null,
88
+ "tags": ["frontend", "images", "performance", "webp"]
89
+ },
90
+ {
91
+ "id": "fe-007",
92
+ "type": "recommendation",
93
+ "topic": "state management selection criteria",
94
+ "content": "Use local component state (useState/signals) for UI-only state. Use URL state (query params, path) for shareable/bookmarkable state. Use server state libraries (TanStack Query, SWR) for API data with caching. Use global state (Zustand, Redux, or context) only for truly cross-cutting client state (auth, theme, feature flags). Most applications over-use global state.",
95
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
96
+ "evidence": "documented",
97
+ "status": "active",
98
+ "phase_added": "define",
99
+ "timestamp": "2025-01-01T00:00:00.000Z",
100
+ "conflicts_with": [],
101
+ "resolved_by": null,
102
+ "tags": ["frontend", "state-management", "react", "architecture"]
103
+ },
104
+ {
105
+ "id": "fe-008",
106
+ "type": "risk",
107
+ "topic": "layout shift from web fonts",
108
+ "content": "Custom web fonts cause either FOIT (Flash of Invisible Text) or FOUT (Flash of Unstyled Text), both contributing to CLS. Mitigate with font-display: swap, preload critical fonts with <link rel='preload'>, subset fonts to used character ranges (Latin subset is ~30 KB vs 200 KB for full Unicode), and use size-adjust to match fallback metrics.",
109
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
110
+ "evidence": "tested",
111
+ "status": "active",
112
+ "phase_added": "define",
113
+ "timestamp": "2025-01-01T00:00:00.000Z",
114
+ "conflicts_with": [],
115
+ "resolved_by": null,
116
+ "tags": ["frontend", "fonts", "cls", "performance"]
117
+ },
118
+ {
119
+ "id": "fe-009",
120
+ "type": "factual",
121
+ "topic": "code splitting impact",
122
+ "content": "Route-based code splitting typically reduces initial bundle size by 40-60%. Dynamic import() splits a module into a separate chunk loaded on demand. For React, use React.lazy() with Suspense boundaries. Split at route boundaries first, then heavy component boundaries (charts, editors, maps). Avoid splitting components under 30 KB as the HTTP overhead negates the benefit.",
123
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
124
+ "evidence": "tested",
125
+ "status": "active",
126
+ "phase_added": "define",
127
+ "timestamp": "2025-01-01T00:00:00.000Z",
128
+ "conflicts_with": [],
129
+ "resolved_by": null,
130
+ "tags": ["frontend", "code-splitting", "bundle-size", "lazy-loading"]
131
+ },
132
+ {
133
+ "id": "fe-010",
134
+ "type": "constraint",
135
+ "topic": "keyboard navigation requirements",
136
+ "content": "Every interactive element must be reachable and operable via keyboard alone. Tab order must follow visual reading order. Focus must be visible (minimum 2px outline, 3:1 contrast ratio against adjacent colors). Custom components (dropdowns, modals, tabs) must implement ARIA roles and keyboard patterns from WAI-ARIA Authoring Practices. Test by unplugging the mouse.",
137
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
138
+ "evidence": "documented",
139
+ "status": "active",
140
+ "phase_added": "define",
141
+ "timestamp": "2025-01-01T00:00:00.000Z",
142
+ "conflicts_with": [],
143
+ "resolved_by": null,
144
+ "tags": ["frontend", "accessibility", "keyboard", "aria"]
145
+ },
146
+ {
147
+ "id": "fe-011",
148
+ "type": "estimate",
149
+ "topic": "mobile performance gap",
150
+ "content": "Median mobile devices (Moto G Power class) process JavaScript 3-5x slower than desktop. A page that loads in 1.5s on desktop takes 4-6s on median mobile hardware over 4G. Always test on throttled connections (Slow 3G: 400 Kbps, 400ms RTT) and CPU throttled 4x in DevTools. Real user monitoring shows median mobile LCP is 2x desktop LCP.",
151
+ "source": { "origin": "industry", "artifact": null, "connector": null },
152
+ "evidence": "web",
153
+ "status": "active",
154
+ "phase_added": "define",
155
+ "timestamp": "2025-01-01T00:00:00.000Z",
156
+ "conflicts_with": [],
157
+ "resolved_by": null,
158
+ "tags": ["frontend", "mobile", "performance", "testing"]
159
+ },
160
+ {
161
+ "id": "fe-012",
162
+ "type": "recommendation",
163
+ "topic": "error boundaries and fallbacks",
164
+ "content": "Wrap major UI sections in error boundaries that catch render errors and display a fallback UI instead of a white screen. Log caught errors to your observability stack. Provide a retry mechanism in the fallback. Without error boundaries, a single failing component crashes the entire page. Test error boundaries by deliberately throwing in development.",
165
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
166
+ "evidence": "documented",
167
+ "status": "active",
168
+ "phase_added": "define",
169
+ "timestamp": "2025-01-01T00:00:00.000Z",
170
+ "conflicts_with": [],
171
+ "resolved_by": null,
172
+ "tags": ["frontend", "error-handling", "resilience", "react"]
173
+ }
174
+ ]
175
+ }
@@ -0,0 +1,147 @@
1
+ {
2
+ "name": "Migration Research Templates",
3
+ "description": "Common research patterns for technology migrations -- database migrations, cloud migrations, language/framework migrations. Seed claims capture recurring risks and constraints.",
4
+ "version": "1.0.0",
5
+ "claims": [
6
+ {
7
+ "id": "mig-001",
8
+ "type": "constraint",
9
+ "topic": "migration rollback safety",
10
+ "content": "Production migrations must be reversible. Every schema change, data transform, or infrastructure swap needs a tested rollback procedure with a maximum rollback time SLA.",
11
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
12
+ "evidence": "documented",
13
+ "status": "active",
14
+ "phase_added": "define",
15
+ "timestamp": "2025-01-01T00:00:00.000Z",
16
+ "conflicts_with": [],
17
+ "resolved_by": null,
18
+ "tags": ["migration", "rollback", "safety"]
19
+ },
20
+ {
21
+ "id": "mig-002",
22
+ "type": "risk",
23
+ "topic": "dual-write consistency risk",
24
+ "content": "Dual-write periods introduce data consistency risks. During cutover, writes must go to both old and new systems, and a reconciliation process must detect drift.",
25
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
26
+ "evidence": "documented",
27
+ "status": "active",
28
+ "phase_added": "define",
29
+ "timestamp": "2025-01-01T00:00:00.000Z",
30
+ "conflicts_with": [],
31
+ "resolved_by": null,
32
+ "tags": ["migration", "dual-write", "consistency"]
33
+ },
34
+ {
35
+ "id": "mig-003",
36
+ "type": "risk",
37
+ "topic": "migration window overrun",
38
+ "content": "Data migrations that run longer than the maintenance window force a choice between extended downtime and incremental migration. Estimate data volume transfer rates early.",
39
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
40
+ "evidence": "documented",
41
+ "status": "active",
42
+ "phase_added": "define",
43
+ "timestamp": "2025-01-01T00:00:00.000Z",
44
+ "conflicts_with": [],
45
+ "resolved_by": null,
46
+ "tags": ["migration", "downtime", "planning"]
47
+ },
48
+ {
49
+ "id": "mig-004",
50
+ "type": "constraint",
51
+ "topic": "minimum viable migration scope",
52
+ "content": "Feature parity with the legacy system is rarely achievable on day one. Define a 'minimum viable migration' scope that covers the critical path and defer edge cases.",
53
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
54
+ "evidence": "stated",
55
+ "status": "active",
56
+ "phase_added": "define",
57
+ "timestamp": "2025-01-01T00:00:00.000Z",
58
+ "conflicts_with": [],
59
+ "resolved_by": null,
60
+ "tags": ["migration", "scope", "mvp"]
61
+ },
62
+ {
63
+ "id": "mig-005",
64
+ "type": "recommendation",
65
+ "topic": "strangler fig migration pattern",
66
+ "content": "Use the strangler fig pattern for large migrations: route traffic incrementally to the new system while the old system continues to serve unmodified paths.",
67
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
68
+ "evidence": "documented",
69
+ "status": "active",
70
+ "phase_added": "define",
71
+ "timestamp": "2025-01-01T00:00:00.000Z",
72
+ "conflicts_with": [],
73
+ "resolved_by": null,
74
+ "tags": ["migration", "strangler-fig", "pattern"]
75
+ },
76
+ {
77
+ "id": "mig-006",
78
+ "type": "risk",
79
+ "topic": "character encoding corruption",
80
+ "content": "Character encoding mismatches (UTF-8 vs Latin-1, collation differences) are the most common silent data corruption source in database migrations. Test with production-representative data.",
81
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
82
+ "evidence": "documented",
83
+ "status": "active",
84
+ "phase_added": "define",
85
+ "timestamp": "2025-01-01T00:00:00.000Z",
86
+ "conflicts_with": [],
87
+ "resolved_by": null,
88
+ "tags": ["migration", "database", "encoding"]
89
+ },
90
+ {
91
+ "id": "mig-007",
92
+ "type": "estimate",
93
+ "topic": "cloud migration timeline",
94
+ "content": "Cloud-to-cloud migrations typically take 2-4x longer than initial estimates due to undocumented dependencies, permission differences, and networking surprises.",
95
+ "source": { "origin": "industry", "artifact": null, "connector": null },
96
+ "evidence": "web",
97
+ "status": "active",
98
+ "phase_added": "define",
99
+ "timestamp": "2025-01-01T00:00:00.000Z",
100
+ "conflicts_with": [],
101
+ "resolved_by": null,
102
+ "tags": ["migration", "cloud", "estimation"]
103
+ },
104
+ {
105
+ "id": "mig-008",
106
+ "type": "constraint",
107
+ "topic": "API versioning during migration",
108
+ "content": "API contract changes during migration require versioned endpoints. Clients must be able to continue using the old API for a deprecation period (minimum 3 months is standard).",
109
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
110
+ "evidence": "documented",
111
+ "status": "active",
112
+ "phase_added": "define",
113
+ "timestamp": "2025-01-01T00:00:00.000Z",
114
+ "conflicts_with": [],
115
+ "resolved_by": null,
116
+ "tags": ["migration", "api", "versioning"]
117
+ },
118
+ {
119
+ "id": "mig-009",
120
+ "type": "recommendation",
121
+ "topic": "shadow traffic validation",
122
+ "content": "Run shadow traffic (replay production reads against the new system without serving responses) for at least 2 weeks before cutover. Compare results for correctness and latency.",
123
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
124
+ "evidence": "documented",
125
+ "status": "active",
126
+ "phase_added": "define",
127
+ "timestamp": "2025-01-01T00:00:00.000Z",
128
+ "conflicts_with": [],
129
+ "resolved_by": null,
130
+ "tags": ["migration", "shadow-traffic", "validation"]
131
+ },
132
+ {
133
+ "id": "mig-010",
134
+ "type": "risk",
135
+ "topic": "legacy knowledge decay",
136
+ "content": "Team knowledge of the legacy system degrades over time. If migration stalls, institutional knowledge may be lost before it completes. Document as you migrate, not after.",
137
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
138
+ "evidence": "stated",
139
+ "status": "active",
140
+ "phase_added": "define",
141
+ "timestamp": "2025-01-01T00:00:00.000Z",
142
+ "conflicts_with": [],
143
+ "resolved_by": null,
144
+ "tags": ["migration", "knowledge", "documentation"]
145
+ }
146
+ ]
147
+ }
@@ -0,0 +1,175 @@
1
+ {
2
+ "name": "Observability and Incident Response",
3
+ "description": "Structured logging, metrics, distributed tracing, SLO definitions, alerting hygiene, and incident response patterns for production systems.",
4
+ "version": "1.0.0",
5
+ "claims": [
6
+ {
7
+ "id": "obs-001",
8
+ "type": "constraint",
9
+ "topic": "structured logging format",
10
+ "content": "All application logs must be structured JSON with mandatory fields: timestamp (ISO 8601), level (debug/info/warn/error/fatal), message, service, and trace_id. Unstructured string logs break log aggregation queries and make correlation impossible at scale.",
11
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
12
+ "evidence": "production",
13
+ "status": "active",
14
+ "phase_added": "define",
15
+ "timestamp": "2025-01-01T00:00:00.000Z",
16
+ "conflicts_with": [],
17
+ "resolved_by": null,
18
+ "tags": ["observability", "logging", "structured-logging"]
19
+ },
20
+ {
21
+ "id": "obs-002",
22
+ "type": "recommendation",
23
+ "topic": "RED method for services",
24
+ "content": "Every service should expose the RED metrics: Rate (requests per second), Errors (failed requests per second), and Duration (latency histogram). These three metrics cover 80% of service-level debugging. Use histograms with buckets at 5ms, 10ms, 25ms, 50ms, 100ms, 250ms, 500ms, 1s, 2.5s, 5s, 10s.",
25
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
26
+ "evidence": "documented",
27
+ "status": "active",
28
+ "phase_added": "define",
29
+ "timestamp": "2025-01-01T00:00:00.000Z",
30
+ "conflicts_with": [],
31
+ "resolved_by": null,
32
+ "tags": ["observability", "metrics", "red-method", "monitoring"]
33
+ },
34
+ {
35
+ "id": "obs-003",
36
+ "type": "factual",
37
+ "topic": "SLO error budget calculation",
38
+ "content": "An SLO of 99.9% availability allows 43.2 minutes of downtime per 30-day window (0.1% error budget). An SLO of 99.95% allows 21.6 minutes. Error budget = total time * (1 - SLO target). When budget is exhausted, freeze feature work and focus on reliability.",
39
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
40
+ "evidence": "documented",
41
+ "status": "active",
42
+ "phase_added": "define",
43
+ "timestamp": "2025-01-01T00:00:00.000Z",
44
+ "conflicts_with": [],
45
+ "resolved_by": null,
46
+ "tags": ["observability", "slo", "error-budget", "reliability"]
47
+ },
48
+ {
49
+ "id": "obs-004",
50
+ "type": "risk",
51
+ "topic": "alert fatigue",
52
+ "content": "Alert fatigue occurs when on-call engineers receive more than 2-3 actionable alerts per shift. Non-actionable alerts must be demoted to dashboards or deleted. Every alert should have a runbook link, a clear threshold, and an expected action. Pages without runbooks erode trust in the alerting system.",
53
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
54
+ "evidence": "production",
55
+ "status": "active",
56
+ "phase_added": "define",
57
+ "timestamp": "2025-01-01T00:00:00.000Z",
58
+ "conflicts_with": [],
59
+ "resolved_by": null,
60
+ "tags": ["observability", "alerting", "on-call", "alert-fatigue"]
61
+ },
62
+ {
63
+ "id": "obs-005",
64
+ "type": "recommendation",
65
+ "topic": "distributed trace propagation",
66
+ "content": "Propagate W3C Trace Context headers (traceparent, tracestate) across all service boundaries including HTTP, gRPC, and message queues. Without trace propagation, latency debugging in distributed systems requires manual log correlation which takes 10-50x longer.",
67
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
68
+ "evidence": "documented",
69
+ "status": "active",
70
+ "phase_added": "define",
71
+ "timestamp": "2025-01-01T00:00:00.000Z",
72
+ "conflicts_with": [],
73
+ "resolved_by": null,
74
+ "tags": ["observability", "tracing", "distributed-systems", "opentelemetry"]
75
+ },
76
+ {
77
+ "id": "obs-006",
78
+ "type": "constraint",
79
+ "topic": "no PII in logs",
80
+ "content": "Logs must never contain PII (emails, names, IP addresses, session tokens, passwords). Use hashed or tokenized identifiers for correlation. Accidental PII in logs creates GDPR/CCPA compliance risk and makes log retention policies complex. Enforce with automated log scrubbing or allowlist-based field serialization.",
81
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
82
+ "evidence": "documented",
83
+ "status": "active",
84
+ "phase_added": "define",
85
+ "timestamp": "2025-01-01T00:00:00.000Z",
86
+ "conflicts_with": [],
87
+ "resolved_by": null,
88
+ "tags": ["observability", "logging", "pii", "compliance"]
89
+ },
90
+ {
91
+ "id": "obs-007",
92
+ "type": "recommendation",
93
+ "topic": "USE method for infrastructure",
94
+ "content": "For infrastructure resources (CPU, memory, disk, network), track the USE metrics: Utilization (% of capacity used), Saturation (queue depth or wait time), and Errors (error count). Alert when utilization exceeds 80% sustained over 5 minutes or saturation is non-zero.",
95
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
96
+ "evidence": "documented",
97
+ "status": "active",
98
+ "phase_added": "define",
99
+ "timestamp": "2025-01-01T00:00:00.000Z",
100
+ "conflicts_with": [],
101
+ "resolved_by": null,
102
+ "tags": ["observability", "metrics", "use-method", "infrastructure"]
103
+ },
104
+ {
105
+ "id": "obs-008",
106
+ "type": "factual",
107
+ "topic": "log retention cost model",
108
+ "content": "Log storage costs scale linearly with volume and retention. At 100 GB/day ingestion, 30-day retention costs roughly $3,000-6,000/month on major cloud log platforms (Datadog, Splunk, CloudWatch). Reduce costs by sampling debug logs (keep 10%), archiving to cold storage after 7 days, and dropping health check logs.",
109
+ "source": { "origin": "industry", "artifact": null, "connector": null },
110
+ "evidence": "web",
111
+ "status": "active",
112
+ "phase_added": "define",
113
+ "timestamp": "2025-01-01T00:00:00.000Z",
114
+ "conflicts_with": [],
115
+ "resolved_by": null,
116
+ "tags": ["observability", "logging", "cost", "retention"]
117
+ },
118
+ {
119
+ "id": "obs-009",
120
+ "type": "recommendation",
121
+ "topic": "incident severity definitions",
122
+ "content": "Define incident severity levels with quantitative criteria: SEV1 (total service outage or data loss, all hands, 15-min response), SEV2 (degraded for >50% of users, team response, 30-min), SEV3 (degraded for <50% or non-critical feature down, next business day), SEV4 (cosmetic or minor, backlog). Ambiguous severity wastes escalation time.",
123
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
124
+ "evidence": "documented",
125
+ "status": "active",
126
+ "phase_added": "define",
127
+ "timestamp": "2025-01-01T00:00:00.000Z",
128
+ "conflicts_with": [],
129
+ "resolved_by": null,
130
+ "tags": ["observability", "incident-response", "severity", "on-call"]
131
+ },
132
+ {
133
+ "id": "obs-010",
134
+ "type": "risk",
135
+ "topic": "cardinality explosion in metrics",
136
+ "content": "High-cardinality labels (user IDs, request IDs, full URLs) on metrics cause cardinality explosion, which degrades query performance and inflates storage costs exponentially. Prometheus recommends keeping total time series under 10 million. Use logs or traces for high-cardinality data, not metrics.",
137
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
138
+ "evidence": "production",
139
+ "status": "active",
140
+ "phase_added": "define",
141
+ "timestamp": "2025-01-01T00:00:00.000Z",
142
+ "conflicts_with": [],
143
+ "resolved_by": null,
144
+ "tags": ["observability", "metrics", "cardinality", "prometheus"]
145
+ },
146
+ {
147
+ "id": "obs-011",
148
+ "type": "recommendation",
149
+ "topic": "postmortem blamelessness",
150
+ "content": "Postmortems must be blameless and focus on systemic causes, not individual actions. Required sections: timeline, impact (users affected, duration, revenue impact), root cause, contributing factors, action items with owners and due dates. Publish postmortems within 48 hours of incident resolution.",
151
+ "source": { "origin": "best-practice", "artifact": null, "connector": null },
152
+ "evidence": "documented",
153
+ "status": "active",
154
+ "phase_added": "define",
155
+ "timestamp": "2025-01-01T00:00:00.000Z",
156
+ "conflicts_with": [],
157
+ "resolved_by": null,
158
+ "tags": ["observability", "incident-response", "postmortem", "culture"]
159
+ },
160
+ {
161
+ "id": "obs-012",
162
+ "type": "estimate",
163
+ "topic": "OpenTelemetry migration effort",
164
+ "content": "Migrating from vendor-specific instrumentation (Datadog agent, New Relic APM) to OpenTelemetry SDK typically takes 2-4 weeks per service for auto-instrumentation, plus 1-2 additional weeks for custom spans and metrics migration. The payoff is vendor portability and 30-50% reduction in long-term instrumentation costs.",
165
+ "source": { "origin": "industry", "artifact": null, "connector": null },
166
+ "evidence": "web",
167
+ "status": "active",
168
+ "phase_added": "define",
169
+ "timestamp": "2025-01-01T00:00:00.000Z",
170
+ "conflicts_with": [],
171
+ "resolved_by": null,
172
+ "tags": ["observability", "opentelemetry", "migration", "cost"]
173
+ }
174
+ ]
175
+ }