@grainulation/silo 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,173 @@
1
+ {
2
+ "name": "Hackathon: Sprint Boost",
3
+ "description": "Business-aligned hackathon category pack. Teams tackle real backlog items or OKR-driven research questions. Judging weights: 40% business impact, 30% research rigor, 30% creativity. Rewards sprints that ship real value.",
4
+ "version": "1.0.0",
5
+ "judging": {
6
+ "weights": {
7
+ "impact": 0.4,
8
+ "rigor": 0.3,
9
+ "creativity": 0.3
10
+ },
11
+ "auto_score_dimensions": {
12
+ "impact": {
13
+ "constraint_satisfaction": 0.4,
14
+ "evidence_tier": 0.3,
15
+ "corroboration": 0.3
16
+ },
17
+ "rigor": {
18
+ "evidence_tier": 0.35,
19
+ "challenge_depth": 0.3,
20
+ "health": 0.2,
21
+ "corroboration": 0.15
22
+ },
23
+ "creativity": {
24
+ "type_diversity": 0.35,
25
+ "recommendation_ratio": 0.35,
26
+ "cross_topic_breadth": 0.3
27
+ }
28
+ },
29
+ "human_score_dimensions": [
30
+ "decision_clarity",
31
+ "stakeholder_awareness",
32
+ "actionability"
33
+ ],
34
+ "final_blend": {
35
+ "auto": 0.5,
36
+ "human": 0.5
37
+ }
38
+ },
39
+ "claims": [
40
+ {
41
+ "id": "hsb-001",
42
+ "type": "constraint",
43
+ "topic": "sprint boost format",
44
+ "content": "Sprint Boost is a business-aligned hackathon format. Each team picks a real task from their backlog or a strategic OKR key result, runs a wheat research sprint on it, and submits the compiled brief. The output is a sprint deliverable, not a side project.",
45
+ "source": {
46
+ "origin": "stakeholder",
47
+ "artifact": null,
48
+ "connector": null
49
+ },
50
+ "evidence": "stated",
51
+ "status": "active",
52
+ "phase_added": "define",
53
+ "timestamp": "2026-03-21T00:00:00.000Z",
54
+ "conflicts_with": [],
55
+ "resolved_by": null,
56
+ "tags": ["hackathon", "sprint-boost", "business-aligned", "format"]
57
+ },
58
+ {
59
+ "id": "hsb-002",
60
+ "type": "constraint",
61
+ "topic": "impact scoring",
62
+ "content": "Business impact is the dominant judging dimension at 40%. Scored across: (1) Does the research address a real business constraint? (2) Are findings backed by evidence strong enough to act on? (3) Are claims corroborated by external sources? A sprint that solves a real problem with weak evidence still outscores a polished sprint on a hypothetical problem.",
63
+ "source": {
64
+ "origin": "best-practice",
65
+ "artifact": null,
66
+ "connector": null
67
+ },
68
+ "evidence": "documented",
69
+ "status": "active",
70
+ "phase_added": "define",
71
+ "timestamp": "2026-03-21T00:00:00.000Z",
72
+ "conflicts_with": [],
73
+ "resolved_by": null,
74
+ "tags": ["hackathon", "sprint-boost", "impact", "scoring"]
75
+ },
76
+ {
77
+ "id": "hsb-003",
78
+ "type": "constraint",
79
+ "topic": "rigor scoring",
80
+ "content": "Research rigor is weighted at 30%. Scored across: (1) Evidence tier distribution — higher tiers (tested, production) score exponentially better. (2) Challenge depth — did the team stress-test their own findings? (3) Health — clean compilation with few warnings. (4) Corroboration — external witnesses back up claims.",
81
+ "source": {
82
+ "origin": "best-practice",
83
+ "artifact": null,
84
+ "connector": null
85
+ },
86
+ "evidence": "documented",
87
+ "status": "active",
88
+ "phase_added": "define",
89
+ "timestamp": "2026-03-21T00:00:00.000Z",
90
+ "conflicts_with": [],
91
+ "resolved_by": null,
92
+ "tags": ["hackathon", "sprint-boost", "rigor", "scoring"]
93
+ },
94
+ {
95
+ "id": "hsb-004",
96
+ "type": "constraint",
97
+ "topic": "creativity scoring",
98
+ "content": "Creativity is weighted at 30%. Scored across: (1) Type diversity — using multiple claim types shows multi-angle thinking. (2) Recommendation ratio — novel, actionable recommendations signal creative problem-solving. (3) Cross-topic breadth — drawing from diverse domains demonstrates creative synthesis.",
99
+ "source": {
100
+ "origin": "best-practice",
101
+ "artifact": null,
102
+ "connector": null
103
+ },
104
+ "evidence": "documented",
105
+ "status": "active",
106
+ "phase_added": "define",
107
+ "timestamp": "2026-03-21T00:00:00.000Z",
108
+ "conflicts_with": [],
109
+ "resolved_by": null,
110
+ "tags": ["hackathon", "sprint-boost", "creativity", "scoring"]
111
+ },
112
+ {
113
+ "id": "hsb-005",
114
+ "type": "recommendation",
115
+ "topic": "challenge tracks",
116
+ "content": "Sprint Boost should offer challenge tracks tied to real OKRs: (1) 'Accelerate' — pick a backlog item and deliver research that unblocks it. (2) 'Rethink' — same task, novel approach. (3) 'Cross-pollinate' — research a problem from another team's backlog. Track 2 prevents Sprint Boost from becoming a crunch sprint.",
117
+ "source": { "origin": "research", "artifact": null, "connector": null },
118
+ "evidence": "web",
119
+ "status": "active",
120
+ "phase_added": "research",
121
+ "timestamp": "2026-03-21T00:00:00.000Z",
122
+ "conflicts_with": [],
123
+ "resolved_by": null,
124
+ "tags": ["hackathon", "sprint-boost", "tracks", "okr"]
125
+ },
126
+ {
127
+ "id": "hsb-006",
128
+ "type": "risk",
129
+ "topic": "crunch sprint risk",
130
+ "content": "Risk: If Sprint Boost only rewards shipping backlog items, it becomes unpaid overtime disguised as a hackathon. The creativity weight (30%) and the 'Rethink' track are guardrails. Human judges should penalize submissions that are pure task execution with no research insight.",
131
+ "source": { "origin": "research", "artifact": null, "connector": null },
132
+ "evidence": "web",
133
+ "status": "active",
134
+ "phase_added": "research",
135
+ "timestamp": "2026-03-21T00:00:00.000Z",
136
+ "conflicts_with": [],
137
+ "resolved_by": null,
138
+ "tags": ["hackathon", "sprint-boost", "risk", "crunch"]
139
+ },
140
+ {
141
+ "id": "hsb-007",
142
+ "type": "recommendation",
143
+ "topic": "human judge criteria",
144
+ "content": "Human judges for Sprint Boost score on: (1) Decision clarity — could a VP act on this brief? (2) Stakeholder awareness — are constraints from real decision-makers? (3) Actionability — are next steps concrete with owners and timelines? These map 1:1 to the three scoring dimensions (impact, rigor, creativity).",
145
+ "source": { "origin": "research", "artifact": null, "connector": null },
146
+ "evidence": "stated",
147
+ "status": "active",
148
+ "phase_added": "research",
149
+ "timestamp": "2026-03-21T00:00:00.000Z",
150
+ "conflicts_with": [],
151
+ "resolved_by": null,
152
+ "tags": ["hackathon", "sprint-boost", "human-judging", "criteria"]
153
+ },
154
+ {
155
+ "id": "hsb-008",
156
+ "type": "factual",
157
+ "topic": "OKR hackathon precedent",
158
+ "content": "The OKR Hackathon is an established corporate format (documented by Devpost) where teams compete to complete OKR goals within a hackathon timebox. Intel ties innovation hackathons directly to strategic OKRs. This validates Sprint Boost: real work, hackathon energy, research-backed output.",
159
+ "source": {
160
+ "origin": "research",
161
+ "artifact": "https://info.devpost.com/blog/8-types-of-internal-hackathons",
162
+ "connector": null
163
+ },
164
+ "evidence": "web",
165
+ "status": "active",
166
+ "phase_added": "research",
167
+ "timestamp": "2026-03-21T00:00:00.000Z",
168
+ "conflicts_with": [],
169
+ "resolved_by": null,
170
+ "tags": ["hackathon", "sprint-boost", "okr", "precedent"]
171
+ }
172
+ ]
173
+ }
@@ -0,0 +1,219 @@
1
+ {
2
+ "name": "Incident Postmortem",
3
+ "description": "Structured framework for blameless incident review sprints. Covers timeline reconstruction, root cause analysis, contributing factors, action items, and organizational learning. Aligned with SRE best practices.",
4
+ "version": "1.0.0",
5
+ "claims": [
6
+ {
7
+ "id": "inc-001",
8
+ "type": "constraint",
9
+ "topic": "blameless postmortem culture",
10
+ "content": "Postmortems must be blameless. Focus on systemic causes, not individual mistakes. Language matters: 'the deploy pipeline lacked rollback automation' not 'engineer X deployed without testing'. Blame drives hiding, which prevents learning.",
11
+ "source": {
12
+ "origin": "best-practice",
13
+ "artifact": "Google SRE Book, Chapter 15",
14
+ "connector": null
15
+ },
16
+ "evidence": "documented",
17
+ "status": "active",
18
+ "phase_added": "define",
19
+ "timestamp": "2026-03-21T00:00:00.000Z",
20
+ "conflicts_with": [],
21
+ "resolved_by": null,
22
+ "tags": ["postmortem", "blameless", "culture", "sre"]
23
+ },
24
+ {
25
+ "id": "inc-002",
26
+ "type": "constraint",
27
+ "topic": "incident timeline",
28
+ "content": "Every postmortem must include a precise timeline: (1) First customer impact (not first alert). (2) Detection time. (3) Escalation and response milestones. (4) Mitigation applied. (5) Full resolution. (6) Verification of recovery. Use UTC timestamps. Each entry is a factual claim with 'documented' evidence from logs and metrics.",
29
+ "source": {
30
+ "origin": "best-practice",
31
+ "artifact": "Google SRE Book, Chapter 15",
32
+ "connector": null
33
+ },
34
+ "evidence": "documented",
35
+ "status": "active",
36
+ "phase_added": "define",
37
+ "timestamp": "2026-03-21T00:00:00.000Z",
38
+ "conflicts_with": [],
39
+ "resolved_by": null,
40
+ "tags": ["postmortem", "timeline", "documentation", "sre"]
41
+ },
42
+ {
43
+ "id": "inc-003",
44
+ "type": "constraint",
45
+ "topic": "severity classification",
46
+ "content": "Classify incident severity: SEV1 — widespread outage, revenue-impacting, all-hands response. SEV2 — partial outage, degraded experience for subset of users. SEV3 — minor issue, workaround available, limited impact. SEV4 — cosmetic or edge case, no user impact. Postmortem required for SEV1 and SEV2. SEV3 optional.",
47
+ "source": {
48
+ "origin": "best-practice",
49
+ "artifact": null,
50
+ "connector": null
51
+ },
52
+ "evidence": "documented",
53
+ "status": "active",
54
+ "phase_added": "define",
55
+ "timestamp": "2026-03-21T00:00:00.000Z",
56
+ "conflicts_with": [],
57
+ "resolved_by": null,
58
+ "tags": ["postmortem", "severity", "classification", "sre"]
59
+ },
60
+ {
61
+ "id": "inc-004",
62
+ "type": "constraint",
63
+ "topic": "root cause analysis",
64
+ "content": "Use the '5 Whys' technique or fault tree analysis to identify root causes. Most incidents have multiple contributing causes. Distinguish: (1) Triggering cause — the immediate action that started the incident. (2) Root cause — the systemic weakness that allowed the trigger to cause impact. (3) Contributing factors — conditions that worsened severity or delayed response.",
65
+ "source": {
66
+ "origin": "best-practice",
67
+ "artifact": null,
68
+ "connector": null
69
+ },
70
+ "evidence": "documented",
71
+ "status": "active",
72
+ "phase_added": "define",
73
+ "timestamp": "2026-03-21T00:00:00.000Z",
74
+ "conflicts_with": [],
75
+ "resolved_by": null,
76
+ "tags": ["postmortem", "root-cause", "5-whys", "fault-tree"]
77
+ },
78
+ {
79
+ "id": "inc-005",
80
+ "type": "constraint",
81
+ "topic": "action items",
82
+ "content": "Every postmortem must produce prioritized action items: (1) Each action has an owner and deadline. (2) Actions are typed: 'mitigate' (prevent recurrence), 'detect' (catch it faster), 'respond' (improve response). (3) Track completion — unfinished action items from past postmortems are a leading indicator of future incidents.",
83
+ "source": {
84
+ "origin": "best-practice",
85
+ "artifact": "Google SRE Book, Chapter 15",
86
+ "connector": null
87
+ },
88
+ "evidence": "documented",
89
+ "status": "active",
90
+ "phase_added": "define",
91
+ "timestamp": "2026-03-21T00:00:00.000Z",
92
+ "conflicts_with": [],
93
+ "resolved_by": null,
94
+ "tags": ["postmortem", "action-items", "tracking", "sre"]
95
+ },
96
+ {
97
+ "id": "inc-006",
98
+ "type": "risk",
99
+ "topic": "action item decay",
100
+ "content": "The most common postmortem failure: action items are written but never completed. Studies show 40-60% of postmortem action items go unfinished. Counter: link action items to sprint tickets, review completion in weekly standups, escalate overdue items, and track completion rate as a team metric.",
101
+ "source": { "origin": "research", "artifact": null, "connector": null },
102
+ "evidence": "web",
103
+ "status": "active",
104
+ "phase_added": "research",
105
+ "timestamp": "2026-03-21T00:00:00.000Z",
106
+ "conflicts_with": [],
107
+ "resolved_by": null,
108
+ "tags": ["postmortem", "action-items", "decay", "risk"]
109
+ },
110
+ {
111
+ "id": "inc-007",
112
+ "type": "factual",
113
+ "topic": "impact quantification",
114
+ "content": "Quantify incident impact across dimensions: (1) Duration (time to detect + time to mitigate + time to resolve). (2) User impact (affected users / total users). (3) Revenue impact (lost transactions, SLA credits owed). (4) Reputation impact (social media mentions, support tickets). (5) Engineering cost (person-hours spent responding).",
115
+ "source": {
116
+ "origin": "best-practice",
117
+ "artifact": null,
118
+ "connector": null
119
+ },
120
+ "evidence": "documented",
121
+ "status": "active",
122
+ "phase_added": "define",
123
+ "timestamp": "2026-03-21T00:00:00.000Z",
124
+ "conflicts_with": [],
125
+ "resolved_by": null,
126
+ "tags": ["postmortem", "impact", "metrics", "quantification"]
127
+ },
128
+ {
129
+ "id": "inc-008",
130
+ "type": "recommendation",
131
+ "topic": "wheat-powered postmortem workflow",
132
+ "content": "Map wheat sprint to postmortem: (1) Sprint question = 'What caused incident X and how do we prevent recurrence?' (2) Factual claims = timeline events backed by logs. (3) Constraint claims = SLA requirements and detection targets. (4) Risk claims = contributing factors and systemic weaknesses. (5) Recommendation claims = action items with owners. (6) The compiled brief IS the postmortem document.",
133
+ "source": {
134
+ "origin": "best-practice",
135
+ "artifact": null,
136
+ "connector": null
137
+ },
138
+ "evidence": "stated",
139
+ "status": "active",
140
+ "phase_added": "define",
141
+ "timestamp": "2026-03-21T00:00:00.000Z",
142
+ "conflicts_with": [],
143
+ "resolved_by": null,
144
+ "tags": ["postmortem", "wheat", "workflow", "recommendation"]
145
+ },
146
+ {
147
+ "id": "inc-009",
148
+ "type": "constraint",
149
+ "topic": "postmortem timeline requirement",
150
+ "content": "Postmortem sprint must begin within 48 hours of incident resolution for SEV1, within 1 week for SEV2. Memory degrades rapidly — the timeline becomes unreliable after 72 hours. The postmortem review meeting should happen within 5 business days of the sprint.",
151
+ "source": {
152
+ "origin": "best-practice",
153
+ "artifact": "Google SRE Book, Chapter 15",
154
+ "connector": null
155
+ },
156
+ "evidence": "documented",
157
+ "status": "active",
158
+ "phase_added": "define",
159
+ "timestamp": "2026-03-21T00:00:00.000Z",
160
+ "conflicts_with": [],
161
+ "resolved_by": null,
162
+ "tags": ["postmortem", "timeline", "sre", "process"]
163
+ },
164
+ {
165
+ "id": "inc-010",
166
+ "type": "factual",
167
+ "topic": "detection and response metrics",
168
+ "content": "Key incident metrics: MTTD (mean time to detect) — from first impact to first alert. MTTR (mean time to respond) — from alert to first responder action. MTTM (mean time to mitigate) — from response to customer impact stopped. MTTResolve — from mitigation to root cause fixed. Track trends across incidents, not just individual values.",
169
+ "source": {
170
+ "origin": "best-practice",
171
+ "artifact": null,
172
+ "connector": null
173
+ },
174
+ "evidence": "documented",
175
+ "status": "active",
176
+ "phase_added": "define",
177
+ "timestamp": "2026-03-21T00:00:00.000Z",
178
+ "conflicts_with": [],
179
+ "resolved_by": null,
180
+ "tags": ["postmortem", "mttd", "mttr", "metrics", "sre"]
181
+ },
182
+ {
183
+ "id": "inc-011",
184
+ "type": "risk",
185
+ "topic": "recurrence patterns",
186
+ "content": "If the same category of incident recurs 3+ times, the postmortems are failing. Common reasons: (1) Action items address symptoms not root causes. (2) Systemic issues span team boundaries with no single owner. (3) Technical debt prioritized below feature work. Escalate recurring incident categories to engineering leadership.",
187
+ "source": {
188
+ "origin": "best-practice",
189
+ "artifact": null,
190
+ "connector": null
191
+ },
192
+ "evidence": "documented",
193
+ "status": "active",
194
+ "phase_added": "research",
195
+ "timestamp": "2026-03-21T00:00:00.000Z",
196
+ "conflicts_with": [],
197
+ "resolved_by": null,
198
+ "tags": ["postmortem", "recurrence", "systemic", "risk"]
199
+ },
200
+ {
201
+ "id": "inc-012",
202
+ "type": "recommendation",
203
+ "topic": "learning dissemination",
204
+ "content": "Postmortem value is maximized when learnings spread beyond the incident team. Recommended: (1) Publish postmortem to internal wiki within 1 week. (2) Present at engineering all-hands for SEV1. (3) Add to onboarding reading list for the affected service. (4) Use wheat /witness to link related incidents across teams — the knowledge graph surfaces patterns humans miss.",
205
+ "source": {
206
+ "origin": "best-practice",
207
+ "artifact": null,
208
+ "connector": null
209
+ },
210
+ "evidence": "stated",
211
+ "status": "active",
212
+ "phase_added": "research",
213
+ "timestamp": "2026-03-21T00:00:00.000Z",
214
+ "conflicts_with": [],
215
+ "resolved_by": null,
216
+ "tags": ["postmortem", "learning", "knowledge-sharing", "enterprise"]
217
+ }
218
+ ]
219
+ }
@@ -8,7 +8,11 @@
8
8
  "type": "constraint",
9
9
  "topic": "migration rollback safety",
10
10
  "content": "Production migrations must be reversible. Every schema change, data transform, or infrastructure swap needs a tested rollback procedure with a maximum rollback time SLA.",
11
- "source": { "origin": "best-practice", "artifact": null, "connector": null },
11
+ "source": {
12
+ "origin": "best-practice",
13
+ "artifact": null,
14
+ "connector": null
15
+ },
12
16
  "evidence": "documented",
13
17
  "status": "active",
14
18
  "phase_added": "define",
@@ -22,7 +26,11 @@
22
26
  "type": "risk",
23
27
  "topic": "dual-write consistency risk",
24
28
  "content": "Dual-write periods introduce data consistency risks. During cutover, writes must go to both old and new systems, and a reconciliation process must detect drift.",
25
- "source": { "origin": "best-practice", "artifact": null, "connector": null },
29
+ "source": {
30
+ "origin": "best-practice",
31
+ "artifact": null,
32
+ "connector": null
33
+ },
26
34
  "evidence": "documented",
27
35
  "status": "active",
28
36
  "phase_added": "define",
@@ -36,7 +44,11 @@
36
44
  "type": "risk",
37
45
  "topic": "migration window overrun",
38
46
  "content": "Data migrations that run longer than the maintenance window force a choice between extended downtime and incremental migration. Estimate data volume transfer rates early.",
39
- "source": { "origin": "best-practice", "artifact": null, "connector": null },
47
+ "source": {
48
+ "origin": "best-practice",
49
+ "artifact": null,
50
+ "connector": null
51
+ },
40
52
  "evidence": "documented",
41
53
  "status": "active",
42
54
  "phase_added": "define",
@@ -50,7 +62,11 @@
50
62
  "type": "constraint",
51
63
  "topic": "minimum viable migration scope",
52
64
  "content": "Feature parity with the legacy system is rarely achievable on day one. Define a 'minimum viable migration' scope that covers the critical path and defer edge cases.",
53
- "source": { "origin": "best-practice", "artifact": null, "connector": null },
65
+ "source": {
66
+ "origin": "best-practice",
67
+ "artifact": null,
68
+ "connector": null
69
+ },
54
70
  "evidence": "stated",
55
71
  "status": "active",
56
72
  "phase_added": "define",
@@ -64,7 +80,11 @@
64
80
  "type": "recommendation",
65
81
  "topic": "strangler fig migration pattern",
66
82
  "content": "Use the strangler fig pattern for large migrations: route traffic incrementally to the new system while the old system continues to serve unmodified paths.",
67
- "source": { "origin": "best-practice", "artifact": null, "connector": null },
83
+ "source": {
84
+ "origin": "best-practice",
85
+ "artifact": null,
86
+ "connector": null
87
+ },
68
88
  "evidence": "documented",
69
89
  "status": "active",
70
90
  "phase_added": "define",
@@ -78,7 +98,11 @@
78
98
  "type": "risk",
79
99
  "topic": "character encoding corruption",
80
100
  "content": "Character encoding mismatches (UTF-8 vs Latin-1, collation differences) are the most common silent data corruption source in database migrations. Test with production-representative data.",
81
- "source": { "origin": "best-practice", "artifact": null, "connector": null },
101
+ "source": {
102
+ "origin": "best-practice",
103
+ "artifact": null,
104
+ "connector": null
105
+ },
82
106
  "evidence": "documented",
83
107
  "status": "active",
84
108
  "phase_added": "define",
@@ -106,7 +130,11 @@
106
130
  "type": "constraint",
107
131
  "topic": "API versioning during migration",
108
132
  "content": "API contract changes during migration require versioned endpoints. Clients must be able to continue using the old API for a deprecation period (minimum 3 months is standard).",
109
- "source": { "origin": "best-practice", "artifact": null, "connector": null },
133
+ "source": {
134
+ "origin": "best-practice",
135
+ "artifact": null,
136
+ "connector": null
137
+ },
110
138
  "evidence": "documented",
111
139
  "status": "active",
112
140
  "phase_added": "define",
@@ -120,7 +148,11 @@
120
148
  "type": "recommendation",
121
149
  "topic": "shadow traffic validation",
122
150
  "content": "Run shadow traffic (replay production reads against the new system without serving responses) for at least 2 weeks before cutover. Compare results for correctness and latency.",
123
- "source": { "origin": "best-practice", "artifact": null, "connector": null },
151
+ "source": {
152
+ "origin": "best-practice",
153
+ "artifact": null,
154
+ "connector": null
155
+ },
124
156
  "evidence": "documented",
125
157
  "status": "active",
126
158
  "phase_added": "define",
@@ -134,7 +166,11 @@
134
166
  "type": "risk",
135
167
  "topic": "legacy knowledge decay",
136
168
  "content": "Team knowledge of the legacy system degrades over time. If migration stalls, institutional knowledge may be lost before it completes. Document as you migrate, not after.",
137
- "source": { "origin": "best-practice", "artifact": null, "connector": null },
169
+ "source": {
170
+ "origin": "best-practice",
171
+ "artifact": null,
172
+ "connector": null
173
+ },
138
174
  "evidence": "stated",
139
175
  "status": "active",
140
176
  "phase_added": "define",
@@ -8,7 +8,11 @@
8
8
  "type": "constraint",
9
9
  "topic": "structured logging format",
10
10
  "content": "All application logs must be structured JSON with mandatory fields: timestamp (ISO 8601), level (debug/info/warn/error/fatal), message, service, and trace_id. Unstructured string logs break log aggregation queries and make correlation impossible at scale.",
11
- "source": { "origin": "best-practice", "artifact": null, "connector": null },
11
+ "source": {
12
+ "origin": "best-practice",
13
+ "artifact": null,
14
+ "connector": null
15
+ },
12
16
  "evidence": "production",
13
17
  "status": "active",
14
18
  "phase_added": "define",
@@ -22,7 +26,11 @@
22
26
  "type": "recommendation",
23
27
  "topic": "RED method for services",
24
28
  "content": "Every service should expose the RED metrics: Rate (requests per second), Errors (failed requests per second), and Duration (latency histogram). These three metrics cover 80% of service-level debugging. Use histograms with buckets at 5ms, 10ms, 25ms, 50ms, 100ms, 250ms, 500ms, 1s, 2.5s, 5s, 10s.",
25
- "source": { "origin": "best-practice", "artifact": null, "connector": null },
29
+ "source": {
30
+ "origin": "best-practice",
31
+ "artifact": null,
32
+ "connector": null
33
+ },
26
34
  "evidence": "documented",
27
35
  "status": "active",
28
36
  "phase_added": "define",
@@ -36,7 +44,11 @@
36
44
  "type": "factual",
37
45
  "topic": "SLO error budget calculation",
38
46
  "content": "An SLO of 99.9% availability allows 43.2 minutes of downtime per 30-day window (0.1% error budget). An SLO of 99.95% allows 21.6 minutes. Error budget = total time * (1 - SLO target). When budget is exhausted, freeze feature work and focus on reliability.",
39
- "source": { "origin": "best-practice", "artifact": null, "connector": null },
47
+ "source": {
48
+ "origin": "best-practice",
49
+ "artifact": null,
50
+ "connector": null
51
+ },
40
52
  "evidence": "documented",
41
53
  "status": "active",
42
54
  "phase_added": "define",
@@ -50,7 +62,11 @@
50
62
  "type": "risk",
51
63
  "topic": "alert fatigue",
52
64
  "content": "Alert fatigue occurs when on-call engineers receive more than 2-3 actionable alerts per shift. Non-actionable alerts must be demoted to dashboards or deleted. Every alert should have a runbook link, a clear threshold, and an expected action. Pages without runbooks erode trust in the alerting system.",
53
- "source": { "origin": "best-practice", "artifact": null, "connector": null },
65
+ "source": {
66
+ "origin": "best-practice",
67
+ "artifact": null,
68
+ "connector": null
69
+ },
54
70
  "evidence": "production",
55
71
  "status": "active",
56
72
  "phase_added": "define",
@@ -64,21 +80,34 @@
64
80
  "type": "recommendation",
65
81
  "topic": "distributed trace propagation",
66
82
  "content": "Propagate W3C Trace Context headers (traceparent, tracestate) across all service boundaries including HTTP, gRPC, and message queues. Without trace propagation, latency debugging in distributed systems requires manual log correlation which takes 10-50x longer.",
67
- "source": { "origin": "best-practice", "artifact": null, "connector": null },
83
+ "source": {
84
+ "origin": "best-practice",
85
+ "artifact": null,
86
+ "connector": null
87
+ },
68
88
  "evidence": "documented",
69
89
  "status": "active",
70
90
  "phase_added": "define",
71
91
  "timestamp": "2025-01-01T00:00:00.000Z",
72
92
  "conflicts_with": [],
73
93
  "resolved_by": null,
74
- "tags": ["observability", "tracing", "distributed-systems", "opentelemetry"]
94
+ "tags": [
95
+ "observability",
96
+ "tracing",
97
+ "distributed-systems",
98
+ "opentelemetry"
99
+ ]
75
100
  },
76
101
  {
77
102
  "id": "obs-006",
78
103
  "type": "constraint",
79
104
  "topic": "no PII in logs",
80
105
  "content": "Logs must never contain PII (emails, names, IP addresses, session tokens, passwords). Use hashed or tokenized identifiers for correlation. Accidental PII in logs creates GDPR/CCPA compliance risk and makes log retention policies complex. Enforce with automated log scrubbing or allowlist-based field serialization.",
81
- "source": { "origin": "best-practice", "artifact": null, "connector": null },
106
+ "source": {
107
+ "origin": "best-practice",
108
+ "artifact": null,
109
+ "connector": null
110
+ },
82
111
  "evidence": "documented",
83
112
  "status": "active",
84
113
  "phase_added": "define",
@@ -92,7 +121,11 @@
92
121
  "type": "recommendation",
93
122
  "topic": "USE method for infrastructure",
94
123
  "content": "For infrastructure resources (CPU, memory, disk, network), track the USE metrics: Utilization (% of capacity used), Saturation (queue depth or wait time), and Errors (error count). Alert when utilization exceeds 80% sustained over 5 minutes or saturation is non-zero.",
95
- "source": { "origin": "best-practice", "artifact": null, "connector": null },
124
+ "source": {
125
+ "origin": "best-practice",
126
+ "artifact": null,
127
+ "connector": null
128
+ },
96
129
  "evidence": "documented",
97
130
  "status": "active",
98
131
  "phase_added": "define",
@@ -120,7 +153,11 @@
120
153
  "type": "recommendation",
121
154
  "topic": "incident severity definitions",
122
155
  "content": "Define incident severity levels with quantitative criteria: SEV1 (total service outage or data loss, all hands, 15-min response), SEV2 (degraded for >50% of users, team response, 30-min), SEV3 (degraded for <50% or non-critical feature down, next business day), SEV4 (cosmetic or minor, backlog). Ambiguous severity wastes escalation time.",
123
- "source": { "origin": "best-practice", "artifact": null, "connector": null },
156
+ "source": {
157
+ "origin": "best-practice",
158
+ "artifact": null,
159
+ "connector": null
160
+ },
124
161
  "evidence": "documented",
125
162
  "status": "active",
126
163
  "phase_added": "define",
@@ -134,7 +171,11 @@
134
171
  "type": "risk",
135
172
  "topic": "cardinality explosion in metrics",
136
173
  "content": "High-cardinality labels (user IDs, request IDs, full URLs) on metrics cause cardinality explosion, which degrades query performance and inflates storage costs exponentially. Prometheus recommends keeping total time series under 10 million. Use logs or traces for high-cardinality data, not metrics.",
137
- "source": { "origin": "best-practice", "artifact": null, "connector": null },
174
+ "source": {
175
+ "origin": "best-practice",
176
+ "artifact": null,
177
+ "connector": null
178
+ },
138
179
  "evidence": "production",
139
180
  "status": "active",
140
181
  "phase_added": "define",
@@ -148,7 +189,11 @@
148
189
  "type": "recommendation",
149
190
  "topic": "postmortem blamelessness",
150
191
  "content": "Postmortems must be blameless and focus on systemic causes, not individual actions. Required sections: timeline, impact (users affected, duration, revenue impact), root cause, contributing factors, action items with owners and due dates. Publish postmortems within 48 hours of incident resolution.",
151
- "source": { "origin": "best-practice", "artifact": null, "connector": null },
192
+ "source": {
193
+ "origin": "best-practice",
194
+ "artifact": null,
195
+ "connector": null
196
+ },
152
197
  "evidence": "documented",
153
198
  "status": "active",
154
199
  "phase_added": "define",
@@ -172,4 +217,4 @@
172
217
  "tags": ["observability", "opentelemetry", "migration", "cost"]
173
218
  }
174
219
  ]
175
- }
220
+ }