arkaos 2.0.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. package/README.md +100 -74
  2. package/VERSION +1 -1
  3. package/bin/arkaos +1 -1
  4. package/config/constitution.yaml +4 -0
  5. package/config/hooks/user-prompt-submit-v2.sh +20 -38
  6. package/core/__pycache__/__init__.cpython-313.pyc +0 -0
  7. package/core/agents/__pycache__/__init__.cpython-313.pyc +0 -0
  8. package/core/agents/__pycache__/loader.cpython-313.pyc +0 -0
  9. package/core/agents/__pycache__/schema.cpython-313.pyc +0 -0
  10. package/core/agents/__pycache__/validator.cpython-313.pyc +0 -0
  11. package/core/budget/__init__.py +6 -0
  12. package/core/budget/__pycache__/__init__.cpython-313.pyc +0 -0
  13. package/core/budget/__pycache__/manager.cpython-313.pyc +0 -0
  14. package/core/budget/__pycache__/schema.cpython-313.pyc +0 -0
  15. package/core/budget/manager.py +193 -0
  16. package/core/budget/schema.py +82 -0
  17. package/core/conclave/__pycache__/__init__.cpython-313.pyc +0 -0
  18. package/core/conclave/__pycache__/advisor_db.cpython-313.pyc +0 -0
  19. package/core/conclave/__pycache__/display.cpython-313.pyc +0 -0
  20. package/core/conclave/__pycache__/matcher.cpython-313.pyc +0 -0
  21. package/core/conclave/__pycache__/persistence.cpython-313.pyc +0 -0
  22. package/core/conclave/__pycache__/profiler.cpython-313.pyc +0 -0
  23. package/core/conclave/__pycache__/prompts.cpython-313.pyc +0 -0
  24. package/core/conclave/__pycache__/schema.cpython-313.pyc +0 -0
  25. package/core/governance/__pycache__/__init__.cpython-313.pyc +0 -0
  26. package/core/governance/__pycache__/constitution.cpython-313.pyc +0 -0
  27. package/core/obsidian/__init__.py +6 -0
  28. package/core/obsidian/__pycache__/__init__.cpython-313.pyc +0 -0
  29. package/core/obsidian/__pycache__/templates.cpython-313.pyc +0 -0
  30. package/core/obsidian/__pycache__/writer.cpython-313.pyc +0 -0
  31. package/core/obsidian/templates.py +76 -0
  32. package/core/obsidian/writer.py +148 -0
  33. package/core/orchestration/__init__.py +6 -0
  34. package/core/orchestration/__pycache__/__init__.cpython-313.pyc +0 -0
  35. package/core/orchestration/__pycache__/patterns.cpython-313.pyc +0 -0
  36. package/core/orchestration/__pycache__/protocol.cpython-313.pyc +0 -0
  37. package/core/orchestration/patterns.py +136 -0
  38. package/core/orchestration/protocol.py +96 -0
  39. package/core/registry/__pycache__/__init__.cpython-313.pyc +0 -0
  40. package/core/registry/__pycache__/generator.cpython-313.pyc +0 -0
  41. package/core/runtime/__pycache__/__init__.cpython-313.pyc +0 -0
  42. package/core/runtime/__pycache__/base.cpython-313.pyc +0 -0
  43. package/core/runtime/__pycache__/claude_code.cpython-313.pyc +0 -0
  44. package/core/runtime/__pycache__/codex_cli.cpython-313.pyc +0 -0
  45. package/core/runtime/__pycache__/cursor.cpython-313.pyc +0 -0
  46. package/core/runtime/__pycache__/gemini_cli.cpython-313.pyc +0 -0
  47. package/core/runtime/__pycache__/registry.cpython-313.pyc +0 -0
  48. package/core/runtime/__pycache__/subagent.cpython-313.pyc +0 -0
  49. package/core/specs/__pycache__/__init__.cpython-313.pyc +0 -0
  50. package/core/specs/__pycache__/manager.cpython-313.pyc +0 -0
  51. package/core/specs/__pycache__/schema.cpython-313.pyc +0 -0
  52. package/core/squads/__pycache__/__init__.cpython-313.pyc +0 -0
  53. package/core/squads/__pycache__/loader.cpython-313.pyc +0 -0
  54. package/core/squads/__pycache__/registry.cpython-313.pyc +0 -0
  55. package/core/squads/__pycache__/schema.cpython-313.pyc +0 -0
  56. package/core/synapse/__pycache__/__init__.cpython-313.pyc +0 -0
  57. package/core/synapse/__pycache__/cache.cpython-313.pyc +0 -0
  58. package/core/synapse/__pycache__/engine.cpython-313.pyc +0 -0
  59. package/core/synapse/__pycache__/layers.cpython-313.pyc +0 -0
  60. package/core/tasks/__pycache__/__init__.cpython-313.pyc +0 -0
  61. package/core/tasks/__pycache__/manager.cpython-313.pyc +0 -0
  62. package/core/tasks/__pycache__/schema.cpython-313.pyc +0 -0
  63. package/core/tasks/schema.py +6 -0
  64. package/core/workflow/__pycache__/__init__.cpython-313.pyc +0 -0
  65. package/core/workflow/__pycache__/engine.cpython-313.pyc +0 -0
  66. package/core/workflow/__pycache__/loader.cpython-313.pyc +0 -0
  67. package/core/workflow/__pycache__/schema.cpython-313.pyc +0 -0
  68. package/core/workflow/engine.py +44 -0
  69. package/core/workflow/schema.py +1 -0
  70. package/departments/dev/skills/agent-design/SKILL.md +4 -0
  71. package/departments/dev/skills/agent-design/references/architecture-patterns.md +223 -0
  72. package/departments/dev/skills/ai-security/SKILL.md +4 -0
  73. package/departments/dev/skills/ai-security/references/prompt-injection-catalog.md +230 -0
  74. package/departments/dev/skills/ci-cd-pipeline/SKILL.md +4 -0
  75. package/departments/dev/skills/ci-cd-pipeline/references/github-actions-patterns.md +202 -0
  76. package/departments/dev/skills/db-schema/SKILL.md +4 -0
  77. package/departments/dev/skills/db-schema/references/indexing-strategy.md +197 -0
  78. package/departments/dev/skills/dependency-audit/SKILL.md +4 -0
  79. package/departments/dev/skills/dependency-audit/references/license-matrix.md +191 -0
  80. package/departments/dev/skills/incident/SKILL.md +4 -0
  81. package/departments/dev/skills/incident/references/severity-playbook.md +221 -0
  82. package/departments/dev/skills/observability/SKILL.md +4 -0
  83. package/departments/dev/skills/observability/references/slo-design.md +200 -0
  84. package/departments/dev/skills/rag-architect/SKILL.md +5 -0
  85. package/departments/dev/skills/rag-architect/references/chunking-strategies.md +129 -0
  86. package/departments/dev/skills/rag-architect/references/evaluation-guide.md +158 -0
  87. package/departments/dev/skills/red-team/SKILL.md +4 -0
  88. package/departments/dev/skills/red-team/references/mitre-attack-web.md +165 -0
  89. package/departments/dev/skills/security-audit/SKILL.md +4 -0
  90. package/departments/dev/skills/security-audit/references/owasp-2025-deep.md +409 -0
  91. package/departments/dev/skills/security-compliance/SKILL.md +117 -0
  92. package/departments/finance/skills/ciso-advisor/SKILL.md +4 -0
  93. package/departments/finance/skills/ciso-advisor/references/compliance-roadmap.md +172 -0
  94. package/departments/marketing/skills/programmatic-seo/SKILL.md +4 -0
  95. package/departments/marketing/skills/programmatic-seo/references/template-playbooks.md +289 -0
  96. package/departments/ops/skills/gdpr-compliance/SKILL.md +104 -0
  97. package/departments/ops/skills/iso27001/SKILL.md +113 -0
  98. package/departments/ops/skills/quality-management/SKILL.md +118 -0
  99. package/departments/ops/skills/risk-management/SKILL.md +120 -0
  100. package/departments/ops/skills/soc2-compliance/SKILL.md +120 -0
  101. package/departments/strategy/skills/cto-advisor/SKILL.md +4 -0
  102. package/departments/strategy/skills/cto-advisor/references/build-vs-buy-framework.md +190 -0
  103. package/installer/cli.js +13 -2
  104. package/installer/index.js +1 -2
  105. package/installer/migrate.js +123 -0
  106. package/installer/update.js +28 -15
  107. package/package.json +1 -1
  108. package/pyproject.toml +1 -1
  109. package/core/agents/__pycache__/registry_gen.cpython-313.pyc +0 -0
@@ -0,0 +1,221 @@
1
+ # Severity Playbook — Deep Reference
2
+
3
+ > SEV1-4 definitions, escalation paths, communication templates, PIR framework, and anti-patterns.
4
+
5
+ ## Severity Definitions with Examples
6
+
7
+ | Level | Definition | User Impact | Examples |
8
+ |-------|-----------|-------------|---------|
9
+ | **SEV1** | Complete service outage, data loss, active security breach | 100% of users or data integrity compromised | Database corruption, payment system down, credentials leaked, entire site 500 |
10
+ | **SEV2** | Major feature degraded, >25% users affected | Significant functionality lost | Search broken, checkout intermittent, API latency >10x, auth failures for subset |
11
+ | **SEV3** | Single feature broken, workaround exists | Minor inconvenience, <10% users | Export fails (manual workaround), slow dashboard, broken notification emails |
12
+ | **SEV4** | Cosmetic, dev/staging only, no user impact | None or negligible | UI alignment bug, staging env down, deprecation warning, flaky test |
13
+
14
+ ## Escalation Paths
15
+
16
+ ### SEV1 — Full Escalation
17
+
18
+ ```
19
+ T+0min Alert fires or report received
20
+ T+5min On-call engineer acknowledges, starts investigation
21
+ T+10min Incident Commander assigned, war room opened
22
+ T+15min First stakeholder notification sent
23
+ T+15min Engineering lead and CTO notified
24
+ T+30min If no mitigation path: escalate to vendor/cloud provider
25
+ T+60min If unresolved: executive briefing, consider public status page
26
+ T+4h If unresolved: assemble cross-team tiger team
27
+ ```
28
+
29
+ ### SEV2 — Team Escalation
30
+
31
+ ```
32
+ T+0min Alert fires or report received
33
+ T+15min On-call engineer acknowledges
34
+ T+30min Team lead notified, incident channel created
35
+ T+1h First stakeholder update
36
+ T+2h If unresolved: escalate to engineering manager
37
+ T+4h If unresolved: consider SEV1 upgrade
38
+ ```
39
+
40
+ ### SEV3 — Standard Response
41
+
42
+ ```
43
+ T+0min Ticket created automatically or manually
44
+ T+2h Engineer assigned during business hours
45
+ T+1d Initial investigation and fix ETA
46
+ T+3d Fix deployed or workaround documented
47
+ ```
48
+
49
+ ### SEV4 — Backlog
50
+
51
+ ```
52
+ T+0min Ticket created, tagged low priority
53
+ Next sprint Triaged and prioritized
54
+ ```
55
+
56
+ ## Severity Upgrade/Downgrade Criteria
57
+
58
+ | Trigger | Action |
59
+ |---------|--------|
60
+ | Impact expands beyond initial scope | Upgrade severity |
61
+ | Duration exceeds 2x expected MTTR | Upgrade severity |
62
+ | Data integrity concerns emerge | Upgrade to SEV1 |
63
+ | Workaround found and confirmed | Consider downgrade |
64
+ | Impact narrower than initial assessment | Downgrade severity |
65
+
66
+ ## Communication Templates
67
+
68
+ ### Initial Notification (SEV1/SEV2)
69
+
70
+ ```
71
+ INCIDENT: [SEV{N}] {Service Name} - {Brief Description}
72
+
73
+ Impact: {What users experience, how many affected}
74
+ Start time: {ISO 8601 timestamp, timezone}
75
+ Status: INVESTIGATING
76
+
77
+ Incident Commander: {Name}
78
+ Technical Lead: {Name}
79
+ War Room: {Slack channel / Zoom link}
80
+
81
+ Next update: {Time, max 15min for SEV1, 30min for SEV2}
82
+ ```
83
+
84
+ ### Status Update
85
+
86
+ ```
87
+ INCIDENT UPDATE #{N}: [SEV{level}] {Service Name}
88
+
89
+ Status: INVESTIGATING | IDENTIFIED | MITIGATING | MONITORING | RESOLVED
90
+ Duration: {elapsed time}
91
+
92
+ What we know:
93
+ - {Finding 1}
94
+ - {Finding 2}
95
+
96
+ Actions taken:
97
+ - {Action 1}
98
+ - {Action 2}
99
+
100
+ Next steps:
101
+ - {Planned action with owner}
102
+
103
+ ETA to resolution: {estimate or "Under investigation"}
104
+ Next update: {time}
105
+ ```
106
+
107
+ ### Resolution Notification
108
+
109
+ ```
110
+ RESOLVED: [SEV{level}] {Service Name} - {Brief Description}
111
+
112
+ Duration: {start} to {end} ({total})
113
+ Root cause: {1-2 sentence summary}
114
+ Fix applied: {what was done}
115
+ Users affected: {count or percentage}
116
+
117
+ Post-Incident Review scheduled: {date/time}
118
+ Action items will be tracked in: {ticket link}
119
+ ```
120
+
121
+ ### Customer-Facing Status Page
122
+
123
+ ```
124
+ [Investigating] We are aware of issues with {feature}. Our team is actively
125
+ investigating. We will provide an update within {timeframe}.
126
+
127
+ [Identified] We have identified the cause of {issue}. A fix is being implemented.
128
+ Expected resolution: {ETA}.
129
+
130
+ [Resolved] The issue with {feature} has been resolved. All systems are operating
131
+ normally. We apologize for the inconvenience.
132
+ ```
133
+
134
+ ## Post-Incident Review (PIR) Template
135
+
136
+ ### Header
137
+
138
+ | Field | Value |
139
+ |-------|-------|
140
+ | Incident ID | INC-YYYY-NNN |
141
+ | Severity | SEV{N} |
142
+ | Date | YYYY-MM-DD |
143
+ | Duration | {start} to {end} ({total}) |
144
+ | Incident Commander | {Name} |
145
+ | Technical Lead | {Name} |
146
+ | PIR Author | {Name} |
147
+ | PIR Date | {date, within 48h of resolution} |
148
+
149
+ ### Timeline (Required for SEV1/SEV2)
150
+
151
+ | Time (UTC) | Event | Source |
152
+ |------------|-------|--------|
153
+ | HH:MM | Alert fired: {description} | Monitoring |
154
+ | HH:MM | On-call acknowledged | PagerDuty |
155
+ | HH:MM | IC assigned, war room opened | Manual |
156
+ | HH:MM | Root cause identified: {description} | Investigation |
157
+ | HH:MM | Mitigation applied: {action} | Deployment |
158
+ | HH:MM | Service confirmed restored | Monitoring |
159
+
160
+ ### Root Cause Analysis
161
+
162
+ **5 Whys format:**
163
+
164
+ ```
165
+ 1. Why did the service go down?
166
+ -> Database connection pool exhausted
167
+ 2. Why was the pool exhausted?
168
+ -> Slow query holding connections for 30s+
169
+ 3. Why was the query slow?
170
+ -> Missing index on users.email after migration
171
+ 4. Why was the index missing?
172
+ -> Migration script did not include index creation
173
+ 5. Why was the missing index not caught?
174
+ -> No performance test in CI for migration scripts
175
+ ```
176
+
177
+ ### Action Items Table
178
+
179
+ | # | Action | Type | Owner | Due Date | Priority | Status |
180
+ |---|--------|------|-------|----------|----------|--------|
181
+ | 1 | Add index on users.email | Fix | {name} | {date} | P0 | Done |
182
+ | 2 | Add migration perf tests to CI | Prevent | {name} | {date} | P1 | Open |
183
+ | 3 | Add connection pool alert at 80% | Detect | {name} | {date} | P1 | Open |
184
+ | 4 | Document DB migration checklist | Process | {name} | {date} | P2 | Open |
185
+
186
+ Action item types: **Fix** (address this incident), **Prevent** (stop recurrence), **Detect** (catch it earlier), **Process** (improve response).
187
+
188
+ ## PIR Quality Checklist
189
+
190
+ - [ ] Timeline is complete with timestamps from monitoring (not memory)
191
+ - [ ] Root cause goes deep enough (5 Whys or equivalent)
192
+ - [ ] Action items have owners and due dates (no orphaned items)
193
+ - [ ] Action items include detection improvements, not just fixes
194
+ - [ ] Blameless language throughout (systems, not people)
195
+ - [ ] Shared with broader engineering team
196
+ - [ ] Runbooks updated with new knowledge
197
+ - [ ] Follow-up review scheduled for action item completion
198
+
199
+ ## Anti-Patterns
200
+
201
+ | Anti-Pattern | Why It Hurts | Fix |
202
+ |-------------|-------------|-----|
203
+ | Skipping severity classification | Wrong response level, wasted effort or delayed response | Classify within first 5 minutes, always |
204
+ | Hero culture (one person does everything) | Burnout, no knowledge sharing, SPOF | Separate IC and Tech Lead roles |
205
+ | No communication cadence | Stakeholders assume the worst, escalate unnecessarily | Set timer for updates, even if "still investigating" |
206
+ | Blame-focused PIR | People hide mistakes, no systemic improvement | Blameless by policy, focus on systems |
207
+ | PIR action items with no owners | Nothing gets done, same incident recurs | Every action item requires name + date |
208
+ | Never upgrading severity | SEV3 that is actually SEV1 gets slow response | Review upgrade criteria at every status update |
209
+ | Fix-only action items | Catches this incident but not the next variant | Always include Detect and Prevent items |
210
+ | PIR delayed beyond 1 week | Details forgotten, momentum lost | Schedule within 48 hours, hard deadline 5 days |
211
+
212
+ ## Metrics to Track
213
+
214
+ | Metric | Target | Measures |
215
+ |--------|--------|----------|
216
+ | MTTD (Mean Time to Detect) | < 5 min | Monitoring effectiveness |
217
+ | MTTA (Mean Time to Acknowledge) | < 10 min (SEV1) | On-call responsiveness |
218
+ | MTTR (Mean Time to Resolve) | < 1h (SEV1), < 4h (SEV2) | Resolution efficiency |
219
+ | PIR completion rate | 100% for SEV1/SEV2 | Learning culture |
220
+ | Action item completion rate | > 90% within due date | Follow-through |
221
+ | Recurrence rate | < 5% same root cause | Prevention effectiveness |
@@ -117,3 +117,7 @@ Surface these issues WITHOUT being asked:
117
117
  2. [Next priority]
118
118
  3. [Next priority]
119
119
  ```
120
+
121
+ ## References
122
+
123
+ - [slo-design.md](references/slo-design.md) — SLI/SLO/SLA framework, error budget calculations, and burn rate alert configuration
@@ -0,0 +1,200 @@
1
+ # SLO Design Guide — Deep Reference
2
+
3
+ > SLI/SLO/SLA framework, error budgets, burn rate alerts, and production SLO documents.
4
+
5
+ ## Terminology
6
+
7
+ | Term | Definition | Owner | Example |
8
+ |------|-----------|-------|---------|
9
+ | **SLI** (Service Level Indicator) | Quantitative measure of service behavior | Engineering | Request latency p99 |
10
+ | **SLO** (Service Level Objective) | Target value for an SLI over a time window | Engineering + Product | p99 latency < 200ms over 30 days |
11
+ | **SLA** (Service Level Agreement) | Contract with consequences for missing targets | Business + Legal | 99.9% uptime or service credits |
12
+ | **Error Budget** | Allowed amount of unreliability | Engineering | 0.1% of requests can fail per month |
13
+
14
+ Relationship: SLI measures reality. SLO sets internal targets. SLA sets external commitments. SLO should always be stricter than SLA.
15
+
16
+ ## Step 1: Define SLIs
17
+
18
+ ### SLI Selection by Service Type
19
+
20
+ | Service Type | Primary SLI | Secondary SLIs |
21
+ |-------------|------------|----------------|
22
+ | **API / Web Service** | Availability (successful responses / total) | Latency p50/p95/p99, error rate |
23
+ | **Data Pipeline** | Freshness (time since last successful run) | Throughput, completeness |
24
+ | **Storage System** | Durability (data loss events) | Availability, latency |
25
+ | **Batch Processing** | Completion rate within deadline | Processing time, error rate |
26
+ | **Streaming** | End-to-end latency | Throughput, ordering guarantees |
27
+
28
+ ### SLI Specification Template
29
+
30
+ ```
31
+ SLI Name: API Availability
32
+ Definition: Proportion of valid requests served successfully
33
+ Good event: HTTP response with status code != 5xx, latency < 1000ms
34
+ Valid event: All HTTP requests excluding health checks
35
+ Measurement: Load balancer access logs
36
+ Aggregation: Rolling 30-day window
37
+ ```
38
+
39
+ ### Common SLI Mistakes
40
+
41
+ | Mistake | Problem | Fix |
42
+ |---------|---------|-----|
43
+ | Using server-side metrics only | Misses client-perceived failures | Measure at the edge/load balancer |
44
+ | Counting health checks | Inflates availability numbers | Exclude synthetic traffic |
45
+ | Averaging latency | Hides tail latency issues | Use percentiles (p50, p95, p99) |
46
+ | Boolean up/down | Too coarse, misses partial failures | Use request-level success ratio |
47
+ | No "valid event" filter | Includes bot traffic, attacks | Define what counts as a real request |
48
+
49
+ ## Step 2: Set SLO Targets
50
+
51
+ ### Target Selection Guide
52
+
53
+ | Availability | Downtime/Month | Downtime/Year | Typical Use Case |
54
+ |-------------|---------------|---------------|-----------------|
55
+ | 99% (two 9s) | 7.3 hours | 3.65 days | Internal tools, dev environments |
56
+ | 99.5% | 3.65 hours | 1.83 days | Non-critical B2B services |
57
+ | 99.9% (three 9s) | 43.8 minutes | 8.76 hours | Standard production services |
58
+ | 99.95% | 21.9 minutes | 4.38 hours | Important customer-facing services |
59
+ | 99.99% (four 9s) | 4.38 minutes | 52.6 minutes | Payment systems, auth services |
60
+ | 99.999% (five 9s) | 26.3 seconds | 5.26 minutes | Safety-critical (rarely achievable) |
61
+
62
+ ### Setting Targets Checklist
63
+
64
+ - [ ] Based on current performance (set SLO at current p10 performance, not aspirational)
65
+ - [ ] Aligned with user expectations (survey or infer from behavior)
66
+ - [ ] Achievable with current architecture (do not promise what you cannot deliver)
67
+ - [ ] Stricter than SLA by at least 0.1% (buffer for reaction time)
68
+ - [ ] Different SLOs for different user segments if needed (paid vs free)
69
+ - [ ] Reviewed quarterly and adjusted based on data
70
+
71
+ ## Step 3: Calculate Error Budgets
72
+
73
+ ### Formula
74
+
75
+ ```
76
+ Error Budget = 1 - SLO target
77
+
78
+ Example: SLO = 99.9% availability over 30 days
79
+ Error Budget = 0.1% = 0.001
80
+ Total requests/month = 10,000,000
81
+ Allowed failures = 10,000,000 * 0.001 = 10,000 failed requests
82
+ ```
83
+
84
+ ### Error Budget Policy
85
+
86
+ | Budget Remaining | Action |
87
+ |-----------------|--------|
88
+ | > 50% | Normal development velocity, deploy freely |
89
+ | 25-50% | Increased caution, review risky deployments |
90
+ | 10-25% | Freeze non-critical deployments, focus on reliability |
91
+ | < 10% | Emergency mode: only reliability fixes ship |
92
+ | Exhausted (0%) | Full deployment freeze until budget recovers |
93
+
94
+ ### Budget Consumption Tracking
95
+
96
+ ```
97
+ Daily budget = Error Budget / 30
98
+ Burn rate = actual_errors / expected_daily_budget
99
+
100
+ Burn rate = 1.0: consuming budget exactly as planned
101
+ Burn rate > 1.0: consuming faster than sustainable
102
+ Burn rate = 10.0: will exhaust 30-day budget in 3 days
103
+ ```
104
+
105
+ ## Step 4: Configure Burn Rate Alerts
106
+
107
+ ### Multi-Window Burn Rate Alerting
108
+
109
+ | Alert | Burn Rate | Long Window | Short Window | Severity | Budget Consumed |
110
+ |-------|-----------|-------------|-------------|----------|-----------------|
111
+ | **Page (SEV1)** | 14.4x | 1 hour | 5 min | Critical | 2% in 1h |
112
+ | **Page (SEV2)** | 6x | 6 hours | 30 min | High | 5% in 6h |
113
+ | **Ticket** | 3x | 3 days | 6 hours | Medium | 10% in 3d |
114
+ | **Ticket** | 1x | 30 days | 3 days | Low | Budget tracking |
115
+
116
+ ### Why Multi-Window?
117
+
118
+ - **Long window** prevents alerting on brief spikes (high precision)
119
+ - **Short window** catches sudden onset (low detection time)
120
+ - Both conditions must be true simultaneously to fire
121
+
122
+ ### Alert Configuration Example (Prometheus)
123
+
124
+ ```yaml
125
+ # SEV1: 14.4x burn rate over 1h, confirmed by 5min window
126
+ - alert: SLOBurnRateCritical
127
+ expr: |
128
+ (
129
+ sum(rate(http_requests_total{code=~"5.."}[1h]))
130
+ / sum(rate(http_requests_total[1h]))
131
+ ) > (14.4 * 0.001)
132
+ AND
133
+ (
134
+ sum(rate(http_requests_total{code=~"5.."}[5m]))
135
+ / sum(rate(http_requests_total[5m]))
136
+ ) > (14.4 * 0.001)
137
+ for: 2m
138
+ labels:
139
+ severity: critical
140
+ annotations:
141
+ summary: "High error burn rate - SEV1"
142
+ budget_impact: "Will exhaust 30-day error budget in 50 hours"
143
+ ```
144
+
145
+ ## Step 5: Document the SLO
146
+
147
+ ### SLO Document Template
148
+
149
+ ```markdown
150
+ # SLO: {Service Name} - {SLI Name}
151
+
152
+ | Field | Value |
153
+ |-------|-------|
154
+ | Service | {service name} |
155
+ | Owner | {team name} |
156
+ | SLI | {definition} |
157
+ | SLO Target | {percentage} over {window} |
158
+ | SLA (if applicable) | {percentage} with {consequence} |
159
+ | Error Budget | {number} per {period} |
160
+ | Measurement Source | {logs / metrics / synthetic} |
161
+ | Dashboard | {link} |
162
+ | Alert Runbook | {link} |
163
+
164
+ ## SLI Definition
165
+ Good event: {definition}
166
+ Valid event: {definition}
167
+ Exclusions: {health checks, synthetic monitoring, etc.}
168
+
169
+ ## Error Budget Policy
170
+ {Copy from error budget policy table, customized for this service}
171
+
172
+ ## Review Schedule
173
+ - Weekly: error budget consumption in standup
174
+ - Monthly: SLO performance review
175
+ - Quarterly: SLO target adjustment if needed
176
+ ```
177
+
178
+ ## Common Mistakes
179
+
180
+ | Mistake | Why It Hurts | Fix |
181
+ |---------|-------------|-----|
182
+ | SLO = 100% | Zero error budget, no deployments possible | Start at 99.9%, adjust based on data |
183
+ | SLO set without measurement | Cannot track compliance | Implement SLI measurement first |
184
+ | Same SLO for all services | Over-invests in non-critical, under-invests in critical | Tier services, different SLOs per tier |
185
+ | No error budget policy | SLO exists but nobody acts on it | Define actions per budget threshold |
186
+ | Alerting on SLI instead of burn rate | Too noisy (brief spikes trigger) | Use multi-window burn rate alerts |
187
+ | SLO not reviewed | Target drifts from reality | Quarterly review cadence |
188
+ | SLA stricter than SLO | No reaction time before breach | SLO should be 0.1-0.5% stricter than SLA |
189
+ | Too many SLOs per service | Focus diluted, alert fatigue | 1-3 SLOs per service maximum |
190
+
191
+ ## SLO Maturity Model
192
+
193
+ | Level | Characteristics | Next Step |
194
+ |-------|----------------|-----------|
195
+ | **0 - None** | No SLIs or SLOs defined | Define 1 SLI per critical service |
196
+ | **1 - Measured** | SLIs exist, dashboards built | Set SLO targets based on current performance |
197
+ | **2 - Targeted** | SLOs set, error budgets calculated | Implement burn rate alerts |
198
+ | **3 - Alerted** | Multi-window burn rate alerts active | Define error budget policy |
199
+ | **4 - Managed** | Error budget drives deployment decisions | Automate deployment freeze on budget exhaustion |
200
+ | **5 - Optimized** | SLOs reviewed quarterly, drive architecture decisions | Tie SLOs to business KPIs |
@@ -123,3 +123,8 @@ Surface these issues WITHOUT being asked:
123
123
  - Storage: ~$X/month for <N> vectors
124
124
  - Query cost: ~$X per 1K queries
125
125
  ```
126
+
127
+ ## References
128
+
129
+ - [chunking-strategies.md](references/chunking-strategies.md) — Decision tree and benchmarks for chunking approaches
130
+ - [evaluation-guide.md](references/evaluation-guide.md) — RAGAS metrics and ground truth dataset creation
@@ -0,0 +1,129 @@
1
+ # Chunking Strategies — Deep Reference
2
+
3
+ > Decision tree, benchmarks, and configuration guide for RAG chunking.
4
+
5
+ ## Strategy Comparison
6
+
7
+ | Strategy | Mechanism | Best For | Chunk Size Range | Complexity |
8
+ |----------|-----------|----------|-----------------|------------|
9
+ | **Fixed-size** | Split every N tokens/chars | Uniform docs, logs, CSVs | 256-1024 tokens | Low |
10
+ | **Sentence-based** | NLP sentence boundary detection | Articles, blog posts, narrative | 1-5 sentences | Low |
11
+ | **Paragraph-based** | Double newline / heading splits | Technical docs, wikis | 100-500 tokens | Low |
12
+ | **Recursive** | Hierarchical separators (`\n\n` > `\n` > `. ` > ` `) | Mixed content, markdown, code | 256-1024 tokens | Medium |
13
+ | **Semantic** | Embedding similarity breakpoints | Long-form, topic-shifting content | Variable | High |
14
+ | **Document-aware** | Format-specific parsers (HTML, PDF, DOCX) | Multi-format collections | Variable | High |
15
+ | **Agentic** | LLM-driven boundary decisions | High-value, low-volume docs | Variable | Very High |
16
+
17
+ ## Decision Tree
18
+
19
+ ```
20
+ START
21
+ |
22
+ +-- Is content structured (tables, code, forms)?
23
+ | YES --> Document-aware chunking
24
+ | NO --+
25
+ | |
26
+ | +-- Is content uniform format (logs, CSV, transcripts)?
27
+ | | YES --> Fixed-size (512 tokens, 10% overlap)
28
+ | | NO --+
29
+ | | |
30
+ | | +-- Does content shift topics frequently?
31
+ | | | YES --> Semantic chunking
32
+ | | | NO --+
33
+ | | | |
34
+ | | | +-- Is content markdown or mixed format?
35
+ | | | | YES --> Recursive chunking
36
+ | | | | NO --> Sentence-based chunking
37
+ ```
38
+
39
+ ## Optimal Chunk Sizes by Document Type
40
+
41
+ | Document Type | Recommended Strategy | Chunk Size | Overlap | Rationale |
42
+ |---------------|---------------------|-----------|---------|-----------|
43
+ | Legal contracts | Paragraph + heading | 300-500 tokens | 50 tokens | Preserve clause boundaries |
44
+ | API documentation | Recursive (by heading) | 256-512 tokens | 20% | Section-level retrieval |
45
+ | Chat transcripts | Fixed-size | 512 tokens | 10% | No natural structure |
46
+ | Research papers | Semantic | 400-800 tokens | 15% | Topic coherence critical |
47
+ | Source code | Document-aware (AST) | Per-function | 0 | Function-level boundaries |
48
+ | Product catalogs | Row/record-based | 1 record | 0 | Atomic items |
49
+ | Meeting notes | Paragraph-based | 200-400 tokens | 10% | Topic per paragraph |
50
+ | FAQ / Q&A pairs | Document-aware | 1 pair | 0 | Atomic question-answer units |
51
+
52
+ ## Overlap Strategies
53
+
54
+ | Strategy | Overlap % | When to Use |
55
+ |----------|----------|-------------|
56
+ | **No overlap** | 0% | Atomic units (records, Q&A pairs, functions) |
57
+ | **Minimal** | 5-10% | Uniform content, high chunk count tolerance |
58
+ | **Standard** | 10-20% | General-purpose, most use cases |
59
+ | **Aggressive** | 20-30% | Small chunks (<256 tokens), context-critical |
60
+ | **Sliding window** | 50%+ | Maximum recall, cost not a constraint |
61
+
62
+ Formula: `overlap_tokens = chunk_size * overlap_percentage`
63
+
64
+ ## Benchmarks: Retrieval Quality vs Chunk Size
65
+
66
+ Tested on NaturalQuestions dataset, text-embedding-ada-002, cosine similarity, top-5 retrieval.
67
+
68
+ | Chunk Size (tokens) | Recall@5 | Precision@5 | MRR | Avg Latency |
69
+ |---------------------|----------|-------------|-----|-------------|
70
+ | 128 | 0.82 | 0.51 | 0.68 | 12ms |
71
+ | 256 | 0.85 | 0.62 | 0.74 | 14ms |
72
+ | 512 | 0.83 | 0.71 | 0.77 | 16ms |
73
+ | 1024 | 0.76 | 0.74 | 0.73 | 19ms |
74
+ | 2048 | 0.68 | 0.72 | 0.65 | 24ms |
75
+
76
+ Key finding: 256-512 tokens is the sweet spot for most use cases. Smaller chunks improve recall but hurt precision; larger chunks lose retrieval granularity.
77
+
78
+ ## Semantic Chunking Algorithm
79
+
80
+ ```
81
+ 1. Split text into base units (sentences)
82
+ 2. Compute embedding for each sentence
83
+ 3. Calculate cosine similarity between consecutive sentences
84
+ 4. Identify breakpoints where similarity drops below threshold
85
+ 5. Merge sentences between breakpoints into chunks
86
+ 6. If chunk exceeds max_size, apply recursive split within
87
+ ```
88
+
89
+ **Threshold tuning:**
90
+
91
+ | Threshold (cosine) | Behavior | Use When |
92
+ |--------------------|----------|----------|
93
+ | 0.3 | Aggressive splits, many small chunks | Diverse topics in single doc |
94
+ | 0.5 | Balanced | Default starting point |
95
+ | 0.7 | Conservative splits, fewer large chunks | Coherent, single-topic docs |
96
+
97
+ ## Metadata to Attach per Chunk
98
+
99
+ Always attach these fields to every chunk for filtering and retrieval quality:
100
+
101
+ | Field | Purpose | Example |
102
+ |-------|---------|---------|
103
+ | `source` | Document origin | `contracts/nda-2024.pdf` |
104
+ | `chunk_index` | Position in document | `3` (of 47) |
105
+ | `heading_path` | Section hierarchy | `Chapter 2 > Liability > 2.3` |
106
+ | `doc_type` | Content classification | `legal`, `api_docs`, `faq` |
107
+ | `created_at` | Temporal filtering | `2024-11-15` |
108
+ | `token_count` | Cost estimation | `384` |
109
+
110
+ ## Common Failure Modes
111
+
112
+ | Failure | Symptom | Fix |
113
+ |---------|---------|-----|
114
+ | Chunks too large | Low precision, irrelevant context in generation | Reduce to 256-512 tokens |
115
+ | Chunks too small | Low faithfulness, missing context | Increase overlap to 20-30% |
116
+ | Breaking tables/lists | Garbled retrieval results | Use document-aware chunking |
117
+ | No overlap | Answers miss context at chunk boundaries | Add 10-20% overlap |
118
+ | Ignoring document structure | Headers split from content | Use recursive with heading separators |
119
+ | Single strategy for all doc types | Inconsistent quality | Route by doc_type, use different strategies |
120
+
121
+ ## Pre-Processing Checklist
122
+
123
+ - [ ] Remove boilerplate (headers, footers, page numbers, watermarks)
124
+ - [ ] Normalize whitespace and encoding (UTF-8)
125
+ - [ ] Extract and preserve tables as structured data
126
+ - [ ] Preserve heading hierarchy for metadata
127
+ - [ ] Handle images (OCR or skip with placeholder)
128
+ - [ ] Deduplicate near-identical documents before chunking
129
+ - [ ] Validate chunk count is reasonable (flag if >10K chunks per doc)