agentic-team-templates 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +280 -0
- package/bin/cli.js +5 -0
- package/package.json +47 -0
- package/src/index.js +521 -0
- package/templates/_shared/code-quality.md +162 -0
- package/templates/_shared/communication.md +114 -0
- package/templates/_shared/core-principles.md +62 -0
- package/templates/_shared/git-workflow.md +165 -0
- package/templates/_shared/security-fundamentals.md +173 -0
- package/templates/blockchain/.cursorrules/defi-patterns.md +520 -0
- package/templates/blockchain/.cursorrules/gas-optimization.md +339 -0
- package/templates/blockchain/.cursorrules/overview.md +130 -0
- package/templates/blockchain/.cursorrules/security.md +318 -0
- package/templates/blockchain/.cursorrules/smart-contracts.md +364 -0
- package/templates/blockchain/.cursorrules/testing.md +415 -0
- package/templates/blockchain/.cursorrules/web3-integration.md +538 -0
- package/templates/blockchain/CLAUDE.md +389 -0
- package/templates/cli-tools/.cursorrules/architecture.md +412 -0
- package/templates/cli-tools/.cursorrules/arguments.md +406 -0
- package/templates/cli-tools/.cursorrules/distribution.md +546 -0
- package/templates/cli-tools/.cursorrules/error-handling.md +455 -0
- package/templates/cli-tools/.cursorrules/overview.md +136 -0
- package/templates/cli-tools/.cursorrules/testing.md +537 -0
- package/templates/cli-tools/.cursorrules/user-experience.md +545 -0
- package/templates/cli-tools/CLAUDE.md +356 -0
- package/templates/data-engineering/.cursorrules/data-modeling.md +367 -0
- package/templates/data-engineering/.cursorrules/data-quality.md +455 -0
- package/templates/data-engineering/.cursorrules/overview.md +85 -0
- package/templates/data-engineering/.cursorrules/performance.md +339 -0
- package/templates/data-engineering/.cursorrules/pipeline-design.md +280 -0
- package/templates/data-engineering/.cursorrules/security.md +460 -0
- package/templates/data-engineering/.cursorrules/testing.md +452 -0
- package/templates/data-engineering/CLAUDE.md +974 -0
- package/templates/devops-sre/.cursorrules/capacity-planning.md +653 -0
- package/templates/devops-sre/.cursorrules/change-management.md +584 -0
- package/templates/devops-sre/.cursorrules/chaos-engineering.md +651 -0
- package/templates/devops-sre/.cursorrules/disaster-recovery.md +641 -0
- package/templates/devops-sre/.cursorrules/incident-management.md +565 -0
- package/templates/devops-sre/.cursorrules/observability.md +714 -0
- package/templates/devops-sre/.cursorrules/overview.md +230 -0
- package/templates/devops-sre/.cursorrules/postmortems.md +588 -0
- package/templates/devops-sre/.cursorrules/runbooks.md +760 -0
- package/templates/devops-sre/.cursorrules/slo-sli.md +617 -0
- package/templates/devops-sre/.cursorrules/toil-reduction.md +567 -0
- package/templates/devops-sre/CLAUDE.md +1007 -0
- package/templates/documentation/.cursorrules/adr.md +277 -0
- package/templates/documentation/.cursorrules/api-documentation.md +411 -0
- package/templates/documentation/.cursorrules/code-comments.md +253 -0
- package/templates/documentation/.cursorrules/maintenance.md +260 -0
- package/templates/documentation/.cursorrules/overview.md +82 -0
- package/templates/documentation/.cursorrules/readme-standards.md +306 -0
- package/templates/documentation/CLAUDE.md +120 -0
- package/templates/fullstack/.cursorrules/api-contracts.md +331 -0
- package/templates/fullstack/.cursorrules/architecture.md +298 -0
- package/templates/fullstack/.cursorrules/overview.md +109 -0
- package/templates/fullstack/.cursorrules/shared-types.md +348 -0
- package/templates/fullstack/.cursorrules/testing.md +386 -0
- package/templates/fullstack/CLAUDE.md +349 -0
- package/templates/ml-ai/.cursorrules/data-engineering.md +483 -0
- package/templates/ml-ai/.cursorrules/deployment.md +601 -0
- package/templates/ml-ai/.cursorrules/model-development.md +538 -0
- package/templates/ml-ai/.cursorrules/monitoring.md +658 -0
- package/templates/ml-ai/.cursorrules/overview.md +131 -0
- package/templates/ml-ai/.cursorrules/security.md +637 -0
- package/templates/ml-ai/.cursorrules/testing.md +678 -0
- package/templates/ml-ai/CLAUDE.md +1136 -0
- package/templates/mobile/.cursorrules/navigation.md +246 -0
- package/templates/mobile/.cursorrules/offline-first.md +302 -0
- package/templates/mobile/.cursorrules/overview.md +71 -0
- package/templates/mobile/.cursorrules/performance.md +345 -0
- package/templates/mobile/.cursorrules/testing.md +339 -0
- package/templates/mobile/CLAUDE.md +233 -0
- package/templates/platform-engineering/.cursorrules/ci-cd.md +778 -0
- package/templates/platform-engineering/.cursorrules/developer-experience.md +632 -0
- package/templates/platform-engineering/.cursorrules/infrastructure-as-code.md +600 -0
- package/templates/platform-engineering/.cursorrules/kubernetes.md +710 -0
- package/templates/platform-engineering/.cursorrules/observability.md +747 -0
- package/templates/platform-engineering/.cursorrules/overview.md +215 -0
- package/templates/platform-engineering/.cursorrules/security.md +855 -0
- package/templates/platform-engineering/.cursorrules/testing.md +878 -0
- package/templates/platform-engineering/CLAUDE.md +850 -0
- package/templates/utility-agent/.cursorrules/action-control.md +284 -0
- package/templates/utility-agent/.cursorrules/context-management.md +186 -0
- package/templates/utility-agent/.cursorrules/hallucination-prevention.md +253 -0
- package/templates/utility-agent/.cursorrules/overview.md +78 -0
- package/templates/utility-agent/.cursorrules/token-optimization.md +369 -0
- package/templates/utility-agent/CLAUDE.md +513 -0
- package/templates/web-backend/.cursorrules/api-design.md +255 -0
- package/templates/web-backend/.cursorrules/authentication.md +309 -0
- package/templates/web-backend/.cursorrules/database-patterns.md +298 -0
- package/templates/web-backend/.cursorrules/error-handling.md +366 -0
- package/templates/web-backend/.cursorrules/overview.md +69 -0
- package/templates/web-backend/.cursorrules/security.md +358 -0
- package/templates/web-backend/.cursorrules/testing.md +395 -0
- package/templates/web-backend/CLAUDE.md +366 -0
- package/templates/web-frontend/.cursorrules/accessibility.md +296 -0
- package/templates/web-frontend/.cursorrules/component-patterns.md +204 -0
- package/templates/web-frontend/.cursorrules/overview.md +72 -0
- package/templates/web-frontend/.cursorrules/performance.md +325 -0
- package/templates/web-frontend/.cursorrules/state-management.md +227 -0
- package/templates/web-frontend/.cursorrules/styling.md +271 -0
- package/templates/web-frontend/.cursorrules/testing.md +311 -0
- package/templates/web-frontend/CLAUDE.md +399 -0
|
@@ -0,0 +1,567 @@
|
|
|
1
|
+
# Toil Reduction
|
|
2
|
+
|
|
3
|
+
Comprehensive guidelines for identifying, measuring, and eliminating operational toil.
|
|
4
|
+
|
|
5
|
+
## Core Principles
|
|
6
|
+
|
|
7
|
+
1. **Identify Systematically** - Measure toil before trying to reduce it
|
|
8
|
+
2. **Automate Ruthlessly** - If you do it more than twice, automate it
|
|
9
|
+
3. **Eliminate, Don't Optimize** - Sometimes the best automation is not doing the task
|
|
10
|
+
4. **Invest Continuously** - Dedicate time specifically for toil reduction
|
|
11
|
+
|
|
12
|
+
## Understanding Toil
|
|
13
|
+
|
|
14
|
+
### Definition of Toil
|
|
15
|
+
|
|
16
|
+
```yaml
|
|
17
|
+
toil_characteristics:
|
|
18
|
+
manual:
|
|
19
|
+
description: "Requires human intervention"
|
|
20
|
+
example: "SSH to server to check status"
|
|
21
|
+
|
|
22
|
+
repetitive:
|
|
23
|
+
description: "Done over and over"
|
|
24
|
+
example: "Weekly certificate rotation"
|
|
25
|
+
|
|
26
|
+
automatable:
|
|
27
|
+
description: "Could be done by a machine"
|
|
28
|
+
example: "Restarting a service on alert"
|
|
29
|
+
|
|
30
|
+
tactical:
|
|
31
|
+
description: "Reactive, not strategic"
|
|
32
|
+
example: "Responding to disk space alerts"
|
|
33
|
+
|
|
34
|
+
no_enduring_value:
|
|
35
|
+
description: "Doesn't improve the system"
|
|
36
|
+
example: "Manual deploys that don't improve process"
|
|
37
|
+
|
|
38
|
+
scales_linearly:
|
|
39
|
+
description: "Grows with service growth"
|
|
40
|
+
example: "Provisioning new user accounts"
|
|
41
|
+
|
|
42
|
+
not_toil:
|
|
43
|
+
- "Coding new automation"
|
|
44
|
+
- "Designing systems"
|
|
45
|
+
- "Writing documentation"
|
|
46
|
+
- "On-call work (though it can become toil)"
|
|
47
|
+
- "Postmortem analysis"
|
|
48
|
+
- "Architecture reviews"
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### Toil vs Engineering Work
|
|
52
|
+
|
|
53
|
+
```yaml
|
|
54
|
+
comparison:
|
|
55
|
+
toil:
|
|
56
|
+
definition: "Running the service"
|
|
57
|
+
value: "Keeps lights on, temporary"
|
|
58
|
+
scaling: "Grows with service size"
|
|
59
|
+
outcome: "Maintains status quo"
|
|
60
|
+
examples:
|
|
61
|
+
- "Manual deployments"
|
|
62
|
+
- "Ticket queue processing"
|
|
63
|
+
- "Capacity reports"
|
|
64
|
+
- "Alert response"
|
|
65
|
+
|
|
66
|
+
engineering:
|
|
67
|
+
definition: "Improving the service"
|
|
68
|
+
value: "Permanent improvements"
|
|
69
|
+
scaling: "Reduces future work"
|
|
70
|
+
outcome: "Makes service better"
|
|
71
|
+
examples:
|
|
72
|
+
- "Automating deployments"
|
|
73
|
+
- "Building self-service tools"
|
|
74
|
+
- "Improving monitoring"
|
|
75
|
+
- "Writing runbooks"
|
|
76
|
+
|
|
77
|
+
target_balance:
|
|
78
|
+
toil: "< 50% of SRE time"
|
|
79
|
+
engineering: "> 50% of SRE time"
|
|
80
|
+
rationale: "Too much toil prevents improvement"
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Measuring Toil
|
|
84
|
+
|
|
85
|
+
### Toil Tracking
|
|
86
|
+
|
|
87
|
+
```yaml
|
|
88
|
+
tracking_methods:
|
|
89
|
+
time_tracking:
|
|
90
|
+
description: "Log time spent on tasks"
|
|
91
|
+
tools:
|
|
92
|
+
- "Jira work logs"
|
|
93
|
+
- "Toggl/Clockify"
|
|
94
|
+
- "Custom spreadsheet"
|
|
95
|
+
categories:
|
|
96
|
+
- "Incident response"
|
|
97
|
+
- "Deployments"
|
|
98
|
+
- "Access requests"
|
|
99
|
+
- "Capacity management"
|
|
100
|
+
- "Other operational tasks"
|
|
101
|
+
|
|
102
|
+
ticket_analysis:
|
|
103
|
+
description: "Analyze support ticket patterns"
|
|
104
|
+
metrics:
|
|
105
|
+
- "Tickets per week by type"
|
|
106
|
+
- "Time per ticket"
|
|
107
|
+
- "Recurring ticket themes"
|
|
108
|
+
|
|
109
|
+
survey:
|
|
110
|
+
description: "Ask team about toil burden"
|
|
111
|
+
frequency: "Monthly or quarterly"
|
|
112
|
+
questions:
|
|
113
|
+
- "Top 3 most tedious tasks"
|
|
114
|
+
- "Tasks you wish were automated"
|
|
115
|
+
- "Estimated toil percentage"
|
|
116
|
+
|
|
117
|
+
toil_inventory:
|
|
118
|
+
template: |
|
|
119
|
+
| Task | Frequency | Duration | Automatable? | Priority |
|
|
120
|
+
|------|-----------|----------|--------------|----------|
|
|
121
|
+
| Manual deploys | 10x/week | 30 min | Yes | High |
|
|
122
|
+
| User provisioning | 5x/week | 15 min | Yes | Medium |
|
|
123
|
+
| Alert investigation | 20x/week | 15 min | Partial | High |
|
|
124
|
+
| Capacity report | 1x/week | 2 hours | Yes | Medium |
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### Toil Metrics
|
|
128
|
+
|
|
129
|
+
```yaml
|
|
130
|
+
key_metrics:
|
|
131
|
+
toil_percentage:
|
|
132
|
+
formula: "toil_hours / total_work_hours * 100"
|
|
133
|
+
target: "< 50%"
|
|
134
|
+
measurement: "Weekly or monthly"
|
|
135
|
+
|
|
136
|
+
toil_per_incident:
|
|
137
|
+
formula: "total_toil_hours / incident_count"
|
|
138
|
+
target: "Decreasing trend"
|
|
139
|
+
indicates: "Incident response efficiency"
|
|
140
|
+
|
|
141
|
+
automation_rate:
|
|
142
|
+
formula: "automated_tasks / total_tasks"
|
|
143
|
+
target: "Increasing trend"
|
|
144
|
+
indicates: "Progress on automation"
|
|
145
|
+
|
|
146
|
+
time_to_automate:
|
|
147
|
+
formula: "time_since_task_identified / is_automated"
|
|
148
|
+
target: "< 30 days for high-priority"
|
|
149
|
+
indicates: "Automation velocity"
|
|
150
|
+
|
|
151
|
+
tracking_dashboard:
|
|
152
|
+
visualizations:
|
|
153
|
+
- "Toil trend over time"
|
|
154
|
+
- "Toil by category"
|
|
155
|
+
- "Toil per team member"
|
|
156
|
+
- "Automation backlog size"
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
## Identifying Toil
|
|
160
|
+
|
|
161
|
+
### Common Sources of Toil
|
|
162
|
+
|
|
163
|
+
```yaml
|
|
164
|
+
deployment_toil:
|
|
165
|
+
symptoms:
|
|
166
|
+
- "Manual deployment steps"
|
|
167
|
+
- "Deployment-related pages"
|
|
168
|
+
- "Post-deploy verification"
|
|
169
|
+
examples:
|
|
170
|
+
- "SSH to servers to deploy"
|
|
171
|
+
- "Manually running database migrations"
|
|
172
|
+
- "Verifying each deployment manually"
|
|
173
|
+
solutions:
|
|
174
|
+
- "CI/CD pipelines"
|
|
175
|
+
- "Automated rollbacks"
|
|
176
|
+
- "Deployment verification tests"
|
|
177
|
+
|
|
178
|
+
access_management_toil:
|
|
179
|
+
symptoms:
|
|
180
|
+
- "Manual account creation"
|
|
181
|
+
- "Permission requests via tickets"
|
|
182
|
+
- "Access audits"
|
|
183
|
+
examples:
|
|
184
|
+
- "Creating cloud IAM users manually"
|
|
185
|
+
- "Adding SSH keys to servers"
|
|
186
|
+
- "Granting database access"
|
|
187
|
+
solutions:
|
|
188
|
+
- "Self-service access portal"
|
|
189
|
+
- "RBAC with group management"
|
|
190
|
+
- "Automated access reviews"
|
|
191
|
+
|
|
192
|
+
capacity_toil:
|
|
193
|
+
symptoms:
|
|
194
|
+
- "Manual scaling"
|
|
195
|
+
- "Capacity planning spreadsheets"
|
|
196
|
+
- "Alert-driven scaling"
|
|
197
|
+
examples:
|
|
198
|
+
- "Resizing instances manually"
|
|
199
|
+
- "Adding nodes before expected load"
|
|
200
|
+
- "Responding to disk space alerts"
|
|
201
|
+
solutions:
|
|
202
|
+
- "Autoscaling"
|
|
203
|
+
- "Automated capacity reporting"
|
|
204
|
+
- "Predictive scaling"
|
|
205
|
+
|
|
206
|
+
incident_response_toil:
|
|
207
|
+
symptoms:
|
|
208
|
+
- "Manual remediation steps"
|
|
209
|
+
- "Repeated troubleshooting"
|
|
210
|
+
- "Alert fatigue"
|
|
211
|
+
examples:
|
|
212
|
+
- "Restarting services manually"
|
|
213
|
+
- "Same investigation for same alerts"
|
|
214
|
+
- "Too many non-actionable alerts"
|
|
215
|
+
solutions:
|
|
216
|
+
- "Auto-remediation"
|
|
217
|
+
- "Better alerts"
|
|
218
|
+
- "Self-healing systems"
|
|
219
|
+
|
|
220
|
+
maintenance_toil:
|
|
221
|
+
symptoms:
|
|
222
|
+
- "Manual updates"
|
|
223
|
+
- "Certificate management"
|
|
224
|
+
- "Secret rotation"
|
|
225
|
+
examples:
|
|
226
|
+
- "Patching servers one by one"
|
|
227
|
+
- "Renewing certificates manually"
|
|
228
|
+
- "Rotating passwords"
|
|
229
|
+
solutions:
|
|
230
|
+
- "Automated patching"
|
|
231
|
+
- "Auto-renewing certificates"
|
|
232
|
+
- "Secret management systems"
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
### Toil Discovery Methods
|
|
236
|
+
|
|
237
|
+
```yaml
|
|
238
|
+
discovery_techniques:
|
|
239
|
+
retrospective:
|
|
240
|
+
frequency: "Weekly"
|
|
241
|
+
questions:
|
|
242
|
+
- "What took the most time this week?"
|
|
243
|
+
- "What felt repetitive?"
|
|
244
|
+
- "What would you automate?"
|
|
245
|
+
|
|
246
|
+
shadowing:
|
|
247
|
+
description: "Observe team members working"
|
|
248
|
+
purpose: "Find unrecognized toil"
|
|
249
|
+
outcome: "List of automation candidates"
|
|
250
|
+
|
|
251
|
+
ticket_mining:
|
|
252
|
+
process:
|
|
253
|
+
- "Export last 3 months of tickets"
|
|
254
|
+
- "Categorize by type"
|
|
255
|
+
- "Identify patterns"
|
|
256
|
+
- "Calculate time per category"
|
|
257
|
+
|
|
258
|
+
on_call_analysis:
|
|
259
|
+
data:
|
|
260
|
+
- "Alert frequency by type"
|
|
261
|
+
- "Actions taken per alert"
|
|
262
|
+
- "Time to resolve"
|
|
263
|
+
patterns:
|
|
264
|
+
- "Same alerts recurring"
|
|
265
|
+
- "Same remediation steps"
|
|
266
|
+
- "Alerts during off-hours"
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
## Eliminating Toil
|
|
270
|
+
|
|
271
|
+
### Elimination Strategies
|
|
272
|
+
|
|
273
|
+
```yaml
|
|
274
|
+
eliminate:
|
|
275
|
+
description: "Remove the need entirely"
|
|
276
|
+
examples:
|
|
277
|
+
- "Remove unused service instead of maintaining it"
|
|
278
|
+
- "Simplify architecture to remove failure modes"
|
|
279
|
+
- "Delegate to managed service"
|
|
280
|
+
when_to_use: "When task has questionable value"
|
|
281
|
+
|
|
282
|
+
automate:
|
|
283
|
+
description: "Build software to do the task"
|
|
284
|
+
examples:
|
|
285
|
+
- "CI/CD for deployments"
|
|
286
|
+
- "Auto-scaling for capacity"
|
|
287
|
+
- "Auto-remediation for common issues"
|
|
288
|
+
when_to_use: "When task is valuable but repetitive"
|
|
289
|
+
|
|
290
|
+
self_service:
|
|
291
|
+
description: "Enable users to do it themselves"
|
|
292
|
+
examples:
|
|
293
|
+
- "Self-service access requests"
|
|
294
|
+
- "Self-service environment provisioning"
|
|
295
|
+
- "Documentation for common questions"
|
|
296
|
+
when_to_use: "When SRE shouldn't be in the critical path"
|
|
297
|
+
|
|
298
|
+
optimize:
|
|
299
|
+
description: "Make the task faster"
|
|
300
|
+
examples:
|
|
301
|
+
- "Better tooling"
|
|
302
|
+
- "Streamlined processes"
|
|
303
|
+
- "Parallel execution"
|
|
304
|
+
when_to_use: "When elimination/automation not feasible"
|
|
305
|
+
```
|
|
306
|
+
|
|
307
|
+
### Prioritization Framework
|
|
308
|
+
|
|
309
|
+
```yaml
|
|
310
|
+
prioritization_matrix:
|
|
311
|
+
high_priority:
|
|
312
|
+
criteria:
|
|
313
|
+
- "High frequency (daily+)"
|
|
314
|
+
- "High time cost (> 1 hour)"
|
|
315
|
+
- "Clearly automatable"
|
|
316
|
+
- "Affects multiple people"
|
|
317
|
+
examples:
|
|
318
|
+
- "Manual deployments"
|
|
319
|
+
- "Recurring alerts needing same fix"
|
|
320
|
+
|
|
321
|
+
medium_priority:
|
|
322
|
+
criteria:
|
|
323
|
+
- "Medium frequency (weekly)"
|
|
324
|
+
- "Medium time cost (30-60 min)"
|
|
325
|
+
- "Partially automatable"
|
|
326
|
+
examples:
|
|
327
|
+
- "Access requests"
|
|
328
|
+
- "Capacity adjustments"
|
|
329
|
+
|
|
330
|
+
low_priority:
|
|
331
|
+
criteria:
|
|
332
|
+
- "Low frequency (monthly+)"
|
|
333
|
+
- "Low time cost (< 30 min)"
|
|
334
|
+
- "Complex to automate"
|
|
335
|
+
examples:
|
|
336
|
+
- "Quarterly audits"
|
|
337
|
+
- "One-off requests"
|
|
338
|
+
|
|
339
|
+
roi_calculation:
|
|
340
|
+
formula: |
|
|
341
|
+
ROI = (time_saved_per_occurrence * frequency * duration) / automation_effort
|
|
342
|
+
|
|
343
|
+
example:
|
|
344
|
+
task: "Manual deployment"
|
|
345
|
+
time_per_occurrence: "30 minutes"
|
|
346
|
+
frequency: "10 per week"
|
|
347
|
+
duration: "52 weeks"
|
|
348
|
+
total_toil: "260 hours/year"
|
|
349
|
+
automation_effort: "40 hours"
|
|
350
|
+
roi: "6.5x in first year"
|
|
351
|
+
```
|
|
352
|
+
|
|
353
|
+
### Automation Best Practices
|
|
354
|
+
|
|
355
|
+
```yaml
|
|
356
|
+
automation_principles:
|
|
357
|
+
start_simple:
|
|
358
|
+
description: "Automate the 80% case first"
|
|
359
|
+
example: "Automate happy path, handle edge cases manually initially"
|
|
360
|
+
|
|
361
|
+
iterate:
|
|
362
|
+
description: "Improve automation over time"
|
|
363
|
+
approach: "Ship basic automation, add features based on real usage"
|
|
364
|
+
|
|
365
|
+
document:
|
|
366
|
+
description: "Explain what automation does"
|
|
367
|
+
purpose: "Others can debug, maintain, improve"
|
|
368
|
+
|
|
369
|
+
monitor:
|
|
370
|
+
description: "Track automation success/failure"
|
|
371
|
+
metrics:
|
|
372
|
+
- "Success rate"
|
|
373
|
+
- "Time saved"
|
|
374
|
+
- "Failure modes"
|
|
375
|
+
|
|
376
|
+
maintain:
|
|
377
|
+
description: "Keep automation working"
|
|
378
|
+
consideration: "Automation has maintenance cost too"
|
|
379
|
+
|
|
380
|
+
self_healing_patterns:
|
|
381
|
+
auto_restart:
|
|
382
|
+
trigger: "Process crashed"
|
|
383
|
+
action: "Restart process"
|
|
384
|
+
tools: "Kubernetes, systemd"
|
|
385
|
+
|
|
386
|
+
auto_scale:
|
|
387
|
+
trigger: "High load"
|
|
388
|
+
action: "Add capacity"
|
|
389
|
+
tools: "HPA, cloud autoscaling"
|
|
390
|
+
|
|
391
|
+
auto_failover:
|
|
392
|
+
trigger: "Primary failure"
|
|
393
|
+
action: "Promote secondary"
|
|
394
|
+
tools: "Patroni, RDS Multi-AZ"
|
|
395
|
+
|
|
396
|
+
auto_remediate:
|
|
397
|
+
trigger: "Known error condition"
|
|
398
|
+
action: "Apply known fix"
|
|
399
|
+
tools: "Custom scripts, Rundeck"
|
|
400
|
+
```
|
|
401
|
+
|
|
402
|
+
## Self-Service
|
|
403
|
+
|
|
404
|
+
### Building Self-Service
|
|
405
|
+
|
|
406
|
+
```yaml
|
|
407
|
+
self_service_principles:
|
|
408
|
+
reduce_friction:
|
|
409
|
+
goal: "Make it easier to self-serve than to ask"
|
|
410
|
+
methods:
|
|
411
|
+
- "Good documentation"
|
|
412
|
+
- "Intuitive interfaces"
|
|
413
|
+
- "Sensible defaults"
|
|
414
|
+
|
|
415
|
+
guardrails:
|
|
416
|
+
goal: "Prevent mistakes without blocking"
|
|
417
|
+
methods:
|
|
418
|
+
- "Validation"
|
|
419
|
+
- "Quotas and limits"
|
|
420
|
+
- "Approval for high-risk actions"
|
|
421
|
+
|
|
422
|
+
transparency:
|
|
423
|
+
goal: "Users understand what's happening"
|
|
424
|
+
methods:
|
|
425
|
+
- "Clear status updates"
|
|
426
|
+
- "Audit logs"
|
|
427
|
+
- "Error messages that help"
|
|
428
|
+
|
|
429
|
+
self_service_examples:
|
|
430
|
+
environment_provisioning:
|
|
431
|
+
before: "File ticket, wait 2 days"
|
|
432
|
+
after: "Click button, get environment in 10 minutes"
|
|
433
|
+
implementation:
|
|
434
|
+
- "Terraform modules"
|
|
435
|
+
- "GitOps workflow"
|
|
436
|
+
- "Web portal"
|
|
437
|
+
|
|
438
|
+
access_management:
|
|
439
|
+
before: "File ticket, get manual approval"
|
|
440
|
+
after: "Request in portal, automatic approval for standard access"
|
|
441
|
+
implementation:
|
|
442
|
+
- "Access catalog"
|
|
443
|
+
- "Automated approval rules"
|
|
444
|
+
- "Time-bound access"
|
|
445
|
+
|
|
446
|
+
database_requests:
|
|
447
|
+
before: "DBA runs queries on request"
|
|
448
|
+
after: "Self-service query tool with safety limits"
|
|
449
|
+
implementation:
|
|
450
|
+
- "Read replica access"
|
|
451
|
+
- "Query timeout limits"
|
|
452
|
+
- "Audit logging"
|
|
453
|
+
```
|
|
454
|
+
|
|
455
|
+
### Internal Developer Platform
|
|
456
|
+
|
|
457
|
+
```yaml
|
|
458
|
+
idp_components:
|
|
459
|
+
service_catalog:
|
|
460
|
+
purpose: "Discover available services"
|
|
461
|
+
features:
|
|
462
|
+
- "Service documentation"
|
|
463
|
+
- "Ownership information"
|
|
464
|
+
- "Dependency mapping"
|
|
465
|
+
tools: "Backstage, Port"
|
|
466
|
+
|
|
467
|
+
self_service_portal:
|
|
468
|
+
purpose: "Request resources"
|
|
469
|
+
features:
|
|
470
|
+
- "Environment provisioning"
|
|
471
|
+
- "Access requests"
|
|
472
|
+
- "Resource creation"
|
|
473
|
+
tools: "Backstage scaffolder, custom portals"
|
|
474
|
+
|
|
475
|
+
golden_paths:
|
|
476
|
+
purpose: "Recommended ways to do things"
|
|
477
|
+
features:
|
|
478
|
+
- "Project templates"
|
|
479
|
+
- "Best practice defaults"
|
|
480
|
+
- "Integrated tooling"
|
|
481
|
+
benefit: "Reduce decisions, improve consistency"
|
|
482
|
+
```
|
|
483
|
+
|
|
484
|
+
## Sustaining Toil Reduction
|
|
485
|
+
|
|
486
|
+
### Ongoing Process
|
|
487
|
+
|
|
488
|
+
```yaml
|
|
489
|
+
continuous_improvement:
|
|
490
|
+
weekly:
|
|
491
|
+
- "Review toil from past week"
|
|
492
|
+
- "Identify new automation opportunities"
|
|
493
|
+
- "Progress on existing automation"
|
|
494
|
+
|
|
495
|
+
monthly:
|
|
496
|
+
- "Toil metrics review"
|
|
497
|
+
- "Prioritize automation backlog"
|
|
498
|
+
- "Celebrate wins"
|
|
499
|
+
|
|
500
|
+
quarterly:
|
|
501
|
+
- "Comprehensive toil assessment"
|
|
502
|
+
- "Set reduction targets"
|
|
503
|
+
- "Allocate engineering time"
|
|
504
|
+
|
|
505
|
+
dedicated_time:
|
|
506
|
+
approach: "Reserve time specifically for toil reduction"
|
|
507
|
+
options:
|
|
508
|
+
rotation: "One person focused on automation each sprint"
|
|
509
|
+
percentage: "20% of sprint capacity for automation"
|
|
510
|
+
hack_week: "Quarterly week focused on toil reduction"
|
|
511
|
+
|
|
512
|
+
protection: "Don't let interrupt work consume automation time"
|
|
513
|
+
```
|
|
514
|
+
|
|
515
|
+
### Cultural Aspects
|
|
516
|
+
|
|
517
|
+
```yaml
|
|
518
|
+
cultural_practices:
|
|
519
|
+
celebrate_automation:
|
|
520
|
+
- "Announce when automation ships"
|
|
521
|
+
- "Track time saved"
|
|
522
|
+
- "Recognize contributors"
|
|
523
|
+
|
|
524
|
+
make_toil_visible:
|
|
525
|
+
- "Dashboard showing toil metrics"
|
|
526
|
+
- "Regular reports to leadership"
|
|
527
|
+
- "Include in team reviews"
|
|
528
|
+
|
|
529
|
+
empower_team:
|
|
530
|
+
- "Anyone can propose automation"
|
|
531
|
+
- "Time allocated for experimentation"
|
|
532
|
+
- "Safe to try and fail"
|
|
533
|
+
|
|
534
|
+
learn_from_others:
|
|
535
|
+
- "Share automation across teams"
|
|
536
|
+
- "Internal automation showcase"
|
|
537
|
+
- "Reuse common patterns"
|
|
538
|
+
```
|
|
539
|
+
|
|
540
|
+
## Common Pitfalls
|
|
541
|
+
|
|
542
|
+
```yaml
|
|
543
|
+
pitfall_over_engineering:
|
|
544
|
+
problem: "Spend more time automating than task would take"
|
|
545
|
+
example: "Week automating task that happens once a year"
|
|
546
|
+
solution: "Calculate ROI before starting"
|
|
547
|
+
|
|
548
|
+
pitfall_automation_debt:
|
|
549
|
+
problem: "Automation breaks and no one fixes it"
|
|
550
|
+
example: "CI pipeline that's always red"
|
|
551
|
+
solution: "Maintain automation like production code"
|
|
552
|
+
|
|
553
|
+
pitfall_false_automation:
|
|
554
|
+
problem: "Automation that still needs manual steps"
|
|
555
|
+
example: "Script that requires SSH and manual verification"
|
|
556
|
+
solution: "Full automation or don't call it automated"
|
|
557
|
+
|
|
558
|
+
pitfall_local_optimization:
|
|
559
|
+
problem: "Automate symptoms instead of fixing causes"
|
|
560
|
+
example: "Auto-restart instead of fixing memory leak"
|
|
561
|
+
solution: "Ask why the toil exists"
|
|
562
|
+
|
|
563
|
+
pitfall_resistance:
|
|
564
|
+
problem: "Team resistant to changing processes"
|
|
565
|
+
example: "'We've always done it this way'"
|
|
566
|
+
solution: "Demonstrate value, involve team in solutions"
|
|
567
|
+
```
|