omgkit 2.22.11 → 2.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -8
- package/package.json +2 -2
- package/plugin/registry.yaml +3 -3
- package/plugin/skills/devops/dora-metrics/SKILL.md +852 -0
- package/plugin/skills/devops/feature-flags/SKILL.md +559 -0
- package/plugin/skills/methodology/stacked-diffs/SKILL.md +568 -0
- package/plugin/skills/testing/chaos-engineering/SKILL.md +732 -0
|
@@ -0,0 +1,852 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: DORA Metrics and DevOps Performance
|
|
3
|
+
description: The agent implements DORA metrics tracking for measuring and improving software delivery performance. Use when establishing engineering metrics, benchmarking teams, or driving DevOps transformation.
|
|
4
|
+
category: devops
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# DORA Metrics and DevOps Performance
|
|
8
|
+
|
|
9
|
+
## Purpose
|
|
10
|
+
|
|
11
|
+
DORA (DevOps Research and Assessment) metrics are the industry standard for measuring software delivery performance. Google's research across thousands of organizations identified four key metrics that predict:
|
|
12
|
+
|
|
13
|
+
- **Organizational performance** (profitability, market share)
|
|
14
|
+
- **Non-commercial performance** (quality, customer satisfaction)
|
|
15
|
+
- **Team well-being** and reduced burnout
|
|
16
|
+
|
|
17
|
+
Elite performers who meet reliability targets are **2.3x more likely** to use trunk-based development and continuous delivery practices.
|
|
18
|
+
|
|
19
|
+
## Features
|
|
20
|
+
|
|
21
|
+
| Metric | What It Measures | Elite Benchmark |
|
|
22
|
+
|--------|------------------|-----------------|
|
|
23
|
+
| Deployment Frequency | How often code reaches production | Multiple times per day |
|
|
24
|
+
| Lead Time for Changes | Time from commit to production | Less than 1 hour |
|
|
25
|
+
| Change Failure Rate | Percentage of deployments causing failures | 0-15% |
|
|
26
|
+
| Time to Restore Service | Recovery time from incidents | Less than 1 hour |
|
|
27
|
+
|
|
28
|
+
## The Four Key Metrics
|
|
29
|
+
|
|
30
|
+
### 1. Deployment Frequency
|
|
31
|
+
|
|
32
|
+
**Definition:** How often your organization deploys code to production.
|
|
33
|
+
|
|
34
|
+
```typescript
|
|
35
|
+
// Deployment frequency calculation
|
|
36
|
+
interface DeploymentData {
|
|
37
|
+
timestamp: Date;
|
|
38
|
+
environment: string;
|
|
39
|
+
service: string;
|
|
40
|
+
success: boolean;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
function calculateDeploymentFrequency(
|
|
44
|
+
deployments: DeploymentData[],
|
|
45
|
+
periodDays: number = 30
|
|
46
|
+
): { frequency: string; deploymentsPerDay: number } {
|
|
47
|
+
const productionDeployments = deployments.filter(
|
|
48
|
+
d => d.environment === 'production' && d.success
|
|
49
|
+
);
|
|
50
|
+
|
|
51
|
+
const deploymentsPerDay = productionDeployments.length / periodDays;
|
|
52
|
+
|
|
53
|
+
let frequency: string;
|
|
54
|
+
if (deploymentsPerDay >= 1) {
|
|
55
|
+
frequency = 'elite'; // Multiple times per day or daily
|
|
56
|
+
} else if (deploymentsPerDay >= 1/7) {
|
|
57
|
+
frequency = 'high'; // Weekly to daily
|
|
58
|
+
} else if (deploymentsPerDay >= 1/30) {
|
|
59
|
+
frequency = 'medium'; // Monthly to weekly
|
|
60
|
+
} else {
|
|
61
|
+
frequency = 'low'; // Less than monthly
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
return { frequency, deploymentsPerDay };
|
|
65
|
+
}
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### 2. Lead Time for Changes
|
|
69
|
+
|
|
70
|
+
**Definition:** Time from code commit to code running in production.
|
|
71
|
+
|
|
72
|
+
```typescript
|
|
73
|
+
// Lead time calculation
|
|
74
|
+
interface ChangeData {
|
|
75
|
+
commitTimestamp: Date;
|
|
76
|
+
deployTimestamp: Date;
|
|
77
|
+
commitSha: string;
|
|
78
|
+
prNumber?: number;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
function calculateLeadTime(changes: ChangeData[]): {
|
|
82
|
+
medianHours: number;
|
|
83
|
+
p90Hours: number;
|
|
84
|
+
performance: string;
|
|
85
|
+
} {
|
|
86
|
+
const leadTimes = changes.map(c =>
|
|
87
|
+
(c.deployTimestamp.getTime() - c.commitTimestamp.getTime()) / (1000 * 60 * 60)
|
|
88
|
+
);
|
|
89
|
+
|
|
90
|
+
leadTimes.sort((a, b) => a - b);
|
|
91
|
+
|
|
92
|
+
const median = leadTimes[Math.floor(leadTimes.length / 2)];
|
|
93
|
+
const p90 = leadTimes[Math.floor(leadTimes.length * 0.9)];
|
|
94
|
+
|
|
95
|
+
let performance: string;
|
|
96
|
+
if (median < 1) {
|
|
97
|
+
performance = 'elite'; // Less than 1 hour
|
|
98
|
+
} else if (median < 24) {
|
|
99
|
+
performance = 'high'; // Less than 1 day
|
|
100
|
+
} else if (median < 168) {
|
|
101
|
+
performance = 'medium'; // Less than 1 week
|
|
102
|
+
} else {
|
|
103
|
+
performance = 'low'; // More than 1 week
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
return { medianHours: median, p90Hours: p90, performance };
|
|
107
|
+
}
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### 3. Change Failure Rate
|
|
111
|
+
|
|
112
|
+
**Definition:** Percentage of deployments that result in degraded service requiring remediation.
|
|
113
|
+
|
|
114
|
+
```typescript
|
|
115
|
+
// Change failure rate calculation
|
|
116
|
+
interface DeploymentOutcome {
|
|
117
|
+
deploymentId: string;
|
|
118
|
+
timestamp: Date;
|
|
119
|
+
success: boolean;
|
|
120
|
+
causedIncident: boolean;
|
|
121
|
+
requiredRollback: boolean;
|
|
122
|
+
requiredHotfix: boolean;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
function calculateChangeFailureRate(deployments: DeploymentOutcome[]): {
|
|
126
|
+
rate: number;
|
|
127
|
+
performance: string;
|
|
128
|
+
} {
|
|
129
|
+
const total = deployments.length;
|
|
130
|
+
const failures = deployments.filter(d =>
|
|
131
|
+
d.causedIncident || d.requiredRollback || d.requiredHotfix
|
|
132
|
+
).length;
|
|
133
|
+
|
|
134
|
+
const rate = (failures / total) * 100;
|
|
135
|
+
|
|
136
|
+
let performance: string;
|
|
137
|
+
if (rate <= 15) {
|
|
138
|
+
performance = 'elite'; // 0-15%
|
|
139
|
+
} else if (rate <= 30) {
|
|
140
|
+
performance = 'high'; // 16-30%
|
|
141
|
+
} else if (rate <= 45) {
|
|
142
|
+
performance = 'medium'; // 31-45%
|
|
143
|
+
} else {
|
|
144
|
+
performance = 'low'; // 46%+
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
return { rate, performance };
|
|
148
|
+
}
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
### 4. Time to Restore Service (MTTR)
|
|
152
|
+
|
|
153
|
+
**Definition:** How long it takes to restore service when an incident occurs.
|
|
154
|
+
|
|
155
|
+
```typescript
|
|
156
|
+
// MTTR calculation
|
|
157
|
+
interface Incident {
|
|
158
|
+
id: string;
|
|
159
|
+
startTime: Date;
|
|
160
|
+
resolvedTime: Date;
|
|
161
|
+
severity: 'critical' | 'major' | 'minor';
|
|
162
|
+
service: string;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
function calculateMTTR(incidents: Incident[]): {
|
|
166
|
+
medianHours: number;
|
|
167
|
+
performance: string;
|
|
168
|
+
byService: Record<string, number>;
|
|
169
|
+
} {
|
|
170
|
+
const restorationTimes = incidents.map(i =>
|
|
171
|
+
(i.resolvedTime.getTime() - i.startTime.getTime()) / (1000 * 60 * 60)
|
|
172
|
+
);
|
|
173
|
+
|
|
174
|
+
restorationTimes.sort((a, b) => a - b);
|
|
175
|
+
const median = restorationTimes[Math.floor(restorationTimes.length / 2)];
|
|
176
|
+
|
|
177
|
+
let performance: string;
|
|
178
|
+
if (median < 1) {
|
|
179
|
+
performance = 'elite'; // Less than 1 hour
|
|
180
|
+
} else if (median < 24) {
|
|
181
|
+
performance = 'high'; // Less than 1 day
|
|
182
|
+
} else if (median < 168) {
|
|
183
|
+
performance = 'medium'; // Less than 1 week
|
|
184
|
+
} else {
|
|
185
|
+
performance = 'low'; // More than 1 week
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
// Group by service
|
|
189
|
+
const byService: Record<string, number[]> = {};
|
|
190
|
+
for (const incident of incidents) {
|
|
191
|
+
if (!byService[incident.service]) byService[incident.service] = [];
|
|
192
|
+
const hours = (incident.resolvedTime.getTime() - incident.startTime.getTime()) / (1000 * 60 * 60);
|
|
193
|
+
byService[incident.service].push(hours);
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
const serviceMedians: Record<string, number> = {};
|
|
197
|
+
for (const [service, times] of Object.entries(byService)) {
|
|
198
|
+
times.sort((a, b) => a - b);
|
|
199
|
+
serviceMedians[service] = times[Math.floor(times.length / 2)];
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
return { medianHours: median, performance, byService: serviceMedians };
|
|
203
|
+
}
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
## Performance Levels (2024 Benchmarks)
|
|
207
|
+
|
|
208
|
+
| Level | Deploy Freq | Lead Time | Change Failure | MTTR |
|
|
209
|
+
|-------|-------------|-----------|----------------|------|
|
|
210
|
+
| **Elite** | Multiple/day | < 1 hour | 0-15% | < 1 hour |
|
|
211
|
+
| **High** | Daily-Weekly | 1 day - 1 week | 16-30% | < 1 day |
|
|
212
|
+
| **Medium** | Weekly-Monthly | 1 week - 1 month | 16-30% | < 1 day |
|
|
213
|
+
| **Low** | Monthly+ | 1-6 months | 16-30% | < 1 week |
|
|
214
|
+
|
|
215
|
+
**Key Insight (2024 DORA Report):** Elite performers are **2.3x more likely** to meet reliability targets when using trunk-based development.
|
|
216
|
+
|
|
217
|
+
## Measurement Implementation
|
|
218
|
+
|
|
219
|
+
### GitHub Actions DORA Workflow
|
|
220
|
+
|
|
221
|
+
```yaml
|
|
222
|
+
# .github/workflows/dora-metrics.yml
|
|
223
|
+
name: DORA Metrics Collection
|
|
224
|
+
|
|
225
|
+
on:
|
|
226
|
+
schedule:
|
|
227
|
+
- cron: '0 0 * * 0' # Weekly on Sunday
|
|
228
|
+
workflow_dispatch:
|
|
229
|
+
|
|
230
|
+
jobs:
|
|
231
|
+
collect-metrics:
|
|
232
|
+
runs-on: ubuntu-latest
|
|
233
|
+
steps:
|
|
234
|
+
- uses: actions/checkout@v4
|
|
235
|
+
|
|
236
|
+
- name: Collect Deployment Data
|
|
237
|
+
id: deployments
|
|
238
|
+
uses: actions/github-script@v7
|
|
239
|
+
with:
|
|
240
|
+
script: |
|
|
241
|
+
const thirtyDaysAgo = new Date();
|
|
242
|
+
thirtyDaysAgo.setDate(thirtyDaysAgo.getDate() - 30);
|
|
243
|
+
|
|
244
|
+
// Get workflow runs (deployments)
|
|
245
|
+
const { data: runs } = await github.rest.actions.listWorkflowRuns({
|
|
246
|
+
owner: context.repo.owner,
|
|
247
|
+
repo: context.repo.repo,
|
|
248
|
+
workflow_id: 'deploy.yml',
|
|
249
|
+
created: `>=${thirtyDaysAgo.toISOString()}`,
|
|
250
|
+
status: 'completed'
|
|
251
|
+
});
|
|
252
|
+
|
|
253
|
+
const deployments = runs.workflow_runs.filter(r =>
|
|
254
|
+
r.conclusion === 'success'
|
|
255
|
+
);
|
|
256
|
+
|
|
257
|
+
// Calculate deployment frequency
|
|
258
|
+
const deploymentsPerDay = deployments.length / 30;
|
|
259
|
+
|
|
260
|
+
return {
|
|
261
|
+
count: deployments.length,
|
|
262
|
+
perDay: deploymentsPerDay.toFixed(2),
|
|
263
|
+
frequency: deploymentsPerDay >= 1 ? 'elite' :
|
|
264
|
+
deploymentsPerDay >= 0.14 ? 'high' :
|
|
265
|
+
deploymentsPerDay >= 0.03 ? 'medium' : 'low'
|
|
266
|
+
};
|
|
267
|
+
|
|
268
|
+
- name: Collect Lead Time Data
|
|
269
|
+
id: lead-time
|
|
270
|
+
uses: actions/github-script@v7
|
|
271
|
+
with:
|
|
272
|
+
script: |
|
|
273
|
+
const thirtyDaysAgo = new Date();
|
|
274
|
+
thirtyDaysAgo.setDate(thirtyDaysAgo.getDate() - 30);
|
|
275
|
+
|
|
276
|
+
// Get merged PRs
|
|
277
|
+
const { data: prs } = await github.rest.pulls.list({
|
|
278
|
+
owner: context.repo.owner,
|
|
279
|
+
repo: context.repo.repo,
|
|
280
|
+
state: 'closed',
|
|
281
|
+
sort: 'updated',
|
|
282
|
+
direction: 'desc',
|
|
283
|
+
per_page: 100
|
|
284
|
+
});
|
|
285
|
+
|
|
286
|
+
const mergedPRs = prs.filter(pr =>
|
|
287
|
+
pr.merged_at &&
|
|
288
|
+
new Date(pr.merged_at) > thirtyDaysAgo
|
|
289
|
+
);
|
|
290
|
+
|
|
291
|
+
const leadTimes = mergedPRs.map(pr => {
|
|
292
|
+
const created = new Date(pr.created_at);
|
|
293
|
+
const merged = new Date(pr.merged_at);
|
|
294
|
+
return (merged - created) / (1000 * 60 * 60); // hours
|
|
295
|
+
});
|
|
296
|
+
|
|
297
|
+
leadTimes.sort((a, b) => a - b);
|
|
298
|
+
const median = leadTimes[Math.floor(leadTimes.length / 2)] || 0;
|
|
299
|
+
|
|
300
|
+
return {
|
|
301
|
+
medianHours: median.toFixed(1),
|
|
302
|
+
performance: median < 1 ? 'elite' :
|
|
303
|
+
median < 24 ? 'high' :
|
|
304
|
+
median < 168 ? 'medium' : 'low'
|
|
305
|
+
};
|
|
306
|
+
|
|
307
|
+
- name: Generate Report
|
|
308
|
+
run: |
|
|
309
|
+
cat << EOF > dora-report.md
|
|
310
|
+
# DORA Metrics Report
|
|
311
|
+
**Period:** Last 30 days
|
|
312
|
+
**Generated:** $(date -u +"%Y-%m-%d %H:%M:%S UTC")
|
|
313
|
+
|
|
314
|
+
## Metrics Summary
|
|
315
|
+
|
|
316
|
+
| Metric | Value | Performance |
|
|
317
|
+
|--------|-------|-------------|
|
|
318
|
+
| Deployment Frequency | ${{ fromJson(steps.deployments.outputs.result).perDay }}/day | ${{ fromJson(steps.deployments.outputs.result).frequency }} |
|
|
319
|
+
| Lead Time for Changes | ${{ fromJson(steps.lead-time.outputs.result).medianHours }} hours | ${{ fromJson(steps.lead-time.outputs.result).performance }} |
|
|
320
|
+
|
|
321
|
+
## Recommendations
|
|
322
|
+
$(if [ "${{ fromJson(steps.deployments.outputs.result).frequency }}" != "elite" ]; then echo "- Increase deployment frequency through smaller, more frequent releases"; fi)
|
|
323
|
+
$(if [ "${{ fromJson(steps.lead-time.outputs.result).performance }}" != "elite" ]; then echo "- Reduce lead time by automating more of the review process"; fi)
|
|
324
|
+
EOF
|
|
325
|
+
|
|
326
|
+
- name: Upload Report
|
|
327
|
+
uses: actions/upload-artifact@v4
|
|
328
|
+
with:
|
|
329
|
+
name: dora-metrics-report
|
|
330
|
+
path: dora-report.md
|
|
331
|
+
```
|
|
332
|
+
|
|
333
|
+
### Custom Metrics Collection Script
|
|
334
|
+
|
|
335
|
+
```typescript
|
|
336
|
+
// scripts/collect-dora-metrics.ts
|
|
337
|
+
import { Octokit } from '@octokit/rest';
|
|
338
|
+
|
|
339
|
+
interface DORAMetrics {
|
|
340
|
+
period: { start: Date; end: Date };
|
|
341
|
+
deploymentFrequency: {
|
|
342
|
+
count: number;
|
|
343
|
+
perDay: number;
|
|
344
|
+
performance: 'elite' | 'high' | 'medium' | 'low';
|
|
345
|
+
};
|
|
346
|
+
leadTime: {
|
|
347
|
+
medianHours: number;
|
|
348
|
+
p90Hours: number;
|
|
349
|
+
performance: 'elite' | 'high' | 'medium' | 'low';
|
|
350
|
+
};
|
|
351
|
+
changeFailureRate: {
|
|
352
|
+
total: number;
|
|
353
|
+
failures: number;
|
|
354
|
+
rate: number;
|
|
355
|
+
performance: 'elite' | 'high' | 'medium' | 'low';
|
|
356
|
+
};
|
|
357
|
+
mttr: {
|
|
358
|
+
medianHours: number;
|
|
359
|
+
incidentCount: number;
|
|
360
|
+
performance: 'elite' | 'high' | 'medium' | 'low';
|
|
361
|
+
};
|
|
362
|
+
overallPerformance: 'elite' | 'high' | 'medium' | 'low';
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
class DORAMetricsCollector {
|
|
366
|
+
private octokit: Octokit;
|
|
367
|
+
private owner: string;
|
|
368
|
+
private repo: string;
|
|
369
|
+
|
|
370
|
+
constructor(token: string, owner: string, repo: string) {
|
|
371
|
+
this.octokit = new Octokit({ auth: token });
|
|
372
|
+
this.owner = owner;
|
|
373
|
+
this.repo = repo;
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
async collect(periodDays: number = 30): Promise<DORAMetrics> {
|
|
377
|
+
const end = new Date();
|
|
378
|
+
const start = new Date();
|
|
379
|
+
start.setDate(start.getDate() - periodDays);
|
|
380
|
+
|
|
381
|
+
const [deployments, prs, incidents] = await Promise.all([
|
|
382
|
+
this.getDeployments(start, end),
|
|
383
|
+
this.getMergedPRs(start, end),
|
|
384
|
+
this.getIncidents(start, end)
|
|
385
|
+
]);
|
|
386
|
+
|
|
387
|
+
// Calculate each metric
|
|
388
|
+
const deploymentFrequency = this.calcDeploymentFrequency(deployments, periodDays);
|
|
389
|
+
const leadTime = this.calcLeadTime(prs);
|
|
390
|
+
const changeFailureRate = this.calcChangeFailureRate(deployments, incidents);
|
|
391
|
+
const mttr = this.calcMTTR(incidents);
|
|
392
|
+
|
|
393
|
+
// Determine overall performance
|
|
394
|
+
const performances = [
|
|
395
|
+
deploymentFrequency.performance,
|
|
396
|
+
leadTime.performance,
|
|
397
|
+
changeFailureRate.performance,
|
|
398
|
+
mttr.performance
|
|
399
|
+
];
|
|
400
|
+
|
|
401
|
+
const overallPerformance = this.getOverallPerformance(performances);
|
|
402
|
+
|
|
403
|
+
return {
|
|
404
|
+
period: { start, end },
|
|
405
|
+
deploymentFrequency,
|
|
406
|
+
leadTime,
|
|
407
|
+
changeFailureRate,
|
|
408
|
+
mttr,
|
|
409
|
+
overallPerformance
|
|
410
|
+
};
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
private async getDeployments(start: Date, end: Date) {
|
|
414
|
+
const { data } = await this.octokit.actions.listWorkflowRuns({
|
|
415
|
+
owner: this.owner,
|
|
416
|
+
repo: this.repo,
|
|
417
|
+
workflow_id: 'deploy.yml',
|
|
418
|
+
created: `${start.toISOString()}..${end.toISOString()}`
|
|
419
|
+
});
|
|
420
|
+
return data.workflow_runs;
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
private async getMergedPRs(start: Date, end: Date) {
|
|
424
|
+
const { data } = await this.octokit.pulls.list({
|
|
425
|
+
owner: this.owner,
|
|
426
|
+
repo: this.repo,
|
|
427
|
+
state: 'closed',
|
|
428
|
+
sort: 'updated',
|
|
429
|
+
per_page: 100
|
|
430
|
+
});
|
|
431
|
+
return data.filter(pr =>
|
|
432
|
+
pr.merged_at &&
|
|
433
|
+
new Date(pr.merged_at) >= start &&
|
|
434
|
+
new Date(pr.merged_at) <= end
|
|
435
|
+
);
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
private async getIncidents(start: Date, end: Date) {
|
|
439
|
+
// This would typically come from PagerDuty, OpsGenie, or GitHub Issues
|
|
440
|
+
// Placeholder implementation
|
|
441
|
+
const { data } = await this.octokit.issues.listForRepo({
|
|
442
|
+
owner: this.owner,
|
|
443
|
+
repo: this.repo,
|
|
444
|
+
labels: 'incident',
|
|
445
|
+
state: 'closed',
|
|
446
|
+
since: start.toISOString()
|
|
447
|
+
});
|
|
448
|
+
return data;
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
private calcDeploymentFrequency(deployments: any[], periodDays: number) {
|
|
452
|
+
const successful = deployments.filter(d => d.conclusion === 'success');
|
|
453
|
+
const perDay = successful.length / periodDays;
|
|
454
|
+
|
|
455
|
+
return {
|
|
456
|
+
count: successful.length,
|
|
457
|
+
perDay,
|
|
458
|
+
performance: this.getFrequencyPerformance(perDay)
|
|
459
|
+
};
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
private calcLeadTime(prs: any[]) {
|
|
463
|
+
const times = prs.map(pr => {
|
|
464
|
+
const created = new Date(pr.created_at);
|
|
465
|
+
const merged = new Date(pr.merged_at);
|
|
466
|
+
return (merged.getTime() - created.getTime()) / (1000 * 60 * 60);
|
|
467
|
+
});
|
|
468
|
+
|
|
469
|
+
times.sort((a, b) => a - b);
|
|
470
|
+
const median = times[Math.floor(times.length / 2)] || 0;
|
|
471
|
+
const p90 = times[Math.floor(times.length * 0.9)] || 0;
|
|
472
|
+
|
|
473
|
+
return {
|
|
474
|
+
medianHours: median,
|
|
475
|
+
p90Hours: p90,
|
|
476
|
+
performance: this.getLeadTimePerformance(median)
|
|
477
|
+
};
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
private calcChangeFailureRate(deployments: any[], incidents: any[]) {
|
|
481
|
+
const total = deployments.filter(d => d.conclusion === 'success').length;
|
|
482
|
+
const failures = incidents.length; // Simplified
|
|
483
|
+
|
|
484
|
+
const rate = total > 0 ? (failures / total) * 100 : 0;
|
|
485
|
+
|
|
486
|
+
return {
|
|
487
|
+
total,
|
|
488
|
+
failures,
|
|
489
|
+
rate,
|
|
490
|
+
performance: this.getFailureRatePerformance(rate)
|
|
491
|
+
};
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
private calcMTTR(incidents: any[]) {
|
|
495
|
+
const times = incidents
|
|
496
|
+
.filter(i => i.closed_at)
|
|
497
|
+
.map(i => {
|
|
498
|
+
const opened = new Date(i.created_at);
|
|
499
|
+
const closed = new Date(i.closed_at);
|
|
500
|
+
return (closed.getTime() - opened.getTime()) / (1000 * 60 * 60);
|
|
501
|
+
});
|
|
502
|
+
|
|
503
|
+
times.sort((a, b) => a - b);
|
|
504
|
+
const median = times[Math.floor(times.length / 2)] || 0;
|
|
505
|
+
|
|
506
|
+
return {
|
|
507
|
+
medianHours: median,
|
|
508
|
+
incidentCount: incidents.length,
|
|
509
|
+
performance: this.getMTTRPerformance(median)
|
|
510
|
+
};
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
private getFrequencyPerformance(perDay: number): 'elite' | 'high' | 'medium' | 'low' {
|
|
514
|
+
if (perDay >= 1) return 'elite';
|
|
515
|
+
if (perDay >= 1/7) return 'high';
|
|
516
|
+
if (perDay >= 1/30) return 'medium';
|
|
517
|
+
return 'low';
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
private getLeadTimePerformance(hours: number): 'elite' | 'high' | 'medium' | 'low' {
|
|
521
|
+
if (hours < 1) return 'elite';
|
|
522
|
+
if (hours < 24) return 'high';
|
|
523
|
+
if (hours < 168) return 'medium';
|
|
524
|
+
return 'low';
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
private getFailureRatePerformance(rate: number): 'elite' | 'high' | 'medium' | 'low' {
|
|
528
|
+
if (rate <= 15) return 'elite';
|
|
529
|
+
if (rate <= 30) return 'high';
|
|
530
|
+
if (rate <= 45) return 'medium';
|
|
531
|
+
return 'low';
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
private getMTTRPerformance(hours: number): 'elite' | 'high' | 'medium' | 'low' {
|
|
535
|
+
if (hours < 1) return 'elite';
|
|
536
|
+
if (hours < 24) return 'high';
|
|
537
|
+
if (hours < 168) return 'medium';
|
|
538
|
+
return 'low';
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
private getOverallPerformance(performances: string[]): 'elite' | 'high' | 'medium' | 'low' {
|
|
542
|
+
const scores = { elite: 4, high: 3, medium: 2, low: 1 };
|
|
543
|
+
const avg = performances.reduce((sum, p) => sum + scores[p as keyof typeof scores], 0) / performances.length;
|
|
544
|
+
|
|
545
|
+
if (avg >= 3.5) return 'elite';
|
|
546
|
+
if (avg >= 2.5) return 'high';
|
|
547
|
+
if (avg >= 1.5) return 'medium';
|
|
548
|
+
return 'low';
|
|
549
|
+
}
|
|
550
|
+
}
|
|
551
|
+
|
|
552
|
+
// Usage
|
|
553
|
+
const collector = new DORAMetricsCollector(
|
|
554
|
+
process.env.GITHUB_TOKEN!,
|
|
555
|
+
'myorg',
|
|
556
|
+
'myrepo'
|
|
557
|
+
);
|
|
558
|
+
|
|
559
|
+
const metrics = await collector.collect(30);
|
|
560
|
+
console.log(JSON.stringify(metrics, null, 2));
|
|
561
|
+
```
|
|
562
|
+
|
|
563
|
+
### Grafana Dashboard Configuration
|
|
564
|
+
|
|
565
|
+
```json
|
|
566
|
+
{
|
|
567
|
+
"dashboard": {
|
|
568
|
+
"title": "DORA Metrics Dashboard",
|
|
569
|
+
"panels": [
|
|
570
|
+
{
|
|
571
|
+
"title": "Deployment Frequency",
|
|
572
|
+
"type": "stat",
|
|
573
|
+
"targets": [
|
|
574
|
+
{
|
|
575
|
+
"expr": "sum(increase(deployments_total{environment=\"production\"}[30d])) / 30",
|
|
576
|
+
"legendFormat": "Deploys/day"
|
|
577
|
+
}
|
|
578
|
+
],
|
|
579
|
+
"fieldConfig": {
|
|
580
|
+
"defaults": {
|
|
581
|
+
"thresholds": {
|
|
582
|
+
"steps": [
|
|
583
|
+
{ "value": 0, "color": "red" },
|
|
584
|
+
{ "value": 0.03, "color": "orange" },
|
|
585
|
+
{ "value": 0.14, "color": "yellow" },
|
|
586
|
+
{ "value": 1, "color": "green" }
|
|
587
|
+
]
|
|
588
|
+
}
|
|
589
|
+
}
|
|
590
|
+
}
|
|
591
|
+
},
|
|
592
|
+
{
|
|
593
|
+
"title": "Lead Time for Changes",
|
|
594
|
+
"type": "stat",
|
|
595
|
+
"targets": [
|
|
596
|
+
{
|
|
597
|
+
"expr": "histogram_quantile(0.5, sum(rate(lead_time_hours_bucket[30d])) by (le))",
|
|
598
|
+
"legendFormat": "Median (hours)"
|
|
599
|
+
}
|
|
600
|
+
],
|
|
601
|
+
"fieldConfig": {
|
|
602
|
+
"defaults": {
|
|
603
|
+
"unit": "h",
|
|
604
|
+
"thresholds": {
|
|
605
|
+
"steps": [
|
|
606
|
+
{ "value": 0, "color": "green" },
|
|
607
|
+
{ "value": 1, "color": "yellow" },
|
|
608
|
+
{ "value": 24, "color": "orange" },
|
|
609
|
+
{ "value": 168, "color": "red" }
|
|
610
|
+
]
|
|
611
|
+
}
|
|
612
|
+
}
|
|
613
|
+
}
|
|
614
|
+
},
|
|
615
|
+
{
|
|
616
|
+
"title": "Change Failure Rate",
|
|
617
|
+
"type": "gauge",
|
|
618
|
+
"targets": [
|
|
619
|
+
{
|
|
620
|
+
"expr": "sum(deployments_failed_total) / sum(deployments_total) * 100",
|
|
621
|
+
"legendFormat": "Failure Rate %"
|
|
622
|
+
}
|
|
623
|
+
],
|
|
624
|
+
"fieldConfig": {
|
|
625
|
+
"defaults": {
|
|
626
|
+
"unit": "percent",
|
|
627
|
+
"min": 0,
|
|
628
|
+
"max": 100,
|
|
629
|
+
"thresholds": {
|
|
630
|
+
"steps": [
|
|
631
|
+
{ "value": 0, "color": "green" },
|
|
632
|
+
{ "value": 15, "color": "yellow" },
|
|
633
|
+
{ "value": 30, "color": "orange" },
|
|
634
|
+
{ "value": 45, "color": "red" }
|
|
635
|
+
]
|
|
636
|
+
}
|
|
637
|
+
}
|
|
638
|
+
}
|
|
639
|
+
},
|
|
640
|
+
{
|
|
641
|
+
"title": "Time to Restore (MTTR)",
|
|
642
|
+
"type": "stat",
|
|
643
|
+
"targets": [
|
|
644
|
+
{
|
|
645
|
+
"expr": "histogram_quantile(0.5, sum(rate(incident_resolution_hours_bucket[30d])) by (le))",
|
|
646
|
+
"legendFormat": "Median (hours)"
|
|
647
|
+
}
|
|
648
|
+
],
|
|
649
|
+
"fieldConfig": {
|
|
650
|
+
"defaults": {
|
|
651
|
+
"unit": "h",
|
|
652
|
+
"thresholds": {
|
|
653
|
+
"steps": [
|
|
654
|
+
{ "value": 0, "color": "green" },
|
|
655
|
+
{ "value": 1, "color": "yellow" },
|
|
656
|
+
{ "value": 24, "color": "orange" },
|
|
657
|
+
{ "value": 168, "color": "red" }
|
|
658
|
+
]
|
|
659
|
+
}
|
|
660
|
+
}
|
|
661
|
+
}
|
|
662
|
+
}
|
|
663
|
+
]
|
|
664
|
+
}
|
|
665
|
+
}
|
|
666
|
+
```
|
|
667
|
+
|
|
668
|
+
## Tools and Platforms
|
|
669
|
+
|
|
670
|
+
| Tool | Type | Features |
|
|
671
|
+
|------|------|----------|
|
|
672
|
+
| **Four Keys** (Google) | Open Source | GitHub/GitLab integration, BigQuery |
|
|
673
|
+
| **LinearB** | Commercial | Git analytics, workflow metrics |
|
|
674
|
+
| **Sleuth** | Commercial | Deploy tracking, change intelligence |
|
|
675
|
+
| **Faros AI** | Commercial | Multi-source aggregation |
|
|
676
|
+
| **Propelo** | Commercial | SDLC insights |
|
|
677
|
+
| **Jellyfish** | Commercial | Engineering management |
|
|
678
|
+
|
|
679
|
+
### Four Keys Setup (Google)
|
|
680
|
+
|
|
681
|
+
```bash
|
|
682
|
+
# Deploy Four Keys to GCP
|
|
683
|
+
git clone https://github.com/dora-team/fourkeys.git
|
|
684
|
+
cd fourkeys
|
|
685
|
+
|
|
686
|
+
# Configure
|
|
687
|
+
export PROJECT_ID="my-project"
|
|
688
|
+
export REGION="us-central1"
|
|
689
|
+
|
|
690
|
+
# Deploy
|
|
691
|
+
./setup/setup.sh
|
|
692
|
+
|
|
693
|
+
# Configure webhook for GitHub events
|
|
694
|
+
# Add to GitHub repo settings: https://<REGION>-<PROJECT_ID>.cloudfunctions.net/github-parser
|
|
695
|
+
```
|
|
696
|
+
|
|
697
|
+
## Improvement Strategies
|
|
698
|
+
|
|
699
|
+
### Improving Deployment Frequency
|
|
700
|
+
|
|
701
|
+
| Current | Target | Strategy |
|
|
702
|
+
|---------|--------|----------|
|
|
703
|
+
| Monthly | Weekly | Automate deployments, reduce batch size |
|
|
704
|
+
| Weekly | Daily | Feature flags, trunk-based development |
|
|
705
|
+
| Daily | Multiple/day | Continuous deployment, small PRs |
|
|
706
|
+
|
|
707
|
+
### Improving Lead Time
|
|
708
|
+
|
|
709
|
+
| Bottleneck | Solution |
|
|
710
|
+
|------------|----------|
|
|
711
|
+
| Long code reviews | Smaller PRs, async reviews, automation |
|
|
712
|
+
| Manual testing | Automated tests, shift-left |
|
|
713
|
+
| Manual deployments | CI/CD automation |
|
|
714
|
+
| Environment issues | Infrastructure as code |
|
|
715
|
+
|
|
716
|
+
### Reducing Change Failure Rate
|
|
717
|
+
|
|
718
|
+
| Problem | Solution |
|
|
719
|
+
|---------|----------|
|
|
720
|
+
| Insufficient testing | Increase coverage, add integration tests |
|
|
721
|
+
| Big bang releases | Feature flags, canary releases |
|
|
722
|
+
| Lack of review | Automated checks, required reviews |
|
|
723
|
+
| Poor monitoring | Better observability, alerting |
|
|
724
|
+
|
|
725
|
+
### Reducing MTTR
|
|
726
|
+
|
|
727
|
+
| Improvement | Impact |
|
|
728
|
+
|-------------|--------|
|
|
729
|
+
| Runbooks | Faster diagnosis |
|
|
730
|
+
| Feature flags | Instant rollback |
|
|
731
|
+
| Observability | Faster root cause |
|
|
732
|
+
| Chaos engineering | Proactive resilience |
|
|
733
|
+
|
|
734
|
+
## Best Practices
|
|
735
|
+
|
|
736
|
+
### 1. Measure Consistently
|
|
737
|
+
|
|
738
|
+
```typescript
|
|
739
|
+
// Standardized metric definitions
|
|
740
|
+
const METRIC_DEFINITIONS = {
|
|
741
|
+
deploymentFrequency: {
|
|
742
|
+
source: 'GitHub Actions',
|
|
743
|
+
filter: 'workflow=deploy.yml, conclusion=success',
|
|
744
|
+
aggregation: 'count per day'
|
|
745
|
+
},
|
|
746
|
+
leadTime: {
|
|
747
|
+
source: 'GitHub PRs',
|
|
748
|
+
measurement: 'created_at to merged_at',
|
|
749
|
+
aggregation: 'median'
|
|
750
|
+
},
|
|
751
|
+
changeFailureRate: {
|
|
752
|
+
source: 'GitHub Issues + Deployments',
|
|
753
|
+
filter: 'label=incident, within 24h of deployment',
|
|
754
|
+
aggregation: 'incidents / deployments * 100'
|
|
755
|
+
},
|
|
756
|
+
mttr: {
|
|
757
|
+
source: 'PagerDuty',
|
|
758
|
+
measurement: 'triggered_at to resolved_at',
|
|
759
|
+
aggregation: 'median'
|
|
760
|
+
}
|
|
761
|
+
};
|
|
762
|
+
```
|
|
763
|
+
|
|
764
|
+
### 2. Set Realistic Goals
|
|
765
|
+
|
|
766
|
+
```yaml
|
|
767
|
+
# Quarterly improvement targets
|
|
768
|
+
q1_2024:
|
|
769
|
+
deployment_frequency:
|
|
770
|
+
current: 0.5/day
|
|
771
|
+
target: 1.0/day
|
|
772
|
+
improvement: 100%
|
|
773
|
+
lead_time:
|
|
774
|
+
current: 48h
|
|
775
|
+
target: 24h
|
|
776
|
+
improvement: 50%
|
|
777
|
+
change_failure_rate:
|
|
778
|
+
current: 25%
|
|
779
|
+
target: 20%
|
|
780
|
+
improvement: 20%
|
|
781
|
+
mttr:
|
|
782
|
+
current: 4h
|
|
783
|
+
target: 2h
|
|
784
|
+
improvement: 50%
|
|
785
|
+
```
|
|
786
|
+
|
|
787
|
+
### 3. Avoid Gaming Metrics
|
|
788
|
+
|
|
789
|
+
| Gaming Behavior | Why It's Bad | Better Approach |
|
|
790
|
+
|-----------------|--------------|-----------------|
|
|
791
|
+
| Deploying empty commits | Fake frequency | Track meaningful changes |
|
|
792
|
+
| Not labeling incidents | Hide failures | Blameless culture |
|
|
793
|
+
| Splitting PRs artificially | Fake lead time | Focus on value |
|
|
794
|
+
| Rushing fixes | Lower quality | Fix root cause |
|
|
795
|
+
|
|
796
|
+
## Use Cases
|
|
797
|
+
|
|
798
|
+
### 1. Team Performance Review
|
|
799
|
+
|
|
800
|
+
```typescript
|
|
801
|
+
// Quarterly DORA review
|
|
802
|
+
async function quarterlyReview(team: string) {
|
|
803
|
+
const metrics = await collectMetrics({ team, period: '90d' });
|
|
804
|
+
|
|
805
|
+
return {
|
|
806
|
+
summary: {
|
|
807
|
+
overallPerformance: metrics.overallPerformance,
|
|
808
|
+
strongestMetric: findStrongest(metrics),
|
|
809
|
+
improvementArea: findWeakest(metrics)
|
|
810
|
+
},
|
|
811
|
+
comparison: {
|
|
812
|
+
vsLastQuarter: await compareToLastQuarter(team, metrics),
|
|
813
|
+
vsIndustry: compareToIndustryBenchmarks(metrics)
|
|
814
|
+
},
|
|
815
|
+
recommendations: generateRecommendations(metrics)
|
|
816
|
+
};
|
|
817
|
+
}
|
|
818
|
+
```
|
|
819
|
+
|
|
820
|
+
### 2. DevOps Transformation Tracking
|
|
821
|
+
|
|
822
|
+
```typescript
|
|
823
|
+
// Track transformation progress
|
|
824
|
+
const transformationGoals = {
|
|
825
|
+
phase1: { // Foundation
|
|
826
|
+
deploymentFrequency: 'weekly',
|
|
827
|
+
leadTime: '< 1 week'
|
|
828
|
+
},
|
|
829
|
+
phase2: { // Acceleration
|
|
830
|
+
deploymentFrequency: 'daily',
|
|
831
|
+
leadTime: '< 1 day',
|
|
832
|
+
changeFailureRate: '< 30%'
|
|
833
|
+
},
|
|
834
|
+
phase3: { // Excellence
|
|
835
|
+
deploymentFrequency: 'multiple/day',
|
|
836
|
+
leadTime: '< 1 hour',
|
|
837
|
+
changeFailureRate: '< 15%',
|
|
838
|
+
mttr: '< 1 hour'
|
|
839
|
+
}
|
|
840
|
+
};
|
|
841
|
+
```
|
|
842
|
+
|
|
843
|
+
## Related Skills
|
|
844
|
+
|
|
845
|
+
- `devops/github-actions` - CI/CD automation
|
|
846
|
+
- `devops/observability` - Monitoring and metrics
|
|
847
|
+
- `testing/comprehensive-testing` - Quality gates
|
|
848
|
+
- `devops/feature-flags` - Progressive delivery
|
|
849
|
+
|
|
850
|
+
---
|
|
851
|
+
|
|
852
|
+
*Think Omega. Build Omega. Be Omega.*
|