shipwright-cli 1.10.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +221 -55
- package/completions/_shipwright +264 -32
- package/completions/shipwright.bash +118 -26
- package/completions/shipwright.fish +80 -2
- package/dashboard/server.ts +208 -0
- package/docs/strategy/01-market-research.md +619 -0
- package/docs/strategy/02-mission-and-brand.md +587 -0
- package/docs/strategy/03-gtm-and-roadmap.md +759 -0
- package/docs/strategy/QUICK-START.txt +289 -0
- package/docs/strategy/README.md +172 -0
- package/docs/tmux-research/TMUX-ARCHITECTURE.md +567 -0
- package/docs/tmux-research/TMUX-AUDIT.md +925 -0
- package/docs/tmux-research/TMUX-BEST-PRACTICES-2025-2026.md +829 -0
- package/docs/tmux-research/TMUX-QUICK-REFERENCE.md +543 -0
- package/docs/tmux-research/TMUX-RESEARCH-INDEX.md +438 -0
- package/package.json +4 -2
- package/scripts/lib/helpers.sh +7 -0
- package/scripts/sw +323 -2
- package/scripts/sw-activity.sh +500 -0
- package/scripts/sw-adaptive.sh +925 -0
- package/scripts/sw-adversarial.sh +1 -1
- package/scripts/sw-architecture-enforcer.sh +1 -1
- package/scripts/sw-auth.sh +613 -0
- package/scripts/sw-autonomous.sh +754 -0
- package/scripts/sw-changelog.sh +704 -0
- package/scripts/sw-checkpoint.sh +1 -1
- package/scripts/sw-ci.sh +602 -0
- package/scripts/sw-cleanup.sh +1 -1
- package/scripts/sw-code-review.sh +698 -0
- package/scripts/sw-connect.sh +1 -1
- package/scripts/sw-context.sh +605 -0
- package/scripts/sw-cost.sh +44 -3
- package/scripts/sw-daemon.sh +568 -138
- package/scripts/sw-dashboard.sh +1 -1
- package/scripts/sw-db.sh +1380 -0
- package/scripts/sw-decompose.sh +539 -0
- package/scripts/sw-deps.sh +551 -0
- package/scripts/sw-developer-simulation.sh +1 -1
- package/scripts/sw-discovery.sh +412 -0
- package/scripts/sw-docs-agent.sh +539 -0
- package/scripts/sw-docs.sh +1 -1
- package/scripts/sw-doctor.sh +107 -1
- package/scripts/sw-dora.sh +615 -0
- package/scripts/sw-durable.sh +710 -0
- package/scripts/sw-e2e-orchestrator.sh +535 -0
- package/scripts/sw-eventbus.sh +393 -0
- package/scripts/sw-feedback.sh +479 -0
- package/scripts/sw-fix.sh +1 -1
- package/scripts/sw-fleet-discover.sh +567 -0
- package/scripts/sw-fleet-viz.sh +404 -0
- package/scripts/sw-fleet.sh +8 -1
- package/scripts/sw-github-app.sh +596 -0
- package/scripts/sw-github-checks.sh +4 -4
- package/scripts/sw-github-deploy.sh +1 -1
- package/scripts/sw-github-graphql.sh +1 -1
- package/scripts/sw-guild.sh +569 -0
- package/scripts/sw-heartbeat.sh +1 -1
- package/scripts/sw-hygiene.sh +559 -0
- package/scripts/sw-incident.sh +656 -0
- package/scripts/sw-init.sh +237 -24
- package/scripts/sw-instrument.sh +699 -0
- package/scripts/sw-intelligence.sh +1 -1
- package/scripts/sw-jira.sh +1 -1
- package/scripts/sw-launchd.sh +363 -28
- package/scripts/sw-linear.sh +1 -1
- package/scripts/sw-logs.sh +1 -1
- package/scripts/sw-loop.sh +267 -21
- package/scripts/sw-memory.sh +18 -1
- package/scripts/sw-mission-control.sh +487 -0
- package/scripts/sw-model-router.sh +545 -0
- package/scripts/sw-otel.sh +596 -0
- package/scripts/sw-oversight.sh +764 -0
- package/scripts/sw-pipeline-composer.sh +1 -1
- package/scripts/sw-pipeline-vitals.sh +1 -1
- package/scripts/sw-pipeline.sh +947 -35
- package/scripts/sw-pm.sh +758 -0
- package/scripts/sw-pr-lifecycle.sh +522 -0
- package/scripts/sw-predictive.sh +8 -1
- package/scripts/sw-prep.sh +1 -1
- package/scripts/sw-ps.sh +1 -1
- package/scripts/sw-public-dashboard.sh +798 -0
- package/scripts/sw-quality.sh +595 -0
- package/scripts/sw-reaper.sh +1 -1
- package/scripts/sw-recruit.sh +2248 -0
- package/scripts/sw-regression.sh +642 -0
- package/scripts/sw-release-manager.sh +736 -0
- package/scripts/sw-release.sh +706 -0
- package/scripts/sw-remote.sh +1 -1
- package/scripts/sw-replay.sh +520 -0
- package/scripts/sw-retro.sh +691 -0
- package/scripts/sw-scale.sh +444 -0
- package/scripts/sw-security-audit.sh +505 -0
- package/scripts/sw-self-optimize.sh +1 -1
- package/scripts/sw-session.sh +1 -1
- package/scripts/sw-setup.sh +263 -127
- package/scripts/sw-standup.sh +712 -0
- package/scripts/sw-status.sh +44 -2
- package/scripts/sw-strategic.sh +806 -0
- package/scripts/sw-stream.sh +450 -0
- package/scripts/sw-swarm.sh +620 -0
- package/scripts/sw-team-stages.sh +511 -0
- package/scripts/sw-templates.sh +4 -4
- package/scripts/sw-testgen.sh +566 -0
- package/scripts/sw-tmux-pipeline.sh +554 -0
- package/scripts/sw-tmux-role-color.sh +58 -0
- package/scripts/sw-tmux-status.sh +128 -0
- package/scripts/sw-tmux.sh +1 -1
- package/scripts/sw-trace.sh +485 -0
- package/scripts/sw-tracker-github.sh +188 -0
- package/scripts/sw-tracker-jira.sh +172 -0
- package/scripts/sw-tracker-linear.sh +251 -0
- package/scripts/sw-tracker.sh +117 -2
- package/scripts/sw-triage.sh +627 -0
- package/scripts/sw-upgrade.sh +1 -1
- package/scripts/sw-ux.sh +677 -0
- package/scripts/sw-webhook.sh +627 -0
- package/scripts/sw-widgets.sh +530 -0
- package/scripts/sw-worktree.sh +1 -1
- package/templates/pipelines/autonomous.json +2 -2
- package/tmux/shipwright-overlay.conf +35 -17
- package/tmux/tmux.conf +23 -21
|
@@ -0,0 +1,656 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# ╔═══════════════════════════════════════════════════════════════════════════╗
|
|
3
|
+
# ║ shipwright incident — Autonomous Incident Detection & Response ║
|
|
4
|
+
# ║ Detect failures · Triage · Root cause analysis · Auto-remediate ║
|
|
5
|
+
# ╚═══════════════════════════════════════════════════════════════════════════╝
|
|
6
|
+
set -euo pipefail
|
|
7
|
+
trap 'echo "ERROR: $BASH_SOURCE:$LINENO exited with status $?" >&2' ERR
|
|
8
|
+
|
|
9
|
+
VERSION="2.1.0"
|
|
10
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
11
|
+
REPO_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
12
|
+
|
|
13
|
+
# ─── Colors (matches Seth's tmux theme) ─────────────────────────────────────
|
|
14
|
+
CYAN='\033[38;2;0;212;255m' # #00d4ff — primary accent
|
|
15
|
+
PURPLE='\033[38;2;124;58;237m' # #7c3aed — secondary
|
|
16
|
+
BLUE='\033[38;2;0;102;255m' # #0066ff — tertiary
|
|
17
|
+
GREEN='\033[38;2;74;222;128m' # success
|
|
18
|
+
YELLOW='\033[38;2;250;204;21m' # warning
|
|
19
|
+
RED='\033[38;2;248;113;113m' # error
|
|
20
|
+
DIM='\033[2m'
|
|
21
|
+
BOLD='\033[1m'
|
|
22
|
+
RESET='\033[0m'
|
|
23
|
+
|
|
24
|
+
# ─── Cross-platform compatibility ──────────────────────────────────────────
|
|
25
|
+
# shellcheck source=lib/compat.sh
|
|
26
|
+
[[ -f "$SCRIPT_DIR/lib/compat.sh" ]] && source "$SCRIPT_DIR/lib/compat.sh"
|
|
27
|
+
|
|
28
|
+
# ─── Output Helpers ─────────────────────────────────────────────────────────
|
|
29
|
+
info() { echo -e "${CYAN}${BOLD}▸${RESET} $*"; }
|
|
30
|
+
success() { echo -e "${GREEN}${BOLD}✓${RESET} $*"; }
|
|
31
|
+
warn() { echo -e "${YELLOW}${BOLD}⚠${RESET} $*"; }
|
|
32
|
+
error() { echo -e "${RED}${BOLD}✗${RESET} $*" >&2; }
|
|
33
|
+
|
|
34
|
+
now_iso() { date -u +"%Y-%m-%dT%H:%M:%SZ"; }
|
|
35
|
+
now_epoch() { date +%s; }
|
|
36
|
+
|
|
37
|
+
format_duration() {
|
|
38
|
+
local secs="$1"
|
|
39
|
+
if [[ "$secs" -ge 3600 ]]; then
|
|
40
|
+
printf "%dh %dm %ds" $((secs/3600)) $((secs%3600/60)) $((secs%60))
|
|
41
|
+
elif [[ "$secs" -ge 60 ]]; then
|
|
42
|
+
printf "%dm %ds" $((secs/60)) $((secs%60))
|
|
43
|
+
else
|
|
44
|
+
printf "%ds" "$secs"
|
|
45
|
+
fi
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
# ─── Structured Event Log ──────────────────────────────────────────────────
|
|
49
|
+
EVENTS_FILE="${HOME}/.shipwright/events.jsonl"
|
|
50
|
+
|
|
51
|
+
emit_event() {
|
|
52
|
+
local event_type="$1"
|
|
53
|
+
shift
|
|
54
|
+
local json_fields=""
|
|
55
|
+
for kv in "$@"; do
|
|
56
|
+
local key="${kv%%=*}"
|
|
57
|
+
local val="${kv#*=}"
|
|
58
|
+
if [[ "$val" =~ ^-?[0-9]+\.?[0-9]*$ ]]; then
|
|
59
|
+
json_fields="${json_fields},\"${key}\":${val}"
|
|
60
|
+
else
|
|
61
|
+
val="${val//\"/\\\"}"
|
|
62
|
+
json_fields="${json_fields},\"${key}\":\"${val}\""
|
|
63
|
+
fi
|
|
64
|
+
done
|
|
65
|
+
mkdir -p "${HOME}/.shipwright"
|
|
66
|
+
echo "{\"ts\":\"$(now_iso)\",\"ts_epoch\":$(now_epoch),\"type\":\"${event_type}\"${json_fields}}" >> "$EVENTS_FILE"
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
# ─── State Directories ──────────────────────────────────────────────────────
|
|
70
|
+
INCIDENTS_DIR="${HOME}/.shipwright/incidents"
|
|
71
|
+
INCIDENT_CONFIG="${INCIDENTS_DIR}/config.json"
|
|
72
|
+
MONITOR_PID_FILE="${INCIDENTS_DIR}/monitor.pid"
|
|
73
|
+
|
|
74
|
+
ensure_incident_dir() {
|
|
75
|
+
mkdir -p "$INCIDENTS_DIR"
|
|
76
|
+
[[ -f "$INCIDENT_CONFIG" ]] || cat > "$INCIDENT_CONFIG" << 'EOF'
|
|
77
|
+
{
|
|
78
|
+
"auto_response_enabled": true,
|
|
79
|
+
"p0_auto_hotfix": true,
|
|
80
|
+
"p1_auto_hotfix": false,
|
|
81
|
+
"auto_rollback_enabled": false,
|
|
82
|
+
"notification_channels": ["stdout"],
|
|
83
|
+
"severity_thresholds": {
|
|
84
|
+
"p0_impact_count": 3,
|
|
85
|
+
"p0_deploy_failure": true,
|
|
86
|
+
"p1_test_regression_count": 5,
|
|
87
|
+
"p1_pipeline_failure_rate": 0.3
|
|
88
|
+
},
|
|
89
|
+
"root_cause_patterns": {
|
|
90
|
+
"timeout_keywords": ["timeout", "deadline", "too slow"],
|
|
91
|
+
"memory_keywords": ["out of memory", "OOM", "heap"],
|
|
92
|
+
"dependency_keywords": ["dependency", "import", "require", "not found"],
|
|
93
|
+
"auth_keywords": ["auth", "permission", "forbidden", "401", "403"]
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
EOF
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
# ─── Failure Detection ──────────────────────────────────────────────────────
|
|
100
|
+
|
|
101
|
+
detect_pipeline_failures() {
|
|
102
|
+
local since="${1:-3600}" # Last N seconds
|
|
103
|
+
local cutoff_time=$(($(now_epoch) - since))
|
|
104
|
+
|
|
105
|
+
[[ ! -f "$EVENTS_FILE" ]] && return 0
|
|
106
|
+
|
|
107
|
+
awk -v cutoff="$cutoff_time" -F'"' '
|
|
108
|
+
BEGIN { count=0 }
|
|
109
|
+
/pipeline\.failed|stage\.failed|test\.failed|deploy\.failed/ {
|
|
110
|
+
for (i=1; i<=NF; i++) {
|
|
111
|
+
if ($i ~ /ts_epoch/) {
|
|
112
|
+
ts_epoch_val=$(i+2)
|
|
113
|
+
gsub(/^[^0-9]*/, "", ts_epoch_val)
|
|
114
|
+
gsub(/[^0-9].*/, "", ts_epoch_val)
|
|
115
|
+
if (ts_epoch_val+0 > cutoff) {
|
|
116
|
+
print $0
|
|
117
|
+
count++
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
END { exit (count > 0 ? 0 : 1) }
|
|
123
|
+
' "$EVENTS_FILE"
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
get_recent_failures() {
|
|
127
|
+
local since="${1:-3600}"
|
|
128
|
+
local cutoff_time=$(($(now_epoch) - since))
|
|
129
|
+
|
|
130
|
+
[[ ! -f "$EVENTS_FILE" ]] && echo "[]" && return 0
|
|
131
|
+
|
|
132
|
+
jq -s --arg cutoff "$cutoff_time" '
|
|
133
|
+
map(
|
|
134
|
+
select(
|
|
135
|
+
(.ts_epoch | tonumber) > ($cutoff | tonumber) and
|
|
136
|
+
(.type | contains("failed") or contains("error") or contains("timeout"))
|
|
137
|
+
) |
|
|
138
|
+
{
|
|
139
|
+
ts: .ts,
|
|
140
|
+
ts_epoch: .ts_epoch,
|
|
141
|
+
type: .type,
|
|
142
|
+
issue: .issue,
|
|
143
|
+
stage: .stage,
|
|
144
|
+
reason: .reason,
|
|
145
|
+
error: .error
|
|
146
|
+
}
|
|
147
|
+
)
|
|
148
|
+
' "$EVENTS_FILE" 2>/dev/null || echo "[]"
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
# ─── Severity Classification ───────────────────────────────────────────────
|
|
152
|
+
|
|
153
|
+
classify_severity() {
|
|
154
|
+
local failure_type="$1"
|
|
155
|
+
local impact_scope="$2" # Number of affected resources
|
|
156
|
+
|
|
157
|
+
case "$failure_type" in
|
|
158
|
+
deploy.failed|pipeline.critical_error)
|
|
159
|
+
echo "P0"
|
|
160
|
+
;;
|
|
161
|
+
test.regression|stage.failed)
|
|
162
|
+
if [[ "$impact_scope" -gt 5 ]]; then
|
|
163
|
+
echo "P0"
|
|
164
|
+
else
|
|
165
|
+
echo "P1"
|
|
166
|
+
fi
|
|
167
|
+
;;
|
|
168
|
+
stage.timeout|health_check.failed)
|
|
169
|
+
echo "P2"
|
|
170
|
+
;;
|
|
171
|
+
*)
|
|
172
|
+
echo "P3"
|
|
173
|
+
;;
|
|
174
|
+
esac
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
# ─── Root Cause Analysis ───────────────────────────────────────────────────
|
|
178
|
+
|
|
179
|
+
analyze_root_cause() {
|
|
180
|
+
local failure_log="$1"
|
|
181
|
+
local config="$2"
|
|
182
|
+
|
|
183
|
+
local timeout_hits error_hits memory_hits dependency_hits
|
|
184
|
+
timeout_hits=$(echo "$failure_log" | grep -ic "timeout\|deadline\|too slow" || echo "0")
|
|
185
|
+
memory_hits=$(echo "$failure_log" | grep -ic "out of memory\|OOM\|heap" || echo "0")
|
|
186
|
+
dependency_hits=$(echo "$failure_log" | grep -ic "dependency\|import\|require\|not found" || echo "0")
|
|
187
|
+
error_hits=$(echo "$failure_log" | grep -c . || echo "0")
|
|
188
|
+
|
|
189
|
+
if [[ "$timeout_hits" -gt 0 ]]; then
|
|
190
|
+
echo "Performance degradation: Timeout detected (${timeout_hits} occurrences)"
|
|
191
|
+
elif [[ "$memory_hits" -gt 0 ]]; then
|
|
192
|
+
echo "Memory pressure: OOM or heap allocation issue (${memory_hits} occurrences)"
|
|
193
|
+
elif [[ "$dependency_hits" -gt 0 ]]; then
|
|
194
|
+
echo "Dependency failure: Missing or incompatible dependency (${dependency_hits} occurrences)"
|
|
195
|
+
else
|
|
196
|
+
echo "Unknown cause: Check logs (${error_hits} error lines)"
|
|
197
|
+
fi
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
# ─── Incident Record Management ─────────────────────────────────────────────
|
|
201
|
+
|
|
202
|
+
create_incident_record() {
|
|
203
|
+
local incident_id="$1"
|
|
204
|
+
local severity="$2"
|
|
205
|
+
local root_cause="$3"
|
|
206
|
+
local failure_events="$4"
|
|
207
|
+
|
|
208
|
+
local incident_file="${INCIDENTS_DIR}/${incident_id}.json"
|
|
209
|
+
local created_at
|
|
210
|
+
created_at="$(now_iso)"
|
|
211
|
+
|
|
212
|
+
cat > "$incident_file" << EOF
|
|
213
|
+
{
|
|
214
|
+
"id": "$incident_id",
|
|
215
|
+
"created_at": "$created_at",
|
|
216
|
+
"severity": "$severity",
|
|
217
|
+
"status": "open",
|
|
218
|
+
"root_cause": "$root_cause",
|
|
219
|
+
"failure_events": $failure_events,
|
|
220
|
+
"timeline": [],
|
|
221
|
+
"remediation": null,
|
|
222
|
+
"resolved_at": null,
|
|
223
|
+
"mttr_seconds": null,
|
|
224
|
+
"post_mortem_url": null
|
|
225
|
+
}
|
|
226
|
+
EOF
|
|
227
|
+
|
|
228
|
+
emit_event "incident.created" "incident_id=$incident_id" "severity=$severity"
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
# ─── Hotfix Creation ───────────────────────────────────────────────────────
|
|
232
|
+
|
|
233
|
+
create_hotfix_issue() {
|
|
234
|
+
local incident_id="$1"
|
|
235
|
+
local severity="$2"
|
|
236
|
+
local root_cause="$3"
|
|
237
|
+
|
|
238
|
+
if ! command -v gh &>/dev/null; then
|
|
239
|
+
warn "gh CLI not found, skipping GitHub issue creation"
|
|
240
|
+
return 1
|
|
241
|
+
fi
|
|
242
|
+
|
|
243
|
+
local title="[HOTFIX] $severity: $root_cause"
|
|
244
|
+
local body="**Incident ID:** $incident_id
|
|
245
|
+
**Severity:** $severity
|
|
246
|
+
**Root Cause:** $root_cause
|
|
247
|
+
|
|
248
|
+
## Timeline
|
|
249
|
+
See incident details: \`shipwright incident show $incident_id\`
|
|
250
|
+
|
|
251
|
+
## Automated Detection
|
|
252
|
+
This issue was automatically created by the incident commander.
|
|
253
|
+
"
|
|
254
|
+
|
|
255
|
+
# shipwright label so daemon picks up; hotfix for routing
|
|
256
|
+
local issue_url
|
|
257
|
+
issue_url=$(gh issue create --title "$title" --body "$body" --label "hotfix,shipwright" 2>/dev/null || echo "")
|
|
258
|
+
|
|
259
|
+
if [[ -n "$issue_url" ]]; then
|
|
260
|
+
success "Created hotfix issue: $issue_url"
|
|
261
|
+
local issue_num
|
|
262
|
+
issue_num=$(echo "$issue_url" | sed -n 's|.*/issues/\([0-9]*\)|\1|p')
|
|
263
|
+
[[ -n "$issue_num" ]] && echo "$issue_num"
|
|
264
|
+
return 0
|
|
265
|
+
fi
|
|
266
|
+
|
|
267
|
+
warn "Failed to create GitHub issue"
|
|
268
|
+
return 1
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
# Trigger pipeline for P0/P1 hotfix issue (auto-remediation)
|
|
272
|
+
trigger_pipeline_for_incident() {
|
|
273
|
+
local issue_num="$1"
|
|
274
|
+
local incident_id="$2"
|
|
275
|
+
if [[ -z "$issue_num" || ! "$issue_num" =~ ^[0-9]+$ ]]; then
|
|
276
|
+
return 0
|
|
277
|
+
fi
|
|
278
|
+
if [[ ! -x "$SCRIPT_DIR/sw-pipeline.sh" ]]; then
|
|
279
|
+
return 0
|
|
280
|
+
fi
|
|
281
|
+
info "Auto-triggering pipeline for P0/P1 hotfix issue #${issue_num} (incident: $incident_id)"
|
|
282
|
+
(cd "$REPO_DIR" && export REPO_DIR SCRIPT_DIR && bash "$SCRIPT_DIR/sw-pipeline.sh" start --issue "$issue_num" --template hotfix 2>/dev/null) &
|
|
283
|
+
emit_event "incident.pipeline_triggered" "incident_id=$incident_id" "issue=$issue_num"
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
# Execute rollback when auto_rollback_enabled (wire to sw-feedback / sw-github-deploy)
|
|
287
|
+
trigger_rollback_for_incident() {
|
|
288
|
+
local incident_id="$1"
|
|
289
|
+
local reason="${2:-P0/P1 incident}"
|
|
290
|
+
if [[ ! -x "$SCRIPT_DIR/sw-feedback.sh" ]]; then
|
|
291
|
+
return 0
|
|
292
|
+
fi
|
|
293
|
+
info "Auto-rollback triggered for incident $incident_id: $reason"
|
|
294
|
+
(cd "$REPO_DIR" && bash "$SCRIPT_DIR/sw-feedback.sh" rollback production "$reason" 2>/dev/null) || true
|
|
295
|
+
emit_event "incident.rollback_triggered" "incident_id=$incident_id" "reason=$reason"
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
# ─── Watch Command ─────────────────────────────────────────────────────────
|
|
299
|
+
|
|
300
|
+
cmd_watch() {
|
|
301
|
+
local interval="${1:-60}"
|
|
302
|
+
|
|
303
|
+
if [[ -f "$MONITOR_PID_FILE" ]]; then
|
|
304
|
+
local old_pid
|
|
305
|
+
old_pid=$(cat "$MONITOR_PID_FILE" 2>/dev/null || echo "")
|
|
306
|
+
if [[ -n "$old_pid" ]] && kill -0 "$old_pid" 2>/dev/null; then
|
|
307
|
+
warn "Monitor already running with PID $old_pid"
|
|
308
|
+
return 1
|
|
309
|
+
fi
|
|
310
|
+
fi
|
|
311
|
+
|
|
312
|
+
info "Starting incident monitoring (interval: ${interval}s)"
|
|
313
|
+
|
|
314
|
+
# Background process
|
|
315
|
+
(
|
|
316
|
+
echo $$ > "$MONITOR_PID_FILE"
|
|
317
|
+
trap 'rm -f "'"$MONITOR_PID_FILE"'"' EXIT
|
|
318
|
+
|
|
319
|
+
while true; do
|
|
320
|
+
sleep "$interval"
|
|
321
|
+
|
|
322
|
+
# Check for recent failures
|
|
323
|
+
local failures_json
|
|
324
|
+
failures_json=$(get_recent_failures "$interval")
|
|
325
|
+
local failure_count
|
|
326
|
+
failure_count=$(echo "$failures_json" | jq 'length')
|
|
327
|
+
|
|
328
|
+
if [[ "$failure_count" -gt 0 ]]; then
|
|
329
|
+
info "Detected $failure_count failure(s)"
|
|
330
|
+
|
|
331
|
+
# Generate incident
|
|
332
|
+
local incident_id
|
|
333
|
+
incident_id="inc-$(date +%s)"
|
|
334
|
+
|
|
335
|
+
local severity
|
|
336
|
+
severity=$(classify_severity "$(echo "$failures_json" | jq -r '.[0].type')" "$failure_count")
|
|
337
|
+
|
|
338
|
+
local root_cause
|
|
339
|
+
root_cause=$(analyze_root_cause "$(echo "$failures_json" | jq -r '.[0] | tostring')" "$INCIDENT_CONFIG")
|
|
340
|
+
|
|
341
|
+
create_incident_record "$incident_id" "$severity" "$root_cause" "$failures_json"
|
|
342
|
+
|
|
343
|
+
info "Incident $incident_id created (severity: $severity)"
|
|
344
|
+
emit_event "incident.detected" "incident_id=$incident_id" "severity=$severity"
|
|
345
|
+
|
|
346
|
+
# Auto-response for P0/P1: hotfix issue, trigger pipeline, optional rollback
|
|
347
|
+
if [[ "$severity" == "P0" ]] || [[ "$severity" == "P1" ]]; then
|
|
348
|
+
local auto_rollback
|
|
349
|
+
auto_rollback=$(jq -r '.auto_rollback_enabled // false' "$INCIDENT_CONFIG" 2>/dev/null || echo "false")
|
|
350
|
+
if [[ "$auto_rollback" == "true" ]]; then
|
|
351
|
+
trigger_rollback_for_incident "$incident_id" "P0/P1 incident: $root_cause"
|
|
352
|
+
fi
|
|
353
|
+
local auto_hotfix
|
|
354
|
+
auto_hotfix=$(jq -r '.p0_auto_hotfix // .p1_auto_hotfix' "$INCIDENT_CONFIG" 2>/dev/null || echo "false")
|
|
355
|
+
if [[ "$auto_hotfix" == "true" ]]; then
|
|
356
|
+
local issue_num
|
|
357
|
+
issue_num=$(create_hotfix_issue "$incident_id" "$severity" "$root_cause")
|
|
358
|
+
if [[ -n "$issue_num" ]]; then
|
|
359
|
+
trigger_pipeline_for_incident "$issue_num" "$incident_id"
|
|
360
|
+
fi
|
|
361
|
+
fi
|
|
362
|
+
fi
|
|
363
|
+
fi
|
|
364
|
+
done
|
|
365
|
+
) &
|
|
366
|
+
|
|
367
|
+
success "Monitor started in background (PID: $!)"
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
# ─── List Command ──────────────────────────────────────────────────────────
|
|
371
|
+
|
|
372
|
+
cmd_list() {
|
|
373
|
+
local format="${1:-table}"
|
|
374
|
+
|
|
375
|
+
local incident_files
|
|
376
|
+
incident_files=$(find "$INCIDENTS_DIR" -name '*.json' -not -name '*postmortem*' -type f 2>/dev/null || true)
|
|
377
|
+
|
|
378
|
+
if [[ -z "$incident_files" ]]; then
|
|
379
|
+
info "No incidents recorded"
|
|
380
|
+
return 0
|
|
381
|
+
fi
|
|
382
|
+
|
|
383
|
+
case "$format" in
|
|
384
|
+
json)
|
|
385
|
+
echo "["
|
|
386
|
+
local first=true
|
|
387
|
+
while IFS= read -r incident_file; do
|
|
388
|
+
[[ -z "$incident_file" ]] && continue
|
|
389
|
+
if [[ "$first" == true ]]; then
|
|
390
|
+
first=false
|
|
391
|
+
else
|
|
392
|
+
echo ","
|
|
393
|
+
fi
|
|
394
|
+
cat "$incident_file"
|
|
395
|
+
done <<< "$incident_files"
|
|
396
|
+
echo "]"
|
|
397
|
+
;;
|
|
398
|
+
*)
|
|
399
|
+
echo -e "${BOLD}Recent Incidents${RESET}"
|
|
400
|
+
echo -e "${DIM}────────────────────────────────────────────────────────────────${RESET}"
|
|
401
|
+
|
|
402
|
+
while IFS= read -r incident_file; do
|
|
403
|
+
[[ -z "$incident_file" ]] && continue
|
|
404
|
+
|
|
405
|
+
local id severity status cause
|
|
406
|
+
id=$(jq -r '.id // "unknown"' "$incident_file" 2>/dev/null || echo "unknown")
|
|
407
|
+
severity=$(jq -r '.severity // "P3"' "$incident_file" 2>/dev/null || echo "P3")
|
|
408
|
+
status=$(jq -r '.status // "open"' "$incident_file" 2>/dev/null || echo "open")
|
|
409
|
+
cause=$(jq -r '.root_cause // "unknown"' "$incident_file" 2>/dev/null || echo "unknown")
|
|
410
|
+
cause="${cause:0:50}"
|
|
411
|
+
|
|
412
|
+
case "$severity" in
|
|
413
|
+
P0) severity="${RED}${BOLD}$severity${RESET}" ;;
|
|
414
|
+
P1) severity="${YELLOW}${BOLD}$severity${RESET}" ;;
|
|
415
|
+
P2) severity="${BLUE}$severity${RESET}" ;;
|
|
416
|
+
*) severity="${DIM}$severity${RESET}" ;;
|
|
417
|
+
esac
|
|
418
|
+
|
|
419
|
+
printf "%-20s %s %-8s %s\n" "$id" "$severity" "$status" "$cause"
|
|
420
|
+
done <<< "$incident_files"
|
|
421
|
+
;;
|
|
422
|
+
esac
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
# ─── Show Command ──────────────────────────────────────────────────────────
|
|
426
|
+
|
|
427
|
+
cmd_show() {
|
|
428
|
+
local incident_id="$1"
|
|
429
|
+
[[ -z "$incident_id" ]] && { error "Usage: shipwright incident show <incident_id>"; return 1; }
|
|
430
|
+
|
|
431
|
+
local incident_file="${INCIDENTS_DIR}/${incident_id}.json"
|
|
432
|
+
[[ ! -f "$incident_file" ]] && { error "Incident not found: $incident_id"; return 1; }
|
|
433
|
+
|
|
434
|
+
info "Incident: $incident_id"
|
|
435
|
+
echo ""
|
|
436
|
+
|
|
437
|
+
jq . "$incident_file" | while read -r line; do
|
|
438
|
+
echo " $line"
|
|
439
|
+
done
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
# ─── Report Command ────────────────────────────────────────────────────────
|
|
443
|
+
|
|
444
|
+
cmd_report() {
|
|
445
|
+
local incident_id="$1"
|
|
446
|
+
[[ -z "$incident_id" ]] && { error "Usage: shipwright incident report <incident_id>"; return 1; }
|
|
447
|
+
|
|
448
|
+
local incident_file="${INCIDENTS_DIR}/${incident_id}.json"
|
|
449
|
+
[[ ! -f "$incident_file" ]] && { error "Incident not found: $incident_id"; return 1; }
|
|
450
|
+
|
|
451
|
+
local incident
|
|
452
|
+
incident=$(jq . "$incident_file")
|
|
453
|
+
|
|
454
|
+
local report_file="${INCIDENTS_DIR}/${incident_id}-postmortem.md"
|
|
455
|
+
|
|
456
|
+
cat > "$report_file" << EOF
|
|
457
|
+
# Post-Incident Report
|
|
458
|
+
**Incident ID:** $incident_id
|
|
459
|
+
**Generated:** $(now_iso)
|
|
460
|
+
|
|
461
|
+
## Summary
|
|
462
|
+
$(echo "$incident" | jq -r '.root_cause')
|
|
463
|
+
|
|
464
|
+
## Timeline
|
|
465
|
+
EOF
|
|
466
|
+
|
|
467
|
+
echo "$incident" | jq -r '.failure_events[] | "- \(.ts): \(.type)"' >> "$report_file"
|
|
468
|
+
|
|
469
|
+
cat >> "$report_file" << EOF
|
|
470
|
+
|
|
471
|
+
## Impact
|
|
472
|
+
- Severity: $(echo "$incident" | jq -r '.severity')
|
|
473
|
+
- Status: $(echo "$incident" | jq -r '.status')
|
|
474
|
+
|
|
475
|
+
## Resolution
|
|
476
|
+
$(echo "$incident" | jq -r '.remediation // "Pending"')
|
|
477
|
+
|
|
478
|
+
## Prevention
|
|
479
|
+
1. Monitor for similar patterns
|
|
480
|
+
2. Add alerting thresholds
|
|
481
|
+
3. Improve automated detection
|
|
482
|
+
EOF
|
|
483
|
+
|
|
484
|
+
success "Report generated: $report_file"
|
|
485
|
+
echo "$report_file"
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
# ─── Stats Command ──────────────────────────────────────────────────────────
|
|
489
|
+
|
|
490
|
+
cmd_stats() {
|
|
491
|
+
local format="${1:-table}"
|
|
492
|
+
|
|
493
|
+
if [[ ! -d "$INCIDENTS_DIR" ]] || [[ -z "$(ls -1 "$INCIDENTS_DIR"/*.json 2>/dev/null | grep -v postmortem)" ]]; then
|
|
494
|
+
info "No incident data available"
|
|
495
|
+
return 0
|
|
496
|
+
fi
|
|
497
|
+
|
|
498
|
+
local total_incidents
|
|
499
|
+
total_incidents=$(ls -1 "$INCIDENTS_DIR"/*.json 2>/dev/null | grep -v postmortem | wc -l)
|
|
500
|
+
|
|
501
|
+
local incident_files
|
|
502
|
+
incident_files=$(find "$INCIDENTS_DIR" -name '*.json' -not -name '*postmortem*' -type f 2>/dev/null || true)
|
|
503
|
+
local p0_count p1_count p2_count p3_count resolved_count mttr_sum mttr_avg
|
|
504
|
+
p0_count=0
|
|
505
|
+
p1_count=0
|
|
506
|
+
p2_count=0
|
|
507
|
+
p3_count=0
|
|
508
|
+
resolved_count=0
|
|
509
|
+
mttr_sum=0
|
|
510
|
+
|
|
511
|
+
while IFS= read -r incident_file; do
|
|
512
|
+
[[ -z "$incident_file" ]] && continue
|
|
513
|
+
local sev status mttr
|
|
514
|
+
sev=$(jq -r '.severity // "P3"' "$incident_file" 2>/dev/null || echo "P3")
|
|
515
|
+
status=$(jq -r '.status // "open"' "$incident_file" 2>/dev/null || echo "open")
|
|
516
|
+
mttr=$(jq -r '.mttr_seconds // 0' "$incident_file" 2>/dev/null || echo "0")
|
|
517
|
+
|
|
518
|
+
case "$sev" in
|
|
519
|
+
P0) ((p0_count++)) ;;
|
|
520
|
+
P1) ((p1_count++)) ;;
|
|
521
|
+
P2) ((p2_count++)) ;;
|
|
522
|
+
*) ((p3_count++)) ;;
|
|
523
|
+
esac
|
|
524
|
+
|
|
525
|
+
if [[ "$status" == "resolved" ]]; then
|
|
526
|
+
((resolved_count++))
|
|
527
|
+
mttr_sum=$((mttr_sum + mttr))
|
|
528
|
+
fi
|
|
529
|
+
done <<< "$incident_files"
|
|
530
|
+
|
|
531
|
+
mttr_avg=0
|
|
532
|
+
if [[ "$resolved_count" -gt 0 ]]; then
|
|
533
|
+
mttr_avg=$((mttr_sum / resolved_count))
|
|
534
|
+
fi
|
|
535
|
+
|
|
536
|
+
case "$format" in
|
|
537
|
+
json)
|
|
538
|
+
jq -n \
|
|
539
|
+
--arg total "$total_incidents" \
|
|
540
|
+
--arg p0 "$p0_count" \
|
|
541
|
+
--arg p1 "$p1_count" \
|
|
542
|
+
--arg p2 "$p2_count" \
|
|
543
|
+
--arg p3 "$p3_count" \
|
|
544
|
+
--arg resolved "$resolved_count" \
|
|
545
|
+
--arg mttr "$mttr_avg" \
|
|
546
|
+
'{
|
|
547
|
+
total: ($total | tonumber),
|
|
548
|
+
by_severity: {p0: ($p0 | tonumber), p1: ($p1 | tonumber), p2: ($p2 | tonumber), p3: ($p3 | tonumber)},
|
|
549
|
+
resolved: ($resolved | tonumber),
|
|
550
|
+
mttr_seconds: ($mttr | tonumber)
|
|
551
|
+
}'
|
|
552
|
+
;;
|
|
553
|
+
*)
|
|
554
|
+
echo -e "${BOLD}Incident Statistics${RESET}"
|
|
555
|
+
echo -e "${DIM}────────────────────────────────────────────────────────────────${RESET}"
|
|
556
|
+
echo "Total Incidents: $total_incidents"
|
|
557
|
+
echo " P0 (Critical): $p0_count"
|
|
558
|
+
echo " P1 (High): $p1_count"
|
|
559
|
+
echo " P2 (Medium): $p2_count"
|
|
560
|
+
echo " P3 (Low): $p3_count"
|
|
561
|
+
echo ""
|
|
562
|
+
echo "Resolved: $resolved_count"
|
|
563
|
+
echo "MTTR (avg): $(format_duration "$mttr_avg")"
|
|
564
|
+
;;
|
|
565
|
+
esac
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
# ─── Stop Command ──────────────────────────────────────────────────────────
|
|
569
|
+
|
|
570
|
+
cmd_stop() {
|
|
571
|
+
if [[ -f "$MONITOR_PID_FILE" ]]; then
|
|
572
|
+
local pid
|
|
573
|
+
pid=$(cat "$MONITOR_PID_FILE" 2>/dev/null || echo "")
|
|
574
|
+
if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then
|
|
575
|
+
kill "$pid"
|
|
576
|
+
rm -f "$MONITOR_PID_FILE"
|
|
577
|
+
success "Monitor stopped (PID: $pid)"
|
|
578
|
+
else
|
|
579
|
+
warn "Monitor not running"
|
|
580
|
+
fi
|
|
581
|
+
else
|
|
582
|
+
warn "Monitor not running"
|
|
583
|
+
fi
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
# ─── Help Command ──────────────────────────────────────────────────────────
|
|
587
|
+
|
|
588
|
+
show_help() {
|
|
589
|
+
echo -e "${CYAN}${BOLD}shipwright incident${RESET} — Autonomous incident detection & response"
|
|
590
|
+
echo ""
|
|
591
|
+
echo -e "${BOLD}USAGE${RESET}"
|
|
592
|
+
echo -e " ${CYAN}shipwright incident${RESET} <command> [options]"
|
|
593
|
+
echo ""
|
|
594
|
+
echo -e "${BOLD}COMMANDS${RESET}"
|
|
595
|
+
echo -e " ${CYAN}watch${RESET} [interval] Start monitoring for incidents (default: 60s)"
|
|
596
|
+
echo -e " ${CYAN}stop${RESET} Stop incident monitoring"
|
|
597
|
+
echo -e " ${CYAN}list${RESET} [format] List recent incidents (table|json)"
|
|
598
|
+
echo -e " ${CYAN}show${RESET} <incident-id> Show details for an incident"
|
|
599
|
+
echo -e " ${CYAN}report${RESET} <incident-id> Generate post-mortem report"
|
|
600
|
+
echo -e " ${CYAN}stats${RESET} [format] Show incident statistics (table|json)"
|
|
601
|
+
echo -e " ${CYAN}config${RESET} <cmd> Configure incident response (show|set)"
|
|
602
|
+
echo -e " ${CYAN}help${RESET} Show this help"
|
|
603
|
+
echo ""
|
|
604
|
+
echo -e "${BOLD}EXAMPLES${RESET}"
|
|
605
|
+
echo -e " ${DIM}shipwright incident watch # Start monitoring${RESET}"
|
|
606
|
+
echo -e " ${DIM}shipwright incident list # Show all incidents${RESET}"
|
|
607
|
+
echo -e " ${DIM}shipwright incident show inc-1702 # Show incident details${RESET}"
|
|
608
|
+
echo -e " ${DIM}shipwright incident report inc-1702 # Generate post-mortem${RESET}"
|
|
609
|
+
echo -e " ${DIM}shipwright incident stats # Show MTTR and frequency${RESET}"
|
|
610
|
+
}
|
|
611
|
+
|
|
612
|
+
# ─── Main Router ───────────────────────────────────────────────────────────
|
|
613
|
+
|
|
614
|
+
main() {
|
|
615
|
+
ensure_incident_dir
|
|
616
|
+
|
|
617
|
+
local cmd="${1:-help}"
|
|
618
|
+
shift 2>/dev/null || true
|
|
619
|
+
|
|
620
|
+
case "$cmd" in
|
|
621
|
+
watch)
|
|
622
|
+
cmd_watch "$@"
|
|
623
|
+
;;
|
|
624
|
+
stop)
|
|
625
|
+
cmd_stop "$@"
|
|
626
|
+
;;
|
|
627
|
+
list)
|
|
628
|
+
cmd_list "$@"
|
|
629
|
+
;;
|
|
630
|
+
show)
|
|
631
|
+
cmd_show "$@"
|
|
632
|
+
;;
|
|
633
|
+
report)
|
|
634
|
+
cmd_report "$@"
|
|
635
|
+
;;
|
|
636
|
+
stats)
|
|
637
|
+
cmd_stats "$@"
|
|
638
|
+
;;
|
|
639
|
+
config)
|
|
640
|
+
error "config command not yet implemented"
|
|
641
|
+
return 1
|
|
642
|
+
;;
|
|
643
|
+
help|--help|-h)
|
|
644
|
+
show_help
|
|
645
|
+
;;
|
|
646
|
+
*)
|
|
647
|
+
error "Unknown command: $cmd"
|
|
648
|
+
show_help
|
|
649
|
+
exit 1
|
|
650
|
+
;;
|
|
651
|
+
esac
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
if [[ "${BASH_SOURCE[0]}" == "$0" ]]; then
|
|
655
|
+
main "$@"
|
|
656
|
+
fi
|