shipwright-cli 1.7.1 → 1.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/code-reviewer.md +90 -0
- package/.claude/agents/devops-engineer.md +142 -0
- package/.claude/agents/pipeline-agent.md +80 -0
- package/.claude/agents/shell-script-specialist.md +150 -0
- package/.claude/agents/test-specialist.md +196 -0
- package/.claude/hooks/post-tool-use.sh +38 -0
- package/.claude/hooks/pre-tool-use.sh +25 -0
- package/.claude/hooks/session-started.sh +37 -0
- package/README.md +212 -814
- package/claude-code/CLAUDE.md.shipwright +54 -0
- package/claude-code/hooks/notify-idle.sh +2 -2
- package/claude-code/hooks/session-start.sh +24 -0
- package/claude-code/hooks/task-completed.sh +6 -2
- package/claude-code/settings.json.template +12 -0
- package/dashboard/public/app.js +4422 -0
- package/dashboard/public/index.html +816 -0
- package/dashboard/public/styles.css +4755 -0
- package/dashboard/server.ts +4315 -0
- package/docs/KNOWN-ISSUES.md +18 -10
- package/docs/TIPS.md +38 -26
- package/docs/patterns/README.md +33 -23
- package/package.json +9 -5
- package/scripts/adapters/iterm2-adapter.sh +1 -1
- package/scripts/adapters/tmux-adapter.sh +52 -23
- package/scripts/adapters/wezterm-adapter.sh +26 -14
- package/scripts/lib/compat.sh +200 -0
- package/scripts/lib/helpers.sh +72 -0
- package/scripts/postinstall.mjs +72 -13
- package/scripts/{cct → sw} +109 -21
- package/scripts/sw-adversarial.sh +274 -0
- package/scripts/sw-architecture-enforcer.sh +330 -0
- package/scripts/sw-checkpoint.sh +390 -0
- package/scripts/{cct-cleanup.sh → sw-cleanup.sh} +3 -1
- package/scripts/sw-connect.sh +619 -0
- package/scripts/{cct-cost.sh → sw-cost.sh} +368 -34
- package/scripts/{cct-daemon.sh → sw-daemon.sh} +2217 -204
- package/scripts/sw-dashboard.sh +477 -0
- package/scripts/sw-developer-simulation.sh +252 -0
- package/scripts/sw-docs.sh +635 -0
- package/scripts/sw-doctor.sh +907 -0
- package/scripts/{cct-fix.sh → sw-fix.sh} +10 -6
- package/scripts/{cct-fleet.sh → sw-fleet.sh} +498 -22
- package/scripts/sw-github-checks.sh +521 -0
- package/scripts/sw-github-deploy.sh +533 -0
- package/scripts/sw-github-graphql.sh +972 -0
- package/scripts/sw-heartbeat.sh +293 -0
- package/scripts/{cct-init.sh → sw-init.sh} +144 -11
- package/scripts/sw-intelligence.sh +1196 -0
- package/scripts/sw-jira.sh +643 -0
- package/scripts/sw-launchd.sh +364 -0
- package/scripts/sw-linear.sh +648 -0
- package/scripts/{cct-logs.sh → sw-logs.sh} +72 -2
- package/scripts/{cct-loop.sh → sw-loop.sh} +534 -44
- package/scripts/{cct-memory.sh → sw-memory.sh} +321 -38
- package/scripts/sw-patrol-meta.sh +417 -0
- package/scripts/sw-pipeline-composer.sh +455 -0
- package/scripts/{cct-pipeline.sh → sw-pipeline.sh} +2319 -178
- package/scripts/sw-predictive.sh +820 -0
- package/scripts/{cct-prep.sh → sw-prep.sh} +339 -49
- package/scripts/{cct-ps.sh → sw-ps.sh} +6 -4
- package/scripts/{cct-reaper.sh → sw-reaper.sh} +6 -4
- package/scripts/sw-remote.sh +687 -0
- package/scripts/sw-self-optimize.sh +947 -0
- package/scripts/sw-session.sh +519 -0
- package/scripts/sw-setup.sh +234 -0
- package/scripts/sw-status.sh +605 -0
- package/scripts/{cct-templates.sh → sw-templates.sh} +9 -4
- package/scripts/sw-tmux.sh +591 -0
- package/scripts/sw-tracker-jira.sh +277 -0
- package/scripts/sw-tracker-linear.sh +292 -0
- package/scripts/sw-tracker.sh +409 -0
- package/scripts/{cct-upgrade.sh → sw-upgrade.sh} +103 -46
- package/scripts/{cct-worktree.sh → sw-worktree.sh} +3 -0
- package/templates/pipelines/autonomous.json +27 -5
- package/templates/pipelines/full.json +12 -0
- package/templates/pipelines/standard.json +12 -0
- package/tmux/{claude-teams-overlay.conf → shipwright-overlay.conf} +27 -9
- package/tmux/templates/accessibility.json +34 -0
- package/tmux/templates/api-design.json +35 -0
- package/tmux/templates/architecture.json +1 -0
- package/tmux/templates/bug-fix.json +9 -0
- package/tmux/templates/code-review.json +1 -0
- package/tmux/templates/compliance.json +36 -0
- package/tmux/templates/data-pipeline.json +36 -0
- package/tmux/templates/debt-paydown.json +34 -0
- package/tmux/templates/devops.json +1 -0
- package/tmux/templates/documentation.json +1 -0
- package/tmux/templates/exploration.json +1 -0
- package/tmux/templates/feature-dev.json +1 -0
- package/tmux/templates/full-stack.json +8 -0
- package/tmux/templates/i18n.json +34 -0
- package/tmux/templates/incident-response.json +36 -0
- package/tmux/templates/migration.json +1 -0
- package/tmux/templates/observability.json +35 -0
- package/tmux/templates/onboarding.json +33 -0
- package/tmux/templates/performance.json +35 -0
- package/tmux/templates/refactor.json +1 -0
- package/tmux/templates/release.json +35 -0
- package/tmux/templates/security-audit.json +8 -0
- package/tmux/templates/spike.json +34 -0
- package/tmux/templates/testing.json +1 -0
- package/tmux/tmux.conf +98 -9
- package/scripts/cct-doctor.sh +0 -414
- package/scripts/cct-session.sh +0 -284
- package/scripts/cct-status.sh +0 -169
|
@@ -4,8 +4,9 @@
|
|
|
4
4
|
# ║ Polls for labeled issues · Spawns pipelines · Manages worktrees ║
|
|
5
5
|
# ╚═══════════════════════════════════════════════════════════════════════════╝
|
|
6
6
|
set -euo pipefail
|
|
7
|
+
trap 'echo "ERROR: $BASH_SOURCE:$LINENO exited with status $?" >&2' ERR
|
|
7
8
|
|
|
8
|
-
VERSION="1.
|
|
9
|
+
VERSION="1.9.0"
|
|
9
10
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
10
11
|
REPO_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
11
12
|
|
|
@@ -20,6 +21,28 @@ DIM='\033[2m'
|
|
|
20
21
|
BOLD='\033[1m'
|
|
21
22
|
RESET='\033[0m'
|
|
22
23
|
|
|
24
|
+
# ─── Cross-platform compatibility ──────────────────────────────────────────
|
|
25
|
+
# shellcheck source=lib/compat.sh
|
|
26
|
+
[[ -f "$SCRIPT_DIR/lib/compat.sh" ]] && source "$SCRIPT_DIR/lib/compat.sh"
|
|
27
|
+
|
|
28
|
+
# ─── Intelligence Engine (optional) ──────────────────────────────────────────
|
|
29
|
+
# shellcheck source=sw-intelligence.sh
|
|
30
|
+
[[ -f "$SCRIPT_DIR/sw-intelligence.sh" ]] && source "$SCRIPT_DIR/sw-intelligence.sh"
|
|
31
|
+
# shellcheck source=sw-pipeline-composer.sh
|
|
32
|
+
[[ -f "$SCRIPT_DIR/sw-pipeline-composer.sh" ]] && source "$SCRIPT_DIR/sw-pipeline-composer.sh"
|
|
33
|
+
# shellcheck source=sw-self-optimize.sh
|
|
34
|
+
[[ -f "$SCRIPT_DIR/sw-self-optimize.sh" ]] && source "$SCRIPT_DIR/sw-self-optimize.sh"
|
|
35
|
+
# shellcheck source=sw-predictive.sh
|
|
36
|
+
[[ -f "$SCRIPT_DIR/sw-predictive.sh" ]] && source "$SCRIPT_DIR/sw-predictive.sh"
|
|
37
|
+
|
|
38
|
+
# ─── GitHub API Modules (optional) ────────────────────────────────────────
|
|
39
|
+
# shellcheck source=sw-github-graphql.sh
|
|
40
|
+
[[ -f "$SCRIPT_DIR/sw-github-graphql.sh" ]] && source "$SCRIPT_DIR/sw-github-graphql.sh"
|
|
41
|
+
# shellcheck source=sw-github-checks.sh
|
|
42
|
+
[[ -f "$SCRIPT_DIR/sw-github-checks.sh" ]] && source "$SCRIPT_DIR/sw-github-checks.sh"
|
|
43
|
+
# shellcheck source=sw-github-deploy.sh
|
|
44
|
+
[[ -f "$SCRIPT_DIR/sw-github-deploy.sh" ]] && source "$SCRIPT_DIR/sw-github-deploy.sh"
|
|
45
|
+
|
|
23
46
|
# ─── Output Helpers ─────────────────────────────────────────────────────────
|
|
24
47
|
info() { echo -e "${CYAN}${BOLD}▸${RESET} $*"; }
|
|
25
48
|
success() { echo -e "${GREEN}${BOLD}✓${RESET} $*"; }
|
|
@@ -49,7 +72,7 @@ format_duration() {
|
|
|
49
72
|
}
|
|
50
73
|
|
|
51
74
|
# ─── Structured Event Log ──────────────────────────────────────────────────
|
|
52
|
-
EVENTS_FILE="${HOME}/.
|
|
75
|
+
EVENTS_FILE="${HOME}/.shipwright/events.jsonl"
|
|
53
76
|
|
|
54
77
|
emit_event() {
|
|
55
78
|
local event_type="$1"
|
|
@@ -61,14 +84,68 @@ emit_event() {
|
|
|
61
84
|
if [[ "$val" =~ ^-?[0-9]+\.?[0-9]*$ ]]; then
|
|
62
85
|
json_fields="${json_fields},\"${key}\":${val}"
|
|
63
86
|
else
|
|
64
|
-
|
|
65
|
-
|
|
87
|
+
local escaped_val
|
|
88
|
+
escaped_val=$(printf '%s' "$val" | jq -Rs '.' 2>/dev/null || printf '"%s"' "${val//\"/\\\"}")
|
|
89
|
+
json_fields="${json_fields},\"${key}\":${escaped_val}"
|
|
66
90
|
fi
|
|
67
91
|
done
|
|
68
|
-
mkdir -p "${HOME}/.
|
|
92
|
+
mkdir -p "${HOME}/.shipwright"
|
|
69
93
|
echo "{\"ts\":\"$(now_iso)\",\"ts_epoch\":$(now_epoch),\"type\":\"${event_type}\"${json_fields}}" >> "$EVENTS_FILE"
|
|
70
94
|
}
|
|
71
95
|
|
|
96
|
+
# ─── Event Log Rotation ─────────────────────────────────────────────────────
|
|
97
|
+
rotate_event_log() {
|
|
98
|
+
local max_size=$((50 * 1024 * 1024)) # 50MB
|
|
99
|
+
local max_rotations=3
|
|
100
|
+
|
|
101
|
+
# Rotate events.jsonl if too large
|
|
102
|
+
if [[ -f "$EVENTS_FILE" ]]; then
|
|
103
|
+
local size
|
|
104
|
+
size=$(wc -c < "$EVENTS_FILE" 2>/dev/null || echo 0)
|
|
105
|
+
if [[ "$size" -gt "$max_size" ]]; then
|
|
106
|
+
# Shift rotations: .3 → delete, .2 → .3, .1 → .2, current → .1
|
|
107
|
+
local i=$max_rotations
|
|
108
|
+
while [[ $i -gt 1 ]]; do
|
|
109
|
+
local prev=$((i - 1))
|
|
110
|
+
[[ -f "${EVENTS_FILE}.${prev}" ]] && mv "${EVENTS_FILE}.${prev}" "${EVENTS_FILE}.${i}"
|
|
111
|
+
i=$((i - 1))
|
|
112
|
+
done
|
|
113
|
+
mv "$EVENTS_FILE" "${EVENTS_FILE}.1"
|
|
114
|
+
touch "$EVENTS_FILE"
|
|
115
|
+
emit_event "daemon.log_rotated" "previous_size=$size"
|
|
116
|
+
info "Rotated events.jsonl (was $(( size / 1048576 ))MB)"
|
|
117
|
+
fi
|
|
118
|
+
fi
|
|
119
|
+
|
|
120
|
+
# Clean old heartbeat files (> 24h)
|
|
121
|
+
local heartbeat_dir="$HOME/.shipwright/heartbeats"
|
|
122
|
+
if [[ -d "$heartbeat_dir" ]]; then
|
|
123
|
+
find "$heartbeat_dir" -name "*.json" -mmin +1440 -delete 2>/dev/null || true
|
|
124
|
+
fi
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
# ─── GitHub Context (loaded once at startup) ──────────────────────────────
|
|
128
|
+
DAEMON_GITHUB_CONTEXT=""
|
|
129
|
+
|
|
130
|
+
daemon_github_context() {
|
|
131
|
+
# Skip if no GitHub
|
|
132
|
+
[[ "${NO_GITHUB:-false}" == "true" ]] && return 0
|
|
133
|
+
type gh_repo_context &>/dev/null 2>&1 || return 0
|
|
134
|
+
type _gh_detect_repo &>/dev/null 2>&1 || return 0
|
|
135
|
+
|
|
136
|
+
_gh_detect_repo 2>/dev/null || return 0
|
|
137
|
+
local owner="${GH_OWNER:-}" repo="${GH_REPO:-}"
|
|
138
|
+
[[ -z "$owner" || -z "$repo" ]] && return 0
|
|
139
|
+
|
|
140
|
+
local context
|
|
141
|
+
context=$(gh_repo_context "$owner" "$repo" 2>/dev/null || echo "{}")
|
|
142
|
+
if [[ -n "$context" && "$context" != "{}" ]]; then
|
|
143
|
+
daemon_log INFO "GitHub context loaded: $(echo "$context" | jq -r '.contributor_count // 0') contributors, $(echo "$context" | jq -r '.security_alert_count // 0') security alerts"
|
|
144
|
+
DAEMON_GITHUB_CONTEXT="$context"
|
|
145
|
+
export DAEMON_GITHUB_CONTEXT
|
|
146
|
+
fi
|
|
147
|
+
}
|
|
148
|
+
|
|
72
149
|
# ─── GitHub API Retry with Backoff ────────────────────────────────────────
|
|
73
150
|
# Retries gh commands up to 3 times with exponential backoff (1s, 3s, 9s).
|
|
74
151
|
# Detects rate-limit (403/429) and transient errors. Returns the gh exit code.
|
|
@@ -106,7 +183,7 @@ gh_retry() {
|
|
|
106
183
|
}
|
|
107
184
|
|
|
108
185
|
# ─── Defaults ───────────────────────────────────────────────────────────────
|
|
109
|
-
DAEMON_DIR="$HOME/.
|
|
186
|
+
DAEMON_DIR="$HOME/.shipwright"
|
|
110
187
|
PID_FILE="$DAEMON_DIR/daemon.pid"
|
|
111
188
|
SHUTDOWN_FLAG="$DAEMON_DIR/daemon.shutdown"
|
|
112
189
|
STATE_FILE=""
|
|
@@ -153,8 +230,17 @@ PATROL_INTERVAL="${PATROL_INTERVAL:-3600}"
|
|
|
153
230
|
PATROL_MAX_ISSUES="${PATROL_MAX_ISSUES:-5}"
|
|
154
231
|
PATROL_LABEL="${PATROL_LABEL:-auto-patrol}"
|
|
155
232
|
PATROL_DRY_RUN=false
|
|
233
|
+
PATROL_AUTO_WATCH=false
|
|
234
|
+
PATROL_FAILURES_THRESHOLD=3
|
|
235
|
+
PATROL_DORA_ENABLED=true
|
|
236
|
+
PATROL_UNTESTED_ENABLED=true
|
|
237
|
+
PATROL_RETRY_ENABLED=true
|
|
238
|
+
PATROL_RETRY_THRESHOLD=2
|
|
156
239
|
LAST_PATROL_EPOCH=0
|
|
157
240
|
|
|
241
|
+
# Team dashboard coordination
|
|
242
|
+
DASHBOARD_URL="${DASHBOARD_URL:-http://localhost:8767}"
|
|
243
|
+
|
|
158
244
|
# Runtime
|
|
159
245
|
NO_GITHUB=false
|
|
160
246
|
CONFIG_PATH=""
|
|
@@ -320,6 +406,12 @@ load_config() {
|
|
|
320
406
|
PATROL_INTERVAL=$(jq -r '.patrol.interval // 3600' "$config_file")
|
|
321
407
|
PATROL_MAX_ISSUES=$(jq -r '.patrol.max_issues // 5' "$config_file")
|
|
322
408
|
PATROL_LABEL=$(jq -r '.patrol.label // "auto-patrol"' "$config_file")
|
|
409
|
+
PATROL_AUTO_WATCH=$(jq -r '.patrol.auto_watch // false' "$config_file")
|
|
410
|
+
PATROL_FAILURES_THRESHOLD=$(jq -r '.patrol.checks.recurring_failures.threshold // 3' "$config_file")
|
|
411
|
+
PATROL_DORA_ENABLED=$(jq -r '.patrol.checks.dora_degradation.enabled // true' "$config_file")
|
|
412
|
+
PATROL_UNTESTED_ENABLED=$(jq -r '.patrol.checks.untested_scripts.enabled // true' "$config_file")
|
|
413
|
+
PATROL_RETRY_ENABLED=$(jq -r '.patrol.checks.retry_exhaustion.enabled // true' "$config_file")
|
|
414
|
+
PATROL_RETRY_THRESHOLD=$(jq -r '.patrol.checks.retry_exhaustion.threshold // 2' "$config_file")
|
|
323
415
|
|
|
324
416
|
# adaptive template selection
|
|
325
417
|
AUTO_TEMPLATE=$(jq -r '.auto_template // false' "$config_file")
|
|
@@ -333,6 +425,18 @@ load_config() {
|
|
|
333
425
|
SELF_OPTIMIZE=$(jq -r '.self_optimize // false' "$config_file")
|
|
334
426
|
OPTIMIZE_INTERVAL=$(jq -r '.optimize_interval // 10' "$config_file")
|
|
335
427
|
|
|
428
|
+
# intelligence engine settings
|
|
429
|
+
INTELLIGENCE_ENABLED=$(jq -r '.intelligence.enabled // false' "$config_file")
|
|
430
|
+
INTELLIGENCE_CACHE_TTL=$(jq -r '.intelligence.cache_ttl_seconds // 3600' "$config_file")
|
|
431
|
+
COMPOSER_ENABLED=$(jq -r '.intelligence.composer_enabled // false' "$config_file")
|
|
432
|
+
OPTIMIZATION_ENABLED=$(jq -r '.intelligence.optimization_enabled // false' "$config_file")
|
|
433
|
+
PREDICTION_ENABLED=$(jq -r '.intelligence.prediction_enabled // false' "$config_file")
|
|
434
|
+
ANOMALY_THRESHOLD=$(jq -r '.intelligence.anomaly_threshold // 3.0' "$config_file")
|
|
435
|
+
|
|
436
|
+
# adaptive thresholds (intelligence-driven operational tuning)
|
|
437
|
+
ADAPTIVE_THRESHOLDS_ENABLED=$(jq -r '.intelligence.adaptive_enabled // false' "$config_file")
|
|
438
|
+
PRIORITY_STRATEGY=$(jq -r '.intelligence.priority_strategy // "quick-wins-first"' "$config_file")
|
|
439
|
+
|
|
336
440
|
# gh_retry: enable retry wrapper on critical GitHub API calls
|
|
337
441
|
GH_RETRY_ENABLED=$(jq -r '.gh_retry // true' "$config_file")
|
|
338
442
|
|
|
@@ -361,6 +465,23 @@ load_config() {
|
|
|
361
465
|
WORKER_MEM_GB=$(jq -r '.worker_mem_gb // 4' "$config_file")
|
|
362
466
|
EST_COST_PER_JOB=$(jq -r '.estimated_cost_per_job_usd // 5.0' "$config_file")
|
|
363
467
|
|
|
468
|
+
# heartbeat + checkpoint recovery
|
|
469
|
+
HEALTH_HEARTBEAT_TIMEOUT=$(jq -r '.health.heartbeat_timeout_s // 120' "$config_file")
|
|
470
|
+
CHECKPOINT_ENABLED=$(jq -r '.health.checkpoint_enabled // true' "$config_file")
|
|
471
|
+
|
|
472
|
+
# progress-based health monitoring (replaces static timeouts)
|
|
473
|
+
PROGRESS_MONITORING=$(jq -r '.health.progress_based // true' "$config_file")
|
|
474
|
+
PROGRESS_CHECKS_BEFORE_WARN=$(jq -r '.health.stale_checks_before_warn // 3' "$config_file")
|
|
475
|
+
PROGRESS_CHECKS_BEFORE_KILL=$(jq -r '.health.stale_checks_before_kill // 6' "$config_file")
|
|
476
|
+
PROGRESS_HARD_LIMIT_S=$(jq -r '.health.hard_limit_s // 10800' "$config_file") # 3hr absolute max
|
|
477
|
+
|
|
478
|
+
# team dashboard URL (for coordinated claiming)
|
|
479
|
+
local cfg_dashboard_url
|
|
480
|
+
cfg_dashboard_url=$(jq -r '.dashboard_url // ""' "$config_file")
|
|
481
|
+
if [[ -n "$cfg_dashboard_url" && "$cfg_dashboard_url" != "null" ]]; then
|
|
482
|
+
DASHBOARD_URL="$cfg_dashboard_url"
|
|
483
|
+
fi
|
|
484
|
+
|
|
364
485
|
success "Config loaded"
|
|
365
486
|
}
|
|
366
487
|
|
|
@@ -375,6 +496,546 @@ setup_dirs() {
|
|
|
375
496
|
WORKTREE_DIR=".worktrees"
|
|
376
497
|
|
|
377
498
|
mkdir -p "$LOG_DIR"
|
|
499
|
+
mkdir -p "$HOME/.shipwright/progress"
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
# ─── Adaptive Threshold Helpers ──────────────────────────────────────────────
|
|
503
|
+
# When intelligence.adaptive_enabled=true, operational thresholds are learned
|
|
504
|
+
# from historical data instead of using fixed defaults.
|
|
505
|
+
# Every function falls back to the current hardcoded value when no data exists.
|
|
506
|
+
|
|
507
|
+
ADAPTIVE_THRESHOLDS_ENABLED="${ADAPTIVE_THRESHOLDS_ENABLED:-false}"
|
|
508
|
+
PRIORITY_STRATEGY="${PRIORITY_STRATEGY:-quick-wins-first}"
|
|
509
|
+
EMPTY_QUEUE_CYCLES=0
|
|
510
|
+
|
|
511
|
+
# Adapt poll interval based on queue state
|
|
512
|
+
# Empty queue 5+ cycles → 120s; queue has items → 30s; processing → 60s
|
|
513
|
+
get_adaptive_poll_interval() {
|
|
514
|
+
local queue_depth="$1"
|
|
515
|
+
local active_count="$2"
|
|
516
|
+
|
|
517
|
+
if [[ "${ADAPTIVE_THRESHOLDS_ENABLED:-false}" != "true" ]]; then
|
|
518
|
+
echo "$POLL_INTERVAL"
|
|
519
|
+
return
|
|
520
|
+
fi
|
|
521
|
+
|
|
522
|
+
if [[ "$queue_depth" -eq 0 && "$active_count" -eq 0 ]]; then
|
|
523
|
+
EMPTY_QUEUE_CYCLES=$((EMPTY_QUEUE_CYCLES + 1))
|
|
524
|
+
else
|
|
525
|
+
EMPTY_QUEUE_CYCLES=0
|
|
526
|
+
fi
|
|
527
|
+
|
|
528
|
+
local interval="$POLL_INTERVAL"
|
|
529
|
+
if [[ "$EMPTY_QUEUE_CYCLES" -ge 5 ]]; then
|
|
530
|
+
interval=120
|
|
531
|
+
elif [[ "$queue_depth" -gt 0 ]]; then
|
|
532
|
+
interval=30
|
|
533
|
+
else
|
|
534
|
+
interval=60
|
|
535
|
+
fi
|
|
536
|
+
|
|
537
|
+
# Persist current setting for dashboard visibility
|
|
538
|
+
local tuning_file="$HOME/.shipwright/optimization/daemon-tuning.json"
|
|
539
|
+
mkdir -p "$HOME/.shipwright/optimization"
|
|
540
|
+
local tmp_tuning="${tuning_file}.tmp.$$"
|
|
541
|
+
if [[ -f "$tuning_file" ]]; then
|
|
542
|
+
jq --argjson pi "$interval" --argjson eqc "$EMPTY_QUEUE_CYCLES" \
|
|
543
|
+
'.poll_interval = $pi | .empty_queue_cycles = $eqc' \
|
|
544
|
+
"$tuning_file" > "$tmp_tuning" 2>/dev/null && mv "$tmp_tuning" "$tuning_file"
|
|
545
|
+
else
|
|
546
|
+
jq -n --argjson pi "$interval" --argjson eqc "$EMPTY_QUEUE_CYCLES" \
|
|
547
|
+
'{poll_interval: $pi, empty_queue_cycles: $eqc}' > "$tmp_tuning" \
|
|
548
|
+
&& mv "$tmp_tuning" "$tuning_file"
|
|
549
|
+
fi
|
|
550
|
+
|
|
551
|
+
echo "$interval"
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
# Rolling average cost per template from costs.json (last 10 runs)
|
|
555
|
+
get_adaptive_cost_estimate() {
|
|
556
|
+
local template="${1:-autonomous}"
|
|
557
|
+
|
|
558
|
+
if [[ "${ADAPTIVE_THRESHOLDS_ENABLED:-false}" != "true" ]]; then
|
|
559
|
+
echo "$EST_COST_PER_JOB"
|
|
560
|
+
return
|
|
561
|
+
fi
|
|
562
|
+
|
|
563
|
+
local costs_file="$HOME/.shipwright/costs.json"
|
|
564
|
+
if [[ ! -f "$costs_file" ]]; then
|
|
565
|
+
echo "$EST_COST_PER_JOB"
|
|
566
|
+
return
|
|
567
|
+
fi
|
|
568
|
+
|
|
569
|
+
local avg_cost
|
|
570
|
+
avg_cost=$(jq -r --arg tpl "$template" '
|
|
571
|
+
[.sessions // [] | .[] | select(.template == $tpl) | .total_cost_usd // 0] |
|
|
572
|
+
.[-10:] | if length > 0 then (add / length) else null end
|
|
573
|
+
' "$costs_file" 2>/dev/null || echo "")
|
|
574
|
+
|
|
575
|
+
if [[ -n "$avg_cost" && "$avg_cost" != "null" && "$avg_cost" != "0" ]]; then
|
|
576
|
+
echo "$avg_cost"
|
|
577
|
+
else
|
|
578
|
+
echo "$EST_COST_PER_JOB"
|
|
579
|
+
fi
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
# Per-stage adaptive heartbeat timeout from learned stage durations
|
|
583
|
+
get_adaptive_heartbeat_timeout() {
|
|
584
|
+
local stage="${1:-unknown}"
|
|
585
|
+
|
|
586
|
+
if [[ "${ADAPTIVE_THRESHOLDS_ENABLED:-false}" != "true" ]]; then
|
|
587
|
+
echo "${HEALTH_HEARTBEAT_TIMEOUT:-120}"
|
|
588
|
+
return
|
|
589
|
+
fi
|
|
590
|
+
|
|
591
|
+
# Stage-specific defaults (used when no learned data)
|
|
592
|
+
local default_timeout="${HEALTH_HEARTBEAT_TIMEOUT:-120}"
|
|
593
|
+
case "$stage" in
|
|
594
|
+
build) default_timeout=300 ;;
|
|
595
|
+
test) default_timeout=180 ;;
|
|
596
|
+
review|compound_quality) default_timeout=180 ;;
|
|
597
|
+
lint|format|intake|plan|design) default_timeout=60 ;;
|
|
598
|
+
esac
|
|
599
|
+
|
|
600
|
+
local durations_file="$HOME/.shipwright/optimization/stage-durations.json"
|
|
601
|
+
if [[ ! -f "$durations_file" ]]; then
|
|
602
|
+
echo "$default_timeout"
|
|
603
|
+
return
|
|
604
|
+
fi
|
|
605
|
+
|
|
606
|
+
local learned_duration
|
|
607
|
+
learned_duration=$(jq -r --arg s "$stage" \
|
|
608
|
+
'.stages[$s].p90_duration_s // 0' "$durations_file" 2>/dev/null || echo "0")
|
|
609
|
+
|
|
610
|
+
if [[ "$learned_duration" -gt 0 ]]; then
|
|
611
|
+
# 150% of p90 duration, floor of 60s
|
|
612
|
+
local adaptive_timeout=$(( (learned_duration * 3) / 2 ))
|
|
613
|
+
[[ "$adaptive_timeout" -lt 60 ]] && adaptive_timeout=60
|
|
614
|
+
echo "$adaptive_timeout"
|
|
615
|
+
else
|
|
616
|
+
echo "$default_timeout"
|
|
617
|
+
fi
|
|
618
|
+
}
|
|
619
|
+
|
|
620
|
+
# Adaptive stale pipeline timeout using 95th percentile of historical durations
|
|
621
|
+
get_adaptive_stale_timeout() {
|
|
622
|
+
local template="${1:-autonomous}"
|
|
623
|
+
|
|
624
|
+
if [[ "${ADAPTIVE_THRESHOLDS_ENABLED:-false}" != "true" ]]; then
|
|
625
|
+
echo "${HEALTH_STALE_TIMEOUT:-1800}"
|
|
626
|
+
return
|
|
627
|
+
fi
|
|
628
|
+
|
|
629
|
+
local durations_file="$HOME/.shipwright/optimization/pipeline-durations.json"
|
|
630
|
+
if [[ ! -f "$durations_file" ]]; then
|
|
631
|
+
echo "${HEALTH_STALE_TIMEOUT:-1800}"
|
|
632
|
+
return
|
|
633
|
+
fi
|
|
634
|
+
|
|
635
|
+
local p95_duration
|
|
636
|
+
p95_duration=$(jq -r --arg tpl "$template" \
|
|
637
|
+
'.templates[$tpl].p95_duration_s // 0' "$durations_file" 2>/dev/null || echo "0")
|
|
638
|
+
|
|
639
|
+
if [[ "$p95_duration" -gt 0 ]]; then
|
|
640
|
+
# 1.5x safety margin, clamped 600s-7200s
|
|
641
|
+
local adaptive_timeout=$(( (p95_duration * 3) / 2 ))
|
|
642
|
+
[[ "$adaptive_timeout" -lt 600 ]] && adaptive_timeout=600
|
|
643
|
+
[[ "$adaptive_timeout" -gt 7200 ]] && adaptive_timeout=7200
|
|
644
|
+
echo "$adaptive_timeout"
|
|
645
|
+
else
|
|
646
|
+
echo "${HEALTH_STALE_TIMEOUT:-1800}"
|
|
647
|
+
fi
|
|
648
|
+
}
|
|
649
|
+
|
|
650
|
+
# Record pipeline duration for future threshold learning
|
|
651
|
+
record_pipeline_duration() {
|
|
652
|
+
local template="$1" duration_s="$2" result="$3"
|
|
653
|
+
|
|
654
|
+
if [[ "${ADAPTIVE_THRESHOLDS_ENABLED:-false}" != "true" ]]; then
|
|
655
|
+
return
|
|
656
|
+
fi
|
|
657
|
+
[[ ! "$duration_s" =~ ^[0-9]+$ ]] && return
|
|
658
|
+
|
|
659
|
+
local durations_file="$HOME/.shipwright/optimization/pipeline-durations.json"
|
|
660
|
+
mkdir -p "$HOME/.shipwright/optimization"
|
|
661
|
+
|
|
662
|
+
if [[ ! -f "$durations_file" ]]; then
|
|
663
|
+
echo '{"templates":{}}' > "$durations_file"
|
|
664
|
+
fi
|
|
665
|
+
|
|
666
|
+
local tmp_dur="${durations_file}.tmp.$$"
|
|
667
|
+
jq --arg tpl "$template" --argjson dur "$duration_s" --arg res "$result" --arg ts "$(now_iso)" '
|
|
668
|
+
.templates[$tpl] = (
|
|
669
|
+
(.templates[$tpl] // {durations: [], p95_duration_s: 0}) |
|
|
670
|
+
.durations = ((.durations + [{duration_s: $dur, result: $res, ts: $ts}]) | .[-50:]) |
|
|
671
|
+
.p95_duration_s = (
|
|
672
|
+
[.durations[].duration_s] | sort |
|
|
673
|
+
if length > 0 then .[((length * 95 / 100) | floor)] else 0 end
|
|
674
|
+
)
|
|
675
|
+
)
|
|
676
|
+
' "$durations_file" > "$tmp_dur" 2>/dev/null && mv "$tmp_dur" "$durations_file"
|
|
677
|
+
}
|
|
678
|
+
|
|
679
|
+
# ─── Progress-Based Health Monitoring ─────────────────────────────────────────
|
|
680
|
+
# Instead of killing jobs after a static timeout, we check for forward progress.
|
|
681
|
+
# Progress signals: stage transitions, iteration advances, git diff growth, new files.
|
|
682
|
+
# Graduated response: healthy → slowing → stalled → stuck → kill.
|
|
683
|
+
|
|
684
|
+
PROGRESS_DIR="$HOME/.shipwright/progress"
|
|
685
|
+
|
|
686
|
+
# Collect a progress snapshot for an active job
|
|
687
|
+
# Returns JSON with stage, iteration, diff_lines, files_changed
|
|
688
|
+
daemon_collect_snapshot() {
|
|
689
|
+
local issue_num="$1" worktree="$2" pid="$3"
|
|
690
|
+
|
|
691
|
+
local stage="" iteration=0 diff_lines=0 files_changed=0 last_error=""
|
|
692
|
+
|
|
693
|
+
# Get stage and iteration from heartbeat (fastest source)
|
|
694
|
+
local heartbeat_dir="$HOME/.shipwright/heartbeats"
|
|
695
|
+
if [[ -d "$heartbeat_dir" ]]; then
|
|
696
|
+
local hb_file
|
|
697
|
+
for hb_file in "$heartbeat_dir"/*.json; do
|
|
698
|
+
[[ ! -f "$hb_file" ]] && continue
|
|
699
|
+
local hb_pid
|
|
700
|
+
hb_pid=$(jq -r '.pid // 0' "$hb_file" 2>/dev/null || echo 0)
|
|
701
|
+
if [[ "$hb_pid" == "$pid" ]]; then
|
|
702
|
+
stage=$(jq -r '.stage // "unknown"' "$hb_file" 2>/dev/null || echo "unknown")
|
|
703
|
+
iteration=$(jq -r '.iteration // 0' "$hb_file" 2>/dev/null || echo 0)
|
|
704
|
+
[[ "$iteration" == "null" ]] && iteration=0
|
|
705
|
+
break
|
|
706
|
+
fi
|
|
707
|
+
done
|
|
708
|
+
fi
|
|
709
|
+
|
|
710
|
+
# Fallback: read stage from pipeline-state.md in worktree
|
|
711
|
+
if [[ -z "$stage" || "$stage" == "unknown" ]] && [[ -d "$worktree" ]]; then
|
|
712
|
+
local state_file="$worktree/.claude/pipeline-state.md"
|
|
713
|
+
if [[ -f "$state_file" ]]; then
|
|
714
|
+
stage=$(grep -m1 '^current_stage:' "$state_file" 2>/dev/null | sed 's/^current_stage: *//' || echo "unknown")
|
|
715
|
+
fi
|
|
716
|
+
fi
|
|
717
|
+
|
|
718
|
+
# Get git diff stats from worktree (how much code has been written)
|
|
719
|
+
if [[ -d "$worktree/.git" ]] || [[ -f "$worktree/.git" ]]; then
|
|
720
|
+
diff_lines=$(cd "$worktree" && git diff --stat 2>/dev/null | tail -1 | grep -o '[0-9]* insertion' | grep -o '[0-9]*' || echo "0")
|
|
721
|
+
[[ -z "$diff_lines" ]] && diff_lines=0
|
|
722
|
+
files_changed=$(cd "$worktree" && git diff --name-only 2>/dev/null | wc -l | tr -d ' ' || echo "0")
|
|
723
|
+
# Also count untracked files the agent has created
|
|
724
|
+
local untracked
|
|
725
|
+
untracked=$(cd "$worktree" && git ls-files --others --exclude-standard 2>/dev/null | wc -l | tr -d ' ' || echo "0")
|
|
726
|
+
files_changed=$((files_changed + untracked))
|
|
727
|
+
fi
|
|
728
|
+
|
|
729
|
+
# Check last error from error log
|
|
730
|
+
if [[ -d "$worktree" ]]; then
|
|
731
|
+
local error_log="$worktree/.claude/pipeline-artifacts/error-log.jsonl"
|
|
732
|
+
if [[ -f "$error_log" ]]; then
|
|
733
|
+
last_error=$(tail -1 "$error_log" 2>/dev/null | jq -r '.signature // ""' 2>/dev/null || echo "")
|
|
734
|
+
fi
|
|
735
|
+
fi
|
|
736
|
+
|
|
737
|
+
# Output JSON snapshot
|
|
738
|
+
jq -n \
|
|
739
|
+
--arg stage "$stage" \
|
|
740
|
+
--argjson iteration "${iteration:-0}" \
|
|
741
|
+
--argjson diff_lines "${diff_lines:-0}" \
|
|
742
|
+
--argjson files_changed "${files_changed:-0}" \
|
|
743
|
+
--arg last_error "$last_error" \
|
|
744
|
+
--arg ts "$(now_iso)" \
|
|
745
|
+
'{
|
|
746
|
+
stage: $stage,
|
|
747
|
+
iteration: $iteration,
|
|
748
|
+
diff_lines: $diff_lines,
|
|
749
|
+
files_changed: $files_changed,
|
|
750
|
+
last_error: $last_error,
|
|
751
|
+
ts: $ts
|
|
752
|
+
}'
|
|
753
|
+
}
|
|
754
|
+
|
|
755
|
+
# Assess job progress by comparing current snapshot to previous
|
|
756
|
+
# Returns: healthy | slowing | stalled | stuck
|
|
757
|
+
daemon_assess_progress() {
|
|
758
|
+
local issue_num="$1" current_snapshot="$2"
|
|
759
|
+
|
|
760
|
+
mkdir -p "$PROGRESS_DIR"
|
|
761
|
+
local progress_file="$PROGRESS_DIR/issue-${issue_num}.json"
|
|
762
|
+
|
|
763
|
+
# If no previous snapshot, store this one and return healthy
|
|
764
|
+
if [[ ! -f "$progress_file" ]]; then
|
|
765
|
+
jq -n \
|
|
766
|
+
--argjson snap "$current_snapshot" \
|
|
767
|
+
--arg issue "$issue_num" \
|
|
768
|
+
'{
|
|
769
|
+
issue: $issue,
|
|
770
|
+
snapshots: [$snap],
|
|
771
|
+
no_progress_count: 0,
|
|
772
|
+
last_progress_at: $snap.ts,
|
|
773
|
+
repeated_error_count: 0
|
|
774
|
+
}' > "$progress_file"
|
|
775
|
+
echo "healthy"
|
|
776
|
+
return
|
|
777
|
+
fi
|
|
778
|
+
|
|
779
|
+
local prev_data
|
|
780
|
+
prev_data=$(cat "$progress_file")
|
|
781
|
+
|
|
782
|
+
# Get previous snapshot values
|
|
783
|
+
local prev_stage prev_iteration prev_diff_lines prev_files prev_error prev_no_progress
|
|
784
|
+
prev_stage=$(echo "$prev_data" | jq -r '.snapshots[-1].stage // "unknown"')
|
|
785
|
+
prev_iteration=$(echo "$prev_data" | jq -r '.snapshots[-1].iteration // 0')
|
|
786
|
+
prev_diff_lines=$(echo "$prev_data" | jq -r '.snapshots[-1].diff_lines // 0')
|
|
787
|
+
prev_files=$(echo "$prev_data" | jq -r '.snapshots[-1].files_changed // 0')
|
|
788
|
+
prev_error=$(echo "$prev_data" | jq -r '.snapshots[-1].last_error // ""')
|
|
789
|
+
prev_no_progress=$(echo "$prev_data" | jq -r '.no_progress_count // 0')
|
|
790
|
+
local prev_repeated_errors
|
|
791
|
+
prev_repeated_errors=$(echo "$prev_data" | jq -r '.repeated_error_count // 0')
|
|
792
|
+
|
|
793
|
+
# Get current values
|
|
794
|
+
local cur_stage cur_iteration cur_diff cur_files cur_error
|
|
795
|
+
cur_stage=$(echo "$current_snapshot" | jq -r '.stage')
|
|
796
|
+
cur_iteration=$(echo "$current_snapshot" | jq -r '.iteration')
|
|
797
|
+
cur_diff=$(echo "$current_snapshot" | jq -r '.diff_lines')
|
|
798
|
+
cur_files=$(echo "$current_snapshot" | jq -r '.files_changed')
|
|
799
|
+
cur_error=$(echo "$current_snapshot" | jq -r '.last_error')
|
|
800
|
+
|
|
801
|
+
# Detect progress
|
|
802
|
+
local has_progress=false
|
|
803
|
+
|
|
804
|
+
# Stage advanced → clear progress
|
|
805
|
+
if [[ "$cur_stage" != "$prev_stage" && "$cur_stage" != "unknown" ]]; then
|
|
806
|
+
has_progress=true
|
|
807
|
+
daemon_log INFO "Progress: issue #${issue_num} stage ${prev_stage} → ${cur_stage}"
|
|
808
|
+
fi
|
|
809
|
+
|
|
810
|
+
# Iteration increased → clear progress (agent is looping but advancing)
|
|
811
|
+
if [[ "$cur_iteration" -gt "$prev_iteration" ]]; then
|
|
812
|
+
has_progress=true
|
|
813
|
+
daemon_log INFO "Progress: issue #${issue_num} iteration ${prev_iteration} → ${cur_iteration}"
|
|
814
|
+
fi
|
|
815
|
+
|
|
816
|
+
# Diff lines grew (agent is writing code)
|
|
817
|
+
if [[ "$cur_diff" -gt "$prev_diff_lines" ]]; then
|
|
818
|
+
has_progress=true
|
|
819
|
+
fi
|
|
820
|
+
|
|
821
|
+
# More files touched
|
|
822
|
+
if [[ "$cur_files" -gt "$prev_files" ]]; then
|
|
823
|
+
has_progress=true
|
|
824
|
+
fi
|
|
825
|
+
|
|
826
|
+
# Detect repeated errors (same error signature hitting again)
|
|
827
|
+
local repeated_errors="$prev_repeated_errors"
|
|
828
|
+
if [[ -n "$cur_error" && "$cur_error" == "$prev_error" ]]; then
|
|
829
|
+
repeated_errors=$((repeated_errors + 1))
|
|
830
|
+
elif [[ -n "$cur_error" && "$cur_error" != "$prev_error" ]]; then
|
|
831
|
+
# Different error — reset counter (agent is making different mistakes, that's progress)
|
|
832
|
+
repeated_errors=0
|
|
833
|
+
fi
|
|
834
|
+
|
|
835
|
+
# Update no_progress counter
|
|
836
|
+
local no_progress_count
|
|
837
|
+
if [[ "$has_progress" == "true" ]]; then
|
|
838
|
+
no_progress_count=0
|
|
839
|
+
repeated_errors=0
|
|
840
|
+
else
|
|
841
|
+
no_progress_count=$((prev_no_progress + 1))
|
|
842
|
+
fi
|
|
843
|
+
|
|
844
|
+
# Update progress file (keep last 10 snapshots)
|
|
845
|
+
local tmp_progress="${progress_file}.tmp.$$"
|
|
846
|
+
jq \
|
|
847
|
+
--argjson snap "$current_snapshot" \
|
|
848
|
+
--argjson npc "$no_progress_count" \
|
|
849
|
+
--argjson rec "$repeated_errors" \
|
|
850
|
+
--arg ts "$(now_iso)" \
|
|
851
|
+
'
|
|
852
|
+
.snapshots = ((.snapshots + [$snap]) | .[-10:]) |
|
|
853
|
+
.no_progress_count = $npc |
|
|
854
|
+
.repeated_error_count = $rec |
|
|
855
|
+
if $npc == 0 then .last_progress_at = $ts else . end
|
|
856
|
+
' "$progress_file" > "$tmp_progress" 2>/dev/null && mv "$tmp_progress" "$progress_file"
|
|
857
|
+
|
|
858
|
+
# Determine verdict
|
|
859
|
+
local warn_threshold="${PROGRESS_CHECKS_BEFORE_WARN:-3}"
|
|
860
|
+
local kill_threshold="${PROGRESS_CHECKS_BEFORE_KILL:-6}"
|
|
861
|
+
|
|
862
|
+
# Stuck in same error loop — accelerate to kill
|
|
863
|
+
if [[ "$repeated_errors" -ge 3 ]]; then
|
|
864
|
+
echo "stuck"
|
|
865
|
+
return
|
|
866
|
+
fi
|
|
867
|
+
|
|
868
|
+
if [[ "$no_progress_count" -ge "$kill_threshold" ]]; then
|
|
869
|
+
echo "stuck"
|
|
870
|
+
elif [[ "$no_progress_count" -ge "$warn_threshold" ]]; then
|
|
871
|
+
echo "stalled"
|
|
872
|
+
elif [[ "$no_progress_count" -ge 1 ]]; then
|
|
873
|
+
echo "slowing"
|
|
874
|
+
else
|
|
875
|
+
echo "healthy"
|
|
876
|
+
fi
|
|
877
|
+
}
|
|
878
|
+
|
|
879
|
+
# Clean up progress tracking for a completed/failed job
|
|
880
|
+
daemon_clear_progress() {
|
|
881
|
+
local issue_num="$1"
|
|
882
|
+
rm -f "$PROGRESS_DIR/issue-${issue_num}.json"
|
|
883
|
+
}
|
|
884
|
+
|
|
885
|
+
# Learn actual worker memory from peak RSS of pipeline processes
|
|
886
|
+
learn_worker_memory() {
|
|
887
|
+
if [[ "${ADAPTIVE_THRESHOLDS_ENABLED:-false}" != "true" ]]; then
|
|
888
|
+
return
|
|
889
|
+
fi
|
|
890
|
+
if [[ ! -f "$STATE_FILE" ]]; then
|
|
891
|
+
return
|
|
892
|
+
fi
|
|
893
|
+
|
|
894
|
+
local total_rss=0
|
|
895
|
+
local process_count=0
|
|
896
|
+
|
|
897
|
+
while IFS= read -r job; do
|
|
898
|
+
local pid
|
|
899
|
+
pid=$(echo "$job" | jq -r '.pid // empty')
|
|
900
|
+
[[ -z "$pid" || ! "$pid" =~ ^[0-9]+$ ]] && continue
|
|
901
|
+
if kill -0 "$pid" 2>/dev/null; then
|
|
902
|
+
local rss_kb
|
|
903
|
+
rss_kb=$(ps -o rss= -p "$pid" 2>/dev/null | tr -d ' ' || echo "0")
|
|
904
|
+
[[ ! "$rss_kb" =~ ^[0-9]+$ ]] && rss_kb=0
|
|
905
|
+
if [[ "$rss_kb" -gt 0 ]]; then
|
|
906
|
+
total_rss=$((total_rss + rss_kb))
|
|
907
|
+
process_count=$((process_count + 1))
|
|
908
|
+
fi
|
|
909
|
+
fi
|
|
910
|
+
done < <(jq -c '.active_jobs[]' "$STATE_FILE" 2>/dev/null || true)
|
|
911
|
+
|
|
912
|
+
if [[ "$process_count" -gt 0 ]]; then
|
|
913
|
+
local avg_rss_gb=$(( total_rss / process_count / 1048576 ))
|
|
914
|
+
# 125% headroom, minimum 1GB, max 16GB
|
|
915
|
+
local learned_mem_gb=$(( (avg_rss_gb * 5 + 3) / 4 ))
|
|
916
|
+
[[ "$learned_mem_gb" -lt 1 ]] && learned_mem_gb=1
|
|
917
|
+
[[ "$learned_mem_gb" -gt 16 ]] && learned_mem_gb=16
|
|
918
|
+
|
|
919
|
+
local tuning_file="$HOME/.shipwright/optimization/daemon-tuning.json"
|
|
920
|
+
mkdir -p "$HOME/.shipwright/optimization"
|
|
921
|
+
local tmp_tuning="${tuning_file}.tmp.$$"
|
|
922
|
+
if [[ -f "$tuning_file" ]]; then
|
|
923
|
+
jq --argjson mem "$learned_mem_gb" --argjson rss "$total_rss" --argjson cnt "$process_count" \
|
|
924
|
+
'.learned_worker_mem_gb = $mem | .last_rss_total_kb = $rss | .last_rss_process_count = $cnt' \
|
|
925
|
+
"$tuning_file" > "$tmp_tuning" 2>/dev/null && mv "$tmp_tuning" "$tuning_file"
|
|
926
|
+
else
|
|
927
|
+
jq -n --argjson mem "$learned_mem_gb" \
|
|
928
|
+
'{learned_worker_mem_gb: $mem}' > "$tmp_tuning" && mv "$tmp_tuning" "$tuning_file"
|
|
929
|
+
fi
|
|
930
|
+
|
|
931
|
+
WORKER_MEM_GB="$learned_mem_gb"
|
|
932
|
+
fi
|
|
933
|
+
}
|
|
934
|
+
|
|
935
|
+
# Record scaling outcome for learning optimal parallelism
|
|
936
|
+
record_scaling_outcome() {
|
|
937
|
+
local parallelism="$1" result="$2"
|
|
938
|
+
|
|
939
|
+
if [[ "${ADAPTIVE_THRESHOLDS_ENABLED:-false}" != "true" ]]; then
|
|
940
|
+
return
|
|
941
|
+
fi
|
|
942
|
+
|
|
943
|
+
local tuning_file="$HOME/.shipwright/optimization/daemon-tuning.json"
|
|
944
|
+
mkdir -p "$HOME/.shipwright/optimization"
|
|
945
|
+
local tmp_tuning="${tuning_file}.tmp.$$"
|
|
946
|
+
if [[ -f "$tuning_file" ]]; then
|
|
947
|
+
jq --argjson p "$parallelism" --arg r "$result" --arg ts "$(now_iso)" '
|
|
948
|
+
.scaling_history = ((.scaling_history // []) + [{parallelism: $p, result: $r, ts: $ts}]) |
|
|
949
|
+
.scaling_history |= .[-50:]
|
|
950
|
+
' "$tuning_file" > "$tmp_tuning" 2>/dev/null && mv "$tmp_tuning" "$tuning_file"
|
|
951
|
+
else
|
|
952
|
+
jq -n --argjson p "$parallelism" --arg r "$result" --arg ts "$(now_iso)" '
|
|
953
|
+
{scaling_history: [{parallelism: $p, result: $r, ts: $ts}]}
|
|
954
|
+
' > "$tmp_tuning" && mv "$tmp_tuning" "$tuning_file"
|
|
955
|
+
fi
|
|
956
|
+
}
|
|
957
|
+
|
|
958
|
+
# Get success rate at a given parallelism level (for gradual scaling decisions)
|
|
959
|
+
get_success_rate_at_parallelism() {
|
|
960
|
+
local target_parallelism="$1"
|
|
961
|
+
|
|
962
|
+
local tuning_file="$HOME/.shipwright/optimization/daemon-tuning.json"
|
|
963
|
+
if [[ ! -f "$tuning_file" ]]; then
|
|
964
|
+
echo "100"
|
|
965
|
+
return
|
|
966
|
+
fi
|
|
967
|
+
|
|
968
|
+
local rate
|
|
969
|
+
rate=$(jq -r --argjson p "$target_parallelism" '
|
|
970
|
+
[.scaling_history // [] | .[] | select(.parallelism == $p)] |
|
|
971
|
+
if length > 0 then
|
|
972
|
+
([.[] | select(.result == "success")] | length) * 100 / length | floor
|
|
973
|
+
else 100 end
|
|
974
|
+
' "$tuning_file" 2>/dev/null || echo "100")
|
|
975
|
+
|
|
976
|
+
echo "${rate:-100}"
|
|
977
|
+
}
|
|
978
|
+
|
|
979
|
+
# Adapt patrol limits based on hit rate
|
|
980
|
+
adapt_patrol_limits() {
|
|
981
|
+
local findings="$1" max_issues="$2"
|
|
982
|
+
|
|
983
|
+
if [[ "${ADAPTIVE_THRESHOLDS_ENABLED:-false}" != "true" ]]; then
|
|
984
|
+
return
|
|
985
|
+
fi
|
|
986
|
+
|
|
987
|
+
local tuning_file="$HOME/.shipwright/optimization/daemon-tuning.json"
|
|
988
|
+
mkdir -p "$HOME/.shipwright/optimization"
|
|
989
|
+
|
|
990
|
+
local new_max="$max_issues"
|
|
991
|
+
if [[ "$findings" -ge "$max_issues" ]]; then
|
|
992
|
+
# Consistently hitting limit — increase
|
|
993
|
+
new_max=$((max_issues + 2))
|
|
994
|
+
[[ "$new_max" -gt 20 ]] && new_max=20
|
|
995
|
+
elif [[ "$findings" -eq 0 ]]; then
|
|
996
|
+
# Finds nothing — reduce
|
|
997
|
+
if [[ "$max_issues" -gt 3 ]]; then
|
|
998
|
+
new_max=$((max_issues - 1))
|
|
999
|
+
else
|
|
1000
|
+
new_max=3
|
|
1001
|
+
fi
|
|
1002
|
+
fi
|
|
1003
|
+
|
|
1004
|
+
local tmp_tuning="${tuning_file}.tmp.$$"
|
|
1005
|
+
if [[ -f "$tuning_file" ]]; then
|
|
1006
|
+
jq --argjson pm "$new_max" --argjson lf "$findings" --arg ts "$(now_iso)" \
|
|
1007
|
+
'.patrol_max_issues = $pm | .last_patrol_findings = $lf | .patrol_adapted_at = $ts' \
|
|
1008
|
+
"$tuning_file" > "$tmp_tuning" 2>/dev/null && mv "$tmp_tuning" "$tuning_file"
|
|
1009
|
+
else
|
|
1010
|
+
jq -n --argjson pm "$new_max" --argjson lf "$findings" --arg ts "$(now_iso)" \
|
|
1011
|
+
'{patrol_max_issues: $pm, last_patrol_findings: $lf, patrol_adapted_at: $ts}' \
|
|
1012
|
+
> "$tmp_tuning" && mv "$tmp_tuning" "$tuning_file"
|
|
1013
|
+
fi
|
|
1014
|
+
}
|
|
1015
|
+
|
|
1016
|
+
# Load adaptive patrol limits from tuning config
|
|
1017
|
+
load_adaptive_patrol_limits() {
|
|
1018
|
+
if [[ "${ADAPTIVE_THRESHOLDS_ENABLED:-false}" != "true" ]]; then
|
|
1019
|
+
return
|
|
1020
|
+
fi
|
|
1021
|
+
|
|
1022
|
+
local tuning_file="$HOME/.shipwright/optimization/daemon-tuning.json"
|
|
1023
|
+
if [[ ! -f "$tuning_file" ]]; then
|
|
1024
|
+
return
|
|
1025
|
+
fi
|
|
1026
|
+
|
|
1027
|
+
local adaptive_max_issues
|
|
1028
|
+
adaptive_max_issues=$(jq -r '.patrol_max_issues // 0' "$tuning_file" 2>/dev/null || echo "0")
|
|
1029
|
+
if [[ "$adaptive_max_issues" -gt 0 ]]; then
|
|
1030
|
+
PATROL_MAX_ISSUES="$adaptive_max_issues"
|
|
1031
|
+
fi
|
|
1032
|
+
}
|
|
1033
|
+
|
|
1034
|
+
# Extract dependency issue numbers from issue text
|
|
1035
|
+
extract_issue_dependencies() {
|
|
1036
|
+
local text="$1"
|
|
1037
|
+
|
|
1038
|
+
echo "$text" | grep -oE '(depends on|blocked by|after) #[0-9]+' | grep -oE '#[0-9]+' | sort -u || true
|
|
378
1039
|
}
|
|
379
1040
|
|
|
380
1041
|
# ─── Logging ─────────────────────────────────────────────────────────────────
|
|
@@ -387,6 +1048,18 @@ daemon_log() {
|
|
|
387
1048
|
ts=$(now_iso)
|
|
388
1049
|
echo "[$ts] [$level] $msg" >> "$LOG_FILE"
|
|
389
1050
|
|
|
1051
|
+
# Rotate daemon.log if over 20MB (checked every ~100 writes)
|
|
1052
|
+
if [[ $(( RANDOM % 100 )) -eq 0 ]] && [[ -f "$LOG_FILE" ]]; then
|
|
1053
|
+
local log_size
|
|
1054
|
+
log_size=$(wc -c < "$LOG_FILE" 2>/dev/null || echo 0)
|
|
1055
|
+
if [[ "$log_size" -gt 20971520 ]]; then
|
|
1056
|
+
[[ -f "${LOG_FILE}.2" ]] && mv "${LOG_FILE}.2" "${LOG_FILE}.3"
|
|
1057
|
+
[[ -f "${LOG_FILE}.1" ]] && mv "${LOG_FILE}.1" "${LOG_FILE}.2"
|
|
1058
|
+
mv "$LOG_FILE" "${LOG_FILE}.1"
|
|
1059
|
+
touch "$LOG_FILE"
|
|
1060
|
+
fi
|
|
1061
|
+
fi
|
|
1062
|
+
|
|
390
1063
|
# Also print to stdout
|
|
391
1064
|
case "$level" in
|
|
392
1065
|
INFO) info "$msg" ;;
|
|
@@ -431,6 +1104,40 @@ notify() {
|
|
|
431
1104
|
fi
|
|
432
1105
|
}
|
|
433
1106
|
|
|
1107
|
+
# ─── GitHub Rate-Limit Circuit Breaker ─────────────────────────────────────
|
|
1108
|
+
# Tracks consecutive GitHub API failures. If we hit too many failures in a row,
|
|
1109
|
+
# we back off exponentially to avoid hammering a rate-limited API.
|
|
1110
|
+
|
|
1111
|
+
GH_CONSECUTIVE_FAILURES=0
|
|
1112
|
+
GH_BACKOFF_UNTIL=0 # epoch seconds — skip gh calls until this time
|
|
1113
|
+
|
|
1114
|
+
gh_rate_limited() {
|
|
1115
|
+
# Returns 0 (true) if we should skip GitHub API calls
|
|
1116
|
+
local now_e
|
|
1117
|
+
now_e=$(now_epoch)
|
|
1118
|
+
if [[ "$GH_BACKOFF_UNTIL" -gt "$now_e" ]]; then
|
|
1119
|
+
return 0
|
|
1120
|
+
fi
|
|
1121
|
+
return 1
|
|
1122
|
+
}
|
|
1123
|
+
|
|
1124
|
+
gh_record_success() {
|
|
1125
|
+
GH_CONSECUTIVE_FAILURES=0
|
|
1126
|
+
GH_BACKOFF_UNTIL=0
|
|
1127
|
+
}
|
|
1128
|
+
|
|
1129
|
+
gh_record_failure() {
|
|
1130
|
+
GH_CONSECUTIVE_FAILURES=$((GH_CONSECUTIVE_FAILURES + 1))
|
|
1131
|
+
if [[ "$GH_CONSECUTIVE_FAILURES" -ge 3 ]]; then
|
|
1132
|
+
# Exponential backoff: 30s, 60s, 120s, 240s (capped at 5min)
|
|
1133
|
+
local backoff_secs=$((30 * (1 << (GH_CONSECUTIVE_FAILURES - 3))))
|
|
1134
|
+
[[ "$backoff_secs" -gt 300 ]] && backoff_secs=300
|
|
1135
|
+
GH_BACKOFF_UNTIL=$(( $(now_epoch) + backoff_secs ))
|
|
1136
|
+
daemon_log WARN "GitHub rate-limit circuit breaker: backing off ${backoff_secs}s after ${GH_CONSECUTIVE_FAILURES} failures"
|
|
1137
|
+
emit_event "daemon.rate_limit" "failures=$GH_CONSECUTIVE_FAILURES" "backoff_s=$backoff_secs"
|
|
1138
|
+
fi
|
|
1139
|
+
}
|
|
1140
|
+
|
|
434
1141
|
# ─── Pre-flight Checks ──────────────────────────────────────────────────────
|
|
435
1142
|
|
|
436
1143
|
preflight_checks() {
|
|
@@ -490,10 +1197,10 @@ preflight_checks() {
|
|
|
490
1197
|
fi
|
|
491
1198
|
|
|
492
1199
|
# 4. Pipeline script
|
|
493
|
-
if [[ -x "$SCRIPT_DIR/
|
|
494
|
-
echo -e " ${GREEN}✓${RESET}
|
|
1200
|
+
if [[ -x "$SCRIPT_DIR/sw-pipeline.sh" ]]; then
|
|
1201
|
+
echo -e " ${GREEN}✓${RESET} sw-pipeline.sh available"
|
|
495
1202
|
else
|
|
496
|
-
echo -e " ${RED}✗${RESET}
|
|
1203
|
+
echo -e " ${RED}✗${RESET} sw-pipeline.sh not found at $SCRIPT_DIR"
|
|
497
1204
|
errors=$((errors + 1))
|
|
498
1205
|
fi
|
|
499
1206
|
|
|
@@ -518,17 +1225,59 @@ preflight_checks() {
|
|
|
518
1225
|
|
|
519
1226
|
# ─── State Management ───────────────────────────────────────────────────────
|
|
520
1227
|
|
|
1228
|
+
# State file lock FD (used by locked_state_update for serialized read-modify-write)
|
|
1229
|
+
STATE_LOCK_FD=7
|
|
1230
|
+
|
|
521
1231
|
# Atomic write: write to tmp file, then mv (prevents corruption on crash)
|
|
522
1232
|
atomic_write_state() {
|
|
523
1233
|
local content="$1"
|
|
524
|
-
local tmp_file
|
|
525
|
-
|
|
526
|
-
|
|
1234
|
+
local tmp_file
|
|
1235
|
+
tmp_file=$(mktemp "${STATE_FILE}.tmp.XXXXXX") || {
|
|
1236
|
+
daemon_log ERROR "Failed to create temp file for state write"
|
|
1237
|
+
return 1
|
|
1238
|
+
}
|
|
1239
|
+
echo "$content" > "$tmp_file" || {
|
|
1240
|
+
daemon_log ERROR "Failed to write state to temp file"
|
|
1241
|
+
rm -f "$tmp_file"
|
|
1242
|
+
return 1
|
|
1243
|
+
}
|
|
1244
|
+
mv "$tmp_file" "$STATE_FILE" || {
|
|
1245
|
+
daemon_log ERROR "Failed to move temp state file into place"
|
|
1246
|
+
rm -f "$tmp_file"
|
|
1247
|
+
return 1
|
|
1248
|
+
}
|
|
1249
|
+
}
|
|
1250
|
+
|
|
1251
|
+
# Locked read-modify-write: prevents TOCTOU race on state file.
|
|
1252
|
+
# Usage: locked_state_update '.queued += [42]'
|
|
1253
|
+
# The jq expression is applied to the current state file atomically.
|
|
1254
|
+
locked_state_update() {
|
|
1255
|
+
local jq_expr="$1"
|
|
1256
|
+
shift
|
|
1257
|
+
local lock_file="${STATE_FILE}.lock"
|
|
1258
|
+
(
|
|
1259
|
+
if command -v flock &>/dev/null; then
|
|
1260
|
+
flock -w 5 200 2>/dev/null || {
|
|
1261
|
+
daemon_log ERROR "locked_state_update: lock acquisition timed out — aborting"
|
|
1262
|
+
return 1
|
|
1263
|
+
}
|
|
1264
|
+
fi
|
|
1265
|
+
local tmp
|
|
1266
|
+
tmp=$(jq "$jq_expr" "$@" "$STATE_FILE" 2>&1) || {
|
|
1267
|
+
daemon_log ERROR "locked_state_update: jq failed — $(echo "$tmp" | head -1)"
|
|
1268
|
+
return 1
|
|
1269
|
+
}
|
|
1270
|
+
atomic_write_state "$tmp" || {
|
|
1271
|
+
daemon_log ERROR "locked_state_update: atomic_write_state failed"
|
|
1272
|
+
return 1
|
|
1273
|
+
}
|
|
1274
|
+
) 200>"$lock_file"
|
|
527
1275
|
}
|
|
528
1276
|
|
|
529
1277
|
init_state() {
|
|
530
1278
|
if [[ ! -f "$STATE_FILE" ]]; then
|
|
531
|
-
|
|
1279
|
+
local init_json
|
|
1280
|
+
init_json=$(jq -n \
|
|
532
1281
|
--arg pid "$$" \
|
|
533
1282
|
--arg started "$(now_iso)" \
|
|
534
1283
|
--argjson interval "$POLL_INTERVAL" \
|
|
@@ -550,25 +1299,32 @@ init_state() {
|
|
|
550
1299
|
queued: [],
|
|
551
1300
|
completed: [],
|
|
552
1301
|
retry_counts: {},
|
|
553
|
-
priority_lane_active: []
|
|
554
|
-
|
|
1302
|
+
priority_lane_active: [],
|
|
1303
|
+
titles: {}
|
|
1304
|
+
}')
|
|
1305
|
+
local lock_file="${STATE_FILE}.lock"
|
|
1306
|
+
(
|
|
1307
|
+
if command -v flock &>/dev/null; then
|
|
1308
|
+
flock -w 5 200 2>/dev/null || {
|
|
1309
|
+
daemon_log ERROR "init_state: lock acquisition timed out"
|
|
1310
|
+
return 1
|
|
1311
|
+
}
|
|
1312
|
+
fi
|
|
1313
|
+
atomic_write_state "$init_json"
|
|
1314
|
+
) 200>"$lock_file"
|
|
555
1315
|
else
|
|
556
1316
|
# Update PID and start time in existing state
|
|
557
|
-
|
|
558
|
-
tmp=$(jq \
|
|
1317
|
+
locked_state_update \
|
|
559
1318
|
--arg pid "$$" \
|
|
560
1319
|
--arg started "$(now_iso)" \
|
|
561
|
-
'.pid = ($pid | tonumber) | .started_at = $started'
|
|
562
|
-
"$STATE_FILE")
|
|
563
|
-
atomic_write_state "$tmp"
|
|
1320
|
+
'.pid = ($pid | tonumber) | .started_at = $started'
|
|
564
1321
|
fi
|
|
565
1322
|
}
|
|
566
1323
|
|
|
567
1324
|
update_state_field() {
|
|
568
1325
|
local field="$1" value="$2"
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
atomic_write_state "$tmp"
|
|
1326
|
+
locked_state_update --arg field "$field" --arg val "$value" \
|
|
1327
|
+
'.[$field] = $val'
|
|
572
1328
|
}
|
|
573
1329
|
|
|
574
1330
|
# ─── Inflight Check ─────────────────────────────────────────────────────────
|
|
@@ -611,15 +1367,36 @@ get_active_count() {
|
|
|
611
1367
|
jq -r '.active_jobs | length' "$STATE_FILE" 2>/dev/null || echo 0
|
|
612
1368
|
}
|
|
613
1369
|
|
|
1370
|
+
# Race-safe active count: acquires state lock before reading.
|
|
1371
|
+
# Returns MAX_PARALLEL on lock timeout (safe fail — prevents over-spawning).
|
|
1372
|
+
locked_get_active_count() {
|
|
1373
|
+
if [[ ! -f "$STATE_FILE" ]]; then
|
|
1374
|
+
echo 0
|
|
1375
|
+
return
|
|
1376
|
+
fi
|
|
1377
|
+
local lock_file="${STATE_FILE}.lock"
|
|
1378
|
+
local count
|
|
1379
|
+
count=$(
|
|
1380
|
+
(
|
|
1381
|
+
if command -v flock &>/dev/null; then
|
|
1382
|
+
flock -w 5 200 2>/dev/null || {
|
|
1383
|
+
daemon_log WARN "locked_get_active_count: lock timeout — returning MAX_PARALLEL as safe default"
|
|
1384
|
+
echo "$MAX_PARALLEL"
|
|
1385
|
+
exit 0
|
|
1386
|
+
}
|
|
1387
|
+
fi
|
|
1388
|
+
jq -r '.active_jobs | length' "$STATE_FILE" 2>/dev/null || echo "$MAX_PARALLEL"
|
|
1389
|
+
) 200>"$lock_file"
|
|
1390
|
+
)
|
|
1391
|
+
echo "${count:-0}"
|
|
1392
|
+
}
|
|
1393
|
+
|
|
614
1394
|
# ─── Queue Management ───────────────────────────────────────────────────────
|
|
615
1395
|
|
|
616
1396
|
enqueue_issue() {
|
|
617
1397
|
local issue_num="$1"
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
'.queued += [$num] | .queued |= unique' \
|
|
621
|
-
"$STATE_FILE")
|
|
622
|
-
atomic_write_state "$tmp"
|
|
1398
|
+
locked_state_update --argjson num "$issue_num" \
|
|
1399
|
+
'.queued += [$num] | .queued |= unique'
|
|
623
1400
|
daemon_log INFO "Queued issue #${issue_num} (at capacity)"
|
|
624
1401
|
}
|
|
625
1402
|
|
|
@@ -631,10 +1408,8 @@ dequeue_next() {
|
|
|
631
1408
|
local next
|
|
632
1409
|
next=$(jq -r '.queued[0] // empty' "$STATE_FILE" 2>/dev/null || true)
|
|
633
1410
|
if [[ -n "$next" ]]; then
|
|
634
|
-
# Remove from queue
|
|
635
|
-
|
|
636
|
-
tmp=$(jq '.queued = .queued[1:]' "$STATE_FILE")
|
|
637
|
-
atomic_write_state "$tmp"
|
|
1411
|
+
# Remove from queue (locked to prevent race with enqueue)
|
|
1412
|
+
locked_state_update '.queued = .queued[1:]'
|
|
638
1413
|
echo "$next"
|
|
639
1414
|
fi
|
|
640
1415
|
}
|
|
@@ -667,11 +1442,8 @@ get_priority_active_count() {
|
|
|
667
1442
|
|
|
668
1443
|
track_priority_job() {
|
|
669
1444
|
local issue_num="$1"
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
'.priority_lane_active = ((.priority_lane_active // []) + [$num] | unique)' \
|
|
673
|
-
"$STATE_FILE")
|
|
674
|
-
atomic_write_state "$tmp"
|
|
1445
|
+
locked_state_update --argjson num "$issue_num" \
|
|
1446
|
+
'.priority_lane_active = ((.priority_lane_active // []) + [$num] | unique)'
|
|
675
1447
|
}
|
|
676
1448
|
|
|
677
1449
|
untrack_priority_job() {
|
|
@@ -679,11 +1451,63 @@ untrack_priority_job() {
|
|
|
679
1451
|
if [[ ! -f "$STATE_FILE" ]]; then
|
|
680
1452
|
return
|
|
681
1453
|
fi
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
1454
|
+
locked_state_update --argjson num "$issue_num" \
|
|
1455
|
+
'.priority_lane_active = [(.priority_lane_active // [])[] | select(. != $num)]'
|
|
1456
|
+
}
|
|
1457
|
+
|
|
1458
|
+
# ─── Distributed Issue Claiming ───────────────────────────────────────────
|
|
1459
|
+
|
|
1460
|
+
claim_issue() {
|
|
1461
|
+
local issue_num="$1"
|
|
1462
|
+
local machine_name="$2"
|
|
1463
|
+
|
|
1464
|
+
[[ "$NO_GITHUB" == "true" ]] && return 0 # No claiming in no-github mode
|
|
1465
|
+
|
|
1466
|
+
# Try dashboard-coordinated claim first (atomic label-based)
|
|
1467
|
+
local resp
|
|
1468
|
+
resp=$(curl -s --max-time 5 -X POST "${DASHBOARD_URL}/api/claim" \
|
|
1469
|
+
-H "Content-Type: application/json" \
|
|
1470
|
+
-d "$(jq -n --argjson issue "$issue_num" --arg machine "$machine_name" \
|
|
1471
|
+
'{issue: $issue, machine: $machine}')" 2>/dev/null || echo "")
|
|
1472
|
+
|
|
1473
|
+
if [[ -n "$resp" ]] && echo "$resp" | jq -e '.approved == true' &>/dev/null; then
|
|
1474
|
+
return 0
|
|
1475
|
+
elif [[ -n "$resp" ]] && echo "$resp" | jq -e '.approved == false' &>/dev/null; then
|
|
1476
|
+
local claimed_by
|
|
1477
|
+
claimed_by=$(echo "$resp" | jq -r '.claimed_by // "another machine"')
|
|
1478
|
+
daemon_log INFO "Issue #${issue_num} claimed by ${claimed_by} (via dashboard)"
|
|
1479
|
+
return 1
|
|
1480
|
+
fi
|
|
1481
|
+
|
|
1482
|
+
# Fallback: direct GitHub label check (dashboard unreachable)
|
|
1483
|
+
daemon_log WARN "Dashboard unreachable — falling back to direct GitHub label claim"
|
|
1484
|
+
local existing_claim
|
|
1485
|
+
existing_claim=$(gh issue view "$issue_num" --json labels --jq \
|
|
1486
|
+
'[.labels[].name | select(startswith("claimed:"))] | .[0] // ""' 2>/dev/null || true)
|
|
1487
|
+
|
|
1488
|
+
if [[ -n "$existing_claim" ]]; then
|
|
1489
|
+
daemon_log INFO "Issue #${issue_num} already claimed: ${existing_claim}"
|
|
1490
|
+
return 1
|
|
1491
|
+
fi
|
|
1492
|
+
|
|
1493
|
+
gh issue edit "$issue_num" --add-label "claimed:${machine_name}" 2>/dev/null || return 1
|
|
1494
|
+
return 0
|
|
1495
|
+
}
|
|
1496
|
+
|
|
1497
|
+
release_claim() {
|
|
1498
|
+
local issue_num="$1"
|
|
1499
|
+
local machine_name="$2"
|
|
1500
|
+
|
|
1501
|
+
[[ "$NO_GITHUB" == "true" ]] && return 0
|
|
1502
|
+
|
|
1503
|
+
# Try dashboard-coordinated release first
|
|
1504
|
+
curl -s --max-time 5 -X POST "${DASHBOARD_URL}/api/claim/release" \
|
|
1505
|
+
-H "Content-Type: application/json" \
|
|
1506
|
+
-d "$(jq -n --argjson issue "$issue_num" --arg machine "$machine_name" \
|
|
1507
|
+
'{issue: $issue, machine: $machine}')" 2>/dev/null || true
|
|
1508
|
+
|
|
1509
|
+
# Also remove label directly as backup (idempotent)
|
|
1510
|
+
gh issue edit "$issue_num" --remove-label "claimed:${machine_name}" 2>/dev/null || true
|
|
687
1511
|
}
|
|
688
1512
|
|
|
689
1513
|
# ─── Org-Wide Repo Management ─────────────────────────────────────────────
|
|
@@ -718,6 +1542,38 @@ daemon_spawn_pipeline() {
|
|
|
718
1542
|
|
|
719
1543
|
daemon_log INFO "Spawning pipeline for issue #${issue_num}: ${issue_title}"
|
|
720
1544
|
|
|
1545
|
+
# Extract goal text from issue (title + first line of body)
|
|
1546
|
+
local issue_goal="$issue_title"
|
|
1547
|
+
if [[ "$NO_GITHUB" != "true" ]]; then
|
|
1548
|
+
local issue_body_first
|
|
1549
|
+
issue_body_first=$(gh issue view "$issue_num" --json body --jq '.body' 2>/dev/null | head -3 | tr '\n' ' ' | cut -c1-200 || true)
|
|
1550
|
+
if [[ -n "$issue_body_first" ]]; then
|
|
1551
|
+
issue_goal="${issue_title}: ${issue_body_first}"
|
|
1552
|
+
fi
|
|
1553
|
+
fi
|
|
1554
|
+
|
|
1555
|
+
# ── Predictive risk assessment (if enabled) ──
|
|
1556
|
+
if [[ "${PREDICTION_ENABLED:-false}" == "true" ]] && type predict_pipeline_risk &>/dev/null 2>&1; then
|
|
1557
|
+
local issue_json_for_pred=""
|
|
1558
|
+
if [[ "$NO_GITHUB" != "true" ]]; then
|
|
1559
|
+
issue_json_for_pred=$(gh issue view "$issue_num" --json number,title,body,labels 2>/dev/null || echo "")
|
|
1560
|
+
fi
|
|
1561
|
+
if [[ -n "$issue_json_for_pred" ]]; then
|
|
1562
|
+
local risk_result
|
|
1563
|
+
risk_result=$(predict_pipeline_risk "$issue_json_for_pred" "" 2>/dev/null || echo "")
|
|
1564
|
+
if [[ -n "$risk_result" ]]; then
|
|
1565
|
+
local overall_risk
|
|
1566
|
+
overall_risk=$(echo "$risk_result" | jq -r '.overall_risk // 50' 2>/dev/null || echo "50")
|
|
1567
|
+
if [[ "$overall_risk" -gt 80 ]]; then
|
|
1568
|
+
daemon_log WARN "HIGH RISK (${overall_risk}%) predicted for issue #${issue_num} — upgrading model"
|
|
1569
|
+
export CLAUDE_MODEL="opus"
|
|
1570
|
+
elif [[ "$overall_risk" -lt 30 ]]; then
|
|
1571
|
+
daemon_log INFO "LOW RISK (${overall_risk}%) predicted for issue #${issue_num}"
|
|
1572
|
+
fi
|
|
1573
|
+
fi
|
|
1574
|
+
fi
|
|
1575
|
+
fi
|
|
1576
|
+
|
|
721
1577
|
# Check disk space before spawning
|
|
722
1578
|
local free_space_kb
|
|
723
1579
|
free_space_kb=$(df -k "." 2>/dev/null | tail -1 | awk '{print $4}')
|
|
@@ -747,13 +1603,23 @@ daemon_spawn_pipeline() {
|
|
|
747
1603
|
# Standard mode: use git worktree
|
|
748
1604
|
work_dir="${WORKTREE_DIR}/daemon-issue-${issue_num}"
|
|
749
1605
|
|
|
750
|
-
#
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
1606
|
+
# Serialize worktree operations with a lock file (run in subshell to auto-close FD)
|
|
1607
|
+
mkdir -p "$WORKTREE_DIR"
|
|
1608
|
+
local wt_ok=0
|
|
1609
|
+
(
|
|
1610
|
+
flock -w 30 200 2>/dev/null || true
|
|
1611
|
+
|
|
1612
|
+
# Clean up stale worktree if it exists
|
|
1613
|
+
if [[ -d "$work_dir" ]]; then
|
|
1614
|
+
git worktree remove "$work_dir" --force 2>/dev/null || true
|
|
1615
|
+
fi
|
|
1616
|
+
git branch -D "$branch_name" 2>/dev/null || true
|
|
1617
|
+
|
|
1618
|
+
git worktree add "$work_dir" -b "$branch_name" "$BASE_BRANCH" 2>/dev/null
|
|
1619
|
+
) 200>"${WORKTREE_DIR}/.worktree.lock"
|
|
1620
|
+
wt_ok=$?
|
|
755
1621
|
|
|
756
|
-
if
|
|
1622
|
+
if [[ $wt_ok -ne 0 ]]; then
|
|
757
1623
|
daemon_log ERROR "Failed to create worktree for issue #${issue_num}"
|
|
758
1624
|
return 1
|
|
759
1625
|
fi
|
|
@@ -773,17 +1639,19 @@ daemon_spawn_pipeline() {
|
|
|
773
1639
|
fi
|
|
774
1640
|
|
|
775
1641
|
# Run pipeline in work directory (background)
|
|
1642
|
+
echo -e "\n\n===== Pipeline run $(date -u +%Y-%m-%dT%H:%M:%SZ) =====" >> "$LOG_DIR/issue-${issue_num}.log" 2>/dev/null || true
|
|
776
1643
|
(
|
|
777
1644
|
cd "$work_dir"
|
|
778
|
-
"$SCRIPT_DIR/
|
|
779
|
-
)
|
|
1645
|
+
"$SCRIPT_DIR/sw-pipeline.sh" "${pipeline_args[@]}"
|
|
1646
|
+
) >> "$LOG_DIR/issue-${issue_num}.log" 2>&1 200>&- &
|
|
780
1647
|
local pid=$!
|
|
781
1648
|
|
|
782
1649
|
daemon_log INFO "Pipeline started for issue #${issue_num} (PID: ${pid})"
|
|
783
1650
|
|
|
784
|
-
# Track the job (include repo for org mode)
|
|
785
|
-
daemon_track_job "$issue_num" "$pid" "$work_dir" "$issue_title" "$repo_full_name"
|
|
1651
|
+
# Track the job (include repo and goal for org mode)
|
|
1652
|
+
daemon_track_job "$issue_num" "$pid" "$work_dir" "$issue_title" "$repo_full_name" "$issue_goal"
|
|
786
1653
|
emit_event "daemon.spawn" "issue=$issue_num" "pid=$pid" "repo=${repo_full_name:-local}"
|
|
1654
|
+
"$SCRIPT_DIR/sw-tracker.sh" notify "spawn" "$issue_num" 2>/dev/null || true
|
|
787
1655
|
|
|
788
1656
|
# Comment on the issue
|
|
789
1657
|
if [[ "$NO_GITHUB" != "true" ]]; then
|
|
@@ -791,9 +1659,9 @@ daemon_spawn_pipeline() {
|
|
|
791
1659
|
if [[ -n "$repo_full_name" ]]; then
|
|
792
1660
|
gh_args+=("--repo" "$repo_full_name")
|
|
793
1661
|
fi
|
|
794
|
-
gh issue comment "$issue_num" "${gh_args[@]}" --body "## 🤖 Pipeline Started
|
|
1662
|
+
gh issue comment "$issue_num" ${gh_args[@]+"${gh_args[@]}"} --body "## 🤖 Pipeline Started
|
|
795
1663
|
|
|
796
|
-
**
|
|
1664
|
+
**Delivering:** ${issue_title}
|
|
797
1665
|
|
|
798
1666
|
| Field | Value |
|
|
799
1667
|
|-------|-------|
|
|
@@ -802,32 +1670,31 @@ daemon_spawn_pipeline() {
|
|
|
802
1670
|
| Repo | \`${repo_full_name:-local}\` |
|
|
803
1671
|
| Started | $(now_iso) |
|
|
804
1672
|
|
|
805
|
-
_Progress updates will
|
|
1673
|
+
_Progress updates will appear below as the pipeline advances through each stage._" 2>/dev/null || true
|
|
806
1674
|
fi
|
|
807
1675
|
}
|
|
808
1676
|
|
|
809
1677
|
# ─── Track Job ───────────────────────────────────────────────────────────────
|
|
810
1678
|
|
|
811
1679
|
daemon_track_job() {
|
|
812
|
-
local issue_num="$1" pid="$2" worktree="$3" title="${4:-}" repo="${5:-}"
|
|
813
|
-
|
|
814
|
-
tmp=$(jq \
|
|
1680
|
+
local issue_num="$1" pid="$2" worktree="$3" title="${4:-}" repo="${5:-}" goal="${6:-}"
|
|
1681
|
+
locked_state_update \
|
|
815
1682
|
--argjson num "$issue_num" \
|
|
816
1683
|
--argjson pid "$pid" \
|
|
817
1684
|
--arg wt "$worktree" \
|
|
818
1685
|
--arg title "$title" \
|
|
819
1686
|
--arg started "$(now_iso)" \
|
|
820
1687
|
--arg repo "$repo" \
|
|
1688
|
+
--arg goal "$goal" \
|
|
821
1689
|
'.active_jobs += [{
|
|
822
1690
|
issue: $num,
|
|
823
1691
|
pid: $pid,
|
|
824
1692
|
worktree: $wt,
|
|
825
1693
|
title: $title,
|
|
826
1694
|
started_at: $started,
|
|
827
|
-
repo: $repo
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
atomic_write_state "$tmp"
|
|
1695
|
+
repo: $repo,
|
|
1696
|
+
goal: $goal
|
|
1697
|
+
}]'
|
|
831
1698
|
}
|
|
832
1699
|
|
|
833
1700
|
# ─── Reap Completed Jobs ────────────────────────────────────────────────────
|
|
@@ -843,11 +1710,17 @@ daemon_reap_completed() {
|
|
|
843
1710
|
return
|
|
844
1711
|
fi
|
|
845
1712
|
|
|
1713
|
+
local _retry_spawned_for=""
|
|
1714
|
+
|
|
846
1715
|
while IFS= read -r job; do
|
|
847
1716
|
local issue_num pid worktree
|
|
848
|
-
issue_num=$(echo "$job" | jq -r '.issue')
|
|
849
|
-
pid=$(echo "$job" | jq -r '.pid')
|
|
850
|
-
worktree=$(echo "$job" | jq -r '.worktree')
|
|
1717
|
+
issue_num=$(echo "$job" | jq -r '.issue // empty')
|
|
1718
|
+
pid=$(echo "$job" | jq -r '.pid // empty')
|
|
1719
|
+
worktree=$(echo "$job" | jq -r '.worktree // empty')
|
|
1720
|
+
|
|
1721
|
+
# Skip malformed entries (corrupted state file)
|
|
1722
|
+
[[ -z "$issue_num" || ! "$issue_num" =~ ^[0-9]+$ ]] && continue
|
|
1723
|
+
[[ -z "$pid" || ! "$pid" =~ ^[0-9]+$ ]] && continue
|
|
851
1724
|
|
|
852
1725
|
# Check if process is still running
|
|
853
1726
|
if kill -0 "$pid" 2>/dev/null; then
|
|
@@ -855,13 +1728,30 @@ daemon_reap_completed() {
|
|
|
855
1728
|
fi
|
|
856
1729
|
|
|
857
1730
|
# Process is dead — determine exit code
|
|
1731
|
+
# Note: wait returns 127 if process was already reaped (e.g., by init)
|
|
1732
|
+
# In that case, check pipeline log for success/failure indicators
|
|
858
1733
|
local exit_code=0
|
|
859
1734
|
wait "$pid" 2>/dev/null || exit_code=$?
|
|
1735
|
+
if [[ "$exit_code" -eq 127 ]]; then
|
|
1736
|
+
# Process already reaped — check log file for real outcome
|
|
1737
|
+
local issue_log="$LOG_DIR/issue-${issue_num}.log"
|
|
1738
|
+
if [[ -f "$issue_log" ]]; then
|
|
1739
|
+
if grep -q "Pipeline completed successfully" "$issue_log" 2>/dev/null; then
|
|
1740
|
+
exit_code=0
|
|
1741
|
+
elif grep -q "Pipeline failed\|ERROR.*stage.*failed\|exited with status" "$issue_log" 2>/dev/null; then
|
|
1742
|
+
exit_code=1
|
|
1743
|
+
else
|
|
1744
|
+
daemon_log WARN "Could not determine exit code for issue #${issue_num} (PID ${pid} already reaped) — marking as failure"
|
|
1745
|
+
exit_code=1
|
|
1746
|
+
fi
|
|
1747
|
+
else
|
|
1748
|
+
exit_code=1
|
|
1749
|
+
fi
|
|
1750
|
+
fi
|
|
860
1751
|
|
|
861
|
-
local started_at duration_str=""
|
|
1752
|
+
local started_at duration_str="" start_epoch=0 end_epoch=0
|
|
862
1753
|
started_at=$(echo "$job" | jq -r '.started_at // empty')
|
|
863
1754
|
if [[ -n "$started_at" ]]; then
|
|
864
|
-
local start_epoch end_epoch
|
|
865
1755
|
# macOS date -j for parsing ISO dates (TZ=UTC to parse Z-suffix correctly)
|
|
866
1756
|
start_epoch=$(TZ=UTC date -j -f "%Y-%m-%dT%H:%M:%SZ" "$started_at" +%s 2>/dev/null || date -d "$started_at" +%s 2>/dev/null || echo "0")
|
|
867
1757
|
end_epoch=$(now_epoch)
|
|
@@ -882,31 +1772,52 @@ daemon_reap_completed() {
|
|
|
882
1772
|
daemon_on_failure "$issue_num" "$exit_code" "$duration_str"
|
|
883
1773
|
fi
|
|
884
1774
|
|
|
885
|
-
#
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
#
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
1775
|
+
# Clean up progress tracking for this job
|
|
1776
|
+
daemon_clear_progress "$issue_num"
|
|
1777
|
+
|
|
1778
|
+
# Release claim lock (label-based coordination)
|
|
1779
|
+
local reap_machine_name
|
|
1780
|
+
reap_machine_name=$(jq -r '.machines[] | select(.role == "primary") | .name' "$HOME/.shipwright/machines.json" 2>/dev/null || hostname -s)
|
|
1781
|
+
release_claim "$issue_num" "$reap_machine_name"
|
|
1782
|
+
|
|
1783
|
+
# Skip cleanup if a retry was just spawned for this issue
|
|
1784
|
+
if [[ "$_retry_spawned_for" == "$issue_num" ]]; then
|
|
1785
|
+
daemon_log INFO "Retry spawned for issue #${issue_num} — skipping worktree cleanup"
|
|
1786
|
+
else
|
|
1787
|
+
# Remove from active_jobs and priority lane tracking (locked)
|
|
1788
|
+
locked_state_update --argjson num "$issue_num" \
|
|
1789
|
+
'.active_jobs = [.active_jobs[] | select(.issue != $num)]'
|
|
1790
|
+
untrack_priority_job "$issue_num"
|
|
1791
|
+
|
|
1792
|
+
# Clean up worktree (skip for org-mode clones — they persist)
|
|
1793
|
+
local job_repo
|
|
1794
|
+
job_repo=$(echo "$job" | jq -r '.repo // ""')
|
|
1795
|
+
if [[ -z "$job_repo" ]] && [[ -d "$worktree" ]]; then
|
|
1796
|
+
git worktree remove "$worktree" --force 2>/dev/null || true
|
|
1797
|
+
daemon_log INFO "Cleaned worktree: $worktree"
|
|
1798
|
+
git branch -D "daemon/issue-${issue_num}" 2>/dev/null || true
|
|
1799
|
+
elif [[ -n "$job_repo" ]]; then
|
|
1800
|
+
daemon_log INFO "Org-mode: preserving clone for ${job_repo}"
|
|
1801
|
+
fi
|
|
1802
|
+
fi
|
|
1803
|
+
|
|
1804
|
+
# Dequeue next issue if available AND we have capacity
|
|
1805
|
+
# NOTE: locked_get_active_count prevents TOCTOU race with the
|
|
1806
|
+
# active_jobs removal above. A tiny window remains between
|
|
1807
|
+
# the count read and dequeue_next's own lock acquisition, but
|
|
1808
|
+
# dequeue_next is itself locked, so the worst case is a
|
|
1809
|
+
# missed dequeue that the next poll cycle will pick up.
|
|
1810
|
+
local current_active
|
|
1811
|
+
current_active=$(locked_get_active_count)
|
|
1812
|
+
if [[ "$current_active" -lt "$MAX_PARALLEL" ]]; then
|
|
1813
|
+
local next_issue
|
|
1814
|
+
next_issue=$(dequeue_next)
|
|
1815
|
+
if [[ -n "$next_issue" ]]; then
|
|
1816
|
+
local next_title
|
|
1817
|
+
next_title=$(jq -r --arg n "$next_issue" '.titles[$n] // ""' "$STATE_FILE" 2>/dev/null || true)
|
|
1818
|
+
daemon_log INFO "Dequeuing issue #${next_issue}: ${next_title}"
|
|
1819
|
+
daemon_spawn_pipeline "$next_issue" "$next_title"
|
|
1820
|
+
fi
|
|
910
1821
|
fi
|
|
911
1822
|
done <<< "$jobs"
|
|
912
1823
|
}
|
|
@@ -918,9 +1829,23 @@ daemon_on_success() {
|
|
|
918
1829
|
|
|
919
1830
|
daemon_log SUCCESS "Pipeline completed for issue #${issue_num} (${duration:-unknown})"
|
|
920
1831
|
|
|
921
|
-
# Record
|
|
922
|
-
|
|
923
|
-
|
|
1832
|
+
# Record pipeline duration for adaptive threshold learning
|
|
1833
|
+
if [[ -n "$duration" && "$duration" != "unknown" ]]; then
|
|
1834
|
+
# Parse duration string back to seconds (e.g. "5m 30s" → 330)
|
|
1835
|
+
local dur_secs=0
|
|
1836
|
+
local _h _m _s
|
|
1837
|
+
_h=$(echo "$duration" | grep -oE '[0-9]+h' | grep -oE '[0-9]+' || true)
|
|
1838
|
+
_m=$(echo "$duration" | grep -oE '[0-9]+m' | grep -oE '[0-9]+' || true)
|
|
1839
|
+
_s=$(echo "$duration" | grep -oE '[0-9]+s' | grep -oE '[0-9]+' || true)
|
|
1840
|
+
dur_secs=$(( ${_h:-0} * 3600 + ${_m:-0} * 60 + ${_s:-0} ))
|
|
1841
|
+
if [[ "$dur_secs" -gt 0 ]]; then
|
|
1842
|
+
record_pipeline_duration "$PIPELINE_TEMPLATE" "$dur_secs" "success"
|
|
1843
|
+
record_scaling_outcome "$MAX_PARALLEL" "success"
|
|
1844
|
+
fi
|
|
1845
|
+
fi
|
|
1846
|
+
|
|
1847
|
+
# Record in completed list + clear retry count for this issue
|
|
1848
|
+
locked_state_update \
|
|
924
1849
|
--argjson num "$issue_num" \
|
|
925
1850
|
--arg result "success" \
|
|
926
1851
|
--arg dur "${duration:-unknown}" \
|
|
@@ -930,9 +1855,8 @@ daemon_on_success() {
|
|
|
930
1855
|
result: $result,
|
|
931
1856
|
duration: $dur,
|
|
932
1857
|
completed_at: $completed_at
|
|
933
|
-
}]
|
|
934
|
-
|
|
935
|
-
atomic_write_state "$tmp"
|
|
1858
|
+
}] | .completed = .completed[-500:]
|
|
1859
|
+
| del(.retry_counts[($num | tostring)])'
|
|
936
1860
|
|
|
937
1861
|
if [[ "$NO_GITHUB" != "true" ]]; then
|
|
938
1862
|
# Remove watch label, add success label
|
|
@@ -960,6 +1884,7 @@ Check the associated PR for the implementation." 2>/dev/null || true
|
|
|
960
1884
|
|
|
961
1885
|
notify "Pipeline Complete — Issue #${issue_num}" \
|
|
962
1886
|
"Duration: ${duration:-unknown}" "success"
|
|
1887
|
+
"$SCRIPT_DIR/sw-tracker.sh" notify "completed" "$issue_num" 2>/dev/null || true
|
|
963
1888
|
}
|
|
964
1889
|
|
|
965
1890
|
# ─── Failure Handler ────────────────────────────────────────────────────────
|
|
@@ -969,9 +1894,22 @@ daemon_on_failure() {
|
|
|
969
1894
|
|
|
970
1895
|
daemon_log ERROR "Pipeline failed for issue #${issue_num} (exit: ${exit_code}, ${duration:-unknown})"
|
|
971
1896
|
|
|
1897
|
+
# Record pipeline duration for adaptive threshold learning
|
|
1898
|
+
if [[ -n "$duration" && "$duration" != "unknown" ]]; then
|
|
1899
|
+
local dur_secs=0
|
|
1900
|
+
local _h _m _s
|
|
1901
|
+
_h=$(echo "$duration" | grep -oE '[0-9]+h' | grep -oE '[0-9]+' || true)
|
|
1902
|
+
_m=$(echo "$duration" | grep -oE '[0-9]+m' | grep -oE '[0-9]+' || true)
|
|
1903
|
+
_s=$(echo "$duration" | grep -oE '[0-9]+s' | grep -oE '[0-9]+' || true)
|
|
1904
|
+
dur_secs=$(( ${_h:-0} * 3600 + ${_m:-0} * 60 + ${_s:-0} ))
|
|
1905
|
+
if [[ "$dur_secs" -gt 0 ]]; then
|
|
1906
|
+
record_pipeline_duration "$PIPELINE_TEMPLATE" "$dur_secs" "failure"
|
|
1907
|
+
record_scaling_outcome "$MAX_PARALLEL" "failure"
|
|
1908
|
+
fi
|
|
1909
|
+
fi
|
|
1910
|
+
|
|
972
1911
|
# Record in completed list
|
|
973
|
-
|
|
974
|
-
tmp=$(jq \
|
|
1912
|
+
locked_state_update \
|
|
975
1913
|
--argjson num "$issue_num" \
|
|
976
1914
|
--arg result "failed" \
|
|
977
1915
|
--argjson code "$exit_code" \
|
|
@@ -983,9 +1921,7 @@ daemon_on_failure() {
|
|
|
983
1921
|
exit_code: $code,
|
|
984
1922
|
duration: $dur,
|
|
985
1923
|
completed_at: $completed_at
|
|
986
|
-
}]'
|
|
987
|
-
"$STATE_FILE")
|
|
988
|
-
atomic_write_state "$tmp"
|
|
1924
|
+
}] | .completed = .completed[-500:]'
|
|
989
1925
|
|
|
990
1926
|
# ── Auto-retry with strategy escalation ──
|
|
991
1927
|
if [[ "${RETRY_ESCALATION:-true}" == "true" ]]; then
|
|
@@ -996,15 +1932,32 @@ daemon_on_failure() {
|
|
|
996
1932
|
if [[ "$retry_count" -lt "${MAX_RETRIES:-2}" ]]; then
|
|
997
1933
|
retry_count=$((retry_count + 1))
|
|
998
1934
|
|
|
999
|
-
# Update retry count in state
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
'.retry_counts[$num] = $count'
|
|
1003
|
-
atomic_write_state "$tmp_state"
|
|
1935
|
+
# Update retry count in state (locked to prevent race)
|
|
1936
|
+
locked_state_update \
|
|
1937
|
+
--arg num "$issue_num" --argjson count "$retry_count" \
|
|
1938
|
+
'.retry_counts[$num] = $count'
|
|
1004
1939
|
|
|
1005
1940
|
daemon_log WARN "Auto-retry #${retry_count}/${MAX_RETRIES:-2} for issue #${issue_num}"
|
|
1006
1941
|
emit_event "daemon.retry" "issue=$issue_num" "retry=$retry_count" "max=${MAX_RETRIES:-2}"
|
|
1007
1942
|
|
|
1943
|
+
# Check for checkpoint to enable resume-from-checkpoint
|
|
1944
|
+
local checkpoint_args=()
|
|
1945
|
+
if [[ "${CHECKPOINT_ENABLED:-true}" == "true" ]]; then
|
|
1946
|
+
# Try to find worktree for this issue to check for checkpoints
|
|
1947
|
+
local issue_worktree="${REPO_DIR}/.worktrees/daemon-issue-${issue_num}"
|
|
1948
|
+
if [[ -d "$issue_worktree/.claude/pipeline-artifacts/checkpoints" ]]; then
|
|
1949
|
+
local latest_checkpoint=""
|
|
1950
|
+
for cp_file in "$issue_worktree/.claude/pipeline-artifacts/checkpoints"/*-checkpoint.json; do
|
|
1951
|
+
[[ -f "$cp_file" ]] && latest_checkpoint="$cp_file"
|
|
1952
|
+
done
|
|
1953
|
+
if [[ -n "$latest_checkpoint" ]]; then
|
|
1954
|
+
daemon_log INFO "Found checkpoint: $latest_checkpoint"
|
|
1955
|
+
emit_event "daemon.recovery" "issue=$issue_num" "checkpoint=$latest_checkpoint"
|
|
1956
|
+
checkpoint_args+=("--resume")
|
|
1957
|
+
fi
|
|
1958
|
+
fi
|
|
1959
|
+
fi
|
|
1960
|
+
|
|
1008
1961
|
# Build escalated pipeline args
|
|
1009
1962
|
local retry_template="$PIPELINE_TEMPLATE"
|
|
1010
1963
|
local retry_model="${MODEL:-opus}"
|
|
@@ -1038,12 +1991,18 @@ Pipeline failed — retrying with escalated strategy.
|
|
|
1038
1991
|
_Escalation: $(if [[ "$retry_count" -eq 1 ]]; then echo "upgraded model + increased iterations"; else echo "full template + compound quality"; fi)_" 2>/dev/null || true
|
|
1039
1992
|
fi
|
|
1040
1993
|
|
|
1994
|
+
# Backoff before retry: 30s * retry_count (30s, 60s, ...)
|
|
1995
|
+
local backoff_secs=$((30 * retry_count))
|
|
1996
|
+
daemon_log INFO "Waiting ${backoff_secs}s before retry #${retry_count}"
|
|
1997
|
+
sleep "$backoff_secs"
|
|
1998
|
+
|
|
1041
1999
|
# Re-spawn with escalated strategy
|
|
1042
2000
|
local orig_template="$PIPELINE_TEMPLATE"
|
|
1043
2001
|
local orig_model="$MODEL"
|
|
1044
2002
|
PIPELINE_TEMPLATE="$retry_template"
|
|
1045
2003
|
MODEL="$retry_model"
|
|
1046
2004
|
daemon_spawn_pipeline "$issue_num" "retry-${retry_count}"
|
|
2005
|
+
_retry_spawned_for="$issue_num"
|
|
1047
2006
|
PIPELINE_TEMPLATE="$orig_template"
|
|
1048
2007
|
MODEL="$orig_model"
|
|
1049
2008
|
return
|
|
@@ -1099,18 +2058,66 @@ _Re-add the \`${WATCH_LABEL}\` label to retry._" 2>/dev/null || true
|
|
|
1099
2058
|
|
|
1100
2059
|
notify "Pipeline Failed — Issue #${issue_num}" \
|
|
1101
2060
|
"Exit code: ${exit_code}, Duration: ${duration:-unknown}" "error"
|
|
2061
|
+
"$SCRIPT_DIR/sw-tracker.sh" notify "failed" "$issue_num" "Exit code: ${exit_code}, Duration: ${duration:-unknown}" 2>/dev/null || true
|
|
1102
2062
|
}
|
|
1103
2063
|
|
|
1104
2064
|
# ─── Intelligent Triage ──────────────────────────────────────────────────────
|
|
1105
2065
|
|
|
1106
2066
|
# Score an issue from 0-100 based on multiple signals for intelligent prioritization.
|
|
1107
2067
|
# Combines priority labels, age, complexity, dependencies, type, and memory signals.
|
|
2068
|
+
# When intelligence engine is enabled, uses semantic AI analysis for richer scoring.
|
|
1108
2069
|
triage_score_issue() {
|
|
1109
2070
|
local issue_json="$1"
|
|
1110
2071
|
local issue_num issue_title issue_body labels_csv created_at
|
|
1111
2072
|
issue_num=$(echo "$issue_json" | jq -r '.number')
|
|
1112
2073
|
issue_title=$(echo "$issue_json" | jq -r '.title // ""')
|
|
1113
2074
|
issue_body=$(echo "$issue_json" | jq -r '.body // ""')
|
|
2075
|
+
|
|
2076
|
+
# ── Intelligence-powered triage (if enabled) ──
|
|
2077
|
+
if [[ "${INTELLIGENCE_ENABLED:-false}" == "true" ]] && type intelligence_analyze_issue &>/dev/null 2>&1; then
|
|
2078
|
+
daemon_log INFO "Intelligence: using AI triage (intelligence enabled)"
|
|
2079
|
+
local analysis
|
|
2080
|
+
analysis=$(intelligence_analyze_issue "$issue_json" 2>/dev/null || echo "")
|
|
2081
|
+
if [[ -n "$analysis" && "$analysis" != "{}" && "$analysis" != "null" ]]; then
|
|
2082
|
+
# Extract complexity (1-10) and convert to score (0-100)
|
|
2083
|
+
local ai_complexity ai_risk ai_success_prob
|
|
2084
|
+
ai_complexity=$(echo "$analysis" | jq -r '.complexity // 0' 2>/dev/null || echo "0")
|
|
2085
|
+
ai_risk=$(echo "$analysis" | jq -r '.risk_level // "medium"' 2>/dev/null || echo "medium")
|
|
2086
|
+
ai_success_prob=$(echo "$analysis" | jq -r '.success_probability // 50' 2>/dev/null || echo "50")
|
|
2087
|
+
|
|
2088
|
+
# Store analysis for downstream use (composer, predictions)
|
|
2089
|
+
export INTELLIGENCE_ANALYSIS="$analysis"
|
|
2090
|
+
export INTELLIGENCE_COMPLEXITY="$ai_complexity"
|
|
2091
|
+
|
|
2092
|
+
# Convert AI analysis to triage score:
|
|
2093
|
+
# Higher success probability + lower complexity = higher score (process sooner)
|
|
2094
|
+
local ai_score
|
|
2095
|
+
ai_score=$(( ai_success_prob - (ai_complexity * 3) ))
|
|
2096
|
+
# Risk adjustment
|
|
2097
|
+
case "$ai_risk" in
|
|
2098
|
+
critical) ai_score=$((ai_score + 15)) ;; # Critical = process urgently
|
|
2099
|
+
high) ai_score=$((ai_score + 10)) ;;
|
|
2100
|
+
low) ai_score=$((ai_score - 5)) ;;
|
|
2101
|
+
esac
|
|
2102
|
+
# Clamp
|
|
2103
|
+
[[ "$ai_score" -lt 0 ]] && ai_score=0
|
|
2104
|
+
[[ "$ai_score" -gt 100 ]] && ai_score=100
|
|
2105
|
+
|
|
2106
|
+
emit_event "intelligence.triage" \
|
|
2107
|
+
"issue=$issue_num" \
|
|
2108
|
+
"complexity=$ai_complexity" \
|
|
2109
|
+
"risk=$ai_risk" \
|
|
2110
|
+
"success_prob=$ai_success_prob" \
|
|
2111
|
+
"score=$ai_score"
|
|
2112
|
+
|
|
2113
|
+
echo "$ai_score"
|
|
2114
|
+
return
|
|
2115
|
+
fi
|
|
2116
|
+
# Fall through to heuristic scoring if intelligence call failed
|
|
2117
|
+
daemon_log INFO "Intelligence: AI triage failed, falling back to heuristic scoring"
|
|
2118
|
+
else
|
|
2119
|
+
daemon_log INFO "Intelligence: using heuristic triage (intelligence disabled, enable with intelligence.enabled=true)"
|
|
2120
|
+
fi
|
|
1114
2121
|
labels_csv=$(echo "$issue_json" | jq -r '[.labels[].name] | join(",")')
|
|
1115
2122
|
created_at=$(echo "$issue_json" | jq -r '.createdAt // ""')
|
|
1116
2123
|
|
|
@@ -1211,9 +2218,9 @@ triage_score_issue() {
|
|
|
1211
2218
|
|
|
1212
2219
|
# ── 6. Memory bonus (0-10 points / -5 for prior failures) ──
|
|
1213
2220
|
local memory_score=0
|
|
1214
|
-
if [[ -x "$SCRIPT_DIR/
|
|
2221
|
+
if [[ -x "$SCRIPT_DIR/sw-memory.sh" ]]; then
|
|
1215
2222
|
local memory_result
|
|
1216
|
-
memory_result=$("$SCRIPT_DIR/
|
|
2223
|
+
memory_result=$("$SCRIPT_DIR/sw-memory.sh" search --issue "$issue_num" --json 2>/dev/null || true)
|
|
1217
2224
|
if [[ -n "$memory_result" ]]; then
|
|
1218
2225
|
local prior_result
|
|
1219
2226
|
prior_result=$(echo "$memory_result" | jq -r '.last_result // ""' 2>/dev/null || true)
|
|
@@ -1245,6 +2252,7 @@ triage_score_issue() {
|
|
|
1245
2252
|
}
|
|
1246
2253
|
|
|
1247
2254
|
# Auto-select pipeline template based on issue labels
|
|
2255
|
+
# When intelligence/composer is enabled, composes a custom pipeline instead of static selection.
|
|
1248
2256
|
select_pipeline_template() {
|
|
1249
2257
|
local labels="$1"
|
|
1250
2258
|
local score="${2:-50}"
|
|
@@ -1255,7 +2263,57 @@ select_pipeline_template() {
|
|
|
1255
2263
|
return
|
|
1256
2264
|
fi
|
|
1257
2265
|
|
|
1258
|
-
# ──
|
|
2266
|
+
# ── Intelligence-composed pipeline (if enabled) ──
|
|
2267
|
+
if [[ "${COMPOSER_ENABLED:-false}" == "true" ]] && type composer_create_pipeline &>/dev/null 2>&1; then
|
|
2268
|
+
daemon_log INFO "Intelligence: using AI pipeline composition (composer enabled)"
|
|
2269
|
+
local analysis="${INTELLIGENCE_ANALYSIS:-{}}"
|
|
2270
|
+
local repo_context=""
|
|
2271
|
+
if [[ -f "${REPO_DIR:-}/.claude/pipeline-state.md" ]]; then
|
|
2272
|
+
repo_context="has_pipeline_state"
|
|
2273
|
+
fi
|
|
2274
|
+
local budget_json="{}"
|
|
2275
|
+
if [[ -x "$SCRIPT_DIR/sw-cost.sh" ]]; then
|
|
2276
|
+
local remaining
|
|
2277
|
+
remaining=$(bash "$SCRIPT_DIR/sw-cost.sh" remaining-budget 2>/dev/null || echo "")
|
|
2278
|
+
if [[ -n "$remaining" ]]; then
|
|
2279
|
+
budget_json="{\"remaining_usd\": $remaining}"
|
|
2280
|
+
fi
|
|
2281
|
+
fi
|
|
2282
|
+
local composed_path
|
|
2283
|
+
composed_path=$(composer_create_pipeline "$analysis" "$repo_context" "$budget_json" 2>/dev/null || echo "")
|
|
2284
|
+
if [[ -n "$composed_path" && -f "$composed_path" ]]; then
|
|
2285
|
+
emit_event "daemon.composed_pipeline" "labels=$labels" "score=$score"
|
|
2286
|
+
echo "composed"
|
|
2287
|
+
return
|
|
2288
|
+
fi
|
|
2289
|
+
# Fall through to static selection if composition failed
|
|
2290
|
+
daemon_log INFO "Intelligence: AI pipeline composition failed, falling back to static template selection"
|
|
2291
|
+
else
|
|
2292
|
+
daemon_log INFO "Intelligence: using static template selection (composer disabled, enable with intelligence.composer_enabled=true)"
|
|
2293
|
+
fi
|
|
2294
|
+
|
|
2295
|
+
# ── Branch protection escalation (highest priority) ──
|
|
2296
|
+
if type gh_branch_protection &>/dev/null 2>&1 && [[ "${NO_GITHUB:-false}" != "true" ]]; then
|
|
2297
|
+
if type _gh_detect_repo &>/dev/null 2>&1; then
|
|
2298
|
+
_gh_detect_repo 2>/dev/null || true
|
|
2299
|
+
fi
|
|
2300
|
+
local gh_owner="${GH_OWNER:-}" gh_repo="${GH_REPO:-}"
|
|
2301
|
+
if [[ -n "$gh_owner" && -n "$gh_repo" ]]; then
|
|
2302
|
+
local protection
|
|
2303
|
+
protection=$(gh_branch_protection "$gh_owner" "$gh_repo" "${BASE_BRANCH:-main}" 2>/dev/null || echo '{"protected": false}')
|
|
2304
|
+
local strict_protection
|
|
2305
|
+
strict_protection=$(echo "$protection" | jq -r '.enforce_admins.enabled // false' 2>/dev/null || echo "false")
|
|
2306
|
+
local required_reviews
|
|
2307
|
+
required_reviews=$(echo "$protection" | jq -r '.required_pull_request_reviews.required_approving_review_count // 0' 2>/dev/null || echo "0")
|
|
2308
|
+
if [[ "$strict_protection" == "true" ]] || [[ "${required_reviews:-0}" -gt 1 ]]; then
|
|
2309
|
+
daemon_log INFO "Branch has strict protection — escalating to enterprise template"
|
|
2310
|
+
echo "enterprise"
|
|
2311
|
+
return
|
|
2312
|
+
fi
|
|
2313
|
+
fi
|
|
2314
|
+
fi
|
|
2315
|
+
|
|
2316
|
+
# ── Label-based overrides ──
|
|
1259
2317
|
if echo "$labels" | grep -qi "hotfix\|incident"; then
|
|
1260
2318
|
echo "hotfix"
|
|
1261
2319
|
return
|
|
@@ -1363,6 +2421,16 @@ daemon_triage_show() {
|
|
|
1363
2421
|
echo ""
|
|
1364
2422
|
}
|
|
1365
2423
|
|
|
2424
|
+
# ─── Patrol Self-Labeling ─────────────────────────────────────────────────
|
|
2425
|
+
patrol_build_labels() {
|
|
2426
|
+
local check_label="$1"
|
|
2427
|
+
local labels="${PATROL_LABEL},${check_label}"
|
|
2428
|
+
if [[ "$PATROL_AUTO_WATCH" == "true" && -n "${WATCH_LABEL:-}" ]]; then
|
|
2429
|
+
labels="${labels},${WATCH_LABEL}"
|
|
2430
|
+
fi
|
|
2431
|
+
echo "$labels"
|
|
2432
|
+
}
|
|
2433
|
+
|
|
1366
2434
|
# ─── Proactive Patrol Mode ───────────────────────────────────────────────────
|
|
1367
2435
|
|
|
1368
2436
|
daemon_patrol() {
|
|
@@ -1413,7 +2481,7 @@ daemon_patrol() {
|
|
|
1413
2481
|
fi
|
|
1414
2482
|
|
|
1415
2483
|
findings=$((findings + 1))
|
|
1416
|
-
emit_event "patrol.finding" "
|
|
2484
|
+
emit_event "patrol.finding" "check=security" "severity=$severity" "package=$name"
|
|
1417
2485
|
|
|
1418
2486
|
# Check if issue already exists
|
|
1419
2487
|
if [[ "$NO_GITHUB" != "true" ]] && [[ "$dry_run" != "true" ]]; then
|
|
@@ -1434,9 +2502,9 @@ daemon_patrol() {
|
|
|
1434
2502
|
| Date | $(now_iso) |
|
|
1435
2503
|
|
|
1436
2504
|
Auto-detected by \`shipwright daemon patrol\`." \
|
|
1437
|
-
--label "security"
|
|
2505
|
+
--label "$(patrol_build_labels "security")" 2>/dev/null || true
|
|
1438
2506
|
issues_created=$((issues_created + 1))
|
|
1439
|
-
emit_event "patrol.issue_created" "
|
|
2507
|
+
emit_event "patrol.issue_created" "check=security" "package=$name"
|
|
1440
2508
|
fi
|
|
1441
2509
|
else
|
|
1442
2510
|
echo -e " ${RED}●${RESET} ${BOLD}${severity}${RESET}: ${title} in ${CYAN}${name}${RESET}"
|
|
@@ -1467,6 +2535,39 @@ Auto-detected by \`shipwright daemon patrol\`." \
|
|
|
1467
2535
|
fi
|
|
1468
2536
|
fi
|
|
1469
2537
|
|
|
2538
|
+
# Enrich with GitHub security alerts
|
|
2539
|
+
if type gh_security_alerts &>/dev/null 2>&1 && [[ "${NO_GITHUB:-false}" != "true" ]]; then
|
|
2540
|
+
if type _gh_detect_repo &>/dev/null 2>&1; then
|
|
2541
|
+
_gh_detect_repo 2>/dev/null || true
|
|
2542
|
+
fi
|
|
2543
|
+
local gh_owner="${GH_OWNER:-}" gh_repo="${GH_REPO:-}"
|
|
2544
|
+
if [[ -n "$gh_owner" && -n "$gh_repo" ]]; then
|
|
2545
|
+
local gh_alerts
|
|
2546
|
+
gh_alerts=$(gh_security_alerts "$gh_owner" "$gh_repo" 2>/dev/null || echo "[]")
|
|
2547
|
+
local gh_alert_count
|
|
2548
|
+
gh_alert_count=$(echo "$gh_alerts" | jq 'length' 2>/dev/null || echo "0")
|
|
2549
|
+
if [[ "${gh_alert_count:-0}" -gt 0 ]]; then
|
|
2550
|
+
daemon_log WARN "Patrol: $gh_alert_count GitHub security alert(s) found"
|
|
2551
|
+
findings=$((findings + gh_alert_count))
|
|
2552
|
+
fi
|
|
2553
|
+
fi
|
|
2554
|
+
fi
|
|
2555
|
+
|
|
2556
|
+
# Enrich with GitHub Dependabot alerts
|
|
2557
|
+
if type gh_dependabot_alerts &>/dev/null 2>&1 && [[ "${NO_GITHUB:-false}" != "true" ]]; then
|
|
2558
|
+
local gh_owner="${GH_OWNER:-}" gh_repo="${GH_REPO:-}"
|
|
2559
|
+
if [[ -n "$gh_owner" && -n "$gh_repo" ]]; then
|
|
2560
|
+
local dep_alerts
|
|
2561
|
+
dep_alerts=$(gh_dependabot_alerts "$gh_owner" "$gh_repo" 2>/dev/null || echo "[]")
|
|
2562
|
+
local dep_alert_count
|
|
2563
|
+
dep_alert_count=$(echo "$dep_alerts" | jq 'length' 2>/dev/null || echo "0")
|
|
2564
|
+
if [[ "${dep_alert_count:-0}" -gt 0 ]]; then
|
|
2565
|
+
daemon_log WARN "Patrol: $dep_alert_count Dependabot alert(s) found"
|
|
2566
|
+
findings=$((findings + dep_alert_count))
|
|
2567
|
+
fi
|
|
2568
|
+
fi
|
|
2569
|
+
fi
|
|
2570
|
+
|
|
1470
2571
|
total_findings=$((total_findings + findings))
|
|
1471
2572
|
if [[ "$findings" -gt 0 ]]; then
|
|
1472
2573
|
daemon_log INFO "Patrol: found ${findings} security vulnerability(ies)"
|
|
@@ -1499,7 +2600,7 @@ Auto-detected by \`shipwright daemon patrol\`." \
|
|
|
1499
2600
|
if [[ "$diff" -ge 2 ]]; then
|
|
1500
2601
|
findings=$((findings + 1))
|
|
1501
2602
|
stale_packages="${stale_packages}\n- \`${name}\`: ${current} → ${latest} (${diff} major versions behind)"
|
|
1502
|
-
emit_event "patrol.finding" "
|
|
2603
|
+
emit_event "patrol.finding" "check=stale_dependency" "package=$name" "current=$current" "latest=$latest"
|
|
1503
2604
|
|
|
1504
2605
|
if [[ "$dry_run" == "true" ]] || [[ "$NO_GITHUB" == "true" ]]; then
|
|
1505
2606
|
echo -e " ${YELLOW}●${RESET} ${CYAN}${name}${RESET}: ${current} → ${latest} (${diff} major versions behind)"
|
|
@@ -1522,9 +2623,9 @@ The following packages are 2+ major versions behind:
|
|
|
1522
2623
|
$(echo -e "$stale_packages")
|
|
1523
2624
|
|
|
1524
2625
|
Auto-detected by \`shipwright daemon patrol\` on $(now_iso)." \
|
|
1525
|
-
--label "dependencies"
|
|
2626
|
+
--label "$(patrol_build_labels "dependencies")" 2>/dev/null || true
|
|
1526
2627
|
issues_created=$((issues_created + 1))
|
|
1527
|
-
emit_event "patrol.issue_created" "
|
|
2628
|
+
emit_event "patrol.issue_created" "check=stale_dependency" "count=$findings"
|
|
1528
2629
|
fi
|
|
1529
2630
|
fi
|
|
1530
2631
|
fi
|
|
@@ -1586,9 +2687,9 @@ $(echo -e "$dead_files")
|
|
|
1586
2687
|
> **Note:** Some files may be entry points or dynamically loaded. Verify before removing.
|
|
1587
2688
|
|
|
1588
2689
|
Auto-detected by \`shipwright daemon patrol\` on $(now_iso)." \
|
|
1589
|
-
--label "tech-debt"
|
|
2690
|
+
--label "$(patrol_build_labels "tech-debt")" 2>/dev/null || true
|
|
1590
2691
|
issues_created=$((issues_created + 1))
|
|
1591
|
-
emit_event "patrol.issue_created" "
|
|
2692
|
+
emit_event "patrol.issue_created" "check=dead_code" "count=$findings"
|
|
1592
2693
|
fi
|
|
1593
2694
|
fi
|
|
1594
2695
|
|
|
@@ -1649,9 +2750,9 @@ These files have < 50% line coverage:
|
|
|
1649
2750
|
$(echo -e "$low_cov_files")
|
|
1650
2751
|
|
|
1651
2752
|
Auto-detected by \`shipwright daemon patrol\` on $(now_iso)." \
|
|
1652
|
-
--label "testing"
|
|
2753
|
+
--label "$(patrol_build_labels "testing")" 2>/dev/null || true
|
|
1653
2754
|
issues_created=$((issues_created + 1))
|
|
1654
|
-
emit_event "patrol.issue_created" "
|
|
2755
|
+
emit_event "patrol.issue_created" "check=coverage" "count=$findings"
|
|
1655
2756
|
fi
|
|
1656
2757
|
fi
|
|
1657
2758
|
|
|
@@ -1694,9 +2795,49 @@ Auto-detected by \`shipwright daemon patrol\` on $(now_iso)." \
|
|
|
1694
2795
|
tag_epoch=$(git log -1 --format=%ct "$latest_tag" 2>/dev/null || echo "0")
|
|
1695
2796
|
if [[ "$tag_epoch" -gt "$changelog_epoch" ]] && [[ "$changelog_epoch" -gt 0 ]]; then
|
|
1696
2797
|
findings=$((findings + 1))
|
|
1697
|
-
stale_docs="${stale_docs}\n- \`CHANGELOG.md\`: not updated since tag \`${latest_tag}\`"
|
|
2798
|
+
stale_docs="${stale_docs}\n- \`CHANGELOG.md\`: not updated since tag \`${latest_tag}\`"
|
|
2799
|
+
if [[ "$dry_run" == "true" ]] || [[ "$NO_GITHUB" == "true" ]]; then
|
|
2800
|
+
echo -e " ${YELLOW}●${RESET} CHANGELOG.md not updated since ${latest_tag}"
|
|
2801
|
+
fi
|
|
2802
|
+
fi
|
|
2803
|
+
fi
|
|
2804
|
+
fi
|
|
2805
|
+
|
|
2806
|
+
# Check CLAUDE.md staleness (same pattern as README)
|
|
2807
|
+
if [[ -f ".claude/CLAUDE.md" ]]; then
|
|
2808
|
+
local claudemd_epoch claudemd_src_epoch
|
|
2809
|
+
claudemd_src_epoch=$(git log -1 --format=%ct -- "*.ts" "*.js" "*.py" "*.go" "*.rs" "*.sh" 2>/dev/null || echo "0")
|
|
2810
|
+
claudemd_epoch=$(git log -1 --format=%ct -- ".claude/CLAUDE.md" 2>/dev/null || echo "0")
|
|
2811
|
+
if [[ "$claudemd_src_epoch" -gt 0 ]] && [[ "$claudemd_epoch" -gt 0 ]]; then
|
|
2812
|
+
local claude_drift=$((claudemd_src_epoch - claudemd_epoch))
|
|
2813
|
+
if [[ "$claude_drift" -gt 2592000 ]]; then
|
|
2814
|
+
findings=$((findings + 1))
|
|
2815
|
+
local claude_days_behind=$((claude_drift / 86400))
|
|
2816
|
+
stale_docs="${stale_docs}\n- \`.claude/CLAUDE.md\`: ${claude_days_behind} days behind source code"
|
|
1698
2817
|
if [[ "$dry_run" == "true" ]] || [[ "$NO_GITHUB" == "true" ]]; then
|
|
1699
|
-
echo -e " ${YELLOW}●${RESET}
|
|
2818
|
+
echo -e " ${YELLOW}●${RESET} CLAUDE.md is ${claude_days_behind} days behind source code"
|
|
2819
|
+
fi
|
|
2820
|
+
fi
|
|
2821
|
+
fi
|
|
2822
|
+
fi
|
|
2823
|
+
|
|
2824
|
+
# Check AUTO section freshness (if sw-docs.sh available)
|
|
2825
|
+
if [[ -x "$SCRIPT_DIR/sw-docs.sh" ]]; then
|
|
2826
|
+
local docs_stale=false
|
|
2827
|
+
bash "$SCRIPT_DIR/sw-docs.sh" check >/dev/null 2>&1 || docs_stale=true
|
|
2828
|
+
if [[ "$docs_stale" == "true" ]]; then
|
|
2829
|
+
findings=$((findings + 1))
|
|
2830
|
+
stale_docs="${stale_docs}\n- AUTO sections: some documentation sections are stale"
|
|
2831
|
+
if [[ "$dry_run" == "true" ]] || [[ "$NO_GITHUB" == "true" ]]; then
|
|
2832
|
+
echo -e " ${YELLOW}●${RESET} AUTO documentation sections are stale"
|
|
2833
|
+
fi
|
|
2834
|
+
# Auto-sync if not dry run
|
|
2835
|
+
if [[ "$dry_run" != "true" ]] && [[ "$NO_GITHUB" != "true" ]]; then
|
|
2836
|
+
daemon_log INFO "Auto-syncing stale documentation sections"
|
|
2837
|
+
bash "$SCRIPT_DIR/sw-docs.sh" sync 2>/dev/null || true
|
|
2838
|
+
if ! git diff --quiet -- '*.md' 2>/dev/null; then
|
|
2839
|
+
git add -A '*.md' 2>/dev/null || true
|
|
2840
|
+
git commit -m "docs: auto-sync stale documentation sections" 2>/dev/null || true
|
|
1700
2841
|
fi
|
|
1701
2842
|
fi
|
|
1702
2843
|
fi
|
|
@@ -1715,9 +2856,9 @@ The following docs may need updating:
|
|
|
1715
2856
|
$(echo -e "$stale_docs")
|
|
1716
2857
|
|
|
1717
2858
|
Auto-detected by \`shipwright daemon patrol\` on $(now_iso)." \
|
|
1718
|
-
--label "documentation"
|
|
2859
|
+
--label "$(patrol_build_labels "documentation")" 2>/dev/null || true
|
|
1719
2860
|
issues_created=$((issues_created + 1))
|
|
1720
|
-
emit_event "patrol.issue_created" "
|
|
2861
|
+
emit_event "patrol.issue_created" "check=documentation" "count=$findings"
|
|
1721
2862
|
fi
|
|
1722
2863
|
fi
|
|
1723
2864
|
|
|
@@ -1754,7 +2895,7 @@ Auto-detected by \`shipwright daemon patrol\` on $(now_iso)." \
|
|
|
1754
2895
|
if [[ "$recent_test_dur" -gt "$threshold" ]]; then
|
|
1755
2896
|
total_findings=$((total_findings + 1))
|
|
1756
2897
|
local pct_slower=$(( (recent_test_dur - baseline_dur) * 100 / baseline_dur ))
|
|
1757
|
-
emit_event "patrol.finding" "
|
|
2898
|
+
emit_event "patrol.finding" "check=performance" "baseline=${baseline_dur}s" "current=${recent_test_dur}s" "regression=${pct_slower}%"
|
|
1758
2899
|
|
|
1759
2900
|
if [[ "$dry_run" == "true" ]] || [[ "$NO_GITHUB" == "true" ]]; then
|
|
1760
2901
|
echo -e " ${RED}●${RESET} Test suite ${pct_slower}% slower than baseline (${baseline_dur}s → ${recent_test_dur}s)"
|
|
@@ -1774,9 +2915,9 @@ Auto-detected by \`shipwright daemon patrol\` on $(now_iso)." \
|
|
|
1774
2915
|
| Regression | ${pct_slower}% |
|
|
1775
2916
|
|
|
1776
2917
|
Auto-detected by \`shipwright daemon patrol\` on $(now_iso)." \
|
|
1777
|
-
--label "performance"
|
|
2918
|
+
--label "$(patrol_build_labels "performance")" 2>/dev/null || true
|
|
1778
2919
|
issues_created=$((issues_created + 1))
|
|
1779
|
-
emit_event "patrol.issue_created" "
|
|
2920
|
+
emit_event "patrol.issue_created" "check=performance"
|
|
1780
2921
|
fi
|
|
1781
2922
|
fi
|
|
1782
2923
|
|
|
@@ -1792,31 +2933,557 @@ Auto-detected by \`shipwright daemon patrol\` on $(now_iso)." \
|
|
|
1792
2933
|
daemon_log INFO "Patrol: performance baseline updated (${recent_test_dur}s)"
|
|
1793
2934
|
}
|
|
1794
2935
|
|
|
1795
|
-
# ──
|
|
2936
|
+
# ── 7. Recurring Failure Patterns ──
|
|
2937
|
+
patrol_recurring_failures() {
|
|
2938
|
+
if [[ "$PATROL_FAILURES_THRESHOLD" -le 0 ]]; then return; fi
|
|
2939
|
+
daemon_log INFO "Patrol: checking recurring failure patterns"
|
|
2940
|
+
local findings=0
|
|
2941
|
+
|
|
2942
|
+
# Source memory functions if available
|
|
2943
|
+
local memory_script="$SCRIPT_DIR/sw-memory.sh"
|
|
2944
|
+
if [[ ! -f "$memory_script" ]]; then
|
|
2945
|
+
daemon_log INFO "Patrol: memory script not found — skipping recurring failures"
|
|
2946
|
+
return
|
|
2947
|
+
fi
|
|
2948
|
+
|
|
2949
|
+
# Get actionable failures from memory
|
|
2950
|
+
# Note: sw-memory.sh runs its CLI router on source, so we must redirect
|
|
2951
|
+
# the source's stdout to /dev/null and only capture the function's output
|
|
2952
|
+
local failures_json
|
|
2953
|
+
failures_json=$(
|
|
2954
|
+
(
|
|
2955
|
+
source "$memory_script" > /dev/null 2>&1 || true
|
|
2956
|
+
if command -v memory_get_actionable_failures &>/dev/null; then
|
|
2957
|
+
memory_get_actionable_failures "$PATROL_FAILURES_THRESHOLD"
|
|
2958
|
+
else
|
|
2959
|
+
echo "[]"
|
|
2960
|
+
fi
|
|
2961
|
+
)
|
|
2962
|
+
)
|
|
2963
|
+
|
|
2964
|
+
local count
|
|
2965
|
+
count=$(echo "$failures_json" | jq 'length' 2>/dev/null || echo "0")
|
|
2966
|
+
if [[ "${count:-0}" -eq 0 ]]; then
|
|
2967
|
+
daemon_log INFO "Patrol: no recurring failures above threshold ($PATROL_FAILURES_THRESHOLD)"
|
|
2968
|
+
return
|
|
2969
|
+
fi
|
|
2970
|
+
|
|
2971
|
+
while IFS= read -r failure; do
|
|
2972
|
+
local pattern stage seen_count last_seen root_cause
|
|
2973
|
+
pattern=$(echo "$failure" | jq -r '.pattern // "unknown"')
|
|
2974
|
+
stage=$(echo "$failure" | jq -r '.stage // "unknown"')
|
|
2975
|
+
seen_count=$(echo "$failure" | jq -r '.seen_count // 0')
|
|
2976
|
+
last_seen=$(echo "$failure" | jq -r '.last_seen // "unknown"')
|
|
2977
|
+
root_cause=$(echo "$failure" | jq -r '.root_cause // "Not yet identified"')
|
|
2978
|
+
|
|
2979
|
+
# Truncate pattern for title (first 60 chars)
|
|
2980
|
+
local short_pattern
|
|
2981
|
+
short_pattern=$(echo "$pattern" | cut -c1-60)
|
|
2982
|
+
|
|
2983
|
+
findings=$((findings + 1))
|
|
2984
|
+
emit_event "patrol.finding" "check=recurring_failure" "pattern=$short_pattern" "seen_count=$seen_count"
|
|
2985
|
+
|
|
2986
|
+
if [[ "$NO_GITHUB" != "true" ]] && [[ "$dry_run" != "true" ]]; then
|
|
2987
|
+
# Deduplicate
|
|
2988
|
+
local existing
|
|
2989
|
+
existing=$(gh issue list --label "$PATROL_LABEL" --label "recurring-failure" \
|
|
2990
|
+
--search "Fix recurring: ${short_pattern}" --json number -q 'length' 2>/dev/null || echo "0")
|
|
2991
|
+
if [[ "${existing:-0}" -eq 0 ]] && [[ "$issues_created" -lt "$PATROL_MAX_ISSUES" ]]; then
|
|
2992
|
+
gh issue create \
|
|
2993
|
+
--title "Fix recurring: ${short_pattern}" \
|
|
2994
|
+
--body "## Recurring Failure Pattern
|
|
2995
|
+
|
|
2996
|
+
| Field | Value |
|
|
2997
|
+
|-------|-------|
|
|
2998
|
+
| Stage | \`${stage}\` |
|
|
2999
|
+
| Pattern | \`${pattern}\` |
|
|
3000
|
+
| Seen count | **${seen_count}** |
|
|
3001
|
+
| Last seen | ${last_seen} |
|
|
3002
|
+
| Root cause | ${root_cause} |
|
|
3003
|
+
| Found by | Shipwright patrol |
|
|
3004
|
+
| Date | $(now_iso) |
|
|
3005
|
+
|
|
3006
|
+
### Suggested Actions
|
|
3007
|
+
- Investigate the root cause in the \`${stage}\` stage
|
|
3008
|
+
- Check if recent changes introduced the failure
|
|
3009
|
+
- Add a targeted test to prevent regression
|
|
3010
|
+
|
|
3011
|
+
Auto-detected by \`shipwright daemon patrol\`." \
|
|
3012
|
+
--label "$(patrol_build_labels "recurring-failure")" 2>/dev/null || true
|
|
3013
|
+
issues_created=$((issues_created + 1))
|
|
3014
|
+
emit_event "patrol.issue_created" "check=recurring_failure" "pattern=$short_pattern"
|
|
3015
|
+
fi
|
|
3016
|
+
else
|
|
3017
|
+
echo -e " ${RED}●${RESET} ${BOLD}recurring${RESET}: ${short_pattern} (${seen_count}x in ${CYAN}${stage}${RESET})"
|
|
3018
|
+
fi
|
|
3019
|
+
done < <(echo "$failures_json" | jq -c '.[]' 2>/dev/null)
|
|
3020
|
+
|
|
3021
|
+
total_findings=$((total_findings + findings))
|
|
3022
|
+
daemon_log INFO "Patrol: found ${findings} recurring failure pattern(s)"
|
|
3023
|
+
}
|
|
3024
|
+
|
|
3025
|
+
# ── 8. DORA Metric Degradation ──
|
|
3026
|
+
patrol_dora_degradation() {
|
|
3027
|
+
if [[ "$PATROL_DORA_ENABLED" != "true" ]]; then return; fi
|
|
3028
|
+
daemon_log INFO "Patrol: checking DORA metric degradation"
|
|
3029
|
+
|
|
3030
|
+
if [[ ! -f "$EVENTS_FILE" ]]; then
|
|
3031
|
+
daemon_log INFO "Patrol: no events file — skipping DORA check"
|
|
3032
|
+
return
|
|
3033
|
+
fi
|
|
3034
|
+
|
|
3035
|
+
local now_e
|
|
3036
|
+
now_e=$(now_epoch)
|
|
3037
|
+
|
|
3038
|
+
# Current 7-day window
|
|
3039
|
+
local current_start=$((now_e - 604800))
|
|
3040
|
+
# Previous 7-day window
|
|
3041
|
+
local prev_start=$((now_e - 1209600))
|
|
3042
|
+
local prev_end=$current_start
|
|
3043
|
+
|
|
3044
|
+
# Get events for both windows
|
|
3045
|
+
local current_events prev_events
|
|
3046
|
+
current_events=$(jq -s --argjson start "$current_start" \
|
|
3047
|
+
'[.[] | select(.ts_epoch >= $start)]' "$EVENTS_FILE" 2>/dev/null || echo "[]")
|
|
3048
|
+
prev_events=$(jq -s --argjson start "$prev_start" --argjson end "$prev_end" \
|
|
3049
|
+
'[.[] | select(.ts_epoch >= $start and .ts_epoch < $end)]' "$EVENTS_FILE" 2>/dev/null || echo "[]")
|
|
3050
|
+
|
|
3051
|
+
# Helper: calculate DORA metrics from an event set
|
|
3052
|
+
calc_dora() {
|
|
3053
|
+
local events="$1"
|
|
3054
|
+
local total successes failures
|
|
3055
|
+
total=$(echo "$events" | jq '[.[] | select(.type == "pipeline.completed")] | length' 2>/dev/null || echo "0")
|
|
3056
|
+
successes=$(echo "$events" | jq '[.[] | select(.type == "pipeline.completed" and .result == "success")] | length' 2>/dev/null || echo "0")
|
|
3057
|
+
failures=$(echo "$events" | jq '[.[] | select(.type == "pipeline.completed" and .result == "failure")] | length' 2>/dev/null || echo "0")
|
|
3058
|
+
|
|
3059
|
+
local deploy_freq="0"
|
|
3060
|
+
[[ "$total" -gt 0 ]] && deploy_freq=$(echo "$successes 7" | awk '{printf "%.1f", $1 / ($2 / 7)}')
|
|
3061
|
+
|
|
3062
|
+
local cfr="0"
|
|
3063
|
+
[[ "$total" -gt 0 ]] && cfr=$(echo "$failures $total" | awk '{printf "%.1f", ($1 / $2) * 100}')
|
|
3064
|
+
|
|
3065
|
+
local cycle_time="0"
|
|
3066
|
+
cycle_time=$(echo "$events" | jq '[.[] | select(.type == "pipeline.completed" and .result == "success") | .duration_s] | sort | if length > 0 then .[length/2 | floor] else 0 end' 2>/dev/null || echo "0")
|
|
3067
|
+
|
|
3068
|
+
echo "{\"deploy_freq\":$deploy_freq,\"cfr\":$cfr,\"cycle_time\":$cycle_time,\"total\":$total}"
|
|
3069
|
+
}
|
|
3070
|
+
|
|
3071
|
+
local current_metrics prev_metrics
|
|
3072
|
+
current_metrics=$(calc_dora "$current_events")
|
|
3073
|
+
prev_metrics=$(calc_dora "$prev_events")
|
|
3074
|
+
|
|
3075
|
+
local prev_total
|
|
3076
|
+
prev_total=$(echo "$prev_metrics" | jq '.total' 2>/dev/null || echo "0")
|
|
3077
|
+
local current_total
|
|
3078
|
+
current_total=$(echo "$current_metrics" | jq '.total' 2>/dev/null || echo "0")
|
|
3079
|
+
|
|
3080
|
+
# Need data in both windows to compare
|
|
3081
|
+
if [[ "${prev_total:-0}" -lt 3 ]] || [[ "${current_total:-0}" -lt 3 ]]; then
|
|
3082
|
+
daemon_log INFO "Patrol: insufficient data for DORA comparison (prev=$prev_total, current=$current_total)"
|
|
3083
|
+
return
|
|
3084
|
+
fi
|
|
3085
|
+
|
|
3086
|
+
# Grade each metric using dora_grade (defined in daemon_metrics, redefined here inline)
|
|
3087
|
+
local_dora_grade() {
|
|
3088
|
+
local metric="$1" value="$2"
|
|
3089
|
+
case "$metric" in
|
|
3090
|
+
deploy_freq)
|
|
3091
|
+
if awk "BEGIN{exit !($value >= 7)}" 2>/dev/null; then echo "Elite"; return; fi
|
|
3092
|
+
if awk "BEGIN{exit !($value >= 1)}" 2>/dev/null; then echo "High"; return; fi
|
|
3093
|
+
if awk "BEGIN{exit !($value >= 0.25)}" 2>/dev/null; then echo "Medium"; return; fi
|
|
3094
|
+
echo "Low" ;;
|
|
3095
|
+
cfr)
|
|
3096
|
+
if awk "BEGIN{exit !($value < 5)}" 2>/dev/null; then echo "Elite"; return; fi
|
|
3097
|
+
if awk "BEGIN{exit !($value < 10)}" 2>/dev/null; then echo "High"; return; fi
|
|
3098
|
+
if awk "BEGIN{exit !($value < 15)}" 2>/dev/null; then echo "Medium"; return; fi
|
|
3099
|
+
echo "Low" ;;
|
|
3100
|
+
cycle_time)
|
|
3101
|
+
[[ "$value" -lt 3600 ]] && echo "Elite" && return
|
|
3102
|
+
[[ "$value" -lt 86400 ]] && echo "High" && return
|
|
3103
|
+
[[ "$value" -lt 604800 ]] && echo "Medium" && return
|
|
3104
|
+
echo "Low" ;;
|
|
3105
|
+
esac
|
|
3106
|
+
}
|
|
3107
|
+
|
|
3108
|
+
grade_rank() {
|
|
3109
|
+
case "$1" in
|
|
3110
|
+
Elite) echo 4 ;; High) echo 3 ;; Medium) echo 2 ;; Low) echo 1 ;; *) echo 0 ;;
|
|
3111
|
+
esac
|
|
3112
|
+
}
|
|
3113
|
+
|
|
3114
|
+
local degraded_metrics=""
|
|
3115
|
+
local degradation_details=""
|
|
3116
|
+
|
|
3117
|
+
# Check deploy frequency
|
|
3118
|
+
local prev_df curr_df
|
|
3119
|
+
prev_df=$(echo "$prev_metrics" | jq -r '.deploy_freq')
|
|
3120
|
+
curr_df=$(echo "$current_metrics" | jq -r '.deploy_freq')
|
|
3121
|
+
local prev_df_grade curr_df_grade
|
|
3122
|
+
prev_df_grade=$(local_dora_grade deploy_freq "$prev_df")
|
|
3123
|
+
curr_df_grade=$(local_dora_grade deploy_freq "$curr_df")
|
|
3124
|
+
if [[ "$(grade_rank "$curr_df_grade")" -lt "$(grade_rank "$prev_df_grade")" ]]; then
|
|
3125
|
+
degraded_metrics="${degraded_metrics}deploy_freq "
|
|
3126
|
+
degradation_details="${degradation_details}\n| Deploy Frequency | ${prev_df_grade} (${prev_df}/wk) | ${curr_df_grade} (${curr_df}/wk) | Check for blocked PRs, increase automation |"
|
|
3127
|
+
fi
|
|
3128
|
+
|
|
3129
|
+
# Check CFR
|
|
3130
|
+
local prev_cfr curr_cfr
|
|
3131
|
+
prev_cfr=$(echo "$prev_metrics" | jq -r '.cfr')
|
|
3132
|
+
curr_cfr=$(echo "$current_metrics" | jq -r '.cfr')
|
|
3133
|
+
local prev_cfr_grade curr_cfr_grade
|
|
3134
|
+
prev_cfr_grade=$(local_dora_grade cfr "$prev_cfr")
|
|
3135
|
+
curr_cfr_grade=$(local_dora_grade cfr "$curr_cfr")
|
|
3136
|
+
if [[ "$(grade_rank "$curr_cfr_grade")" -lt "$(grade_rank "$prev_cfr_grade")" ]]; then
|
|
3137
|
+
degraded_metrics="${degraded_metrics}cfr "
|
|
3138
|
+
degradation_details="${degradation_details}\n| Change Failure Rate | ${prev_cfr_grade} (${prev_cfr}%) | ${curr_cfr_grade} (${curr_cfr}%) | Investigate recent failures, improve test coverage |"
|
|
3139
|
+
fi
|
|
3140
|
+
|
|
3141
|
+
# Check Cycle Time
|
|
3142
|
+
local prev_ct curr_ct
|
|
3143
|
+
prev_ct=$(echo "$prev_metrics" | jq -r '.cycle_time')
|
|
3144
|
+
curr_ct=$(echo "$current_metrics" | jq -r '.cycle_time')
|
|
3145
|
+
local prev_ct_grade curr_ct_grade
|
|
3146
|
+
prev_ct_grade=$(local_dora_grade cycle_time "$prev_ct")
|
|
3147
|
+
curr_ct_grade=$(local_dora_grade cycle_time "$curr_ct")
|
|
3148
|
+
if [[ "$(grade_rank "$curr_ct_grade")" -lt "$(grade_rank "$prev_ct_grade")" ]]; then
|
|
3149
|
+
degraded_metrics="${degraded_metrics}cycle_time "
|
|
3150
|
+
degradation_details="${degradation_details}\n| Cycle Time | ${prev_ct_grade} (${prev_ct}s) | ${curr_ct_grade} (${curr_ct}s) | Profile slow stages, check for new slow tests |"
|
|
3151
|
+
fi
|
|
3152
|
+
|
|
3153
|
+
if [[ -z "$degraded_metrics" ]]; then
|
|
3154
|
+
daemon_log INFO "Patrol: no DORA degradation detected"
|
|
3155
|
+
return
|
|
3156
|
+
fi
|
|
3157
|
+
|
|
3158
|
+
local findings=0
|
|
3159
|
+
findings=1
|
|
3160
|
+
total_findings=$((total_findings + findings))
|
|
3161
|
+
emit_event "patrol.finding" "check=dora_regression" "metrics=$degraded_metrics"
|
|
3162
|
+
|
|
3163
|
+
if [[ "$NO_GITHUB" != "true" ]] && [[ "$dry_run" != "true" ]]; then
|
|
3164
|
+
local trimmed
|
|
3165
|
+
trimmed=$(echo "$degraded_metrics" | sed 's/ *$//' | tr ' ' ',')
|
|
3166
|
+
local existing
|
|
3167
|
+
existing=$(gh issue list --label "$PATROL_LABEL" --label "dora-regression" \
|
|
3168
|
+
--search "DORA regression" --json number -q 'length' 2>/dev/null || echo "0")
|
|
3169
|
+
if [[ "${existing:-0}" -eq 0 ]] && [[ "$issues_created" -lt "$PATROL_MAX_ISSUES" ]]; then
|
|
3170
|
+
gh issue create \
|
|
3171
|
+
--title "DORA regression: ${trimmed}" \
|
|
3172
|
+
--body "## DORA Metric Degradation
|
|
3173
|
+
|
|
3174
|
+
| Metric | Previous (7d) | Current (7d) | Suggested Action |
|
|
3175
|
+
|--------|---------------|--------------|------------------|$(echo -e "$degradation_details")
|
|
3176
|
+
|
|
3177
|
+
> Compared: previous 7-day window vs current 7-day window.
|
|
3178
|
+
|
|
3179
|
+
Auto-detected by \`shipwright daemon patrol\` on $(now_iso)." \
|
|
3180
|
+
--label "$(patrol_build_labels "dora-regression")" 2>/dev/null || true
|
|
3181
|
+
issues_created=$((issues_created + 1))
|
|
3182
|
+
emit_event "patrol.issue_created" "check=dora_regression" "metrics=$trimmed"
|
|
3183
|
+
fi
|
|
3184
|
+
else
|
|
3185
|
+
local trimmed
|
|
3186
|
+
trimmed=$(echo "$degraded_metrics" | sed 's/ *$//')
|
|
3187
|
+
echo -e " ${RED}●${RESET} ${BOLD}DORA regression${RESET}: ${trimmed}"
|
|
3188
|
+
fi
|
|
3189
|
+
|
|
3190
|
+
daemon_log INFO "Patrol: DORA degradation detected in: ${degraded_metrics}"
|
|
3191
|
+
}
|
|
3192
|
+
|
|
3193
|
+
# ── 9. Untested Scripts ──
|
|
3194
|
+
patrol_untested_scripts() {
|
|
3195
|
+
if [[ "$PATROL_UNTESTED_ENABLED" != "true" ]]; then return; fi
|
|
3196
|
+
daemon_log INFO "Patrol: checking for untested scripts"
|
|
3197
|
+
local findings=0
|
|
3198
|
+
local untested_list=""
|
|
3199
|
+
|
|
3200
|
+
local scripts_dir="$SCRIPT_DIR"
|
|
3201
|
+
if [[ ! -d "$scripts_dir" ]]; then
|
|
3202
|
+
daemon_log INFO "Patrol: scripts directory not found — skipping"
|
|
3203
|
+
return
|
|
3204
|
+
fi
|
|
3205
|
+
|
|
3206
|
+
# Collect untested scripts with usage counts
|
|
3207
|
+
local untested_entries=""
|
|
3208
|
+
while IFS= read -r script; do
|
|
3209
|
+
local basename
|
|
3210
|
+
basename=$(basename "$script")
|
|
3211
|
+
# Skip test scripts themselves
|
|
3212
|
+
[[ "$basename" == *-test.sh ]] && continue
|
|
3213
|
+
# Skip the main CLI router
|
|
3214
|
+
[[ "$basename" == "sw" ]] && continue
|
|
3215
|
+
|
|
3216
|
+
# Extract the name part (sw-NAME.sh -> NAME)
|
|
3217
|
+
local name
|
|
3218
|
+
name=$(echo "$basename" | sed 's/^sw-//' | sed 's/\.sh$//')
|
|
3219
|
+
|
|
3220
|
+
# Check if a test file exists
|
|
3221
|
+
if [[ ! -f "$scripts_dir/sw-${name}-test.sh" ]]; then
|
|
3222
|
+
# Count usage across other scripts
|
|
3223
|
+
local usage_count
|
|
3224
|
+
usage_count=$(grep -rl "sw-${name}" "$scripts_dir"/sw-*.sh 2>/dev/null | grep -cv "$basename" || true)
|
|
3225
|
+
usage_count=${usage_count:-0}
|
|
3226
|
+
|
|
3227
|
+
local line_count
|
|
3228
|
+
line_count=$(wc -l < "$script" 2>/dev/null | tr -d ' ')
|
|
3229
|
+
|
|
3230
|
+
untested_entries="${untested_entries}${usage_count}|${basename}|${line_count}\n"
|
|
3231
|
+
findings=$((findings + 1))
|
|
3232
|
+
fi
|
|
3233
|
+
done < <(find "$scripts_dir" -maxdepth 1 -name "sw-*.sh" -type f 2>/dev/null | sort)
|
|
3234
|
+
|
|
3235
|
+
if [[ "$findings" -eq 0 ]]; then
|
|
3236
|
+
daemon_log INFO "Patrol: all scripts have test files"
|
|
3237
|
+
return
|
|
3238
|
+
fi
|
|
3239
|
+
|
|
3240
|
+
# Sort by usage count descending
|
|
3241
|
+
local sorted_entries
|
|
3242
|
+
sorted_entries=$(echo -e "$untested_entries" | sort -t'|' -k1 -rn | head -10)
|
|
3243
|
+
|
|
3244
|
+
while IFS='|' read -r usage_count basename line_count; do
|
|
3245
|
+
[[ -z "$basename" ]] && continue
|
|
3246
|
+
untested_list="${untested_list}\n- \`${basename}\` (${line_count} lines, referenced by ${usage_count} scripts)"
|
|
3247
|
+
emit_event "patrol.finding" "check=untested_script" "script=$basename" "lines=$line_count" "usage=$usage_count"
|
|
3248
|
+
|
|
3249
|
+
if [[ "$dry_run" == "true" ]] || [[ "$NO_GITHUB" == "true" ]]; then
|
|
3250
|
+
echo -e " ${YELLOW}●${RESET} ${CYAN}${basename}${RESET} (${line_count} lines, ${usage_count} refs)"
|
|
3251
|
+
fi
|
|
3252
|
+
done <<< "$sorted_entries"
|
|
3253
|
+
|
|
3254
|
+
total_findings=$((total_findings + findings))
|
|
3255
|
+
|
|
3256
|
+
if [[ "$NO_GITHUB" != "true" ]] && [[ "$dry_run" != "true" ]]; then
|
|
3257
|
+
local existing
|
|
3258
|
+
existing=$(gh issue list --label "$PATROL_LABEL" --label "test-coverage" \
|
|
3259
|
+
--search "Add tests for untested scripts" --json number -q 'length' 2>/dev/null || echo "0")
|
|
3260
|
+
if [[ "${existing:-0}" -eq 0 ]] && [[ "$issues_created" -lt "$PATROL_MAX_ISSUES" ]]; then
|
|
3261
|
+
gh issue create \
|
|
3262
|
+
--title "Add tests for ${findings} untested script(s)" \
|
|
3263
|
+
--body "## Untested Scripts
|
|
3264
|
+
|
|
3265
|
+
The following scripts have no corresponding test file (\`sw-*-test.sh\`):
|
|
3266
|
+
$(echo -e "$untested_list")
|
|
3267
|
+
|
|
3268
|
+
### How to Add Tests
|
|
3269
|
+
Each test file should follow the pattern in existing test scripts (e.g., \`sw-daemon-test.sh\`):
|
|
3270
|
+
- Mock environment with TEMP_DIR
|
|
3271
|
+
- PASS/FAIL counters
|
|
3272
|
+
- \`run_test\` harness
|
|
3273
|
+
- Register in \`package.json\` test script
|
|
3274
|
+
|
|
3275
|
+
Auto-detected by \`shipwright daemon patrol\` on $(now_iso)." \
|
|
3276
|
+
--label "$(patrol_build_labels "test-coverage")" 2>/dev/null || true
|
|
3277
|
+
issues_created=$((issues_created + 1))
|
|
3278
|
+
emit_event "patrol.issue_created" "check=untested_scripts" "count=$findings"
|
|
3279
|
+
fi
|
|
3280
|
+
fi
|
|
3281
|
+
|
|
3282
|
+
daemon_log INFO "Patrol: found ${findings} untested script(s)"
|
|
3283
|
+
}
|
|
3284
|
+
|
|
3285
|
+
# ── 10. Retry Exhaustion Patterns ──
|
|
3286
|
+
patrol_retry_exhaustion() {
|
|
3287
|
+
if [[ "$PATROL_RETRY_ENABLED" != "true" ]]; then return; fi
|
|
3288
|
+
daemon_log INFO "Patrol: checking retry exhaustion patterns"
|
|
3289
|
+
local findings=0
|
|
3290
|
+
|
|
3291
|
+
if [[ ! -f "$EVENTS_FILE" ]]; then
|
|
3292
|
+
daemon_log INFO "Patrol: no events file — skipping retry check"
|
|
3293
|
+
return
|
|
3294
|
+
fi
|
|
3295
|
+
|
|
3296
|
+
local seven_days_ago
|
|
3297
|
+
seven_days_ago=$(($(now_epoch) - 604800))
|
|
3298
|
+
|
|
3299
|
+
# Find retry_exhausted events in last 7 days
|
|
3300
|
+
local exhausted_events
|
|
3301
|
+
exhausted_events=$(jq -s --argjson since "$seven_days_ago" \
|
|
3302
|
+
'[.[] | select(.type == "daemon.retry_exhausted" and (.ts_epoch // 0) >= $since)]' \
|
|
3303
|
+
"$EVENTS_FILE" 2>/dev/null || echo "[]")
|
|
3304
|
+
|
|
3305
|
+
local exhausted_count
|
|
3306
|
+
exhausted_count=$(echo "$exhausted_events" | jq 'length' 2>/dev/null || echo "0")
|
|
3307
|
+
|
|
3308
|
+
if [[ "${exhausted_count:-0}" -lt "$PATROL_RETRY_THRESHOLD" ]]; then
|
|
3309
|
+
daemon_log INFO "Patrol: retry exhaustions ($exhausted_count) below threshold ($PATROL_RETRY_THRESHOLD)"
|
|
3310
|
+
return
|
|
3311
|
+
fi
|
|
3312
|
+
|
|
3313
|
+
findings=1
|
|
3314
|
+
total_findings=$((total_findings + findings))
|
|
3315
|
+
|
|
3316
|
+
# Get unique issue patterns
|
|
3317
|
+
local issue_list
|
|
3318
|
+
issue_list=$(echo "$exhausted_events" | jq -r '[.[] | .issue // "unknown"] | unique | join(", ")' 2>/dev/null || echo "unknown")
|
|
3319
|
+
|
|
3320
|
+
local first_ts last_ts
|
|
3321
|
+
first_ts=$(echo "$exhausted_events" | jq -r '[.[] | .ts] | sort | first // "unknown"' 2>/dev/null || echo "unknown")
|
|
3322
|
+
last_ts=$(echo "$exhausted_events" | jq -r '[.[] | .ts] | sort | last // "unknown"' 2>/dev/null || echo "unknown")
|
|
3323
|
+
|
|
3324
|
+
emit_event "patrol.finding" "check=retry_exhaustion" "count=$exhausted_count" "issues=$issue_list"
|
|
3325
|
+
|
|
3326
|
+
if [[ "$NO_GITHUB" != "true" ]] && [[ "$dry_run" != "true" ]]; then
|
|
3327
|
+
local existing
|
|
3328
|
+
existing=$(gh issue list --label "$PATROL_LABEL" --label "reliability" \
|
|
3329
|
+
--search "Retry exhaustion pattern" --json number -q 'length' 2>/dev/null || echo "0")
|
|
3330
|
+
if [[ "${existing:-0}" -eq 0 ]] && [[ "$issues_created" -lt "$PATROL_MAX_ISSUES" ]]; then
|
|
3331
|
+
gh issue create \
|
|
3332
|
+
--title "Retry exhaustion pattern (${exhausted_count} in 7 days)" \
|
|
3333
|
+
--body "## Retry Exhaustion Pattern
|
|
3334
|
+
|
|
3335
|
+
| Field | Value |
|
|
3336
|
+
|-------|-------|
|
|
3337
|
+
| Exhaustions (7d) | **${exhausted_count}** |
|
|
3338
|
+
| Threshold | ${PATROL_RETRY_THRESHOLD} |
|
|
3339
|
+
| Affected issues | ${issue_list} |
|
|
3340
|
+
| First occurrence | ${first_ts} |
|
|
3341
|
+
| Latest occurrence | ${last_ts} |
|
|
3342
|
+
|
|
3343
|
+
### Investigation Steps
|
|
3344
|
+
1. Check the affected issues for common patterns
|
|
3345
|
+
2. Review pipeline logs for root cause
|
|
3346
|
+
3. Consider if max_retries needs adjustment
|
|
3347
|
+
4. Investigate if an external dependency is flaky
|
|
3348
|
+
|
|
3349
|
+
Auto-detected by \`shipwright daemon patrol\` on $(now_iso)." \
|
|
3350
|
+
--label "$(patrol_build_labels "reliability")" 2>/dev/null || true
|
|
3351
|
+
issues_created=$((issues_created + 1))
|
|
3352
|
+
emit_event "patrol.issue_created" "check=retry_exhaustion" "count=$exhausted_count"
|
|
3353
|
+
fi
|
|
3354
|
+
else
|
|
3355
|
+
echo -e " ${RED}●${RESET} ${BOLD}retry exhaustion${RESET}: ${exhausted_count} exhaustions in 7 days (issues: ${issue_list})"
|
|
3356
|
+
fi
|
|
3357
|
+
|
|
3358
|
+
daemon_log INFO "Patrol: found retry exhaustion pattern (${exhausted_count} in 7 days)"
|
|
3359
|
+
}
|
|
3360
|
+
|
|
3361
|
+
# ── Stage 1: Run all grep-based patrol checks (fast pre-filter) ──
|
|
3362
|
+
local patrol_findings_summary=""
|
|
3363
|
+
local pre_check_findings=0
|
|
3364
|
+
|
|
1796
3365
|
echo -e " ${BOLD}Security Audit${RESET}"
|
|
3366
|
+
pre_check_findings=$total_findings
|
|
1797
3367
|
patrol_security_audit
|
|
3368
|
+
if [[ "$total_findings" -gt "$pre_check_findings" ]]; then
|
|
3369
|
+
patrol_findings_summary="${patrol_findings_summary}security: $((total_findings - pre_check_findings)) finding(s); "
|
|
3370
|
+
fi
|
|
1798
3371
|
echo ""
|
|
1799
3372
|
|
|
1800
3373
|
echo -e " ${BOLD}Stale Dependencies${RESET}"
|
|
3374
|
+
pre_check_findings=$total_findings
|
|
1801
3375
|
patrol_stale_dependencies
|
|
3376
|
+
if [[ "$total_findings" -gt "$pre_check_findings" ]]; then
|
|
3377
|
+
patrol_findings_summary="${patrol_findings_summary}stale_deps: $((total_findings - pre_check_findings)) finding(s); "
|
|
3378
|
+
fi
|
|
1802
3379
|
echo ""
|
|
1803
3380
|
|
|
1804
3381
|
echo -e " ${BOLD}Dead Code Detection${RESET}"
|
|
3382
|
+
pre_check_findings=$total_findings
|
|
1805
3383
|
patrol_dead_code
|
|
3384
|
+
if [[ "$total_findings" -gt "$pre_check_findings" ]]; then
|
|
3385
|
+
patrol_findings_summary="${patrol_findings_summary}dead_code: $((total_findings - pre_check_findings)) finding(s); "
|
|
3386
|
+
fi
|
|
1806
3387
|
echo ""
|
|
1807
3388
|
|
|
1808
3389
|
echo -e " ${BOLD}Test Coverage Gaps${RESET}"
|
|
3390
|
+
pre_check_findings=$total_findings
|
|
1809
3391
|
patrol_coverage_gaps
|
|
3392
|
+
if [[ "$total_findings" -gt "$pre_check_findings" ]]; then
|
|
3393
|
+
patrol_findings_summary="${patrol_findings_summary}coverage: $((total_findings - pre_check_findings)) finding(s); "
|
|
3394
|
+
fi
|
|
1810
3395
|
echo ""
|
|
1811
3396
|
|
|
1812
3397
|
echo -e " ${BOLD}Documentation Staleness${RESET}"
|
|
3398
|
+
pre_check_findings=$total_findings
|
|
1813
3399
|
patrol_doc_staleness
|
|
3400
|
+
if [[ "$total_findings" -gt "$pre_check_findings" ]]; then
|
|
3401
|
+
patrol_findings_summary="${patrol_findings_summary}docs: $((total_findings - pre_check_findings)) finding(s); "
|
|
3402
|
+
fi
|
|
1814
3403
|
echo ""
|
|
1815
3404
|
|
|
1816
3405
|
echo -e " ${BOLD}Performance Baseline${RESET}"
|
|
3406
|
+
pre_check_findings=$total_findings
|
|
1817
3407
|
patrol_performance_baseline
|
|
3408
|
+
if [[ "$total_findings" -gt "$pre_check_findings" ]]; then
|
|
3409
|
+
patrol_findings_summary="${patrol_findings_summary}performance: $((total_findings - pre_check_findings)) finding(s); "
|
|
3410
|
+
fi
|
|
3411
|
+
echo ""
|
|
3412
|
+
|
|
3413
|
+
echo -e " ${BOLD}Recurring Failures${RESET}"
|
|
3414
|
+
pre_check_findings=$total_findings
|
|
3415
|
+
patrol_recurring_failures
|
|
3416
|
+
if [[ "$total_findings" -gt "$pre_check_findings" ]]; then
|
|
3417
|
+
patrol_findings_summary="${patrol_findings_summary}recurring_failures: $((total_findings - pre_check_findings)) finding(s); "
|
|
3418
|
+
fi
|
|
3419
|
+
echo ""
|
|
3420
|
+
|
|
3421
|
+
echo -e " ${BOLD}DORA Degradation${RESET}"
|
|
3422
|
+
pre_check_findings=$total_findings
|
|
3423
|
+
patrol_dora_degradation
|
|
3424
|
+
if [[ "$total_findings" -gt "$pre_check_findings" ]]; then
|
|
3425
|
+
patrol_findings_summary="${patrol_findings_summary}dora: $((total_findings - pre_check_findings)) finding(s); "
|
|
3426
|
+
fi
|
|
3427
|
+
echo ""
|
|
3428
|
+
|
|
3429
|
+
echo -e " ${BOLD}Untested Scripts${RESET}"
|
|
3430
|
+
pre_check_findings=$total_findings
|
|
3431
|
+
patrol_untested_scripts
|
|
3432
|
+
if [[ "$total_findings" -gt "$pre_check_findings" ]]; then
|
|
3433
|
+
patrol_findings_summary="${patrol_findings_summary}untested: $((total_findings - pre_check_findings)) finding(s); "
|
|
3434
|
+
fi
|
|
3435
|
+
echo ""
|
|
3436
|
+
|
|
3437
|
+
echo -e " ${BOLD}Retry Exhaustion${RESET}"
|
|
3438
|
+
pre_check_findings=$total_findings
|
|
3439
|
+
patrol_retry_exhaustion
|
|
3440
|
+
if [[ "$total_findings" -gt "$pre_check_findings" ]]; then
|
|
3441
|
+
patrol_findings_summary="${patrol_findings_summary}retry_exhaustion: $((total_findings - pre_check_findings)) finding(s); "
|
|
3442
|
+
fi
|
|
1818
3443
|
echo ""
|
|
1819
3444
|
|
|
3445
|
+
# ── Stage 2: AI-Powered Confirmation (if enabled) ──
|
|
3446
|
+
if [[ "${PREDICTION_ENABLED:-false}" == "true" ]] && type patrol_ai_analyze &>/dev/null 2>&1; then
|
|
3447
|
+
daemon_log INFO "Intelligence: using AI patrol analysis (prediction enabled)"
|
|
3448
|
+
echo -e " ${BOLD}AI Deep Analysis${RESET}"
|
|
3449
|
+
# Sample recent source files for AI analysis
|
|
3450
|
+
local sample_files=""
|
|
3451
|
+
local git_log_recent=""
|
|
3452
|
+
sample_files=$(git diff --name-only HEAD~5 2>/dev/null | head -10 | tr '\n' ',' || echo "")
|
|
3453
|
+
git_log_recent=$(git log --oneline -10 2>/dev/null || echo "")
|
|
3454
|
+
# Include grep-based findings summary as context for AI confirmation
|
|
3455
|
+
if [[ -n "$patrol_findings_summary" ]]; then
|
|
3456
|
+
git_log_recent="${git_log_recent}
|
|
3457
|
+
|
|
3458
|
+
Patrol pre-filter findings to confirm: ${patrol_findings_summary}"
|
|
3459
|
+
daemon_log INFO "Patrol: passing ${total_findings} grep findings to AI for confirmation"
|
|
3460
|
+
fi
|
|
3461
|
+
if [[ -n "$sample_files" ]]; then
|
|
3462
|
+
local ai_findings
|
|
3463
|
+
ai_findings=$(patrol_ai_analyze "$sample_files" "$git_log_recent" 2>/dev/null || echo "[]")
|
|
3464
|
+
if [[ -n "$ai_findings" && "$ai_findings" != "[]" ]]; then
|
|
3465
|
+
local ai_count
|
|
3466
|
+
ai_count=$(echo "$ai_findings" | jq 'length' 2>/dev/null || echo "0")
|
|
3467
|
+
ai_count=${ai_count:-0}
|
|
3468
|
+
total_findings=$((total_findings + ai_count))
|
|
3469
|
+
echo -e " ${CYAN}●${RESET} AI confirmed findings + found ${ai_count} additional issue(s)"
|
|
3470
|
+
emit_event "patrol.ai_analysis" "findings=$ai_count" "grep_findings=${patrol_findings_summary:-none}"
|
|
3471
|
+
else
|
|
3472
|
+
echo -e " ${GREEN}●${RESET} AI analysis: grep findings confirmed, no additional issues"
|
|
3473
|
+
fi
|
|
3474
|
+
fi
|
|
3475
|
+
echo ""
|
|
3476
|
+
else
|
|
3477
|
+
daemon_log INFO "Intelligence: using grep-only patrol (prediction disabled, enable with intelligence.prediction_enabled=true)"
|
|
3478
|
+
fi
|
|
3479
|
+
|
|
3480
|
+
# ── Meta Self-Improvement Patrol ──
|
|
3481
|
+
if [[ -f "$SCRIPT_DIR/sw-patrol-meta.sh" ]]; then
|
|
3482
|
+
# shellcheck source=sw-patrol-meta.sh
|
|
3483
|
+
source "$SCRIPT_DIR/sw-patrol-meta.sh"
|
|
3484
|
+
patrol_meta_run
|
|
3485
|
+
fi
|
|
3486
|
+
|
|
1820
3487
|
# ── Summary ──
|
|
1821
3488
|
emit_event "patrol.completed" "findings=$total_findings" "issues_created=$issues_created" "dry_run=$dry_run"
|
|
1822
3489
|
|
|
@@ -1829,6 +3496,9 @@ Auto-detected by \`shipwright daemon patrol\` on $(now_iso)." \
|
|
|
1829
3496
|
echo ""
|
|
1830
3497
|
|
|
1831
3498
|
daemon_log INFO "Patrol complete: ${total_findings} findings, ${issues_created} issues created"
|
|
3499
|
+
|
|
3500
|
+
# Adapt patrol limits based on hit rate
|
|
3501
|
+
adapt_patrol_limits "$total_findings" "$PATROL_MAX_ISSUES"
|
|
1832
3502
|
}
|
|
1833
3503
|
|
|
1834
3504
|
# ─── Poll Issues ─────────────────────────────────────────────────────────────
|
|
@@ -1839,6 +3509,18 @@ daemon_poll_issues() {
|
|
|
1839
3509
|
return
|
|
1840
3510
|
fi
|
|
1841
3511
|
|
|
3512
|
+
# Check for pause flag (set by dashboard or disk_low alert)
|
|
3513
|
+
if [[ -f "$HOME/.shipwright/daemon-pause.flag" ]]; then
|
|
3514
|
+
daemon_log INFO "Daemon paused — skipping poll"
|
|
3515
|
+
return
|
|
3516
|
+
fi
|
|
3517
|
+
|
|
3518
|
+
# Circuit breaker: skip poll if in backoff window
|
|
3519
|
+
if gh_rate_limited; then
|
|
3520
|
+
daemon_log INFO "Polling skipped (rate-limit backoff until $(epoch_to_iso "$GH_BACKOFF_UNTIL"))"
|
|
3521
|
+
return
|
|
3522
|
+
fi
|
|
3523
|
+
|
|
1842
3524
|
local issues_json
|
|
1843
3525
|
|
|
1844
3526
|
# Select gh command wrapper: gh_retry for critical poll calls when enabled
|
|
@@ -1865,6 +3547,7 @@ daemon_poll_issues() {
|
|
|
1865
3547
|
fi
|
|
1866
3548
|
fi
|
|
1867
3549
|
daemon_log WARN "GitHub API error (org search) — backing off ${BACKOFF_SECS}s"
|
|
3550
|
+
gh_record_failure
|
|
1868
3551
|
sleep "$BACKOFF_SECS"
|
|
1869
3552
|
return
|
|
1870
3553
|
}
|
|
@@ -1891,6 +3574,7 @@ daemon_poll_issues() {
|
|
|
1891
3574
|
fi
|
|
1892
3575
|
fi
|
|
1893
3576
|
daemon_log WARN "GitHub API error — backing off ${BACKOFF_SECS}s"
|
|
3577
|
+
gh_record_failure
|
|
1894
3578
|
sleep "$BACKOFF_SECS"
|
|
1895
3579
|
return
|
|
1896
3580
|
}
|
|
@@ -1898,6 +3582,7 @@ daemon_poll_issues() {
|
|
|
1898
3582
|
|
|
1899
3583
|
# Reset backoff on success
|
|
1900
3584
|
BACKOFF_SECS=0
|
|
3585
|
+
gh_record_success
|
|
1901
3586
|
|
|
1902
3587
|
local issue_count
|
|
1903
3588
|
issue_count=$(echo "$issues_json" | jq 'length' 2>/dev/null || echo 0)
|
|
@@ -1913,6 +3598,7 @@ daemon_poll_issues() {
|
|
|
1913
3598
|
|
|
1914
3599
|
# Score each issue using intelligent triage and sort by descending score
|
|
1915
3600
|
local scored_issues=()
|
|
3601
|
+
local dep_graph="" # "issue:dep1,dep2" entries for dependency ordering
|
|
1916
3602
|
while IFS= read -r issue; do
|
|
1917
3603
|
local num score
|
|
1918
3604
|
num=$(echo "$issue" | jq -r '.number')
|
|
@@ -1923,14 +3609,85 @@ daemon_poll_issues() {
|
|
|
1923
3609
|
repo_name=$(echo "$issue" | jq -r '.repository.nameWithOwner // ""')
|
|
1924
3610
|
fi
|
|
1925
3611
|
scored_issues+=("${score}|${num}|${repo_name}")
|
|
3612
|
+
|
|
3613
|
+
# Issue dependency detection (adaptive: extract "depends on #X", "blocked by #X")
|
|
3614
|
+
if [[ "${ADAPTIVE_THRESHOLDS_ENABLED:-false}" == "true" ]]; then
|
|
3615
|
+
local issue_text
|
|
3616
|
+
issue_text=$(echo "$issue" | jq -r '(.title // "") + " " + (.body // "")')
|
|
3617
|
+
local deps
|
|
3618
|
+
deps=$(extract_issue_dependencies "$issue_text")
|
|
3619
|
+
if [[ -n "$deps" ]]; then
|
|
3620
|
+
local dep_nums
|
|
3621
|
+
dep_nums=$(echo "$deps" | tr -d '#' | tr '\n' ',' | sed 's/,$//')
|
|
3622
|
+
dep_graph="${dep_graph}${num}:${dep_nums}\n"
|
|
3623
|
+
daemon_log INFO "Issue #${num} depends on: ${deps//$'\n'/, }"
|
|
3624
|
+
fi
|
|
3625
|
+
fi
|
|
1926
3626
|
done < <(echo "$issues_json" | jq -c '.[]')
|
|
1927
3627
|
|
|
1928
|
-
# Sort by score descending
|
|
3628
|
+
# Sort by score — strategy determines ascending vs descending
|
|
1929
3629
|
local sorted_order
|
|
1930
|
-
|
|
3630
|
+
if [[ "${PRIORITY_STRATEGY:-quick-wins-first}" == "complex-first" ]]; then
|
|
3631
|
+
# Complex-first: lower score (more complex) first
|
|
3632
|
+
sorted_order=$(printf '%s\n' "${scored_issues[@]}" | sort -t'|' -k1 -n)
|
|
3633
|
+
else
|
|
3634
|
+
# Quick-wins-first (default): higher score (simpler) first
|
|
3635
|
+
sorted_order=$(printf '%s\n' "${scored_issues[@]}" | sort -t'|' -k1 -rn)
|
|
3636
|
+
fi
|
|
3637
|
+
|
|
3638
|
+
# Dependency-aware reordering: move dependencies before dependents
|
|
3639
|
+
if [[ -n "$dep_graph" && "${ADAPTIVE_THRESHOLDS_ENABLED:-false}" == "true" ]]; then
|
|
3640
|
+
local reordered=""
|
|
3641
|
+
local scheduled=""
|
|
3642
|
+
# Multiple passes to resolve transitive dependencies (max 3)
|
|
3643
|
+
local pass=0
|
|
3644
|
+
while [[ $pass -lt 3 ]]; do
|
|
3645
|
+
local changed=false
|
|
3646
|
+
local new_order=""
|
|
3647
|
+
while IFS='|' read -r s_score s_num s_repo; do
|
|
3648
|
+
[[ -z "$s_num" ]] && continue
|
|
3649
|
+
# Check if this issue has unscheduled dependencies
|
|
3650
|
+
local issue_deps
|
|
3651
|
+
issue_deps=$(echo -e "$dep_graph" | grep "^${s_num}:" | head -1 | cut -d: -f2 || true)
|
|
3652
|
+
if [[ -n "$issue_deps" ]]; then
|
|
3653
|
+
# Check if all deps are scheduled (or not in our issue set)
|
|
3654
|
+
local all_deps_ready=true
|
|
3655
|
+
local IFS_SAVE="$IFS"
|
|
3656
|
+
IFS=','
|
|
3657
|
+
for dep in $issue_deps; do
|
|
3658
|
+
dep="${dep## }"
|
|
3659
|
+
dep="${dep%% }"
|
|
3660
|
+
# Is this dep in our scored set and not yet scheduled?
|
|
3661
|
+
if echo "$sorted_order" | grep -q "|${dep}|" && ! echo "$scheduled" | grep -q "|${dep}|"; then
|
|
3662
|
+
all_deps_ready=false
|
|
3663
|
+
break
|
|
3664
|
+
fi
|
|
3665
|
+
done
|
|
3666
|
+
IFS="$IFS_SAVE"
|
|
3667
|
+
if [[ "$all_deps_ready" == "false" ]]; then
|
|
3668
|
+
# Defer this issue — append at end
|
|
3669
|
+
new_order="${new_order}${s_score}|${s_num}|${s_repo}\n"
|
|
3670
|
+
changed=true
|
|
3671
|
+
continue
|
|
3672
|
+
fi
|
|
3673
|
+
fi
|
|
3674
|
+
reordered="${reordered}${s_score}|${s_num}|${s_repo}\n"
|
|
3675
|
+
scheduled="${scheduled}|${s_num}|"
|
|
3676
|
+
done <<< "$sorted_order"
|
|
3677
|
+
# Append deferred issues
|
|
3678
|
+
reordered="${reordered}${new_order}"
|
|
3679
|
+
sorted_order=$(echo -e "$reordered" | grep -v '^$')
|
|
3680
|
+
reordered=""
|
|
3681
|
+
scheduled=""
|
|
3682
|
+
if [[ "$changed" == "false" ]]; then
|
|
3683
|
+
break
|
|
3684
|
+
fi
|
|
3685
|
+
pass=$((pass + 1))
|
|
3686
|
+
done
|
|
3687
|
+
fi
|
|
1931
3688
|
|
|
1932
3689
|
local active_count
|
|
1933
|
-
active_count=$(
|
|
3690
|
+
active_count=$(locked_get_active_count)
|
|
1934
3691
|
|
|
1935
3692
|
# Process each issue in triage order (process substitution keeps state in current shell)
|
|
1936
3693
|
while IFS='|' read -r score issue_num repo_name; do
|
|
@@ -1940,11 +3697,27 @@ daemon_poll_issues() {
|
|
|
1940
3697
|
issue_title=$(echo "$issues_json" | jq -r --argjson n "$issue_num" '.[] | select(.number == $n) | .title')
|
|
1941
3698
|
labels_csv=$(echo "$issues_json" | jq -r --argjson n "$issue_num" '.[] | select(.number == $n) | [.labels[].name] | join(",")')
|
|
1942
3699
|
|
|
3700
|
+
# Cache title in state for dashboard visibility
|
|
3701
|
+
if [[ -n "$issue_title" ]]; then
|
|
3702
|
+
locked_state_update --arg num "$issue_num" --arg title "$issue_title" \
|
|
3703
|
+
'.titles[$num] = $title'
|
|
3704
|
+
fi
|
|
3705
|
+
|
|
1943
3706
|
# Skip if already inflight
|
|
1944
3707
|
if daemon_is_inflight "$issue_num"; then
|
|
1945
3708
|
continue
|
|
1946
3709
|
fi
|
|
1947
3710
|
|
|
3711
|
+
# Distributed claim (skip if no machines registered)
|
|
3712
|
+
if [[ -f "$HOME/.shipwright/machines.json" ]]; then
|
|
3713
|
+
local machine_name
|
|
3714
|
+
machine_name=$(jq -r '.machines[] | select(.role == "primary") | .name' "$HOME/.shipwright/machines.json" 2>/dev/null || hostname -s)
|
|
3715
|
+
if ! claim_issue "$issue_num" "$machine_name"; then
|
|
3716
|
+
daemon_log INFO "Issue #${issue_num} claimed by another machine — skipping"
|
|
3717
|
+
continue
|
|
3718
|
+
fi
|
|
3719
|
+
fi
|
|
3720
|
+
|
|
1948
3721
|
# Priority lane: bypass queue for critical issues
|
|
1949
3722
|
if [[ "$PRIORITY_LANE" == "true" ]]; then
|
|
1950
3723
|
local priority_active
|
|
@@ -1967,7 +3740,7 @@ daemon_poll_issues() {
|
|
|
1967
3740
|
fi
|
|
1968
3741
|
|
|
1969
3742
|
# Check capacity
|
|
1970
|
-
active_count=$(
|
|
3743
|
+
active_count=$(locked_get_active_count)
|
|
1971
3744
|
if [[ "$active_count" -ge "$MAX_PARALLEL" ]]; then
|
|
1972
3745
|
enqueue_issue "$issue_num"
|
|
1973
3746
|
continue
|
|
@@ -1993,33 +3766,95 @@ daemon_poll_issues() {
|
|
|
1993
3766
|
|
|
1994
3767
|
daemon_health_check() {
|
|
1995
3768
|
local findings=0
|
|
1996
|
-
|
|
1997
|
-
# Stale jobs: kill processes running > timeout
|
|
1998
|
-
local stale_timeout="${HEALTH_STALE_TIMEOUT:-1800}" # default 30min
|
|
1999
3769
|
local now_e
|
|
2000
3770
|
now_e=$(now_epoch)
|
|
2001
3771
|
|
|
2002
3772
|
if [[ -f "$STATE_FILE" ]]; then
|
|
3773
|
+
# ── Progress-Based Health Monitoring ──
|
|
3774
|
+
# Instead of killing after a static timeout, check for forward progress.
|
|
3775
|
+
# Only kill when the agent is truly stuck (no stage change, no new code,
|
|
3776
|
+
# same error repeating). A hard wall-clock limit remains as absolute safety net.
|
|
3777
|
+
|
|
3778
|
+
local hard_limit="${PROGRESS_HARD_LIMIT_S:-10800}"
|
|
3779
|
+
local use_progress="${PROGRESS_MONITORING:-true}"
|
|
3780
|
+
|
|
2003
3781
|
while IFS= read -r job; do
|
|
2004
|
-
local pid started_at issue_num
|
|
3782
|
+
local pid started_at issue_num worktree
|
|
2005
3783
|
pid=$(echo "$job" | jq -r '.pid')
|
|
2006
3784
|
started_at=$(echo "$job" | jq -r '.started_at // empty')
|
|
2007
3785
|
issue_num=$(echo "$job" | jq -r '.issue')
|
|
3786
|
+
worktree=$(echo "$job" | jq -r '.worktree // ""')
|
|
3787
|
+
|
|
3788
|
+
# Skip dead processes
|
|
3789
|
+
if ! kill -0 "$pid" 2>/dev/null; then
|
|
3790
|
+
continue
|
|
3791
|
+
fi
|
|
2008
3792
|
|
|
3793
|
+
local elapsed=0
|
|
2009
3794
|
if [[ -n "$started_at" ]]; then
|
|
2010
3795
|
local start_e
|
|
2011
3796
|
start_e=$(TZ=UTC date -j -f "%Y-%m-%dT%H:%M:%SZ" "$started_at" +%s 2>/dev/null || date -d "$started_at" +%s 2>/dev/null || echo "0")
|
|
2012
|
-
|
|
2013
|
-
|
|
2014
|
-
|
|
3797
|
+
elapsed=$(( now_e - start_e ))
|
|
3798
|
+
fi
|
|
3799
|
+
|
|
3800
|
+
# Hard wall-clock limit — absolute safety net (default 3h)
|
|
3801
|
+
if [[ "$elapsed" -gt "$hard_limit" ]]; then
|
|
3802
|
+
daemon_log WARN "Hard limit exceeded: issue #${issue_num} (${elapsed}s > ${hard_limit}s, PID $pid) — killing"
|
|
3803
|
+
emit_event "daemon.hard_limit" "issue=$issue_num" "elapsed_s=$elapsed" "limit_s=$hard_limit" "pid=$pid"
|
|
3804
|
+
kill "$pid" 2>/dev/null || true
|
|
3805
|
+
daemon_clear_progress "$issue_num"
|
|
3806
|
+
findings=$((findings + 1))
|
|
3807
|
+
continue
|
|
3808
|
+
fi
|
|
3809
|
+
|
|
3810
|
+
# Progress-based detection (when enabled)
|
|
3811
|
+
if [[ "$use_progress" == "true" && -n "$worktree" ]]; then
|
|
3812
|
+
local snapshot verdict
|
|
3813
|
+
snapshot=$(daemon_collect_snapshot "$issue_num" "$worktree" "$pid" 2>/dev/null || echo '{}')
|
|
3814
|
+
|
|
3815
|
+
if [[ "$snapshot" != "{}" ]]; then
|
|
3816
|
+
verdict=$(daemon_assess_progress "$issue_num" "$snapshot" 2>/dev/null || echo "healthy")
|
|
3817
|
+
|
|
3818
|
+
case "$verdict" in
|
|
3819
|
+
healthy)
|
|
3820
|
+
# All good — agent is making progress
|
|
3821
|
+
;;
|
|
3822
|
+
slowing)
|
|
3823
|
+
daemon_log INFO "Issue #${issue_num} slowing (no progress for 1-2 checks, ${elapsed}s elapsed)"
|
|
3824
|
+
;;
|
|
3825
|
+
stalled)
|
|
3826
|
+
local no_progress_count
|
|
3827
|
+
no_progress_count=$(jq -r '.no_progress_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
|
|
3828
|
+
daemon_log WARN "Issue #${issue_num} stalled: no progress for ${no_progress_count} checks (${elapsed}s elapsed, PID $pid)"
|
|
3829
|
+
emit_event "daemon.stalled" "issue=$issue_num" "no_progress=$no_progress_count" "elapsed_s=$elapsed" "pid=$pid"
|
|
3830
|
+
;;
|
|
3831
|
+
stuck)
|
|
3832
|
+
local no_progress_count repeated_errors cur_stage
|
|
3833
|
+
no_progress_count=$(jq -r '.no_progress_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
|
|
3834
|
+
repeated_errors=$(jq -r '.repeated_error_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
|
|
3835
|
+
cur_stage=$(echo "$snapshot" | jq -r '.stage // "unknown"')
|
|
3836
|
+
daemon_log WARN "Issue #${issue_num} STUCK: no progress for ${no_progress_count} checks, ${repeated_errors} repeated errors (stage=${cur_stage}, ${elapsed}s, PID $pid) — killing"
|
|
3837
|
+
emit_event "daemon.stuck_kill" "issue=$issue_num" "no_progress=$no_progress_count" "repeated_errors=$repeated_errors" "stage=$cur_stage" "elapsed_s=$elapsed" "pid=$pid"
|
|
3838
|
+
kill "$pid" 2>/dev/null || true
|
|
3839
|
+
daemon_clear_progress "$issue_num"
|
|
3840
|
+
findings=$((findings + 1))
|
|
3841
|
+
;;
|
|
3842
|
+
esac
|
|
3843
|
+
fi
|
|
3844
|
+
else
|
|
3845
|
+
# Fallback: legacy time-based detection when progress monitoring is off
|
|
3846
|
+
local stale_timeout
|
|
3847
|
+
stale_timeout=$(get_adaptive_stale_timeout "$PIPELINE_TEMPLATE")
|
|
3848
|
+
if [[ "$elapsed" -gt "$stale_timeout" ]]; then
|
|
3849
|
+
daemon_log WARN "Stale job (legacy): issue #${issue_num} (${elapsed}s > ${stale_timeout}s, PID $pid) — killing"
|
|
2015
3850
|
kill "$pid" 2>/dev/null || true
|
|
2016
3851
|
findings=$((findings + 1))
|
|
2017
3852
|
fi
|
|
2018
3853
|
fi
|
|
2019
|
-
done < <(jq -c '.active_jobs[]' "$STATE_FILE" 2>/dev/null)
|
|
3854
|
+
done < <(jq -c '.active_jobs[]' "$STATE_FILE" 2>/dev/null || true)
|
|
2020
3855
|
fi
|
|
2021
3856
|
|
|
2022
|
-
# Disk space warning
|
|
3857
|
+
# Disk space warning (check both repo dir and ~/.shipwright)
|
|
2023
3858
|
local free_kb
|
|
2024
3859
|
free_kb=$(df -k "." 2>/dev/null | tail -1 | awk '{print $4}')
|
|
2025
3860
|
if [[ -n "$free_kb" ]] && [[ "$free_kb" -lt 1048576 ]] 2>/dev/null; then
|
|
@@ -2027,6 +3862,17 @@ daemon_health_check() {
|
|
|
2027
3862
|
findings=$((findings + 1))
|
|
2028
3863
|
fi
|
|
2029
3864
|
|
|
3865
|
+
# Critical disk space on ~/.shipwright — pause spawning
|
|
3866
|
+
local sw_free_kb
|
|
3867
|
+
sw_free_kb=$(df -k "$HOME/.shipwright" 2>/dev/null | tail -1 | awk '{print $4}')
|
|
3868
|
+
if [[ -n "$sw_free_kb" ]] && [[ "$sw_free_kb" -lt 512000 ]] 2>/dev/null; then
|
|
3869
|
+
daemon_log WARN "Critical disk space on ~/.shipwright: $(( sw_free_kb / 1024 ))MB — pausing spawns"
|
|
3870
|
+
emit_event "daemon.disk_low" "free_mb=$(( sw_free_kb / 1024 ))"
|
|
3871
|
+
mkdir -p "$HOME/.shipwright"
|
|
3872
|
+
echo '{"paused":true,"reason":"disk_low"}' > "$HOME/.shipwright/daemon-pause.flag"
|
|
3873
|
+
findings=$((findings + 1))
|
|
3874
|
+
fi
|
|
3875
|
+
|
|
2030
3876
|
# Events file size warning
|
|
2031
3877
|
if [[ -f "$EVENTS_FILE" ]]; then
|
|
2032
3878
|
local events_size
|
|
@@ -2096,6 +3942,13 @@ daemon_auto_scale() {
|
|
|
2096
3942
|
|
|
2097
3943
|
local prev_max="$MAX_PARALLEL"
|
|
2098
3944
|
|
|
3945
|
+
# ── Learn worker memory from actual RSS (adaptive) ──
|
|
3946
|
+
learn_worker_memory
|
|
3947
|
+
|
|
3948
|
+
# ── Adaptive cost estimate per template ──
|
|
3949
|
+
local effective_cost_per_job
|
|
3950
|
+
effective_cost_per_job=$(get_adaptive_cost_estimate "$PIPELINE_TEMPLATE")
|
|
3951
|
+
|
|
2099
3952
|
# ── CPU cores ──
|
|
2100
3953
|
local cpu_cores=2
|
|
2101
3954
|
if [[ "$(uname -s)" == "Darwin" ]]; then
|
|
@@ -2106,10 +3959,9 @@ daemon_auto_scale() {
|
|
|
2106
3959
|
local max_by_cpu=$(( (cpu_cores * 3) / 4 )) # 75% utilization cap
|
|
2107
3960
|
[[ "$max_by_cpu" -lt 1 ]] && max_by_cpu=1
|
|
2108
3961
|
|
|
2109
|
-
# ── Load average check
|
|
3962
|
+
# ── Load average check — gradual scaling curve (replaces 90% cliff) ──
|
|
2110
3963
|
local load_avg
|
|
2111
3964
|
load_avg=$(uptime | awk -F'load averages?: ' '{print $2}' | awk -F'[, ]+' '{print $1}' 2>/dev/null || echo "0")
|
|
2112
|
-
# Validate numeric
|
|
2113
3965
|
if [[ ! "$load_avg" =~ ^[0-9]+\.?[0-9]*$ ]]; then
|
|
2114
3966
|
load_avg="0"
|
|
2115
3967
|
fi
|
|
@@ -2117,17 +3969,28 @@ daemon_auto_scale() {
|
|
|
2117
3969
|
if [[ "$cpu_cores" -gt 0 ]]; then
|
|
2118
3970
|
load_ratio=$(awk -v load="$load_avg" -v cores="$cpu_cores" 'BEGIN { printf "%.0f", (load / cores) * 100 }')
|
|
2119
3971
|
fi
|
|
2120
|
-
|
|
2121
|
-
|
|
3972
|
+
# Gradual load scaling curve (replaces binary 90% cliff)
|
|
3973
|
+
if [[ "$load_ratio" -gt 95 ]]; then
|
|
3974
|
+
# 95%+: minimum workers only
|
|
2122
3975
|
max_by_cpu="$MIN_WORKERS"
|
|
2123
|
-
daemon_log WARN "Auto-scale:
|
|
3976
|
+
daemon_log WARN "Auto-scale: critical load (${load_ratio}%) — minimum workers only"
|
|
3977
|
+
elif [[ "$load_ratio" -gt 85 ]]; then
|
|
3978
|
+
# 85-95%: reduce by 50%
|
|
3979
|
+
max_by_cpu=$(( max_by_cpu / 2 ))
|
|
3980
|
+
[[ "$max_by_cpu" -lt "$MIN_WORKERS" ]] && max_by_cpu="$MIN_WORKERS"
|
|
3981
|
+
daemon_log WARN "Auto-scale: high load (${load_ratio}%) — reducing capacity 50%"
|
|
3982
|
+
elif [[ "$load_ratio" -gt 70 ]]; then
|
|
3983
|
+
# 70-85%: reduce by 25%
|
|
3984
|
+
max_by_cpu=$(( (max_by_cpu * 3) / 4 ))
|
|
3985
|
+
[[ "$max_by_cpu" -lt "$MIN_WORKERS" ]] && max_by_cpu="$MIN_WORKERS"
|
|
3986
|
+
daemon_log INFO "Auto-scale: moderate load (${load_ratio}%) — reducing capacity 25%"
|
|
2124
3987
|
fi
|
|
3988
|
+
# 0-70%: full capacity (no change)
|
|
2125
3989
|
|
|
2126
3990
|
# ── Available memory ──
|
|
2127
3991
|
local avail_mem_gb=8
|
|
2128
3992
|
if [[ "$(uname -s)" == "Darwin" ]]; then
|
|
2129
3993
|
local page_size free_pages inactive_pages purgeable_pages speculative_pages
|
|
2130
|
-
# Page size is in format: "(page size of 16384 bytes)"
|
|
2131
3994
|
page_size=$(vm_stat | awk '/page size of/ {for(i=1;i<=NF;i++) if($i ~ /^[0-9]+$/) print $i}')
|
|
2132
3995
|
page_size="${page_size:-16384}"
|
|
2133
3996
|
free_pages=$(vm_stat | awk '/^Pages free:/ {gsub(/\./, "", $NF); print $NF}')
|
|
@@ -2138,7 +4001,6 @@ daemon_auto_scale() {
|
|
|
2138
4001
|
inactive_pages="${inactive_pages:-0}"
|
|
2139
4002
|
purgeable_pages=$(vm_stat | awk '/^Pages purgeable:/ {gsub(/\./, "", $NF); print $NF}')
|
|
2140
4003
|
purgeable_pages="${purgeable_pages:-0}"
|
|
2141
|
-
# Available ≈ free + speculative + inactive + purgeable
|
|
2142
4004
|
local avail_pages=$(( free_pages + speculative_pages + inactive_pages + purgeable_pages ))
|
|
2143
4005
|
if [[ "$avail_pages" -gt 0 && "$page_size" -gt 0 ]]; then
|
|
2144
4006
|
local free_bytes=$(( avail_pages * page_size ))
|
|
@@ -2153,13 +4015,13 @@ daemon_auto_scale() {
|
|
|
2153
4015
|
local max_by_mem=$(( avail_mem_gb / WORKER_MEM_GB ))
|
|
2154
4016
|
[[ "$max_by_mem" -lt 1 ]] && max_by_mem=1
|
|
2155
4017
|
|
|
2156
|
-
# ── Budget remaining ──
|
|
4018
|
+
# ── Budget remaining (adaptive cost estimate) ──
|
|
2157
4019
|
local max_by_budget="$MAX_WORKERS"
|
|
2158
4020
|
local remaining_usd
|
|
2159
|
-
remaining_usd=$("$SCRIPT_DIR/
|
|
4021
|
+
remaining_usd=$("$SCRIPT_DIR/sw-cost.sh" remaining-budget 2>/dev/null || echo "unlimited")
|
|
2160
4022
|
if [[ "$remaining_usd" != "unlimited" && -n "$remaining_usd" ]]; then
|
|
2161
|
-
if awk -v r="$remaining_usd" -v c="$
|
|
2162
|
-
max_by_budget=$(awk -v r="$remaining_usd" -v c="$
|
|
4023
|
+
if awk -v r="$remaining_usd" -v c="$effective_cost_per_job" 'BEGIN { exit !(r > 0 && c > 0) }'; then
|
|
4024
|
+
max_by_budget=$(awk -v r="$remaining_usd" -v c="$effective_cost_per_job" 'BEGIN { printf "%.0f", r / c }')
|
|
2163
4025
|
[[ "$max_by_budget" -lt 0 ]] && max_by_budget=0
|
|
2164
4026
|
else
|
|
2165
4027
|
max_by_budget=0
|
|
@@ -2192,10 +4054,31 @@ daemon_auto_scale() {
|
|
|
2192
4054
|
# Clamp to min_workers
|
|
2193
4055
|
[[ "$computed" -lt "$MIN_WORKERS" ]] && computed="$MIN_WORKERS"
|
|
2194
4056
|
|
|
4057
|
+
# ── Gradual scaling: change by at most 1 at a time (adaptive) ──
|
|
4058
|
+
if [[ "${ADAPTIVE_THRESHOLDS_ENABLED:-false}" == "true" ]]; then
|
|
4059
|
+
if [[ "$computed" -gt "$prev_max" ]]; then
|
|
4060
|
+
# Check success rate at target parallelism before scaling up
|
|
4061
|
+
local target_rate
|
|
4062
|
+
target_rate=$(get_success_rate_at_parallelism "$((prev_max + 1))")
|
|
4063
|
+
if [[ "$target_rate" -lt 50 ]]; then
|
|
4064
|
+
# Poor success rate at higher parallelism — hold steady
|
|
4065
|
+
computed="$prev_max"
|
|
4066
|
+
daemon_log INFO "Auto-scale: holding at ${prev_max} (success rate ${target_rate}% at $((prev_max + 1)))"
|
|
4067
|
+
else
|
|
4068
|
+
# Scale up by 1, not jump to target
|
|
4069
|
+
computed=$((prev_max + 1))
|
|
4070
|
+
fi
|
|
4071
|
+
elif [[ "$computed" -lt "$prev_max" ]]; then
|
|
4072
|
+
# Scale down by 1, not drop to minimum
|
|
4073
|
+
computed=$((prev_max - 1))
|
|
4074
|
+
[[ "$computed" -lt "$MIN_WORKERS" ]] && computed="$MIN_WORKERS"
|
|
4075
|
+
fi
|
|
4076
|
+
fi
|
|
4077
|
+
|
|
2195
4078
|
MAX_PARALLEL="$computed"
|
|
2196
4079
|
|
|
2197
4080
|
if [[ "$MAX_PARALLEL" -ne "$prev_max" ]]; then
|
|
2198
|
-
daemon_log INFO "Auto-scale: ${prev_max} → ${MAX_PARALLEL} (cpu=${max_by_cpu} mem=${max_by_mem} budget=${max_by_budget} queue=${max_by_queue})"
|
|
4081
|
+
daemon_log INFO "Auto-scale: ${prev_max} → ${MAX_PARALLEL} (cpu=${max_by_cpu} mem=${max_by_mem} budget=${max_by_budget} queue=${max_by_queue} load=${load_ratio}%)"
|
|
2199
4082
|
emit_event "daemon.scale" \
|
|
2200
4083
|
"from=$prev_max" \
|
|
2201
4084
|
"to=$MAX_PARALLEL" \
|
|
@@ -2205,7 +4088,8 @@ daemon_auto_scale() {
|
|
|
2205
4088
|
"max_by_queue=$max_by_queue" \
|
|
2206
4089
|
"cpu_cores=$cpu_cores" \
|
|
2207
4090
|
"avail_mem_gb=$avail_mem_gb" \
|
|
2208
|
-
"remaining_usd=$remaining_usd"
|
|
4091
|
+
"remaining_usd=$remaining_usd" \
|
|
4092
|
+
"load_ratio=$load_ratio"
|
|
2209
4093
|
fi
|
|
2210
4094
|
}
|
|
2211
4095
|
|
|
@@ -2213,7 +4097,7 @@ daemon_auto_scale() {
|
|
|
2213
4097
|
# Checks for fleet-reload.flag and reloads MAX_PARALLEL from fleet-managed config
|
|
2214
4098
|
|
|
2215
4099
|
daemon_reload_config() {
|
|
2216
|
-
local reload_flag="$HOME/.
|
|
4100
|
+
local reload_flag="$HOME/.shipwright/fleet-reload.flag"
|
|
2217
4101
|
if [[ ! -f "$reload_flag" ]]; then
|
|
2218
4102
|
return
|
|
2219
4103
|
fi
|
|
@@ -2245,6 +4129,15 @@ daemon_self_optimize() {
|
|
|
2245
4129
|
return
|
|
2246
4130
|
fi
|
|
2247
4131
|
|
|
4132
|
+
# ── Intelligence-powered optimization (if enabled) ──
|
|
4133
|
+
if [[ "${OPTIMIZATION_ENABLED:-false}" == "true" ]] && type optimize_full_analysis &>/dev/null 2>&1; then
|
|
4134
|
+
daemon_log INFO "Running intelligence-powered optimization"
|
|
4135
|
+
optimize_full_analysis 2>/dev/null || {
|
|
4136
|
+
daemon_log WARN "Intelligence optimization failed — falling back to DORA-based tuning"
|
|
4137
|
+
}
|
|
4138
|
+
# Still run DORA-based tuning below as a complement
|
|
4139
|
+
fi
|
|
4140
|
+
|
|
2248
4141
|
daemon_log INFO "Running self-optimization check"
|
|
2249
4142
|
|
|
2250
4143
|
# Read DORA metrics from recent events (last 7 days)
|
|
@@ -2339,13 +4232,10 @@ daemon_self_optimize() {
|
|
|
2339
4232
|
local adj_str
|
|
2340
4233
|
adj_str=$(printf '%s; ' "${adjustments[@]}")
|
|
2341
4234
|
|
|
2342
|
-
|
|
2343
|
-
tmp_state=$(jq \
|
|
4235
|
+
locked_state_update \
|
|
2344
4236
|
--arg adj "$adj_str" \
|
|
2345
4237
|
--arg ts "$(now_iso)" \
|
|
2346
|
-
'.last_optimization = {timestamp: $ts, adjustments: $adj}'
|
|
2347
|
-
"$STATE_FILE")
|
|
2348
|
-
atomic_write_state "$tmp_state"
|
|
4238
|
+
'.last_optimization = {timestamp: $ts, adjustments: $adj}'
|
|
2349
4239
|
|
|
2350
4240
|
# ── Persist adjustments to daemon-config.json (survives restart) ──
|
|
2351
4241
|
local config_file="${CONFIG_PATH:-.claude/daemon-config.json}"
|
|
@@ -2427,24 +4317,59 @@ daemon_cleanup_stale() {
|
|
|
2427
4317
|
done < <(find "$artifacts_dir" -mindepth 1 -maxdepth 1 -type d 2>/dev/null)
|
|
2428
4318
|
fi
|
|
2429
4319
|
|
|
2430
|
-
# ── 3.
|
|
4320
|
+
# ── 3. Clean orphaned daemon/* branches (no matching worktree or active job) ──
|
|
4321
|
+
if command -v git &>/dev/null; then
|
|
4322
|
+
while IFS= read -r branch; do
|
|
4323
|
+
[[ -z "$branch" ]] && continue
|
|
4324
|
+
branch="${branch## }" # trim leading spaces
|
|
4325
|
+
# Only clean daemon-created branches
|
|
4326
|
+
[[ "$branch" == daemon/issue-* ]] || continue
|
|
4327
|
+
# Extract issue number
|
|
4328
|
+
local branch_issue_num="${branch#daemon/issue-}"
|
|
4329
|
+
# Skip if there's an active job for this issue
|
|
4330
|
+
if daemon_is_inflight "$branch_issue_num" 2>/dev/null; then
|
|
4331
|
+
continue
|
|
4332
|
+
fi
|
|
4333
|
+
daemon_log INFO "Removing orphaned branch: ${branch}"
|
|
4334
|
+
git branch -D "$branch" 2>/dev/null || true
|
|
4335
|
+
cleaned=$((cleaned + 1))
|
|
4336
|
+
done < <(git branch --list 'daemon/issue-*' 2>/dev/null)
|
|
4337
|
+
fi
|
|
4338
|
+
|
|
4339
|
+
# ── 4. Prune completed/failed state entries older than age_days ──
|
|
2431
4340
|
if [[ -f "$STATE_FILE" ]]; then
|
|
2432
4341
|
local cutoff_iso
|
|
2433
4342
|
cutoff_iso=$(epoch_to_iso $((now_e - age_secs)))
|
|
2434
|
-
local before_count
|
|
4343
|
+
local before_count
|
|
2435
4344
|
before_count=$(jq '.completed | length' "$STATE_FILE" 2>/dev/null || echo 0)
|
|
2436
|
-
|
|
2437
|
-
|
|
2438
|
-
|
|
2439
|
-
|
|
2440
|
-
|
|
2441
|
-
|
|
2442
|
-
|
|
2443
|
-
|
|
2444
|
-
|
|
2445
|
-
|
|
2446
|
-
|
|
4345
|
+
locked_state_update --arg cutoff "$cutoff_iso" \
|
|
4346
|
+
'.completed = [.completed[] | select(.completed_at > $cutoff)]' 2>/dev/null || true
|
|
4347
|
+
local after_count
|
|
4348
|
+
after_count=$(jq '.completed | length' "$STATE_FILE" 2>/dev/null || echo 0)
|
|
4349
|
+
local pruned=$((before_count - after_count))
|
|
4350
|
+
if [[ "$pruned" -gt 0 ]]; then
|
|
4351
|
+
daemon_log INFO "Pruned ${pruned} old completed state entries"
|
|
4352
|
+
cleaned=$((cleaned + pruned))
|
|
4353
|
+
fi
|
|
4354
|
+
fi
|
|
4355
|
+
|
|
4356
|
+
# ── 5. Prune stale retry_counts (issues no longer in flight or queued) ──
|
|
4357
|
+
if [[ -f "$STATE_FILE" ]]; then
|
|
4358
|
+
local retry_keys
|
|
4359
|
+
retry_keys=$(jq -r '.retry_counts // {} | keys[]' "$STATE_FILE" 2>/dev/null || true)
|
|
4360
|
+
local stale_keys=()
|
|
4361
|
+
while IFS= read -r key; do
|
|
4362
|
+
[[ -z "$key" ]] && continue
|
|
4363
|
+
if ! daemon_is_inflight "$key" 2>/dev/null; then
|
|
4364
|
+
stale_keys+=("$key")
|
|
2447
4365
|
fi
|
|
4366
|
+
done <<< "$retry_keys"
|
|
4367
|
+
if [[ ${#stale_keys[@]} -gt 0 ]]; then
|
|
4368
|
+
for sk in "${stale_keys[@]}"; do
|
|
4369
|
+
locked_state_update --arg k "$sk" 'del(.retry_counts[$k])' 2>/dev/null || continue
|
|
4370
|
+
done
|
|
4371
|
+
daemon_log INFO "Pruned ${#stale_keys[@]} stale retry count(s)"
|
|
4372
|
+
cleaned=$((cleaned + ${#stale_keys[@]}))
|
|
2448
4373
|
fi
|
|
2449
4374
|
fi
|
|
2450
4375
|
|
|
@@ -2465,56 +4390,69 @@ daemon_poll_loop() {
|
|
|
2465
4390
|
daemon_log INFO "Watching for label: ${CYAN}${WATCH_LABEL}${RESET}"
|
|
2466
4391
|
|
|
2467
4392
|
while [[ ! -f "$SHUTDOWN_FLAG" ]]; do
|
|
2468
|
-
|
|
2469
|
-
|
|
2470
|
-
|
|
4393
|
+
# All poll loop calls are error-guarded to prevent set -e from killing the daemon.
|
|
4394
|
+
# The || operator disables set -e for the entire call chain, so transient failures
|
|
4395
|
+
# (GitHub API timeouts, jq errors, intelligence failures) are logged and skipped.
|
|
4396
|
+
daemon_poll_issues || daemon_log WARN "daemon_poll_issues failed — continuing"
|
|
4397
|
+
daemon_reap_completed || daemon_log WARN "daemon_reap_completed failed — continuing"
|
|
4398
|
+
daemon_health_check || daemon_log WARN "daemon_health_check failed — continuing"
|
|
2471
4399
|
|
|
2472
4400
|
# Increment cycle counter (must be before all modulo checks)
|
|
2473
4401
|
POLL_CYCLE_COUNT=$((POLL_CYCLE_COUNT + 1))
|
|
2474
4402
|
|
|
2475
4403
|
# Fleet config reload every 3 cycles
|
|
2476
4404
|
if [[ $((POLL_CYCLE_COUNT % 3)) -eq 0 ]]; then
|
|
2477
|
-
daemon_reload_config
|
|
4405
|
+
daemon_reload_config || daemon_log WARN "daemon_reload_config failed — continuing"
|
|
2478
4406
|
fi
|
|
2479
4407
|
|
|
2480
4408
|
# Check degradation every 5 poll cycles
|
|
2481
4409
|
if [[ $((POLL_CYCLE_COUNT % 5)) -eq 0 ]]; then
|
|
2482
|
-
daemon_check_degradation
|
|
4410
|
+
daemon_check_degradation || daemon_log WARN "daemon_check_degradation failed — continuing"
|
|
2483
4411
|
fi
|
|
2484
4412
|
|
|
2485
4413
|
# Auto-scale every N cycles (default: 5)
|
|
2486
4414
|
if [[ $((POLL_CYCLE_COUNT % ${AUTO_SCALE_INTERVAL:-5})) -eq 0 ]]; then
|
|
2487
|
-
daemon_auto_scale
|
|
4415
|
+
daemon_auto_scale || daemon_log WARN "daemon_auto_scale failed — continuing"
|
|
2488
4416
|
fi
|
|
2489
4417
|
|
|
2490
4418
|
# Self-optimize every N cycles (default: 10)
|
|
2491
4419
|
if [[ $((POLL_CYCLE_COUNT % ${OPTIMIZE_INTERVAL:-10})) -eq 0 ]]; then
|
|
2492
|
-
daemon_self_optimize
|
|
4420
|
+
daemon_self_optimize || daemon_log WARN "daemon_self_optimize failed — continuing"
|
|
2493
4421
|
fi
|
|
2494
4422
|
|
|
2495
4423
|
# Stale state reaper every N cycles (default: 10)
|
|
2496
4424
|
if [[ $((POLL_CYCLE_COUNT % ${STALE_REAPER_INTERVAL:-10})) -eq 0 ]]; then
|
|
2497
|
-
daemon_cleanup_stale
|
|
4425
|
+
daemon_cleanup_stale || daemon_log WARN "daemon_cleanup_stale failed — continuing"
|
|
2498
4426
|
fi
|
|
2499
4427
|
|
|
2500
|
-
#
|
|
4428
|
+
# Rotate event log every 10 cycles (~10 min with 60s interval)
|
|
4429
|
+
if [[ $((POLL_CYCLE_COUNT % 10)) -eq 0 ]]; then
|
|
4430
|
+
rotate_event_log || true
|
|
4431
|
+
fi
|
|
4432
|
+
|
|
4433
|
+
# Proactive patrol during quiet periods (with adaptive limits)
|
|
2501
4434
|
local issue_count_now active_count_now
|
|
2502
4435
|
issue_count_now=$(jq -r '.queued | length' "$STATE_FILE" 2>/dev/null || echo 0)
|
|
2503
|
-
active_count_now=$(get_active_count)
|
|
4436
|
+
active_count_now=$(get_active_count || echo 0)
|
|
2504
4437
|
if [[ "$issue_count_now" -eq 0 ]] && [[ "$active_count_now" -eq 0 ]]; then
|
|
2505
4438
|
local now_e
|
|
2506
|
-
now_e=$(now_epoch)
|
|
4439
|
+
now_e=$(now_epoch || date +%s)
|
|
2507
4440
|
if [[ $((now_e - LAST_PATROL_EPOCH)) -ge "$PATROL_INTERVAL" ]]; then
|
|
4441
|
+
load_adaptive_patrol_limits || true
|
|
2508
4442
|
daemon_log INFO "No active work — running patrol"
|
|
2509
|
-
daemon_patrol --once
|
|
4443
|
+
daemon_patrol --once || daemon_log WARN "daemon_patrol failed — continuing"
|
|
2510
4444
|
LAST_PATROL_EPOCH=$now_e
|
|
2511
4445
|
fi
|
|
2512
4446
|
fi
|
|
2513
4447
|
|
|
4448
|
+
# ── Adaptive poll interval: adjust sleep based on queue state ──
|
|
4449
|
+
local effective_interval
|
|
4450
|
+
effective_interval=$(get_adaptive_poll_interval "$issue_count_now" "$active_count_now" || echo "${POLL_INTERVAL:-30}")
|
|
4451
|
+
|
|
2514
4452
|
# Sleep in 1s intervals so we can catch shutdown quickly
|
|
2515
4453
|
local i=0
|
|
2516
|
-
while [[ $i -lt $
|
|
2517
|
-
sleep 1
|
|
4454
|
+
while [[ $i -lt $effective_interval ]] && [[ ! -f "$SHUTDOWN_FLAG" ]]; do
|
|
4455
|
+
sleep 1 || true # Guard against signal interruption under set -e
|
|
2518
4456
|
i=$((i + 1))
|
|
2519
4457
|
done
|
|
2520
4458
|
done
|
|
@@ -2525,7 +4463,39 @@ daemon_poll_loop() {
|
|
|
2525
4463
|
# ─── Graceful Shutdown Handler ───────────────────────────────────────────────
|
|
2526
4464
|
|
|
2527
4465
|
cleanup_on_exit() {
|
|
2528
|
-
|
|
4466
|
+
local exit_code=$?
|
|
4467
|
+
local last_cmd="${BASH_COMMAND:-unknown}"
|
|
4468
|
+
daemon_log INFO "Cleaning up... (exit_code=${exit_code}, last_command=${last_cmd})"
|
|
4469
|
+
|
|
4470
|
+
# Kill all active pipeline child processes
|
|
4471
|
+
if [[ -f "$STATE_FILE" ]]; then
|
|
4472
|
+
local child_pids
|
|
4473
|
+
child_pids=$(jq -r '.active_jobs[].pid // empty' "$STATE_FILE" 2>/dev/null || true)
|
|
4474
|
+
if [[ -n "$child_pids" ]]; then
|
|
4475
|
+
local killed=0
|
|
4476
|
+
while IFS= read -r cpid; do
|
|
4477
|
+
[[ -z "$cpid" ]] && continue
|
|
4478
|
+
if kill -0 "$cpid" 2>/dev/null; then
|
|
4479
|
+
daemon_log INFO "Killing pipeline process PID ${cpid}"
|
|
4480
|
+
kill "$cpid" 2>/dev/null || true
|
|
4481
|
+
killed=$((killed + 1))
|
|
4482
|
+
fi
|
|
4483
|
+
done <<< "$child_pids"
|
|
4484
|
+
if [[ $killed -gt 0 ]]; then
|
|
4485
|
+
daemon_log INFO "Sent SIGTERM to ${killed} pipeline process(es) — waiting 5s"
|
|
4486
|
+
sleep 5
|
|
4487
|
+
# Force-kill any that didn't exit
|
|
4488
|
+
while IFS= read -r cpid; do
|
|
4489
|
+
[[ -z "$cpid" ]] && continue
|
|
4490
|
+
if kill -0 "$cpid" 2>/dev/null; then
|
|
4491
|
+
daemon_log WARN "Force-killing pipeline PID ${cpid}"
|
|
4492
|
+
kill -9 "$cpid" 2>/dev/null || true
|
|
4493
|
+
fi
|
|
4494
|
+
done <<< "$child_pids"
|
|
4495
|
+
fi
|
|
4496
|
+
fi
|
|
4497
|
+
fi
|
|
4498
|
+
|
|
2529
4499
|
rm -f "$PID_FILE" "$SHUTDOWN_FLAG"
|
|
2530
4500
|
daemon_log INFO "Daemon stopped"
|
|
2531
4501
|
emit_event "daemon.stopped" "pid=$$"
|
|
@@ -2544,15 +4514,19 @@ daemon_start() {
|
|
|
2544
4514
|
local existing_pid
|
|
2545
4515
|
existing_pid=$(cat "$PID_FILE" 2>/dev/null || true)
|
|
2546
4516
|
if [[ -n "$existing_pid" ]] && kill -0 "$existing_pid" 2>/dev/null; then
|
|
4517
|
+
exec 9>&- # Release FD before exiting
|
|
2547
4518
|
error "Daemon already running (PID: ${existing_pid})"
|
|
2548
4519
|
info "Use ${CYAN}shipwright daemon stop${RESET} to stop it first"
|
|
2549
4520
|
exit 1
|
|
2550
4521
|
else
|
|
2551
4522
|
warn "Stale PID file found — removing"
|
|
2552
4523
|
rm -f "$PID_FILE"
|
|
4524
|
+
exec 9>&- # Release old FD
|
|
2553
4525
|
exec 9>"$PID_FILE"
|
|
2554
4526
|
fi
|
|
2555
4527
|
fi
|
|
4528
|
+
# Release FD 9 — we only needed it for the startup race check
|
|
4529
|
+
exec 9>&-
|
|
2556
4530
|
|
|
2557
4531
|
# Load config
|
|
2558
4532
|
load_config
|
|
@@ -2569,10 +4543,10 @@ daemon_start() {
|
|
|
2569
4543
|
exit 1
|
|
2570
4544
|
fi
|
|
2571
4545
|
|
|
2572
|
-
info "Starting daemon in detached tmux session: ${CYAN}
|
|
4546
|
+
info "Starting daemon in detached tmux session: ${CYAN}sw-daemon${RESET}"
|
|
2573
4547
|
|
|
2574
4548
|
# Build the command to run in tmux
|
|
2575
|
-
local cmd_args=("$SCRIPT_DIR/
|
|
4549
|
+
local cmd_args=("$SCRIPT_DIR/sw-daemon.sh" "start")
|
|
2576
4550
|
if [[ -n "$CONFIG_PATH" ]]; then
|
|
2577
4551
|
cmd_args+=("--config" "$CONFIG_PATH")
|
|
2578
4552
|
fi
|
|
@@ -2580,14 +4554,16 @@ daemon_start() {
|
|
|
2580
4554
|
cmd_args+=("--no-github")
|
|
2581
4555
|
fi
|
|
2582
4556
|
|
|
2583
|
-
|
|
4557
|
+
# Export current PATH so detached session finds claude, gh, etc.
|
|
4558
|
+
local tmux_cmd="export PATH='${PATH}'; ${cmd_args[*]}"
|
|
4559
|
+
tmux new-session -d -s "sw-daemon" "$tmux_cmd" 2>/dev/null || {
|
|
2584
4560
|
# Session may already exist — try killing and recreating
|
|
2585
|
-
tmux kill-session -t "
|
|
2586
|
-
tmux new-session -d -s "
|
|
4561
|
+
tmux kill-session -t "sw-daemon" 2>/dev/null || true
|
|
4562
|
+
tmux new-session -d -s "sw-daemon" "$tmux_cmd"
|
|
2587
4563
|
}
|
|
2588
4564
|
|
|
2589
|
-
success "Daemon started in tmux session ${CYAN}
|
|
2590
|
-
info "Attach with: ${DIM}tmux attach -t
|
|
4565
|
+
success "Daemon started in tmux session ${CYAN}sw-daemon${RESET}"
|
|
4566
|
+
info "Attach with: ${DIM}tmux attach -t sw-daemon${RESET}"
|
|
2591
4567
|
info "View logs: ${DIM}shipwright daemon logs --follow${RESET}"
|
|
2592
4568
|
return 0
|
|
2593
4569
|
fi
|
|
@@ -2595,8 +4571,10 @@ daemon_start() {
|
|
|
2595
4571
|
# Foreground mode
|
|
2596
4572
|
info "Starting daemon (PID: $$)"
|
|
2597
4573
|
|
|
2598
|
-
# Write PID file
|
|
2599
|
-
|
|
4574
|
+
# Write PID file atomically
|
|
4575
|
+
local pid_tmp="${PID_FILE}.tmp.$$"
|
|
4576
|
+
echo "$$" > "$pid_tmp"
|
|
4577
|
+
mv "$pid_tmp" "$PID_FILE"
|
|
2600
4578
|
|
|
2601
4579
|
# Remove stale shutdown flag
|
|
2602
4580
|
rm -f "$SHUTDOWN_FLAG"
|
|
@@ -2606,10 +4584,26 @@ daemon_start() {
|
|
|
2606
4584
|
|
|
2607
4585
|
# Trap signals for graceful shutdown
|
|
2608
4586
|
trap cleanup_on_exit EXIT
|
|
2609
|
-
trap 'touch "$SHUTDOWN_FLAG"' SIGINT SIGTERM
|
|
4587
|
+
trap '{ echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] [WARN] SIGINT/SIGTERM received — initiating shutdown" >> "$LOG_FILE" 2>/dev/null; } || true; touch "$SHUTDOWN_FLAG"' SIGINT SIGTERM
|
|
4588
|
+
# Ignore SIGHUP — tmux sends this on attach/detach and we must survive it
|
|
4589
|
+
trap '' SIGHUP
|
|
4590
|
+
# Ignore SIGPIPE — broken pipes in command substitutions must not kill the daemon
|
|
4591
|
+
trap '' SIGPIPE
|
|
4592
|
+
|
|
4593
|
+
# Override global ERR trap to log to daemon log file (not stderr, which is lost when tmux dies)
|
|
4594
|
+
trap '{ echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] [ERROR] ERR trap: line=$LINENO exit=$? cmd=$BASH_COMMAND" >> "$LOG_FILE" 2>/dev/null; } || true' ERR
|
|
2610
4595
|
|
|
2611
4596
|
# Reap any orphaned jobs from previous runs
|
|
2612
|
-
daemon_reap_completed
|
|
4597
|
+
daemon_reap_completed || daemon_log WARN "Failed to reap orphaned jobs — continuing"
|
|
4598
|
+
|
|
4599
|
+
# Clean up stale temp files from previous crashes
|
|
4600
|
+
find "$(dirname "$STATE_FILE")" -name "*.tmp.*" -mmin +5 -delete 2>/dev/null || true
|
|
4601
|
+
|
|
4602
|
+
# Rotate event log on startup
|
|
4603
|
+
rotate_event_log || true
|
|
4604
|
+
|
|
4605
|
+
# Load GitHub context (repo metadata, security alerts, etc.)
|
|
4606
|
+
daemon_github_context || daemon_log WARN "Failed to load GitHub context — continuing without it"
|
|
2613
4607
|
|
|
2614
4608
|
daemon_log INFO "Daemon started successfully"
|
|
2615
4609
|
daemon_log INFO "Config: poll_interval=${POLL_INTERVAL}s, max_parallel=${MAX_PARALLEL}, label=${WATCH_LABEL}"
|
|
@@ -2673,7 +4667,7 @@ daemon_stop() {
|
|
|
2673
4667
|
rm -f "$PID_FILE" "$SHUTDOWN_FLAG"
|
|
2674
4668
|
|
|
2675
4669
|
# Also kill tmux session if it exists
|
|
2676
|
-
tmux kill-session -t "
|
|
4670
|
+
tmux kill-session -t "sw-daemon" 2>/dev/null || true
|
|
2677
4671
|
|
|
2678
4672
|
success "Daemon stopped"
|
|
2679
4673
|
}
|
|
@@ -2819,7 +4813,14 @@ daemon_init() {
|
|
|
2819
4813
|
"patrol": {
|
|
2820
4814
|
"interval": 3600,
|
|
2821
4815
|
"max_issues": 5,
|
|
2822
|
-
"label": "auto-patrol"
|
|
4816
|
+
"label": "auto-patrol",
|
|
4817
|
+
"auto_watch": false,
|
|
4818
|
+
"checks": {
|
|
4819
|
+
"recurring_failures": { "enabled": true, "threshold": 3 },
|
|
4820
|
+
"dora_degradation": { "enabled": true },
|
|
4821
|
+
"untested_scripts": { "enabled": true },
|
|
4822
|
+
"retry_exhaustion": { "enabled": true, "threshold": 2 }
|
|
4823
|
+
}
|
|
2823
4824
|
},
|
|
2824
4825
|
"auto_template": false,
|
|
2825
4826
|
"template_map": {
|
|
@@ -2841,7 +4842,19 @@ daemon_init() {
|
|
|
2841
4842
|
"max_workers": 8,
|
|
2842
4843
|
"min_workers": 1,
|
|
2843
4844
|
"worker_mem_gb": 4,
|
|
2844
|
-
"estimated_cost_per_job_usd": 5.0
|
|
4845
|
+
"estimated_cost_per_job_usd": 5.0,
|
|
4846
|
+
"intelligence": {
|
|
4847
|
+
"enabled": true,
|
|
4848
|
+
"cache_ttl_seconds": 3600,
|
|
4849
|
+
"composer_enabled": true,
|
|
4850
|
+
"optimization_enabled": true,
|
|
4851
|
+
"prediction_enabled": true,
|
|
4852
|
+
"adversarial_enabled": false,
|
|
4853
|
+
"simulation_enabled": false,
|
|
4854
|
+
"architecture_enabled": false,
|
|
4855
|
+
"ab_test_ratio": 0.2,
|
|
4856
|
+
"anomaly_threshold": 3.0
|
|
4857
|
+
}
|
|
2845
4858
|
}
|
|
2846
4859
|
CONFIGEOF
|
|
2847
4860
|
|
|
@@ -3175,7 +5188,7 @@ case "$SUBCOMMAND" in
|
|
|
3175
5188
|
daemon_patrol "$@"
|
|
3176
5189
|
;;
|
|
3177
5190
|
test)
|
|
3178
|
-
exec "$SCRIPT_DIR/
|
|
5191
|
+
exec "$SCRIPT_DIR/sw-daemon-test.sh" "$@"
|
|
3179
5192
|
;;
|
|
3180
5193
|
help|--help|-h)
|
|
3181
5194
|
show_help
|