shipwright-cli 1.9.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. package/.claude/hooks/post-tool-use.sh +12 -5
  2. package/README.md +114 -36
  3. package/completions/_shipwright +212 -32
  4. package/completions/shipwright.bash +97 -25
  5. package/docs/strategy/01-market-research.md +619 -0
  6. package/docs/strategy/02-mission-and-brand.md +587 -0
  7. package/docs/strategy/03-gtm-and-roadmap.md +759 -0
  8. package/docs/strategy/QUICK-START.txt +289 -0
  9. package/docs/strategy/README.md +172 -0
  10. package/package.json +4 -2
  11. package/scripts/sw +217 -2
  12. package/scripts/sw-activity.sh +500 -0
  13. package/scripts/sw-adaptive.sh +925 -0
  14. package/scripts/sw-adversarial.sh +1 -1
  15. package/scripts/sw-architecture-enforcer.sh +1 -1
  16. package/scripts/sw-auth.sh +613 -0
  17. package/scripts/sw-autonomous.sh +664 -0
  18. package/scripts/sw-changelog.sh +704 -0
  19. package/scripts/sw-checkpoint.sh +79 -1
  20. package/scripts/sw-ci.sh +602 -0
  21. package/scripts/sw-cleanup.sh +192 -7
  22. package/scripts/sw-code-review.sh +637 -0
  23. package/scripts/sw-connect.sh +1 -1
  24. package/scripts/sw-context.sh +605 -0
  25. package/scripts/sw-cost.sh +1 -1
  26. package/scripts/sw-daemon.sh +812 -138
  27. package/scripts/sw-dashboard.sh +1 -1
  28. package/scripts/sw-db.sh +540 -0
  29. package/scripts/sw-decompose.sh +539 -0
  30. package/scripts/sw-deps.sh +551 -0
  31. package/scripts/sw-developer-simulation.sh +1 -1
  32. package/scripts/sw-discovery.sh +412 -0
  33. package/scripts/sw-docs-agent.sh +539 -0
  34. package/scripts/sw-docs.sh +1 -1
  35. package/scripts/sw-doctor.sh +59 -1
  36. package/scripts/sw-dora.sh +615 -0
  37. package/scripts/sw-durable.sh +710 -0
  38. package/scripts/sw-e2e-orchestrator.sh +535 -0
  39. package/scripts/sw-eventbus.sh +393 -0
  40. package/scripts/sw-feedback.sh +471 -0
  41. package/scripts/sw-fix.sh +1 -1
  42. package/scripts/sw-fleet-discover.sh +567 -0
  43. package/scripts/sw-fleet-viz.sh +404 -0
  44. package/scripts/sw-fleet.sh +8 -1
  45. package/scripts/sw-github-app.sh +596 -0
  46. package/scripts/sw-github-checks.sh +1 -1
  47. package/scripts/sw-github-deploy.sh +1 -1
  48. package/scripts/sw-github-graphql.sh +1 -1
  49. package/scripts/sw-guild.sh +569 -0
  50. package/scripts/sw-heartbeat.sh +1 -1
  51. package/scripts/sw-hygiene.sh +559 -0
  52. package/scripts/sw-incident.sh +617 -0
  53. package/scripts/sw-init.sh +88 -1
  54. package/scripts/sw-instrument.sh +699 -0
  55. package/scripts/sw-intelligence.sh +1 -1
  56. package/scripts/sw-jira.sh +1 -1
  57. package/scripts/sw-launchd.sh +366 -31
  58. package/scripts/sw-linear.sh +1 -1
  59. package/scripts/sw-logs.sh +1 -1
  60. package/scripts/sw-loop.sh +507 -51
  61. package/scripts/sw-memory.sh +198 -3
  62. package/scripts/sw-mission-control.sh +487 -0
  63. package/scripts/sw-model-router.sh +545 -0
  64. package/scripts/sw-otel.sh +596 -0
  65. package/scripts/sw-oversight.sh +689 -0
  66. package/scripts/sw-pipeline-composer.sh +8 -8
  67. package/scripts/sw-pipeline-vitals.sh +1096 -0
  68. package/scripts/sw-pipeline.sh +2451 -180
  69. package/scripts/sw-pm.sh +693 -0
  70. package/scripts/sw-pr-lifecycle.sh +522 -0
  71. package/scripts/sw-predictive.sh +1 -1
  72. package/scripts/sw-prep.sh +1 -1
  73. package/scripts/sw-ps.sh +4 -3
  74. package/scripts/sw-public-dashboard.sh +798 -0
  75. package/scripts/sw-quality.sh +595 -0
  76. package/scripts/sw-reaper.sh +5 -3
  77. package/scripts/sw-recruit.sh +573 -0
  78. package/scripts/sw-regression.sh +642 -0
  79. package/scripts/sw-release-manager.sh +736 -0
  80. package/scripts/sw-release.sh +706 -0
  81. package/scripts/sw-remote.sh +1 -1
  82. package/scripts/sw-replay.sh +520 -0
  83. package/scripts/sw-retro.sh +691 -0
  84. package/scripts/sw-scale.sh +444 -0
  85. package/scripts/sw-security-audit.sh +505 -0
  86. package/scripts/sw-self-optimize.sh +109 -8
  87. package/scripts/sw-session.sh +31 -9
  88. package/scripts/sw-setup.sh +1 -1
  89. package/scripts/sw-standup.sh +712 -0
  90. package/scripts/sw-status.sh +192 -1
  91. package/scripts/sw-strategic.sh +658 -0
  92. package/scripts/sw-stream.sh +450 -0
  93. package/scripts/sw-swarm.sh +583 -0
  94. package/scripts/sw-team-stages.sh +511 -0
  95. package/scripts/sw-templates.sh +1 -1
  96. package/scripts/sw-testgen.sh +515 -0
  97. package/scripts/sw-tmux-pipeline.sh +554 -0
  98. package/scripts/sw-tmux.sh +1 -1
  99. package/scripts/sw-trace.sh +485 -0
  100. package/scripts/sw-tracker-github.sh +188 -0
  101. package/scripts/sw-tracker-jira.sh +172 -0
  102. package/scripts/sw-tracker-linear.sh +251 -0
  103. package/scripts/sw-tracker.sh +117 -2
  104. package/scripts/sw-triage.sh +603 -0
  105. package/scripts/sw-upgrade.sh +1 -1
  106. package/scripts/sw-ux.sh +677 -0
  107. package/scripts/sw-webhook.sh +627 -0
  108. package/scripts/sw-widgets.sh +530 -0
  109. package/scripts/sw-worktree.sh +1 -1
  110. package/templates/pipelines/autonomous.json +8 -1
  111. package/templates/pipelines/cost-aware.json +21 -0
  112. package/templates/pipelines/deployed.json +40 -6
  113. package/templates/pipelines/enterprise.json +16 -2
  114. package/templates/pipelines/fast.json +19 -0
  115. package/templates/pipelines/full.json +16 -2
  116. package/templates/pipelines/hotfix.json +19 -0
  117. package/templates/pipelines/standard.json +19 -0
@@ -6,7 +6,10 @@
6
6
  set -euo pipefail
7
7
  trap 'echo "ERROR: $BASH_SOURCE:$LINENO exited with status $?" >&2' ERR
8
8
 
9
- VERSION="1.9.0"
9
+ # Allow spawning Claude CLI from within a Claude Code session (daemon, fleet, etc.)
10
+ unset CLAUDECODE 2>/dev/null || true
11
+
12
+ VERSION="2.0.0"
10
13
  SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
11
14
  REPO_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
12
15
 
@@ -34,6 +37,8 @@ RESET='\033[0m'
34
37
  [[ -f "$SCRIPT_DIR/sw-self-optimize.sh" ]] && source "$SCRIPT_DIR/sw-self-optimize.sh"
35
38
  # shellcheck source=sw-predictive.sh
36
39
  [[ -f "$SCRIPT_DIR/sw-predictive.sh" ]] && source "$SCRIPT_DIR/sw-predictive.sh"
40
+ # shellcheck source=sw-pipeline-vitals.sh
41
+ [[ -f "$SCRIPT_DIR/sw-pipeline-vitals.sh" ]] && source "$SCRIPT_DIR/sw-pipeline-vitals.sh"
37
42
 
38
43
  # ─── GitHub API Modules (optional) ────────────────────────────────────────
39
44
  # shellcheck source=sw-github-graphql.sh
@@ -125,7 +130,6 @@ rotate_event_log() {
125
130
  }
126
131
 
127
132
  # ─── GitHub Context (loaded once at startup) ──────────────────────────────
128
- DAEMON_GITHUB_CONTEXT=""
129
133
 
130
134
  daemon_github_context() {
131
135
  # Skip if no GitHub
@@ -141,8 +145,6 @@ daemon_github_context() {
141
145
  context=$(gh_repo_context "$owner" "$repo" 2>/dev/null || echo "{}")
142
146
  if [[ -n "$context" && "$context" != "{}" ]]; then
143
147
  daemon_log INFO "GitHub context loaded: $(echo "$context" | jq -r '.contributor_count // 0') contributors, $(echo "$context" | jq -r '.security_alert_count // 0') security alerts"
144
- DAEMON_GITHUB_CONTEXT="$context"
145
- export DAEMON_GITHUB_CONTEXT
146
148
  fi
147
149
  }
148
150
 
@@ -166,9 +168,9 @@ gh_retry() {
166
168
 
167
169
  # Check for rate-limit or server error indicators
168
170
  if echo "$output" | grep -qiE "rate limit|403|429|502|503"; then
169
- daemon_log WARN "gh_retry: rate limit / server error on attempt ${attempt}/${max_retries} — backoff ${backoff}s"
171
+ daemon_log WARN "gh_retry: rate limit / server error on attempt ${attempt}/${max_retries} — backoff ${backoff}s" >&2
170
172
  else
171
- daemon_log WARN "gh_retry: transient error on attempt ${attempt}/${max_retries} (exit ${exit_code}) — backoff ${backoff}s"
173
+ daemon_log WARN "gh_retry: transient error on attempt ${attempt}/${max_retries} (exit ${exit_code}) — backoff ${backoff}s" >&2
172
174
  fi
173
175
 
174
176
  if [[ $attempt -lt $max_retries ]]; then
@@ -421,6 +423,14 @@ load_config() {
421
423
  MAX_RETRIES=$(jq -r '.max_retries // 2' "$config_file")
422
424
  RETRY_ESCALATION=$(jq -r '.retry_escalation // true' "$config_file")
423
425
 
426
+ # session restart + fast test passthrough
427
+ MAX_RESTARTS_CFG=$(jq -r '.max_restarts // 3' "$config_file" 2>/dev/null || echo "3")
428
+ if ! [[ "$MAX_RESTARTS_CFG" =~ ^[0-9]+$ ]]; then
429
+ daemon_log WARN "Invalid max_restarts in config: $MAX_RESTARTS_CFG (using default: 3)"
430
+ MAX_RESTARTS_CFG="3"
431
+ fi
432
+ FAST_TEST_CMD_CFG=$(jq -r '.fast_test_cmd // ""' "$config_file" 2>/dev/null || echo "")
433
+
424
434
  # self-optimization
425
435
  SELF_OPTIMIZE=$(jq -r '.self_optimize // false' "$config_file")
426
436
  OPTIMIZE_INTERVAL=$(jq -r '.optimize_interval // 10' "$config_file")
@@ -471,9 +481,11 @@ load_config() {
471
481
 
472
482
  # progress-based health monitoring (replaces static timeouts)
473
483
  PROGRESS_MONITORING=$(jq -r '.health.progress_based // true' "$config_file")
474
- PROGRESS_CHECKS_BEFORE_WARN=$(jq -r '.health.stale_checks_before_warn // 3' "$config_file")
475
- PROGRESS_CHECKS_BEFORE_KILL=$(jq -r '.health.stale_checks_before_kill // 6' "$config_file")
476
- PROGRESS_HARD_LIMIT_S=$(jq -r '.health.hard_limit_s // 10800' "$config_file") # 3hr absolute max
484
+ PROGRESS_CHECKS_BEFORE_WARN=$(jq -r '.health.stale_checks_before_warn // 20' "$config_file")
485
+ PROGRESS_CHECKS_BEFORE_KILL=$(jq -r '.health.stale_checks_before_kill // 120' "$config_file")
486
+ PROGRESS_HARD_LIMIT_S=$(jq -r '.health.hard_limit_s // 0' "$config_file") # 0 = disabled (no hard kill)
487
+ NUDGE_ENABLED=$(jq -r '.health.nudge_enabled // true' "$config_file")
488
+ NUDGE_AFTER_CHECKS=$(jq -r '.health.nudge_after_checks // 40' "$config_file")
477
489
 
478
490
  # team dashboard URL (for coordinated claiming)
479
491
  local cfg_dashboard_url
@@ -482,6 +494,12 @@ load_config() {
482
494
  DASHBOARD_URL="$cfg_dashboard_url"
483
495
  fi
484
496
 
497
+ # Auto-enable self_optimize when auto_template is on
498
+ if [[ "${AUTO_TEMPLATE:-false}" == "true" && "${SELF_OPTIMIZE:-false}" == "false" ]]; then
499
+ SELF_OPTIMIZE="true"
500
+ daemon_log INFO "Auto-enabling self_optimize (auto_template is true)"
501
+ fi
502
+
485
503
  success "Config loaded"
486
504
  }
487
505
 
@@ -823,6 +841,31 @@ daemon_assess_progress() {
823
841
  has_progress=true
824
842
  fi
825
843
 
844
+ # Claude subprocess is alive and consuming CPU — agent is thinking/working
845
+ # During build stage, Claude can spend 10+ minutes thinking before any
846
+ # visible git changes appear. Detect this as progress.
847
+ if [[ "$has_progress" != "true" ]]; then
848
+ local _pid_for_check
849
+ _pid_for_check=$(echo "$current_snapshot" | jq -r '.pid // empty' 2>/dev/null || true)
850
+ if [[ -z "$_pid_for_check" ]]; then
851
+ # Fallback: get PID from active_jobs
852
+ _pid_for_check=$(jq -r --argjson num "$issue_num" \
853
+ '.active_jobs[] | select(.issue == ($num | tonumber)) | .pid' "$STATE_FILE" 2>/dev/null | head -1 || true)
854
+ fi
855
+ if [[ -n "$_pid_for_check" ]]; then
856
+ # Check if any child process (claude) is alive and using CPU
857
+ local child_cpu=0
858
+ child_cpu=$(ps -o pid=,pcpu= -p "$_pid_for_check" 2>/dev/null | awk '{sum+=$2} END{printf "%d", sum+0}' || echo "0")
859
+ if [[ "$child_cpu" -eq 0 ]]; then
860
+ # Check children of the pipeline process
861
+ child_cpu=$(pgrep -P "$_pid_for_check" 2>/dev/null | xargs -I{} ps -o pcpu= -p {} 2>/dev/null | awk '{sum+=$1} END{printf "%d", sum+0}' || echo "0")
862
+ fi
863
+ if [[ "${child_cpu:-0}" -gt 0 ]]; then
864
+ has_progress=true
865
+ fi
866
+ fi
867
+ fi
868
+
826
869
  # Detect repeated errors (same error signature hitting again)
827
870
  local repeated_errors="$prev_repeated_errors"
828
871
  if [[ -n "$cur_error" && "$cur_error" == "$prev_error" ]]; then
@@ -855,7 +898,56 @@ daemon_assess_progress() {
855
898
  if $npc == 0 then .last_progress_at = $ts else . end
856
899
  ' "$progress_file" > "$tmp_progress" 2>/dev/null && mv "$tmp_progress" "$progress_file"
857
900
 
858
- # Determine verdict
901
+ # ── Vitals-based verdict (preferred over static thresholds) ──
902
+ if type pipeline_compute_vitals &>/dev/null 2>&1 && type pipeline_health_verdict &>/dev/null 2>&1; then
903
+ # Compute vitals using the worktree's pipeline state if available
904
+ local _worktree_state=""
905
+ local _worktree_artifacts=""
906
+ local _worktree_dir
907
+ _worktree_dir=$(jq -r --arg i "$issue_num" '.active_jobs[] | select(.issue == ($i | tonumber)) | .worktree // ""' "$STATE_FILE" 2>/dev/null || echo "")
908
+ if [[ -n "$_worktree_dir" && -d "$_worktree_dir/.claude" ]]; then
909
+ _worktree_state="$_worktree_dir/.claude/pipeline-state.md"
910
+ _worktree_artifacts="$_worktree_dir/.claude/pipeline-artifacts"
911
+ fi
912
+
913
+ local _vitals_json
914
+ _vitals_json=$(pipeline_compute_vitals "$_worktree_state" "$_worktree_artifacts" "$issue_num" 2>/dev/null) || true
915
+ if [[ -n "$_vitals_json" && "$_vitals_json" != "{}" ]]; then
916
+ local _health_verdict _health_score
917
+ _health_verdict=$(echo "$_vitals_json" | jq -r '.verdict // "continue"' 2>/dev/null || echo "continue")
918
+ _health_score=$(echo "$_vitals_json" | jq -r '.health_score // 50' 2>/dev/null || echo "50")
919
+
920
+ emit_event "pipeline.vitals_check" \
921
+ "issue=$issue_num" \
922
+ "health_score=$_health_score" \
923
+ "verdict=$_health_verdict" \
924
+ "no_progress=$no_progress_count" \
925
+ "repeated_errors=$repeated_errors"
926
+
927
+ # Map vitals verdict to daemon verdict
928
+ case "$_health_verdict" in
929
+ continue)
930
+ echo "healthy"
931
+ return
932
+ ;;
933
+ warn)
934
+ # Sluggish but not dead — equivalent to slowing
935
+ echo "slowing"
936
+ return
937
+ ;;
938
+ intervene)
939
+ echo "stalled"
940
+ return
941
+ ;;
942
+ abort)
943
+ echo "stuck"
944
+ return
945
+ ;;
946
+ esac
947
+ fi
948
+ fi
949
+
950
+ # ── Fallback: static threshold verdict ──
859
951
  local warn_threshold="${PROGRESS_CHECKS_BEFORE_WARN:-3}"
860
952
  local kill_threshold="${PROGRESS_CHECKS_BEFORE_KILL:-6}"
861
953
 
@@ -1039,6 +1131,7 @@ extract_issue_dependencies() {
1039
1131
  }
1040
1132
 
1041
1133
  # ─── Logging ─────────────────────────────────────────────────────────────────
1134
+ DAEMON_LOG_WRITE_COUNT=0
1042
1135
 
1043
1136
  daemon_log() {
1044
1137
  local level="$1"
@@ -1048,8 +1141,9 @@ daemon_log() {
1048
1141
  ts=$(now_iso)
1049
1142
  echo "[$ts] [$level] $msg" >> "$LOG_FILE"
1050
1143
 
1051
- # Rotate daemon.log if over 20MB (checked every ~100 writes)
1052
- if [[ $(( RANDOM % 100 )) -eq 0 ]] && [[ -f "$LOG_FILE" ]]; then
1144
+ # Rotate daemon.log if over 20MB (checked every 100 writes)
1145
+ DAEMON_LOG_WRITE_COUNT=$(( DAEMON_LOG_WRITE_COUNT + 1 ))
1146
+ if [[ $(( DAEMON_LOG_WRITE_COUNT % 100 )) -eq 0 ]] && [[ -f "$LOG_FILE" ]]; then
1053
1147
  local log_size
1054
1148
  log_size=$(wc -c < "$LOG_FILE" 2>/dev/null || echo 0)
1055
1149
  if [[ "$log_size" -gt 20971520 ]]; then
@@ -1060,11 +1154,14 @@ daemon_log() {
1060
1154
  fi
1061
1155
  fi
1062
1156
 
1063
- # Also print to stdout
1157
+ # Print to stderr (NOT stdout) to avoid corrupting command substitution captures.
1158
+ # This is critical: functions like select_pipeline_template(), triage_score_issue(),
1159
+ # gh_retry(), and locked_get_active_count() return values via echo/stdout and are
1160
+ # called via $(). If daemon_log writes to stdout, the log text corrupts return values.
1064
1161
  case "$level" in
1065
- INFO) info "$msg" ;;
1066
- SUCCESS) success "$msg" ;;
1067
- WARN) warn "$msg" ;;
1162
+ INFO) info "$msg" >&2 ;;
1163
+ SUCCESS) success "$msg" >&2 ;;
1164
+ WARN) warn "$msg" >&2 ;;
1068
1165
  ERROR) error "$msg" ;;
1069
1166
  esac
1070
1167
  }
@@ -1130,7 +1227,10 @@ gh_record_failure() {
1130
1227
  GH_CONSECUTIVE_FAILURES=$((GH_CONSECUTIVE_FAILURES + 1))
1131
1228
  if [[ "$GH_CONSECUTIVE_FAILURES" -ge 3 ]]; then
1132
1229
  # Exponential backoff: 30s, 60s, 120s, 240s (capped at 5min)
1133
- local backoff_secs=$((30 * (1 << (GH_CONSECUTIVE_FAILURES - 3))))
1230
+ # Cap shift to avoid integer overflow for large failure counts
1231
+ local shift_amt=$(( GH_CONSECUTIVE_FAILURES - 3 ))
1232
+ [[ "$shift_amt" -gt 4 ]] && shift_amt=4
1233
+ local backoff_secs=$((30 * (1 << shift_amt)))
1134
1234
  [[ "$backoff_secs" -gt 300 ]] && backoff_secs=300
1135
1235
  GH_BACKOFF_UNTIL=$(( $(now_epoch) + backoff_secs ))
1136
1236
  daemon_log WARN "GitHub rate-limit circuit breaker: backing off ${backoff_secs}s after ${GH_CONSECUTIVE_FAILURES} failures"
@@ -1138,6 +1238,74 @@ gh_record_failure() {
1138
1238
  fi
1139
1239
  }
1140
1240
 
1241
+ # ─── Runtime Auth Check ──────────────────────────────────────────────────────
1242
+
1243
+ LAST_AUTH_CHECK_EPOCH=0
1244
+ AUTH_CHECK_INTERVAL=300 # 5 minutes
1245
+
1246
+ daemon_preflight_auth_check() {
1247
+ local now_e
1248
+ now_e=$(now_epoch)
1249
+ if [[ $((now_e - LAST_AUTH_CHECK_EPOCH)) -lt "$AUTH_CHECK_INTERVAL" ]]; then
1250
+ return 0
1251
+ fi
1252
+ LAST_AUTH_CHECK_EPOCH="$now_e"
1253
+
1254
+ # gh auth check
1255
+ if [[ "${NO_GITHUB:-false}" != "true" ]]; then
1256
+ if ! gh auth status &>/dev/null 2>&1; then
1257
+ daemon_log ERROR "GitHub auth check failed — auto-pausing daemon"
1258
+ local pause_json
1259
+ pause_json=$(jq -n --arg reason "gh_auth_failure" --arg ts "$(now_iso)" \
1260
+ '{reason: $reason, timestamp: $ts}')
1261
+ local _tmp_pause
1262
+ _tmp_pause=$(mktemp "${TMPDIR:-/tmp}/sw-pause.XXXXXX")
1263
+ echo "$pause_json" > "$_tmp_pause"
1264
+ mv "$_tmp_pause" "$PAUSE_FLAG"
1265
+ emit_event "daemon.auto_pause" "reason=gh_auth_failure"
1266
+ return 1
1267
+ fi
1268
+ fi
1269
+
1270
+ # claude auth check with 15s timeout (macOS has no timeout command)
1271
+ local claude_auth_ok=false
1272
+ local _auth_tmp
1273
+ _auth_tmp=$(mktemp "${TMPDIR:-/tmp}/sw-auth.XXXXXX")
1274
+ ( claude --print -p "ok" --max-turns 1 > "$_auth_tmp" 2>/dev/null ) &
1275
+ local _auth_pid=$!
1276
+ local _auth_waited=0
1277
+ while kill -0 "$_auth_pid" 2>/dev/null && [[ "$_auth_waited" -lt 15 ]]; do
1278
+ sleep 1
1279
+ _auth_waited=$((_auth_waited + 1))
1280
+ done
1281
+ if kill -0 "$_auth_pid" 2>/dev/null; then
1282
+ kill "$_auth_pid" 2>/dev/null || true
1283
+ wait "$_auth_pid" 2>/dev/null || true
1284
+ else
1285
+ wait "$_auth_pid" 2>/dev/null || true
1286
+ fi
1287
+
1288
+ if [[ -s "$_auth_tmp" ]]; then
1289
+ claude_auth_ok=true
1290
+ fi
1291
+ rm -f "$_auth_tmp"
1292
+
1293
+ if [[ "$claude_auth_ok" != "true" ]]; then
1294
+ daemon_log ERROR "Claude auth check failed — auto-pausing daemon"
1295
+ local pause_json
1296
+ pause_json=$(jq -n --arg reason "claude_auth_failure" --arg ts "$(now_iso)" \
1297
+ '{reason: $reason, timestamp: $ts}')
1298
+ local _tmp_pause
1299
+ _tmp_pause=$(mktemp "${TMPDIR:-/tmp}/sw-pause.XXXXXX")
1300
+ echo "$pause_json" > "$_tmp_pause"
1301
+ mv "$_tmp_pause" "$PAUSE_FLAG"
1302
+ emit_event "daemon.auto_pause" "reason=claude_auth_failure"
1303
+ return 1
1304
+ fi
1305
+
1306
+ return 0
1307
+ }
1308
+
1141
1309
  # ─── Pre-flight Checks ──────────────────────────────────────────────────────
1142
1310
 
1143
1311
  preflight_checks() {
@@ -1380,7 +1548,7 @@ locked_get_active_count() {
1380
1548
  (
1381
1549
  if command -v flock &>/dev/null; then
1382
1550
  flock -w 5 200 2>/dev/null || {
1383
- daemon_log WARN "locked_get_active_count: lock timeout — returning MAX_PARALLEL as safe default"
1551
+ daemon_log WARN "locked_get_active_count: lock timeout — returning MAX_PARALLEL as safe default" >&2
1384
1552
  echo "$MAX_PARALLEL"
1385
1553
  exit 0
1386
1554
  }
@@ -1539,9 +1707,24 @@ daemon_spawn_pipeline() {
1539
1707
  local issue_num="$1"
1540
1708
  local issue_title="${2:-}"
1541
1709
  local repo_full_name="${3:-}" # owner/repo (org mode only)
1710
+ shift 3 2>/dev/null || true
1711
+ local extra_pipeline_args=("$@") # Optional extra args passed to sw-pipeline.sh
1542
1712
 
1543
1713
  daemon_log INFO "Spawning pipeline for issue #${issue_num}: ${issue_title}"
1544
1714
 
1715
+ # ── Issue decomposition (if decomposer available) ──
1716
+ local decompose_script="${SCRIPT_DIR}/sw-decompose.sh"
1717
+ if [[ -x "$decompose_script" && "$NO_GITHUB" != "true" ]]; then
1718
+ local decompose_result=""
1719
+ decompose_result=$("$decompose_script" auto "$issue_num" 2>/dev/null) || true
1720
+ if [[ "$decompose_result" == *"decomposed"* ]]; then
1721
+ daemon_log INFO "Issue #${issue_num} decomposed into subtasks — skipping pipeline"
1722
+ # Remove the shipwright label so decomposed parent doesn't re-queue
1723
+ gh issue edit "$issue_num" --remove-label "shipwright" 2>/dev/null || true
1724
+ return 0
1725
+ fi
1726
+ fi
1727
+
1545
1728
  # Extract goal text from issue (title + first line of body)
1546
1729
  local issue_goal="$issue_title"
1547
1730
  if [[ "$NO_GITHUB" != "true" ]]; then
@@ -1626,6 +1809,17 @@ daemon_spawn_pipeline() {
1626
1809
  daemon_log INFO "Worktree created at ${work_dir}"
1627
1810
  fi
1628
1811
 
1812
+ # If template is "composed", copy the composed spec into the worktree
1813
+ if [[ "$PIPELINE_TEMPLATE" == "composed" ]]; then
1814
+ local _src_composed="${REPO_DIR:-.}/.claude/pipeline-artifacts/composed-pipeline.json"
1815
+ if [[ -f "$_src_composed" ]]; then
1816
+ local _dst_artifacts="${work_dir}/.claude/pipeline-artifacts"
1817
+ mkdir -p "$_dst_artifacts"
1818
+ cp "$_src_composed" "$_dst_artifacts/composed-pipeline.json" 2>/dev/null || true
1819
+ daemon_log INFO "Copied composed pipeline spec to worktree"
1820
+ fi
1821
+ fi
1822
+
1629
1823
  # Build pipeline args
1630
1824
  local pipeline_args=("start" "--issue" "$issue_num" "--pipeline" "$PIPELINE_TEMPLATE")
1631
1825
  if [[ "$SKIP_GATES" == "true" ]]; then
@@ -1637,12 +1831,27 @@ daemon_spawn_pipeline() {
1637
1831
  if [[ "$NO_GITHUB" == "true" ]]; then
1638
1832
  pipeline_args+=("--no-github")
1639
1833
  fi
1834
+ # Pass session restart config
1835
+ if [[ "${MAX_RESTARTS_CFG:-0}" -gt 0 ]]; then
1836
+ pipeline_args+=("--max-restarts" "$MAX_RESTARTS_CFG")
1837
+ fi
1838
+ # Pass fast test command
1839
+ if [[ -n "${FAST_TEST_CMD_CFG:-}" ]]; then
1840
+ pipeline_args+=("--fast-test-cmd" "$FAST_TEST_CMD_CFG")
1841
+ fi
1842
+
1843
+ # Append any extra pipeline args (from retry escalation, etc.)
1844
+ if [[ ${#extra_pipeline_args[@]} -gt 0 ]]; then
1845
+ pipeline_args+=("${extra_pipeline_args[@]}")
1846
+ fi
1640
1847
 
1641
1848
  # Run pipeline in work directory (background)
1849
+ # Ignore SIGHUP so tmux attach/detach and process group changes don't kill the pipeline
1642
1850
  echo -e "\n\n===== Pipeline run $(date -u +%Y-%m-%dT%H:%M:%SZ) =====" >> "$LOG_DIR/issue-${issue_num}.log" 2>/dev/null || true
1643
1851
  (
1852
+ trap '' HUP
1644
1853
  cd "$work_dir"
1645
- "$SCRIPT_DIR/sw-pipeline.sh" "${pipeline_args[@]}"
1854
+ exec "$SCRIPT_DIR/sw-pipeline.sh" "${pipeline_args[@]}"
1646
1855
  ) >> "$LOG_DIR/issue-${issue_num}.log" 2>&1 200>&- &
1647
1856
  local pid=$!
1648
1857
 
@@ -1770,6 +1979,41 @@ daemon_reap_completed() {
1770
1979
  daemon_on_success "$issue_num" "$duration_str"
1771
1980
  else
1772
1981
  daemon_on_failure "$issue_num" "$exit_code" "$duration_str"
1982
+
1983
+ # Cancel any lingering in_progress GitHub Check Runs for failed job
1984
+ if [[ "${NO_GITHUB:-false}" != "true" && -n "$worktree" ]]; then
1985
+ local check_ids_file="${worktree}/.claude/pipeline-artifacts/check-run-ids.json"
1986
+ if [[ -f "$check_ids_file" ]]; then
1987
+ daemon_log INFO "Cancelling in-progress check runs for issue #${issue_num}"
1988
+ local _stage
1989
+ while IFS= read -r _stage; do
1990
+ [[ -z "$_stage" ]] && continue
1991
+ # Direct API call since we're in daemon context
1992
+ local _run_id
1993
+ _run_id=$(jq -r --arg s "$_stage" '.[$s] // empty' "$check_ids_file" 2>/dev/null || true)
1994
+ if [[ -n "$_run_id" && "$_run_id" != "null" ]]; then
1995
+ local _detected
1996
+ _detected=$(git remote get-url origin 2>/dev/null | sed 's|.*github.com[:/]\(.*\)\.git$|\1|' || true)
1997
+ if [[ -n "$_detected" ]]; then
1998
+ local _owner="${_detected%%/*}" _repo="${_detected##*/}"
1999
+ gh api "repos/${_owner}/${_repo}/check-runs/${_run_id}" \
2000
+ --method PATCH \
2001
+ --field status=completed \
2002
+ --field conclusion=cancelled \
2003
+ --silent 2>/dev/null || true
2004
+ fi
2005
+ fi
2006
+ done < <(jq -r 'keys[]' "$check_ids_file" 2>/dev/null || true)
2007
+ fi
2008
+ fi
2009
+ fi
2010
+
2011
+ # Finalize memory (capture failure patterns for future runs)
2012
+ if type memory_finalize_pipeline &>/dev/null 2>&1; then
2013
+ local _job_state _job_artifacts
2014
+ _job_state="${worktree:-.}/.claude/pipeline-state.md"
2015
+ _job_artifacts="${worktree:-.}/.claude/pipeline-artifacts"
2016
+ memory_finalize_pipeline "$_job_state" "$_job_artifacts" 2>/dev/null || true
1773
2017
  fi
1774
2018
 
1775
2019
  # Clean up progress tracking for this job
@@ -1780,15 +2024,18 @@ daemon_reap_completed() {
1780
2024
  reap_machine_name=$(jq -r '.machines[] | select(.role == "primary") | .name' "$HOME/.shipwright/machines.json" 2>/dev/null || hostname -s)
1781
2025
  release_claim "$issue_num" "$reap_machine_name"
1782
2026
 
1783
- # Skip cleanup if a retry was just spawned for this issue
2027
+ # Always remove the OLD job entry from active_jobs to prevent
2028
+ # re-reaping of the dead PID on the next cycle. When a retry was
2029
+ # spawned, daemon_spawn_pipeline already added a fresh entry with
2030
+ # the new PID — we must not leave the stale one behind.
2031
+ locked_state_update --argjson num "$issue_num" \
2032
+ --argjson old_pid "${pid:-0}" \
2033
+ '.active_jobs = [.active_jobs[] | select(.issue != $num or .pid != $old_pid)]'
2034
+ untrack_priority_job "$issue_num"
2035
+
1784
2036
  if [[ "$_retry_spawned_for" == "$issue_num" ]]; then
1785
2037
  daemon_log INFO "Retry spawned for issue #${issue_num} — skipping worktree cleanup"
1786
2038
  else
1787
- # Remove from active_jobs and priority lane tracking (locked)
1788
- locked_state_update --argjson num "$issue_num" \
1789
- '.active_jobs = [.active_jobs[] | select(.issue != $num)]'
1790
- untrack_priority_job "$issue_num"
1791
-
1792
2039
  # Clean up worktree (skip for org-mode clones — they persist)
1793
2040
  local job_repo
1794
2041
  job_repo=$(echo "$job" | jq -r '.repo // ""')
@@ -1827,6 +2074,9 @@ daemon_reap_completed() {
1827
2074
  daemon_on_success() {
1828
2075
  local issue_num="$1" duration="${2:-}"
1829
2076
 
2077
+ # Reset consecutive failure tracking on any success
2078
+ reset_failure_tracking
2079
+
1830
2080
  daemon_log SUCCESS "Pipeline completed for issue #${issue_num} (${duration:-unknown})"
1831
2081
 
1832
2082
  # Record pipeline duration for adaptive threshold learning
@@ -1887,6 +2137,91 @@ Check the associated PR for the implementation." 2>/dev/null || true
1887
2137
  "$SCRIPT_DIR/sw-tracker.sh" notify "completed" "$issue_num" 2>/dev/null || true
1888
2138
  }
1889
2139
 
2140
+ # ─── Failure Classification ─────────────────────────────────────────────────
2141
+
2142
+ classify_failure() {
2143
+ local issue_num="$1"
2144
+ if [[ -z "${LOG_DIR:-}" ]]; then
2145
+ echo "unknown"
2146
+ return
2147
+ fi
2148
+ local log_path="$LOG_DIR/issue-${issue_num}.log"
2149
+ if [[ ! -f "$log_path" ]]; then
2150
+ echo "unknown"
2151
+ return
2152
+ fi
2153
+ local tail_content
2154
+ tail_content=$(tail -200 "$log_path" 2>/dev/null || true)
2155
+
2156
+ # Auth errors
2157
+ if echo "$tail_content" | grep -qiE 'not logged in|unauthorized|auth.*fail|401 |invalid.*token|CLAUDE_CODE_OAUTH_TOKEN|api key.*invalid|authentication required'; then
2158
+ echo "auth_error"
2159
+ return
2160
+ fi
2161
+ # API errors (rate limits, timeouts, server errors)
2162
+ if echo "$tail_content" | grep -qiE 'rate limit|429 |503 |502 |overloaded|timeout|ETIMEDOUT|ECONNRESET|socket hang up|service unavailable'; then
2163
+ echo "api_error"
2164
+ return
2165
+ fi
2166
+ # Invalid issue (not found, empty body)
2167
+ if echo "$tail_content" | grep -qiE 'issue not found|404 |no body|could not resolve|GraphQL.*not found|issue.*does not exist'; then
2168
+ echo "invalid_issue"
2169
+ return
2170
+ fi
2171
+ # Context exhaustion — check progress file
2172
+ local issue_worktree_path="${WORKTREE_DIR:-${REPO_DIR}/.worktrees}/daemon-issue-${issue_num}"
2173
+ local progress_file="${issue_worktree_path}/.claude/loop-logs/progress.md"
2174
+ if [[ -f "$progress_file" ]]; then
2175
+ local cf_iter
2176
+ cf_iter=$(grep -oE 'Iteration: [0-9]+' "$progress_file" 2>/dev/null | tail -1 | grep -oE '[0-9]+' || echo "0")
2177
+ if ! [[ "${cf_iter:-0}" =~ ^[0-9]+$ ]]; then cf_iter="0"; fi
2178
+ local cf_tests
2179
+ cf_tests=$(grep -oE 'Tests passing: (true|false)' "$progress_file" 2>/dev/null | awk '{print $NF}' || echo "unknown")
2180
+ if [[ "${cf_iter:-0}" -gt 0 ]] && { [[ "$cf_tests" == "false" ]] || [[ "$cf_tests" == "unknown" ]]; }; then
2181
+ echo "context_exhaustion"
2182
+ return
2183
+ fi
2184
+ fi
2185
+ # Build failure (test errors, compile errors)
2186
+ if echo "$tail_content" | grep -qiE 'test.*fail|FAIL|build.*error|compile.*error|lint.*fail|npm ERR|exit code [1-9]'; then
2187
+ echo "build_failure"
2188
+ return
2189
+ fi
2190
+ echo "unknown"
2191
+ }
2192
+
2193
+ # ─── Consecutive Failure Tracking ──────────────────────────────────────────
2194
+
2195
+ DAEMON_CONSECUTIVE_FAILURE_CLASS=""
2196
+ DAEMON_CONSECUTIVE_FAILURE_COUNT=0
2197
+
2198
+ record_failure_class() {
2199
+ local failure_class="$1"
2200
+ if [[ "$failure_class" == "$DAEMON_CONSECUTIVE_FAILURE_CLASS" ]]; then
2201
+ DAEMON_CONSECUTIVE_FAILURE_COUNT=$((DAEMON_CONSECUTIVE_FAILURE_COUNT + 1))
2202
+ else
2203
+ DAEMON_CONSECUTIVE_FAILURE_CLASS="$failure_class"
2204
+ DAEMON_CONSECUTIVE_FAILURE_COUNT=1
2205
+ fi
2206
+
2207
+ if [[ "$DAEMON_CONSECUTIVE_FAILURE_COUNT" -ge 3 ]]; then
2208
+ daemon_log ERROR "3 consecutive failures (class: ${failure_class}) — auto-pausing daemon"
2209
+ local pause_json
2210
+ pause_json=$(jq -n --arg reason "consecutive_${failure_class}" --arg ts "$(now_iso)" \
2211
+ '{reason: $reason, timestamp: $ts}')
2212
+ local _tmp_pause
2213
+ _tmp_pause=$(mktemp "${TMPDIR:-/tmp}/sw-pause.XXXXXX")
2214
+ echo "$pause_json" > "$_tmp_pause"
2215
+ mv "$_tmp_pause" "$PAUSE_FLAG"
2216
+ emit_event "daemon.auto_pause" "reason=consecutive_failures" "class=$failure_class" "count=$DAEMON_CONSECUTIVE_FAILURE_COUNT"
2217
+ fi
2218
+ }
2219
+
2220
+ reset_failure_tracking() {
2221
+ DAEMON_CONSECUTIVE_FAILURE_CLASS=""
2222
+ DAEMON_CONSECUTIVE_FAILURE_COUNT=0
2223
+ }
2224
+
1890
2225
  # ─── Failure Handler ────────────────────────────────────────────────────────
1891
2226
 
1892
2227
  daemon_on_failure() {
@@ -1923,100 +2258,160 @@ daemon_on_failure() {
1923
2258
  completed_at: $completed_at
1924
2259
  }] | .completed = .completed[-500:]'
1925
2260
 
2261
+ # ── Classify failure and decide retry strategy ──
2262
+ local failure_class
2263
+ failure_class=$(classify_failure "$issue_num")
2264
+ daemon_log INFO "Failure classified as: ${failure_class} for issue #${issue_num}"
2265
+ emit_event "daemon.failure_classified" "issue=$issue_num" "class=$failure_class"
2266
+ record_failure_class "$failure_class"
2267
+
1926
2268
  # ── Auto-retry with strategy escalation ──
1927
2269
  if [[ "${RETRY_ESCALATION:-true}" == "true" ]]; then
1928
2270
  local retry_count
1929
2271
  retry_count=$(jq -r --arg num "$issue_num" \
1930
2272
  '.retry_counts[$num] // 0' "$STATE_FILE" 2>/dev/null || echo "0")
1931
2273
 
1932
- if [[ "$retry_count" -lt "${MAX_RETRIES:-2}" ]]; then
1933
- retry_count=$((retry_count + 1))
1934
-
1935
- # Update retry count in state (locked to prevent race)
1936
- locked_state_update \
1937
- --arg num "$issue_num" --argjson count "$retry_count" \
1938
- '.retry_counts[$num] = $count'
1939
-
1940
- daemon_log WARN "Auto-retry #${retry_count}/${MAX_RETRIES:-2} for issue #${issue_num}"
1941
- emit_event "daemon.retry" "issue=$issue_num" "retry=$retry_count" "max=${MAX_RETRIES:-2}"
1942
-
1943
- # Check for checkpoint to enable resume-from-checkpoint
1944
- local checkpoint_args=()
1945
- if [[ "${CHECKPOINT_ENABLED:-true}" == "true" ]]; then
1946
- # Try to find worktree for this issue to check for checkpoints
1947
- local issue_worktree="${REPO_DIR}/.worktrees/daemon-issue-${issue_num}"
1948
- if [[ -d "$issue_worktree/.claude/pipeline-artifacts/checkpoints" ]]; then
1949
- local latest_checkpoint=""
1950
- for cp_file in "$issue_worktree/.claude/pipeline-artifacts/checkpoints"/*-checkpoint.json; do
1951
- [[ -f "$cp_file" ]] && latest_checkpoint="$cp_file"
1952
- done
1953
- if [[ -n "$latest_checkpoint" ]]; then
1954
- daemon_log INFO "Found checkpoint: $latest_checkpoint"
1955
- emit_event "daemon.recovery" "issue=$issue_num" "checkpoint=$latest_checkpoint"
1956
- checkpoint_args+=("--resume")
1957
- fi
2274
+ # Non-retryable failures skip retry entirely
2275
+ case "$failure_class" in
2276
+ auth_error)
2277
+ daemon_log ERROR "Auth error for issue #${issue_num} skipping retry"
2278
+ emit_event "daemon.skip_retry" "issue=$issue_num" "reason=auth_error"
2279
+ if [[ "$NO_GITHUB" != "true" ]]; then
2280
+ gh issue edit "$issue_num" --add-label "pipeline/auth-error" 2>/dev/null || true
1958
2281
  fi
1959
- fi
2282
+ ;;
2283
+ invalid_issue)
2284
+ daemon_log ERROR "Invalid issue #${issue_num} — skipping retry"
2285
+ emit_event "daemon.skip_retry" "issue=$issue_num" "reason=invalid_issue"
2286
+ if [[ "$NO_GITHUB" != "true" ]]; then
2287
+ gh issue comment "$issue_num" --body "Pipeline skipped retry: issue appears invalid or has no body." 2>/dev/null || true
2288
+ fi
2289
+ ;;
2290
+ *)
2291
+ # Retryable failures — proceed with escalation
2292
+ if [[ "$retry_count" -lt "${MAX_RETRIES:-2}" ]]; then
2293
+ retry_count=$((retry_count + 1))
2294
+
2295
+ # Update retry count in state (locked to prevent race)
2296
+ locked_state_update \
2297
+ --arg num "$issue_num" --argjson count "$retry_count" \
2298
+ '.retry_counts[$num] = $count'
2299
+
2300
+ daemon_log WARN "Auto-retry #${retry_count}/${MAX_RETRIES:-2} for issue #${issue_num} (class: ${failure_class})"
2301
+ emit_event "daemon.retry" "issue=$issue_num" "retry=$retry_count" "max=${MAX_RETRIES:-2}" "class=$failure_class"
2302
+
2303
+ # Check for checkpoint to enable resume-from-checkpoint
2304
+ local checkpoint_args=()
2305
+ if [[ "${CHECKPOINT_ENABLED:-true}" == "true" ]]; then
2306
+ local issue_worktree="${REPO_DIR}/.worktrees/daemon-issue-${issue_num}"
2307
+ if [[ -d "$issue_worktree/.claude/pipeline-artifacts/checkpoints" ]]; then
2308
+ local latest_checkpoint=""
2309
+ for cp_file in "$issue_worktree/.claude/pipeline-artifacts/checkpoints"/*-checkpoint.json; do
2310
+ [[ -f "$cp_file" ]] && latest_checkpoint="$cp_file"
2311
+ done
2312
+ if [[ -n "$latest_checkpoint" ]]; then
2313
+ daemon_log INFO "Found checkpoint: $latest_checkpoint"
2314
+ emit_event "daemon.recovery" "issue=$issue_num" "checkpoint=$latest_checkpoint"
2315
+ checkpoint_args+=("--resume")
2316
+ fi
2317
+ fi
2318
+ fi
1960
2319
 
1961
- # Build escalated pipeline args
1962
- local retry_template="$PIPELINE_TEMPLATE"
1963
- local retry_model="${MODEL:-opus}"
1964
- local extra_args=()
1965
-
1966
- if [[ "$retry_count" -eq 1 ]]; then
1967
- # Retry 1: same template, upgrade model, more iterations
1968
- retry_model="opus"
1969
- extra_args+=("--max-iterations" "30")
1970
- daemon_log INFO "Escalation: model=opus, max_iterations=30"
1971
- elif [[ "$retry_count" -ge 2 ]]; then
1972
- # Retry 2: full template, compound quality max cycles
1973
- retry_template="full"
1974
- retry_model="opus"
1975
- extra_args+=("--max-iterations" "30" "--compound-cycles" "5")
1976
- daemon_log INFO "Escalation: template=full, compound_cycles=5"
1977
- fi
2320
+ # Build escalated pipeline args
2321
+ local retry_template="$PIPELINE_TEMPLATE"
2322
+ local retry_model="${MODEL:-opus}"
2323
+ local extra_args=()
2324
+
2325
+ if [[ "$retry_count" -eq 1 ]]; then
2326
+ retry_model="opus"
2327
+ extra_args+=("--max-iterations" "30")
2328
+ daemon_log INFO "Escalation: model=opus, max_iterations=30"
2329
+ elif [[ "$retry_count" -ge 2 ]]; then
2330
+ retry_template="full"
2331
+ retry_model="opus"
2332
+ extra_args+=("--max-iterations" "30" "--compound-cycles" "5")
2333
+ daemon_log INFO "Escalation: template=full, compound_cycles=5"
2334
+ fi
1978
2335
 
1979
- if [[ "$NO_GITHUB" != "true" ]]; then
1980
- gh issue comment "$issue_num" --body "## 🔄 Auto-Retry #${retry_count}
2336
+ # Increase restarts on context exhaustion
2337
+ if [[ "$failure_class" == "context_exhaustion" ]]; then
2338
+ local boosted_restarts=$(( ${MAX_RESTARTS_CFG:-3} + retry_count ))
2339
+ if [[ "$boosted_restarts" -gt 5 ]]; then
2340
+ boosted_restarts=5
2341
+ fi
2342
+ extra_args+=("--max-restarts" "$boosted_restarts")
2343
+ daemon_log INFO "Boosting max-restarts to $boosted_restarts (context exhaustion)"
2344
+ fi
1981
2345
 
1982
- Pipeline failed retrying with escalated strategy.
2346
+ # API errors get extended backoff
2347
+ local api_backoff=300
2348
+ local backoff_secs=$((30 * retry_count))
2349
+ if [[ "$failure_class" == "api_error" ]]; then
2350
+ backoff_secs=$((api_backoff * retry_count))
2351
+ daemon_log INFO "API error — extended backoff ${backoff_secs}s"
2352
+ fi
2353
+
2354
+ if [[ "$NO_GITHUB" != "true" ]]; then
2355
+ gh issue comment "$issue_num" --body "## 🔄 Auto-Retry #${retry_count}
2356
+
2357
+ Pipeline failed (${failure_class}) — retrying with escalated strategy.
1983
2358
 
1984
2359
  | Field | Value |
1985
2360
  |-------|-------|
1986
2361
  | Retry | ${retry_count} / ${MAX_RETRIES:-2} |
2362
+ | Failure | \`${failure_class}\` |
1987
2363
  | Template | \`${retry_template}\` |
1988
2364
  | Model | \`${retry_model}\` |
1989
2365
  | Started | $(now_iso) |
1990
2366
 
1991
2367
  _Escalation: $(if [[ "$retry_count" -eq 1 ]]; then echo "upgraded model + increased iterations"; else echo "full template + compound quality"; fi)_" 2>/dev/null || true
1992
- fi
2368
+ fi
1993
2369
 
1994
- # Backoff before retry: 30s * retry_count (30s, 60s, ...)
1995
- local backoff_secs=$((30 * retry_count))
1996
- daemon_log INFO "Waiting ${backoff_secs}s before retry #${retry_count}"
1997
- sleep "$backoff_secs"
1998
-
1999
- # Re-spawn with escalated strategy
2000
- local orig_template="$PIPELINE_TEMPLATE"
2001
- local orig_model="$MODEL"
2002
- PIPELINE_TEMPLATE="$retry_template"
2003
- MODEL="$retry_model"
2004
- daemon_spawn_pipeline "$issue_num" "retry-${retry_count}"
2005
- _retry_spawned_for="$issue_num"
2006
- PIPELINE_TEMPLATE="$orig_template"
2007
- MODEL="$orig_model"
2008
- return
2009
- fi
2370
+ daemon_log INFO "Waiting ${backoff_secs}s before retry #${retry_count}"
2371
+ sleep "$backoff_secs"
2372
+
2373
+ # Merge checkpoint args + extra args for passthrough
2374
+ local all_extra_args=()
2375
+ if [[ ${#checkpoint_args[@]} -gt 0 ]]; then
2376
+ all_extra_args+=("${checkpoint_args[@]}")
2377
+ fi
2378
+ if [[ ${#extra_args[@]} -gt 0 ]]; then
2379
+ all_extra_args+=("${extra_args[@]}")
2380
+ fi
2381
+
2382
+ # Re-spawn with escalated strategy
2383
+ local orig_template="$PIPELINE_TEMPLATE"
2384
+ local orig_model="$MODEL"
2385
+ PIPELINE_TEMPLATE="$retry_template"
2386
+ MODEL="$retry_model"
2387
+ daemon_spawn_pipeline "$issue_num" "retry-${retry_count}" "" "${all_extra_args[@]}"
2388
+ _retry_spawned_for="$issue_num"
2389
+ PIPELINE_TEMPLATE="$orig_template"
2390
+ MODEL="$orig_model"
2391
+ return
2392
+ fi
2010
2393
 
2011
- daemon_log WARN "Max retries (${MAX_RETRIES:-2}) exhausted for issue #${issue_num}"
2012
- emit_event "daemon.retry_exhausted" "issue=$issue_num" "retries=$retry_count"
2394
+ daemon_log WARN "Max retries (${MAX_RETRIES:-2}) exhausted for issue #${issue_num}"
2395
+ emit_event "daemon.retry_exhausted" "issue=$issue_num" "retries=$retry_count"
2396
+ ;;
2397
+ esac
2013
2398
  fi
2014
2399
 
2015
2400
  # ── No retry — report final failure ──
2016
2401
  if [[ "$NO_GITHUB" != "true" ]]; then
2017
- # Add failure label
2402
+ # Add failure label and remove watch label (prevent re-processing)
2018
2403
  gh issue edit "$issue_num" \
2019
- --add-label "$ON_FAILURE_ADD_LABEL" 2>/dev/null || true
2404
+ --add-label "$ON_FAILURE_ADD_LABEL" \
2405
+ --remove-label "$WATCH_LABEL" 2>/dev/null || true
2406
+
2407
+ # Close any draft PR created for this issue (cleanup abandoned work)
2408
+ local draft_pr
2409
+ draft_pr=$(gh pr list --head "daemon/issue-${issue_num}" --head "pipeline/pipeline-issue-${issue_num}" \
2410
+ --json number,isDraft --jq '.[] | select(.isDraft == true) | .number' 2>/dev/null | head -1 || true)
2411
+ if [[ -n "$draft_pr" ]]; then
2412
+ gh pr close "$draft_pr" --delete-branch 2>/dev/null || true
2413
+ daemon_log INFO "Closed draft PR #${draft_pr} for failed issue #${issue_num}"
2414
+ fi
2020
2415
 
2021
2416
  # Comment with log tail
2022
2417
  local log_tail=""
@@ -2075,7 +2470,7 @@ triage_score_issue() {
2075
2470
 
2076
2471
  # ── Intelligence-powered triage (if enabled) ──
2077
2472
  if [[ "${INTELLIGENCE_ENABLED:-false}" == "true" ]] && type intelligence_analyze_issue &>/dev/null 2>&1; then
2078
- daemon_log INFO "Intelligence: using AI triage (intelligence enabled)"
2473
+ daemon_log INFO "Intelligence: using AI triage (intelligence enabled)" >&2
2079
2474
  local analysis
2080
2475
  analysis=$(intelligence_analyze_issue "$issue_json" 2>/dev/null || echo "")
2081
2476
  if [[ -n "$analysis" && "$analysis" != "{}" && "$analysis" != "null" ]]; then
@@ -2114,9 +2509,9 @@ triage_score_issue() {
2114
2509
  return
2115
2510
  fi
2116
2511
  # Fall through to heuristic scoring if intelligence call failed
2117
- daemon_log INFO "Intelligence: AI triage failed, falling back to heuristic scoring"
2512
+ daemon_log INFO "Intelligence: AI triage failed, falling back to heuristic scoring" >&2
2118
2513
  else
2119
- daemon_log INFO "Intelligence: using heuristic triage (intelligence disabled, enable with intelligence.enabled=true)"
2514
+ daemon_log INFO "Intelligence: using heuristic triage (intelligence disabled, enable with intelligence.enabled=true)" >&2
2120
2515
  fi
2121
2516
  labels_csv=$(echo "$issue_json" | jq -r '[.labels[].name] | join(",")')
2122
2517
  created_at=$(echo "$issue_json" | jq -r '.createdAt // ""')
@@ -2256,6 +2651,7 @@ triage_score_issue() {
2256
2651
  select_pipeline_template() {
2257
2652
  local labels="$1"
2258
2653
  local score="${2:-50}"
2654
+ local _selected_template=""
2259
2655
 
2260
2656
  # When auto_template is disabled, use default pipeline template
2261
2657
  if [[ "${AUTO_TEMPLATE:-false}" != "true" ]]; then
@@ -2265,7 +2661,7 @@ select_pipeline_template() {
2265
2661
 
2266
2662
  # ── Intelligence-composed pipeline (if enabled) ──
2267
2663
  if [[ "${COMPOSER_ENABLED:-false}" == "true" ]] && type composer_create_pipeline &>/dev/null 2>&1; then
2268
- daemon_log INFO "Intelligence: using AI pipeline composition (composer enabled)"
2664
+ daemon_log INFO "Intelligence: using AI pipeline composition (composer enabled)" >&2
2269
2665
  local analysis="${INTELLIGENCE_ANALYSIS:-{}}"
2270
2666
  local repo_context=""
2271
2667
  if [[ -f "${REPO_DIR:-}/.claude/pipeline-state.md" ]]; then
@@ -2287,9 +2683,69 @@ select_pipeline_template() {
2287
2683
  return
2288
2684
  fi
2289
2685
  # Fall through to static selection if composition failed
2290
- daemon_log INFO "Intelligence: AI pipeline composition failed, falling back to static template selection"
2686
+ daemon_log INFO "Intelligence: AI pipeline composition failed, falling back to static template selection" >&2
2291
2687
  else
2292
- daemon_log INFO "Intelligence: using static template selection (composer disabled, enable with intelligence.composer_enabled=true)"
2688
+ daemon_log INFO "Intelligence: using static template selection (composer disabled, enable with intelligence.composer_enabled=true)" >&2
2689
+ fi
2690
+
2691
+ # ── DORA-driven template escalation ──
2692
+ if [[ -f "${EVENTS_FILE:-$HOME/.shipwright/events.jsonl}" ]]; then
2693
+ local _dora_events _dora_total _dora_failures _dora_cfr
2694
+ _dora_events=$(tail -500 "${EVENTS_FILE:-$HOME/.shipwright/events.jsonl}" \
2695
+ | grep '"type":"pipeline.completed"' 2>/dev/null \
2696
+ | tail -5 || true)
2697
+ _dora_total=$(echo "$_dora_events" | grep -c '.' 2>/dev/null || echo "0")
2698
+ _dora_total="${_dora_total:-0}"
2699
+ if [[ "$_dora_total" -ge 3 ]]; then
2700
+ _dora_failures=$(echo "$_dora_events" | grep -c '"result":"failure"' 2>/dev/null || true)
2701
+ _dora_failures="${_dora_failures:-0}"
2702
+ _dora_cfr=$(( _dora_failures * 100 / _dora_total ))
2703
+ if [[ "$_dora_cfr" -gt 40 ]]; then
2704
+ daemon_log INFO "DORA escalation: CFR ${_dora_cfr}% > 40% — forcing enterprise template" >&2
2705
+ emit_event "daemon.dora_escalation" \
2706
+ "cfr=$_dora_cfr" \
2707
+ "total=$_dora_total" \
2708
+ "failures=$_dora_failures" \
2709
+ "template=enterprise"
2710
+ echo "enterprise"
2711
+ return
2712
+ fi
2713
+ if [[ "$_dora_cfr" -lt 10 && "$score" -ge 60 ]]; then
2714
+ daemon_log INFO "DORA: CFR ${_dora_cfr}% < 10% — fast template eligible" >&2
2715
+ # Fall through to allow other factors to also vote for fast
2716
+ fi
2717
+
2718
+ # ── DORA multi-factor ──
2719
+ # Cycle time: if median > 120min, prefer faster templates
2720
+ local _dora_cycle_time=0
2721
+ _dora_cycle_time=$(echo "$_dora_events" | jq -r 'select(.duration_s) | .duration_s' 2>/dev/null \
2722
+ | sort -n | awk '{ a[NR]=$1 } END { if (NR>0) print int(a[int(NR/2)+1]/60); else print 0 }' 2>/dev/null) || _dora_cycle_time=0
2723
+ _dora_cycle_time="${_dora_cycle_time:-0}"
2724
+ if [[ "${_dora_cycle_time:-0}" -gt 120 ]]; then
2725
+ daemon_log INFO "DORA: cycle time ${_dora_cycle_time}min > 120 — preferring fast template" >&2
2726
+ if [[ "${score:-0}" -ge 60 ]]; then
2727
+ echo "fast"
2728
+ return
2729
+ fi
2730
+ fi
2731
+
2732
+ # Deploy frequency: if < 1/week, use cost-aware
2733
+ local _dora_deploy_freq=0
2734
+ local _dora_first_epoch _dora_last_epoch _dora_span_days
2735
+ _dora_first_epoch=$(echo "$_dora_events" | head -1 | jq -r '.timestamp // empty' 2>/dev/null | xargs -I{} date -j -f "%Y-%m-%dT%H:%M:%SZ" {} +%s 2>/dev/null || echo "0")
2736
+ _dora_last_epoch=$(echo "$_dora_events" | tail -1 | jq -r '.timestamp // empty' 2>/dev/null | xargs -I{} date -j -f "%Y-%m-%dT%H:%M:%SZ" {} +%s 2>/dev/null || echo "0")
2737
+ if [[ "${_dora_first_epoch:-0}" -gt 0 && "${_dora_last_epoch:-0}" -gt 0 ]]; then
2738
+ _dora_span_days=$(( (_dora_last_epoch - _dora_first_epoch) / 86400 ))
2739
+ if [[ "${_dora_span_days:-0}" -gt 0 ]]; then
2740
+ _dora_deploy_freq=$(awk -v t="$_dora_total" -v d="$_dora_span_days" 'BEGIN { printf "%.1f", t * 7 / d }' 2>/dev/null) || _dora_deploy_freq=0
2741
+ fi
2742
+ fi
2743
+ if [[ -n "${_dora_deploy_freq:-}" ]] && awk -v f="${_dora_deploy_freq:-0}" 'BEGIN{exit !(f > 0 && f < 1)}' 2>/dev/null; then
2744
+ daemon_log INFO "DORA: deploy freq ${_dora_deploy_freq}/week — using cost-aware" >&2
2745
+ echo "cost-aware"
2746
+ return
2747
+ fi
2748
+ fi
2293
2749
  fi
2294
2750
 
2295
2751
  # ── Branch protection escalation (highest priority) ──
@@ -2306,7 +2762,7 @@ select_pipeline_template() {
2306
2762
  local required_reviews
2307
2763
  required_reviews=$(echo "$protection" | jq -r '.required_pull_request_reviews.required_approving_review_count // 0' 2>/dev/null || echo "0")
2308
2764
  if [[ "$strict_protection" == "true" ]] || [[ "${required_reviews:-0}" -gt 1 ]]; then
2309
- daemon_log INFO "Branch has strict protection — escalating to enterprise template"
2765
+ daemon_log INFO "Branch has strict protection — escalating to enterprise template" >&2
2310
2766
  echo "enterprise"
2311
2767
  return
2312
2768
  fi
@@ -2340,6 +2796,62 @@ select_pipeline_template() {
2340
2796
  fi
2341
2797
  fi
2342
2798
 
2799
+ # ── Quality memory-driven selection ──
2800
+ local quality_scores_file="${HOME}/.shipwright/optimization/quality-scores.jsonl"
2801
+ if [[ -f "$quality_scores_file" ]]; then
2802
+ local repo_hash
2803
+ repo_hash=$(cd "${REPO_DIR:-.}" && git rev-parse --show-toplevel 2>/dev/null | shasum -a 256 | cut -c1-16 || echo "unknown")
2804
+ # Get last 5 quality scores for this repo
2805
+ local recent_scores avg_quality has_critical
2806
+ recent_scores=$(grep "\"repo\":\"$repo_hash\"" "$quality_scores_file" 2>/dev/null | tail -5 || true)
2807
+ if [[ -n "$recent_scores" ]]; then
2808
+ avg_quality=$(echo "$recent_scores" | jq -r '.quality_score // 70' 2>/dev/null | awk '{ sum += $1; count++ } END { if (count > 0) printf "%.0f", sum/count; else print 70 }')
2809
+ has_critical=$(echo "$recent_scores" | jq -r '.findings.critical // 0' 2>/dev/null | awk '{ sum += $1 } END { print (sum > 0) ? "yes" : "no" }')
2810
+
2811
+ # Critical findings in recent history → force enterprise
2812
+ if [[ "$has_critical" == "yes" ]]; then
2813
+ daemon_log INFO "Quality memory: critical findings in recent runs — using enterprise template" >&2
2814
+ echo "enterprise"
2815
+ return
2816
+ fi
2817
+
2818
+ # Poor quality history → use full template
2819
+ if [[ "${avg_quality:-70}" -lt 60 ]]; then
2820
+ daemon_log INFO "Quality memory: avg score ${avg_quality}/100 in recent runs — using full template" >&2
2821
+ echo "full"
2822
+ return
2823
+ fi
2824
+
2825
+ # Excellent quality history → allow faster template
2826
+ if [[ "${avg_quality:-70}" -gt 80 ]]; then
2827
+ daemon_log INFO "Quality memory: avg score ${avg_quality}/100 in recent runs — eligible for fast template" >&2
2828
+ # Only upgrade if score also suggests fast
2829
+ if [[ "$score" -ge 60 ]]; then
2830
+ echo "fast"
2831
+ return
2832
+ fi
2833
+ fi
2834
+ fi
2835
+ fi
2836
+
2837
+ # ── Learned template weights ──
2838
+ local _tw_file="${HOME}/.shipwright/optimization/template-weights.json"
2839
+ if [[ -f "$_tw_file" ]]; then
2840
+ local _best_template _best_rate
2841
+ _best_template=$(jq -r '
2842
+ .weights // {} | to_entries
2843
+ | map(select(.value.sample_size >= 3))
2844
+ | sort_by(-.value.success_rate)
2845
+ | .[0].key // ""
2846
+ ' "$_tw_file" 2>/dev/null) || true
2847
+ if [[ -n "${_best_template:-}" && "${_best_template:-}" != "null" && "${_best_template:-}" != "" ]]; then
2848
+ _best_rate=$(jq -r --arg t "$_best_template" '.weights[$t].success_rate // 0' "$_tw_file" 2>/dev/null) || _best_rate=0
2849
+ daemon_log INFO "Template weights: ${_best_template} (${_best_rate} success rate)" >&2
2850
+ echo "$_best_template"
2851
+ return
2852
+ fi
2853
+ fi
2854
+
2343
2855
  # ── Score-based selection ──
2344
2856
  if [[ "$score" -ge 70 ]]; then
2345
2857
  echo "fast"
@@ -2388,8 +2900,12 @@ daemon_triage_show() {
2388
2900
  num=$(echo "$issue" | jq -r '.number')
2389
2901
  title=$(echo "$issue" | jq -r '.title // "—"')
2390
2902
  labels_csv=$(echo "$issue" | jq -r '[.labels[].name] | join(", ")')
2391
- score=$(triage_score_issue "$issue")
2392
- template=$(select_pipeline_template "$labels_csv" "$score")
2903
+ score=$(triage_score_issue "$issue" 2>/dev/null | tail -1)
2904
+ score=$(printf '%s' "$score" | tr -cd '[:digit:]')
2905
+ [[ -z "$score" ]] && score=50
2906
+ template=$(select_pipeline_template "$labels_csv" "$score" 2>/dev/null | tail -1)
2907
+ template=$(printf '%s' "$template" | sed $'s/\x1b\\[[0-9;]*m//g' | tr -cd '[:alnum:]-_')
2908
+ [[ -z "$template" ]] && template="$PIPELINE_TEMPLATE"
2393
2909
 
2394
2910
  scored_lines+=("${score}|${num}|${title}|${labels_csv}|${template}")
2395
2911
  done < <(echo "$issues_json" | jq -c '.[]')
@@ -3221,11 +3737,12 @@ Auto-detected by \`shipwright daemon patrol\` on $(now_iso)." \
3221
3737
  if [[ ! -f "$scripts_dir/sw-${name}-test.sh" ]]; then
3222
3738
  # Count usage across other scripts
3223
3739
  local usage_count
3224
- usage_count=$(grep -rl "sw-${name}" "$scripts_dir"/sw-*.sh 2>/dev/null | grep -cv "$basename" || true)
3740
+ usage_count=$(grep -rl "sw-${name}" "$scripts_dir"/sw-*.sh 2>/dev/null | grep -cv "$basename" 2>/dev/null || echo "0")
3225
3741
  usage_count=${usage_count:-0}
3226
3742
 
3227
3743
  local line_count
3228
- line_count=$(wc -l < "$script" 2>/dev/null | tr -d ' ')
3744
+ line_count=$(wc -l < "$script" 2>/dev/null | tr -d ' ' || echo "0")
3745
+ line_count=${line_count:-0}
3229
3746
 
3230
3747
  untested_entries="${untested_entries}${usage_count}|${basename}|${line_count}\n"
3231
3748
  findings=$((findings + 1))
@@ -3484,6 +4001,13 @@ Patrol pre-filter findings to confirm: ${patrol_findings_summary}"
3484
4001
  patrol_meta_run
3485
4002
  fi
3486
4003
 
4004
+ # ── Strategic Intelligence Patrol (requires CLAUDE_CODE_OAUTH_TOKEN) ──
4005
+ if [[ -f "$SCRIPT_DIR/sw-strategic.sh" ]] && [[ -n "${CLAUDE_CODE_OAUTH_TOKEN:-}" ]]; then
4006
+ # shellcheck source=sw-strategic.sh
4007
+ source "$SCRIPT_DIR/sw-strategic.sh"
4008
+ strategic_patrol_run || true
4009
+ fi
4010
+
3487
4011
  # ── Summary ──
3488
4012
  emit_event "patrol.completed" "findings=$total_findings" "issues_created=$issues_created" "dry_run=$dry_run"
3489
4013
 
@@ -3602,7 +4126,9 @@ daemon_poll_issues() {
3602
4126
  while IFS= read -r issue; do
3603
4127
  local num score
3604
4128
  num=$(echo "$issue" | jq -r '.number')
3605
- score=$(triage_score_issue "$issue")
4129
+ score=$(triage_score_issue "$issue" 2>/dev/null | tail -1)
4130
+ score=$(printf '%s' "$score" | tr -cd '[:digit:]')
4131
+ [[ -z "$score" ]] && score=50
3606
4132
  # For org mode, include repo name in the scored entry
3607
4133
  local repo_name=""
3608
4134
  if [[ "$WATCH_MODE" == "org" ]]; then
@@ -3629,10 +4155,10 @@ daemon_poll_issues() {
3629
4155
  local sorted_order
3630
4156
  if [[ "${PRIORITY_STRATEGY:-quick-wins-first}" == "complex-first" ]]; then
3631
4157
  # Complex-first: lower score (more complex) first
3632
- sorted_order=$(printf '%s\n' "${scored_issues[@]}" | sort -t'|' -k1 -n)
4158
+ sorted_order=$(printf '%s\n' "${scored_issues[@]}" | sort -t'|' -k1,1 -n -k2,2 -n)
3633
4159
  else
3634
- # Quick-wins-first (default): higher score (simpler) first
3635
- sorted_order=$(printf '%s\n' "${scored_issues[@]}" | sort -t'|' -k1 -rn)
4160
+ # Quick-wins-first (default): higher score (simpler) first, lowest issue# first on ties
4161
+ sorted_order=$(printf '%s\n' "${scored_issues[@]}" | sort -t'|' -k1,1 -rn -k2,2 -n)
3636
4162
  fi
3637
4163
 
3638
4164
  # Dependency-aware reordering: move dependencies before dependents
@@ -3727,7 +4253,9 @@ daemon_poll_issues() {
3727
4253
  emit_event "daemon.priority_lane" "issue=$issue_num" "score=$score"
3728
4254
 
3729
4255
  local template
3730
- template=$(select_pipeline_template "$labels_csv" "$score")
4256
+ template=$(select_pipeline_template "$labels_csv" "$score" 2>/dev/null | tail -1)
4257
+ template=$(printf '%s' "$template" | sed $'s/\x1b\\[[0-9;]*m//g' | tr -cd '[:alnum:]-_')
4258
+ [[ -z "$template" ]] && template="$PIPELINE_TEMPLATE"
3731
4259
  daemon_log INFO "Triage: issue #${issue_num} scored ${score}, template=${template} [PRIORITY]"
3732
4260
 
3733
4261
  local orig_template="$PIPELINE_TEMPLATE"
@@ -3748,7 +4276,9 @@ daemon_poll_issues() {
3748
4276
 
3749
4277
  # Auto-select pipeline template based on labels + triage score
3750
4278
  local template
3751
- template=$(select_pipeline_template "$labels_csv" "$score")
4279
+ template=$(select_pipeline_template "$labels_csv" "$score" 2>/dev/null | tail -1)
4280
+ template=$(printf '%s' "$template" | sed $'s/\x1b\\[[0-9;]*m//g' | tr -cd '[:alnum:]-_')
4281
+ [[ -z "$template" ]] && template="$PIPELINE_TEMPLATE"
3752
4282
  daemon_log INFO "Triage: issue #${issue_num} scored ${score}, template=${template}"
3753
4283
 
3754
4284
  # Spawn pipeline (template selection applied via PIPELINE_TEMPLATE override)
@@ -3756,8 +4286,41 @@ daemon_poll_issues() {
3756
4286
  PIPELINE_TEMPLATE="$template"
3757
4287
  daemon_spawn_pipeline "$issue_num" "$issue_title" "$repo_name"
3758
4288
  PIPELINE_TEMPLATE="$orig_template"
4289
+
4290
+ # Stagger delay between spawns to avoid API contention
4291
+ local stagger_delay="${SPAWN_STAGGER_SECONDS:-15}"
4292
+ if [[ "$stagger_delay" -gt 0 ]]; then
4293
+ sleep "$stagger_delay"
4294
+ fi
3759
4295
  done <<< "$sorted_order"
3760
4296
 
4297
+ # ── Drain queue if we have capacity (prevents deadlock when queue is
4298
+ # populated but no active jobs exist to trigger dequeue) ──
4299
+ local drain_active
4300
+ drain_active=$(locked_get_active_count)
4301
+ while [[ "$drain_active" -lt "$MAX_PARALLEL" ]]; do
4302
+ local drain_issue
4303
+ drain_issue=$(dequeue_next)
4304
+ [[ -z "$drain_issue" ]] && break
4305
+ local drain_title
4306
+ drain_title=$(jq -r --arg n "$drain_issue" '.titles[$n] // ""' "$STATE_FILE" 2>/dev/null || true)
4307
+
4308
+ local drain_labels drain_score drain_template
4309
+ drain_labels=$(echo "$issues_json" | jq -r --argjson n "$drain_issue" \
4310
+ '.[] | select(.number == $n) | [.labels[].name] | join(",")' 2>/dev/null || echo "")
4311
+ drain_score=$(echo "$sorted_order" | grep "|${drain_issue}|" | cut -d'|' -f1 || echo "50")
4312
+ drain_template=$(select_pipeline_template "$drain_labels" "${drain_score:-50}" 2>/dev/null | tail -1)
4313
+ drain_template=$(printf '%s' "$drain_template" | sed $'s/\x1b\\[[0-9;]*m//g' | tr -cd '[:alnum:]-_')
4314
+ [[ -z "$drain_template" ]] && drain_template="$PIPELINE_TEMPLATE"
4315
+
4316
+ daemon_log INFO "Draining queue: issue #${drain_issue}, template=${drain_template}"
4317
+ local orig_template="$PIPELINE_TEMPLATE"
4318
+ PIPELINE_TEMPLATE="$drain_template"
4319
+ daemon_spawn_pipeline "$drain_issue" "$drain_title"
4320
+ PIPELINE_TEMPLATE="$orig_template"
4321
+ drain_active=$(locked_get_active_count)
4322
+ done
4323
+
3761
4324
  # Update last poll
3762
4325
  update_state_field "last_poll" "$(now_iso)"
3763
4326
  }
@@ -3770,13 +4333,15 @@ daemon_health_check() {
3770
4333
  now_e=$(now_epoch)
3771
4334
 
3772
4335
  if [[ -f "$STATE_FILE" ]]; then
3773
- # ── Progress-Based Health Monitoring ──
3774
- # Instead of killing after a static timeout, check for forward progress.
3775
- # Only kill when the agent is truly stuck (no stage change, no new code,
3776
- # same error repeating). A hard wall-clock limit remains as absolute safety net.
4336
+ # ── Intelligent Health Monitoring ──
4337
+ # Instead of killing after a countdown, sense what the agent is doing.
4338
+ # Agents think for long stretches that's normal and expected.
4339
+ # Strategy: sense understand be patient nudge → only kill as last resort.
3777
4340
 
3778
- local hard_limit="${PROGRESS_HARD_LIMIT_S:-10800}"
4341
+ local hard_limit="${PROGRESS_HARD_LIMIT_S:-0}"
3779
4342
  local use_progress="${PROGRESS_MONITORING:-true}"
4343
+ local nudge_enabled="${NUDGE_ENABLED:-true}"
4344
+ local nudge_after="${NUDGE_AFTER_CHECKS:-40}"
3780
4345
 
3781
4346
  while IFS= read -r job; do
3782
4347
  local pid started_at issue_num worktree
@@ -3797,8 +4362,8 @@ daemon_health_check() {
3797
4362
  elapsed=$(( now_e - start_e ))
3798
4363
  fi
3799
4364
 
3800
- # Hard wall-clock limit — absolute safety net (default 3h)
3801
- if [[ "$elapsed" -gt "$hard_limit" ]]; then
4365
+ # Hard wall-clock limit — disabled by default (0 = off)
4366
+ if [[ "$hard_limit" -gt 0 && "$elapsed" -gt "$hard_limit" ]]; then
3802
4367
  daemon_log WARN "Hard limit exceeded: issue #${issue_num} (${elapsed}s > ${hard_limit}s, PID $pid) — killing"
3803
4368
  emit_event "daemon.hard_limit" "issue=$issue_num" "elapsed_s=$elapsed" "limit_s=$hard_limit" "pid=$pid"
3804
4369
  kill "$pid" 2>/dev/null || true
@@ -3807,7 +4372,7 @@ daemon_health_check() {
3807
4372
  continue
3808
4373
  fi
3809
4374
 
3810
- # Progress-based detection (when enabled)
4375
+ # ── Intelligent Progress Sensing ──
3811
4376
  if [[ "$use_progress" == "true" && -n "$worktree" ]]; then
3812
4377
  local snapshot verdict
3813
4378
  snapshot=$(daemon_collect_snapshot "$issue_num" "$worktree" "$pid" 2>/dev/null || echo '{}')
@@ -3815,29 +4380,87 @@ daemon_health_check() {
3815
4380
  if [[ "$snapshot" != "{}" ]]; then
3816
4381
  verdict=$(daemon_assess_progress "$issue_num" "$snapshot" 2>/dev/null || echo "healthy")
3817
4382
 
4383
+ local no_progress_count=0
4384
+ no_progress_count=$(jq -r '.no_progress_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
4385
+ local cur_stage
4386
+ cur_stage=$(echo "$snapshot" | jq -r '.stage // "unknown"')
4387
+
3818
4388
  case "$verdict" in
3819
4389
  healthy)
3820
4390
  # All good — agent is making progress
3821
4391
  ;;
3822
4392
  slowing)
3823
- daemon_log INFO "Issue #${issue_num} slowing (no progress for 1-2 checks, ${elapsed}s elapsed)"
4393
+ daemon_log INFO "Issue #${issue_num} slowing (no visible changes for ${no_progress_count} checks, ${elapsed}s elapsed, stage=${cur_stage})"
3824
4394
  ;;
3825
4395
  stalled)
3826
- local no_progress_count
3827
- no_progress_count=$(jq -r '.no_progress_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
3828
- daemon_log WARN "Issue #${issue_num} stalled: no progress for ${no_progress_count} checks (${elapsed}s elapsed, PID $pid)"
3829
- emit_event "daemon.stalled" "issue=$issue_num" "no_progress=$no_progress_count" "elapsed_s=$elapsed" "pid=$pid"
4396
+ # Check if agent subprocess is alive and consuming CPU
4397
+ local agent_alive=false
4398
+ local child_cpu=0
4399
+ child_cpu=$(pgrep -P "$pid" 2>/dev/null | xargs -I{} ps -o pcpu= -p {} 2>/dev/null | awk '{sum+=$1} END{printf "%d", sum+0}' || echo "0")
4400
+ if [[ "${child_cpu:-0}" -gt 0 ]]; then
4401
+ agent_alive=true
4402
+ fi
4403
+
4404
+ if [[ "$agent_alive" == "true" ]]; then
4405
+ daemon_log INFO "Issue #${issue_num} no visible progress (${no_progress_count} checks) but agent is alive (CPU: ${child_cpu}%, stage=${cur_stage}, ${elapsed}s) — being patient"
4406
+ else
4407
+ daemon_log WARN "Issue #${issue_num} stalled: no progress for ${no_progress_count} checks, no CPU activity (${elapsed}s elapsed, PID $pid)"
4408
+ emit_event "daemon.stalled" "issue=$issue_num" "no_progress=$no_progress_count" "elapsed_s=$elapsed" "pid=$pid"
4409
+ fi
3830
4410
  ;;
3831
4411
  stuck)
3832
- local no_progress_count repeated_errors cur_stage
3833
- no_progress_count=$(jq -r '.no_progress_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
4412
+ local repeated_errors
3834
4413
  repeated_errors=$(jq -r '.repeated_error_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
3835
- cur_stage=$(echo "$snapshot" | jq -r '.stage // "unknown"')
3836
- daemon_log WARN "Issue #${issue_num} STUCK: no progress for ${no_progress_count} checks, ${repeated_errors} repeated errors (stage=${cur_stage}, ${elapsed}s, PID $pid) — killing"
3837
- emit_event "daemon.stuck_kill" "issue=$issue_num" "no_progress=$no_progress_count" "repeated_errors=$repeated_errors" "stage=$cur_stage" "elapsed_s=$elapsed" "pid=$pid"
3838
- kill "$pid" 2>/dev/null || true
3839
- daemon_clear_progress "$issue_num"
3840
- findings=$((findings + 1))
4414
+
4415
+ # Even "stuck" check if the process tree is alive first
4416
+ local agent_alive=false
4417
+ local child_cpu=0
4418
+ child_cpu=$(pgrep -P "$pid" 2>/dev/null | xargs -I{} ps -o pcpu= -p {} 2>/dev/null | awk '{sum+=$1} END{printf "%d", sum+0}' || echo "0")
4419
+ if [[ "${child_cpu:-0}" -gt 0 ]]; then
4420
+ agent_alive=true
4421
+ fi
4422
+
4423
+ if [[ "$agent_alive" == "true" && "$repeated_errors" -lt 3 ]]; then
4424
+ # Agent is alive — nudge instead of kill
4425
+ if [[ "$nudge_enabled" == "true" && "$no_progress_count" -ge "$nudge_after" ]]; then
4426
+ local nudge_file="${worktree}/.claude/nudge.md"
4427
+ if [[ ! -f "$nudge_file" ]]; then
4428
+ cat > "$nudge_file" <<NUDGE_EOF
4429
+ # Nudge from Daemon Health Monitor
4430
+
4431
+ The daemon has noticed no visible progress for $(( no_progress_count * 30 / 60 )) minutes.
4432
+ Current stage: ${cur_stage}
4433
+
4434
+ If you're stuck, consider:
4435
+ - Breaking the task into smaller steps
4436
+ - Committing partial progress
4437
+ - Running tests to validate current state
4438
+
4439
+ This is just a gentle check-in — take your time if you're working through a complex problem.
4440
+ NUDGE_EOF
4441
+ daemon_log INFO "Issue #${issue_num} nudged (${no_progress_count} checks, stage=${cur_stage}, CPU=${child_cpu}%) — file written to worktree"
4442
+ emit_event "daemon.nudge" "issue=$issue_num" "no_progress=$no_progress_count" "stage=$cur_stage" "elapsed_s=$elapsed"
4443
+ fi
4444
+ else
4445
+ daemon_log INFO "Issue #${issue_num} no visible progress (${no_progress_count} checks) but agent is alive (CPU: ${child_cpu}%, stage=${cur_stage}) — waiting"
4446
+ fi
4447
+ elif [[ "$repeated_errors" -ge 5 ]]; then
4448
+ # Truly stuck in an error loop — kill as last resort
4449
+ daemon_log WARN "Issue #${issue_num} in error loop: ${repeated_errors} repeated errors (stage=${cur_stage}, ${elapsed}s, PID $pid) — killing"
4450
+ emit_event "daemon.stuck_kill" "issue=$issue_num" "no_progress=$no_progress_count" "repeated_errors=$repeated_errors" "stage=$cur_stage" "elapsed_s=$elapsed" "pid=$pid" "reason=error_loop"
4451
+ kill "$pid" 2>/dev/null || true
4452
+ daemon_clear_progress "$issue_num"
4453
+ findings=$((findings + 1))
4454
+ elif [[ "$agent_alive" != "true" && "$no_progress_count" -ge "$((PROGRESS_CHECKS_BEFORE_KILL * 2))" ]]; then
4455
+ # Process tree is dead AND no progress for very long time
4456
+ daemon_log WARN "Issue #${issue_num} appears dead: no CPU, no progress for ${no_progress_count} checks (${elapsed}s, PID $pid) — killing"
4457
+ emit_event "daemon.stuck_kill" "issue=$issue_num" "no_progress=$no_progress_count" "repeated_errors=$repeated_errors" "stage=$cur_stage" "elapsed_s=$elapsed" "pid=$pid" "reason=dead_process"
4458
+ kill "$pid" 2>/dev/null || true
4459
+ daemon_clear_progress "$issue_num"
4460
+ findings=$((findings + 1))
4461
+ else
4462
+ daemon_log WARN "Issue #${issue_num} struggling (${no_progress_count} checks, ${repeated_errors} errors, CPU=${child_cpu}%, stage=${cur_stage}) — monitoring"
4463
+ fi
3841
4464
  ;;
3842
4465
  esac
3843
4466
  fi
@@ -3846,8 +4469,9 @@ daemon_health_check() {
3846
4469
  local stale_timeout
3847
4470
  stale_timeout=$(get_adaptive_stale_timeout "$PIPELINE_TEMPLATE")
3848
4471
  if [[ "$elapsed" -gt "$stale_timeout" ]]; then
3849
- daemon_log WARN "Stale job (legacy): issue #${issue_num} (${elapsed}s > ${stale_timeout}s, PID $pid) — killing"
3850
- kill "$pid" 2>/dev/null || true
4472
+ daemon_log WARN "Stale job (legacy): issue #${issue_num} (${elapsed}s > ${stale_timeout}s, PID $pid)"
4473
+ # Don't kill just log. Let the process run.
4474
+ emit_event "daemon.stale_warning" "issue=$issue_num" "elapsed_s=$elapsed" "pid=$pid"
3851
4475
  findings=$((findings + 1))
3852
4476
  fi
3853
4477
  fi
@@ -3908,8 +4532,11 @@ daemon_check_degradation() {
3908
4532
  local failures successes
3909
4533
  failures=$(echo "$recent" | jq '[.[] | select(.result == "failure")] | length')
3910
4534
  successes=$(echo "$recent" | jq '[.[] | select(.result == "success")] | length')
3911
- local cfr_pct=$(( failures * 100 / count ))
3912
- local success_pct=$(( successes * 100 / count ))
4535
+ local cfr_pct=0 success_pct=0
4536
+ if [[ "${count:-0}" -gt 0 ]]; then
4537
+ cfr_pct=$(( failures * 100 / count ))
4538
+ success_pct=$(( successes * 100 / count ))
4539
+ fi
3913
4540
 
3914
4541
  local alerts=""
3915
4542
  if [[ "$cfr_pct" -gt "$cfr_threshold" ]]; then
@@ -4039,11 +4666,43 @@ daemon_auto_scale() {
4039
4666
  local max_by_queue=$(( queue_depth + active_count ))
4040
4667
  [[ "$max_by_queue" -lt 1 ]] && max_by_queue=1
4041
4668
 
4669
+ # ── Vitals-driven scaling factor ──
4670
+ local max_by_vitals="$MAX_WORKERS"
4671
+ if type pipeline_compute_vitals &>/dev/null 2>&1 && [[ -f "$STATE_FILE" ]]; then
4672
+ local _total_health=0 _health_count=0
4673
+ while IFS= read -r _job; do
4674
+ local _job_issue _job_worktree
4675
+ _job_issue=$(echo "$_job" | jq -r '.issue // 0')
4676
+ _job_worktree=$(echo "$_job" | jq -r '.worktree // ""')
4677
+ if [[ -n "$_job_worktree" && -d "$_job_worktree/.claude" ]]; then
4678
+ local _job_vitals _job_health
4679
+ _job_vitals=$(pipeline_compute_vitals "$_job_worktree/.claude/pipeline-state.md" "$_job_worktree/.claude/pipeline-artifacts" "$_job_issue" 2>/dev/null) || true
4680
+ if [[ -n "$_job_vitals" && "$_job_vitals" != "{}" ]]; then
4681
+ _job_health=$(echo "$_job_vitals" | jq -r '.health_score // 50' 2>/dev/null || echo "50")
4682
+ _total_health=$((_total_health + _job_health))
4683
+ _health_count=$((_health_count + 1))
4684
+ fi
4685
+ fi
4686
+ done < <(jq -c '.active_jobs[]' "$STATE_FILE" 2>/dev/null || true)
4687
+
4688
+ if [[ "$_health_count" -gt 0 ]]; then
4689
+ local _avg_health=$((_total_health / _health_count))
4690
+ if [[ "$_avg_health" -lt 50 ]]; then
4691
+ # Pipelines struggling — reduce workers to give each more resources
4692
+ max_by_vitals=$(( MAX_WORKERS * _avg_health / 100 ))
4693
+ [[ "$max_by_vitals" -lt "$MIN_WORKERS" ]] && max_by_vitals="$MIN_WORKERS"
4694
+ daemon_log INFO "Auto-scale: vitals avg health ${_avg_health}% — capping at ${max_by_vitals} workers"
4695
+ fi
4696
+ # avg_health > 70: no reduction (full capacity available)
4697
+ fi
4698
+ fi
4699
+
4042
4700
  # ── Compute final value ──
4043
4701
  local computed="$max_by_cpu"
4044
4702
  [[ "$max_by_mem" -lt "$computed" ]] && computed="$max_by_mem"
4045
4703
  [[ "$max_by_budget" -lt "$computed" ]] && computed="$max_by_budget"
4046
4704
  [[ "$max_by_queue" -lt "$computed" ]] && computed="$max_by_queue"
4705
+ [[ "$max_by_vitals" -lt "$computed" ]] && computed="$max_by_vitals"
4047
4706
  [[ "$MAX_WORKERS" -lt "$computed" ]] && computed="$MAX_WORKERS"
4048
4707
 
4049
4708
  # Respect fleet-assigned ceiling if set
@@ -4302,7 +4961,19 @@ daemon_cleanup_stale() {
4302
4961
  done < <(git worktree list --porcelain 2>/dev/null | grep '^worktree ' | sed 's/^worktree //')
4303
4962
  fi
4304
4963
 
4305
- # ── 2. Clean old pipeline artifacts ──
4964
+ # ── 2. Expire old checkpoints ──
4965
+ if [[ -x "$SCRIPT_DIR/sw-checkpoint.sh" ]]; then
4966
+ local expired_output
4967
+ expired_output=$(bash "$SCRIPT_DIR/sw-checkpoint.sh" expire --hours "$((age_days * 24))" 2>/dev/null || true)
4968
+ if [[ -n "$expired_output" ]] && echo "$expired_output" | grep -q "Expired"; then
4969
+ local expired_count
4970
+ expired_count=$(echo "$expired_output" | grep -c "Expired" || true)
4971
+ cleaned=$((cleaned + ${expired_count:-0}))
4972
+ daemon_log INFO "Expired ${expired_count:-0} old checkpoint(s)"
4973
+ fi
4974
+ fi
4975
+
4976
+ # ── 3. Clean old pipeline artifacts (subdirectories only) ──
4306
4977
  local artifacts_dir=".claude/pipeline-artifacts"
4307
4978
  if [[ -d "$artifacts_dir" ]]; then
4308
4979
  while IFS= read -r artifact_dir; do
@@ -4393,6 +5064,7 @@ daemon_poll_loop() {
4393
5064
  # All poll loop calls are error-guarded to prevent set -e from killing the daemon.
4394
5065
  # The || operator disables set -e for the entire call chain, so transient failures
4395
5066
  # (GitHub API timeouts, jq errors, intelligence failures) are logged and skipped.
5067
+ daemon_preflight_auth_check || daemon_log WARN "Auth check failed — daemon may be paused"
4396
5068
  daemon_poll_issues || daemon_log WARN "daemon_poll_issues failed — continuing"
4397
5069
  daemon_reap_completed || daemon_log WARN "daemon_reap_completed failed — continuing"
4398
5070
  daemon_health_check || daemon_log WARN "daemon_health_check failed — continuing"
@@ -4476,7 +5148,8 @@ cleanup_on_exit() {
4476
5148
  while IFS= read -r cpid; do
4477
5149
  [[ -z "$cpid" ]] && continue
4478
5150
  if kill -0 "$cpid" 2>/dev/null; then
4479
- daemon_log INFO "Killing pipeline process PID ${cpid}"
5151
+ daemon_log INFO "Killing pipeline process tree PID ${cpid}"
5152
+ pkill -TERM -P "$cpid" 2>/dev/null || true
4480
5153
  kill "$cpid" 2>/dev/null || true
4481
5154
  killed=$((killed + 1))
4482
5155
  fi
@@ -4488,7 +5161,8 @@ cleanup_on_exit() {
4488
5161
  while IFS= read -r cpid; do
4489
5162
  [[ -z "$cpid" ]] && continue
4490
5163
  if kill -0 "$cpid" 2>/dev/null; then
4491
- daemon_log WARN "Force-killing pipeline PID ${cpid}"
5164
+ daemon_log WARN "Force-killing pipeline tree PID ${cpid}"
5165
+ pkill -9 -P "$cpid" 2>/dev/null || true
4492
5166
  kill -9 "$cpid" 2>/dev/null || true
4493
5167
  fi
4494
5168
  done <<< "$child_pids"