shipwright-cli 1.10.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. package/README.md +114 -36
  2. package/completions/_shipwright +212 -32
  3. package/completions/shipwright.bash +97 -25
  4. package/docs/strategy/01-market-research.md +619 -0
  5. package/docs/strategy/02-mission-and-brand.md +587 -0
  6. package/docs/strategy/03-gtm-and-roadmap.md +759 -0
  7. package/docs/strategy/QUICK-START.txt +289 -0
  8. package/docs/strategy/README.md +172 -0
  9. package/package.json +4 -2
  10. package/scripts/sw +208 -1
  11. package/scripts/sw-activity.sh +500 -0
  12. package/scripts/sw-adaptive.sh +925 -0
  13. package/scripts/sw-adversarial.sh +1 -1
  14. package/scripts/sw-architecture-enforcer.sh +1 -1
  15. package/scripts/sw-auth.sh +613 -0
  16. package/scripts/sw-autonomous.sh +664 -0
  17. package/scripts/sw-changelog.sh +704 -0
  18. package/scripts/sw-checkpoint.sh +1 -1
  19. package/scripts/sw-ci.sh +602 -0
  20. package/scripts/sw-cleanup.sh +1 -1
  21. package/scripts/sw-code-review.sh +637 -0
  22. package/scripts/sw-connect.sh +1 -1
  23. package/scripts/sw-context.sh +605 -0
  24. package/scripts/sw-cost.sh +1 -1
  25. package/scripts/sw-daemon.sh +432 -130
  26. package/scripts/sw-dashboard.sh +1 -1
  27. package/scripts/sw-db.sh +540 -0
  28. package/scripts/sw-decompose.sh +539 -0
  29. package/scripts/sw-deps.sh +551 -0
  30. package/scripts/sw-developer-simulation.sh +1 -1
  31. package/scripts/sw-discovery.sh +412 -0
  32. package/scripts/sw-docs-agent.sh +539 -0
  33. package/scripts/sw-docs.sh +1 -1
  34. package/scripts/sw-doctor.sh +59 -1
  35. package/scripts/sw-dora.sh +615 -0
  36. package/scripts/sw-durable.sh +710 -0
  37. package/scripts/sw-e2e-orchestrator.sh +535 -0
  38. package/scripts/sw-eventbus.sh +393 -0
  39. package/scripts/sw-feedback.sh +471 -0
  40. package/scripts/sw-fix.sh +1 -1
  41. package/scripts/sw-fleet-discover.sh +567 -0
  42. package/scripts/sw-fleet-viz.sh +404 -0
  43. package/scripts/sw-fleet.sh +8 -1
  44. package/scripts/sw-github-app.sh +596 -0
  45. package/scripts/sw-github-checks.sh +1 -1
  46. package/scripts/sw-github-deploy.sh +1 -1
  47. package/scripts/sw-github-graphql.sh +1 -1
  48. package/scripts/sw-guild.sh +569 -0
  49. package/scripts/sw-heartbeat.sh +1 -1
  50. package/scripts/sw-hygiene.sh +559 -0
  51. package/scripts/sw-incident.sh +617 -0
  52. package/scripts/sw-init.sh +88 -1
  53. package/scripts/sw-instrument.sh +699 -0
  54. package/scripts/sw-intelligence.sh +1 -1
  55. package/scripts/sw-jira.sh +1 -1
  56. package/scripts/sw-launchd.sh +363 -28
  57. package/scripts/sw-linear.sh +1 -1
  58. package/scripts/sw-logs.sh +1 -1
  59. package/scripts/sw-loop.sh +64 -3
  60. package/scripts/sw-memory.sh +1 -1
  61. package/scripts/sw-mission-control.sh +487 -0
  62. package/scripts/sw-model-router.sh +545 -0
  63. package/scripts/sw-otel.sh +596 -0
  64. package/scripts/sw-oversight.sh +689 -0
  65. package/scripts/sw-pipeline-composer.sh +1 -1
  66. package/scripts/sw-pipeline-vitals.sh +1 -1
  67. package/scripts/sw-pipeline.sh +687 -24
  68. package/scripts/sw-pm.sh +693 -0
  69. package/scripts/sw-pr-lifecycle.sh +522 -0
  70. package/scripts/sw-predictive.sh +1 -1
  71. package/scripts/sw-prep.sh +1 -1
  72. package/scripts/sw-ps.sh +1 -1
  73. package/scripts/sw-public-dashboard.sh +798 -0
  74. package/scripts/sw-quality.sh +595 -0
  75. package/scripts/sw-reaper.sh +1 -1
  76. package/scripts/sw-recruit.sh +573 -0
  77. package/scripts/sw-regression.sh +642 -0
  78. package/scripts/sw-release-manager.sh +736 -0
  79. package/scripts/sw-release.sh +706 -0
  80. package/scripts/sw-remote.sh +1 -1
  81. package/scripts/sw-replay.sh +520 -0
  82. package/scripts/sw-retro.sh +691 -0
  83. package/scripts/sw-scale.sh +444 -0
  84. package/scripts/sw-security-audit.sh +505 -0
  85. package/scripts/sw-self-optimize.sh +1 -1
  86. package/scripts/sw-session.sh +1 -1
  87. package/scripts/sw-setup.sh +1 -1
  88. package/scripts/sw-standup.sh +712 -0
  89. package/scripts/sw-status.sh +1 -1
  90. package/scripts/sw-strategic.sh +658 -0
  91. package/scripts/sw-stream.sh +450 -0
  92. package/scripts/sw-swarm.sh +583 -0
  93. package/scripts/sw-team-stages.sh +511 -0
  94. package/scripts/sw-templates.sh +1 -1
  95. package/scripts/sw-testgen.sh +515 -0
  96. package/scripts/sw-tmux-pipeline.sh +554 -0
  97. package/scripts/sw-tmux.sh +1 -1
  98. package/scripts/sw-trace.sh +485 -0
  99. package/scripts/sw-tracker-github.sh +188 -0
  100. package/scripts/sw-tracker-jira.sh +172 -0
  101. package/scripts/sw-tracker-linear.sh +251 -0
  102. package/scripts/sw-tracker.sh +117 -2
  103. package/scripts/sw-triage.sh +603 -0
  104. package/scripts/sw-upgrade.sh +1 -1
  105. package/scripts/sw-ux.sh +677 -0
  106. package/scripts/sw-webhook.sh +627 -0
  107. package/scripts/sw-widgets.sh +530 -0
  108. package/scripts/sw-worktree.sh +1 -1
@@ -6,7 +6,10 @@
6
6
  set -euo pipefail
7
7
  trap 'echo "ERROR: $BASH_SOURCE:$LINENO exited with status $?" >&2' ERR
8
8
 
9
- VERSION="1.10.0"
9
+ # Allow spawning Claude CLI from within a Claude Code session (daemon, fleet, etc.)
10
+ unset CLAUDECODE 2>/dev/null || true
11
+
12
+ VERSION="2.0.0"
10
13
  SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
11
14
  REPO_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
12
15
 
@@ -478,9 +481,11 @@ load_config() {
478
481
 
479
482
  # progress-based health monitoring (replaces static timeouts)
480
483
  PROGRESS_MONITORING=$(jq -r '.health.progress_based // true' "$config_file")
481
- PROGRESS_CHECKS_BEFORE_WARN=$(jq -r '.health.stale_checks_before_warn // 3' "$config_file")
482
- PROGRESS_CHECKS_BEFORE_KILL=$(jq -r '.health.stale_checks_before_kill // 6' "$config_file")
483
- PROGRESS_HARD_LIMIT_S=$(jq -r '.health.hard_limit_s // 10800' "$config_file") # 3hr absolute max
484
+ PROGRESS_CHECKS_BEFORE_WARN=$(jq -r '.health.stale_checks_before_warn // 20' "$config_file")
485
+ PROGRESS_CHECKS_BEFORE_KILL=$(jq -r '.health.stale_checks_before_kill // 120' "$config_file")
486
+ PROGRESS_HARD_LIMIT_S=$(jq -r '.health.hard_limit_s // 0' "$config_file") # 0 = disabled (no hard kill)
487
+ NUDGE_ENABLED=$(jq -r '.health.nudge_enabled // true' "$config_file")
488
+ NUDGE_AFTER_CHECKS=$(jq -r '.health.nudge_after_checks // 40' "$config_file")
484
489
 
485
490
  # team dashboard URL (for coordinated claiming)
486
491
  local cfg_dashboard_url
@@ -836,6 +841,31 @@ daemon_assess_progress() {
836
841
  has_progress=true
837
842
  fi
838
843
 
844
+ # Claude subprocess is alive and consuming CPU — agent is thinking/working
845
+ # During build stage, Claude can spend 10+ minutes thinking before any
846
+ # visible git changes appear. Detect this as progress.
847
+ if [[ "$has_progress" != "true" ]]; then
848
+ local _pid_for_check
849
+ _pid_for_check=$(echo "$current_snapshot" | jq -r '.pid // empty' 2>/dev/null || true)
850
+ if [[ -z "$_pid_for_check" ]]; then
851
+ # Fallback: get PID from active_jobs
852
+ _pid_for_check=$(jq -r --argjson num "$issue_num" \
853
+ '.active_jobs[] | select(.issue == ($num | tonumber)) | .pid' "$STATE_FILE" 2>/dev/null | head -1 || true)
854
+ fi
855
+ if [[ -n "$_pid_for_check" ]]; then
856
+ # Check if any child process (claude) is alive and using CPU
857
+ local child_cpu=0
858
+ child_cpu=$(ps -o pid=,pcpu= -p "$_pid_for_check" 2>/dev/null | awk '{sum+=$2} END{printf "%d", sum+0}' || echo "0")
859
+ if [[ "$child_cpu" -eq 0 ]]; then
860
+ # Check children of the pipeline process
861
+ child_cpu=$(pgrep -P "$_pid_for_check" 2>/dev/null | xargs -I{} ps -o pcpu= -p {} 2>/dev/null | awk '{sum+=$1} END{printf "%d", sum+0}' || echo "0")
862
+ fi
863
+ if [[ "${child_cpu:-0}" -gt 0 ]]; then
864
+ has_progress=true
865
+ fi
866
+ fi
867
+ fi
868
+
839
869
  # Detect repeated errors (same error signature hitting again)
840
870
  local repeated_errors="$prev_repeated_errors"
841
871
  if [[ -n "$cur_error" && "$cur_error" == "$prev_error" ]]; then
@@ -1208,6 +1238,74 @@ gh_record_failure() {
1208
1238
  fi
1209
1239
  }
1210
1240
 
1241
+ # ─── Runtime Auth Check ──────────────────────────────────────────────────────
1242
+
1243
+ LAST_AUTH_CHECK_EPOCH=0
1244
+ AUTH_CHECK_INTERVAL=300 # 5 minutes
1245
+
1246
+ daemon_preflight_auth_check() {
1247
+ local now_e
1248
+ now_e=$(now_epoch)
1249
+ if [[ $((now_e - LAST_AUTH_CHECK_EPOCH)) -lt "$AUTH_CHECK_INTERVAL" ]]; then
1250
+ return 0
1251
+ fi
1252
+ LAST_AUTH_CHECK_EPOCH="$now_e"
1253
+
1254
+ # gh auth check
1255
+ if [[ "${NO_GITHUB:-false}" != "true" ]]; then
1256
+ if ! gh auth status &>/dev/null 2>&1; then
1257
+ daemon_log ERROR "GitHub auth check failed — auto-pausing daemon"
1258
+ local pause_json
1259
+ pause_json=$(jq -n --arg reason "gh_auth_failure" --arg ts "$(now_iso)" \
1260
+ '{reason: $reason, timestamp: $ts}')
1261
+ local _tmp_pause
1262
+ _tmp_pause=$(mktemp "${TMPDIR:-/tmp}/sw-pause.XXXXXX")
1263
+ echo "$pause_json" > "$_tmp_pause"
1264
+ mv "$_tmp_pause" "$PAUSE_FLAG"
1265
+ emit_event "daemon.auto_pause" "reason=gh_auth_failure"
1266
+ return 1
1267
+ fi
1268
+ fi
1269
+
1270
+ # claude auth check with 15s timeout (macOS has no timeout command)
1271
+ local claude_auth_ok=false
1272
+ local _auth_tmp
1273
+ _auth_tmp=$(mktemp "${TMPDIR:-/tmp}/sw-auth.XXXXXX")
1274
+ ( claude --print -p "ok" --max-turns 1 > "$_auth_tmp" 2>/dev/null ) &
1275
+ local _auth_pid=$!
1276
+ local _auth_waited=0
1277
+ while kill -0 "$_auth_pid" 2>/dev/null && [[ "$_auth_waited" -lt 15 ]]; do
1278
+ sleep 1
1279
+ _auth_waited=$((_auth_waited + 1))
1280
+ done
1281
+ if kill -0 "$_auth_pid" 2>/dev/null; then
1282
+ kill "$_auth_pid" 2>/dev/null || true
1283
+ wait "$_auth_pid" 2>/dev/null || true
1284
+ else
1285
+ wait "$_auth_pid" 2>/dev/null || true
1286
+ fi
1287
+
1288
+ if [[ -s "$_auth_tmp" ]]; then
1289
+ claude_auth_ok=true
1290
+ fi
1291
+ rm -f "$_auth_tmp"
1292
+
1293
+ if [[ "$claude_auth_ok" != "true" ]]; then
1294
+ daemon_log ERROR "Claude auth check failed — auto-pausing daemon"
1295
+ local pause_json
1296
+ pause_json=$(jq -n --arg reason "claude_auth_failure" --arg ts "$(now_iso)" \
1297
+ '{reason: $reason, timestamp: $ts}')
1298
+ local _tmp_pause
1299
+ _tmp_pause=$(mktemp "${TMPDIR:-/tmp}/sw-pause.XXXXXX")
1300
+ echo "$pause_json" > "$_tmp_pause"
1301
+ mv "$_tmp_pause" "$PAUSE_FLAG"
1302
+ emit_event "daemon.auto_pause" "reason=claude_auth_failure"
1303
+ return 1
1304
+ fi
1305
+
1306
+ return 0
1307
+ }
1308
+
1211
1309
  # ─── Pre-flight Checks ──────────────────────────────────────────────────────
1212
1310
 
1213
1311
  preflight_checks() {
@@ -1609,9 +1707,24 @@ daemon_spawn_pipeline() {
1609
1707
  local issue_num="$1"
1610
1708
  local issue_title="${2:-}"
1611
1709
  local repo_full_name="${3:-}" # owner/repo (org mode only)
1710
+ shift 3 2>/dev/null || true
1711
+ local extra_pipeline_args=("$@") # Optional extra args passed to sw-pipeline.sh
1612
1712
 
1613
1713
  daemon_log INFO "Spawning pipeline for issue #${issue_num}: ${issue_title}"
1614
1714
 
1715
+ # ── Issue decomposition (if decomposer available) ──
1716
+ local decompose_script="${SCRIPT_DIR}/sw-decompose.sh"
1717
+ if [[ -x "$decompose_script" && "$NO_GITHUB" != "true" ]]; then
1718
+ local decompose_result=""
1719
+ decompose_result=$("$decompose_script" auto "$issue_num" 2>/dev/null) || true
1720
+ if [[ "$decompose_result" == *"decomposed"* ]]; then
1721
+ daemon_log INFO "Issue #${issue_num} decomposed into subtasks — skipping pipeline"
1722
+ # Remove the shipwright label so decomposed parent doesn't re-queue
1723
+ gh issue edit "$issue_num" --remove-label "shipwright" 2>/dev/null || true
1724
+ return 0
1725
+ fi
1726
+ fi
1727
+
1615
1728
  # Extract goal text from issue (title + first line of body)
1616
1729
  local issue_goal="$issue_title"
1617
1730
  if [[ "$NO_GITHUB" != "true" ]]; then
@@ -1727,11 +1840,18 @@ daemon_spawn_pipeline() {
1727
1840
  pipeline_args+=("--fast-test-cmd" "$FAST_TEST_CMD_CFG")
1728
1841
  fi
1729
1842
 
1843
+ # Append any extra pipeline args (from retry escalation, etc.)
1844
+ if [[ ${#extra_pipeline_args[@]} -gt 0 ]]; then
1845
+ pipeline_args+=("${extra_pipeline_args[@]}")
1846
+ fi
1847
+
1730
1848
  # Run pipeline in work directory (background)
1849
+ # Ignore SIGHUP so tmux attach/detach and process group changes don't kill the pipeline
1731
1850
  echo -e "\n\n===== Pipeline run $(date -u +%Y-%m-%dT%H:%M:%SZ) =====" >> "$LOG_DIR/issue-${issue_num}.log" 2>/dev/null || true
1732
1851
  (
1852
+ trap '' HUP
1733
1853
  cd "$work_dir"
1734
- "$SCRIPT_DIR/sw-pipeline.sh" "${pipeline_args[@]}"
1854
+ exec "$SCRIPT_DIR/sw-pipeline.sh" "${pipeline_args[@]}"
1735
1855
  ) >> "$LOG_DIR/issue-${issue_num}.log" 2>&1 200>&- &
1736
1856
  local pid=$!
1737
1857
 
@@ -1904,15 +2024,18 @@ daemon_reap_completed() {
1904
2024
  reap_machine_name=$(jq -r '.machines[] | select(.role == "primary") | .name' "$HOME/.shipwright/machines.json" 2>/dev/null || hostname -s)
1905
2025
  release_claim "$issue_num" "$reap_machine_name"
1906
2026
 
1907
- # Skip cleanup if a retry was just spawned for this issue
2027
+ # Always remove the OLD job entry from active_jobs to prevent
2028
+ # re-reaping of the dead PID on the next cycle. When a retry was
2029
+ # spawned, daemon_spawn_pipeline already added a fresh entry with
2030
+ # the new PID — we must not leave the stale one behind.
2031
+ locked_state_update --argjson num "$issue_num" \
2032
+ --argjson old_pid "${pid:-0}" \
2033
+ '.active_jobs = [.active_jobs[] | select(.issue != $num or .pid != $old_pid)]'
2034
+ untrack_priority_job "$issue_num"
2035
+
1908
2036
  if [[ "$_retry_spawned_for" == "$issue_num" ]]; then
1909
2037
  daemon_log INFO "Retry spawned for issue #${issue_num} — skipping worktree cleanup"
1910
2038
  else
1911
- # Remove from active_jobs and priority lane tracking (locked)
1912
- locked_state_update --argjson num "$issue_num" \
1913
- '.active_jobs = [.active_jobs[] | select(.issue != $num)]'
1914
- untrack_priority_job "$issue_num"
1915
-
1916
2039
  # Clean up worktree (skip for org-mode clones — they persist)
1917
2040
  local job_repo
1918
2041
  job_repo=$(echo "$job" | jq -r '.repo // ""')
@@ -1951,6 +2074,9 @@ daemon_reap_completed() {
1951
2074
  daemon_on_success() {
1952
2075
  local issue_num="$1" duration="${2:-}"
1953
2076
 
2077
+ # Reset consecutive failure tracking on any success
2078
+ reset_failure_tracking
2079
+
1954
2080
  daemon_log SUCCESS "Pipeline completed for issue #${issue_num} (${duration:-unknown})"
1955
2081
 
1956
2082
  # Record pipeline duration for adaptive threshold learning
@@ -2011,6 +2137,91 @@ Check the associated PR for the implementation." 2>/dev/null || true
2011
2137
  "$SCRIPT_DIR/sw-tracker.sh" notify "completed" "$issue_num" 2>/dev/null || true
2012
2138
  }
2013
2139
 
2140
+ # ─── Failure Classification ─────────────────────────────────────────────────
2141
+
2142
+ classify_failure() {
2143
+ local issue_num="$1"
2144
+ if [[ -z "${LOG_DIR:-}" ]]; then
2145
+ echo "unknown"
2146
+ return
2147
+ fi
2148
+ local log_path="$LOG_DIR/issue-${issue_num}.log"
2149
+ if [[ ! -f "$log_path" ]]; then
2150
+ echo "unknown"
2151
+ return
2152
+ fi
2153
+ local tail_content
2154
+ tail_content=$(tail -200 "$log_path" 2>/dev/null || true)
2155
+
2156
+ # Auth errors
2157
+ if echo "$tail_content" | grep -qiE 'not logged in|unauthorized|auth.*fail|401 |invalid.*token|CLAUDE_CODE_OAUTH_TOKEN|api key.*invalid|authentication required'; then
2158
+ echo "auth_error"
2159
+ return
2160
+ fi
2161
+ # API errors (rate limits, timeouts, server errors)
2162
+ if echo "$tail_content" | grep -qiE 'rate limit|429 |503 |502 |overloaded|timeout|ETIMEDOUT|ECONNRESET|socket hang up|service unavailable'; then
2163
+ echo "api_error"
2164
+ return
2165
+ fi
2166
+ # Invalid issue (not found, empty body)
2167
+ if echo "$tail_content" | grep -qiE 'issue not found|404 |no body|could not resolve|GraphQL.*not found|issue.*does not exist'; then
2168
+ echo "invalid_issue"
2169
+ return
2170
+ fi
2171
+ # Context exhaustion — check progress file
2172
+ local issue_worktree_path="${WORKTREE_DIR:-${REPO_DIR}/.worktrees}/daemon-issue-${issue_num}"
2173
+ local progress_file="${issue_worktree_path}/.claude/loop-logs/progress.md"
2174
+ if [[ -f "$progress_file" ]]; then
2175
+ local cf_iter
2176
+ cf_iter=$(grep -oE 'Iteration: [0-9]+' "$progress_file" 2>/dev/null | tail -1 | grep -oE '[0-9]+' || echo "0")
2177
+ if ! [[ "${cf_iter:-0}" =~ ^[0-9]+$ ]]; then cf_iter="0"; fi
2178
+ local cf_tests
2179
+ cf_tests=$(grep -oE 'Tests passing: (true|false)' "$progress_file" 2>/dev/null | awk '{print $NF}' || echo "unknown")
2180
+ if [[ "${cf_iter:-0}" -gt 0 ]] && { [[ "$cf_tests" == "false" ]] || [[ "$cf_tests" == "unknown" ]]; }; then
2181
+ echo "context_exhaustion"
2182
+ return
2183
+ fi
2184
+ fi
2185
+ # Build failure (test errors, compile errors)
2186
+ if echo "$tail_content" | grep -qiE 'test.*fail|FAIL|build.*error|compile.*error|lint.*fail|npm ERR|exit code [1-9]'; then
2187
+ echo "build_failure"
2188
+ return
2189
+ fi
2190
+ echo "unknown"
2191
+ }
2192
+
2193
+ # ─── Consecutive Failure Tracking ──────────────────────────────────────────
2194
+
2195
+ DAEMON_CONSECUTIVE_FAILURE_CLASS=""
2196
+ DAEMON_CONSECUTIVE_FAILURE_COUNT=0
2197
+
2198
+ record_failure_class() {
2199
+ local failure_class="$1"
2200
+ if [[ "$failure_class" == "$DAEMON_CONSECUTIVE_FAILURE_CLASS" ]]; then
2201
+ DAEMON_CONSECUTIVE_FAILURE_COUNT=$((DAEMON_CONSECUTIVE_FAILURE_COUNT + 1))
2202
+ else
2203
+ DAEMON_CONSECUTIVE_FAILURE_CLASS="$failure_class"
2204
+ DAEMON_CONSECUTIVE_FAILURE_COUNT=1
2205
+ fi
2206
+
2207
+ if [[ "$DAEMON_CONSECUTIVE_FAILURE_COUNT" -ge 3 ]]; then
2208
+ daemon_log ERROR "3 consecutive failures (class: ${failure_class}) — auto-pausing daemon"
2209
+ local pause_json
2210
+ pause_json=$(jq -n --arg reason "consecutive_${failure_class}" --arg ts "$(now_iso)" \
2211
+ '{reason: $reason, timestamp: $ts}')
2212
+ local _tmp_pause
2213
+ _tmp_pause=$(mktemp "${TMPDIR:-/tmp}/sw-pause.XXXXXX")
2214
+ echo "$pause_json" > "$_tmp_pause"
2215
+ mv "$_tmp_pause" "$PAUSE_FLAG"
2216
+ emit_event "daemon.auto_pause" "reason=consecutive_failures" "class=$failure_class" "count=$DAEMON_CONSECUTIVE_FAILURE_COUNT"
2217
+ fi
2218
+ }
2219
+
2220
+ reset_failure_tracking() {
2221
+ DAEMON_CONSECUTIVE_FAILURE_CLASS=""
2222
+ DAEMON_CONSECUTIVE_FAILURE_COUNT=0
2223
+ }
2224
+
2014
2225
  # ─── Failure Handler ────────────────────────────────────────────────────────
2015
2226
 
2016
2227
  daemon_on_failure() {
@@ -2047,123 +2258,143 @@ daemon_on_failure() {
2047
2258
  completed_at: $completed_at
2048
2259
  }] | .completed = .completed[-500:]'
2049
2260
 
2261
+ # ── Classify failure and decide retry strategy ──
2262
+ local failure_class
2263
+ failure_class=$(classify_failure "$issue_num")
2264
+ daemon_log INFO "Failure classified as: ${failure_class} for issue #${issue_num}"
2265
+ emit_event "daemon.failure_classified" "issue=$issue_num" "class=$failure_class"
2266
+ record_failure_class "$failure_class"
2267
+
2050
2268
  # ── Auto-retry with strategy escalation ──
2051
2269
  if [[ "${RETRY_ESCALATION:-true}" == "true" ]]; then
2052
2270
  local retry_count
2053
2271
  retry_count=$(jq -r --arg num "$issue_num" \
2054
2272
  '.retry_counts[$num] // 0' "$STATE_FILE" 2>/dev/null || echo "0")
2055
2273
 
2056
- if [[ "$retry_count" -lt "${MAX_RETRIES:-2}" ]]; then
2057
- retry_count=$((retry_count + 1))
2058
-
2059
- # Update retry count in state (locked to prevent race)
2060
- locked_state_update \
2061
- --arg num "$issue_num" --argjson count "$retry_count" \
2062
- '.retry_counts[$num] = $count'
2063
-
2064
- daemon_log WARN "Auto-retry #${retry_count}/${MAX_RETRIES:-2} for issue #${issue_num}"
2065
- emit_event "daemon.retry" "issue=$issue_num" "retry=$retry_count" "max=${MAX_RETRIES:-2}"
2066
-
2067
- # Check for checkpoint to enable resume-from-checkpoint
2068
- local checkpoint_args=()
2069
- if [[ "${CHECKPOINT_ENABLED:-true}" == "true" ]]; then
2070
- # Try to find worktree for this issue to check for checkpoints
2071
- local issue_worktree="${REPO_DIR}/.worktrees/daemon-issue-${issue_num}"
2072
- if [[ -d "$issue_worktree/.claude/pipeline-artifacts/checkpoints" ]]; then
2073
- local latest_checkpoint=""
2074
- for cp_file in "$issue_worktree/.claude/pipeline-artifacts/checkpoints"/*-checkpoint.json; do
2075
- [[ -f "$cp_file" ]] && latest_checkpoint="$cp_file"
2076
- done
2077
- if [[ -n "$latest_checkpoint" ]]; then
2078
- daemon_log INFO "Found checkpoint: $latest_checkpoint"
2079
- emit_event "daemon.recovery" "issue=$issue_num" "checkpoint=$latest_checkpoint"
2080
- checkpoint_args+=("--resume")
2081
- fi
2274
+ # Non-retryable failures skip retry entirely
2275
+ case "$failure_class" in
2276
+ auth_error)
2277
+ daemon_log ERROR "Auth error for issue #${issue_num} skipping retry"
2278
+ emit_event "daemon.skip_retry" "issue=$issue_num" "reason=auth_error"
2279
+ if [[ "$NO_GITHUB" != "true" ]]; then
2280
+ gh issue edit "$issue_num" --add-label "pipeline/auth-error" 2>/dev/null || true
2082
2281
  fi
2083
- fi
2084
-
2085
- # Detect context exhaustion from progress file
2086
- local failure_reason="unknown"
2087
- local issue_worktree_path="${WORKTREE_DIR:-${REPO_DIR}/.worktrees}/daemon-issue-${issue_num}"
2088
- local progress_file="${issue_worktree_path}/.claude/loop-logs/progress.md"
2089
- if [[ -f "$progress_file" ]]; then
2090
- local progress_iter
2091
- progress_iter=$(grep -oE 'Iteration: [0-9]+' "$progress_file" 2>/dev/null | tail -1 | grep -oE '[0-9]+' || echo "0")
2092
- if ! [[ "${progress_iter:-0}" =~ ^[0-9]+$ ]]; then
2093
- progress_iter="0"
2094
- fi
2095
- local progress_tests
2096
- progress_tests=$(grep -oE 'Tests passing: (true|false)' "$progress_file" 2>/dev/null | awk '{print $NF}' || echo "unknown")
2097
- if [[ "${progress_iter:-0}" -gt 0 ]] && { [[ "$progress_tests" == "false" ]] || [[ "$progress_tests" == "unknown" ]]; }; then
2098
- failure_reason="context_exhaustion"
2099
- emit_event "daemon.context_exhaustion" "issue=$issue_num" "iterations=$progress_iter"
2100
- daemon_log WARN "Context exhaustion detected for issue #${issue_num} (iterations: ${progress_iter})"
2282
+ ;;
2283
+ invalid_issue)
2284
+ daemon_log ERROR "Invalid issue #${issue_num} skipping retry"
2285
+ emit_event "daemon.skip_retry" "issue=$issue_num" "reason=invalid_issue"
2286
+ if [[ "$NO_GITHUB" != "true" ]]; then
2287
+ gh issue comment "$issue_num" --body "Pipeline skipped retry: issue appears invalid or has no body." 2>/dev/null || true
2101
2288
  fi
2102
- fi
2289
+ ;;
2290
+ *)
2291
+ # Retryable failures — proceed with escalation
2292
+ if [[ "$retry_count" -lt "${MAX_RETRIES:-2}" ]]; then
2293
+ retry_count=$((retry_count + 1))
2294
+
2295
+ # Update retry count in state (locked to prevent race)
2296
+ locked_state_update \
2297
+ --arg num "$issue_num" --argjson count "$retry_count" \
2298
+ '.retry_counts[$num] = $count'
2299
+
2300
+ daemon_log WARN "Auto-retry #${retry_count}/${MAX_RETRIES:-2} for issue #${issue_num} (class: ${failure_class})"
2301
+ emit_event "daemon.retry" "issue=$issue_num" "retry=$retry_count" "max=${MAX_RETRIES:-2}" "class=$failure_class"
2302
+
2303
+ # Check for checkpoint to enable resume-from-checkpoint
2304
+ local checkpoint_args=()
2305
+ if [[ "${CHECKPOINT_ENABLED:-true}" == "true" ]]; then
2306
+ local issue_worktree="${REPO_DIR}/.worktrees/daemon-issue-${issue_num}"
2307
+ if [[ -d "$issue_worktree/.claude/pipeline-artifacts/checkpoints" ]]; then
2308
+ local latest_checkpoint=""
2309
+ for cp_file in "$issue_worktree/.claude/pipeline-artifacts/checkpoints"/*-checkpoint.json; do
2310
+ [[ -f "$cp_file" ]] && latest_checkpoint="$cp_file"
2311
+ done
2312
+ if [[ -n "$latest_checkpoint" ]]; then
2313
+ daemon_log INFO "Found checkpoint: $latest_checkpoint"
2314
+ emit_event "daemon.recovery" "issue=$issue_num" "checkpoint=$latest_checkpoint"
2315
+ checkpoint_args+=("--resume")
2316
+ fi
2317
+ fi
2318
+ fi
2103
2319
 
2104
- # Build escalated pipeline args
2105
- local retry_template="$PIPELINE_TEMPLATE"
2106
- local retry_model="${MODEL:-opus}"
2107
- local extra_args=()
2108
-
2109
- if [[ "$retry_count" -eq 1 ]]; then
2110
- # Retry 1: same template, upgrade model, more iterations
2111
- retry_model="opus"
2112
- extra_args+=("--max-iterations" "30")
2113
- daemon_log INFO "Escalation: model=opus, max_iterations=30"
2114
- elif [[ "$retry_count" -ge 2 ]]; then
2115
- # Retry 2: full template, compound quality max cycles
2116
- retry_template="full"
2117
- retry_model="opus"
2118
- extra_args+=("--max-iterations" "30" "--compound-cycles" "5")
2119
- daemon_log INFO "Escalation: template=full, compound_cycles=5"
2120
- fi
2320
+ # Build escalated pipeline args
2321
+ local retry_template="$PIPELINE_TEMPLATE"
2322
+ local retry_model="${MODEL:-opus}"
2323
+ local extra_args=()
2324
+
2325
+ if [[ "$retry_count" -eq 1 ]]; then
2326
+ retry_model="opus"
2327
+ extra_args+=("--max-iterations" "30")
2328
+ daemon_log INFO "Escalation: model=opus, max_iterations=30"
2329
+ elif [[ "$retry_count" -ge 2 ]]; then
2330
+ retry_template="full"
2331
+ retry_model="opus"
2332
+ extra_args+=("--max-iterations" "30" "--compound-cycles" "5")
2333
+ daemon_log INFO "Escalation: template=full, compound_cycles=5"
2334
+ fi
2121
2335
 
2122
- # Increase restarts on context exhaustion
2123
- if [[ "$failure_reason" == "context_exhaustion" ]]; then
2124
- local boosted_restarts=$(( ${MAX_RESTARTS_CFG:-3} + retry_count ))
2125
- # Cap at sw-loop's hard limit of 5
2126
- if [[ "$boosted_restarts" -gt 5 ]]; then
2127
- boosted_restarts=5
2128
- fi
2129
- extra_args+=("--max-restarts" "$boosted_restarts")
2130
- daemon_log INFO "Boosting max-restarts to $boosted_restarts (context exhaustion)"
2131
- fi
2336
+ # Increase restarts on context exhaustion
2337
+ if [[ "$failure_class" == "context_exhaustion" ]]; then
2338
+ local boosted_restarts=$(( ${MAX_RESTARTS_CFG:-3} + retry_count ))
2339
+ if [[ "$boosted_restarts" -gt 5 ]]; then
2340
+ boosted_restarts=5
2341
+ fi
2342
+ extra_args+=("--max-restarts" "$boosted_restarts")
2343
+ daemon_log INFO "Boosting max-restarts to $boosted_restarts (context exhaustion)"
2344
+ fi
2132
2345
 
2133
- if [[ "$NO_GITHUB" != "true" ]]; then
2134
- gh issue comment "$issue_num" --body "## 🔄 Auto-Retry #${retry_count}
2346
+ # API errors get extended backoff
2347
+ local api_backoff=300
2348
+ local backoff_secs=$((30 * retry_count))
2349
+ if [[ "$failure_class" == "api_error" ]]; then
2350
+ backoff_secs=$((api_backoff * retry_count))
2351
+ daemon_log INFO "API error — extended backoff ${backoff_secs}s"
2352
+ fi
2135
2353
 
2136
- Pipeline failed retrying with escalated strategy.
2354
+ if [[ "$NO_GITHUB" != "true" ]]; then
2355
+ gh issue comment "$issue_num" --body "## 🔄 Auto-Retry #${retry_count}
2356
+
2357
+ Pipeline failed (${failure_class}) — retrying with escalated strategy.
2137
2358
 
2138
2359
  | Field | Value |
2139
2360
  |-------|-------|
2140
2361
  | Retry | ${retry_count} / ${MAX_RETRIES:-2} |
2362
+ | Failure | \`${failure_class}\` |
2141
2363
  | Template | \`${retry_template}\` |
2142
2364
  | Model | \`${retry_model}\` |
2143
2365
  | Started | $(now_iso) |
2144
2366
 
2145
2367
  _Escalation: $(if [[ "$retry_count" -eq 1 ]]; then echo "upgraded model + increased iterations"; else echo "full template + compound quality"; fi)_" 2>/dev/null || true
2146
- fi
2368
+ fi
2147
2369
 
2148
- # Backoff before retry: 30s * retry_count (30s, 60s, ...)
2149
- local backoff_secs=$((30 * retry_count))
2150
- daemon_log INFO "Waiting ${backoff_secs}s before retry #${retry_count}"
2151
- sleep "$backoff_secs"
2152
-
2153
- # Re-spawn with escalated strategy
2154
- local orig_template="$PIPELINE_TEMPLATE"
2155
- local orig_model="$MODEL"
2156
- PIPELINE_TEMPLATE="$retry_template"
2157
- MODEL="$retry_model"
2158
- daemon_spawn_pipeline "$issue_num" "retry-${retry_count}"
2159
- _retry_spawned_for="$issue_num"
2160
- PIPELINE_TEMPLATE="$orig_template"
2161
- MODEL="$orig_model"
2162
- return
2163
- fi
2370
+ daemon_log INFO "Waiting ${backoff_secs}s before retry #${retry_count}"
2371
+ sleep "$backoff_secs"
2164
2372
 
2165
- daemon_log WARN "Max retries (${MAX_RETRIES:-2}) exhausted for issue #${issue_num}"
2166
- emit_event "daemon.retry_exhausted" "issue=$issue_num" "retries=$retry_count"
2373
+ # Merge checkpoint args + extra args for passthrough
2374
+ local all_extra_args=()
2375
+ if [[ ${#checkpoint_args[@]} -gt 0 ]]; then
2376
+ all_extra_args+=("${checkpoint_args[@]}")
2377
+ fi
2378
+ if [[ ${#extra_args[@]} -gt 0 ]]; then
2379
+ all_extra_args+=("${extra_args[@]}")
2380
+ fi
2381
+
2382
+ # Re-spawn with escalated strategy
2383
+ local orig_template="$PIPELINE_TEMPLATE"
2384
+ local orig_model="$MODEL"
2385
+ PIPELINE_TEMPLATE="$retry_template"
2386
+ MODEL="$retry_model"
2387
+ daemon_spawn_pipeline "$issue_num" "retry-${retry_count}" "" "${all_extra_args[@]}"
2388
+ _retry_spawned_for="$issue_num"
2389
+ PIPELINE_TEMPLATE="$orig_template"
2390
+ MODEL="$orig_model"
2391
+ return
2392
+ fi
2393
+
2394
+ daemon_log WARN "Max retries (${MAX_RETRIES:-2}) exhausted for issue #${issue_num}"
2395
+ emit_event "daemon.retry_exhausted" "issue=$issue_num" "retries=$retry_count"
2396
+ ;;
2397
+ esac
2167
2398
  fi
2168
2399
 
2169
2400
  # ── No retry — report final failure ──
@@ -3770,6 +4001,13 @@ Patrol pre-filter findings to confirm: ${patrol_findings_summary}"
3770
4001
  patrol_meta_run
3771
4002
  fi
3772
4003
 
4004
+ # ── Strategic Intelligence Patrol (requires CLAUDE_CODE_OAUTH_TOKEN) ──
4005
+ if [[ -f "$SCRIPT_DIR/sw-strategic.sh" ]] && [[ -n "${CLAUDE_CODE_OAUTH_TOKEN:-}" ]]; then
4006
+ # shellcheck source=sw-strategic.sh
4007
+ source "$SCRIPT_DIR/sw-strategic.sh"
4008
+ strategic_patrol_run || true
4009
+ fi
4010
+
3773
4011
  # ── Summary ──
3774
4012
  emit_event "patrol.completed" "findings=$total_findings" "issues_created=$issues_created" "dry_run=$dry_run"
3775
4013
 
@@ -4095,13 +4333,15 @@ daemon_health_check() {
4095
4333
  now_e=$(now_epoch)
4096
4334
 
4097
4335
  if [[ -f "$STATE_FILE" ]]; then
4098
- # ── Progress-Based Health Monitoring ──
4099
- # Instead of killing after a static timeout, check for forward progress.
4100
- # Only kill when the agent is truly stuck (no stage change, no new code,
4101
- # same error repeating). A hard wall-clock limit remains as absolute safety net.
4336
+ # ── Intelligent Health Monitoring ──
4337
+ # Instead of killing after a countdown, sense what the agent is doing.
4338
+ # Agents think for long stretches that's normal and expected.
4339
+ # Strategy: sense understand be patient nudge → only kill as last resort.
4102
4340
 
4103
- local hard_limit="${PROGRESS_HARD_LIMIT_S:-10800}"
4341
+ local hard_limit="${PROGRESS_HARD_LIMIT_S:-0}"
4104
4342
  local use_progress="${PROGRESS_MONITORING:-true}"
4343
+ local nudge_enabled="${NUDGE_ENABLED:-true}"
4344
+ local nudge_after="${NUDGE_AFTER_CHECKS:-40}"
4105
4345
 
4106
4346
  while IFS= read -r job; do
4107
4347
  local pid started_at issue_num worktree
@@ -4122,8 +4362,8 @@ daemon_health_check() {
4122
4362
  elapsed=$(( now_e - start_e ))
4123
4363
  fi
4124
4364
 
4125
- # Hard wall-clock limit — absolute safety net (default 3h)
4126
- if [[ "$elapsed" -gt "$hard_limit" ]]; then
4365
+ # Hard wall-clock limit — disabled by default (0 = off)
4366
+ if [[ "$hard_limit" -gt 0 && "$elapsed" -gt "$hard_limit" ]]; then
4127
4367
  daemon_log WARN "Hard limit exceeded: issue #${issue_num} (${elapsed}s > ${hard_limit}s, PID $pid) — killing"
4128
4368
  emit_event "daemon.hard_limit" "issue=$issue_num" "elapsed_s=$elapsed" "limit_s=$hard_limit" "pid=$pid"
4129
4369
  kill "$pid" 2>/dev/null || true
@@ -4132,7 +4372,7 @@ daemon_health_check() {
4132
4372
  continue
4133
4373
  fi
4134
4374
 
4135
- # Progress-based detection (when enabled)
4375
+ # ── Intelligent Progress Sensing ──
4136
4376
  if [[ "$use_progress" == "true" && -n "$worktree" ]]; then
4137
4377
  local snapshot verdict
4138
4378
  snapshot=$(daemon_collect_snapshot "$issue_num" "$worktree" "$pid" 2>/dev/null || echo '{}')
@@ -4140,29 +4380,87 @@ daemon_health_check() {
4140
4380
  if [[ "$snapshot" != "{}" ]]; then
4141
4381
  verdict=$(daemon_assess_progress "$issue_num" "$snapshot" 2>/dev/null || echo "healthy")
4142
4382
 
4383
+ local no_progress_count=0
4384
+ no_progress_count=$(jq -r '.no_progress_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
4385
+ local cur_stage
4386
+ cur_stage=$(echo "$snapshot" | jq -r '.stage // "unknown"')
4387
+
4143
4388
  case "$verdict" in
4144
4389
  healthy)
4145
4390
  # All good — agent is making progress
4146
4391
  ;;
4147
4392
  slowing)
4148
- daemon_log INFO "Issue #${issue_num} slowing (no progress for 1-2 checks, ${elapsed}s elapsed)"
4393
+ daemon_log INFO "Issue #${issue_num} slowing (no visible changes for ${no_progress_count} checks, ${elapsed}s elapsed, stage=${cur_stage})"
4149
4394
  ;;
4150
4395
  stalled)
4151
- local no_progress_count
4152
- no_progress_count=$(jq -r '.no_progress_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
4153
- daemon_log WARN "Issue #${issue_num} stalled: no progress for ${no_progress_count} checks (${elapsed}s elapsed, PID $pid)"
4154
- emit_event "daemon.stalled" "issue=$issue_num" "no_progress=$no_progress_count" "elapsed_s=$elapsed" "pid=$pid"
4396
+ # Check if agent subprocess is alive and consuming CPU
4397
+ local agent_alive=false
4398
+ local child_cpu=0
4399
+ child_cpu=$(pgrep -P "$pid" 2>/dev/null | xargs -I{} ps -o pcpu= -p {} 2>/dev/null | awk '{sum+=$1} END{printf "%d", sum+0}' || echo "0")
4400
+ if [[ "${child_cpu:-0}" -gt 0 ]]; then
4401
+ agent_alive=true
4402
+ fi
4403
+
4404
+ if [[ "$agent_alive" == "true" ]]; then
4405
+ daemon_log INFO "Issue #${issue_num} no visible progress (${no_progress_count} checks) but agent is alive (CPU: ${child_cpu}%, stage=${cur_stage}, ${elapsed}s) — being patient"
4406
+ else
4407
+ daemon_log WARN "Issue #${issue_num} stalled: no progress for ${no_progress_count} checks, no CPU activity (${elapsed}s elapsed, PID $pid)"
4408
+ emit_event "daemon.stalled" "issue=$issue_num" "no_progress=$no_progress_count" "elapsed_s=$elapsed" "pid=$pid"
4409
+ fi
4155
4410
  ;;
4156
4411
  stuck)
4157
- local no_progress_count repeated_errors cur_stage
4158
- no_progress_count=$(jq -r '.no_progress_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
4412
+ local repeated_errors
4159
4413
  repeated_errors=$(jq -r '.repeated_error_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
4160
- cur_stage=$(echo "$snapshot" | jq -r '.stage // "unknown"')
4161
- daemon_log WARN "Issue #${issue_num} STUCK: no progress for ${no_progress_count} checks, ${repeated_errors} repeated errors (stage=${cur_stage}, ${elapsed}s, PID $pid) — killing"
4162
- emit_event "daemon.stuck_kill" "issue=$issue_num" "no_progress=$no_progress_count" "repeated_errors=$repeated_errors" "stage=$cur_stage" "elapsed_s=$elapsed" "pid=$pid"
4163
- kill "$pid" 2>/dev/null || true
4164
- daemon_clear_progress "$issue_num"
4165
- findings=$((findings + 1))
4414
+
4415
+ # Even "stuck" check if the process tree is alive first
4416
+ local agent_alive=false
4417
+ local child_cpu=0
4418
+ child_cpu=$(pgrep -P "$pid" 2>/dev/null | xargs -I{} ps -o pcpu= -p {} 2>/dev/null | awk '{sum+=$1} END{printf "%d", sum+0}' || echo "0")
4419
+ if [[ "${child_cpu:-0}" -gt 0 ]]; then
4420
+ agent_alive=true
4421
+ fi
4422
+
4423
+ if [[ "$agent_alive" == "true" && "$repeated_errors" -lt 3 ]]; then
4424
+ # Agent is alive — nudge instead of kill
4425
+ if [[ "$nudge_enabled" == "true" && "$no_progress_count" -ge "$nudge_after" ]]; then
4426
+ local nudge_file="${worktree}/.claude/nudge.md"
4427
+ if [[ ! -f "$nudge_file" ]]; then
4428
+ cat > "$nudge_file" <<NUDGE_EOF
4429
+ # Nudge from Daemon Health Monitor
4430
+
4431
+ The daemon has noticed no visible progress for $(( no_progress_count * 30 / 60 )) minutes.
4432
+ Current stage: ${cur_stage}
4433
+
4434
+ If you're stuck, consider:
4435
+ - Breaking the task into smaller steps
4436
+ - Committing partial progress
4437
+ - Running tests to validate current state
4438
+
4439
+ This is just a gentle check-in — take your time if you're working through a complex problem.
4440
+ NUDGE_EOF
4441
+ daemon_log INFO "Issue #${issue_num} nudged (${no_progress_count} checks, stage=${cur_stage}, CPU=${child_cpu}%) — file written to worktree"
4442
+ emit_event "daemon.nudge" "issue=$issue_num" "no_progress=$no_progress_count" "stage=$cur_stage" "elapsed_s=$elapsed"
4443
+ fi
4444
+ else
4445
+ daemon_log INFO "Issue #${issue_num} no visible progress (${no_progress_count} checks) but agent is alive (CPU: ${child_cpu}%, stage=${cur_stage}) — waiting"
4446
+ fi
4447
+ elif [[ "$repeated_errors" -ge 5 ]]; then
4448
+ # Truly stuck in an error loop — kill as last resort
4449
+ daemon_log WARN "Issue #${issue_num} in error loop: ${repeated_errors} repeated errors (stage=${cur_stage}, ${elapsed}s, PID $pid) — killing"
4450
+ emit_event "daemon.stuck_kill" "issue=$issue_num" "no_progress=$no_progress_count" "repeated_errors=$repeated_errors" "stage=$cur_stage" "elapsed_s=$elapsed" "pid=$pid" "reason=error_loop"
4451
+ kill "$pid" 2>/dev/null || true
4452
+ daemon_clear_progress "$issue_num"
4453
+ findings=$((findings + 1))
4454
+ elif [[ "$agent_alive" != "true" && "$no_progress_count" -ge "$((PROGRESS_CHECKS_BEFORE_KILL * 2))" ]]; then
4455
+ # Process tree is dead AND no progress for very long time
4456
+ daemon_log WARN "Issue #${issue_num} appears dead: no CPU, no progress for ${no_progress_count} checks (${elapsed}s, PID $pid) — killing"
4457
+ emit_event "daemon.stuck_kill" "issue=$issue_num" "no_progress=$no_progress_count" "repeated_errors=$repeated_errors" "stage=$cur_stage" "elapsed_s=$elapsed" "pid=$pid" "reason=dead_process"
4458
+ kill "$pid" 2>/dev/null || true
4459
+ daemon_clear_progress "$issue_num"
4460
+ findings=$((findings + 1))
4461
+ else
4462
+ daemon_log WARN "Issue #${issue_num} struggling (${no_progress_count} checks, ${repeated_errors} errors, CPU=${child_cpu}%, stage=${cur_stage}) — monitoring"
4463
+ fi
4166
4464
  ;;
4167
4465
  esac
4168
4466
  fi
@@ -4171,8 +4469,9 @@ daemon_health_check() {
4171
4469
  local stale_timeout
4172
4470
  stale_timeout=$(get_adaptive_stale_timeout "$PIPELINE_TEMPLATE")
4173
4471
  if [[ "$elapsed" -gt "$stale_timeout" ]]; then
4174
- daemon_log WARN "Stale job (legacy): issue #${issue_num} (${elapsed}s > ${stale_timeout}s, PID $pid) — killing"
4175
- kill "$pid" 2>/dev/null || true
4472
+ daemon_log WARN "Stale job (legacy): issue #${issue_num} (${elapsed}s > ${stale_timeout}s, PID $pid)"
4473
+ # Don't kill just log. Let the process run.
4474
+ emit_event "daemon.stale_warning" "issue=$issue_num" "elapsed_s=$elapsed" "pid=$pid"
4176
4475
  findings=$((findings + 1))
4177
4476
  fi
4178
4477
  fi
@@ -4765,6 +5064,7 @@ daemon_poll_loop() {
4765
5064
  # All poll loop calls are error-guarded to prevent set -e from killing the daemon.
4766
5065
  # The || operator disables set -e for the entire call chain, so transient failures
4767
5066
  # (GitHub API timeouts, jq errors, intelligence failures) are logged and skipped.
5067
+ daemon_preflight_auth_check || daemon_log WARN "Auth check failed — daemon may be paused"
4768
5068
  daemon_poll_issues || daemon_log WARN "daemon_poll_issues failed — continuing"
4769
5069
  daemon_reap_completed || daemon_log WARN "daemon_reap_completed failed — continuing"
4770
5070
  daemon_health_check || daemon_log WARN "daemon_health_check failed — continuing"
@@ -4848,7 +5148,8 @@ cleanup_on_exit() {
4848
5148
  while IFS= read -r cpid; do
4849
5149
  [[ -z "$cpid" ]] && continue
4850
5150
  if kill -0 "$cpid" 2>/dev/null; then
4851
- daemon_log INFO "Killing pipeline process PID ${cpid}"
5151
+ daemon_log INFO "Killing pipeline process tree PID ${cpid}"
5152
+ pkill -TERM -P "$cpid" 2>/dev/null || true
4852
5153
  kill "$cpid" 2>/dev/null || true
4853
5154
  killed=$((killed + 1))
4854
5155
  fi
@@ -4860,7 +5161,8 @@ cleanup_on_exit() {
4860
5161
  while IFS= read -r cpid; do
4861
5162
  [[ -z "$cpid" ]] && continue
4862
5163
  if kill -0 "$cpid" 2>/dev/null; then
4863
- daemon_log WARN "Force-killing pipeline PID ${cpid}"
5164
+ daemon_log WARN "Force-killing pipeline tree PID ${cpid}"
5165
+ pkill -9 -P "$cpid" 2>/dev/null || true
4864
5166
  kill -9 "$cpid" 2>/dev/null || true
4865
5167
  fi
4866
5168
  done <<< "$child_pids"