shipwright-cli 1.10.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. package/README.md +221 -55
  2. package/completions/_shipwright +264 -32
  3. package/completions/shipwright.bash +118 -26
  4. package/completions/shipwright.fish +80 -2
  5. package/dashboard/server.ts +208 -0
  6. package/docs/strategy/01-market-research.md +619 -0
  7. package/docs/strategy/02-mission-and-brand.md +587 -0
  8. package/docs/strategy/03-gtm-and-roadmap.md +759 -0
  9. package/docs/strategy/QUICK-START.txt +289 -0
  10. package/docs/strategy/README.md +172 -0
  11. package/docs/tmux-research/TMUX-ARCHITECTURE.md +567 -0
  12. package/docs/tmux-research/TMUX-AUDIT.md +925 -0
  13. package/docs/tmux-research/TMUX-BEST-PRACTICES-2025-2026.md +829 -0
  14. package/docs/tmux-research/TMUX-QUICK-REFERENCE.md +543 -0
  15. package/docs/tmux-research/TMUX-RESEARCH-INDEX.md +438 -0
  16. package/package.json +4 -2
  17. package/scripts/lib/helpers.sh +7 -0
  18. package/scripts/sw +323 -2
  19. package/scripts/sw-activity.sh +500 -0
  20. package/scripts/sw-adaptive.sh +925 -0
  21. package/scripts/sw-adversarial.sh +1 -1
  22. package/scripts/sw-architecture-enforcer.sh +1 -1
  23. package/scripts/sw-auth.sh +613 -0
  24. package/scripts/sw-autonomous.sh +754 -0
  25. package/scripts/sw-changelog.sh +704 -0
  26. package/scripts/sw-checkpoint.sh +1 -1
  27. package/scripts/sw-ci.sh +602 -0
  28. package/scripts/sw-cleanup.sh +1 -1
  29. package/scripts/sw-code-review.sh +698 -0
  30. package/scripts/sw-connect.sh +1 -1
  31. package/scripts/sw-context.sh +605 -0
  32. package/scripts/sw-cost.sh +44 -3
  33. package/scripts/sw-daemon.sh +568 -138
  34. package/scripts/sw-dashboard.sh +1 -1
  35. package/scripts/sw-db.sh +1380 -0
  36. package/scripts/sw-decompose.sh +539 -0
  37. package/scripts/sw-deps.sh +551 -0
  38. package/scripts/sw-developer-simulation.sh +1 -1
  39. package/scripts/sw-discovery.sh +412 -0
  40. package/scripts/sw-docs-agent.sh +539 -0
  41. package/scripts/sw-docs.sh +1 -1
  42. package/scripts/sw-doctor.sh +107 -1
  43. package/scripts/sw-dora.sh +615 -0
  44. package/scripts/sw-durable.sh +710 -0
  45. package/scripts/sw-e2e-orchestrator.sh +535 -0
  46. package/scripts/sw-eventbus.sh +393 -0
  47. package/scripts/sw-feedback.sh +479 -0
  48. package/scripts/sw-fix.sh +1 -1
  49. package/scripts/sw-fleet-discover.sh +567 -0
  50. package/scripts/sw-fleet-viz.sh +404 -0
  51. package/scripts/sw-fleet.sh +8 -1
  52. package/scripts/sw-github-app.sh +596 -0
  53. package/scripts/sw-github-checks.sh +4 -4
  54. package/scripts/sw-github-deploy.sh +1 -1
  55. package/scripts/sw-github-graphql.sh +1 -1
  56. package/scripts/sw-guild.sh +569 -0
  57. package/scripts/sw-heartbeat.sh +1 -1
  58. package/scripts/sw-hygiene.sh +559 -0
  59. package/scripts/sw-incident.sh +656 -0
  60. package/scripts/sw-init.sh +237 -24
  61. package/scripts/sw-instrument.sh +699 -0
  62. package/scripts/sw-intelligence.sh +1 -1
  63. package/scripts/sw-jira.sh +1 -1
  64. package/scripts/sw-launchd.sh +363 -28
  65. package/scripts/sw-linear.sh +1 -1
  66. package/scripts/sw-logs.sh +1 -1
  67. package/scripts/sw-loop.sh +267 -21
  68. package/scripts/sw-memory.sh +18 -1
  69. package/scripts/sw-mission-control.sh +487 -0
  70. package/scripts/sw-model-router.sh +545 -0
  71. package/scripts/sw-otel.sh +596 -0
  72. package/scripts/sw-oversight.sh +764 -0
  73. package/scripts/sw-pipeline-composer.sh +1 -1
  74. package/scripts/sw-pipeline-vitals.sh +1 -1
  75. package/scripts/sw-pipeline.sh +947 -35
  76. package/scripts/sw-pm.sh +758 -0
  77. package/scripts/sw-pr-lifecycle.sh +522 -0
  78. package/scripts/sw-predictive.sh +8 -1
  79. package/scripts/sw-prep.sh +1 -1
  80. package/scripts/sw-ps.sh +1 -1
  81. package/scripts/sw-public-dashboard.sh +798 -0
  82. package/scripts/sw-quality.sh +595 -0
  83. package/scripts/sw-reaper.sh +1 -1
  84. package/scripts/sw-recruit.sh +2248 -0
  85. package/scripts/sw-regression.sh +642 -0
  86. package/scripts/sw-release-manager.sh +736 -0
  87. package/scripts/sw-release.sh +706 -0
  88. package/scripts/sw-remote.sh +1 -1
  89. package/scripts/sw-replay.sh +520 -0
  90. package/scripts/sw-retro.sh +691 -0
  91. package/scripts/sw-scale.sh +444 -0
  92. package/scripts/sw-security-audit.sh +505 -0
  93. package/scripts/sw-self-optimize.sh +1 -1
  94. package/scripts/sw-session.sh +1 -1
  95. package/scripts/sw-setup.sh +263 -127
  96. package/scripts/sw-standup.sh +712 -0
  97. package/scripts/sw-status.sh +44 -2
  98. package/scripts/sw-strategic.sh +806 -0
  99. package/scripts/sw-stream.sh +450 -0
  100. package/scripts/sw-swarm.sh +620 -0
  101. package/scripts/sw-team-stages.sh +511 -0
  102. package/scripts/sw-templates.sh +4 -4
  103. package/scripts/sw-testgen.sh +566 -0
  104. package/scripts/sw-tmux-pipeline.sh +554 -0
  105. package/scripts/sw-tmux-role-color.sh +58 -0
  106. package/scripts/sw-tmux-status.sh +128 -0
  107. package/scripts/sw-tmux.sh +1 -1
  108. package/scripts/sw-trace.sh +485 -0
  109. package/scripts/sw-tracker-github.sh +188 -0
  110. package/scripts/sw-tracker-jira.sh +172 -0
  111. package/scripts/sw-tracker-linear.sh +251 -0
  112. package/scripts/sw-tracker.sh +117 -2
  113. package/scripts/sw-triage.sh +627 -0
  114. package/scripts/sw-upgrade.sh +1 -1
  115. package/scripts/sw-ux.sh +677 -0
  116. package/scripts/sw-webhook.sh +627 -0
  117. package/scripts/sw-widgets.sh +530 -0
  118. package/scripts/sw-worktree.sh +1 -1
  119. package/templates/pipelines/autonomous.json +2 -2
  120. package/tmux/shipwright-overlay.conf +35 -17
  121. package/tmux/tmux.conf +23 -21
@@ -6,7 +6,10 @@
6
6
  set -euo pipefail
7
7
  trap 'echo "ERROR: $BASH_SOURCE:$LINENO exited with status $?" >&2' ERR
8
8
 
9
- VERSION="1.10.0"
9
+ # Allow spawning Claude CLI from within a Claude Code session (daemon, fleet, etc.)
10
+ unset CLAUDECODE 2>/dev/null || true
11
+
12
+ VERSION="2.1.0"
10
13
  SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
11
14
  REPO_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
12
15
 
@@ -37,6 +40,10 @@ RESET='\033[0m'
37
40
  # shellcheck source=sw-pipeline-vitals.sh
38
41
  [[ -f "$SCRIPT_DIR/sw-pipeline-vitals.sh" ]] && source "$SCRIPT_DIR/sw-pipeline-vitals.sh"
39
42
 
43
+ # ─── SQLite Persistence (optional) ──────────────────────────────────────────
44
+ # shellcheck source=sw-db.sh
45
+ [[ -f "$SCRIPT_DIR/sw-db.sh" ]] && source "$SCRIPT_DIR/sw-db.sh"
46
+
40
47
  # ─── GitHub API Modules (optional) ────────────────────────────────────────
41
48
  # shellcheck source=sw-github-graphql.sh
42
49
  [[ -f "$SCRIPT_DIR/sw-github-graphql.sh" ]] && source "$SCRIPT_DIR/sw-github-graphql.sh"
@@ -478,9 +485,11 @@ load_config() {
478
485
 
479
486
  # progress-based health monitoring (replaces static timeouts)
480
487
  PROGRESS_MONITORING=$(jq -r '.health.progress_based // true' "$config_file")
481
- PROGRESS_CHECKS_BEFORE_WARN=$(jq -r '.health.stale_checks_before_warn // 3' "$config_file")
482
- PROGRESS_CHECKS_BEFORE_KILL=$(jq -r '.health.stale_checks_before_kill // 6' "$config_file")
483
- PROGRESS_HARD_LIMIT_S=$(jq -r '.health.hard_limit_s // 10800' "$config_file") # 3hr absolute max
488
+ PROGRESS_CHECKS_BEFORE_WARN=$(jq -r '.health.stale_checks_before_warn // 20' "$config_file")
489
+ PROGRESS_CHECKS_BEFORE_KILL=$(jq -r '.health.stale_checks_before_kill // 120' "$config_file")
490
+ PROGRESS_HARD_LIMIT_S=$(jq -r '.health.hard_limit_s // 0' "$config_file") # 0 = disabled (no hard kill)
491
+ NUDGE_ENABLED=$(jq -r '.health.nudge_enabled // true' "$config_file")
492
+ NUDGE_AFTER_CHECKS=$(jq -r '.health.nudge_after_checks // 40' "$config_file")
484
493
 
485
494
  # team dashboard URL (for coordinated claiming)
486
495
  local cfg_dashboard_url
@@ -502,11 +511,13 @@ load_config() {
502
511
 
503
512
  setup_dirs() {
504
513
  mkdir -p "$DAEMON_DIR"
514
+ mkdir -p "$HOME/.shipwright"
505
515
 
506
516
  STATE_FILE="$DAEMON_DIR/daemon-state.json"
507
517
  LOG_FILE="$DAEMON_DIR/daemon.log"
508
518
  LOG_DIR="$DAEMON_DIR/logs"
509
519
  WORKTREE_DIR=".worktrees"
520
+ PAUSE_FLAG="${HOME}/.shipwright/daemon-pause.flag"
510
521
 
511
522
  mkdir -p "$LOG_DIR"
512
523
  mkdir -p "$HOME/.shipwright/progress"
@@ -836,6 +847,31 @@ daemon_assess_progress() {
836
847
  has_progress=true
837
848
  fi
838
849
 
850
+ # Claude subprocess is alive and consuming CPU — agent is thinking/working
851
+ # During build stage, Claude can spend 10+ minutes thinking before any
852
+ # visible git changes appear. Detect this as progress.
853
+ if [[ "$has_progress" != "true" ]]; then
854
+ local _pid_for_check
855
+ _pid_for_check=$(echo "$current_snapshot" | jq -r '.pid // empty' 2>/dev/null || true)
856
+ if [[ -z "$_pid_for_check" ]]; then
857
+ # Fallback: get PID from active_jobs
858
+ _pid_for_check=$(jq -r --argjson num "$issue_num" \
859
+ '.active_jobs[] | select(.issue == ($num | tonumber)) | .pid' "$STATE_FILE" 2>/dev/null | head -1 || true)
860
+ fi
861
+ if [[ -n "$_pid_for_check" ]]; then
862
+ # Check if any child process (claude) is alive and using CPU
863
+ local child_cpu=0
864
+ child_cpu=$(ps -o pid=,pcpu= -p "$_pid_for_check" 2>/dev/null | awk '{sum+=$2} END{printf "%d", sum+0}' || echo "0")
865
+ if [[ "$child_cpu" -eq 0 ]]; then
866
+ # Check children of the pipeline process
867
+ child_cpu=$(pgrep -P "$_pid_for_check" 2>/dev/null | xargs -I{} ps -o pcpu= -p {} 2>/dev/null | awk '{sum+=$1} END{printf "%d", sum+0}' || echo "0")
868
+ fi
869
+ if [[ "${child_cpu:-0}" -gt 0 ]]; then
870
+ has_progress=true
871
+ fi
872
+ fi
873
+ fi
874
+
839
875
  # Detect repeated errors (same error signature hitting again)
840
876
  local repeated_errors="$prev_repeated_errors"
841
877
  if [[ -n "$cur_error" && "$cur_error" == "$prev_error" ]]; then
@@ -1208,6 +1244,74 @@ gh_record_failure() {
1208
1244
  fi
1209
1245
  }
1210
1246
 
1247
+ # ─── Runtime Auth Check ──────────────────────────────────────────────────────
1248
+
1249
+ LAST_AUTH_CHECK_EPOCH=0
1250
+ AUTH_CHECK_INTERVAL=300 # 5 minutes
1251
+
1252
+ daemon_preflight_auth_check() {
1253
+ local now_e
1254
+ now_e=$(now_epoch)
1255
+ if [[ $((now_e - LAST_AUTH_CHECK_EPOCH)) -lt "$AUTH_CHECK_INTERVAL" ]]; then
1256
+ return 0
1257
+ fi
1258
+ LAST_AUTH_CHECK_EPOCH="$now_e"
1259
+
1260
+ # gh auth check
1261
+ if [[ "${NO_GITHUB:-false}" != "true" ]]; then
1262
+ if ! gh auth status &>/dev/null 2>&1; then
1263
+ daemon_log ERROR "GitHub auth check failed — auto-pausing daemon"
1264
+ local pause_json
1265
+ pause_json=$(jq -n --arg reason "gh_auth_failure" --arg ts "$(now_iso)" \
1266
+ '{reason: $reason, timestamp: $ts}')
1267
+ local _tmp_pause
1268
+ _tmp_pause=$(mktemp "${TMPDIR:-/tmp}/sw-pause.XXXXXX")
1269
+ echo "$pause_json" > "$_tmp_pause"
1270
+ mv "$_tmp_pause" "$PAUSE_FLAG"
1271
+ emit_event "daemon.auto_pause" "reason=gh_auth_failure"
1272
+ return 1
1273
+ fi
1274
+ fi
1275
+
1276
+ # claude auth check with 15s timeout (macOS has no timeout command)
1277
+ local claude_auth_ok=false
1278
+ local _auth_tmp
1279
+ _auth_tmp=$(mktemp "${TMPDIR:-/tmp}/sw-auth.XXXXXX")
1280
+ ( claude --print -p "ok" --max-turns 1 > "$_auth_tmp" 2>/dev/null ) &
1281
+ local _auth_pid=$!
1282
+ local _auth_waited=0
1283
+ while kill -0 "$_auth_pid" 2>/dev/null && [[ "$_auth_waited" -lt 15 ]]; do
1284
+ sleep 1
1285
+ _auth_waited=$((_auth_waited + 1))
1286
+ done
1287
+ if kill -0 "$_auth_pid" 2>/dev/null; then
1288
+ kill "$_auth_pid" 2>/dev/null || true
1289
+ wait "$_auth_pid" 2>/dev/null || true
1290
+ else
1291
+ wait "$_auth_pid" 2>/dev/null || true
1292
+ fi
1293
+
1294
+ if [[ -s "$_auth_tmp" ]]; then
1295
+ claude_auth_ok=true
1296
+ fi
1297
+ rm -f "$_auth_tmp"
1298
+
1299
+ if [[ "$claude_auth_ok" != "true" ]]; then
1300
+ daemon_log ERROR "Claude auth check failed — auto-pausing daemon"
1301
+ local pause_json
1302
+ pause_json=$(jq -n --arg reason "claude_auth_failure" --arg ts "$(now_iso)" \
1303
+ '{reason: $reason, timestamp: $ts}')
1304
+ local _tmp_pause
1305
+ _tmp_pause=$(mktemp "${TMPDIR:-/tmp}/sw-pause.XXXXXX")
1306
+ echo "$pause_json" > "$_tmp_pause"
1307
+ mv "$_tmp_pause" "$PAUSE_FLAG"
1308
+ emit_event "daemon.auto_pause" "reason=claude_auth_failure"
1309
+ return 1
1310
+ fi
1311
+
1312
+ return 0
1313
+ }
1314
+
1211
1315
  # ─── Pre-flight Checks ──────────────────────────────────────────────────────
1212
1316
 
1213
1317
  preflight_checks() {
@@ -1369,6 +1473,7 @@ init_state() {
1369
1473
  queued: [],
1370
1474
  completed: [],
1371
1475
  retry_counts: {},
1476
+ failure_history: [],
1372
1477
  priority_lane_active: [],
1373
1478
  titles: {}
1374
1479
  }')
@@ -1609,9 +1714,24 @@ daemon_spawn_pipeline() {
1609
1714
  local issue_num="$1"
1610
1715
  local issue_title="${2:-}"
1611
1716
  local repo_full_name="${3:-}" # owner/repo (org mode only)
1717
+ shift 3 2>/dev/null || true
1718
+ local extra_pipeline_args=("$@") # Optional extra args passed to sw-pipeline.sh
1612
1719
 
1613
1720
  daemon_log INFO "Spawning pipeline for issue #${issue_num}: ${issue_title}"
1614
1721
 
1722
+ # ── Issue decomposition (if decomposer available) ──
1723
+ local decompose_script="${SCRIPT_DIR}/sw-decompose.sh"
1724
+ if [[ -x "$decompose_script" && "$NO_GITHUB" != "true" ]]; then
1725
+ local decompose_result=""
1726
+ decompose_result=$("$decompose_script" auto "$issue_num" 2>/dev/null) || true
1727
+ if [[ "$decompose_result" == *"decomposed"* ]]; then
1728
+ daemon_log INFO "Issue #${issue_num} decomposed into subtasks — skipping pipeline"
1729
+ # Remove the shipwright label so decomposed parent doesn't re-queue
1730
+ gh issue edit "$issue_num" --remove-label "shipwright" 2>/dev/null || true
1731
+ return 0
1732
+ fi
1733
+ fi
1734
+
1615
1735
  # Extract goal text from issue (title + first line of body)
1616
1736
  local issue_goal="$issue_title"
1617
1737
  if [[ "$NO_GITHUB" != "true" ]]; then
@@ -1727,11 +1847,18 @@ daemon_spawn_pipeline() {
1727
1847
  pipeline_args+=("--fast-test-cmd" "$FAST_TEST_CMD_CFG")
1728
1848
  fi
1729
1849
 
1850
+ # Append any extra pipeline args (from retry escalation, etc.)
1851
+ if [[ ${#extra_pipeline_args[@]} -gt 0 ]]; then
1852
+ pipeline_args+=("${extra_pipeline_args[@]}")
1853
+ fi
1854
+
1730
1855
  # Run pipeline in work directory (background)
1856
+ # Ignore SIGHUP so tmux attach/detach and process group changes don't kill the pipeline
1731
1857
  echo -e "\n\n===== Pipeline run $(date -u +%Y-%m-%dT%H:%M:%SZ) =====" >> "$LOG_DIR/issue-${issue_num}.log" 2>/dev/null || true
1732
1858
  (
1859
+ trap '' HUP
1733
1860
  cd "$work_dir"
1734
- "$SCRIPT_DIR/sw-pipeline.sh" "${pipeline_args[@]}"
1861
+ exec "$SCRIPT_DIR/sw-pipeline.sh" "${pipeline_args[@]}"
1735
1862
  ) >> "$LOG_DIR/issue-${issue_num}.log" 2>&1 200>&- &
1736
1863
  local pid=$!
1737
1864
 
@@ -1767,6 +1894,14 @@ _Progress updates will appear below as the pipeline advances through each stage.
1767
1894
 
1768
1895
  daemon_track_job() {
1769
1896
  local issue_num="$1" pid="$2" worktree="$3" title="${4:-}" repo="${5:-}" goal="${6:-}"
1897
+
1898
+ # Write to SQLite (non-blocking, best-effort)
1899
+ if type db_save_job &>/dev/null; then
1900
+ local job_id="daemon-${issue_num}-$(now_epoch)"
1901
+ db_save_job "$job_id" "$issue_num" "$title" "$pid" "$worktree" "" "${PIPELINE_TEMPLATE:-autonomous}" "$goal" 2>/dev/null || true
1902
+ fi
1903
+
1904
+ # Always write to JSON state file (primary for now)
1770
1905
  locked_state_update \
1771
1906
  --argjson num "$issue_num" \
1772
1907
  --argjson pid "$pid" \
@@ -1855,6 +1990,16 @@ daemon_reap_completed() {
1855
1990
  [[ "$start_epoch" -gt 0 ]] && dur_s=$((end_epoch - start_epoch))
1856
1991
  emit_event "daemon.reap" "issue=$issue_num" "result=$result_str" "duration_s=$dur_s"
1857
1992
 
1993
+ # Update SQLite (mark job complete/failed)
1994
+ if type db_complete_job &>/dev/null && type db_fail_job &>/dev/null; then
1995
+ local _db_job_id="daemon-${issue_num}-${start_epoch}"
1996
+ if [[ "$exit_code" -eq 0 ]]; then
1997
+ db_complete_job "$_db_job_id" "$result_str" 2>/dev/null || true
1998
+ else
1999
+ db_fail_job "$_db_job_id" "$result_str" 2>/dev/null || true
2000
+ fi
2001
+ fi
2002
+
1858
2003
  if [[ "$exit_code" -eq 0 ]]; then
1859
2004
  daemon_on_success "$issue_num" "$duration_str"
1860
2005
  else
@@ -1904,15 +2049,18 @@ daemon_reap_completed() {
1904
2049
  reap_machine_name=$(jq -r '.machines[] | select(.role == "primary") | .name' "$HOME/.shipwright/machines.json" 2>/dev/null || hostname -s)
1905
2050
  release_claim "$issue_num" "$reap_machine_name"
1906
2051
 
1907
- # Skip cleanup if a retry was just spawned for this issue
2052
+ # Always remove the OLD job entry from active_jobs to prevent
2053
+ # re-reaping of the dead PID on the next cycle. When a retry was
2054
+ # spawned, daemon_spawn_pipeline already added a fresh entry with
2055
+ # the new PID — we must not leave the stale one behind.
2056
+ locked_state_update --argjson num "$issue_num" \
2057
+ --argjson old_pid "${pid:-0}" \
2058
+ '.active_jobs = [.active_jobs[] | select(.issue != $num or .pid != $old_pid)]'
2059
+ untrack_priority_job "$issue_num"
2060
+
1908
2061
  if [[ "$_retry_spawned_for" == "$issue_num" ]]; then
1909
2062
  daemon_log INFO "Retry spawned for issue #${issue_num} — skipping worktree cleanup"
1910
2063
  else
1911
- # Remove from active_jobs and priority lane tracking (locked)
1912
- locked_state_update --argjson num "$issue_num" \
1913
- '.active_jobs = [.active_jobs[] | select(.issue != $num)]'
1914
- untrack_priority_job "$issue_num"
1915
-
1916
2064
  # Clean up worktree (skip for org-mode clones — they persist)
1917
2065
  local job_repo
1918
2066
  job_repo=$(echo "$job" | jq -r '.repo // ""')
@@ -1951,6 +2099,9 @@ daemon_reap_completed() {
1951
2099
  daemon_on_success() {
1952
2100
  local issue_num="$1" duration="${2:-}"
1953
2101
 
2102
+ # Reset consecutive failure tracking on any success
2103
+ reset_failure_tracking
2104
+
1954
2105
  daemon_log SUCCESS "Pipeline completed for issue #${issue_num} (${duration:-unknown})"
1955
2106
 
1956
2107
  # Record pipeline duration for adaptive threshold learning
@@ -2009,6 +2160,149 @@ Check the associated PR for the implementation." 2>/dev/null || true
2009
2160
  notify "Pipeline Complete — Issue #${issue_num}" \
2010
2161
  "Duration: ${duration:-unknown}" "success"
2011
2162
  "$SCRIPT_DIR/sw-tracker.sh" notify "completed" "$issue_num" 2>/dev/null || true
2163
+
2164
+ # PM agent: record success for learning
2165
+ if [[ -x "$SCRIPT_DIR/sw-pm.sh" ]]; then
2166
+ bash "$SCRIPT_DIR/sw-pm.sh" learn "$issue_num" success 2>/dev/null || true
2167
+ fi
2168
+ }
2169
+
2170
+ # ─── Failure Classification ─────────────────────────────────────────────────
2171
+
2172
+ classify_failure() {
2173
+ local issue_num="$1"
2174
+ if [[ -z "${LOG_DIR:-}" ]]; then
2175
+ echo "unknown"
2176
+ return
2177
+ fi
2178
+ local log_path="$LOG_DIR/issue-${issue_num}.log"
2179
+ if [[ ! -f "$log_path" ]]; then
2180
+ echo "unknown"
2181
+ return
2182
+ fi
2183
+ local tail_content
2184
+ tail_content=$(tail -200 "$log_path" 2>/dev/null || true)
2185
+
2186
+ # Auth errors
2187
+ if echo "$tail_content" | grep -qiE 'not logged in|unauthorized|auth.*fail|401 |invalid.*token|CLAUDE_CODE_OAUTH_TOKEN|api key.*invalid|authentication required'; then
2188
+ echo "auth_error"
2189
+ return
2190
+ fi
2191
+ # API errors (rate limits, timeouts, server errors)
2192
+ if echo "$tail_content" | grep -qiE 'rate limit|429 |503 |502 |overloaded|timeout|ETIMEDOUT|ECONNRESET|socket hang up|service unavailable'; then
2193
+ echo "api_error"
2194
+ return
2195
+ fi
2196
+ # Invalid issue (not found, empty body)
2197
+ if echo "$tail_content" | grep -qiE 'issue not found|404 |no body|could not resolve|GraphQL.*not found|issue.*does not exist'; then
2198
+ echo "invalid_issue"
2199
+ return
2200
+ fi
2201
+ # Context exhaustion — check progress file
2202
+ local issue_worktree_path="${WORKTREE_DIR:-${REPO_DIR}/.worktrees}/daemon-issue-${issue_num}"
2203
+ local progress_file="${issue_worktree_path}/.claude/loop-logs/progress.md"
2204
+ if [[ -f "$progress_file" ]]; then
2205
+ local cf_iter
2206
+ cf_iter=$(grep -oE 'Iteration: [0-9]+' "$progress_file" 2>/dev/null | tail -1 | grep -oE '[0-9]+' || echo "0")
2207
+ if ! [[ "${cf_iter:-0}" =~ ^[0-9]+$ ]]; then cf_iter="0"; fi
2208
+ local cf_tests
2209
+ cf_tests=$(grep -oE 'Tests passing: (true|false)' "$progress_file" 2>/dev/null | awk '{print $NF}' || echo "unknown")
2210
+ if [[ "${cf_iter:-0}" -gt 0 ]] && { [[ "$cf_tests" == "false" ]] || [[ "$cf_tests" == "unknown" ]]; }; then
2211
+ echo "context_exhaustion"
2212
+ return
2213
+ fi
2214
+ fi
2215
+ # Build failure (test errors, compile errors)
2216
+ if echo "$tail_content" | grep -qiE 'test.*fail|FAIL|build.*error|compile.*error|lint.*fail|npm ERR|exit code [1-9]'; then
2217
+ echo "build_failure"
2218
+ return
2219
+ fi
2220
+ echo "unknown"
2221
+ }
2222
+
2223
+ # ─── Consecutive Failure Tracking (persisted + adaptive) ─────────────────────
2224
+
2225
+ DAEMON_CONSECUTIVE_FAILURE_CLASS=""
2226
+ DAEMON_CONSECUTIVE_FAILURE_COUNT=0
2227
+
2228
+ # Max retries per failure class (adaptive retry strategy)
2229
+ get_max_retries_for_class() {
2230
+ local class="${1:-unknown}"
2231
+ case "$class" in
2232
+ auth_error|invalid_issue) echo 0 ;;
2233
+ api_error) echo "${MAX_RETRIES_API_ERROR:-4}" ;;
2234
+ context_exhaustion) echo "${MAX_RETRIES_CONTEXT_EXHAUSTION:-2}" ;;
2235
+ build_failure) echo "${MAX_RETRIES_BUILD:-2}" ;;
2236
+ *) echo "${MAX_RETRIES:-2}" ;;
2237
+ esac
2238
+ }
2239
+
2240
+ # Append failure to persisted history and compute consecutive count; smart pause with exponential backoff
2241
+ record_failure_class() {
2242
+ local failure_class="$1"
2243
+ # In-memory consecutive (for backward compat)
2244
+ if [[ "$failure_class" == "$DAEMON_CONSECUTIVE_FAILURE_CLASS" ]]; then
2245
+ DAEMON_CONSECUTIVE_FAILURE_COUNT=$((DAEMON_CONSECUTIVE_FAILURE_COUNT + 1))
2246
+ else
2247
+ DAEMON_CONSECUTIVE_FAILURE_CLASS="$failure_class"
2248
+ DAEMON_CONSECUTIVE_FAILURE_COUNT=1
2249
+ fi
2250
+
2251
+ # Persist failure to state (failure_history) for pattern tracking
2252
+ if [[ -f "${STATE_FILE:-}" ]]; then
2253
+ local entry
2254
+ entry=$(jq -n --arg ts "$(now_iso)" --arg class "$failure_class" '{ts: $ts, class: $class}')
2255
+ locked_state_update --argjson entry "$entry" \
2256
+ '.failure_history = ((.failure_history // []) + [$entry] | .[-100:])' 2>/dev/null || true
2257
+ fi
2258
+
2259
+ # Consecutive count from persisted tail: count only the unbroken run of $failure_class
2260
+ # from the newest entry backwards (not total occurrences)
2261
+ local consecutive="$DAEMON_CONSECUTIVE_FAILURE_COUNT"
2262
+ if [[ -f "${STATE_FILE:-}" ]]; then
2263
+ local from_state
2264
+ from_state=$(jq -r --arg c "$failure_class" '
2265
+ (.failure_history // []) | [.[].class] | reverse |
2266
+ if length == 0 then 0
2267
+ elif .[0] != $c then 0
2268
+ else
2269
+ reduce .[] as $x (
2270
+ {count: 0, done: false};
2271
+ if .done then . elif $x == $c then .count += 1 else .done = true end
2272
+ ) | .count
2273
+ end
2274
+ ' "$STATE_FILE" 2>/dev/null || echo "1")
2275
+ consecutive="${from_state:-1}"
2276
+ [[ "$consecutive" -eq 0 ]] && consecutive="$DAEMON_CONSECUTIVE_FAILURE_COUNT"
2277
+ DAEMON_CONSECUTIVE_FAILURE_COUNT="$consecutive"
2278
+ fi
2279
+
2280
+ # Smart pause: exponential backoff instead of hard stop (resume_after so daemon can auto-resume)
2281
+ if [[ "$consecutive" -ge 3 ]]; then
2282
+ local pause_mins=$((5 * (1 << (consecutive - 3))))
2283
+ [[ "$pause_mins" -gt 480 ]] && pause_mins=480
2284
+ local resume_ts resume_after
2285
+ resume_ts=$(($(date +%s) + pause_mins * 60))
2286
+ resume_after=$(epoch_to_iso "$resume_ts")
2287
+ daemon_log ERROR "${consecutive} consecutive failures (class: ${failure_class}) — auto-pausing until ${resume_after} (${pause_mins}m backoff)"
2288
+ local pause_json
2289
+ pause_json=$(jq -n \
2290
+ --arg reason "consecutive_${failure_class}" \
2291
+ --arg ts "$(now_iso)" \
2292
+ --arg resume "$resume_after" \
2293
+ --argjson count "$consecutive" \
2294
+ '{reason: $reason, timestamp: $ts, resume_after: $resume, consecutive_count: $count}')
2295
+ local _tmp_pause
2296
+ _tmp_pause=$(mktemp "${TMPDIR:-/tmp}/sw-pause.XXXXXX")
2297
+ echo "$pause_json" > "$_tmp_pause"
2298
+ mv "$_tmp_pause" "$PAUSE_FLAG"
2299
+ emit_event "daemon.auto_pause" "reason=consecutive_failures" "class=$failure_class" "count=$consecutive" "resume_after=$resume_after"
2300
+ fi
2301
+ }
2302
+
2303
+ reset_failure_tracking() {
2304
+ DAEMON_CONSECUTIVE_FAILURE_CLASS=""
2305
+ DAEMON_CONSECUTIVE_FAILURE_COUNT=0
2012
2306
  }
2013
2307
 
2014
2308
  # ─── Failure Handler ────────────────────────────────────────────────────────
@@ -2047,126 +2341,152 @@ daemon_on_failure() {
2047
2341
  completed_at: $completed_at
2048
2342
  }] | .completed = .completed[-500:]'
2049
2343
 
2344
+ # ── Classify failure and decide retry strategy ──
2345
+ local failure_class
2346
+ failure_class=$(classify_failure "$issue_num")
2347
+ daemon_log INFO "Failure classified as: ${failure_class} for issue #${issue_num}"
2348
+ emit_event "daemon.failure_classified" "issue=$issue_num" "class=$failure_class"
2349
+ record_failure_class "$failure_class"
2350
+
2050
2351
  # ── Auto-retry with strategy escalation ──
2051
2352
  if [[ "${RETRY_ESCALATION:-true}" == "true" ]]; then
2052
2353
  local retry_count
2053
2354
  retry_count=$(jq -r --arg num "$issue_num" \
2054
2355
  '.retry_counts[$num] // 0' "$STATE_FILE" 2>/dev/null || echo "0")
2055
2356
 
2056
- if [[ "$retry_count" -lt "${MAX_RETRIES:-2}" ]]; then
2057
- retry_count=$((retry_count + 1))
2058
-
2059
- # Update retry count in state (locked to prevent race)
2060
- locked_state_update \
2061
- --arg num "$issue_num" --argjson count "$retry_count" \
2062
- '.retry_counts[$num] = $count'
2063
-
2064
- daemon_log WARN "Auto-retry #${retry_count}/${MAX_RETRIES:-2} for issue #${issue_num}"
2065
- emit_event "daemon.retry" "issue=$issue_num" "retry=$retry_count" "max=${MAX_RETRIES:-2}"
2066
-
2067
- # Check for checkpoint to enable resume-from-checkpoint
2068
- local checkpoint_args=()
2069
- if [[ "${CHECKPOINT_ENABLED:-true}" == "true" ]]; then
2070
- # Try to find worktree for this issue to check for checkpoints
2071
- local issue_worktree="${REPO_DIR}/.worktrees/daemon-issue-${issue_num}"
2072
- if [[ -d "$issue_worktree/.claude/pipeline-artifacts/checkpoints" ]]; then
2073
- local latest_checkpoint=""
2074
- for cp_file in "$issue_worktree/.claude/pipeline-artifacts/checkpoints"/*-checkpoint.json; do
2075
- [[ -f "$cp_file" ]] && latest_checkpoint="$cp_file"
2076
- done
2077
- if [[ -n "$latest_checkpoint" ]]; then
2078
- daemon_log INFO "Found checkpoint: $latest_checkpoint"
2079
- emit_event "daemon.recovery" "issue=$issue_num" "checkpoint=$latest_checkpoint"
2080
- checkpoint_args+=("--resume")
2081
- fi
2082
- fi
2083
- fi
2084
-
2085
- # Detect context exhaustion from progress file
2086
- local failure_reason="unknown"
2087
- local issue_worktree_path="${WORKTREE_DIR:-${REPO_DIR}/.worktrees}/daemon-issue-${issue_num}"
2088
- local progress_file="${issue_worktree_path}/.claude/loop-logs/progress.md"
2089
- if [[ -f "$progress_file" ]]; then
2090
- local progress_iter
2091
- progress_iter=$(grep -oE 'Iteration: [0-9]+' "$progress_file" 2>/dev/null | tail -1 | grep -oE '[0-9]+' || echo "0")
2092
- if ! [[ "${progress_iter:-0}" =~ ^[0-9]+$ ]]; then
2093
- progress_iter="0"
2357
+ # Non-retryable failures skip retry entirely
2358
+ case "$failure_class" in
2359
+ auth_error)
2360
+ daemon_log ERROR "Auth error for issue #${issue_num} skipping retry"
2361
+ emit_event "daemon.skip_retry" "issue=$issue_num" "reason=auth_error"
2362
+ if [[ "$NO_GITHUB" != "true" ]]; then
2363
+ gh issue edit "$issue_num" --add-label "pipeline/auth-error" 2>/dev/null || true
2094
2364
  fi
2095
- local progress_tests
2096
- progress_tests=$(grep -oE 'Tests passing: (true|false)' "$progress_file" 2>/dev/null | awk '{print $NF}' || echo "unknown")
2097
- if [[ "${progress_iter:-0}" -gt 0 ]] && { [[ "$progress_tests" == "false" ]] || [[ "$progress_tests" == "unknown" ]]; }; then
2098
- failure_reason="context_exhaustion"
2099
- emit_event "daemon.context_exhaustion" "issue=$issue_num" "iterations=$progress_iter"
2100
- daemon_log WARN "Context exhaustion detected for issue #${issue_num} (iterations: ${progress_iter})"
2365
+ ;;
2366
+ invalid_issue)
2367
+ daemon_log ERROR "Invalid issue #${issue_num} skipping retry"
2368
+ emit_event "daemon.skip_retry" "issue=$issue_num" "reason=invalid_issue"
2369
+ if [[ "$NO_GITHUB" != "true" ]]; then
2370
+ gh issue comment "$issue_num" --body "Pipeline skipped retry: issue appears invalid or has no body." 2>/dev/null || true
2101
2371
  fi
2102
- fi
2372
+ ;;
2373
+ *)
2374
+ # Retryable failures — per-class max retries and escalation
2375
+ local effective_max
2376
+ effective_max=$(get_max_retries_for_class "$failure_class")
2377
+ if [[ "$retry_count" -lt "$effective_max" ]]; then
2378
+ retry_count=$((retry_count + 1))
2379
+
2380
+ # Update retry count in state (locked to prevent race)
2381
+ locked_state_update \
2382
+ --arg num "$issue_num" --argjson count "$retry_count" \
2383
+ '.retry_counts[$num] = $count'
2384
+
2385
+ daemon_log WARN "Auto-retry #${retry_count}/${effective_max} for issue #${issue_num} (class: ${failure_class})"
2386
+ emit_event "daemon.retry" "issue=$issue_num" "retry=$retry_count" "max=$effective_max" "class=$failure_class"
2387
+
2388
+ # Check for checkpoint to enable resume-from-checkpoint
2389
+ local checkpoint_args=()
2390
+ if [[ "${CHECKPOINT_ENABLED:-true}" == "true" ]]; then
2391
+ local issue_worktree="${REPO_DIR}/.worktrees/daemon-issue-${issue_num}"
2392
+ if [[ -d "$issue_worktree/.claude/pipeline-artifacts/checkpoints" ]]; then
2393
+ local latest_checkpoint=""
2394
+ for cp_file in "$issue_worktree/.claude/pipeline-artifacts/checkpoints"/*-checkpoint.json; do
2395
+ [[ -f "$cp_file" ]] && latest_checkpoint="$cp_file"
2396
+ done
2397
+ if [[ -n "$latest_checkpoint" ]]; then
2398
+ daemon_log INFO "Found checkpoint: $latest_checkpoint"
2399
+ emit_event "daemon.recovery" "issue=$issue_num" "checkpoint=$latest_checkpoint"
2400
+ checkpoint_args+=("--resume")
2401
+ fi
2402
+ fi
2403
+ fi
2103
2404
 
2104
- # Build escalated pipeline args
2105
- local retry_template="$PIPELINE_TEMPLATE"
2106
- local retry_model="${MODEL:-opus}"
2107
- local extra_args=()
2108
-
2109
- if [[ "$retry_count" -eq 1 ]]; then
2110
- # Retry 1: same template, upgrade model, more iterations
2111
- retry_model="opus"
2112
- extra_args+=("--max-iterations" "30")
2113
- daemon_log INFO "Escalation: model=opus, max_iterations=30"
2114
- elif [[ "$retry_count" -ge 2 ]]; then
2115
- # Retry 2: full template, compound quality max cycles
2116
- retry_template="full"
2117
- retry_model="opus"
2118
- extra_args+=("--max-iterations" "30" "--compound-cycles" "5")
2119
- daemon_log INFO "Escalation: template=full, compound_cycles=5"
2120
- fi
2405
+ # Build escalated pipeline args
2406
+ local retry_template="$PIPELINE_TEMPLATE"
2407
+ local retry_model="${MODEL:-opus}"
2408
+ local extra_args=()
2409
+
2410
+ if [[ "$retry_count" -eq 1 ]]; then
2411
+ retry_model="opus"
2412
+ extra_args+=("--max-iterations" "30")
2413
+ daemon_log INFO "Escalation: model=opus, max_iterations=30"
2414
+ elif [[ "$retry_count" -ge 2 ]]; then
2415
+ retry_template="full"
2416
+ retry_model="opus"
2417
+ extra_args+=("--max-iterations" "30" "--compound-cycles" "5")
2418
+ daemon_log INFO "Escalation: template=full, compound_cycles=5"
2419
+ fi
2121
2420
 
2122
- # Increase restarts on context exhaustion
2123
- if [[ "$failure_reason" == "context_exhaustion" ]]; then
2124
- local boosted_restarts=$(( ${MAX_RESTARTS_CFG:-3} + retry_count ))
2125
- # Cap at sw-loop's hard limit of 5
2126
- if [[ "$boosted_restarts" -gt 5 ]]; then
2127
- boosted_restarts=5
2128
- fi
2129
- extra_args+=("--max-restarts" "$boosted_restarts")
2130
- daemon_log INFO "Boosting max-restarts to $boosted_restarts (context exhaustion)"
2131
- fi
2421
+ # Increase restarts on context exhaustion
2422
+ if [[ "$failure_class" == "context_exhaustion" ]]; then
2423
+ local boosted_restarts=$(( ${MAX_RESTARTS_CFG:-3} + retry_count ))
2424
+ if [[ "$boosted_restarts" -gt 5 ]]; then
2425
+ boosted_restarts=5
2426
+ fi
2427
+ extra_args+=("--max-restarts" "$boosted_restarts")
2428
+ daemon_log INFO "Boosting max-restarts to $boosted_restarts (context exhaustion)"
2429
+ fi
2430
+
2431
+ # Exponential backoff (per-class base); cap at 1h
2432
+ local base_secs=30
2433
+ [[ "$failure_class" == "api_error" ]] && base_secs=300
2434
+ local backoff_secs=$((base_secs * (1 << (retry_count - 1))))
2435
+ [[ "$backoff_secs" -gt 3600 ]] && backoff_secs=3600
2436
+ [[ "$failure_class" == "api_error" ]] && daemon_log INFO "API error — exponential backoff ${backoff_secs}s"
2132
2437
 
2133
- if [[ "$NO_GITHUB" != "true" ]]; then
2134
- gh issue comment "$issue_num" --body "## 🔄 Auto-Retry #${retry_count}
2438
+ if [[ "$NO_GITHUB" != "true" ]]; then
2439
+ gh issue comment "$issue_num" --body "## 🔄 Auto-Retry #${retry_count}
2135
2440
 
2136
- Pipeline failed — retrying with escalated strategy.
2441
+ Pipeline failed (${failure_class}) — retrying with escalated strategy.
2137
2442
 
2138
2443
  | Field | Value |
2139
2444
  |-------|-------|
2140
2445
  | Retry | ${retry_count} / ${MAX_RETRIES:-2} |
2446
+ | Failure | \`${failure_class}\` |
2141
2447
  | Template | \`${retry_template}\` |
2142
2448
  | Model | \`${retry_model}\` |
2143
2449
  | Started | $(now_iso) |
2144
2450
 
2145
2451
  _Escalation: $(if [[ "$retry_count" -eq 1 ]]; then echo "upgraded model + increased iterations"; else echo "full template + compound quality"; fi)_" 2>/dev/null || true
2146
- fi
2452
+ fi
2147
2453
 
2148
- # Backoff before retry: 30s * retry_count (30s, 60s, ...)
2149
- local backoff_secs=$((30 * retry_count))
2150
- daemon_log INFO "Waiting ${backoff_secs}s before retry #${retry_count}"
2151
- sleep "$backoff_secs"
2152
-
2153
- # Re-spawn with escalated strategy
2154
- local orig_template="$PIPELINE_TEMPLATE"
2155
- local orig_model="$MODEL"
2156
- PIPELINE_TEMPLATE="$retry_template"
2157
- MODEL="$retry_model"
2158
- daemon_spawn_pipeline "$issue_num" "retry-${retry_count}"
2159
- _retry_spawned_for="$issue_num"
2160
- PIPELINE_TEMPLATE="$orig_template"
2161
- MODEL="$orig_model"
2162
- return
2163
- fi
2454
+ daemon_log INFO "Waiting ${backoff_secs}s before retry #${retry_count}"
2455
+ sleep "$backoff_secs"
2164
2456
 
2165
- daemon_log WARN "Max retries (${MAX_RETRIES:-2}) exhausted for issue #${issue_num}"
2166
- emit_event "daemon.retry_exhausted" "issue=$issue_num" "retries=$retry_count"
2457
+ # Merge checkpoint args + extra args for passthrough
2458
+ local all_extra_args=()
2459
+ if [[ ${#checkpoint_args[@]} -gt 0 ]]; then
2460
+ all_extra_args+=("${checkpoint_args[@]}")
2461
+ fi
2462
+ if [[ ${#extra_args[@]} -gt 0 ]]; then
2463
+ all_extra_args+=("${extra_args[@]}")
2464
+ fi
2465
+
2466
+ # Re-spawn with escalated strategy
2467
+ local orig_template="$PIPELINE_TEMPLATE"
2468
+ local orig_model="$MODEL"
2469
+ PIPELINE_TEMPLATE="$retry_template"
2470
+ MODEL="$retry_model"
2471
+ daemon_spawn_pipeline "$issue_num" "retry-${retry_count}" "" "${all_extra_args[@]}"
2472
+ _retry_spawned_for="$issue_num"
2473
+ PIPELINE_TEMPLATE="$orig_template"
2474
+ MODEL="$orig_model"
2475
+ return
2476
+ fi
2477
+
2478
+ daemon_log WARN "Max retries (${effective_max}) exhausted for issue #${issue_num}"
2479
+ emit_event "daemon.retry_exhausted" "issue=$issue_num" "retries=$retry_count"
2480
+ ;;
2481
+ esac
2167
2482
  fi
2168
2483
 
2169
2484
  # ── No retry — report final failure ──
2485
+ # PM agent: record failure for learning (only when we're done with this issue)
2486
+ if [[ -x "$SCRIPT_DIR/sw-pm.sh" ]]; then
2487
+ bash "$SCRIPT_DIR/sw-pm.sh" learn "$issue_num" failure 2>/dev/null || true
2488
+ fi
2489
+
2170
2490
  if [[ "$NO_GITHUB" != "true" ]]; then
2171
2491
  # Add failure label and remove watch label (prevent re-processing)
2172
2492
  gh issue edit "$issue_num" \
@@ -2191,10 +2511,11 @@ _Escalation: $(if [[ "$retry_count" -eq 1 ]]; then echo "upgraded model + increa
2191
2511
 
2192
2512
  local retry_info=""
2193
2513
  if [[ "${RETRY_ESCALATION:-true}" == "true" ]]; then
2194
- local final_count
2514
+ local final_count final_max
2195
2515
  final_count=$(jq -r --arg num "$issue_num" \
2196
2516
  '.retry_counts[$num] // 0' "$STATE_FILE" 2>/dev/null || echo "0")
2197
- retry_info="| Retries | ${final_count} / ${MAX_RETRIES:-2} (exhausted) |"
2517
+ final_max=$(get_max_retries_for_class "$failure_class")
2518
+ retry_info="| Retries | ${final_count} / ${final_max} (exhausted) |"
2198
2519
  fi
2199
2520
 
2200
2521
  gh issue comment "$issue_num" --body "## ❌ Pipeline Failed
@@ -3770,6 +4091,13 @@ Patrol pre-filter findings to confirm: ${patrol_findings_summary}"
3770
4091
  patrol_meta_run
3771
4092
  fi
3772
4093
 
4094
+ # ── Strategic Intelligence Patrol (requires CLAUDE_CODE_OAUTH_TOKEN) ──
4095
+ if [[ -f "$SCRIPT_DIR/sw-strategic.sh" ]] && [[ -n "${CLAUDE_CODE_OAUTH_TOKEN:-}" ]]; then
4096
+ # shellcheck source=sw-strategic.sh
4097
+ source "$SCRIPT_DIR/sw-strategic.sh"
4098
+ strategic_patrol_run || true
4099
+ fi
4100
+
3773
4101
  # ── Summary ──
3774
4102
  emit_event "patrol.completed" "findings=$total_findings" "issues_created=$issues_created" "dry_run=$dry_run"
3775
4103
 
@@ -3795,10 +4123,27 @@ daemon_poll_issues() {
3795
4123
  return
3796
4124
  fi
3797
4125
 
3798
- # Check for pause flag (set by dashboard or disk_low alert)
3799
- if [[ -f "$HOME/.shipwright/daemon-pause.flag" ]]; then
3800
- daemon_log INFO "Daemon paused — skipping poll"
3801
- return
4126
+ # Check for pause flag (set by dashboard, disk_low, or consecutive-failure backoff)
4127
+ local pause_file="${PAUSE_FLAG:-$HOME/.shipwright/daemon-pause.flag}"
4128
+ if [[ -f "$pause_file" ]]; then
4129
+ local resume_after
4130
+ resume_after=$(jq -r '.resume_after // empty' "$pause_file" 2>/dev/null || true)
4131
+ if [[ -n "$resume_after" ]]; then
4132
+ local now_epoch resume_epoch
4133
+ now_epoch=$(date +%s)
4134
+ resume_epoch=$(TZ=UTC date -j -f "%Y-%m-%dT%H:%M:%SZ" "$resume_after" +%s 2>/dev/null || \
4135
+ date -d "$resume_after" +%s 2>/dev/null || echo 0)
4136
+ if [[ "$resume_epoch" -gt 0 ]] && [[ "$now_epoch" -ge "$resume_epoch" ]]; then
4137
+ rm -f "$pause_file"
4138
+ daemon_log INFO "Auto-resuming after backoff (resume_after passed)"
4139
+ else
4140
+ daemon_log INFO "Daemon paused until ${resume_after} — skipping poll"
4141
+ return
4142
+ fi
4143
+ else
4144
+ daemon_log INFO "Daemon paused — skipping poll"
4145
+ return
4146
+ fi
3802
4147
  fi
3803
4148
 
3804
4149
  # Circuit breaker: skip poll if in backoff window
@@ -4036,9 +4381,25 @@ daemon_poll_issues() {
4036
4381
  continue
4037
4382
  fi
4038
4383
 
4039
- # Auto-select pipeline template based on labels + triage score
4384
+ # Auto-select pipeline template: PM recommendation (if available) else labels + triage score
4040
4385
  local template
4041
- template=$(select_pipeline_template "$labels_csv" "$score" 2>/dev/null | tail -1)
4386
+ if [[ "$NO_GITHUB" != "true" ]] && [[ -x "$SCRIPT_DIR/sw-pm.sh" ]]; then
4387
+ local pm_rec
4388
+ pm_rec=$(bash "$SCRIPT_DIR/sw-pm.sh" recommend --json "$issue_num" 2>/dev/null) || true
4389
+ if [[ -n "$pm_rec" ]]; then
4390
+ template=$(echo "$pm_rec" | jq -r '.team_composition.template // empty' 2>/dev/null) || true
4391
+ # Capability self-assessment: low confidence → upgrade to full template
4392
+ local confidence
4393
+ confidence=$(echo "$pm_rec" | jq -r '.team_composition.confidence_percent // 100' 2>/dev/null) || true
4394
+ if [[ -n "$confidence" && "$confidence" != "null" && "$confidence" -lt 60 ]]; then
4395
+ daemon_log INFO "Low PM confidence (${confidence}%) — upgrading to full template"
4396
+ template="full"
4397
+ fi
4398
+ fi
4399
+ fi
4400
+ if [[ -z "$template" ]]; then
4401
+ template=$(select_pipeline_template "$labels_csv" "$score" 2>/dev/null | tail -1)
4402
+ fi
4042
4403
  template=$(printf '%s' "$template" | sed $'s/\x1b\\[[0-9;]*m//g' | tr -cd '[:alnum:]-_')
4043
4404
  [[ -z "$template" ]] && template="$PIPELINE_TEMPLATE"
4044
4405
  daemon_log INFO "Triage: issue #${issue_num} scored ${score}, template=${template}"
@@ -4095,13 +4456,15 @@ daemon_health_check() {
4095
4456
  now_e=$(now_epoch)
4096
4457
 
4097
4458
  if [[ -f "$STATE_FILE" ]]; then
4098
- # ── Progress-Based Health Monitoring ──
4099
- # Instead of killing after a static timeout, check for forward progress.
4100
- # Only kill when the agent is truly stuck (no stage change, no new code,
4101
- # same error repeating). A hard wall-clock limit remains as absolute safety net.
4459
+ # ── Intelligent Health Monitoring ──
4460
+ # Instead of killing after a countdown, sense what the agent is doing.
4461
+ # Agents think for long stretches that's normal and expected.
4462
+ # Strategy: sense understand be patient nudge → only kill as last resort.
4102
4463
 
4103
- local hard_limit="${PROGRESS_HARD_LIMIT_S:-10800}"
4464
+ local hard_limit="${PROGRESS_HARD_LIMIT_S:-0}"
4104
4465
  local use_progress="${PROGRESS_MONITORING:-true}"
4466
+ local nudge_enabled="${NUDGE_ENABLED:-true}"
4467
+ local nudge_after="${NUDGE_AFTER_CHECKS:-40}"
4105
4468
 
4106
4469
  while IFS= read -r job; do
4107
4470
  local pid started_at issue_num worktree
@@ -4122,8 +4485,8 @@ daemon_health_check() {
4122
4485
  elapsed=$(( now_e - start_e ))
4123
4486
  fi
4124
4487
 
4125
- # Hard wall-clock limit — absolute safety net (default 3h)
4126
- if [[ "$elapsed" -gt "$hard_limit" ]]; then
4488
+ # Hard wall-clock limit — disabled by default (0 = off)
4489
+ if [[ "$hard_limit" -gt 0 && "$elapsed" -gt "$hard_limit" ]]; then
4127
4490
  daemon_log WARN "Hard limit exceeded: issue #${issue_num} (${elapsed}s > ${hard_limit}s, PID $pid) — killing"
4128
4491
  emit_event "daemon.hard_limit" "issue=$issue_num" "elapsed_s=$elapsed" "limit_s=$hard_limit" "pid=$pid"
4129
4492
  kill "$pid" 2>/dev/null || true
@@ -4132,7 +4495,7 @@ daemon_health_check() {
4132
4495
  continue
4133
4496
  fi
4134
4497
 
4135
- # Progress-based detection (when enabled)
4498
+ # ── Intelligent Progress Sensing ──
4136
4499
  if [[ "$use_progress" == "true" && -n "$worktree" ]]; then
4137
4500
  local snapshot verdict
4138
4501
  snapshot=$(daemon_collect_snapshot "$issue_num" "$worktree" "$pid" 2>/dev/null || echo '{}')
@@ -4140,29 +4503,87 @@ daemon_health_check() {
4140
4503
  if [[ "$snapshot" != "{}" ]]; then
4141
4504
  verdict=$(daemon_assess_progress "$issue_num" "$snapshot" 2>/dev/null || echo "healthy")
4142
4505
 
4506
+ local no_progress_count=0
4507
+ no_progress_count=$(jq -r '.no_progress_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
4508
+ local cur_stage
4509
+ cur_stage=$(echo "$snapshot" | jq -r '.stage // "unknown"')
4510
+
4143
4511
  case "$verdict" in
4144
4512
  healthy)
4145
4513
  # All good — agent is making progress
4146
4514
  ;;
4147
4515
  slowing)
4148
- daemon_log INFO "Issue #${issue_num} slowing (no progress for 1-2 checks, ${elapsed}s elapsed)"
4516
+ daemon_log INFO "Issue #${issue_num} slowing (no visible changes for ${no_progress_count} checks, ${elapsed}s elapsed, stage=${cur_stage})"
4149
4517
  ;;
4150
4518
  stalled)
4151
- local no_progress_count
4152
- no_progress_count=$(jq -r '.no_progress_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
4153
- daemon_log WARN "Issue #${issue_num} stalled: no progress for ${no_progress_count} checks (${elapsed}s elapsed, PID $pid)"
4154
- emit_event "daemon.stalled" "issue=$issue_num" "no_progress=$no_progress_count" "elapsed_s=$elapsed" "pid=$pid"
4519
+ # Check if agent subprocess is alive and consuming CPU
4520
+ local agent_alive=false
4521
+ local child_cpu=0
4522
+ child_cpu=$(pgrep -P "$pid" 2>/dev/null | xargs -I{} ps -o pcpu= -p {} 2>/dev/null | awk '{sum+=$1} END{printf "%d", sum+0}' || echo "0")
4523
+ if [[ "${child_cpu:-0}" -gt 0 ]]; then
4524
+ agent_alive=true
4525
+ fi
4526
+
4527
+ if [[ "$agent_alive" == "true" ]]; then
4528
+ daemon_log INFO "Issue #${issue_num} no visible progress (${no_progress_count} checks) but agent is alive (CPU: ${child_cpu}%, stage=${cur_stage}, ${elapsed}s) — being patient"
4529
+ else
4530
+ daemon_log WARN "Issue #${issue_num} stalled: no progress for ${no_progress_count} checks, no CPU activity (${elapsed}s elapsed, PID $pid)"
4531
+ emit_event "daemon.stalled" "issue=$issue_num" "no_progress=$no_progress_count" "elapsed_s=$elapsed" "pid=$pid"
4532
+ fi
4155
4533
  ;;
4156
4534
  stuck)
4157
- local no_progress_count repeated_errors cur_stage
4158
- no_progress_count=$(jq -r '.no_progress_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
4535
+ local repeated_errors
4159
4536
  repeated_errors=$(jq -r '.repeated_error_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
4160
- cur_stage=$(echo "$snapshot" | jq -r '.stage // "unknown"')
4161
- daemon_log WARN "Issue #${issue_num} STUCK: no progress for ${no_progress_count} checks, ${repeated_errors} repeated errors (stage=${cur_stage}, ${elapsed}s, PID $pid) — killing"
4162
- emit_event "daemon.stuck_kill" "issue=$issue_num" "no_progress=$no_progress_count" "repeated_errors=$repeated_errors" "stage=$cur_stage" "elapsed_s=$elapsed" "pid=$pid"
4163
- kill "$pid" 2>/dev/null || true
4164
- daemon_clear_progress "$issue_num"
4165
- findings=$((findings + 1))
4537
+
4538
+ # Even "stuck" check if the process tree is alive first
4539
+ local agent_alive=false
4540
+ local child_cpu=0
4541
+ child_cpu=$(pgrep -P "$pid" 2>/dev/null | xargs -I{} ps -o pcpu= -p {} 2>/dev/null | awk '{sum+=$1} END{printf "%d", sum+0}' || echo "0")
4542
+ if [[ "${child_cpu:-0}" -gt 0 ]]; then
4543
+ agent_alive=true
4544
+ fi
4545
+
4546
+ if [[ "$agent_alive" == "true" && "$repeated_errors" -lt 3 ]]; then
4547
+ # Agent is alive — nudge instead of kill
4548
+ if [[ "$nudge_enabled" == "true" && "$no_progress_count" -ge "$nudge_after" ]]; then
4549
+ local nudge_file="${worktree}/.claude/nudge.md"
4550
+ if [[ ! -f "$nudge_file" ]]; then
4551
+ cat > "$nudge_file" <<NUDGE_EOF
4552
+ # Nudge from Daemon Health Monitor
4553
+
4554
+ The daemon has noticed no visible progress for $(( no_progress_count * 30 / 60 )) minutes.
4555
+ Current stage: ${cur_stage}
4556
+
4557
+ If you're stuck, consider:
4558
+ - Breaking the task into smaller steps
4559
+ - Committing partial progress
4560
+ - Running tests to validate current state
4561
+
4562
+ This is just a gentle check-in — take your time if you're working through a complex problem.
4563
+ NUDGE_EOF
4564
+ daemon_log INFO "Issue #${issue_num} nudged (${no_progress_count} checks, stage=${cur_stage}, CPU=${child_cpu}%) — file written to worktree"
4565
+ emit_event "daemon.nudge" "issue=$issue_num" "no_progress=$no_progress_count" "stage=$cur_stage" "elapsed_s=$elapsed"
4566
+ fi
4567
+ else
4568
+ daemon_log INFO "Issue #${issue_num} no visible progress (${no_progress_count} checks) but agent is alive (CPU: ${child_cpu}%, stage=${cur_stage}) — waiting"
4569
+ fi
4570
+ elif [[ "$repeated_errors" -ge 5 ]]; then
4571
+ # Truly stuck in an error loop — kill as last resort
4572
+ daemon_log WARN "Issue #${issue_num} in error loop: ${repeated_errors} repeated errors (stage=${cur_stage}, ${elapsed}s, PID $pid) — killing"
4573
+ emit_event "daemon.stuck_kill" "issue=$issue_num" "no_progress=$no_progress_count" "repeated_errors=$repeated_errors" "stage=$cur_stage" "elapsed_s=$elapsed" "pid=$pid" "reason=error_loop"
4574
+ kill "$pid" 2>/dev/null || true
4575
+ daemon_clear_progress "$issue_num"
4576
+ findings=$((findings + 1))
4577
+ elif [[ "$agent_alive" != "true" && "$no_progress_count" -ge "$((PROGRESS_CHECKS_BEFORE_KILL * 2))" ]]; then
4578
+ # Process tree is dead AND no progress for very long time
4579
+ daemon_log WARN "Issue #${issue_num} appears dead: no CPU, no progress for ${no_progress_count} checks (${elapsed}s, PID $pid) — killing"
4580
+ emit_event "daemon.stuck_kill" "issue=$issue_num" "no_progress=$no_progress_count" "repeated_errors=$repeated_errors" "stage=$cur_stage" "elapsed_s=$elapsed" "pid=$pid" "reason=dead_process"
4581
+ kill "$pid" 2>/dev/null || true
4582
+ daemon_clear_progress "$issue_num"
4583
+ findings=$((findings + 1))
4584
+ else
4585
+ daemon_log WARN "Issue #${issue_num} struggling (${no_progress_count} checks, ${repeated_errors} errors, CPU=${child_cpu}%, stage=${cur_stage}) — monitoring"
4586
+ fi
4166
4587
  ;;
4167
4588
  esac
4168
4589
  fi
@@ -4171,8 +4592,9 @@ daemon_health_check() {
4171
4592
  local stale_timeout
4172
4593
  stale_timeout=$(get_adaptive_stale_timeout "$PIPELINE_TEMPLATE")
4173
4594
  if [[ "$elapsed" -gt "$stale_timeout" ]]; then
4174
- daemon_log WARN "Stale job (legacy): issue #${issue_num} (${elapsed}s > ${stale_timeout}s, PID $pid) — killing"
4175
- kill "$pid" 2>/dev/null || true
4595
+ daemon_log WARN "Stale job (legacy): issue #${issue_num} (${elapsed}s > ${stale_timeout}s, PID $pid)"
4596
+ # Don't kill just log. Let the process run.
4597
+ emit_event "daemon.stale_warning" "issue=$issue_num" "elapsed_s=$elapsed" "pid=$pid"
4176
4598
  findings=$((findings + 1))
4177
4599
  fi
4178
4600
  fi
@@ -4765,6 +5187,7 @@ daemon_poll_loop() {
4765
5187
  # All poll loop calls are error-guarded to prevent set -e from killing the daemon.
4766
5188
  # The || operator disables set -e for the entire call chain, so transient failures
4767
5189
  # (GitHub API timeouts, jq errors, intelligence failures) are logged and skipped.
5190
+ daemon_preflight_auth_check || daemon_log WARN "Auth check failed — daemon may be paused"
4768
5191
  daemon_poll_issues || daemon_log WARN "daemon_poll_issues failed — continuing"
4769
5192
  daemon_reap_completed || daemon_log WARN "daemon_reap_completed failed — continuing"
4770
5193
  daemon_health_check || daemon_log WARN "daemon_health_check failed — continuing"
@@ -4848,7 +5271,8 @@ cleanup_on_exit() {
4848
5271
  while IFS= read -r cpid; do
4849
5272
  [[ -z "$cpid" ]] && continue
4850
5273
  if kill -0 "$cpid" 2>/dev/null; then
4851
- daemon_log INFO "Killing pipeline process PID ${cpid}"
5274
+ daemon_log INFO "Killing pipeline process tree PID ${cpid}"
5275
+ pkill -TERM -P "$cpid" 2>/dev/null || true
4852
5276
  kill "$cpid" 2>/dev/null || true
4853
5277
  killed=$((killed + 1))
4854
5278
  fi
@@ -4860,7 +5284,8 @@ cleanup_on_exit() {
4860
5284
  while IFS= read -r cpid; do
4861
5285
  [[ -z "$cpid" ]] && continue
4862
5286
  if kill -0 "$cpid" 2>/dev/null; then
4863
- daemon_log WARN "Force-killing pipeline PID ${cpid}"
5287
+ daemon_log WARN "Force-killing pipeline tree PID ${cpid}"
5288
+ pkill -9 -P "$cpid" 2>/dev/null || true
4864
5289
  kill -9 "$cpid" 2>/dev/null || true
4865
5290
  fi
4866
5291
  done <<< "$child_pids"
@@ -4951,6 +5376,11 @@ daemon_start() {
4951
5376
  # Remove stale shutdown flag
4952
5377
  rm -f "$SHUTDOWN_FLAG"
4953
5378
 
5379
+ # Initialize SQLite database (if available)
5380
+ if type init_schema &>/dev/null; then
5381
+ init_schema 2>/dev/null || true
5382
+ fi
5383
+
4954
5384
  # Initialize state
4955
5385
  init_state
4956
5386