npm - shipwright-cli - Versions diffs - 1.9.0 → 2.0.0 - Mend

shipwright-cli 1.9.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (117) hide show

package/.claude/hooks/post-tool-use.sh +12 -5
package/README.md +114 -36
package/completions/_shipwright +212 -32
package/completions/shipwright.bash +97 -25
package/docs/strategy/01-market-research.md +619 -0
package/docs/strategy/02-mission-and-brand.md +587 -0
package/docs/strategy/03-gtm-and-roadmap.md +759 -0
package/docs/strategy/QUICK-START.txt +289 -0
package/docs/strategy/README.md +172 -0
package/package.json +4 -2
package/scripts/sw +217 -2
package/scripts/sw-activity.sh +500 -0
package/scripts/sw-adaptive.sh +925 -0
package/scripts/sw-adversarial.sh +1 -1
package/scripts/sw-architecture-enforcer.sh +1 -1
package/scripts/sw-auth.sh +613 -0
package/scripts/sw-autonomous.sh +664 -0
package/scripts/sw-changelog.sh +704 -0
package/scripts/sw-checkpoint.sh +79 -1
package/scripts/sw-ci.sh +602 -0
package/scripts/sw-cleanup.sh +192 -7
package/scripts/sw-code-review.sh +637 -0
package/scripts/sw-connect.sh +1 -1
package/scripts/sw-context.sh +605 -0
package/scripts/sw-cost.sh +1 -1
package/scripts/sw-daemon.sh +812 -138
package/scripts/sw-dashboard.sh +1 -1
package/scripts/sw-db.sh +540 -0
package/scripts/sw-decompose.sh +539 -0
package/scripts/sw-deps.sh +551 -0
package/scripts/sw-developer-simulation.sh +1 -1
package/scripts/sw-discovery.sh +412 -0
package/scripts/sw-docs-agent.sh +539 -0
package/scripts/sw-docs.sh +1 -1
package/scripts/sw-doctor.sh +59 -1
package/scripts/sw-dora.sh +615 -0
package/scripts/sw-durable.sh +710 -0
package/scripts/sw-e2e-orchestrator.sh +535 -0
package/scripts/sw-eventbus.sh +393 -0
package/scripts/sw-feedback.sh +471 -0
package/scripts/sw-fix.sh +1 -1
package/scripts/sw-fleet-discover.sh +567 -0
package/scripts/sw-fleet-viz.sh +404 -0
package/scripts/sw-fleet.sh +8 -1
package/scripts/sw-github-app.sh +596 -0
package/scripts/sw-github-checks.sh +1 -1
package/scripts/sw-github-deploy.sh +1 -1
package/scripts/sw-github-graphql.sh +1 -1
package/scripts/sw-guild.sh +569 -0
package/scripts/sw-heartbeat.sh +1 -1
package/scripts/sw-hygiene.sh +559 -0
package/scripts/sw-incident.sh +617 -0
package/scripts/sw-init.sh +88 -1
package/scripts/sw-instrument.sh +699 -0
package/scripts/sw-intelligence.sh +1 -1
package/scripts/sw-jira.sh +1 -1
package/scripts/sw-launchd.sh +366 -31
package/scripts/sw-linear.sh +1 -1
package/scripts/sw-logs.sh +1 -1
package/scripts/sw-loop.sh +507 -51
package/scripts/sw-memory.sh +198 -3
package/scripts/sw-mission-control.sh +487 -0
package/scripts/sw-model-router.sh +545 -0
package/scripts/sw-otel.sh +596 -0
package/scripts/sw-oversight.sh +689 -0
package/scripts/sw-pipeline-composer.sh +8 -8
package/scripts/sw-pipeline-vitals.sh +1096 -0
package/scripts/sw-pipeline.sh +2451 -180
package/scripts/sw-pm.sh +693 -0
package/scripts/sw-pr-lifecycle.sh +522 -0
package/scripts/sw-predictive.sh +1 -1
package/scripts/sw-prep.sh +1 -1
package/scripts/sw-ps.sh +4 -3
package/scripts/sw-public-dashboard.sh +798 -0
package/scripts/sw-quality.sh +595 -0
package/scripts/sw-reaper.sh +5 -3
package/scripts/sw-recruit.sh +573 -0
package/scripts/sw-regression.sh +642 -0
package/scripts/sw-release-manager.sh +736 -0
package/scripts/sw-release.sh +706 -0
package/scripts/sw-remote.sh +1 -1
package/scripts/sw-replay.sh +520 -0
package/scripts/sw-retro.sh +691 -0
package/scripts/sw-scale.sh +444 -0
package/scripts/sw-security-audit.sh +505 -0
package/scripts/sw-self-optimize.sh +109 -8
package/scripts/sw-session.sh +31 -9
package/scripts/sw-setup.sh +1 -1
package/scripts/sw-standup.sh +712 -0
package/scripts/sw-status.sh +192 -1
package/scripts/sw-strategic.sh +658 -0
package/scripts/sw-stream.sh +450 -0
package/scripts/sw-swarm.sh +583 -0
package/scripts/sw-team-stages.sh +511 -0
package/scripts/sw-templates.sh +1 -1
package/scripts/sw-testgen.sh +515 -0
package/scripts/sw-tmux-pipeline.sh +554 -0
package/scripts/sw-tmux.sh +1 -1
package/scripts/sw-trace.sh +485 -0
package/scripts/sw-tracker-github.sh +188 -0
package/scripts/sw-tracker-jira.sh +172 -0
package/scripts/sw-tracker-linear.sh +251 -0
package/scripts/sw-tracker.sh +117 -2
package/scripts/sw-triage.sh +603 -0
package/scripts/sw-upgrade.sh +1 -1
package/scripts/sw-ux.sh +677 -0
package/scripts/sw-webhook.sh +627 -0
package/scripts/sw-widgets.sh +530 -0
package/scripts/sw-worktree.sh +1 -1
package/templates/pipelines/autonomous.json +8 -1
package/templates/pipelines/cost-aware.json +21 -0
package/templates/pipelines/deployed.json +40 -6
package/templates/pipelines/enterprise.json +16 -2
package/templates/pipelines/fast.json +19 -0
package/templates/pipelines/full.json +16 -2
package/templates/pipelines/hotfix.json +19 -0
package/templates/pipelines/standard.json +19 -0

package/scripts/sw-daemon.sh CHANGED Viewed

@@ -6,7 +6,10 @@
 set -euo pipefail
 trap 'echo "ERROR: $BASH_SOURCE:$LINENO exited with status $?" >&2' ERR
-VERSION="1.9.0"
+# Allow spawning Claude CLI from within a Claude Code session (daemon, fleet, etc.)
+unset CLAUDECODE 2>/dev/null || true
+VERSION="2.0.0"
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 REPO_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
@@ -34,6 +37,8 @@ RESET='\033[0m'
 [[ -f "$SCRIPT_DIR/sw-self-optimize.sh" ]] && source "$SCRIPT_DIR/sw-self-optimize.sh"
 # shellcheck source=sw-predictive.sh
 [[ -f "$SCRIPT_DIR/sw-predictive.sh" ]] && source "$SCRIPT_DIR/sw-predictive.sh"
+# shellcheck source=sw-pipeline-vitals.sh
+[[ -f "$SCRIPT_DIR/sw-pipeline-vitals.sh" ]] && source "$SCRIPT_DIR/sw-pipeline-vitals.sh"
 # ─── GitHub API Modules (optional) ────────────────────────────────────────
 # shellcheck source=sw-github-graphql.sh
@@ -125,7 +130,6 @@ rotate_event_log() {
 }
 # ─── GitHub Context (loaded once at startup) ──────────────────────────────
-DAEMON_GITHUB_CONTEXT=""
 daemon_github_context() {
     # Skip if no GitHub
@@ -141,8 +145,6 @@ daemon_github_context() {
     context=$(gh_repo_context "$owner" "$repo" 2>/dev/null || echo "{}")
     if [[ -n "$context" && "$context" != "{}" ]]; then
         daemon_log INFO "GitHub context loaded: $(echo "$context" | jq -r '.contributor_count // 0') contributors, $(echo "$context" | jq -r '.security_alert_count // 0') security alerts"
-        DAEMON_GITHUB_CONTEXT="$context"
-        export DAEMON_GITHUB_CONTEXT
     fi
 }
@@ -166,9 +168,9 @@ gh_retry() {
         # Check for rate-limit or server error indicators
         if echo "$output" | grep -qiE "rate limit|403|429|502|503"; then
-            daemon_log WARN "gh_retry: rate limit / server error on attempt ${attempt}/${max_retries} — backoff ${backoff}s"
+            daemon_log WARN "gh_retry: rate limit / server error on attempt ${attempt}/${max_retries} — backoff ${backoff}s" >&2
         else
-            daemon_log WARN "gh_retry: transient error on attempt ${attempt}/${max_retries} (exit ${exit_code}) — backoff ${backoff}s"
+            daemon_log WARN "gh_retry: transient error on attempt ${attempt}/${max_retries} (exit ${exit_code}) — backoff ${backoff}s" >&2
         fi
         if [[ $attempt -lt $max_retries ]]; then
@@ -421,6 +423,14 @@ load_config() {
     MAX_RETRIES=$(jq -r '.max_retries // 2' "$config_file")
     RETRY_ESCALATION=$(jq -r '.retry_escalation // true' "$config_file")
+    # session restart + fast test passthrough
+    MAX_RESTARTS_CFG=$(jq -r '.max_restarts // 3' "$config_file" 2>/dev/null || echo "3")
+    if ! [[ "$MAX_RESTARTS_CFG" =~ ^[0-9]+$ ]]; then
+        daemon_log WARN "Invalid max_restarts in config: $MAX_RESTARTS_CFG (using default: 3)"
+        MAX_RESTARTS_CFG="3"
+    fi
+    FAST_TEST_CMD_CFG=$(jq -r '.fast_test_cmd // ""' "$config_file" 2>/dev/null || echo "")
     # self-optimization
     SELF_OPTIMIZE=$(jq -r '.self_optimize // false' "$config_file")
     OPTIMIZE_INTERVAL=$(jq -r '.optimize_interval // 10' "$config_file")
@@ -471,9 +481,11 @@ load_config() {
     # progress-based health monitoring (replaces static timeouts)
     PROGRESS_MONITORING=$(jq -r '.health.progress_based // true' "$config_file")
-    PROGRESS_CHECKS_BEFORE_WARN=$(jq -r '.health.stale_checks_before_warn // 3' "$config_file")
-    PROGRESS_CHECKS_BEFORE_KILL=$(jq -r '.health.stale_checks_before_kill // 6' "$config_file")
-    PROGRESS_HARD_LIMIT_S=$(jq -r '.health.hard_limit_s // 10800' "$config_file")  # 3hr absolute max
+    PROGRESS_CHECKS_BEFORE_WARN=$(jq -r '.health.stale_checks_before_warn // 20' "$config_file")
+    PROGRESS_CHECKS_BEFORE_KILL=$(jq -r '.health.stale_checks_before_kill // 120' "$config_file")
+    PROGRESS_HARD_LIMIT_S=$(jq -r '.health.hard_limit_s // 0' "$config_file")  # 0 = disabled (no hard kill)
+    NUDGE_ENABLED=$(jq -r '.health.nudge_enabled // true' "$config_file")
+    NUDGE_AFTER_CHECKS=$(jq -r '.health.nudge_after_checks // 40' "$config_file")
     # team dashboard URL (for coordinated claiming)
     local cfg_dashboard_url
@@ -482,6 +494,12 @@ load_config() {
         DASHBOARD_URL="$cfg_dashboard_url"
     fi
+    # Auto-enable self_optimize when auto_template is on
+    if [[ "${AUTO_TEMPLATE:-false}" == "true" && "${SELF_OPTIMIZE:-false}" == "false" ]]; then
+        SELF_OPTIMIZE="true"
+        daemon_log INFO "Auto-enabling self_optimize (auto_template is true)"
+    fi
     success "Config loaded"
 }
@@ -823,6 +841,31 @@ daemon_assess_progress() {
         has_progress=true
     fi
+    # Claude subprocess is alive and consuming CPU — agent is thinking/working
+    # During build stage, Claude can spend 10+ minutes thinking before any
+    # visible git changes appear.  Detect this as progress.
+    if [[ "$has_progress" != "true" ]]; then
+        local _pid_for_check
+        _pid_for_check=$(echo "$current_snapshot" | jq -r '.pid // empty' 2>/dev/null || true)
+        if [[ -z "$_pid_for_check" ]]; then
+            # Fallback: get PID from active_jobs
+            _pid_for_check=$(jq -r --argjson num "$issue_num" \
+                '.active_jobs[] | select(.issue == ($num | tonumber)) | .pid' "$STATE_FILE" 2>/dev/null | head -1 || true)
+        fi
+        if [[ -n "$_pid_for_check" ]]; then
+            # Check if any child process (claude) is alive and using CPU
+            local child_cpu=0
+            child_cpu=$(ps -o pid=,pcpu= -p "$_pid_for_check" 2>/dev/null | awk '{sum+=$2} END{printf "%d", sum+0}' || echo "0")
+            if [[ "$child_cpu" -eq 0 ]]; then
+                # Check children of the pipeline process
+                child_cpu=$(pgrep -P "$_pid_for_check" 2>/dev/null | xargs -I{} ps -o pcpu= -p {} 2>/dev/null | awk '{sum+=$1} END{printf "%d", sum+0}' || echo "0")
+            fi
+            if [[ "${child_cpu:-0}" -gt 0 ]]; then
+                has_progress=true
+            fi
+        fi
+    fi
     # Detect repeated errors (same error signature hitting again)
     local repeated_errors="$prev_repeated_errors"
     if [[ -n "$cur_error" && "$cur_error" == "$prev_error" ]]; then
@@ -855,7 +898,56 @@ daemon_assess_progress() {
         if $npc == 0 then .last_progress_at = $ts else . end
         ' "$progress_file" > "$tmp_progress" 2>/dev/null && mv "$tmp_progress" "$progress_file"
-    # Determine verdict
+    # ── Vitals-based verdict (preferred over static thresholds) ──
+    if type pipeline_compute_vitals &>/dev/null 2>&1 && type pipeline_health_verdict &>/dev/null 2>&1; then
+        # Compute vitals using the worktree's pipeline state if available
+        local _worktree_state=""
+        local _worktree_artifacts=""
+        local _worktree_dir
+        _worktree_dir=$(jq -r --arg i "$issue_num" '.active_jobs[] | select(.issue == ($i | tonumber)) | .worktree // ""' "$STATE_FILE" 2>/dev/null || echo "")
+        if [[ -n "$_worktree_dir" && -d "$_worktree_dir/.claude" ]]; then
+            _worktree_state="$_worktree_dir/.claude/pipeline-state.md"
+            _worktree_artifacts="$_worktree_dir/.claude/pipeline-artifacts"
+        fi
+        local _vitals_json
+        _vitals_json=$(pipeline_compute_vitals "$_worktree_state" "$_worktree_artifacts" "$issue_num" 2>/dev/null) || true
+        if [[ -n "$_vitals_json" && "$_vitals_json" != "{}" ]]; then
+            local _health_verdict _health_score
+            _health_verdict=$(echo "$_vitals_json" | jq -r '.verdict // "continue"' 2>/dev/null || echo "continue")
+            _health_score=$(echo "$_vitals_json" | jq -r '.health_score // 50' 2>/dev/null || echo "50")
+            emit_event "pipeline.vitals_check" \
+                "issue=$issue_num" \
+                "health_score=$_health_score" \
+                "verdict=$_health_verdict" \
+                "no_progress=$no_progress_count" \
+                "repeated_errors=$repeated_errors"
+            # Map vitals verdict to daemon verdict
+            case "$_health_verdict" in
+                continue)
+                    echo "healthy"
+                    return
+                    ;;
+                warn)
+                    # Sluggish but not dead — equivalent to slowing
+                    echo "slowing"
+                    return
+                    ;;
+                intervene)
+                    echo "stalled"
+                    return
+                    ;;
+                abort)
+                    echo "stuck"
+                    return
+                    ;;
+            esac
+        fi
+    fi
+    # ── Fallback: static threshold verdict ──
     local warn_threshold="${PROGRESS_CHECKS_BEFORE_WARN:-3}"
     local kill_threshold="${PROGRESS_CHECKS_BEFORE_KILL:-6}"
@@ -1039,6 +1131,7 @@ extract_issue_dependencies() {
 }
 # ─── Logging ─────────────────────────────────────────────────────────────────
+DAEMON_LOG_WRITE_COUNT=0
 daemon_log() {
     local level="$1"
@@ -1048,8 +1141,9 @@ daemon_log() {
     ts=$(now_iso)
     echo "[$ts] [$level] $msg" >> "$LOG_FILE"
-    # Rotate daemon.log if over 20MB (checked every ~100 writes)
-    if [[ $(( RANDOM % 100 )) -eq 0 ]] && [[ -f "$LOG_FILE" ]]; then
+    # Rotate daemon.log if over 20MB (checked every 100 writes)
+    DAEMON_LOG_WRITE_COUNT=$(( DAEMON_LOG_WRITE_COUNT + 1 ))
+    if [[ $(( DAEMON_LOG_WRITE_COUNT % 100 )) -eq 0 ]] && [[ -f "$LOG_FILE" ]]; then
         local log_size
         log_size=$(wc -c < "$LOG_FILE" 2>/dev/null || echo 0)
         if [[ "$log_size" -gt 20971520 ]]; then
@@ -1060,11 +1154,14 @@ daemon_log() {
         fi
     fi
-    # Also print to stdout
+    # Print to stderr (NOT stdout) to avoid corrupting command substitution captures.
+    # This is critical: functions like select_pipeline_template(), triage_score_issue(),
+    # gh_retry(), and locked_get_active_count() return values via echo/stdout and are
+    # called via $(). If daemon_log writes to stdout, the log text corrupts return values.
     case "$level" in
-        INFO)    info "$msg" ;;
-        SUCCESS) success "$msg" ;;
-        WARN)    warn "$msg" ;;
+        INFO)    info "$msg" >&2 ;;
+        SUCCESS) success "$msg" >&2 ;;
+        WARN)    warn "$msg" >&2 ;;
         ERROR)   error "$msg" ;;
     esac
 }
@@ -1130,7 +1227,10 @@ gh_record_failure() {
     GH_CONSECUTIVE_FAILURES=$((GH_CONSECUTIVE_FAILURES + 1))
     if [[ "$GH_CONSECUTIVE_FAILURES" -ge 3 ]]; then
         # Exponential backoff: 30s, 60s, 120s, 240s (capped at 5min)
-        local backoff_secs=$((30 * (1 << (GH_CONSECUTIVE_FAILURES - 3))))
+        # Cap shift to avoid integer overflow for large failure counts
+        local shift_amt=$(( GH_CONSECUTIVE_FAILURES - 3 ))
+        [[ "$shift_amt" -gt 4 ]] && shift_amt=4
+        local backoff_secs=$((30 * (1 << shift_amt)))
         [[ "$backoff_secs" -gt 300 ]] && backoff_secs=300
         GH_BACKOFF_UNTIL=$(( $(now_epoch) + backoff_secs ))
         daemon_log WARN "GitHub rate-limit circuit breaker: backing off ${backoff_secs}s after ${GH_CONSECUTIVE_FAILURES} failures"
@@ -1138,6 +1238,74 @@ gh_record_failure() {
     fi
 }
+# ─── Runtime Auth Check ──────────────────────────────────────────────────────
+LAST_AUTH_CHECK_EPOCH=0
+AUTH_CHECK_INTERVAL=300  # 5 minutes
+daemon_preflight_auth_check() {
+    local now_e
+    now_e=$(now_epoch)
+    if [[ $((now_e - LAST_AUTH_CHECK_EPOCH)) -lt "$AUTH_CHECK_INTERVAL" ]]; then
+        return 0
+    fi
+    LAST_AUTH_CHECK_EPOCH="$now_e"
+    # gh auth check
+    if [[ "${NO_GITHUB:-false}" != "true" ]]; then
+        if ! gh auth status &>/dev/null 2>&1; then
+            daemon_log ERROR "GitHub auth check failed — auto-pausing daemon"
+            local pause_json
+            pause_json=$(jq -n --arg reason "gh_auth_failure" --arg ts "$(now_iso)" \
+                '{reason: $reason, timestamp: $ts}')
+            local _tmp_pause
+            _tmp_pause=$(mktemp "${TMPDIR:-/tmp}/sw-pause.XXXXXX")
+            echo "$pause_json" > "$_tmp_pause"
+            mv "$_tmp_pause" "$PAUSE_FLAG"
+            emit_event "daemon.auto_pause" "reason=gh_auth_failure"
+            return 1
+        fi
+    fi
+    # claude auth check with 15s timeout (macOS has no timeout command)
+    local claude_auth_ok=false
+    local _auth_tmp
+    _auth_tmp=$(mktemp "${TMPDIR:-/tmp}/sw-auth.XXXXXX")
+    ( claude --print -p "ok" --max-turns 1 > "$_auth_tmp" 2>/dev/null ) &
+    local _auth_pid=$!
+    local _auth_waited=0
+    while kill -0 "$_auth_pid" 2>/dev/null && [[ "$_auth_waited" -lt 15 ]]; do
+        sleep 1
+        _auth_waited=$((_auth_waited + 1))
+    done
+    if kill -0 "$_auth_pid" 2>/dev/null; then
+        kill "$_auth_pid" 2>/dev/null || true
+        wait "$_auth_pid" 2>/dev/null || true
+    else
+        wait "$_auth_pid" 2>/dev/null || true
+    fi
+    if [[ -s "$_auth_tmp" ]]; then
+        claude_auth_ok=true
+    fi
+    rm -f "$_auth_tmp"
+    if [[ "$claude_auth_ok" != "true" ]]; then
+        daemon_log ERROR "Claude auth check failed — auto-pausing daemon"
+        local pause_json
+        pause_json=$(jq -n --arg reason "claude_auth_failure" --arg ts "$(now_iso)" \
+            '{reason: $reason, timestamp: $ts}')
+        local _tmp_pause
+        _tmp_pause=$(mktemp "${TMPDIR:-/tmp}/sw-pause.XXXXXX")
+        echo "$pause_json" > "$_tmp_pause"
+        mv "$_tmp_pause" "$PAUSE_FLAG"
+        emit_event "daemon.auto_pause" "reason=claude_auth_failure"
+        return 1
+    fi
+    return 0
+}
 # ─── Pre-flight Checks ──────────────────────────────────────────────────────
 preflight_checks() {
@@ -1380,7 +1548,7 @@ locked_get_active_count() {
         (
             if command -v flock &>/dev/null; then
                 flock -w 5 200 2>/dev/null || {
-                    daemon_log WARN "locked_get_active_count: lock timeout — returning MAX_PARALLEL as safe default"
+                    daemon_log WARN "locked_get_active_count: lock timeout — returning MAX_PARALLEL as safe default" >&2
                     echo "$MAX_PARALLEL"
                     exit 0
                 }
@@ -1539,9 +1707,24 @@ daemon_spawn_pipeline() {
     local issue_num="$1"
     local issue_title="${2:-}"
     local repo_full_name="${3:-}"  # owner/repo (org mode only)
+    shift 3 2>/dev/null || true
+    local extra_pipeline_args=("$@")  # Optional extra args passed to sw-pipeline.sh
     daemon_log INFO "Spawning pipeline for issue #${issue_num}: ${issue_title}"
+    # ── Issue decomposition (if decomposer available) ──
+    local decompose_script="${SCRIPT_DIR}/sw-decompose.sh"
+    if [[ -x "$decompose_script" && "$NO_GITHUB" != "true" ]]; then
+        local decompose_result=""
+        decompose_result=$("$decompose_script" auto "$issue_num" 2>/dev/null) || true
+        if [[ "$decompose_result" == *"decomposed"* ]]; then
+            daemon_log INFO "Issue #${issue_num} decomposed into subtasks — skipping pipeline"
+            # Remove the shipwright label so decomposed parent doesn't re-queue
+            gh issue edit "$issue_num" --remove-label "shipwright" 2>/dev/null || true
+            return 0
+        fi
+    fi
     # Extract goal text from issue (title + first line of body)
     local issue_goal="$issue_title"
     if [[ "$NO_GITHUB" != "true" ]]; then
@@ -1626,6 +1809,17 @@ daemon_spawn_pipeline() {
         daemon_log INFO "Worktree created at ${work_dir}"
     fi
+    # If template is "composed", copy the composed spec into the worktree
+    if [[ "$PIPELINE_TEMPLATE" == "composed" ]]; then
+        local _src_composed="${REPO_DIR:-.}/.claude/pipeline-artifacts/composed-pipeline.json"
+        if [[ -f "$_src_composed" ]]; then
+            local _dst_artifacts="${work_dir}/.claude/pipeline-artifacts"
+            mkdir -p "$_dst_artifacts"
+            cp "$_src_composed" "$_dst_artifacts/composed-pipeline.json" 2>/dev/null || true
+            daemon_log INFO "Copied composed pipeline spec to worktree"
+        fi
+    fi
     # Build pipeline args
     local pipeline_args=("start" "--issue" "$issue_num" "--pipeline" "$PIPELINE_TEMPLATE")
     if [[ "$SKIP_GATES" == "true" ]]; then
@@ -1637,12 +1831,27 @@ daemon_spawn_pipeline() {
     if [[ "$NO_GITHUB" == "true" ]]; then
         pipeline_args+=("--no-github")
     fi
+    # Pass session restart config
+    if [[ "${MAX_RESTARTS_CFG:-0}" -gt 0 ]]; then
+        pipeline_args+=("--max-restarts" "$MAX_RESTARTS_CFG")
+    fi
+    # Pass fast test command
+    if [[ -n "${FAST_TEST_CMD_CFG:-}" ]]; then
+        pipeline_args+=("--fast-test-cmd" "$FAST_TEST_CMD_CFG")
+    fi
+    # Append any extra pipeline args (from retry escalation, etc.)
+    if [[ ${#extra_pipeline_args[@]} -gt 0 ]]; then
+        pipeline_args+=("${extra_pipeline_args[@]}")
+    fi
     # Run pipeline in work directory (background)
+    # Ignore SIGHUP so tmux attach/detach and process group changes don't kill the pipeline
     echo -e "\n\n===== Pipeline run $(date -u +%Y-%m-%dT%H:%M:%SZ) =====" >> "$LOG_DIR/issue-${issue_num}.log" 2>/dev/null || true
     (
+        trap '' HUP
         cd "$work_dir"
-        "$SCRIPT_DIR/sw-pipeline.sh" "${pipeline_args[@]}"
+        exec "$SCRIPT_DIR/sw-pipeline.sh" "${pipeline_args[@]}"
     ) >> "$LOG_DIR/issue-${issue_num}.log" 2>&1 200>&- &
     local pid=$!
@@ -1770,6 +1979,41 @@ daemon_reap_completed() {
             daemon_on_success "$issue_num" "$duration_str"
         else
             daemon_on_failure "$issue_num" "$exit_code" "$duration_str"
+            # Cancel any lingering in_progress GitHub Check Runs for failed job
+            if [[ "${NO_GITHUB:-false}" != "true" && -n "$worktree" ]]; then
+                local check_ids_file="${worktree}/.claude/pipeline-artifacts/check-run-ids.json"
+                if [[ -f "$check_ids_file" ]]; then
+                    daemon_log INFO "Cancelling in-progress check runs for issue #${issue_num}"
+                    local _stage
+                    while IFS= read -r _stage; do
+                        [[ -z "$_stage" ]] && continue
+                        # Direct API call since we're in daemon context
+                        local _run_id
+                        _run_id=$(jq -r --arg s "$_stage" '.[$s] // empty' "$check_ids_file" 2>/dev/null || true)
+                        if [[ -n "$_run_id" && "$_run_id" != "null" ]]; then
+                            local _detected
+                            _detected=$(git remote get-url origin 2>/dev/null | sed 's|.*github.com[:/]\(.*\)\.git$|\1|' || true)
+                            if [[ -n "$_detected" ]]; then
+                                local _owner="${_detected%%/*}" _repo="${_detected##*/}"
+                                gh api "repos/${_owner}/${_repo}/check-runs/${_run_id}" \
+                                    --method PATCH \
+                                    --field status=completed \
+                                    --field conclusion=cancelled \
+                                    --silent 2>/dev/null || true
+                            fi
+                        fi
+                    done < <(jq -r 'keys[]' "$check_ids_file" 2>/dev/null || true)
+                fi
+            fi
+        fi
+        # Finalize memory (capture failure patterns for future runs)
+        if type memory_finalize_pipeline &>/dev/null 2>&1; then
+            local _job_state _job_artifacts
+            _job_state="${worktree:-.}/.claude/pipeline-state.md"
+            _job_artifacts="${worktree:-.}/.claude/pipeline-artifacts"
+            memory_finalize_pipeline "$_job_state" "$_job_artifacts" 2>/dev/null || true
         fi
         # Clean up progress tracking for this job
@@ -1780,15 +2024,18 @@ daemon_reap_completed() {
         reap_machine_name=$(jq -r '.machines[] | select(.role == "primary") | .name' "$HOME/.shipwright/machines.json" 2>/dev/null || hostname -s)
         release_claim "$issue_num" "$reap_machine_name"
-        # Skip cleanup if a retry was just spawned for this issue
+        # Always remove the OLD job entry from active_jobs to prevent
+        # re-reaping of the dead PID on the next cycle.  When a retry was
+        # spawned, daemon_spawn_pipeline already added a fresh entry with
+        # the new PID — we must not leave the stale one behind.
+        locked_state_update --argjson num "$issue_num" \
+            --argjson old_pid "${pid:-0}" \
+            '.active_jobs = [.active_jobs[] | select(.issue != $num or .pid != $old_pid)]'
+        untrack_priority_job "$issue_num"
         if [[ "$_retry_spawned_for" == "$issue_num" ]]; then
             daemon_log INFO "Retry spawned for issue #${issue_num} — skipping worktree cleanup"
         else
-            # Remove from active_jobs and priority lane tracking (locked)
-            locked_state_update --argjson num "$issue_num" \
-                '.active_jobs = [.active_jobs[] | select(.issue != $num)]'
-            untrack_priority_job "$issue_num"
             # Clean up worktree (skip for org-mode clones — they persist)
             local job_repo
             job_repo=$(echo "$job" | jq -r '.repo // ""')
@@ -1827,6 +2074,9 @@ daemon_reap_completed() {
 daemon_on_success() {
     local issue_num="$1" duration="${2:-}"
+    # Reset consecutive failure tracking on any success
+    reset_failure_tracking
     daemon_log SUCCESS "Pipeline completed for issue #${issue_num} (${duration:-unknown})"
     # Record pipeline duration for adaptive threshold learning
@@ -1887,6 +2137,91 @@ Check the associated PR for the implementation." 2>/dev/null || true
     "$SCRIPT_DIR/sw-tracker.sh" notify "completed" "$issue_num" 2>/dev/null || true
 }
+# ─── Failure Classification ─────────────────────────────────────────────────
+classify_failure() {
+    local issue_num="$1"
+    if [[ -z "${LOG_DIR:-}" ]]; then
+        echo "unknown"
+        return
+    fi
+    local log_path="$LOG_DIR/issue-${issue_num}.log"
+    if [[ ! -f "$log_path" ]]; then
+        echo "unknown"
+        return
+    fi
+    local tail_content
+    tail_content=$(tail -200 "$log_path" 2>/dev/null || true)
+    # Auth errors
+    if echo "$tail_content" | grep -qiE 'not logged in|unauthorized|auth.*fail|401 |invalid.*token|CLAUDE_CODE_OAUTH_TOKEN|api key.*invalid|authentication required'; then
+        echo "auth_error"
+        return
+    fi
+    # API errors (rate limits, timeouts, server errors)
+    if echo "$tail_content" | grep -qiE 'rate limit|429 |503 |502 |overloaded|timeout|ETIMEDOUT|ECONNRESET|socket hang up|service unavailable'; then
+        echo "api_error"
+        return
+    fi
+    # Invalid issue (not found, empty body)
+    if echo "$tail_content" | grep -qiE 'issue not found|404 |no body|could not resolve|GraphQL.*not found|issue.*does not exist'; then
+        echo "invalid_issue"
+        return
+    fi
+    # Context exhaustion — check progress file
+    local issue_worktree_path="${WORKTREE_DIR:-${REPO_DIR}/.worktrees}/daemon-issue-${issue_num}"
+    local progress_file="${issue_worktree_path}/.claude/loop-logs/progress.md"
+    if [[ -f "$progress_file" ]]; then
+        local cf_iter
+        cf_iter=$(grep -oE 'Iteration: [0-9]+' "$progress_file" 2>/dev/null | tail -1 | grep -oE '[0-9]+' || echo "0")
+        if ! [[ "${cf_iter:-0}" =~ ^[0-9]+$ ]]; then cf_iter="0"; fi
+        local cf_tests
+        cf_tests=$(grep -oE 'Tests passing: (true|false)' "$progress_file" 2>/dev/null | awk '{print $NF}' || echo "unknown")
+        if [[ "${cf_iter:-0}" -gt 0 ]] && { [[ "$cf_tests" == "false" ]] || [[ "$cf_tests" == "unknown" ]]; }; then
+            echo "context_exhaustion"
+            return
+        fi
+    fi
+    # Build failure (test errors, compile errors)
+    if echo "$tail_content" | grep -qiE 'test.*fail|FAIL|build.*error|compile.*error|lint.*fail|npm ERR|exit code [1-9]'; then
+        echo "build_failure"
+        return
+    fi
+    echo "unknown"
+}
+# ─── Consecutive Failure Tracking ──────────────────────────────────────────
+DAEMON_CONSECUTIVE_FAILURE_CLASS=""
+DAEMON_CONSECUTIVE_FAILURE_COUNT=0
+record_failure_class() {
+    local failure_class="$1"
+    if [[ "$failure_class" == "$DAEMON_CONSECUTIVE_FAILURE_CLASS" ]]; then
+        DAEMON_CONSECUTIVE_FAILURE_COUNT=$((DAEMON_CONSECUTIVE_FAILURE_COUNT + 1))
+    else
+        DAEMON_CONSECUTIVE_FAILURE_CLASS="$failure_class"
+        DAEMON_CONSECUTIVE_FAILURE_COUNT=1
+    fi
+    if [[ "$DAEMON_CONSECUTIVE_FAILURE_COUNT" -ge 3 ]]; then
+        daemon_log ERROR "3 consecutive failures (class: ${failure_class}) — auto-pausing daemon"
+        local pause_json
+        pause_json=$(jq -n --arg reason "consecutive_${failure_class}" --arg ts "$(now_iso)" \
+            '{reason: $reason, timestamp: $ts}')
+        local _tmp_pause
+        _tmp_pause=$(mktemp "${TMPDIR:-/tmp}/sw-pause.XXXXXX")
+        echo "$pause_json" > "$_tmp_pause"
+        mv "$_tmp_pause" "$PAUSE_FLAG"
+        emit_event "daemon.auto_pause" "reason=consecutive_failures" "class=$failure_class" "count=$DAEMON_CONSECUTIVE_FAILURE_COUNT"
+    fi
+}
+reset_failure_tracking() {
+    DAEMON_CONSECUTIVE_FAILURE_CLASS=""
+    DAEMON_CONSECUTIVE_FAILURE_COUNT=0
+}
 # ─── Failure Handler ────────────────────────────────────────────────────────
 daemon_on_failure() {
@@ -1923,100 +2258,160 @@ daemon_on_failure() {
             completed_at: $completed_at
         }] | .completed = .completed[-500:]'
+    # ── Classify failure and decide retry strategy ──
+    local failure_class
+    failure_class=$(classify_failure "$issue_num")
+    daemon_log INFO "Failure classified as: ${failure_class} for issue #${issue_num}"
+    emit_event "daemon.failure_classified" "issue=$issue_num" "class=$failure_class"
+    record_failure_class "$failure_class"
     # ── Auto-retry with strategy escalation ──
     if [[ "${RETRY_ESCALATION:-true}" == "true" ]]; then
         local retry_count
         retry_count=$(jq -r --arg num "$issue_num" \
             '.retry_counts[$num] // 0' "$STATE_FILE" 2>/dev/null || echo "0")
-        if [[ "$retry_count" -lt "${MAX_RETRIES:-2}" ]]; then
-            retry_count=$((retry_count + 1))
-            # Update retry count in state (locked to prevent race)
-            locked_state_update \
-                --arg num "$issue_num" --argjson count "$retry_count" \
-                '.retry_counts[$num] = $count'
-            daemon_log WARN "Auto-retry #${retry_count}/${MAX_RETRIES:-2} for issue #${issue_num}"
-            emit_event "daemon.retry" "issue=$issue_num" "retry=$retry_count" "max=${MAX_RETRIES:-2}"
-            # Check for checkpoint to enable resume-from-checkpoint
-            local checkpoint_args=()
-            if [[ "${CHECKPOINT_ENABLED:-true}" == "true" ]]; then
-                # Try to find worktree for this issue to check for checkpoints
-                local issue_worktree="${REPO_DIR}/.worktrees/daemon-issue-${issue_num}"
-                if [[ -d "$issue_worktree/.claude/pipeline-artifacts/checkpoints" ]]; then
-                    local latest_checkpoint=""
-                    for cp_file in "$issue_worktree/.claude/pipeline-artifacts/checkpoints"/*-checkpoint.json; do
-                        [[ -f "$cp_file" ]] && latest_checkpoint="$cp_file"
-                    done
-                    if [[ -n "$latest_checkpoint" ]]; then
-                        daemon_log INFO "Found checkpoint: $latest_checkpoint"
-                        emit_event "daemon.recovery" "issue=$issue_num" "checkpoint=$latest_checkpoint"
-                        checkpoint_args+=("--resume")
-                    fi
+        # Non-retryable failures — skip retry entirely
+        case "$failure_class" in
+            auth_error)
+                daemon_log ERROR "Auth error for issue #${issue_num} — skipping retry"
+                emit_event "daemon.skip_retry" "issue=$issue_num" "reason=auth_error"
+                if [[ "$NO_GITHUB" != "true" ]]; then
+                    gh issue edit "$issue_num" --add-label "pipeline/auth-error" 2>/dev/null || true
                 fi
-            fi
+                ;;
+            invalid_issue)
+                daemon_log ERROR "Invalid issue #${issue_num} — skipping retry"
+                emit_event "daemon.skip_retry" "issue=$issue_num" "reason=invalid_issue"
+                if [[ "$NO_GITHUB" != "true" ]]; then
+                    gh issue comment "$issue_num" --body "Pipeline skipped retry: issue appears invalid or has no body." 2>/dev/null || true
+                fi
+                ;;
+            *)
+                # Retryable failures — proceed with escalation
+                if [[ "$retry_count" -lt "${MAX_RETRIES:-2}" ]]; then
+                    retry_count=$((retry_count + 1))
+                    # Update retry count in state (locked to prevent race)
+                    locked_state_update \
+                        --arg num "$issue_num" --argjson count "$retry_count" \
+                        '.retry_counts[$num] = $count'
+                    daemon_log WARN "Auto-retry #${retry_count}/${MAX_RETRIES:-2} for issue #${issue_num} (class: ${failure_class})"
+                    emit_event "daemon.retry" "issue=$issue_num" "retry=$retry_count" "max=${MAX_RETRIES:-2}" "class=$failure_class"
+                    # Check for checkpoint to enable resume-from-checkpoint
+                    local checkpoint_args=()
+                    if [[ "${CHECKPOINT_ENABLED:-true}" == "true" ]]; then
+                        local issue_worktree="${REPO_DIR}/.worktrees/daemon-issue-${issue_num}"
+                        if [[ -d "$issue_worktree/.claude/pipeline-artifacts/checkpoints" ]]; then
+                            local latest_checkpoint=""
+                            for cp_file in "$issue_worktree/.claude/pipeline-artifacts/checkpoints"/*-checkpoint.json; do
+                                [[ -f "$cp_file" ]] && latest_checkpoint="$cp_file"
+                            done
+                            if [[ -n "$latest_checkpoint" ]]; then
+                                daemon_log INFO "Found checkpoint: $latest_checkpoint"
+                                emit_event "daemon.recovery" "issue=$issue_num" "checkpoint=$latest_checkpoint"
+                                checkpoint_args+=("--resume")
+                            fi
+                        fi
+                    fi
-            # Build escalated pipeline args
-            local retry_template="$PIPELINE_TEMPLATE"
-            local retry_model="${MODEL:-opus}"
-            local extra_args=()
-            if [[ "$retry_count" -eq 1 ]]; then
-                # Retry 1: same template, upgrade model, more iterations
-                retry_model="opus"
-                extra_args+=("--max-iterations" "30")
-                daemon_log INFO "Escalation: model=opus, max_iterations=30"
-            elif [[ "$retry_count" -ge 2 ]]; then
-                # Retry 2: full template, compound quality max cycles
-                retry_template="full"
-                retry_model="opus"
-                extra_args+=("--max-iterations" "30" "--compound-cycles" "5")
-                daemon_log INFO "Escalation: template=full, compound_cycles=5"
-            fi
+                    # Build escalated pipeline args
+                    local retry_template="$PIPELINE_TEMPLATE"
+                    local retry_model="${MODEL:-opus}"
+                    local extra_args=()
+                    if [[ "$retry_count" -eq 1 ]]; then
+                        retry_model="opus"
+                        extra_args+=("--max-iterations" "30")
+                        daemon_log INFO "Escalation: model=opus, max_iterations=30"
+                    elif [[ "$retry_count" -ge 2 ]]; then
+                        retry_template="full"
+                        retry_model="opus"
+                        extra_args+=("--max-iterations" "30" "--compound-cycles" "5")
+                        daemon_log INFO "Escalation: template=full, compound_cycles=5"
+                    fi
-            if [[ "$NO_GITHUB" != "true" ]]; then
-                gh issue comment "$issue_num" --body "## 🔄 Auto-Retry #${retry_count}
+                    # Increase restarts on context exhaustion
+                    if [[ "$failure_class" == "context_exhaustion" ]]; then
+                        local boosted_restarts=$(( ${MAX_RESTARTS_CFG:-3} + retry_count ))
+                        if [[ "$boosted_restarts" -gt 5 ]]; then
+                            boosted_restarts=5
+                        fi
+                        extra_args+=("--max-restarts" "$boosted_restarts")
+                        daemon_log INFO "Boosting max-restarts to $boosted_restarts (context exhaustion)"
+                    fi
-Pipeline failed — retrying with escalated strategy.
+                    # API errors get extended backoff
+                    local api_backoff=300
+                    local backoff_secs=$((30 * retry_count))
+                    if [[ "$failure_class" == "api_error" ]]; then
+                        backoff_secs=$((api_backoff * retry_count))
+                        daemon_log INFO "API error — extended backoff ${backoff_secs}s"
+                    fi
+                    if [[ "$NO_GITHUB" != "true" ]]; then
+                        gh issue comment "$issue_num" --body "## 🔄 Auto-Retry #${retry_count}
+Pipeline failed (${failure_class}) — retrying with escalated strategy.
 | Field | Value |
 |-------|-------|
 | Retry | ${retry_count} / ${MAX_RETRIES:-2} |
+| Failure | \`${failure_class}\` |
 | Template | \`${retry_template}\` |
 | Model | \`${retry_model}\` |
 | Started | $(now_iso) |
 _Escalation: $(if [[ "$retry_count" -eq 1 ]]; then echo "upgraded model + increased iterations"; else echo "full template + compound quality"; fi)_" 2>/dev/null || true
-            fi
+                    fi
-            # Backoff before retry: 30s * retry_count (30s, 60s, ...)
-            local backoff_secs=$((30 * retry_count))
-            daemon_log INFO "Waiting ${backoff_secs}s before retry #${retry_count}"
-            sleep "$backoff_secs"
-            # Re-spawn with escalated strategy
-            local orig_template="$PIPELINE_TEMPLATE"
-            local orig_model="$MODEL"
-            PIPELINE_TEMPLATE="$retry_template"
-            MODEL="$retry_model"
-            daemon_spawn_pipeline "$issue_num" "retry-${retry_count}"
-            _retry_spawned_for="$issue_num"
-            PIPELINE_TEMPLATE="$orig_template"
-            MODEL="$orig_model"
-            return
-        fi
+                    daemon_log INFO "Waiting ${backoff_secs}s before retry #${retry_count}"
+                    sleep "$backoff_secs"
+                    # Merge checkpoint args + extra args for passthrough
+                    local all_extra_args=()
+                    if [[ ${#checkpoint_args[@]} -gt 0 ]]; then
+                        all_extra_args+=("${checkpoint_args[@]}")
+                    fi
+                    if [[ ${#extra_args[@]} -gt 0 ]]; then
+                        all_extra_args+=("${extra_args[@]}")
+                    fi
+                    # Re-spawn with escalated strategy
+                    local orig_template="$PIPELINE_TEMPLATE"
+                    local orig_model="$MODEL"
+                    PIPELINE_TEMPLATE="$retry_template"
+                    MODEL="$retry_model"
+                    daemon_spawn_pipeline "$issue_num" "retry-${retry_count}" "" "${all_extra_args[@]}"
+                    _retry_spawned_for="$issue_num"
+                    PIPELINE_TEMPLATE="$orig_template"
+                    MODEL="$orig_model"
+                    return
+                fi
-        daemon_log WARN "Max retries (${MAX_RETRIES:-2}) exhausted for issue #${issue_num}"
-        emit_event "daemon.retry_exhausted" "issue=$issue_num" "retries=$retry_count"
+                daemon_log WARN "Max retries (${MAX_RETRIES:-2}) exhausted for issue #${issue_num}"
+                emit_event "daemon.retry_exhausted" "issue=$issue_num" "retries=$retry_count"
+                ;;
+        esac
     fi
     # ── No retry — report final failure ──
     if [[ "$NO_GITHUB" != "true" ]]; then
-        # Add failure label
+        # Add failure label and remove watch label (prevent re-processing)
         gh issue edit "$issue_num" \
-            --add-label "$ON_FAILURE_ADD_LABEL" 2>/dev/null || true
+            --add-label "$ON_FAILURE_ADD_LABEL" \
+            --remove-label "$WATCH_LABEL" 2>/dev/null || true
+        # Close any draft PR created for this issue (cleanup abandoned work)
+        local draft_pr
+        draft_pr=$(gh pr list --head "daemon/issue-${issue_num}" --head "pipeline/pipeline-issue-${issue_num}" \
+            --json number,isDraft --jq '.[] | select(.isDraft == true) | .number' 2>/dev/null | head -1 || true)
+        if [[ -n "$draft_pr" ]]; then
+            gh pr close "$draft_pr" --delete-branch 2>/dev/null || true
+            daemon_log INFO "Closed draft PR #${draft_pr} for failed issue #${issue_num}"
+        fi
         # Comment with log tail
         local log_tail=""
@@ -2075,7 +2470,7 @@ triage_score_issue() {
     # ── Intelligence-powered triage (if enabled) ──
     if [[ "${INTELLIGENCE_ENABLED:-false}" == "true" ]] && type intelligence_analyze_issue &>/dev/null 2>&1; then
-        daemon_log INFO "Intelligence: using AI triage (intelligence enabled)"
+        daemon_log INFO "Intelligence: using AI triage (intelligence enabled)" >&2
         local analysis
         analysis=$(intelligence_analyze_issue "$issue_json" 2>/dev/null || echo "")
         if [[ -n "$analysis" && "$analysis" != "{}" && "$analysis" != "null" ]]; then
@@ -2114,9 +2509,9 @@ triage_score_issue() {
             return
         fi
         # Fall through to heuristic scoring if intelligence call failed
-        daemon_log INFO "Intelligence: AI triage failed, falling back to heuristic scoring"
+        daemon_log INFO "Intelligence: AI triage failed, falling back to heuristic scoring" >&2
     else
-        daemon_log INFO "Intelligence: using heuristic triage (intelligence disabled, enable with intelligence.enabled=true)"
+        daemon_log INFO "Intelligence: using heuristic triage (intelligence disabled, enable with intelligence.enabled=true)" >&2
     fi
     labels_csv=$(echo "$issue_json" | jq -r '[.labels[].name] | join(",")')
     created_at=$(echo "$issue_json" | jq -r '.createdAt // ""')
@@ -2256,6 +2651,7 @@ triage_score_issue() {
 select_pipeline_template() {
     local labels="$1"
     local score="${2:-50}"
+    local _selected_template=""
     # When auto_template is disabled, use default pipeline template
     if [[ "${AUTO_TEMPLATE:-false}" != "true" ]]; then
@@ -2265,7 +2661,7 @@ select_pipeline_template() {
     # ── Intelligence-composed pipeline (if enabled) ──
     if [[ "${COMPOSER_ENABLED:-false}" == "true" ]] && type composer_create_pipeline &>/dev/null 2>&1; then
-        daemon_log INFO "Intelligence: using AI pipeline composition (composer enabled)"
+        daemon_log INFO "Intelligence: using AI pipeline composition (composer enabled)" >&2
         local analysis="${INTELLIGENCE_ANALYSIS:-{}}"
         local repo_context=""
         if [[ -f "${REPO_DIR:-}/.claude/pipeline-state.md" ]]; then
@@ -2287,9 +2683,69 @@ select_pipeline_template() {
             return
         fi
         # Fall through to static selection if composition failed
-        daemon_log INFO "Intelligence: AI pipeline composition failed, falling back to static template selection"
+        daemon_log INFO "Intelligence: AI pipeline composition failed, falling back to static template selection" >&2
     else
-        daemon_log INFO "Intelligence: using static template selection (composer disabled, enable with intelligence.composer_enabled=true)"
+        daemon_log INFO "Intelligence: using static template selection (composer disabled, enable with intelligence.composer_enabled=true)" >&2
+    fi
+    # ── DORA-driven template escalation ──
+    if [[ -f "${EVENTS_FILE:-$HOME/.shipwright/events.jsonl}" ]]; then
+        local _dora_events _dora_total _dora_failures _dora_cfr
+        _dora_events=$(tail -500 "${EVENTS_FILE:-$HOME/.shipwright/events.jsonl}" \
+            | grep '"type":"pipeline.completed"' 2>/dev/null \
+            | tail -5 || true)
+        _dora_total=$(echo "$_dora_events" | grep -c '.' 2>/dev/null || echo "0")
+        _dora_total="${_dora_total:-0}"
+        if [[ "$_dora_total" -ge 3 ]]; then
+            _dora_failures=$(echo "$_dora_events" | grep -c '"result":"failure"' 2>/dev/null || true)
+            _dora_failures="${_dora_failures:-0}"
+            _dora_cfr=$(( _dora_failures * 100 / _dora_total ))
+            if [[ "$_dora_cfr" -gt 40 ]]; then
+                daemon_log INFO "DORA escalation: CFR ${_dora_cfr}% > 40% — forcing enterprise template" >&2
+                emit_event "daemon.dora_escalation" \
+                    "cfr=$_dora_cfr" \
+                    "total=$_dora_total" \
+                    "failures=$_dora_failures" \
+                    "template=enterprise"
+                echo "enterprise"
+                return
+            fi
+            if [[ "$_dora_cfr" -lt 10 && "$score" -ge 60 ]]; then
+                daemon_log INFO "DORA: CFR ${_dora_cfr}% < 10% — fast template eligible" >&2
+                # Fall through to allow other factors to also vote for fast
+            fi
+            # ── DORA multi-factor ──
+            # Cycle time: if median > 120min, prefer faster templates
+            local _dora_cycle_time=0
+            _dora_cycle_time=$(echo "$_dora_events" | jq -r 'select(.duration_s) | .duration_s' 2>/dev/null \
+                | sort -n | awk '{ a[NR]=$1 } END { if (NR>0) print int(a[int(NR/2)+1]/60); else print 0 }' 2>/dev/null) || _dora_cycle_time=0
+            _dora_cycle_time="${_dora_cycle_time:-0}"
+            if [[ "${_dora_cycle_time:-0}" -gt 120 ]]; then
+                daemon_log INFO "DORA: cycle time ${_dora_cycle_time}min > 120 — preferring fast template" >&2
+                if [[ "${score:-0}" -ge 60 ]]; then
+                    echo "fast"
+                    return
+                fi
+            fi
+            # Deploy frequency: if < 1/week, use cost-aware
+            local _dora_deploy_freq=0
+            local _dora_first_epoch _dora_last_epoch _dora_span_days
+            _dora_first_epoch=$(echo "$_dora_events" | head -1 | jq -r '.timestamp // empty' 2>/dev/null | xargs -I{} date -j -f "%Y-%m-%dT%H:%M:%SZ" {} +%s 2>/dev/null || echo "0")
+            _dora_last_epoch=$(echo "$_dora_events" | tail -1 | jq -r '.timestamp // empty' 2>/dev/null | xargs -I{} date -j -f "%Y-%m-%dT%H:%M:%SZ" {} +%s 2>/dev/null || echo "0")
+            if [[ "${_dora_first_epoch:-0}" -gt 0 && "${_dora_last_epoch:-0}" -gt 0 ]]; then
+                _dora_span_days=$(( (_dora_last_epoch - _dora_first_epoch) / 86400 ))
+                if [[ "${_dora_span_days:-0}" -gt 0 ]]; then
+                    _dora_deploy_freq=$(awk -v t="$_dora_total" -v d="$_dora_span_days" 'BEGIN { printf "%.1f", t * 7 / d }' 2>/dev/null) || _dora_deploy_freq=0
+                fi
+            fi
+            if [[ -n "${_dora_deploy_freq:-}" ]] && awk -v f="${_dora_deploy_freq:-0}" 'BEGIN{exit !(f > 0 && f < 1)}' 2>/dev/null; then
+                daemon_log INFO "DORA: deploy freq ${_dora_deploy_freq}/week — using cost-aware" >&2
+                echo "cost-aware"
+                return
+            fi
+        fi
     fi
     # ── Branch protection escalation (highest priority) ──
@@ -2306,7 +2762,7 @@ select_pipeline_template() {
             local required_reviews
             required_reviews=$(echo "$protection" | jq -r '.required_pull_request_reviews.required_approving_review_count // 0' 2>/dev/null || echo "0")
             if [[ "$strict_protection" == "true" ]] || [[ "${required_reviews:-0}" -gt 1 ]]; then
-                daemon_log INFO "Branch has strict protection — escalating to enterprise template"
+                daemon_log INFO "Branch has strict protection — escalating to enterprise template" >&2
                 echo "enterprise"
                 return
             fi
@@ -2340,6 +2796,62 @@ select_pipeline_template() {
         fi
     fi
+    # ── Quality memory-driven selection ──
+    local quality_scores_file="${HOME}/.shipwright/optimization/quality-scores.jsonl"
+    if [[ -f "$quality_scores_file" ]]; then
+        local repo_hash
+        repo_hash=$(cd "${REPO_DIR:-.}" && git rev-parse --show-toplevel 2>/dev/null | shasum -a 256 | cut -c1-16 || echo "unknown")
+        # Get last 5 quality scores for this repo
+        local recent_scores avg_quality has_critical
+        recent_scores=$(grep "\"repo\":\"$repo_hash\"" "$quality_scores_file" 2>/dev/null | tail -5 || true)
+        if [[ -n "$recent_scores" ]]; then
+            avg_quality=$(echo "$recent_scores" | jq -r '.quality_score // 70' 2>/dev/null | awk '{ sum += $1; count++ } END { if (count > 0) printf "%.0f", sum/count; else print 70 }')
+            has_critical=$(echo "$recent_scores" | jq -r '.findings.critical // 0' 2>/dev/null | awk '{ sum += $1 } END { print (sum > 0) ? "yes" : "no" }')
+            # Critical findings in recent history → force enterprise
+            if [[ "$has_critical" == "yes" ]]; then
+                daemon_log INFO "Quality memory: critical findings in recent runs — using enterprise template" >&2
+                echo "enterprise"
+                return
+            fi
+            # Poor quality history → use full template
+            if [[ "${avg_quality:-70}" -lt 60 ]]; then
+                daemon_log INFO "Quality memory: avg score ${avg_quality}/100 in recent runs — using full template" >&2
+                echo "full"
+                return
+            fi
+            # Excellent quality history → allow faster template
+            if [[ "${avg_quality:-70}" -gt 80 ]]; then
+                daemon_log INFO "Quality memory: avg score ${avg_quality}/100 in recent runs — eligible for fast template" >&2
+                # Only upgrade if score also suggests fast
+                if [[ "$score" -ge 60 ]]; then
+                    echo "fast"
+                    return
+                fi
+            fi
+        fi
+    fi
+    # ── Learned template weights ──
+    local _tw_file="${HOME}/.shipwright/optimization/template-weights.json"
+    if [[ -f "$_tw_file" ]]; then
+        local _best_template _best_rate
+        _best_template=$(jq -r '
+            .weights // {} | to_entries
+            | map(select(.value.sample_size >= 3))
+            | sort_by(-.value.success_rate)
+            | .[0].key // ""
+        ' "$_tw_file" 2>/dev/null) || true
+        if [[ -n "${_best_template:-}" && "${_best_template:-}" != "null" && "${_best_template:-}" != "" ]]; then
+            _best_rate=$(jq -r --arg t "$_best_template" '.weights[$t].success_rate // 0' "$_tw_file" 2>/dev/null) || _best_rate=0
+            daemon_log INFO "Template weights: ${_best_template} (${_best_rate} success rate)" >&2
+            echo "$_best_template"
+            return
+        fi
+    fi
     # ── Score-based selection ──
     if [[ "$score" -ge 70 ]]; then
         echo "fast"
@@ -2388,8 +2900,12 @@ daemon_triage_show() {
         num=$(echo "$issue" | jq -r '.number')
         title=$(echo "$issue" | jq -r '.title // "—"')
         labels_csv=$(echo "$issue" | jq -r '[.labels[].name] | join(", ")')
-        score=$(triage_score_issue "$issue")
-        template=$(select_pipeline_template "$labels_csv" "$score")
+        score=$(triage_score_issue "$issue" 2>/dev/null | tail -1)
+        score=$(printf '%s' "$score" | tr -cd '[:digit:]')
+        [[ -z "$score" ]] && score=50
+        template=$(select_pipeline_template "$labels_csv" "$score" 2>/dev/null | tail -1)
+        template=$(printf '%s' "$template" | sed $'s/\x1b\\[[0-9;]*m//g' | tr -cd '[:alnum:]-_')
+        [[ -z "$template" ]] && template="$PIPELINE_TEMPLATE"
         scored_lines+=("${score}|${num}|${title}|${labels_csv}|${template}")
     done < <(echo "$issues_json" | jq -c '.[]')
@@ -3221,11 +3737,12 @@ Auto-detected by \`shipwright daemon patrol\` on $(now_iso)." \
             if [[ ! -f "$scripts_dir/sw-${name}-test.sh" ]]; then
                 # Count usage across other scripts
                 local usage_count
-                usage_count=$(grep -rl "sw-${name}" "$scripts_dir"/sw-*.sh 2>/dev/null | grep -cv "$basename" || true)
+                usage_count=$(grep -rl "sw-${name}" "$scripts_dir"/sw-*.sh 2>/dev/null | grep -cv "$basename" 2>/dev/null || echo "0")
                 usage_count=${usage_count:-0}
                 local line_count
-                line_count=$(wc -l < "$script" 2>/dev/null | tr -d ' ')
+                line_count=$(wc -l < "$script" 2>/dev/null | tr -d ' ' || echo "0")
+                line_count=${line_count:-0}
                 untested_entries="${untested_entries}${usage_count}|${basename}|${line_count}\n"
                 findings=$((findings + 1))
@@ -3484,6 +4001,13 @@ Patrol pre-filter findings to confirm: ${patrol_findings_summary}"
         patrol_meta_run
     fi
+    # ── Strategic Intelligence Patrol (requires CLAUDE_CODE_OAUTH_TOKEN) ──
+    if [[ -f "$SCRIPT_DIR/sw-strategic.sh" ]] && [[ -n "${CLAUDE_CODE_OAUTH_TOKEN:-}" ]]; then
+        # shellcheck source=sw-strategic.sh
+        source "$SCRIPT_DIR/sw-strategic.sh"
+        strategic_patrol_run || true
+    fi
     # ── Summary ──
     emit_event "patrol.completed" "findings=$total_findings" "issues_created=$issues_created" "dry_run=$dry_run"
@@ -3602,7 +4126,9 @@ daemon_poll_issues() {
     while IFS= read -r issue; do
         local num score
         num=$(echo "$issue" | jq -r '.number')
-        score=$(triage_score_issue "$issue")
+        score=$(triage_score_issue "$issue" 2>/dev/null | tail -1)
+        score=$(printf '%s' "$score" | tr -cd '[:digit:]')
+        [[ -z "$score" ]] && score=50
         # For org mode, include repo name in the scored entry
         local repo_name=""
         if [[ "$WATCH_MODE" == "org" ]]; then
@@ -3629,10 +4155,10 @@ daemon_poll_issues() {
     local sorted_order
     if [[ "${PRIORITY_STRATEGY:-quick-wins-first}" == "complex-first" ]]; then
         # Complex-first: lower score (more complex) first
-        sorted_order=$(printf '%s\n' "${scored_issues[@]}" | sort -t'|' -k1 -n)
+        sorted_order=$(printf '%s\n' "${scored_issues[@]}" | sort -t'|' -k1,1 -n -k2,2 -n)
     else
-        # Quick-wins-first (default): higher score (simpler) first
-        sorted_order=$(printf '%s\n' "${scored_issues[@]}" | sort -t'|' -k1 -rn)
+        # Quick-wins-first (default): higher score (simpler) first, lowest issue# first on ties
+        sorted_order=$(printf '%s\n' "${scored_issues[@]}" | sort -t'|' -k1,1 -rn -k2,2 -n)
     fi
     # Dependency-aware reordering: move dependencies before dependents
@@ -3727,7 +4253,9 @@ daemon_poll_issues() {
                 emit_event "daemon.priority_lane" "issue=$issue_num" "score=$score"
                 local template
-                template=$(select_pipeline_template "$labels_csv" "$score")
+                template=$(select_pipeline_template "$labels_csv" "$score" 2>/dev/null | tail -1)
+                template=$(printf '%s' "$template" | sed $'s/\x1b\\[[0-9;]*m//g' | tr -cd '[:alnum:]-_')
+                [[ -z "$template" ]] && template="$PIPELINE_TEMPLATE"
                 daemon_log INFO "Triage: issue #${issue_num} scored ${score}, template=${template} [PRIORITY]"
                 local orig_template="$PIPELINE_TEMPLATE"
@@ -3748,7 +4276,9 @@ daemon_poll_issues() {
         # Auto-select pipeline template based on labels + triage score
         local template
-        template=$(select_pipeline_template "$labels_csv" "$score")
+        template=$(select_pipeline_template "$labels_csv" "$score" 2>/dev/null | tail -1)
+        template=$(printf '%s' "$template" | sed $'s/\x1b\\[[0-9;]*m//g' | tr -cd '[:alnum:]-_')
+        [[ -z "$template" ]] && template="$PIPELINE_TEMPLATE"
         daemon_log INFO "Triage: issue #${issue_num} scored ${score}, template=${template}"
         # Spawn pipeline (template selection applied via PIPELINE_TEMPLATE override)
@@ -3756,8 +4286,41 @@ daemon_poll_issues() {
         PIPELINE_TEMPLATE="$template"
         daemon_spawn_pipeline "$issue_num" "$issue_title" "$repo_name"
         PIPELINE_TEMPLATE="$orig_template"
+        # Stagger delay between spawns to avoid API contention
+        local stagger_delay="${SPAWN_STAGGER_SECONDS:-15}"
+        if [[ "$stagger_delay" -gt 0 ]]; then
+            sleep "$stagger_delay"
+        fi
     done <<< "$sorted_order"
+    # ── Drain queue if we have capacity (prevents deadlock when queue is
+    #    populated but no active jobs exist to trigger dequeue) ──
+    local drain_active
+    drain_active=$(locked_get_active_count)
+    while [[ "$drain_active" -lt "$MAX_PARALLEL" ]]; do
+        local drain_issue
+        drain_issue=$(dequeue_next)
+        [[ -z "$drain_issue" ]] && break
+        local drain_title
+        drain_title=$(jq -r --arg n "$drain_issue" '.titles[$n] // ""' "$STATE_FILE" 2>/dev/null || true)
+        local drain_labels drain_score drain_template
+        drain_labels=$(echo "$issues_json" | jq -r --argjson n "$drain_issue" \
+            '.[] | select(.number == $n) | [.labels[].name] | join(",")' 2>/dev/null || echo "")
+        drain_score=$(echo "$sorted_order" | grep "|${drain_issue}|" | cut -d'|' -f1 || echo "50")
+        drain_template=$(select_pipeline_template "$drain_labels" "${drain_score:-50}" 2>/dev/null | tail -1)
+        drain_template=$(printf '%s' "$drain_template" | sed $'s/\x1b\\[[0-9;]*m//g' | tr -cd '[:alnum:]-_')
+        [[ -z "$drain_template" ]] && drain_template="$PIPELINE_TEMPLATE"
+        daemon_log INFO "Draining queue: issue #${drain_issue}, template=${drain_template}"
+        local orig_template="$PIPELINE_TEMPLATE"
+        PIPELINE_TEMPLATE="$drain_template"
+        daemon_spawn_pipeline "$drain_issue" "$drain_title"
+        PIPELINE_TEMPLATE="$orig_template"
+        drain_active=$(locked_get_active_count)
+    done
     # Update last poll
     update_state_field "last_poll" "$(now_iso)"
 }
@@ -3770,13 +4333,15 @@ daemon_health_check() {
     now_e=$(now_epoch)
     if [[ -f "$STATE_FILE" ]]; then
-        # ── Progress-Based Health Monitoring ──
-        # Instead of killing after a static timeout, check for forward progress.
-        # Only kill when the agent is truly stuck (no stage change, no new code,
-        # same error repeating). A hard wall-clock limit remains as absolute safety net.
+        # ── Intelligent Health Monitoring ──
+        # Instead of killing after a countdown, sense what the agent is doing.
+        # Agents think for long stretches — that's normal and expected.
+        # Strategy: sense → understand → be patient → nudge → only kill as last resort.
-        local hard_limit="${PROGRESS_HARD_LIMIT_S:-10800}"
+        local hard_limit="${PROGRESS_HARD_LIMIT_S:-0}"
         local use_progress="${PROGRESS_MONITORING:-true}"
+        local nudge_enabled="${NUDGE_ENABLED:-true}"
+        local nudge_after="${NUDGE_AFTER_CHECKS:-40}"
         while IFS= read -r job; do
             local pid started_at issue_num worktree
@@ -3797,8 +4362,8 @@ daemon_health_check() {
                 elapsed=$(( now_e - start_e ))
             fi
-            # Hard wall-clock limit — absolute safety net (default 3h)
-            if [[ "$elapsed" -gt "$hard_limit" ]]; then
+            # Hard wall-clock limit — disabled by default (0 = off)
+            if [[ "$hard_limit" -gt 0 && "$elapsed" -gt "$hard_limit" ]]; then
                 daemon_log WARN "Hard limit exceeded: issue #${issue_num} (${elapsed}s > ${hard_limit}s, PID $pid) — killing"
                 emit_event "daemon.hard_limit" "issue=$issue_num" "elapsed_s=$elapsed" "limit_s=$hard_limit" "pid=$pid"
                 kill "$pid" 2>/dev/null || true
@@ -3807,7 +4372,7 @@ daemon_health_check() {
                 continue
             fi
-            # Progress-based detection (when enabled)
+            # ── Intelligent Progress Sensing ──
             if [[ "$use_progress" == "true" && -n "$worktree" ]]; then
                 local snapshot verdict
                 snapshot=$(daemon_collect_snapshot "$issue_num" "$worktree" "$pid" 2>/dev/null || echo '{}')
@@ -3815,29 +4380,87 @@ daemon_health_check() {
                 if [[ "$snapshot" != "{}" ]]; then
                     verdict=$(daemon_assess_progress "$issue_num" "$snapshot" 2>/dev/null || echo "healthy")
+                    local no_progress_count=0
+                    no_progress_count=$(jq -r '.no_progress_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
+                    local cur_stage
+                    cur_stage=$(echo "$snapshot" | jq -r '.stage // "unknown"')
                     case "$verdict" in
                         healthy)
                             # All good — agent is making progress
                             ;;
                         slowing)
-                            daemon_log INFO "Issue #${issue_num} slowing (no progress for 1-2 checks, ${elapsed}s elapsed)"
+                            daemon_log INFO "Issue #${issue_num} slowing (no visible changes for ${no_progress_count} checks, ${elapsed}s elapsed, stage=${cur_stage})"
                             ;;
                         stalled)
-                            local no_progress_count
-                            no_progress_count=$(jq -r '.no_progress_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
-                            daemon_log WARN "Issue #${issue_num} stalled: no progress for ${no_progress_count} checks (${elapsed}s elapsed, PID $pid)"
-                            emit_event "daemon.stalled" "issue=$issue_num" "no_progress=$no_progress_count" "elapsed_s=$elapsed" "pid=$pid"
+                            # Check if agent subprocess is alive and consuming CPU
+                            local agent_alive=false
+                            local child_cpu=0
+                            child_cpu=$(pgrep -P "$pid" 2>/dev/null | xargs -I{} ps -o pcpu= -p {} 2>/dev/null | awk '{sum+=$1} END{printf "%d", sum+0}' || echo "0")
+                            if [[ "${child_cpu:-0}" -gt 0 ]]; then
+                                agent_alive=true
+                            fi
+                            if [[ "$agent_alive" == "true" ]]; then
+                                daemon_log INFO "Issue #${issue_num} no visible progress (${no_progress_count} checks) but agent is alive (CPU: ${child_cpu}%, stage=${cur_stage}, ${elapsed}s) — being patient"
+                            else
+                                daemon_log WARN "Issue #${issue_num} stalled: no progress for ${no_progress_count} checks, no CPU activity (${elapsed}s elapsed, PID $pid)"
+                                emit_event "daemon.stalled" "issue=$issue_num" "no_progress=$no_progress_count" "elapsed_s=$elapsed" "pid=$pid"
+                            fi
                             ;;
                         stuck)
-                            local no_progress_count repeated_errors cur_stage
-                            no_progress_count=$(jq -r '.no_progress_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
+                            local repeated_errors
                             repeated_errors=$(jq -r '.repeated_error_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
-                            cur_stage=$(echo "$snapshot" | jq -r '.stage // "unknown"')
-                            daemon_log WARN "Issue #${issue_num} STUCK: no progress for ${no_progress_count} checks, ${repeated_errors} repeated errors (stage=${cur_stage}, ${elapsed}s, PID $pid) — killing"
-                            emit_event "daemon.stuck_kill" "issue=$issue_num" "no_progress=$no_progress_count" "repeated_errors=$repeated_errors" "stage=$cur_stage" "elapsed_s=$elapsed" "pid=$pid"
-                            kill "$pid" 2>/dev/null || true
-                            daemon_clear_progress "$issue_num"
-                            findings=$((findings + 1))
+                            # Even "stuck" — check if the process tree is alive first
+                            local agent_alive=false
+                            local child_cpu=0
+                            child_cpu=$(pgrep -P "$pid" 2>/dev/null | xargs -I{} ps -o pcpu= -p {} 2>/dev/null | awk '{sum+=$1} END{printf "%d", sum+0}' || echo "0")
+                            if [[ "${child_cpu:-0}" -gt 0 ]]; then
+                                agent_alive=true
+                            fi
+                            if [[ "$agent_alive" == "true" && "$repeated_errors" -lt 3 ]]; then
+                                # Agent is alive — nudge instead of kill
+                                if [[ "$nudge_enabled" == "true" && "$no_progress_count" -ge "$nudge_after" ]]; then
+                                    local nudge_file="${worktree}/.claude/nudge.md"
+                                    if [[ ! -f "$nudge_file" ]]; then
+                                        cat > "$nudge_file" <<NUDGE_EOF
+# Nudge from Daemon Health Monitor
+The daemon has noticed no visible progress for $(( no_progress_count * 30 / 60 )) minutes.
+Current stage: ${cur_stage}
+If you're stuck, consider:
+- Breaking the task into smaller steps
+- Committing partial progress
+- Running tests to validate current state
+This is just a gentle check-in — take your time if you're working through a complex problem.
+NUDGE_EOF
+                                        daemon_log INFO "Issue #${issue_num} nudged (${no_progress_count} checks, stage=${cur_stage}, CPU=${child_cpu}%) — file written to worktree"
+                                        emit_event "daemon.nudge" "issue=$issue_num" "no_progress=$no_progress_count" "stage=$cur_stage" "elapsed_s=$elapsed"
+                                    fi
+                                else
+                                    daemon_log INFO "Issue #${issue_num} no visible progress (${no_progress_count} checks) but agent is alive (CPU: ${child_cpu}%, stage=${cur_stage}) — waiting"
+                                fi
+                            elif [[ "$repeated_errors" -ge 5 ]]; then
+                                # Truly stuck in an error loop — kill as last resort
+                                daemon_log WARN "Issue #${issue_num} in error loop: ${repeated_errors} repeated errors (stage=${cur_stage}, ${elapsed}s, PID $pid) — killing"
+                                emit_event "daemon.stuck_kill" "issue=$issue_num" "no_progress=$no_progress_count" "repeated_errors=$repeated_errors" "stage=$cur_stage" "elapsed_s=$elapsed" "pid=$pid" "reason=error_loop"
+                                kill "$pid" 2>/dev/null || true
+                                daemon_clear_progress "$issue_num"
+                                findings=$((findings + 1))
+                            elif [[ "$agent_alive" != "true" && "$no_progress_count" -ge "$((PROGRESS_CHECKS_BEFORE_KILL * 2))" ]]; then
+                                # Process tree is dead AND no progress for very long time
+                                daemon_log WARN "Issue #${issue_num} appears dead: no CPU, no progress for ${no_progress_count} checks (${elapsed}s, PID $pid) — killing"
+                                emit_event "daemon.stuck_kill" "issue=$issue_num" "no_progress=$no_progress_count" "repeated_errors=$repeated_errors" "stage=$cur_stage" "elapsed_s=$elapsed" "pid=$pid" "reason=dead_process"
+                                kill "$pid" 2>/dev/null || true
+                                daemon_clear_progress "$issue_num"
+                                findings=$((findings + 1))
+                            else
+                                daemon_log WARN "Issue #${issue_num} struggling (${no_progress_count} checks, ${repeated_errors} errors, CPU=${child_cpu}%, stage=${cur_stage}) — monitoring"
+                            fi
                             ;;
                     esac
                 fi
@@ -3846,8 +4469,9 @@ daemon_health_check() {
                 local stale_timeout
                 stale_timeout=$(get_adaptive_stale_timeout "$PIPELINE_TEMPLATE")
                 if [[ "$elapsed" -gt "$stale_timeout" ]]; then
-                    daemon_log WARN "Stale job (legacy): issue #${issue_num} (${elapsed}s > ${stale_timeout}s, PID $pid) — killing"
-                    kill "$pid" 2>/dev/null || true
+                    daemon_log WARN "Stale job (legacy): issue #${issue_num} (${elapsed}s > ${stale_timeout}s, PID $pid)"
+                    # Don't kill — just log. Let the process run.
+                    emit_event "daemon.stale_warning" "issue=$issue_num" "elapsed_s=$elapsed" "pid=$pid"
                     findings=$((findings + 1))
                 fi
             fi
@@ -3908,8 +4532,11 @@ daemon_check_degradation() {
     local failures successes
     failures=$(echo "$recent" | jq '[.[] | select(.result == "failure")] | length')
     successes=$(echo "$recent" | jq '[.[] | select(.result == "success")] | length')
-    local cfr_pct=$(( failures * 100 / count ))
-    local success_pct=$(( successes * 100 / count ))
+    local cfr_pct=0 success_pct=0
+    if [[ "${count:-0}" -gt 0 ]]; then
+        cfr_pct=$(( failures * 100 / count ))
+        success_pct=$(( successes * 100 / count ))
+    fi
     local alerts=""
     if [[ "$cfr_pct" -gt "$cfr_threshold" ]]; then
@@ -4039,11 +4666,43 @@ daemon_auto_scale() {
     local max_by_queue=$(( queue_depth + active_count ))
     [[ "$max_by_queue" -lt 1 ]] && max_by_queue=1
+    # ── Vitals-driven scaling factor ──
+    local max_by_vitals="$MAX_WORKERS"
+    if type pipeline_compute_vitals &>/dev/null 2>&1 && [[ -f "$STATE_FILE" ]]; then
+        local _total_health=0 _health_count=0
+        while IFS= read -r _job; do
+            local _job_issue _job_worktree
+            _job_issue=$(echo "$_job" | jq -r '.issue // 0')
+            _job_worktree=$(echo "$_job" | jq -r '.worktree // ""')
+            if [[ -n "$_job_worktree" && -d "$_job_worktree/.claude" ]]; then
+                local _job_vitals _job_health
+                _job_vitals=$(pipeline_compute_vitals "$_job_worktree/.claude/pipeline-state.md" "$_job_worktree/.claude/pipeline-artifacts" "$_job_issue" 2>/dev/null) || true
+                if [[ -n "$_job_vitals" && "$_job_vitals" != "{}" ]]; then
+                    _job_health=$(echo "$_job_vitals" | jq -r '.health_score // 50' 2>/dev/null || echo "50")
+                    _total_health=$((_total_health + _job_health))
+                    _health_count=$((_health_count + 1))
+                fi
+            fi
+        done < <(jq -c '.active_jobs[]' "$STATE_FILE" 2>/dev/null || true)
+        if [[ "$_health_count" -gt 0 ]]; then
+            local _avg_health=$((_total_health / _health_count))
+            if [[ "$_avg_health" -lt 50 ]]; then
+                # Pipelines struggling — reduce workers to give each more resources
+                max_by_vitals=$(( MAX_WORKERS * _avg_health / 100 ))
+                [[ "$max_by_vitals" -lt "$MIN_WORKERS" ]] && max_by_vitals="$MIN_WORKERS"
+                daemon_log INFO "Auto-scale: vitals avg health ${_avg_health}% — capping at ${max_by_vitals} workers"
+            fi
+            # avg_health > 70: no reduction (full capacity available)
+        fi
+    fi
     # ── Compute final value ──
     local computed="$max_by_cpu"
     [[ "$max_by_mem" -lt "$computed" ]] && computed="$max_by_mem"
     [[ "$max_by_budget" -lt "$computed" ]] && computed="$max_by_budget"
     [[ "$max_by_queue" -lt "$computed" ]] && computed="$max_by_queue"
+    [[ "$max_by_vitals" -lt "$computed" ]] && computed="$max_by_vitals"
     [[ "$MAX_WORKERS" -lt "$computed" ]] && computed="$MAX_WORKERS"
     # Respect fleet-assigned ceiling if set
@@ -4302,7 +4961,19 @@ daemon_cleanup_stale() {
         done < <(git worktree list --porcelain 2>/dev/null | grep '^worktree ' | sed 's/^worktree //')
     fi
-    # ── 2. Clean old pipeline artifacts ──
+    # ── 2. Expire old checkpoints ──
+    if [[ -x "$SCRIPT_DIR/sw-checkpoint.sh" ]]; then
+        local expired_output
+        expired_output=$(bash "$SCRIPT_DIR/sw-checkpoint.sh" expire --hours "$((age_days * 24))" 2>/dev/null || true)
+        if [[ -n "$expired_output" ]] && echo "$expired_output" | grep -q "Expired"; then
+            local expired_count
+            expired_count=$(echo "$expired_output" | grep -c "Expired" || true)
+            cleaned=$((cleaned + ${expired_count:-0}))
+            daemon_log INFO "Expired ${expired_count:-0} old checkpoint(s)"
+        fi
+    fi
+    # ── 3. Clean old pipeline artifacts (subdirectories only) ──
     local artifacts_dir=".claude/pipeline-artifacts"
     if [[ -d "$artifacts_dir" ]]; then
         while IFS= read -r artifact_dir; do
@@ -4393,6 +5064,7 @@ daemon_poll_loop() {
         # All poll loop calls are error-guarded to prevent set -e from killing the daemon.
         # The || operator disables set -e for the entire call chain, so transient failures
         # (GitHub API timeouts, jq errors, intelligence failures) are logged and skipped.
+        daemon_preflight_auth_check || daemon_log WARN "Auth check failed — daemon may be paused"
         daemon_poll_issues || daemon_log WARN "daemon_poll_issues failed — continuing"
         daemon_reap_completed || daemon_log WARN "daemon_reap_completed failed — continuing"
         daemon_health_check || daemon_log WARN "daemon_health_check failed — continuing"
@@ -4476,7 +5148,8 @@ cleanup_on_exit() {
             while IFS= read -r cpid; do
                 [[ -z "$cpid" ]] && continue
                 if kill -0 "$cpid" 2>/dev/null; then
-                    daemon_log INFO "Killing pipeline process PID ${cpid}"
+                    daemon_log INFO "Killing pipeline process tree PID ${cpid}"
+                    pkill -TERM -P "$cpid" 2>/dev/null || true
                     kill "$cpid" 2>/dev/null || true
                     killed=$((killed + 1))
                 fi
@@ -4488,7 +5161,8 @@ cleanup_on_exit() {
                 while IFS= read -r cpid; do
                     [[ -z "$cpid" ]] && continue
                     if kill -0 "$cpid" 2>/dev/null; then
-                        daemon_log WARN "Force-killing pipeline PID ${cpid}"
+                        daemon_log WARN "Force-killing pipeline tree PID ${cpid}"
+                        pkill -9 -P "$cpid" 2>/dev/null || true
                         kill -9 "$cpid" 2>/dev/null || true
                     fi
                 done <<< "$child_pids"