npm - shipwright-cli - Versions diffs - 1.10.0 → 2.0.0 - Mend

shipwright-cli 1.10.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (108) hide show

package/README.md +114 -36
package/completions/_shipwright +212 -32
package/completions/shipwright.bash +97 -25
package/docs/strategy/01-market-research.md +619 -0
package/docs/strategy/02-mission-and-brand.md +587 -0
package/docs/strategy/03-gtm-and-roadmap.md +759 -0
package/docs/strategy/QUICK-START.txt +289 -0
package/docs/strategy/README.md +172 -0
package/package.json +4 -2
package/scripts/sw +208 -1
package/scripts/sw-activity.sh +500 -0
package/scripts/sw-adaptive.sh +925 -0
package/scripts/sw-adversarial.sh +1 -1
package/scripts/sw-architecture-enforcer.sh +1 -1
package/scripts/sw-auth.sh +613 -0
package/scripts/sw-autonomous.sh +664 -0
package/scripts/sw-changelog.sh +704 -0
package/scripts/sw-checkpoint.sh +1 -1
package/scripts/sw-ci.sh +602 -0
package/scripts/sw-cleanup.sh +1 -1
package/scripts/sw-code-review.sh +637 -0
package/scripts/sw-connect.sh +1 -1
package/scripts/sw-context.sh +605 -0
package/scripts/sw-cost.sh +1 -1
package/scripts/sw-daemon.sh +432 -130
package/scripts/sw-dashboard.sh +1 -1
package/scripts/sw-db.sh +540 -0
package/scripts/sw-decompose.sh +539 -0
package/scripts/sw-deps.sh +551 -0
package/scripts/sw-developer-simulation.sh +1 -1
package/scripts/sw-discovery.sh +412 -0
package/scripts/sw-docs-agent.sh +539 -0
package/scripts/sw-docs.sh +1 -1
package/scripts/sw-doctor.sh +59 -1
package/scripts/sw-dora.sh +615 -0
package/scripts/sw-durable.sh +710 -0
package/scripts/sw-e2e-orchestrator.sh +535 -0
package/scripts/sw-eventbus.sh +393 -0
package/scripts/sw-feedback.sh +471 -0
package/scripts/sw-fix.sh +1 -1
package/scripts/sw-fleet-discover.sh +567 -0
package/scripts/sw-fleet-viz.sh +404 -0
package/scripts/sw-fleet.sh +8 -1
package/scripts/sw-github-app.sh +596 -0
package/scripts/sw-github-checks.sh +1 -1
package/scripts/sw-github-deploy.sh +1 -1
package/scripts/sw-github-graphql.sh +1 -1
package/scripts/sw-guild.sh +569 -0
package/scripts/sw-heartbeat.sh +1 -1
package/scripts/sw-hygiene.sh +559 -0
package/scripts/sw-incident.sh +617 -0
package/scripts/sw-init.sh +88 -1
package/scripts/sw-instrument.sh +699 -0
package/scripts/sw-intelligence.sh +1 -1
package/scripts/sw-jira.sh +1 -1
package/scripts/sw-launchd.sh +363 -28
package/scripts/sw-linear.sh +1 -1
package/scripts/sw-logs.sh +1 -1
package/scripts/sw-loop.sh +64 -3
package/scripts/sw-memory.sh +1 -1
package/scripts/sw-mission-control.sh +487 -0
package/scripts/sw-model-router.sh +545 -0
package/scripts/sw-otel.sh +596 -0
package/scripts/sw-oversight.sh +689 -0
package/scripts/sw-pipeline-composer.sh +1 -1
package/scripts/sw-pipeline-vitals.sh +1 -1
package/scripts/sw-pipeline.sh +687 -24
package/scripts/sw-pm.sh +693 -0
package/scripts/sw-pr-lifecycle.sh +522 -0
package/scripts/sw-predictive.sh +1 -1
package/scripts/sw-prep.sh +1 -1
package/scripts/sw-ps.sh +1 -1
package/scripts/sw-public-dashboard.sh +798 -0
package/scripts/sw-quality.sh +595 -0
package/scripts/sw-reaper.sh +1 -1
package/scripts/sw-recruit.sh +573 -0
package/scripts/sw-regression.sh +642 -0
package/scripts/sw-release-manager.sh +736 -0
package/scripts/sw-release.sh +706 -0
package/scripts/sw-remote.sh +1 -1
package/scripts/sw-replay.sh +520 -0
package/scripts/sw-retro.sh +691 -0
package/scripts/sw-scale.sh +444 -0
package/scripts/sw-security-audit.sh +505 -0
package/scripts/sw-self-optimize.sh +1 -1
package/scripts/sw-session.sh +1 -1
package/scripts/sw-setup.sh +1 -1
package/scripts/sw-standup.sh +712 -0
package/scripts/sw-status.sh +1 -1
package/scripts/sw-strategic.sh +658 -0
package/scripts/sw-stream.sh +450 -0
package/scripts/sw-swarm.sh +583 -0
package/scripts/sw-team-stages.sh +511 -0
package/scripts/sw-templates.sh +1 -1
package/scripts/sw-testgen.sh +515 -0
package/scripts/sw-tmux-pipeline.sh +554 -0
package/scripts/sw-tmux.sh +1 -1
package/scripts/sw-trace.sh +485 -0
package/scripts/sw-tracker-github.sh +188 -0
package/scripts/sw-tracker-jira.sh +172 -0
package/scripts/sw-tracker-linear.sh +251 -0
package/scripts/sw-tracker.sh +117 -2
package/scripts/sw-triage.sh +603 -0
package/scripts/sw-upgrade.sh +1 -1
package/scripts/sw-ux.sh +677 -0
package/scripts/sw-webhook.sh +627 -0
package/scripts/sw-widgets.sh +530 -0
package/scripts/sw-worktree.sh +1 -1

package/scripts/sw-daemon.sh CHANGED Viewed

@@ -6,7 +6,10 @@
 set -euo pipefail
 trap 'echo "ERROR: $BASH_SOURCE:$LINENO exited with status $?" >&2' ERR
-VERSION="1.10.0"
+# Allow spawning Claude CLI from within a Claude Code session (daemon, fleet, etc.)
+unset CLAUDECODE 2>/dev/null || true
+VERSION="2.0.0"
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 REPO_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
@@ -478,9 +481,11 @@ load_config() {
     # progress-based health monitoring (replaces static timeouts)
     PROGRESS_MONITORING=$(jq -r '.health.progress_based // true' "$config_file")
-    PROGRESS_CHECKS_BEFORE_WARN=$(jq -r '.health.stale_checks_before_warn // 3' "$config_file")
-    PROGRESS_CHECKS_BEFORE_KILL=$(jq -r '.health.stale_checks_before_kill // 6' "$config_file")
-    PROGRESS_HARD_LIMIT_S=$(jq -r '.health.hard_limit_s // 10800' "$config_file")  # 3hr absolute max
+    PROGRESS_CHECKS_BEFORE_WARN=$(jq -r '.health.stale_checks_before_warn // 20' "$config_file")
+    PROGRESS_CHECKS_BEFORE_KILL=$(jq -r '.health.stale_checks_before_kill // 120' "$config_file")
+    PROGRESS_HARD_LIMIT_S=$(jq -r '.health.hard_limit_s // 0' "$config_file")  # 0 = disabled (no hard kill)
+    NUDGE_ENABLED=$(jq -r '.health.nudge_enabled // true' "$config_file")
+    NUDGE_AFTER_CHECKS=$(jq -r '.health.nudge_after_checks // 40' "$config_file")
     # team dashboard URL (for coordinated claiming)
     local cfg_dashboard_url
@@ -836,6 +841,31 @@ daemon_assess_progress() {
         has_progress=true
     fi
+    # Claude subprocess is alive and consuming CPU — agent is thinking/working
+    # During build stage, Claude can spend 10+ minutes thinking before any
+    # visible git changes appear.  Detect this as progress.
+    if [[ "$has_progress" != "true" ]]; then
+        local _pid_for_check
+        _pid_for_check=$(echo "$current_snapshot" | jq -r '.pid // empty' 2>/dev/null || true)
+        if [[ -z "$_pid_for_check" ]]; then
+            # Fallback: get PID from active_jobs
+            _pid_for_check=$(jq -r --argjson num "$issue_num" \
+                '.active_jobs[] | select(.issue == ($num | tonumber)) | .pid' "$STATE_FILE" 2>/dev/null | head -1 || true)
+        fi
+        if [[ -n "$_pid_for_check" ]]; then
+            # Check if any child process (claude) is alive and using CPU
+            local child_cpu=0
+            child_cpu=$(ps -o pid=,pcpu= -p "$_pid_for_check" 2>/dev/null | awk '{sum+=$2} END{printf "%d", sum+0}' || echo "0")
+            if [[ "$child_cpu" -eq 0 ]]; then
+                # Check children of the pipeline process
+                child_cpu=$(pgrep -P "$_pid_for_check" 2>/dev/null | xargs -I{} ps -o pcpu= -p {} 2>/dev/null | awk '{sum+=$1} END{printf "%d", sum+0}' || echo "0")
+            fi
+            if [[ "${child_cpu:-0}" -gt 0 ]]; then
+                has_progress=true
+            fi
+        fi
+    fi
     # Detect repeated errors (same error signature hitting again)
     local repeated_errors="$prev_repeated_errors"
     if [[ -n "$cur_error" && "$cur_error" == "$prev_error" ]]; then
@@ -1208,6 +1238,74 @@ gh_record_failure() {
     fi
 }
+# ─── Runtime Auth Check ──────────────────────────────────────────────────────
+LAST_AUTH_CHECK_EPOCH=0
+AUTH_CHECK_INTERVAL=300  # 5 minutes
+daemon_preflight_auth_check() {
+    local now_e
+    now_e=$(now_epoch)
+    if [[ $((now_e - LAST_AUTH_CHECK_EPOCH)) -lt "$AUTH_CHECK_INTERVAL" ]]; then
+        return 0
+    fi
+    LAST_AUTH_CHECK_EPOCH="$now_e"
+    # gh auth check
+    if [[ "${NO_GITHUB:-false}" != "true" ]]; then
+        if ! gh auth status &>/dev/null 2>&1; then
+            daemon_log ERROR "GitHub auth check failed — auto-pausing daemon"
+            local pause_json
+            pause_json=$(jq -n --arg reason "gh_auth_failure" --arg ts "$(now_iso)" \
+                '{reason: $reason, timestamp: $ts}')
+            local _tmp_pause
+            _tmp_pause=$(mktemp "${TMPDIR:-/tmp}/sw-pause.XXXXXX")
+            echo "$pause_json" > "$_tmp_pause"
+            mv "$_tmp_pause" "$PAUSE_FLAG"
+            emit_event "daemon.auto_pause" "reason=gh_auth_failure"
+            return 1
+        fi
+    fi
+    # claude auth check with 15s timeout (macOS has no timeout command)
+    local claude_auth_ok=false
+    local _auth_tmp
+    _auth_tmp=$(mktemp "${TMPDIR:-/tmp}/sw-auth.XXXXXX")
+    ( claude --print -p "ok" --max-turns 1 > "$_auth_tmp" 2>/dev/null ) &
+    local _auth_pid=$!
+    local _auth_waited=0
+    while kill -0 "$_auth_pid" 2>/dev/null && [[ "$_auth_waited" -lt 15 ]]; do
+        sleep 1
+        _auth_waited=$((_auth_waited + 1))
+    done
+    if kill -0 "$_auth_pid" 2>/dev/null; then
+        kill "$_auth_pid" 2>/dev/null || true
+        wait "$_auth_pid" 2>/dev/null || true
+    else
+        wait "$_auth_pid" 2>/dev/null || true
+    fi
+    if [[ -s "$_auth_tmp" ]]; then
+        claude_auth_ok=true
+    fi
+    rm -f "$_auth_tmp"
+    if [[ "$claude_auth_ok" != "true" ]]; then
+        daemon_log ERROR "Claude auth check failed — auto-pausing daemon"
+        local pause_json
+        pause_json=$(jq -n --arg reason "claude_auth_failure" --arg ts "$(now_iso)" \
+            '{reason: $reason, timestamp: $ts}')
+        local _tmp_pause
+        _tmp_pause=$(mktemp "${TMPDIR:-/tmp}/sw-pause.XXXXXX")
+        echo "$pause_json" > "$_tmp_pause"
+        mv "$_tmp_pause" "$PAUSE_FLAG"
+        emit_event "daemon.auto_pause" "reason=claude_auth_failure"
+        return 1
+    fi
+    return 0
+}
 # ─── Pre-flight Checks ──────────────────────────────────────────────────────
 preflight_checks() {
@@ -1609,9 +1707,24 @@ daemon_spawn_pipeline() {
     local issue_num="$1"
     local issue_title="${2:-}"
     local repo_full_name="${3:-}"  # owner/repo (org mode only)
+    shift 3 2>/dev/null || true
+    local extra_pipeline_args=("$@")  # Optional extra args passed to sw-pipeline.sh
     daemon_log INFO "Spawning pipeline for issue #${issue_num}: ${issue_title}"
+    # ── Issue decomposition (if decomposer available) ──
+    local decompose_script="${SCRIPT_DIR}/sw-decompose.sh"
+    if [[ -x "$decompose_script" && "$NO_GITHUB" != "true" ]]; then
+        local decompose_result=""
+        decompose_result=$("$decompose_script" auto "$issue_num" 2>/dev/null) || true
+        if [[ "$decompose_result" == *"decomposed"* ]]; then
+            daemon_log INFO "Issue #${issue_num} decomposed into subtasks — skipping pipeline"
+            # Remove the shipwright label so decomposed parent doesn't re-queue
+            gh issue edit "$issue_num" --remove-label "shipwright" 2>/dev/null || true
+            return 0
+        fi
+    fi
     # Extract goal text from issue (title + first line of body)
     local issue_goal="$issue_title"
     if [[ "$NO_GITHUB" != "true" ]]; then
@@ -1727,11 +1840,18 @@ daemon_spawn_pipeline() {
         pipeline_args+=("--fast-test-cmd" "$FAST_TEST_CMD_CFG")
     fi
+    # Append any extra pipeline args (from retry escalation, etc.)
+    if [[ ${#extra_pipeline_args[@]} -gt 0 ]]; then
+        pipeline_args+=("${extra_pipeline_args[@]}")
+    fi
     # Run pipeline in work directory (background)
+    # Ignore SIGHUP so tmux attach/detach and process group changes don't kill the pipeline
     echo -e "\n\n===== Pipeline run $(date -u +%Y-%m-%dT%H:%M:%SZ) =====" >> "$LOG_DIR/issue-${issue_num}.log" 2>/dev/null || true
     (
+        trap '' HUP
         cd "$work_dir"
-        "$SCRIPT_DIR/sw-pipeline.sh" "${pipeline_args[@]}"
+        exec "$SCRIPT_DIR/sw-pipeline.sh" "${pipeline_args[@]}"
     ) >> "$LOG_DIR/issue-${issue_num}.log" 2>&1 200>&- &
     local pid=$!
@@ -1904,15 +2024,18 @@ daemon_reap_completed() {
         reap_machine_name=$(jq -r '.machines[] | select(.role == "primary") | .name' "$HOME/.shipwright/machines.json" 2>/dev/null || hostname -s)
         release_claim "$issue_num" "$reap_machine_name"
-        # Skip cleanup if a retry was just spawned for this issue
+        # Always remove the OLD job entry from active_jobs to prevent
+        # re-reaping of the dead PID on the next cycle.  When a retry was
+        # spawned, daemon_spawn_pipeline already added a fresh entry with
+        # the new PID — we must not leave the stale one behind.
+        locked_state_update --argjson num "$issue_num" \
+            --argjson old_pid "${pid:-0}" \
+            '.active_jobs = [.active_jobs[] | select(.issue != $num or .pid != $old_pid)]'
+        untrack_priority_job "$issue_num"
         if [[ "$_retry_spawned_for" == "$issue_num" ]]; then
             daemon_log INFO "Retry spawned for issue #${issue_num} — skipping worktree cleanup"
         else
-            # Remove from active_jobs and priority lane tracking (locked)
-            locked_state_update --argjson num "$issue_num" \
-                '.active_jobs = [.active_jobs[] | select(.issue != $num)]'
-            untrack_priority_job "$issue_num"
             # Clean up worktree (skip for org-mode clones — they persist)
             local job_repo
             job_repo=$(echo "$job" | jq -r '.repo // ""')
@@ -1951,6 +2074,9 @@ daemon_reap_completed() {
 daemon_on_success() {
     local issue_num="$1" duration="${2:-}"
+    # Reset consecutive failure tracking on any success
+    reset_failure_tracking
     daemon_log SUCCESS "Pipeline completed for issue #${issue_num} (${duration:-unknown})"
     # Record pipeline duration for adaptive threshold learning
@@ -2011,6 +2137,91 @@ Check the associated PR for the implementation." 2>/dev/null || true
     "$SCRIPT_DIR/sw-tracker.sh" notify "completed" "$issue_num" 2>/dev/null || true
 }
+# ─── Failure Classification ─────────────────────────────────────────────────
+classify_failure() {
+    local issue_num="$1"
+    if [[ -z "${LOG_DIR:-}" ]]; then
+        echo "unknown"
+        return
+    fi
+    local log_path="$LOG_DIR/issue-${issue_num}.log"
+    if [[ ! -f "$log_path" ]]; then
+        echo "unknown"
+        return
+    fi
+    local tail_content
+    tail_content=$(tail -200 "$log_path" 2>/dev/null || true)
+    # Auth errors
+    if echo "$tail_content" | grep -qiE 'not logged in|unauthorized|auth.*fail|401 |invalid.*token|CLAUDE_CODE_OAUTH_TOKEN|api key.*invalid|authentication required'; then
+        echo "auth_error"
+        return
+    fi
+    # API errors (rate limits, timeouts, server errors)
+    if echo "$tail_content" | grep -qiE 'rate limit|429 |503 |502 |overloaded|timeout|ETIMEDOUT|ECONNRESET|socket hang up|service unavailable'; then
+        echo "api_error"
+        return
+    fi
+    # Invalid issue (not found, empty body)
+    if echo "$tail_content" | grep -qiE 'issue not found|404 |no body|could not resolve|GraphQL.*not found|issue.*does not exist'; then
+        echo "invalid_issue"
+        return
+    fi
+    # Context exhaustion — check progress file
+    local issue_worktree_path="${WORKTREE_DIR:-${REPO_DIR}/.worktrees}/daemon-issue-${issue_num}"
+    local progress_file="${issue_worktree_path}/.claude/loop-logs/progress.md"
+    if [[ -f "$progress_file" ]]; then
+        local cf_iter
+        cf_iter=$(grep -oE 'Iteration: [0-9]+' "$progress_file" 2>/dev/null | tail -1 | grep -oE '[0-9]+' || echo "0")
+        if ! [[ "${cf_iter:-0}" =~ ^[0-9]+$ ]]; then cf_iter="0"; fi
+        local cf_tests
+        cf_tests=$(grep -oE 'Tests passing: (true|false)' "$progress_file" 2>/dev/null | awk '{print $NF}' || echo "unknown")
+        if [[ "${cf_iter:-0}" -gt 0 ]] && { [[ "$cf_tests" == "false" ]] || [[ "$cf_tests" == "unknown" ]]; }; then
+            echo "context_exhaustion"
+            return
+        fi
+    fi
+    # Build failure (test errors, compile errors)
+    if echo "$tail_content" | grep -qiE 'test.*fail|FAIL|build.*error|compile.*error|lint.*fail|npm ERR|exit code [1-9]'; then
+        echo "build_failure"
+        return
+    fi
+    echo "unknown"
+}
+# ─── Consecutive Failure Tracking ──────────────────────────────────────────
+DAEMON_CONSECUTIVE_FAILURE_CLASS=""
+DAEMON_CONSECUTIVE_FAILURE_COUNT=0
+record_failure_class() {
+    local failure_class="$1"
+    if [[ "$failure_class" == "$DAEMON_CONSECUTIVE_FAILURE_CLASS" ]]; then
+        DAEMON_CONSECUTIVE_FAILURE_COUNT=$((DAEMON_CONSECUTIVE_FAILURE_COUNT + 1))
+    else
+        DAEMON_CONSECUTIVE_FAILURE_CLASS="$failure_class"
+        DAEMON_CONSECUTIVE_FAILURE_COUNT=1
+    fi
+    if [[ "$DAEMON_CONSECUTIVE_FAILURE_COUNT" -ge 3 ]]; then
+        daemon_log ERROR "3 consecutive failures (class: ${failure_class}) — auto-pausing daemon"
+        local pause_json
+        pause_json=$(jq -n --arg reason "consecutive_${failure_class}" --arg ts "$(now_iso)" \
+            '{reason: $reason, timestamp: $ts}')
+        local _tmp_pause
+        _tmp_pause=$(mktemp "${TMPDIR:-/tmp}/sw-pause.XXXXXX")
+        echo "$pause_json" > "$_tmp_pause"
+        mv "$_tmp_pause" "$PAUSE_FLAG"
+        emit_event "daemon.auto_pause" "reason=consecutive_failures" "class=$failure_class" "count=$DAEMON_CONSECUTIVE_FAILURE_COUNT"
+    fi
+}
+reset_failure_tracking() {
+    DAEMON_CONSECUTIVE_FAILURE_CLASS=""
+    DAEMON_CONSECUTIVE_FAILURE_COUNT=0
+}
 # ─── Failure Handler ────────────────────────────────────────────────────────
 daemon_on_failure() {
@@ -2047,123 +2258,143 @@ daemon_on_failure() {
             completed_at: $completed_at
         }] | .completed = .completed[-500:]'
+    # ── Classify failure and decide retry strategy ──
+    local failure_class
+    failure_class=$(classify_failure "$issue_num")
+    daemon_log INFO "Failure classified as: ${failure_class} for issue #${issue_num}"
+    emit_event "daemon.failure_classified" "issue=$issue_num" "class=$failure_class"
+    record_failure_class "$failure_class"
     # ── Auto-retry with strategy escalation ──
     if [[ "${RETRY_ESCALATION:-true}" == "true" ]]; then
         local retry_count
         retry_count=$(jq -r --arg num "$issue_num" \
             '.retry_counts[$num] // 0' "$STATE_FILE" 2>/dev/null || echo "0")
-        if [[ "$retry_count" -lt "${MAX_RETRIES:-2}" ]]; then
-            retry_count=$((retry_count + 1))
-            # Update retry count in state (locked to prevent race)
-            locked_state_update \
-                --arg num "$issue_num" --argjson count "$retry_count" \
-                '.retry_counts[$num] = $count'
-            daemon_log WARN "Auto-retry #${retry_count}/${MAX_RETRIES:-2} for issue #${issue_num}"
-            emit_event "daemon.retry" "issue=$issue_num" "retry=$retry_count" "max=${MAX_RETRIES:-2}"
-            # Check for checkpoint to enable resume-from-checkpoint
-            local checkpoint_args=()
-            if [[ "${CHECKPOINT_ENABLED:-true}" == "true" ]]; then
-                # Try to find worktree for this issue to check for checkpoints
-                local issue_worktree="${REPO_DIR}/.worktrees/daemon-issue-${issue_num}"
-                if [[ -d "$issue_worktree/.claude/pipeline-artifacts/checkpoints" ]]; then
-                    local latest_checkpoint=""
-                    for cp_file in "$issue_worktree/.claude/pipeline-artifacts/checkpoints"/*-checkpoint.json; do
-                        [[ -f "$cp_file" ]] && latest_checkpoint="$cp_file"
-                    done
-                    if [[ -n "$latest_checkpoint" ]]; then
-                        daemon_log INFO "Found checkpoint: $latest_checkpoint"
-                        emit_event "daemon.recovery" "issue=$issue_num" "checkpoint=$latest_checkpoint"
-                        checkpoint_args+=("--resume")
-                    fi
+        # Non-retryable failures — skip retry entirely
+        case "$failure_class" in
+            auth_error)
+                daemon_log ERROR "Auth error for issue #${issue_num} — skipping retry"
+                emit_event "daemon.skip_retry" "issue=$issue_num" "reason=auth_error"
+                if [[ "$NO_GITHUB" != "true" ]]; then
+                    gh issue edit "$issue_num" --add-label "pipeline/auth-error" 2>/dev/null || true
                 fi
-            fi
-            # Detect context exhaustion from progress file
-            local failure_reason="unknown"
-            local issue_worktree_path="${WORKTREE_DIR:-${REPO_DIR}/.worktrees}/daemon-issue-${issue_num}"
-            local progress_file="${issue_worktree_path}/.claude/loop-logs/progress.md"
-            if [[ -f "$progress_file" ]]; then
-                local progress_iter
-                progress_iter=$(grep -oE 'Iteration: [0-9]+' "$progress_file" 2>/dev/null | tail -1 | grep -oE '[0-9]+' || echo "0")
-                if ! [[ "${progress_iter:-0}" =~ ^[0-9]+$ ]]; then
-                    progress_iter="0"
-                fi
-                local progress_tests
-                progress_tests=$(grep -oE 'Tests passing: (true|false)' "$progress_file" 2>/dev/null | awk '{print $NF}' || echo "unknown")
-                if [[ "${progress_iter:-0}" -gt 0 ]] && { [[ "$progress_tests" == "false" ]] || [[ "$progress_tests" == "unknown" ]]; }; then
-                    failure_reason="context_exhaustion"
-                    emit_event "daemon.context_exhaustion" "issue=$issue_num" "iterations=$progress_iter"
-                    daemon_log WARN "Context exhaustion detected for issue #${issue_num} (iterations: ${progress_iter})"
+                ;;
+            invalid_issue)
+                daemon_log ERROR "Invalid issue #${issue_num} — skipping retry"
+                emit_event "daemon.skip_retry" "issue=$issue_num" "reason=invalid_issue"
+                if [[ "$NO_GITHUB" != "true" ]]; then
+                    gh issue comment "$issue_num" --body "Pipeline skipped retry: issue appears invalid or has no body." 2>/dev/null || true
                 fi
-            fi
+                ;;
+            *)
+                # Retryable failures — proceed with escalation
+                if [[ "$retry_count" -lt "${MAX_RETRIES:-2}" ]]; then
+                    retry_count=$((retry_count + 1))
+                    # Update retry count in state (locked to prevent race)
+                    locked_state_update \
+                        --arg num "$issue_num" --argjson count "$retry_count" \
+                        '.retry_counts[$num] = $count'
+                    daemon_log WARN "Auto-retry #${retry_count}/${MAX_RETRIES:-2} for issue #${issue_num} (class: ${failure_class})"
+                    emit_event "daemon.retry" "issue=$issue_num" "retry=$retry_count" "max=${MAX_RETRIES:-2}" "class=$failure_class"
+                    # Check for checkpoint to enable resume-from-checkpoint
+                    local checkpoint_args=()
+                    if [[ "${CHECKPOINT_ENABLED:-true}" == "true" ]]; then
+                        local issue_worktree="${REPO_DIR}/.worktrees/daemon-issue-${issue_num}"
+                        if [[ -d "$issue_worktree/.claude/pipeline-artifacts/checkpoints" ]]; then
+                            local latest_checkpoint=""
+                            for cp_file in "$issue_worktree/.claude/pipeline-artifacts/checkpoints"/*-checkpoint.json; do
+                                [[ -f "$cp_file" ]] && latest_checkpoint="$cp_file"
+                            done
+                            if [[ -n "$latest_checkpoint" ]]; then
+                                daemon_log INFO "Found checkpoint: $latest_checkpoint"
+                                emit_event "daemon.recovery" "issue=$issue_num" "checkpoint=$latest_checkpoint"
+                                checkpoint_args+=("--resume")
+                            fi
+                        fi
+                    fi
-            # Build escalated pipeline args
-            local retry_template="$PIPELINE_TEMPLATE"
-            local retry_model="${MODEL:-opus}"
-            local extra_args=()
-            if [[ "$retry_count" -eq 1 ]]; then
-                # Retry 1: same template, upgrade model, more iterations
-                retry_model="opus"
-                extra_args+=("--max-iterations" "30")
-                daemon_log INFO "Escalation: model=opus, max_iterations=30"
-            elif [[ "$retry_count" -ge 2 ]]; then
-                # Retry 2: full template, compound quality max cycles
-                retry_template="full"
-                retry_model="opus"
-                extra_args+=("--max-iterations" "30" "--compound-cycles" "5")
-                daemon_log INFO "Escalation: template=full, compound_cycles=5"
-            fi
+                    # Build escalated pipeline args
+                    local retry_template="$PIPELINE_TEMPLATE"
+                    local retry_model="${MODEL:-opus}"
+                    local extra_args=()
+                    if [[ "$retry_count" -eq 1 ]]; then
+                        retry_model="opus"
+                        extra_args+=("--max-iterations" "30")
+                        daemon_log INFO "Escalation: model=opus, max_iterations=30"
+                    elif [[ "$retry_count" -ge 2 ]]; then
+                        retry_template="full"
+                        retry_model="opus"
+                        extra_args+=("--max-iterations" "30" "--compound-cycles" "5")
+                        daemon_log INFO "Escalation: template=full, compound_cycles=5"
+                    fi
-            # Increase restarts on context exhaustion
-            if [[ "$failure_reason" == "context_exhaustion" ]]; then
-                local boosted_restarts=$(( ${MAX_RESTARTS_CFG:-3} + retry_count ))
-                # Cap at sw-loop's hard limit of 5
-                if [[ "$boosted_restarts" -gt 5 ]]; then
-                    boosted_restarts=5
-                fi
-                extra_args+=("--max-restarts" "$boosted_restarts")
-                daemon_log INFO "Boosting max-restarts to $boosted_restarts (context exhaustion)"
-            fi
+                    # Increase restarts on context exhaustion
+                    if [[ "$failure_class" == "context_exhaustion" ]]; then
+                        local boosted_restarts=$(( ${MAX_RESTARTS_CFG:-3} + retry_count ))
+                        if [[ "$boosted_restarts" -gt 5 ]]; then
+                            boosted_restarts=5
+                        fi
+                        extra_args+=("--max-restarts" "$boosted_restarts")
+                        daemon_log INFO "Boosting max-restarts to $boosted_restarts (context exhaustion)"
+                    fi
-            if [[ "$NO_GITHUB" != "true" ]]; then
-                gh issue comment "$issue_num" --body "## 🔄 Auto-Retry #${retry_count}
+                    # API errors get extended backoff
+                    local api_backoff=300
+                    local backoff_secs=$((30 * retry_count))
+                    if [[ "$failure_class" == "api_error" ]]; then
+                        backoff_secs=$((api_backoff * retry_count))
+                        daemon_log INFO "API error — extended backoff ${backoff_secs}s"
+                    fi
-Pipeline failed — retrying with escalated strategy.
+                    if [[ "$NO_GITHUB" != "true" ]]; then
+                        gh issue comment "$issue_num" --body "## 🔄 Auto-Retry #${retry_count}
+Pipeline failed (${failure_class}) — retrying with escalated strategy.
 | Field | Value |
 |-------|-------|
 | Retry | ${retry_count} / ${MAX_RETRIES:-2} |
+| Failure | \`${failure_class}\` |
 | Template | \`${retry_template}\` |
 | Model | \`${retry_model}\` |
 | Started | $(now_iso) |
 _Escalation: $(if [[ "$retry_count" -eq 1 ]]; then echo "upgraded model + increased iterations"; else echo "full template + compound quality"; fi)_" 2>/dev/null || true
-            fi
+                    fi
-            # Backoff before retry: 30s * retry_count (30s, 60s, ...)
-            local backoff_secs=$((30 * retry_count))
-            daemon_log INFO "Waiting ${backoff_secs}s before retry #${retry_count}"
-            sleep "$backoff_secs"
-            # Re-spawn with escalated strategy
-            local orig_template="$PIPELINE_TEMPLATE"
-            local orig_model="$MODEL"
-            PIPELINE_TEMPLATE="$retry_template"
-            MODEL="$retry_model"
-            daemon_spawn_pipeline "$issue_num" "retry-${retry_count}"
-            _retry_spawned_for="$issue_num"
-            PIPELINE_TEMPLATE="$orig_template"
-            MODEL="$orig_model"
-            return
-        fi
+                    daemon_log INFO "Waiting ${backoff_secs}s before retry #${retry_count}"
+                    sleep "$backoff_secs"
-        daemon_log WARN "Max retries (${MAX_RETRIES:-2}) exhausted for issue #${issue_num}"
-        emit_event "daemon.retry_exhausted" "issue=$issue_num" "retries=$retry_count"
+                    # Merge checkpoint args + extra args for passthrough
+                    local all_extra_args=()
+                    if [[ ${#checkpoint_args[@]} -gt 0 ]]; then
+                        all_extra_args+=("${checkpoint_args[@]}")
+                    fi
+                    if [[ ${#extra_args[@]} -gt 0 ]]; then
+                        all_extra_args+=("${extra_args[@]}")
+                    fi
+                    # Re-spawn with escalated strategy
+                    local orig_template="$PIPELINE_TEMPLATE"
+                    local orig_model="$MODEL"
+                    PIPELINE_TEMPLATE="$retry_template"
+                    MODEL="$retry_model"
+                    daemon_spawn_pipeline "$issue_num" "retry-${retry_count}" "" "${all_extra_args[@]}"
+                    _retry_spawned_for="$issue_num"
+                    PIPELINE_TEMPLATE="$orig_template"
+                    MODEL="$orig_model"
+                    return
+                fi
+                daemon_log WARN "Max retries (${MAX_RETRIES:-2}) exhausted for issue #${issue_num}"
+                emit_event "daemon.retry_exhausted" "issue=$issue_num" "retries=$retry_count"
+                ;;
+        esac
     fi
     # ── No retry — report final failure ──
@@ -3770,6 +4001,13 @@ Patrol pre-filter findings to confirm: ${patrol_findings_summary}"
         patrol_meta_run
     fi
+    # ── Strategic Intelligence Patrol (requires CLAUDE_CODE_OAUTH_TOKEN) ──
+    if [[ -f "$SCRIPT_DIR/sw-strategic.sh" ]] && [[ -n "${CLAUDE_CODE_OAUTH_TOKEN:-}" ]]; then
+        # shellcheck source=sw-strategic.sh
+        source "$SCRIPT_DIR/sw-strategic.sh"
+        strategic_patrol_run || true
+    fi
     # ── Summary ──
     emit_event "patrol.completed" "findings=$total_findings" "issues_created=$issues_created" "dry_run=$dry_run"
@@ -4095,13 +4333,15 @@ daemon_health_check() {
     now_e=$(now_epoch)
     if [[ -f "$STATE_FILE" ]]; then
-        # ── Progress-Based Health Monitoring ──
-        # Instead of killing after a static timeout, check for forward progress.
-        # Only kill when the agent is truly stuck (no stage change, no new code,
-        # same error repeating). A hard wall-clock limit remains as absolute safety net.
+        # ── Intelligent Health Monitoring ──
+        # Instead of killing after a countdown, sense what the agent is doing.
+        # Agents think for long stretches — that's normal and expected.
+        # Strategy: sense → understand → be patient → nudge → only kill as last resort.
-        local hard_limit="${PROGRESS_HARD_LIMIT_S:-10800}"
+        local hard_limit="${PROGRESS_HARD_LIMIT_S:-0}"
         local use_progress="${PROGRESS_MONITORING:-true}"
+        local nudge_enabled="${NUDGE_ENABLED:-true}"
+        local nudge_after="${NUDGE_AFTER_CHECKS:-40}"
         while IFS= read -r job; do
             local pid started_at issue_num worktree
@@ -4122,8 +4362,8 @@ daemon_health_check() {
                 elapsed=$(( now_e - start_e ))
             fi
-            # Hard wall-clock limit — absolute safety net (default 3h)
-            if [[ "$elapsed" -gt "$hard_limit" ]]; then
+            # Hard wall-clock limit — disabled by default (0 = off)
+            if [[ "$hard_limit" -gt 0 && "$elapsed" -gt "$hard_limit" ]]; then
                 daemon_log WARN "Hard limit exceeded: issue #${issue_num} (${elapsed}s > ${hard_limit}s, PID $pid) — killing"
                 emit_event "daemon.hard_limit" "issue=$issue_num" "elapsed_s=$elapsed" "limit_s=$hard_limit" "pid=$pid"
                 kill "$pid" 2>/dev/null || true
@@ -4132,7 +4372,7 @@ daemon_health_check() {
                 continue
             fi
-            # Progress-based detection (when enabled)
+            # ── Intelligent Progress Sensing ──
             if [[ "$use_progress" == "true" && -n "$worktree" ]]; then
                 local snapshot verdict
                 snapshot=$(daemon_collect_snapshot "$issue_num" "$worktree" "$pid" 2>/dev/null || echo '{}')
@@ -4140,29 +4380,87 @@ daemon_health_check() {
                 if [[ "$snapshot" != "{}" ]]; then
                     verdict=$(daemon_assess_progress "$issue_num" "$snapshot" 2>/dev/null || echo "healthy")
+                    local no_progress_count=0
+                    no_progress_count=$(jq -r '.no_progress_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
+                    local cur_stage
+                    cur_stage=$(echo "$snapshot" | jq -r '.stage // "unknown"')
                     case "$verdict" in
                         healthy)
                             # All good — agent is making progress
                             ;;
                         slowing)
-                            daemon_log INFO "Issue #${issue_num} slowing (no progress for 1-2 checks, ${elapsed}s elapsed)"
+                            daemon_log INFO "Issue #${issue_num} slowing (no visible changes for ${no_progress_count} checks, ${elapsed}s elapsed, stage=${cur_stage})"
                             ;;
                         stalled)
-                            local no_progress_count
-                            no_progress_count=$(jq -r '.no_progress_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
-                            daemon_log WARN "Issue #${issue_num} stalled: no progress for ${no_progress_count} checks (${elapsed}s elapsed, PID $pid)"
-                            emit_event "daemon.stalled" "issue=$issue_num" "no_progress=$no_progress_count" "elapsed_s=$elapsed" "pid=$pid"
+                            # Check if agent subprocess is alive and consuming CPU
+                            local agent_alive=false
+                            local child_cpu=0
+                            child_cpu=$(pgrep -P "$pid" 2>/dev/null | xargs -I{} ps -o pcpu= -p {} 2>/dev/null | awk '{sum+=$1} END{printf "%d", sum+0}' || echo "0")
+                            if [[ "${child_cpu:-0}" -gt 0 ]]; then
+                                agent_alive=true
+                            fi
+                            if [[ "$agent_alive" == "true" ]]; then
+                                daemon_log INFO "Issue #${issue_num} no visible progress (${no_progress_count} checks) but agent is alive (CPU: ${child_cpu}%, stage=${cur_stage}, ${elapsed}s) — being patient"
+                            else
+                                daemon_log WARN "Issue #${issue_num} stalled: no progress for ${no_progress_count} checks, no CPU activity (${elapsed}s elapsed, PID $pid)"
+                                emit_event "daemon.stalled" "issue=$issue_num" "no_progress=$no_progress_count" "elapsed_s=$elapsed" "pid=$pid"
+                            fi
                             ;;
                         stuck)
-                            local no_progress_count repeated_errors cur_stage
-                            no_progress_count=$(jq -r '.no_progress_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
+                            local repeated_errors
                             repeated_errors=$(jq -r '.repeated_error_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
-                            cur_stage=$(echo "$snapshot" | jq -r '.stage // "unknown"')
-                            daemon_log WARN "Issue #${issue_num} STUCK: no progress for ${no_progress_count} checks, ${repeated_errors} repeated errors (stage=${cur_stage}, ${elapsed}s, PID $pid) — killing"
-                            emit_event "daemon.stuck_kill" "issue=$issue_num" "no_progress=$no_progress_count" "repeated_errors=$repeated_errors" "stage=$cur_stage" "elapsed_s=$elapsed" "pid=$pid"
-                            kill "$pid" 2>/dev/null || true
-                            daemon_clear_progress "$issue_num"
-                            findings=$((findings + 1))
+                            # Even "stuck" — check if the process tree is alive first
+                            local agent_alive=false
+                            local child_cpu=0
+                            child_cpu=$(pgrep -P "$pid" 2>/dev/null | xargs -I{} ps -o pcpu= -p {} 2>/dev/null | awk '{sum+=$1} END{printf "%d", sum+0}' || echo "0")
+                            if [[ "${child_cpu:-0}" -gt 0 ]]; then
+                                agent_alive=true
+                            fi
+                            if [[ "$agent_alive" == "true" && "$repeated_errors" -lt 3 ]]; then
+                                # Agent is alive — nudge instead of kill
+                                if [[ "$nudge_enabled" == "true" && "$no_progress_count" -ge "$nudge_after" ]]; then
+                                    local nudge_file="${worktree}/.claude/nudge.md"
+                                    if [[ ! -f "$nudge_file" ]]; then
+                                        cat > "$nudge_file" <<NUDGE_EOF
+# Nudge from Daemon Health Monitor
+The daemon has noticed no visible progress for $(( no_progress_count * 30 / 60 )) minutes.
+Current stage: ${cur_stage}
+If you're stuck, consider:
+- Breaking the task into smaller steps
+- Committing partial progress
+- Running tests to validate current state
+This is just a gentle check-in — take your time if you're working through a complex problem.
+NUDGE_EOF
+                                        daemon_log INFO "Issue #${issue_num} nudged (${no_progress_count} checks, stage=${cur_stage}, CPU=${child_cpu}%) — file written to worktree"
+                                        emit_event "daemon.nudge" "issue=$issue_num" "no_progress=$no_progress_count" "stage=$cur_stage" "elapsed_s=$elapsed"
+                                    fi
+                                else
+                                    daemon_log INFO "Issue #${issue_num} no visible progress (${no_progress_count} checks) but agent is alive (CPU: ${child_cpu}%, stage=${cur_stage}) — waiting"
+                                fi
+                            elif [[ "$repeated_errors" -ge 5 ]]; then
+                                # Truly stuck in an error loop — kill as last resort
+                                daemon_log WARN "Issue #${issue_num} in error loop: ${repeated_errors} repeated errors (stage=${cur_stage}, ${elapsed}s, PID $pid) — killing"
+                                emit_event "daemon.stuck_kill" "issue=$issue_num" "no_progress=$no_progress_count" "repeated_errors=$repeated_errors" "stage=$cur_stage" "elapsed_s=$elapsed" "pid=$pid" "reason=error_loop"
+                                kill "$pid" 2>/dev/null || true
+                                daemon_clear_progress "$issue_num"
+                                findings=$((findings + 1))
+                            elif [[ "$agent_alive" != "true" && "$no_progress_count" -ge "$((PROGRESS_CHECKS_BEFORE_KILL * 2))" ]]; then
+                                # Process tree is dead AND no progress for very long time
+                                daemon_log WARN "Issue #${issue_num} appears dead: no CPU, no progress for ${no_progress_count} checks (${elapsed}s, PID $pid) — killing"
+                                emit_event "daemon.stuck_kill" "issue=$issue_num" "no_progress=$no_progress_count" "repeated_errors=$repeated_errors" "stage=$cur_stage" "elapsed_s=$elapsed" "pid=$pid" "reason=dead_process"
+                                kill "$pid" 2>/dev/null || true
+                                daemon_clear_progress "$issue_num"
+                                findings=$((findings + 1))
+                            else
+                                daemon_log WARN "Issue #${issue_num} struggling (${no_progress_count} checks, ${repeated_errors} errors, CPU=${child_cpu}%, stage=${cur_stage}) — monitoring"
+                            fi
                             ;;
                     esac
                 fi
@@ -4171,8 +4469,9 @@ daemon_health_check() {
                 local stale_timeout
                 stale_timeout=$(get_adaptive_stale_timeout "$PIPELINE_TEMPLATE")
                 if [[ "$elapsed" -gt "$stale_timeout" ]]; then
-                    daemon_log WARN "Stale job (legacy): issue #${issue_num} (${elapsed}s > ${stale_timeout}s, PID $pid) — killing"
-                    kill "$pid" 2>/dev/null || true
+                    daemon_log WARN "Stale job (legacy): issue #${issue_num} (${elapsed}s > ${stale_timeout}s, PID $pid)"
+                    # Don't kill — just log. Let the process run.
+                    emit_event "daemon.stale_warning" "issue=$issue_num" "elapsed_s=$elapsed" "pid=$pid"
                     findings=$((findings + 1))
                 fi
             fi
@@ -4765,6 +5064,7 @@ daemon_poll_loop() {
         # All poll loop calls are error-guarded to prevent set -e from killing the daemon.
         # The || operator disables set -e for the entire call chain, so transient failures
         # (GitHub API timeouts, jq errors, intelligence failures) are logged and skipped.
+        daemon_preflight_auth_check || daemon_log WARN "Auth check failed — daemon may be paused"
         daemon_poll_issues || daemon_log WARN "daemon_poll_issues failed — continuing"
         daemon_reap_completed || daemon_log WARN "daemon_reap_completed failed — continuing"
         daemon_health_check || daemon_log WARN "daemon_health_check failed — continuing"
@@ -4848,7 +5148,8 @@ cleanup_on_exit() {
             while IFS= read -r cpid; do
                 [[ -z "$cpid" ]] && continue
                 if kill -0 "$cpid" 2>/dev/null; then
-                    daemon_log INFO "Killing pipeline process PID ${cpid}"
+                    daemon_log INFO "Killing pipeline process tree PID ${cpid}"
+                    pkill -TERM -P "$cpid" 2>/dev/null || true
                     kill "$cpid" 2>/dev/null || true
                     killed=$((killed + 1))
                 fi
@@ -4860,7 +5161,8 @@ cleanup_on_exit() {
                 while IFS= read -r cpid; do
                     [[ -z "$cpid" ]] && continue
                     if kill -0 "$cpid" 2>/dev/null; then
-                        daemon_log WARN "Force-killing pipeline PID ${cpid}"
+                        daemon_log WARN "Force-killing pipeline tree PID ${cpid}"
+                        pkill -9 -P "$cpid" 2>/dev/null || true
                         kill -9 "$cpid" 2>/dev/null || true
                     fi
                 done <<< "$child_pids"