shipwright-cli 1.10.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +114 -36
- package/completions/_shipwright +212 -32
- package/completions/shipwright.bash +97 -25
- package/docs/strategy/01-market-research.md +619 -0
- package/docs/strategy/02-mission-and-brand.md +587 -0
- package/docs/strategy/03-gtm-and-roadmap.md +759 -0
- package/docs/strategy/QUICK-START.txt +289 -0
- package/docs/strategy/README.md +172 -0
- package/package.json +4 -2
- package/scripts/sw +208 -1
- package/scripts/sw-activity.sh +500 -0
- package/scripts/sw-adaptive.sh +925 -0
- package/scripts/sw-adversarial.sh +1 -1
- package/scripts/sw-architecture-enforcer.sh +1 -1
- package/scripts/sw-auth.sh +613 -0
- package/scripts/sw-autonomous.sh +664 -0
- package/scripts/sw-changelog.sh +704 -0
- package/scripts/sw-checkpoint.sh +1 -1
- package/scripts/sw-ci.sh +602 -0
- package/scripts/sw-cleanup.sh +1 -1
- package/scripts/sw-code-review.sh +637 -0
- package/scripts/sw-connect.sh +1 -1
- package/scripts/sw-context.sh +605 -0
- package/scripts/sw-cost.sh +1 -1
- package/scripts/sw-daemon.sh +432 -130
- package/scripts/sw-dashboard.sh +1 -1
- package/scripts/sw-db.sh +540 -0
- package/scripts/sw-decompose.sh +539 -0
- package/scripts/sw-deps.sh +551 -0
- package/scripts/sw-developer-simulation.sh +1 -1
- package/scripts/sw-discovery.sh +412 -0
- package/scripts/sw-docs-agent.sh +539 -0
- package/scripts/sw-docs.sh +1 -1
- package/scripts/sw-doctor.sh +59 -1
- package/scripts/sw-dora.sh +615 -0
- package/scripts/sw-durable.sh +710 -0
- package/scripts/sw-e2e-orchestrator.sh +535 -0
- package/scripts/sw-eventbus.sh +393 -0
- package/scripts/sw-feedback.sh +471 -0
- package/scripts/sw-fix.sh +1 -1
- package/scripts/sw-fleet-discover.sh +567 -0
- package/scripts/sw-fleet-viz.sh +404 -0
- package/scripts/sw-fleet.sh +8 -1
- package/scripts/sw-github-app.sh +596 -0
- package/scripts/sw-github-checks.sh +1 -1
- package/scripts/sw-github-deploy.sh +1 -1
- package/scripts/sw-github-graphql.sh +1 -1
- package/scripts/sw-guild.sh +569 -0
- package/scripts/sw-heartbeat.sh +1 -1
- package/scripts/sw-hygiene.sh +559 -0
- package/scripts/sw-incident.sh +617 -0
- package/scripts/sw-init.sh +88 -1
- package/scripts/sw-instrument.sh +699 -0
- package/scripts/sw-intelligence.sh +1 -1
- package/scripts/sw-jira.sh +1 -1
- package/scripts/sw-launchd.sh +363 -28
- package/scripts/sw-linear.sh +1 -1
- package/scripts/sw-logs.sh +1 -1
- package/scripts/sw-loop.sh +64 -3
- package/scripts/sw-memory.sh +1 -1
- package/scripts/sw-mission-control.sh +487 -0
- package/scripts/sw-model-router.sh +545 -0
- package/scripts/sw-otel.sh +596 -0
- package/scripts/sw-oversight.sh +689 -0
- package/scripts/sw-pipeline-composer.sh +1 -1
- package/scripts/sw-pipeline-vitals.sh +1 -1
- package/scripts/sw-pipeline.sh +687 -24
- package/scripts/sw-pm.sh +693 -0
- package/scripts/sw-pr-lifecycle.sh +522 -0
- package/scripts/sw-predictive.sh +1 -1
- package/scripts/sw-prep.sh +1 -1
- package/scripts/sw-ps.sh +1 -1
- package/scripts/sw-public-dashboard.sh +798 -0
- package/scripts/sw-quality.sh +595 -0
- package/scripts/sw-reaper.sh +1 -1
- package/scripts/sw-recruit.sh +573 -0
- package/scripts/sw-regression.sh +642 -0
- package/scripts/sw-release-manager.sh +736 -0
- package/scripts/sw-release.sh +706 -0
- package/scripts/sw-remote.sh +1 -1
- package/scripts/sw-replay.sh +520 -0
- package/scripts/sw-retro.sh +691 -0
- package/scripts/sw-scale.sh +444 -0
- package/scripts/sw-security-audit.sh +505 -0
- package/scripts/sw-self-optimize.sh +1 -1
- package/scripts/sw-session.sh +1 -1
- package/scripts/sw-setup.sh +1 -1
- package/scripts/sw-standup.sh +712 -0
- package/scripts/sw-status.sh +1 -1
- package/scripts/sw-strategic.sh +658 -0
- package/scripts/sw-stream.sh +450 -0
- package/scripts/sw-swarm.sh +583 -0
- package/scripts/sw-team-stages.sh +511 -0
- package/scripts/sw-templates.sh +1 -1
- package/scripts/sw-testgen.sh +515 -0
- package/scripts/sw-tmux-pipeline.sh +554 -0
- package/scripts/sw-tmux.sh +1 -1
- package/scripts/sw-trace.sh +485 -0
- package/scripts/sw-tracker-github.sh +188 -0
- package/scripts/sw-tracker-jira.sh +172 -0
- package/scripts/sw-tracker-linear.sh +251 -0
- package/scripts/sw-tracker.sh +117 -2
- package/scripts/sw-triage.sh +603 -0
- package/scripts/sw-upgrade.sh +1 -1
- package/scripts/sw-ux.sh +677 -0
- package/scripts/sw-webhook.sh +627 -0
- package/scripts/sw-widgets.sh +530 -0
- package/scripts/sw-worktree.sh +1 -1
package/scripts/sw-daemon.sh
CHANGED
|
@@ -6,7 +6,10 @@
|
|
|
6
6
|
set -euo pipefail
|
|
7
7
|
trap 'echo "ERROR: $BASH_SOURCE:$LINENO exited with status $?" >&2' ERR
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
# Allow spawning Claude CLI from within a Claude Code session (daemon, fleet, etc.)
|
|
10
|
+
unset CLAUDECODE 2>/dev/null || true
|
|
11
|
+
|
|
12
|
+
VERSION="2.0.0"
|
|
10
13
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
11
14
|
REPO_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
12
15
|
|
|
@@ -478,9 +481,11 @@ load_config() {
|
|
|
478
481
|
|
|
479
482
|
# progress-based health monitoring (replaces static timeouts)
|
|
480
483
|
PROGRESS_MONITORING=$(jq -r '.health.progress_based // true' "$config_file")
|
|
481
|
-
PROGRESS_CHECKS_BEFORE_WARN=$(jq -r '.health.stale_checks_before_warn //
|
|
482
|
-
PROGRESS_CHECKS_BEFORE_KILL=$(jq -r '.health.stale_checks_before_kill //
|
|
483
|
-
PROGRESS_HARD_LIMIT_S=$(jq -r '.health.hard_limit_s //
|
|
484
|
+
PROGRESS_CHECKS_BEFORE_WARN=$(jq -r '.health.stale_checks_before_warn // 20' "$config_file")
|
|
485
|
+
PROGRESS_CHECKS_BEFORE_KILL=$(jq -r '.health.stale_checks_before_kill // 120' "$config_file")
|
|
486
|
+
PROGRESS_HARD_LIMIT_S=$(jq -r '.health.hard_limit_s // 0' "$config_file") # 0 = disabled (no hard kill)
|
|
487
|
+
NUDGE_ENABLED=$(jq -r '.health.nudge_enabled // true' "$config_file")
|
|
488
|
+
NUDGE_AFTER_CHECKS=$(jq -r '.health.nudge_after_checks // 40' "$config_file")
|
|
484
489
|
|
|
485
490
|
# team dashboard URL (for coordinated claiming)
|
|
486
491
|
local cfg_dashboard_url
|
|
@@ -836,6 +841,31 @@ daemon_assess_progress() {
|
|
|
836
841
|
has_progress=true
|
|
837
842
|
fi
|
|
838
843
|
|
|
844
|
+
# Claude subprocess is alive and consuming CPU — agent is thinking/working
|
|
845
|
+
# During build stage, Claude can spend 10+ minutes thinking before any
|
|
846
|
+
# visible git changes appear. Detect this as progress.
|
|
847
|
+
if [[ "$has_progress" != "true" ]]; then
|
|
848
|
+
local _pid_for_check
|
|
849
|
+
_pid_for_check=$(echo "$current_snapshot" | jq -r '.pid // empty' 2>/dev/null || true)
|
|
850
|
+
if [[ -z "$_pid_for_check" ]]; then
|
|
851
|
+
# Fallback: get PID from active_jobs
|
|
852
|
+
_pid_for_check=$(jq -r --argjson num "$issue_num" \
|
|
853
|
+
'.active_jobs[] | select(.issue == ($num | tonumber)) | .pid' "$STATE_FILE" 2>/dev/null | head -1 || true)
|
|
854
|
+
fi
|
|
855
|
+
if [[ -n "$_pid_for_check" ]]; then
|
|
856
|
+
# Check if any child process (claude) is alive and using CPU
|
|
857
|
+
local child_cpu=0
|
|
858
|
+
child_cpu=$(ps -o pid=,pcpu= -p "$_pid_for_check" 2>/dev/null | awk '{sum+=$2} END{printf "%d", sum+0}' || echo "0")
|
|
859
|
+
if [[ "$child_cpu" -eq 0 ]]; then
|
|
860
|
+
# Check children of the pipeline process
|
|
861
|
+
child_cpu=$(pgrep -P "$_pid_for_check" 2>/dev/null | xargs -I{} ps -o pcpu= -p {} 2>/dev/null | awk '{sum+=$1} END{printf "%d", sum+0}' || echo "0")
|
|
862
|
+
fi
|
|
863
|
+
if [[ "${child_cpu:-0}" -gt 0 ]]; then
|
|
864
|
+
has_progress=true
|
|
865
|
+
fi
|
|
866
|
+
fi
|
|
867
|
+
fi
|
|
868
|
+
|
|
839
869
|
# Detect repeated errors (same error signature hitting again)
|
|
840
870
|
local repeated_errors="$prev_repeated_errors"
|
|
841
871
|
if [[ -n "$cur_error" && "$cur_error" == "$prev_error" ]]; then
|
|
@@ -1208,6 +1238,74 @@ gh_record_failure() {
|
|
|
1208
1238
|
fi
|
|
1209
1239
|
}
|
|
1210
1240
|
|
|
1241
|
+
# ─── Runtime Auth Check ──────────────────────────────────────────────────────
|
|
1242
|
+
|
|
1243
|
+
LAST_AUTH_CHECK_EPOCH=0
|
|
1244
|
+
AUTH_CHECK_INTERVAL=300 # 5 minutes
|
|
1245
|
+
|
|
1246
|
+
daemon_preflight_auth_check() {
|
|
1247
|
+
local now_e
|
|
1248
|
+
now_e=$(now_epoch)
|
|
1249
|
+
if [[ $((now_e - LAST_AUTH_CHECK_EPOCH)) -lt "$AUTH_CHECK_INTERVAL" ]]; then
|
|
1250
|
+
return 0
|
|
1251
|
+
fi
|
|
1252
|
+
LAST_AUTH_CHECK_EPOCH="$now_e"
|
|
1253
|
+
|
|
1254
|
+
# gh auth check
|
|
1255
|
+
if [[ "${NO_GITHUB:-false}" != "true" ]]; then
|
|
1256
|
+
if ! gh auth status &>/dev/null 2>&1; then
|
|
1257
|
+
daemon_log ERROR "GitHub auth check failed — auto-pausing daemon"
|
|
1258
|
+
local pause_json
|
|
1259
|
+
pause_json=$(jq -n --arg reason "gh_auth_failure" --arg ts "$(now_iso)" \
|
|
1260
|
+
'{reason: $reason, timestamp: $ts}')
|
|
1261
|
+
local _tmp_pause
|
|
1262
|
+
_tmp_pause=$(mktemp "${TMPDIR:-/tmp}/sw-pause.XXXXXX")
|
|
1263
|
+
echo "$pause_json" > "$_tmp_pause"
|
|
1264
|
+
mv "$_tmp_pause" "$PAUSE_FLAG"
|
|
1265
|
+
emit_event "daemon.auto_pause" "reason=gh_auth_failure"
|
|
1266
|
+
return 1
|
|
1267
|
+
fi
|
|
1268
|
+
fi
|
|
1269
|
+
|
|
1270
|
+
# claude auth check with 15s timeout (macOS has no timeout command)
|
|
1271
|
+
local claude_auth_ok=false
|
|
1272
|
+
local _auth_tmp
|
|
1273
|
+
_auth_tmp=$(mktemp "${TMPDIR:-/tmp}/sw-auth.XXXXXX")
|
|
1274
|
+
( claude --print -p "ok" --max-turns 1 > "$_auth_tmp" 2>/dev/null ) &
|
|
1275
|
+
local _auth_pid=$!
|
|
1276
|
+
local _auth_waited=0
|
|
1277
|
+
while kill -0 "$_auth_pid" 2>/dev/null && [[ "$_auth_waited" -lt 15 ]]; do
|
|
1278
|
+
sleep 1
|
|
1279
|
+
_auth_waited=$((_auth_waited + 1))
|
|
1280
|
+
done
|
|
1281
|
+
if kill -0 "$_auth_pid" 2>/dev/null; then
|
|
1282
|
+
kill "$_auth_pid" 2>/dev/null || true
|
|
1283
|
+
wait "$_auth_pid" 2>/dev/null || true
|
|
1284
|
+
else
|
|
1285
|
+
wait "$_auth_pid" 2>/dev/null || true
|
|
1286
|
+
fi
|
|
1287
|
+
|
|
1288
|
+
if [[ -s "$_auth_tmp" ]]; then
|
|
1289
|
+
claude_auth_ok=true
|
|
1290
|
+
fi
|
|
1291
|
+
rm -f "$_auth_tmp"
|
|
1292
|
+
|
|
1293
|
+
if [[ "$claude_auth_ok" != "true" ]]; then
|
|
1294
|
+
daemon_log ERROR "Claude auth check failed — auto-pausing daemon"
|
|
1295
|
+
local pause_json
|
|
1296
|
+
pause_json=$(jq -n --arg reason "claude_auth_failure" --arg ts "$(now_iso)" \
|
|
1297
|
+
'{reason: $reason, timestamp: $ts}')
|
|
1298
|
+
local _tmp_pause
|
|
1299
|
+
_tmp_pause=$(mktemp "${TMPDIR:-/tmp}/sw-pause.XXXXXX")
|
|
1300
|
+
echo "$pause_json" > "$_tmp_pause"
|
|
1301
|
+
mv "$_tmp_pause" "$PAUSE_FLAG"
|
|
1302
|
+
emit_event "daemon.auto_pause" "reason=claude_auth_failure"
|
|
1303
|
+
return 1
|
|
1304
|
+
fi
|
|
1305
|
+
|
|
1306
|
+
return 0
|
|
1307
|
+
}
|
|
1308
|
+
|
|
1211
1309
|
# ─── Pre-flight Checks ──────────────────────────────────────────────────────
|
|
1212
1310
|
|
|
1213
1311
|
preflight_checks() {
|
|
@@ -1609,9 +1707,24 @@ daemon_spawn_pipeline() {
|
|
|
1609
1707
|
local issue_num="$1"
|
|
1610
1708
|
local issue_title="${2:-}"
|
|
1611
1709
|
local repo_full_name="${3:-}" # owner/repo (org mode only)
|
|
1710
|
+
shift 3 2>/dev/null || true
|
|
1711
|
+
local extra_pipeline_args=("$@") # Optional extra args passed to sw-pipeline.sh
|
|
1612
1712
|
|
|
1613
1713
|
daemon_log INFO "Spawning pipeline for issue #${issue_num}: ${issue_title}"
|
|
1614
1714
|
|
|
1715
|
+
# ── Issue decomposition (if decomposer available) ──
|
|
1716
|
+
local decompose_script="${SCRIPT_DIR}/sw-decompose.sh"
|
|
1717
|
+
if [[ -x "$decompose_script" && "$NO_GITHUB" != "true" ]]; then
|
|
1718
|
+
local decompose_result=""
|
|
1719
|
+
decompose_result=$("$decompose_script" auto "$issue_num" 2>/dev/null) || true
|
|
1720
|
+
if [[ "$decompose_result" == *"decomposed"* ]]; then
|
|
1721
|
+
daemon_log INFO "Issue #${issue_num} decomposed into subtasks — skipping pipeline"
|
|
1722
|
+
# Remove the shipwright label so decomposed parent doesn't re-queue
|
|
1723
|
+
gh issue edit "$issue_num" --remove-label "shipwright" 2>/dev/null || true
|
|
1724
|
+
return 0
|
|
1725
|
+
fi
|
|
1726
|
+
fi
|
|
1727
|
+
|
|
1615
1728
|
# Extract goal text from issue (title + first line of body)
|
|
1616
1729
|
local issue_goal="$issue_title"
|
|
1617
1730
|
if [[ "$NO_GITHUB" != "true" ]]; then
|
|
@@ -1727,11 +1840,18 @@ daemon_spawn_pipeline() {
|
|
|
1727
1840
|
pipeline_args+=("--fast-test-cmd" "$FAST_TEST_CMD_CFG")
|
|
1728
1841
|
fi
|
|
1729
1842
|
|
|
1843
|
+
# Append any extra pipeline args (from retry escalation, etc.)
|
|
1844
|
+
if [[ ${#extra_pipeline_args[@]} -gt 0 ]]; then
|
|
1845
|
+
pipeline_args+=("${extra_pipeline_args[@]}")
|
|
1846
|
+
fi
|
|
1847
|
+
|
|
1730
1848
|
# Run pipeline in work directory (background)
|
|
1849
|
+
# Ignore SIGHUP so tmux attach/detach and process group changes don't kill the pipeline
|
|
1731
1850
|
echo -e "\n\n===== Pipeline run $(date -u +%Y-%m-%dT%H:%M:%SZ) =====" >> "$LOG_DIR/issue-${issue_num}.log" 2>/dev/null || true
|
|
1732
1851
|
(
|
|
1852
|
+
trap '' HUP
|
|
1733
1853
|
cd "$work_dir"
|
|
1734
|
-
"$SCRIPT_DIR/sw-pipeline.sh" "${pipeline_args[@]}"
|
|
1854
|
+
exec "$SCRIPT_DIR/sw-pipeline.sh" "${pipeline_args[@]}"
|
|
1735
1855
|
) >> "$LOG_DIR/issue-${issue_num}.log" 2>&1 200>&- &
|
|
1736
1856
|
local pid=$!
|
|
1737
1857
|
|
|
@@ -1904,15 +2024,18 @@ daemon_reap_completed() {
|
|
|
1904
2024
|
reap_machine_name=$(jq -r '.machines[] | select(.role == "primary") | .name' "$HOME/.shipwright/machines.json" 2>/dev/null || hostname -s)
|
|
1905
2025
|
release_claim "$issue_num" "$reap_machine_name"
|
|
1906
2026
|
|
|
1907
|
-
#
|
|
2027
|
+
# Always remove the OLD job entry from active_jobs to prevent
|
|
2028
|
+
# re-reaping of the dead PID on the next cycle. When a retry was
|
|
2029
|
+
# spawned, daemon_spawn_pipeline already added a fresh entry with
|
|
2030
|
+
# the new PID — we must not leave the stale one behind.
|
|
2031
|
+
locked_state_update --argjson num "$issue_num" \
|
|
2032
|
+
--argjson old_pid "${pid:-0}" \
|
|
2033
|
+
'.active_jobs = [.active_jobs[] | select(.issue != $num or .pid != $old_pid)]'
|
|
2034
|
+
untrack_priority_job "$issue_num"
|
|
2035
|
+
|
|
1908
2036
|
if [[ "$_retry_spawned_for" == "$issue_num" ]]; then
|
|
1909
2037
|
daemon_log INFO "Retry spawned for issue #${issue_num} — skipping worktree cleanup"
|
|
1910
2038
|
else
|
|
1911
|
-
# Remove from active_jobs and priority lane tracking (locked)
|
|
1912
|
-
locked_state_update --argjson num "$issue_num" \
|
|
1913
|
-
'.active_jobs = [.active_jobs[] | select(.issue != $num)]'
|
|
1914
|
-
untrack_priority_job "$issue_num"
|
|
1915
|
-
|
|
1916
2039
|
# Clean up worktree (skip for org-mode clones — they persist)
|
|
1917
2040
|
local job_repo
|
|
1918
2041
|
job_repo=$(echo "$job" | jq -r '.repo // ""')
|
|
@@ -1951,6 +2074,9 @@ daemon_reap_completed() {
|
|
|
1951
2074
|
daemon_on_success() {
|
|
1952
2075
|
local issue_num="$1" duration="${2:-}"
|
|
1953
2076
|
|
|
2077
|
+
# Reset consecutive failure tracking on any success
|
|
2078
|
+
reset_failure_tracking
|
|
2079
|
+
|
|
1954
2080
|
daemon_log SUCCESS "Pipeline completed for issue #${issue_num} (${duration:-unknown})"
|
|
1955
2081
|
|
|
1956
2082
|
# Record pipeline duration for adaptive threshold learning
|
|
@@ -2011,6 +2137,91 @@ Check the associated PR for the implementation." 2>/dev/null || true
|
|
|
2011
2137
|
"$SCRIPT_DIR/sw-tracker.sh" notify "completed" "$issue_num" 2>/dev/null || true
|
|
2012
2138
|
}
|
|
2013
2139
|
|
|
2140
|
+
# ─── Failure Classification ─────────────────────────────────────────────────
|
|
2141
|
+
|
|
2142
|
+
classify_failure() {
|
|
2143
|
+
local issue_num="$1"
|
|
2144
|
+
if [[ -z "${LOG_DIR:-}" ]]; then
|
|
2145
|
+
echo "unknown"
|
|
2146
|
+
return
|
|
2147
|
+
fi
|
|
2148
|
+
local log_path="$LOG_DIR/issue-${issue_num}.log"
|
|
2149
|
+
if [[ ! -f "$log_path" ]]; then
|
|
2150
|
+
echo "unknown"
|
|
2151
|
+
return
|
|
2152
|
+
fi
|
|
2153
|
+
local tail_content
|
|
2154
|
+
tail_content=$(tail -200 "$log_path" 2>/dev/null || true)
|
|
2155
|
+
|
|
2156
|
+
# Auth errors
|
|
2157
|
+
if echo "$tail_content" | grep -qiE 'not logged in|unauthorized|auth.*fail|401 |invalid.*token|CLAUDE_CODE_OAUTH_TOKEN|api key.*invalid|authentication required'; then
|
|
2158
|
+
echo "auth_error"
|
|
2159
|
+
return
|
|
2160
|
+
fi
|
|
2161
|
+
# API errors (rate limits, timeouts, server errors)
|
|
2162
|
+
if echo "$tail_content" | grep -qiE 'rate limit|429 |503 |502 |overloaded|timeout|ETIMEDOUT|ECONNRESET|socket hang up|service unavailable'; then
|
|
2163
|
+
echo "api_error"
|
|
2164
|
+
return
|
|
2165
|
+
fi
|
|
2166
|
+
# Invalid issue (not found, empty body)
|
|
2167
|
+
if echo "$tail_content" | grep -qiE 'issue not found|404 |no body|could not resolve|GraphQL.*not found|issue.*does not exist'; then
|
|
2168
|
+
echo "invalid_issue"
|
|
2169
|
+
return
|
|
2170
|
+
fi
|
|
2171
|
+
# Context exhaustion — check progress file
|
|
2172
|
+
local issue_worktree_path="${WORKTREE_DIR:-${REPO_DIR}/.worktrees}/daemon-issue-${issue_num}"
|
|
2173
|
+
local progress_file="${issue_worktree_path}/.claude/loop-logs/progress.md"
|
|
2174
|
+
if [[ -f "$progress_file" ]]; then
|
|
2175
|
+
local cf_iter
|
|
2176
|
+
cf_iter=$(grep -oE 'Iteration: [0-9]+' "$progress_file" 2>/dev/null | tail -1 | grep -oE '[0-9]+' || echo "0")
|
|
2177
|
+
if ! [[ "${cf_iter:-0}" =~ ^[0-9]+$ ]]; then cf_iter="0"; fi
|
|
2178
|
+
local cf_tests
|
|
2179
|
+
cf_tests=$(grep -oE 'Tests passing: (true|false)' "$progress_file" 2>/dev/null | awk '{print $NF}' || echo "unknown")
|
|
2180
|
+
if [[ "${cf_iter:-0}" -gt 0 ]] && { [[ "$cf_tests" == "false" ]] || [[ "$cf_tests" == "unknown" ]]; }; then
|
|
2181
|
+
echo "context_exhaustion"
|
|
2182
|
+
return
|
|
2183
|
+
fi
|
|
2184
|
+
fi
|
|
2185
|
+
# Build failure (test errors, compile errors)
|
|
2186
|
+
if echo "$tail_content" | grep -qiE 'test.*fail|FAIL|build.*error|compile.*error|lint.*fail|npm ERR|exit code [1-9]'; then
|
|
2187
|
+
echo "build_failure"
|
|
2188
|
+
return
|
|
2189
|
+
fi
|
|
2190
|
+
echo "unknown"
|
|
2191
|
+
}
|
|
2192
|
+
|
|
2193
|
+
# ─── Consecutive Failure Tracking ──────────────────────────────────────────
|
|
2194
|
+
|
|
2195
|
+
DAEMON_CONSECUTIVE_FAILURE_CLASS=""
|
|
2196
|
+
DAEMON_CONSECUTIVE_FAILURE_COUNT=0
|
|
2197
|
+
|
|
2198
|
+
record_failure_class() {
|
|
2199
|
+
local failure_class="$1"
|
|
2200
|
+
if [[ "$failure_class" == "$DAEMON_CONSECUTIVE_FAILURE_CLASS" ]]; then
|
|
2201
|
+
DAEMON_CONSECUTIVE_FAILURE_COUNT=$((DAEMON_CONSECUTIVE_FAILURE_COUNT + 1))
|
|
2202
|
+
else
|
|
2203
|
+
DAEMON_CONSECUTIVE_FAILURE_CLASS="$failure_class"
|
|
2204
|
+
DAEMON_CONSECUTIVE_FAILURE_COUNT=1
|
|
2205
|
+
fi
|
|
2206
|
+
|
|
2207
|
+
if [[ "$DAEMON_CONSECUTIVE_FAILURE_COUNT" -ge 3 ]]; then
|
|
2208
|
+
daemon_log ERROR "3 consecutive failures (class: ${failure_class}) — auto-pausing daemon"
|
|
2209
|
+
local pause_json
|
|
2210
|
+
pause_json=$(jq -n --arg reason "consecutive_${failure_class}" --arg ts "$(now_iso)" \
|
|
2211
|
+
'{reason: $reason, timestamp: $ts}')
|
|
2212
|
+
local _tmp_pause
|
|
2213
|
+
_tmp_pause=$(mktemp "${TMPDIR:-/tmp}/sw-pause.XXXXXX")
|
|
2214
|
+
echo "$pause_json" > "$_tmp_pause"
|
|
2215
|
+
mv "$_tmp_pause" "$PAUSE_FLAG"
|
|
2216
|
+
emit_event "daemon.auto_pause" "reason=consecutive_failures" "class=$failure_class" "count=$DAEMON_CONSECUTIVE_FAILURE_COUNT"
|
|
2217
|
+
fi
|
|
2218
|
+
}
|
|
2219
|
+
|
|
2220
|
+
reset_failure_tracking() {
|
|
2221
|
+
DAEMON_CONSECUTIVE_FAILURE_CLASS=""
|
|
2222
|
+
DAEMON_CONSECUTIVE_FAILURE_COUNT=0
|
|
2223
|
+
}
|
|
2224
|
+
|
|
2014
2225
|
# ─── Failure Handler ────────────────────────────────────────────────────────
|
|
2015
2226
|
|
|
2016
2227
|
daemon_on_failure() {
|
|
@@ -2047,123 +2258,143 @@ daemon_on_failure() {
|
|
|
2047
2258
|
completed_at: $completed_at
|
|
2048
2259
|
}] | .completed = .completed[-500:]'
|
|
2049
2260
|
|
|
2261
|
+
# ── Classify failure and decide retry strategy ──
|
|
2262
|
+
local failure_class
|
|
2263
|
+
failure_class=$(classify_failure "$issue_num")
|
|
2264
|
+
daemon_log INFO "Failure classified as: ${failure_class} for issue #${issue_num}"
|
|
2265
|
+
emit_event "daemon.failure_classified" "issue=$issue_num" "class=$failure_class"
|
|
2266
|
+
record_failure_class "$failure_class"
|
|
2267
|
+
|
|
2050
2268
|
# ── Auto-retry with strategy escalation ──
|
|
2051
2269
|
if [[ "${RETRY_ESCALATION:-true}" == "true" ]]; then
|
|
2052
2270
|
local retry_count
|
|
2053
2271
|
retry_count=$(jq -r --arg num "$issue_num" \
|
|
2054
2272
|
'.retry_counts[$num] // 0' "$STATE_FILE" 2>/dev/null || echo "0")
|
|
2055
2273
|
|
|
2056
|
-
|
|
2057
|
-
|
|
2058
|
-
|
|
2059
|
-
|
|
2060
|
-
|
|
2061
|
-
|
|
2062
|
-
|
|
2063
|
-
|
|
2064
|
-
daemon_log WARN "Auto-retry #${retry_count}/${MAX_RETRIES:-2} for issue #${issue_num}"
|
|
2065
|
-
emit_event "daemon.retry" "issue=$issue_num" "retry=$retry_count" "max=${MAX_RETRIES:-2}"
|
|
2066
|
-
|
|
2067
|
-
# Check for checkpoint to enable resume-from-checkpoint
|
|
2068
|
-
local checkpoint_args=()
|
|
2069
|
-
if [[ "${CHECKPOINT_ENABLED:-true}" == "true" ]]; then
|
|
2070
|
-
# Try to find worktree for this issue to check for checkpoints
|
|
2071
|
-
local issue_worktree="${REPO_DIR}/.worktrees/daemon-issue-${issue_num}"
|
|
2072
|
-
if [[ -d "$issue_worktree/.claude/pipeline-artifacts/checkpoints" ]]; then
|
|
2073
|
-
local latest_checkpoint=""
|
|
2074
|
-
for cp_file in "$issue_worktree/.claude/pipeline-artifacts/checkpoints"/*-checkpoint.json; do
|
|
2075
|
-
[[ -f "$cp_file" ]] && latest_checkpoint="$cp_file"
|
|
2076
|
-
done
|
|
2077
|
-
if [[ -n "$latest_checkpoint" ]]; then
|
|
2078
|
-
daemon_log INFO "Found checkpoint: $latest_checkpoint"
|
|
2079
|
-
emit_event "daemon.recovery" "issue=$issue_num" "checkpoint=$latest_checkpoint"
|
|
2080
|
-
checkpoint_args+=("--resume")
|
|
2081
|
-
fi
|
|
2274
|
+
# Non-retryable failures — skip retry entirely
|
|
2275
|
+
case "$failure_class" in
|
|
2276
|
+
auth_error)
|
|
2277
|
+
daemon_log ERROR "Auth error for issue #${issue_num} — skipping retry"
|
|
2278
|
+
emit_event "daemon.skip_retry" "issue=$issue_num" "reason=auth_error"
|
|
2279
|
+
if [[ "$NO_GITHUB" != "true" ]]; then
|
|
2280
|
+
gh issue edit "$issue_num" --add-label "pipeline/auth-error" 2>/dev/null || true
|
|
2082
2281
|
fi
|
|
2083
|
-
|
|
2084
|
-
|
|
2085
|
-
|
|
2086
|
-
|
|
2087
|
-
|
|
2088
|
-
|
|
2089
|
-
if [[ -f "$progress_file" ]]; then
|
|
2090
|
-
local progress_iter
|
|
2091
|
-
progress_iter=$(grep -oE 'Iteration: [0-9]+' "$progress_file" 2>/dev/null | tail -1 | grep -oE '[0-9]+' || echo "0")
|
|
2092
|
-
if ! [[ "${progress_iter:-0}" =~ ^[0-9]+$ ]]; then
|
|
2093
|
-
progress_iter="0"
|
|
2094
|
-
fi
|
|
2095
|
-
local progress_tests
|
|
2096
|
-
progress_tests=$(grep -oE 'Tests passing: (true|false)' "$progress_file" 2>/dev/null | awk '{print $NF}' || echo "unknown")
|
|
2097
|
-
if [[ "${progress_iter:-0}" -gt 0 ]] && { [[ "$progress_tests" == "false" ]] || [[ "$progress_tests" == "unknown" ]]; }; then
|
|
2098
|
-
failure_reason="context_exhaustion"
|
|
2099
|
-
emit_event "daemon.context_exhaustion" "issue=$issue_num" "iterations=$progress_iter"
|
|
2100
|
-
daemon_log WARN "Context exhaustion detected for issue #${issue_num} (iterations: ${progress_iter})"
|
|
2282
|
+
;;
|
|
2283
|
+
invalid_issue)
|
|
2284
|
+
daemon_log ERROR "Invalid issue #${issue_num} — skipping retry"
|
|
2285
|
+
emit_event "daemon.skip_retry" "issue=$issue_num" "reason=invalid_issue"
|
|
2286
|
+
if [[ "$NO_GITHUB" != "true" ]]; then
|
|
2287
|
+
gh issue comment "$issue_num" --body "Pipeline skipped retry: issue appears invalid or has no body." 2>/dev/null || true
|
|
2101
2288
|
fi
|
|
2102
|
-
|
|
2289
|
+
;;
|
|
2290
|
+
*)
|
|
2291
|
+
# Retryable failures — proceed with escalation
|
|
2292
|
+
if [[ "$retry_count" -lt "${MAX_RETRIES:-2}" ]]; then
|
|
2293
|
+
retry_count=$((retry_count + 1))
|
|
2294
|
+
|
|
2295
|
+
# Update retry count in state (locked to prevent race)
|
|
2296
|
+
locked_state_update \
|
|
2297
|
+
--arg num "$issue_num" --argjson count "$retry_count" \
|
|
2298
|
+
'.retry_counts[$num] = $count'
|
|
2299
|
+
|
|
2300
|
+
daemon_log WARN "Auto-retry #${retry_count}/${MAX_RETRIES:-2} for issue #${issue_num} (class: ${failure_class})"
|
|
2301
|
+
emit_event "daemon.retry" "issue=$issue_num" "retry=$retry_count" "max=${MAX_RETRIES:-2}" "class=$failure_class"
|
|
2302
|
+
|
|
2303
|
+
# Check for checkpoint to enable resume-from-checkpoint
|
|
2304
|
+
local checkpoint_args=()
|
|
2305
|
+
if [[ "${CHECKPOINT_ENABLED:-true}" == "true" ]]; then
|
|
2306
|
+
local issue_worktree="${REPO_DIR}/.worktrees/daemon-issue-${issue_num}"
|
|
2307
|
+
if [[ -d "$issue_worktree/.claude/pipeline-artifacts/checkpoints" ]]; then
|
|
2308
|
+
local latest_checkpoint=""
|
|
2309
|
+
for cp_file in "$issue_worktree/.claude/pipeline-artifacts/checkpoints"/*-checkpoint.json; do
|
|
2310
|
+
[[ -f "$cp_file" ]] && latest_checkpoint="$cp_file"
|
|
2311
|
+
done
|
|
2312
|
+
if [[ -n "$latest_checkpoint" ]]; then
|
|
2313
|
+
daemon_log INFO "Found checkpoint: $latest_checkpoint"
|
|
2314
|
+
emit_event "daemon.recovery" "issue=$issue_num" "checkpoint=$latest_checkpoint"
|
|
2315
|
+
checkpoint_args+=("--resume")
|
|
2316
|
+
fi
|
|
2317
|
+
fi
|
|
2318
|
+
fi
|
|
2103
2319
|
|
|
2104
|
-
|
|
2105
|
-
|
|
2106
|
-
|
|
2107
|
-
|
|
2108
|
-
|
|
2109
|
-
|
|
2110
|
-
|
|
2111
|
-
|
|
2112
|
-
|
|
2113
|
-
|
|
2114
|
-
|
|
2115
|
-
|
|
2116
|
-
|
|
2117
|
-
|
|
2118
|
-
|
|
2119
|
-
daemon_log INFO "Escalation: template=full, compound_cycles=5"
|
|
2120
|
-
fi
|
|
2320
|
+
# Build escalated pipeline args
|
|
2321
|
+
local retry_template="$PIPELINE_TEMPLATE"
|
|
2322
|
+
local retry_model="${MODEL:-opus}"
|
|
2323
|
+
local extra_args=()
|
|
2324
|
+
|
|
2325
|
+
if [[ "$retry_count" -eq 1 ]]; then
|
|
2326
|
+
retry_model="opus"
|
|
2327
|
+
extra_args+=("--max-iterations" "30")
|
|
2328
|
+
daemon_log INFO "Escalation: model=opus, max_iterations=30"
|
|
2329
|
+
elif [[ "$retry_count" -ge 2 ]]; then
|
|
2330
|
+
retry_template="full"
|
|
2331
|
+
retry_model="opus"
|
|
2332
|
+
extra_args+=("--max-iterations" "30" "--compound-cycles" "5")
|
|
2333
|
+
daemon_log INFO "Escalation: template=full, compound_cycles=5"
|
|
2334
|
+
fi
|
|
2121
2335
|
|
|
2122
|
-
|
|
2123
|
-
|
|
2124
|
-
|
|
2125
|
-
|
|
2126
|
-
|
|
2127
|
-
|
|
2128
|
-
|
|
2129
|
-
|
|
2130
|
-
|
|
2131
|
-
fi
|
|
2336
|
+
# Increase restarts on context exhaustion
|
|
2337
|
+
if [[ "$failure_class" == "context_exhaustion" ]]; then
|
|
2338
|
+
local boosted_restarts=$(( ${MAX_RESTARTS_CFG:-3} + retry_count ))
|
|
2339
|
+
if [[ "$boosted_restarts" -gt 5 ]]; then
|
|
2340
|
+
boosted_restarts=5
|
|
2341
|
+
fi
|
|
2342
|
+
extra_args+=("--max-restarts" "$boosted_restarts")
|
|
2343
|
+
daemon_log INFO "Boosting max-restarts to $boosted_restarts (context exhaustion)"
|
|
2344
|
+
fi
|
|
2132
2345
|
|
|
2133
|
-
|
|
2134
|
-
|
|
2346
|
+
# API errors get extended backoff
|
|
2347
|
+
local api_backoff=300
|
|
2348
|
+
local backoff_secs=$((30 * retry_count))
|
|
2349
|
+
if [[ "$failure_class" == "api_error" ]]; then
|
|
2350
|
+
backoff_secs=$((api_backoff * retry_count))
|
|
2351
|
+
daemon_log INFO "API error — extended backoff ${backoff_secs}s"
|
|
2352
|
+
fi
|
|
2135
2353
|
|
|
2136
|
-
|
|
2354
|
+
if [[ "$NO_GITHUB" != "true" ]]; then
|
|
2355
|
+
gh issue comment "$issue_num" --body "## 🔄 Auto-Retry #${retry_count}
|
|
2356
|
+
|
|
2357
|
+
Pipeline failed (${failure_class}) — retrying with escalated strategy.
|
|
2137
2358
|
|
|
2138
2359
|
| Field | Value |
|
|
2139
2360
|
|-------|-------|
|
|
2140
2361
|
| Retry | ${retry_count} / ${MAX_RETRIES:-2} |
|
|
2362
|
+
| Failure | \`${failure_class}\` |
|
|
2141
2363
|
| Template | \`${retry_template}\` |
|
|
2142
2364
|
| Model | \`${retry_model}\` |
|
|
2143
2365
|
| Started | $(now_iso) |
|
|
2144
2366
|
|
|
2145
2367
|
_Escalation: $(if [[ "$retry_count" -eq 1 ]]; then echo "upgraded model + increased iterations"; else echo "full template + compound quality"; fi)_" 2>/dev/null || true
|
|
2146
|
-
|
|
2368
|
+
fi
|
|
2147
2369
|
|
|
2148
|
-
|
|
2149
|
-
|
|
2150
|
-
daemon_log INFO "Waiting ${backoff_secs}s before retry #${retry_count}"
|
|
2151
|
-
sleep "$backoff_secs"
|
|
2152
|
-
|
|
2153
|
-
# Re-spawn with escalated strategy
|
|
2154
|
-
local orig_template="$PIPELINE_TEMPLATE"
|
|
2155
|
-
local orig_model="$MODEL"
|
|
2156
|
-
PIPELINE_TEMPLATE="$retry_template"
|
|
2157
|
-
MODEL="$retry_model"
|
|
2158
|
-
daemon_spawn_pipeline "$issue_num" "retry-${retry_count}"
|
|
2159
|
-
_retry_spawned_for="$issue_num"
|
|
2160
|
-
PIPELINE_TEMPLATE="$orig_template"
|
|
2161
|
-
MODEL="$orig_model"
|
|
2162
|
-
return
|
|
2163
|
-
fi
|
|
2370
|
+
daemon_log INFO "Waiting ${backoff_secs}s before retry #${retry_count}"
|
|
2371
|
+
sleep "$backoff_secs"
|
|
2164
2372
|
|
|
2165
|
-
|
|
2166
|
-
|
|
2373
|
+
# Merge checkpoint args + extra args for passthrough
|
|
2374
|
+
local all_extra_args=()
|
|
2375
|
+
if [[ ${#checkpoint_args[@]} -gt 0 ]]; then
|
|
2376
|
+
all_extra_args+=("${checkpoint_args[@]}")
|
|
2377
|
+
fi
|
|
2378
|
+
if [[ ${#extra_args[@]} -gt 0 ]]; then
|
|
2379
|
+
all_extra_args+=("${extra_args[@]}")
|
|
2380
|
+
fi
|
|
2381
|
+
|
|
2382
|
+
# Re-spawn with escalated strategy
|
|
2383
|
+
local orig_template="$PIPELINE_TEMPLATE"
|
|
2384
|
+
local orig_model="$MODEL"
|
|
2385
|
+
PIPELINE_TEMPLATE="$retry_template"
|
|
2386
|
+
MODEL="$retry_model"
|
|
2387
|
+
daemon_spawn_pipeline "$issue_num" "retry-${retry_count}" "" "${all_extra_args[@]}"
|
|
2388
|
+
_retry_spawned_for="$issue_num"
|
|
2389
|
+
PIPELINE_TEMPLATE="$orig_template"
|
|
2390
|
+
MODEL="$orig_model"
|
|
2391
|
+
return
|
|
2392
|
+
fi
|
|
2393
|
+
|
|
2394
|
+
daemon_log WARN "Max retries (${MAX_RETRIES:-2}) exhausted for issue #${issue_num}"
|
|
2395
|
+
emit_event "daemon.retry_exhausted" "issue=$issue_num" "retries=$retry_count"
|
|
2396
|
+
;;
|
|
2397
|
+
esac
|
|
2167
2398
|
fi
|
|
2168
2399
|
|
|
2169
2400
|
# ── No retry — report final failure ──
|
|
@@ -3770,6 +4001,13 @@ Patrol pre-filter findings to confirm: ${patrol_findings_summary}"
|
|
|
3770
4001
|
patrol_meta_run
|
|
3771
4002
|
fi
|
|
3772
4003
|
|
|
4004
|
+
# ── Strategic Intelligence Patrol (requires CLAUDE_CODE_OAUTH_TOKEN) ──
|
|
4005
|
+
if [[ -f "$SCRIPT_DIR/sw-strategic.sh" ]] && [[ -n "${CLAUDE_CODE_OAUTH_TOKEN:-}" ]]; then
|
|
4006
|
+
# shellcheck source=sw-strategic.sh
|
|
4007
|
+
source "$SCRIPT_DIR/sw-strategic.sh"
|
|
4008
|
+
strategic_patrol_run || true
|
|
4009
|
+
fi
|
|
4010
|
+
|
|
3773
4011
|
# ── Summary ──
|
|
3774
4012
|
emit_event "patrol.completed" "findings=$total_findings" "issues_created=$issues_created" "dry_run=$dry_run"
|
|
3775
4013
|
|
|
@@ -4095,13 +4333,15 @@ daemon_health_check() {
|
|
|
4095
4333
|
now_e=$(now_epoch)
|
|
4096
4334
|
|
|
4097
4335
|
if [[ -f "$STATE_FILE" ]]; then
|
|
4098
|
-
# ──
|
|
4099
|
-
# Instead of killing after a
|
|
4100
|
-
#
|
|
4101
|
-
#
|
|
4336
|
+
# ── Intelligent Health Monitoring ──
|
|
4337
|
+
# Instead of killing after a countdown, sense what the agent is doing.
|
|
4338
|
+
# Agents think for long stretches — that's normal and expected.
|
|
4339
|
+
# Strategy: sense → understand → be patient → nudge → only kill as last resort.
|
|
4102
4340
|
|
|
4103
|
-
local hard_limit="${PROGRESS_HARD_LIMIT_S:-
|
|
4341
|
+
local hard_limit="${PROGRESS_HARD_LIMIT_S:-0}"
|
|
4104
4342
|
local use_progress="${PROGRESS_MONITORING:-true}"
|
|
4343
|
+
local nudge_enabled="${NUDGE_ENABLED:-true}"
|
|
4344
|
+
local nudge_after="${NUDGE_AFTER_CHECKS:-40}"
|
|
4105
4345
|
|
|
4106
4346
|
while IFS= read -r job; do
|
|
4107
4347
|
local pid started_at issue_num worktree
|
|
@@ -4122,8 +4362,8 @@ daemon_health_check() {
|
|
|
4122
4362
|
elapsed=$(( now_e - start_e ))
|
|
4123
4363
|
fi
|
|
4124
4364
|
|
|
4125
|
-
# Hard wall-clock limit —
|
|
4126
|
-
if [[ "$elapsed" -gt "$hard_limit" ]]; then
|
|
4365
|
+
# Hard wall-clock limit — disabled by default (0 = off)
|
|
4366
|
+
if [[ "$hard_limit" -gt 0 && "$elapsed" -gt "$hard_limit" ]]; then
|
|
4127
4367
|
daemon_log WARN "Hard limit exceeded: issue #${issue_num} (${elapsed}s > ${hard_limit}s, PID $pid) — killing"
|
|
4128
4368
|
emit_event "daemon.hard_limit" "issue=$issue_num" "elapsed_s=$elapsed" "limit_s=$hard_limit" "pid=$pid"
|
|
4129
4369
|
kill "$pid" 2>/dev/null || true
|
|
@@ -4132,7 +4372,7 @@ daemon_health_check() {
|
|
|
4132
4372
|
continue
|
|
4133
4373
|
fi
|
|
4134
4374
|
|
|
4135
|
-
# Progress
|
|
4375
|
+
# ── Intelligent Progress Sensing ──
|
|
4136
4376
|
if [[ "$use_progress" == "true" && -n "$worktree" ]]; then
|
|
4137
4377
|
local snapshot verdict
|
|
4138
4378
|
snapshot=$(daemon_collect_snapshot "$issue_num" "$worktree" "$pid" 2>/dev/null || echo '{}')
|
|
@@ -4140,29 +4380,87 @@ daemon_health_check() {
|
|
|
4140
4380
|
if [[ "$snapshot" != "{}" ]]; then
|
|
4141
4381
|
verdict=$(daemon_assess_progress "$issue_num" "$snapshot" 2>/dev/null || echo "healthy")
|
|
4142
4382
|
|
|
4383
|
+
local no_progress_count=0
|
|
4384
|
+
no_progress_count=$(jq -r '.no_progress_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
|
|
4385
|
+
local cur_stage
|
|
4386
|
+
cur_stage=$(echo "$snapshot" | jq -r '.stage // "unknown"')
|
|
4387
|
+
|
|
4143
4388
|
case "$verdict" in
|
|
4144
4389
|
healthy)
|
|
4145
4390
|
# All good — agent is making progress
|
|
4146
4391
|
;;
|
|
4147
4392
|
slowing)
|
|
4148
|
-
daemon_log INFO "Issue #${issue_num} slowing (no
|
|
4393
|
+
daemon_log INFO "Issue #${issue_num} slowing (no visible changes for ${no_progress_count} checks, ${elapsed}s elapsed, stage=${cur_stage})"
|
|
4149
4394
|
;;
|
|
4150
4395
|
stalled)
|
|
4151
|
-
|
|
4152
|
-
|
|
4153
|
-
|
|
4154
|
-
|
|
4396
|
+
# Check if agent subprocess is alive and consuming CPU
|
|
4397
|
+
local agent_alive=false
|
|
4398
|
+
local child_cpu=0
|
|
4399
|
+
child_cpu=$(pgrep -P "$pid" 2>/dev/null | xargs -I{} ps -o pcpu= -p {} 2>/dev/null | awk '{sum+=$1} END{printf "%d", sum+0}' || echo "0")
|
|
4400
|
+
if [[ "${child_cpu:-0}" -gt 0 ]]; then
|
|
4401
|
+
agent_alive=true
|
|
4402
|
+
fi
|
|
4403
|
+
|
|
4404
|
+
if [[ "$agent_alive" == "true" ]]; then
|
|
4405
|
+
daemon_log INFO "Issue #${issue_num} no visible progress (${no_progress_count} checks) but agent is alive (CPU: ${child_cpu}%, stage=${cur_stage}, ${elapsed}s) — being patient"
|
|
4406
|
+
else
|
|
4407
|
+
daemon_log WARN "Issue #${issue_num} stalled: no progress for ${no_progress_count} checks, no CPU activity (${elapsed}s elapsed, PID $pid)"
|
|
4408
|
+
emit_event "daemon.stalled" "issue=$issue_num" "no_progress=$no_progress_count" "elapsed_s=$elapsed" "pid=$pid"
|
|
4409
|
+
fi
|
|
4155
4410
|
;;
|
|
4156
4411
|
stuck)
|
|
4157
|
-
local
|
|
4158
|
-
no_progress_count=$(jq -r '.no_progress_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
|
|
4412
|
+
local repeated_errors
|
|
4159
4413
|
repeated_errors=$(jq -r '.repeated_error_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
|
|
4160
|
-
|
|
4161
|
-
|
|
4162
|
-
|
|
4163
|
-
|
|
4164
|
-
|
|
4165
|
-
|
|
4414
|
+
|
|
4415
|
+
# Even "stuck" — check if the process tree is alive first
|
|
4416
|
+
local agent_alive=false
|
|
4417
|
+
local child_cpu=0
|
|
4418
|
+
child_cpu=$(pgrep -P "$pid" 2>/dev/null | xargs -I{} ps -o pcpu= -p {} 2>/dev/null | awk '{sum+=$1} END{printf "%d", sum+0}' || echo "0")
|
|
4419
|
+
if [[ "${child_cpu:-0}" -gt 0 ]]; then
|
|
4420
|
+
agent_alive=true
|
|
4421
|
+
fi
|
|
4422
|
+
|
|
4423
|
+
if [[ "$agent_alive" == "true" && "$repeated_errors" -lt 3 ]]; then
|
|
4424
|
+
# Agent is alive — nudge instead of kill
|
|
4425
|
+
if [[ "$nudge_enabled" == "true" && "$no_progress_count" -ge "$nudge_after" ]]; then
|
|
4426
|
+
local nudge_file="${worktree}/.claude/nudge.md"
|
|
4427
|
+
if [[ ! -f "$nudge_file" ]]; then
|
|
4428
|
+
cat > "$nudge_file" <<NUDGE_EOF
|
|
4429
|
+
# Nudge from Daemon Health Monitor
|
|
4430
|
+
|
|
4431
|
+
The daemon has noticed no visible progress for $(( no_progress_count * 30 / 60 )) minutes.
|
|
4432
|
+
Current stage: ${cur_stage}
|
|
4433
|
+
|
|
4434
|
+
If you're stuck, consider:
|
|
4435
|
+
- Breaking the task into smaller steps
|
|
4436
|
+
- Committing partial progress
|
|
4437
|
+
- Running tests to validate current state
|
|
4438
|
+
|
|
4439
|
+
This is just a gentle check-in — take your time if you're working through a complex problem.
|
|
4440
|
+
NUDGE_EOF
|
|
4441
|
+
daemon_log INFO "Issue #${issue_num} nudged (${no_progress_count} checks, stage=${cur_stage}, CPU=${child_cpu}%) — file written to worktree"
|
|
4442
|
+
emit_event "daemon.nudge" "issue=$issue_num" "no_progress=$no_progress_count" "stage=$cur_stage" "elapsed_s=$elapsed"
|
|
4443
|
+
fi
|
|
4444
|
+
else
|
|
4445
|
+
daemon_log INFO "Issue #${issue_num} no visible progress (${no_progress_count} checks) but agent is alive (CPU: ${child_cpu}%, stage=${cur_stage}) — waiting"
|
|
4446
|
+
fi
|
|
4447
|
+
elif [[ "$repeated_errors" -ge 5 ]]; then
|
|
4448
|
+
# Truly stuck in an error loop — kill as last resort
|
|
4449
|
+
daemon_log WARN "Issue #${issue_num} in error loop: ${repeated_errors} repeated errors (stage=${cur_stage}, ${elapsed}s, PID $pid) — killing"
|
|
4450
|
+
emit_event "daemon.stuck_kill" "issue=$issue_num" "no_progress=$no_progress_count" "repeated_errors=$repeated_errors" "stage=$cur_stage" "elapsed_s=$elapsed" "pid=$pid" "reason=error_loop"
|
|
4451
|
+
kill "$pid" 2>/dev/null || true
|
|
4452
|
+
daemon_clear_progress "$issue_num"
|
|
4453
|
+
findings=$((findings + 1))
|
|
4454
|
+
elif [[ "$agent_alive" != "true" && "$no_progress_count" -ge "$((PROGRESS_CHECKS_BEFORE_KILL * 2))" ]]; then
|
|
4455
|
+
# Process tree is dead AND no progress for very long time
|
|
4456
|
+
daemon_log WARN "Issue #${issue_num} appears dead: no CPU, no progress for ${no_progress_count} checks (${elapsed}s, PID $pid) — killing"
|
|
4457
|
+
emit_event "daemon.stuck_kill" "issue=$issue_num" "no_progress=$no_progress_count" "repeated_errors=$repeated_errors" "stage=$cur_stage" "elapsed_s=$elapsed" "pid=$pid" "reason=dead_process"
|
|
4458
|
+
kill "$pid" 2>/dev/null || true
|
|
4459
|
+
daemon_clear_progress "$issue_num"
|
|
4460
|
+
findings=$((findings + 1))
|
|
4461
|
+
else
|
|
4462
|
+
daemon_log WARN "Issue #${issue_num} struggling (${no_progress_count} checks, ${repeated_errors} errors, CPU=${child_cpu}%, stage=${cur_stage}) — monitoring"
|
|
4463
|
+
fi
|
|
4166
4464
|
;;
|
|
4167
4465
|
esac
|
|
4168
4466
|
fi
|
|
@@ -4171,8 +4469,9 @@ daemon_health_check() {
|
|
|
4171
4469
|
local stale_timeout
|
|
4172
4470
|
stale_timeout=$(get_adaptive_stale_timeout "$PIPELINE_TEMPLATE")
|
|
4173
4471
|
if [[ "$elapsed" -gt "$stale_timeout" ]]; then
|
|
4174
|
-
daemon_log WARN "Stale job (legacy): issue #${issue_num} (${elapsed}s > ${stale_timeout}s, PID $pid)
|
|
4175
|
-
kill
|
|
4472
|
+
daemon_log WARN "Stale job (legacy): issue #${issue_num} (${elapsed}s > ${stale_timeout}s, PID $pid)"
|
|
4473
|
+
# Don't kill — just log. Let the process run.
|
|
4474
|
+
emit_event "daemon.stale_warning" "issue=$issue_num" "elapsed_s=$elapsed" "pid=$pid"
|
|
4176
4475
|
findings=$((findings + 1))
|
|
4177
4476
|
fi
|
|
4178
4477
|
fi
|
|
@@ -4765,6 +5064,7 @@ daemon_poll_loop() {
|
|
|
4765
5064
|
# All poll loop calls are error-guarded to prevent set -e from killing the daemon.
|
|
4766
5065
|
# The || operator disables set -e for the entire call chain, so transient failures
|
|
4767
5066
|
# (GitHub API timeouts, jq errors, intelligence failures) are logged and skipped.
|
|
5067
|
+
daemon_preflight_auth_check || daemon_log WARN "Auth check failed — daemon may be paused"
|
|
4768
5068
|
daemon_poll_issues || daemon_log WARN "daemon_poll_issues failed — continuing"
|
|
4769
5069
|
daemon_reap_completed || daemon_log WARN "daemon_reap_completed failed — continuing"
|
|
4770
5070
|
daemon_health_check || daemon_log WARN "daemon_health_check failed — continuing"
|
|
@@ -4848,7 +5148,8 @@ cleanup_on_exit() {
|
|
|
4848
5148
|
while IFS= read -r cpid; do
|
|
4849
5149
|
[[ -z "$cpid" ]] && continue
|
|
4850
5150
|
if kill -0 "$cpid" 2>/dev/null; then
|
|
4851
|
-
daemon_log INFO "Killing pipeline process PID ${cpid}"
|
|
5151
|
+
daemon_log INFO "Killing pipeline process tree PID ${cpid}"
|
|
5152
|
+
pkill -TERM -P "$cpid" 2>/dev/null || true
|
|
4852
5153
|
kill "$cpid" 2>/dev/null || true
|
|
4853
5154
|
killed=$((killed + 1))
|
|
4854
5155
|
fi
|
|
@@ -4860,7 +5161,8 @@ cleanup_on_exit() {
|
|
|
4860
5161
|
while IFS= read -r cpid; do
|
|
4861
5162
|
[[ -z "$cpid" ]] && continue
|
|
4862
5163
|
if kill -0 "$cpid" 2>/dev/null; then
|
|
4863
|
-
daemon_log WARN "Force-killing pipeline PID ${cpid}"
|
|
5164
|
+
daemon_log WARN "Force-killing pipeline tree PID ${cpid}"
|
|
5165
|
+
pkill -9 -P "$cpid" 2>/dev/null || true
|
|
4864
5166
|
kill -9 "$cpid" 2>/dev/null || true
|
|
4865
5167
|
fi
|
|
4866
5168
|
done <<< "$child_pids"
|