shipwright-cli 1.9.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/hooks/post-tool-use.sh +12 -5
- package/README.md +114 -36
- package/completions/_shipwright +212 -32
- package/completions/shipwright.bash +97 -25
- package/docs/strategy/01-market-research.md +619 -0
- package/docs/strategy/02-mission-and-brand.md +587 -0
- package/docs/strategy/03-gtm-and-roadmap.md +759 -0
- package/docs/strategy/QUICK-START.txt +289 -0
- package/docs/strategy/README.md +172 -0
- package/package.json +4 -2
- package/scripts/sw +217 -2
- package/scripts/sw-activity.sh +500 -0
- package/scripts/sw-adaptive.sh +925 -0
- package/scripts/sw-adversarial.sh +1 -1
- package/scripts/sw-architecture-enforcer.sh +1 -1
- package/scripts/sw-auth.sh +613 -0
- package/scripts/sw-autonomous.sh +664 -0
- package/scripts/sw-changelog.sh +704 -0
- package/scripts/sw-checkpoint.sh +79 -1
- package/scripts/sw-ci.sh +602 -0
- package/scripts/sw-cleanup.sh +192 -7
- package/scripts/sw-code-review.sh +637 -0
- package/scripts/sw-connect.sh +1 -1
- package/scripts/sw-context.sh +605 -0
- package/scripts/sw-cost.sh +1 -1
- package/scripts/sw-daemon.sh +812 -138
- package/scripts/sw-dashboard.sh +1 -1
- package/scripts/sw-db.sh +540 -0
- package/scripts/sw-decompose.sh +539 -0
- package/scripts/sw-deps.sh +551 -0
- package/scripts/sw-developer-simulation.sh +1 -1
- package/scripts/sw-discovery.sh +412 -0
- package/scripts/sw-docs-agent.sh +539 -0
- package/scripts/sw-docs.sh +1 -1
- package/scripts/sw-doctor.sh +59 -1
- package/scripts/sw-dora.sh +615 -0
- package/scripts/sw-durable.sh +710 -0
- package/scripts/sw-e2e-orchestrator.sh +535 -0
- package/scripts/sw-eventbus.sh +393 -0
- package/scripts/sw-feedback.sh +471 -0
- package/scripts/sw-fix.sh +1 -1
- package/scripts/sw-fleet-discover.sh +567 -0
- package/scripts/sw-fleet-viz.sh +404 -0
- package/scripts/sw-fleet.sh +8 -1
- package/scripts/sw-github-app.sh +596 -0
- package/scripts/sw-github-checks.sh +1 -1
- package/scripts/sw-github-deploy.sh +1 -1
- package/scripts/sw-github-graphql.sh +1 -1
- package/scripts/sw-guild.sh +569 -0
- package/scripts/sw-heartbeat.sh +1 -1
- package/scripts/sw-hygiene.sh +559 -0
- package/scripts/sw-incident.sh +617 -0
- package/scripts/sw-init.sh +88 -1
- package/scripts/sw-instrument.sh +699 -0
- package/scripts/sw-intelligence.sh +1 -1
- package/scripts/sw-jira.sh +1 -1
- package/scripts/sw-launchd.sh +366 -31
- package/scripts/sw-linear.sh +1 -1
- package/scripts/sw-logs.sh +1 -1
- package/scripts/sw-loop.sh +507 -51
- package/scripts/sw-memory.sh +198 -3
- package/scripts/sw-mission-control.sh +487 -0
- package/scripts/sw-model-router.sh +545 -0
- package/scripts/sw-otel.sh +596 -0
- package/scripts/sw-oversight.sh +689 -0
- package/scripts/sw-pipeline-composer.sh +8 -8
- package/scripts/sw-pipeline-vitals.sh +1096 -0
- package/scripts/sw-pipeline.sh +2451 -180
- package/scripts/sw-pm.sh +693 -0
- package/scripts/sw-pr-lifecycle.sh +522 -0
- package/scripts/sw-predictive.sh +1 -1
- package/scripts/sw-prep.sh +1 -1
- package/scripts/sw-ps.sh +4 -3
- package/scripts/sw-public-dashboard.sh +798 -0
- package/scripts/sw-quality.sh +595 -0
- package/scripts/sw-reaper.sh +5 -3
- package/scripts/sw-recruit.sh +573 -0
- package/scripts/sw-regression.sh +642 -0
- package/scripts/sw-release-manager.sh +736 -0
- package/scripts/sw-release.sh +706 -0
- package/scripts/sw-remote.sh +1 -1
- package/scripts/sw-replay.sh +520 -0
- package/scripts/sw-retro.sh +691 -0
- package/scripts/sw-scale.sh +444 -0
- package/scripts/sw-security-audit.sh +505 -0
- package/scripts/sw-self-optimize.sh +109 -8
- package/scripts/sw-session.sh +31 -9
- package/scripts/sw-setup.sh +1 -1
- package/scripts/sw-standup.sh +712 -0
- package/scripts/sw-status.sh +192 -1
- package/scripts/sw-strategic.sh +658 -0
- package/scripts/sw-stream.sh +450 -0
- package/scripts/sw-swarm.sh +583 -0
- package/scripts/sw-team-stages.sh +511 -0
- package/scripts/sw-templates.sh +1 -1
- package/scripts/sw-testgen.sh +515 -0
- package/scripts/sw-tmux-pipeline.sh +554 -0
- package/scripts/sw-tmux.sh +1 -1
- package/scripts/sw-trace.sh +485 -0
- package/scripts/sw-tracker-github.sh +188 -0
- package/scripts/sw-tracker-jira.sh +172 -0
- package/scripts/sw-tracker-linear.sh +251 -0
- package/scripts/sw-tracker.sh +117 -2
- package/scripts/sw-triage.sh +603 -0
- package/scripts/sw-upgrade.sh +1 -1
- package/scripts/sw-ux.sh +677 -0
- package/scripts/sw-webhook.sh +627 -0
- package/scripts/sw-widgets.sh +530 -0
- package/scripts/sw-worktree.sh +1 -1
- package/templates/pipelines/autonomous.json +8 -1
- package/templates/pipelines/cost-aware.json +21 -0
- package/templates/pipelines/deployed.json +40 -6
- package/templates/pipelines/enterprise.json +16 -2
- package/templates/pipelines/fast.json +19 -0
- package/templates/pipelines/full.json +16 -2
- package/templates/pipelines/hotfix.json +19 -0
- package/templates/pipelines/standard.json +19 -0
package/scripts/sw-daemon.sh
CHANGED
|
@@ -6,7 +6,10 @@
|
|
|
6
6
|
set -euo pipefail
|
|
7
7
|
trap 'echo "ERROR: $BASH_SOURCE:$LINENO exited with status $?" >&2' ERR
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
# Allow spawning Claude CLI from within a Claude Code session (daemon, fleet, etc.)
|
|
10
|
+
unset CLAUDECODE 2>/dev/null || true
|
|
11
|
+
|
|
12
|
+
VERSION="2.0.0"
|
|
10
13
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
11
14
|
REPO_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
12
15
|
|
|
@@ -34,6 +37,8 @@ RESET='\033[0m'
|
|
|
34
37
|
[[ -f "$SCRIPT_DIR/sw-self-optimize.sh" ]] && source "$SCRIPT_DIR/sw-self-optimize.sh"
|
|
35
38
|
# shellcheck source=sw-predictive.sh
|
|
36
39
|
[[ -f "$SCRIPT_DIR/sw-predictive.sh" ]] && source "$SCRIPT_DIR/sw-predictive.sh"
|
|
40
|
+
# shellcheck source=sw-pipeline-vitals.sh
|
|
41
|
+
[[ -f "$SCRIPT_DIR/sw-pipeline-vitals.sh" ]] && source "$SCRIPT_DIR/sw-pipeline-vitals.sh"
|
|
37
42
|
|
|
38
43
|
# ─── GitHub API Modules (optional) ────────────────────────────────────────
|
|
39
44
|
# shellcheck source=sw-github-graphql.sh
|
|
@@ -125,7 +130,6 @@ rotate_event_log() {
|
|
|
125
130
|
}
|
|
126
131
|
|
|
127
132
|
# ─── GitHub Context (loaded once at startup) ──────────────────────────────
|
|
128
|
-
DAEMON_GITHUB_CONTEXT=""
|
|
129
133
|
|
|
130
134
|
daemon_github_context() {
|
|
131
135
|
# Skip if no GitHub
|
|
@@ -141,8 +145,6 @@ daemon_github_context() {
|
|
|
141
145
|
context=$(gh_repo_context "$owner" "$repo" 2>/dev/null || echo "{}")
|
|
142
146
|
if [[ -n "$context" && "$context" != "{}" ]]; then
|
|
143
147
|
daemon_log INFO "GitHub context loaded: $(echo "$context" | jq -r '.contributor_count // 0') contributors, $(echo "$context" | jq -r '.security_alert_count // 0') security alerts"
|
|
144
|
-
DAEMON_GITHUB_CONTEXT="$context"
|
|
145
|
-
export DAEMON_GITHUB_CONTEXT
|
|
146
148
|
fi
|
|
147
149
|
}
|
|
148
150
|
|
|
@@ -166,9 +168,9 @@ gh_retry() {
|
|
|
166
168
|
|
|
167
169
|
# Check for rate-limit or server error indicators
|
|
168
170
|
if echo "$output" | grep -qiE "rate limit|403|429|502|503"; then
|
|
169
|
-
daemon_log WARN "gh_retry: rate limit / server error on attempt ${attempt}/${max_retries} — backoff ${backoff}s"
|
|
171
|
+
daemon_log WARN "gh_retry: rate limit / server error on attempt ${attempt}/${max_retries} — backoff ${backoff}s" >&2
|
|
170
172
|
else
|
|
171
|
-
daemon_log WARN "gh_retry: transient error on attempt ${attempt}/${max_retries} (exit ${exit_code}) — backoff ${backoff}s"
|
|
173
|
+
daemon_log WARN "gh_retry: transient error on attempt ${attempt}/${max_retries} (exit ${exit_code}) — backoff ${backoff}s" >&2
|
|
172
174
|
fi
|
|
173
175
|
|
|
174
176
|
if [[ $attempt -lt $max_retries ]]; then
|
|
@@ -421,6 +423,14 @@ load_config() {
|
|
|
421
423
|
MAX_RETRIES=$(jq -r '.max_retries // 2' "$config_file")
|
|
422
424
|
RETRY_ESCALATION=$(jq -r '.retry_escalation // true' "$config_file")
|
|
423
425
|
|
|
426
|
+
# session restart + fast test passthrough
|
|
427
|
+
MAX_RESTARTS_CFG=$(jq -r '.max_restarts // 3' "$config_file" 2>/dev/null || echo "3")
|
|
428
|
+
if ! [[ "$MAX_RESTARTS_CFG" =~ ^[0-9]+$ ]]; then
|
|
429
|
+
daemon_log WARN "Invalid max_restarts in config: $MAX_RESTARTS_CFG (using default: 3)"
|
|
430
|
+
MAX_RESTARTS_CFG="3"
|
|
431
|
+
fi
|
|
432
|
+
FAST_TEST_CMD_CFG=$(jq -r '.fast_test_cmd // ""' "$config_file" 2>/dev/null || echo "")
|
|
433
|
+
|
|
424
434
|
# self-optimization
|
|
425
435
|
SELF_OPTIMIZE=$(jq -r '.self_optimize // false' "$config_file")
|
|
426
436
|
OPTIMIZE_INTERVAL=$(jq -r '.optimize_interval // 10' "$config_file")
|
|
@@ -471,9 +481,11 @@ load_config() {
|
|
|
471
481
|
|
|
472
482
|
# progress-based health monitoring (replaces static timeouts)
|
|
473
483
|
PROGRESS_MONITORING=$(jq -r '.health.progress_based // true' "$config_file")
|
|
474
|
-
PROGRESS_CHECKS_BEFORE_WARN=$(jq -r '.health.stale_checks_before_warn //
|
|
475
|
-
PROGRESS_CHECKS_BEFORE_KILL=$(jq -r '.health.stale_checks_before_kill //
|
|
476
|
-
PROGRESS_HARD_LIMIT_S=$(jq -r '.health.hard_limit_s //
|
|
484
|
+
PROGRESS_CHECKS_BEFORE_WARN=$(jq -r '.health.stale_checks_before_warn // 20' "$config_file")
|
|
485
|
+
PROGRESS_CHECKS_BEFORE_KILL=$(jq -r '.health.stale_checks_before_kill // 120' "$config_file")
|
|
486
|
+
PROGRESS_HARD_LIMIT_S=$(jq -r '.health.hard_limit_s // 0' "$config_file") # 0 = disabled (no hard kill)
|
|
487
|
+
NUDGE_ENABLED=$(jq -r '.health.nudge_enabled // true' "$config_file")
|
|
488
|
+
NUDGE_AFTER_CHECKS=$(jq -r '.health.nudge_after_checks // 40' "$config_file")
|
|
477
489
|
|
|
478
490
|
# team dashboard URL (for coordinated claiming)
|
|
479
491
|
local cfg_dashboard_url
|
|
@@ -482,6 +494,12 @@ load_config() {
|
|
|
482
494
|
DASHBOARD_URL="$cfg_dashboard_url"
|
|
483
495
|
fi
|
|
484
496
|
|
|
497
|
+
# Auto-enable self_optimize when auto_template is on
|
|
498
|
+
if [[ "${AUTO_TEMPLATE:-false}" == "true" && "${SELF_OPTIMIZE:-false}" == "false" ]]; then
|
|
499
|
+
SELF_OPTIMIZE="true"
|
|
500
|
+
daemon_log INFO "Auto-enabling self_optimize (auto_template is true)"
|
|
501
|
+
fi
|
|
502
|
+
|
|
485
503
|
success "Config loaded"
|
|
486
504
|
}
|
|
487
505
|
|
|
@@ -823,6 +841,31 @@ daemon_assess_progress() {
|
|
|
823
841
|
has_progress=true
|
|
824
842
|
fi
|
|
825
843
|
|
|
844
|
+
# Claude subprocess is alive and consuming CPU — agent is thinking/working
|
|
845
|
+
# During build stage, Claude can spend 10+ minutes thinking before any
|
|
846
|
+
# visible git changes appear. Detect this as progress.
|
|
847
|
+
if [[ "$has_progress" != "true" ]]; then
|
|
848
|
+
local _pid_for_check
|
|
849
|
+
_pid_for_check=$(echo "$current_snapshot" | jq -r '.pid // empty' 2>/dev/null || true)
|
|
850
|
+
if [[ -z "$_pid_for_check" ]]; then
|
|
851
|
+
# Fallback: get PID from active_jobs
|
|
852
|
+
_pid_for_check=$(jq -r --argjson num "$issue_num" \
|
|
853
|
+
'.active_jobs[] | select(.issue == ($num | tonumber)) | .pid' "$STATE_FILE" 2>/dev/null | head -1 || true)
|
|
854
|
+
fi
|
|
855
|
+
if [[ -n "$_pid_for_check" ]]; then
|
|
856
|
+
# Check if any child process (claude) is alive and using CPU
|
|
857
|
+
local child_cpu=0
|
|
858
|
+
child_cpu=$(ps -o pid=,pcpu= -p "$_pid_for_check" 2>/dev/null | awk '{sum+=$2} END{printf "%d", sum+0}' || echo "0")
|
|
859
|
+
if [[ "$child_cpu" -eq 0 ]]; then
|
|
860
|
+
# Check children of the pipeline process
|
|
861
|
+
child_cpu=$(pgrep -P "$_pid_for_check" 2>/dev/null | xargs -I{} ps -o pcpu= -p {} 2>/dev/null | awk '{sum+=$1} END{printf "%d", sum+0}' || echo "0")
|
|
862
|
+
fi
|
|
863
|
+
if [[ "${child_cpu:-0}" -gt 0 ]]; then
|
|
864
|
+
has_progress=true
|
|
865
|
+
fi
|
|
866
|
+
fi
|
|
867
|
+
fi
|
|
868
|
+
|
|
826
869
|
# Detect repeated errors (same error signature hitting again)
|
|
827
870
|
local repeated_errors="$prev_repeated_errors"
|
|
828
871
|
if [[ -n "$cur_error" && "$cur_error" == "$prev_error" ]]; then
|
|
@@ -855,7 +898,56 @@ daemon_assess_progress() {
|
|
|
855
898
|
if $npc == 0 then .last_progress_at = $ts else . end
|
|
856
899
|
' "$progress_file" > "$tmp_progress" 2>/dev/null && mv "$tmp_progress" "$progress_file"
|
|
857
900
|
|
|
858
|
-
#
|
|
901
|
+
# ── Vitals-based verdict (preferred over static thresholds) ──
|
|
902
|
+
if type pipeline_compute_vitals &>/dev/null 2>&1 && type pipeline_health_verdict &>/dev/null 2>&1; then
|
|
903
|
+
# Compute vitals using the worktree's pipeline state if available
|
|
904
|
+
local _worktree_state=""
|
|
905
|
+
local _worktree_artifacts=""
|
|
906
|
+
local _worktree_dir
|
|
907
|
+
_worktree_dir=$(jq -r --arg i "$issue_num" '.active_jobs[] | select(.issue == ($i | tonumber)) | .worktree // ""' "$STATE_FILE" 2>/dev/null || echo "")
|
|
908
|
+
if [[ -n "$_worktree_dir" && -d "$_worktree_dir/.claude" ]]; then
|
|
909
|
+
_worktree_state="$_worktree_dir/.claude/pipeline-state.md"
|
|
910
|
+
_worktree_artifacts="$_worktree_dir/.claude/pipeline-artifacts"
|
|
911
|
+
fi
|
|
912
|
+
|
|
913
|
+
local _vitals_json
|
|
914
|
+
_vitals_json=$(pipeline_compute_vitals "$_worktree_state" "$_worktree_artifacts" "$issue_num" 2>/dev/null) || true
|
|
915
|
+
if [[ -n "$_vitals_json" && "$_vitals_json" != "{}" ]]; then
|
|
916
|
+
local _health_verdict _health_score
|
|
917
|
+
_health_verdict=$(echo "$_vitals_json" | jq -r '.verdict // "continue"' 2>/dev/null || echo "continue")
|
|
918
|
+
_health_score=$(echo "$_vitals_json" | jq -r '.health_score // 50' 2>/dev/null || echo "50")
|
|
919
|
+
|
|
920
|
+
emit_event "pipeline.vitals_check" \
|
|
921
|
+
"issue=$issue_num" \
|
|
922
|
+
"health_score=$_health_score" \
|
|
923
|
+
"verdict=$_health_verdict" \
|
|
924
|
+
"no_progress=$no_progress_count" \
|
|
925
|
+
"repeated_errors=$repeated_errors"
|
|
926
|
+
|
|
927
|
+
# Map vitals verdict to daemon verdict
|
|
928
|
+
case "$_health_verdict" in
|
|
929
|
+
continue)
|
|
930
|
+
echo "healthy"
|
|
931
|
+
return
|
|
932
|
+
;;
|
|
933
|
+
warn)
|
|
934
|
+
# Sluggish but not dead — equivalent to slowing
|
|
935
|
+
echo "slowing"
|
|
936
|
+
return
|
|
937
|
+
;;
|
|
938
|
+
intervene)
|
|
939
|
+
echo "stalled"
|
|
940
|
+
return
|
|
941
|
+
;;
|
|
942
|
+
abort)
|
|
943
|
+
echo "stuck"
|
|
944
|
+
return
|
|
945
|
+
;;
|
|
946
|
+
esac
|
|
947
|
+
fi
|
|
948
|
+
fi
|
|
949
|
+
|
|
950
|
+
# ── Fallback: static threshold verdict ──
|
|
859
951
|
local warn_threshold="${PROGRESS_CHECKS_BEFORE_WARN:-3}"
|
|
860
952
|
local kill_threshold="${PROGRESS_CHECKS_BEFORE_KILL:-6}"
|
|
861
953
|
|
|
@@ -1039,6 +1131,7 @@ extract_issue_dependencies() {
|
|
|
1039
1131
|
}
|
|
1040
1132
|
|
|
1041
1133
|
# ─── Logging ─────────────────────────────────────────────────────────────────
|
|
1134
|
+
DAEMON_LOG_WRITE_COUNT=0
|
|
1042
1135
|
|
|
1043
1136
|
daemon_log() {
|
|
1044
1137
|
local level="$1"
|
|
@@ -1048,8 +1141,9 @@ daemon_log() {
|
|
|
1048
1141
|
ts=$(now_iso)
|
|
1049
1142
|
echo "[$ts] [$level] $msg" >> "$LOG_FILE"
|
|
1050
1143
|
|
|
1051
|
-
# Rotate daemon.log if over 20MB (checked every
|
|
1052
|
-
|
|
1144
|
+
# Rotate daemon.log if over 20MB (checked every 100 writes)
|
|
1145
|
+
DAEMON_LOG_WRITE_COUNT=$(( DAEMON_LOG_WRITE_COUNT + 1 ))
|
|
1146
|
+
if [[ $(( DAEMON_LOG_WRITE_COUNT % 100 )) -eq 0 ]] && [[ -f "$LOG_FILE" ]]; then
|
|
1053
1147
|
local log_size
|
|
1054
1148
|
log_size=$(wc -c < "$LOG_FILE" 2>/dev/null || echo 0)
|
|
1055
1149
|
if [[ "$log_size" -gt 20971520 ]]; then
|
|
@@ -1060,11 +1154,14 @@ daemon_log() {
|
|
|
1060
1154
|
fi
|
|
1061
1155
|
fi
|
|
1062
1156
|
|
|
1063
|
-
#
|
|
1157
|
+
# Print to stderr (NOT stdout) to avoid corrupting command substitution captures.
|
|
1158
|
+
# This is critical: functions like select_pipeline_template(), triage_score_issue(),
|
|
1159
|
+
# gh_retry(), and locked_get_active_count() return values via echo/stdout and are
|
|
1160
|
+
# called via $(). If daemon_log writes to stdout, the log text corrupts return values.
|
|
1064
1161
|
case "$level" in
|
|
1065
|
-
INFO) info "$msg" ;;
|
|
1066
|
-
SUCCESS) success "$msg" ;;
|
|
1067
|
-
WARN) warn "$msg" ;;
|
|
1162
|
+
INFO) info "$msg" >&2 ;;
|
|
1163
|
+
SUCCESS) success "$msg" >&2 ;;
|
|
1164
|
+
WARN) warn "$msg" >&2 ;;
|
|
1068
1165
|
ERROR) error "$msg" ;;
|
|
1069
1166
|
esac
|
|
1070
1167
|
}
|
|
@@ -1130,7 +1227,10 @@ gh_record_failure() {
|
|
|
1130
1227
|
GH_CONSECUTIVE_FAILURES=$((GH_CONSECUTIVE_FAILURES + 1))
|
|
1131
1228
|
if [[ "$GH_CONSECUTIVE_FAILURES" -ge 3 ]]; then
|
|
1132
1229
|
# Exponential backoff: 30s, 60s, 120s, 240s (capped at 5min)
|
|
1133
|
-
|
|
1230
|
+
# Cap shift to avoid integer overflow for large failure counts
|
|
1231
|
+
local shift_amt=$(( GH_CONSECUTIVE_FAILURES - 3 ))
|
|
1232
|
+
[[ "$shift_amt" -gt 4 ]] && shift_amt=4
|
|
1233
|
+
local backoff_secs=$((30 * (1 << shift_amt)))
|
|
1134
1234
|
[[ "$backoff_secs" -gt 300 ]] && backoff_secs=300
|
|
1135
1235
|
GH_BACKOFF_UNTIL=$(( $(now_epoch) + backoff_secs ))
|
|
1136
1236
|
daemon_log WARN "GitHub rate-limit circuit breaker: backing off ${backoff_secs}s after ${GH_CONSECUTIVE_FAILURES} failures"
|
|
@@ -1138,6 +1238,74 @@ gh_record_failure() {
|
|
|
1138
1238
|
fi
|
|
1139
1239
|
}
|
|
1140
1240
|
|
|
1241
|
+
# ─── Runtime Auth Check ──────────────────────────────────────────────────────
|
|
1242
|
+
|
|
1243
|
+
LAST_AUTH_CHECK_EPOCH=0
|
|
1244
|
+
AUTH_CHECK_INTERVAL=300 # 5 minutes
|
|
1245
|
+
|
|
1246
|
+
daemon_preflight_auth_check() {
|
|
1247
|
+
local now_e
|
|
1248
|
+
now_e=$(now_epoch)
|
|
1249
|
+
if [[ $((now_e - LAST_AUTH_CHECK_EPOCH)) -lt "$AUTH_CHECK_INTERVAL" ]]; then
|
|
1250
|
+
return 0
|
|
1251
|
+
fi
|
|
1252
|
+
LAST_AUTH_CHECK_EPOCH="$now_e"
|
|
1253
|
+
|
|
1254
|
+
# gh auth check
|
|
1255
|
+
if [[ "${NO_GITHUB:-false}" != "true" ]]; then
|
|
1256
|
+
if ! gh auth status &>/dev/null 2>&1; then
|
|
1257
|
+
daemon_log ERROR "GitHub auth check failed — auto-pausing daemon"
|
|
1258
|
+
local pause_json
|
|
1259
|
+
pause_json=$(jq -n --arg reason "gh_auth_failure" --arg ts "$(now_iso)" \
|
|
1260
|
+
'{reason: $reason, timestamp: $ts}')
|
|
1261
|
+
local _tmp_pause
|
|
1262
|
+
_tmp_pause=$(mktemp "${TMPDIR:-/tmp}/sw-pause.XXXXXX")
|
|
1263
|
+
echo "$pause_json" > "$_tmp_pause"
|
|
1264
|
+
mv "$_tmp_pause" "$PAUSE_FLAG"
|
|
1265
|
+
emit_event "daemon.auto_pause" "reason=gh_auth_failure"
|
|
1266
|
+
return 1
|
|
1267
|
+
fi
|
|
1268
|
+
fi
|
|
1269
|
+
|
|
1270
|
+
# claude auth check with 15s timeout (macOS has no timeout command)
|
|
1271
|
+
local claude_auth_ok=false
|
|
1272
|
+
local _auth_tmp
|
|
1273
|
+
_auth_tmp=$(mktemp "${TMPDIR:-/tmp}/sw-auth.XXXXXX")
|
|
1274
|
+
( claude --print -p "ok" --max-turns 1 > "$_auth_tmp" 2>/dev/null ) &
|
|
1275
|
+
local _auth_pid=$!
|
|
1276
|
+
local _auth_waited=0
|
|
1277
|
+
while kill -0 "$_auth_pid" 2>/dev/null && [[ "$_auth_waited" -lt 15 ]]; do
|
|
1278
|
+
sleep 1
|
|
1279
|
+
_auth_waited=$((_auth_waited + 1))
|
|
1280
|
+
done
|
|
1281
|
+
if kill -0 "$_auth_pid" 2>/dev/null; then
|
|
1282
|
+
kill "$_auth_pid" 2>/dev/null || true
|
|
1283
|
+
wait "$_auth_pid" 2>/dev/null || true
|
|
1284
|
+
else
|
|
1285
|
+
wait "$_auth_pid" 2>/dev/null || true
|
|
1286
|
+
fi
|
|
1287
|
+
|
|
1288
|
+
if [[ -s "$_auth_tmp" ]]; then
|
|
1289
|
+
claude_auth_ok=true
|
|
1290
|
+
fi
|
|
1291
|
+
rm -f "$_auth_tmp"
|
|
1292
|
+
|
|
1293
|
+
if [[ "$claude_auth_ok" != "true" ]]; then
|
|
1294
|
+
daemon_log ERROR "Claude auth check failed — auto-pausing daemon"
|
|
1295
|
+
local pause_json
|
|
1296
|
+
pause_json=$(jq -n --arg reason "claude_auth_failure" --arg ts "$(now_iso)" \
|
|
1297
|
+
'{reason: $reason, timestamp: $ts}')
|
|
1298
|
+
local _tmp_pause
|
|
1299
|
+
_tmp_pause=$(mktemp "${TMPDIR:-/tmp}/sw-pause.XXXXXX")
|
|
1300
|
+
echo "$pause_json" > "$_tmp_pause"
|
|
1301
|
+
mv "$_tmp_pause" "$PAUSE_FLAG"
|
|
1302
|
+
emit_event "daemon.auto_pause" "reason=claude_auth_failure"
|
|
1303
|
+
return 1
|
|
1304
|
+
fi
|
|
1305
|
+
|
|
1306
|
+
return 0
|
|
1307
|
+
}
|
|
1308
|
+
|
|
1141
1309
|
# ─── Pre-flight Checks ──────────────────────────────────────────────────────
|
|
1142
1310
|
|
|
1143
1311
|
preflight_checks() {
|
|
@@ -1380,7 +1548,7 @@ locked_get_active_count() {
|
|
|
1380
1548
|
(
|
|
1381
1549
|
if command -v flock &>/dev/null; then
|
|
1382
1550
|
flock -w 5 200 2>/dev/null || {
|
|
1383
|
-
daemon_log WARN "locked_get_active_count: lock timeout — returning MAX_PARALLEL as safe default"
|
|
1551
|
+
daemon_log WARN "locked_get_active_count: lock timeout — returning MAX_PARALLEL as safe default" >&2
|
|
1384
1552
|
echo "$MAX_PARALLEL"
|
|
1385
1553
|
exit 0
|
|
1386
1554
|
}
|
|
@@ -1539,9 +1707,24 @@ daemon_spawn_pipeline() {
|
|
|
1539
1707
|
local issue_num="$1"
|
|
1540
1708
|
local issue_title="${2:-}"
|
|
1541
1709
|
local repo_full_name="${3:-}" # owner/repo (org mode only)
|
|
1710
|
+
shift 3 2>/dev/null || true
|
|
1711
|
+
local extra_pipeline_args=("$@") # Optional extra args passed to sw-pipeline.sh
|
|
1542
1712
|
|
|
1543
1713
|
daemon_log INFO "Spawning pipeline for issue #${issue_num}: ${issue_title}"
|
|
1544
1714
|
|
|
1715
|
+
# ── Issue decomposition (if decomposer available) ──
|
|
1716
|
+
local decompose_script="${SCRIPT_DIR}/sw-decompose.sh"
|
|
1717
|
+
if [[ -x "$decompose_script" && "$NO_GITHUB" != "true" ]]; then
|
|
1718
|
+
local decompose_result=""
|
|
1719
|
+
decompose_result=$("$decompose_script" auto "$issue_num" 2>/dev/null) || true
|
|
1720
|
+
if [[ "$decompose_result" == *"decomposed"* ]]; then
|
|
1721
|
+
daemon_log INFO "Issue #${issue_num} decomposed into subtasks — skipping pipeline"
|
|
1722
|
+
# Remove the shipwright label so decomposed parent doesn't re-queue
|
|
1723
|
+
gh issue edit "$issue_num" --remove-label "shipwright" 2>/dev/null || true
|
|
1724
|
+
return 0
|
|
1725
|
+
fi
|
|
1726
|
+
fi
|
|
1727
|
+
|
|
1545
1728
|
# Extract goal text from issue (title + first line of body)
|
|
1546
1729
|
local issue_goal="$issue_title"
|
|
1547
1730
|
if [[ "$NO_GITHUB" != "true" ]]; then
|
|
@@ -1626,6 +1809,17 @@ daemon_spawn_pipeline() {
|
|
|
1626
1809
|
daemon_log INFO "Worktree created at ${work_dir}"
|
|
1627
1810
|
fi
|
|
1628
1811
|
|
|
1812
|
+
# If template is "composed", copy the composed spec into the worktree
|
|
1813
|
+
if [[ "$PIPELINE_TEMPLATE" == "composed" ]]; then
|
|
1814
|
+
local _src_composed="${REPO_DIR:-.}/.claude/pipeline-artifacts/composed-pipeline.json"
|
|
1815
|
+
if [[ -f "$_src_composed" ]]; then
|
|
1816
|
+
local _dst_artifacts="${work_dir}/.claude/pipeline-artifacts"
|
|
1817
|
+
mkdir -p "$_dst_artifacts"
|
|
1818
|
+
cp "$_src_composed" "$_dst_artifacts/composed-pipeline.json" 2>/dev/null || true
|
|
1819
|
+
daemon_log INFO "Copied composed pipeline spec to worktree"
|
|
1820
|
+
fi
|
|
1821
|
+
fi
|
|
1822
|
+
|
|
1629
1823
|
# Build pipeline args
|
|
1630
1824
|
local pipeline_args=("start" "--issue" "$issue_num" "--pipeline" "$PIPELINE_TEMPLATE")
|
|
1631
1825
|
if [[ "$SKIP_GATES" == "true" ]]; then
|
|
@@ -1637,12 +1831,27 @@ daemon_spawn_pipeline() {
|
|
|
1637
1831
|
if [[ "$NO_GITHUB" == "true" ]]; then
|
|
1638
1832
|
pipeline_args+=("--no-github")
|
|
1639
1833
|
fi
|
|
1834
|
+
# Pass session restart config
|
|
1835
|
+
if [[ "${MAX_RESTARTS_CFG:-0}" -gt 0 ]]; then
|
|
1836
|
+
pipeline_args+=("--max-restarts" "$MAX_RESTARTS_CFG")
|
|
1837
|
+
fi
|
|
1838
|
+
# Pass fast test command
|
|
1839
|
+
if [[ -n "${FAST_TEST_CMD_CFG:-}" ]]; then
|
|
1840
|
+
pipeline_args+=("--fast-test-cmd" "$FAST_TEST_CMD_CFG")
|
|
1841
|
+
fi
|
|
1842
|
+
|
|
1843
|
+
# Append any extra pipeline args (from retry escalation, etc.)
|
|
1844
|
+
if [[ ${#extra_pipeline_args[@]} -gt 0 ]]; then
|
|
1845
|
+
pipeline_args+=("${extra_pipeline_args[@]}")
|
|
1846
|
+
fi
|
|
1640
1847
|
|
|
1641
1848
|
# Run pipeline in work directory (background)
|
|
1849
|
+
# Ignore SIGHUP so tmux attach/detach and process group changes don't kill the pipeline
|
|
1642
1850
|
echo -e "\n\n===== Pipeline run $(date -u +%Y-%m-%dT%H:%M:%SZ) =====" >> "$LOG_DIR/issue-${issue_num}.log" 2>/dev/null || true
|
|
1643
1851
|
(
|
|
1852
|
+
trap '' HUP
|
|
1644
1853
|
cd "$work_dir"
|
|
1645
|
-
"$SCRIPT_DIR/sw-pipeline.sh" "${pipeline_args[@]}"
|
|
1854
|
+
exec "$SCRIPT_DIR/sw-pipeline.sh" "${pipeline_args[@]}"
|
|
1646
1855
|
) >> "$LOG_DIR/issue-${issue_num}.log" 2>&1 200>&- &
|
|
1647
1856
|
local pid=$!
|
|
1648
1857
|
|
|
@@ -1770,6 +1979,41 @@ daemon_reap_completed() {
|
|
|
1770
1979
|
daemon_on_success "$issue_num" "$duration_str"
|
|
1771
1980
|
else
|
|
1772
1981
|
daemon_on_failure "$issue_num" "$exit_code" "$duration_str"
|
|
1982
|
+
|
|
1983
|
+
# Cancel any lingering in_progress GitHub Check Runs for failed job
|
|
1984
|
+
if [[ "${NO_GITHUB:-false}" != "true" && -n "$worktree" ]]; then
|
|
1985
|
+
local check_ids_file="${worktree}/.claude/pipeline-artifacts/check-run-ids.json"
|
|
1986
|
+
if [[ -f "$check_ids_file" ]]; then
|
|
1987
|
+
daemon_log INFO "Cancelling in-progress check runs for issue #${issue_num}"
|
|
1988
|
+
local _stage
|
|
1989
|
+
while IFS= read -r _stage; do
|
|
1990
|
+
[[ -z "$_stage" ]] && continue
|
|
1991
|
+
# Direct API call since we're in daemon context
|
|
1992
|
+
local _run_id
|
|
1993
|
+
_run_id=$(jq -r --arg s "$_stage" '.[$s] // empty' "$check_ids_file" 2>/dev/null || true)
|
|
1994
|
+
if [[ -n "$_run_id" && "$_run_id" != "null" ]]; then
|
|
1995
|
+
local _detected
|
|
1996
|
+
_detected=$(git remote get-url origin 2>/dev/null | sed 's|.*github.com[:/]\(.*\)\.git$|\1|' || true)
|
|
1997
|
+
if [[ -n "$_detected" ]]; then
|
|
1998
|
+
local _owner="${_detected%%/*}" _repo="${_detected##*/}"
|
|
1999
|
+
gh api "repos/${_owner}/${_repo}/check-runs/${_run_id}" \
|
|
2000
|
+
--method PATCH \
|
|
2001
|
+
--field status=completed \
|
|
2002
|
+
--field conclusion=cancelled \
|
|
2003
|
+
--silent 2>/dev/null || true
|
|
2004
|
+
fi
|
|
2005
|
+
fi
|
|
2006
|
+
done < <(jq -r 'keys[]' "$check_ids_file" 2>/dev/null || true)
|
|
2007
|
+
fi
|
|
2008
|
+
fi
|
|
2009
|
+
fi
|
|
2010
|
+
|
|
2011
|
+
# Finalize memory (capture failure patterns for future runs)
|
|
2012
|
+
if type memory_finalize_pipeline &>/dev/null 2>&1; then
|
|
2013
|
+
local _job_state _job_artifacts
|
|
2014
|
+
_job_state="${worktree:-.}/.claude/pipeline-state.md"
|
|
2015
|
+
_job_artifacts="${worktree:-.}/.claude/pipeline-artifacts"
|
|
2016
|
+
memory_finalize_pipeline "$_job_state" "$_job_artifacts" 2>/dev/null || true
|
|
1773
2017
|
fi
|
|
1774
2018
|
|
|
1775
2019
|
# Clean up progress tracking for this job
|
|
@@ -1780,15 +2024,18 @@ daemon_reap_completed() {
|
|
|
1780
2024
|
reap_machine_name=$(jq -r '.machines[] | select(.role == "primary") | .name' "$HOME/.shipwright/machines.json" 2>/dev/null || hostname -s)
|
|
1781
2025
|
release_claim "$issue_num" "$reap_machine_name"
|
|
1782
2026
|
|
|
1783
|
-
#
|
|
2027
|
+
# Always remove the OLD job entry from active_jobs to prevent
|
|
2028
|
+
# re-reaping of the dead PID on the next cycle. When a retry was
|
|
2029
|
+
# spawned, daemon_spawn_pipeline already added a fresh entry with
|
|
2030
|
+
# the new PID — we must not leave the stale one behind.
|
|
2031
|
+
locked_state_update --argjson num "$issue_num" \
|
|
2032
|
+
--argjson old_pid "${pid:-0}" \
|
|
2033
|
+
'.active_jobs = [.active_jobs[] | select(.issue != $num or .pid != $old_pid)]'
|
|
2034
|
+
untrack_priority_job "$issue_num"
|
|
2035
|
+
|
|
1784
2036
|
if [[ "$_retry_spawned_for" == "$issue_num" ]]; then
|
|
1785
2037
|
daemon_log INFO "Retry spawned for issue #${issue_num} — skipping worktree cleanup"
|
|
1786
2038
|
else
|
|
1787
|
-
# Remove from active_jobs and priority lane tracking (locked)
|
|
1788
|
-
locked_state_update --argjson num "$issue_num" \
|
|
1789
|
-
'.active_jobs = [.active_jobs[] | select(.issue != $num)]'
|
|
1790
|
-
untrack_priority_job "$issue_num"
|
|
1791
|
-
|
|
1792
2039
|
# Clean up worktree (skip for org-mode clones — they persist)
|
|
1793
2040
|
local job_repo
|
|
1794
2041
|
job_repo=$(echo "$job" | jq -r '.repo // ""')
|
|
@@ -1827,6 +2074,9 @@ daemon_reap_completed() {
|
|
|
1827
2074
|
daemon_on_success() {
|
|
1828
2075
|
local issue_num="$1" duration="${2:-}"
|
|
1829
2076
|
|
|
2077
|
+
# Reset consecutive failure tracking on any success
|
|
2078
|
+
reset_failure_tracking
|
|
2079
|
+
|
|
1830
2080
|
daemon_log SUCCESS "Pipeline completed for issue #${issue_num} (${duration:-unknown})"
|
|
1831
2081
|
|
|
1832
2082
|
# Record pipeline duration for adaptive threshold learning
|
|
@@ -1887,6 +2137,91 @@ Check the associated PR for the implementation." 2>/dev/null || true
|
|
|
1887
2137
|
"$SCRIPT_DIR/sw-tracker.sh" notify "completed" "$issue_num" 2>/dev/null || true
|
|
1888
2138
|
}
|
|
1889
2139
|
|
|
2140
|
+
# ─── Failure Classification ─────────────────────────────────────────────────
|
|
2141
|
+
|
|
2142
|
+
classify_failure() {
|
|
2143
|
+
local issue_num="$1"
|
|
2144
|
+
if [[ -z "${LOG_DIR:-}" ]]; then
|
|
2145
|
+
echo "unknown"
|
|
2146
|
+
return
|
|
2147
|
+
fi
|
|
2148
|
+
local log_path="$LOG_DIR/issue-${issue_num}.log"
|
|
2149
|
+
if [[ ! -f "$log_path" ]]; then
|
|
2150
|
+
echo "unknown"
|
|
2151
|
+
return
|
|
2152
|
+
fi
|
|
2153
|
+
local tail_content
|
|
2154
|
+
tail_content=$(tail -200 "$log_path" 2>/dev/null || true)
|
|
2155
|
+
|
|
2156
|
+
# Auth errors
|
|
2157
|
+
if echo "$tail_content" | grep -qiE 'not logged in|unauthorized|auth.*fail|401 |invalid.*token|CLAUDE_CODE_OAUTH_TOKEN|api key.*invalid|authentication required'; then
|
|
2158
|
+
echo "auth_error"
|
|
2159
|
+
return
|
|
2160
|
+
fi
|
|
2161
|
+
# API errors (rate limits, timeouts, server errors)
|
|
2162
|
+
if echo "$tail_content" | grep -qiE 'rate limit|429 |503 |502 |overloaded|timeout|ETIMEDOUT|ECONNRESET|socket hang up|service unavailable'; then
|
|
2163
|
+
echo "api_error"
|
|
2164
|
+
return
|
|
2165
|
+
fi
|
|
2166
|
+
# Invalid issue (not found, empty body)
|
|
2167
|
+
if echo "$tail_content" | grep -qiE 'issue not found|404 |no body|could not resolve|GraphQL.*not found|issue.*does not exist'; then
|
|
2168
|
+
echo "invalid_issue"
|
|
2169
|
+
return
|
|
2170
|
+
fi
|
|
2171
|
+
# Context exhaustion — check progress file
|
|
2172
|
+
local issue_worktree_path="${WORKTREE_DIR:-${REPO_DIR}/.worktrees}/daemon-issue-${issue_num}"
|
|
2173
|
+
local progress_file="${issue_worktree_path}/.claude/loop-logs/progress.md"
|
|
2174
|
+
if [[ -f "$progress_file" ]]; then
|
|
2175
|
+
local cf_iter
|
|
2176
|
+
cf_iter=$(grep -oE 'Iteration: [0-9]+' "$progress_file" 2>/dev/null | tail -1 | grep -oE '[0-9]+' || echo "0")
|
|
2177
|
+
if ! [[ "${cf_iter:-0}" =~ ^[0-9]+$ ]]; then cf_iter="0"; fi
|
|
2178
|
+
local cf_tests
|
|
2179
|
+
cf_tests=$(grep -oE 'Tests passing: (true|false)' "$progress_file" 2>/dev/null | awk '{print $NF}' || echo "unknown")
|
|
2180
|
+
if [[ "${cf_iter:-0}" -gt 0 ]] && { [[ "$cf_tests" == "false" ]] || [[ "$cf_tests" == "unknown" ]]; }; then
|
|
2181
|
+
echo "context_exhaustion"
|
|
2182
|
+
return
|
|
2183
|
+
fi
|
|
2184
|
+
fi
|
|
2185
|
+
# Build failure (test errors, compile errors)
|
|
2186
|
+
if echo "$tail_content" | grep -qiE 'test.*fail|FAIL|build.*error|compile.*error|lint.*fail|npm ERR|exit code [1-9]'; then
|
|
2187
|
+
echo "build_failure"
|
|
2188
|
+
return
|
|
2189
|
+
fi
|
|
2190
|
+
echo "unknown"
|
|
2191
|
+
}
|
|
2192
|
+
|
|
2193
|
+
# ─── Consecutive Failure Tracking ──────────────────────────────────────────
|
|
2194
|
+
|
|
2195
|
+
DAEMON_CONSECUTIVE_FAILURE_CLASS=""
|
|
2196
|
+
DAEMON_CONSECUTIVE_FAILURE_COUNT=0
|
|
2197
|
+
|
|
2198
|
+
record_failure_class() {
|
|
2199
|
+
local failure_class="$1"
|
|
2200
|
+
if [[ "$failure_class" == "$DAEMON_CONSECUTIVE_FAILURE_CLASS" ]]; then
|
|
2201
|
+
DAEMON_CONSECUTIVE_FAILURE_COUNT=$((DAEMON_CONSECUTIVE_FAILURE_COUNT + 1))
|
|
2202
|
+
else
|
|
2203
|
+
DAEMON_CONSECUTIVE_FAILURE_CLASS="$failure_class"
|
|
2204
|
+
DAEMON_CONSECUTIVE_FAILURE_COUNT=1
|
|
2205
|
+
fi
|
|
2206
|
+
|
|
2207
|
+
if [[ "$DAEMON_CONSECUTIVE_FAILURE_COUNT" -ge 3 ]]; then
|
|
2208
|
+
daemon_log ERROR "3 consecutive failures (class: ${failure_class}) — auto-pausing daemon"
|
|
2209
|
+
local pause_json
|
|
2210
|
+
pause_json=$(jq -n --arg reason "consecutive_${failure_class}" --arg ts "$(now_iso)" \
|
|
2211
|
+
'{reason: $reason, timestamp: $ts}')
|
|
2212
|
+
local _tmp_pause
|
|
2213
|
+
_tmp_pause=$(mktemp "${TMPDIR:-/tmp}/sw-pause.XXXXXX")
|
|
2214
|
+
echo "$pause_json" > "$_tmp_pause"
|
|
2215
|
+
mv "$_tmp_pause" "$PAUSE_FLAG"
|
|
2216
|
+
emit_event "daemon.auto_pause" "reason=consecutive_failures" "class=$failure_class" "count=$DAEMON_CONSECUTIVE_FAILURE_COUNT"
|
|
2217
|
+
fi
|
|
2218
|
+
}
|
|
2219
|
+
|
|
2220
|
+
reset_failure_tracking() {
|
|
2221
|
+
DAEMON_CONSECUTIVE_FAILURE_CLASS=""
|
|
2222
|
+
DAEMON_CONSECUTIVE_FAILURE_COUNT=0
|
|
2223
|
+
}
|
|
2224
|
+
|
|
1890
2225
|
# ─── Failure Handler ────────────────────────────────────────────────────────
|
|
1891
2226
|
|
|
1892
2227
|
daemon_on_failure() {
|
|
@@ -1923,100 +2258,160 @@ daemon_on_failure() {
|
|
|
1923
2258
|
completed_at: $completed_at
|
|
1924
2259
|
}] | .completed = .completed[-500:]'
|
|
1925
2260
|
|
|
2261
|
+
# ── Classify failure and decide retry strategy ──
|
|
2262
|
+
local failure_class
|
|
2263
|
+
failure_class=$(classify_failure "$issue_num")
|
|
2264
|
+
daemon_log INFO "Failure classified as: ${failure_class} for issue #${issue_num}"
|
|
2265
|
+
emit_event "daemon.failure_classified" "issue=$issue_num" "class=$failure_class"
|
|
2266
|
+
record_failure_class "$failure_class"
|
|
2267
|
+
|
|
1926
2268
|
# ── Auto-retry with strategy escalation ──
|
|
1927
2269
|
if [[ "${RETRY_ESCALATION:-true}" == "true" ]]; then
|
|
1928
2270
|
local retry_count
|
|
1929
2271
|
retry_count=$(jq -r --arg num "$issue_num" \
|
|
1930
2272
|
'.retry_counts[$num] // 0' "$STATE_FILE" 2>/dev/null || echo "0")
|
|
1931
2273
|
|
|
1932
|
-
|
|
1933
|
-
|
|
1934
|
-
|
|
1935
|
-
|
|
1936
|
-
|
|
1937
|
-
|
|
1938
|
-
|
|
1939
|
-
|
|
1940
|
-
daemon_log WARN "Auto-retry #${retry_count}/${MAX_RETRIES:-2} for issue #${issue_num}"
|
|
1941
|
-
emit_event "daemon.retry" "issue=$issue_num" "retry=$retry_count" "max=${MAX_RETRIES:-2}"
|
|
1942
|
-
|
|
1943
|
-
# Check for checkpoint to enable resume-from-checkpoint
|
|
1944
|
-
local checkpoint_args=()
|
|
1945
|
-
if [[ "${CHECKPOINT_ENABLED:-true}" == "true" ]]; then
|
|
1946
|
-
# Try to find worktree for this issue to check for checkpoints
|
|
1947
|
-
local issue_worktree="${REPO_DIR}/.worktrees/daemon-issue-${issue_num}"
|
|
1948
|
-
if [[ -d "$issue_worktree/.claude/pipeline-artifacts/checkpoints" ]]; then
|
|
1949
|
-
local latest_checkpoint=""
|
|
1950
|
-
for cp_file in "$issue_worktree/.claude/pipeline-artifacts/checkpoints"/*-checkpoint.json; do
|
|
1951
|
-
[[ -f "$cp_file" ]] && latest_checkpoint="$cp_file"
|
|
1952
|
-
done
|
|
1953
|
-
if [[ -n "$latest_checkpoint" ]]; then
|
|
1954
|
-
daemon_log INFO "Found checkpoint: $latest_checkpoint"
|
|
1955
|
-
emit_event "daemon.recovery" "issue=$issue_num" "checkpoint=$latest_checkpoint"
|
|
1956
|
-
checkpoint_args+=("--resume")
|
|
1957
|
-
fi
|
|
2274
|
+
# Non-retryable failures — skip retry entirely
|
|
2275
|
+
case "$failure_class" in
|
|
2276
|
+
auth_error)
|
|
2277
|
+
daemon_log ERROR "Auth error for issue #${issue_num} — skipping retry"
|
|
2278
|
+
emit_event "daemon.skip_retry" "issue=$issue_num" "reason=auth_error"
|
|
2279
|
+
if [[ "$NO_GITHUB" != "true" ]]; then
|
|
2280
|
+
gh issue edit "$issue_num" --add-label "pipeline/auth-error" 2>/dev/null || true
|
|
1958
2281
|
fi
|
|
1959
|
-
|
|
2282
|
+
;;
|
|
2283
|
+
invalid_issue)
|
|
2284
|
+
daemon_log ERROR "Invalid issue #${issue_num} — skipping retry"
|
|
2285
|
+
emit_event "daemon.skip_retry" "issue=$issue_num" "reason=invalid_issue"
|
|
2286
|
+
if [[ "$NO_GITHUB" != "true" ]]; then
|
|
2287
|
+
gh issue comment "$issue_num" --body "Pipeline skipped retry: issue appears invalid or has no body." 2>/dev/null || true
|
|
2288
|
+
fi
|
|
2289
|
+
;;
|
|
2290
|
+
*)
|
|
2291
|
+
# Retryable failures — proceed with escalation
|
|
2292
|
+
if [[ "$retry_count" -lt "${MAX_RETRIES:-2}" ]]; then
|
|
2293
|
+
retry_count=$((retry_count + 1))
|
|
2294
|
+
|
|
2295
|
+
# Update retry count in state (locked to prevent race)
|
|
2296
|
+
locked_state_update \
|
|
2297
|
+
--arg num "$issue_num" --argjson count "$retry_count" \
|
|
2298
|
+
'.retry_counts[$num] = $count'
|
|
2299
|
+
|
|
2300
|
+
daemon_log WARN "Auto-retry #${retry_count}/${MAX_RETRIES:-2} for issue #${issue_num} (class: ${failure_class})"
|
|
2301
|
+
emit_event "daemon.retry" "issue=$issue_num" "retry=$retry_count" "max=${MAX_RETRIES:-2}" "class=$failure_class"
|
|
2302
|
+
|
|
2303
|
+
# Check for checkpoint to enable resume-from-checkpoint
|
|
2304
|
+
local checkpoint_args=()
|
|
2305
|
+
if [[ "${CHECKPOINT_ENABLED:-true}" == "true" ]]; then
|
|
2306
|
+
local issue_worktree="${REPO_DIR}/.worktrees/daemon-issue-${issue_num}"
|
|
2307
|
+
if [[ -d "$issue_worktree/.claude/pipeline-artifacts/checkpoints" ]]; then
|
|
2308
|
+
local latest_checkpoint=""
|
|
2309
|
+
for cp_file in "$issue_worktree/.claude/pipeline-artifacts/checkpoints"/*-checkpoint.json; do
|
|
2310
|
+
[[ -f "$cp_file" ]] && latest_checkpoint="$cp_file"
|
|
2311
|
+
done
|
|
2312
|
+
if [[ -n "$latest_checkpoint" ]]; then
|
|
2313
|
+
daemon_log INFO "Found checkpoint: $latest_checkpoint"
|
|
2314
|
+
emit_event "daemon.recovery" "issue=$issue_num" "checkpoint=$latest_checkpoint"
|
|
2315
|
+
checkpoint_args+=("--resume")
|
|
2316
|
+
fi
|
|
2317
|
+
fi
|
|
2318
|
+
fi
|
|
1960
2319
|
|
|
1961
|
-
|
|
1962
|
-
|
|
1963
|
-
|
|
1964
|
-
|
|
1965
|
-
|
|
1966
|
-
|
|
1967
|
-
|
|
1968
|
-
|
|
1969
|
-
|
|
1970
|
-
|
|
1971
|
-
|
|
1972
|
-
|
|
1973
|
-
|
|
1974
|
-
|
|
1975
|
-
|
|
1976
|
-
daemon_log INFO "Escalation: template=full, compound_cycles=5"
|
|
1977
|
-
fi
|
|
2320
|
+
# Build escalated pipeline args
|
|
2321
|
+
local retry_template="$PIPELINE_TEMPLATE"
|
|
2322
|
+
local retry_model="${MODEL:-opus}"
|
|
2323
|
+
local extra_args=()
|
|
2324
|
+
|
|
2325
|
+
if [[ "$retry_count" -eq 1 ]]; then
|
|
2326
|
+
retry_model="opus"
|
|
2327
|
+
extra_args+=("--max-iterations" "30")
|
|
2328
|
+
daemon_log INFO "Escalation: model=opus, max_iterations=30"
|
|
2329
|
+
elif [[ "$retry_count" -ge 2 ]]; then
|
|
2330
|
+
retry_template="full"
|
|
2331
|
+
retry_model="opus"
|
|
2332
|
+
extra_args+=("--max-iterations" "30" "--compound-cycles" "5")
|
|
2333
|
+
daemon_log INFO "Escalation: template=full, compound_cycles=5"
|
|
2334
|
+
fi
|
|
1978
2335
|
|
|
1979
|
-
|
|
1980
|
-
|
|
2336
|
+
# Increase restarts on context exhaustion
|
|
2337
|
+
if [[ "$failure_class" == "context_exhaustion" ]]; then
|
|
2338
|
+
local boosted_restarts=$(( ${MAX_RESTARTS_CFG:-3} + retry_count ))
|
|
2339
|
+
if [[ "$boosted_restarts" -gt 5 ]]; then
|
|
2340
|
+
boosted_restarts=5
|
|
2341
|
+
fi
|
|
2342
|
+
extra_args+=("--max-restarts" "$boosted_restarts")
|
|
2343
|
+
daemon_log INFO "Boosting max-restarts to $boosted_restarts (context exhaustion)"
|
|
2344
|
+
fi
|
|
1981
2345
|
|
|
1982
|
-
|
|
2346
|
+
# API errors get extended backoff
|
|
2347
|
+
local api_backoff=300
|
|
2348
|
+
local backoff_secs=$((30 * retry_count))
|
|
2349
|
+
if [[ "$failure_class" == "api_error" ]]; then
|
|
2350
|
+
backoff_secs=$((api_backoff * retry_count))
|
|
2351
|
+
daemon_log INFO "API error — extended backoff ${backoff_secs}s"
|
|
2352
|
+
fi
|
|
2353
|
+
|
|
2354
|
+
if [[ "$NO_GITHUB" != "true" ]]; then
|
|
2355
|
+
gh issue comment "$issue_num" --body "## 🔄 Auto-Retry #${retry_count}
|
|
2356
|
+
|
|
2357
|
+
Pipeline failed (${failure_class}) — retrying with escalated strategy.
|
|
1983
2358
|
|
|
1984
2359
|
| Field | Value |
|
|
1985
2360
|
|-------|-------|
|
|
1986
2361
|
| Retry | ${retry_count} / ${MAX_RETRIES:-2} |
|
|
2362
|
+
| Failure | \`${failure_class}\` |
|
|
1987
2363
|
| Template | \`${retry_template}\` |
|
|
1988
2364
|
| Model | \`${retry_model}\` |
|
|
1989
2365
|
| Started | $(now_iso) |
|
|
1990
2366
|
|
|
1991
2367
|
_Escalation: $(if [[ "$retry_count" -eq 1 ]]; then echo "upgraded model + increased iterations"; else echo "full template + compound quality"; fi)_" 2>/dev/null || true
|
|
1992
|
-
|
|
2368
|
+
fi
|
|
1993
2369
|
|
|
1994
|
-
|
|
1995
|
-
|
|
1996
|
-
|
|
1997
|
-
|
|
1998
|
-
|
|
1999
|
-
|
|
2000
|
-
|
|
2001
|
-
|
|
2002
|
-
|
|
2003
|
-
|
|
2004
|
-
|
|
2005
|
-
|
|
2006
|
-
|
|
2007
|
-
|
|
2008
|
-
|
|
2009
|
-
|
|
2370
|
+
daemon_log INFO "Waiting ${backoff_secs}s before retry #${retry_count}"
|
|
2371
|
+
sleep "$backoff_secs"
|
|
2372
|
+
|
|
2373
|
+
# Merge checkpoint args + extra args for passthrough
|
|
2374
|
+
local all_extra_args=()
|
|
2375
|
+
if [[ ${#checkpoint_args[@]} -gt 0 ]]; then
|
|
2376
|
+
all_extra_args+=("${checkpoint_args[@]}")
|
|
2377
|
+
fi
|
|
2378
|
+
if [[ ${#extra_args[@]} -gt 0 ]]; then
|
|
2379
|
+
all_extra_args+=("${extra_args[@]}")
|
|
2380
|
+
fi
|
|
2381
|
+
|
|
2382
|
+
# Re-spawn with escalated strategy
|
|
2383
|
+
local orig_template="$PIPELINE_TEMPLATE"
|
|
2384
|
+
local orig_model="$MODEL"
|
|
2385
|
+
PIPELINE_TEMPLATE="$retry_template"
|
|
2386
|
+
MODEL="$retry_model"
|
|
2387
|
+
daemon_spawn_pipeline "$issue_num" "retry-${retry_count}" "" "${all_extra_args[@]}"
|
|
2388
|
+
_retry_spawned_for="$issue_num"
|
|
2389
|
+
PIPELINE_TEMPLATE="$orig_template"
|
|
2390
|
+
MODEL="$orig_model"
|
|
2391
|
+
return
|
|
2392
|
+
fi
|
|
2010
2393
|
|
|
2011
|
-
|
|
2012
|
-
|
|
2394
|
+
daemon_log WARN "Max retries (${MAX_RETRIES:-2}) exhausted for issue #${issue_num}"
|
|
2395
|
+
emit_event "daemon.retry_exhausted" "issue=$issue_num" "retries=$retry_count"
|
|
2396
|
+
;;
|
|
2397
|
+
esac
|
|
2013
2398
|
fi
|
|
2014
2399
|
|
|
2015
2400
|
# ── No retry — report final failure ──
|
|
2016
2401
|
if [[ "$NO_GITHUB" != "true" ]]; then
|
|
2017
|
-
# Add failure label
|
|
2402
|
+
# Add failure label and remove watch label (prevent re-processing)
|
|
2018
2403
|
gh issue edit "$issue_num" \
|
|
2019
|
-
--add-label "$ON_FAILURE_ADD_LABEL"
|
|
2404
|
+
--add-label "$ON_FAILURE_ADD_LABEL" \
|
|
2405
|
+
--remove-label "$WATCH_LABEL" 2>/dev/null || true
|
|
2406
|
+
|
|
2407
|
+
# Close any draft PR created for this issue (cleanup abandoned work)
|
|
2408
|
+
local draft_pr
|
|
2409
|
+
draft_pr=$(gh pr list --head "daemon/issue-${issue_num}" --head "pipeline/pipeline-issue-${issue_num}" \
|
|
2410
|
+
--json number,isDraft --jq '.[] | select(.isDraft == true) | .number' 2>/dev/null | head -1 || true)
|
|
2411
|
+
if [[ -n "$draft_pr" ]]; then
|
|
2412
|
+
gh pr close "$draft_pr" --delete-branch 2>/dev/null || true
|
|
2413
|
+
daemon_log INFO "Closed draft PR #${draft_pr} for failed issue #${issue_num}"
|
|
2414
|
+
fi
|
|
2020
2415
|
|
|
2021
2416
|
# Comment with log tail
|
|
2022
2417
|
local log_tail=""
|
|
@@ -2075,7 +2470,7 @@ triage_score_issue() {
|
|
|
2075
2470
|
|
|
2076
2471
|
# ── Intelligence-powered triage (if enabled) ──
|
|
2077
2472
|
if [[ "${INTELLIGENCE_ENABLED:-false}" == "true" ]] && type intelligence_analyze_issue &>/dev/null 2>&1; then
|
|
2078
|
-
daemon_log INFO "Intelligence: using AI triage (intelligence enabled)"
|
|
2473
|
+
daemon_log INFO "Intelligence: using AI triage (intelligence enabled)" >&2
|
|
2079
2474
|
local analysis
|
|
2080
2475
|
analysis=$(intelligence_analyze_issue "$issue_json" 2>/dev/null || echo "")
|
|
2081
2476
|
if [[ -n "$analysis" && "$analysis" != "{}" && "$analysis" != "null" ]]; then
|
|
@@ -2114,9 +2509,9 @@ triage_score_issue() {
|
|
|
2114
2509
|
return
|
|
2115
2510
|
fi
|
|
2116
2511
|
# Fall through to heuristic scoring if intelligence call failed
|
|
2117
|
-
daemon_log INFO "Intelligence: AI triage failed, falling back to heuristic scoring"
|
|
2512
|
+
daemon_log INFO "Intelligence: AI triage failed, falling back to heuristic scoring" >&2
|
|
2118
2513
|
else
|
|
2119
|
-
daemon_log INFO "Intelligence: using heuristic triage (intelligence disabled, enable with intelligence.enabled=true)"
|
|
2514
|
+
daemon_log INFO "Intelligence: using heuristic triage (intelligence disabled, enable with intelligence.enabled=true)" >&2
|
|
2120
2515
|
fi
|
|
2121
2516
|
labels_csv=$(echo "$issue_json" | jq -r '[.labels[].name] | join(",")')
|
|
2122
2517
|
created_at=$(echo "$issue_json" | jq -r '.createdAt // ""')
|
|
@@ -2256,6 +2651,7 @@ triage_score_issue() {
|
|
|
2256
2651
|
select_pipeline_template() {
|
|
2257
2652
|
local labels="$1"
|
|
2258
2653
|
local score="${2:-50}"
|
|
2654
|
+
local _selected_template=""
|
|
2259
2655
|
|
|
2260
2656
|
# When auto_template is disabled, use default pipeline template
|
|
2261
2657
|
if [[ "${AUTO_TEMPLATE:-false}" != "true" ]]; then
|
|
@@ -2265,7 +2661,7 @@ select_pipeline_template() {
|
|
|
2265
2661
|
|
|
2266
2662
|
# ── Intelligence-composed pipeline (if enabled) ──
|
|
2267
2663
|
if [[ "${COMPOSER_ENABLED:-false}" == "true" ]] && type composer_create_pipeline &>/dev/null 2>&1; then
|
|
2268
|
-
daemon_log INFO "Intelligence: using AI pipeline composition (composer enabled)"
|
|
2664
|
+
daemon_log INFO "Intelligence: using AI pipeline composition (composer enabled)" >&2
|
|
2269
2665
|
local analysis="${INTELLIGENCE_ANALYSIS:-{}}"
|
|
2270
2666
|
local repo_context=""
|
|
2271
2667
|
if [[ -f "${REPO_DIR:-}/.claude/pipeline-state.md" ]]; then
|
|
@@ -2287,9 +2683,69 @@ select_pipeline_template() {
|
|
|
2287
2683
|
return
|
|
2288
2684
|
fi
|
|
2289
2685
|
# Fall through to static selection if composition failed
|
|
2290
|
-
daemon_log INFO "Intelligence: AI pipeline composition failed, falling back to static template selection"
|
|
2686
|
+
daemon_log INFO "Intelligence: AI pipeline composition failed, falling back to static template selection" >&2
|
|
2291
2687
|
else
|
|
2292
|
-
daemon_log INFO "Intelligence: using static template selection (composer disabled, enable with intelligence.composer_enabled=true)"
|
|
2688
|
+
daemon_log INFO "Intelligence: using static template selection (composer disabled, enable with intelligence.composer_enabled=true)" >&2
|
|
2689
|
+
fi
|
|
2690
|
+
|
|
2691
|
+
# ── DORA-driven template escalation ──
|
|
2692
|
+
if [[ -f "${EVENTS_FILE:-$HOME/.shipwright/events.jsonl}" ]]; then
|
|
2693
|
+
local _dora_events _dora_total _dora_failures _dora_cfr
|
|
2694
|
+
_dora_events=$(tail -500 "${EVENTS_FILE:-$HOME/.shipwright/events.jsonl}" \
|
|
2695
|
+
| grep '"type":"pipeline.completed"' 2>/dev/null \
|
|
2696
|
+
| tail -5 || true)
|
|
2697
|
+
_dora_total=$(echo "$_dora_events" | grep -c '.' 2>/dev/null || echo "0")
|
|
2698
|
+
_dora_total="${_dora_total:-0}"
|
|
2699
|
+
if [[ "$_dora_total" -ge 3 ]]; then
|
|
2700
|
+
_dora_failures=$(echo "$_dora_events" | grep -c '"result":"failure"' 2>/dev/null || true)
|
|
2701
|
+
_dora_failures="${_dora_failures:-0}"
|
|
2702
|
+
_dora_cfr=$(( _dora_failures * 100 / _dora_total ))
|
|
2703
|
+
if [[ "$_dora_cfr" -gt 40 ]]; then
|
|
2704
|
+
daemon_log INFO "DORA escalation: CFR ${_dora_cfr}% > 40% — forcing enterprise template" >&2
|
|
2705
|
+
emit_event "daemon.dora_escalation" \
|
|
2706
|
+
"cfr=$_dora_cfr" \
|
|
2707
|
+
"total=$_dora_total" \
|
|
2708
|
+
"failures=$_dora_failures" \
|
|
2709
|
+
"template=enterprise"
|
|
2710
|
+
echo "enterprise"
|
|
2711
|
+
return
|
|
2712
|
+
fi
|
|
2713
|
+
if [[ "$_dora_cfr" -lt 10 && "$score" -ge 60 ]]; then
|
|
2714
|
+
daemon_log INFO "DORA: CFR ${_dora_cfr}% < 10% — fast template eligible" >&2
|
|
2715
|
+
# Fall through to allow other factors to also vote for fast
|
|
2716
|
+
fi
|
|
2717
|
+
|
|
2718
|
+
# ── DORA multi-factor ──
|
|
2719
|
+
# Cycle time: if median > 120min, prefer faster templates
|
|
2720
|
+
local _dora_cycle_time=0
|
|
2721
|
+
_dora_cycle_time=$(echo "$_dora_events" | jq -r 'select(.duration_s) | .duration_s' 2>/dev/null \
|
|
2722
|
+
| sort -n | awk '{ a[NR]=$1 } END { if (NR>0) print int(a[int(NR/2)+1]/60); else print 0 }' 2>/dev/null) || _dora_cycle_time=0
|
|
2723
|
+
_dora_cycle_time="${_dora_cycle_time:-0}"
|
|
2724
|
+
if [[ "${_dora_cycle_time:-0}" -gt 120 ]]; then
|
|
2725
|
+
daemon_log INFO "DORA: cycle time ${_dora_cycle_time}min > 120 — preferring fast template" >&2
|
|
2726
|
+
if [[ "${score:-0}" -ge 60 ]]; then
|
|
2727
|
+
echo "fast"
|
|
2728
|
+
return
|
|
2729
|
+
fi
|
|
2730
|
+
fi
|
|
2731
|
+
|
|
2732
|
+
# Deploy frequency: if < 1/week, use cost-aware
|
|
2733
|
+
local _dora_deploy_freq=0
|
|
2734
|
+
local _dora_first_epoch _dora_last_epoch _dora_span_days
|
|
2735
|
+
_dora_first_epoch=$(echo "$_dora_events" | head -1 | jq -r '.timestamp // empty' 2>/dev/null | xargs -I{} date -j -f "%Y-%m-%dT%H:%M:%SZ" {} +%s 2>/dev/null || echo "0")
|
|
2736
|
+
_dora_last_epoch=$(echo "$_dora_events" | tail -1 | jq -r '.timestamp // empty' 2>/dev/null | xargs -I{} date -j -f "%Y-%m-%dT%H:%M:%SZ" {} +%s 2>/dev/null || echo "0")
|
|
2737
|
+
if [[ "${_dora_first_epoch:-0}" -gt 0 && "${_dora_last_epoch:-0}" -gt 0 ]]; then
|
|
2738
|
+
_dora_span_days=$(( (_dora_last_epoch - _dora_first_epoch) / 86400 ))
|
|
2739
|
+
if [[ "${_dora_span_days:-0}" -gt 0 ]]; then
|
|
2740
|
+
_dora_deploy_freq=$(awk -v t="$_dora_total" -v d="$_dora_span_days" 'BEGIN { printf "%.1f", t * 7 / d }' 2>/dev/null) || _dora_deploy_freq=0
|
|
2741
|
+
fi
|
|
2742
|
+
fi
|
|
2743
|
+
if [[ -n "${_dora_deploy_freq:-}" ]] && awk -v f="${_dora_deploy_freq:-0}" 'BEGIN{exit !(f > 0 && f < 1)}' 2>/dev/null; then
|
|
2744
|
+
daemon_log INFO "DORA: deploy freq ${_dora_deploy_freq}/week — using cost-aware" >&2
|
|
2745
|
+
echo "cost-aware"
|
|
2746
|
+
return
|
|
2747
|
+
fi
|
|
2748
|
+
fi
|
|
2293
2749
|
fi
|
|
2294
2750
|
|
|
2295
2751
|
# ── Branch protection escalation (highest priority) ──
|
|
@@ -2306,7 +2762,7 @@ select_pipeline_template() {
|
|
|
2306
2762
|
local required_reviews
|
|
2307
2763
|
required_reviews=$(echo "$protection" | jq -r '.required_pull_request_reviews.required_approving_review_count // 0' 2>/dev/null || echo "0")
|
|
2308
2764
|
if [[ "$strict_protection" == "true" ]] || [[ "${required_reviews:-0}" -gt 1 ]]; then
|
|
2309
|
-
daemon_log INFO "Branch has strict protection — escalating to enterprise template"
|
|
2765
|
+
daemon_log INFO "Branch has strict protection — escalating to enterprise template" >&2
|
|
2310
2766
|
echo "enterprise"
|
|
2311
2767
|
return
|
|
2312
2768
|
fi
|
|
@@ -2340,6 +2796,62 @@ select_pipeline_template() {
|
|
|
2340
2796
|
fi
|
|
2341
2797
|
fi
|
|
2342
2798
|
|
|
2799
|
+
# ── Quality memory-driven selection ──
|
|
2800
|
+
local quality_scores_file="${HOME}/.shipwright/optimization/quality-scores.jsonl"
|
|
2801
|
+
if [[ -f "$quality_scores_file" ]]; then
|
|
2802
|
+
local repo_hash
|
|
2803
|
+
repo_hash=$(cd "${REPO_DIR:-.}" && git rev-parse --show-toplevel 2>/dev/null | shasum -a 256 | cut -c1-16 || echo "unknown")
|
|
2804
|
+
# Get last 5 quality scores for this repo
|
|
2805
|
+
local recent_scores avg_quality has_critical
|
|
2806
|
+
recent_scores=$(grep "\"repo\":\"$repo_hash\"" "$quality_scores_file" 2>/dev/null | tail -5 || true)
|
|
2807
|
+
if [[ -n "$recent_scores" ]]; then
|
|
2808
|
+
avg_quality=$(echo "$recent_scores" | jq -r '.quality_score // 70' 2>/dev/null | awk '{ sum += $1; count++ } END { if (count > 0) printf "%.0f", sum/count; else print 70 }')
|
|
2809
|
+
has_critical=$(echo "$recent_scores" | jq -r '.findings.critical // 0' 2>/dev/null | awk '{ sum += $1 } END { print (sum > 0) ? "yes" : "no" }')
|
|
2810
|
+
|
|
2811
|
+
# Critical findings in recent history → force enterprise
|
|
2812
|
+
if [[ "$has_critical" == "yes" ]]; then
|
|
2813
|
+
daemon_log INFO "Quality memory: critical findings in recent runs — using enterprise template" >&2
|
|
2814
|
+
echo "enterprise"
|
|
2815
|
+
return
|
|
2816
|
+
fi
|
|
2817
|
+
|
|
2818
|
+
# Poor quality history → use full template
|
|
2819
|
+
if [[ "${avg_quality:-70}" -lt 60 ]]; then
|
|
2820
|
+
daemon_log INFO "Quality memory: avg score ${avg_quality}/100 in recent runs — using full template" >&2
|
|
2821
|
+
echo "full"
|
|
2822
|
+
return
|
|
2823
|
+
fi
|
|
2824
|
+
|
|
2825
|
+
# Excellent quality history → allow faster template
|
|
2826
|
+
if [[ "${avg_quality:-70}" -gt 80 ]]; then
|
|
2827
|
+
daemon_log INFO "Quality memory: avg score ${avg_quality}/100 in recent runs — eligible for fast template" >&2
|
|
2828
|
+
# Only upgrade if score also suggests fast
|
|
2829
|
+
if [[ "$score" -ge 60 ]]; then
|
|
2830
|
+
echo "fast"
|
|
2831
|
+
return
|
|
2832
|
+
fi
|
|
2833
|
+
fi
|
|
2834
|
+
fi
|
|
2835
|
+
fi
|
|
2836
|
+
|
|
2837
|
+
# ── Learned template weights ──
|
|
2838
|
+
local _tw_file="${HOME}/.shipwright/optimization/template-weights.json"
|
|
2839
|
+
if [[ -f "$_tw_file" ]]; then
|
|
2840
|
+
local _best_template _best_rate
|
|
2841
|
+
_best_template=$(jq -r '
|
|
2842
|
+
.weights // {} | to_entries
|
|
2843
|
+
| map(select(.value.sample_size >= 3))
|
|
2844
|
+
| sort_by(-.value.success_rate)
|
|
2845
|
+
| .[0].key // ""
|
|
2846
|
+
' "$_tw_file" 2>/dev/null) || true
|
|
2847
|
+
if [[ -n "${_best_template:-}" && "${_best_template:-}" != "null" && "${_best_template:-}" != "" ]]; then
|
|
2848
|
+
_best_rate=$(jq -r --arg t "$_best_template" '.weights[$t].success_rate // 0' "$_tw_file" 2>/dev/null) || _best_rate=0
|
|
2849
|
+
daemon_log INFO "Template weights: ${_best_template} (${_best_rate} success rate)" >&2
|
|
2850
|
+
echo "$_best_template"
|
|
2851
|
+
return
|
|
2852
|
+
fi
|
|
2853
|
+
fi
|
|
2854
|
+
|
|
2343
2855
|
# ── Score-based selection ──
|
|
2344
2856
|
if [[ "$score" -ge 70 ]]; then
|
|
2345
2857
|
echo "fast"
|
|
@@ -2388,8 +2900,12 @@ daemon_triage_show() {
|
|
|
2388
2900
|
num=$(echo "$issue" | jq -r '.number')
|
|
2389
2901
|
title=$(echo "$issue" | jq -r '.title // "—"')
|
|
2390
2902
|
labels_csv=$(echo "$issue" | jq -r '[.labels[].name] | join(", ")')
|
|
2391
|
-
score=$(triage_score_issue "$issue")
|
|
2392
|
-
|
|
2903
|
+
score=$(triage_score_issue "$issue" 2>/dev/null | tail -1)
|
|
2904
|
+
score=$(printf '%s' "$score" | tr -cd '[:digit:]')
|
|
2905
|
+
[[ -z "$score" ]] && score=50
|
|
2906
|
+
template=$(select_pipeline_template "$labels_csv" "$score" 2>/dev/null | tail -1)
|
|
2907
|
+
template=$(printf '%s' "$template" | sed $'s/\x1b\\[[0-9;]*m//g' | tr -cd '[:alnum:]-_')
|
|
2908
|
+
[[ -z "$template" ]] && template="$PIPELINE_TEMPLATE"
|
|
2393
2909
|
|
|
2394
2910
|
scored_lines+=("${score}|${num}|${title}|${labels_csv}|${template}")
|
|
2395
2911
|
done < <(echo "$issues_json" | jq -c '.[]')
|
|
@@ -3221,11 +3737,12 @@ Auto-detected by \`shipwright daemon patrol\` on $(now_iso)." \
|
|
|
3221
3737
|
if [[ ! -f "$scripts_dir/sw-${name}-test.sh" ]]; then
|
|
3222
3738
|
# Count usage across other scripts
|
|
3223
3739
|
local usage_count
|
|
3224
|
-
usage_count=$(grep -rl "sw-${name}" "$scripts_dir"/sw-*.sh 2>/dev/null | grep -cv "$basename" ||
|
|
3740
|
+
usage_count=$(grep -rl "sw-${name}" "$scripts_dir"/sw-*.sh 2>/dev/null | grep -cv "$basename" 2>/dev/null || echo "0")
|
|
3225
3741
|
usage_count=${usage_count:-0}
|
|
3226
3742
|
|
|
3227
3743
|
local line_count
|
|
3228
|
-
line_count=$(wc -l < "$script" 2>/dev/null | tr -d ' ')
|
|
3744
|
+
line_count=$(wc -l < "$script" 2>/dev/null | tr -d ' ' || echo "0")
|
|
3745
|
+
line_count=${line_count:-0}
|
|
3229
3746
|
|
|
3230
3747
|
untested_entries="${untested_entries}${usage_count}|${basename}|${line_count}\n"
|
|
3231
3748
|
findings=$((findings + 1))
|
|
@@ -3484,6 +4001,13 @@ Patrol pre-filter findings to confirm: ${patrol_findings_summary}"
|
|
|
3484
4001
|
patrol_meta_run
|
|
3485
4002
|
fi
|
|
3486
4003
|
|
|
4004
|
+
# ── Strategic Intelligence Patrol (requires CLAUDE_CODE_OAUTH_TOKEN) ──
|
|
4005
|
+
if [[ -f "$SCRIPT_DIR/sw-strategic.sh" ]] && [[ -n "${CLAUDE_CODE_OAUTH_TOKEN:-}" ]]; then
|
|
4006
|
+
# shellcheck source=sw-strategic.sh
|
|
4007
|
+
source "$SCRIPT_DIR/sw-strategic.sh"
|
|
4008
|
+
strategic_patrol_run || true
|
|
4009
|
+
fi
|
|
4010
|
+
|
|
3487
4011
|
# ── Summary ──
|
|
3488
4012
|
emit_event "patrol.completed" "findings=$total_findings" "issues_created=$issues_created" "dry_run=$dry_run"
|
|
3489
4013
|
|
|
@@ -3602,7 +4126,9 @@ daemon_poll_issues() {
|
|
|
3602
4126
|
while IFS= read -r issue; do
|
|
3603
4127
|
local num score
|
|
3604
4128
|
num=$(echo "$issue" | jq -r '.number')
|
|
3605
|
-
score=$(triage_score_issue "$issue")
|
|
4129
|
+
score=$(triage_score_issue "$issue" 2>/dev/null | tail -1)
|
|
4130
|
+
score=$(printf '%s' "$score" | tr -cd '[:digit:]')
|
|
4131
|
+
[[ -z "$score" ]] && score=50
|
|
3606
4132
|
# For org mode, include repo name in the scored entry
|
|
3607
4133
|
local repo_name=""
|
|
3608
4134
|
if [[ "$WATCH_MODE" == "org" ]]; then
|
|
@@ -3629,10 +4155,10 @@ daemon_poll_issues() {
|
|
|
3629
4155
|
local sorted_order
|
|
3630
4156
|
if [[ "${PRIORITY_STRATEGY:-quick-wins-first}" == "complex-first" ]]; then
|
|
3631
4157
|
# Complex-first: lower score (more complex) first
|
|
3632
|
-
sorted_order=$(printf '%s\n' "${scored_issues[@]}" | sort -t'|' -k1 -n)
|
|
4158
|
+
sorted_order=$(printf '%s\n' "${scored_issues[@]}" | sort -t'|' -k1,1 -n -k2,2 -n)
|
|
3633
4159
|
else
|
|
3634
|
-
# Quick-wins-first (default): higher score (simpler) first
|
|
3635
|
-
sorted_order=$(printf '%s\n' "${scored_issues[@]}" | sort -t'|' -k1 -rn)
|
|
4160
|
+
# Quick-wins-first (default): higher score (simpler) first, lowest issue# first on ties
|
|
4161
|
+
sorted_order=$(printf '%s\n' "${scored_issues[@]}" | sort -t'|' -k1,1 -rn -k2,2 -n)
|
|
3636
4162
|
fi
|
|
3637
4163
|
|
|
3638
4164
|
# Dependency-aware reordering: move dependencies before dependents
|
|
@@ -3727,7 +4253,9 @@ daemon_poll_issues() {
|
|
|
3727
4253
|
emit_event "daemon.priority_lane" "issue=$issue_num" "score=$score"
|
|
3728
4254
|
|
|
3729
4255
|
local template
|
|
3730
|
-
template=$(select_pipeline_template "$labels_csv" "$score")
|
|
4256
|
+
template=$(select_pipeline_template "$labels_csv" "$score" 2>/dev/null | tail -1)
|
|
4257
|
+
template=$(printf '%s' "$template" | sed $'s/\x1b\\[[0-9;]*m//g' | tr -cd '[:alnum:]-_')
|
|
4258
|
+
[[ -z "$template" ]] && template="$PIPELINE_TEMPLATE"
|
|
3731
4259
|
daemon_log INFO "Triage: issue #${issue_num} scored ${score}, template=${template} [PRIORITY]"
|
|
3732
4260
|
|
|
3733
4261
|
local orig_template="$PIPELINE_TEMPLATE"
|
|
@@ -3748,7 +4276,9 @@ daemon_poll_issues() {
|
|
|
3748
4276
|
|
|
3749
4277
|
# Auto-select pipeline template based on labels + triage score
|
|
3750
4278
|
local template
|
|
3751
|
-
template=$(select_pipeline_template "$labels_csv" "$score")
|
|
4279
|
+
template=$(select_pipeline_template "$labels_csv" "$score" 2>/dev/null | tail -1)
|
|
4280
|
+
template=$(printf '%s' "$template" | sed $'s/\x1b\\[[0-9;]*m//g' | tr -cd '[:alnum:]-_')
|
|
4281
|
+
[[ -z "$template" ]] && template="$PIPELINE_TEMPLATE"
|
|
3752
4282
|
daemon_log INFO "Triage: issue #${issue_num} scored ${score}, template=${template}"
|
|
3753
4283
|
|
|
3754
4284
|
# Spawn pipeline (template selection applied via PIPELINE_TEMPLATE override)
|
|
@@ -3756,8 +4286,41 @@ daemon_poll_issues() {
|
|
|
3756
4286
|
PIPELINE_TEMPLATE="$template"
|
|
3757
4287
|
daemon_spawn_pipeline "$issue_num" "$issue_title" "$repo_name"
|
|
3758
4288
|
PIPELINE_TEMPLATE="$orig_template"
|
|
4289
|
+
|
|
4290
|
+
# Stagger delay between spawns to avoid API contention
|
|
4291
|
+
local stagger_delay="${SPAWN_STAGGER_SECONDS:-15}"
|
|
4292
|
+
if [[ "$stagger_delay" -gt 0 ]]; then
|
|
4293
|
+
sleep "$stagger_delay"
|
|
4294
|
+
fi
|
|
3759
4295
|
done <<< "$sorted_order"
|
|
3760
4296
|
|
|
4297
|
+
# ── Drain queue if we have capacity (prevents deadlock when queue is
|
|
4298
|
+
# populated but no active jobs exist to trigger dequeue) ──
|
|
4299
|
+
local drain_active
|
|
4300
|
+
drain_active=$(locked_get_active_count)
|
|
4301
|
+
while [[ "$drain_active" -lt "$MAX_PARALLEL" ]]; do
|
|
4302
|
+
local drain_issue
|
|
4303
|
+
drain_issue=$(dequeue_next)
|
|
4304
|
+
[[ -z "$drain_issue" ]] && break
|
|
4305
|
+
local drain_title
|
|
4306
|
+
drain_title=$(jq -r --arg n "$drain_issue" '.titles[$n] // ""' "$STATE_FILE" 2>/dev/null || true)
|
|
4307
|
+
|
|
4308
|
+
local drain_labels drain_score drain_template
|
|
4309
|
+
drain_labels=$(echo "$issues_json" | jq -r --argjson n "$drain_issue" \
|
|
4310
|
+
'.[] | select(.number == $n) | [.labels[].name] | join(",")' 2>/dev/null || echo "")
|
|
4311
|
+
drain_score=$(echo "$sorted_order" | grep "|${drain_issue}|" | cut -d'|' -f1 || echo "50")
|
|
4312
|
+
drain_template=$(select_pipeline_template "$drain_labels" "${drain_score:-50}" 2>/dev/null | tail -1)
|
|
4313
|
+
drain_template=$(printf '%s' "$drain_template" | sed $'s/\x1b\\[[0-9;]*m//g' | tr -cd '[:alnum:]-_')
|
|
4314
|
+
[[ -z "$drain_template" ]] && drain_template="$PIPELINE_TEMPLATE"
|
|
4315
|
+
|
|
4316
|
+
daemon_log INFO "Draining queue: issue #${drain_issue}, template=${drain_template}"
|
|
4317
|
+
local orig_template="$PIPELINE_TEMPLATE"
|
|
4318
|
+
PIPELINE_TEMPLATE="$drain_template"
|
|
4319
|
+
daemon_spawn_pipeline "$drain_issue" "$drain_title"
|
|
4320
|
+
PIPELINE_TEMPLATE="$orig_template"
|
|
4321
|
+
drain_active=$(locked_get_active_count)
|
|
4322
|
+
done
|
|
4323
|
+
|
|
3761
4324
|
# Update last poll
|
|
3762
4325
|
update_state_field "last_poll" "$(now_iso)"
|
|
3763
4326
|
}
|
|
@@ -3770,13 +4333,15 @@ daemon_health_check() {
|
|
|
3770
4333
|
now_e=$(now_epoch)
|
|
3771
4334
|
|
|
3772
4335
|
if [[ -f "$STATE_FILE" ]]; then
|
|
3773
|
-
# ──
|
|
3774
|
-
# Instead of killing after a
|
|
3775
|
-
#
|
|
3776
|
-
#
|
|
4336
|
+
# ── Intelligent Health Monitoring ──
|
|
4337
|
+
# Instead of killing after a countdown, sense what the agent is doing.
|
|
4338
|
+
# Agents think for long stretches — that's normal and expected.
|
|
4339
|
+
# Strategy: sense → understand → be patient → nudge → only kill as last resort.
|
|
3777
4340
|
|
|
3778
|
-
local hard_limit="${PROGRESS_HARD_LIMIT_S:-
|
|
4341
|
+
local hard_limit="${PROGRESS_HARD_LIMIT_S:-0}"
|
|
3779
4342
|
local use_progress="${PROGRESS_MONITORING:-true}"
|
|
4343
|
+
local nudge_enabled="${NUDGE_ENABLED:-true}"
|
|
4344
|
+
local nudge_after="${NUDGE_AFTER_CHECKS:-40}"
|
|
3780
4345
|
|
|
3781
4346
|
while IFS= read -r job; do
|
|
3782
4347
|
local pid started_at issue_num worktree
|
|
@@ -3797,8 +4362,8 @@ daemon_health_check() {
|
|
|
3797
4362
|
elapsed=$(( now_e - start_e ))
|
|
3798
4363
|
fi
|
|
3799
4364
|
|
|
3800
|
-
# Hard wall-clock limit —
|
|
3801
|
-
if [[ "$elapsed" -gt "$hard_limit" ]]; then
|
|
4365
|
+
# Hard wall-clock limit — disabled by default (0 = off)
|
|
4366
|
+
if [[ "$hard_limit" -gt 0 && "$elapsed" -gt "$hard_limit" ]]; then
|
|
3802
4367
|
daemon_log WARN "Hard limit exceeded: issue #${issue_num} (${elapsed}s > ${hard_limit}s, PID $pid) — killing"
|
|
3803
4368
|
emit_event "daemon.hard_limit" "issue=$issue_num" "elapsed_s=$elapsed" "limit_s=$hard_limit" "pid=$pid"
|
|
3804
4369
|
kill "$pid" 2>/dev/null || true
|
|
@@ -3807,7 +4372,7 @@ daemon_health_check() {
|
|
|
3807
4372
|
continue
|
|
3808
4373
|
fi
|
|
3809
4374
|
|
|
3810
|
-
# Progress
|
|
4375
|
+
# ── Intelligent Progress Sensing ──
|
|
3811
4376
|
if [[ "$use_progress" == "true" && -n "$worktree" ]]; then
|
|
3812
4377
|
local snapshot verdict
|
|
3813
4378
|
snapshot=$(daemon_collect_snapshot "$issue_num" "$worktree" "$pid" 2>/dev/null || echo '{}')
|
|
@@ -3815,29 +4380,87 @@ daemon_health_check() {
|
|
|
3815
4380
|
if [[ "$snapshot" != "{}" ]]; then
|
|
3816
4381
|
verdict=$(daemon_assess_progress "$issue_num" "$snapshot" 2>/dev/null || echo "healthy")
|
|
3817
4382
|
|
|
4383
|
+
local no_progress_count=0
|
|
4384
|
+
no_progress_count=$(jq -r '.no_progress_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
|
|
4385
|
+
local cur_stage
|
|
4386
|
+
cur_stage=$(echo "$snapshot" | jq -r '.stage // "unknown"')
|
|
4387
|
+
|
|
3818
4388
|
case "$verdict" in
|
|
3819
4389
|
healthy)
|
|
3820
4390
|
# All good — agent is making progress
|
|
3821
4391
|
;;
|
|
3822
4392
|
slowing)
|
|
3823
|
-
daemon_log INFO "Issue #${issue_num} slowing (no
|
|
4393
|
+
daemon_log INFO "Issue #${issue_num} slowing (no visible changes for ${no_progress_count} checks, ${elapsed}s elapsed, stage=${cur_stage})"
|
|
3824
4394
|
;;
|
|
3825
4395
|
stalled)
|
|
3826
|
-
|
|
3827
|
-
|
|
3828
|
-
|
|
3829
|
-
|
|
4396
|
+
# Check if agent subprocess is alive and consuming CPU
|
|
4397
|
+
local agent_alive=false
|
|
4398
|
+
local child_cpu=0
|
|
4399
|
+
child_cpu=$(pgrep -P "$pid" 2>/dev/null | xargs -I{} ps -o pcpu= -p {} 2>/dev/null | awk '{sum+=$1} END{printf "%d", sum+0}' || echo "0")
|
|
4400
|
+
if [[ "${child_cpu:-0}" -gt 0 ]]; then
|
|
4401
|
+
agent_alive=true
|
|
4402
|
+
fi
|
|
4403
|
+
|
|
4404
|
+
if [[ "$agent_alive" == "true" ]]; then
|
|
4405
|
+
daemon_log INFO "Issue #${issue_num} no visible progress (${no_progress_count} checks) but agent is alive (CPU: ${child_cpu}%, stage=${cur_stage}, ${elapsed}s) — being patient"
|
|
4406
|
+
else
|
|
4407
|
+
daemon_log WARN "Issue #${issue_num} stalled: no progress for ${no_progress_count} checks, no CPU activity (${elapsed}s elapsed, PID $pid)"
|
|
4408
|
+
emit_event "daemon.stalled" "issue=$issue_num" "no_progress=$no_progress_count" "elapsed_s=$elapsed" "pid=$pid"
|
|
4409
|
+
fi
|
|
3830
4410
|
;;
|
|
3831
4411
|
stuck)
|
|
3832
|
-
local
|
|
3833
|
-
no_progress_count=$(jq -r '.no_progress_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
|
|
4412
|
+
local repeated_errors
|
|
3834
4413
|
repeated_errors=$(jq -r '.repeated_error_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
|
|
3835
|
-
|
|
3836
|
-
|
|
3837
|
-
|
|
3838
|
-
|
|
3839
|
-
|
|
3840
|
-
|
|
4414
|
+
|
|
4415
|
+
# Even "stuck" — check if the process tree is alive first
|
|
4416
|
+
local agent_alive=false
|
|
4417
|
+
local child_cpu=0
|
|
4418
|
+
child_cpu=$(pgrep -P "$pid" 2>/dev/null | xargs -I{} ps -o pcpu= -p {} 2>/dev/null | awk '{sum+=$1} END{printf "%d", sum+0}' || echo "0")
|
|
4419
|
+
if [[ "${child_cpu:-0}" -gt 0 ]]; then
|
|
4420
|
+
agent_alive=true
|
|
4421
|
+
fi
|
|
4422
|
+
|
|
4423
|
+
if [[ "$agent_alive" == "true" && "$repeated_errors" -lt 3 ]]; then
|
|
4424
|
+
# Agent is alive — nudge instead of kill
|
|
4425
|
+
if [[ "$nudge_enabled" == "true" && "$no_progress_count" -ge "$nudge_after" ]]; then
|
|
4426
|
+
local nudge_file="${worktree}/.claude/nudge.md"
|
|
4427
|
+
if [[ ! -f "$nudge_file" ]]; then
|
|
4428
|
+
cat > "$nudge_file" <<NUDGE_EOF
|
|
4429
|
+
# Nudge from Daemon Health Monitor
|
|
4430
|
+
|
|
4431
|
+
The daemon has noticed no visible progress for $(( no_progress_count * 30 / 60 )) minutes.
|
|
4432
|
+
Current stage: ${cur_stage}
|
|
4433
|
+
|
|
4434
|
+
If you're stuck, consider:
|
|
4435
|
+
- Breaking the task into smaller steps
|
|
4436
|
+
- Committing partial progress
|
|
4437
|
+
- Running tests to validate current state
|
|
4438
|
+
|
|
4439
|
+
This is just a gentle check-in — take your time if you're working through a complex problem.
|
|
4440
|
+
NUDGE_EOF
|
|
4441
|
+
daemon_log INFO "Issue #${issue_num} nudged (${no_progress_count} checks, stage=${cur_stage}, CPU=${child_cpu}%) — file written to worktree"
|
|
4442
|
+
emit_event "daemon.nudge" "issue=$issue_num" "no_progress=$no_progress_count" "stage=$cur_stage" "elapsed_s=$elapsed"
|
|
4443
|
+
fi
|
|
4444
|
+
else
|
|
4445
|
+
daemon_log INFO "Issue #${issue_num} no visible progress (${no_progress_count} checks) but agent is alive (CPU: ${child_cpu}%, stage=${cur_stage}) — waiting"
|
|
4446
|
+
fi
|
|
4447
|
+
elif [[ "$repeated_errors" -ge 5 ]]; then
|
|
4448
|
+
# Truly stuck in an error loop — kill as last resort
|
|
4449
|
+
daemon_log WARN "Issue #${issue_num} in error loop: ${repeated_errors} repeated errors (stage=${cur_stage}, ${elapsed}s, PID $pid) — killing"
|
|
4450
|
+
emit_event "daemon.stuck_kill" "issue=$issue_num" "no_progress=$no_progress_count" "repeated_errors=$repeated_errors" "stage=$cur_stage" "elapsed_s=$elapsed" "pid=$pid" "reason=error_loop"
|
|
4451
|
+
kill "$pid" 2>/dev/null || true
|
|
4452
|
+
daemon_clear_progress "$issue_num"
|
|
4453
|
+
findings=$((findings + 1))
|
|
4454
|
+
elif [[ "$agent_alive" != "true" && "$no_progress_count" -ge "$((PROGRESS_CHECKS_BEFORE_KILL * 2))" ]]; then
|
|
4455
|
+
# Process tree is dead AND no progress for very long time
|
|
4456
|
+
daemon_log WARN "Issue #${issue_num} appears dead: no CPU, no progress for ${no_progress_count} checks (${elapsed}s, PID $pid) — killing"
|
|
4457
|
+
emit_event "daemon.stuck_kill" "issue=$issue_num" "no_progress=$no_progress_count" "repeated_errors=$repeated_errors" "stage=$cur_stage" "elapsed_s=$elapsed" "pid=$pid" "reason=dead_process"
|
|
4458
|
+
kill "$pid" 2>/dev/null || true
|
|
4459
|
+
daemon_clear_progress "$issue_num"
|
|
4460
|
+
findings=$((findings + 1))
|
|
4461
|
+
else
|
|
4462
|
+
daemon_log WARN "Issue #${issue_num} struggling (${no_progress_count} checks, ${repeated_errors} errors, CPU=${child_cpu}%, stage=${cur_stage}) — monitoring"
|
|
4463
|
+
fi
|
|
3841
4464
|
;;
|
|
3842
4465
|
esac
|
|
3843
4466
|
fi
|
|
@@ -3846,8 +4469,9 @@ daemon_health_check() {
|
|
|
3846
4469
|
local stale_timeout
|
|
3847
4470
|
stale_timeout=$(get_adaptive_stale_timeout "$PIPELINE_TEMPLATE")
|
|
3848
4471
|
if [[ "$elapsed" -gt "$stale_timeout" ]]; then
|
|
3849
|
-
daemon_log WARN "Stale job (legacy): issue #${issue_num} (${elapsed}s > ${stale_timeout}s, PID $pid)
|
|
3850
|
-
kill
|
|
4472
|
+
daemon_log WARN "Stale job (legacy): issue #${issue_num} (${elapsed}s > ${stale_timeout}s, PID $pid)"
|
|
4473
|
+
# Don't kill — just log. Let the process run.
|
|
4474
|
+
emit_event "daemon.stale_warning" "issue=$issue_num" "elapsed_s=$elapsed" "pid=$pid"
|
|
3851
4475
|
findings=$((findings + 1))
|
|
3852
4476
|
fi
|
|
3853
4477
|
fi
|
|
@@ -3908,8 +4532,11 @@ daemon_check_degradation() {
|
|
|
3908
4532
|
local failures successes
|
|
3909
4533
|
failures=$(echo "$recent" | jq '[.[] | select(.result == "failure")] | length')
|
|
3910
4534
|
successes=$(echo "$recent" | jq '[.[] | select(.result == "success")] | length')
|
|
3911
|
-
local cfr_pct
|
|
3912
|
-
|
|
4535
|
+
local cfr_pct=0 success_pct=0
|
|
4536
|
+
if [[ "${count:-0}" -gt 0 ]]; then
|
|
4537
|
+
cfr_pct=$(( failures * 100 / count ))
|
|
4538
|
+
success_pct=$(( successes * 100 / count ))
|
|
4539
|
+
fi
|
|
3913
4540
|
|
|
3914
4541
|
local alerts=""
|
|
3915
4542
|
if [[ "$cfr_pct" -gt "$cfr_threshold" ]]; then
|
|
@@ -4039,11 +4666,43 @@ daemon_auto_scale() {
|
|
|
4039
4666
|
local max_by_queue=$(( queue_depth + active_count ))
|
|
4040
4667
|
[[ "$max_by_queue" -lt 1 ]] && max_by_queue=1
|
|
4041
4668
|
|
|
4669
|
+
# ── Vitals-driven scaling factor ──
|
|
4670
|
+
local max_by_vitals="$MAX_WORKERS"
|
|
4671
|
+
if type pipeline_compute_vitals &>/dev/null 2>&1 && [[ -f "$STATE_FILE" ]]; then
|
|
4672
|
+
local _total_health=0 _health_count=0
|
|
4673
|
+
while IFS= read -r _job; do
|
|
4674
|
+
local _job_issue _job_worktree
|
|
4675
|
+
_job_issue=$(echo "$_job" | jq -r '.issue // 0')
|
|
4676
|
+
_job_worktree=$(echo "$_job" | jq -r '.worktree // ""')
|
|
4677
|
+
if [[ -n "$_job_worktree" && -d "$_job_worktree/.claude" ]]; then
|
|
4678
|
+
local _job_vitals _job_health
|
|
4679
|
+
_job_vitals=$(pipeline_compute_vitals "$_job_worktree/.claude/pipeline-state.md" "$_job_worktree/.claude/pipeline-artifacts" "$_job_issue" 2>/dev/null) || true
|
|
4680
|
+
if [[ -n "$_job_vitals" && "$_job_vitals" != "{}" ]]; then
|
|
4681
|
+
_job_health=$(echo "$_job_vitals" | jq -r '.health_score // 50' 2>/dev/null || echo "50")
|
|
4682
|
+
_total_health=$((_total_health + _job_health))
|
|
4683
|
+
_health_count=$((_health_count + 1))
|
|
4684
|
+
fi
|
|
4685
|
+
fi
|
|
4686
|
+
done < <(jq -c '.active_jobs[]' "$STATE_FILE" 2>/dev/null || true)
|
|
4687
|
+
|
|
4688
|
+
if [[ "$_health_count" -gt 0 ]]; then
|
|
4689
|
+
local _avg_health=$((_total_health / _health_count))
|
|
4690
|
+
if [[ "$_avg_health" -lt 50 ]]; then
|
|
4691
|
+
# Pipelines struggling — reduce workers to give each more resources
|
|
4692
|
+
max_by_vitals=$(( MAX_WORKERS * _avg_health / 100 ))
|
|
4693
|
+
[[ "$max_by_vitals" -lt "$MIN_WORKERS" ]] && max_by_vitals="$MIN_WORKERS"
|
|
4694
|
+
daemon_log INFO "Auto-scale: vitals avg health ${_avg_health}% — capping at ${max_by_vitals} workers"
|
|
4695
|
+
fi
|
|
4696
|
+
# avg_health > 70: no reduction (full capacity available)
|
|
4697
|
+
fi
|
|
4698
|
+
fi
|
|
4699
|
+
|
|
4042
4700
|
# ── Compute final value ──
|
|
4043
4701
|
local computed="$max_by_cpu"
|
|
4044
4702
|
[[ "$max_by_mem" -lt "$computed" ]] && computed="$max_by_mem"
|
|
4045
4703
|
[[ "$max_by_budget" -lt "$computed" ]] && computed="$max_by_budget"
|
|
4046
4704
|
[[ "$max_by_queue" -lt "$computed" ]] && computed="$max_by_queue"
|
|
4705
|
+
[[ "$max_by_vitals" -lt "$computed" ]] && computed="$max_by_vitals"
|
|
4047
4706
|
[[ "$MAX_WORKERS" -lt "$computed" ]] && computed="$MAX_WORKERS"
|
|
4048
4707
|
|
|
4049
4708
|
# Respect fleet-assigned ceiling if set
|
|
@@ -4302,7 +4961,19 @@ daemon_cleanup_stale() {
|
|
|
4302
4961
|
done < <(git worktree list --porcelain 2>/dev/null | grep '^worktree ' | sed 's/^worktree //')
|
|
4303
4962
|
fi
|
|
4304
4963
|
|
|
4305
|
-
# ── 2.
|
|
4964
|
+
# ── 2. Expire old checkpoints ──
|
|
4965
|
+
if [[ -x "$SCRIPT_DIR/sw-checkpoint.sh" ]]; then
|
|
4966
|
+
local expired_output
|
|
4967
|
+
expired_output=$(bash "$SCRIPT_DIR/sw-checkpoint.sh" expire --hours "$((age_days * 24))" 2>/dev/null || true)
|
|
4968
|
+
if [[ -n "$expired_output" ]] && echo "$expired_output" | grep -q "Expired"; then
|
|
4969
|
+
local expired_count
|
|
4970
|
+
expired_count=$(echo "$expired_output" | grep -c "Expired" || true)
|
|
4971
|
+
cleaned=$((cleaned + ${expired_count:-0}))
|
|
4972
|
+
daemon_log INFO "Expired ${expired_count:-0} old checkpoint(s)"
|
|
4973
|
+
fi
|
|
4974
|
+
fi
|
|
4975
|
+
|
|
4976
|
+
# ── 3. Clean old pipeline artifacts (subdirectories only) ──
|
|
4306
4977
|
local artifacts_dir=".claude/pipeline-artifacts"
|
|
4307
4978
|
if [[ -d "$artifacts_dir" ]]; then
|
|
4308
4979
|
while IFS= read -r artifact_dir; do
|
|
@@ -4393,6 +5064,7 @@ daemon_poll_loop() {
|
|
|
4393
5064
|
# All poll loop calls are error-guarded to prevent set -e from killing the daemon.
|
|
4394
5065
|
# The || operator disables set -e for the entire call chain, so transient failures
|
|
4395
5066
|
# (GitHub API timeouts, jq errors, intelligence failures) are logged and skipped.
|
|
5067
|
+
daemon_preflight_auth_check || daemon_log WARN "Auth check failed — daemon may be paused"
|
|
4396
5068
|
daemon_poll_issues || daemon_log WARN "daemon_poll_issues failed — continuing"
|
|
4397
5069
|
daemon_reap_completed || daemon_log WARN "daemon_reap_completed failed — continuing"
|
|
4398
5070
|
daemon_health_check || daemon_log WARN "daemon_health_check failed — continuing"
|
|
@@ -4476,7 +5148,8 @@ cleanup_on_exit() {
|
|
|
4476
5148
|
while IFS= read -r cpid; do
|
|
4477
5149
|
[[ -z "$cpid" ]] && continue
|
|
4478
5150
|
if kill -0 "$cpid" 2>/dev/null; then
|
|
4479
|
-
daemon_log INFO "Killing pipeline process PID ${cpid}"
|
|
5151
|
+
daemon_log INFO "Killing pipeline process tree PID ${cpid}"
|
|
5152
|
+
pkill -TERM -P "$cpid" 2>/dev/null || true
|
|
4480
5153
|
kill "$cpid" 2>/dev/null || true
|
|
4481
5154
|
killed=$((killed + 1))
|
|
4482
5155
|
fi
|
|
@@ -4488,7 +5161,8 @@ cleanup_on_exit() {
|
|
|
4488
5161
|
while IFS= read -r cpid; do
|
|
4489
5162
|
[[ -z "$cpid" ]] && continue
|
|
4490
5163
|
if kill -0 "$cpid" 2>/dev/null; then
|
|
4491
|
-
daemon_log WARN "Force-killing pipeline PID ${cpid}"
|
|
5164
|
+
daemon_log WARN "Force-killing pipeline tree PID ${cpid}"
|
|
5165
|
+
pkill -9 -P "$cpid" 2>/dev/null || true
|
|
4492
5166
|
kill -9 "$cpid" 2>/dev/null || true
|
|
4493
5167
|
fi
|
|
4494
5168
|
done <<< "$child_pids"
|