shipwright-cli 1.9.0 → 1.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/hooks/post-tool-use.sh +12 -5
- package/package.json +2 -2
- package/scripts/sw +9 -1
- package/scripts/sw-adversarial.sh +1 -1
- package/scripts/sw-architecture-enforcer.sh +1 -1
- package/scripts/sw-checkpoint.sh +79 -1
- package/scripts/sw-cleanup.sh +192 -7
- package/scripts/sw-connect.sh +1 -1
- package/scripts/sw-cost.sh +1 -1
- package/scripts/sw-daemon.sh +409 -37
- package/scripts/sw-dashboard.sh +1 -1
- package/scripts/sw-developer-simulation.sh +1 -1
- package/scripts/sw-docs.sh +1 -1
- package/scripts/sw-doctor.sh +1 -1
- package/scripts/sw-fix.sh +1 -1
- package/scripts/sw-fleet.sh +1 -1
- package/scripts/sw-github-checks.sh +1 -1
- package/scripts/sw-github-deploy.sh +1 -1
- package/scripts/sw-github-graphql.sh +1 -1
- package/scripts/sw-heartbeat.sh +1 -1
- package/scripts/sw-init.sh +1 -1
- package/scripts/sw-intelligence.sh +1 -1
- package/scripts/sw-jira.sh +1 -1
- package/scripts/sw-launchd.sh +4 -4
- package/scripts/sw-linear.sh +1 -1
- package/scripts/sw-logs.sh +1 -1
- package/scripts/sw-loop.sh +444 -49
- package/scripts/sw-memory.sh +198 -3
- package/scripts/sw-pipeline-composer.sh +8 -8
- package/scripts/sw-pipeline-vitals.sh +1096 -0
- package/scripts/sw-pipeline.sh +1692 -84
- package/scripts/sw-predictive.sh +1 -1
- package/scripts/sw-prep.sh +1 -1
- package/scripts/sw-ps.sh +4 -3
- package/scripts/sw-reaper.sh +5 -3
- package/scripts/sw-remote.sh +1 -1
- package/scripts/sw-self-optimize.sh +109 -8
- package/scripts/sw-session.sh +31 -9
- package/scripts/sw-setup.sh +1 -1
- package/scripts/sw-status.sh +192 -1
- package/scripts/sw-templates.sh +1 -1
- package/scripts/sw-tmux.sh +1 -1
- package/scripts/sw-tracker.sh +1 -1
- package/scripts/sw-upgrade.sh +1 -1
- package/scripts/sw-worktree.sh +1 -1
- package/templates/pipelines/autonomous.json +8 -1
- package/templates/pipelines/cost-aware.json +21 -0
- package/templates/pipelines/deployed.json +40 -6
- package/templates/pipelines/enterprise.json +16 -2
- package/templates/pipelines/fast.json +19 -0
- package/templates/pipelines/full.json +16 -2
- package/templates/pipelines/hotfix.json +19 -0
- package/templates/pipelines/standard.json +19 -0
package/scripts/sw-daemon.sh
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
set -euo pipefail
|
|
7
7
|
trap 'echo "ERROR: $BASH_SOURCE:$LINENO exited with status $?" >&2' ERR
|
|
8
8
|
|
|
9
|
-
VERSION="1.
|
|
9
|
+
VERSION="1.10.0"
|
|
10
10
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
11
11
|
REPO_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
12
12
|
|
|
@@ -34,6 +34,8 @@ RESET='\033[0m'
|
|
|
34
34
|
[[ -f "$SCRIPT_DIR/sw-self-optimize.sh" ]] && source "$SCRIPT_DIR/sw-self-optimize.sh"
|
|
35
35
|
# shellcheck source=sw-predictive.sh
|
|
36
36
|
[[ -f "$SCRIPT_DIR/sw-predictive.sh" ]] && source "$SCRIPT_DIR/sw-predictive.sh"
|
|
37
|
+
# shellcheck source=sw-pipeline-vitals.sh
|
|
38
|
+
[[ -f "$SCRIPT_DIR/sw-pipeline-vitals.sh" ]] && source "$SCRIPT_DIR/sw-pipeline-vitals.sh"
|
|
37
39
|
|
|
38
40
|
# ─── GitHub API Modules (optional) ────────────────────────────────────────
|
|
39
41
|
# shellcheck source=sw-github-graphql.sh
|
|
@@ -125,7 +127,6 @@ rotate_event_log() {
|
|
|
125
127
|
}
|
|
126
128
|
|
|
127
129
|
# ─── GitHub Context (loaded once at startup) ──────────────────────────────
|
|
128
|
-
DAEMON_GITHUB_CONTEXT=""
|
|
129
130
|
|
|
130
131
|
daemon_github_context() {
|
|
131
132
|
# Skip if no GitHub
|
|
@@ -141,8 +142,6 @@ daemon_github_context() {
|
|
|
141
142
|
context=$(gh_repo_context "$owner" "$repo" 2>/dev/null || echo "{}")
|
|
142
143
|
if [[ -n "$context" && "$context" != "{}" ]]; then
|
|
143
144
|
daemon_log INFO "GitHub context loaded: $(echo "$context" | jq -r '.contributor_count // 0') contributors, $(echo "$context" | jq -r '.security_alert_count // 0') security alerts"
|
|
144
|
-
DAEMON_GITHUB_CONTEXT="$context"
|
|
145
|
-
export DAEMON_GITHUB_CONTEXT
|
|
146
145
|
fi
|
|
147
146
|
}
|
|
148
147
|
|
|
@@ -166,9 +165,9 @@ gh_retry() {
|
|
|
166
165
|
|
|
167
166
|
# Check for rate-limit or server error indicators
|
|
168
167
|
if echo "$output" | grep -qiE "rate limit|403|429|502|503"; then
|
|
169
|
-
daemon_log WARN "gh_retry: rate limit / server error on attempt ${attempt}/${max_retries} — backoff ${backoff}s"
|
|
168
|
+
daemon_log WARN "gh_retry: rate limit / server error on attempt ${attempt}/${max_retries} — backoff ${backoff}s" >&2
|
|
170
169
|
else
|
|
171
|
-
daemon_log WARN "gh_retry: transient error on attempt ${attempt}/${max_retries} (exit ${exit_code}) — backoff ${backoff}s"
|
|
170
|
+
daemon_log WARN "gh_retry: transient error on attempt ${attempt}/${max_retries} (exit ${exit_code}) — backoff ${backoff}s" >&2
|
|
172
171
|
fi
|
|
173
172
|
|
|
174
173
|
if [[ $attempt -lt $max_retries ]]; then
|
|
@@ -421,6 +420,14 @@ load_config() {
|
|
|
421
420
|
MAX_RETRIES=$(jq -r '.max_retries // 2' "$config_file")
|
|
422
421
|
RETRY_ESCALATION=$(jq -r '.retry_escalation // true' "$config_file")
|
|
423
422
|
|
|
423
|
+
# session restart + fast test passthrough
|
|
424
|
+
MAX_RESTARTS_CFG=$(jq -r '.max_restarts // 3' "$config_file" 2>/dev/null || echo "3")
|
|
425
|
+
if ! [[ "$MAX_RESTARTS_CFG" =~ ^[0-9]+$ ]]; then
|
|
426
|
+
daemon_log WARN "Invalid max_restarts in config: $MAX_RESTARTS_CFG (using default: 3)"
|
|
427
|
+
MAX_RESTARTS_CFG="3"
|
|
428
|
+
fi
|
|
429
|
+
FAST_TEST_CMD_CFG=$(jq -r '.fast_test_cmd // ""' "$config_file" 2>/dev/null || echo "")
|
|
430
|
+
|
|
424
431
|
# self-optimization
|
|
425
432
|
SELF_OPTIMIZE=$(jq -r '.self_optimize // false' "$config_file")
|
|
426
433
|
OPTIMIZE_INTERVAL=$(jq -r '.optimize_interval // 10' "$config_file")
|
|
@@ -482,6 +489,12 @@ load_config() {
|
|
|
482
489
|
DASHBOARD_URL="$cfg_dashboard_url"
|
|
483
490
|
fi
|
|
484
491
|
|
|
492
|
+
# Auto-enable self_optimize when auto_template is on
|
|
493
|
+
if [[ "${AUTO_TEMPLATE:-false}" == "true" && "${SELF_OPTIMIZE:-false}" == "false" ]]; then
|
|
494
|
+
SELF_OPTIMIZE="true"
|
|
495
|
+
daemon_log INFO "Auto-enabling self_optimize (auto_template is true)"
|
|
496
|
+
fi
|
|
497
|
+
|
|
485
498
|
success "Config loaded"
|
|
486
499
|
}
|
|
487
500
|
|
|
@@ -855,7 +868,56 @@ daemon_assess_progress() {
|
|
|
855
868
|
if $npc == 0 then .last_progress_at = $ts else . end
|
|
856
869
|
' "$progress_file" > "$tmp_progress" 2>/dev/null && mv "$tmp_progress" "$progress_file"
|
|
857
870
|
|
|
858
|
-
#
|
|
871
|
+
# ── Vitals-based verdict (preferred over static thresholds) ──
|
|
872
|
+
if type pipeline_compute_vitals &>/dev/null 2>&1 && type pipeline_health_verdict &>/dev/null 2>&1; then
|
|
873
|
+
# Compute vitals using the worktree's pipeline state if available
|
|
874
|
+
local _worktree_state=""
|
|
875
|
+
local _worktree_artifacts=""
|
|
876
|
+
local _worktree_dir
|
|
877
|
+
_worktree_dir=$(jq -r --arg i "$issue_num" '.active_jobs[] | select(.issue == ($i | tonumber)) | .worktree // ""' "$STATE_FILE" 2>/dev/null || echo "")
|
|
878
|
+
if [[ -n "$_worktree_dir" && -d "$_worktree_dir/.claude" ]]; then
|
|
879
|
+
_worktree_state="$_worktree_dir/.claude/pipeline-state.md"
|
|
880
|
+
_worktree_artifacts="$_worktree_dir/.claude/pipeline-artifacts"
|
|
881
|
+
fi
|
|
882
|
+
|
|
883
|
+
local _vitals_json
|
|
884
|
+
_vitals_json=$(pipeline_compute_vitals "$_worktree_state" "$_worktree_artifacts" "$issue_num" 2>/dev/null) || true
|
|
885
|
+
if [[ -n "$_vitals_json" && "$_vitals_json" != "{}" ]]; then
|
|
886
|
+
local _health_verdict _health_score
|
|
887
|
+
_health_verdict=$(echo "$_vitals_json" | jq -r '.verdict // "continue"' 2>/dev/null || echo "continue")
|
|
888
|
+
_health_score=$(echo "$_vitals_json" | jq -r '.health_score // 50' 2>/dev/null || echo "50")
|
|
889
|
+
|
|
890
|
+
emit_event "pipeline.vitals_check" \
|
|
891
|
+
"issue=$issue_num" \
|
|
892
|
+
"health_score=$_health_score" \
|
|
893
|
+
"verdict=$_health_verdict" \
|
|
894
|
+
"no_progress=$no_progress_count" \
|
|
895
|
+
"repeated_errors=$repeated_errors"
|
|
896
|
+
|
|
897
|
+
# Map vitals verdict to daemon verdict
|
|
898
|
+
case "$_health_verdict" in
|
|
899
|
+
continue)
|
|
900
|
+
echo "healthy"
|
|
901
|
+
return
|
|
902
|
+
;;
|
|
903
|
+
warn)
|
|
904
|
+
# Sluggish but not dead — equivalent to slowing
|
|
905
|
+
echo "slowing"
|
|
906
|
+
return
|
|
907
|
+
;;
|
|
908
|
+
intervene)
|
|
909
|
+
echo "stalled"
|
|
910
|
+
return
|
|
911
|
+
;;
|
|
912
|
+
abort)
|
|
913
|
+
echo "stuck"
|
|
914
|
+
return
|
|
915
|
+
;;
|
|
916
|
+
esac
|
|
917
|
+
fi
|
|
918
|
+
fi
|
|
919
|
+
|
|
920
|
+
# ── Fallback: static threshold verdict ──
|
|
859
921
|
local warn_threshold="${PROGRESS_CHECKS_BEFORE_WARN:-3}"
|
|
860
922
|
local kill_threshold="${PROGRESS_CHECKS_BEFORE_KILL:-6}"
|
|
861
923
|
|
|
@@ -1039,6 +1101,7 @@ extract_issue_dependencies() {
|
|
|
1039
1101
|
}
|
|
1040
1102
|
|
|
1041
1103
|
# ─── Logging ─────────────────────────────────────────────────────────────────
|
|
1104
|
+
DAEMON_LOG_WRITE_COUNT=0
|
|
1042
1105
|
|
|
1043
1106
|
daemon_log() {
|
|
1044
1107
|
local level="$1"
|
|
@@ -1048,8 +1111,9 @@ daemon_log() {
|
|
|
1048
1111
|
ts=$(now_iso)
|
|
1049
1112
|
echo "[$ts] [$level] $msg" >> "$LOG_FILE"
|
|
1050
1113
|
|
|
1051
|
-
# Rotate daemon.log if over 20MB (checked every
|
|
1052
|
-
|
|
1114
|
+
# Rotate daemon.log if over 20MB (checked every 100 writes)
|
|
1115
|
+
DAEMON_LOG_WRITE_COUNT=$(( DAEMON_LOG_WRITE_COUNT + 1 ))
|
|
1116
|
+
if [[ $(( DAEMON_LOG_WRITE_COUNT % 100 )) -eq 0 ]] && [[ -f "$LOG_FILE" ]]; then
|
|
1053
1117
|
local log_size
|
|
1054
1118
|
log_size=$(wc -c < "$LOG_FILE" 2>/dev/null || echo 0)
|
|
1055
1119
|
if [[ "$log_size" -gt 20971520 ]]; then
|
|
@@ -1060,11 +1124,14 @@ daemon_log() {
|
|
|
1060
1124
|
fi
|
|
1061
1125
|
fi
|
|
1062
1126
|
|
|
1063
|
-
#
|
|
1127
|
+
# Print to stderr (NOT stdout) to avoid corrupting command substitution captures.
|
|
1128
|
+
# This is critical: functions like select_pipeline_template(), triage_score_issue(),
|
|
1129
|
+
# gh_retry(), and locked_get_active_count() return values via echo/stdout and are
|
|
1130
|
+
# called via $(). If daemon_log writes to stdout, the log text corrupts return values.
|
|
1064
1131
|
case "$level" in
|
|
1065
|
-
INFO) info "$msg" ;;
|
|
1066
|
-
SUCCESS) success "$msg" ;;
|
|
1067
|
-
WARN) warn "$msg" ;;
|
|
1132
|
+
INFO) info "$msg" >&2 ;;
|
|
1133
|
+
SUCCESS) success "$msg" >&2 ;;
|
|
1134
|
+
WARN) warn "$msg" >&2 ;;
|
|
1068
1135
|
ERROR) error "$msg" ;;
|
|
1069
1136
|
esac
|
|
1070
1137
|
}
|
|
@@ -1130,7 +1197,10 @@ gh_record_failure() {
|
|
|
1130
1197
|
GH_CONSECUTIVE_FAILURES=$((GH_CONSECUTIVE_FAILURES + 1))
|
|
1131
1198
|
if [[ "$GH_CONSECUTIVE_FAILURES" -ge 3 ]]; then
|
|
1132
1199
|
# Exponential backoff: 30s, 60s, 120s, 240s (capped at 5min)
|
|
1133
|
-
|
|
1200
|
+
# Cap shift to avoid integer overflow for large failure counts
|
|
1201
|
+
local shift_amt=$(( GH_CONSECUTIVE_FAILURES - 3 ))
|
|
1202
|
+
[[ "$shift_amt" -gt 4 ]] && shift_amt=4
|
|
1203
|
+
local backoff_secs=$((30 * (1 << shift_amt)))
|
|
1134
1204
|
[[ "$backoff_secs" -gt 300 ]] && backoff_secs=300
|
|
1135
1205
|
GH_BACKOFF_UNTIL=$(( $(now_epoch) + backoff_secs ))
|
|
1136
1206
|
daemon_log WARN "GitHub rate-limit circuit breaker: backing off ${backoff_secs}s after ${GH_CONSECUTIVE_FAILURES} failures"
|
|
@@ -1380,7 +1450,7 @@ locked_get_active_count() {
|
|
|
1380
1450
|
(
|
|
1381
1451
|
if command -v flock &>/dev/null; then
|
|
1382
1452
|
flock -w 5 200 2>/dev/null || {
|
|
1383
|
-
daemon_log WARN "locked_get_active_count: lock timeout — returning MAX_PARALLEL as safe default"
|
|
1453
|
+
daemon_log WARN "locked_get_active_count: lock timeout — returning MAX_PARALLEL as safe default" >&2
|
|
1384
1454
|
echo "$MAX_PARALLEL"
|
|
1385
1455
|
exit 0
|
|
1386
1456
|
}
|
|
@@ -1626,6 +1696,17 @@ daemon_spawn_pipeline() {
|
|
|
1626
1696
|
daemon_log INFO "Worktree created at ${work_dir}"
|
|
1627
1697
|
fi
|
|
1628
1698
|
|
|
1699
|
+
# If template is "composed", copy the composed spec into the worktree
|
|
1700
|
+
if [[ "$PIPELINE_TEMPLATE" == "composed" ]]; then
|
|
1701
|
+
local _src_composed="${REPO_DIR:-.}/.claude/pipeline-artifacts/composed-pipeline.json"
|
|
1702
|
+
if [[ -f "$_src_composed" ]]; then
|
|
1703
|
+
local _dst_artifacts="${work_dir}/.claude/pipeline-artifacts"
|
|
1704
|
+
mkdir -p "$_dst_artifacts"
|
|
1705
|
+
cp "$_src_composed" "$_dst_artifacts/composed-pipeline.json" 2>/dev/null || true
|
|
1706
|
+
daemon_log INFO "Copied composed pipeline spec to worktree"
|
|
1707
|
+
fi
|
|
1708
|
+
fi
|
|
1709
|
+
|
|
1629
1710
|
# Build pipeline args
|
|
1630
1711
|
local pipeline_args=("start" "--issue" "$issue_num" "--pipeline" "$PIPELINE_TEMPLATE")
|
|
1631
1712
|
if [[ "$SKIP_GATES" == "true" ]]; then
|
|
@@ -1637,6 +1718,14 @@ daemon_spawn_pipeline() {
|
|
|
1637
1718
|
if [[ "$NO_GITHUB" == "true" ]]; then
|
|
1638
1719
|
pipeline_args+=("--no-github")
|
|
1639
1720
|
fi
|
|
1721
|
+
# Pass session restart config
|
|
1722
|
+
if [[ "${MAX_RESTARTS_CFG:-0}" -gt 0 ]]; then
|
|
1723
|
+
pipeline_args+=("--max-restarts" "$MAX_RESTARTS_CFG")
|
|
1724
|
+
fi
|
|
1725
|
+
# Pass fast test command
|
|
1726
|
+
if [[ -n "${FAST_TEST_CMD_CFG:-}" ]]; then
|
|
1727
|
+
pipeline_args+=("--fast-test-cmd" "$FAST_TEST_CMD_CFG")
|
|
1728
|
+
fi
|
|
1640
1729
|
|
|
1641
1730
|
# Run pipeline in work directory (background)
|
|
1642
1731
|
echo -e "\n\n===== Pipeline run $(date -u +%Y-%m-%dT%H:%M:%SZ) =====" >> "$LOG_DIR/issue-${issue_num}.log" 2>/dev/null || true
|
|
@@ -1770,6 +1859,41 @@ daemon_reap_completed() {
|
|
|
1770
1859
|
daemon_on_success "$issue_num" "$duration_str"
|
|
1771
1860
|
else
|
|
1772
1861
|
daemon_on_failure "$issue_num" "$exit_code" "$duration_str"
|
|
1862
|
+
|
|
1863
|
+
# Cancel any lingering in_progress GitHub Check Runs for failed job
|
|
1864
|
+
if [[ "${NO_GITHUB:-false}" != "true" && -n "$worktree" ]]; then
|
|
1865
|
+
local check_ids_file="${worktree}/.claude/pipeline-artifacts/check-run-ids.json"
|
|
1866
|
+
if [[ -f "$check_ids_file" ]]; then
|
|
1867
|
+
daemon_log INFO "Cancelling in-progress check runs for issue #${issue_num}"
|
|
1868
|
+
local _stage
|
|
1869
|
+
while IFS= read -r _stage; do
|
|
1870
|
+
[[ -z "$_stage" ]] && continue
|
|
1871
|
+
# Direct API call since we're in daemon context
|
|
1872
|
+
local _run_id
|
|
1873
|
+
_run_id=$(jq -r --arg s "$_stage" '.[$s] // empty' "$check_ids_file" 2>/dev/null || true)
|
|
1874
|
+
if [[ -n "$_run_id" && "$_run_id" != "null" ]]; then
|
|
1875
|
+
local _detected
|
|
1876
|
+
_detected=$(git remote get-url origin 2>/dev/null | sed 's|.*github.com[:/]\(.*\)\.git$|\1|' || true)
|
|
1877
|
+
if [[ -n "$_detected" ]]; then
|
|
1878
|
+
local _owner="${_detected%%/*}" _repo="${_detected##*/}"
|
|
1879
|
+
gh api "repos/${_owner}/${_repo}/check-runs/${_run_id}" \
|
|
1880
|
+
--method PATCH \
|
|
1881
|
+
--field status=completed \
|
|
1882
|
+
--field conclusion=cancelled \
|
|
1883
|
+
--silent 2>/dev/null || true
|
|
1884
|
+
fi
|
|
1885
|
+
fi
|
|
1886
|
+
done < <(jq -r 'keys[]' "$check_ids_file" 2>/dev/null || true)
|
|
1887
|
+
fi
|
|
1888
|
+
fi
|
|
1889
|
+
fi
|
|
1890
|
+
|
|
1891
|
+
# Finalize memory (capture failure patterns for future runs)
|
|
1892
|
+
if type memory_finalize_pipeline &>/dev/null 2>&1; then
|
|
1893
|
+
local _job_state _job_artifacts
|
|
1894
|
+
_job_state="${worktree:-.}/.claude/pipeline-state.md"
|
|
1895
|
+
_job_artifacts="${worktree:-.}/.claude/pipeline-artifacts"
|
|
1896
|
+
memory_finalize_pipeline "$_job_state" "$_job_artifacts" 2>/dev/null || true
|
|
1773
1897
|
fi
|
|
1774
1898
|
|
|
1775
1899
|
# Clean up progress tracking for this job
|
|
@@ -1958,6 +2082,25 @@ daemon_on_failure() {
|
|
|
1958
2082
|
fi
|
|
1959
2083
|
fi
|
|
1960
2084
|
|
|
2085
|
+
# Detect context exhaustion from progress file
|
|
2086
|
+
local failure_reason="unknown"
|
|
2087
|
+
local issue_worktree_path="${WORKTREE_DIR:-${REPO_DIR}/.worktrees}/daemon-issue-${issue_num}"
|
|
2088
|
+
local progress_file="${issue_worktree_path}/.claude/loop-logs/progress.md"
|
|
2089
|
+
if [[ -f "$progress_file" ]]; then
|
|
2090
|
+
local progress_iter
|
|
2091
|
+
progress_iter=$(grep -oE 'Iteration: [0-9]+' "$progress_file" 2>/dev/null | tail -1 | grep -oE '[0-9]+' || echo "0")
|
|
2092
|
+
if ! [[ "${progress_iter:-0}" =~ ^[0-9]+$ ]]; then
|
|
2093
|
+
progress_iter="0"
|
|
2094
|
+
fi
|
|
2095
|
+
local progress_tests
|
|
2096
|
+
progress_tests=$(grep -oE 'Tests passing: (true|false)' "$progress_file" 2>/dev/null | awk '{print $NF}' || echo "unknown")
|
|
2097
|
+
if [[ "${progress_iter:-0}" -gt 0 ]] && { [[ "$progress_tests" == "false" ]] || [[ "$progress_tests" == "unknown" ]]; }; then
|
|
2098
|
+
failure_reason="context_exhaustion"
|
|
2099
|
+
emit_event "daemon.context_exhaustion" "issue=$issue_num" "iterations=$progress_iter"
|
|
2100
|
+
daemon_log WARN "Context exhaustion detected for issue #${issue_num} (iterations: ${progress_iter})"
|
|
2101
|
+
fi
|
|
2102
|
+
fi
|
|
2103
|
+
|
|
1961
2104
|
# Build escalated pipeline args
|
|
1962
2105
|
local retry_template="$PIPELINE_TEMPLATE"
|
|
1963
2106
|
local retry_model="${MODEL:-opus}"
|
|
@@ -1976,6 +2119,17 @@ daemon_on_failure() {
|
|
|
1976
2119
|
daemon_log INFO "Escalation: template=full, compound_cycles=5"
|
|
1977
2120
|
fi
|
|
1978
2121
|
|
|
2122
|
+
# Increase restarts on context exhaustion
|
|
2123
|
+
if [[ "$failure_reason" == "context_exhaustion" ]]; then
|
|
2124
|
+
local boosted_restarts=$(( ${MAX_RESTARTS_CFG:-3} + retry_count ))
|
|
2125
|
+
# Cap at sw-loop's hard limit of 5
|
|
2126
|
+
if [[ "$boosted_restarts" -gt 5 ]]; then
|
|
2127
|
+
boosted_restarts=5
|
|
2128
|
+
fi
|
|
2129
|
+
extra_args+=("--max-restarts" "$boosted_restarts")
|
|
2130
|
+
daemon_log INFO "Boosting max-restarts to $boosted_restarts (context exhaustion)"
|
|
2131
|
+
fi
|
|
2132
|
+
|
|
1979
2133
|
if [[ "$NO_GITHUB" != "true" ]]; then
|
|
1980
2134
|
gh issue comment "$issue_num" --body "## 🔄 Auto-Retry #${retry_count}
|
|
1981
2135
|
|
|
@@ -2014,9 +2168,19 @@ _Escalation: $(if [[ "$retry_count" -eq 1 ]]; then echo "upgraded model + increa
|
|
|
2014
2168
|
|
|
2015
2169
|
# ── No retry — report final failure ──
|
|
2016
2170
|
if [[ "$NO_GITHUB" != "true" ]]; then
|
|
2017
|
-
# Add failure label
|
|
2171
|
+
# Add failure label and remove watch label (prevent re-processing)
|
|
2018
2172
|
gh issue edit "$issue_num" \
|
|
2019
|
-
--add-label "$ON_FAILURE_ADD_LABEL"
|
|
2173
|
+
--add-label "$ON_FAILURE_ADD_LABEL" \
|
|
2174
|
+
--remove-label "$WATCH_LABEL" 2>/dev/null || true
|
|
2175
|
+
|
|
2176
|
+
# Close any draft PR created for this issue (cleanup abandoned work)
|
|
2177
|
+
local draft_pr
|
|
2178
|
+
draft_pr=$(gh pr list --head "daemon/issue-${issue_num}" --head "pipeline/pipeline-issue-${issue_num}" \
|
|
2179
|
+
--json number,isDraft --jq '.[] | select(.isDraft == true) | .number' 2>/dev/null | head -1 || true)
|
|
2180
|
+
if [[ -n "$draft_pr" ]]; then
|
|
2181
|
+
gh pr close "$draft_pr" --delete-branch 2>/dev/null || true
|
|
2182
|
+
daemon_log INFO "Closed draft PR #${draft_pr} for failed issue #${issue_num}"
|
|
2183
|
+
fi
|
|
2020
2184
|
|
|
2021
2185
|
# Comment with log tail
|
|
2022
2186
|
local log_tail=""
|
|
@@ -2075,7 +2239,7 @@ triage_score_issue() {
|
|
|
2075
2239
|
|
|
2076
2240
|
# ── Intelligence-powered triage (if enabled) ──
|
|
2077
2241
|
if [[ "${INTELLIGENCE_ENABLED:-false}" == "true" ]] && type intelligence_analyze_issue &>/dev/null 2>&1; then
|
|
2078
|
-
daemon_log INFO "Intelligence: using AI triage (intelligence enabled)"
|
|
2242
|
+
daemon_log INFO "Intelligence: using AI triage (intelligence enabled)" >&2
|
|
2079
2243
|
local analysis
|
|
2080
2244
|
analysis=$(intelligence_analyze_issue "$issue_json" 2>/dev/null || echo "")
|
|
2081
2245
|
if [[ -n "$analysis" && "$analysis" != "{}" && "$analysis" != "null" ]]; then
|
|
@@ -2114,9 +2278,9 @@ triage_score_issue() {
|
|
|
2114
2278
|
return
|
|
2115
2279
|
fi
|
|
2116
2280
|
# Fall through to heuristic scoring if intelligence call failed
|
|
2117
|
-
daemon_log INFO "Intelligence: AI triage failed, falling back to heuristic scoring"
|
|
2281
|
+
daemon_log INFO "Intelligence: AI triage failed, falling back to heuristic scoring" >&2
|
|
2118
2282
|
else
|
|
2119
|
-
daemon_log INFO "Intelligence: using heuristic triage (intelligence disabled, enable with intelligence.enabled=true)"
|
|
2283
|
+
daemon_log INFO "Intelligence: using heuristic triage (intelligence disabled, enable with intelligence.enabled=true)" >&2
|
|
2120
2284
|
fi
|
|
2121
2285
|
labels_csv=$(echo "$issue_json" | jq -r '[.labels[].name] | join(",")')
|
|
2122
2286
|
created_at=$(echo "$issue_json" | jq -r '.createdAt // ""')
|
|
@@ -2256,6 +2420,7 @@ triage_score_issue() {
|
|
|
2256
2420
|
select_pipeline_template() {
|
|
2257
2421
|
local labels="$1"
|
|
2258
2422
|
local score="${2:-50}"
|
|
2423
|
+
local _selected_template=""
|
|
2259
2424
|
|
|
2260
2425
|
# When auto_template is disabled, use default pipeline template
|
|
2261
2426
|
if [[ "${AUTO_TEMPLATE:-false}" != "true" ]]; then
|
|
@@ -2265,7 +2430,7 @@ select_pipeline_template() {
|
|
|
2265
2430
|
|
|
2266
2431
|
# ── Intelligence-composed pipeline (if enabled) ──
|
|
2267
2432
|
if [[ "${COMPOSER_ENABLED:-false}" == "true" ]] && type composer_create_pipeline &>/dev/null 2>&1; then
|
|
2268
|
-
daemon_log INFO "Intelligence: using AI pipeline composition (composer enabled)"
|
|
2433
|
+
daemon_log INFO "Intelligence: using AI pipeline composition (composer enabled)" >&2
|
|
2269
2434
|
local analysis="${INTELLIGENCE_ANALYSIS:-{}}"
|
|
2270
2435
|
local repo_context=""
|
|
2271
2436
|
if [[ -f "${REPO_DIR:-}/.claude/pipeline-state.md" ]]; then
|
|
@@ -2287,9 +2452,69 @@ select_pipeline_template() {
|
|
|
2287
2452
|
return
|
|
2288
2453
|
fi
|
|
2289
2454
|
# Fall through to static selection if composition failed
|
|
2290
|
-
daemon_log INFO "Intelligence: AI pipeline composition failed, falling back to static template selection"
|
|
2455
|
+
daemon_log INFO "Intelligence: AI pipeline composition failed, falling back to static template selection" >&2
|
|
2291
2456
|
else
|
|
2292
|
-
daemon_log INFO "Intelligence: using static template selection (composer disabled, enable with intelligence.composer_enabled=true)"
|
|
2457
|
+
daemon_log INFO "Intelligence: using static template selection (composer disabled, enable with intelligence.composer_enabled=true)" >&2
|
|
2458
|
+
fi
|
|
2459
|
+
|
|
2460
|
+
# ── DORA-driven template escalation ──
|
|
2461
|
+
if [[ -f "${EVENTS_FILE:-$HOME/.shipwright/events.jsonl}" ]]; then
|
|
2462
|
+
local _dora_events _dora_total _dora_failures _dora_cfr
|
|
2463
|
+
_dora_events=$(tail -500 "${EVENTS_FILE:-$HOME/.shipwright/events.jsonl}" \
|
|
2464
|
+
| grep '"type":"pipeline.completed"' 2>/dev/null \
|
|
2465
|
+
| tail -5 || true)
|
|
2466
|
+
_dora_total=$(echo "$_dora_events" | grep -c '.' 2>/dev/null || echo "0")
|
|
2467
|
+
_dora_total="${_dora_total:-0}"
|
|
2468
|
+
if [[ "$_dora_total" -ge 3 ]]; then
|
|
2469
|
+
_dora_failures=$(echo "$_dora_events" | grep -c '"result":"failure"' 2>/dev/null || true)
|
|
2470
|
+
_dora_failures="${_dora_failures:-0}"
|
|
2471
|
+
_dora_cfr=$(( _dora_failures * 100 / _dora_total ))
|
|
2472
|
+
if [[ "$_dora_cfr" -gt 40 ]]; then
|
|
2473
|
+
daemon_log INFO "DORA escalation: CFR ${_dora_cfr}% > 40% — forcing enterprise template" >&2
|
|
2474
|
+
emit_event "daemon.dora_escalation" \
|
|
2475
|
+
"cfr=$_dora_cfr" \
|
|
2476
|
+
"total=$_dora_total" \
|
|
2477
|
+
"failures=$_dora_failures" \
|
|
2478
|
+
"template=enterprise"
|
|
2479
|
+
echo "enterprise"
|
|
2480
|
+
return
|
|
2481
|
+
fi
|
|
2482
|
+
if [[ "$_dora_cfr" -lt 10 && "$score" -ge 60 ]]; then
|
|
2483
|
+
daemon_log INFO "DORA: CFR ${_dora_cfr}% < 10% — fast template eligible" >&2
|
|
2484
|
+
# Fall through to allow other factors to also vote for fast
|
|
2485
|
+
fi
|
|
2486
|
+
|
|
2487
|
+
# ── DORA multi-factor ──
|
|
2488
|
+
# Cycle time: if median > 120min, prefer faster templates
|
|
2489
|
+
local _dora_cycle_time=0
|
|
2490
|
+
_dora_cycle_time=$(echo "$_dora_events" | jq -r 'select(.duration_s) | .duration_s' 2>/dev/null \
|
|
2491
|
+
| sort -n | awk '{ a[NR]=$1 } END { if (NR>0) print int(a[int(NR/2)+1]/60); else print 0 }' 2>/dev/null) || _dora_cycle_time=0
|
|
2492
|
+
_dora_cycle_time="${_dora_cycle_time:-0}"
|
|
2493
|
+
if [[ "${_dora_cycle_time:-0}" -gt 120 ]]; then
|
|
2494
|
+
daemon_log INFO "DORA: cycle time ${_dora_cycle_time}min > 120 — preferring fast template" >&2
|
|
2495
|
+
if [[ "${score:-0}" -ge 60 ]]; then
|
|
2496
|
+
echo "fast"
|
|
2497
|
+
return
|
|
2498
|
+
fi
|
|
2499
|
+
fi
|
|
2500
|
+
|
|
2501
|
+
# Deploy frequency: if < 1/week, use cost-aware
|
|
2502
|
+
local _dora_deploy_freq=0
|
|
2503
|
+
local _dora_first_epoch _dora_last_epoch _dora_span_days
|
|
2504
|
+
_dora_first_epoch=$(echo "$_dora_events" | head -1 | jq -r '.timestamp // empty' 2>/dev/null | xargs -I{} date -j -f "%Y-%m-%dT%H:%M:%SZ" {} +%s 2>/dev/null || echo "0")
|
|
2505
|
+
_dora_last_epoch=$(echo "$_dora_events" | tail -1 | jq -r '.timestamp // empty' 2>/dev/null | xargs -I{} date -j -f "%Y-%m-%dT%H:%M:%SZ" {} +%s 2>/dev/null || echo "0")
|
|
2506
|
+
if [[ "${_dora_first_epoch:-0}" -gt 0 && "${_dora_last_epoch:-0}" -gt 0 ]]; then
|
|
2507
|
+
_dora_span_days=$(( (_dora_last_epoch - _dora_first_epoch) / 86400 ))
|
|
2508
|
+
if [[ "${_dora_span_days:-0}" -gt 0 ]]; then
|
|
2509
|
+
_dora_deploy_freq=$(awk -v t="$_dora_total" -v d="$_dora_span_days" 'BEGIN { printf "%.1f", t * 7 / d }' 2>/dev/null) || _dora_deploy_freq=0
|
|
2510
|
+
fi
|
|
2511
|
+
fi
|
|
2512
|
+
if [[ -n "${_dora_deploy_freq:-}" ]] && awk -v f="${_dora_deploy_freq:-0}" 'BEGIN{exit !(f > 0 && f < 1)}' 2>/dev/null; then
|
|
2513
|
+
daemon_log INFO "DORA: deploy freq ${_dora_deploy_freq}/week — using cost-aware" >&2
|
|
2514
|
+
echo "cost-aware"
|
|
2515
|
+
return
|
|
2516
|
+
fi
|
|
2517
|
+
fi
|
|
2293
2518
|
fi
|
|
2294
2519
|
|
|
2295
2520
|
# ── Branch protection escalation (highest priority) ──
|
|
@@ -2306,7 +2531,7 @@ select_pipeline_template() {
|
|
|
2306
2531
|
local required_reviews
|
|
2307
2532
|
required_reviews=$(echo "$protection" | jq -r '.required_pull_request_reviews.required_approving_review_count // 0' 2>/dev/null || echo "0")
|
|
2308
2533
|
if [[ "$strict_protection" == "true" ]] || [[ "${required_reviews:-0}" -gt 1 ]]; then
|
|
2309
|
-
daemon_log INFO "Branch has strict protection — escalating to enterprise template"
|
|
2534
|
+
daemon_log INFO "Branch has strict protection — escalating to enterprise template" >&2
|
|
2310
2535
|
echo "enterprise"
|
|
2311
2536
|
return
|
|
2312
2537
|
fi
|
|
@@ -2340,6 +2565,62 @@ select_pipeline_template() {
|
|
|
2340
2565
|
fi
|
|
2341
2566
|
fi
|
|
2342
2567
|
|
|
2568
|
+
# ── Quality memory-driven selection ──
|
|
2569
|
+
local quality_scores_file="${HOME}/.shipwright/optimization/quality-scores.jsonl"
|
|
2570
|
+
if [[ -f "$quality_scores_file" ]]; then
|
|
2571
|
+
local repo_hash
|
|
2572
|
+
repo_hash=$(cd "${REPO_DIR:-.}" && git rev-parse --show-toplevel 2>/dev/null | shasum -a 256 | cut -c1-16 || echo "unknown")
|
|
2573
|
+
# Get last 5 quality scores for this repo
|
|
2574
|
+
local recent_scores avg_quality has_critical
|
|
2575
|
+
recent_scores=$(grep "\"repo\":\"$repo_hash\"" "$quality_scores_file" 2>/dev/null | tail -5 || true)
|
|
2576
|
+
if [[ -n "$recent_scores" ]]; then
|
|
2577
|
+
avg_quality=$(echo "$recent_scores" | jq -r '.quality_score // 70' 2>/dev/null | awk '{ sum += $1; count++ } END { if (count > 0) printf "%.0f", sum/count; else print 70 }')
|
|
2578
|
+
has_critical=$(echo "$recent_scores" | jq -r '.findings.critical // 0' 2>/dev/null | awk '{ sum += $1 } END { print (sum > 0) ? "yes" : "no" }')
|
|
2579
|
+
|
|
2580
|
+
# Critical findings in recent history → force enterprise
|
|
2581
|
+
if [[ "$has_critical" == "yes" ]]; then
|
|
2582
|
+
daemon_log INFO "Quality memory: critical findings in recent runs — using enterprise template" >&2
|
|
2583
|
+
echo "enterprise"
|
|
2584
|
+
return
|
|
2585
|
+
fi
|
|
2586
|
+
|
|
2587
|
+
# Poor quality history → use full template
|
|
2588
|
+
if [[ "${avg_quality:-70}" -lt 60 ]]; then
|
|
2589
|
+
daemon_log INFO "Quality memory: avg score ${avg_quality}/100 in recent runs — using full template" >&2
|
|
2590
|
+
echo "full"
|
|
2591
|
+
return
|
|
2592
|
+
fi
|
|
2593
|
+
|
|
2594
|
+
# Excellent quality history → allow faster template
|
|
2595
|
+
if [[ "${avg_quality:-70}" -gt 80 ]]; then
|
|
2596
|
+
daemon_log INFO "Quality memory: avg score ${avg_quality}/100 in recent runs — eligible for fast template" >&2
|
|
2597
|
+
# Only upgrade if score also suggests fast
|
|
2598
|
+
if [[ "$score" -ge 60 ]]; then
|
|
2599
|
+
echo "fast"
|
|
2600
|
+
return
|
|
2601
|
+
fi
|
|
2602
|
+
fi
|
|
2603
|
+
fi
|
|
2604
|
+
fi
|
|
2605
|
+
|
|
2606
|
+
# ── Learned template weights ──
|
|
2607
|
+
local _tw_file="${HOME}/.shipwright/optimization/template-weights.json"
|
|
2608
|
+
if [[ -f "$_tw_file" ]]; then
|
|
2609
|
+
local _best_template _best_rate
|
|
2610
|
+
_best_template=$(jq -r '
|
|
2611
|
+
.weights // {} | to_entries
|
|
2612
|
+
| map(select(.value.sample_size >= 3))
|
|
2613
|
+
| sort_by(-.value.success_rate)
|
|
2614
|
+
| .[0].key // ""
|
|
2615
|
+
' "$_tw_file" 2>/dev/null) || true
|
|
2616
|
+
if [[ -n "${_best_template:-}" && "${_best_template:-}" != "null" && "${_best_template:-}" != "" ]]; then
|
|
2617
|
+
_best_rate=$(jq -r --arg t "$_best_template" '.weights[$t].success_rate // 0' "$_tw_file" 2>/dev/null) || _best_rate=0
|
|
2618
|
+
daemon_log INFO "Template weights: ${_best_template} (${_best_rate} success rate)" >&2
|
|
2619
|
+
echo "$_best_template"
|
|
2620
|
+
return
|
|
2621
|
+
fi
|
|
2622
|
+
fi
|
|
2623
|
+
|
|
2343
2624
|
# ── Score-based selection ──
|
|
2344
2625
|
if [[ "$score" -ge 70 ]]; then
|
|
2345
2626
|
echo "fast"
|
|
@@ -2388,8 +2669,12 @@ daemon_triage_show() {
|
|
|
2388
2669
|
num=$(echo "$issue" | jq -r '.number')
|
|
2389
2670
|
title=$(echo "$issue" | jq -r '.title // "—"')
|
|
2390
2671
|
labels_csv=$(echo "$issue" | jq -r '[.labels[].name] | join(", ")')
|
|
2391
|
-
score=$(triage_score_issue "$issue")
|
|
2392
|
-
|
|
2672
|
+
score=$(triage_score_issue "$issue" 2>/dev/null | tail -1)
|
|
2673
|
+
score=$(printf '%s' "$score" | tr -cd '[:digit:]')
|
|
2674
|
+
[[ -z "$score" ]] && score=50
|
|
2675
|
+
template=$(select_pipeline_template "$labels_csv" "$score" 2>/dev/null | tail -1)
|
|
2676
|
+
template=$(printf '%s' "$template" | sed $'s/\x1b\\[[0-9;]*m//g' | tr -cd '[:alnum:]-_')
|
|
2677
|
+
[[ -z "$template" ]] && template="$PIPELINE_TEMPLATE"
|
|
2393
2678
|
|
|
2394
2679
|
scored_lines+=("${score}|${num}|${title}|${labels_csv}|${template}")
|
|
2395
2680
|
done < <(echo "$issues_json" | jq -c '.[]')
|
|
@@ -3221,11 +3506,12 @@ Auto-detected by \`shipwright daemon patrol\` on $(now_iso)." \
|
|
|
3221
3506
|
if [[ ! -f "$scripts_dir/sw-${name}-test.sh" ]]; then
|
|
3222
3507
|
# Count usage across other scripts
|
|
3223
3508
|
local usage_count
|
|
3224
|
-
usage_count=$(grep -rl "sw-${name}" "$scripts_dir"/sw-*.sh 2>/dev/null | grep -cv "$basename" ||
|
|
3509
|
+
usage_count=$(grep -rl "sw-${name}" "$scripts_dir"/sw-*.sh 2>/dev/null | grep -cv "$basename" 2>/dev/null || echo "0")
|
|
3225
3510
|
usage_count=${usage_count:-0}
|
|
3226
3511
|
|
|
3227
3512
|
local line_count
|
|
3228
|
-
line_count=$(wc -l < "$script" 2>/dev/null | tr -d ' ')
|
|
3513
|
+
line_count=$(wc -l < "$script" 2>/dev/null | tr -d ' ' || echo "0")
|
|
3514
|
+
line_count=${line_count:-0}
|
|
3229
3515
|
|
|
3230
3516
|
untested_entries="${untested_entries}${usage_count}|${basename}|${line_count}\n"
|
|
3231
3517
|
findings=$((findings + 1))
|
|
@@ -3602,7 +3888,9 @@ daemon_poll_issues() {
|
|
|
3602
3888
|
while IFS= read -r issue; do
|
|
3603
3889
|
local num score
|
|
3604
3890
|
num=$(echo "$issue" | jq -r '.number')
|
|
3605
|
-
score=$(triage_score_issue "$issue")
|
|
3891
|
+
score=$(triage_score_issue "$issue" 2>/dev/null | tail -1)
|
|
3892
|
+
score=$(printf '%s' "$score" | tr -cd '[:digit:]')
|
|
3893
|
+
[[ -z "$score" ]] && score=50
|
|
3606
3894
|
# For org mode, include repo name in the scored entry
|
|
3607
3895
|
local repo_name=""
|
|
3608
3896
|
if [[ "$WATCH_MODE" == "org" ]]; then
|
|
@@ -3629,10 +3917,10 @@ daemon_poll_issues() {
|
|
|
3629
3917
|
local sorted_order
|
|
3630
3918
|
if [[ "${PRIORITY_STRATEGY:-quick-wins-first}" == "complex-first" ]]; then
|
|
3631
3919
|
# Complex-first: lower score (more complex) first
|
|
3632
|
-
sorted_order=$(printf '%s\n' "${scored_issues[@]}" | sort -t'|' -k1 -n)
|
|
3920
|
+
sorted_order=$(printf '%s\n' "${scored_issues[@]}" | sort -t'|' -k1,1 -n -k2,2 -n)
|
|
3633
3921
|
else
|
|
3634
|
-
# Quick-wins-first (default): higher score (simpler) first
|
|
3635
|
-
sorted_order=$(printf '%s\n' "${scored_issues[@]}" | sort -t'|' -k1 -rn)
|
|
3922
|
+
# Quick-wins-first (default): higher score (simpler) first, lowest issue# first on ties
|
|
3923
|
+
sorted_order=$(printf '%s\n' "${scored_issues[@]}" | sort -t'|' -k1,1 -rn -k2,2 -n)
|
|
3636
3924
|
fi
|
|
3637
3925
|
|
|
3638
3926
|
# Dependency-aware reordering: move dependencies before dependents
|
|
@@ -3727,7 +4015,9 @@ daemon_poll_issues() {
|
|
|
3727
4015
|
emit_event "daemon.priority_lane" "issue=$issue_num" "score=$score"
|
|
3728
4016
|
|
|
3729
4017
|
local template
|
|
3730
|
-
template=$(select_pipeline_template "$labels_csv" "$score")
|
|
4018
|
+
template=$(select_pipeline_template "$labels_csv" "$score" 2>/dev/null | tail -1)
|
|
4019
|
+
template=$(printf '%s' "$template" | sed $'s/\x1b\\[[0-9;]*m//g' | tr -cd '[:alnum:]-_')
|
|
4020
|
+
[[ -z "$template" ]] && template="$PIPELINE_TEMPLATE"
|
|
3731
4021
|
daemon_log INFO "Triage: issue #${issue_num} scored ${score}, template=${template} [PRIORITY]"
|
|
3732
4022
|
|
|
3733
4023
|
local orig_template="$PIPELINE_TEMPLATE"
|
|
@@ -3748,7 +4038,9 @@ daemon_poll_issues() {
|
|
|
3748
4038
|
|
|
3749
4039
|
# Auto-select pipeline template based on labels + triage score
|
|
3750
4040
|
local template
|
|
3751
|
-
template=$(select_pipeline_template "$labels_csv" "$score")
|
|
4041
|
+
template=$(select_pipeline_template "$labels_csv" "$score" 2>/dev/null | tail -1)
|
|
4042
|
+
template=$(printf '%s' "$template" | sed $'s/\x1b\\[[0-9;]*m//g' | tr -cd '[:alnum:]-_')
|
|
4043
|
+
[[ -z "$template" ]] && template="$PIPELINE_TEMPLATE"
|
|
3752
4044
|
daemon_log INFO "Triage: issue #${issue_num} scored ${score}, template=${template}"
|
|
3753
4045
|
|
|
3754
4046
|
# Spawn pipeline (template selection applied via PIPELINE_TEMPLATE override)
|
|
@@ -3756,8 +4048,41 @@ daemon_poll_issues() {
|
|
|
3756
4048
|
PIPELINE_TEMPLATE="$template"
|
|
3757
4049
|
daemon_spawn_pipeline "$issue_num" "$issue_title" "$repo_name"
|
|
3758
4050
|
PIPELINE_TEMPLATE="$orig_template"
|
|
4051
|
+
|
|
4052
|
+
# Stagger delay between spawns to avoid API contention
|
|
4053
|
+
local stagger_delay="${SPAWN_STAGGER_SECONDS:-15}"
|
|
4054
|
+
if [[ "$stagger_delay" -gt 0 ]]; then
|
|
4055
|
+
sleep "$stagger_delay"
|
|
4056
|
+
fi
|
|
3759
4057
|
done <<< "$sorted_order"
|
|
3760
4058
|
|
|
4059
|
+
# ── Drain queue if we have capacity (prevents deadlock when queue is
|
|
4060
|
+
# populated but no active jobs exist to trigger dequeue) ──
|
|
4061
|
+
local drain_active
|
|
4062
|
+
drain_active=$(locked_get_active_count)
|
|
4063
|
+
while [[ "$drain_active" -lt "$MAX_PARALLEL" ]]; do
|
|
4064
|
+
local drain_issue
|
|
4065
|
+
drain_issue=$(dequeue_next)
|
|
4066
|
+
[[ -z "$drain_issue" ]] && break
|
|
4067
|
+
local drain_title
|
|
4068
|
+
drain_title=$(jq -r --arg n "$drain_issue" '.titles[$n] // ""' "$STATE_FILE" 2>/dev/null || true)
|
|
4069
|
+
|
|
4070
|
+
local drain_labels drain_score drain_template
|
|
4071
|
+
drain_labels=$(echo "$issues_json" | jq -r --argjson n "$drain_issue" \
|
|
4072
|
+
'.[] | select(.number == $n) | [.labels[].name] | join(",")' 2>/dev/null || echo "")
|
|
4073
|
+
drain_score=$(echo "$sorted_order" | grep "|${drain_issue}|" | cut -d'|' -f1 || echo "50")
|
|
4074
|
+
drain_template=$(select_pipeline_template "$drain_labels" "${drain_score:-50}" 2>/dev/null | tail -1)
|
|
4075
|
+
drain_template=$(printf '%s' "$drain_template" | sed $'s/\x1b\\[[0-9;]*m//g' | tr -cd '[:alnum:]-_')
|
|
4076
|
+
[[ -z "$drain_template" ]] && drain_template="$PIPELINE_TEMPLATE"
|
|
4077
|
+
|
|
4078
|
+
daemon_log INFO "Draining queue: issue #${drain_issue}, template=${drain_template}"
|
|
4079
|
+
local orig_template="$PIPELINE_TEMPLATE"
|
|
4080
|
+
PIPELINE_TEMPLATE="$drain_template"
|
|
4081
|
+
daemon_spawn_pipeline "$drain_issue" "$drain_title"
|
|
4082
|
+
PIPELINE_TEMPLATE="$orig_template"
|
|
4083
|
+
drain_active=$(locked_get_active_count)
|
|
4084
|
+
done
|
|
4085
|
+
|
|
3761
4086
|
# Update last poll
|
|
3762
4087
|
update_state_field "last_poll" "$(now_iso)"
|
|
3763
4088
|
}
|
|
@@ -3908,8 +4233,11 @@ daemon_check_degradation() {
|
|
|
3908
4233
|
local failures successes
|
|
3909
4234
|
failures=$(echo "$recent" | jq '[.[] | select(.result == "failure")] | length')
|
|
3910
4235
|
successes=$(echo "$recent" | jq '[.[] | select(.result == "success")] | length')
|
|
3911
|
-
local cfr_pct
|
|
3912
|
-
|
|
4236
|
+
local cfr_pct=0 success_pct=0
|
|
4237
|
+
if [[ "${count:-0}" -gt 0 ]]; then
|
|
4238
|
+
cfr_pct=$(( failures * 100 / count ))
|
|
4239
|
+
success_pct=$(( successes * 100 / count ))
|
|
4240
|
+
fi
|
|
3913
4241
|
|
|
3914
4242
|
local alerts=""
|
|
3915
4243
|
if [[ "$cfr_pct" -gt "$cfr_threshold" ]]; then
|
|
@@ -4039,11 +4367,43 @@ daemon_auto_scale() {
|
|
|
4039
4367
|
local max_by_queue=$(( queue_depth + active_count ))
|
|
4040
4368
|
[[ "$max_by_queue" -lt 1 ]] && max_by_queue=1
|
|
4041
4369
|
|
|
4370
|
+
# ── Vitals-driven scaling factor ──
|
|
4371
|
+
local max_by_vitals="$MAX_WORKERS"
|
|
4372
|
+
if type pipeline_compute_vitals &>/dev/null 2>&1 && [[ -f "$STATE_FILE" ]]; then
|
|
4373
|
+
local _total_health=0 _health_count=0
|
|
4374
|
+
while IFS= read -r _job; do
|
|
4375
|
+
local _job_issue _job_worktree
|
|
4376
|
+
_job_issue=$(echo "$_job" | jq -r '.issue // 0')
|
|
4377
|
+
_job_worktree=$(echo "$_job" | jq -r '.worktree // ""')
|
|
4378
|
+
if [[ -n "$_job_worktree" && -d "$_job_worktree/.claude" ]]; then
|
|
4379
|
+
local _job_vitals _job_health
|
|
4380
|
+
_job_vitals=$(pipeline_compute_vitals "$_job_worktree/.claude/pipeline-state.md" "$_job_worktree/.claude/pipeline-artifacts" "$_job_issue" 2>/dev/null) || true
|
|
4381
|
+
if [[ -n "$_job_vitals" && "$_job_vitals" != "{}" ]]; then
|
|
4382
|
+
_job_health=$(echo "$_job_vitals" | jq -r '.health_score // 50' 2>/dev/null || echo "50")
|
|
4383
|
+
_total_health=$((_total_health + _job_health))
|
|
4384
|
+
_health_count=$((_health_count + 1))
|
|
4385
|
+
fi
|
|
4386
|
+
fi
|
|
4387
|
+
done < <(jq -c '.active_jobs[]' "$STATE_FILE" 2>/dev/null || true)
|
|
4388
|
+
|
|
4389
|
+
if [[ "$_health_count" -gt 0 ]]; then
|
|
4390
|
+
local _avg_health=$((_total_health / _health_count))
|
|
4391
|
+
if [[ "$_avg_health" -lt 50 ]]; then
|
|
4392
|
+
# Pipelines struggling — reduce workers to give each more resources
|
|
4393
|
+
max_by_vitals=$(( MAX_WORKERS * _avg_health / 100 ))
|
|
4394
|
+
[[ "$max_by_vitals" -lt "$MIN_WORKERS" ]] && max_by_vitals="$MIN_WORKERS"
|
|
4395
|
+
daemon_log INFO "Auto-scale: vitals avg health ${_avg_health}% — capping at ${max_by_vitals} workers"
|
|
4396
|
+
fi
|
|
4397
|
+
# avg_health > 70: no reduction (full capacity available)
|
|
4398
|
+
fi
|
|
4399
|
+
fi
|
|
4400
|
+
|
|
4042
4401
|
# ── Compute final value ──
|
|
4043
4402
|
local computed="$max_by_cpu"
|
|
4044
4403
|
[[ "$max_by_mem" -lt "$computed" ]] && computed="$max_by_mem"
|
|
4045
4404
|
[[ "$max_by_budget" -lt "$computed" ]] && computed="$max_by_budget"
|
|
4046
4405
|
[[ "$max_by_queue" -lt "$computed" ]] && computed="$max_by_queue"
|
|
4406
|
+
[[ "$max_by_vitals" -lt "$computed" ]] && computed="$max_by_vitals"
|
|
4047
4407
|
[[ "$MAX_WORKERS" -lt "$computed" ]] && computed="$MAX_WORKERS"
|
|
4048
4408
|
|
|
4049
4409
|
# Respect fleet-assigned ceiling if set
|
|
@@ -4302,7 +4662,19 @@ daemon_cleanup_stale() {
|
|
|
4302
4662
|
done < <(git worktree list --porcelain 2>/dev/null | grep '^worktree ' | sed 's/^worktree //')
|
|
4303
4663
|
fi
|
|
4304
4664
|
|
|
4305
|
-
# ── 2.
|
|
4665
|
+
# ── 2. Expire old checkpoints ──
|
|
4666
|
+
if [[ -x "$SCRIPT_DIR/sw-checkpoint.sh" ]]; then
|
|
4667
|
+
local expired_output
|
|
4668
|
+
expired_output=$(bash "$SCRIPT_DIR/sw-checkpoint.sh" expire --hours "$((age_days * 24))" 2>/dev/null || true)
|
|
4669
|
+
if [[ -n "$expired_output" ]] && echo "$expired_output" | grep -q "Expired"; then
|
|
4670
|
+
local expired_count
|
|
4671
|
+
expired_count=$(echo "$expired_output" | grep -c "Expired" || true)
|
|
4672
|
+
cleaned=$((cleaned + ${expired_count:-0}))
|
|
4673
|
+
daemon_log INFO "Expired ${expired_count:-0} old checkpoint(s)"
|
|
4674
|
+
fi
|
|
4675
|
+
fi
|
|
4676
|
+
|
|
4677
|
+
# ── 3. Clean old pipeline artifacts (subdirectories only) ──
|
|
4306
4678
|
local artifacts_dir=".claude/pipeline-artifacts"
|
|
4307
4679
|
if [[ -d "$artifacts_dir" ]]; then
|
|
4308
4680
|
while IFS= read -r artifact_dir; do
|