shipwright-cli 2.2.0 → 2.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +15 -16
- package/config/policy.schema.json +104 -29
- package/docs/AGI-PLATFORM-PLAN.md +11 -7
- package/docs/AGI-WHATS-NEXT.md +26 -20
- package/docs/README.md +2 -0
- package/package.json +1 -1
- package/scripts/check-version-consistency.sh +72 -0
- package/scripts/lib/daemon-adaptive.sh +610 -0
- package/scripts/lib/daemon-dispatch.sh +489 -0
- package/scripts/lib/daemon-failure.sh +387 -0
- package/scripts/lib/daemon-patrol.sh +1113 -0
- package/scripts/lib/daemon-poll.sh +1202 -0
- package/scripts/lib/daemon-state.sh +550 -0
- package/scripts/lib/daemon-triage.sh +490 -0
- package/scripts/lib/helpers.sh +81 -1
- package/scripts/lib/pipeline-detection.sh +278 -0
- package/scripts/lib/pipeline-github.sh +196 -0
- package/scripts/lib/pipeline-intelligence.sh +1706 -0
- package/scripts/lib/pipeline-quality-checks.sh +1054 -0
- package/scripts/lib/pipeline-quality.sh +11 -0
- package/scripts/lib/pipeline-stages.sh +2508 -0
- package/scripts/lib/pipeline-state.sh +529 -0
- package/scripts/sw +26 -4
- package/scripts/sw-activity.sh +1 -1
- package/scripts/sw-adaptive.sh +2 -2
- package/scripts/sw-adversarial.sh +1 -1
- package/scripts/sw-architecture-enforcer.sh +1 -1
- package/scripts/sw-auth.sh +1 -1
- package/scripts/sw-autonomous.sh +1 -1
- package/scripts/sw-changelog.sh +1 -1
- package/scripts/sw-checkpoint.sh +1 -1
- package/scripts/sw-ci.sh +1 -1
- package/scripts/sw-cleanup.sh +1 -1
- package/scripts/sw-code-review.sh +1 -1
- package/scripts/sw-connect.sh +1 -1
- package/scripts/sw-context.sh +1 -1
- package/scripts/sw-cost.sh +1 -1
- package/scripts/sw-daemon.sh +52 -4816
- package/scripts/sw-dashboard.sh +1 -1
- package/scripts/sw-db.sh +1 -1
- package/scripts/sw-decompose.sh +1 -1
- package/scripts/sw-deps.sh +1 -1
- package/scripts/sw-developer-simulation.sh +1 -1
- package/scripts/sw-discovery.sh +1 -1
- package/scripts/sw-doc-fleet.sh +1 -1
- package/scripts/sw-docs-agent.sh +1 -1
- package/scripts/sw-docs.sh +1 -1
- package/scripts/sw-doctor.sh +42 -1
- package/scripts/sw-dora.sh +1 -1
- package/scripts/sw-durable.sh +1 -1
- package/scripts/sw-e2e-orchestrator.sh +1 -1
- package/scripts/sw-eventbus.sh +1 -1
- package/scripts/sw-feedback.sh +1 -1
- package/scripts/sw-fix.sh +1 -1
- package/scripts/sw-fleet-discover.sh +1 -1
- package/scripts/sw-fleet-viz.sh +3 -3
- package/scripts/sw-fleet.sh +1 -1
- package/scripts/sw-github-app.sh +1 -1
- package/scripts/sw-github-checks.sh +1 -1
- package/scripts/sw-github-deploy.sh +1 -1
- package/scripts/sw-github-graphql.sh +1 -1
- package/scripts/sw-guild.sh +1 -1
- package/scripts/sw-heartbeat.sh +1 -1
- package/scripts/sw-hygiene.sh +1 -1
- package/scripts/sw-incident.sh +1 -1
- package/scripts/sw-init.sh +1 -1
- package/scripts/sw-instrument.sh +1 -1
- package/scripts/sw-intelligence.sh +1 -1
- package/scripts/sw-jira.sh +1 -1
- package/scripts/sw-launchd.sh +1 -1
- package/scripts/sw-linear.sh +1 -1
- package/scripts/sw-logs.sh +1 -1
- package/scripts/sw-loop.sh +1 -1
- package/scripts/sw-memory.sh +1 -1
- package/scripts/sw-mission-control.sh +1 -1
- package/scripts/sw-model-router.sh +1 -1
- package/scripts/sw-otel.sh +4 -4
- package/scripts/sw-oversight.sh +1 -1
- package/scripts/sw-pipeline-composer.sh +1 -1
- package/scripts/sw-pipeline-vitals.sh +1 -1
- package/scripts/sw-pipeline.sh +23 -56
- package/scripts/sw-pipeline.sh.mock +7 -0
- package/scripts/sw-pm.sh +1 -1
- package/scripts/sw-pr-lifecycle.sh +1 -1
- package/scripts/sw-predictive.sh +1 -1
- package/scripts/sw-prep.sh +1 -1
- package/scripts/sw-ps.sh +1 -1
- package/scripts/sw-public-dashboard.sh +1 -1
- package/scripts/sw-quality.sh +1 -1
- package/scripts/sw-reaper.sh +1 -1
- package/scripts/sw-recruit.sh +9 -1
- package/scripts/sw-regression.sh +1 -1
- package/scripts/sw-release-manager.sh +1 -1
- package/scripts/sw-release.sh +1 -1
- package/scripts/sw-remote.sh +1 -1
- package/scripts/sw-replay.sh +1 -1
- package/scripts/sw-retro.sh +1 -1
- package/scripts/sw-scale.sh +8 -5
- package/scripts/sw-security-audit.sh +1 -1
- package/scripts/sw-self-optimize.sh +158 -7
- package/scripts/sw-session.sh +1 -1
- package/scripts/sw-setup.sh +1 -1
- package/scripts/sw-standup.sh +3 -3
- package/scripts/sw-status.sh +1 -1
- package/scripts/sw-strategic.sh +1 -1
- package/scripts/sw-stream.sh +8 -2
- package/scripts/sw-swarm.sh +7 -10
- package/scripts/sw-team-stages.sh +1 -1
- package/scripts/sw-templates.sh +1 -1
- package/scripts/sw-testgen.sh +1 -1
- package/scripts/sw-tmux-pipeline.sh +1 -1
- package/scripts/sw-tmux.sh +1 -1
- package/scripts/sw-trace.sh +1 -1
- package/scripts/sw-tracker.sh +24 -6
- package/scripts/sw-triage.sh +1 -1
- package/scripts/sw-upgrade.sh +1 -1
- package/scripts/sw-ux.sh +1 -1
- package/scripts/sw-webhook.sh +1 -1
- package/scripts/sw-widgets.sh +1 -1
- package/scripts/sw-worktree.sh +1 -1
|
@@ -0,0 +1,387 @@
|
|
|
1
|
+
# daemon-failure.sh — Failure classification, retry, backoff (for sw-daemon.sh)
|
|
2
|
+
# Source from sw-daemon.sh. Requires state, helpers.
|
|
3
|
+
[[ -n "${_DAEMON_FAILURE_LOADED:-}" ]] && return 0
|
|
4
|
+
_DAEMON_FAILURE_LOADED=1
|
|
5
|
+
|
|
6
|
+
classify_failure() {
|
|
7
|
+
local issue_num="$1"
|
|
8
|
+
if [[ -z "${LOG_DIR:-}" ]]; then
|
|
9
|
+
echo "unknown"
|
|
10
|
+
return
|
|
11
|
+
fi
|
|
12
|
+
local log_path="$LOG_DIR/issue-${issue_num}.log"
|
|
13
|
+
if [[ ! -f "$log_path" ]]; then
|
|
14
|
+
echo "unknown"
|
|
15
|
+
return
|
|
16
|
+
fi
|
|
17
|
+
local tail_content
|
|
18
|
+
tail_content=$(tail -200 "$log_path" 2>/dev/null || true)
|
|
19
|
+
|
|
20
|
+
# Auth errors
|
|
21
|
+
if echo "$tail_content" | grep -qiE 'not logged in|unauthorized|auth.*fail|401 |invalid.*token|CLAUDE_CODE_OAUTH_TOKEN|api key.*invalid|authentication required'; then
|
|
22
|
+
echo "auth_error"
|
|
23
|
+
return
|
|
24
|
+
fi
|
|
25
|
+
# API errors (rate limits, timeouts, server errors)
|
|
26
|
+
if echo "$tail_content" | grep -qiE 'rate limit|429 |503 |502 |overloaded|timeout|ETIMEDOUT|ECONNRESET|socket hang up|service unavailable'; then
|
|
27
|
+
echo "api_error"
|
|
28
|
+
return
|
|
29
|
+
fi
|
|
30
|
+
# Invalid issue (not found, empty body)
|
|
31
|
+
if echo "$tail_content" | grep -qiE 'issue not found|404 |no body|could not resolve|GraphQL.*not found|issue.*does not exist'; then
|
|
32
|
+
echo "invalid_issue"
|
|
33
|
+
return
|
|
34
|
+
fi
|
|
35
|
+
# Context exhaustion — check progress file
|
|
36
|
+
local issue_worktree_path="${WORKTREE_DIR:-${REPO_DIR}/.worktrees}/daemon-issue-${issue_num}"
|
|
37
|
+
local progress_file="${issue_worktree_path}/.claude/loop-logs/progress.md"
|
|
38
|
+
if [[ -f "$progress_file" ]]; then
|
|
39
|
+
local cf_iter
|
|
40
|
+
cf_iter=$(grep -oE 'Iteration: [0-9]+' "$progress_file" 2>/dev/null | tail -1 | grep -oE '[0-9]+' || echo "0")
|
|
41
|
+
if ! [[ "${cf_iter:-0}" =~ ^[0-9]+$ ]]; then cf_iter="0"; fi
|
|
42
|
+
local cf_tests
|
|
43
|
+
cf_tests=$(grep -oE 'Tests passing: (true|false)' "$progress_file" 2>/dev/null | awk '{print $NF}' || echo "unknown")
|
|
44
|
+
if [[ "${cf_iter:-0}" -gt 0 ]] && { [[ "$cf_tests" == "false" ]] || [[ "$cf_tests" == "unknown" ]]; }; then
|
|
45
|
+
echo "context_exhaustion"
|
|
46
|
+
return
|
|
47
|
+
fi
|
|
48
|
+
fi
|
|
49
|
+
# Build failure (test errors, compile errors)
|
|
50
|
+
if echo "$tail_content" | grep -qiE 'test.*fail|FAIL|build.*error|compile.*error|lint.*fail|npm ERR|exit code [1-9]'; then
|
|
51
|
+
echo "build_failure"
|
|
52
|
+
return
|
|
53
|
+
fi
|
|
54
|
+
echo "unknown"
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
# ─── Consecutive Failure Tracking (persisted + adaptive) ─────────────────────
|
|
58
|
+
|
|
59
|
+
DAEMON_CONSECUTIVE_FAILURE_CLASS=""
|
|
60
|
+
DAEMON_CONSECUTIVE_FAILURE_COUNT=0
|
|
61
|
+
|
|
62
|
+
# Max retries per failure class (adaptive retry strategy)
|
|
63
|
+
get_max_retries_for_class() {
|
|
64
|
+
local class="${1:-unknown}"
|
|
65
|
+
case "$class" in
|
|
66
|
+
auth_error|invalid_issue) echo 0 ;;
|
|
67
|
+
api_error) echo "${MAX_RETRIES_API_ERROR:-4}" ;;
|
|
68
|
+
context_exhaustion) echo "${MAX_RETRIES_CONTEXT_EXHAUSTION:-2}" ;;
|
|
69
|
+
build_failure) echo "${MAX_RETRIES_BUILD:-2}" ;;
|
|
70
|
+
*) echo "${MAX_RETRIES:-2}" ;;
|
|
71
|
+
esac
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
# Append failure to persisted history and compute consecutive count; smart pause with exponential backoff
|
|
75
|
+
record_failure_class() {
|
|
76
|
+
local failure_class="$1"
|
|
77
|
+
# In-memory consecutive (for backward compat)
|
|
78
|
+
if [[ "$failure_class" == "$DAEMON_CONSECUTIVE_FAILURE_CLASS" ]]; then
|
|
79
|
+
DAEMON_CONSECUTIVE_FAILURE_COUNT=$((DAEMON_CONSECUTIVE_FAILURE_COUNT + 1))
|
|
80
|
+
else
|
|
81
|
+
DAEMON_CONSECUTIVE_FAILURE_CLASS="$failure_class"
|
|
82
|
+
DAEMON_CONSECUTIVE_FAILURE_COUNT=1
|
|
83
|
+
fi
|
|
84
|
+
|
|
85
|
+
# Persist failure to state (failure_history) for pattern tracking
|
|
86
|
+
if [[ -f "${STATE_FILE:-}" ]]; then
|
|
87
|
+
local entry
|
|
88
|
+
entry=$(jq -n --arg ts "$(now_iso)" --arg class "$failure_class" '{ts: $ts, class: $class}')
|
|
89
|
+
locked_state_update --argjson entry "$entry" \
|
|
90
|
+
'.failure_history = ((.failure_history // []) + [$entry] | .[-100:])' 2>/dev/null || true
|
|
91
|
+
fi
|
|
92
|
+
|
|
93
|
+
# Consecutive count from persisted tail: count only the unbroken run of $failure_class
|
|
94
|
+
# from the newest entry backwards (not total occurrences)
|
|
95
|
+
local consecutive="$DAEMON_CONSECUTIVE_FAILURE_COUNT"
|
|
96
|
+
if [[ -f "${STATE_FILE:-}" ]]; then
|
|
97
|
+
local from_state
|
|
98
|
+
from_state=$(jq -r --arg c "$failure_class" '
|
|
99
|
+
(.failure_history // []) | [.[].class] | reverse |
|
|
100
|
+
if length == 0 then 0
|
|
101
|
+
elif .[0] != $c then 0
|
|
102
|
+
else
|
|
103
|
+
reduce .[] as $x (
|
|
104
|
+
{count: 0, done: false};
|
|
105
|
+
if .done then . elif $x == $c then .count += 1 else .done = true end
|
|
106
|
+
) | .count
|
|
107
|
+
end
|
|
108
|
+
' "$STATE_FILE" 2>/dev/null || echo "1")
|
|
109
|
+
consecutive="${from_state:-1}"
|
|
110
|
+
[[ "$consecutive" -eq 0 ]] && consecutive="$DAEMON_CONSECUTIVE_FAILURE_COUNT"
|
|
111
|
+
DAEMON_CONSECUTIVE_FAILURE_COUNT="$consecutive"
|
|
112
|
+
fi
|
|
113
|
+
|
|
114
|
+
# Smart pause: exponential backoff instead of hard stop (resume_after so daemon can auto-resume)
|
|
115
|
+
if [[ "$consecutive" -ge 3 ]]; then
|
|
116
|
+
local pause_mins=$((5 * (1 << (consecutive - 3))))
|
|
117
|
+
[[ "$pause_mins" -gt 480 ]] && pause_mins=480
|
|
118
|
+
local resume_ts resume_after
|
|
119
|
+
resume_ts=$(($(date +%s) + pause_mins * 60))
|
|
120
|
+
resume_after=$(epoch_to_iso "$resume_ts")
|
|
121
|
+
daemon_log ERROR "${consecutive} consecutive failures (class: ${failure_class}) — auto-pausing until ${resume_after} (${pause_mins}m backoff)"
|
|
122
|
+
local pause_json
|
|
123
|
+
pause_json=$(jq -n \
|
|
124
|
+
--arg reason "consecutive_${failure_class}" \
|
|
125
|
+
--arg ts "$(now_iso)" \
|
|
126
|
+
--arg resume "$resume_after" \
|
|
127
|
+
--argjson count "$consecutive" \
|
|
128
|
+
'{reason: $reason, timestamp: $ts, resume_after: $resume, consecutive_count: $count}')
|
|
129
|
+
local _tmp_pause
|
|
130
|
+
_tmp_pause=$(mktemp "${TMPDIR:-/tmp}/sw-pause.XXXXXX")
|
|
131
|
+
echo "$pause_json" > "$_tmp_pause"
|
|
132
|
+
mv "$_tmp_pause" "$PAUSE_FLAG"
|
|
133
|
+
emit_event "daemon.auto_pause" "reason=consecutive_failures" "class=$failure_class" "count=$consecutive" "resume_after=$resume_after"
|
|
134
|
+
fi
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
reset_failure_tracking() {
|
|
138
|
+
DAEMON_CONSECUTIVE_FAILURE_CLASS=""
|
|
139
|
+
DAEMON_CONSECUTIVE_FAILURE_COUNT=0
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
# ─── Failure Handler ────────────────────────────────────────────────────────
|
|
143
|
+
|
|
144
|
+
daemon_on_failure() {
|
|
145
|
+
local issue_num="$1" exit_code="${2:-1}" duration="${3:-}"
|
|
146
|
+
|
|
147
|
+
daemon_log ERROR "Pipeline failed for issue #${issue_num} (exit: ${exit_code}, ${duration:-unknown})"
|
|
148
|
+
|
|
149
|
+
# Record pipeline duration for adaptive threshold learning
|
|
150
|
+
if [[ -n "$duration" && "$duration" != "unknown" ]]; then
|
|
151
|
+
local dur_secs=0
|
|
152
|
+
local _h _m _s
|
|
153
|
+
_h=$(echo "$duration" | grep -oE '[0-9]+h' | grep -oE '[0-9]+' || true)
|
|
154
|
+
_m=$(echo "$duration" | grep -oE '[0-9]+m' | grep -oE '[0-9]+' || true)
|
|
155
|
+
_s=$(echo "$duration" | grep -oE '[0-9]+s' | grep -oE '[0-9]+' || true)
|
|
156
|
+
dur_secs=$(( ${_h:-0} * 3600 + ${_m:-0} * 60 + ${_s:-0} ))
|
|
157
|
+
if [[ "$dur_secs" -gt 0 ]]; then
|
|
158
|
+
record_pipeline_duration "$PIPELINE_TEMPLATE" "$dur_secs" "failure"
|
|
159
|
+
record_scaling_outcome "$MAX_PARALLEL" "failure"
|
|
160
|
+
fi
|
|
161
|
+
fi
|
|
162
|
+
|
|
163
|
+
# Record in completed list
|
|
164
|
+
locked_state_update \
|
|
165
|
+
--argjson num "$issue_num" \
|
|
166
|
+
--arg result "failed" \
|
|
167
|
+
--argjson code "$exit_code" \
|
|
168
|
+
--arg dur "${duration:-unknown}" \
|
|
169
|
+
--arg completed_at "$(now_iso)" \
|
|
170
|
+
'.completed += [{
|
|
171
|
+
issue: $num,
|
|
172
|
+
result: $result,
|
|
173
|
+
exit_code: $code,
|
|
174
|
+
duration: $dur,
|
|
175
|
+
completed_at: $completed_at
|
|
176
|
+
}] | .completed = .completed[-500:]'
|
|
177
|
+
|
|
178
|
+
# ── Classify failure and decide retry strategy ──
|
|
179
|
+
local failure_class
|
|
180
|
+
failure_class=$(classify_failure "$issue_num")
|
|
181
|
+
daemon_log INFO "Failure classified as: ${failure_class} for issue #${issue_num}"
|
|
182
|
+
emit_event "daemon.failure_classified" "issue=$issue_num" "class=$failure_class"
|
|
183
|
+
record_failure_class "$failure_class"
|
|
184
|
+
|
|
185
|
+
# ── Auto-retry with strategy escalation ──
|
|
186
|
+
if [[ "${RETRY_ESCALATION:-true}" == "true" ]]; then
|
|
187
|
+
local retry_count
|
|
188
|
+
retry_count=$(jq -r --arg num "$issue_num" \
|
|
189
|
+
'.retry_counts[$num] // 0' "$STATE_FILE" 2>/dev/null || echo "0")
|
|
190
|
+
|
|
191
|
+
# Non-retryable failures — skip retry entirely
|
|
192
|
+
case "$failure_class" in
|
|
193
|
+
auth_error)
|
|
194
|
+
daemon_log ERROR "Auth error for issue #${issue_num} — skipping retry"
|
|
195
|
+
emit_event "daemon.skip_retry" "issue=$issue_num" "reason=auth_error"
|
|
196
|
+
if [[ "$NO_GITHUB" != "true" ]]; then
|
|
197
|
+
gh issue edit "$issue_num" --add-label "pipeline/auth-error" 2>/dev/null || true
|
|
198
|
+
fi
|
|
199
|
+
;;
|
|
200
|
+
invalid_issue)
|
|
201
|
+
daemon_log ERROR "Invalid issue #${issue_num} — skipping retry"
|
|
202
|
+
emit_event "daemon.skip_retry" "issue=$issue_num" "reason=invalid_issue"
|
|
203
|
+
if [[ "$NO_GITHUB" != "true" ]]; then
|
|
204
|
+
gh issue comment "$issue_num" --body "Pipeline skipped retry: issue appears invalid or has no body." 2>/dev/null || true
|
|
205
|
+
fi
|
|
206
|
+
;;
|
|
207
|
+
*)
|
|
208
|
+
# Retryable failures — per-class max retries and escalation
|
|
209
|
+
local effective_max
|
|
210
|
+
effective_max=$(get_max_retries_for_class "$failure_class")
|
|
211
|
+
if [[ "$retry_count" -lt "$effective_max" ]]; then
|
|
212
|
+
retry_count=$((retry_count + 1))
|
|
213
|
+
|
|
214
|
+
# Update retry count in state (locked to prevent race)
|
|
215
|
+
locked_state_update \
|
|
216
|
+
--arg num "$issue_num" --argjson count "$retry_count" \
|
|
217
|
+
'.retry_counts[$num] = $count'
|
|
218
|
+
|
|
219
|
+
daemon_log WARN "Auto-retry #${retry_count}/${effective_max} for issue #${issue_num} (class: ${failure_class})"
|
|
220
|
+
emit_event "daemon.retry" "issue=$issue_num" "retry=$retry_count" "max=$effective_max" "class=$failure_class"
|
|
221
|
+
|
|
222
|
+
# Check for checkpoint to enable resume-from-checkpoint
|
|
223
|
+
local checkpoint_args=()
|
|
224
|
+
if [[ "${CHECKPOINT_ENABLED:-true}" == "true" ]]; then
|
|
225
|
+
local issue_worktree="${REPO_DIR}/.worktrees/daemon-issue-${issue_num}"
|
|
226
|
+
if [[ -d "$issue_worktree/.claude/pipeline-artifacts/checkpoints" ]]; then
|
|
227
|
+
local latest_checkpoint=""
|
|
228
|
+
for cp_file in "$issue_worktree/.claude/pipeline-artifacts/checkpoints"/*-checkpoint.json; do
|
|
229
|
+
[[ -f "$cp_file" ]] && latest_checkpoint="$cp_file"
|
|
230
|
+
done
|
|
231
|
+
if [[ -n "$latest_checkpoint" ]]; then
|
|
232
|
+
daemon_log INFO "Found checkpoint: $latest_checkpoint"
|
|
233
|
+
emit_event "daemon.recovery" "issue=$issue_num" "checkpoint=$latest_checkpoint"
|
|
234
|
+
checkpoint_args+=("--resume")
|
|
235
|
+
fi
|
|
236
|
+
fi
|
|
237
|
+
fi
|
|
238
|
+
|
|
239
|
+
# Build escalated pipeline args
|
|
240
|
+
local retry_template="$PIPELINE_TEMPLATE"
|
|
241
|
+
local retry_model="${MODEL:-opus}"
|
|
242
|
+
local extra_args=()
|
|
243
|
+
|
|
244
|
+
if [[ "$retry_count" -eq 1 ]]; then
|
|
245
|
+
retry_model="opus"
|
|
246
|
+
extra_args+=("--max-iterations" "30")
|
|
247
|
+
daemon_log INFO "Escalation: model=opus, max_iterations=30"
|
|
248
|
+
elif [[ "$retry_count" -ge 2 ]]; then
|
|
249
|
+
retry_template="full"
|
|
250
|
+
retry_model="opus"
|
|
251
|
+
extra_args+=("--max-iterations" "30" "--compound-cycles" "5")
|
|
252
|
+
daemon_log INFO "Escalation: template=full, compound_cycles=5"
|
|
253
|
+
fi
|
|
254
|
+
|
|
255
|
+
# Increase restarts on context exhaustion
|
|
256
|
+
if [[ "$failure_class" == "context_exhaustion" ]]; then
|
|
257
|
+
local boosted_restarts=$(( ${MAX_RESTARTS_CFG:-3} + retry_count ))
|
|
258
|
+
if [[ "$boosted_restarts" -gt 5 ]]; then
|
|
259
|
+
boosted_restarts=5
|
|
260
|
+
fi
|
|
261
|
+
extra_args+=("--max-restarts" "$boosted_restarts")
|
|
262
|
+
daemon_log INFO "Boosting max-restarts to $boosted_restarts (context exhaustion)"
|
|
263
|
+
fi
|
|
264
|
+
|
|
265
|
+
# Exponential backoff (per-class base); cap at 1h
|
|
266
|
+
local base_secs=30
|
|
267
|
+
[[ "$failure_class" == "api_error" ]] && base_secs=300
|
|
268
|
+
local backoff_secs=$((base_secs * (1 << (retry_count - 1))))
|
|
269
|
+
[[ "$backoff_secs" -gt 3600 ]] && backoff_secs=3600
|
|
270
|
+
[[ "$failure_class" == "api_error" ]] && daemon_log INFO "API error — exponential backoff ${backoff_secs}s"
|
|
271
|
+
|
|
272
|
+
if [[ "$NO_GITHUB" != "true" ]]; then
|
|
273
|
+
gh issue comment "$issue_num" --body "## 🔄 Auto-Retry #${retry_count}
|
|
274
|
+
|
|
275
|
+
Pipeline failed (${failure_class}) — retrying with escalated strategy.
|
|
276
|
+
|
|
277
|
+
| Field | Value |
|
|
278
|
+
|-------|-------|
|
|
279
|
+
| Retry | ${retry_count} / ${MAX_RETRIES:-2} |
|
|
280
|
+
| Failure | \`${failure_class}\` |
|
|
281
|
+
| Template | \`${retry_template}\` |
|
|
282
|
+
| Model | \`${retry_model}\` |
|
|
283
|
+
| Started | $(now_iso) |
|
|
284
|
+
|
|
285
|
+
_Escalation: $(if [[ "$retry_count" -eq 1 ]]; then echo "upgraded model + increased iterations"; else echo "full template + compound quality"; fi)_" 2>/dev/null || true
|
|
286
|
+
fi
|
|
287
|
+
|
|
288
|
+
daemon_log INFO "Waiting ${backoff_secs}s before retry #${retry_count}"
|
|
289
|
+
sleep "$backoff_secs"
|
|
290
|
+
|
|
291
|
+
# Merge checkpoint args + extra args for passthrough
|
|
292
|
+
local all_extra_args=()
|
|
293
|
+
if [[ ${#checkpoint_args[@]} -gt 0 ]]; then
|
|
294
|
+
all_extra_args+=("${checkpoint_args[@]}")
|
|
295
|
+
fi
|
|
296
|
+
if [[ ${#extra_args[@]} -gt 0 ]]; then
|
|
297
|
+
all_extra_args+=("${extra_args[@]}")
|
|
298
|
+
fi
|
|
299
|
+
|
|
300
|
+
# Re-spawn with escalated strategy
|
|
301
|
+
local orig_template="$PIPELINE_TEMPLATE"
|
|
302
|
+
local orig_model="$MODEL"
|
|
303
|
+
PIPELINE_TEMPLATE="$retry_template"
|
|
304
|
+
MODEL="$retry_model"
|
|
305
|
+
daemon_spawn_pipeline "$issue_num" "retry-${retry_count}" "" "${all_extra_args[@]}"
|
|
306
|
+
_retry_spawned_for="$issue_num"
|
|
307
|
+
PIPELINE_TEMPLATE="$orig_template"
|
|
308
|
+
MODEL="$orig_model"
|
|
309
|
+
return
|
|
310
|
+
fi
|
|
311
|
+
|
|
312
|
+
daemon_log WARN "Max retries (${effective_max}) exhausted for issue #${issue_num}"
|
|
313
|
+
emit_event "daemon.retry_exhausted" "issue=$issue_num" "retries=$retry_count"
|
|
314
|
+
;;
|
|
315
|
+
esac
|
|
316
|
+
fi
|
|
317
|
+
|
|
318
|
+
# ── No retry — report final failure ──
|
|
319
|
+
# PM agent: record failure for learning (only when we're done with this issue)
|
|
320
|
+
if [[ -x "$SCRIPT_DIR/sw-pm.sh" ]]; then
|
|
321
|
+
bash "$SCRIPT_DIR/sw-pm.sh" learn "$issue_num" failure 2>/dev/null || true
|
|
322
|
+
fi
|
|
323
|
+
|
|
324
|
+
if [[ "$NO_GITHUB" != "true" ]]; then
|
|
325
|
+
# Add failure label and remove watch label (prevent re-processing)
|
|
326
|
+
gh issue edit "$issue_num" \
|
|
327
|
+
--add-label "$ON_FAILURE_ADD_LABEL" \
|
|
328
|
+
--remove-label "$WATCH_LABEL" 2>/dev/null || true
|
|
329
|
+
|
|
330
|
+
# Close any draft PR created for this issue (cleanup abandoned work)
|
|
331
|
+
local draft_pr
|
|
332
|
+
draft_pr=$(gh pr list --head "daemon/issue-${issue_num}" --head "pipeline/pipeline-issue-${issue_num}" \
|
|
333
|
+
--json number,isDraft --jq '.[] | select(.isDraft == true) | .number' 2>/dev/null | head -1 || true)
|
|
334
|
+
if [[ -n "$draft_pr" ]]; then
|
|
335
|
+
gh pr close "$draft_pr" --delete-branch 2>/dev/null || true
|
|
336
|
+
daemon_log INFO "Closed draft PR #${draft_pr} for failed issue #${issue_num}"
|
|
337
|
+
fi
|
|
338
|
+
|
|
339
|
+
# Comment with log tail
|
|
340
|
+
local log_tail=""
|
|
341
|
+
local log_path="$LOG_DIR/issue-${issue_num}.log"
|
|
342
|
+
if [[ -f "$log_path" ]]; then
|
|
343
|
+
log_tail=$(tail -"$ON_FAILURE_LOG_LINES" "$log_path" 2>/dev/null || true)
|
|
344
|
+
fi
|
|
345
|
+
|
|
346
|
+
local retry_info=""
|
|
347
|
+
if [[ "${RETRY_ESCALATION:-true}" == "true" ]]; then
|
|
348
|
+
local final_count final_max
|
|
349
|
+
final_count=$(jq -r --arg num "$issue_num" \
|
|
350
|
+
'.retry_counts[$num] // 0' "$STATE_FILE" 2>/dev/null || echo "0")
|
|
351
|
+
final_max=$(get_max_retries_for_class "$failure_class")
|
|
352
|
+
retry_info="| Retries | ${final_count} / ${final_max} (exhausted) |"
|
|
353
|
+
fi
|
|
354
|
+
|
|
355
|
+
gh issue comment "$issue_num" --body "## ❌ Pipeline Failed
|
|
356
|
+
|
|
357
|
+
The autonomous pipeline encountered an error.
|
|
358
|
+
|
|
359
|
+
| Field | Value |
|
|
360
|
+
|-------|-------|
|
|
361
|
+
| Exit Code | ${exit_code} |
|
|
362
|
+
| Duration | ${duration:-unknown} |
|
|
363
|
+
| Failed At | $(now_iso) |
|
|
364
|
+
${retry_info}
|
|
365
|
+
|
|
366
|
+
<details>
|
|
367
|
+
<summary>Last ${ON_FAILURE_LOG_LINES} lines of log</summary>
|
|
368
|
+
|
|
369
|
+
\`\`\`
|
|
370
|
+
${log_tail}
|
|
371
|
+
\`\`\`
|
|
372
|
+
|
|
373
|
+
</details>
|
|
374
|
+
|
|
375
|
+
_Re-add the \`${WATCH_LABEL}\` label to retry._" 2>/dev/null || true
|
|
376
|
+
fi
|
|
377
|
+
|
|
378
|
+
notify "Pipeline Failed — Issue #${issue_num}" \
|
|
379
|
+
"Exit code: ${exit_code}, Duration: ${duration:-unknown}" "error"
|
|
380
|
+
"$SCRIPT_DIR/sw-tracker.sh" notify "failed" "$issue_num" "Exit code: ${exit_code}, Duration: ${duration:-unknown}" 2>/dev/null || true
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
# ─── Intelligent Triage ──────────────────────────────────────────────────────
|
|
384
|
+
|
|
385
|
+
# Score an issue from 0-100 based on multiple signals for intelligent prioritization.
|
|
386
|
+
# Combines priority labels, age, complexity, dependencies, type, and memory signals.
|
|
387
|
+
# When intelligence engine is enabled, uses semantic AI analysis for richer scoring.
|