shipwright-cli 2.2.0 → 2.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +15 -16
- package/config/policy.schema.json +104 -29
- package/docs/AGI-PLATFORM-PLAN.md +11 -7
- package/docs/AGI-WHATS-NEXT.md +26 -20
- package/docs/README.md +2 -0
- package/package.json +1 -1
- package/scripts/check-version-consistency.sh +72 -0
- package/scripts/lib/daemon-adaptive.sh +610 -0
- package/scripts/lib/daemon-dispatch.sh +489 -0
- package/scripts/lib/daemon-failure.sh +387 -0
- package/scripts/lib/daemon-patrol.sh +1113 -0
- package/scripts/lib/daemon-poll.sh +1202 -0
- package/scripts/lib/daemon-state.sh +550 -0
- package/scripts/lib/daemon-triage.sh +490 -0
- package/scripts/lib/helpers.sh +81 -1
- package/scripts/lib/pipeline-detection.sh +278 -0
- package/scripts/lib/pipeline-github.sh +196 -0
- package/scripts/lib/pipeline-intelligence.sh +1706 -0
- package/scripts/lib/pipeline-quality-checks.sh +1054 -0
- package/scripts/lib/pipeline-quality.sh +11 -0
- package/scripts/lib/pipeline-stages.sh +2508 -0
- package/scripts/lib/pipeline-state.sh +529 -0
- package/scripts/sw +26 -4
- package/scripts/sw-activity.sh +1 -1
- package/scripts/sw-adaptive.sh +2 -2
- package/scripts/sw-adversarial.sh +1 -1
- package/scripts/sw-architecture-enforcer.sh +1 -1
- package/scripts/sw-auth.sh +1 -1
- package/scripts/sw-autonomous.sh +1 -1
- package/scripts/sw-changelog.sh +1 -1
- package/scripts/sw-checkpoint.sh +1 -1
- package/scripts/sw-ci.sh +1 -1
- package/scripts/sw-cleanup.sh +1 -1
- package/scripts/sw-code-review.sh +1 -1
- package/scripts/sw-connect.sh +1 -1
- package/scripts/sw-context.sh +1 -1
- package/scripts/sw-cost.sh +1 -1
- package/scripts/sw-daemon.sh +52 -4816
- package/scripts/sw-dashboard.sh +1 -1
- package/scripts/sw-db.sh +1 -1
- package/scripts/sw-decompose.sh +1 -1
- package/scripts/sw-deps.sh +1 -1
- package/scripts/sw-developer-simulation.sh +1 -1
- package/scripts/sw-discovery.sh +1 -1
- package/scripts/sw-doc-fleet.sh +1 -1
- package/scripts/sw-docs-agent.sh +1 -1
- package/scripts/sw-docs.sh +1 -1
- package/scripts/sw-doctor.sh +42 -1
- package/scripts/sw-dora.sh +1 -1
- package/scripts/sw-durable.sh +1 -1
- package/scripts/sw-e2e-orchestrator.sh +1 -1
- package/scripts/sw-eventbus.sh +1 -1
- package/scripts/sw-feedback.sh +1 -1
- package/scripts/sw-fix.sh +1 -1
- package/scripts/sw-fleet-discover.sh +1 -1
- package/scripts/sw-fleet-viz.sh +3 -3
- package/scripts/sw-fleet.sh +1 -1
- package/scripts/sw-github-app.sh +1 -1
- package/scripts/sw-github-checks.sh +1 -1
- package/scripts/sw-github-deploy.sh +1 -1
- package/scripts/sw-github-graphql.sh +1 -1
- package/scripts/sw-guild.sh +1 -1
- package/scripts/sw-heartbeat.sh +1 -1
- package/scripts/sw-hygiene.sh +1 -1
- package/scripts/sw-incident.sh +1 -1
- package/scripts/sw-init.sh +1 -1
- package/scripts/sw-instrument.sh +1 -1
- package/scripts/sw-intelligence.sh +1 -1
- package/scripts/sw-jira.sh +1 -1
- package/scripts/sw-launchd.sh +1 -1
- package/scripts/sw-linear.sh +1 -1
- package/scripts/sw-logs.sh +1 -1
- package/scripts/sw-loop.sh +1 -1
- package/scripts/sw-memory.sh +1 -1
- package/scripts/sw-mission-control.sh +1 -1
- package/scripts/sw-model-router.sh +1 -1
- package/scripts/sw-otel.sh +4 -4
- package/scripts/sw-oversight.sh +1 -1
- package/scripts/sw-pipeline-composer.sh +1 -1
- package/scripts/sw-pipeline-vitals.sh +1 -1
- package/scripts/sw-pipeline.sh +23 -56
- package/scripts/sw-pipeline.sh.mock +7 -0
- package/scripts/sw-pm.sh +1 -1
- package/scripts/sw-pr-lifecycle.sh +1 -1
- package/scripts/sw-predictive.sh +1 -1
- package/scripts/sw-prep.sh +1 -1
- package/scripts/sw-ps.sh +1 -1
- package/scripts/sw-public-dashboard.sh +1 -1
- package/scripts/sw-quality.sh +1 -1
- package/scripts/sw-reaper.sh +1 -1
- package/scripts/sw-recruit.sh +9 -1
- package/scripts/sw-regression.sh +1 -1
- package/scripts/sw-release-manager.sh +1 -1
- package/scripts/sw-release.sh +1 -1
- package/scripts/sw-remote.sh +1 -1
- package/scripts/sw-replay.sh +1 -1
- package/scripts/sw-retro.sh +1 -1
- package/scripts/sw-scale.sh +8 -5
- package/scripts/sw-security-audit.sh +1 -1
- package/scripts/sw-self-optimize.sh +158 -7
- package/scripts/sw-session.sh +1 -1
- package/scripts/sw-setup.sh +1 -1
- package/scripts/sw-standup.sh +3 -3
- package/scripts/sw-status.sh +1 -1
- package/scripts/sw-strategic.sh +1 -1
- package/scripts/sw-stream.sh +8 -2
- package/scripts/sw-swarm.sh +7 -10
- package/scripts/sw-team-stages.sh +1 -1
- package/scripts/sw-templates.sh +1 -1
- package/scripts/sw-testgen.sh +1 -1
- package/scripts/sw-tmux-pipeline.sh +1 -1
- package/scripts/sw-tmux.sh +1 -1
- package/scripts/sw-trace.sh +1 -1
- package/scripts/sw-tracker.sh +24 -6
- package/scripts/sw-triage.sh +1 -1
- package/scripts/sw-upgrade.sh +1 -1
- package/scripts/sw-ux.sh +1 -1
- package/scripts/sw-webhook.sh +1 -1
- package/scripts/sw-widgets.sh +1 -1
- package/scripts/sw-worktree.sh +1 -1
|
@@ -0,0 +1,1202 @@
|
|
|
1
|
+
# daemon-poll.sh — Poll loop, health, scale, cleanup (for sw-daemon.sh)
|
|
2
|
+
# Source from sw-daemon.sh. Requires daemon-health, state, dispatch, failure, patrol.
|
|
3
|
+
[[ -n "${_DAEMON_POLL_LOADED:-}" ]] && return 0
|
|
4
|
+
_DAEMON_POLL_LOADED=1
|
|
5
|
+
|
|
6
|
+
daemon_poll_issues() {
|
|
7
|
+
if [[ "$NO_GITHUB" == "true" ]]; then
|
|
8
|
+
daemon_log INFO "Polling skipped (--no-github)"
|
|
9
|
+
return
|
|
10
|
+
fi
|
|
11
|
+
|
|
12
|
+
# Check for pause flag (set by dashboard, disk_low, or consecutive-failure backoff)
|
|
13
|
+
local pause_file="${PAUSE_FLAG:-$HOME/.shipwright/daemon-pause.flag}"
|
|
14
|
+
if [[ -f "$pause_file" ]]; then
|
|
15
|
+
local resume_after
|
|
16
|
+
resume_after=$(jq -r '.resume_after // empty' "$pause_file" 2>/dev/null || true)
|
|
17
|
+
if [[ -n "$resume_after" ]]; then
|
|
18
|
+
local now_epoch resume_epoch
|
|
19
|
+
now_epoch=$(date +%s)
|
|
20
|
+
resume_epoch=$(TZ=UTC date -j -f "%Y-%m-%dT%H:%M:%SZ" "$resume_after" +%s 2>/dev/null || \
|
|
21
|
+
date -d "$resume_after" +%s 2>/dev/null || echo 0)
|
|
22
|
+
if [[ "$resume_epoch" -gt 0 ]] && [[ "$now_epoch" -ge "$resume_epoch" ]]; then
|
|
23
|
+
rm -f "$pause_file"
|
|
24
|
+
daemon_log INFO "Auto-resuming after backoff (resume_after passed)"
|
|
25
|
+
else
|
|
26
|
+
daemon_log INFO "Daemon paused until ${resume_after} — skipping poll"
|
|
27
|
+
return
|
|
28
|
+
fi
|
|
29
|
+
else
|
|
30
|
+
daemon_log INFO "Daemon paused — skipping poll"
|
|
31
|
+
return
|
|
32
|
+
fi
|
|
33
|
+
fi
|
|
34
|
+
|
|
35
|
+
# Circuit breaker: skip poll if in backoff window
|
|
36
|
+
if gh_rate_limited; then
|
|
37
|
+
daemon_log INFO "Polling skipped (rate-limit backoff until $(epoch_to_iso "$GH_BACKOFF_UNTIL"))"
|
|
38
|
+
return
|
|
39
|
+
fi
|
|
40
|
+
|
|
41
|
+
local issues_json
|
|
42
|
+
|
|
43
|
+
# Select gh command wrapper: gh_retry for critical poll calls when enabled
|
|
44
|
+
local gh_cmd="gh"
|
|
45
|
+
if [[ "${GH_RETRY_ENABLED:-true}" == "true" ]]; then
|
|
46
|
+
gh_cmd="gh_retry gh"
|
|
47
|
+
fi
|
|
48
|
+
|
|
49
|
+
if [[ "$WATCH_MODE" == "org" && -n "$ORG" ]]; then
|
|
50
|
+
# Org-wide mode: search issues across all org repos
|
|
51
|
+
issues_json=$($gh_cmd search issues \
|
|
52
|
+
--label "$WATCH_LABEL" \
|
|
53
|
+
--owner "$ORG" \
|
|
54
|
+
--state open \
|
|
55
|
+
--json repository,number,title,labels,body,createdAt \
|
|
56
|
+
--limit 20 2>/dev/null) || {
|
|
57
|
+
# Handle rate limiting with exponential backoff
|
|
58
|
+
if [[ $BACKOFF_SECS -eq 0 ]]; then
|
|
59
|
+
BACKOFF_SECS=30
|
|
60
|
+
elif [[ $BACKOFF_SECS -lt 300 ]]; then
|
|
61
|
+
BACKOFF_SECS=$((BACKOFF_SECS * 2))
|
|
62
|
+
if [[ $BACKOFF_SECS -gt 300 ]]; then
|
|
63
|
+
BACKOFF_SECS=300
|
|
64
|
+
fi
|
|
65
|
+
fi
|
|
66
|
+
daemon_log WARN "GitHub API error (org search) — backing off ${BACKOFF_SECS}s"
|
|
67
|
+
gh_record_failure
|
|
68
|
+
sleep "$BACKOFF_SECS"
|
|
69
|
+
return
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
# Filter by repo_filter regex if set
|
|
73
|
+
if [[ -n "$REPO_FILTER" ]]; then
|
|
74
|
+
issues_json=$(echo "$issues_json" | jq -c --arg filter "$REPO_FILTER" \
|
|
75
|
+
'[.[] | select(.repository.nameWithOwner | test($filter))]')
|
|
76
|
+
fi
|
|
77
|
+
else
|
|
78
|
+
# Standard single-repo mode
|
|
79
|
+
issues_json=$($gh_cmd issue list \
|
|
80
|
+
--label "$WATCH_LABEL" \
|
|
81
|
+
--state open \
|
|
82
|
+
--json number,title,labels,body,createdAt \
|
|
83
|
+
--limit 20 2>/dev/null) || {
|
|
84
|
+
# Handle rate limiting with exponential backoff
|
|
85
|
+
if [[ $BACKOFF_SECS -eq 0 ]]; then
|
|
86
|
+
BACKOFF_SECS=30
|
|
87
|
+
elif [[ $BACKOFF_SECS -lt 300 ]]; then
|
|
88
|
+
BACKOFF_SECS=$((BACKOFF_SECS * 2))
|
|
89
|
+
if [[ $BACKOFF_SECS -gt 300 ]]; then
|
|
90
|
+
BACKOFF_SECS=300
|
|
91
|
+
fi
|
|
92
|
+
fi
|
|
93
|
+
daemon_log WARN "GitHub API error — backing off ${BACKOFF_SECS}s"
|
|
94
|
+
gh_record_failure
|
|
95
|
+
sleep "$BACKOFF_SECS"
|
|
96
|
+
return
|
|
97
|
+
}
|
|
98
|
+
fi
|
|
99
|
+
|
|
100
|
+
# Reset backoff on success
|
|
101
|
+
BACKOFF_SECS=0
|
|
102
|
+
gh_record_success
|
|
103
|
+
|
|
104
|
+
local issue_count
|
|
105
|
+
issue_count=$(echo "$issues_json" | jq 'length' 2>/dev/null || echo 0)
|
|
106
|
+
|
|
107
|
+
if [[ "$issue_count" -eq 0 ]]; then
|
|
108
|
+
return
|
|
109
|
+
fi
|
|
110
|
+
|
|
111
|
+
local mode_label="repo"
|
|
112
|
+
[[ "$WATCH_MODE" == "org" ]] && mode_label="org:${ORG}"
|
|
113
|
+
daemon_log INFO "Found ${issue_count} issue(s) with label '${WATCH_LABEL}' (${mode_label})"
|
|
114
|
+
emit_event "daemon.poll" "issues_found=$issue_count" "active=$(get_active_count)" "mode=$WATCH_MODE"
|
|
115
|
+
|
|
116
|
+
# Score each issue using intelligent triage and sort by descending score
|
|
117
|
+
local scored_issues=()
|
|
118
|
+
local dep_graph="" # "issue:dep1,dep2" entries for dependency ordering
|
|
119
|
+
while IFS= read -r issue; do
|
|
120
|
+
local num score
|
|
121
|
+
num=$(echo "$issue" | jq -r '.number')
|
|
122
|
+
score=$(triage_score_issue "$issue" 2>/dev/null | tail -1)
|
|
123
|
+
score=$(printf '%s' "$score" | tr -cd '[:digit:]')
|
|
124
|
+
[[ -z "$score" ]] && score=50
|
|
125
|
+
# For org mode, include repo name in the scored entry
|
|
126
|
+
local repo_name=""
|
|
127
|
+
if [[ "$WATCH_MODE" == "org" ]]; then
|
|
128
|
+
repo_name=$(echo "$issue" | jq -r '.repository.nameWithOwner // ""')
|
|
129
|
+
fi
|
|
130
|
+
scored_issues+=("${score}|${num}|${repo_name}")
|
|
131
|
+
|
|
132
|
+
# Issue dependency detection (adaptive: extract "depends on #X", "blocked by #X")
|
|
133
|
+
if [[ "${ADAPTIVE_THRESHOLDS_ENABLED:-false}" == "true" ]]; then
|
|
134
|
+
local issue_text
|
|
135
|
+
issue_text=$(echo "$issue" | jq -r '(.title // "") + " " + (.body // "")')
|
|
136
|
+
local deps
|
|
137
|
+
deps=$(extract_issue_dependencies "$issue_text")
|
|
138
|
+
if [[ -n "$deps" ]]; then
|
|
139
|
+
local dep_nums
|
|
140
|
+
dep_nums=$(echo "$deps" | tr -d '#' | tr '\n' ',' | sed 's/,$//')
|
|
141
|
+
dep_graph="${dep_graph}${num}:${dep_nums}\n"
|
|
142
|
+
daemon_log INFO "Issue #${num} depends on: ${deps//$'\n'/, }"
|
|
143
|
+
fi
|
|
144
|
+
fi
|
|
145
|
+
done < <(echo "$issues_json" | jq -c '.[]')
|
|
146
|
+
|
|
147
|
+
# Sort by score — strategy determines ascending vs descending
|
|
148
|
+
local sorted_order
|
|
149
|
+
if [[ "${PRIORITY_STRATEGY:-quick-wins-first}" == "complex-first" ]]; then
|
|
150
|
+
# Complex-first: lower score (more complex) first
|
|
151
|
+
sorted_order=$(printf '%s\n' "${scored_issues[@]}" | sort -t'|' -k1,1 -n -k2,2 -n)
|
|
152
|
+
else
|
|
153
|
+
# Quick-wins-first (default): higher score (simpler) first, lowest issue# first on ties
|
|
154
|
+
sorted_order=$(printf '%s\n' "${scored_issues[@]}" | sort -t'|' -k1,1 -rn -k2,2 -n)
|
|
155
|
+
fi
|
|
156
|
+
|
|
157
|
+
# Dependency-aware reordering: move dependencies before dependents
|
|
158
|
+
if [[ -n "$dep_graph" && "${ADAPTIVE_THRESHOLDS_ENABLED:-false}" == "true" ]]; then
|
|
159
|
+
local reordered=""
|
|
160
|
+
local scheduled=""
|
|
161
|
+
# Multiple passes to resolve transitive dependencies (max 3)
|
|
162
|
+
local pass=0
|
|
163
|
+
while [[ $pass -lt 3 ]]; do
|
|
164
|
+
local changed=false
|
|
165
|
+
local new_order=""
|
|
166
|
+
while IFS='|' read -r s_score s_num s_repo; do
|
|
167
|
+
[[ -z "$s_num" ]] && continue
|
|
168
|
+
# Check if this issue has unscheduled dependencies
|
|
169
|
+
local issue_deps
|
|
170
|
+
issue_deps=$(echo -e "$dep_graph" | grep "^${s_num}:" | head -1 | cut -d: -f2 || true)
|
|
171
|
+
if [[ -n "$issue_deps" ]]; then
|
|
172
|
+
# Check if all deps are scheduled (or not in our issue set)
|
|
173
|
+
local all_deps_ready=true
|
|
174
|
+
local IFS_SAVE="$IFS"
|
|
175
|
+
IFS=','
|
|
176
|
+
for dep in $issue_deps; do
|
|
177
|
+
dep="${dep## }"
|
|
178
|
+
dep="${dep%% }"
|
|
179
|
+
# Is this dep in our scored set and not yet scheduled?
|
|
180
|
+
if echo "$sorted_order" | grep -q "|${dep}|" && ! echo "$scheduled" | grep -q "|${dep}|"; then
|
|
181
|
+
all_deps_ready=false
|
|
182
|
+
break
|
|
183
|
+
fi
|
|
184
|
+
done
|
|
185
|
+
IFS="$IFS_SAVE"
|
|
186
|
+
if [[ "$all_deps_ready" == "false" ]]; then
|
|
187
|
+
# Defer this issue — append at end
|
|
188
|
+
new_order="${new_order}${s_score}|${s_num}|${s_repo}\n"
|
|
189
|
+
changed=true
|
|
190
|
+
continue
|
|
191
|
+
fi
|
|
192
|
+
fi
|
|
193
|
+
reordered="${reordered}${s_score}|${s_num}|${s_repo}\n"
|
|
194
|
+
scheduled="${scheduled}|${s_num}|"
|
|
195
|
+
done <<< "$sorted_order"
|
|
196
|
+
# Append deferred issues
|
|
197
|
+
reordered="${reordered}${new_order}"
|
|
198
|
+
sorted_order=$(echo -e "$reordered" | grep -v '^$')
|
|
199
|
+
reordered=""
|
|
200
|
+
scheduled=""
|
|
201
|
+
if [[ "$changed" == "false" ]]; then
|
|
202
|
+
break
|
|
203
|
+
fi
|
|
204
|
+
pass=$((pass + 1))
|
|
205
|
+
done
|
|
206
|
+
fi
|
|
207
|
+
|
|
208
|
+
local active_count
|
|
209
|
+
active_count=$(locked_get_active_count)
|
|
210
|
+
|
|
211
|
+
# Process each issue in triage order (process substitution keeps state in current shell)
|
|
212
|
+
while IFS='|' read -r score issue_num repo_name; do
|
|
213
|
+
[[ -z "$issue_num" ]] && continue
|
|
214
|
+
|
|
215
|
+
local issue_title labels_csv
|
|
216
|
+
issue_title=$(echo "$issues_json" | jq -r --argjson n "$issue_num" '.[] | select(.number == $n) | .title')
|
|
217
|
+
labels_csv=$(echo "$issues_json" | jq -r --argjson n "$issue_num" '.[] | select(.number == $n) | [.labels[].name] | join(",")')
|
|
218
|
+
|
|
219
|
+
# Cache title in state for dashboard visibility
|
|
220
|
+
if [[ -n "$issue_title" ]]; then
|
|
221
|
+
locked_state_update --arg num "$issue_num" --arg title "$issue_title" \
|
|
222
|
+
'.titles[$num] = $title'
|
|
223
|
+
fi
|
|
224
|
+
|
|
225
|
+
# Skip if already inflight
|
|
226
|
+
if daemon_is_inflight "$issue_num"; then
|
|
227
|
+
continue
|
|
228
|
+
fi
|
|
229
|
+
|
|
230
|
+
# Distributed claim (skip if no machines registered)
|
|
231
|
+
if [[ -f "$HOME/.shipwright/machines.json" ]]; then
|
|
232
|
+
local machine_name
|
|
233
|
+
machine_name=$(jq -r '.machines[] | select(.role == "primary") | .name' "$HOME/.shipwright/machines.json" 2>/dev/null || hostname -s)
|
|
234
|
+
if ! claim_issue "$issue_num" "$machine_name"; then
|
|
235
|
+
daemon_log INFO "Issue #${issue_num} claimed by another machine — skipping"
|
|
236
|
+
continue
|
|
237
|
+
fi
|
|
238
|
+
fi
|
|
239
|
+
|
|
240
|
+
# Priority lane: bypass queue for critical issues
|
|
241
|
+
if [[ "$PRIORITY_LANE" == "true" ]]; then
|
|
242
|
+
local priority_active
|
|
243
|
+
priority_active=$(get_priority_active_count)
|
|
244
|
+
if is_priority_issue "$labels_csv" && [[ "$priority_active" -lt "$PRIORITY_LANE_MAX" ]]; then
|
|
245
|
+
daemon_log WARN "PRIORITY LANE: issue #${issue_num} bypassing queue (${labels_csv})"
|
|
246
|
+
emit_event "daemon.priority_lane" "issue=$issue_num" "score=$score"
|
|
247
|
+
|
|
248
|
+
local template
|
|
249
|
+
template=$(select_pipeline_template "$labels_csv" "$score" 2>/dev/null | tail -1)
|
|
250
|
+
template=$(printf '%s' "$template" | sed $'s/\x1b\\[[0-9;]*m//g' | tr -cd '[:alnum:]-_')
|
|
251
|
+
[[ -z "$template" ]] && template="$PIPELINE_TEMPLATE"
|
|
252
|
+
daemon_log INFO "Triage: issue #${issue_num} scored ${score}, template=${template} [PRIORITY]"
|
|
253
|
+
|
|
254
|
+
local orig_template="$PIPELINE_TEMPLATE"
|
|
255
|
+
PIPELINE_TEMPLATE="$template"
|
|
256
|
+
daemon_spawn_pipeline "$issue_num" "$issue_title" "$repo_name"
|
|
257
|
+
PIPELINE_TEMPLATE="$orig_template"
|
|
258
|
+
track_priority_job "$issue_num"
|
|
259
|
+
continue
|
|
260
|
+
fi
|
|
261
|
+
fi
|
|
262
|
+
|
|
263
|
+
# Check capacity
|
|
264
|
+
active_count=$(locked_get_active_count)
|
|
265
|
+
if [[ "$active_count" -ge "$MAX_PARALLEL" ]]; then
|
|
266
|
+
enqueue_issue "$issue_num"
|
|
267
|
+
continue
|
|
268
|
+
fi
|
|
269
|
+
|
|
270
|
+
# Auto-select pipeline template: PM recommendation (if available) else labels + triage score
|
|
271
|
+
local template
|
|
272
|
+
if [[ "$NO_GITHUB" != "true" ]] && [[ -x "$SCRIPT_DIR/sw-pm.sh" ]]; then
|
|
273
|
+
local pm_rec
|
|
274
|
+
pm_rec=$(bash "$SCRIPT_DIR/sw-pm.sh" recommend --json "$issue_num" 2>/dev/null) || true
|
|
275
|
+
if [[ -n "$pm_rec" ]]; then
|
|
276
|
+
template=$(echo "$pm_rec" | jq -r '.team_composition.template // empty' 2>/dev/null) || true
|
|
277
|
+
# Capability self-assessment: low confidence → upgrade to full template
|
|
278
|
+
local confidence
|
|
279
|
+
confidence=$(echo "$pm_rec" | jq -r '.team_composition.confidence_percent // 100' 2>/dev/null) || true
|
|
280
|
+
if [[ -n "$confidence" && "$confidence" != "null" && "$confidence" -lt 60 ]]; then
|
|
281
|
+
daemon_log INFO "Low PM confidence (${confidence}%) — upgrading to full template"
|
|
282
|
+
template="full"
|
|
283
|
+
fi
|
|
284
|
+
fi
|
|
285
|
+
fi
|
|
286
|
+
if [[ -z "$template" ]]; then
|
|
287
|
+
template=$(select_pipeline_template "$labels_csv" "$score" 2>/dev/null | tail -1)
|
|
288
|
+
fi
|
|
289
|
+
template=$(printf '%s' "$template" | sed $'s/\x1b\\[[0-9;]*m//g' | tr -cd '[:alnum:]-_')
|
|
290
|
+
[[ -z "$template" ]] && template="$PIPELINE_TEMPLATE"
|
|
291
|
+
daemon_log INFO "Triage: issue #${issue_num} scored ${score}, template=${template}"
|
|
292
|
+
|
|
293
|
+
# Spawn pipeline (template selection applied via PIPELINE_TEMPLATE override)
|
|
294
|
+
local orig_template="$PIPELINE_TEMPLATE"
|
|
295
|
+
PIPELINE_TEMPLATE="$template"
|
|
296
|
+
daemon_spawn_pipeline "$issue_num" "$issue_title" "$repo_name"
|
|
297
|
+
PIPELINE_TEMPLATE="$orig_template"
|
|
298
|
+
|
|
299
|
+
# Stagger delay between spawns to avoid API contention
|
|
300
|
+
local stagger_delay="${SPAWN_STAGGER_SECONDS:-15}"
|
|
301
|
+
if [[ "$stagger_delay" -gt 0 ]]; then
|
|
302
|
+
sleep "$stagger_delay"
|
|
303
|
+
fi
|
|
304
|
+
done <<< "$sorted_order"
|
|
305
|
+
|
|
306
|
+
# ── Drain queue if we have capacity (prevents deadlock when queue is
|
|
307
|
+
# populated but no active jobs exist to trigger dequeue) ──
|
|
308
|
+
local drain_active
|
|
309
|
+
drain_active=$(locked_get_active_count)
|
|
310
|
+
while [[ "$drain_active" -lt "$MAX_PARALLEL" ]]; do
|
|
311
|
+
local drain_issue
|
|
312
|
+
drain_issue=$(dequeue_next)
|
|
313
|
+
[[ -z "$drain_issue" ]] && break
|
|
314
|
+
local drain_title
|
|
315
|
+
drain_title=$(jq -r --arg n "$drain_issue" '.titles[$n] // ""' "$STATE_FILE" 2>/dev/null || true)
|
|
316
|
+
|
|
317
|
+
local drain_labels drain_score drain_template
|
|
318
|
+
drain_labels=$(echo "$issues_json" | jq -r --argjson n "$drain_issue" \
|
|
319
|
+
'.[] | select(.number == $n) | [.labels[].name] | join(",")' 2>/dev/null || echo "")
|
|
320
|
+
drain_score=$(echo "$sorted_order" | grep "|${drain_issue}|" | cut -d'|' -f1 || echo "50")
|
|
321
|
+
drain_template=$(select_pipeline_template "$drain_labels" "${drain_score:-50}" 2>/dev/null | tail -1)
|
|
322
|
+
drain_template=$(printf '%s' "$drain_template" | sed $'s/\x1b\\[[0-9;]*m//g' | tr -cd '[:alnum:]-_')
|
|
323
|
+
[[ -z "$drain_template" ]] && drain_template="$PIPELINE_TEMPLATE"
|
|
324
|
+
|
|
325
|
+
daemon_log INFO "Draining queue: issue #${drain_issue}, template=${drain_template}"
|
|
326
|
+
local orig_template="$PIPELINE_TEMPLATE"
|
|
327
|
+
PIPELINE_TEMPLATE="$drain_template"
|
|
328
|
+
daemon_spawn_pipeline "$drain_issue" "$drain_title"
|
|
329
|
+
PIPELINE_TEMPLATE="$orig_template"
|
|
330
|
+
drain_active=$(locked_get_active_count)
|
|
331
|
+
done
|
|
332
|
+
|
|
333
|
+
# Update last poll
|
|
334
|
+
update_state_field "last_poll" "$(now_iso)"
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
# ─── Health Check ─────────────────────────────────────────────────────────────
|
|
338
|
+
|
|
339
|
+
daemon_health_check() {
|
|
340
|
+
local findings=0
|
|
341
|
+
local now_e
|
|
342
|
+
now_e=$(now_epoch)
|
|
343
|
+
|
|
344
|
+
if [[ -f "$STATE_FILE" ]]; then
|
|
345
|
+
# ── Intelligent Health Monitoring ──
|
|
346
|
+
# Instead of killing after a countdown, sense what the agent is doing.
|
|
347
|
+
# Agents think for long stretches — that's normal and expected.
|
|
348
|
+
# Strategy: sense → understand → be patient → nudge → only kill as last resort.
|
|
349
|
+
|
|
350
|
+
local hard_limit="${PROGRESS_HARD_LIMIT_S:-0}"
|
|
351
|
+
local use_progress="${PROGRESS_MONITORING:-true}"
|
|
352
|
+
local nudge_enabled="${NUDGE_ENABLED:-true}"
|
|
353
|
+
local nudge_after="${NUDGE_AFTER_CHECKS:-40}"
|
|
354
|
+
|
|
355
|
+
while IFS= read -r job; do
|
|
356
|
+
local pid started_at issue_num worktree
|
|
357
|
+
pid=$(echo "$job" | jq -r '.pid')
|
|
358
|
+
started_at=$(echo "$job" | jq -r '.started_at // empty')
|
|
359
|
+
issue_num=$(echo "$job" | jq -r '.issue')
|
|
360
|
+
worktree=$(echo "$job" | jq -r '.worktree // ""')
|
|
361
|
+
|
|
362
|
+
# Skip dead processes
|
|
363
|
+
if ! kill -0 "$pid" 2>/dev/null; then
|
|
364
|
+
continue
|
|
365
|
+
fi
|
|
366
|
+
|
|
367
|
+
local elapsed=0
|
|
368
|
+
if [[ -n "$started_at" ]]; then
|
|
369
|
+
local start_e
|
|
370
|
+
start_e=$(TZ=UTC date -j -f "%Y-%m-%dT%H:%M:%SZ" "$started_at" +%s 2>/dev/null || date -d "$started_at" +%s 2>/dev/null || echo "0")
|
|
371
|
+
elapsed=$(( now_e - start_e ))
|
|
372
|
+
fi
|
|
373
|
+
|
|
374
|
+
# Hard wall-clock limit — disabled by default (0 = off)
|
|
375
|
+
if [[ "$hard_limit" -gt 0 && "$elapsed" -gt "$hard_limit" ]]; then
|
|
376
|
+
daemon_log WARN "Hard limit exceeded: issue #${issue_num} (${elapsed}s > ${hard_limit}s, PID $pid) — killing"
|
|
377
|
+
emit_event "daemon.hard_limit" "issue=$issue_num" "elapsed_s=$elapsed" "limit_s=$hard_limit" "pid=$pid"
|
|
378
|
+
kill "$pid" 2>/dev/null || true
|
|
379
|
+
daemon_clear_progress "$issue_num"
|
|
380
|
+
findings=$((findings + 1))
|
|
381
|
+
continue
|
|
382
|
+
fi
|
|
383
|
+
|
|
384
|
+
# ── Intelligent Progress Sensing ──
|
|
385
|
+
if [[ "$use_progress" == "true" && -n "$worktree" ]]; then
|
|
386
|
+
local snapshot verdict
|
|
387
|
+
snapshot=$(daemon_collect_snapshot "$issue_num" "$worktree" "$pid" 2>/dev/null || echo '{}')
|
|
388
|
+
|
|
389
|
+
if [[ "$snapshot" != "{}" ]]; then
|
|
390
|
+
verdict=$(daemon_assess_progress "$issue_num" "$snapshot" 2>/dev/null || echo "healthy")
|
|
391
|
+
|
|
392
|
+
local no_progress_count=0
|
|
393
|
+
no_progress_count=$(jq -r '.no_progress_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
|
|
394
|
+
local cur_stage
|
|
395
|
+
cur_stage=$(echo "$snapshot" | jq -r '.stage // "unknown"')
|
|
396
|
+
|
|
397
|
+
case "$verdict" in
|
|
398
|
+
healthy)
|
|
399
|
+
# All good — agent is making progress
|
|
400
|
+
;;
|
|
401
|
+
slowing)
|
|
402
|
+
daemon_log INFO "Issue #${issue_num} slowing (no visible changes for ${no_progress_count} checks, ${elapsed}s elapsed, stage=${cur_stage})"
|
|
403
|
+
;;
|
|
404
|
+
stalled)
|
|
405
|
+
# Check if agent subprocess is alive and consuming CPU
|
|
406
|
+
local agent_alive=false
|
|
407
|
+
local child_cpu=0
|
|
408
|
+
child_cpu=$(pgrep -P "$pid" 2>/dev/null | xargs -I{} ps -o pcpu= -p {} 2>/dev/null | awk '{sum+=$1} END{printf "%d", sum+0}' || echo "0")
|
|
409
|
+
if [[ "${child_cpu:-0}" -gt 0 ]]; then
|
|
410
|
+
agent_alive=true
|
|
411
|
+
fi
|
|
412
|
+
|
|
413
|
+
if [[ "$agent_alive" == "true" ]]; then
|
|
414
|
+
daemon_log INFO "Issue #${issue_num} no visible progress (${no_progress_count} checks) but agent is alive (CPU: ${child_cpu}%, stage=${cur_stage}, ${elapsed}s) — being patient"
|
|
415
|
+
else
|
|
416
|
+
daemon_log WARN "Issue #${issue_num} stalled: no progress for ${no_progress_count} checks, no CPU activity (${elapsed}s elapsed, PID $pid)"
|
|
417
|
+
emit_event "daemon.stalled" "issue=$issue_num" "no_progress=$no_progress_count" "elapsed_s=$elapsed" "pid=$pid"
|
|
418
|
+
fi
|
|
419
|
+
;;
|
|
420
|
+
stuck)
|
|
421
|
+
local repeated_errors
|
|
422
|
+
repeated_errors=$(jq -r '.repeated_error_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
|
|
423
|
+
|
|
424
|
+
# Even "stuck" — check if the process tree is alive first
|
|
425
|
+
local agent_alive=false
|
|
426
|
+
local child_cpu=0
|
|
427
|
+
child_cpu=$(pgrep -P "$pid" 2>/dev/null | xargs -I{} ps -o pcpu= -p {} 2>/dev/null | awk '{sum+=$1} END{printf "%d", sum+0}' || echo "0")
|
|
428
|
+
if [[ "${child_cpu:-0}" -gt 0 ]]; then
|
|
429
|
+
agent_alive=true
|
|
430
|
+
fi
|
|
431
|
+
|
|
432
|
+
if [[ "$agent_alive" == "true" && "$repeated_errors" -lt 3 ]]; then
|
|
433
|
+
# Agent is alive — nudge instead of kill
|
|
434
|
+
if [[ "$nudge_enabled" == "true" && "$no_progress_count" -ge "$nudge_after" ]]; then
|
|
435
|
+
local nudge_file="${worktree}/.claude/nudge.md"
|
|
436
|
+
if [[ ! -f "$nudge_file" ]]; then
|
|
437
|
+
cat > "$nudge_file" <<NUDGE_EOF
|
|
438
|
+
# Nudge from Daemon Health Monitor
|
|
439
|
+
|
|
440
|
+
The daemon has noticed no visible progress for $(( no_progress_count * 30 / 60 )) minutes.
|
|
441
|
+
Current stage: ${cur_stage}
|
|
442
|
+
|
|
443
|
+
If you're stuck, consider:
|
|
444
|
+
- Breaking the task into smaller steps
|
|
445
|
+
- Committing partial progress
|
|
446
|
+
- Running tests to validate current state
|
|
447
|
+
|
|
448
|
+
This is just a gentle check-in — take your time if you're working through a complex problem.
|
|
449
|
+
NUDGE_EOF
|
|
450
|
+
daemon_log INFO "Issue #${issue_num} nudged (${no_progress_count} checks, stage=${cur_stage}, CPU=${child_cpu}%) — file written to worktree"
|
|
451
|
+
emit_event "daemon.nudge" "issue=$issue_num" "no_progress=$no_progress_count" "stage=$cur_stage" "elapsed_s=$elapsed"
|
|
452
|
+
fi
|
|
453
|
+
else
|
|
454
|
+
daemon_log INFO "Issue #${issue_num} no visible progress (${no_progress_count} checks) but agent is alive (CPU: ${child_cpu}%, stage=${cur_stage}) — waiting"
|
|
455
|
+
fi
|
|
456
|
+
elif [[ "$repeated_errors" -ge 5 ]]; then
|
|
457
|
+
# Truly stuck in an error loop — kill as last resort
|
|
458
|
+
daemon_log WARN "Issue #${issue_num} in error loop: ${repeated_errors} repeated errors (stage=${cur_stage}, ${elapsed}s, PID $pid) — killing"
|
|
459
|
+
emit_event "daemon.stuck_kill" "issue=$issue_num" "no_progress=$no_progress_count" "repeated_errors=$repeated_errors" "stage=$cur_stage" "elapsed_s=$elapsed" "pid=$pid" "reason=error_loop"
|
|
460
|
+
kill "$pid" 2>/dev/null || true
|
|
461
|
+
daemon_clear_progress "$issue_num"
|
|
462
|
+
findings=$((findings + 1))
|
|
463
|
+
elif [[ "$agent_alive" != "true" && "$no_progress_count" -ge "$((PROGRESS_CHECKS_BEFORE_KILL * 2))" ]]; then
|
|
464
|
+
# Process tree is dead AND no progress for very long time
|
|
465
|
+
daemon_log WARN "Issue #${issue_num} appears dead: no CPU, no progress for ${no_progress_count} checks (${elapsed}s, PID $pid) — killing"
|
|
466
|
+
emit_event "daemon.stuck_kill" "issue=$issue_num" "no_progress=$no_progress_count" "repeated_errors=$repeated_errors" "stage=$cur_stage" "elapsed_s=$elapsed" "pid=$pid" "reason=dead_process"
|
|
467
|
+
kill "$pid" 2>/dev/null || true
|
|
468
|
+
daemon_clear_progress "$issue_num"
|
|
469
|
+
findings=$((findings + 1))
|
|
470
|
+
else
|
|
471
|
+
daemon_log WARN "Issue #${issue_num} struggling (${no_progress_count} checks, ${repeated_errors} errors, CPU=${child_cpu}%, stage=${cur_stage}) — monitoring"
|
|
472
|
+
fi
|
|
473
|
+
;;
|
|
474
|
+
esac
|
|
475
|
+
fi
|
|
476
|
+
else
|
|
477
|
+
# Fallback: legacy time-based detection when progress monitoring is off
|
|
478
|
+
local stale_timeout
|
|
479
|
+
stale_timeout=$(get_adaptive_stale_timeout "$PIPELINE_TEMPLATE")
|
|
480
|
+
if [[ "$elapsed" -gt "$stale_timeout" ]]; then
|
|
481
|
+
# Check if process is still alive
|
|
482
|
+
if kill -0 "$pid" 2>/dev/null; then
|
|
483
|
+
# Kill at 2x stale timeout — the process is truly hung
|
|
484
|
+
local kill_threshold=$(( stale_timeout * 2 ))
|
|
485
|
+
if [[ "$elapsed" -gt "$kill_threshold" ]]; then
|
|
486
|
+
daemon_log WARN "Killing stale job (legacy): issue #${issue_num} (${elapsed}s > ${kill_threshold}s kill threshold, PID $pid)"
|
|
487
|
+
emit_event "daemon.stale_kill" "issue=$issue_num" "elapsed_s=$elapsed" "pid=$pid"
|
|
488
|
+
kill "$pid" 2>/dev/null || true
|
|
489
|
+
sleep 2
|
|
490
|
+
kill -9 "$pid" 2>/dev/null || true
|
|
491
|
+
else
|
|
492
|
+
daemon_log WARN "Stale job (legacy): issue #${issue_num} (${elapsed}s > ${stale_timeout}s, PID $pid) — will kill at ${kill_threshold}s"
|
|
493
|
+
emit_event "daemon.stale_warning" "issue=$issue_num" "elapsed_s=$elapsed" "pid=$pid"
|
|
494
|
+
fi
|
|
495
|
+
else
|
|
496
|
+
daemon_log WARN "Stale job with dead process: issue #${issue_num} (PID $pid no longer exists)"
|
|
497
|
+
emit_event "daemon.stale_dead" "issue=$issue_num" "pid=$pid"
|
|
498
|
+
fi
|
|
499
|
+
findings=$((findings + 1))
|
|
500
|
+
fi
|
|
501
|
+
fi
|
|
502
|
+
done < <(jq -c '.active_jobs[]' "$STATE_FILE" 2>/dev/null || true)
|
|
503
|
+
fi
|
|
504
|
+
|
|
505
|
+
# Disk space warning (check both repo dir and ~/.shipwright)
|
|
506
|
+
local free_kb
|
|
507
|
+
free_kb=$(df -k "." 2>/dev/null | tail -1 | awk '{print $4}')
|
|
508
|
+
if [[ -n "$free_kb" ]] && [[ "$free_kb" -lt 1048576 ]] 2>/dev/null; then
|
|
509
|
+
daemon_log WARN "Low disk space: $(( free_kb / 1024 ))MB free"
|
|
510
|
+
findings=$((findings + 1))
|
|
511
|
+
fi
|
|
512
|
+
|
|
513
|
+
# Critical disk space on ~/.shipwright — pause spawning
|
|
514
|
+
local sw_free_kb
|
|
515
|
+
sw_free_kb=$(df -k "$HOME/.shipwright" 2>/dev/null | tail -1 | awk '{print $4}')
|
|
516
|
+
if [[ -n "$sw_free_kb" ]] && [[ "$sw_free_kb" -lt 512000 ]] 2>/dev/null; then
|
|
517
|
+
daemon_log WARN "Critical disk space on ~/.shipwright: $(( sw_free_kb / 1024 ))MB — pausing spawns"
|
|
518
|
+
emit_event "daemon.disk_low" "free_mb=$(( sw_free_kb / 1024 ))"
|
|
519
|
+
mkdir -p "$HOME/.shipwright"
|
|
520
|
+
echo '{"paused":true,"reason":"disk_low"}' > "$HOME/.shipwright/daemon-pause.flag"
|
|
521
|
+
findings=$((findings + 1))
|
|
522
|
+
fi
|
|
523
|
+
|
|
524
|
+
# Events file size warning
|
|
525
|
+
if [[ -f "$EVENTS_FILE" ]]; then
|
|
526
|
+
local events_size
|
|
527
|
+
events_size=$(wc -c < "$EVENTS_FILE" 2>/dev/null || echo 0)
|
|
528
|
+
if [[ "$events_size" -gt 104857600 ]]; then # 100MB
|
|
529
|
+
daemon_log WARN "Events file large ($(( events_size / 1048576 ))MB) — consider rotating"
|
|
530
|
+
findings=$((findings + 1))
|
|
531
|
+
fi
|
|
532
|
+
fi
|
|
533
|
+
|
|
534
|
+
if [[ "$findings" -gt 0 ]]; then
|
|
535
|
+
emit_event "daemon.health" "findings=$findings"
|
|
536
|
+
fi
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
# ─── Degradation Alerting ─────────────────────────────────────────────────────
|
|
540
|
+
|
|
541
|
+
daemon_check_degradation() {
|
|
542
|
+
if [[ ! -f "$EVENTS_FILE" ]]; then return; fi
|
|
543
|
+
|
|
544
|
+
local window="${DEGRADATION_WINDOW:-5}"
|
|
545
|
+
local cfr_threshold="${DEGRADATION_CFR_THRESHOLD:-30}"
|
|
546
|
+
local success_threshold="${DEGRADATION_SUCCESS_THRESHOLD:-50}"
|
|
547
|
+
|
|
548
|
+
# Get last N pipeline completions
|
|
549
|
+
local recent
|
|
550
|
+
recent=$(tail -200 "$EVENTS_FILE" | jq -s "[.[] | select(.type == \"pipeline.completed\")] | .[-${window}:]" 2>/dev/null)
|
|
551
|
+
local count
|
|
552
|
+
count=$(echo "$recent" | jq 'length' 2>/dev/null || echo 0)
|
|
553
|
+
|
|
554
|
+
if [[ "$count" -lt "$window" ]]; then return; fi
|
|
555
|
+
|
|
556
|
+
local failures successes
|
|
557
|
+
failures=$(echo "$recent" | jq '[.[] | select(.result == "failure")] | length')
|
|
558
|
+
successes=$(echo "$recent" | jq '[.[] | select(.result == "success")] | length')
|
|
559
|
+
local cfr_pct=0 success_pct=0
|
|
560
|
+
if [[ "${count:-0}" -gt 0 ]]; then
|
|
561
|
+
cfr_pct=$(( failures * 100 / count ))
|
|
562
|
+
success_pct=$(( successes * 100 / count ))
|
|
563
|
+
fi
|
|
564
|
+
|
|
565
|
+
local alerts=""
|
|
566
|
+
if [[ "$cfr_pct" -gt "$cfr_threshold" ]]; then
|
|
567
|
+
alerts="CFR ${cfr_pct}% exceeds threshold ${cfr_threshold}%"
|
|
568
|
+
daemon_log WARN "DEGRADATION: $alerts"
|
|
569
|
+
fi
|
|
570
|
+
if [[ "$success_pct" -lt "$success_threshold" ]]; then
|
|
571
|
+
local msg="Success rate ${success_pct}% below threshold ${success_threshold}%"
|
|
572
|
+
[[ -n "$alerts" ]] && alerts="$alerts; $msg" || alerts="$msg"
|
|
573
|
+
daemon_log WARN "DEGRADATION: $msg"
|
|
574
|
+
fi
|
|
575
|
+
|
|
576
|
+
if [[ -n "$alerts" ]]; then
|
|
577
|
+
emit_event "daemon.alert" "alerts=$alerts" "cfr_pct=$cfr_pct" "success_pct=$success_pct"
|
|
578
|
+
|
|
579
|
+
# Slack notification
|
|
580
|
+
if [[ -n "${SLACK_WEBHOOK:-}" ]]; then
|
|
581
|
+
notify "Pipeline Degradation Alert" "$alerts" "warn"
|
|
582
|
+
fi
|
|
583
|
+
fi
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
# ─── Auto-Scaling ─────────────────────────────────────────────────────────
|
|
587
|
+
# Dynamically adjusts MAX_PARALLEL based on CPU, memory, budget, and queue depth
|
|
588
|
+
|
|
589
|
+
daemon_auto_scale() {
|
|
590
|
+
if [[ "${AUTO_SCALE:-false}" != "true" ]]; then
|
|
591
|
+
return
|
|
592
|
+
fi
|
|
593
|
+
|
|
594
|
+
local prev_max="$MAX_PARALLEL"
|
|
595
|
+
|
|
596
|
+
# ── Learn worker memory from actual RSS (adaptive) ──
|
|
597
|
+
learn_worker_memory
|
|
598
|
+
|
|
599
|
+
# ── Adaptive cost estimate per template ──
|
|
600
|
+
local effective_cost_per_job
|
|
601
|
+
effective_cost_per_job=$(get_adaptive_cost_estimate "$PIPELINE_TEMPLATE")
|
|
602
|
+
|
|
603
|
+
# ── CPU cores ──
|
|
604
|
+
local cpu_cores=2
|
|
605
|
+
if [[ "$(uname -s)" == "Darwin" ]]; then
|
|
606
|
+
cpu_cores=$(sysctl -n hw.ncpu 2>/dev/null || echo 2)
|
|
607
|
+
else
|
|
608
|
+
cpu_cores=$(nproc 2>/dev/null || echo 2)
|
|
609
|
+
fi
|
|
610
|
+
local max_by_cpu=$(( (cpu_cores * 3) / 4 )) # 75% utilization cap
|
|
611
|
+
[[ "$max_by_cpu" -lt 1 ]] && max_by_cpu=1
|
|
612
|
+
|
|
613
|
+
# ── Load average check — gradual scaling curve (replaces 90% cliff) ──
|
|
614
|
+
local load_avg
|
|
615
|
+
load_avg=$(uptime | awk -F'load averages?: ' '{print $2}' | awk -F'[, ]+' '{print $1}' 2>/dev/null || echo "0")
|
|
616
|
+
if [[ ! "$load_avg" =~ ^[0-9]+\.?[0-9]*$ ]]; then
|
|
617
|
+
load_avg="0"
|
|
618
|
+
fi
|
|
619
|
+
local load_ratio=0
|
|
620
|
+
if [[ "$cpu_cores" -gt 0 ]]; then
|
|
621
|
+
load_ratio=$(awk -v load="$load_avg" -v cores="$cpu_cores" 'BEGIN { printf "%.0f", (load / cores) * 100 }')
|
|
622
|
+
fi
|
|
623
|
+
# Gradual load scaling curve (replaces binary 90% cliff)
|
|
624
|
+
if [[ "$load_ratio" -gt 95 ]]; then
|
|
625
|
+
# 95%+: minimum workers only
|
|
626
|
+
max_by_cpu="$MIN_WORKERS"
|
|
627
|
+
daemon_log WARN "Auto-scale: critical load (${load_ratio}%) — minimum workers only"
|
|
628
|
+
elif [[ "$load_ratio" -gt 85 ]]; then
|
|
629
|
+
# 85-95%: reduce by 50%
|
|
630
|
+
max_by_cpu=$(( max_by_cpu / 2 ))
|
|
631
|
+
[[ "$max_by_cpu" -lt "$MIN_WORKERS" ]] && max_by_cpu="$MIN_WORKERS"
|
|
632
|
+
daemon_log WARN "Auto-scale: high load (${load_ratio}%) — reducing capacity 50%"
|
|
633
|
+
elif [[ "$load_ratio" -gt 70 ]]; then
|
|
634
|
+
# 70-85%: reduce by 25%
|
|
635
|
+
max_by_cpu=$(( (max_by_cpu * 3) / 4 ))
|
|
636
|
+
[[ "$max_by_cpu" -lt "$MIN_WORKERS" ]] && max_by_cpu="$MIN_WORKERS"
|
|
637
|
+
daemon_log INFO "Auto-scale: moderate load (${load_ratio}%) — reducing capacity 25%"
|
|
638
|
+
fi
|
|
639
|
+
# 0-70%: full capacity (no change)
|
|
640
|
+
|
|
641
|
+
# ── Available memory ──
|
|
642
|
+
local avail_mem_gb=8
|
|
643
|
+
if [[ "$(uname -s)" == "Darwin" ]]; then
|
|
644
|
+
local page_size free_pages inactive_pages purgeable_pages speculative_pages
|
|
645
|
+
page_size=$(vm_stat | awk '/page size of/ {for(i=1;i<=NF;i++) if($i ~ /^[0-9]+$/) print $i}')
|
|
646
|
+
page_size="${page_size:-16384}"
|
|
647
|
+
free_pages=$(vm_stat | awk '/^Pages free:/ {gsub(/\./, "", $NF); print $NF}')
|
|
648
|
+
free_pages="${free_pages:-0}"
|
|
649
|
+
speculative_pages=$(vm_stat | awk '/^Pages speculative:/ {gsub(/\./, "", $NF); print $NF}')
|
|
650
|
+
speculative_pages="${speculative_pages:-0}"
|
|
651
|
+
inactive_pages=$(vm_stat | awk '/^Pages inactive:/ {gsub(/\./, "", $NF); print $NF}')
|
|
652
|
+
inactive_pages="${inactive_pages:-0}"
|
|
653
|
+
purgeable_pages=$(vm_stat | awk '/^Pages purgeable:/ {gsub(/\./, "", $NF); print $NF}')
|
|
654
|
+
purgeable_pages="${purgeable_pages:-0}"
|
|
655
|
+
local avail_pages=$(( free_pages + speculative_pages + inactive_pages + purgeable_pages ))
|
|
656
|
+
if [[ "$avail_pages" -gt 0 && "$page_size" -gt 0 ]]; then
|
|
657
|
+
local free_bytes=$(( avail_pages * page_size ))
|
|
658
|
+
avail_mem_gb=$(( free_bytes / 1073741824 ))
|
|
659
|
+
fi
|
|
660
|
+
else
|
|
661
|
+
local avail_kb
|
|
662
|
+
avail_kb=$(awk '/MemAvailable/ {print $2}' /proc/meminfo 2>/dev/null || echo "8388608")
|
|
663
|
+
avail_mem_gb=$(( avail_kb / 1048576 ))
|
|
664
|
+
fi
|
|
665
|
+
[[ "$avail_mem_gb" -lt 1 ]] && avail_mem_gb=1
|
|
666
|
+
local max_by_mem=$(( avail_mem_gb / WORKER_MEM_GB ))
|
|
667
|
+
[[ "$max_by_mem" -lt 1 ]] && max_by_mem=1
|
|
668
|
+
|
|
669
|
+
# ── Budget remaining (adaptive cost estimate) ──
|
|
670
|
+
local max_by_budget="$MAX_WORKERS"
|
|
671
|
+
local remaining_usd
|
|
672
|
+
remaining_usd=$("$SCRIPT_DIR/sw-cost.sh" remaining-budget 2>/dev/null || echo "unlimited")
|
|
673
|
+
if [[ "$remaining_usd" != "unlimited" && -n "$remaining_usd" ]]; then
|
|
674
|
+
if awk -v r="$remaining_usd" -v c="$effective_cost_per_job" 'BEGIN { exit !(r > 0 && c > 0) }'; then
|
|
675
|
+
max_by_budget=$(awk -v r="$remaining_usd" -v c="$effective_cost_per_job" 'BEGIN { printf "%.0f", r / c }')
|
|
676
|
+
[[ "$max_by_budget" -lt 0 ]] && max_by_budget=0
|
|
677
|
+
else
|
|
678
|
+
max_by_budget=0
|
|
679
|
+
fi
|
|
680
|
+
fi
|
|
681
|
+
|
|
682
|
+
# ── Queue depth (don't over-provision) ──
|
|
683
|
+
local queue_depth active_count
|
|
684
|
+
queue_depth=$(jq -r '.queued | length' "$STATE_FILE" 2>/dev/null || echo 0)
|
|
685
|
+
queue_depth="${queue_depth:-0}"
|
|
686
|
+
[[ ! "$queue_depth" =~ ^[0-9]+$ ]] && queue_depth=0
|
|
687
|
+
active_count=$(get_active_count)
|
|
688
|
+
active_count="${active_count:-0}"
|
|
689
|
+
[[ ! "$active_count" =~ ^[0-9]+$ ]] && active_count=0
|
|
690
|
+
local max_by_queue=$(( queue_depth + active_count ))
|
|
691
|
+
[[ "$max_by_queue" -lt 1 ]] && max_by_queue=1
|
|
692
|
+
|
|
693
|
+
# ── Vitals-driven scaling factor ──
|
|
694
|
+
local max_by_vitals="$MAX_WORKERS"
|
|
695
|
+
if type pipeline_compute_vitals &>/dev/null 2>&1 && [[ -f "$STATE_FILE" ]]; then
|
|
696
|
+
local _total_health=0 _health_count=0
|
|
697
|
+
while IFS= read -r _job; do
|
|
698
|
+
local _job_issue _job_worktree
|
|
699
|
+
_job_issue=$(echo "$_job" | jq -r '.issue // 0')
|
|
700
|
+
_job_worktree=$(echo "$_job" | jq -r '.worktree // ""')
|
|
701
|
+
if [[ -n "$_job_worktree" && -d "$_job_worktree/.claude" ]]; then
|
|
702
|
+
local _job_vitals _job_health
|
|
703
|
+
_job_vitals=$(pipeline_compute_vitals "$_job_worktree/.claude/pipeline-state.md" "$_job_worktree/.claude/pipeline-artifacts" "$_job_issue" 2>/dev/null) || true
|
|
704
|
+
if [[ -n "$_job_vitals" && "$_job_vitals" != "{}" ]]; then
|
|
705
|
+
_job_health=$(echo "$_job_vitals" | jq -r '.health_score // 50' 2>/dev/null || echo "50")
|
|
706
|
+
_total_health=$((_total_health + _job_health))
|
|
707
|
+
_health_count=$((_health_count + 1))
|
|
708
|
+
fi
|
|
709
|
+
fi
|
|
710
|
+
done < <(jq -c '.active_jobs[]' "$STATE_FILE" 2>/dev/null || true)
|
|
711
|
+
|
|
712
|
+
if [[ "$_health_count" -gt 0 ]]; then
|
|
713
|
+
local _avg_health=$((_total_health / _health_count))
|
|
714
|
+
if [[ "$_avg_health" -lt 50 ]]; then
|
|
715
|
+
# Pipelines struggling — reduce workers to give each more resources
|
|
716
|
+
max_by_vitals=$(( MAX_WORKERS * _avg_health / 100 ))
|
|
717
|
+
[[ "$max_by_vitals" -lt "$MIN_WORKERS" ]] && max_by_vitals="$MIN_WORKERS"
|
|
718
|
+
daemon_log INFO "Auto-scale: vitals avg health ${_avg_health}% — capping at ${max_by_vitals} workers"
|
|
719
|
+
fi
|
|
720
|
+
# avg_health > 70: no reduction (full capacity available)
|
|
721
|
+
fi
|
|
722
|
+
fi
|
|
723
|
+
|
|
724
|
+
# ── Compute final value ──
|
|
725
|
+
local computed="$max_by_cpu"
|
|
726
|
+
[[ "$max_by_mem" -lt "$computed" ]] && computed="$max_by_mem"
|
|
727
|
+
[[ "$max_by_budget" -lt "$computed" ]] && computed="$max_by_budget"
|
|
728
|
+
[[ "$max_by_queue" -lt "$computed" ]] && computed="$max_by_queue"
|
|
729
|
+
[[ "$max_by_vitals" -lt "$computed" ]] && computed="$max_by_vitals"
|
|
730
|
+
[[ "$MAX_WORKERS" -lt "$computed" ]] && computed="$MAX_WORKERS"
|
|
731
|
+
|
|
732
|
+
# Respect fleet-assigned ceiling if set
|
|
733
|
+
if [[ -n "${FLEET_MAX_PARALLEL:-}" && "$FLEET_MAX_PARALLEL" -lt "$computed" ]]; then
|
|
734
|
+
computed="$FLEET_MAX_PARALLEL"
|
|
735
|
+
fi
|
|
736
|
+
|
|
737
|
+
# Clamp to min_workers
|
|
738
|
+
[[ "$computed" -lt "$MIN_WORKERS" ]] && computed="$MIN_WORKERS"
|
|
739
|
+
|
|
740
|
+
# ── Gradual scaling: change by at most 1 at a time (adaptive) ──
|
|
741
|
+
if [[ "${ADAPTIVE_THRESHOLDS_ENABLED:-false}" == "true" ]]; then
|
|
742
|
+
if [[ "$computed" -gt "$prev_max" ]]; then
|
|
743
|
+
# Check success rate at target parallelism before scaling up
|
|
744
|
+
local target_rate
|
|
745
|
+
target_rate=$(get_success_rate_at_parallelism "$((prev_max + 1))")
|
|
746
|
+
if [[ "$target_rate" -lt 50 ]]; then
|
|
747
|
+
# Poor success rate at higher parallelism — hold steady
|
|
748
|
+
computed="$prev_max"
|
|
749
|
+
daemon_log INFO "Auto-scale: holding at ${prev_max} (success rate ${target_rate}% at $((prev_max + 1)))"
|
|
750
|
+
else
|
|
751
|
+
# Scale up by 1, not jump to target
|
|
752
|
+
computed=$((prev_max + 1))
|
|
753
|
+
fi
|
|
754
|
+
elif [[ "$computed" -lt "$prev_max" ]]; then
|
|
755
|
+
# Scale down by 1, not drop to minimum
|
|
756
|
+
computed=$((prev_max - 1))
|
|
757
|
+
[[ "$computed" -lt "$MIN_WORKERS" ]] && computed="$MIN_WORKERS"
|
|
758
|
+
fi
|
|
759
|
+
fi
|
|
760
|
+
|
|
761
|
+
MAX_PARALLEL="$computed"
|
|
762
|
+
|
|
763
|
+
if [[ "$MAX_PARALLEL" -ne "$prev_max" ]]; then
|
|
764
|
+
daemon_log INFO "Auto-scale: ${prev_max} → ${MAX_PARALLEL} (cpu=${max_by_cpu} mem=${max_by_mem} budget=${max_by_budget} queue=${max_by_queue} load=${load_ratio}%)"
|
|
765
|
+
emit_event "daemon.scale" \
|
|
766
|
+
"from=$prev_max" \
|
|
767
|
+
"to=$MAX_PARALLEL" \
|
|
768
|
+
"max_by_cpu=$max_by_cpu" \
|
|
769
|
+
"max_by_mem=$max_by_mem" \
|
|
770
|
+
"max_by_budget=$max_by_budget" \
|
|
771
|
+
"max_by_queue=$max_by_queue" \
|
|
772
|
+
"cpu_cores=$cpu_cores" \
|
|
773
|
+
"avail_mem_gb=$avail_mem_gb" \
|
|
774
|
+
"remaining_usd=$remaining_usd" \
|
|
775
|
+
"load_ratio=$load_ratio"
|
|
776
|
+
fi
|
|
777
|
+
}
|
|
778
|
+
|
|
779
|
+
# ─── Fleet Config Reload ──────────────────────────────────────────────────
|
|
780
|
+
# Checks for fleet-reload.flag and reloads MAX_PARALLEL from fleet-managed config
|
|
781
|
+
|
|
782
|
+
daemon_reload_config() {
|
|
783
|
+
local reload_flag="$HOME/.shipwright/fleet-reload.flag"
|
|
784
|
+
if [[ ! -f "$reload_flag" ]]; then
|
|
785
|
+
return
|
|
786
|
+
fi
|
|
787
|
+
|
|
788
|
+
local fleet_config=".claude/.fleet-daemon-config.json"
|
|
789
|
+
if [[ -f "$fleet_config" ]]; then
|
|
790
|
+
local new_max
|
|
791
|
+
new_max=$(jq -r '.max_parallel // empty' "$fleet_config" 2>/dev/null || true)
|
|
792
|
+
if [[ -n "$new_max" && "$new_max" != "null" ]]; then
|
|
793
|
+
local prev="$MAX_PARALLEL"
|
|
794
|
+
FLEET_MAX_PARALLEL="$new_max"
|
|
795
|
+
MAX_PARALLEL="$new_max"
|
|
796
|
+
daemon_log INFO "Fleet reload: max_parallel ${prev} → ${MAX_PARALLEL} (fleet ceiling: ${FLEET_MAX_PARALLEL})"
|
|
797
|
+
emit_event "daemon.fleet_reload" "from=$prev" "to=$MAX_PARALLEL"
|
|
798
|
+
fi
|
|
799
|
+
fi
|
|
800
|
+
|
|
801
|
+
rm -f "$reload_flag"
|
|
802
|
+
}
|
|
803
|
+
|
|
804
|
+
# ─── Self-Optimizing Metrics Loop ──────────────────────────────────────────
|
|
805
|
+
|
|
806
|
+
daemon_self_optimize() {
|
|
807
|
+
if [[ "${SELF_OPTIMIZE:-false}" != "true" ]]; then
|
|
808
|
+
return
|
|
809
|
+
fi
|
|
810
|
+
|
|
811
|
+
if [[ ! -f "$EVENTS_FILE" ]]; then
|
|
812
|
+
return
|
|
813
|
+
fi
|
|
814
|
+
|
|
815
|
+
# ── Intelligence-powered optimization (if enabled) ──
|
|
816
|
+
if [[ "${OPTIMIZATION_ENABLED:-false}" == "true" ]] && type optimize_full_analysis &>/dev/null 2>&1; then
|
|
817
|
+
daemon_log INFO "Running intelligence-powered optimization"
|
|
818
|
+
optimize_full_analysis 2>/dev/null || {
|
|
819
|
+
daemon_log WARN "Intelligence optimization failed — falling back to DORA-based tuning"
|
|
820
|
+
}
|
|
821
|
+
# Still run DORA-based tuning below as a complement
|
|
822
|
+
fi
|
|
823
|
+
|
|
824
|
+
daemon_log INFO "Running self-optimization check"
|
|
825
|
+
|
|
826
|
+
# Read DORA metrics from recent events (last 7 days)
|
|
827
|
+
local cutoff_epoch
|
|
828
|
+
cutoff_epoch=$(( $(now_epoch) - (7 * 86400) ))
|
|
829
|
+
|
|
830
|
+
local period_events
|
|
831
|
+
period_events=$(jq -c "select(.ts_epoch >= $cutoff_epoch)" "$EVENTS_FILE" 2>/dev/null || true)
|
|
832
|
+
|
|
833
|
+
if [[ -z "$period_events" ]]; then
|
|
834
|
+
daemon_log INFO "No recent events for optimization"
|
|
835
|
+
return
|
|
836
|
+
fi
|
|
837
|
+
|
|
838
|
+
local total_completed successes failures
|
|
839
|
+
total_completed=$(echo "$period_events" | jq -s '[.[] | select(.type == "pipeline.completed")] | length')
|
|
840
|
+
successes=$(echo "$period_events" | jq -s '[.[] | select(.type == "pipeline.completed" and .result == "success")] | length')
|
|
841
|
+
failures=$(echo "$period_events" | jq -s '[.[] | select(.type == "pipeline.completed" and .result == "failure")] | length')
|
|
842
|
+
|
|
843
|
+
# Change Failure Rate
|
|
844
|
+
local cfr=0
|
|
845
|
+
if [[ "$total_completed" -gt 0 ]]; then
|
|
846
|
+
cfr=$(echo "$failures $total_completed" | awk '{printf "%.0f", ($1 / $2) * 100}')
|
|
847
|
+
fi
|
|
848
|
+
|
|
849
|
+
# Cycle time (median, in seconds)
|
|
850
|
+
local cycle_time_median
|
|
851
|
+
cycle_time_median=$(echo "$period_events" | \
|
|
852
|
+
jq -s '[.[] | select(.type == "pipeline.completed" and .result == "success") | .duration_s // 0] | sort | if length > 0 then .[length/2 | floor] else 0 end')
|
|
853
|
+
|
|
854
|
+
# Deploy frequency (per week)
|
|
855
|
+
local deploy_freq
|
|
856
|
+
deploy_freq=$(echo "$successes" | awk '{printf "%.1f", $1 / 1}') # Already 7 days
|
|
857
|
+
|
|
858
|
+
# MTTR
|
|
859
|
+
local mttr
|
|
860
|
+
mttr=$(echo "$period_events" | \
|
|
861
|
+
jq -s '
|
|
862
|
+
[.[] | select(.type == "pipeline.completed")] | sort_by(.ts_epoch // 0) |
|
|
863
|
+
[range(length) as $i |
|
|
864
|
+
if .[$i].result == "failure" then
|
|
865
|
+
[.[$i+1:][] | select(.result == "success")][0] as $next |
|
|
866
|
+
if $next and $next.ts_epoch and .[$i].ts_epoch then
|
|
867
|
+
($next.ts_epoch - .[$i].ts_epoch)
|
|
868
|
+
else null end
|
|
869
|
+
else null end
|
|
870
|
+
] | map(select(. != null)) |
|
|
871
|
+
if length > 0 then (add / length | floor) else 0 end
|
|
872
|
+
')
|
|
873
|
+
|
|
874
|
+
local adjustments=()
|
|
875
|
+
|
|
876
|
+
# ── CFR > 20%: enable compound_quality, increase max_cycles ──
|
|
877
|
+
if [[ "$cfr" -gt 40 ]]; then
|
|
878
|
+
PIPELINE_TEMPLATE="full"
|
|
879
|
+
adjustments+=("template→full (CFR ${cfr}% > 40%)")
|
|
880
|
+
daemon_log WARN "Self-optimize: CFR ${cfr}% critical — switching to full template"
|
|
881
|
+
elif [[ "$cfr" -gt 20 ]]; then
|
|
882
|
+
adjustments+=("compound_quality enabled (CFR ${cfr}% > 20%)")
|
|
883
|
+
daemon_log WARN "Self-optimize: CFR ${cfr}% elevated — enabling compound quality"
|
|
884
|
+
fi
|
|
885
|
+
|
|
886
|
+
# ── Lead time > 4hrs: increase max_parallel, reduce poll_interval ──
|
|
887
|
+
if [[ "$cycle_time_median" -gt 14400 ]]; then
|
|
888
|
+
MAX_PARALLEL=$((MAX_PARALLEL + 1))
|
|
889
|
+
if [[ "$POLL_INTERVAL" -gt 30 ]]; then
|
|
890
|
+
POLL_INTERVAL=$((POLL_INTERVAL / 2))
|
|
891
|
+
fi
|
|
892
|
+
adjustments+=("max_parallel→${MAX_PARALLEL}, poll_interval→${POLL_INTERVAL}s (lead time > 4hrs)")
|
|
893
|
+
daemon_log WARN "Self-optimize: lead time $(format_duration "$cycle_time_median") — increasing parallelism"
|
|
894
|
+
elif [[ "$cycle_time_median" -gt 7200 ]]; then
|
|
895
|
+
# ── Lead time > 2hrs: enable auto_template for fast-pathing ──
|
|
896
|
+
AUTO_TEMPLATE="true"
|
|
897
|
+
adjustments+=("auto_template enabled (lead time > 2hrs)")
|
|
898
|
+
daemon_log INFO "Self-optimize: lead time $(format_duration "$cycle_time_median") — enabling adaptive templates"
|
|
899
|
+
fi
|
|
900
|
+
|
|
901
|
+
# ── Deploy freq < 1/day (< 7/week): enable merge stage ──
|
|
902
|
+
if [[ "$(echo "$deploy_freq < 7" | bc -l 2>/dev/null || echo 0)" == "1" ]]; then
|
|
903
|
+
adjustments+=("merge stage recommended (deploy freq ${deploy_freq}/week)")
|
|
904
|
+
daemon_log INFO "Self-optimize: low deploy frequency — consider enabling merge stage"
|
|
905
|
+
fi
|
|
906
|
+
|
|
907
|
+
# ── MTTR > 2hrs: enable auto_rollback ──
|
|
908
|
+
if [[ "$mttr" -gt 7200 ]]; then
|
|
909
|
+
adjustments+=("auto_rollback recommended (MTTR $(format_duration "$mttr"))")
|
|
910
|
+
daemon_log WARN "Self-optimize: high MTTR $(format_duration "$mttr") — consider enabling auto-rollback"
|
|
911
|
+
fi
|
|
912
|
+
|
|
913
|
+
# Write adjustments to state and persist to config
|
|
914
|
+
if [[ ${#adjustments[@]} -gt 0 ]]; then
|
|
915
|
+
local adj_str
|
|
916
|
+
adj_str=$(printf '%s; ' "${adjustments[@]}")
|
|
917
|
+
|
|
918
|
+
locked_state_update \
|
|
919
|
+
--arg adj "$adj_str" \
|
|
920
|
+
--arg ts "$(now_iso)" \
|
|
921
|
+
'.last_optimization = {timestamp: $ts, adjustments: $adj}'
|
|
922
|
+
|
|
923
|
+
# ── Persist adjustments to daemon-config.json (survives restart) ──
|
|
924
|
+
local config_file="${CONFIG_PATH:-.claude/daemon-config.json}"
|
|
925
|
+
if [[ -f "$config_file" ]]; then
|
|
926
|
+
local tmp_config
|
|
927
|
+
tmp_config=$(jq \
|
|
928
|
+
--argjson max_parallel "$MAX_PARALLEL" \
|
|
929
|
+
--argjson poll_interval "$POLL_INTERVAL" \
|
|
930
|
+
--arg template "$PIPELINE_TEMPLATE" \
|
|
931
|
+
--arg auto_template "${AUTO_TEMPLATE:-false}" \
|
|
932
|
+
--arg ts "$(now_iso)" \
|
|
933
|
+
--arg adj "$adj_str" \
|
|
934
|
+
'.max_parallel = $max_parallel |
|
|
935
|
+
.poll_interval = $poll_interval |
|
|
936
|
+
.pipeline_template = $template |
|
|
937
|
+
.auto_template = ($auto_template == "true") |
|
|
938
|
+
.last_optimization = {timestamp: $ts, adjustments: $adj}' \
|
|
939
|
+
"$config_file")
|
|
940
|
+
# Atomic write: tmp file + mv
|
|
941
|
+
local tmp_cfg_file="${config_file}.tmp.$$"
|
|
942
|
+
echo "$tmp_config" > "$tmp_cfg_file"
|
|
943
|
+
mv "$tmp_cfg_file" "$config_file"
|
|
944
|
+
daemon_log INFO "Self-optimize: persisted adjustments to ${config_file}"
|
|
945
|
+
fi
|
|
946
|
+
|
|
947
|
+
emit_event "daemon.optimize" "adjustments=${adj_str}" "cfr=$cfr" "cycle_time=$cycle_time_median" "deploy_freq=$deploy_freq" "mttr=$mttr"
|
|
948
|
+
daemon_log SUCCESS "Self-optimization applied ${#adjustments[@]} adjustment(s)"
|
|
949
|
+
else
|
|
950
|
+
daemon_log INFO "Self-optimization: all metrics within thresholds"
|
|
951
|
+
fi
|
|
952
|
+
}
|
|
953
|
+
|
|
954
|
+
# ─── Stale State Reaper ──────────────────────────────────────────────────────
|
|
955
|
+
# Cleans old worktrees, pipeline artifacts, and completed state entries.
|
|
956
|
+
# Called every N poll cycles (configurable via stale_reaper_interval).
|
|
957
|
+
|
|
958
|
+
daemon_cleanup_stale() {
|
|
959
|
+
if [[ "${STALE_REAPER_ENABLED:-true}" != "true" ]]; then
|
|
960
|
+
return
|
|
961
|
+
fi
|
|
962
|
+
|
|
963
|
+
daemon_log INFO "Running stale state reaper"
|
|
964
|
+
local cleaned=0
|
|
965
|
+
local age_days="${STALE_REAPER_AGE_DAYS:-7}"
|
|
966
|
+
local age_secs=$((age_days * 86400))
|
|
967
|
+
local now_e
|
|
968
|
+
now_e=$(now_epoch)
|
|
969
|
+
|
|
970
|
+
# ── 1. Clean old git worktrees ──
|
|
971
|
+
if command -v git &>/dev/null; then
|
|
972
|
+
while IFS= read -r line; do
|
|
973
|
+
local wt_path
|
|
974
|
+
wt_path=$(echo "$line" | awk '{print $1}')
|
|
975
|
+
# Only clean daemon-created worktrees
|
|
976
|
+
[[ "$wt_path" == *"daemon-issue-"* ]] || continue
|
|
977
|
+
# Check worktree age via directory mtime
|
|
978
|
+
local mtime
|
|
979
|
+
mtime=$(stat -f '%m' "$wt_path" 2>/dev/null || stat -c '%Y' "$wt_path" 2>/dev/null || echo "0")
|
|
980
|
+
if [[ $((now_e - mtime)) -gt $age_secs ]]; then
|
|
981
|
+
daemon_log INFO "Removing stale worktree: ${wt_path}"
|
|
982
|
+
git worktree remove "$wt_path" --force 2>/dev/null || true
|
|
983
|
+
cleaned=$((cleaned + 1))
|
|
984
|
+
fi
|
|
985
|
+
done < <(git worktree list --porcelain 2>/dev/null | grep '^worktree ' | sed 's/^worktree //')
|
|
986
|
+
fi
|
|
987
|
+
|
|
988
|
+
# ── 2. Expire old checkpoints ──
|
|
989
|
+
if [[ -x "$SCRIPT_DIR/sw-checkpoint.sh" ]]; then
|
|
990
|
+
local expired_output
|
|
991
|
+
expired_output=$(bash "$SCRIPT_DIR/sw-checkpoint.sh" expire --hours "$((age_days * 24))" 2>/dev/null || true)
|
|
992
|
+
if [[ -n "$expired_output" ]] && echo "$expired_output" | grep -q "Expired"; then
|
|
993
|
+
local expired_count
|
|
994
|
+
expired_count=$(echo "$expired_output" | grep -c "Expired" || true)
|
|
995
|
+
cleaned=$((cleaned + ${expired_count:-0}))
|
|
996
|
+
daemon_log INFO "Expired ${expired_count:-0} old checkpoint(s)"
|
|
997
|
+
fi
|
|
998
|
+
fi
|
|
999
|
+
|
|
1000
|
+
# ── 3. Clean old pipeline artifacts (subdirectories only) ──
|
|
1001
|
+
local artifacts_dir=".claude/pipeline-artifacts"
|
|
1002
|
+
if [[ -d "$artifacts_dir" ]]; then
|
|
1003
|
+
while IFS= read -r artifact_dir; do
|
|
1004
|
+
[[ -d "$artifact_dir" ]] || continue
|
|
1005
|
+
local mtime
|
|
1006
|
+
mtime=$(stat -f '%m' "$artifact_dir" 2>/dev/null || stat -c '%Y' "$artifact_dir" 2>/dev/null || echo "0")
|
|
1007
|
+
if [[ $((now_e - mtime)) -gt $age_secs ]]; then
|
|
1008
|
+
daemon_log INFO "Removing stale artifact: ${artifact_dir}"
|
|
1009
|
+
rm -rf "$artifact_dir"
|
|
1010
|
+
cleaned=$((cleaned + 1))
|
|
1011
|
+
fi
|
|
1012
|
+
done < <(find "$artifacts_dir" -mindepth 1 -maxdepth 1 -type d 2>/dev/null)
|
|
1013
|
+
fi
|
|
1014
|
+
|
|
1015
|
+
# ── 3. Clean orphaned daemon/* branches (no matching worktree or active job) ──
|
|
1016
|
+
if command -v git &>/dev/null; then
|
|
1017
|
+
while IFS= read -r branch; do
|
|
1018
|
+
[[ -z "$branch" ]] && continue
|
|
1019
|
+
branch="${branch## }" # trim leading spaces
|
|
1020
|
+
# Only clean daemon-created branches
|
|
1021
|
+
[[ "$branch" == daemon/issue-* ]] || continue
|
|
1022
|
+
# Extract issue number
|
|
1023
|
+
local branch_issue_num="${branch#daemon/issue-}"
|
|
1024
|
+
# Skip if there's an active job for this issue
|
|
1025
|
+
if daemon_is_inflight "$branch_issue_num" 2>/dev/null; then
|
|
1026
|
+
continue
|
|
1027
|
+
fi
|
|
1028
|
+
daemon_log INFO "Removing orphaned branch: ${branch}"
|
|
1029
|
+
git branch -D "$branch" 2>/dev/null || true
|
|
1030
|
+
cleaned=$((cleaned + 1))
|
|
1031
|
+
done < <(git branch --list 'daemon/issue-*' 2>/dev/null)
|
|
1032
|
+
fi
|
|
1033
|
+
|
|
1034
|
+
# ── 4. Prune completed/failed state entries older than age_days ──
|
|
1035
|
+
if [[ -f "$STATE_FILE" ]]; then
|
|
1036
|
+
local cutoff_iso
|
|
1037
|
+
cutoff_iso=$(epoch_to_iso $((now_e - age_secs)))
|
|
1038
|
+
local before_count
|
|
1039
|
+
before_count=$(jq '.completed | length' "$STATE_FILE" 2>/dev/null || echo 0)
|
|
1040
|
+
locked_state_update --arg cutoff "$cutoff_iso" \
|
|
1041
|
+
'.completed = [.completed[] | select(.completed_at > $cutoff)]' 2>/dev/null || true
|
|
1042
|
+
local after_count
|
|
1043
|
+
after_count=$(jq '.completed | length' "$STATE_FILE" 2>/dev/null || echo 0)
|
|
1044
|
+
local pruned=$((before_count - after_count))
|
|
1045
|
+
if [[ "$pruned" -gt 0 ]]; then
|
|
1046
|
+
daemon_log INFO "Pruned ${pruned} old completed state entries"
|
|
1047
|
+
cleaned=$((cleaned + pruned))
|
|
1048
|
+
fi
|
|
1049
|
+
fi
|
|
1050
|
+
|
|
1051
|
+
# ── 5. Prune stale retry_counts (issues no longer in flight or queued) ──
|
|
1052
|
+
if [[ -f "$STATE_FILE" ]]; then
|
|
1053
|
+
local retry_keys
|
|
1054
|
+
retry_keys=$(jq -r '.retry_counts // {} | keys[]' "$STATE_FILE" 2>/dev/null || true)
|
|
1055
|
+
local stale_keys=()
|
|
1056
|
+
while IFS= read -r key; do
|
|
1057
|
+
[[ -z "$key" ]] && continue
|
|
1058
|
+
if ! daemon_is_inflight "$key" 2>/dev/null; then
|
|
1059
|
+
stale_keys+=("$key")
|
|
1060
|
+
fi
|
|
1061
|
+
done <<< "$retry_keys"
|
|
1062
|
+
if [[ ${#stale_keys[@]} -gt 0 ]]; then
|
|
1063
|
+
for sk in "${stale_keys[@]}"; do
|
|
1064
|
+
locked_state_update --arg k "$sk" 'del(.retry_counts[$k])' 2>/dev/null || continue
|
|
1065
|
+
done
|
|
1066
|
+
daemon_log INFO "Pruned ${#stale_keys[@]} stale retry count(s)"
|
|
1067
|
+
cleaned=$((cleaned + ${#stale_keys[@]}))
|
|
1068
|
+
fi
|
|
1069
|
+
fi
|
|
1070
|
+
|
|
1071
|
+
# ── 6. Detect stale pipeline-state.md stuck in "running" ──
|
|
1072
|
+
local pipeline_state=".claude/pipeline-state.md"
|
|
1073
|
+
if [[ -f "$pipeline_state" ]]; then
|
|
1074
|
+
local ps_status=""
|
|
1075
|
+
ps_status=$(sed -n 's/^status: *//p' "$pipeline_state" 2>/dev/null | head -1 | tr -d ' ')
|
|
1076
|
+
if [[ "$ps_status" == "running" ]]; then
|
|
1077
|
+
local ps_mtime
|
|
1078
|
+
ps_mtime=$(stat -f '%m' "$pipeline_state" 2>/dev/null || stat -c '%Y' "$pipeline_state" 2>/dev/null || echo "0")
|
|
1079
|
+
local ps_age=$((now_e - ps_mtime))
|
|
1080
|
+
# If pipeline-state.md has been "running" for more than 2 hours and no active job
|
|
1081
|
+
if [[ "$ps_age" -gt 7200 ]]; then
|
|
1082
|
+
local has_active=false
|
|
1083
|
+
if [[ -f "$STATE_FILE" ]]; then
|
|
1084
|
+
local active_count
|
|
1085
|
+
active_count=$(jq '.active_jobs | length' "$STATE_FILE" 2>/dev/null || echo "0")
|
|
1086
|
+
[[ "${active_count:-0}" -gt 0 ]] && has_active=true
|
|
1087
|
+
fi
|
|
1088
|
+
if [[ "$has_active" == "false" ]]; then
|
|
1089
|
+
daemon_log WARN "Stale pipeline-state.md stuck in 'running' for ${ps_age}s with no active jobs — marking failed"
|
|
1090
|
+
# Atomically update status to failed
|
|
1091
|
+
local tmp_ps="${pipeline_state}.tmp.$$"
|
|
1092
|
+
sed 's/^status: *running/status: failed (stale — cleaned by daemon)/' "$pipeline_state" > "$tmp_ps" 2>/dev/null && mv "$tmp_ps" "$pipeline_state" || rm -f "$tmp_ps"
|
|
1093
|
+
emit_event "daemon.stale_pipeline_state" "age_s=$ps_age"
|
|
1094
|
+
cleaned=$((cleaned + 1))
|
|
1095
|
+
fi
|
|
1096
|
+
fi
|
|
1097
|
+
fi
|
|
1098
|
+
fi
|
|
1099
|
+
|
|
1100
|
+
# ── 7. Clean remote branches for merged pipeline/* branches ──
|
|
1101
|
+
if command -v git &>/dev/null && [[ "${NO_GITHUB:-}" != "true" ]]; then
|
|
1102
|
+
while IFS= read -r branch; do
|
|
1103
|
+
[[ -z "$branch" ]] && continue
|
|
1104
|
+
branch="${branch## }"
|
|
1105
|
+
[[ "$branch" == pipeline/* ]] || continue
|
|
1106
|
+
local br_issue="${branch#pipeline/pipeline-issue-}"
|
|
1107
|
+
if ! daemon_is_inflight "$br_issue" 2>/dev/null; then
|
|
1108
|
+
daemon_log INFO "Removing orphaned pipeline branch: ${branch}"
|
|
1109
|
+
git branch -D "$branch" 2>/dev/null || true
|
|
1110
|
+
git push origin --delete "$branch" 2>/dev/null || true
|
|
1111
|
+
cleaned=$((cleaned + 1))
|
|
1112
|
+
fi
|
|
1113
|
+
done < <(git branch --list 'pipeline/*' 2>/dev/null)
|
|
1114
|
+
fi
|
|
1115
|
+
|
|
1116
|
+
if [[ "$cleaned" -gt 0 ]]; then
|
|
1117
|
+
emit_event "daemon.cleanup" "cleaned=$cleaned" "age_days=$age_days"
|
|
1118
|
+
daemon_log SUCCESS "Stale reaper cleaned ${cleaned} item(s)"
|
|
1119
|
+
else
|
|
1120
|
+
daemon_log INFO "Stale reaper: nothing to clean"
|
|
1121
|
+
fi
|
|
1122
|
+
}
|
|
1123
|
+
|
|
1124
|
+
# ─── Poll Loop ───────────────────────────────────────────────────────────────
|
|
1125
|
+
|
|
1126
|
+
POLL_CYCLE_COUNT=0
|
|
1127
|
+
|
|
1128
|
+
daemon_poll_loop() {
|
|
1129
|
+
daemon_log INFO "Entering poll loop (interval: ${POLL_INTERVAL}s, max_parallel: ${MAX_PARALLEL})"
|
|
1130
|
+
daemon_log INFO "Watching for label: ${CYAN}${WATCH_LABEL}${RESET}"
|
|
1131
|
+
|
|
1132
|
+
while [[ ! -f "$SHUTDOWN_FLAG" ]]; do
|
|
1133
|
+
# All poll loop calls are error-guarded to prevent set -e from killing the daemon.
|
|
1134
|
+
# The || operator disables set -e for the entire call chain, so transient failures
|
|
1135
|
+
# (GitHub API timeouts, jq errors, intelligence failures) are logged and skipped.
|
|
1136
|
+
daemon_preflight_auth_check || daemon_log WARN "Auth check failed — daemon may be paused"
|
|
1137
|
+
daemon_poll_issues || daemon_log WARN "daemon_poll_issues failed — continuing"
|
|
1138
|
+
daemon_reap_completed || daemon_log WARN "daemon_reap_completed failed — continuing"
|
|
1139
|
+
daemon_health_check || daemon_log WARN "daemon_health_check failed — continuing"
|
|
1140
|
+
|
|
1141
|
+
# Increment cycle counter (must be before all modulo checks)
|
|
1142
|
+
POLL_CYCLE_COUNT=$((POLL_CYCLE_COUNT + 1))
|
|
1143
|
+
|
|
1144
|
+
# Fleet config reload every 3 cycles
|
|
1145
|
+
if [[ $((POLL_CYCLE_COUNT % 3)) -eq 0 ]]; then
|
|
1146
|
+
daemon_reload_config || daemon_log WARN "daemon_reload_config failed — continuing"
|
|
1147
|
+
fi
|
|
1148
|
+
|
|
1149
|
+
# Check degradation every 5 poll cycles
|
|
1150
|
+
if [[ $((POLL_CYCLE_COUNT % 5)) -eq 0 ]]; then
|
|
1151
|
+
daemon_check_degradation || daemon_log WARN "daemon_check_degradation failed — continuing"
|
|
1152
|
+
fi
|
|
1153
|
+
|
|
1154
|
+
# Auto-scale every N cycles (default: 5)
|
|
1155
|
+
if [[ $((POLL_CYCLE_COUNT % ${AUTO_SCALE_INTERVAL:-5})) -eq 0 ]]; then
|
|
1156
|
+
daemon_auto_scale || daemon_log WARN "daemon_auto_scale failed — continuing"
|
|
1157
|
+
fi
|
|
1158
|
+
|
|
1159
|
+
# Self-optimize every N cycles (default: 10)
|
|
1160
|
+
if [[ $((POLL_CYCLE_COUNT % ${OPTIMIZE_INTERVAL:-10})) -eq 0 ]]; then
|
|
1161
|
+
daemon_self_optimize || daemon_log WARN "daemon_self_optimize failed — continuing"
|
|
1162
|
+
fi
|
|
1163
|
+
|
|
1164
|
+
# Stale state reaper every N cycles (default: 10)
|
|
1165
|
+
if [[ $((POLL_CYCLE_COUNT % ${STALE_REAPER_INTERVAL:-10})) -eq 0 ]]; then
|
|
1166
|
+
daemon_cleanup_stale || daemon_log WARN "daemon_cleanup_stale failed — continuing"
|
|
1167
|
+
fi
|
|
1168
|
+
|
|
1169
|
+
# Rotate event log every 10 cycles (~10 min with 60s interval)
|
|
1170
|
+
if [[ $((POLL_CYCLE_COUNT % 10)) -eq 0 ]]; then
|
|
1171
|
+
rotate_event_log || true
|
|
1172
|
+
fi
|
|
1173
|
+
|
|
1174
|
+
# Proactive patrol during quiet periods (with adaptive limits)
|
|
1175
|
+
local issue_count_now active_count_now
|
|
1176
|
+
issue_count_now=$(jq -r '.queued | length' "$STATE_FILE" 2>/dev/null || echo 0)
|
|
1177
|
+
active_count_now=$(get_active_count || echo 0)
|
|
1178
|
+
if [[ "$issue_count_now" -eq 0 ]] && [[ "$active_count_now" -eq 0 ]]; then
|
|
1179
|
+
local now_e
|
|
1180
|
+
now_e=$(now_epoch || date +%s)
|
|
1181
|
+
if [[ $((now_e - LAST_PATROL_EPOCH)) -ge "$PATROL_INTERVAL" ]]; then
|
|
1182
|
+
load_adaptive_patrol_limits || true
|
|
1183
|
+
daemon_log INFO "No active work — running patrol"
|
|
1184
|
+
daemon_patrol --once || daemon_log WARN "daemon_patrol failed — continuing"
|
|
1185
|
+
LAST_PATROL_EPOCH=$now_e
|
|
1186
|
+
fi
|
|
1187
|
+
fi
|
|
1188
|
+
|
|
1189
|
+
# ── Adaptive poll interval: adjust sleep based on queue state ──
|
|
1190
|
+
local effective_interval
|
|
1191
|
+
effective_interval=$(get_adaptive_poll_interval "$issue_count_now" "$active_count_now" || echo "${POLL_INTERVAL:-30}")
|
|
1192
|
+
|
|
1193
|
+
# Sleep in 1s intervals so we can catch shutdown quickly
|
|
1194
|
+
local i=0
|
|
1195
|
+
while [[ $i -lt $effective_interval ]] && [[ ! -f "$SHUTDOWN_FLAG" ]]; do
|
|
1196
|
+
sleep 1 || true # Guard against signal interruption under set -e
|
|
1197
|
+
i=$((i + 1))
|
|
1198
|
+
done
|
|
1199
|
+
done
|
|
1200
|
+
|
|
1201
|
+
daemon_log INFO "Shutdown flag detected — exiting poll loop"
|
|
1202
|
+
}
|