shipwright-cli 2.2.1 → 2.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/README.md +12 -13
  2. package/docs/AGI-PLATFORM-PLAN.md +5 -5
  3. package/docs/AGI-WHATS-NEXT.md +11 -8
  4. package/docs/README.md +2 -0
  5. package/package.json +1 -1
  6. package/scripts/check-version-consistency.sh +72 -0
  7. package/scripts/lib/daemon-adaptive.sh +610 -0
  8. package/scripts/lib/daemon-dispatch.sh +489 -0
  9. package/scripts/lib/daemon-failure.sh +387 -0
  10. package/scripts/lib/daemon-patrol.sh +1113 -0
  11. package/scripts/lib/daemon-poll.sh +1202 -0
  12. package/scripts/lib/daemon-state.sh +550 -0
  13. package/scripts/lib/daemon-triage.sh +490 -0
  14. package/scripts/lib/helpers.sh +51 -0
  15. package/scripts/lib/pipeline-intelligence.sh +0 -6
  16. package/scripts/lib/pipeline-quality-checks.sh +2 -0
  17. package/scripts/lib/pipeline-stages.sh +20 -0
  18. package/scripts/sw +26 -4
  19. package/scripts/sw-activity.sh +1 -1
  20. package/scripts/sw-adaptive.sh +2 -2
  21. package/scripts/sw-adversarial.sh +1 -1
  22. package/scripts/sw-architecture-enforcer.sh +1 -1
  23. package/scripts/sw-auth.sh +1 -1
  24. package/scripts/sw-autonomous.sh +1 -1
  25. package/scripts/sw-changelog.sh +1 -1
  26. package/scripts/sw-checkpoint.sh +1 -1
  27. package/scripts/sw-ci.sh +1 -1
  28. package/scripts/sw-cleanup.sh +1 -1
  29. package/scripts/sw-code-review.sh +1 -1
  30. package/scripts/sw-connect.sh +1 -1
  31. package/scripts/sw-context.sh +1 -1
  32. package/scripts/sw-cost.sh +1 -1
  33. package/scripts/sw-daemon.sh +52 -4816
  34. package/scripts/sw-dashboard.sh +1 -1
  35. package/scripts/sw-db.sh +1 -1
  36. package/scripts/sw-decompose.sh +1 -1
  37. package/scripts/sw-deps.sh +1 -1
  38. package/scripts/sw-developer-simulation.sh +1 -1
  39. package/scripts/sw-discovery.sh +1 -1
  40. package/scripts/sw-doc-fleet.sh +1 -1
  41. package/scripts/sw-docs-agent.sh +1 -1
  42. package/scripts/sw-docs.sh +1 -1
  43. package/scripts/sw-doctor.sh +42 -1
  44. package/scripts/sw-dora.sh +1 -1
  45. package/scripts/sw-durable.sh +1 -1
  46. package/scripts/sw-e2e-orchestrator.sh +1 -1
  47. package/scripts/sw-eventbus.sh +1 -1
  48. package/scripts/sw-feedback.sh +1 -1
  49. package/scripts/sw-fix.sh +1 -1
  50. package/scripts/sw-fleet-discover.sh +1 -1
  51. package/scripts/sw-fleet-viz.sh +3 -3
  52. package/scripts/sw-fleet.sh +1 -1
  53. package/scripts/sw-github-app.sh +1 -1
  54. package/scripts/sw-github-checks.sh +1 -1
  55. package/scripts/sw-github-deploy.sh +1 -1
  56. package/scripts/sw-github-graphql.sh +1 -1
  57. package/scripts/sw-guild.sh +1 -1
  58. package/scripts/sw-heartbeat.sh +1 -1
  59. package/scripts/sw-hygiene.sh +1 -1
  60. package/scripts/sw-incident.sh +1 -1
  61. package/scripts/sw-init.sh +1 -1
  62. package/scripts/sw-instrument.sh +1 -1
  63. package/scripts/sw-intelligence.sh +1 -1
  64. package/scripts/sw-jira.sh +1 -1
  65. package/scripts/sw-launchd.sh +1 -1
  66. package/scripts/sw-linear.sh +1 -1
  67. package/scripts/sw-logs.sh +1 -1
  68. package/scripts/sw-loop.sh +1 -1
  69. package/scripts/sw-memory.sh +1 -1
  70. package/scripts/sw-mission-control.sh +1 -1
  71. package/scripts/sw-model-router.sh +1 -1
  72. package/scripts/sw-otel.sh +4 -4
  73. package/scripts/sw-oversight.sh +1 -1
  74. package/scripts/sw-pipeline-composer.sh +1 -1
  75. package/scripts/sw-pipeline-vitals.sh +1 -1
  76. package/scripts/sw-pipeline.sh +16 -56
  77. package/scripts/sw-pipeline.sh.mock +7 -0
  78. package/scripts/sw-pm.sh +1 -1
  79. package/scripts/sw-pr-lifecycle.sh +1 -1
  80. package/scripts/sw-predictive.sh +1 -1
  81. package/scripts/sw-prep.sh +1 -1
  82. package/scripts/sw-ps.sh +1 -1
  83. package/scripts/sw-public-dashboard.sh +1 -1
  84. package/scripts/sw-quality.sh +1 -1
  85. package/scripts/sw-reaper.sh +1 -1
  86. package/scripts/sw-recruit.sh +9 -1
  87. package/scripts/sw-regression.sh +1 -1
  88. package/scripts/sw-release-manager.sh +1 -1
  89. package/scripts/sw-release.sh +1 -1
  90. package/scripts/sw-remote.sh +1 -1
  91. package/scripts/sw-replay.sh +1 -1
  92. package/scripts/sw-retro.sh +1 -1
  93. package/scripts/sw-scale.sh +8 -5
  94. package/scripts/sw-security-audit.sh +1 -1
  95. package/scripts/sw-self-optimize.sh +158 -7
  96. package/scripts/sw-session.sh +1 -1
  97. package/scripts/sw-setup.sh +1 -1
  98. package/scripts/sw-standup.sh +3 -3
  99. package/scripts/sw-status.sh +1 -1
  100. package/scripts/sw-strategic.sh +1 -1
  101. package/scripts/sw-stream.sh +8 -2
  102. package/scripts/sw-swarm.sh +7 -10
  103. package/scripts/sw-team-stages.sh +1 -1
  104. package/scripts/sw-templates.sh +1 -1
  105. package/scripts/sw-testgen.sh +1 -1
  106. package/scripts/sw-tmux-pipeline.sh +1 -1
  107. package/scripts/sw-tmux.sh +1 -1
  108. package/scripts/sw-trace.sh +1 -1
  109. package/scripts/sw-tracker.sh +24 -6
  110. package/scripts/sw-triage.sh +1 -1
  111. package/scripts/sw-upgrade.sh +1 -1
  112. package/scripts/sw-ux.sh +1 -1
  113. package/scripts/sw-webhook.sh +1 -1
  114. package/scripts/sw-widgets.sh +1 -1
  115. package/scripts/sw-worktree.sh +1 -1
@@ -0,0 +1,1202 @@
1
+ # daemon-poll.sh — Poll loop, health, scale, cleanup (for sw-daemon.sh)
2
+ # Source from sw-daemon.sh. Requires daemon-health, state, dispatch, failure, patrol.
3
+ [[ -n "${_DAEMON_POLL_LOADED:-}" ]] && return 0
4
+ _DAEMON_POLL_LOADED=1
5
+
6
+ daemon_poll_issues() {
7
+ if [[ "$NO_GITHUB" == "true" ]]; then
8
+ daemon_log INFO "Polling skipped (--no-github)"
9
+ return
10
+ fi
11
+
12
+ # Check for pause flag (set by dashboard, disk_low, or consecutive-failure backoff)
13
+ local pause_file="${PAUSE_FLAG:-$HOME/.shipwright/daemon-pause.flag}"
14
+ if [[ -f "$pause_file" ]]; then
15
+ local resume_after
16
+ resume_after=$(jq -r '.resume_after // empty' "$pause_file" 2>/dev/null || true)
17
+ if [[ -n "$resume_after" ]]; then
18
+ local now_epoch resume_epoch
19
+ now_epoch=$(date +%s)
20
+ resume_epoch=$(TZ=UTC date -j -f "%Y-%m-%dT%H:%M:%SZ" "$resume_after" +%s 2>/dev/null || \
21
+ date -d "$resume_after" +%s 2>/dev/null || echo 0)
22
+ if [[ "$resume_epoch" -gt 0 ]] && [[ "$now_epoch" -ge "$resume_epoch" ]]; then
23
+ rm -f "$pause_file"
24
+ daemon_log INFO "Auto-resuming after backoff (resume_after passed)"
25
+ else
26
+ daemon_log INFO "Daemon paused until ${resume_after} — skipping poll"
27
+ return
28
+ fi
29
+ else
30
+ daemon_log INFO "Daemon paused — skipping poll"
31
+ return
32
+ fi
33
+ fi
34
+
35
+ # Circuit breaker: skip poll if in backoff window
36
+ if gh_rate_limited; then
37
+ daemon_log INFO "Polling skipped (rate-limit backoff until $(epoch_to_iso "$GH_BACKOFF_UNTIL"))"
38
+ return
39
+ fi
40
+
41
+ local issues_json
42
+
43
+ # Select gh command wrapper: gh_retry for critical poll calls when enabled
44
+ local gh_cmd="gh"
45
+ if [[ "${GH_RETRY_ENABLED:-true}" == "true" ]]; then
46
+ gh_cmd="gh_retry gh"
47
+ fi
48
+
49
+ if [[ "$WATCH_MODE" == "org" && -n "$ORG" ]]; then
50
+ # Org-wide mode: search issues across all org repos
51
+ issues_json=$($gh_cmd search issues \
52
+ --label "$WATCH_LABEL" \
53
+ --owner "$ORG" \
54
+ --state open \
55
+ --json repository,number,title,labels,body,createdAt \
56
+ --limit 20 2>/dev/null) || {
57
+ # Handle rate limiting with exponential backoff
58
+ if [[ $BACKOFF_SECS -eq 0 ]]; then
59
+ BACKOFF_SECS=30
60
+ elif [[ $BACKOFF_SECS -lt 300 ]]; then
61
+ BACKOFF_SECS=$((BACKOFF_SECS * 2))
62
+ if [[ $BACKOFF_SECS -gt 300 ]]; then
63
+ BACKOFF_SECS=300
64
+ fi
65
+ fi
66
+ daemon_log WARN "GitHub API error (org search) — backing off ${BACKOFF_SECS}s"
67
+ gh_record_failure
68
+ sleep "$BACKOFF_SECS"
69
+ return
70
+ }
71
+
72
+ # Filter by repo_filter regex if set
73
+ if [[ -n "$REPO_FILTER" ]]; then
74
+ issues_json=$(echo "$issues_json" | jq -c --arg filter "$REPO_FILTER" \
75
+ '[.[] | select(.repository.nameWithOwner | test($filter))]')
76
+ fi
77
+ else
78
+ # Standard single-repo mode
79
+ issues_json=$($gh_cmd issue list \
80
+ --label "$WATCH_LABEL" \
81
+ --state open \
82
+ --json number,title,labels,body,createdAt \
83
+ --limit 20 2>/dev/null) || {
84
+ # Handle rate limiting with exponential backoff
85
+ if [[ $BACKOFF_SECS -eq 0 ]]; then
86
+ BACKOFF_SECS=30
87
+ elif [[ $BACKOFF_SECS -lt 300 ]]; then
88
+ BACKOFF_SECS=$((BACKOFF_SECS * 2))
89
+ if [[ $BACKOFF_SECS -gt 300 ]]; then
90
+ BACKOFF_SECS=300
91
+ fi
92
+ fi
93
+ daemon_log WARN "GitHub API error — backing off ${BACKOFF_SECS}s"
94
+ gh_record_failure
95
+ sleep "$BACKOFF_SECS"
96
+ return
97
+ }
98
+ fi
99
+
100
+ # Reset backoff on success
101
+ BACKOFF_SECS=0
102
+ gh_record_success
103
+
104
+ local issue_count
105
+ issue_count=$(echo "$issues_json" | jq 'length' 2>/dev/null || echo 0)
106
+
107
+ if [[ "$issue_count" -eq 0 ]]; then
108
+ return
109
+ fi
110
+
111
+ local mode_label="repo"
112
+ [[ "$WATCH_MODE" == "org" ]] && mode_label="org:${ORG}"
113
+ daemon_log INFO "Found ${issue_count} issue(s) with label '${WATCH_LABEL}' (${mode_label})"
114
+ emit_event "daemon.poll" "issues_found=$issue_count" "active=$(get_active_count)" "mode=$WATCH_MODE"
115
+
116
+ # Score each issue using intelligent triage and sort by descending score
117
+ local scored_issues=()
118
+ local dep_graph="" # "issue:dep1,dep2" entries for dependency ordering
119
+ while IFS= read -r issue; do
120
+ local num score
121
+ num=$(echo "$issue" | jq -r '.number')
122
+ score=$(triage_score_issue "$issue" 2>/dev/null | tail -1)
123
+ score=$(printf '%s' "$score" | tr -cd '[:digit:]')
124
+ [[ -z "$score" ]] && score=50
125
+ # For org mode, include repo name in the scored entry
126
+ local repo_name=""
127
+ if [[ "$WATCH_MODE" == "org" ]]; then
128
+ repo_name=$(echo "$issue" | jq -r '.repository.nameWithOwner // ""')
129
+ fi
130
+ scored_issues+=("${score}|${num}|${repo_name}")
131
+
132
+ # Issue dependency detection (adaptive: extract "depends on #X", "blocked by #X")
133
+ if [[ "${ADAPTIVE_THRESHOLDS_ENABLED:-false}" == "true" ]]; then
134
+ local issue_text
135
+ issue_text=$(echo "$issue" | jq -r '(.title // "") + " " + (.body // "")')
136
+ local deps
137
+ deps=$(extract_issue_dependencies "$issue_text")
138
+ if [[ -n "$deps" ]]; then
139
+ local dep_nums
140
+ dep_nums=$(echo "$deps" | tr -d '#' | tr '\n' ',' | sed 's/,$//')
141
+ dep_graph="${dep_graph}${num}:${dep_nums}\n"
142
+ daemon_log INFO "Issue #${num} depends on: ${deps//$'\n'/, }"
143
+ fi
144
+ fi
145
+ done < <(echo "$issues_json" | jq -c '.[]')
146
+
147
+ # Sort by score — strategy determines ascending vs descending
148
+ local sorted_order
149
+ if [[ "${PRIORITY_STRATEGY:-quick-wins-first}" == "complex-first" ]]; then
150
+ # Complex-first: lower score (more complex) first
151
+ sorted_order=$(printf '%s\n' "${scored_issues[@]}" | sort -t'|' -k1,1 -n -k2,2 -n)
152
+ else
153
+ # Quick-wins-first (default): higher score (simpler) first, lowest issue# first on ties
154
+ sorted_order=$(printf '%s\n' "${scored_issues[@]}" | sort -t'|' -k1,1 -rn -k2,2 -n)
155
+ fi
156
+
157
+ # Dependency-aware reordering: move dependencies before dependents
158
+ if [[ -n "$dep_graph" && "${ADAPTIVE_THRESHOLDS_ENABLED:-false}" == "true" ]]; then
159
+ local reordered=""
160
+ local scheduled=""
161
+ # Multiple passes to resolve transitive dependencies (max 3)
162
+ local pass=0
163
+ while [[ $pass -lt 3 ]]; do
164
+ local changed=false
165
+ local new_order=""
166
+ while IFS='|' read -r s_score s_num s_repo; do
167
+ [[ -z "$s_num" ]] && continue
168
+ # Check if this issue has unscheduled dependencies
169
+ local issue_deps
170
+ issue_deps=$(echo -e "$dep_graph" | grep "^${s_num}:" | head -1 | cut -d: -f2 || true)
171
+ if [[ -n "$issue_deps" ]]; then
172
+ # Check if all deps are scheduled (or not in our issue set)
173
+ local all_deps_ready=true
174
+ local IFS_SAVE="$IFS"
175
+ IFS=','
176
+ for dep in $issue_deps; do
177
+ dep="${dep## }"
178
+ dep="${dep%% }"
179
+ # Is this dep in our scored set and not yet scheduled?
180
+ if echo "$sorted_order" | grep -q "|${dep}|" && ! echo "$scheduled" | grep -q "|${dep}|"; then
181
+ all_deps_ready=false
182
+ break
183
+ fi
184
+ done
185
+ IFS="$IFS_SAVE"
186
+ if [[ "$all_deps_ready" == "false" ]]; then
187
+ # Defer this issue — append at end
188
+ new_order="${new_order}${s_score}|${s_num}|${s_repo}\n"
189
+ changed=true
190
+ continue
191
+ fi
192
+ fi
193
+ reordered="${reordered}${s_score}|${s_num}|${s_repo}\n"
194
+ scheduled="${scheduled}|${s_num}|"
195
+ done <<< "$sorted_order"
196
+ # Append deferred issues
197
+ reordered="${reordered}${new_order}"
198
+ sorted_order=$(echo -e "$reordered" | grep -v '^$')
199
+ reordered=""
200
+ scheduled=""
201
+ if [[ "$changed" == "false" ]]; then
202
+ break
203
+ fi
204
+ pass=$((pass + 1))
205
+ done
206
+ fi
207
+
208
+ local active_count
209
+ active_count=$(locked_get_active_count)
210
+
211
+ # Process each issue in triage order (process substitution keeps state in current shell)
212
+ while IFS='|' read -r score issue_num repo_name; do
213
+ [[ -z "$issue_num" ]] && continue
214
+
215
+ local issue_title labels_csv
216
+ issue_title=$(echo "$issues_json" | jq -r --argjson n "$issue_num" '.[] | select(.number == $n) | .title')
217
+ labels_csv=$(echo "$issues_json" | jq -r --argjson n "$issue_num" '.[] | select(.number == $n) | [.labels[].name] | join(",")')
218
+
219
+ # Cache title in state for dashboard visibility
220
+ if [[ -n "$issue_title" ]]; then
221
+ locked_state_update --arg num "$issue_num" --arg title "$issue_title" \
222
+ '.titles[$num] = $title'
223
+ fi
224
+
225
+ # Skip if already inflight
226
+ if daemon_is_inflight "$issue_num"; then
227
+ continue
228
+ fi
229
+
230
+ # Distributed claim (skip if no machines registered)
231
+ if [[ -f "$HOME/.shipwright/machines.json" ]]; then
232
+ local machine_name
233
+ machine_name=$(jq -r '.machines[] | select(.role == "primary") | .name' "$HOME/.shipwright/machines.json" 2>/dev/null || hostname -s)
234
+ if ! claim_issue "$issue_num" "$machine_name"; then
235
+ daemon_log INFO "Issue #${issue_num} claimed by another machine — skipping"
236
+ continue
237
+ fi
238
+ fi
239
+
240
+ # Priority lane: bypass queue for critical issues
241
+ if [[ "$PRIORITY_LANE" == "true" ]]; then
242
+ local priority_active
243
+ priority_active=$(get_priority_active_count)
244
+ if is_priority_issue "$labels_csv" && [[ "$priority_active" -lt "$PRIORITY_LANE_MAX" ]]; then
245
+ daemon_log WARN "PRIORITY LANE: issue #${issue_num} bypassing queue (${labels_csv})"
246
+ emit_event "daemon.priority_lane" "issue=$issue_num" "score=$score"
247
+
248
+ local template
249
+ template=$(select_pipeline_template "$labels_csv" "$score" 2>/dev/null | tail -1)
250
+ template=$(printf '%s' "$template" | sed $'s/\x1b\\[[0-9;]*m//g' | tr -cd '[:alnum:]-_')
251
+ [[ -z "$template" ]] && template="$PIPELINE_TEMPLATE"
252
+ daemon_log INFO "Triage: issue #${issue_num} scored ${score}, template=${template} [PRIORITY]"
253
+
254
+ local orig_template="$PIPELINE_TEMPLATE"
255
+ PIPELINE_TEMPLATE="$template"
256
+ daemon_spawn_pipeline "$issue_num" "$issue_title" "$repo_name"
257
+ PIPELINE_TEMPLATE="$orig_template"
258
+ track_priority_job "$issue_num"
259
+ continue
260
+ fi
261
+ fi
262
+
263
+ # Check capacity
264
+ active_count=$(locked_get_active_count)
265
+ if [[ "$active_count" -ge "$MAX_PARALLEL" ]]; then
266
+ enqueue_issue "$issue_num"
267
+ continue
268
+ fi
269
+
270
+ # Auto-select pipeline template: PM recommendation (if available) else labels + triage score
271
+ local template
272
+ if [[ "$NO_GITHUB" != "true" ]] && [[ -x "$SCRIPT_DIR/sw-pm.sh" ]]; then
273
+ local pm_rec
274
+ pm_rec=$(bash "$SCRIPT_DIR/sw-pm.sh" recommend --json "$issue_num" 2>/dev/null) || true
275
+ if [[ -n "$pm_rec" ]]; then
276
+ template=$(echo "$pm_rec" | jq -r '.team_composition.template // empty' 2>/dev/null) || true
277
+ # Capability self-assessment: low confidence → upgrade to full template
278
+ local confidence
279
+ confidence=$(echo "$pm_rec" | jq -r '.team_composition.confidence_percent // 100' 2>/dev/null) || true
280
+ if [[ -n "$confidence" && "$confidence" != "null" && "$confidence" -lt 60 ]]; then
281
+ daemon_log INFO "Low PM confidence (${confidence}%) — upgrading to full template"
282
+ template="full"
283
+ fi
284
+ fi
285
+ fi
286
+ if [[ -z "$template" ]]; then
287
+ template=$(select_pipeline_template "$labels_csv" "$score" 2>/dev/null | tail -1)
288
+ fi
289
+ template=$(printf '%s' "$template" | sed $'s/\x1b\\[[0-9;]*m//g' | tr -cd '[:alnum:]-_')
290
+ [[ -z "$template" ]] && template="$PIPELINE_TEMPLATE"
291
+ daemon_log INFO "Triage: issue #${issue_num} scored ${score}, template=${template}"
292
+
293
+ # Spawn pipeline (template selection applied via PIPELINE_TEMPLATE override)
294
+ local orig_template="$PIPELINE_TEMPLATE"
295
+ PIPELINE_TEMPLATE="$template"
296
+ daemon_spawn_pipeline "$issue_num" "$issue_title" "$repo_name"
297
+ PIPELINE_TEMPLATE="$orig_template"
298
+
299
+ # Stagger delay between spawns to avoid API contention
300
+ local stagger_delay="${SPAWN_STAGGER_SECONDS:-15}"
301
+ if [[ "$stagger_delay" -gt 0 ]]; then
302
+ sleep "$stagger_delay"
303
+ fi
304
+ done <<< "$sorted_order"
305
+
306
+ # ── Drain queue if we have capacity (prevents deadlock when queue is
307
+ # populated but no active jobs exist to trigger dequeue) ──
308
+ local drain_active
309
+ drain_active=$(locked_get_active_count)
310
+ while [[ "$drain_active" -lt "$MAX_PARALLEL" ]]; do
311
+ local drain_issue
312
+ drain_issue=$(dequeue_next)
313
+ [[ -z "$drain_issue" ]] && break
314
+ local drain_title
315
+ drain_title=$(jq -r --arg n "$drain_issue" '.titles[$n] // ""' "$STATE_FILE" 2>/dev/null || true)
316
+
317
+ local drain_labels drain_score drain_template
318
+ drain_labels=$(echo "$issues_json" | jq -r --argjson n "$drain_issue" \
319
+ '.[] | select(.number == $n) | [.labels[].name] | join(",")' 2>/dev/null || echo "")
320
+ drain_score=$(echo "$sorted_order" | grep "|${drain_issue}|" | cut -d'|' -f1 || echo "50")
321
+ drain_template=$(select_pipeline_template "$drain_labels" "${drain_score:-50}" 2>/dev/null | tail -1)
322
+ drain_template=$(printf '%s' "$drain_template" | sed $'s/\x1b\\[[0-9;]*m//g' | tr -cd '[:alnum:]-_')
323
+ [[ -z "$drain_template" ]] && drain_template="$PIPELINE_TEMPLATE"
324
+
325
+ daemon_log INFO "Draining queue: issue #${drain_issue}, template=${drain_template}"
326
+ local orig_template="$PIPELINE_TEMPLATE"
327
+ PIPELINE_TEMPLATE="$drain_template"
328
+ daemon_spawn_pipeline "$drain_issue" "$drain_title"
329
+ PIPELINE_TEMPLATE="$orig_template"
330
+ drain_active=$(locked_get_active_count)
331
+ done
332
+
333
+ # Update last poll
334
+ update_state_field "last_poll" "$(now_iso)"
335
+ }
336
+
337
+ # ─── Health Check ─────────────────────────────────────────────────────────────
338
+
339
+ daemon_health_check() {
340
+ local findings=0
341
+ local now_e
342
+ now_e=$(now_epoch)
343
+
344
+ if [[ -f "$STATE_FILE" ]]; then
345
+ # ── Intelligent Health Monitoring ──
346
+ # Instead of killing after a countdown, sense what the agent is doing.
347
+ # Agents think for long stretches — that's normal and expected.
348
+ # Strategy: sense → understand → be patient → nudge → only kill as last resort.
349
+
350
+ local hard_limit="${PROGRESS_HARD_LIMIT_S:-0}"
351
+ local use_progress="${PROGRESS_MONITORING:-true}"
352
+ local nudge_enabled="${NUDGE_ENABLED:-true}"
353
+ local nudge_after="${NUDGE_AFTER_CHECKS:-40}"
354
+
355
+ while IFS= read -r job; do
356
+ local pid started_at issue_num worktree
357
+ pid=$(echo "$job" | jq -r '.pid')
358
+ started_at=$(echo "$job" | jq -r '.started_at // empty')
359
+ issue_num=$(echo "$job" | jq -r '.issue')
360
+ worktree=$(echo "$job" | jq -r '.worktree // ""')
361
+
362
+ # Skip dead processes
363
+ if ! kill -0 "$pid" 2>/dev/null; then
364
+ continue
365
+ fi
366
+
367
+ local elapsed=0
368
+ if [[ -n "$started_at" ]]; then
369
+ local start_e
370
+ start_e=$(TZ=UTC date -j -f "%Y-%m-%dT%H:%M:%SZ" "$started_at" +%s 2>/dev/null || date -d "$started_at" +%s 2>/dev/null || echo "0")
371
+ elapsed=$(( now_e - start_e ))
372
+ fi
373
+
374
+ # Hard wall-clock limit — disabled by default (0 = off)
375
+ if [[ "$hard_limit" -gt 0 && "$elapsed" -gt "$hard_limit" ]]; then
376
+ daemon_log WARN "Hard limit exceeded: issue #${issue_num} (${elapsed}s > ${hard_limit}s, PID $pid) — killing"
377
+ emit_event "daemon.hard_limit" "issue=$issue_num" "elapsed_s=$elapsed" "limit_s=$hard_limit" "pid=$pid"
378
+ kill "$pid" 2>/dev/null || true
379
+ daemon_clear_progress "$issue_num"
380
+ findings=$((findings + 1))
381
+ continue
382
+ fi
383
+
384
+ # ── Intelligent Progress Sensing ──
385
+ if [[ "$use_progress" == "true" && -n "$worktree" ]]; then
386
+ local snapshot verdict
387
+ snapshot=$(daemon_collect_snapshot "$issue_num" "$worktree" "$pid" 2>/dev/null || echo '{}')
388
+
389
+ if [[ "$snapshot" != "{}" ]]; then
390
+ verdict=$(daemon_assess_progress "$issue_num" "$snapshot" 2>/dev/null || echo "healthy")
391
+
392
+ local no_progress_count=0
393
+ no_progress_count=$(jq -r '.no_progress_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
394
+ local cur_stage
395
+ cur_stage=$(echo "$snapshot" | jq -r '.stage // "unknown"')
396
+
397
+ case "$verdict" in
398
+ healthy)
399
+ # All good — agent is making progress
400
+ ;;
401
+ slowing)
402
+ daemon_log INFO "Issue #${issue_num} slowing (no visible changes for ${no_progress_count} checks, ${elapsed}s elapsed, stage=${cur_stage})"
403
+ ;;
404
+ stalled)
405
+ # Check if agent subprocess is alive and consuming CPU
406
+ local agent_alive=false
407
+ local child_cpu=0
408
+ child_cpu=$(pgrep -P "$pid" 2>/dev/null | xargs -I{} ps -o pcpu= -p {} 2>/dev/null | awk '{sum+=$1} END{printf "%d", sum+0}' || echo "0")
409
+ if [[ "${child_cpu:-0}" -gt 0 ]]; then
410
+ agent_alive=true
411
+ fi
412
+
413
+ if [[ "$agent_alive" == "true" ]]; then
414
+ daemon_log INFO "Issue #${issue_num} no visible progress (${no_progress_count} checks) but agent is alive (CPU: ${child_cpu}%, stage=${cur_stage}, ${elapsed}s) — being patient"
415
+ else
416
+ daemon_log WARN "Issue #${issue_num} stalled: no progress for ${no_progress_count} checks, no CPU activity (${elapsed}s elapsed, PID $pid)"
417
+ emit_event "daemon.stalled" "issue=$issue_num" "no_progress=$no_progress_count" "elapsed_s=$elapsed" "pid=$pid"
418
+ fi
419
+ ;;
420
+ stuck)
421
+ local repeated_errors
422
+ repeated_errors=$(jq -r '.repeated_error_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
423
+
424
+ # Even "stuck" — check if the process tree is alive first
425
+ local agent_alive=false
426
+ local child_cpu=0
427
+ child_cpu=$(pgrep -P "$pid" 2>/dev/null | xargs -I{} ps -o pcpu= -p {} 2>/dev/null | awk '{sum+=$1} END{printf "%d", sum+0}' || echo "0")
428
+ if [[ "${child_cpu:-0}" -gt 0 ]]; then
429
+ agent_alive=true
430
+ fi
431
+
432
+ if [[ "$agent_alive" == "true" && "$repeated_errors" -lt 3 ]]; then
433
+ # Agent is alive — nudge instead of kill
434
+ if [[ "$nudge_enabled" == "true" && "$no_progress_count" -ge "$nudge_after" ]]; then
435
+ local nudge_file="${worktree}/.claude/nudge.md"
436
+ if [[ ! -f "$nudge_file" ]]; then
437
+ cat > "$nudge_file" <<NUDGE_EOF
438
+ # Nudge from Daemon Health Monitor
439
+
440
+ The daemon has noticed no visible progress for $(( no_progress_count * 30 / 60 )) minutes.
441
+ Current stage: ${cur_stage}
442
+
443
+ If you're stuck, consider:
444
+ - Breaking the task into smaller steps
445
+ - Committing partial progress
446
+ - Running tests to validate current state
447
+
448
+ This is just a gentle check-in — take your time if you're working through a complex problem.
449
+ NUDGE_EOF
450
+ daemon_log INFO "Issue #${issue_num} nudged (${no_progress_count} checks, stage=${cur_stage}, CPU=${child_cpu}%) — file written to worktree"
451
+ emit_event "daemon.nudge" "issue=$issue_num" "no_progress=$no_progress_count" "stage=$cur_stage" "elapsed_s=$elapsed"
452
+ fi
453
+ else
454
+ daemon_log INFO "Issue #${issue_num} no visible progress (${no_progress_count} checks) but agent is alive (CPU: ${child_cpu}%, stage=${cur_stage}) — waiting"
455
+ fi
456
+ elif [[ "$repeated_errors" -ge 5 ]]; then
457
+ # Truly stuck in an error loop — kill as last resort
458
+ daemon_log WARN "Issue #${issue_num} in error loop: ${repeated_errors} repeated errors (stage=${cur_stage}, ${elapsed}s, PID $pid) — killing"
459
+ emit_event "daemon.stuck_kill" "issue=$issue_num" "no_progress=$no_progress_count" "repeated_errors=$repeated_errors" "stage=$cur_stage" "elapsed_s=$elapsed" "pid=$pid" "reason=error_loop"
460
+ kill "$pid" 2>/dev/null || true
461
+ daemon_clear_progress "$issue_num"
462
+ findings=$((findings + 1))
463
+ elif [[ "$agent_alive" != "true" && "$no_progress_count" -ge "$((PROGRESS_CHECKS_BEFORE_KILL * 2))" ]]; then
464
+ # Process tree is dead AND no progress for very long time
465
+ daemon_log WARN "Issue #${issue_num} appears dead: no CPU, no progress for ${no_progress_count} checks (${elapsed}s, PID $pid) — killing"
466
+ emit_event "daemon.stuck_kill" "issue=$issue_num" "no_progress=$no_progress_count" "repeated_errors=$repeated_errors" "stage=$cur_stage" "elapsed_s=$elapsed" "pid=$pid" "reason=dead_process"
467
+ kill "$pid" 2>/dev/null || true
468
+ daemon_clear_progress "$issue_num"
469
+ findings=$((findings + 1))
470
+ else
471
+ daemon_log WARN "Issue #${issue_num} struggling (${no_progress_count} checks, ${repeated_errors} errors, CPU=${child_cpu}%, stage=${cur_stage}) — monitoring"
472
+ fi
473
+ ;;
474
+ esac
475
+ fi
476
+ else
477
+ # Fallback: legacy time-based detection when progress monitoring is off
478
+ local stale_timeout
479
+ stale_timeout=$(get_adaptive_stale_timeout "$PIPELINE_TEMPLATE")
480
+ if [[ "$elapsed" -gt "$stale_timeout" ]]; then
481
+ # Check if process is still alive
482
+ if kill -0 "$pid" 2>/dev/null; then
483
+ # Kill at 2x stale timeout — the process is truly hung
484
+ local kill_threshold=$(( stale_timeout * 2 ))
485
+ if [[ "$elapsed" -gt "$kill_threshold" ]]; then
486
+ daemon_log WARN "Killing stale job (legacy): issue #${issue_num} (${elapsed}s > ${kill_threshold}s kill threshold, PID $pid)"
487
+ emit_event "daemon.stale_kill" "issue=$issue_num" "elapsed_s=$elapsed" "pid=$pid"
488
+ kill "$pid" 2>/dev/null || true
489
+ sleep 2
490
+ kill -9 "$pid" 2>/dev/null || true
491
+ else
492
+ daemon_log WARN "Stale job (legacy): issue #${issue_num} (${elapsed}s > ${stale_timeout}s, PID $pid) — will kill at ${kill_threshold}s"
493
+ emit_event "daemon.stale_warning" "issue=$issue_num" "elapsed_s=$elapsed" "pid=$pid"
494
+ fi
495
+ else
496
+ daemon_log WARN "Stale job with dead process: issue #${issue_num} (PID $pid no longer exists)"
497
+ emit_event "daemon.stale_dead" "issue=$issue_num" "pid=$pid"
498
+ fi
499
+ findings=$((findings + 1))
500
+ fi
501
+ fi
502
+ done < <(jq -c '.active_jobs[]' "$STATE_FILE" 2>/dev/null || true)
503
+ fi
504
+
505
+ # Disk space warning (check both repo dir and ~/.shipwright)
506
+ local free_kb
507
+ free_kb=$(df -k "." 2>/dev/null | tail -1 | awk '{print $4}')
508
+ if [[ -n "$free_kb" ]] && [[ "$free_kb" -lt 1048576 ]] 2>/dev/null; then
509
+ daemon_log WARN "Low disk space: $(( free_kb / 1024 ))MB free"
510
+ findings=$((findings + 1))
511
+ fi
512
+
513
+ # Critical disk space on ~/.shipwright — pause spawning
514
+ local sw_free_kb
515
+ sw_free_kb=$(df -k "$HOME/.shipwright" 2>/dev/null | tail -1 | awk '{print $4}')
516
+ if [[ -n "$sw_free_kb" ]] && [[ "$sw_free_kb" -lt 512000 ]] 2>/dev/null; then
517
+ daemon_log WARN "Critical disk space on ~/.shipwright: $(( sw_free_kb / 1024 ))MB — pausing spawns"
518
+ emit_event "daemon.disk_low" "free_mb=$(( sw_free_kb / 1024 ))"
519
+ mkdir -p "$HOME/.shipwright"
520
+ echo '{"paused":true,"reason":"disk_low"}' > "$HOME/.shipwright/daemon-pause.flag"
521
+ findings=$((findings + 1))
522
+ fi
523
+
524
+ # Events file size warning
525
+ if [[ -f "$EVENTS_FILE" ]]; then
526
+ local events_size
527
+ events_size=$(wc -c < "$EVENTS_FILE" 2>/dev/null || echo 0)
528
+ if [[ "$events_size" -gt 104857600 ]]; then # 100MB
529
+ daemon_log WARN "Events file large ($(( events_size / 1048576 ))MB) — consider rotating"
530
+ findings=$((findings + 1))
531
+ fi
532
+ fi
533
+
534
+ if [[ "$findings" -gt 0 ]]; then
535
+ emit_event "daemon.health" "findings=$findings"
536
+ fi
537
+ }
538
+
539
+ # ─── Degradation Alerting ─────────────────────────────────────────────────────
540
+
541
+ daemon_check_degradation() {
542
+ if [[ ! -f "$EVENTS_FILE" ]]; then return; fi
543
+
544
+ local window="${DEGRADATION_WINDOW:-5}"
545
+ local cfr_threshold="${DEGRADATION_CFR_THRESHOLD:-30}"
546
+ local success_threshold="${DEGRADATION_SUCCESS_THRESHOLD:-50}"
547
+
548
+ # Get last N pipeline completions
549
+ local recent
550
+ recent=$(tail -200 "$EVENTS_FILE" | jq -s "[.[] | select(.type == \"pipeline.completed\")] | .[-${window}:]" 2>/dev/null)
551
+ local count
552
+ count=$(echo "$recent" | jq 'length' 2>/dev/null || echo 0)
553
+
554
+ if [[ "$count" -lt "$window" ]]; then return; fi
555
+
556
+ local failures successes
557
+ failures=$(echo "$recent" | jq '[.[] | select(.result == "failure")] | length')
558
+ successes=$(echo "$recent" | jq '[.[] | select(.result == "success")] | length')
559
+ local cfr_pct=0 success_pct=0
560
+ if [[ "${count:-0}" -gt 0 ]]; then
561
+ cfr_pct=$(( failures * 100 / count ))
562
+ success_pct=$(( successes * 100 / count ))
563
+ fi
564
+
565
+ local alerts=""
566
+ if [[ "$cfr_pct" -gt "$cfr_threshold" ]]; then
567
+ alerts="CFR ${cfr_pct}% exceeds threshold ${cfr_threshold}%"
568
+ daemon_log WARN "DEGRADATION: $alerts"
569
+ fi
570
+ if [[ "$success_pct" -lt "$success_threshold" ]]; then
571
+ local msg="Success rate ${success_pct}% below threshold ${success_threshold}%"
572
+ [[ -n "$alerts" ]] && alerts="$alerts; $msg" || alerts="$msg"
573
+ daemon_log WARN "DEGRADATION: $msg"
574
+ fi
575
+
576
+ if [[ -n "$alerts" ]]; then
577
+ emit_event "daemon.alert" "alerts=$alerts" "cfr_pct=$cfr_pct" "success_pct=$success_pct"
578
+
579
+ # Slack notification
580
+ if [[ -n "${SLACK_WEBHOOK:-}" ]]; then
581
+ notify "Pipeline Degradation Alert" "$alerts" "warn"
582
+ fi
583
+ fi
584
+ }
585
+
586
+ # ─── Auto-Scaling ─────────────────────────────────────────────────────────
587
+ # Dynamically adjusts MAX_PARALLEL based on CPU, memory, budget, and queue depth
588
+
589
+ daemon_auto_scale() {
590
+ if [[ "${AUTO_SCALE:-false}" != "true" ]]; then
591
+ return
592
+ fi
593
+
594
+ local prev_max="$MAX_PARALLEL"
595
+
596
+ # ── Learn worker memory from actual RSS (adaptive) ──
597
+ learn_worker_memory
598
+
599
+ # ── Adaptive cost estimate per template ──
600
+ local effective_cost_per_job
601
+ effective_cost_per_job=$(get_adaptive_cost_estimate "$PIPELINE_TEMPLATE")
602
+
603
+ # ── CPU cores ──
604
+ local cpu_cores=2
605
+ if [[ "$(uname -s)" == "Darwin" ]]; then
606
+ cpu_cores=$(sysctl -n hw.ncpu 2>/dev/null || echo 2)
607
+ else
608
+ cpu_cores=$(nproc 2>/dev/null || echo 2)
609
+ fi
610
+ local max_by_cpu=$(( (cpu_cores * 3) / 4 )) # 75% utilization cap
611
+ [[ "$max_by_cpu" -lt 1 ]] && max_by_cpu=1
612
+
613
+ # ── Load average check — gradual scaling curve (replaces 90% cliff) ──
614
+ local load_avg
615
+ load_avg=$(uptime | awk -F'load averages?: ' '{print $2}' | awk -F'[, ]+' '{print $1}' 2>/dev/null || echo "0")
616
+ if [[ ! "$load_avg" =~ ^[0-9]+\.?[0-9]*$ ]]; then
617
+ load_avg="0"
618
+ fi
619
+ local load_ratio=0
620
+ if [[ "$cpu_cores" -gt 0 ]]; then
621
+ load_ratio=$(awk -v load="$load_avg" -v cores="$cpu_cores" 'BEGIN { printf "%.0f", (load / cores) * 100 }')
622
+ fi
623
+ # Gradual load scaling curve (replaces binary 90% cliff)
624
+ if [[ "$load_ratio" -gt 95 ]]; then
625
+ # 95%+: minimum workers only
626
+ max_by_cpu="$MIN_WORKERS"
627
+ daemon_log WARN "Auto-scale: critical load (${load_ratio}%) — minimum workers only"
628
+ elif [[ "$load_ratio" -gt 85 ]]; then
629
+ # 85-95%: reduce by 50%
630
+ max_by_cpu=$(( max_by_cpu / 2 ))
631
+ [[ "$max_by_cpu" -lt "$MIN_WORKERS" ]] && max_by_cpu="$MIN_WORKERS"
632
+ daemon_log WARN "Auto-scale: high load (${load_ratio}%) — reducing capacity 50%"
633
+ elif [[ "$load_ratio" -gt 70 ]]; then
634
+ # 70-85%: reduce by 25%
635
+ max_by_cpu=$(( (max_by_cpu * 3) / 4 ))
636
+ [[ "$max_by_cpu" -lt "$MIN_WORKERS" ]] && max_by_cpu="$MIN_WORKERS"
637
+ daemon_log INFO "Auto-scale: moderate load (${load_ratio}%) — reducing capacity 25%"
638
+ fi
639
+ # 0-70%: full capacity (no change)
640
+
641
+ # ── Available memory ──
642
+ local avail_mem_gb=8
643
+ if [[ "$(uname -s)" == "Darwin" ]]; then
644
+ local page_size free_pages inactive_pages purgeable_pages speculative_pages
645
+ page_size=$(vm_stat | awk '/page size of/ {for(i=1;i<=NF;i++) if($i ~ /^[0-9]+$/) print $i}')
646
+ page_size="${page_size:-16384}"
647
+ free_pages=$(vm_stat | awk '/^Pages free:/ {gsub(/\./, "", $NF); print $NF}')
648
+ free_pages="${free_pages:-0}"
649
+ speculative_pages=$(vm_stat | awk '/^Pages speculative:/ {gsub(/\./, "", $NF); print $NF}')
650
+ speculative_pages="${speculative_pages:-0}"
651
+ inactive_pages=$(vm_stat | awk '/^Pages inactive:/ {gsub(/\./, "", $NF); print $NF}')
652
+ inactive_pages="${inactive_pages:-0}"
653
+ purgeable_pages=$(vm_stat | awk '/^Pages purgeable:/ {gsub(/\./, "", $NF); print $NF}')
654
+ purgeable_pages="${purgeable_pages:-0}"
655
+ local avail_pages=$(( free_pages + speculative_pages + inactive_pages + purgeable_pages ))
656
+ if [[ "$avail_pages" -gt 0 && "$page_size" -gt 0 ]]; then
657
+ local free_bytes=$(( avail_pages * page_size ))
658
+ avail_mem_gb=$(( free_bytes / 1073741824 ))
659
+ fi
660
+ else
661
+ local avail_kb
662
+ avail_kb=$(awk '/MemAvailable/ {print $2}' /proc/meminfo 2>/dev/null || echo "8388608")
663
+ avail_mem_gb=$(( avail_kb / 1048576 ))
664
+ fi
665
+ [[ "$avail_mem_gb" -lt 1 ]] && avail_mem_gb=1
666
+ local max_by_mem=$(( avail_mem_gb / WORKER_MEM_GB ))
667
+ [[ "$max_by_mem" -lt 1 ]] && max_by_mem=1
668
+
669
+ # ── Budget remaining (adaptive cost estimate) ──
670
+ local max_by_budget="$MAX_WORKERS"
671
+ local remaining_usd
672
+ remaining_usd=$("$SCRIPT_DIR/sw-cost.sh" remaining-budget 2>/dev/null || echo "unlimited")
673
+ if [[ "$remaining_usd" != "unlimited" && -n "$remaining_usd" ]]; then
674
+ if awk -v r="$remaining_usd" -v c="$effective_cost_per_job" 'BEGIN { exit !(r > 0 && c > 0) }'; then
675
+ max_by_budget=$(awk -v r="$remaining_usd" -v c="$effective_cost_per_job" 'BEGIN { printf "%.0f", r / c }')
676
+ [[ "$max_by_budget" -lt 0 ]] && max_by_budget=0
677
+ else
678
+ max_by_budget=0
679
+ fi
680
+ fi
681
+
682
+ # ── Queue depth (don't over-provision) ──
683
+ local queue_depth active_count
684
+ queue_depth=$(jq -r '.queued | length' "$STATE_FILE" 2>/dev/null || echo 0)
685
+ queue_depth="${queue_depth:-0}"
686
+ [[ ! "$queue_depth" =~ ^[0-9]+$ ]] && queue_depth=0
687
+ active_count=$(get_active_count)
688
+ active_count="${active_count:-0}"
689
+ [[ ! "$active_count" =~ ^[0-9]+$ ]] && active_count=0
690
+ local max_by_queue=$(( queue_depth + active_count ))
691
+ [[ "$max_by_queue" -lt 1 ]] && max_by_queue=1
692
+
693
+ # ── Vitals-driven scaling factor ──
694
+ local max_by_vitals="$MAX_WORKERS"
695
+ if type pipeline_compute_vitals &>/dev/null 2>&1 && [[ -f "$STATE_FILE" ]]; then
696
+ local _total_health=0 _health_count=0
697
+ while IFS= read -r _job; do
698
+ local _job_issue _job_worktree
699
+ _job_issue=$(echo "$_job" | jq -r '.issue // 0')
700
+ _job_worktree=$(echo "$_job" | jq -r '.worktree // ""')
701
+ if [[ -n "$_job_worktree" && -d "$_job_worktree/.claude" ]]; then
702
+ local _job_vitals _job_health
703
+ _job_vitals=$(pipeline_compute_vitals "$_job_worktree/.claude/pipeline-state.md" "$_job_worktree/.claude/pipeline-artifacts" "$_job_issue" 2>/dev/null) || true
704
+ if [[ -n "$_job_vitals" && "$_job_vitals" != "{}" ]]; then
705
+ _job_health=$(echo "$_job_vitals" | jq -r '.health_score // 50' 2>/dev/null || echo "50")
706
+ _total_health=$((_total_health + _job_health))
707
+ _health_count=$((_health_count + 1))
708
+ fi
709
+ fi
710
+ done < <(jq -c '.active_jobs[]' "$STATE_FILE" 2>/dev/null || true)
711
+
712
+ if [[ "$_health_count" -gt 0 ]]; then
713
+ local _avg_health=$((_total_health / _health_count))
714
+ if [[ "$_avg_health" -lt 50 ]]; then
715
+ # Pipelines struggling — reduce workers to give each more resources
716
+ max_by_vitals=$(( MAX_WORKERS * _avg_health / 100 ))
717
+ [[ "$max_by_vitals" -lt "$MIN_WORKERS" ]] && max_by_vitals="$MIN_WORKERS"
718
+ daemon_log INFO "Auto-scale: vitals avg health ${_avg_health}% — capping at ${max_by_vitals} workers"
719
+ fi
720
+ # avg_health > 70: no reduction (full capacity available)
721
+ fi
722
+ fi
723
+
724
+ # ── Compute final value ──
725
+ local computed="$max_by_cpu"
726
+ [[ "$max_by_mem" -lt "$computed" ]] && computed="$max_by_mem"
727
+ [[ "$max_by_budget" -lt "$computed" ]] && computed="$max_by_budget"
728
+ [[ "$max_by_queue" -lt "$computed" ]] && computed="$max_by_queue"
729
+ [[ "$max_by_vitals" -lt "$computed" ]] && computed="$max_by_vitals"
730
+ [[ "$MAX_WORKERS" -lt "$computed" ]] && computed="$MAX_WORKERS"
731
+
732
+ # Respect fleet-assigned ceiling if set
733
+ if [[ -n "${FLEET_MAX_PARALLEL:-}" && "$FLEET_MAX_PARALLEL" -lt "$computed" ]]; then
734
+ computed="$FLEET_MAX_PARALLEL"
735
+ fi
736
+
737
+ # Clamp to min_workers
738
+ [[ "$computed" -lt "$MIN_WORKERS" ]] && computed="$MIN_WORKERS"
739
+
740
+ # ── Gradual scaling: change by at most 1 at a time (adaptive) ──
741
+ if [[ "${ADAPTIVE_THRESHOLDS_ENABLED:-false}" == "true" ]]; then
742
+ if [[ "$computed" -gt "$prev_max" ]]; then
743
+ # Check success rate at target parallelism before scaling up
744
+ local target_rate
745
+ target_rate=$(get_success_rate_at_parallelism "$((prev_max + 1))")
746
+ if [[ "$target_rate" -lt 50 ]]; then
747
+ # Poor success rate at higher parallelism — hold steady
748
+ computed="$prev_max"
749
+ daemon_log INFO "Auto-scale: holding at ${prev_max} (success rate ${target_rate}% at $((prev_max + 1)))"
750
+ else
751
+ # Scale up by 1, not jump to target
752
+ computed=$((prev_max + 1))
753
+ fi
754
+ elif [[ "$computed" -lt "$prev_max" ]]; then
755
+ # Scale down by 1, not drop to minimum
756
+ computed=$((prev_max - 1))
757
+ [[ "$computed" -lt "$MIN_WORKERS" ]] && computed="$MIN_WORKERS"
758
+ fi
759
+ fi
760
+
761
+ MAX_PARALLEL="$computed"
762
+
763
+ if [[ "$MAX_PARALLEL" -ne "$prev_max" ]]; then
764
+ daemon_log INFO "Auto-scale: ${prev_max} → ${MAX_PARALLEL} (cpu=${max_by_cpu} mem=${max_by_mem} budget=${max_by_budget} queue=${max_by_queue} load=${load_ratio}%)"
765
+ emit_event "daemon.scale" \
766
+ "from=$prev_max" \
767
+ "to=$MAX_PARALLEL" \
768
+ "max_by_cpu=$max_by_cpu" \
769
+ "max_by_mem=$max_by_mem" \
770
+ "max_by_budget=$max_by_budget" \
771
+ "max_by_queue=$max_by_queue" \
772
+ "cpu_cores=$cpu_cores" \
773
+ "avail_mem_gb=$avail_mem_gb" \
774
+ "remaining_usd=$remaining_usd" \
775
+ "load_ratio=$load_ratio"
776
+ fi
777
+ }
778
+
779
+ # ─── Fleet Config Reload ──────────────────────────────────────────────────
780
+ # Checks for fleet-reload.flag and reloads MAX_PARALLEL from fleet-managed config
781
+
782
+ daemon_reload_config() {
783
+ local reload_flag="$HOME/.shipwright/fleet-reload.flag"
784
+ if [[ ! -f "$reload_flag" ]]; then
785
+ return
786
+ fi
787
+
788
+ local fleet_config=".claude/.fleet-daemon-config.json"
789
+ if [[ -f "$fleet_config" ]]; then
790
+ local new_max
791
+ new_max=$(jq -r '.max_parallel // empty' "$fleet_config" 2>/dev/null || true)
792
+ if [[ -n "$new_max" && "$new_max" != "null" ]]; then
793
+ local prev="$MAX_PARALLEL"
794
+ FLEET_MAX_PARALLEL="$new_max"
795
+ MAX_PARALLEL="$new_max"
796
+ daemon_log INFO "Fleet reload: max_parallel ${prev} → ${MAX_PARALLEL} (fleet ceiling: ${FLEET_MAX_PARALLEL})"
797
+ emit_event "daemon.fleet_reload" "from=$prev" "to=$MAX_PARALLEL"
798
+ fi
799
+ fi
800
+
801
+ rm -f "$reload_flag"
802
+ }
803
+
804
+ # ─── Self-Optimizing Metrics Loop ──────────────────────────────────────────
805
+
806
+ daemon_self_optimize() {
807
+ if [[ "${SELF_OPTIMIZE:-false}" != "true" ]]; then
808
+ return
809
+ fi
810
+
811
+ if [[ ! -f "$EVENTS_FILE" ]]; then
812
+ return
813
+ fi
814
+
815
+ # ── Intelligence-powered optimization (if enabled) ──
816
+ if [[ "${OPTIMIZATION_ENABLED:-false}" == "true" ]] && type optimize_full_analysis &>/dev/null 2>&1; then
817
+ daemon_log INFO "Running intelligence-powered optimization"
818
+ optimize_full_analysis 2>/dev/null || {
819
+ daemon_log WARN "Intelligence optimization failed — falling back to DORA-based tuning"
820
+ }
821
+ # Still run DORA-based tuning below as a complement
822
+ fi
823
+
824
+ daemon_log INFO "Running self-optimization check"
825
+
826
+ # Read DORA metrics from recent events (last 7 days)
827
+ local cutoff_epoch
828
+ cutoff_epoch=$(( $(now_epoch) - (7 * 86400) ))
829
+
830
+ local period_events
831
+ period_events=$(jq -c "select(.ts_epoch >= $cutoff_epoch)" "$EVENTS_FILE" 2>/dev/null || true)
832
+
833
+ if [[ -z "$period_events" ]]; then
834
+ daemon_log INFO "No recent events for optimization"
835
+ return
836
+ fi
837
+
838
+ local total_completed successes failures
839
+ total_completed=$(echo "$period_events" | jq -s '[.[] | select(.type == "pipeline.completed")] | length')
840
+ successes=$(echo "$period_events" | jq -s '[.[] | select(.type == "pipeline.completed" and .result == "success")] | length')
841
+ failures=$(echo "$period_events" | jq -s '[.[] | select(.type == "pipeline.completed" and .result == "failure")] | length')
842
+
843
+ # Change Failure Rate
844
+ local cfr=0
845
+ if [[ "$total_completed" -gt 0 ]]; then
846
+ cfr=$(echo "$failures $total_completed" | awk '{printf "%.0f", ($1 / $2) * 100}')
847
+ fi
848
+
849
+ # Cycle time (median, in seconds)
850
+ local cycle_time_median
851
+ cycle_time_median=$(echo "$period_events" | \
852
+ jq -s '[.[] | select(.type == "pipeline.completed" and .result == "success") | .duration_s // 0] | sort | if length > 0 then .[length/2 | floor] else 0 end')
853
+
854
+ # Deploy frequency (per week)
855
+ local deploy_freq
856
+ deploy_freq=$(echo "$successes" | awk '{printf "%.1f", $1 / 1}') # Already 7 days
857
+
858
+ # MTTR
859
+ local mttr
860
+ mttr=$(echo "$period_events" | \
861
+ jq -s '
862
+ [.[] | select(.type == "pipeline.completed")] | sort_by(.ts_epoch // 0) |
863
+ [range(length) as $i |
864
+ if .[$i].result == "failure" then
865
+ [.[$i+1:][] | select(.result == "success")][0] as $next |
866
+ if $next and $next.ts_epoch and .[$i].ts_epoch then
867
+ ($next.ts_epoch - .[$i].ts_epoch)
868
+ else null end
869
+ else null end
870
+ ] | map(select(. != null)) |
871
+ if length > 0 then (add / length | floor) else 0 end
872
+ ')
873
+
874
+ local adjustments=()
875
+
876
+ # ── CFR > 20%: enable compound_quality, increase max_cycles ──
877
+ if [[ "$cfr" -gt 40 ]]; then
878
+ PIPELINE_TEMPLATE="full"
879
+ adjustments+=("template→full (CFR ${cfr}% > 40%)")
880
+ daemon_log WARN "Self-optimize: CFR ${cfr}% critical — switching to full template"
881
+ elif [[ "$cfr" -gt 20 ]]; then
882
+ adjustments+=("compound_quality enabled (CFR ${cfr}% > 20%)")
883
+ daemon_log WARN "Self-optimize: CFR ${cfr}% elevated — enabling compound quality"
884
+ fi
885
+
886
+ # ── Lead time > 4hrs: increase max_parallel, reduce poll_interval ──
887
+ if [[ "$cycle_time_median" -gt 14400 ]]; then
888
+ MAX_PARALLEL=$((MAX_PARALLEL + 1))
889
+ if [[ "$POLL_INTERVAL" -gt 30 ]]; then
890
+ POLL_INTERVAL=$((POLL_INTERVAL / 2))
891
+ fi
892
+ adjustments+=("max_parallel→${MAX_PARALLEL}, poll_interval→${POLL_INTERVAL}s (lead time > 4hrs)")
893
+ daemon_log WARN "Self-optimize: lead time $(format_duration "$cycle_time_median") — increasing parallelism"
894
+ elif [[ "$cycle_time_median" -gt 7200 ]]; then
895
+ # ── Lead time > 2hrs: enable auto_template for fast-pathing ──
896
+ AUTO_TEMPLATE="true"
897
+ adjustments+=("auto_template enabled (lead time > 2hrs)")
898
+ daemon_log INFO "Self-optimize: lead time $(format_duration "$cycle_time_median") — enabling adaptive templates"
899
+ fi
900
+
901
+ # ── Deploy freq < 1/day (< 7/week): enable merge stage ──
902
+ if [[ "$(echo "$deploy_freq < 7" | bc -l 2>/dev/null || echo 0)" == "1" ]]; then
903
+ adjustments+=("merge stage recommended (deploy freq ${deploy_freq}/week)")
904
+ daemon_log INFO "Self-optimize: low deploy frequency — consider enabling merge stage"
905
+ fi
906
+
907
+ # ── MTTR > 2hrs: enable auto_rollback ──
908
+ if [[ "$mttr" -gt 7200 ]]; then
909
+ adjustments+=("auto_rollback recommended (MTTR $(format_duration "$mttr"))")
910
+ daemon_log WARN "Self-optimize: high MTTR $(format_duration "$mttr") — consider enabling auto-rollback"
911
+ fi
912
+
913
+ # Write adjustments to state and persist to config
914
+ if [[ ${#adjustments[@]} -gt 0 ]]; then
915
+ local adj_str
916
+ adj_str=$(printf '%s; ' "${adjustments[@]}")
917
+
918
+ locked_state_update \
919
+ --arg adj "$adj_str" \
920
+ --arg ts "$(now_iso)" \
921
+ '.last_optimization = {timestamp: $ts, adjustments: $adj}'
922
+
923
+ # ── Persist adjustments to daemon-config.json (survives restart) ──
924
+ local config_file="${CONFIG_PATH:-.claude/daemon-config.json}"
925
+ if [[ -f "$config_file" ]]; then
926
+ local tmp_config
927
+ tmp_config=$(jq \
928
+ --argjson max_parallel "$MAX_PARALLEL" \
929
+ --argjson poll_interval "$POLL_INTERVAL" \
930
+ --arg template "$PIPELINE_TEMPLATE" \
931
+ --arg auto_template "${AUTO_TEMPLATE:-false}" \
932
+ --arg ts "$(now_iso)" \
933
+ --arg adj "$adj_str" \
934
+ '.max_parallel = $max_parallel |
935
+ .poll_interval = $poll_interval |
936
+ .pipeline_template = $template |
937
+ .auto_template = ($auto_template == "true") |
938
+ .last_optimization = {timestamp: $ts, adjustments: $adj}' \
939
+ "$config_file")
940
+ # Atomic write: tmp file + mv
941
+ local tmp_cfg_file="${config_file}.tmp.$$"
942
+ echo "$tmp_config" > "$tmp_cfg_file"
943
+ mv "$tmp_cfg_file" "$config_file"
944
+ daemon_log INFO "Self-optimize: persisted adjustments to ${config_file}"
945
+ fi
946
+
947
+ emit_event "daemon.optimize" "adjustments=${adj_str}" "cfr=$cfr" "cycle_time=$cycle_time_median" "deploy_freq=$deploy_freq" "mttr=$mttr"
948
+ daemon_log SUCCESS "Self-optimization applied ${#adjustments[@]} adjustment(s)"
949
+ else
950
+ daemon_log INFO "Self-optimization: all metrics within thresholds"
951
+ fi
952
+ }
953
+
954
+ # ─── Stale State Reaper ──────────────────────────────────────────────────────
955
+ # Cleans old worktrees, pipeline artifacts, and completed state entries.
956
+ # Called every N poll cycles (configurable via stale_reaper_interval).
957
+
958
+ daemon_cleanup_stale() {
959
+ if [[ "${STALE_REAPER_ENABLED:-true}" != "true" ]]; then
960
+ return
961
+ fi
962
+
963
+ daemon_log INFO "Running stale state reaper"
964
+ local cleaned=0
965
+ local age_days="${STALE_REAPER_AGE_DAYS:-7}"
966
+ local age_secs=$((age_days * 86400))
967
+ local now_e
968
+ now_e=$(now_epoch)
969
+
970
+ # ── 1. Clean old git worktrees ──
971
+ if command -v git &>/dev/null; then
972
+ while IFS= read -r line; do
973
+ local wt_path
974
+ wt_path=$(echo "$line" | awk '{print $1}')
975
+ # Only clean daemon-created worktrees
976
+ [[ "$wt_path" == *"daemon-issue-"* ]] || continue
977
+ # Check worktree age via directory mtime
978
+ local mtime
979
+ mtime=$(stat -f '%m' "$wt_path" 2>/dev/null || stat -c '%Y' "$wt_path" 2>/dev/null || echo "0")
980
+ if [[ $((now_e - mtime)) -gt $age_secs ]]; then
981
+ daemon_log INFO "Removing stale worktree: ${wt_path}"
982
+ git worktree remove "$wt_path" --force 2>/dev/null || true
983
+ cleaned=$((cleaned + 1))
984
+ fi
985
+ done < <(git worktree list --porcelain 2>/dev/null | grep '^worktree ' | sed 's/^worktree //')
986
+ fi
987
+
988
+ # ── 2. Expire old checkpoints ──
989
+ if [[ -x "$SCRIPT_DIR/sw-checkpoint.sh" ]]; then
990
+ local expired_output
991
+ expired_output=$(bash "$SCRIPT_DIR/sw-checkpoint.sh" expire --hours "$((age_days * 24))" 2>/dev/null || true)
992
+ if [[ -n "$expired_output" ]] && echo "$expired_output" | grep -q "Expired"; then
993
+ local expired_count
994
+ expired_count=$(echo "$expired_output" | grep -c "Expired" || true)
995
+ cleaned=$((cleaned + ${expired_count:-0}))
996
+ daemon_log INFO "Expired ${expired_count:-0} old checkpoint(s)"
997
+ fi
998
+ fi
999
+
1000
+ # ── 3. Clean old pipeline artifacts (subdirectories only) ──
1001
+ local artifacts_dir=".claude/pipeline-artifacts"
1002
+ if [[ -d "$artifacts_dir" ]]; then
1003
+ while IFS= read -r artifact_dir; do
1004
+ [[ -d "$artifact_dir" ]] || continue
1005
+ local mtime
1006
+ mtime=$(stat -f '%m' "$artifact_dir" 2>/dev/null || stat -c '%Y' "$artifact_dir" 2>/dev/null || echo "0")
1007
+ if [[ $((now_e - mtime)) -gt $age_secs ]]; then
1008
+ daemon_log INFO "Removing stale artifact: ${artifact_dir}"
1009
+ rm -rf "$artifact_dir"
1010
+ cleaned=$((cleaned + 1))
1011
+ fi
1012
+ done < <(find "$artifacts_dir" -mindepth 1 -maxdepth 1 -type d 2>/dev/null)
1013
+ fi
1014
+
1015
+ # ── 3. Clean orphaned daemon/* branches (no matching worktree or active job) ──
1016
+ if command -v git &>/dev/null; then
1017
+ while IFS= read -r branch; do
1018
+ [[ -z "$branch" ]] && continue
1019
+ branch="${branch## }" # trim leading spaces
1020
+ # Only clean daemon-created branches
1021
+ [[ "$branch" == daemon/issue-* ]] || continue
1022
+ # Extract issue number
1023
+ local branch_issue_num="${branch#daemon/issue-}"
1024
+ # Skip if there's an active job for this issue
1025
+ if daemon_is_inflight "$branch_issue_num" 2>/dev/null; then
1026
+ continue
1027
+ fi
1028
+ daemon_log INFO "Removing orphaned branch: ${branch}"
1029
+ git branch -D "$branch" 2>/dev/null || true
1030
+ cleaned=$((cleaned + 1))
1031
+ done < <(git branch --list 'daemon/issue-*' 2>/dev/null)
1032
+ fi
1033
+
1034
+ # ── 4. Prune completed/failed state entries older than age_days ──
1035
+ if [[ -f "$STATE_FILE" ]]; then
1036
+ local cutoff_iso
1037
+ cutoff_iso=$(epoch_to_iso $((now_e - age_secs)))
1038
+ local before_count
1039
+ before_count=$(jq '.completed | length' "$STATE_FILE" 2>/dev/null || echo 0)
1040
+ locked_state_update --arg cutoff "$cutoff_iso" \
1041
+ '.completed = [.completed[] | select(.completed_at > $cutoff)]' 2>/dev/null || true
1042
+ local after_count
1043
+ after_count=$(jq '.completed | length' "$STATE_FILE" 2>/dev/null || echo 0)
1044
+ local pruned=$((before_count - after_count))
1045
+ if [[ "$pruned" -gt 0 ]]; then
1046
+ daemon_log INFO "Pruned ${pruned} old completed state entries"
1047
+ cleaned=$((cleaned + pruned))
1048
+ fi
1049
+ fi
1050
+
1051
+ # ── 5. Prune stale retry_counts (issues no longer in flight or queued) ──
1052
+ if [[ -f "$STATE_FILE" ]]; then
1053
+ local retry_keys
1054
+ retry_keys=$(jq -r '.retry_counts // {} | keys[]' "$STATE_FILE" 2>/dev/null || true)
1055
+ local stale_keys=()
1056
+ while IFS= read -r key; do
1057
+ [[ -z "$key" ]] && continue
1058
+ if ! daemon_is_inflight "$key" 2>/dev/null; then
1059
+ stale_keys+=("$key")
1060
+ fi
1061
+ done <<< "$retry_keys"
1062
+ if [[ ${#stale_keys[@]} -gt 0 ]]; then
1063
+ for sk in "${stale_keys[@]}"; do
1064
+ locked_state_update --arg k "$sk" 'del(.retry_counts[$k])' 2>/dev/null || continue
1065
+ done
1066
+ daemon_log INFO "Pruned ${#stale_keys[@]} stale retry count(s)"
1067
+ cleaned=$((cleaned + ${#stale_keys[@]}))
1068
+ fi
1069
+ fi
1070
+
1071
+ # ── 6. Detect stale pipeline-state.md stuck in "running" ──
1072
+ local pipeline_state=".claude/pipeline-state.md"
1073
+ if [[ -f "$pipeline_state" ]]; then
1074
+ local ps_status=""
1075
+ ps_status=$(sed -n 's/^status: *//p' "$pipeline_state" 2>/dev/null | head -1 | tr -d ' ')
1076
+ if [[ "$ps_status" == "running" ]]; then
1077
+ local ps_mtime
1078
+ ps_mtime=$(stat -f '%m' "$pipeline_state" 2>/dev/null || stat -c '%Y' "$pipeline_state" 2>/dev/null || echo "0")
1079
+ local ps_age=$((now_e - ps_mtime))
1080
+ # If pipeline-state.md has been "running" for more than 2 hours and no active job
1081
+ if [[ "$ps_age" -gt 7200 ]]; then
1082
+ local has_active=false
1083
+ if [[ -f "$STATE_FILE" ]]; then
1084
+ local active_count
1085
+ active_count=$(jq '.active_jobs | length' "$STATE_FILE" 2>/dev/null || echo "0")
1086
+ [[ "${active_count:-0}" -gt 0 ]] && has_active=true
1087
+ fi
1088
+ if [[ "$has_active" == "false" ]]; then
1089
+ daemon_log WARN "Stale pipeline-state.md stuck in 'running' for ${ps_age}s with no active jobs — marking failed"
1090
+ # Atomically update status to failed
1091
+ local tmp_ps="${pipeline_state}.tmp.$$"
1092
+ sed 's/^status: *running/status: failed (stale — cleaned by daemon)/' "$pipeline_state" > "$tmp_ps" 2>/dev/null && mv "$tmp_ps" "$pipeline_state" || rm -f "$tmp_ps"
1093
+ emit_event "daemon.stale_pipeline_state" "age_s=$ps_age"
1094
+ cleaned=$((cleaned + 1))
1095
+ fi
1096
+ fi
1097
+ fi
1098
+ fi
1099
+
1100
+ # ── 7. Clean remote branches for merged pipeline/* branches ──
1101
+ if command -v git &>/dev/null && [[ "${NO_GITHUB:-}" != "true" ]]; then
1102
+ while IFS= read -r branch; do
1103
+ [[ -z "$branch" ]] && continue
1104
+ branch="${branch## }"
1105
+ [[ "$branch" == pipeline/* ]] || continue
1106
+ local br_issue="${branch#pipeline/pipeline-issue-}"
1107
+ if ! daemon_is_inflight "$br_issue" 2>/dev/null; then
1108
+ daemon_log INFO "Removing orphaned pipeline branch: ${branch}"
1109
+ git branch -D "$branch" 2>/dev/null || true
1110
+ git push origin --delete "$branch" 2>/dev/null || true
1111
+ cleaned=$((cleaned + 1))
1112
+ fi
1113
+ done < <(git branch --list 'pipeline/*' 2>/dev/null)
1114
+ fi
1115
+
1116
+ if [[ "$cleaned" -gt 0 ]]; then
1117
+ emit_event "daemon.cleanup" "cleaned=$cleaned" "age_days=$age_days"
1118
+ daemon_log SUCCESS "Stale reaper cleaned ${cleaned} item(s)"
1119
+ else
1120
+ daemon_log INFO "Stale reaper: nothing to clean"
1121
+ fi
1122
+ }
1123
+
1124
+ # ─── Poll Loop ───────────────────────────────────────────────────────────────
1125
+
1126
+ POLL_CYCLE_COUNT=0
1127
+
1128
+ daemon_poll_loop() {
1129
+ daemon_log INFO "Entering poll loop (interval: ${POLL_INTERVAL}s, max_parallel: ${MAX_PARALLEL})"
1130
+ daemon_log INFO "Watching for label: ${CYAN}${WATCH_LABEL}${RESET}"
1131
+
1132
+ while [[ ! -f "$SHUTDOWN_FLAG" ]]; do
1133
+ # All poll loop calls are error-guarded to prevent set -e from killing the daemon.
1134
+ # The || operator disables set -e for the entire call chain, so transient failures
1135
+ # (GitHub API timeouts, jq errors, intelligence failures) are logged and skipped.
1136
+ daemon_preflight_auth_check || daemon_log WARN "Auth check failed — daemon may be paused"
1137
+ daemon_poll_issues || daemon_log WARN "daemon_poll_issues failed — continuing"
1138
+ daemon_reap_completed || daemon_log WARN "daemon_reap_completed failed — continuing"
1139
+ daemon_health_check || daemon_log WARN "daemon_health_check failed — continuing"
1140
+
1141
+ # Increment cycle counter (must be before all modulo checks)
1142
+ POLL_CYCLE_COUNT=$((POLL_CYCLE_COUNT + 1))
1143
+
1144
+ # Fleet config reload every 3 cycles
1145
+ if [[ $((POLL_CYCLE_COUNT % 3)) -eq 0 ]]; then
1146
+ daemon_reload_config || daemon_log WARN "daemon_reload_config failed — continuing"
1147
+ fi
1148
+
1149
+ # Check degradation every 5 poll cycles
1150
+ if [[ $((POLL_CYCLE_COUNT % 5)) -eq 0 ]]; then
1151
+ daemon_check_degradation || daemon_log WARN "daemon_check_degradation failed — continuing"
1152
+ fi
1153
+
1154
+ # Auto-scale every N cycles (default: 5)
1155
+ if [[ $((POLL_CYCLE_COUNT % ${AUTO_SCALE_INTERVAL:-5})) -eq 0 ]]; then
1156
+ daemon_auto_scale || daemon_log WARN "daemon_auto_scale failed — continuing"
1157
+ fi
1158
+
1159
+ # Self-optimize every N cycles (default: 10)
1160
+ if [[ $((POLL_CYCLE_COUNT % ${OPTIMIZE_INTERVAL:-10})) -eq 0 ]]; then
1161
+ daemon_self_optimize || daemon_log WARN "daemon_self_optimize failed — continuing"
1162
+ fi
1163
+
1164
+ # Stale state reaper every N cycles (default: 10)
1165
+ if [[ $((POLL_CYCLE_COUNT % ${STALE_REAPER_INTERVAL:-10})) -eq 0 ]]; then
1166
+ daemon_cleanup_stale || daemon_log WARN "daemon_cleanup_stale failed — continuing"
1167
+ fi
1168
+
1169
+ # Rotate event log every 10 cycles (~10 min with 60s interval)
1170
+ if [[ $((POLL_CYCLE_COUNT % 10)) -eq 0 ]]; then
1171
+ rotate_event_log || true
1172
+ fi
1173
+
1174
+ # Proactive patrol during quiet periods (with adaptive limits)
1175
+ local issue_count_now active_count_now
1176
+ issue_count_now=$(jq -r '.queued | length' "$STATE_FILE" 2>/dev/null || echo 0)
1177
+ active_count_now=$(get_active_count || echo 0)
1178
+ if [[ "$issue_count_now" -eq 0 ]] && [[ "$active_count_now" -eq 0 ]]; then
1179
+ local now_e
1180
+ now_e=$(now_epoch || date +%s)
1181
+ if [[ $((now_e - LAST_PATROL_EPOCH)) -ge "$PATROL_INTERVAL" ]]; then
1182
+ load_adaptive_patrol_limits || true
1183
+ daemon_log INFO "No active work — running patrol"
1184
+ daemon_patrol --once || daemon_log WARN "daemon_patrol failed — continuing"
1185
+ LAST_PATROL_EPOCH=$now_e
1186
+ fi
1187
+ fi
1188
+
1189
+ # ── Adaptive poll interval: adjust sleep based on queue state ──
1190
+ local effective_interval
1191
+ effective_interval=$(get_adaptive_poll_interval "$issue_count_now" "$active_count_now" || echo "${POLL_INTERVAL:-30}")
1192
+
1193
+ # Sleep in 1s intervals so we can catch shutdown quickly
1194
+ local i=0
1195
+ while [[ $i -lt $effective_interval ]] && [[ ! -f "$SHUTDOWN_FLAG" ]]; do
1196
+ sleep 1 || true # Guard against signal interruption under set -e
1197
+ i=$((i + 1))
1198
+ done
1199
+ done
1200
+
1201
+ daemon_log INFO "Shutdown flag detected — exiting poll loop"
1202
+ }