shipwright-cli 3.1.0 → 3.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (283) hide show
  1. package/.claude/agents/code-reviewer.md +2 -0
  2. package/.claude/agents/devops-engineer.md +2 -0
  3. package/.claude/agents/doc-fleet-agent.md +2 -0
  4. package/.claude/agents/pipeline-agent.md +2 -0
  5. package/.claude/agents/shell-script-specialist.md +2 -0
  6. package/.claude/agents/test-specialist.md +2 -0
  7. package/.claude/hooks/agent-crash-capture.sh +32 -0
  8. package/.claude/hooks/post-tool-use.sh +3 -2
  9. package/.claude/hooks/pre-tool-use.sh +35 -3
  10. package/README.md +22 -8
  11. package/claude-code/hooks/config-change.sh +18 -0
  12. package/claude-code/hooks/instructions-reloaded.sh +7 -0
  13. package/claude-code/hooks/worktree-create.sh +25 -0
  14. package/claude-code/hooks/worktree-remove.sh +20 -0
  15. package/config/code-constitution.json +130 -0
  16. package/config/defaults.json +25 -2
  17. package/config/policy.json +1 -1
  18. package/dashboard/middleware/auth.ts +134 -0
  19. package/dashboard/middleware/constants.ts +21 -0
  20. package/dashboard/public/index.html +8 -6
  21. package/dashboard/public/styles.css +176 -97
  22. package/dashboard/routes/auth.ts +38 -0
  23. package/dashboard/server.ts +117 -25
  24. package/dashboard/services/config.ts +26 -0
  25. package/dashboard/services/db.ts +118 -0
  26. package/dashboard/src/canvas/pixel-agent.ts +298 -0
  27. package/dashboard/src/canvas/pixel-sprites.ts +440 -0
  28. package/dashboard/src/canvas/shipyard-effects.ts +367 -0
  29. package/dashboard/src/canvas/shipyard-scene.ts +616 -0
  30. package/dashboard/src/canvas/submarine-layout.ts +267 -0
  31. package/dashboard/src/components/header.ts +8 -7
  32. package/dashboard/src/core/api.ts +5 -0
  33. package/dashboard/src/core/router.ts +1 -0
  34. package/dashboard/src/design/submarine-theme.ts +253 -0
  35. package/dashboard/src/main.ts +2 -0
  36. package/dashboard/src/types/api.ts +12 -1
  37. package/dashboard/src/views/activity.ts +2 -1
  38. package/dashboard/src/views/metrics.ts +69 -1
  39. package/dashboard/src/views/shipyard.ts +39 -0
  40. package/dashboard/types/index.ts +166 -0
  41. package/docs/plans/2026-02-28-compound-audit-and-shipyard-design.md +186 -0
  42. package/docs/plans/2026-02-28-skipper-shipwright-implementation-plan.md +1182 -0
  43. package/docs/plans/2026-02-28-skipper-shipwright-integration-design.md +531 -0
  44. package/docs/plans/2026-03-01-ai-powered-skill-injection-design.md +298 -0
  45. package/docs/plans/2026-03-01-ai-powered-skill-injection-plan.md +1109 -0
  46. package/docs/plans/2026-03-01-capabilities-cleanup-plan.md +658 -0
  47. package/docs/plans/2026-03-01-clean-architecture-plan.md +924 -0
  48. package/docs/plans/2026-03-01-compound-audit-cascade-design.md +191 -0
  49. package/docs/plans/2026-03-01-compound-audit-cascade-plan.md +921 -0
  50. package/docs/plans/2026-03-01-deep-integration-plan.md +851 -0
  51. package/docs/plans/2026-03-01-pipeline-audit-trail-design.md +145 -0
  52. package/docs/plans/2026-03-01-pipeline-audit-trail-plan.md +770 -0
  53. package/docs/plans/2026-03-01-refined-depths-brand-design.md +382 -0
  54. package/docs/plans/2026-03-01-refined-depths-implementation.md +599 -0
  55. package/docs/plans/2026-03-01-skipper-kernel-integration-design.md +203 -0
  56. package/docs/plans/2026-03-01-unified-platform-design.md +272 -0
  57. package/docs/plans/2026-03-07-claude-code-feature-integration-design.md +189 -0
  58. package/docs/plans/2026-03-07-claude-code-feature-integration-plan.md +1165 -0
  59. package/docs/research/BACKLOG_QUICK_REFERENCE.md +352 -0
  60. package/docs/research/CUTTING_EDGE_RESEARCH_2026.md +546 -0
  61. package/docs/research/RESEARCH_INDEX.md +439 -0
  62. package/docs/research/RESEARCH_SOURCES.md +440 -0
  63. package/docs/research/RESEARCH_SUMMARY.txt +275 -0
  64. package/docs/superpowers/specs/2026-03-10-pipeline-quality-revolution-design.md +341 -0
  65. package/package.json +2 -2
  66. package/scripts/lib/adaptive-model.sh +427 -0
  67. package/scripts/lib/adaptive-timeout.sh +316 -0
  68. package/scripts/lib/audit-trail.sh +309 -0
  69. package/scripts/lib/auto-recovery.sh +471 -0
  70. package/scripts/lib/bandit-selector.sh +431 -0
  71. package/scripts/lib/bootstrap.sh +104 -2
  72. package/scripts/lib/causal-graph.sh +455 -0
  73. package/scripts/lib/compat.sh +126 -0
  74. package/scripts/lib/compound-audit.sh +337 -0
  75. package/scripts/lib/constitutional.sh +454 -0
  76. package/scripts/lib/context-budget.sh +359 -0
  77. package/scripts/lib/convergence.sh +594 -0
  78. package/scripts/lib/cost-optimizer.sh +634 -0
  79. package/scripts/lib/daemon-adaptive.sh +14 -2
  80. package/scripts/lib/daemon-dispatch.sh +106 -17
  81. package/scripts/lib/daemon-failure.sh +34 -4
  82. package/scripts/lib/daemon-patrol.sh +25 -4
  83. package/scripts/lib/daemon-poll-github.sh +361 -0
  84. package/scripts/lib/daemon-poll-health.sh +299 -0
  85. package/scripts/lib/daemon-poll.sh +27 -611
  86. package/scripts/lib/daemon-state.sh +119 -66
  87. package/scripts/lib/daemon-triage.sh +10 -0
  88. package/scripts/lib/dod-scorecard.sh +442 -0
  89. package/scripts/lib/error-actionability.sh +300 -0
  90. package/scripts/lib/formal-spec.sh +461 -0
  91. package/scripts/lib/helpers.sh +180 -5
  92. package/scripts/lib/intent-analysis.sh +409 -0
  93. package/scripts/lib/loop-convergence.sh +350 -0
  94. package/scripts/lib/loop-iteration.sh +682 -0
  95. package/scripts/lib/loop-progress.sh +48 -0
  96. package/scripts/lib/loop-restart.sh +185 -0
  97. package/scripts/lib/memory-effectiveness.sh +506 -0
  98. package/scripts/lib/mutation-executor.sh +352 -0
  99. package/scripts/lib/outcome-feedback.sh +521 -0
  100. package/scripts/lib/pipeline-cli.sh +336 -0
  101. package/scripts/lib/pipeline-commands.sh +1216 -0
  102. package/scripts/lib/pipeline-detection.sh +101 -3
  103. package/scripts/lib/pipeline-execution.sh +897 -0
  104. package/scripts/lib/pipeline-github.sh +28 -3
  105. package/scripts/lib/pipeline-intelligence-compound.sh +431 -0
  106. package/scripts/lib/pipeline-intelligence-scoring.sh +407 -0
  107. package/scripts/lib/pipeline-intelligence-skip.sh +181 -0
  108. package/scripts/lib/pipeline-intelligence.sh +104 -1138
  109. package/scripts/lib/pipeline-quality-bash-compat.sh +182 -0
  110. package/scripts/lib/pipeline-quality-checks.sh +17 -711
  111. package/scripts/lib/pipeline-quality-gates.sh +563 -0
  112. package/scripts/lib/pipeline-stages-build.sh +730 -0
  113. package/scripts/lib/pipeline-stages-delivery.sh +965 -0
  114. package/scripts/lib/pipeline-stages-intake.sh +1133 -0
  115. package/scripts/lib/pipeline-stages-monitor.sh +407 -0
  116. package/scripts/lib/pipeline-stages-review.sh +1022 -0
  117. package/scripts/lib/pipeline-stages.sh +161 -2901
  118. package/scripts/lib/pipeline-state.sh +36 -5
  119. package/scripts/lib/pipeline-util.sh +487 -0
  120. package/scripts/lib/policy-learner.sh +438 -0
  121. package/scripts/lib/process-reward.sh +493 -0
  122. package/scripts/lib/project-detect.sh +649 -0
  123. package/scripts/lib/quality-profile.sh +334 -0
  124. package/scripts/lib/recruit-commands.sh +885 -0
  125. package/scripts/lib/recruit-learning.sh +739 -0
  126. package/scripts/lib/recruit-roles.sh +648 -0
  127. package/scripts/lib/reward-aggregator.sh +458 -0
  128. package/scripts/lib/rl-optimizer.sh +362 -0
  129. package/scripts/lib/root-cause.sh +427 -0
  130. package/scripts/lib/scope-enforcement.sh +445 -0
  131. package/scripts/lib/session-restart.sh +493 -0
  132. package/scripts/lib/skill-memory.sh +300 -0
  133. package/scripts/lib/skill-registry.sh +775 -0
  134. package/scripts/lib/spec-driven.sh +476 -0
  135. package/scripts/lib/test-helpers.sh +18 -7
  136. package/scripts/lib/test-holdout.sh +429 -0
  137. package/scripts/lib/test-optimizer.sh +511 -0
  138. package/scripts/shipwright-file-suggest.sh +45 -0
  139. package/scripts/skills/adversarial-quality.md +61 -0
  140. package/scripts/skills/api-design.md +44 -0
  141. package/scripts/skills/architecture-design.md +50 -0
  142. package/scripts/skills/brainstorming.md +43 -0
  143. package/scripts/skills/data-pipeline.md +44 -0
  144. package/scripts/skills/deploy-safety.md +64 -0
  145. package/scripts/skills/documentation.md +38 -0
  146. package/scripts/skills/frontend-design.md +45 -0
  147. package/scripts/skills/generated/.gitkeep +0 -0
  148. package/scripts/skills/generated/_refinements/.gitkeep +0 -0
  149. package/scripts/skills/generated/_refinements/adversarial-quality.patch.md +3 -0
  150. package/scripts/skills/generated/_refinements/architecture-design.patch.md +3 -0
  151. package/scripts/skills/generated/_refinements/brainstorming.patch.md +3 -0
  152. package/scripts/skills/generated/cli-version-management.md +29 -0
  153. package/scripts/skills/generated/collection-system-validation.md +99 -0
  154. package/scripts/skills/generated/large-scale-c-refactoring-coordination.md +97 -0
  155. package/scripts/skills/generated/pattern-matching-similarity-scoring.md +195 -0
  156. package/scripts/skills/generated/test-parallelization-detection.md +65 -0
  157. package/scripts/skills/observability.md +79 -0
  158. package/scripts/skills/performance.md +48 -0
  159. package/scripts/skills/pr-quality.md +49 -0
  160. package/scripts/skills/product-thinking.md +43 -0
  161. package/scripts/skills/security-audit.md +49 -0
  162. package/scripts/skills/systematic-debugging.md +40 -0
  163. package/scripts/skills/testing-strategy.md +47 -0
  164. package/scripts/skills/two-stage-review.md +52 -0
  165. package/scripts/skills/validation-thoroughness.md +55 -0
  166. package/scripts/sw +9 -3
  167. package/scripts/sw-activity.sh +9 -8
  168. package/scripts/sw-adaptive.sh +8 -7
  169. package/scripts/sw-adversarial.sh +2 -1
  170. package/scripts/sw-architecture-enforcer.sh +3 -1
  171. package/scripts/sw-auth.sh +12 -2
  172. package/scripts/sw-autonomous.sh +5 -1
  173. package/scripts/sw-changelog.sh +4 -1
  174. package/scripts/sw-checkpoint.sh +2 -1
  175. package/scripts/sw-ci.sh +15 -6
  176. package/scripts/sw-cleanup.sh +4 -26
  177. package/scripts/sw-code-review.sh +45 -20
  178. package/scripts/sw-connect.sh +2 -1
  179. package/scripts/sw-context.sh +2 -1
  180. package/scripts/sw-cost.sh +107 -5
  181. package/scripts/sw-daemon.sh +71 -11
  182. package/scripts/sw-dashboard.sh +3 -1
  183. package/scripts/sw-db.sh +71 -20
  184. package/scripts/sw-decide.sh +8 -2
  185. package/scripts/sw-decompose.sh +360 -17
  186. package/scripts/sw-deps.sh +4 -1
  187. package/scripts/sw-developer-simulation.sh +4 -1
  188. package/scripts/sw-discovery.sh +378 -5
  189. package/scripts/sw-doc-fleet.sh +4 -1
  190. package/scripts/sw-docs-agent.sh +3 -1
  191. package/scripts/sw-docs.sh +2 -1
  192. package/scripts/sw-doctor.sh +453 -2
  193. package/scripts/sw-dora.sh +4 -1
  194. package/scripts/sw-durable.sh +12 -7
  195. package/scripts/sw-e2e-orchestrator.sh +17 -16
  196. package/scripts/sw-eventbus.sh +13 -4
  197. package/scripts/sw-evidence.sh +364 -12
  198. package/scripts/sw-feedback.sh +550 -9
  199. package/scripts/sw-fix.sh +20 -1
  200. package/scripts/sw-fleet-discover.sh +6 -2
  201. package/scripts/sw-fleet-viz.sh +9 -4
  202. package/scripts/sw-fleet.sh +5 -1
  203. package/scripts/sw-github-app.sh +18 -4
  204. package/scripts/sw-github-checks.sh +3 -2
  205. package/scripts/sw-github-deploy.sh +3 -2
  206. package/scripts/sw-github-graphql.sh +18 -7
  207. package/scripts/sw-guild.sh +5 -1
  208. package/scripts/sw-heartbeat.sh +5 -30
  209. package/scripts/sw-hello.sh +67 -0
  210. package/scripts/sw-hygiene.sh +10 -3
  211. package/scripts/sw-incident.sh +273 -5
  212. package/scripts/sw-init.sh +18 -2
  213. package/scripts/sw-instrument.sh +10 -2
  214. package/scripts/sw-intelligence.sh +44 -7
  215. package/scripts/sw-jira.sh +5 -1
  216. package/scripts/sw-launchd.sh +2 -1
  217. package/scripts/sw-linear.sh +4 -1
  218. package/scripts/sw-logs.sh +4 -1
  219. package/scripts/sw-loop.sh +436 -1076
  220. package/scripts/sw-memory.sh +357 -3
  221. package/scripts/sw-mission-control.sh +6 -1
  222. package/scripts/sw-model-router.sh +483 -27
  223. package/scripts/sw-otel.sh +15 -4
  224. package/scripts/sw-oversight.sh +14 -5
  225. package/scripts/sw-patrol-meta.sh +334 -0
  226. package/scripts/sw-pipeline-composer.sh +7 -1
  227. package/scripts/sw-pipeline-vitals.sh +12 -6
  228. package/scripts/sw-pipeline.sh +54 -2653
  229. package/scripts/sw-pm.sh +16 -8
  230. package/scripts/sw-pr-lifecycle.sh +2 -1
  231. package/scripts/sw-predictive.sh +17 -5
  232. package/scripts/sw-prep.sh +185 -2
  233. package/scripts/sw-ps.sh +5 -25
  234. package/scripts/sw-public-dashboard.sh +17 -4
  235. package/scripts/sw-quality.sh +14 -6
  236. package/scripts/sw-reaper.sh +8 -25
  237. package/scripts/sw-recruit.sh +156 -2303
  238. package/scripts/sw-regression.sh +19 -12
  239. package/scripts/sw-release-manager.sh +3 -1
  240. package/scripts/sw-release.sh +4 -1
  241. package/scripts/sw-remote.sh +3 -1
  242. package/scripts/sw-replay.sh +7 -1
  243. package/scripts/sw-retro.sh +158 -1
  244. package/scripts/sw-review-rerun.sh +3 -1
  245. package/scripts/sw-scale.sh +14 -5
  246. package/scripts/sw-security-audit.sh +6 -1
  247. package/scripts/sw-self-optimize.sh +173 -6
  248. package/scripts/sw-session.sh +9 -3
  249. package/scripts/sw-setup.sh +3 -1
  250. package/scripts/sw-stall-detector.sh +406 -0
  251. package/scripts/sw-standup.sh +15 -7
  252. package/scripts/sw-status.sh +3 -1
  253. package/scripts/sw-strategic.sh +14 -6
  254. package/scripts/sw-stream.sh +13 -4
  255. package/scripts/sw-swarm.sh +20 -7
  256. package/scripts/sw-team-stages.sh +13 -6
  257. package/scripts/sw-templates.sh +7 -31
  258. package/scripts/sw-testgen.sh +17 -6
  259. package/scripts/sw-tmux-pipeline.sh +4 -1
  260. package/scripts/sw-tmux-role-color.sh +2 -0
  261. package/scripts/sw-tmux-status.sh +1 -1
  262. package/scripts/sw-tmux.sh +37 -1
  263. package/scripts/sw-trace.sh +3 -1
  264. package/scripts/sw-tracker-github.sh +3 -0
  265. package/scripts/sw-tracker-jira.sh +3 -0
  266. package/scripts/sw-tracker-linear.sh +3 -0
  267. package/scripts/sw-tracker.sh +3 -1
  268. package/scripts/sw-triage.sh +3 -2
  269. package/scripts/sw-upgrade.sh +3 -1
  270. package/scripts/sw-ux.sh +5 -2
  271. package/scripts/sw-webhook.sh +5 -2
  272. package/scripts/sw-widgets.sh +9 -4
  273. package/scripts/sw-worktree.sh +15 -3
  274. package/scripts/test-skill-injection.sh +1233 -0
  275. package/templates/pipelines/autonomous.json +27 -3
  276. package/templates/pipelines/cost-aware.json +34 -8
  277. package/templates/pipelines/deployed.json +12 -0
  278. package/templates/pipelines/enterprise.json +12 -0
  279. package/templates/pipelines/fast.json +6 -0
  280. package/templates/pipelines/full.json +27 -3
  281. package/templates/pipelines/hotfix.json +6 -0
  282. package/templates/pipelines/standard.json +12 -0
  283. package/templates/pipelines/tdd.json +12 -0
@@ -0,0 +1,299 @@
1
+ # daemon-poll-health.sh — Health checks and degradation detection for daemon-poll.sh
2
+ # Source from daemon-poll.sh. Requires state, helpers.
3
+ [[ -n "${_DAEMON_POLL_HEALTH_LOADED:-}" ]] && return 0
4
+ _DAEMON_POLL_HEALTH_LOADED=1
5
+
6
+ # Defaults for variables normally set by sw-daemon.sh (safe under set -u).
7
+ DAEMON_DIR="${DAEMON_DIR:-${HOME}/.shipwright}"
8
+ STATE_FILE="${STATE_FILE:-${DAEMON_DIR}/daemon-state.json}"
9
+ PAUSE_FLAG="${PAUSE_FLAG:-${DAEMON_DIR}/daemon-pause.flag}"
10
+ SHUTDOWN_FLAG="${SHUTDOWN_FLAG:-${DAEMON_DIR}/daemon.shutdown}"
11
+ EVENTS_FILE="${EVENTS_FILE:-${DAEMON_DIR}/events.jsonl}"
12
+ SCRIPT_DIR="${SCRIPT_DIR:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}"
13
+ NO_GITHUB="${NO_GITHUB:-false}"
14
+ POLL_INTERVAL="${POLL_INTERVAL:-60}"
15
+ MAX_PARALLEL="${MAX_PARALLEL:-4}"
16
+ WATCH_LABEL="${WATCH_LABEL:-shipwright}"
17
+ WATCH_MODE="${WATCH_MODE:-repo}"
18
+ PIPELINE_TEMPLATE="${PIPELINE_TEMPLATE:-autonomous}"
19
+ ISSUE_LIMIT="${ISSUE_LIMIT:-100}"
20
+ SLACK_WEBHOOK="${SLACK_WEBHOOK:-}"
21
+ BACKOFF_SECS="${BACKOFF_SECS:-0}"
22
+ POLL_CYCLE_COUNT="${POLL_CYCLE_COUNT:-0}"
23
+
24
+ daemon_health_check() {
25
+ local findings=0
26
+ local now_e
27
+ now_e=$(now_epoch)
28
+
29
+ if [[ -f "$STATE_FILE" ]]; then
30
+ # ── Intelligent Health Monitoring ──
31
+ # Instead of killing after a countdown, sense what the agent is doing.
32
+ # Agents think for long stretches — that's normal and expected.
33
+ # Strategy: sense → understand → be patient → nudge → only kill as last resort.
34
+
35
+ local hard_limit="${PROGRESS_HARD_LIMIT_S:-0}"
36
+ local use_progress="${PROGRESS_MONITORING:-true}"
37
+ local nudge_enabled="${NUDGE_ENABLED:-true}"
38
+ local nudge_after="${NUDGE_AFTER_CHECKS:-40}"
39
+
40
+ while IFS= read -r job; do
41
+ local pid started_at issue_num worktree
42
+ pid=$(echo "$job" | jq -r '.pid')
43
+ started_at=$(echo "$job" | jq -r '.started_at // empty')
44
+ issue_num=$(echo "$job" | jq -r '.issue')
45
+ worktree=$(echo "$job" | jq -r '.worktree // ""')
46
+
47
+ # Skip dead processes
48
+ if ! kill -0 "$pid" 2>/dev/null; then
49
+ continue
50
+ fi
51
+
52
+ local elapsed=0
53
+ if [[ -n "$started_at" ]]; then
54
+ local start_e
55
+ start_e=$(TZ=UTC date -j -f "%Y-%m-%dT%H:%M:%SZ" "$started_at" +%s 2>/dev/null || date -d "$started_at" +%s 2>/dev/null || echo "0")
56
+ elapsed=$(( now_e - start_e ))
57
+ fi
58
+
59
+ # Hard wall-clock limit — disabled by default (0 = off)
60
+ if [[ "$hard_limit" -gt 0 && "$elapsed" -gt "$hard_limit" ]]; then
61
+ daemon_log WARN "Hard limit exceeded: issue #${issue_num} (${elapsed}s > ${hard_limit}s, PID $pid) — killing"
62
+ emit_event "daemon.hard_limit" "issue=$issue_num" "elapsed_s=$elapsed" "limit_s=$hard_limit" "pid=$pid"
63
+ kill "$pid" 2>/dev/null || true
64
+ daemon_clear_progress "$issue_num"
65
+ findings=$((findings + 1))
66
+ continue
67
+ fi
68
+
69
+ # ── Intelligent Progress Sensing ──
70
+ if [[ "$use_progress" == "true" && -n "$worktree" ]]; then
71
+ local snapshot verdict
72
+ snapshot=$(daemon_collect_snapshot "$issue_num" "$worktree" "$pid" 2>/dev/null || echo '{}')
73
+
74
+ if [[ "$snapshot" != "{}" ]]; then
75
+ verdict=$(daemon_assess_progress "$issue_num" "$snapshot" 2>/dev/null || echo "healthy")
76
+
77
+ local no_progress_count=0
78
+ no_progress_count=$(jq -r '.no_progress_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
79
+ local cur_stage
80
+ cur_stage=$(echo "$snapshot" | jq -r '.stage // "unknown"')
81
+
82
+ case "$verdict" in
83
+ healthy)
84
+ # All good — agent is making progress
85
+ ;;
86
+ slowing)
87
+ daemon_log INFO "Issue #${issue_num} slowing (no visible changes for ${no_progress_count} checks, ${elapsed}s elapsed, stage=${cur_stage})"
88
+ ;;
89
+ stalled)
90
+ # Check if agent subprocess is alive and consuming CPU
91
+ local agent_alive=false
92
+ local child_cpu=0
93
+ child_cpu=$(pgrep -P "$pid" 2>/dev/null | xargs -I{} ps -o pcpu= -p {} 2>/dev/null | awk '{sum+=$1} END{printf "%d", sum+0}' || echo "0")
94
+ if [[ "${child_cpu:-0}" -gt 0 ]]; then
95
+ agent_alive=true
96
+ fi
97
+
98
+ if [[ "$agent_alive" == "true" ]]; then
99
+ daemon_log INFO "Issue #${issue_num} no visible progress (${no_progress_count} checks) but agent is alive (CPU: ${child_cpu}%, stage=${cur_stage}, ${elapsed}s) — being patient"
100
+ else
101
+ daemon_log WARN "Issue #${issue_num} stalled: no progress for ${no_progress_count} checks, no CPU activity (${elapsed}s elapsed, PID $pid)"
102
+ emit_event "daemon.stalled" "issue=$issue_num" "no_progress=$no_progress_count" "elapsed_s=$elapsed" "pid=$pid"
103
+ fi
104
+ ;;
105
+ stuck)
106
+ local repeated_errors
107
+ repeated_errors=$(jq -r '.repeated_error_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
108
+
109
+ # Even "stuck" — check if the process tree is alive first
110
+ local agent_alive=false
111
+ local child_cpu=0
112
+ child_cpu=$(pgrep -P "$pid" 2>/dev/null | xargs -I{} ps -o pcpu= -p {} 2>/dev/null | awk '{sum+=$1} END{printf "%d", sum+0}' || echo "0")
113
+ if [[ "${child_cpu:-0}" -gt 0 ]]; then
114
+ agent_alive=true
115
+ fi
116
+
117
+ if [[ "$agent_alive" == "true" && "$repeated_errors" -lt 3 ]]; then
118
+ # Agent is alive — nudge instead of kill
119
+ if [[ "$nudge_enabled" == "true" && "$no_progress_count" -ge "$nudge_after" ]]; then
120
+ local nudge_file="${worktree}/.claude/nudge.md"
121
+ if [[ ! -f "$nudge_file" ]]; then
122
+ cat > "$nudge_file" <<NUDGE_EOF
123
+ # Nudge from Daemon Health Monitor
124
+
125
+ The daemon has noticed no visible progress for $(( no_progress_count * 30 / 60 )) minutes.
126
+ Current stage: ${cur_stage}
127
+
128
+ If you're stuck, consider:
129
+ - Breaking the task into smaller steps
130
+ - Committing partial progress
131
+ - Running tests to validate current state
132
+
133
+ This is just a gentle check-in — take your time if you're working through a complex problem.
134
+ NUDGE_EOF
135
+ daemon_log INFO "Issue #${issue_num} nudged (${no_progress_count} checks, stage=${cur_stage}, CPU=${child_cpu}%) — file written to worktree"
136
+ emit_event "daemon.nudge" "issue=$issue_num" "no_progress=$no_progress_count" "stage=$cur_stage" "elapsed_s=$elapsed"
137
+ fi
138
+ else
139
+ daemon_log INFO "Issue #${issue_num} no visible progress (${no_progress_count} checks) but agent is alive (CPU: ${child_cpu}%, stage=${cur_stage}) — waiting"
140
+ fi
141
+ elif [[ "$repeated_errors" -ge 5 ]]; then
142
+ # Truly stuck in an error loop — kill as last resort
143
+ daemon_log WARN "Issue #${issue_num} in error loop: ${repeated_errors} repeated errors (stage=${cur_stage}, ${elapsed}s, PID $pid) — killing"
144
+ emit_event "daemon.stuck_kill" "issue=$issue_num" "no_progress=$no_progress_count" "repeated_errors=$repeated_errors" "stage=$cur_stage" "elapsed_s=$elapsed" "pid=$pid" "reason=error_loop"
145
+ kill "$pid" 2>/dev/null || true
146
+ daemon_clear_progress "$issue_num"
147
+ findings=$((findings + 1))
148
+ elif [[ "$agent_alive" != "true" && "$no_progress_count" -ge "$((PROGRESS_CHECKS_BEFORE_KILL * 2))" ]]; then
149
+ # Process tree is dead AND no progress for very long time
150
+ daemon_log WARN "Issue #${issue_num} appears dead: no CPU, no progress for ${no_progress_count} checks (${elapsed}s, PID $pid) — killing"
151
+ emit_event "daemon.stuck_kill" "issue=$issue_num" "no_progress=$no_progress_count" "repeated_errors=$repeated_errors" "stage=$cur_stage" "elapsed_s=$elapsed" "pid=$pid" "reason=dead_process"
152
+ kill "$pid" 2>/dev/null || true
153
+ daemon_clear_progress "$issue_num"
154
+ findings=$((findings + 1))
155
+ else
156
+ daemon_log WARN "Issue #${issue_num} struggling (${no_progress_count} checks, ${repeated_errors} errors, CPU=${child_cpu}%, stage=${cur_stage}) — monitoring"
157
+ fi
158
+ ;;
159
+ esac
160
+ fi
161
+ else
162
+ # Fallback: legacy time-based detection when progress monitoring is off
163
+ local stale_timeout
164
+ stale_timeout=$(get_adaptive_stale_timeout "$PIPELINE_TEMPLATE")
165
+ if [[ "$elapsed" -gt "$stale_timeout" ]]; then
166
+ # Check if process is still alive
167
+ if kill -0 "$pid" 2>/dev/null; then
168
+ # Kill at 2x stale timeout — the process is truly hung
169
+ local kill_threshold=$(( stale_timeout * 2 ))
170
+ if [[ "$elapsed" -gt "$kill_threshold" ]]; then
171
+ daemon_log WARN "Killing stale job (legacy): issue #${issue_num} (${elapsed}s > ${kill_threshold}s kill threshold, PID $pid)"
172
+ emit_event "daemon.stale_kill" "issue=$issue_num" "elapsed_s=$elapsed" "pid=$pid"
173
+ kill "$pid" 2>/dev/null || true
174
+ sleep 2
175
+ kill -9 "$pid" 2>/dev/null || true
176
+ else
177
+ daemon_log WARN "Stale job (legacy): issue #${issue_num} (${elapsed}s > ${stale_timeout}s, PID $pid) — will kill at ${kill_threshold}s"
178
+ emit_event "daemon.stale_warning" "issue=$issue_num" "elapsed_s=$elapsed" "pid=$pid"
179
+ fi
180
+ else
181
+ daemon_log WARN "Stale job with dead process: issue #${issue_num} (PID $pid no longer exists)"
182
+ emit_event "daemon.stale_dead" "issue=$issue_num" "pid=$pid"
183
+ fi
184
+ findings=$((findings + 1))
185
+ fi
186
+ fi
187
+ done < <(jq -c '.active_jobs[]' "$STATE_FILE" 2>/dev/null || true)
188
+ fi
189
+
190
+ # Disk space warning (check both repo dir and ~/.shipwright)
191
+ local free_kb
192
+ free_kb=$(df -k "." 2>/dev/null | tail -1 | awk '{print $4}')
193
+ if [[ -n "$free_kb" ]] && [[ "$free_kb" -lt 1048576 ]] 2>/dev/null; then
194
+ daemon_log WARN "Low disk space: $(( free_kb / 1024 ))MB free"
195
+ findings=$((findings + 1))
196
+ fi
197
+
198
+ # Critical disk space on ~/.shipwright — pause spawning
199
+ local sw_free_kb
200
+ sw_free_kb=$(df -k "$HOME/.shipwright" 2>/dev/null | tail -1 | awk '{print $4}')
201
+ if [[ -n "$sw_free_kb" ]] && [[ "$sw_free_kb" -lt 512000 ]] 2>/dev/null; then
202
+ daemon_log WARN "Critical disk space on ~/.shipwright: $(( sw_free_kb / 1024 ))MB — pausing spawns"
203
+ emit_event "daemon.disk_low" "free_mb=$(( sw_free_kb / 1024 ))"
204
+ mkdir -p "$HOME/.shipwright"
205
+ echo '{"paused":true,"reason":"disk_low"}' > "$HOME/.shipwright/daemon-pause.flag"
206
+ findings=$((findings + 1))
207
+ fi
208
+
209
+ # Events file size warning
210
+ if [[ -f "$EVENTS_FILE" ]]; then
211
+ local events_size
212
+ events_size=$(wc -c < "$EVENTS_FILE" 2>/dev/null || echo 0)
213
+ if [[ "$events_size" -gt 104857600 ]]; then # 100MB
214
+ daemon_log WARN "Events file large ($(( events_size / 1048576 ))MB) — consider rotating"
215
+ findings=$((findings + 1))
216
+ fi
217
+ fi
218
+
219
+ if [[ "$findings" -gt 0 ]]; then
220
+ emit_event "daemon.health" "findings=$findings"
221
+ fi
222
+ }
223
+
224
+ # ─── Degradation Alerting ─────────────────────────────────────────────────────
225
+
226
+
227
+ daemon_check_degradation() {
228
+ if [[ ! -f "$EVENTS_FILE" ]]; then return; fi
229
+
230
+ local window="${DEGRADATION_WINDOW:-5}"
231
+ local cfr_threshold="${DEGRADATION_CFR_THRESHOLD:-30}"
232
+ local success_threshold="${DEGRADATION_SUCCESS_THRESHOLD:-50}"
233
+
234
+ # Get last N pipeline completions
235
+ local recent
236
+ recent=$(tail -200 "$EVENTS_FILE" | jq -s "[.[] | select(.type == \"pipeline.completed\")] | .[-${window}:]" 2>/dev/null)
237
+ local count
238
+ count=$(echo "$recent" | jq 'length' 2>/dev/null || echo 0)
239
+
240
+ if [[ "$count" -lt "$window" ]]; then return; fi
241
+
242
+ local failures successes
243
+ failures=$(echo "$recent" | jq '[.[] | select(.result == "failure")] | length')
244
+ successes=$(echo "$recent" | jq '[.[] | select(.result == "success")] | length')
245
+ local cfr_pct=0 success_pct=0
246
+ if [[ "${count:-0}" -gt 0 ]]; then
247
+ cfr_pct=$(( failures * 100 / count ))
248
+ success_pct=$(( successes * 100 / count ))
249
+ fi
250
+
251
+ local alerts=""
252
+ if [[ "$cfr_pct" -gt "$cfr_threshold" ]]; then
253
+ alerts="CFR ${cfr_pct}% exceeds threshold ${cfr_threshold}%"
254
+ daemon_log WARN "DEGRADATION: $alerts"
255
+ fi
256
+ if [[ "$success_pct" -lt "$success_threshold" ]]; then
257
+ local msg="Success rate ${success_pct}% below threshold ${success_threshold}%"
258
+ [[ -n "$alerts" ]] && alerts="$alerts; $msg" || alerts="$msg"
259
+ daemon_log WARN "DEGRADATION: $msg"
260
+ fi
261
+
262
+ if [[ -n "$alerts" ]]; then
263
+ emit_event "daemon.alert" "alerts=$alerts" "cfr_pct=$cfr_pct" "success_pct=$success_pct"
264
+
265
+ # Slack notification
266
+ if [[ -n "${SLACK_WEBHOOK:-}" ]]; then
267
+ notify "Pipeline Degradation Alert" "$alerts" "warn"
268
+ fi
269
+ fi
270
+ }
271
+
272
+ # ─── Auto-Scaling ─────────────────────────────────────────────────────────
273
+ # Dynamically adjusts MAX_PARALLEL based on CPU, memory, budget, and queue depth
274
+
275
+
276
+ daemon_reload_config() {
277
+ local reload_flag="$HOME/.shipwright/fleet-reload.flag"
278
+ if [[ ! -f "$reload_flag" ]]; then
279
+ return
280
+ fi
281
+
282
+ local fleet_config=".claude/.fleet-daemon-config.json"
283
+ if [[ -f "$fleet_config" ]]; then
284
+ local new_max
285
+ new_max=$(jq -r '.max_parallel // empty' "$fleet_config" 2>/dev/null || true)
286
+ if [[ -n "$new_max" && "$new_max" != "null" ]]; then
287
+ local prev="$MAX_PARALLEL"
288
+ FLEET_MAX_PARALLEL="$new_max"
289
+ MAX_PARALLEL="$new_max"
290
+ daemon_log INFO "Fleet reload: max_parallel ${prev} → ${MAX_PARALLEL} (fleet ceiling: ${FLEET_MAX_PARALLEL})"
291
+ emit_event "daemon.fleet_reload" "from=$prev" "to=$MAX_PARALLEL"
292
+ fi
293
+ fi
294
+
295
+ rm -f "$reload_flag"
296
+ }
297
+
298
+ # ─── Self-Optimizing Metrics Loop ──────────────────────────────────────────
299
+