shipwright-cli 3.1.0 → 3.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/code-reviewer.md +2 -0
- package/.claude/agents/devops-engineer.md +2 -0
- package/.claude/agents/doc-fleet-agent.md +2 -0
- package/.claude/agents/pipeline-agent.md +2 -0
- package/.claude/agents/shell-script-specialist.md +2 -0
- package/.claude/agents/test-specialist.md +2 -0
- package/.claude/hooks/agent-crash-capture.sh +32 -0
- package/.claude/hooks/post-tool-use.sh +3 -2
- package/.claude/hooks/pre-tool-use.sh +35 -3
- package/README.md +22 -8
- package/claude-code/hooks/config-change.sh +18 -0
- package/claude-code/hooks/instructions-reloaded.sh +7 -0
- package/claude-code/hooks/worktree-create.sh +25 -0
- package/claude-code/hooks/worktree-remove.sh +20 -0
- package/config/code-constitution.json +130 -0
- package/config/defaults.json +25 -2
- package/config/policy.json +1 -1
- package/dashboard/middleware/auth.ts +134 -0
- package/dashboard/middleware/constants.ts +21 -0
- package/dashboard/public/index.html +8 -6
- package/dashboard/public/styles.css +176 -97
- package/dashboard/routes/auth.ts +38 -0
- package/dashboard/server.ts +117 -25
- package/dashboard/services/config.ts +26 -0
- package/dashboard/services/db.ts +118 -0
- package/dashboard/src/canvas/pixel-agent.ts +298 -0
- package/dashboard/src/canvas/pixel-sprites.ts +440 -0
- package/dashboard/src/canvas/shipyard-effects.ts +367 -0
- package/dashboard/src/canvas/shipyard-scene.ts +616 -0
- package/dashboard/src/canvas/submarine-layout.ts +267 -0
- package/dashboard/src/components/header.ts +8 -7
- package/dashboard/src/core/api.ts +5 -0
- package/dashboard/src/core/router.ts +1 -0
- package/dashboard/src/design/submarine-theme.ts +253 -0
- package/dashboard/src/main.ts +2 -0
- package/dashboard/src/types/api.ts +12 -1
- package/dashboard/src/views/activity.ts +2 -1
- package/dashboard/src/views/metrics.ts +69 -1
- package/dashboard/src/views/shipyard.ts +39 -0
- package/dashboard/types/index.ts +166 -0
- package/docs/plans/2026-02-28-compound-audit-and-shipyard-design.md +186 -0
- package/docs/plans/2026-02-28-skipper-shipwright-implementation-plan.md +1182 -0
- package/docs/plans/2026-02-28-skipper-shipwright-integration-design.md +531 -0
- package/docs/plans/2026-03-01-ai-powered-skill-injection-design.md +298 -0
- package/docs/plans/2026-03-01-ai-powered-skill-injection-plan.md +1109 -0
- package/docs/plans/2026-03-01-capabilities-cleanup-plan.md +658 -0
- package/docs/plans/2026-03-01-clean-architecture-plan.md +924 -0
- package/docs/plans/2026-03-01-compound-audit-cascade-design.md +191 -0
- package/docs/plans/2026-03-01-compound-audit-cascade-plan.md +921 -0
- package/docs/plans/2026-03-01-deep-integration-plan.md +851 -0
- package/docs/plans/2026-03-01-pipeline-audit-trail-design.md +145 -0
- package/docs/plans/2026-03-01-pipeline-audit-trail-plan.md +770 -0
- package/docs/plans/2026-03-01-refined-depths-brand-design.md +382 -0
- package/docs/plans/2026-03-01-refined-depths-implementation.md +599 -0
- package/docs/plans/2026-03-01-skipper-kernel-integration-design.md +203 -0
- package/docs/plans/2026-03-01-unified-platform-design.md +272 -0
- package/docs/plans/2026-03-07-claude-code-feature-integration-design.md +189 -0
- package/docs/plans/2026-03-07-claude-code-feature-integration-plan.md +1165 -0
- package/docs/research/BACKLOG_QUICK_REFERENCE.md +352 -0
- package/docs/research/CUTTING_EDGE_RESEARCH_2026.md +546 -0
- package/docs/research/RESEARCH_INDEX.md +439 -0
- package/docs/research/RESEARCH_SOURCES.md +440 -0
- package/docs/research/RESEARCH_SUMMARY.txt +275 -0
- package/docs/superpowers/specs/2026-03-10-pipeline-quality-revolution-design.md +341 -0
- package/package.json +2 -2
- package/scripts/lib/adaptive-model.sh +427 -0
- package/scripts/lib/adaptive-timeout.sh +316 -0
- package/scripts/lib/audit-trail.sh +309 -0
- package/scripts/lib/auto-recovery.sh +471 -0
- package/scripts/lib/bandit-selector.sh +431 -0
- package/scripts/lib/bootstrap.sh +104 -2
- package/scripts/lib/causal-graph.sh +455 -0
- package/scripts/lib/compat.sh +126 -0
- package/scripts/lib/compound-audit.sh +337 -0
- package/scripts/lib/constitutional.sh +454 -0
- package/scripts/lib/context-budget.sh +359 -0
- package/scripts/lib/convergence.sh +594 -0
- package/scripts/lib/cost-optimizer.sh +634 -0
- package/scripts/lib/daemon-adaptive.sh +14 -2
- package/scripts/lib/daemon-dispatch.sh +106 -17
- package/scripts/lib/daemon-failure.sh +34 -4
- package/scripts/lib/daemon-patrol.sh +25 -4
- package/scripts/lib/daemon-poll-github.sh +361 -0
- package/scripts/lib/daemon-poll-health.sh +299 -0
- package/scripts/lib/daemon-poll.sh +27 -611
- package/scripts/lib/daemon-state.sh +119 -66
- package/scripts/lib/daemon-triage.sh +10 -0
- package/scripts/lib/dod-scorecard.sh +442 -0
- package/scripts/lib/error-actionability.sh +300 -0
- package/scripts/lib/formal-spec.sh +461 -0
- package/scripts/lib/helpers.sh +180 -5
- package/scripts/lib/intent-analysis.sh +409 -0
- package/scripts/lib/loop-convergence.sh +350 -0
- package/scripts/lib/loop-iteration.sh +682 -0
- package/scripts/lib/loop-progress.sh +48 -0
- package/scripts/lib/loop-restart.sh +185 -0
- package/scripts/lib/memory-effectiveness.sh +506 -0
- package/scripts/lib/mutation-executor.sh +352 -0
- package/scripts/lib/outcome-feedback.sh +521 -0
- package/scripts/lib/pipeline-cli.sh +336 -0
- package/scripts/lib/pipeline-commands.sh +1216 -0
- package/scripts/lib/pipeline-detection.sh +101 -3
- package/scripts/lib/pipeline-execution.sh +897 -0
- package/scripts/lib/pipeline-github.sh +28 -3
- package/scripts/lib/pipeline-intelligence-compound.sh +431 -0
- package/scripts/lib/pipeline-intelligence-scoring.sh +407 -0
- package/scripts/lib/pipeline-intelligence-skip.sh +181 -0
- package/scripts/lib/pipeline-intelligence.sh +104 -1138
- package/scripts/lib/pipeline-quality-bash-compat.sh +182 -0
- package/scripts/lib/pipeline-quality-checks.sh +17 -711
- package/scripts/lib/pipeline-quality-gates.sh +563 -0
- package/scripts/lib/pipeline-stages-build.sh +730 -0
- package/scripts/lib/pipeline-stages-delivery.sh +965 -0
- package/scripts/lib/pipeline-stages-intake.sh +1133 -0
- package/scripts/lib/pipeline-stages-monitor.sh +407 -0
- package/scripts/lib/pipeline-stages-review.sh +1022 -0
- package/scripts/lib/pipeline-stages.sh +161 -2901
- package/scripts/lib/pipeline-state.sh +36 -5
- package/scripts/lib/pipeline-util.sh +487 -0
- package/scripts/lib/policy-learner.sh +438 -0
- package/scripts/lib/process-reward.sh +493 -0
- package/scripts/lib/project-detect.sh +649 -0
- package/scripts/lib/quality-profile.sh +334 -0
- package/scripts/lib/recruit-commands.sh +885 -0
- package/scripts/lib/recruit-learning.sh +739 -0
- package/scripts/lib/recruit-roles.sh +648 -0
- package/scripts/lib/reward-aggregator.sh +458 -0
- package/scripts/lib/rl-optimizer.sh +362 -0
- package/scripts/lib/root-cause.sh +427 -0
- package/scripts/lib/scope-enforcement.sh +445 -0
- package/scripts/lib/session-restart.sh +493 -0
- package/scripts/lib/skill-memory.sh +300 -0
- package/scripts/lib/skill-registry.sh +775 -0
- package/scripts/lib/spec-driven.sh +476 -0
- package/scripts/lib/test-helpers.sh +18 -7
- package/scripts/lib/test-holdout.sh +429 -0
- package/scripts/lib/test-optimizer.sh +511 -0
- package/scripts/shipwright-file-suggest.sh +45 -0
- package/scripts/skills/adversarial-quality.md +61 -0
- package/scripts/skills/api-design.md +44 -0
- package/scripts/skills/architecture-design.md +50 -0
- package/scripts/skills/brainstorming.md +43 -0
- package/scripts/skills/data-pipeline.md +44 -0
- package/scripts/skills/deploy-safety.md +64 -0
- package/scripts/skills/documentation.md +38 -0
- package/scripts/skills/frontend-design.md +45 -0
- package/scripts/skills/generated/.gitkeep +0 -0
- package/scripts/skills/generated/_refinements/.gitkeep +0 -0
- package/scripts/skills/generated/_refinements/adversarial-quality.patch.md +3 -0
- package/scripts/skills/generated/_refinements/architecture-design.patch.md +3 -0
- package/scripts/skills/generated/_refinements/brainstorming.patch.md +3 -0
- package/scripts/skills/generated/cli-version-management.md +29 -0
- package/scripts/skills/generated/collection-system-validation.md +99 -0
- package/scripts/skills/generated/large-scale-c-refactoring-coordination.md +97 -0
- package/scripts/skills/generated/pattern-matching-similarity-scoring.md +195 -0
- package/scripts/skills/generated/test-parallelization-detection.md +65 -0
- package/scripts/skills/observability.md +79 -0
- package/scripts/skills/performance.md +48 -0
- package/scripts/skills/pr-quality.md +49 -0
- package/scripts/skills/product-thinking.md +43 -0
- package/scripts/skills/security-audit.md +49 -0
- package/scripts/skills/systematic-debugging.md +40 -0
- package/scripts/skills/testing-strategy.md +47 -0
- package/scripts/skills/two-stage-review.md +52 -0
- package/scripts/skills/validation-thoroughness.md +55 -0
- package/scripts/sw +9 -3
- package/scripts/sw-activity.sh +9 -8
- package/scripts/sw-adaptive.sh +8 -7
- package/scripts/sw-adversarial.sh +2 -1
- package/scripts/sw-architecture-enforcer.sh +3 -1
- package/scripts/sw-auth.sh +12 -2
- package/scripts/sw-autonomous.sh +5 -1
- package/scripts/sw-changelog.sh +4 -1
- package/scripts/sw-checkpoint.sh +2 -1
- package/scripts/sw-ci.sh +15 -6
- package/scripts/sw-cleanup.sh +4 -26
- package/scripts/sw-code-review.sh +45 -20
- package/scripts/sw-connect.sh +2 -1
- package/scripts/sw-context.sh +2 -1
- package/scripts/sw-cost.sh +107 -5
- package/scripts/sw-daemon.sh +71 -11
- package/scripts/sw-dashboard.sh +3 -1
- package/scripts/sw-db.sh +71 -20
- package/scripts/sw-decide.sh +8 -2
- package/scripts/sw-decompose.sh +360 -17
- package/scripts/sw-deps.sh +4 -1
- package/scripts/sw-developer-simulation.sh +4 -1
- package/scripts/sw-discovery.sh +378 -5
- package/scripts/sw-doc-fleet.sh +4 -1
- package/scripts/sw-docs-agent.sh +3 -1
- package/scripts/sw-docs.sh +2 -1
- package/scripts/sw-doctor.sh +453 -2
- package/scripts/sw-dora.sh +4 -1
- package/scripts/sw-durable.sh +12 -7
- package/scripts/sw-e2e-orchestrator.sh +17 -16
- package/scripts/sw-eventbus.sh +13 -4
- package/scripts/sw-evidence.sh +364 -12
- package/scripts/sw-feedback.sh +550 -9
- package/scripts/sw-fix.sh +20 -1
- package/scripts/sw-fleet-discover.sh +6 -2
- package/scripts/sw-fleet-viz.sh +9 -4
- package/scripts/sw-fleet.sh +5 -1
- package/scripts/sw-github-app.sh +18 -4
- package/scripts/sw-github-checks.sh +3 -2
- package/scripts/sw-github-deploy.sh +3 -2
- package/scripts/sw-github-graphql.sh +18 -7
- package/scripts/sw-guild.sh +5 -1
- package/scripts/sw-heartbeat.sh +5 -30
- package/scripts/sw-hello.sh +67 -0
- package/scripts/sw-hygiene.sh +10 -3
- package/scripts/sw-incident.sh +273 -5
- package/scripts/sw-init.sh +18 -2
- package/scripts/sw-instrument.sh +10 -2
- package/scripts/sw-intelligence.sh +44 -7
- package/scripts/sw-jira.sh +5 -1
- package/scripts/sw-launchd.sh +2 -1
- package/scripts/sw-linear.sh +4 -1
- package/scripts/sw-logs.sh +4 -1
- package/scripts/sw-loop.sh +436 -1076
- package/scripts/sw-memory.sh +357 -3
- package/scripts/sw-mission-control.sh +6 -1
- package/scripts/sw-model-router.sh +483 -27
- package/scripts/sw-otel.sh +15 -4
- package/scripts/sw-oversight.sh +14 -5
- package/scripts/sw-patrol-meta.sh +334 -0
- package/scripts/sw-pipeline-composer.sh +7 -1
- package/scripts/sw-pipeline-vitals.sh +12 -6
- package/scripts/sw-pipeline.sh +54 -2653
- package/scripts/sw-pm.sh +16 -8
- package/scripts/sw-pr-lifecycle.sh +2 -1
- package/scripts/sw-predictive.sh +17 -5
- package/scripts/sw-prep.sh +185 -2
- package/scripts/sw-ps.sh +5 -25
- package/scripts/sw-public-dashboard.sh +17 -4
- package/scripts/sw-quality.sh +14 -6
- package/scripts/sw-reaper.sh +8 -25
- package/scripts/sw-recruit.sh +156 -2303
- package/scripts/sw-regression.sh +19 -12
- package/scripts/sw-release-manager.sh +3 -1
- package/scripts/sw-release.sh +4 -1
- package/scripts/sw-remote.sh +3 -1
- package/scripts/sw-replay.sh +7 -1
- package/scripts/sw-retro.sh +158 -1
- package/scripts/sw-review-rerun.sh +3 -1
- package/scripts/sw-scale.sh +14 -5
- package/scripts/sw-security-audit.sh +6 -1
- package/scripts/sw-self-optimize.sh +173 -6
- package/scripts/sw-session.sh +9 -3
- package/scripts/sw-setup.sh +3 -1
- package/scripts/sw-stall-detector.sh +406 -0
- package/scripts/sw-standup.sh +15 -7
- package/scripts/sw-status.sh +3 -1
- package/scripts/sw-strategic.sh +14 -6
- package/scripts/sw-stream.sh +13 -4
- package/scripts/sw-swarm.sh +20 -7
- package/scripts/sw-team-stages.sh +13 -6
- package/scripts/sw-templates.sh +7 -31
- package/scripts/sw-testgen.sh +17 -6
- package/scripts/sw-tmux-pipeline.sh +4 -1
- package/scripts/sw-tmux-role-color.sh +2 -0
- package/scripts/sw-tmux-status.sh +1 -1
- package/scripts/sw-tmux.sh +37 -1
- package/scripts/sw-trace.sh +3 -1
- package/scripts/sw-tracker-github.sh +3 -0
- package/scripts/sw-tracker-jira.sh +3 -0
- package/scripts/sw-tracker-linear.sh +3 -0
- package/scripts/sw-tracker.sh +3 -1
- package/scripts/sw-triage.sh +3 -2
- package/scripts/sw-upgrade.sh +3 -1
- package/scripts/sw-ux.sh +5 -2
- package/scripts/sw-webhook.sh +5 -2
- package/scripts/sw-widgets.sh +9 -4
- package/scripts/sw-worktree.sh +15 -3
- package/scripts/test-skill-injection.sh +1233 -0
- package/templates/pipelines/autonomous.json +27 -3
- package/templates/pipelines/cost-aware.json +34 -8
- package/templates/pipelines/deployed.json +12 -0
- package/templates/pipelines/enterprise.json +12 -0
- package/templates/pipelines/fast.json +6 -0
- package/templates/pipelines/full.json +27 -3
- package/templates/pipelines/hotfix.json +6 -0
- package/templates/pipelines/standard.json +12 -0
- package/templates/pipelines/tdd.json +12 -0
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
# daemon-poll-health.sh — Health checks and degradation detection for daemon-poll.sh
|
|
2
|
+
# Source from daemon-poll.sh. Requires state, helpers.
|
|
3
|
+
[[ -n "${_DAEMON_POLL_HEALTH_LOADED:-}" ]] && return 0
|
|
4
|
+
_DAEMON_POLL_HEALTH_LOADED=1
|
|
5
|
+
|
|
6
|
+
# Defaults for variables normally set by sw-daemon.sh (safe under set -u).
|
|
7
|
+
DAEMON_DIR="${DAEMON_DIR:-${HOME}/.shipwright}"
|
|
8
|
+
STATE_FILE="${STATE_FILE:-${DAEMON_DIR}/daemon-state.json}"
|
|
9
|
+
PAUSE_FLAG="${PAUSE_FLAG:-${DAEMON_DIR}/daemon-pause.flag}"
|
|
10
|
+
SHUTDOWN_FLAG="${SHUTDOWN_FLAG:-${DAEMON_DIR}/daemon.shutdown}"
|
|
11
|
+
EVENTS_FILE="${EVENTS_FILE:-${DAEMON_DIR}/events.jsonl}"
|
|
12
|
+
SCRIPT_DIR="${SCRIPT_DIR:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}"
|
|
13
|
+
NO_GITHUB="${NO_GITHUB:-false}"
|
|
14
|
+
POLL_INTERVAL="${POLL_INTERVAL:-60}"
|
|
15
|
+
MAX_PARALLEL="${MAX_PARALLEL:-4}"
|
|
16
|
+
WATCH_LABEL="${WATCH_LABEL:-shipwright}"
|
|
17
|
+
WATCH_MODE="${WATCH_MODE:-repo}"
|
|
18
|
+
PIPELINE_TEMPLATE="${PIPELINE_TEMPLATE:-autonomous}"
|
|
19
|
+
ISSUE_LIMIT="${ISSUE_LIMIT:-100}"
|
|
20
|
+
SLACK_WEBHOOK="${SLACK_WEBHOOK:-}"
|
|
21
|
+
BACKOFF_SECS="${BACKOFF_SECS:-0}"
|
|
22
|
+
POLL_CYCLE_COUNT="${POLL_CYCLE_COUNT:-0}"
|
|
23
|
+
|
|
24
|
+
daemon_health_check() {
|
|
25
|
+
local findings=0
|
|
26
|
+
local now_e
|
|
27
|
+
now_e=$(now_epoch)
|
|
28
|
+
|
|
29
|
+
if [[ -f "$STATE_FILE" ]]; then
|
|
30
|
+
# ── Intelligent Health Monitoring ──
|
|
31
|
+
# Instead of killing after a countdown, sense what the agent is doing.
|
|
32
|
+
# Agents think for long stretches — that's normal and expected.
|
|
33
|
+
# Strategy: sense → understand → be patient → nudge → only kill as last resort.
|
|
34
|
+
|
|
35
|
+
local hard_limit="${PROGRESS_HARD_LIMIT_S:-0}"
|
|
36
|
+
local use_progress="${PROGRESS_MONITORING:-true}"
|
|
37
|
+
local nudge_enabled="${NUDGE_ENABLED:-true}"
|
|
38
|
+
local nudge_after="${NUDGE_AFTER_CHECKS:-40}"
|
|
39
|
+
|
|
40
|
+
while IFS= read -r job; do
|
|
41
|
+
local pid started_at issue_num worktree
|
|
42
|
+
pid=$(echo "$job" | jq -r '.pid')
|
|
43
|
+
started_at=$(echo "$job" | jq -r '.started_at // empty')
|
|
44
|
+
issue_num=$(echo "$job" | jq -r '.issue')
|
|
45
|
+
worktree=$(echo "$job" | jq -r '.worktree // ""')
|
|
46
|
+
|
|
47
|
+
# Skip dead processes
|
|
48
|
+
if ! kill -0 "$pid" 2>/dev/null; then
|
|
49
|
+
continue
|
|
50
|
+
fi
|
|
51
|
+
|
|
52
|
+
local elapsed=0
|
|
53
|
+
if [[ -n "$started_at" ]]; then
|
|
54
|
+
local start_e
|
|
55
|
+
start_e=$(TZ=UTC date -j -f "%Y-%m-%dT%H:%M:%SZ" "$started_at" +%s 2>/dev/null || date -d "$started_at" +%s 2>/dev/null || echo "0")
|
|
56
|
+
elapsed=$(( now_e - start_e ))
|
|
57
|
+
fi
|
|
58
|
+
|
|
59
|
+
# Hard wall-clock limit — disabled by default (0 = off)
|
|
60
|
+
if [[ "$hard_limit" -gt 0 && "$elapsed" -gt "$hard_limit" ]]; then
|
|
61
|
+
daemon_log WARN "Hard limit exceeded: issue #${issue_num} (${elapsed}s > ${hard_limit}s, PID $pid) — killing"
|
|
62
|
+
emit_event "daemon.hard_limit" "issue=$issue_num" "elapsed_s=$elapsed" "limit_s=$hard_limit" "pid=$pid"
|
|
63
|
+
kill "$pid" 2>/dev/null || true
|
|
64
|
+
daemon_clear_progress "$issue_num"
|
|
65
|
+
findings=$((findings + 1))
|
|
66
|
+
continue
|
|
67
|
+
fi
|
|
68
|
+
|
|
69
|
+
# ── Intelligent Progress Sensing ──
|
|
70
|
+
if [[ "$use_progress" == "true" && -n "$worktree" ]]; then
|
|
71
|
+
local snapshot verdict
|
|
72
|
+
snapshot=$(daemon_collect_snapshot "$issue_num" "$worktree" "$pid" 2>/dev/null || echo '{}')
|
|
73
|
+
|
|
74
|
+
if [[ "$snapshot" != "{}" ]]; then
|
|
75
|
+
verdict=$(daemon_assess_progress "$issue_num" "$snapshot" 2>/dev/null || echo "healthy")
|
|
76
|
+
|
|
77
|
+
local no_progress_count=0
|
|
78
|
+
no_progress_count=$(jq -r '.no_progress_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
|
|
79
|
+
local cur_stage
|
|
80
|
+
cur_stage=$(echo "$snapshot" | jq -r '.stage // "unknown"')
|
|
81
|
+
|
|
82
|
+
case "$verdict" in
|
|
83
|
+
healthy)
|
|
84
|
+
# All good — agent is making progress
|
|
85
|
+
;;
|
|
86
|
+
slowing)
|
|
87
|
+
daemon_log INFO "Issue #${issue_num} slowing (no visible changes for ${no_progress_count} checks, ${elapsed}s elapsed, stage=${cur_stage})"
|
|
88
|
+
;;
|
|
89
|
+
stalled)
|
|
90
|
+
# Check if agent subprocess is alive and consuming CPU
|
|
91
|
+
local agent_alive=false
|
|
92
|
+
local child_cpu=0
|
|
93
|
+
child_cpu=$(pgrep -P "$pid" 2>/dev/null | xargs -I{} ps -o pcpu= -p {} 2>/dev/null | awk '{sum+=$1} END{printf "%d", sum+0}' || echo "0")
|
|
94
|
+
if [[ "${child_cpu:-0}" -gt 0 ]]; then
|
|
95
|
+
agent_alive=true
|
|
96
|
+
fi
|
|
97
|
+
|
|
98
|
+
if [[ "$agent_alive" == "true" ]]; then
|
|
99
|
+
daemon_log INFO "Issue #${issue_num} no visible progress (${no_progress_count} checks) but agent is alive (CPU: ${child_cpu}%, stage=${cur_stage}, ${elapsed}s) — being patient"
|
|
100
|
+
else
|
|
101
|
+
daemon_log WARN "Issue #${issue_num} stalled: no progress for ${no_progress_count} checks, no CPU activity (${elapsed}s elapsed, PID $pid)"
|
|
102
|
+
emit_event "daemon.stalled" "issue=$issue_num" "no_progress=$no_progress_count" "elapsed_s=$elapsed" "pid=$pid"
|
|
103
|
+
fi
|
|
104
|
+
;;
|
|
105
|
+
stuck)
|
|
106
|
+
local repeated_errors
|
|
107
|
+
repeated_errors=$(jq -r '.repeated_error_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
|
|
108
|
+
|
|
109
|
+
# Even "stuck" — check if the process tree is alive first
|
|
110
|
+
local agent_alive=false
|
|
111
|
+
local child_cpu=0
|
|
112
|
+
child_cpu=$(pgrep -P "$pid" 2>/dev/null | xargs -I{} ps -o pcpu= -p {} 2>/dev/null | awk '{sum+=$1} END{printf "%d", sum+0}' || echo "0")
|
|
113
|
+
if [[ "${child_cpu:-0}" -gt 0 ]]; then
|
|
114
|
+
agent_alive=true
|
|
115
|
+
fi
|
|
116
|
+
|
|
117
|
+
if [[ "$agent_alive" == "true" && "$repeated_errors" -lt 3 ]]; then
|
|
118
|
+
# Agent is alive — nudge instead of kill
|
|
119
|
+
if [[ "$nudge_enabled" == "true" && "$no_progress_count" -ge "$nudge_after" ]]; then
|
|
120
|
+
local nudge_file="${worktree}/.claude/nudge.md"
|
|
121
|
+
if [[ ! -f "$nudge_file" ]]; then
|
|
122
|
+
cat > "$nudge_file" <<NUDGE_EOF
|
|
123
|
+
# Nudge from Daemon Health Monitor
|
|
124
|
+
|
|
125
|
+
The daemon has noticed no visible progress for $(( no_progress_count * 30 / 60 )) minutes.
|
|
126
|
+
Current stage: ${cur_stage}
|
|
127
|
+
|
|
128
|
+
If you're stuck, consider:
|
|
129
|
+
- Breaking the task into smaller steps
|
|
130
|
+
- Committing partial progress
|
|
131
|
+
- Running tests to validate current state
|
|
132
|
+
|
|
133
|
+
This is just a gentle check-in — take your time if you're working through a complex problem.
|
|
134
|
+
NUDGE_EOF
|
|
135
|
+
daemon_log INFO "Issue #${issue_num} nudged (${no_progress_count} checks, stage=${cur_stage}, CPU=${child_cpu}%) — file written to worktree"
|
|
136
|
+
emit_event "daemon.nudge" "issue=$issue_num" "no_progress=$no_progress_count" "stage=$cur_stage" "elapsed_s=$elapsed"
|
|
137
|
+
fi
|
|
138
|
+
else
|
|
139
|
+
daemon_log INFO "Issue #${issue_num} no visible progress (${no_progress_count} checks) but agent is alive (CPU: ${child_cpu}%, stage=${cur_stage}) — waiting"
|
|
140
|
+
fi
|
|
141
|
+
elif [[ "$repeated_errors" -ge 5 ]]; then
|
|
142
|
+
# Truly stuck in an error loop — kill as last resort
|
|
143
|
+
daemon_log WARN "Issue #${issue_num} in error loop: ${repeated_errors} repeated errors (stage=${cur_stage}, ${elapsed}s, PID $pid) — killing"
|
|
144
|
+
emit_event "daemon.stuck_kill" "issue=$issue_num" "no_progress=$no_progress_count" "repeated_errors=$repeated_errors" "stage=$cur_stage" "elapsed_s=$elapsed" "pid=$pid" "reason=error_loop"
|
|
145
|
+
kill "$pid" 2>/dev/null || true
|
|
146
|
+
daemon_clear_progress "$issue_num"
|
|
147
|
+
findings=$((findings + 1))
|
|
148
|
+
elif [[ "$agent_alive" != "true" && "$no_progress_count" -ge "$((PROGRESS_CHECKS_BEFORE_KILL * 2))" ]]; then
|
|
149
|
+
# Process tree is dead AND no progress for very long time
|
|
150
|
+
daemon_log WARN "Issue #${issue_num} appears dead: no CPU, no progress for ${no_progress_count} checks (${elapsed}s, PID $pid) — killing"
|
|
151
|
+
emit_event "daemon.stuck_kill" "issue=$issue_num" "no_progress=$no_progress_count" "repeated_errors=$repeated_errors" "stage=$cur_stage" "elapsed_s=$elapsed" "pid=$pid" "reason=dead_process"
|
|
152
|
+
kill "$pid" 2>/dev/null || true
|
|
153
|
+
daemon_clear_progress "$issue_num"
|
|
154
|
+
findings=$((findings + 1))
|
|
155
|
+
else
|
|
156
|
+
daemon_log WARN "Issue #${issue_num} struggling (${no_progress_count} checks, ${repeated_errors} errors, CPU=${child_cpu}%, stage=${cur_stage}) — monitoring"
|
|
157
|
+
fi
|
|
158
|
+
;;
|
|
159
|
+
esac
|
|
160
|
+
fi
|
|
161
|
+
else
|
|
162
|
+
# Fallback: legacy time-based detection when progress monitoring is off
|
|
163
|
+
local stale_timeout
|
|
164
|
+
stale_timeout=$(get_adaptive_stale_timeout "$PIPELINE_TEMPLATE")
|
|
165
|
+
if [[ "$elapsed" -gt "$stale_timeout" ]]; then
|
|
166
|
+
# Check if process is still alive
|
|
167
|
+
if kill -0 "$pid" 2>/dev/null; then
|
|
168
|
+
# Kill at 2x stale timeout — the process is truly hung
|
|
169
|
+
local kill_threshold=$(( stale_timeout * 2 ))
|
|
170
|
+
if [[ "$elapsed" -gt "$kill_threshold" ]]; then
|
|
171
|
+
daemon_log WARN "Killing stale job (legacy): issue #${issue_num} (${elapsed}s > ${kill_threshold}s kill threshold, PID $pid)"
|
|
172
|
+
emit_event "daemon.stale_kill" "issue=$issue_num" "elapsed_s=$elapsed" "pid=$pid"
|
|
173
|
+
kill "$pid" 2>/dev/null || true
|
|
174
|
+
sleep 2
|
|
175
|
+
kill -9 "$pid" 2>/dev/null || true
|
|
176
|
+
else
|
|
177
|
+
daemon_log WARN "Stale job (legacy): issue #${issue_num} (${elapsed}s > ${stale_timeout}s, PID $pid) — will kill at ${kill_threshold}s"
|
|
178
|
+
emit_event "daemon.stale_warning" "issue=$issue_num" "elapsed_s=$elapsed" "pid=$pid"
|
|
179
|
+
fi
|
|
180
|
+
else
|
|
181
|
+
daemon_log WARN "Stale job with dead process: issue #${issue_num} (PID $pid no longer exists)"
|
|
182
|
+
emit_event "daemon.stale_dead" "issue=$issue_num" "pid=$pid"
|
|
183
|
+
fi
|
|
184
|
+
findings=$((findings + 1))
|
|
185
|
+
fi
|
|
186
|
+
fi
|
|
187
|
+
done < <(jq -c '.active_jobs[]' "$STATE_FILE" 2>/dev/null || true)
|
|
188
|
+
fi
|
|
189
|
+
|
|
190
|
+
# Disk space warning (check both repo dir and ~/.shipwright)
|
|
191
|
+
local free_kb
|
|
192
|
+
free_kb=$(df -k "." 2>/dev/null | tail -1 | awk '{print $4}')
|
|
193
|
+
if [[ -n "$free_kb" ]] && [[ "$free_kb" -lt 1048576 ]] 2>/dev/null; then
|
|
194
|
+
daemon_log WARN "Low disk space: $(( free_kb / 1024 ))MB free"
|
|
195
|
+
findings=$((findings + 1))
|
|
196
|
+
fi
|
|
197
|
+
|
|
198
|
+
# Critical disk space on ~/.shipwright — pause spawning
|
|
199
|
+
local sw_free_kb
|
|
200
|
+
sw_free_kb=$(df -k "$HOME/.shipwright" 2>/dev/null | tail -1 | awk '{print $4}')
|
|
201
|
+
if [[ -n "$sw_free_kb" ]] && [[ "$sw_free_kb" -lt 512000 ]] 2>/dev/null; then
|
|
202
|
+
daemon_log WARN "Critical disk space on ~/.shipwright: $(( sw_free_kb / 1024 ))MB — pausing spawns"
|
|
203
|
+
emit_event "daemon.disk_low" "free_mb=$(( sw_free_kb / 1024 ))"
|
|
204
|
+
mkdir -p "$HOME/.shipwright"
|
|
205
|
+
echo '{"paused":true,"reason":"disk_low"}' > "$HOME/.shipwright/daemon-pause.flag"
|
|
206
|
+
findings=$((findings + 1))
|
|
207
|
+
fi
|
|
208
|
+
|
|
209
|
+
# Events file size warning
|
|
210
|
+
if [[ -f "$EVENTS_FILE" ]]; then
|
|
211
|
+
local events_size
|
|
212
|
+
events_size=$(wc -c < "$EVENTS_FILE" 2>/dev/null || echo 0)
|
|
213
|
+
if [[ "$events_size" -gt 104857600 ]]; then # 100MB
|
|
214
|
+
daemon_log WARN "Events file large ($(( events_size / 1048576 ))MB) — consider rotating"
|
|
215
|
+
findings=$((findings + 1))
|
|
216
|
+
fi
|
|
217
|
+
fi
|
|
218
|
+
|
|
219
|
+
if [[ "$findings" -gt 0 ]]; then
|
|
220
|
+
emit_event "daemon.health" "findings=$findings"
|
|
221
|
+
fi
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
# ─── Degradation Alerting ─────────────────────────────────────────────────────
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
daemon_check_degradation() {
|
|
228
|
+
if [[ ! -f "$EVENTS_FILE" ]]; then return; fi
|
|
229
|
+
|
|
230
|
+
local window="${DEGRADATION_WINDOW:-5}"
|
|
231
|
+
local cfr_threshold="${DEGRADATION_CFR_THRESHOLD:-30}"
|
|
232
|
+
local success_threshold="${DEGRADATION_SUCCESS_THRESHOLD:-50}"
|
|
233
|
+
|
|
234
|
+
# Get last N pipeline completions
|
|
235
|
+
local recent
|
|
236
|
+
recent=$(tail -200 "$EVENTS_FILE" | jq -s "[.[] | select(.type == \"pipeline.completed\")] | .[-${window}:]" 2>/dev/null)
|
|
237
|
+
local count
|
|
238
|
+
count=$(echo "$recent" | jq 'length' 2>/dev/null || echo 0)
|
|
239
|
+
|
|
240
|
+
if [[ "$count" -lt "$window" ]]; then return; fi
|
|
241
|
+
|
|
242
|
+
local failures successes
|
|
243
|
+
failures=$(echo "$recent" | jq '[.[] | select(.result == "failure")] | length')
|
|
244
|
+
successes=$(echo "$recent" | jq '[.[] | select(.result == "success")] | length')
|
|
245
|
+
local cfr_pct=0 success_pct=0
|
|
246
|
+
if [[ "${count:-0}" -gt 0 ]]; then
|
|
247
|
+
cfr_pct=$(( failures * 100 / count ))
|
|
248
|
+
success_pct=$(( successes * 100 / count ))
|
|
249
|
+
fi
|
|
250
|
+
|
|
251
|
+
local alerts=""
|
|
252
|
+
if [[ "$cfr_pct" -gt "$cfr_threshold" ]]; then
|
|
253
|
+
alerts="CFR ${cfr_pct}% exceeds threshold ${cfr_threshold}%"
|
|
254
|
+
daemon_log WARN "DEGRADATION: $alerts"
|
|
255
|
+
fi
|
|
256
|
+
if [[ "$success_pct" -lt "$success_threshold" ]]; then
|
|
257
|
+
local msg="Success rate ${success_pct}% below threshold ${success_threshold}%"
|
|
258
|
+
[[ -n "$alerts" ]] && alerts="$alerts; $msg" || alerts="$msg"
|
|
259
|
+
daemon_log WARN "DEGRADATION: $msg"
|
|
260
|
+
fi
|
|
261
|
+
|
|
262
|
+
if [[ -n "$alerts" ]]; then
|
|
263
|
+
emit_event "daemon.alert" "alerts=$alerts" "cfr_pct=$cfr_pct" "success_pct=$success_pct"
|
|
264
|
+
|
|
265
|
+
# Slack notification
|
|
266
|
+
if [[ -n "${SLACK_WEBHOOK:-}" ]]; then
|
|
267
|
+
notify "Pipeline Degradation Alert" "$alerts" "warn"
|
|
268
|
+
fi
|
|
269
|
+
fi
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
# ─── Auto-Scaling ─────────────────────────────────────────────────────────
|
|
273
|
+
# Dynamically adjusts MAX_PARALLEL based on CPU, memory, budget, and queue depth
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
daemon_reload_config() {
|
|
277
|
+
local reload_flag="$HOME/.shipwright/fleet-reload.flag"
|
|
278
|
+
if [[ ! -f "$reload_flag" ]]; then
|
|
279
|
+
return
|
|
280
|
+
fi
|
|
281
|
+
|
|
282
|
+
local fleet_config=".claude/.fleet-daemon-config.json"
|
|
283
|
+
if [[ -f "$fleet_config" ]]; then
|
|
284
|
+
local new_max
|
|
285
|
+
new_max=$(jq -r '.max_parallel // empty' "$fleet_config" 2>/dev/null || true)
|
|
286
|
+
if [[ -n "$new_max" && "$new_max" != "null" ]]; then
|
|
287
|
+
local prev="$MAX_PARALLEL"
|
|
288
|
+
FLEET_MAX_PARALLEL="$new_max"
|
|
289
|
+
MAX_PARALLEL="$new_max"
|
|
290
|
+
daemon_log INFO "Fleet reload: max_parallel ${prev} → ${MAX_PARALLEL} (fleet ceiling: ${FLEET_MAX_PARALLEL})"
|
|
291
|
+
emit_event "daemon.fleet_reload" "from=$prev" "to=$MAX_PARALLEL"
|
|
292
|
+
fi
|
|
293
|
+
fi
|
|
294
|
+
|
|
295
|
+
rm -f "$reload_flag"
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
# ─── Self-Optimizing Metrics Loop ──────────────────────────────────────────
|
|
299
|
+
|