shipwright-cli 2.2.0 → 2.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +15 -16
- package/config/policy.schema.json +104 -29
- package/docs/AGI-PLATFORM-PLAN.md +11 -7
- package/docs/AGI-WHATS-NEXT.md +26 -20
- package/docs/README.md +2 -0
- package/package.json +1 -1
- package/scripts/check-version-consistency.sh +72 -0
- package/scripts/lib/daemon-adaptive.sh +610 -0
- package/scripts/lib/daemon-dispatch.sh +489 -0
- package/scripts/lib/daemon-failure.sh +387 -0
- package/scripts/lib/daemon-patrol.sh +1113 -0
- package/scripts/lib/daemon-poll.sh +1202 -0
- package/scripts/lib/daemon-state.sh +550 -0
- package/scripts/lib/daemon-triage.sh +490 -0
- package/scripts/lib/helpers.sh +81 -1
- package/scripts/lib/pipeline-detection.sh +278 -0
- package/scripts/lib/pipeline-github.sh +196 -0
- package/scripts/lib/pipeline-intelligence.sh +1706 -0
- package/scripts/lib/pipeline-quality-checks.sh +1054 -0
- package/scripts/lib/pipeline-quality.sh +11 -0
- package/scripts/lib/pipeline-stages.sh +2508 -0
- package/scripts/lib/pipeline-state.sh +529 -0
- package/scripts/sw +26 -4
- package/scripts/sw-activity.sh +1 -1
- package/scripts/sw-adaptive.sh +2 -2
- package/scripts/sw-adversarial.sh +1 -1
- package/scripts/sw-architecture-enforcer.sh +1 -1
- package/scripts/sw-auth.sh +1 -1
- package/scripts/sw-autonomous.sh +1 -1
- package/scripts/sw-changelog.sh +1 -1
- package/scripts/sw-checkpoint.sh +1 -1
- package/scripts/sw-ci.sh +1 -1
- package/scripts/sw-cleanup.sh +1 -1
- package/scripts/sw-code-review.sh +1 -1
- package/scripts/sw-connect.sh +1 -1
- package/scripts/sw-context.sh +1 -1
- package/scripts/sw-cost.sh +1 -1
- package/scripts/sw-daemon.sh +52 -4816
- package/scripts/sw-dashboard.sh +1 -1
- package/scripts/sw-db.sh +1 -1
- package/scripts/sw-decompose.sh +1 -1
- package/scripts/sw-deps.sh +1 -1
- package/scripts/sw-developer-simulation.sh +1 -1
- package/scripts/sw-discovery.sh +1 -1
- package/scripts/sw-doc-fleet.sh +1 -1
- package/scripts/sw-docs-agent.sh +1 -1
- package/scripts/sw-docs.sh +1 -1
- package/scripts/sw-doctor.sh +42 -1
- package/scripts/sw-dora.sh +1 -1
- package/scripts/sw-durable.sh +1 -1
- package/scripts/sw-e2e-orchestrator.sh +1 -1
- package/scripts/sw-eventbus.sh +1 -1
- package/scripts/sw-feedback.sh +1 -1
- package/scripts/sw-fix.sh +1 -1
- package/scripts/sw-fleet-discover.sh +1 -1
- package/scripts/sw-fleet-viz.sh +3 -3
- package/scripts/sw-fleet.sh +1 -1
- package/scripts/sw-github-app.sh +1 -1
- package/scripts/sw-github-checks.sh +1 -1
- package/scripts/sw-github-deploy.sh +1 -1
- package/scripts/sw-github-graphql.sh +1 -1
- package/scripts/sw-guild.sh +1 -1
- package/scripts/sw-heartbeat.sh +1 -1
- package/scripts/sw-hygiene.sh +1 -1
- package/scripts/sw-incident.sh +1 -1
- package/scripts/sw-init.sh +1 -1
- package/scripts/sw-instrument.sh +1 -1
- package/scripts/sw-intelligence.sh +1 -1
- package/scripts/sw-jira.sh +1 -1
- package/scripts/sw-launchd.sh +1 -1
- package/scripts/sw-linear.sh +1 -1
- package/scripts/sw-logs.sh +1 -1
- package/scripts/sw-loop.sh +1 -1
- package/scripts/sw-memory.sh +1 -1
- package/scripts/sw-mission-control.sh +1 -1
- package/scripts/sw-model-router.sh +1 -1
- package/scripts/sw-otel.sh +4 -4
- package/scripts/sw-oversight.sh +1 -1
- package/scripts/sw-pipeline-composer.sh +1 -1
- package/scripts/sw-pipeline-vitals.sh +1 -1
- package/scripts/sw-pipeline.sh +23 -56
- package/scripts/sw-pipeline.sh.mock +7 -0
- package/scripts/sw-pm.sh +1 -1
- package/scripts/sw-pr-lifecycle.sh +1 -1
- package/scripts/sw-predictive.sh +1 -1
- package/scripts/sw-prep.sh +1 -1
- package/scripts/sw-ps.sh +1 -1
- package/scripts/sw-public-dashboard.sh +1 -1
- package/scripts/sw-quality.sh +1 -1
- package/scripts/sw-reaper.sh +1 -1
- package/scripts/sw-recruit.sh +9 -1
- package/scripts/sw-regression.sh +1 -1
- package/scripts/sw-release-manager.sh +1 -1
- package/scripts/sw-release.sh +1 -1
- package/scripts/sw-remote.sh +1 -1
- package/scripts/sw-replay.sh +1 -1
- package/scripts/sw-retro.sh +1 -1
- package/scripts/sw-scale.sh +8 -5
- package/scripts/sw-security-audit.sh +1 -1
- package/scripts/sw-self-optimize.sh +158 -7
- package/scripts/sw-session.sh +1 -1
- package/scripts/sw-setup.sh +1 -1
- package/scripts/sw-standup.sh +3 -3
- package/scripts/sw-status.sh +1 -1
- package/scripts/sw-strategic.sh +1 -1
- package/scripts/sw-stream.sh +8 -2
- package/scripts/sw-swarm.sh +7 -10
- package/scripts/sw-team-stages.sh +1 -1
- package/scripts/sw-templates.sh +1 -1
- package/scripts/sw-testgen.sh +1 -1
- package/scripts/sw-tmux-pipeline.sh +1 -1
- package/scripts/sw-tmux.sh +1 -1
- package/scripts/sw-trace.sh +1 -1
- package/scripts/sw-tracker.sh +24 -6
- package/scripts/sw-triage.sh +1 -1
- package/scripts/sw-upgrade.sh +1 -1
- package/scripts/sw-ux.sh +1 -1
- package/scripts/sw-webhook.sh +1 -1
- package/scripts/sw-widgets.sh +1 -1
- package/scripts/sw-worktree.sh +1 -1
|
@@ -0,0 +1,550 @@
|
|
|
1
|
+
# daemon-state.sh — State, queue, claim (for sw-daemon.sh)
|
|
2
|
+
# Source from sw-daemon.sh. Requires STATE_FILE, helpers.
|
|
3
|
+
[[ -n "${_DAEMON_STATE_LOADED:-}" ]] && return 0
|
|
4
|
+
_DAEMON_STATE_LOADED=1
|
|
5
|
+
|
|
6
|
+
daemon_log() {
|
|
7
|
+
local level="$1"
|
|
8
|
+
shift
|
|
9
|
+
local msg="$*"
|
|
10
|
+
local ts
|
|
11
|
+
ts=$(now_iso)
|
|
12
|
+
echo "[$ts] [$level] $msg" >> "$LOG_FILE"
|
|
13
|
+
|
|
14
|
+
# Rotate daemon.log if over 20MB (checked every 100 writes)
|
|
15
|
+
DAEMON_LOG_WRITE_COUNT=$(( DAEMON_LOG_WRITE_COUNT + 1 ))
|
|
16
|
+
if [[ $(( DAEMON_LOG_WRITE_COUNT % 100 )) -eq 0 ]] && [[ -f "$LOG_FILE" ]]; then
|
|
17
|
+
local log_size
|
|
18
|
+
log_size=$(wc -c < "$LOG_FILE" 2>/dev/null || echo 0)
|
|
19
|
+
if [[ "$log_size" -gt 20971520 ]]; then
|
|
20
|
+
[[ -f "${LOG_FILE}.2" ]] && mv "${LOG_FILE}.2" "${LOG_FILE}.3"
|
|
21
|
+
[[ -f "${LOG_FILE}.1" ]] && mv "${LOG_FILE}.1" "${LOG_FILE}.2"
|
|
22
|
+
mv "$LOG_FILE" "${LOG_FILE}.1"
|
|
23
|
+
touch "$LOG_FILE"
|
|
24
|
+
fi
|
|
25
|
+
fi
|
|
26
|
+
|
|
27
|
+
# Print to stderr (NOT stdout) to avoid corrupting command substitution captures.
|
|
28
|
+
# This is critical: functions like select_pipeline_template(), triage_score_issue(),
|
|
29
|
+
# gh_retry(), and locked_get_active_count() return values via echo/stdout and are
|
|
30
|
+
# called via $(). If daemon_log writes to stdout, the log text corrupts return values.
|
|
31
|
+
case "$level" in
|
|
32
|
+
INFO) info "$msg" >&2 ;;
|
|
33
|
+
SUCCESS) success "$msg" >&2 ;;
|
|
34
|
+
WARN) warn "$msg" >&2 ;;
|
|
35
|
+
ERROR) error "$msg" ;;
|
|
36
|
+
esac
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
# ─── Notification Helper ────────────────────────────────────────────────────
|
|
40
|
+
|
|
41
|
+
notify() {
|
|
42
|
+
local title="$1" message="$2" level="${3:-info}"
|
|
43
|
+
local emoji
|
|
44
|
+
case "$level" in
|
|
45
|
+
success) emoji="✅" ;;
|
|
46
|
+
error) emoji="❌" ;;
|
|
47
|
+
warn) emoji="⚠️" ;;
|
|
48
|
+
*) emoji="🔔" ;;
|
|
49
|
+
esac
|
|
50
|
+
|
|
51
|
+
# Slack webhook
|
|
52
|
+
if [[ -n "${SLACK_WEBHOOK:-}" ]]; then
|
|
53
|
+
local payload
|
|
54
|
+
payload=$(jq -n \
|
|
55
|
+
--arg text "${emoji} *${title}*\n${message}" \
|
|
56
|
+
'{text: $text}')
|
|
57
|
+
curl -sf -X POST -H 'Content-Type: application/json' \
|
|
58
|
+
-d "$payload" "$SLACK_WEBHOOK" >/dev/null 2>&1 || true
|
|
59
|
+
fi
|
|
60
|
+
|
|
61
|
+
# Custom webhook (env var SHIPWRIGHT_WEBHOOK_URL, with CCT_WEBHOOK_URL fallback)
|
|
62
|
+
local _webhook_url="${SHIPWRIGHT_WEBHOOK_URL:-${CCT_WEBHOOK_URL:-}}"
|
|
63
|
+
if [[ -n "$_webhook_url" ]]; then
|
|
64
|
+
local payload
|
|
65
|
+
payload=$(jq -n \
|
|
66
|
+
--arg title "$title" --arg message "$message" \
|
|
67
|
+
--arg level "$level" \
|
|
68
|
+
'{title:$title, message:$message, level:$level}')
|
|
69
|
+
curl -sf -X POST -H 'Content-Type: application/json' \
|
|
70
|
+
-d "$payload" "$_webhook_url" >/dev/null 2>&1 || true
|
|
71
|
+
fi
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
# ─── GitHub Rate-Limit Circuit Breaker ─────────────────────────────────────
|
|
75
|
+
# Tracks consecutive GitHub API failures. If we hit too many failures in a row,
|
|
76
|
+
# we back off exponentially to avoid hammering a rate-limited API.
|
|
77
|
+
|
|
78
|
+
GH_CONSECUTIVE_FAILURES=0
|
|
79
|
+
GH_BACKOFF_UNTIL=0 # epoch seconds — skip gh calls until this time
|
|
80
|
+
|
|
81
|
+
gh_rate_limited() {
|
|
82
|
+
# Returns 0 (true) if we should skip GitHub API calls
|
|
83
|
+
local now_e
|
|
84
|
+
now_e=$(now_epoch)
|
|
85
|
+
if [[ "$GH_BACKOFF_UNTIL" -gt "$now_e" ]]; then
|
|
86
|
+
return 0
|
|
87
|
+
fi
|
|
88
|
+
return 1
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
gh_record_success() {
|
|
92
|
+
GH_CONSECUTIVE_FAILURES=0
|
|
93
|
+
GH_BACKOFF_UNTIL=0
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
gh_record_failure() {
|
|
97
|
+
GH_CONSECUTIVE_FAILURES=$((GH_CONSECUTIVE_FAILURES + 1))
|
|
98
|
+
if [[ "$GH_CONSECUTIVE_FAILURES" -ge 3 ]]; then
|
|
99
|
+
# Exponential backoff: 30s, 60s, 120s, 240s (capped at 5min)
|
|
100
|
+
# Cap shift to avoid integer overflow for large failure counts
|
|
101
|
+
local shift_amt=$(( GH_CONSECUTIVE_FAILURES - 3 ))
|
|
102
|
+
[[ "$shift_amt" -gt 4 ]] && shift_amt=4
|
|
103
|
+
local backoff_secs=$((30 * (1 << shift_amt)))
|
|
104
|
+
[[ "$backoff_secs" -gt 300 ]] && backoff_secs=300
|
|
105
|
+
GH_BACKOFF_UNTIL=$(( $(now_epoch) + backoff_secs ))
|
|
106
|
+
daemon_log WARN "GitHub rate-limit circuit breaker: backing off ${backoff_secs}s after ${GH_CONSECUTIVE_FAILURES} failures"
|
|
107
|
+
emit_event "daemon.rate_limit" "failures=$GH_CONSECUTIVE_FAILURES" "backoff_s=$backoff_secs"
|
|
108
|
+
fi
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
# ─── Runtime Auth Check ──────────────────────────────────────────────────────
|
|
112
|
+
|
|
113
|
+
LAST_AUTH_CHECK_EPOCH=0
|
|
114
|
+
AUTH_CHECK_INTERVAL=300 # 5 minutes
|
|
115
|
+
|
|
116
|
+
daemon_preflight_auth_check() {
|
|
117
|
+
local now_e
|
|
118
|
+
now_e=$(now_epoch)
|
|
119
|
+
if [[ $((now_e - LAST_AUTH_CHECK_EPOCH)) -lt "$AUTH_CHECK_INTERVAL" ]]; then
|
|
120
|
+
return 0
|
|
121
|
+
fi
|
|
122
|
+
LAST_AUTH_CHECK_EPOCH="$now_e"
|
|
123
|
+
|
|
124
|
+
# gh auth check
|
|
125
|
+
if [[ "${NO_GITHUB:-false}" != "true" ]]; then
|
|
126
|
+
if ! gh auth status &>/dev/null 2>&1; then
|
|
127
|
+
daemon_log ERROR "GitHub auth check failed — auto-pausing daemon"
|
|
128
|
+
local pause_json
|
|
129
|
+
pause_json=$(jq -n --arg reason "gh_auth_failure" --arg ts "$(now_iso)" \
|
|
130
|
+
'{reason: $reason, timestamp: $ts}')
|
|
131
|
+
local _tmp_pause
|
|
132
|
+
_tmp_pause=$(mktemp "${TMPDIR:-/tmp}/sw-pause.XXXXXX")
|
|
133
|
+
echo "$pause_json" > "$_tmp_pause"
|
|
134
|
+
mv "$_tmp_pause" "$PAUSE_FLAG"
|
|
135
|
+
emit_event "daemon.auto_pause" "reason=gh_auth_failure"
|
|
136
|
+
return 1
|
|
137
|
+
fi
|
|
138
|
+
fi
|
|
139
|
+
|
|
140
|
+
# claude auth check with 15s timeout (macOS has no timeout command)
|
|
141
|
+
local claude_auth_ok=false
|
|
142
|
+
local _auth_tmp
|
|
143
|
+
_auth_tmp=$(mktemp "${TMPDIR:-/tmp}/sw-auth.XXXXXX")
|
|
144
|
+
( claude --print -p "ok" --max-turns 1 > "$_auth_tmp" 2>/dev/null ) &
|
|
145
|
+
local _auth_pid=$!
|
|
146
|
+
local _auth_waited=0
|
|
147
|
+
while kill -0 "$_auth_pid" 2>/dev/null && [[ "$_auth_waited" -lt 15 ]]; do
|
|
148
|
+
sleep 1
|
|
149
|
+
_auth_waited=$((_auth_waited + 1))
|
|
150
|
+
done
|
|
151
|
+
if kill -0 "$_auth_pid" 2>/dev/null; then
|
|
152
|
+
kill "$_auth_pid" 2>/dev/null || true
|
|
153
|
+
wait "$_auth_pid" 2>/dev/null || true
|
|
154
|
+
else
|
|
155
|
+
wait "$_auth_pid" 2>/dev/null || true
|
|
156
|
+
fi
|
|
157
|
+
|
|
158
|
+
if [[ -s "$_auth_tmp" ]]; then
|
|
159
|
+
claude_auth_ok=true
|
|
160
|
+
fi
|
|
161
|
+
rm -f "$_auth_tmp"
|
|
162
|
+
|
|
163
|
+
if [[ "$claude_auth_ok" != "true" ]]; then
|
|
164
|
+
daemon_log ERROR "Claude auth check failed — auto-pausing daemon"
|
|
165
|
+
local pause_json
|
|
166
|
+
pause_json=$(jq -n --arg reason "claude_auth_failure" --arg ts "$(now_iso)" \
|
|
167
|
+
'{reason: $reason, timestamp: $ts}')
|
|
168
|
+
local _tmp_pause
|
|
169
|
+
_tmp_pause=$(mktemp "${TMPDIR:-/tmp}/sw-pause.XXXXXX")
|
|
170
|
+
echo "$pause_json" > "$_tmp_pause"
|
|
171
|
+
mv "$_tmp_pause" "$PAUSE_FLAG"
|
|
172
|
+
emit_event "daemon.auto_pause" "reason=claude_auth_failure"
|
|
173
|
+
return 1
|
|
174
|
+
fi
|
|
175
|
+
|
|
176
|
+
return 0
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
# ─── Pre-flight Checks ──────────────────────────────────────────────────────
|
|
180
|
+
|
|
181
|
+
preflight_checks() {
|
|
182
|
+
local errors=0
|
|
183
|
+
|
|
184
|
+
echo -e "${PURPLE}${BOLD}━━━ Pre-flight Checks ━━━${RESET}"
|
|
185
|
+
echo ""
|
|
186
|
+
|
|
187
|
+
# 1. Required tools
|
|
188
|
+
local required_tools=("git" "jq" "gh" "claude")
|
|
189
|
+
local optional_tools=("tmux" "curl")
|
|
190
|
+
|
|
191
|
+
for tool in "${required_tools[@]}"; do
|
|
192
|
+
if command -v "$tool" &>/dev/null; then
|
|
193
|
+
echo -e " ${GREEN}✓${RESET} $tool"
|
|
194
|
+
else
|
|
195
|
+
echo -e " ${RED}✗${RESET} $tool ${RED}(required)${RESET}"
|
|
196
|
+
errors=$((errors + 1))
|
|
197
|
+
fi
|
|
198
|
+
done
|
|
199
|
+
|
|
200
|
+
for tool in "${optional_tools[@]}"; do
|
|
201
|
+
if command -v "$tool" &>/dev/null; then
|
|
202
|
+
echo -e " ${GREEN}✓${RESET} $tool"
|
|
203
|
+
else
|
|
204
|
+
echo -e " ${DIM}○${RESET} $tool ${DIM}(optional — some features disabled)${RESET}"
|
|
205
|
+
fi
|
|
206
|
+
done
|
|
207
|
+
|
|
208
|
+
# 2. Git state
|
|
209
|
+
echo ""
|
|
210
|
+
if git rev-parse --is-inside-work-tree &>/dev/null; then
|
|
211
|
+
echo -e " ${GREEN}✓${RESET} Inside git repo"
|
|
212
|
+
else
|
|
213
|
+
echo -e " ${RED}✗${RESET} Not inside a git repository"
|
|
214
|
+
errors=$((errors + 1))
|
|
215
|
+
fi
|
|
216
|
+
|
|
217
|
+
# Check base branch exists
|
|
218
|
+
if git rev-parse --verify "$BASE_BRANCH" &>/dev/null; then
|
|
219
|
+
echo -e " ${GREEN}✓${RESET} Base branch: $BASE_BRANCH"
|
|
220
|
+
else
|
|
221
|
+
echo -e " ${RED}✗${RESET} Base branch not found: $BASE_BRANCH"
|
|
222
|
+
errors=$((errors + 1))
|
|
223
|
+
fi
|
|
224
|
+
|
|
225
|
+
# 3. GitHub auth (required for daemon — it needs to poll issues)
|
|
226
|
+
if [[ "$NO_GITHUB" != "true" ]]; then
|
|
227
|
+
if gh auth status &>/dev/null 2>&1; then
|
|
228
|
+
echo -e " ${GREEN}✓${RESET} GitHub authenticated"
|
|
229
|
+
else
|
|
230
|
+
echo -e " ${RED}✗${RESET} GitHub not authenticated (required for daemon)"
|
|
231
|
+
errors=$((errors + 1))
|
|
232
|
+
fi
|
|
233
|
+
else
|
|
234
|
+
echo -e " ${DIM}○${RESET} GitHub disabled (--no-github)"
|
|
235
|
+
fi
|
|
236
|
+
|
|
237
|
+
# 4. Pipeline script
|
|
238
|
+
if [[ -x "$SCRIPT_DIR/sw-pipeline.sh" ]]; then
|
|
239
|
+
echo -e " ${GREEN}✓${RESET} sw-pipeline.sh available"
|
|
240
|
+
else
|
|
241
|
+
echo -e " ${RED}✗${RESET} sw-pipeline.sh not found at $SCRIPT_DIR"
|
|
242
|
+
errors=$((errors + 1))
|
|
243
|
+
fi
|
|
244
|
+
|
|
245
|
+
# 5. Disk space check (warn if < 1GB free)
|
|
246
|
+
local free_space_kb
|
|
247
|
+
free_space_kb=$(df -k "." 2>/dev/null | tail -1 | awk '{print $4}')
|
|
248
|
+
if [[ -n "$free_space_kb" ]] && [[ "$free_space_kb" -lt 1048576 ]] 2>/dev/null; then
|
|
249
|
+
echo -e " ${YELLOW}⚠${RESET} Low disk space: $(( free_space_kb / 1024 ))MB free"
|
|
250
|
+
fi
|
|
251
|
+
|
|
252
|
+
echo ""
|
|
253
|
+
|
|
254
|
+
if [[ "$errors" -gt 0 ]]; then
|
|
255
|
+
error "Pre-flight failed: $errors error(s)"
|
|
256
|
+
return 1
|
|
257
|
+
fi
|
|
258
|
+
|
|
259
|
+
success "Pre-flight passed"
|
|
260
|
+
echo ""
|
|
261
|
+
return 0
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
# ─── State Management ───────────────────────────────────────────────────────
|
|
265
|
+
|
|
266
|
+
# State file lock FD (used by locked_state_update for serialized read-modify-write)
|
|
267
|
+
STATE_LOCK_FD=7
|
|
268
|
+
|
|
269
|
+
# Atomic write: write to tmp file, then mv (prevents corruption on crash)
|
|
270
|
+
atomic_write_state() {
|
|
271
|
+
local content="$1"
|
|
272
|
+
local tmp_file
|
|
273
|
+
tmp_file=$(mktemp "${STATE_FILE}.tmp.XXXXXX") || {
|
|
274
|
+
daemon_log ERROR "Failed to create temp file for state write"
|
|
275
|
+
return 1
|
|
276
|
+
}
|
|
277
|
+
echo "$content" > "$tmp_file" || {
|
|
278
|
+
daemon_log ERROR "Failed to write state to temp file"
|
|
279
|
+
rm -f "$tmp_file"
|
|
280
|
+
return 1
|
|
281
|
+
}
|
|
282
|
+
mv "$tmp_file" "$STATE_FILE" || {
|
|
283
|
+
daemon_log ERROR "Failed to move temp state file into place"
|
|
284
|
+
rm -f "$tmp_file"
|
|
285
|
+
return 1
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
# Locked read-modify-write: prevents TOCTOU race on state file.
|
|
290
|
+
# Usage: locked_state_update '.queued += [42]'
|
|
291
|
+
# The jq expression is applied to the current state file atomically.
|
|
292
|
+
locked_state_update() {
|
|
293
|
+
local jq_expr="$1"
|
|
294
|
+
shift
|
|
295
|
+
local lock_file="${STATE_FILE}.lock"
|
|
296
|
+
(
|
|
297
|
+
if command -v flock &>/dev/null; then
|
|
298
|
+
flock -w 5 200 2>/dev/null || {
|
|
299
|
+
daemon_log ERROR "locked_state_update: lock acquisition timed out — aborting"
|
|
300
|
+
return 1
|
|
301
|
+
}
|
|
302
|
+
fi
|
|
303
|
+
local tmp
|
|
304
|
+
tmp=$(jq "$jq_expr" "$@" "$STATE_FILE" 2>&1) || {
|
|
305
|
+
daemon_log ERROR "locked_state_update: jq failed — $(echo "$tmp" | head -1)"
|
|
306
|
+
return 1
|
|
307
|
+
}
|
|
308
|
+
atomic_write_state "$tmp" || {
|
|
309
|
+
daemon_log ERROR "locked_state_update: atomic_write_state failed"
|
|
310
|
+
return 1
|
|
311
|
+
}
|
|
312
|
+
) 200>"$lock_file"
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
init_state() {
|
|
316
|
+
if [[ ! -f "$STATE_FILE" ]]; then
|
|
317
|
+
local init_json
|
|
318
|
+
init_json=$(jq -n \
|
|
319
|
+
--arg pid "$$" \
|
|
320
|
+
--arg started "$(now_iso)" \
|
|
321
|
+
--argjson interval "$POLL_INTERVAL" \
|
|
322
|
+
--argjson max_parallel "$MAX_PARALLEL" \
|
|
323
|
+
--arg label "$WATCH_LABEL" \
|
|
324
|
+
--arg watch_mode "$WATCH_MODE" \
|
|
325
|
+
'{
|
|
326
|
+
version: 1,
|
|
327
|
+
pid: ($pid | tonumber),
|
|
328
|
+
started_at: $started,
|
|
329
|
+
last_poll: null,
|
|
330
|
+
config: {
|
|
331
|
+
poll_interval: $interval,
|
|
332
|
+
max_parallel: $max_parallel,
|
|
333
|
+
watch_label: $label,
|
|
334
|
+
watch_mode: $watch_mode
|
|
335
|
+
},
|
|
336
|
+
active_jobs: [],
|
|
337
|
+
queued: [],
|
|
338
|
+
completed: [],
|
|
339
|
+
retry_counts: {},
|
|
340
|
+
failure_history: [],
|
|
341
|
+
priority_lane_active: [],
|
|
342
|
+
titles: {}
|
|
343
|
+
}')
|
|
344
|
+
local lock_file="${STATE_FILE}.lock"
|
|
345
|
+
(
|
|
346
|
+
if command -v flock &>/dev/null; then
|
|
347
|
+
flock -w 5 200 2>/dev/null || {
|
|
348
|
+
daemon_log ERROR "init_state: lock acquisition timed out"
|
|
349
|
+
return 1
|
|
350
|
+
}
|
|
351
|
+
fi
|
|
352
|
+
atomic_write_state "$init_json"
|
|
353
|
+
) 200>"$lock_file"
|
|
354
|
+
else
|
|
355
|
+
# Update PID and start time in existing state
|
|
356
|
+
locked_state_update \
|
|
357
|
+
--arg pid "$$" \
|
|
358
|
+
--arg started "$(now_iso)" \
|
|
359
|
+
'.pid = ($pid | tonumber) | .started_at = $started'
|
|
360
|
+
fi
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
update_state_field() {
|
|
364
|
+
local field="$1" value="$2"
|
|
365
|
+
locked_state_update --arg field "$field" --arg val "$value" \
|
|
366
|
+
'.[$field] = $val'
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
# ─── Inflight Check ─────────────────────────────────────────────────────────
|
|
370
|
+
|
|
371
|
+
daemon_is_inflight() {
|
|
372
|
+
local issue_num="$1"
|
|
373
|
+
|
|
374
|
+
if [[ ! -f "$STATE_FILE" ]]; then
|
|
375
|
+
return 1
|
|
376
|
+
fi
|
|
377
|
+
|
|
378
|
+
# Check active_jobs
|
|
379
|
+
local active_match
|
|
380
|
+
active_match=$(jq -r --argjson num "$issue_num" \
|
|
381
|
+
'.active_jobs[] | select(.issue == $num) | .issue' \
|
|
382
|
+
"$STATE_FILE" 2>/dev/null || true)
|
|
383
|
+
if [[ -n "$active_match" ]]; then
|
|
384
|
+
return 0
|
|
385
|
+
fi
|
|
386
|
+
|
|
387
|
+
# Check queued
|
|
388
|
+
local queued_match
|
|
389
|
+
queued_match=$(jq -r --argjson num "$issue_num" \
|
|
390
|
+
'.queued[] | select(. == $num)' \
|
|
391
|
+
"$STATE_FILE" 2>/dev/null || true)
|
|
392
|
+
if [[ -n "$queued_match" ]]; then
|
|
393
|
+
return 0
|
|
394
|
+
fi
|
|
395
|
+
|
|
396
|
+
return 1
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
# ─── Active Job Count ───────────────────────────────────────────────────────
|
|
400
|
+
|
|
401
|
+
get_active_count() {
|
|
402
|
+
if [[ ! -f "$STATE_FILE" ]]; then
|
|
403
|
+
echo 0
|
|
404
|
+
return
|
|
405
|
+
fi
|
|
406
|
+
jq -r '.active_jobs | length' "$STATE_FILE" 2>/dev/null || echo 0
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
# Race-safe active count: acquires state lock before reading.
|
|
410
|
+
# Returns MAX_PARALLEL on lock timeout (safe fail — prevents over-spawning).
|
|
411
|
+
locked_get_active_count() {
|
|
412
|
+
if [[ ! -f "$STATE_FILE" ]]; then
|
|
413
|
+
echo 0
|
|
414
|
+
return
|
|
415
|
+
fi
|
|
416
|
+
local lock_file="${STATE_FILE}.lock"
|
|
417
|
+
local count
|
|
418
|
+
count=$(
|
|
419
|
+
(
|
|
420
|
+
if command -v flock &>/dev/null; then
|
|
421
|
+
flock -w 5 200 2>/dev/null || {
|
|
422
|
+
daemon_log WARN "locked_get_active_count: lock timeout — returning MAX_PARALLEL as safe default" >&2
|
|
423
|
+
echo "$MAX_PARALLEL"
|
|
424
|
+
exit 0
|
|
425
|
+
}
|
|
426
|
+
fi
|
|
427
|
+
jq -r '.active_jobs | length' "$STATE_FILE" 2>/dev/null || echo "$MAX_PARALLEL"
|
|
428
|
+
) 200>"$lock_file"
|
|
429
|
+
)
|
|
430
|
+
echo "${count:-0}"
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
# ─── Queue Management ───────────────────────────────────────────────────────
|
|
434
|
+
|
|
435
|
+
enqueue_issue() {
|
|
436
|
+
local issue_num="$1"
|
|
437
|
+
locked_state_update --argjson num "$issue_num" \
|
|
438
|
+
'.queued += [$num] | .queued |= unique'
|
|
439
|
+
daemon_log INFO "Queued issue #${issue_num} (at capacity)"
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
dequeue_next() {
|
|
443
|
+
if [[ ! -f "$STATE_FILE" ]]; then
|
|
444
|
+
return
|
|
445
|
+
fi
|
|
446
|
+
|
|
447
|
+
local next
|
|
448
|
+
next=$(jq -r '.queued[0] // empty' "$STATE_FILE" 2>/dev/null || true)
|
|
449
|
+
if [[ -n "$next" ]]; then
|
|
450
|
+
# Remove from queue (locked to prevent race with enqueue)
|
|
451
|
+
locked_state_update '.queued = .queued[1:]'
|
|
452
|
+
echo "$next"
|
|
453
|
+
fi
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
# ─── Priority Lane Helpers ─────────────────────────────────────────────────
|
|
457
|
+
|
|
458
|
+
is_priority_issue() {
|
|
459
|
+
local labels_csv="$1"
|
|
460
|
+
local IFS=','
|
|
461
|
+
local lane_labels
|
|
462
|
+
read -ra lane_labels <<< "$PRIORITY_LANE_LABELS"
|
|
463
|
+
for lane_label in "${lane_labels[@]}"; do
|
|
464
|
+
# Trim whitespace
|
|
465
|
+
lane_label="${lane_label## }"
|
|
466
|
+
lane_label="${lane_label%% }"
|
|
467
|
+
if [[ ",$labels_csv," == *",$lane_label,"* ]]; then
|
|
468
|
+
return 0
|
|
469
|
+
fi
|
|
470
|
+
done
|
|
471
|
+
return 1
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
get_priority_active_count() {
|
|
475
|
+
if [[ ! -f "$STATE_FILE" ]]; then
|
|
476
|
+
echo 0
|
|
477
|
+
return
|
|
478
|
+
fi
|
|
479
|
+
jq -r '.priority_lane_active // [] | length' "$STATE_FILE" 2>/dev/null || echo 0
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
track_priority_job() {
|
|
483
|
+
local issue_num="$1"
|
|
484
|
+
locked_state_update --argjson num "$issue_num" \
|
|
485
|
+
'.priority_lane_active = ((.priority_lane_active // []) + [$num] | unique)'
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
untrack_priority_job() {
|
|
489
|
+
local issue_num="$1"
|
|
490
|
+
if [[ ! -f "$STATE_FILE" ]]; then
|
|
491
|
+
return
|
|
492
|
+
fi
|
|
493
|
+
locked_state_update --argjson num "$issue_num" \
|
|
494
|
+
'.priority_lane_active = [(.priority_lane_active // [])[] | select(. != $num)]'
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
# ─── Distributed Issue Claiming ───────────────────────────────────────────
|
|
498
|
+
|
|
499
|
+
claim_issue() {
|
|
500
|
+
local issue_num="$1"
|
|
501
|
+
local machine_name="$2"
|
|
502
|
+
|
|
503
|
+
[[ "$NO_GITHUB" == "true" ]] && return 0 # No claiming in no-github mode
|
|
504
|
+
|
|
505
|
+
# Try dashboard-coordinated claim first (atomic label-based)
|
|
506
|
+
local resp
|
|
507
|
+
resp=$(curl -s --max-time 5 -X POST "${DASHBOARD_URL}/api/claim" \
|
|
508
|
+
-H "Content-Type: application/json" \
|
|
509
|
+
-d "$(jq -n --argjson issue "$issue_num" --arg machine "$machine_name" \
|
|
510
|
+
'{issue: $issue, machine: $machine}')" 2>/dev/null || echo "")
|
|
511
|
+
|
|
512
|
+
if [[ -n "$resp" ]] && echo "$resp" | jq -e '.approved == true' &>/dev/null; then
|
|
513
|
+
return 0
|
|
514
|
+
elif [[ -n "$resp" ]] && echo "$resp" | jq -e '.approved == false' &>/dev/null; then
|
|
515
|
+
local claimed_by
|
|
516
|
+
claimed_by=$(echo "$resp" | jq -r '.claimed_by // "another machine"')
|
|
517
|
+
daemon_log INFO "Issue #${issue_num} claimed by ${claimed_by} (via dashboard)"
|
|
518
|
+
return 1
|
|
519
|
+
fi
|
|
520
|
+
|
|
521
|
+
# Fallback: direct GitHub label check (dashboard unreachable)
|
|
522
|
+
daemon_log WARN "Dashboard unreachable — falling back to direct GitHub label claim"
|
|
523
|
+
local existing_claim
|
|
524
|
+
existing_claim=$(gh issue view "$issue_num" --json labels --jq \
|
|
525
|
+
'[.labels[].name | select(startswith("claimed:"))] | .[0] // ""' 2>/dev/null || true)
|
|
526
|
+
|
|
527
|
+
if [[ -n "$existing_claim" ]]; then
|
|
528
|
+
daemon_log INFO "Issue #${issue_num} already claimed: ${existing_claim}"
|
|
529
|
+
return 1
|
|
530
|
+
fi
|
|
531
|
+
|
|
532
|
+
gh issue edit "$issue_num" --add-label "claimed:${machine_name}" 2>/dev/null || return 1
|
|
533
|
+
return 0
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
release_claim() {
|
|
537
|
+
local issue_num="$1"
|
|
538
|
+
local machine_name="$2"
|
|
539
|
+
|
|
540
|
+
[[ "$NO_GITHUB" == "true" ]] && return 0
|
|
541
|
+
|
|
542
|
+
# Try dashboard-coordinated release first
|
|
543
|
+
curl -s --max-time 5 -X POST "${DASHBOARD_URL}/api/claim/release" \
|
|
544
|
+
-H "Content-Type: application/json" \
|
|
545
|
+
-d "$(jq -n --argjson issue "$issue_num" --arg machine "$machine_name" \
|
|
546
|
+
'{issue: $issue, machine: $machine}')" 2>/dev/null || true
|
|
547
|
+
|
|
548
|
+
# Also remove label directly as backup (idempotent)
|
|
549
|
+
gh issue edit "$issue_num" --remove-label "claimed:${machine_name}" 2>/dev/null || true
|
|
550
|
+
}
|