codeharness 0.26.5 → 0.28.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-F6L7CXLK.js → chunk-2BBYPR57.js} +104 -411
- package/dist/{docker-VHOP56YP.js → docker-72QTSBOK.js} +1 -1
- package/dist/index.js +5015 -2014
- package/package.json +5 -3
- package/templates/agents/analyst.yaml +10 -0
- package/templates/agents/architect.yaml +11 -0
- package/templates/agents/dev.yaml +10 -0
- package/templates/agents/evaluator.yaml +92 -0
- package/templates/agents/pm.yaml +12 -0
- package/templates/agents/qa.yaml +15 -0
- package/templates/agents/retro.yaml +63 -0
- package/templates/agents/reviewer.yaml +76 -0
- package/templates/agents/sm.yaml +10 -0
- package/templates/agents/tech-writer.yaml +11 -0
- package/templates/agents/ux-designer.yaml +13 -0
- package/templates/workflows/default.yaml +41 -0
- package/ralph/AGENTS.md +0 -48
- package/ralph/bridge.sh +0 -424
- package/ralph/db_schema_gen.sh +0 -109
- package/ralph/drivers/claude-code.sh +0 -140
- package/ralph/exec_plans.sh +0 -252
- package/ralph/harness_status.sh +0 -147
- package/ralph/lib/circuit_breaker.sh +0 -210
- package/ralph/lib/date_utils.sh +0 -60
- package/ralph/lib/timeout_utils.sh +0 -77
- package/ralph/onboard.sh +0 -83
- package/ralph/ralph.sh +0 -1407
- package/ralph/validate_epic_docs.sh +0 -129
- package/ralph/verify_gates.sh +0 -210
package/ralph/ralph.sh
DELETED
|
@@ -1,1407 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env bash
|
|
2
|
-
# codeharness Ralph Loop — Vendored from snarktank/ralph
|
|
3
|
-
# Autonomous execution loop that spawns fresh Claude Code instances per iteration
|
|
4
|
-
# with verification gates, crash recovery, rate limiting, and circuit breaker protection.
|
|
5
|
-
#
|
|
6
|
-
# Usage: ralph/ralph.sh --plugin-dir ./codeharness [OPTIONS]
|
|
7
|
-
|
|
8
|
-
# NOTE: set -e intentionally NOT used — it causes silent crashes in the main
|
|
9
|
-
# loop when grep/jq/sed return non-zero. The loop handles errors via exit codes.
|
|
10
|
-
|
|
11
|
-
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
12
|
-
source "$SCRIPT_DIR/lib/date_utils.sh"
|
|
13
|
-
source "$SCRIPT_DIR/lib/timeout_utils.sh"
|
|
14
|
-
source "$SCRIPT_DIR/lib/circuit_breaker.sh"
|
|
15
|
-
|
|
16
|
-
# ─── Configuration ───────────────────────────────────────────────────────────
|
|
17
|
-
|
|
18
|
-
VERSION="0.1.0"
|
|
19
|
-
|
|
20
|
-
# Plugin directory (required — set via --plugin-dir)
|
|
21
|
-
PLUGIN_DIR=""
|
|
22
|
-
|
|
23
|
-
# Harness state directory (derived from project root)
|
|
24
|
-
HARNESS_STATE_DIR=""
|
|
25
|
-
|
|
26
|
-
# Progress file (legacy — kept for backwards compat, optional)
|
|
27
|
-
PROGRESS_FILE=""
|
|
28
|
-
|
|
29
|
-
# Sprint status file (primary task source — read by /harness-run skill)
|
|
30
|
-
SPRINT_STATUS_FILE=""
|
|
31
|
-
|
|
32
|
-
# Prompt file for each iteration
|
|
33
|
-
PROMPT_FILE=""
|
|
34
|
-
|
|
35
|
-
# Logging
|
|
36
|
-
LOG_DIR=""
|
|
37
|
-
|
|
38
|
-
# Loop limits
|
|
39
|
-
MAX_ITERATIONS=${MAX_ITERATIONS:-50}
|
|
40
|
-
MAX_STORY_RETRIES=${MAX_STORY_RETRIES:-10}
|
|
41
|
-
LOOP_TIMEOUT_SECONDS=${LOOP_TIMEOUT_SECONDS:-43200} # 12 hours default
|
|
42
|
-
ITERATION_TIMEOUT_MINUTES=${ITERATION_TIMEOUT_MINUTES:-30}
|
|
43
|
-
|
|
44
|
-
# Rate limiting
|
|
45
|
-
MAX_CALLS_PER_HOUR=${MAX_CALLS_PER_HOUR:-100}
|
|
46
|
-
RATE_LIMIT_SLEEP=3600 # 1 hour
|
|
47
|
-
|
|
48
|
-
# Driver
|
|
49
|
-
PLATFORM_DRIVER="${PLATFORM_DRIVER:-claude-code}"
|
|
50
|
-
CLAUDE_OUTPUT_FORMAT="${CLAUDE_OUTPUT_FORMAT:-stream-json}"
|
|
51
|
-
CLAUDE_ALLOWED_TOOLS="${CLAUDE_ALLOWED_TOOLS:-}"
|
|
52
|
-
CLAUDE_USE_CONTINUE="${CLAUDE_USE_CONTINUE:-false}" # Fresh context per iteration by default
|
|
53
|
-
|
|
54
|
-
# Reset retry state on start
|
|
55
|
-
RESET_RETRIES=false
|
|
56
|
-
|
|
57
|
-
# Live output
|
|
58
|
-
LIVE_OUTPUT=false
|
|
59
|
-
|
|
60
|
-
# Colors
|
|
61
|
-
RED='\033[0;31m'
|
|
62
|
-
GREEN='\033[0;32m'
|
|
63
|
-
YELLOW='\033[1;33m'
|
|
64
|
-
BLUE='\033[0;34m'
|
|
65
|
-
PURPLE='\033[0;35m'
|
|
66
|
-
NC='\033[0m'
|
|
67
|
-
|
|
68
|
-
# ─── Internal state ─────────────────────────────────────────────────────────
|
|
69
|
-
|
|
70
|
-
CALL_COUNT_FILE=""
|
|
71
|
-
TIMESTAMP_FILE=""
|
|
72
|
-
STATUS_FILE=""
|
|
73
|
-
LIVE_LOG_FILE=""
|
|
74
|
-
STORY_RETRY_FILE=""
|
|
75
|
-
FLAGGED_STORIES_FILE=""
|
|
76
|
-
|
|
77
|
-
# Global arrays for driver command building
|
|
78
|
-
declare -a CLAUDE_CMD_ARGS=()
|
|
79
|
-
declare -a LIVE_CMD_ARGS=()
|
|
80
|
-
declare -a VALID_TOOL_PATTERNS=()
|
|
81
|
-
|
|
82
|
-
loop_count=0
|
|
83
|
-
loop_start_time=""
|
|
84
|
-
|
|
85
|
-
# ─── Logging ─────────────────────────────────────────────────────────────────
|
|
86
|
-
|
|
87
|
-
log_status() {
|
|
88
|
-
local level=$1
|
|
89
|
-
local message=$2
|
|
90
|
-
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
|
|
91
|
-
local color=""
|
|
92
|
-
|
|
93
|
-
case $level in
|
|
94
|
-
"INFO") color=$BLUE ;;
|
|
95
|
-
"WARN") color=$YELLOW ;;
|
|
96
|
-
"ERROR") color=$RED ;;
|
|
97
|
-
"SUCCESS") color=$GREEN ;;
|
|
98
|
-
"LOOP") color=$PURPLE ;;
|
|
99
|
-
esac
|
|
100
|
-
|
|
101
|
-
# DEBUG level: log file only, no terminal output
|
|
102
|
-
if [[ "$level" == "DEBUG" ]]; then
|
|
103
|
-
if [[ -n "$LOG_DIR" ]]; then
|
|
104
|
-
echo "[$timestamp] [$level] $message" >> "$LOG_DIR/ralph.log"
|
|
105
|
-
fi
|
|
106
|
-
return
|
|
107
|
-
fi
|
|
108
|
-
|
|
109
|
-
echo -e "${color}[$timestamp] [$level] $message${NC}" >&2
|
|
110
|
-
if [[ -n "$LOG_DIR" ]]; then
|
|
111
|
-
echo "[$timestamp] [$level] $message" >> "$LOG_DIR/ralph.log"
|
|
112
|
-
fi
|
|
113
|
-
}
|
|
114
|
-
|
|
115
|
-
# ─── Rate Limiting ───────────────────────────────────────────────────────────
|
|
116
|
-
|
|
117
|
-
init_call_tracking() {
|
|
118
|
-
local current_hour=$(date +%Y%m%d%H)
|
|
119
|
-
local last_reset_hour=""
|
|
120
|
-
|
|
121
|
-
if [[ -f "$TIMESTAMP_FILE" ]]; then
|
|
122
|
-
last_reset_hour=$(cat "$TIMESTAMP_FILE")
|
|
123
|
-
fi
|
|
124
|
-
|
|
125
|
-
if [[ "$current_hour" != "$last_reset_hour" ]]; then
|
|
126
|
-
echo "0" > "$CALL_COUNT_FILE"
|
|
127
|
-
echo "$current_hour" > "$TIMESTAMP_FILE"
|
|
128
|
-
fi
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
can_make_call() {
|
|
132
|
-
local calls_made=0
|
|
133
|
-
if [[ -f "$CALL_COUNT_FILE" ]]; then
|
|
134
|
-
calls_made=$(cat "$CALL_COUNT_FILE")
|
|
135
|
-
fi
|
|
136
|
-
|
|
137
|
-
if [[ $calls_made -ge $MAX_CALLS_PER_HOUR ]]; then
|
|
138
|
-
return 1
|
|
139
|
-
else
|
|
140
|
-
return 0
|
|
141
|
-
fi
|
|
142
|
-
}
|
|
143
|
-
|
|
144
|
-
wait_for_reset() {
|
|
145
|
-
local calls_made=$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")
|
|
146
|
-
log_status "WARN" "Rate limit reached ($calls_made/$MAX_CALLS_PER_HOUR). Waiting for reset..."
|
|
147
|
-
|
|
148
|
-
local current_minute=$(date +%M)
|
|
149
|
-
local current_second=$(date +%S)
|
|
150
|
-
local wait_time=$(((60 - current_minute - 1) * 60 + (60 - current_second)))
|
|
151
|
-
|
|
152
|
-
log_status "INFO" "Sleeping for $wait_time seconds until next hour..."
|
|
153
|
-
sleep "$wait_time"
|
|
154
|
-
|
|
155
|
-
echo "0" > "$CALL_COUNT_FILE"
|
|
156
|
-
echo "$(date +%Y%m%d%H)" > "$TIMESTAMP_FILE"
|
|
157
|
-
log_status "SUCCESS" "Rate limit reset."
|
|
158
|
-
}
|
|
159
|
-
|
|
160
|
-
# ─── Progress Tracking ───────────────────────────────────────────────────────
|
|
161
|
-
|
|
162
|
-
update_status() {
|
|
163
|
-
local loop_count=$1
|
|
164
|
-
local calls_made=$2
|
|
165
|
-
local last_action=$3
|
|
166
|
-
local status=$4
|
|
167
|
-
local exit_reason=${5:-""}
|
|
168
|
-
|
|
169
|
-
if [[ -z "$STATUS_FILE" ]]; then
|
|
170
|
-
return
|
|
171
|
-
fi
|
|
172
|
-
|
|
173
|
-
# codeharness: Include sprint-status story counts in status JSON
|
|
174
|
-
local stories_total=0
|
|
175
|
-
local stories_completed=0
|
|
176
|
-
if [[ -n "$SPRINT_STATUS_FILE" && -f "$SPRINT_STATUS_FILE" ]]; then
|
|
177
|
-
local sprint_counts
|
|
178
|
-
sprint_counts=$(get_task_counts)
|
|
179
|
-
stories_total=${sprint_counts%% *}
|
|
180
|
-
stories_completed=${sprint_counts##* }
|
|
181
|
-
fi
|
|
182
|
-
|
|
183
|
-
local stories_remaining=$((stories_total - stories_completed))
|
|
184
|
-
local elapsed_seconds=0
|
|
185
|
-
if [[ -n "$loop_start_time" ]]; then
|
|
186
|
-
elapsed_seconds=$(( $(date +%s) - loop_start_time ))
|
|
187
|
-
fi
|
|
188
|
-
|
|
189
|
-
# Build flagged stories JSON array
|
|
190
|
-
local flagged_json="[]"
|
|
191
|
-
if [[ -n "$FLAGGED_STORIES_FILE" && -f "$FLAGGED_STORIES_FILE" ]]; then
|
|
192
|
-
flagged_json=$(jq -R -s 'split("\n") | map(select(length > 0))' < "$FLAGGED_STORIES_FILE")
|
|
193
|
-
fi
|
|
194
|
-
|
|
195
|
-
# Get current story key for status tracking
|
|
196
|
-
local current_story
|
|
197
|
-
current_story=$(get_current_task)
|
|
198
|
-
|
|
199
|
-
jq -n \
|
|
200
|
-
--arg timestamp "$(get_iso_timestamp)" \
|
|
201
|
-
--argjson loop_count "$loop_count" \
|
|
202
|
-
--argjson calls_made "$calls_made" \
|
|
203
|
-
--argjson max_calls "$MAX_CALLS_PER_HOUR" \
|
|
204
|
-
--argjson max_iterations "$MAX_ITERATIONS" \
|
|
205
|
-
--arg last_action "$last_action" \
|
|
206
|
-
--arg status "$status" \
|
|
207
|
-
--arg exit_reason "$exit_reason" \
|
|
208
|
-
--arg version "$VERSION" \
|
|
209
|
-
--arg story "${current_story:-}" \
|
|
210
|
-
--argjson stories_total "$stories_total" \
|
|
211
|
-
--argjson stories_completed "$stories_completed" \
|
|
212
|
-
--argjson stories_remaining "$stories_remaining" \
|
|
213
|
-
--argjson elapsed_seconds "$elapsed_seconds" \
|
|
214
|
-
--argjson flagged_stories "$flagged_json" \
|
|
215
|
-
'{
|
|
216
|
-
timestamp: $timestamp,
|
|
217
|
-
version: $version,
|
|
218
|
-
loop_count: $loop_count,
|
|
219
|
-
calls_made_this_hour: $calls_made,
|
|
220
|
-
max_calls_per_hour: $max_calls,
|
|
221
|
-
max_iterations: $max_iterations,
|
|
222
|
-
last_action: $last_action,
|
|
223
|
-
status: $status,
|
|
224
|
-
story: $story,
|
|
225
|
-
exit_reason: $exit_reason,
|
|
226
|
-
stories_total: $stories_total,
|
|
227
|
-
stories_completed: $stories_completed,
|
|
228
|
-
stories_remaining: $stories_remaining,
|
|
229
|
-
elapsed_seconds: $elapsed_seconds,
|
|
230
|
-
flagged_stories: $flagged_stories
|
|
231
|
-
}' > "$STATUS_FILE"
|
|
232
|
-
}
|
|
233
|
-
|
|
234
|
-
# codeharness: Task picking is handled by /harness-run skill inside each Claude session.
|
|
235
|
-
# Ralph just spawns sessions and checks sprint-status.yaml for completion.
|
|
236
|
-
get_current_task() {
|
|
237
|
-
# Read the first in-progress or ready-for-dev story from sprint-state.json.
|
|
238
|
-
# Task picking is done by /harness-run, but Ralph needs the story key
|
|
239
|
-
# for timeout reports and status tracking.
|
|
240
|
-
local state_file="sprint-state.json"
|
|
241
|
-
if [[ ! -f "$state_file" ]]; then
|
|
242
|
-
echo ""
|
|
243
|
-
return 0
|
|
244
|
-
fi
|
|
245
|
-
|
|
246
|
-
# First try to find an in-progress story
|
|
247
|
-
local story_key
|
|
248
|
-
story_key=$(jq -r '
|
|
249
|
-
.stories // {} | to_entries[]
|
|
250
|
-
| select(.value.status == "in-progress")
|
|
251
|
-
| .key
|
|
252
|
-
' "$state_file" 2>/dev/null | head -1)
|
|
253
|
-
|
|
254
|
-
if [[ -n "$story_key" ]]; then
|
|
255
|
-
echo "$story_key"
|
|
256
|
-
return 0
|
|
257
|
-
fi
|
|
258
|
-
|
|
259
|
-
# Fall back to the first ready-for-dev story
|
|
260
|
-
story_key=$(jq -r '
|
|
261
|
-
.stories // {} | to_entries[]
|
|
262
|
-
| select(.value.status == "ready-for-dev")
|
|
263
|
-
| .key
|
|
264
|
-
' "$state_file" 2>/dev/null | head -1)
|
|
265
|
-
|
|
266
|
-
echo "${story_key:-}"
|
|
267
|
-
return 0
|
|
268
|
-
}
|
|
269
|
-
|
|
270
|
-
# codeharness: Check if all stories in sprint-status.yaml are done.
|
|
271
|
-
# Reads development_status entries matching N-N-slug pattern (story keys).
|
|
272
|
-
# Returns 0 (true) if ALL story entries have status "done", 1 otherwise.
|
|
273
|
-
check_sprint_complete() {
|
|
274
|
-
if [[ ! -f "$SPRINT_STATUS_FILE" ]]; then
|
|
275
|
-
return 1
|
|
276
|
-
fi
|
|
277
|
-
|
|
278
|
-
local total=0
|
|
279
|
-
local done_count=0
|
|
280
|
-
local flagged_count=0
|
|
281
|
-
|
|
282
|
-
# Load flagged stories into a newline-separated string for lookup
|
|
283
|
-
local flagged_list=""
|
|
284
|
-
if [[ -f "$FLAGGED_STORIES_FILE" ]]; then
|
|
285
|
-
flagged_list=$(sed 's/^[[:space:]]*//;s/[[:space:]]*$//' "$FLAGGED_STORIES_FILE" | grep -v '^$')
|
|
286
|
-
fi
|
|
287
|
-
|
|
288
|
-
while IFS=: read -r key value; do
|
|
289
|
-
# Trim whitespace
|
|
290
|
-
key=$(echo "$key" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
|
|
291
|
-
value=$(echo "$value" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
|
|
292
|
-
|
|
293
|
-
# Skip comments and empty lines
|
|
294
|
-
[[ -z "$key" || "$key" == \#* ]] && continue
|
|
295
|
-
|
|
296
|
-
# Match story keys: N-N-slug (e.g. 5-1-ralph-loop-integration)
|
|
297
|
-
if [[ "$key" =~ ^[0-9]+-[0-9]+- ]]; then
|
|
298
|
-
total=$((total + 1))
|
|
299
|
-
if [[ "$value" == "done" ]]; then
|
|
300
|
-
done_count=$((done_count + 1))
|
|
301
|
-
elif [[ -n "$flagged_list" ]] && echo "$flagged_list" | grep -qxF "$key"; then
|
|
302
|
-
# Retry-exhausted/flagged stories count as "effectively done"
|
|
303
|
-
# — no autonomous work can be done on them
|
|
304
|
-
flagged_count=$((flagged_count + 1))
|
|
305
|
-
fi
|
|
306
|
-
fi
|
|
307
|
-
done < "$SPRINT_STATUS_FILE"
|
|
308
|
-
|
|
309
|
-
if [[ $total -eq 0 ]]; then
|
|
310
|
-
return 1
|
|
311
|
-
fi
|
|
312
|
-
|
|
313
|
-
# Sprint is complete if all stories are either done or flagged (no autonomous work left)
|
|
314
|
-
[[ $((done_count + flagged_count)) -eq $total ]]
|
|
315
|
-
}
|
|
316
|
-
|
|
317
|
-
# codeharness: Replaces all_tasks_complete() with sprint-status.yaml check.
|
|
318
|
-
all_tasks_complete() {
|
|
319
|
-
check_sprint_complete
|
|
320
|
-
}
|
|
321
|
-
|
|
322
|
-
# codeharness: Get story counts from sprint-status.yaml.
|
|
323
|
-
# Returns "total completed" (space-separated).
|
|
324
|
-
get_task_counts() {
|
|
325
|
-
if [[ ! -f "$SPRINT_STATUS_FILE" ]]; then
|
|
326
|
-
echo "0 0"
|
|
327
|
-
return
|
|
328
|
-
fi
|
|
329
|
-
|
|
330
|
-
local total=0
|
|
331
|
-
local completed=0
|
|
332
|
-
|
|
333
|
-
while IFS=: read -r key value; do
|
|
334
|
-
key=$(echo "$key" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
|
|
335
|
-
value=$(echo "$value" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
|
|
336
|
-
|
|
337
|
-
[[ -z "$key" || "$key" == \#* ]] && continue
|
|
338
|
-
|
|
339
|
-
if [[ "$key" =~ ^[0-9]+-[0-9]+- ]]; then
|
|
340
|
-
total=$((total + 1))
|
|
341
|
-
if [[ "$value" == "done" || "$value" == "failed" ]]; then
|
|
342
|
-
completed=$((completed + 1))
|
|
343
|
-
fi
|
|
344
|
-
fi
|
|
345
|
-
done < "$SPRINT_STATUS_FILE"
|
|
346
|
-
|
|
347
|
-
echo "$total $completed"
|
|
348
|
-
}
|
|
349
|
-
|
|
350
|
-
# ─── Retry Tracking ─────────────────────────────────────────────────────────
|
|
351
|
-
|
|
352
|
-
# Increment retry count for a story. Returns the new count.
|
|
353
|
-
increment_story_retry() {
|
|
354
|
-
local story_key=$1
|
|
355
|
-
|
|
356
|
-
if [[ -z "$STORY_RETRY_FILE" ]]; then
|
|
357
|
-
echo "0"
|
|
358
|
-
return
|
|
359
|
-
fi
|
|
360
|
-
|
|
361
|
-
local count=0
|
|
362
|
-
local temp_file="${STORY_RETRY_FILE}.tmp"
|
|
363
|
-
|
|
364
|
-
# Read current count if file exists
|
|
365
|
-
if [[ -f "$STORY_RETRY_FILE" ]]; then
|
|
366
|
-
local line
|
|
367
|
-
while IFS=' ' read -r key val; do
|
|
368
|
-
if [[ "$key" == "$story_key" ]]; then
|
|
369
|
-
count=$((val + 0))
|
|
370
|
-
fi
|
|
371
|
-
done < "$STORY_RETRY_FILE"
|
|
372
|
-
fi
|
|
373
|
-
|
|
374
|
-
count=$((count + 1))
|
|
375
|
-
|
|
376
|
-
# Rewrite the file with updated count (atomic via temp file + mv)
|
|
377
|
-
# Clean up stale temp file from any previous crash
|
|
378
|
-
rm -f "$temp_file" 2>/dev/null
|
|
379
|
-
|
|
380
|
-
if [[ -f "$STORY_RETRY_FILE" ]]; then
|
|
381
|
-
local found=false
|
|
382
|
-
while IFS=' ' read -r key val; do
|
|
383
|
-
if [[ "$key" == "$story_key" ]]; then
|
|
384
|
-
echo "$key $count" >> "$temp_file"
|
|
385
|
-
found=true
|
|
386
|
-
else
|
|
387
|
-
echo "$key $val" >> "$temp_file"
|
|
388
|
-
fi
|
|
389
|
-
done < "$STORY_RETRY_FILE"
|
|
390
|
-
if [[ "$found" == "false" ]]; then
|
|
391
|
-
echo "$story_key $count" >> "$temp_file"
|
|
392
|
-
fi
|
|
393
|
-
mv "$temp_file" "$STORY_RETRY_FILE"
|
|
394
|
-
else
|
|
395
|
-
echo "$story_key $count" > "$STORY_RETRY_FILE"
|
|
396
|
-
fi
|
|
397
|
-
|
|
398
|
-
echo "$count"
|
|
399
|
-
}
|
|
400
|
-
|
|
401
|
-
# Get current retry count for a story (0 if not tracked).
|
|
402
|
-
get_story_retry_count() {
|
|
403
|
-
local story_key=$1
|
|
404
|
-
|
|
405
|
-
if [[ -z "$STORY_RETRY_FILE" || ! -f "$STORY_RETRY_FILE" ]]; then
|
|
406
|
-
echo "0"
|
|
407
|
-
return
|
|
408
|
-
fi
|
|
409
|
-
|
|
410
|
-
while IFS=' ' read -r key val; do
|
|
411
|
-
if [[ "$key" == "$story_key" ]]; then
|
|
412
|
-
echo "$((val + 0))"
|
|
413
|
-
return
|
|
414
|
-
fi
|
|
415
|
-
done < "$STORY_RETRY_FILE"
|
|
416
|
-
|
|
417
|
-
echo "0"
|
|
418
|
-
}
|
|
419
|
-
|
|
420
|
-
# Check if a story is flagged (exceeded retry limit).
|
|
421
|
-
is_story_flagged() {
|
|
422
|
-
local story_key=$1
|
|
423
|
-
|
|
424
|
-
if [[ -z "$FLAGGED_STORIES_FILE" || ! -f "$FLAGGED_STORIES_FILE" ]]; then
|
|
425
|
-
return 1
|
|
426
|
-
fi
|
|
427
|
-
|
|
428
|
-
grep -qx "$story_key" "$FLAGGED_STORIES_FILE" 2>/dev/null
|
|
429
|
-
}
|
|
430
|
-
|
|
431
|
-
# Flag a story that exceeded retry limit.
|
|
432
|
-
flag_story() {
|
|
433
|
-
local story_key=$1
|
|
434
|
-
|
|
435
|
-
if [[ -z "$FLAGGED_STORIES_FILE" ]]; then
|
|
436
|
-
return
|
|
437
|
-
fi
|
|
438
|
-
|
|
439
|
-
if ! is_story_flagged "$story_key"; then
|
|
440
|
-
echo "$story_key" >> "$FLAGGED_STORIES_FILE"
|
|
441
|
-
fi
|
|
442
|
-
|
|
443
|
-
# Also update sprint-status.yaml to 'failed' so reconciliation picks it up
|
|
444
|
-
# and sprint-state.json stays consistent (prevents flagged stories stuck at 'review')
|
|
445
|
-
local sprint_yaml="${SPRINT_STATUS_FILE:-}"
|
|
446
|
-
if [[ -n "$sprint_yaml" && -f "$sprint_yaml" ]]; then
|
|
447
|
-
sed -i.bak "s/^ ${story_key}: .*/ ${story_key}: failed/" "$sprint_yaml" 2>/dev/null
|
|
448
|
-
rm -f "${sprint_yaml}.bak" 2>/dev/null
|
|
449
|
-
fi
|
|
450
|
-
}
|
|
451
|
-
|
|
452
|
-
# Get list of flagged stories (newline-separated).
|
|
453
|
-
get_flagged_stories() {
|
|
454
|
-
if [[ -z "$FLAGGED_STORIES_FILE" || ! -f "$FLAGGED_STORIES_FILE" ]]; then
|
|
455
|
-
echo ""
|
|
456
|
-
return
|
|
457
|
-
fi
|
|
458
|
-
cat "$FLAGGED_STORIES_FILE"
|
|
459
|
-
}
|
|
460
|
-
|
|
461
|
-
# Snapshot sprint-status.yaml story statuses as "key:status" lines.
|
|
462
|
-
snapshot_story_statuses() {
|
|
463
|
-
if [[ ! -f "$SPRINT_STATUS_FILE" ]]; then
|
|
464
|
-
echo ""
|
|
465
|
-
return
|
|
466
|
-
fi
|
|
467
|
-
|
|
468
|
-
while IFS=: read -r key value; do
|
|
469
|
-
key=$(echo "$key" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
|
|
470
|
-
value=$(echo "$value" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
|
|
471
|
-
[[ -z "$key" || "$key" == \#* ]] && continue
|
|
472
|
-
if [[ "$key" =~ ^[0-9]+-[0-9]+- ]]; then
|
|
473
|
-
echo "$key:$value"
|
|
474
|
-
fi
|
|
475
|
-
done < "$SPRINT_STATUS_FILE"
|
|
476
|
-
}
|
|
477
|
-
|
|
478
|
-
# Compare before/after snapshots to detect story changes.
|
|
479
|
-
# Sets CHANGED_STORIES (newly done) and UNCHANGED_STORIES (not done).
|
|
480
|
-
detect_story_changes() {
|
|
481
|
-
local before_snapshot=$1
|
|
482
|
-
local after_snapshot=$2
|
|
483
|
-
|
|
484
|
-
CHANGED_STORIES=""
|
|
485
|
-
UNCHANGED_STORIES=""
|
|
486
|
-
|
|
487
|
-
# Parse after snapshot
|
|
488
|
-
while IFS=: read -r key status; do
|
|
489
|
-
[[ -z "$key" ]] && continue
|
|
490
|
-
local before_status=""
|
|
491
|
-
# Find the same key in before snapshot
|
|
492
|
-
while IFS=: read -r bkey bstatus; do
|
|
493
|
-
if [[ "$bkey" == "$key" ]]; then
|
|
494
|
-
before_status="$bstatus"
|
|
495
|
-
break
|
|
496
|
-
fi
|
|
497
|
-
done <<< "$before_snapshot"
|
|
498
|
-
|
|
499
|
-
if [[ "$status" == "done" && "$before_status" != "done" ]]; then
|
|
500
|
-
CHANGED_STORIES="${CHANGED_STORIES}${key}
|
|
501
|
-
"
|
|
502
|
-
elif [[ "$status" != "done" ]]; then
|
|
503
|
-
UNCHANGED_STORIES="${UNCHANGED_STORIES}${key}
|
|
504
|
-
"
|
|
505
|
-
fi
|
|
506
|
-
done <<< "$after_snapshot"
|
|
507
|
-
}
|
|
508
|
-
|
|
509
|
-
# ─── Sprint State Progress Polling ─────────────────────────────────────────
|
|
510
|
-
|
|
511
|
-
# Previous state tracking for change detection
|
|
512
|
-
PREV_STORY=""
|
|
513
|
-
PREV_PHASE=""
|
|
514
|
-
PREV_AC_PROGRESS=""
|
|
515
|
-
PREV_LAST_ACTION=""
|
|
516
|
-
|
|
517
|
-
# Poll sprint-state.json for progress changes during background execution.
|
|
518
|
-
# Prints structured update lines when progress fields change.
|
|
519
|
-
poll_sprint_state_progress() {
|
|
520
|
-
local state_file="sprint-state.json"
|
|
521
|
-
[[ -f "$state_file" ]] || return 0
|
|
522
|
-
|
|
523
|
-
# Single jq call to extract all fields (avoids 4 process spawns per poll cycle)
|
|
524
|
-
local raw
|
|
525
|
-
raw=$(jq -r '[.run.currentStory // "", .run.currentPhase // "", .run.lastAction // "", .run.acProgress // ""] | join("\t")' "$state_file" 2>/dev/null) || return 0
|
|
526
|
-
[[ -n "$raw" ]] || return 0
|
|
527
|
-
|
|
528
|
-
local cur_story cur_phase cur_action cur_ac
|
|
529
|
-
IFS=$'\t' read -r cur_story cur_phase cur_action cur_ac <<< "$raw"
|
|
530
|
-
|
|
531
|
-
# Nothing to report if no story is active
|
|
532
|
-
[[ -z "$cur_story" ]] && return 0
|
|
533
|
-
|
|
534
|
-
# Detect changes and print structured updates
|
|
535
|
-
if [[ "$cur_story" != "$PREV_STORY" || "$cur_phase" != "$PREV_PHASE" ]]; then
|
|
536
|
-
if [[ -n "$cur_action" && "$cur_action" != "null" ]]; then
|
|
537
|
-
log_status "INFO" "Story ${cur_story}: ${cur_phase} (${cur_action})"
|
|
538
|
-
else
|
|
539
|
-
log_status "INFO" "Story ${cur_story}: ${cur_phase}"
|
|
540
|
-
fi
|
|
541
|
-
elif [[ "$cur_ac" != "$PREV_AC_PROGRESS" && -n "$cur_ac" && "$cur_ac" != "null" ]]; then
|
|
542
|
-
log_status "INFO" "Story ${cur_story}: verify (AC ${cur_ac})"
|
|
543
|
-
elif [[ "$cur_action" != "$PREV_LAST_ACTION" && -n "$cur_action" && "$cur_action" != "null" ]]; then
|
|
544
|
-
log_status "INFO" "Story ${cur_story}: ${cur_phase} (${cur_action})"
|
|
545
|
-
fi
|
|
546
|
-
|
|
547
|
-
PREV_STORY="$cur_story"
|
|
548
|
-
PREV_PHASE="$cur_phase"
|
|
549
|
-
PREV_AC_PROGRESS="$cur_ac"
|
|
550
|
-
PREV_LAST_ACTION="$cur_action"
|
|
551
|
-
}
|
|
552
|
-
|
|
553
|
-
# Reset polling state between iterations
|
|
554
|
-
reset_poll_state() {
|
|
555
|
-
PREV_STORY=""
|
|
556
|
-
PREV_PHASE=""
|
|
557
|
-
PREV_AC_PROGRESS=""
|
|
558
|
-
PREV_LAST_ACTION=""
|
|
559
|
-
}
|
|
560
|
-
|
|
561
|
-
# ─── Progress Summary ───────────────────────────────────────────────────────
|
|
562
|
-
|
|
563
|
-
print_progress_summary() {
|
|
564
|
-
local counts
|
|
565
|
-
counts=$(get_task_counts)
|
|
566
|
-
local total=${counts%% *}
|
|
567
|
-
local completed=${counts##* }
|
|
568
|
-
local remaining=$((total - completed))
|
|
569
|
-
local elapsed=$(( $(date +%s) - loop_start_time ))
|
|
570
|
-
local elapsed_fmt
|
|
571
|
-
|
|
572
|
-
if [[ $elapsed -ge 3600 ]]; then
|
|
573
|
-
elapsed_fmt="$((elapsed / 3600))h$((elapsed % 3600 / 60))m"
|
|
574
|
-
elif [[ $elapsed -ge 60 ]]; then
|
|
575
|
-
elapsed_fmt="$((elapsed / 60))m$((elapsed % 60))s"
|
|
576
|
-
else
|
|
577
|
-
elapsed_fmt="${elapsed}s"
|
|
578
|
-
fi
|
|
579
|
-
|
|
580
|
-
# Read cost and failed stories from sprint-state.json (single jq call)
|
|
581
|
-
local cost=""
|
|
582
|
-
local cost_fmt=""
|
|
583
|
-
local failed_stories=""
|
|
584
|
-
if [[ -f "sprint-state.json" ]]; then
|
|
585
|
-
local state_data
|
|
586
|
-
state_data=$(jq -r '(.run.cost // 0 | tostring) + "\n" + ((.run.failed // []) | join("\n"))' "sprint-state.json" 2>/dev/null) || state_data=""
|
|
587
|
-
if [[ -n "$state_data" ]]; then
|
|
588
|
-
cost=$(head -1 <<< "$state_data")
|
|
589
|
-
failed_stories=$(tail -n +2 <<< "$state_data")
|
|
590
|
-
if [[ -n "$cost" && "$cost" != "0" && "$cost" != "null" ]]; then
|
|
591
|
-
cost_fmt=", cost: \$${cost}"
|
|
592
|
-
fi
|
|
593
|
-
fi
|
|
594
|
-
fi
|
|
595
|
-
|
|
596
|
-
log_status "INFO" "Progress: ${completed}/${total} done, ${remaining} remaining (iterations: ${loop_count}, elapsed: ${elapsed_fmt}${cost_fmt})"
|
|
597
|
-
|
|
598
|
-
# Show completed stories with ✓
|
|
599
|
-
if [[ -f "$SPRINT_STATUS_FILE" ]]; then
|
|
600
|
-
while IFS=: read -r key value; do
|
|
601
|
-
key=$(echo "$key" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
|
|
602
|
-
value=$(echo "$value" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
|
|
603
|
-
[[ -z "$key" || "$key" == \#* ]] && continue
|
|
604
|
-
if [[ "$key" =~ ^[0-9]+-[0-9]+- && "$value" == "done" ]]; then
|
|
605
|
-
log_status "SUCCESS" " ✓ ${key}"
|
|
606
|
-
fi
|
|
607
|
-
done < "$SPRINT_STATUS_FILE"
|
|
608
|
-
fi
|
|
609
|
-
|
|
610
|
-
# Show failed stories with ✗ from sprint-state.json
|
|
611
|
-
if [[ -n "$failed_stories" ]]; then
|
|
612
|
-
while IFS= read -r fkey; do
|
|
613
|
-
[[ -z "$fkey" ]] && continue
|
|
614
|
-
log_status "ERROR" " ✗ ${fkey}"
|
|
615
|
-
done <<< "$failed_stories"
|
|
616
|
-
fi
|
|
617
|
-
|
|
618
|
-
# Show flagged/blocked stories with ✕
|
|
619
|
-
if [[ -f "$FLAGGED_STORIES_FILE" ]]; then
|
|
620
|
-
while IFS= read -r bkey; do
|
|
621
|
-
[[ -z "$bkey" ]] && continue
|
|
622
|
-
log_status "WARN" " ✕ ${bkey} (blocked)"
|
|
623
|
-
done < "$FLAGGED_STORIES_FILE"
|
|
624
|
-
fi
|
|
625
|
-
|
|
626
|
-
# Show the next story in line (first non-done, non-flagged)
|
|
627
|
-
if [[ -f "$SPRINT_STATUS_FILE" ]]; then
|
|
628
|
-
local next_story=""
|
|
629
|
-
while IFS=: read -r key value; do
|
|
630
|
-
key=$(echo "$key" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
|
|
631
|
-
value=$(echo "$value" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
|
|
632
|
-
[[ -z "$key" || "$key" == \#* ]] && continue
|
|
633
|
-
if [[ "$key" =~ ^[0-9]+-[0-9]+- && "$value" != "done" ]]; then
|
|
634
|
-
if ! is_story_flagged "$key"; then
|
|
635
|
-
next_story="$key ($value)"
|
|
636
|
-
break
|
|
637
|
-
fi
|
|
638
|
-
fi
|
|
639
|
-
done < "$SPRINT_STATUS_FILE"
|
|
640
|
-
if [[ -n "$next_story" ]]; then
|
|
641
|
-
log_status "INFO" "Next up: ${next_story}"
|
|
642
|
-
fi
|
|
643
|
-
fi
|
|
644
|
-
}
|
|
645
|
-
|
|
646
|
-
# ─── Iteration Insights ──────────────────────────────────────────────────────
|
|
647
|
-
|
|
648
|
-
print_iteration_insights() {
|
|
649
|
-
local project_root
|
|
650
|
-
project_root="$(pwd)"
|
|
651
|
-
local issues_file="$project_root/_bmad-output/implementation-artifacts/.session-issues.md"
|
|
652
|
-
local today
|
|
653
|
-
today=$(date +%Y-%m-%d)
|
|
654
|
-
local retro_file="$project_root/_bmad-output/implementation-artifacts/session-retro-${today}.md"
|
|
655
|
-
|
|
656
|
-
# Show session issues (last 20 lines — most recent subagent)
|
|
657
|
-
if [[ -f "$issues_file" ]]; then
|
|
658
|
-
local issue_count
|
|
659
|
-
issue_count=$(grep -c '^### ' "$issues_file" 2>/dev/null || echo "0")
|
|
660
|
-
if [[ $issue_count -gt 0 ]]; then
|
|
661
|
-
echo ""
|
|
662
|
-
log_status "INFO" "━━━ Session Issues ($issue_count entries) ━━━"
|
|
663
|
-
# Print the last subagent's issues block
|
|
664
|
-
awk '/^### /{block=""} {block=block $0 "\n"} END{printf "%s", block}' "$issues_file" | head -15
|
|
665
|
-
echo ""
|
|
666
|
-
fi
|
|
667
|
-
fi
|
|
668
|
-
|
|
669
|
-
# Show retro summary if generated
|
|
670
|
-
if [[ -f "$retro_file" ]]; then
|
|
671
|
-
log_status "INFO" "━━━ Session Retro ━━━"
|
|
672
|
-
# Print action items section if present, otherwise first 10 lines
|
|
673
|
-
if grep -q '## Action items\|## Action Items' "$retro_file" 2>/dev/null; then
|
|
674
|
-
sed -n '/^## Action [Ii]tems/,/^## /p' "$retro_file" | head -20
|
|
675
|
-
else
|
|
676
|
-
head -10 "$retro_file"
|
|
677
|
-
fi
|
|
678
|
-
echo ""
|
|
679
|
-
fi
|
|
680
|
-
}
|
|
681
|
-
|
|
682
|
-
# ─── Driver Management ──────────────────────────────────────────────────────
|
|
683
|
-
|
|
684
|
-
load_platform_driver() {
|
|
685
|
-
local driver_file="$SCRIPT_DIR/drivers/${PLATFORM_DRIVER}.sh"
|
|
686
|
-
if [[ ! -f "$driver_file" ]]; then
|
|
687
|
-
log_status "ERROR" "Platform driver not found: $driver_file"
|
|
688
|
-
exit 1
|
|
689
|
-
fi
|
|
690
|
-
|
|
691
|
-
# shellcheck source=/dev/null
|
|
692
|
-
source "$driver_file"
|
|
693
|
-
|
|
694
|
-
driver_valid_tools
|
|
695
|
-
|
|
696
|
-
# Auto-populate CLAUDE_ALLOWED_TOOLS from driver's valid tool patterns
|
|
697
|
-
# so Ralph runs autonomously without permission prompts
|
|
698
|
-
if [[ -z "$CLAUDE_ALLOWED_TOOLS" && ${#VALID_TOOL_PATTERNS[@]} -gt 0 ]]; then
|
|
699
|
-
CLAUDE_ALLOWED_TOOLS=$(IFS=','; echo "${VALID_TOOL_PATTERNS[*]}")
|
|
700
|
-
fi
|
|
701
|
-
|
|
702
|
-
log_status "DEBUG" "Platform driver: $(driver_display_name) ($(driver_cli_binary))"
|
|
703
|
-
}
|
|
704
|
-
|
|
705
|
-
# ─── Execution ───────────────────────────────────────────────────────────────
|
|
706
|
-
|
|
707
|
-
execute_iteration() {
|
|
708
|
-
local iteration=$1
|
|
709
|
-
local task_id=$2
|
|
710
|
-
local timestamp=$(date '+%Y-%m-%d_%H-%M-%S')
|
|
711
|
-
local output_file="$LOG_DIR/claude_output_${timestamp}.log"
|
|
712
|
-
local calls_made=$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")
|
|
713
|
-
calls_made=$((calls_made + 1))
|
|
714
|
-
|
|
715
|
-
# Capture git HEAD SHA at iteration start for progress detection
|
|
716
|
-
local loop_start_sha=""
|
|
717
|
-
if command -v git &>/dev/null && git rev-parse --git-dir &>/dev/null 2>&1; then
|
|
718
|
-
loop_start_sha=$(git rev-parse HEAD 2>/dev/null || echo "")
|
|
719
|
-
fi
|
|
720
|
-
|
|
721
|
-
# Snapshot sprint-state.json before iteration (for timeout delta capture)
|
|
722
|
-
local state_snapshot_path="ralph/.state-snapshot.json"
|
|
723
|
-
if [[ -f "sprint-state.json" ]]; then
|
|
724
|
-
cp "sprint-state.json" "$state_snapshot_path" 2>/dev/null || true
|
|
725
|
-
fi
|
|
726
|
-
|
|
727
|
-
log_status "LOOP" "Iteration $iteration — Task: ${task_id:-'(reading from prompt)'}"
|
|
728
|
-
local timeout_seconds=$((ITERATION_TIMEOUT_MINUTES * 60))
|
|
729
|
-
|
|
730
|
-
# Build loop context — pass time budget so the session can prioritize retro
|
|
731
|
-
local start_time
|
|
732
|
-
start_time=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
|
733
|
-
local loop_context="Loop #${iteration}. Time budget: ${ITERATION_TIMEOUT_MINUTES} minutes (started: ${start_time}). Reserve the last 5 minutes for Step 8 (session retrospective) — do not start new story work if less than 10 minutes remain."
|
|
734
|
-
if [[ -n "$task_id" ]]; then
|
|
735
|
-
loop_context+=" Current task: $task_id."
|
|
736
|
-
fi
|
|
737
|
-
|
|
738
|
-
# Build the command via driver
|
|
739
|
-
local session_id="" # Fresh context per iteration
|
|
740
|
-
if ! driver_build_command "$PROMPT_FILE" "$loop_context" "$session_id" "$PLUGIN_DIR"; then
|
|
741
|
-
log_status "ERROR" "Failed to build CLI command"
|
|
742
|
-
return 1
|
|
743
|
-
fi
|
|
744
|
-
|
|
745
|
-
# Write deadline file for time-warning hook
|
|
746
|
-
local deadline=$(( $(date +%s) + timeout_seconds ))
|
|
747
|
-
echo "$deadline" > "ralph/.iteration_deadline"
|
|
748
|
-
|
|
749
|
-
# DEBUG: log command (truncate prompt content to avoid dumping entire prompt to terminal)
|
|
750
|
-
local cmd_summary="${CLAUDE_CMD_ARGS[*]}"
|
|
751
|
-
if [[ ${#cmd_summary} -gt 200 ]]; then
|
|
752
|
-
cmd_summary="${cmd_summary:0:200}... (truncated)"
|
|
753
|
-
fi
|
|
754
|
-
log_status "DEBUG" "Command: $cmd_summary"
|
|
755
|
-
log_status "DEBUG" "Output file: $output_file"
|
|
756
|
-
log_status "DEBUG" "LIVE_OUTPUT=$LIVE_OUTPUT, timeout=${timeout_seconds}s"
|
|
757
|
-
|
|
758
|
-
log_status "INFO" "Starting $(driver_display_name) (timeout: ${ITERATION_TIMEOUT_MINUTES}m)..."
|
|
759
|
-
|
|
760
|
-
# Execute with timeout
|
|
761
|
-
local exit_code=0
|
|
762
|
-
|
|
763
|
-
if [[ "$LIVE_OUTPUT" == "true" ]]; then
|
|
764
|
-
# Live streaming mode
|
|
765
|
-
echo -e "\n=== Iteration #$iteration — $(date '+%Y-%m-%d %H:%M:%S') ===" > "$LIVE_LOG_FILE"
|
|
766
|
-
echo -e "${PURPLE}━━━━━━━━━━━━━ $(driver_display_name) Output ━━━━━━━━━━━━━${NC}"
|
|
767
|
-
|
|
768
|
-
set -o pipefail
|
|
769
|
-
portable_timeout ${timeout_seconds}s "${CLAUDE_CMD_ARGS[@]}" \
|
|
770
|
-
< /dev/null 2>&1 | tee "$output_file" | tee "$LIVE_LOG_FILE"
|
|
771
|
-
exit_code=${PIPESTATUS[0]}
|
|
772
|
-
set +o pipefail
|
|
773
|
-
|
|
774
|
-
echo -e "${PURPLE}━━━━━━━━━━━━━ End of Output ━━━━━━━━━━━━━━━━━━━${NC}"
|
|
775
|
-
else
|
|
776
|
-
# Background mode with progress monitoring
|
|
777
|
-
portable_timeout ${timeout_seconds}s "${CLAUDE_CMD_ARGS[@]}" \
|
|
778
|
-
< /dev/null > "$output_file" 2>&1 &
|
|
779
|
-
|
|
780
|
-
local claude_pid=$!
|
|
781
|
-
local progress_counter=0
|
|
782
|
-
|
|
783
|
-
log_status "DEBUG" "Background PID: $claude_pid"
|
|
784
|
-
|
|
785
|
-
reset_poll_state
|
|
786
|
-
while kill -0 $claude_pid 2>/dev/null; do
|
|
787
|
-
progress_counter=$((progress_counter + 1))
|
|
788
|
-
if [[ -f "$output_file" && -s "$output_file" ]]; then
|
|
789
|
-
cp "$output_file" "$LIVE_LOG_FILE" 2>/dev/null
|
|
790
|
-
fi
|
|
791
|
-
poll_sprint_state_progress
|
|
792
|
-
sleep 10
|
|
793
|
-
done
|
|
794
|
-
|
|
795
|
-
wait $claude_pid
|
|
796
|
-
exit_code=$?
|
|
797
|
-
log_status "DEBUG" "Claude exited with code: $exit_code, output size: $(wc -c < "$output_file" 2>/dev/null || echo 0) bytes"
|
|
798
|
-
|
|
799
|
-
# If output is empty and exit code is non-zero, log diagnostic info
|
|
800
|
-
if [[ ! -s "$output_file" && $exit_code -ne 0 ]]; then
|
|
801
|
-
log_status "ERROR" "Claude produced no output and exited with code $exit_code"
|
|
802
|
-
log_status "DEBUG" "Checking if claude binary is responsive..."
|
|
803
|
-
if claude --version > /dev/null 2>&1; then
|
|
804
|
-
log_status "DEBUG" "claude binary OK: $(claude --version 2>&1)"
|
|
805
|
-
else
|
|
806
|
-
log_status "ERROR" "claude binary not responding"
|
|
807
|
-
fi
|
|
808
|
-
fi
|
|
809
|
-
fi
|
|
810
|
-
|
|
811
|
-
if [[ $exit_code -eq 0 ]]; then
|
|
812
|
-
echo "$calls_made" > "$CALL_COUNT_FILE"
|
|
813
|
-
log_status "SUCCESS" "$(driver_display_name) iteration completed successfully"
|
|
814
|
-
|
|
815
|
-
# Detect progress: check for file changes (committed or uncommitted)
|
|
816
|
-
local files_changed=0
|
|
817
|
-
if command -v git &>/dev/null && git rev-parse --git-dir &>/dev/null 2>&1; then
|
|
818
|
-
local current_sha=$(git rev-parse HEAD 2>/dev/null || echo "")
|
|
819
|
-
|
|
820
|
-
if [[ -n "$loop_start_sha" && -n "$current_sha" && "$loop_start_sha" != "$current_sha" ]]; then
|
|
821
|
-
files_changed=$(
|
|
822
|
-
{
|
|
823
|
-
git diff --name-only "$loop_start_sha" "$current_sha" 2>/dev/null
|
|
824
|
-
git diff --name-only HEAD 2>/dev/null
|
|
825
|
-
git diff --name-only --cached 2>/dev/null
|
|
826
|
-
} | sort -u | wc -l
|
|
827
|
-
)
|
|
828
|
-
else
|
|
829
|
-
files_changed=$(
|
|
830
|
-
{
|
|
831
|
-
git diff --name-only 2>/dev/null
|
|
832
|
-
git diff --name-only --cached 2>/dev/null
|
|
833
|
-
} | sort -u | wc -l
|
|
834
|
-
)
|
|
835
|
-
fi
|
|
836
|
-
fi
|
|
837
|
-
|
|
838
|
-
# If harness-run reported NO_WORK, don't count file changes as progress.
|
|
839
|
-
# Writing session-issues.md with "NO_WORK" creates git diffs but is NOT real progress.
|
|
840
|
-
# IMPORTANT: Only check non-JSON lines. The prompt text is echoed inside JSON objects
|
|
841
|
-
# and contains these strings as instructions — those are false positives.
|
|
842
|
-
if grep -v '^[[:space:]]*{' "$output_file" 2>/dev/null | grep -qE 'Result: NO_WORK'; then
|
|
843
|
-
files_changed=0
|
|
844
|
-
log_status "INFO" "NO_WORK detected — overriding files_changed to 0 for circuit breaker"
|
|
845
|
-
fi
|
|
846
|
-
|
|
847
|
-
local has_errors="false"
|
|
848
|
-
# Only check non-JSON lines for errors. Stream-json output is NDJSON
|
|
849
|
-
# (one JSON object per line), so any line starting with '{' is Claude
|
|
850
|
-
# content — which naturally contains words like "error" and "Exception"
|
|
851
|
-
# in code reviews, test output, and discussion. Grepping those produces
|
|
852
|
-
# false positives that trip the circuit breaker.
|
|
853
|
-
if grep -v '^[[:space:]]*{' "$output_file" 2>/dev/null | \
|
|
854
|
-
grep -qE '(^Error:|^ERROR:|^error:|\]: error|Error occurred|failed with error|[Ee]xception|Fatal|FATAL)'; then
|
|
855
|
-
has_errors="true"
|
|
856
|
-
log_status "WARN" "Errors detected in output"
|
|
857
|
-
fi
|
|
858
|
-
|
|
859
|
-
local output_length=$(wc -c < "$output_file" 2>/dev/null || echo 0)
|
|
860
|
-
|
|
861
|
-
# Record in circuit breaker
|
|
862
|
-
record_loop_result "$iteration" "$files_changed" "$has_errors" "$output_length"
|
|
863
|
-
local circuit_result=$?
|
|
864
|
-
|
|
865
|
-
if [[ $circuit_result -ne 0 ]]; then
|
|
866
|
-
log_status "WARN" "Circuit breaker opened — halting execution"
|
|
867
|
-
return 3
|
|
868
|
-
fi
|
|
869
|
-
|
|
870
|
-
return 0
|
|
871
|
-
elif [[ $exit_code -eq 124 ]]; then
|
|
872
|
-
log_status "WARN" "Iteration timed out after ${ITERATION_TIMEOUT_MINUTES}m"
|
|
873
|
-
|
|
874
|
-
# Capture timeout report
|
|
875
|
-
if command -v npx &>/dev/null; then
|
|
876
|
-
log_status "INFO" "Capturing timeout report..."
|
|
877
|
-
npx codeharness timeout-report \
|
|
878
|
-
--story "${task_id:-unknown}" \
|
|
879
|
-
--iteration "$iteration" \
|
|
880
|
-
--duration "$ITERATION_TIMEOUT_MINUTES" \
|
|
881
|
-
--output-file "$output_file" \
|
|
882
|
-
--state-snapshot "$state_snapshot_path" 2>/dev/null && \
|
|
883
|
-
log_status "INFO" "Timeout report saved" || \
|
|
884
|
-
log_status "WARN" "Failed to capture timeout report"
|
|
885
|
-
fi
|
|
886
|
-
|
|
887
|
-
# Verify report file exists with non-zero content
|
|
888
|
-
local report_file="ralph/logs/timeout-report-${iteration}-${task_id:-unknown}.md"
|
|
889
|
-
if [[ -s "$report_file" ]]; then
|
|
890
|
-
log_status "INFO" "Timeout report verified: $report_file"
|
|
891
|
-
fi
|
|
892
|
-
|
|
893
|
-
return 1
|
|
894
|
-
else
|
|
895
|
-
# Check for API limit
|
|
896
|
-
if grep -qi "5.*hour.*limit\|limit.*reached.*try.*back\|usage.*limit.*reached" "$output_file" 2>/dev/null; then
|
|
897
|
-
log_status "ERROR" "Claude API usage limit reached"
|
|
898
|
-
return 2
|
|
899
|
-
# Check for transient API errors (500, 529, overloaded) — don't count against story
|
|
900
|
-
# Status code patterns exclude decimal prefixes (e.g., cost_usd=0.503 ≠ HTTP 503)
|
|
901
|
-
elif grep -qiE 'Internal server error|api_error|overloaded|(^|[^0-9.])529([^0-9]|$)|(^|[^0-9.])503([^0-9]|$)' "$output_file" 2>/dev/null; then
|
|
902
|
-
log_status "WARN" "Transient API error (not story's fault) — will retry"
|
|
903
|
-
return 4
|
|
904
|
-
else
|
|
905
|
-
log_status "ERROR" "$(driver_display_name) execution failed (exit code: $exit_code)"
|
|
906
|
-
return 1
|
|
907
|
-
fi
|
|
908
|
-
fi
|
|
909
|
-
}
|
|
910
|
-
|
|
911
|
-
# ─── Cleanup ─────────────────────────────────────────────────────────────────
|
|
912
|
-
|
|
913
|
-
cleanup() {
|
|
914
|
-
log_status "INFO" "Ralph loop interrupted. Cleaning up..."
|
|
915
|
-
update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "interrupted" "stopped" "user_cancelled"
|
|
916
|
-
|
|
917
|
-
# Print progress summary on interruption
|
|
918
|
-
if [[ -n "$loop_start_time" && -n "$SPRINT_STATUS_FILE" ]]; then
|
|
919
|
-
local counts
|
|
920
|
-
counts=$(get_task_counts)
|
|
921
|
-
local total=${counts%% *}
|
|
922
|
-
local completed=${counts##* }
|
|
923
|
-
local elapsed=$(( $(date +%s) - loop_start_time ))
|
|
924
|
-
local elapsed_min=$(( elapsed / 60 ))
|
|
925
|
-
|
|
926
|
-
log_status "INFO" " Iterations: $loop_count"
|
|
927
|
-
log_status "INFO" " Stories completed: $completed/$total"
|
|
928
|
-
log_status "INFO" " Elapsed: ${elapsed_min}m"
|
|
929
|
-
fi
|
|
930
|
-
|
|
931
|
-
exit 0
|
|
932
|
-
}
|
|
933
|
-
|
|
934
|
-
trap cleanup SIGINT SIGTERM
|
|
935
|
-
|
|
936
|
-
# ─── Help ────────────────────────────────────────────────────────────────────
|
|
937
|
-
|
|
938
|
-
show_help() {
|
|
939
|
-
cat << 'HELPEOF'
|
|
940
|
-
codeharness Ralph Loop — Autonomous execution with verification gates
|
|
941
|
-
|
|
942
|
-
Usage: ralph/ralph.sh --plugin-dir DIR [OPTIONS]
|
|
943
|
-
|
|
944
|
-
Required:
|
|
945
|
-
--plugin-dir DIR Path to codeharness plugin directory
|
|
946
|
-
|
|
947
|
-
Options:
|
|
948
|
-
-h, --help Show this help message
|
|
949
|
-
--max-iterations NUM Maximum loop iterations (default: 50)
|
|
950
|
-
--max-story-retries NUM Max retries per story before flagging (default: 3)
|
|
951
|
-
--timeout SECONDS Total loop timeout in seconds (default: 14400 = 4h)
|
|
952
|
-
--iteration-timeout MIN Per-iteration timeout in minutes (default: 30)
|
|
953
|
-
--calls NUM Max API calls per hour (default: 100)
|
|
954
|
-
--prompt FILE Prompt file for each iteration
|
|
955
|
-
--progress FILE Progress file (tasks JSON)
|
|
956
|
-
--live Show live output streaming
|
|
957
|
-
--reset Clear retry counters, flagged stories, and circuit breaker before starting
|
|
958
|
-
--reset-circuit Reset circuit breaker and exit
|
|
959
|
-
--status Show current status and exit
|
|
960
|
-
|
|
961
|
-
The loop:
|
|
962
|
-
1. Reads next task from progress file
|
|
963
|
-
2. Spawns fresh Claude Code instance with --plugin-dir
|
|
964
|
-
3. Agent implements story (harness hooks enforce verification)
|
|
965
|
-
4. Circuit breaker monitors for stagnation
|
|
966
|
-
5. On completion or gate failure, picks next task or iterates
|
|
967
|
-
HELPEOF
|
|
968
|
-
}
|
|
969
|
-
|
|
970
|
-
# ─── Sprint Summary ──────────────────────────────────────────────────────────
|
|
971
|
-
|
|
972
|
-
# Print a compact sprint summary at startup
|
|
973
|
-
print_sprint_summary() {
|
|
974
|
-
local counts
|
|
975
|
-
counts=$(get_task_counts)
|
|
976
|
-
local total=${counts%% *}
|
|
977
|
-
local completed=${counts##* }
|
|
978
|
-
local remaining=$((total - completed))
|
|
979
|
-
|
|
980
|
-
# Find next story
|
|
981
|
-
local next_story=""
|
|
982
|
-
local next_status=""
|
|
983
|
-
if [[ -f "$SPRINT_STATUS_FILE" ]]; then
|
|
984
|
-
while IFS=: read -r key value; do
|
|
985
|
-
key=$(echo "$key" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
|
|
986
|
-
value=$(echo "$value" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
|
|
987
|
-
[[ -z "$key" || "$key" == \#* ]] && continue
|
|
988
|
-
if [[ "$key" =~ ^[0-9]+-[0-9]+- && "$value" != "done" ]]; then
|
|
989
|
-
if ! is_story_flagged "$key"; then
|
|
990
|
-
next_story="$key"
|
|
991
|
-
next_status="$value"
|
|
992
|
-
break
|
|
993
|
-
fi
|
|
994
|
-
fi
|
|
995
|
-
done < "$SPRINT_STATUS_FILE"
|
|
996
|
-
fi
|
|
997
|
-
|
|
998
|
-
if [[ -n "$next_story" ]]; then
|
|
999
|
-
log_status "INFO" "Sprint: ${completed}/${total} done, ${remaining} remaining — next: ${next_story} (${next_status})"
|
|
1000
|
-
else
|
|
1001
|
-
log_status "INFO" "Sprint: ${completed}/${total} done, ${remaining} remaining"
|
|
1002
|
-
fi
|
|
1003
|
-
}
|
|
1004
|
-
|
|
1005
|
-
# ─── Main ────────────────────────────────────────────────────────────────────
|
|
1006
|
-
|
|
1007
|
-
main() {
|
|
1008
|
-
if [[ -z "$PLUGIN_DIR" ]]; then
|
|
1009
|
-
log_status "ERROR" "Missing required --plugin-dir argument"
|
|
1010
|
-
show_help
|
|
1011
|
-
exit 1
|
|
1012
|
-
fi
|
|
1013
|
-
|
|
1014
|
-
# Resolve paths
|
|
1015
|
-
PLUGIN_DIR="$(cd "$PLUGIN_DIR" 2>/dev/null && pwd)" || {
|
|
1016
|
-
log_status "ERROR" "Plugin directory does not exist: $PLUGIN_DIR"
|
|
1017
|
-
exit 1
|
|
1018
|
-
}
|
|
1019
|
-
|
|
1020
|
-
# Derive state paths from project root (cwd)
|
|
1021
|
-
local project_root
|
|
1022
|
-
project_root="$(pwd)"
|
|
1023
|
-
|
|
1024
|
-
HARNESS_STATE_DIR="${project_root}/.claude"
|
|
1025
|
-
LOG_DIR="${project_root}/ralph/logs"
|
|
1026
|
-
STATUS_FILE="${project_root}/ralph/status.json"
|
|
1027
|
-
LIVE_LOG_FILE="${project_root}/ralph/live.log"
|
|
1028
|
-
CALL_COUNT_FILE="${project_root}/ralph/.call_count"
|
|
1029
|
-
TIMESTAMP_FILE="${project_root}/ralph/.last_reset"
|
|
1030
|
-
STORY_RETRY_FILE="${project_root}/ralph/.story_retries"
|
|
1031
|
-
FLAGGED_STORIES_FILE="${project_root}/ralph/.flagged_stories"
|
|
1032
|
-
|
|
1033
|
-
# Use progress file from argument or default (legacy, optional)
|
|
1034
|
-
PROGRESS_FILE="${PROGRESS_FILE:-${project_root}/ralph/progress.json}"
|
|
1035
|
-
|
|
1036
|
-
# codeharness: Sprint status file is the primary task source
|
|
1037
|
-
SPRINT_STATUS_FILE="${project_root}/_bmad-output/implementation-artifacts/sprint-status.yaml"
|
|
1038
|
-
|
|
1039
|
-
# Use prompt file from argument or default
|
|
1040
|
-
PROMPT_FILE="${PROMPT_FILE:-${project_root}/.ralph/PROMPT.md}"
|
|
1041
|
-
|
|
1042
|
-
# Create directories
|
|
1043
|
-
mkdir -p "$LOG_DIR"
|
|
1044
|
-
|
|
1045
|
-
# Check dependencies
|
|
1046
|
-
if ! command -v jq &>/dev/null; then
|
|
1047
|
-
log_status "ERROR" "Required dependency 'jq' is not installed"
|
|
1048
|
-
exit 1
|
|
1049
|
-
fi
|
|
1050
|
-
|
|
1051
|
-
# Load platform driver
|
|
1052
|
-
load_platform_driver
|
|
1053
|
-
|
|
1054
|
-
# Check CLI binary
|
|
1055
|
-
if ! driver_check_available; then
|
|
1056
|
-
log_status "ERROR" "$(driver_display_name) CLI not found: $(driver_cli_binary)"
|
|
1057
|
-
exit 1
|
|
1058
|
-
fi
|
|
1059
|
-
|
|
1060
|
-
# Initialize circuit breaker
|
|
1061
|
-
export HARNESS_STATE_DIR
|
|
1062
|
-
init_circuit_breaker
|
|
1063
|
-
|
|
1064
|
-
# Initialize rate limiting
|
|
1065
|
-
init_call_tracking
|
|
1066
|
-
|
|
1067
|
-
# Crash recovery: detect if resuming from a previous run
|
|
1068
|
-
if [[ -f "$STATUS_FILE" ]]; then
|
|
1069
|
-
local prev_status
|
|
1070
|
-
prev_status=$(jq -r '.status // ""' "$STATUS_FILE" 2>/dev/null || echo "")
|
|
1071
|
-
if [[ -n "$prev_status" && "$prev_status" != "completed" ]]; then
|
|
1072
|
-
log_status "INFO" "Resuming from last completed story"
|
|
1073
|
-
fi
|
|
1074
|
-
fi
|
|
1075
|
-
|
|
1076
|
-
# Reset retry state if --reset flag was passed
|
|
1077
|
-
if [[ "$RESET_RETRIES" == "true" ]]; then
|
|
1078
|
-
if [[ -f "$STORY_RETRY_FILE" ]]; then
|
|
1079
|
-
rm -f "$STORY_RETRY_FILE"
|
|
1080
|
-
log_status "INFO" "Cleared story retry counters"
|
|
1081
|
-
fi
|
|
1082
|
-
if [[ -f "$FLAGGED_STORIES_FILE" ]]; then
|
|
1083
|
-
rm -f "$FLAGGED_STORIES_FILE"
|
|
1084
|
-
log_status "INFO" "Cleared flagged stories"
|
|
1085
|
-
fi
|
|
1086
|
-
reset_circuit_breaker "Reset via --reset flag"
|
|
1087
|
-
log_status "INFO" "Circuit breaker reset to CLOSED"
|
|
1088
|
-
fi
|
|
1089
|
-
|
|
1090
|
-
# .story_retries and .flagged_stories are file-based — they persist automatically
|
|
1091
|
-
|
|
1092
|
-
log_status "SUCCESS" "Ralph loop starting"
|
|
1093
|
-
log_status "DEBUG" "Plugin: $PLUGIN_DIR"
|
|
1094
|
-
log_status "DEBUG" "Max iterations: $MAX_ITERATIONS | Timeout: $((LOOP_TIMEOUT_SECONDS / 3600))h"
|
|
1095
|
-
log_status "DEBUG" "Prompt: $PROMPT_FILE"
|
|
1096
|
-
log_status "DEBUG" "Sprint status: $SPRINT_STATUS_FILE"
|
|
1097
|
-
log_status "DEBUG" "Max story retries: $MAX_STORY_RETRIES"
|
|
1098
|
-
|
|
1099
|
-
# Record loop start time for timeout
|
|
1100
|
-
loop_start_time=$(date +%s)
|
|
1101
|
-
|
|
1102
|
-
print_sprint_summary
|
|
1103
|
-
|
|
1104
|
-
local consecutive_failures=0
|
|
1105
|
-
local max_consecutive_failures=3
|
|
1106
|
-
|
|
1107
|
-
while true; do
|
|
1108
|
-
loop_count=$((loop_count + 1))
|
|
1109
|
-
|
|
1110
|
-
# ── Check loop limits ──
|
|
1111
|
-
|
|
1112
|
-
if [[ $loop_count -gt $MAX_ITERATIONS ]]; then
|
|
1113
|
-
update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "max_iterations" "stopped" "max_iterations_reached"
|
|
1114
|
-
|
|
1115
|
-
local counts
|
|
1116
|
-
counts=$(get_task_counts)
|
|
1117
|
-
local total=${counts%% *}
|
|
1118
|
-
local completed=${counts##* }
|
|
1119
|
-
log_status "INFO" "Max iterations ($MAX_ITERATIONS) reached. ${completed}/${total} stories complete."
|
|
1120
|
-
break
|
|
1121
|
-
fi
|
|
1122
|
-
|
|
1123
|
-
# Check total timeout
|
|
1124
|
-
local elapsed=$(( $(date +%s) - loop_start_time ))
|
|
1125
|
-
if [[ $elapsed -ge $LOOP_TIMEOUT_SECONDS ]]; then
|
|
1126
|
-
log_status "WARN" "Loop timeout reached (${LOOP_TIMEOUT_SECONDS}s)"
|
|
1127
|
-
update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "timeout" "stopped" "loop_timeout"
|
|
1128
|
-
break
|
|
1129
|
-
fi
|
|
1130
|
-
|
|
1131
|
-
# ── Check circuit breaker ──
|
|
1132
|
-
|
|
1133
|
-
if should_halt_execution; then
|
|
1134
|
-
# Auto-reset: if there are actionable stories (sprint not complete),
|
|
1135
|
-
# the breaker was tripped by a previous session's no-ops. Reset and retry.
|
|
1136
|
-
if ! all_tasks_complete; then
|
|
1137
|
-
log_status "INFO" "Circuit breaker open but actionable stories exist — auto-resetting"
|
|
1138
|
-
reset_circuit_breaker "Auto-reset: actionable stories detected"
|
|
1139
|
-
else
|
|
1140
|
-
local cb_no_progress=0
|
|
1141
|
-
if [[ -f "$CB_STATE_FILE" ]]; then
|
|
1142
|
-
cb_no_progress=$(jq -r '.consecutive_no_progress // 0' "$CB_STATE_FILE" 2>/dev/null || echo "0")
|
|
1143
|
-
fi
|
|
1144
|
-
log_status "WARN" "Circuit breaker: no progress in ${cb_no_progress} iterations"
|
|
1145
|
-
update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "circuit_breaker" "halted" "stagnation_detected"
|
|
1146
|
-
break
|
|
1147
|
-
fi
|
|
1148
|
-
fi
|
|
1149
|
-
|
|
1150
|
-
# ── Check rate limit ──
|
|
1151
|
-
|
|
1152
|
-
if ! can_make_call; then
|
|
1153
|
-
wait_for_reset
|
|
1154
|
-
continue
|
|
1155
|
-
fi
|
|
1156
|
-
|
|
1157
|
-
# ── Check task completion ──
|
|
1158
|
-
|
|
1159
|
-
if all_tasks_complete; then
|
|
1160
|
-
local counts
|
|
1161
|
-
counts=$(get_task_counts)
|
|
1162
|
-
local total=${counts%% *}
|
|
1163
|
-
|
|
1164
|
-
update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "all_complete" "completed" "all_tasks_done"
|
|
1165
|
-
log_status "SUCCESS" "All stories complete. ${total} stories verified in ${loop_count} iterations."
|
|
1166
|
-
break
|
|
1167
|
-
fi
|
|
1168
|
-
|
|
1169
|
-
# ── Get current task ──
|
|
1170
|
-
|
|
1171
|
-
local current_task
|
|
1172
|
-
current_task=$(get_current_task)
|
|
1173
|
-
|
|
1174
|
-
log_status "LOOP" "=== Iteration #$loop_count ==="
|
|
1175
|
-
update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "executing" "running"
|
|
1176
|
-
|
|
1177
|
-
# ── Snapshot story statuses before iteration ──
|
|
1178
|
-
local before_snapshot
|
|
1179
|
-
before_snapshot=$(snapshot_story_statuses)
|
|
1180
|
-
|
|
1181
|
-
# ── Execute ──
|
|
1182
|
-
|
|
1183
|
-
execute_iteration "$loop_count" "$current_task"
|
|
1184
|
-
local exec_result=$?
|
|
1185
|
-
|
|
1186
|
-
case $exec_result in
|
|
1187
|
-
0)
|
|
1188
|
-
consecutive_failures=0
|
|
1189
|
-
update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "completed" "success"
|
|
1190
|
-
|
|
1191
|
-
# ── Retry tracking: compare sprint-status before/after ──
|
|
1192
|
-
local after_snapshot
|
|
1193
|
-
after_snapshot=$(snapshot_story_statuses)
|
|
1194
|
-
detect_story_changes "$before_snapshot" "$after_snapshot"
|
|
1195
|
-
|
|
1196
|
-
# Only increment retry for the FIRST non-done, non-flagged story
|
|
1197
|
-
# (the one harness-run would have picked up). Other stories were
|
|
1198
|
-
# never attempted — don't penalise them for not progressing.
|
|
1199
|
-
if [[ -n "$UNCHANGED_STORIES" ]]; then
|
|
1200
|
-
while IFS= read -r skey; do
|
|
1201
|
-
[[ -z "$skey" ]] && continue
|
|
1202
|
-
if is_story_flagged "$skey"; then
|
|
1203
|
-
continue
|
|
1204
|
-
fi
|
|
1205
|
-
local retry_count
|
|
1206
|
-
retry_count=$(increment_story_retry "$skey")
|
|
1207
|
-
if [[ $retry_count -ge $MAX_STORY_RETRIES ]]; then
|
|
1208
|
-
log_status "WARN" "Story ${skey} exceeded retry limit (${retry_count}) — flagging and moving on"
|
|
1209
|
-
flag_story "$skey"
|
|
1210
|
-
else
|
|
1211
|
-
log_status "WARN" "Story ${skey} — retry ${retry_count}/${MAX_STORY_RETRIES}"
|
|
1212
|
-
fi
|
|
1213
|
-
break # only retry the first actionable story
|
|
1214
|
-
done <<< "$UNCHANGED_STORIES"
|
|
1215
|
-
fi
|
|
1216
|
-
|
|
1217
|
-
if [[ -n "$CHANGED_STORIES" ]]; then
|
|
1218
|
-
while IFS= read -r skey; do
|
|
1219
|
-
[[ -z "$skey" ]] && continue
|
|
1220
|
-
# Extract story title from story file if available
|
|
1221
|
-
local story_file="$project_root/_bmad-output/implementation-artifacts/${skey}.md"
|
|
1222
|
-
local story_title=""
|
|
1223
|
-
if [[ -f "$story_file" ]]; then
|
|
1224
|
-
story_title=$(grep -m1 '^# \|^## Story' "$story_file" 2>/dev/null | sed 's/^#* *//' | head -c 60)
|
|
1225
|
-
fi
|
|
1226
|
-
local proof_file="$project_root/verification/${skey}-proof.md"
|
|
1227
|
-
local proof_info=""
|
|
1228
|
-
if [[ -f "$proof_file" ]]; then
|
|
1229
|
-
proof_info=" [proof: verification/${skey}-proof.md]"
|
|
1230
|
-
fi
|
|
1231
|
-
if [[ -n "$story_title" ]]; then
|
|
1232
|
-
log_status "SUCCESS" "Story ${skey}: DONE — ${story_title}${proof_info}"
|
|
1233
|
-
else
|
|
1234
|
-
log_status "SUCCESS" "Story ${skey}: DONE${proof_info}"
|
|
1235
|
-
fi
|
|
1236
|
-
done <<< "$CHANGED_STORIES"
|
|
1237
|
-
fi
|
|
1238
|
-
|
|
1239
|
-
sleep 5 # Brief pause between iterations
|
|
1240
|
-
;;
|
|
1241
|
-
2)
|
|
1242
|
-
# API limit — wait or exit
|
|
1243
|
-
log_status "WARN" "API usage limit reached. Waiting 60 minutes..."
|
|
1244
|
-
update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "api_limit" "paused"
|
|
1245
|
-
sleep 3600
|
|
1246
|
-
;;
|
|
1247
|
-
3)
|
|
1248
|
-
# Circuit breaker
|
|
1249
|
-
log_status "ERROR" "Circuit breaker opened — halting loop"
|
|
1250
|
-
update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "circuit_breaker" "halted"
|
|
1251
|
-
break
|
|
1252
|
-
;;
|
|
1253
|
-
4)
|
|
1254
|
-
# Transient API error — retry after brief pause, don't count against story
|
|
1255
|
-
consecutive_failures=0 # reset — this isn't the story's fault
|
|
1256
|
-
log_status "INFO" "Transient API error — retrying in 30s (not counting against story)"
|
|
1257
|
-
sleep 30
|
|
1258
|
-
;;
|
|
1259
|
-
*)
|
|
1260
|
-
# Failure (timeout or crash) — increment retry for the story that was being worked on
|
|
1261
|
-
consecutive_failures=$((consecutive_failures + 1))
|
|
1262
|
-
|
|
1263
|
-
# Increment retry for the first non-done, non-flagged story (the one that caused the timeout)
|
|
1264
|
-
local after_snap_fail
|
|
1265
|
-
after_snap_fail=$(snapshot_story_statuses)
|
|
1266
|
-
while IFS=: read -r fkey fstatus; do
|
|
1267
|
-
[[ -z "$fkey" ]] && continue
|
|
1268
|
-
[[ "$fstatus" == "done" ]] && continue
|
|
1269
|
-
if ! is_story_flagged "$fkey"; then
|
|
1270
|
-
local fail_retry
|
|
1271
|
-
fail_retry=$(increment_story_retry "$fkey")
|
|
1272
|
-
if [[ $fail_retry -ge $MAX_STORY_RETRIES ]]; then
|
|
1273
|
-
log_status "WARN" "Story ${fkey} exceeded retry limit (${fail_retry}) after timeout — flagging"
|
|
1274
|
-
flag_story "$fkey"
|
|
1275
|
-
else
|
|
1276
|
-
log_status "WARN" "Story ${fkey} — timeout retry ${fail_retry}/${MAX_STORY_RETRIES}"
|
|
1277
|
-
fi
|
|
1278
|
-
break
|
|
1279
|
-
fi
|
|
1280
|
-
done <<< "$after_snap_fail"
|
|
1281
|
-
|
|
1282
|
-
if [[ $consecutive_failures -ge $max_consecutive_failures ]]; then
|
|
1283
|
-
log_status "ERROR" "$max_consecutive_failures consecutive failures — halting"
|
|
1284
|
-
update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "consecutive_failures" "halted"
|
|
1285
|
-
break
|
|
1286
|
-
fi
|
|
1287
|
-
|
|
1288
|
-
update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "failed" "error"
|
|
1289
|
-
log_status "WARN" "Iteration failed ($consecutive_failures/$max_consecutive_failures). Waiting 30s..."
|
|
1290
|
-
sleep 30
|
|
1291
|
-
;;
|
|
1292
|
-
esac
|
|
1293
|
-
|
|
1294
|
-
# Print progress summary after every iteration
|
|
1295
|
-
print_progress_summary
|
|
1296
|
-
|
|
1297
|
-
# ── Show session issues and retro highlights ──
|
|
1298
|
-
print_iteration_insights
|
|
1299
|
-
|
|
1300
|
-
log_status "LOOP" "=== End Iteration #$loop_count ==="
|
|
1301
|
-
done
|
|
1302
|
-
|
|
1303
|
-
# Final summary — reads from sprint-status.yaml
|
|
1304
|
-
local counts
|
|
1305
|
-
counts=$(get_task_counts)
|
|
1306
|
-
local total=${counts%% *}
|
|
1307
|
-
local completed=${counts##* }
|
|
1308
|
-
|
|
1309
|
-
local elapsed_total=$(( $(date +%s) - loop_start_time ))
|
|
1310
|
-
local elapsed_min=$(( elapsed_total / 60 ))
|
|
1311
|
-
|
|
1312
|
-
log_status "SUCCESS" "Ralph loop finished"
|
|
1313
|
-
log_status "INFO" " Iterations: $loop_count"
|
|
1314
|
-
log_status "INFO" " Stories completed: $completed/$total"
|
|
1315
|
-
log_status "INFO" " Elapsed: ${elapsed_min}m"
|
|
1316
|
-
log_status "INFO" " API calls: $(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")"
|
|
1317
|
-
|
|
1318
|
-
if [[ $completed -eq $total && $total -gt 0 ]]; then
|
|
1319
|
-
log_status "SUCCESS" "All stories complete. $total stories verified in $loop_count iterations."
|
|
1320
|
-
fi
|
|
1321
|
-
|
|
1322
|
-
# Write final summary to status file
|
|
1323
|
-
update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "final_summary" \
|
|
1324
|
-
"$(if [[ $completed -eq $total && $total -gt 0 ]]; then echo "completed"; else echo "stopped"; fi)" \
|
|
1325
|
-
"completed:$completed/$total"
|
|
1326
|
-
|
|
1327
|
-
}
|
|
1328
|
-
|
|
1329
|
-
# ─── CLI Parsing ─────────────────────────────────────────────────────────────
|
|
1330
|
-
|
|
1331
|
-
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
|
|
1332
|
-
|
|
1333
|
-
while [[ $# -gt 0 ]]; do
|
|
1334
|
-
case $1 in
|
|
1335
|
-
-h|--help)
|
|
1336
|
-
show_help
|
|
1337
|
-
exit 0
|
|
1338
|
-
;;
|
|
1339
|
-
--plugin-dir)
|
|
1340
|
-
PLUGIN_DIR="$2"
|
|
1341
|
-
shift 2
|
|
1342
|
-
;;
|
|
1343
|
-
--max-iterations)
|
|
1344
|
-
MAX_ITERATIONS="$2"
|
|
1345
|
-
shift 2
|
|
1346
|
-
;;
|
|
1347
|
-
--max-story-retries)
|
|
1348
|
-
MAX_STORY_RETRIES="$2"
|
|
1349
|
-
shift 2
|
|
1350
|
-
;;
|
|
1351
|
-
--timeout)
|
|
1352
|
-
LOOP_TIMEOUT_SECONDS="$2"
|
|
1353
|
-
shift 2
|
|
1354
|
-
;;
|
|
1355
|
-
--iteration-timeout)
|
|
1356
|
-
ITERATION_TIMEOUT_MINUTES="$2"
|
|
1357
|
-
shift 2
|
|
1358
|
-
;;
|
|
1359
|
-
--calls)
|
|
1360
|
-
MAX_CALLS_PER_HOUR="$2"
|
|
1361
|
-
shift 2
|
|
1362
|
-
;;
|
|
1363
|
-
--prompt)
|
|
1364
|
-
PROMPT_FILE="$2"
|
|
1365
|
-
shift 2
|
|
1366
|
-
;;
|
|
1367
|
-
--progress)
|
|
1368
|
-
PROGRESS_FILE="$2"
|
|
1369
|
-
shift 2
|
|
1370
|
-
;;
|
|
1371
|
-
--live)
|
|
1372
|
-
LIVE_OUTPUT=true
|
|
1373
|
-
shift
|
|
1374
|
-
;;
|
|
1375
|
-
--reset)
|
|
1376
|
-
RESET_RETRIES=true
|
|
1377
|
-
shift
|
|
1378
|
-
;;
|
|
1379
|
-
--reset-circuit)
|
|
1380
|
-
# Derive state paths so circuit breaker uses the correct directory
|
|
1381
|
-
HARNESS_STATE_DIR="$(pwd)/.claude"
|
|
1382
|
-
export HARNESS_STATE_DIR
|
|
1383
|
-
init_circuit_breaker
|
|
1384
|
-
reset_circuit_breaker "Manual reset via CLI"
|
|
1385
|
-
echo "Circuit breaker reset to CLOSED"
|
|
1386
|
-
exit 0
|
|
1387
|
-
;;
|
|
1388
|
-
--status)
|
|
1389
|
-
_status_file="$(pwd)/ralph/status.json"
|
|
1390
|
-
if [[ -f "$_status_file" ]]; then
|
|
1391
|
-
jq . "$_status_file" 2>/dev/null || cat "$_status_file"
|
|
1392
|
-
else
|
|
1393
|
-
echo "No status file found."
|
|
1394
|
-
fi
|
|
1395
|
-
exit 0
|
|
1396
|
-
;;
|
|
1397
|
-
*)
|
|
1398
|
-
echo "Unknown option: $1"
|
|
1399
|
-
show_help
|
|
1400
|
-
exit 1
|
|
1401
|
-
;;
|
|
1402
|
-
esac
|
|
1403
|
-
done
|
|
1404
|
-
|
|
1405
|
-
main
|
|
1406
|
-
|
|
1407
|
-
fi
|