codeharness 0.26.4 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/ralph/ralph.sh DELETED
@@ -1,1402 +0,0 @@
1
- #!/usr/bin/env bash
2
- # codeharness Ralph Loop — Vendored from snarktank/ralph
3
- # Autonomous execution loop that spawns fresh Claude Code instances per iteration
4
- # with verification gates, crash recovery, rate limiting, and circuit breaker protection.
5
- #
6
- # Usage: ralph/ralph.sh --plugin-dir ./codeharness [OPTIONS]
7
-
8
- # NOTE: set -e intentionally NOT used — it causes silent crashes in the main
9
- # loop when grep/jq/sed return non-zero. The loop handles errors via exit codes.
10
-
11
- SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
12
- source "$SCRIPT_DIR/lib/date_utils.sh"
13
- source "$SCRIPT_DIR/lib/timeout_utils.sh"
14
- source "$SCRIPT_DIR/lib/circuit_breaker.sh"
15
-
16
- # ─── Configuration ───────────────────────────────────────────────────────────
17
-
18
- VERSION="0.1.0"
19
-
20
- # Plugin directory (required — set via --plugin-dir)
21
- PLUGIN_DIR=""
22
-
23
- # Harness state directory (derived from project root)
24
- HARNESS_STATE_DIR=""
25
-
26
- # Progress file (legacy — kept for backwards compat, optional)
27
- PROGRESS_FILE=""
28
-
29
- # Sprint status file (primary task source — read by /harness-run skill)
30
- SPRINT_STATUS_FILE=""
31
-
32
- # Prompt file for each iteration
33
- PROMPT_FILE=""
34
-
35
- # Logging
36
- LOG_DIR=""
37
-
38
- # Loop limits
39
- MAX_ITERATIONS=${MAX_ITERATIONS:-50}
40
- MAX_STORY_RETRIES=${MAX_STORY_RETRIES:-10}
41
- LOOP_TIMEOUT_SECONDS=${LOOP_TIMEOUT_SECONDS:-43200} # 12 hours default
42
- ITERATION_TIMEOUT_MINUTES=${ITERATION_TIMEOUT_MINUTES:-30}
43
-
44
- # Rate limiting
45
- MAX_CALLS_PER_HOUR=${MAX_CALLS_PER_HOUR:-100}
46
- RATE_LIMIT_SLEEP=3600 # 1 hour
47
-
48
- # Driver
49
- PLATFORM_DRIVER="${PLATFORM_DRIVER:-claude-code}"
50
- CLAUDE_OUTPUT_FORMAT="${CLAUDE_OUTPUT_FORMAT:-stream-json}"
51
- CLAUDE_ALLOWED_TOOLS="${CLAUDE_ALLOWED_TOOLS:-}"
52
- CLAUDE_USE_CONTINUE="${CLAUDE_USE_CONTINUE:-false}" # Fresh context per iteration by default
53
-
54
- # Reset retry state on start
55
- RESET_RETRIES=false
56
-
57
- # Live output
58
- LIVE_OUTPUT=false
59
-
60
- # Colors
61
- RED='\033[0;31m'
62
- GREEN='\033[0;32m'
63
- YELLOW='\033[1;33m'
64
- BLUE='\033[0;34m'
65
- PURPLE='\033[0;35m'
66
- NC='\033[0m'
67
-
68
- # ─── Internal state ─────────────────────────────────────────────────────────
69
-
70
- CALL_COUNT_FILE=""
71
- TIMESTAMP_FILE=""
72
- STATUS_FILE=""
73
- LIVE_LOG_FILE=""
74
- STORY_RETRY_FILE=""
75
- FLAGGED_STORIES_FILE=""
76
-
77
- # Global arrays for driver command building
78
- declare -a CLAUDE_CMD_ARGS=()
79
- declare -a LIVE_CMD_ARGS=()
80
- declare -a VALID_TOOL_PATTERNS=()
81
-
82
- loop_count=0
83
- loop_start_time=""
84
-
85
- # ─── Logging ─────────────────────────────────────────────────────────────────
86
-
87
- log_status() {
88
- local level=$1
89
- local message=$2
90
- local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
91
- local color=""
92
-
93
- case $level in
94
- "INFO") color=$BLUE ;;
95
- "WARN") color=$YELLOW ;;
96
- "ERROR") color=$RED ;;
97
- "SUCCESS") color=$GREEN ;;
98
- "LOOP") color=$PURPLE ;;
99
- esac
100
-
101
- # DEBUG level: log file only, no terminal output
102
- if [[ "$level" == "DEBUG" ]]; then
103
- if [[ -n "$LOG_DIR" ]]; then
104
- echo "[$timestamp] [$level] $message" >> "$LOG_DIR/ralph.log"
105
- fi
106
- return
107
- fi
108
-
109
- echo -e "${color}[$timestamp] [$level] $message${NC}" >&2
110
- if [[ -n "$LOG_DIR" ]]; then
111
- echo "[$timestamp] [$level] $message" >> "$LOG_DIR/ralph.log"
112
- fi
113
- }
114
-
115
- # ─── Rate Limiting ───────────────────────────────────────────────────────────
116
-
117
- init_call_tracking() {
118
- local current_hour=$(date +%Y%m%d%H)
119
- local last_reset_hour=""
120
-
121
- if [[ -f "$TIMESTAMP_FILE" ]]; then
122
- last_reset_hour=$(cat "$TIMESTAMP_FILE")
123
- fi
124
-
125
- if [[ "$current_hour" != "$last_reset_hour" ]]; then
126
- echo "0" > "$CALL_COUNT_FILE"
127
- echo "$current_hour" > "$TIMESTAMP_FILE"
128
- fi
129
- }
130
-
131
- can_make_call() {
132
- local calls_made=0
133
- if [[ -f "$CALL_COUNT_FILE" ]]; then
134
- calls_made=$(cat "$CALL_COUNT_FILE")
135
- fi
136
-
137
- if [[ $calls_made -ge $MAX_CALLS_PER_HOUR ]]; then
138
- return 1
139
- else
140
- return 0
141
- fi
142
- }
143
-
144
- wait_for_reset() {
145
- local calls_made=$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")
146
- log_status "WARN" "Rate limit reached ($calls_made/$MAX_CALLS_PER_HOUR). Waiting for reset..."
147
-
148
- local current_minute=$(date +%M)
149
- local current_second=$(date +%S)
150
- local wait_time=$(((60 - current_minute - 1) * 60 + (60 - current_second)))
151
-
152
- log_status "INFO" "Sleeping for $wait_time seconds until next hour..."
153
- sleep "$wait_time"
154
-
155
- echo "0" > "$CALL_COUNT_FILE"
156
- echo "$(date +%Y%m%d%H)" > "$TIMESTAMP_FILE"
157
- log_status "SUCCESS" "Rate limit reset."
158
- }
159
-
160
- # ─── Progress Tracking ───────────────────────────────────────────────────────
161
-
162
- update_status() {
163
- local loop_count=$1
164
- local calls_made=$2
165
- local last_action=$3
166
- local status=$4
167
- local exit_reason=${5:-""}
168
-
169
- if [[ -z "$STATUS_FILE" ]]; then
170
- return
171
- fi
172
-
173
- # codeharness: Include sprint-status story counts in status JSON
174
- local stories_total=0
175
- local stories_completed=0
176
- if [[ -n "$SPRINT_STATUS_FILE" && -f "$SPRINT_STATUS_FILE" ]]; then
177
- local sprint_counts
178
- sprint_counts=$(get_task_counts)
179
- stories_total=${sprint_counts%% *}
180
- stories_completed=${sprint_counts##* }
181
- fi
182
-
183
- local stories_remaining=$((stories_total - stories_completed))
184
- local elapsed_seconds=0
185
- if [[ -n "$loop_start_time" ]]; then
186
- elapsed_seconds=$(( $(date +%s) - loop_start_time ))
187
- fi
188
-
189
- # Build flagged stories JSON array
190
- local flagged_json="[]"
191
- if [[ -n "$FLAGGED_STORIES_FILE" && -f "$FLAGGED_STORIES_FILE" ]]; then
192
- flagged_json=$(jq -R -s 'split("\n") | map(select(length > 0))' < "$FLAGGED_STORIES_FILE")
193
- fi
194
-
195
- # Get current story key for status tracking
196
- local current_story
197
- current_story=$(get_current_task)
198
-
199
- jq -n \
200
- --arg timestamp "$(get_iso_timestamp)" \
201
- --argjson loop_count "$loop_count" \
202
- --argjson calls_made "$calls_made" \
203
- --argjson max_calls "$MAX_CALLS_PER_HOUR" \
204
- --argjson max_iterations "$MAX_ITERATIONS" \
205
- --arg last_action "$last_action" \
206
- --arg status "$status" \
207
- --arg exit_reason "$exit_reason" \
208
- --arg version "$VERSION" \
209
- --arg story "${current_story:-}" \
210
- --argjson stories_total "$stories_total" \
211
- --argjson stories_completed "$stories_completed" \
212
- --argjson stories_remaining "$stories_remaining" \
213
- --argjson elapsed_seconds "$elapsed_seconds" \
214
- --argjson flagged_stories "$flagged_json" \
215
- '{
216
- timestamp: $timestamp,
217
- version: $version,
218
- loop_count: $loop_count,
219
- calls_made_this_hour: $calls_made,
220
- max_calls_per_hour: $max_calls,
221
- max_iterations: $max_iterations,
222
- last_action: $last_action,
223
- status: $status,
224
- story: $story,
225
- exit_reason: $exit_reason,
226
- stories_total: $stories_total,
227
- stories_completed: $stories_completed,
228
- stories_remaining: $stories_remaining,
229
- elapsed_seconds: $elapsed_seconds,
230
- flagged_stories: $flagged_stories
231
- }' > "$STATUS_FILE"
232
- }
233
-
234
- # codeharness: Task picking is handled by /harness-run skill inside each Claude session.
235
- # Ralph just spawns sessions and checks sprint-status.yaml for completion.
236
- get_current_task() {
237
- # Read the first in-progress or ready-for-dev story from sprint-state.json.
238
- # Task picking is done by /harness-run, but Ralph needs the story key
239
- # for timeout reports and status tracking.
240
- local state_file="sprint-state.json"
241
- if [[ ! -f "$state_file" ]]; then
242
- echo ""
243
- return 0
244
- fi
245
-
246
- # First try to find an in-progress story
247
- local story_key
248
- story_key=$(jq -r '
249
- .stories // {} | to_entries[]
250
- | select(.value.status == "in-progress")
251
- | .key
252
- ' "$state_file" 2>/dev/null | head -1)
253
-
254
- if [[ -n "$story_key" ]]; then
255
- echo "$story_key"
256
- return 0
257
- fi
258
-
259
- # Fall back to the first ready-for-dev story
260
- story_key=$(jq -r '
261
- .stories // {} | to_entries[]
262
- | select(.value.status == "ready-for-dev")
263
- | .key
264
- ' "$state_file" 2>/dev/null | head -1)
265
-
266
- echo "${story_key:-}"
267
- return 0
268
- }
269
-
270
- # codeharness: Check if all stories in sprint-status.yaml are done.
271
- # Reads development_status entries matching N-N-slug pattern (story keys).
272
- # Returns 0 (true) if ALL story entries have status "done", 1 otherwise.
273
- check_sprint_complete() {
274
- if [[ ! -f "$SPRINT_STATUS_FILE" ]]; then
275
- return 1
276
- fi
277
-
278
- local total=0
279
- local done_count=0
280
- local flagged_count=0
281
-
282
- # Load flagged stories for comparison
283
- local -A flagged_map
284
- if [[ -f "$FLAGGED_STORIES_FILE" ]]; then
285
- while IFS= read -r flagged_key; do
286
- flagged_key=$(echo "$flagged_key" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
287
- [[ -n "$flagged_key" ]] && flagged_map["$flagged_key"]=1
288
- done < "$FLAGGED_STORIES_FILE"
289
- fi
290
-
291
- while IFS=: read -r key value; do
292
- # Trim whitespace
293
- key=$(echo "$key" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
294
- value=$(echo "$value" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
295
-
296
- # Skip comments and empty lines
297
- [[ -z "$key" || "$key" == \#* ]] && continue
298
-
299
- # Match story keys: N-N-slug (e.g. 5-1-ralph-loop-integration)
300
- if [[ "$key" =~ ^[0-9]+-[0-9]+- ]]; then
301
- total=$((total + 1))
302
- if [[ "$value" == "done" ]]; then
303
- done_count=$((done_count + 1))
304
- elif [[ -n "${flagged_map[$key]+x}" ]]; then
305
- # Retry-exhausted/flagged stories count as "effectively done"
306
- # — no autonomous work can be done on them
307
- flagged_count=$((flagged_count + 1))
308
- fi
309
- fi
310
- done < "$SPRINT_STATUS_FILE"
311
-
312
- if [[ $total -eq 0 ]]; then
313
- return 1
314
- fi
315
-
316
- # Sprint is complete if all stories are either done or flagged (no autonomous work left)
317
- [[ $((done_count + flagged_count)) -eq $total ]]
318
- }
319
-
320
- # codeharness: Replaces all_tasks_complete() with sprint-status.yaml check.
321
- all_tasks_complete() {
322
- check_sprint_complete
323
- }
324
-
325
- # codeharness: Get story counts from sprint-status.yaml.
326
- # Returns "total completed" (space-separated).
327
- get_task_counts() {
328
- if [[ ! -f "$SPRINT_STATUS_FILE" ]]; then
329
- echo "0 0"
330
- return
331
- fi
332
-
333
- local total=0
334
- local completed=0
335
-
336
- while IFS=: read -r key value; do
337
- key=$(echo "$key" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
338
- value=$(echo "$value" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
339
-
340
- [[ -z "$key" || "$key" == \#* ]] && continue
341
-
342
- if [[ "$key" =~ ^[0-9]+-[0-9]+- ]]; then
343
- total=$((total + 1))
344
- if [[ "$value" == "done" ]]; then
345
- completed=$((completed + 1))
346
- fi
347
- fi
348
- done < "$SPRINT_STATUS_FILE"
349
-
350
- echo "$total $completed"
351
- }
352
-
353
- # ─── Retry Tracking ─────────────────────────────────────────────────────────
354
-
355
- # Increment retry count for a story. Returns the new count.
356
- increment_story_retry() {
357
- local story_key=$1
358
-
359
- if [[ -z "$STORY_RETRY_FILE" ]]; then
360
- echo "0"
361
- return
362
- fi
363
-
364
- local count=0
365
- local temp_file="${STORY_RETRY_FILE}.tmp"
366
-
367
- # Read current count if file exists
368
- if [[ -f "$STORY_RETRY_FILE" ]]; then
369
- local line
370
- while IFS=' ' read -r key val; do
371
- if [[ "$key" == "$story_key" ]]; then
372
- count=$((val + 0))
373
- fi
374
- done < "$STORY_RETRY_FILE"
375
- fi
376
-
377
- count=$((count + 1))
378
-
379
- # Rewrite the file with updated count (atomic via temp file + mv)
380
- # Clean up stale temp file from any previous crash
381
- rm -f "$temp_file" 2>/dev/null
382
-
383
- if [[ -f "$STORY_RETRY_FILE" ]]; then
384
- local found=false
385
- while IFS=' ' read -r key val; do
386
- if [[ "$key" == "$story_key" ]]; then
387
- echo "$key $count" >> "$temp_file"
388
- found=true
389
- else
390
- echo "$key $val" >> "$temp_file"
391
- fi
392
- done < "$STORY_RETRY_FILE"
393
- if [[ "$found" == "false" ]]; then
394
- echo "$story_key $count" >> "$temp_file"
395
- fi
396
- mv "$temp_file" "$STORY_RETRY_FILE"
397
- else
398
- echo "$story_key $count" > "$STORY_RETRY_FILE"
399
- fi
400
-
401
- echo "$count"
402
- }
403
-
404
- # Get current retry count for a story (0 if not tracked).
405
- get_story_retry_count() {
406
- local story_key=$1
407
-
408
- if [[ -z "$STORY_RETRY_FILE" || ! -f "$STORY_RETRY_FILE" ]]; then
409
- echo "0"
410
- return
411
- fi
412
-
413
- while IFS=' ' read -r key val; do
414
- if [[ "$key" == "$story_key" ]]; then
415
- echo "$((val + 0))"
416
- return
417
- fi
418
- done < "$STORY_RETRY_FILE"
419
-
420
- echo "0"
421
- }
422
-
423
- # Check if a story is flagged (exceeded retry limit).
424
- is_story_flagged() {
425
- local story_key=$1
426
-
427
- if [[ -z "$FLAGGED_STORIES_FILE" || ! -f "$FLAGGED_STORIES_FILE" ]]; then
428
- return 1
429
- fi
430
-
431
- grep -qx "$story_key" "$FLAGGED_STORIES_FILE" 2>/dev/null
432
- }
433
-
434
- # Flag a story that exceeded retry limit.
435
- flag_story() {
436
- local story_key=$1
437
-
438
- if [[ -z "$FLAGGED_STORIES_FILE" ]]; then
439
- return
440
- fi
441
-
442
- if ! is_story_flagged "$story_key"; then
443
- echo "$story_key" >> "$FLAGGED_STORIES_FILE"
444
- fi
445
- }
446
-
447
- # Get list of flagged stories (newline-separated).
448
- get_flagged_stories() {
449
- if [[ -z "$FLAGGED_STORIES_FILE" || ! -f "$FLAGGED_STORIES_FILE" ]]; then
450
- echo ""
451
- return
452
- fi
453
- cat "$FLAGGED_STORIES_FILE"
454
- }
455
-
456
- # Snapshot sprint-status.yaml story statuses as "key:status" lines.
457
- snapshot_story_statuses() {
458
- if [[ ! -f "$SPRINT_STATUS_FILE" ]]; then
459
- echo ""
460
- return
461
- fi
462
-
463
- while IFS=: read -r key value; do
464
- key=$(echo "$key" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
465
- value=$(echo "$value" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
466
- [[ -z "$key" || "$key" == \#* ]] && continue
467
- if [[ "$key" =~ ^[0-9]+-[0-9]+- ]]; then
468
- echo "$key:$value"
469
- fi
470
- done < "$SPRINT_STATUS_FILE"
471
- }
472
-
473
- # Compare before/after snapshots to detect story changes.
474
- # Sets CHANGED_STORIES (newly done) and UNCHANGED_STORIES (not done).
475
- detect_story_changes() {
476
- local before_snapshot=$1
477
- local after_snapshot=$2
478
-
479
- CHANGED_STORIES=""
480
- UNCHANGED_STORIES=""
481
-
482
- # Parse after snapshot
483
- while IFS=: read -r key status; do
484
- [[ -z "$key" ]] && continue
485
- local before_status=""
486
- # Find the same key in before snapshot
487
- while IFS=: read -r bkey bstatus; do
488
- if [[ "$bkey" == "$key" ]]; then
489
- before_status="$bstatus"
490
- break
491
- fi
492
- done <<< "$before_snapshot"
493
-
494
- if [[ "$status" == "done" && "$before_status" != "done" ]]; then
495
- CHANGED_STORIES="${CHANGED_STORIES}${key}
496
- "
497
- elif [[ "$status" != "done" ]]; then
498
- UNCHANGED_STORIES="${UNCHANGED_STORIES}${key}
499
- "
500
- fi
501
- done <<< "$after_snapshot"
502
- }
503
-
504
- # ─── Sprint State Progress Polling ─────────────────────────────────────────
505
-
506
- # Previous state tracking for change detection
507
- PREV_STORY=""
508
- PREV_PHASE=""
509
- PREV_AC_PROGRESS=""
510
- PREV_LAST_ACTION=""
511
-
512
- # Poll sprint-state.json for progress changes during background execution.
513
- # Prints structured update lines when progress fields change.
514
- poll_sprint_state_progress() {
515
- local state_file="sprint-state.json"
516
- [[ -f "$state_file" ]] || return 0
517
-
518
- # Single jq call to extract all fields (avoids 4 process spawns per poll cycle)
519
- local raw
520
- raw=$(jq -r '[.run.currentStory // "", .run.currentPhase // "", .run.lastAction // "", .run.acProgress // ""] | join("\t")' "$state_file" 2>/dev/null) || return 0
521
- [[ -n "$raw" ]] || return 0
522
-
523
- local cur_story cur_phase cur_action cur_ac
524
- IFS=$'\t' read -r cur_story cur_phase cur_action cur_ac <<< "$raw"
525
-
526
- # Nothing to report if no story is active
527
- [[ -z "$cur_story" ]] && return 0
528
-
529
- # Detect changes and print structured updates
530
- if [[ "$cur_story" != "$PREV_STORY" || "$cur_phase" != "$PREV_PHASE" ]]; then
531
- if [[ -n "$cur_action" && "$cur_action" != "null" ]]; then
532
- log_status "INFO" "Story ${cur_story}: ${cur_phase} (${cur_action})"
533
- else
534
- log_status "INFO" "Story ${cur_story}: ${cur_phase}"
535
- fi
536
- elif [[ "$cur_ac" != "$PREV_AC_PROGRESS" && -n "$cur_ac" && "$cur_ac" != "null" ]]; then
537
- log_status "INFO" "Story ${cur_story}: verify (AC ${cur_ac})"
538
- elif [[ "$cur_action" != "$PREV_LAST_ACTION" && -n "$cur_action" && "$cur_action" != "null" ]]; then
539
- log_status "INFO" "Story ${cur_story}: ${cur_phase} (${cur_action})"
540
- fi
541
-
542
- PREV_STORY="$cur_story"
543
- PREV_PHASE="$cur_phase"
544
- PREV_AC_PROGRESS="$cur_ac"
545
- PREV_LAST_ACTION="$cur_action"
546
- }
547
-
548
- # Reset polling state between iterations
549
- reset_poll_state() {
550
- PREV_STORY=""
551
- PREV_PHASE=""
552
- PREV_AC_PROGRESS=""
553
- PREV_LAST_ACTION=""
554
- }
555
-
556
- # ─── Progress Summary ───────────────────────────────────────────────────────
557
-
558
- print_progress_summary() {
559
- local counts
560
- counts=$(get_task_counts)
561
- local total=${counts%% *}
562
- local completed=${counts##* }
563
- local remaining=$((total - completed))
564
- local elapsed=$(( $(date +%s) - loop_start_time ))
565
- local elapsed_fmt
566
-
567
- if [[ $elapsed -ge 3600 ]]; then
568
- elapsed_fmt="$((elapsed / 3600))h$((elapsed % 3600 / 60))m"
569
- elif [[ $elapsed -ge 60 ]]; then
570
- elapsed_fmt="$((elapsed / 60))m$((elapsed % 60))s"
571
- else
572
- elapsed_fmt="${elapsed}s"
573
- fi
574
-
575
- # Read cost and failed stories from sprint-state.json (single jq call)
576
- local cost=""
577
- local cost_fmt=""
578
- local failed_stories=""
579
- if [[ -f "sprint-state.json" ]]; then
580
- local state_data
581
- state_data=$(jq -r '(.run.cost // 0 | tostring) + "\n" + ((.run.failed // []) | join("\n"))' "sprint-state.json" 2>/dev/null) || state_data=""
582
- if [[ -n "$state_data" ]]; then
583
- cost=$(head -1 <<< "$state_data")
584
- failed_stories=$(tail -n +2 <<< "$state_data")
585
- if [[ -n "$cost" && "$cost" != "0" && "$cost" != "null" ]]; then
586
- cost_fmt=", cost: \$${cost}"
587
- fi
588
- fi
589
- fi
590
-
591
- log_status "INFO" "Progress: ${completed}/${total} done, ${remaining} remaining (iterations: ${loop_count}, elapsed: ${elapsed_fmt}${cost_fmt})"
592
-
593
- # Show completed stories with ✓
594
- if [[ -f "$SPRINT_STATUS_FILE" ]]; then
595
- while IFS=: read -r key value; do
596
- key=$(echo "$key" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
597
- value=$(echo "$value" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
598
- [[ -z "$key" || "$key" == \#* ]] && continue
599
- if [[ "$key" =~ ^[0-9]+-[0-9]+- && "$value" == "done" ]]; then
600
- log_status "SUCCESS" " ✓ ${key}"
601
- fi
602
- done < "$SPRINT_STATUS_FILE"
603
- fi
604
-
605
- # Show failed stories with ✗ from sprint-state.json
606
- if [[ -n "$failed_stories" ]]; then
607
- while IFS= read -r fkey; do
608
- [[ -z "$fkey" ]] && continue
609
- log_status "ERROR" " ✗ ${fkey}"
610
- done <<< "$failed_stories"
611
- fi
612
-
613
- # Show flagged/blocked stories with ✕
614
- if [[ -f "$FLAGGED_STORIES_FILE" ]]; then
615
- while IFS= read -r bkey; do
616
- [[ -z "$bkey" ]] && continue
617
- log_status "WARN" " ✕ ${bkey} (blocked)"
618
- done < "$FLAGGED_STORIES_FILE"
619
- fi
620
-
621
- # Show the next story in line (first non-done, non-flagged)
622
- if [[ -f "$SPRINT_STATUS_FILE" ]]; then
623
- local next_story=""
624
- while IFS=: read -r key value; do
625
- key=$(echo "$key" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
626
- value=$(echo "$value" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
627
- [[ -z "$key" || "$key" == \#* ]] && continue
628
- if [[ "$key" =~ ^[0-9]+-[0-9]+- && "$value" != "done" ]]; then
629
- if ! is_story_flagged "$key"; then
630
- next_story="$key ($value)"
631
- break
632
- fi
633
- fi
634
- done < "$SPRINT_STATUS_FILE"
635
- if [[ -n "$next_story" ]]; then
636
- log_status "INFO" "Next up: ${next_story}"
637
- fi
638
- fi
639
- }
640
-
641
- # ─── Iteration Insights ──────────────────────────────────────────────────────
642
-
643
- print_iteration_insights() {
644
- local project_root
645
- project_root="$(pwd)"
646
- local issues_file="$project_root/_bmad-output/implementation-artifacts/.session-issues.md"
647
- local today
648
- today=$(date +%Y-%m-%d)
649
- local retro_file="$project_root/_bmad-output/implementation-artifacts/session-retro-${today}.md"
650
-
651
- # Show session issues (last 20 lines — most recent subagent)
652
- if [[ -f "$issues_file" ]]; then
653
- local issue_count
654
- issue_count=$(grep -c '^### ' "$issues_file" 2>/dev/null || echo "0")
655
- if [[ $issue_count -gt 0 ]]; then
656
- echo ""
657
- log_status "INFO" "━━━ Session Issues ($issue_count entries) ━━━"
658
- # Print the last subagent's issues block
659
- awk '/^### /{block=""} {block=block $0 "\n"} END{printf "%s", block}' "$issues_file" | head -15
660
- echo ""
661
- fi
662
- fi
663
-
664
- # Show retro summary if generated
665
- if [[ -f "$retro_file" ]]; then
666
- log_status "INFO" "━━━ Session Retro ━━━"
667
- # Print action items section if present, otherwise first 10 lines
668
- if grep -q '## Action items\|## Action Items' "$retro_file" 2>/dev/null; then
669
- sed -n '/^## Action [Ii]tems/,/^## /p' "$retro_file" | head -20
670
- else
671
- head -10 "$retro_file"
672
- fi
673
- echo ""
674
- fi
675
- }
676
-
677
- # ─── Driver Management ──────────────────────────────────────────────────────
678
-
679
- load_platform_driver() {
680
- local driver_file="$SCRIPT_DIR/drivers/${PLATFORM_DRIVER}.sh"
681
- if [[ ! -f "$driver_file" ]]; then
682
- log_status "ERROR" "Platform driver not found: $driver_file"
683
- exit 1
684
- fi
685
-
686
- # shellcheck source=/dev/null
687
- source "$driver_file"
688
-
689
- driver_valid_tools
690
-
691
- # Auto-populate CLAUDE_ALLOWED_TOOLS from driver's valid tool patterns
692
- # so Ralph runs autonomously without permission prompts
693
- if [[ -z "$CLAUDE_ALLOWED_TOOLS" && ${#VALID_TOOL_PATTERNS[@]} -gt 0 ]]; then
694
- CLAUDE_ALLOWED_TOOLS=$(IFS=','; echo "${VALID_TOOL_PATTERNS[*]}")
695
- fi
696
-
697
- log_status "DEBUG" "Platform driver: $(driver_display_name) ($(driver_cli_binary))"
698
- }
699
-
700
- # ─── Execution ───────────────────────────────────────────────────────────────
701
-
702
- execute_iteration() {
703
- local iteration=$1
704
- local task_id=$2
705
- local timestamp=$(date '+%Y-%m-%d_%H-%M-%S')
706
- local output_file="$LOG_DIR/claude_output_${timestamp}.log"
707
- local calls_made=$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")
708
- calls_made=$((calls_made + 1))
709
-
710
- # Capture git HEAD SHA at iteration start for progress detection
711
- local loop_start_sha=""
712
- if command -v git &>/dev/null && git rev-parse --git-dir &>/dev/null 2>&1; then
713
- loop_start_sha=$(git rev-parse HEAD 2>/dev/null || echo "")
714
- fi
715
-
716
- # Snapshot sprint-state.json before iteration (for timeout delta capture)
717
- local state_snapshot_path="ralph/.state-snapshot.json"
718
- if [[ -f "sprint-state.json" ]]; then
719
- cp "sprint-state.json" "$state_snapshot_path" 2>/dev/null || true
720
- fi
721
-
722
- log_status "LOOP" "Iteration $iteration — Task: ${task_id:-'(reading from prompt)'}"
723
- local timeout_seconds=$((ITERATION_TIMEOUT_MINUTES * 60))
724
-
725
- # Build loop context — pass time budget so the session can prioritize retro
726
- local start_time
727
- start_time=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
728
- local loop_context="Loop #${iteration}. Time budget: ${ITERATION_TIMEOUT_MINUTES} minutes (started: ${start_time}). Reserve the last 5 minutes for Step 8 (session retrospective) — do not start new story work if less than 10 minutes remain."
729
- if [[ -n "$task_id" ]]; then
730
- loop_context+=" Current task: $task_id."
731
- fi
732
-
733
- # Build the command via driver
734
- local session_id="" # Fresh context per iteration
735
- if ! driver_build_command "$PROMPT_FILE" "$loop_context" "$session_id" "$PLUGIN_DIR"; then
736
- log_status "ERROR" "Failed to build CLI command"
737
- return 1
738
- fi
739
-
740
- # Write deadline file for time-warning hook
741
- local deadline=$(( $(date +%s) + timeout_seconds ))
742
- echo "$deadline" > "ralph/.iteration_deadline"
743
-
744
- # DEBUG: log command (truncate prompt content to avoid dumping entire prompt to terminal)
745
- local cmd_summary="${CLAUDE_CMD_ARGS[*]}"
746
- if [[ ${#cmd_summary} -gt 200 ]]; then
747
- cmd_summary="${cmd_summary:0:200}... (truncated)"
748
- fi
749
- log_status "DEBUG" "Command: $cmd_summary"
750
- log_status "DEBUG" "Output file: $output_file"
751
- log_status "DEBUG" "LIVE_OUTPUT=$LIVE_OUTPUT, timeout=${timeout_seconds}s"
752
-
753
- log_status "INFO" "Starting $(driver_display_name) (timeout: ${ITERATION_TIMEOUT_MINUTES}m)..."
754
-
755
- # Execute with timeout
756
- local exit_code=0
757
-
758
- if [[ "$LIVE_OUTPUT" == "true" ]]; then
759
- # Live streaming mode
760
- echo -e "\n=== Iteration #$iteration — $(date '+%Y-%m-%d %H:%M:%S') ===" > "$LIVE_LOG_FILE"
761
- echo -e "${PURPLE}━━━━━━━━━━━━━ $(driver_display_name) Output ━━━━━━━━━━━━━${NC}"
762
-
763
- set -o pipefail
764
- portable_timeout ${timeout_seconds}s "${CLAUDE_CMD_ARGS[@]}" \
765
- < /dev/null 2>&1 | tee "$output_file" | tee "$LIVE_LOG_FILE"
766
- exit_code=${PIPESTATUS[0]}
767
- set +o pipefail
768
-
769
- echo -e "${PURPLE}━━━━━━━━━━━━━ End of Output ━━━━━━━━━━━━━━━━━━━${NC}"
770
- else
771
- # Background mode with progress monitoring
772
- portable_timeout ${timeout_seconds}s "${CLAUDE_CMD_ARGS[@]}" \
773
- < /dev/null > "$output_file" 2>&1 &
774
-
775
- local claude_pid=$!
776
- local progress_counter=0
777
-
778
- log_status "DEBUG" "Background PID: $claude_pid"
779
-
780
- reset_poll_state
781
- while kill -0 $claude_pid 2>/dev/null; do
782
- progress_counter=$((progress_counter + 1))
783
- if [[ -f "$output_file" && -s "$output_file" ]]; then
784
- cp "$output_file" "$LIVE_LOG_FILE" 2>/dev/null
785
- fi
786
- poll_sprint_state_progress
787
- sleep 10
788
- done
789
-
790
- wait $claude_pid
791
- exit_code=$?
792
- log_status "DEBUG" "Claude exited with code: $exit_code, output size: $(wc -c < "$output_file" 2>/dev/null || echo 0) bytes"
793
-
794
- # If output is empty and exit code is non-zero, log diagnostic info
795
- if [[ ! -s "$output_file" && $exit_code -ne 0 ]]; then
796
- log_status "ERROR" "Claude produced no output and exited with code $exit_code"
797
- log_status "DEBUG" "Checking if claude binary is responsive..."
798
- if claude --version > /dev/null 2>&1; then
799
- log_status "DEBUG" "claude binary OK: $(claude --version 2>&1)"
800
- else
801
- log_status "ERROR" "claude binary not responding"
802
- fi
803
- fi
804
- fi
805
-
806
- if [[ $exit_code -eq 0 ]]; then
807
- echo "$calls_made" > "$CALL_COUNT_FILE"
808
- log_status "SUCCESS" "$(driver_display_name) iteration completed successfully"
809
-
810
- # Detect progress: check for file changes (committed or uncommitted)
811
- local files_changed=0
812
- if command -v git &>/dev/null && git rev-parse --git-dir &>/dev/null 2>&1; then
813
- local current_sha=$(git rev-parse HEAD 2>/dev/null || echo "")
814
-
815
- if [[ -n "$loop_start_sha" && -n "$current_sha" && "$loop_start_sha" != "$current_sha" ]]; then
816
- files_changed=$(
817
- {
818
- git diff --name-only "$loop_start_sha" "$current_sha" 2>/dev/null
819
- git diff --name-only HEAD 2>/dev/null
820
- git diff --name-only --cached 2>/dev/null
821
- } | sort -u | wc -l
822
- )
823
- else
824
- files_changed=$(
825
- {
826
- git diff --name-only 2>/dev/null
827
- git diff --name-only --cached 2>/dev/null
828
- } | sort -u | wc -l
829
- )
830
- fi
831
- fi
832
-
833
- # If harness-run reported NO_WORK, don't count file changes as progress.
834
- # Writing session-issues.md with "NO_WORK" creates git diffs but is NOT real progress.
835
- # IMPORTANT: Only check non-JSON lines. The prompt text is echoed inside JSON objects
836
- # and contains these strings as instructions — those are false positives.
837
- if grep -v '^[[:space:]]*{' "$output_file" 2>/dev/null | grep -qE 'Result: NO_WORK'; then
838
- files_changed=0
839
- log_status "INFO" "NO_WORK detected — overriding files_changed to 0 for circuit breaker"
840
- fi
841
-
842
- local has_errors="false"
843
- # Only check non-JSON lines for errors. Stream-json output is NDJSON
844
- # (one JSON object per line), so any line starting with '{' is Claude
845
- # content — which naturally contains words like "error" and "Exception"
846
- # in code reviews, test output, and discussion. Grepping those produces
847
- # false positives that trip the circuit breaker.
848
- if grep -v '^[[:space:]]*{' "$output_file" 2>/dev/null | \
849
- grep -qE '(^Error:|^ERROR:|^error:|\]: error|Error occurred|failed with error|[Ee]xception|Fatal|FATAL)'; then
850
- has_errors="true"
851
- log_status "WARN" "Errors detected in output"
852
- fi
853
-
854
- local output_length=$(wc -c < "$output_file" 2>/dev/null || echo 0)
855
-
856
- # Record in circuit breaker
857
- record_loop_result "$iteration" "$files_changed" "$has_errors" "$output_length"
858
- local circuit_result=$?
859
-
860
- if [[ $circuit_result -ne 0 ]]; then
861
- log_status "WARN" "Circuit breaker opened — halting execution"
862
- return 3
863
- fi
864
-
865
- return 0
866
- elif [[ $exit_code -eq 124 ]]; then
867
- log_status "WARN" "Iteration timed out after ${ITERATION_TIMEOUT_MINUTES}m"
868
-
869
- # Capture timeout report
870
- if command -v npx &>/dev/null; then
871
- log_status "INFO" "Capturing timeout report..."
872
- npx codeharness timeout-report \
873
- --story "${task_id:-unknown}" \
874
- --iteration "$iteration" \
875
- --duration "$ITERATION_TIMEOUT_MINUTES" \
876
- --output-file "$output_file" \
877
- --state-snapshot "$state_snapshot_path" 2>/dev/null && \
878
- log_status "INFO" "Timeout report saved" || \
879
- log_status "WARN" "Failed to capture timeout report"
880
- fi
881
-
882
- # Verify report file exists with non-zero content
883
- local report_file="ralph/logs/timeout-report-${iteration}-${task_id:-unknown}.md"
884
- if [[ -s "$report_file" ]]; then
885
- log_status "INFO" "Timeout report verified: $report_file"
886
- fi
887
-
888
- return 1
889
- else
890
- # Check for API limit
891
- if grep -qi "5.*hour.*limit\|limit.*reached.*try.*back\|usage.*limit.*reached" "$output_file" 2>/dev/null; then
892
- log_status "ERROR" "Claude API usage limit reached"
893
- return 2
894
- # Check for transient API errors (500, 529, overloaded) — don't count against story
895
- # Status code patterns exclude decimal prefixes (e.g., cost_usd=0.503 ≠ HTTP 503)
896
- elif grep -qiE 'Internal server error|api_error|overloaded|(^|[^0-9.])529([^0-9]|$)|(^|[^0-9.])503([^0-9]|$)' "$output_file" 2>/dev/null; then
897
- log_status "WARN" "Transient API error (not story's fault) — will retry"
898
- return 4
899
- else
900
- log_status "ERROR" "$(driver_display_name) execution failed (exit code: $exit_code)"
901
- return 1
902
- fi
903
- fi
904
- }
905
-
906
- # ─── Cleanup ─────────────────────────────────────────────────────────────────
907
-
908
- cleanup() {
909
- log_status "INFO" "Ralph loop interrupted. Cleaning up..."
910
- update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "interrupted" "stopped" "user_cancelled"
911
-
912
- # Print progress summary on interruption
913
- if [[ -n "$loop_start_time" && -n "$SPRINT_STATUS_FILE" ]]; then
914
- local counts
915
- counts=$(get_task_counts)
916
- local total=${counts%% *}
917
- local completed=${counts##* }
918
- local elapsed=$(( $(date +%s) - loop_start_time ))
919
- local elapsed_min=$(( elapsed / 60 ))
920
-
921
- log_status "INFO" " Iterations: $loop_count"
922
- log_status "INFO" " Stories completed: $completed/$total"
923
- log_status "INFO" " Elapsed: ${elapsed_min}m"
924
- fi
925
-
926
- exit 0
927
- }
928
-
929
- trap cleanup SIGINT SIGTERM
930
-
931
- # ─── Help ────────────────────────────────────────────────────────────────────
932
-
933
- show_help() {
934
- cat << 'HELPEOF'
935
- codeharness Ralph Loop — Autonomous execution with verification gates
936
-
937
- Usage: ralph/ralph.sh --plugin-dir DIR [OPTIONS]
938
-
939
- Required:
940
- --plugin-dir DIR Path to codeharness plugin directory
941
-
942
- Options:
943
- -h, --help Show this help message
944
- --max-iterations NUM Maximum loop iterations (default: 50)
945
- --max-story-retries NUM Max retries per story before flagging (default: 3)
946
- --timeout SECONDS Total loop timeout in seconds (default: 14400 = 4h)
947
- --iteration-timeout MIN Per-iteration timeout in minutes (default: 30)
948
- --calls NUM Max API calls per hour (default: 100)
949
- --prompt FILE Prompt file for each iteration
950
- --progress FILE Progress file (tasks JSON)
951
- --live Show live output streaming
952
- --reset Clear retry counters, flagged stories, and circuit breaker before starting
953
- --reset-circuit Reset circuit breaker and exit
954
- --status Show current status and exit
955
-
956
- The loop:
957
- 1. Reads next task from progress file
958
- 2. Spawns fresh Claude Code instance with --plugin-dir
959
- 3. Agent implements story (harness hooks enforce verification)
960
- 4. Circuit breaker monitors for stagnation
961
- 5. On completion or gate failure, picks next task or iterates
962
- HELPEOF
963
- }
964
-
965
- # ─── Sprint Summary ──────────────────────────────────────────────────────────
966
-
967
- # Print a compact sprint summary at startup
968
- print_sprint_summary() {
969
- local counts
970
- counts=$(get_task_counts)
971
- local total=${counts%% *}
972
- local completed=${counts##* }
973
- local remaining=$((total - completed))
974
-
975
- # Find next story
976
- local next_story=""
977
- local next_status=""
978
- if [[ -f "$SPRINT_STATUS_FILE" ]]; then
979
- while IFS=: read -r key value; do
980
- key=$(echo "$key" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
981
- value=$(echo "$value" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
982
- [[ -z "$key" || "$key" == \#* ]] && continue
983
- if [[ "$key" =~ ^[0-9]+-[0-9]+- && "$value" != "done" ]]; then
984
- if ! is_story_flagged "$key"; then
985
- next_story="$key"
986
- next_status="$value"
987
- break
988
- fi
989
- fi
990
- done < "$SPRINT_STATUS_FILE"
991
- fi
992
-
993
- if [[ -n "$next_story" ]]; then
994
- log_status "INFO" "Sprint: ${completed}/${total} done, ${remaining} remaining — next: ${next_story} (${next_status})"
995
- else
996
- log_status "INFO" "Sprint: ${completed}/${total} done, ${remaining} remaining"
997
- fi
998
- }
999
-
1000
- # ─── Main ────────────────────────────────────────────────────────────────────
1001
-
1002
- main() {
1003
- if [[ -z "$PLUGIN_DIR" ]]; then
1004
- log_status "ERROR" "Missing required --plugin-dir argument"
1005
- show_help
1006
- exit 1
1007
- fi
1008
-
1009
- # Resolve paths
1010
- PLUGIN_DIR="$(cd "$PLUGIN_DIR" 2>/dev/null && pwd)" || {
1011
- log_status "ERROR" "Plugin directory does not exist: $PLUGIN_DIR"
1012
- exit 1
1013
- }
1014
-
1015
- # Derive state paths from project root (cwd)
1016
- local project_root
1017
- project_root="$(pwd)"
1018
-
1019
- HARNESS_STATE_DIR="${project_root}/.claude"
1020
- LOG_DIR="${project_root}/ralph/logs"
1021
- STATUS_FILE="${project_root}/ralph/status.json"
1022
- LIVE_LOG_FILE="${project_root}/ralph/live.log"
1023
- CALL_COUNT_FILE="${project_root}/ralph/.call_count"
1024
- TIMESTAMP_FILE="${project_root}/ralph/.last_reset"
1025
- STORY_RETRY_FILE="${project_root}/ralph/.story_retries"
1026
- FLAGGED_STORIES_FILE="${project_root}/ralph/.flagged_stories"
1027
-
1028
- # Use progress file from argument or default (legacy, optional)
1029
- PROGRESS_FILE="${PROGRESS_FILE:-${project_root}/ralph/progress.json}"
1030
-
1031
- # codeharness: Sprint status file is the primary task source
1032
- SPRINT_STATUS_FILE="${project_root}/_bmad-output/implementation-artifacts/sprint-status.yaml"
1033
-
1034
- # Use prompt file from argument or default
1035
- PROMPT_FILE="${PROMPT_FILE:-${project_root}/.ralph/PROMPT.md}"
1036
-
1037
- # Create directories
1038
- mkdir -p "$LOG_DIR"
1039
-
1040
- # Check dependencies
1041
- if ! command -v jq &>/dev/null; then
1042
- log_status "ERROR" "Required dependency 'jq' is not installed"
1043
- exit 1
1044
- fi
1045
-
1046
- # Load platform driver
1047
- load_platform_driver
1048
-
1049
- # Check CLI binary
1050
- if ! driver_check_available; then
1051
- log_status "ERROR" "$(driver_display_name) CLI not found: $(driver_cli_binary)"
1052
- exit 1
1053
- fi
1054
-
1055
- # Initialize circuit breaker
1056
- export HARNESS_STATE_DIR
1057
- init_circuit_breaker
1058
-
1059
- # Initialize rate limiting
1060
- init_call_tracking
1061
-
1062
- # Crash recovery: detect if resuming from a previous run
1063
- if [[ -f "$STATUS_FILE" ]]; then
1064
- local prev_status
1065
- prev_status=$(jq -r '.status // ""' "$STATUS_FILE" 2>/dev/null || echo "")
1066
- if [[ -n "$prev_status" && "$prev_status" != "completed" ]]; then
1067
- log_status "INFO" "Resuming from last completed story"
1068
- fi
1069
- fi
1070
-
1071
- # Reset retry state if --reset flag was passed
1072
- if [[ "$RESET_RETRIES" == "true" ]]; then
1073
- if [[ -f "$STORY_RETRY_FILE" ]]; then
1074
- rm -f "$STORY_RETRY_FILE"
1075
- log_status "INFO" "Cleared story retry counters"
1076
- fi
1077
- if [[ -f "$FLAGGED_STORIES_FILE" ]]; then
1078
- rm -f "$FLAGGED_STORIES_FILE"
1079
- log_status "INFO" "Cleared flagged stories"
1080
- fi
1081
- reset_circuit_breaker "Reset via --reset flag"
1082
- log_status "INFO" "Circuit breaker reset to CLOSED"
1083
- fi
1084
-
1085
- # .story_retries and .flagged_stories are file-based — they persist automatically
1086
-
1087
- log_status "SUCCESS" "Ralph loop starting"
1088
- log_status "DEBUG" "Plugin: $PLUGIN_DIR"
1089
- log_status "DEBUG" "Max iterations: $MAX_ITERATIONS | Timeout: $((LOOP_TIMEOUT_SECONDS / 3600))h"
1090
- log_status "DEBUG" "Prompt: $PROMPT_FILE"
1091
- log_status "DEBUG" "Sprint status: $SPRINT_STATUS_FILE"
1092
- log_status "DEBUG" "Max story retries: $MAX_STORY_RETRIES"
1093
-
1094
- # Record loop start time for timeout
1095
- loop_start_time=$(date +%s)
1096
-
1097
- print_sprint_summary
1098
-
1099
- local consecutive_failures=0
1100
- local max_consecutive_failures=3
1101
-
1102
- while true; do
1103
- loop_count=$((loop_count + 1))
1104
-
1105
- # ── Check loop limits ──
1106
-
1107
- if [[ $loop_count -gt $MAX_ITERATIONS ]]; then
1108
- update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "max_iterations" "stopped" "max_iterations_reached"
1109
-
1110
- local counts
1111
- counts=$(get_task_counts)
1112
- local total=${counts%% *}
1113
- local completed=${counts##* }
1114
- log_status "INFO" "Max iterations ($MAX_ITERATIONS) reached. ${completed}/${total} stories complete."
1115
- break
1116
- fi
1117
-
1118
- # Check total timeout
1119
- local elapsed=$(( $(date +%s) - loop_start_time ))
1120
- if [[ $elapsed -ge $LOOP_TIMEOUT_SECONDS ]]; then
1121
- log_status "WARN" "Loop timeout reached (${LOOP_TIMEOUT_SECONDS}s)"
1122
- update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "timeout" "stopped" "loop_timeout"
1123
- break
1124
- fi
1125
-
1126
- # ── Check circuit breaker ──
1127
-
1128
- if should_halt_execution; then
1129
- # Auto-reset: if there are actionable stories (sprint not complete),
1130
- # the breaker was tripped by a previous session's no-ops. Reset and retry.
1131
- if ! all_tasks_complete; then
1132
- log_status "INFO" "Circuit breaker open but actionable stories exist — auto-resetting"
1133
- reset_circuit_breaker "Auto-reset: actionable stories detected"
1134
- else
1135
- local cb_no_progress=0
1136
- if [[ -f "$CB_STATE_FILE" ]]; then
1137
- cb_no_progress=$(jq -r '.consecutive_no_progress // 0' "$CB_STATE_FILE" 2>/dev/null || echo "0")
1138
- fi
1139
- log_status "WARN" "Circuit breaker: no progress in ${cb_no_progress} iterations"
1140
- update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "circuit_breaker" "halted" "stagnation_detected"
1141
- break
1142
- fi
1143
- fi
1144
-
1145
- # ── Check rate limit ──
1146
-
1147
- if ! can_make_call; then
1148
- wait_for_reset
1149
- continue
1150
- fi
1151
-
1152
- # ── Check task completion ──
1153
-
1154
- if all_tasks_complete; then
1155
- local counts
1156
- counts=$(get_task_counts)
1157
- local total=${counts%% *}
1158
-
1159
- update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "all_complete" "completed" "all_tasks_done"
1160
- log_status "SUCCESS" "All stories complete. ${total} stories verified in ${loop_count} iterations."
1161
- break
1162
- fi
1163
-
1164
- # ── Get current task ──
1165
-
1166
- local current_task
1167
- current_task=$(get_current_task)
1168
-
1169
- log_status "LOOP" "=== Iteration #$loop_count ==="
1170
- update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "executing" "running"
1171
-
1172
- # ── Snapshot story statuses before iteration ──
1173
- local before_snapshot
1174
- before_snapshot=$(snapshot_story_statuses)
1175
-
1176
- # ── Execute ──
1177
-
1178
- execute_iteration "$loop_count" "$current_task"
1179
- local exec_result=$?
1180
-
1181
- case $exec_result in
1182
- 0)
1183
- consecutive_failures=0
1184
- update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "completed" "success"
1185
-
1186
- # ── Retry tracking: compare sprint-status before/after ──
1187
- local after_snapshot
1188
- after_snapshot=$(snapshot_story_statuses)
1189
- detect_story_changes "$before_snapshot" "$after_snapshot"
1190
-
1191
- # Only increment retry for the FIRST non-done, non-flagged story
1192
- # (the one harness-run would have picked up). Other stories were
1193
- # never attempted — don't penalise them for not progressing.
1194
- if [[ -n "$UNCHANGED_STORIES" ]]; then
1195
- while IFS= read -r skey; do
1196
- [[ -z "$skey" ]] && continue
1197
- if is_story_flagged "$skey"; then
1198
- continue
1199
- fi
1200
- local retry_count
1201
- retry_count=$(increment_story_retry "$skey")
1202
- if [[ $retry_count -ge $MAX_STORY_RETRIES ]]; then
1203
- log_status "WARN" "Story ${skey} exceeded retry limit (${retry_count}) — flagging and moving on"
1204
- flag_story "$skey"
1205
- else
1206
- log_status "WARN" "Story ${skey} — retry ${retry_count}/${MAX_STORY_RETRIES}"
1207
- fi
1208
- break # only retry the first actionable story
1209
- done <<< "$UNCHANGED_STORIES"
1210
- fi
1211
-
1212
- if [[ -n "$CHANGED_STORIES" ]]; then
1213
- while IFS= read -r skey; do
1214
- [[ -z "$skey" ]] && continue
1215
- # Extract story title from story file if available
1216
- local story_file="$project_root/_bmad-output/implementation-artifacts/${skey}.md"
1217
- local story_title=""
1218
- if [[ -f "$story_file" ]]; then
1219
- story_title=$(grep -m1 '^# \|^## Story' "$story_file" 2>/dev/null | sed 's/^#* *//' | head -c 60)
1220
- fi
1221
- local proof_file="$project_root/verification/${skey}-proof.md"
1222
- local proof_info=""
1223
- if [[ -f "$proof_file" ]]; then
1224
- proof_info=" [proof: verification/${skey}-proof.md]"
1225
- fi
1226
- if [[ -n "$story_title" ]]; then
1227
- log_status "SUCCESS" "Story ${skey}: DONE — ${story_title}${proof_info}"
1228
- else
1229
- log_status "SUCCESS" "Story ${skey}: DONE${proof_info}"
1230
- fi
1231
- done <<< "$CHANGED_STORIES"
1232
- fi
1233
-
1234
- sleep 5 # Brief pause between iterations
1235
- ;;
1236
- 2)
1237
- # API limit — wait or exit
1238
- log_status "WARN" "API usage limit reached. Waiting 60 minutes..."
1239
- update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "api_limit" "paused"
1240
- sleep 3600
1241
- ;;
1242
- 3)
1243
- # Circuit breaker
1244
- log_status "ERROR" "Circuit breaker opened — halting loop"
1245
- update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "circuit_breaker" "halted"
1246
- break
1247
- ;;
1248
- 4)
1249
- # Transient API error — retry after brief pause, don't count against story
1250
- consecutive_failures=0 # reset — this isn't the story's fault
1251
- log_status "INFO" "Transient API error — retrying in 30s (not counting against story)"
1252
- sleep 30
1253
- ;;
1254
- *)
1255
- # Failure (timeout or crash) — increment retry for the story that was being worked on
1256
- consecutive_failures=$((consecutive_failures + 1))
1257
-
1258
- # Increment retry for the first non-done, non-flagged story (the one that caused the timeout)
1259
- local after_snap_fail
1260
- after_snap_fail=$(snapshot_story_statuses)
1261
- while IFS=: read -r fkey fstatus; do
1262
- [[ -z "$fkey" ]] && continue
1263
- [[ "$fstatus" == "done" ]] && continue
1264
- if ! is_story_flagged "$fkey"; then
1265
- local fail_retry
1266
- fail_retry=$(increment_story_retry "$fkey")
1267
- if [[ $fail_retry -ge $MAX_STORY_RETRIES ]]; then
1268
- log_status "WARN" "Story ${fkey} exceeded retry limit (${fail_retry}) after timeout — flagging"
1269
- flag_story "$fkey"
1270
- else
1271
- log_status "WARN" "Story ${fkey} — timeout retry ${fail_retry}/${MAX_STORY_RETRIES}"
1272
- fi
1273
- break
1274
- fi
1275
- done <<< "$after_snap_fail"
1276
-
1277
- if [[ $consecutive_failures -ge $max_consecutive_failures ]]; then
1278
- log_status "ERROR" "$max_consecutive_failures consecutive failures — halting"
1279
- update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "consecutive_failures" "halted"
1280
- break
1281
- fi
1282
-
1283
- update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "failed" "error"
1284
- log_status "WARN" "Iteration failed ($consecutive_failures/$max_consecutive_failures). Waiting 30s..."
1285
- sleep 30
1286
- ;;
1287
- esac
1288
-
1289
- # Print progress summary after every iteration
1290
- print_progress_summary
1291
-
1292
- # ── Show session issues and retro highlights ──
1293
- print_iteration_insights
1294
-
1295
- log_status "LOOP" "=== End Iteration #$loop_count ==="
1296
- done
1297
-
1298
- # Final summary — reads from sprint-status.yaml
1299
- local counts
1300
- counts=$(get_task_counts)
1301
- local total=${counts%% *}
1302
- local completed=${counts##* }
1303
-
1304
- local elapsed_total=$(( $(date +%s) - loop_start_time ))
1305
- local elapsed_min=$(( elapsed_total / 60 ))
1306
-
1307
- log_status "SUCCESS" "Ralph loop finished"
1308
- log_status "INFO" " Iterations: $loop_count"
1309
- log_status "INFO" " Stories completed: $completed/$total"
1310
- log_status "INFO" " Elapsed: ${elapsed_min}m"
1311
- log_status "INFO" " API calls: $(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")"
1312
-
1313
- if [[ $completed -eq $total && $total -gt 0 ]]; then
1314
- log_status "SUCCESS" "All stories complete. $total stories verified in $loop_count iterations."
1315
- fi
1316
-
1317
- # Write final summary to status file
1318
- update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "final_summary" \
1319
- "$(if [[ $completed -eq $total && $total -gt 0 ]]; then echo "completed"; else echo "stopped"; fi)" \
1320
- "completed:$completed/$total"
1321
-
1322
- }
1323
-
1324
- # ─── CLI Parsing ─────────────────────────────────────────────────────────────
1325
-
1326
- if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
1327
-
1328
- while [[ $# -gt 0 ]]; do
1329
- case $1 in
1330
- -h|--help)
1331
- show_help
1332
- exit 0
1333
- ;;
1334
- --plugin-dir)
1335
- PLUGIN_DIR="$2"
1336
- shift 2
1337
- ;;
1338
- --max-iterations)
1339
- MAX_ITERATIONS="$2"
1340
- shift 2
1341
- ;;
1342
- --max-story-retries)
1343
- MAX_STORY_RETRIES="$2"
1344
- shift 2
1345
- ;;
1346
- --timeout)
1347
- LOOP_TIMEOUT_SECONDS="$2"
1348
- shift 2
1349
- ;;
1350
- --iteration-timeout)
1351
- ITERATION_TIMEOUT_MINUTES="$2"
1352
- shift 2
1353
- ;;
1354
- --calls)
1355
- MAX_CALLS_PER_HOUR="$2"
1356
- shift 2
1357
- ;;
1358
- --prompt)
1359
- PROMPT_FILE="$2"
1360
- shift 2
1361
- ;;
1362
- --progress)
1363
- PROGRESS_FILE="$2"
1364
- shift 2
1365
- ;;
1366
- --live)
1367
- LIVE_OUTPUT=true
1368
- shift
1369
- ;;
1370
- --reset)
1371
- RESET_RETRIES=true
1372
- shift
1373
- ;;
1374
- --reset-circuit)
1375
- # Derive state paths so circuit breaker uses the correct directory
1376
- HARNESS_STATE_DIR="$(pwd)/.claude"
1377
- export HARNESS_STATE_DIR
1378
- init_circuit_breaker
1379
- reset_circuit_breaker "Manual reset via CLI"
1380
- echo "Circuit breaker reset to CLOSED"
1381
- exit 0
1382
- ;;
1383
- --status)
1384
- _status_file="$(pwd)/ralph/status.json"
1385
- if [[ -f "$_status_file" ]]; then
1386
- jq . "$_status_file" 2>/dev/null || cat "$_status_file"
1387
- else
1388
- echo "No status file found."
1389
- fi
1390
- exit 0
1391
- ;;
1392
- *)
1393
- echo "Unknown option: $1"
1394
- show_help
1395
- exit 1
1396
- ;;
1397
- esac
1398
- done
1399
-
1400
- main
1401
-
1402
- fi