codeharness 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/ralph/ralph.sh ADDED
@@ -0,0 +1,1006 @@
1
+ #!/usr/bin/env bash
2
+ # codeharness Ralph Loop — Vendored from snarktank/ralph
3
+ # Autonomous execution loop that spawns fresh Claude Code instances per iteration
4
+ # with verification gates, crash recovery, rate limiting, and circuit breaker protection.
5
+ #
6
+ # Usage: ralph/ralph.sh --plugin-dir ./codeharness [OPTIONS]
7
+
8
+ set -e
9
+
10
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
11
+ source "$SCRIPT_DIR/lib/date_utils.sh"
12
+ source "$SCRIPT_DIR/lib/timeout_utils.sh"
13
+ source "$SCRIPT_DIR/lib/circuit_breaker.sh"
14
+
15
+ # ─── Configuration ───────────────────────────────────────────────────────────
16
+
17
+ VERSION="0.1.0"
18
+
19
+ # Plugin directory (required — set via --plugin-dir)
20
+ PLUGIN_DIR=""
21
+
22
+ # Harness state directory (derived from project root)
23
+ HARNESS_STATE_DIR=""
24
+
25
+ # Progress file (legacy — kept for backwards compat, optional)
26
+ PROGRESS_FILE=""
27
+
28
+ # Sprint status file (primary task source — read by /harness-run skill)
29
+ SPRINT_STATUS_FILE=""
30
+
31
+ # Prompt file for each iteration
32
+ PROMPT_FILE=""
33
+
34
+ # Logging
35
+ LOG_DIR=""
36
+
37
+ # Loop limits
38
+ MAX_ITERATIONS=${MAX_ITERATIONS:-50}
39
+ MAX_STORY_RETRIES=${MAX_STORY_RETRIES:-3}
40
+ LOOP_TIMEOUT_SECONDS=${LOOP_TIMEOUT_SECONDS:-14400} # 4 hours default
41
+ ITERATION_TIMEOUT_MINUTES=${ITERATION_TIMEOUT_MINUTES:-15}
42
+
43
+ # Rate limiting
44
+ MAX_CALLS_PER_HOUR=${MAX_CALLS_PER_HOUR:-100}
45
+ RATE_LIMIT_SLEEP=3600 # 1 hour
46
+
47
+ # Driver
48
+ PLATFORM_DRIVER="${PLATFORM_DRIVER:-claude-code}"
49
+ CLAUDE_OUTPUT_FORMAT="${CLAUDE_OUTPUT_FORMAT:-json}"
50
+ CLAUDE_ALLOWED_TOOLS="${CLAUDE_ALLOWED_TOOLS:-}"
51
+ CLAUDE_USE_CONTINUE="${CLAUDE_USE_CONTINUE:-false}" # Fresh context per iteration by default
52
+
53
+ # Live output
54
+ LIVE_OUTPUT=false
55
+
56
+ # Colors
57
+ RED='\033[0;31m'
58
+ GREEN='\033[0;32m'
59
+ YELLOW='\033[1;33m'
60
+ BLUE='\033[0;34m'
61
+ PURPLE='\033[0;35m'
62
+ NC='\033[0m'
63
+
64
+ # ─── Internal state ─────────────────────────────────────────────────────────
65
+
66
+ CALL_COUNT_FILE=""
67
+ TIMESTAMP_FILE=""
68
+ STATUS_FILE=""
69
+ LIVE_LOG_FILE=""
70
+ STORY_RETRY_FILE=""
71
+ FLAGGED_STORIES_FILE=""
72
+
73
+ # Global arrays for driver command building
74
+ declare -a CLAUDE_CMD_ARGS=()
75
+ declare -a LIVE_CMD_ARGS=()
76
+ declare -a VALID_TOOL_PATTERNS=()
77
+
78
+ loop_count=0
79
+ loop_start_time=""
80
+
81
+ # ─── Logging ─────────────────────────────────────────────────────────────────
82
+
83
+ log_status() {
84
+ local level=$1
85
+ local message=$2
86
+ local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
87
+ local color=""
88
+
89
+ case $level in
90
+ "INFO") color=$BLUE ;;
91
+ "WARN") color=$YELLOW ;;
92
+ "ERROR") color=$RED ;;
93
+ "SUCCESS") color=$GREEN ;;
94
+ "LOOP") color=$PURPLE ;;
95
+ esac
96
+
97
+ echo -e "${color}[$timestamp] [$level] $message${NC}" >&2
98
+ if [[ -n "$LOG_DIR" ]]; then
99
+ echo "[$timestamp] [$level] $message" >> "$LOG_DIR/ralph.log"
100
+ fi
101
+ }
102
+
103
+ # ─── Rate Limiting ───────────────────────────────────────────────────────────
104
+
105
+ init_call_tracking() {
106
+ local current_hour=$(date +%Y%m%d%H)
107
+ local last_reset_hour=""
108
+
109
+ if [[ -f "$TIMESTAMP_FILE" ]]; then
110
+ last_reset_hour=$(cat "$TIMESTAMP_FILE")
111
+ fi
112
+
113
+ if [[ "$current_hour" != "$last_reset_hour" ]]; then
114
+ echo "0" > "$CALL_COUNT_FILE"
115
+ echo "$current_hour" > "$TIMESTAMP_FILE"
116
+ fi
117
+ }
118
+
119
+ can_make_call() {
120
+ local calls_made=0
121
+ if [[ -f "$CALL_COUNT_FILE" ]]; then
122
+ calls_made=$(cat "$CALL_COUNT_FILE")
123
+ fi
124
+
125
+ if [[ $calls_made -ge $MAX_CALLS_PER_HOUR ]]; then
126
+ return 1
127
+ else
128
+ return 0
129
+ fi
130
+ }
131
+
132
+ wait_for_reset() {
133
+ local calls_made=$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")
134
+ log_status "WARN" "Rate limit reached ($calls_made/$MAX_CALLS_PER_HOUR). Waiting for reset..."
135
+
136
+ local current_minute=$(date +%M)
137
+ local current_second=$(date +%S)
138
+ local wait_time=$(((60 - current_minute - 1) * 60 + (60 - current_second)))
139
+
140
+ log_status "INFO" "Sleeping for $wait_time seconds until next hour..."
141
+ sleep "$wait_time"
142
+
143
+ echo "0" > "$CALL_COUNT_FILE"
144
+ echo "$(date +%Y%m%d%H)" > "$TIMESTAMP_FILE"
145
+ log_status "SUCCESS" "Rate limit reset."
146
+ }
147
+
148
+ # ─── Progress Tracking ───────────────────────────────────────────────────────
149
+
150
+ update_status() {
151
+ local loop_count=$1
152
+ local calls_made=$2
153
+ local last_action=$3
154
+ local status=$4
155
+ local exit_reason=${5:-""}
156
+
157
+ if [[ -z "$STATUS_FILE" ]]; then
158
+ return
159
+ fi
160
+
161
+ # codeharness: Include sprint-status story counts in status JSON
162
+ local stories_total=0
163
+ local stories_completed=0
164
+ if [[ -n "$SPRINT_STATUS_FILE" && -f "$SPRINT_STATUS_FILE" ]]; then
165
+ local sprint_counts
166
+ sprint_counts=$(get_task_counts)
167
+ stories_total=${sprint_counts%% *}
168
+ stories_completed=${sprint_counts##* }
169
+ fi
170
+
171
+ local stories_remaining=$((stories_total - stories_completed))
172
+ local elapsed_seconds=0
173
+ if [[ -n "$loop_start_time" ]]; then
174
+ elapsed_seconds=$(( $(date +%s) - loop_start_time ))
175
+ fi
176
+
177
+ # Build flagged stories JSON array
178
+ local flagged_json="[]"
179
+ if [[ -n "$FLAGGED_STORIES_FILE" && -f "$FLAGGED_STORIES_FILE" ]]; then
180
+ flagged_json=$(jq -R -s 'split("\n") | map(select(length > 0))' < "$FLAGGED_STORIES_FILE")
181
+ fi
182
+
183
+ jq -n \
184
+ --arg timestamp "$(get_iso_timestamp)" \
185
+ --argjson loop_count "$loop_count" \
186
+ --argjson calls_made "$calls_made" \
187
+ --argjson max_calls "$MAX_CALLS_PER_HOUR" \
188
+ --argjson max_iterations "$MAX_ITERATIONS" \
189
+ --arg last_action "$last_action" \
190
+ --arg status "$status" \
191
+ --arg exit_reason "$exit_reason" \
192
+ --arg version "$VERSION" \
193
+ --argjson stories_total "$stories_total" \
194
+ --argjson stories_completed "$stories_completed" \
195
+ --argjson stories_remaining "$stories_remaining" \
196
+ --argjson elapsed_seconds "$elapsed_seconds" \
197
+ --argjson flagged_stories "$flagged_json" \
198
+ '{
199
+ timestamp: $timestamp,
200
+ version: $version,
201
+ loop_count: $loop_count,
202
+ calls_made_this_hour: $calls_made,
203
+ max_calls_per_hour: $max_calls,
204
+ max_iterations: $max_iterations,
205
+ last_action: $last_action,
206
+ status: $status,
207
+ exit_reason: $exit_reason,
208
+ stories_total: $stories_total,
209
+ stories_completed: $stories_completed,
210
+ stories_remaining: $stories_remaining,
211
+ elapsed_seconds: $elapsed_seconds,
212
+ flagged_stories: $flagged_stories
213
+ }' > "$STATUS_FILE"
214
+ }
215
+
216
+ # codeharness: Task picking is handled by /harness-run skill inside each Claude session.
217
+ # Ralph just spawns sessions and checks sprint-status.yaml for completion.
218
+ get_current_task() {
219
+ # No-op — task picking is done by the /harness-run skill, not Ralph.
220
+ echo ""
221
+ return 0
222
+ }
223
+
224
+ # codeharness: Check if all stories in sprint-status.yaml are done.
225
+ # Reads development_status entries matching N-N-slug pattern (story keys).
226
+ # Returns 0 (true) if ALL story entries have status "done", 1 otherwise.
227
+ check_sprint_complete() {
228
+ if [[ ! -f "$SPRINT_STATUS_FILE" ]]; then
229
+ return 1
230
+ fi
231
+
232
+ local total=0
233
+ local done_count=0
234
+
235
+ while IFS=: read -r key value; do
236
+ # Trim whitespace
237
+ key=$(echo "$key" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
238
+ value=$(echo "$value" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
239
+
240
+ # Skip comments and empty lines
241
+ [[ -z "$key" || "$key" == \#* ]] && continue
242
+
243
+ # Match story keys: N-N-slug (e.g. 5-1-ralph-loop-integration)
244
+ if [[ "$key" =~ ^[0-9]+-[0-9]+- ]]; then
245
+ total=$((total + 1))
246
+ if [[ "$value" == "done" ]]; then
247
+ done_count=$((done_count + 1))
248
+ fi
249
+ fi
250
+ done < "$SPRINT_STATUS_FILE"
251
+
252
+ if [[ $total -eq 0 ]]; then
253
+ return 1
254
+ fi
255
+
256
+ [[ $done_count -eq $total ]]
257
+ }
258
+
259
+ # codeharness: Replaces all_tasks_complete() with sprint-status.yaml check.
260
+ all_tasks_complete() {
261
+ check_sprint_complete
262
+ }
263
+
264
+ # codeharness: Get story counts from sprint-status.yaml.
265
+ # Returns "total completed" (space-separated).
266
+ get_task_counts() {
267
+ if [[ ! -f "$SPRINT_STATUS_FILE" ]]; then
268
+ echo "0 0"
269
+ return
270
+ fi
271
+
272
+ local total=0
273
+ local completed=0
274
+
275
+ while IFS=: read -r key value; do
276
+ key=$(echo "$key" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
277
+ value=$(echo "$value" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
278
+
279
+ [[ -z "$key" || "$key" == \#* ]] && continue
280
+
281
+ if [[ "$key" =~ ^[0-9]+-[0-9]+- ]]; then
282
+ total=$((total + 1))
283
+ if [[ "$value" == "done" ]]; then
284
+ completed=$((completed + 1))
285
+ fi
286
+ fi
287
+ done < "$SPRINT_STATUS_FILE"
288
+
289
+ echo "$total $completed"
290
+ }
291
+
292
+ # ─── Retry Tracking ─────────────────────────────────────────────────────────
293
+
294
+ # Increment retry count for a story. Returns the new count.
295
+ increment_story_retry() {
296
+ local story_key=$1
297
+
298
+ if [[ -z "$STORY_RETRY_FILE" ]]; then
299
+ echo "0"
300
+ return
301
+ fi
302
+
303
+ local count=0
304
+ local temp_file="${STORY_RETRY_FILE}.tmp"
305
+
306
+ # Read current count if file exists
307
+ if [[ -f "$STORY_RETRY_FILE" ]]; then
308
+ local line
309
+ while IFS=' ' read -r key val; do
310
+ if [[ "$key" == "$story_key" ]]; then
311
+ count=$((val + 0))
312
+ fi
313
+ done < "$STORY_RETRY_FILE"
314
+ fi
315
+
316
+ count=$((count + 1))
317
+
318
+ # Rewrite the file with updated count (atomic via temp file + mv)
319
+ # Clean up stale temp file from any previous crash
320
+ rm -f "$temp_file" 2>/dev/null
321
+
322
+ if [[ -f "$STORY_RETRY_FILE" ]]; then
323
+ local found=false
324
+ while IFS=' ' read -r key val; do
325
+ if [[ "$key" == "$story_key" ]]; then
326
+ echo "$key $count" >> "$temp_file"
327
+ found=true
328
+ else
329
+ echo "$key $val" >> "$temp_file"
330
+ fi
331
+ done < "$STORY_RETRY_FILE"
332
+ if [[ "$found" == "false" ]]; then
333
+ echo "$story_key $count" >> "$temp_file"
334
+ fi
335
+ mv "$temp_file" "$STORY_RETRY_FILE"
336
+ else
337
+ echo "$story_key $count" > "$STORY_RETRY_FILE"
338
+ fi
339
+
340
+ echo "$count"
341
+ }
342
+
343
+ # Get current retry count for a story (0 if not tracked).
344
+ get_story_retry_count() {
345
+ local story_key=$1
346
+
347
+ if [[ -z "$STORY_RETRY_FILE" || ! -f "$STORY_RETRY_FILE" ]]; then
348
+ echo "0"
349
+ return
350
+ fi
351
+
352
+ while IFS=' ' read -r key val; do
353
+ if [[ "$key" == "$story_key" ]]; then
354
+ echo "$((val + 0))"
355
+ return
356
+ fi
357
+ done < "$STORY_RETRY_FILE"
358
+
359
+ echo "0"
360
+ }
361
+
362
+ # Check if a story is flagged (exceeded retry limit).
363
+ is_story_flagged() {
364
+ local story_key=$1
365
+
366
+ if [[ -z "$FLAGGED_STORIES_FILE" || ! -f "$FLAGGED_STORIES_FILE" ]]; then
367
+ return 1
368
+ fi
369
+
370
+ grep -qx "$story_key" "$FLAGGED_STORIES_FILE" 2>/dev/null
371
+ }
372
+
373
+ # Flag a story that exceeded retry limit.
374
+ flag_story() {
375
+ local story_key=$1
376
+
377
+ if [[ -z "$FLAGGED_STORIES_FILE" ]]; then
378
+ return
379
+ fi
380
+
381
+ if ! is_story_flagged "$story_key"; then
382
+ echo "$story_key" >> "$FLAGGED_STORIES_FILE"
383
+ fi
384
+ }
385
+
386
+ # Get list of flagged stories (newline-separated).
387
+ get_flagged_stories() {
388
+ if [[ -z "$FLAGGED_STORIES_FILE" || ! -f "$FLAGGED_STORIES_FILE" ]]; then
389
+ echo ""
390
+ return
391
+ fi
392
+ cat "$FLAGGED_STORIES_FILE"
393
+ }
394
+
395
+ # Snapshot sprint-status.yaml story statuses as "key:status" lines.
396
+ snapshot_story_statuses() {
397
+ if [[ ! -f "$SPRINT_STATUS_FILE" ]]; then
398
+ echo ""
399
+ return
400
+ fi
401
+
402
+ while IFS=: read -r key value; do
403
+ key=$(echo "$key" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
404
+ value=$(echo "$value" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
405
+ [[ -z "$key" || "$key" == \#* ]] && continue
406
+ if [[ "$key" =~ ^[0-9]+-[0-9]+- ]]; then
407
+ echo "$key:$value"
408
+ fi
409
+ done < "$SPRINT_STATUS_FILE"
410
+ }
411
+
412
+ # Compare before/after snapshots to detect story changes.
413
+ # Sets CHANGED_STORIES (newly done) and UNCHANGED_STORIES (not done).
414
+ detect_story_changes() {
415
+ local before_snapshot=$1
416
+ local after_snapshot=$2
417
+
418
+ CHANGED_STORIES=""
419
+ UNCHANGED_STORIES=""
420
+
421
+ # Parse after snapshot
422
+ while IFS=: read -r key status; do
423
+ [[ -z "$key" ]] && continue
424
+ local before_status=""
425
+ # Find the same key in before snapshot
426
+ while IFS=: read -r bkey bstatus; do
427
+ if [[ "$bkey" == "$key" ]]; then
428
+ before_status="$bstatus"
429
+ break
430
+ fi
431
+ done <<< "$before_snapshot"
432
+
433
+ if [[ "$status" == "done" && "$before_status" != "done" ]]; then
434
+ CHANGED_STORIES="${CHANGED_STORIES}${key}
435
+ "
436
+ elif [[ "$status" != "done" ]]; then
437
+ UNCHANGED_STORIES="${UNCHANGED_STORIES}${key}
438
+ "
439
+ fi
440
+ done <<< "$after_snapshot"
441
+ }
442
+
443
+ # ─── Progress Summary ───────────────────────────────────────────────────────
444
+
445
+ print_progress_summary() {
446
+ local counts
447
+ counts=$(get_task_counts)
448
+ local total=${counts%% *}
449
+ local completed=${counts##* }
450
+ local elapsed=$(( $(date +%s) - loop_start_time ))
451
+ local elapsed_fmt
452
+
453
+ if [[ $elapsed -ge 3600 ]]; then
454
+ elapsed_fmt="$((elapsed / 3600))h$((elapsed % 3600 / 60))m"
455
+ elif [[ $elapsed -ge 60 ]]; then
456
+ elapsed_fmt="$((elapsed / 60))m$((elapsed % 60))s"
457
+ else
458
+ elapsed_fmt="${elapsed}s"
459
+ fi
460
+
461
+ log_status "INFO" "Progress: ${completed}/${total} stories complete (iterations: ${loop_count}, elapsed: ${elapsed_fmt})"
462
+ }
463
+
464
+ # ─── Driver Management ──────────────────────────────────────────────────────
465
+
466
+ load_platform_driver() {
467
+ local driver_file="$SCRIPT_DIR/drivers/${PLATFORM_DRIVER}.sh"
468
+ if [[ ! -f "$driver_file" ]]; then
469
+ log_status "ERROR" "Platform driver not found: $driver_file"
470
+ exit 1
471
+ fi
472
+
473
+ # shellcheck source=/dev/null
474
+ source "$driver_file"
475
+
476
+ driver_valid_tools
477
+ log_status "INFO" "Platform driver: $(driver_display_name) ($(driver_cli_binary))"
478
+ }
479
+
480
+ # ─── Execution ───────────────────────────────────────────────────────────────
481
+
482
+ execute_iteration() {
483
+ local iteration=$1
484
+ local task_id=$2
485
+ local timestamp=$(date '+%Y-%m-%d_%H-%M-%S')
486
+ local output_file="$LOG_DIR/claude_output_${timestamp}.log"
487
+ local calls_made=$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")
488
+ calls_made=$((calls_made + 1))
489
+
490
+ # Capture git HEAD SHA at iteration start for progress detection
491
+ local loop_start_sha=""
492
+ if command -v git &>/dev/null && git rev-parse --git-dir &>/dev/null 2>&1; then
493
+ loop_start_sha=$(git rev-parse HEAD 2>/dev/null || echo "")
494
+ fi
495
+
496
+ log_status "LOOP" "Iteration $iteration — Task: ${task_id:-'(reading from prompt)'}"
497
+ local timeout_seconds=$((ITERATION_TIMEOUT_MINUTES * 60))
498
+
499
+ # Build loop context
500
+ local loop_context="Loop #${iteration}."
501
+ if [[ -n "$task_id" ]]; then
502
+ loop_context+=" Current task: $task_id."
503
+ fi
504
+
505
+ # Build the command via driver
506
+ local session_id="" # Fresh context per iteration
507
+ if ! driver_build_command "$PROMPT_FILE" "$loop_context" "$session_id" "$PLUGIN_DIR"; then
508
+ log_status "ERROR" "Failed to build CLI command"
509
+ return 1
510
+ fi
511
+
512
+ log_status "INFO" "Starting $(driver_display_name) (timeout: ${ITERATION_TIMEOUT_MINUTES}m)..."
513
+
514
+ # Execute with timeout
515
+ local exit_code=0
516
+
517
+ if [[ "$LIVE_OUTPUT" == "true" ]]; then
518
+ # Live streaming mode
519
+ echo -e "\n=== Iteration #$iteration — $(date '+%Y-%m-%d %H:%M:%S') ===" > "$LIVE_LOG_FILE"
520
+ echo -e "${PURPLE}━━━━━━━━━━━━━ $(driver_display_name) Output ━━━━━━━━━━━━━${NC}"
521
+
522
+ set -o pipefail
523
+ portable_timeout ${timeout_seconds}s "${CLAUDE_CMD_ARGS[@]}" \
524
+ < /dev/null 2>&1 | tee "$output_file" | tee "$LIVE_LOG_FILE"
525
+ exit_code=${PIPESTATUS[0]}
526
+ set +o pipefail
527
+
528
+ echo -e "${PURPLE}━━━━━━━━━━━━━ End of Output ━━━━━━━━━━━━━━━━━━━${NC}"
529
+ else
530
+ # Background mode with progress monitoring
531
+ portable_timeout ${timeout_seconds}s "${CLAUDE_CMD_ARGS[@]}" \
532
+ < /dev/null > "$output_file" 2>&1 &
533
+
534
+ local claude_pid=$!
535
+ local progress_counter=0
536
+
537
+ while kill -0 $claude_pid 2>/dev/null; do
538
+ progress_counter=$((progress_counter + 1))
539
+ if [[ -f "$output_file" && -s "$output_file" ]]; then
540
+ cp "$output_file" "$LIVE_LOG_FILE" 2>/dev/null
541
+ fi
542
+ sleep 10
543
+ done
544
+
545
+ wait $claude_pid
546
+ exit_code=$?
547
+ fi
548
+
549
+ if [[ $exit_code -eq 0 ]]; then
550
+ echo "$calls_made" > "$CALL_COUNT_FILE"
551
+ log_status "SUCCESS" "$(driver_display_name) iteration completed successfully"
552
+
553
+ # Detect progress: check for file changes (committed or uncommitted)
554
+ local files_changed=0
555
+ if command -v git &>/dev/null && git rev-parse --git-dir &>/dev/null 2>&1; then
556
+ local current_sha=$(git rev-parse HEAD 2>/dev/null || echo "")
557
+
558
+ if [[ -n "$loop_start_sha" && -n "$current_sha" && "$loop_start_sha" != "$current_sha" ]]; then
559
+ files_changed=$(
560
+ {
561
+ git diff --name-only "$loop_start_sha" "$current_sha" 2>/dev/null
562
+ git diff --name-only HEAD 2>/dev/null
563
+ git diff --name-only --cached 2>/dev/null
564
+ } | sort -u | wc -l
565
+ )
566
+ else
567
+ files_changed=$(
568
+ {
569
+ git diff --name-only 2>/dev/null
570
+ git diff --name-only --cached 2>/dev/null
571
+ } | sort -u | wc -l
572
+ )
573
+ fi
574
+ fi
575
+
576
+ local has_errors="false"
577
+ if grep -v '"[^"]*error[^"]*":' "$output_file" 2>/dev/null | \
578
+ grep -qE '(^Error:|^ERROR:|^error:|\]: error|Error occurred|failed with error|[Ee]xception|Fatal|FATAL)'; then
579
+ has_errors="true"
580
+ log_status "WARN" "Errors detected in output"
581
+ fi
582
+
583
+ local output_length=$(wc -c < "$output_file" 2>/dev/null || echo 0)
584
+
585
+ # Record in circuit breaker
586
+ record_loop_result "$iteration" "$files_changed" "$has_errors" "$output_length"
587
+ local circuit_result=$?
588
+
589
+ if [[ $circuit_result -ne 0 ]]; then
590
+ log_status "WARN" "Circuit breaker opened — halting execution"
591
+ return 3
592
+ fi
593
+
594
+ return 0
595
+ elif [[ $exit_code -eq 124 ]]; then
596
+ log_status "WARN" "Iteration timed out after ${ITERATION_TIMEOUT_MINUTES}m"
597
+ return 1
598
+ else
599
+ # Check for API limit
600
+ if grep -qi "5.*hour.*limit\|limit.*reached.*try.*back\|usage.*limit.*reached" "$output_file" 2>/dev/null; then
601
+ log_status "ERROR" "Claude API usage limit reached"
602
+ return 2
603
+ else
604
+ log_status "ERROR" "$(driver_display_name) execution failed (exit code: $exit_code)"
605
+ return 1
606
+ fi
607
+ fi
608
+ }
609
+
610
+ # ─── Cleanup ─────────────────────────────────────────────────────────────────
611
+
612
+ cleanup() {
613
+ log_status "INFO" "Ralph loop interrupted. Cleaning up..."
614
+ update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "interrupted" "stopped" "user_cancelled"
615
+
616
+ # Print progress summary on interruption
617
+ if [[ -n "$loop_start_time" && -n "$SPRINT_STATUS_FILE" ]]; then
618
+ local counts
619
+ counts=$(get_task_counts)
620
+ local total=${counts%% *}
621
+ local completed=${counts##* }
622
+ local elapsed=$(( $(date +%s) - loop_start_time ))
623
+ local elapsed_min=$(( elapsed / 60 ))
624
+
625
+ log_status "INFO" " Iterations: $loop_count"
626
+ log_status "INFO" " Stories completed: $completed/$total"
627
+ log_status "INFO" " Elapsed: ${elapsed_min}m"
628
+ fi
629
+
630
+ exit 0
631
+ }
632
+
633
+ trap cleanup SIGINT SIGTERM
634
+
635
+ # ─── Help ────────────────────────────────────────────────────────────────────
636
+
637
+ show_help() {
638
+ cat << 'HELPEOF'
639
+ codeharness Ralph Loop — Autonomous execution with verification gates
640
+
641
+ Usage: ralph/ralph.sh --plugin-dir DIR [OPTIONS]
642
+
643
+ Required:
644
+ --plugin-dir DIR Path to codeharness plugin directory
645
+
646
+ Options:
647
+ -h, --help Show this help message
648
+ --max-iterations NUM Maximum loop iterations (default: 50)
649
+ --max-story-retries NUM Max retries per story before flagging (default: 3)
650
+ --timeout SECONDS Total loop timeout in seconds (default: 14400 = 4h)
651
+ --iteration-timeout MIN Per-iteration timeout in minutes (default: 15)
652
+ --calls NUM Max API calls per hour (default: 100)
653
+ --prompt FILE Prompt file for each iteration
654
+ --progress FILE Progress file (tasks JSON)
655
+ --live Show live output streaming
656
+ --reset-circuit Reset circuit breaker and exit
657
+ --status Show current status and exit
658
+
659
+ The loop:
660
+ 1. Reads next task from progress file
661
+ 2. Spawns fresh Claude Code instance with --plugin-dir
662
+ 3. Agent implements story (harness hooks enforce verification)
663
+ 4. Circuit breaker monitors for stagnation
664
+ 5. On completion or gate failure, picks next task or iterates
665
+ HELPEOF
666
+ }
667
+
668
+ # ─── Main ────────────────────────────────────────────────────────────────────
669
+
670
+ main() {
671
+ if [[ -z "$PLUGIN_DIR" ]]; then
672
+ log_status "ERROR" "Missing required --plugin-dir argument"
673
+ show_help
674
+ exit 1
675
+ fi
676
+
677
+ # Resolve paths
678
+ PLUGIN_DIR="$(cd "$PLUGIN_DIR" 2>/dev/null && pwd)" || {
679
+ log_status "ERROR" "Plugin directory does not exist: $PLUGIN_DIR"
680
+ exit 1
681
+ }
682
+
683
+ # Derive state paths from project root (cwd)
684
+ local project_root
685
+ project_root="$(pwd)"
686
+
687
+ HARNESS_STATE_DIR="${project_root}/.claude"
688
+ LOG_DIR="${project_root}/ralph/logs"
689
+ STATUS_FILE="${project_root}/ralph/status.json"
690
+ LIVE_LOG_FILE="${project_root}/ralph/live.log"
691
+ CALL_COUNT_FILE="${project_root}/ralph/.call_count"
692
+ TIMESTAMP_FILE="${project_root}/ralph/.last_reset"
693
+ STORY_RETRY_FILE="${project_root}/ralph/.story_retries"
694
+ FLAGGED_STORIES_FILE="${project_root}/ralph/.flagged_stories"
695
+
696
+ # Use progress file from argument or default (legacy, optional)
697
+ PROGRESS_FILE="${PROGRESS_FILE:-${project_root}/ralph/progress.json}"
698
+
699
+ # codeharness: Sprint status file is the primary task source
700
+ SPRINT_STATUS_FILE="${project_root}/_bmad-output/implementation-artifacts/sprint-status.yaml"
701
+
702
+ # Use prompt file from argument or default
703
+ PROMPT_FILE="${PROMPT_FILE:-${project_root}/.ralph/PROMPT.md}"
704
+
705
+ # Create directories
706
+ mkdir -p "$LOG_DIR"
707
+
708
+ # Check dependencies
709
+ if ! command -v jq &>/dev/null; then
710
+ log_status "ERROR" "Required dependency 'jq' is not installed"
711
+ exit 1
712
+ fi
713
+
714
+ # Load platform driver
715
+ load_platform_driver
716
+
717
+ # Check CLI binary
718
+ if ! driver_check_available; then
719
+ log_status "ERROR" "$(driver_display_name) CLI not found: $(driver_cli_binary)"
720
+ exit 1
721
+ fi
722
+
723
+ # Initialize circuit breaker
724
+ export HARNESS_STATE_DIR
725
+ init_circuit_breaker
726
+
727
+ # Initialize rate limiting
728
+ init_call_tracking
729
+
730
+ # Crash recovery: detect if resuming from a previous run
731
+ if [[ -f "$STATUS_FILE" ]]; then
732
+ local prev_status
733
+ prev_status=$(jq -r '.status // ""' "$STATUS_FILE" 2>/dev/null || echo "")
734
+ if [[ -n "$prev_status" && "$prev_status" != "completed" ]]; then
735
+ log_status "INFO" "Resuming from last completed story"
736
+ fi
737
+ fi
738
+
739
+ # Preserve retry state across restarts (Task 5.3)
740
+ # .story_retries and .flagged_stories are file-based — they persist automatically
741
+
742
+ log_status "SUCCESS" "Ralph loop starting"
743
+ log_status "INFO" "Plugin: $PLUGIN_DIR"
744
+ log_status "INFO" "Max iterations: $MAX_ITERATIONS | Timeout: $((LOOP_TIMEOUT_SECONDS / 3600))h"
745
+ log_status "INFO" "Prompt: $PROMPT_FILE"
746
+ log_status "INFO" "Sprint status: $SPRINT_STATUS_FILE"
747
+ log_status "INFO" "Max story retries: $MAX_STORY_RETRIES"
748
+
749
+ # Record loop start time for timeout
750
+ loop_start_time=$(date +%s)
751
+
752
+ local consecutive_failures=0
753
+ local max_consecutive_failures=3
754
+
755
+ while true; do
756
+ loop_count=$((loop_count + 1))
757
+
758
+ # ── Check loop limits ──
759
+
760
+ if [[ $loop_count -gt $MAX_ITERATIONS ]]; then
761
+ update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "max_iterations" "stopped" "max_iterations_reached"
762
+
763
+ local counts
764
+ counts=$(get_task_counts)
765
+ local total=${counts%% *}
766
+ local completed=${counts##* }
767
+ log_status "INFO" "Max iterations ($MAX_ITERATIONS) reached. ${completed}/${total} stories complete."
768
+ break
769
+ fi
770
+
771
+ # Check total timeout
772
+ local elapsed=$(( $(date +%s) - loop_start_time ))
773
+ if [[ $elapsed -ge $LOOP_TIMEOUT_SECONDS ]]; then
774
+ log_status "WARN" "Loop timeout reached (${LOOP_TIMEOUT_SECONDS}s)"
775
+ update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "timeout" "stopped" "loop_timeout"
776
+ break
777
+ fi
778
+
779
+ # ── Check circuit breaker ──
780
+
781
+ if should_halt_execution; then
782
+ local cb_no_progress=0
783
+ if [[ -f "$CB_STATE_FILE" ]]; then
784
+ cb_no_progress=$(jq -r '.consecutive_no_progress // 0' "$CB_STATE_FILE" 2>/dev/null || echo "0")
785
+ fi
786
+ log_status "WARN" "Circuit breaker: no progress in ${cb_no_progress} iterations"
787
+ update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "circuit_breaker" "halted" "stagnation_detected"
788
+ break
789
+ fi
790
+
791
+ # ── Check rate limit ──
792
+
793
+ if ! can_make_call; then
794
+ wait_for_reset
795
+ continue
796
+ fi
797
+
798
+ # ── Check task completion ──
799
+
800
+ if all_tasks_complete; then
801
+ local counts
802
+ counts=$(get_task_counts)
803
+ local total=${counts%% *}
804
+
805
+ update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "all_complete" "completed" "all_tasks_done"
806
+ log_status "SUCCESS" "All stories complete. ${total} stories verified in ${loop_count} iterations."
807
+ break
808
+ fi
809
+
810
+ # ── Get current task ──
811
+
812
+ local current_task
813
+ current_task=$(get_current_task)
814
+
815
+ log_status "LOOP" "=== Iteration #$loop_count ==="
816
+ update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "executing" "running"
817
+
818
+ # ── Snapshot story statuses before iteration ──
819
+ local before_snapshot
820
+ before_snapshot=$(snapshot_story_statuses)
821
+
822
+ # ── Execute ──
823
+
824
+ execute_iteration "$loop_count" "$current_task"
825
+ local exec_result=$?
826
+
827
+ case $exec_result in
828
+ 0)
829
+ consecutive_failures=0
830
+ update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "completed" "success"
831
+
832
+ # ── Retry tracking: compare sprint-status before/after ──
833
+ local after_snapshot
834
+ after_snapshot=$(snapshot_story_statuses)
835
+ detect_story_changes "$before_snapshot" "$after_snapshot"
836
+
837
+ # For each non-done, non-flagged story, increment retry count
838
+ if [[ -n "$UNCHANGED_STORIES" ]]; then
839
+ while IFS= read -r skey; do
840
+ [[ -z "$skey" ]] && continue
841
+ # Skip already-flagged stories
842
+ if is_story_flagged "$skey"; then
843
+ continue
844
+ fi
845
+ local retry_count
846
+ retry_count=$(increment_story_retry "$skey")
847
+ if [[ $retry_count -gt $MAX_STORY_RETRIES ]]; then
848
+ log_status "WARN" "Story ${skey} exceeded retry limit (${retry_count}) — flagging and moving on"
849
+ flag_story "$skey"
850
+ else
851
+ log_status "WARN" "Story ${skey} — retry ${retry_count}/${MAX_STORY_RETRIES}"
852
+ fi
853
+ done <<< "$UNCHANGED_STORIES"
854
+ fi
855
+
856
+ if [[ -n "$CHANGED_STORIES" ]]; then
857
+ while IFS= read -r skey; do
858
+ [[ -z "$skey" ]] && continue
859
+ log_status "SUCCESS" "Story ${skey}: DONE"
860
+ done <<< "$CHANGED_STORIES"
861
+ fi
862
+
863
+ sleep 5 # Brief pause between iterations
864
+ ;;
865
+ 2)
866
+ # API limit — wait or exit
867
+ log_status "WARN" "API usage limit reached. Waiting 60 minutes..."
868
+ update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "api_limit" "paused"
869
+ sleep 3600
870
+ ;;
871
+ 3)
872
+ # Circuit breaker
873
+ log_status "ERROR" "Circuit breaker opened — halting loop"
874
+ update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "circuit_breaker" "halted"
875
+ break
876
+ ;;
877
+ *)
878
+ # Failure — retry with backoff
879
+ consecutive_failures=$((consecutive_failures + 1))
880
+ if [[ $consecutive_failures -ge $max_consecutive_failures ]]; then
881
+ log_status "ERROR" "$max_consecutive_failures consecutive failures — halting"
882
+ update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "consecutive_failures" "halted"
883
+ break
884
+ fi
885
+
886
+ update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "failed" "error"
887
+ log_status "WARN" "Iteration failed ($consecutive_failures/$max_consecutive_failures). Waiting 30s..."
888
+ sleep 30
889
+ ;;
890
+ esac
891
+
892
+ # Print progress summary after every iteration
893
+ print_progress_summary
894
+
895
+ log_status "LOOP" "=== End Iteration #$loop_count ==="
896
+ done
897
+
898
+ # Final summary — reads from sprint-status.yaml
899
+ local counts
900
+ counts=$(get_task_counts)
901
+ local total=${counts%% *}
902
+ local completed=${counts##* }
903
+
904
+ local elapsed_total=$(( $(date +%s) - loop_start_time ))
905
+ local elapsed_min=$(( elapsed_total / 60 ))
906
+
907
+ log_status "SUCCESS" "Ralph loop finished"
908
+ log_status "INFO" " Iterations: $loop_count"
909
+ log_status "INFO" " Stories completed: $completed/$total"
910
+ log_status "INFO" " Elapsed: ${elapsed_min}m"
911
+ log_status "INFO" " API calls: $(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")"
912
+
913
+ if [[ $completed -eq $total && $total -gt 0 ]]; then
914
+ log_status "SUCCESS" "All stories complete. $total stories verified in $loop_count iterations."
915
+ fi
916
+
917
+ # Write final summary to status file
918
+ update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "final_summary" \
919
+ "$(if [[ $completed -eq $total && $total -gt 0 ]]; then echo "completed"; else echo "stopped"; fi)" \
920
+ "completed:$completed/$total"
921
+
922
+ # Mandatory retrospective — cannot be skipped
923
+ log_status "INFO" "Triggering mandatory sprint retrospective..."
924
+ if [[ -f "$SCRIPT_DIR/retro.sh" ]]; then
925
+ local project_root
926
+ project_root="$(pwd)"
927
+ "$SCRIPT_DIR/retro.sh" --project-dir "$project_root" 2>&1 || \
928
+ log_status "WARN" "Retro report generation failed"
929
+ fi
930
+ }
931
+
932
+ # ─── CLI Parsing ─────────────────────────────────────────────────────────────
933
+
934
+ if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
935
+
936
+ while [[ $# -gt 0 ]]; do
937
+ case $1 in
938
+ -h|--help)
939
+ show_help
940
+ exit 0
941
+ ;;
942
+ --plugin-dir)
943
+ PLUGIN_DIR="$2"
944
+ shift 2
945
+ ;;
946
+ --max-iterations)
947
+ MAX_ITERATIONS="$2"
948
+ shift 2
949
+ ;;
950
+ --max-story-retries)
951
+ MAX_STORY_RETRIES="$2"
952
+ shift 2
953
+ ;;
954
+ --timeout)
955
+ LOOP_TIMEOUT_SECONDS="$2"
956
+ shift 2
957
+ ;;
958
+ --iteration-timeout)
959
+ ITERATION_TIMEOUT_MINUTES="$2"
960
+ shift 2
961
+ ;;
962
+ --calls)
963
+ MAX_CALLS_PER_HOUR="$2"
964
+ shift 2
965
+ ;;
966
+ --prompt)
967
+ PROMPT_FILE="$2"
968
+ shift 2
969
+ ;;
970
+ --progress)
971
+ PROGRESS_FILE="$2"
972
+ shift 2
973
+ ;;
974
+ --live)
975
+ LIVE_OUTPUT=true
976
+ shift
977
+ ;;
978
+ --reset-circuit)
979
+ # Derive state paths so circuit breaker uses the correct directory
980
+ HARNESS_STATE_DIR="$(pwd)/.claude"
981
+ export HARNESS_STATE_DIR
982
+ init_circuit_breaker
983
+ reset_circuit_breaker "Manual reset via CLI"
984
+ echo "Circuit breaker reset to CLOSED"
985
+ exit 0
986
+ ;;
987
+ --status)
988
+ _status_file="$(pwd)/ralph/status.json"
989
+ if [[ -f "$_status_file" ]]; then
990
+ jq . "$_status_file" 2>/dev/null || cat "$_status_file"
991
+ else
992
+ echo "No status file found."
993
+ fi
994
+ exit 0
995
+ ;;
996
+ *)
997
+ echo "Unknown option: $1"
998
+ show_help
999
+ exit 1
1000
+ ;;
1001
+ esac
1002
+ done
1003
+
1004
+ main
1005
+
1006
+ fi