codeharness 0.26.5 → 0.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/ralph/ralph.sh DELETED
@@ -1,1407 +0,0 @@
1
- #!/usr/bin/env bash
2
- # codeharness Ralph Loop — Vendored from snarktank/ralph
3
- # Autonomous execution loop that spawns fresh Claude Code instances per iteration
4
- # with verification gates, crash recovery, rate limiting, and circuit breaker protection.
5
- #
6
- # Usage: ralph/ralph.sh --plugin-dir ./codeharness [OPTIONS]
7
-
8
- # NOTE: set -e intentionally NOT used — it causes silent crashes in the main
9
- # loop when grep/jq/sed return non-zero. The loop handles errors via exit codes.
10
-
11
- SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
12
- source "$SCRIPT_DIR/lib/date_utils.sh"
13
- source "$SCRIPT_DIR/lib/timeout_utils.sh"
14
- source "$SCRIPT_DIR/lib/circuit_breaker.sh"
15
-
16
- # ─── Configuration ───────────────────────────────────────────────────────────
17
-
18
- VERSION="0.1.0"
19
-
20
- # Plugin directory (required — set via --plugin-dir)
21
- PLUGIN_DIR=""
22
-
23
- # Harness state directory (derived from project root)
24
- HARNESS_STATE_DIR=""
25
-
26
- # Progress file (legacy — kept for backwards compat, optional)
27
- PROGRESS_FILE=""
28
-
29
- # Sprint status file (primary task source — read by /harness-run skill)
30
- SPRINT_STATUS_FILE=""
31
-
32
- # Prompt file for each iteration
33
- PROMPT_FILE=""
34
-
35
- # Logging
36
- LOG_DIR=""
37
-
38
- # Loop limits
39
- MAX_ITERATIONS=${MAX_ITERATIONS:-50}
40
- MAX_STORY_RETRIES=${MAX_STORY_RETRIES:-10}
41
- LOOP_TIMEOUT_SECONDS=${LOOP_TIMEOUT_SECONDS:-43200} # 12 hours default
42
- ITERATION_TIMEOUT_MINUTES=${ITERATION_TIMEOUT_MINUTES:-30}
43
-
44
- # Rate limiting
45
- MAX_CALLS_PER_HOUR=${MAX_CALLS_PER_HOUR:-100}
46
- RATE_LIMIT_SLEEP=3600 # 1 hour
47
-
48
- # Driver
49
- PLATFORM_DRIVER="${PLATFORM_DRIVER:-claude-code}"
50
- CLAUDE_OUTPUT_FORMAT="${CLAUDE_OUTPUT_FORMAT:-stream-json}"
51
- CLAUDE_ALLOWED_TOOLS="${CLAUDE_ALLOWED_TOOLS:-}"
52
- CLAUDE_USE_CONTINUE="${CLAUDE_USE_CONTINUE:-false}" # Fresh context per iteration by default
53
-
54
- # Reset retry state on start
55
- RESET_RETRIES=false
56
-
57
- # Live output
58
- LIVE_OUTPUT=false
59
-
60
- # Colors
61
- RED='\033[0;31m'
62
- GREEN='\033[0;32m'
63
- YELLOW='\033[1;33m'
64
- BLUE='\033[0;34m'
65
- PURPLE='\033[0;35m'
66
- NC='\033[0m'
67
-
68
- # ─── Internal state ─────────────────────────────────────────────────────────
69
-
70
- CALL_COUNT_FILE=""
71
- TIMESTAMP_FILE=""
72
- STATUS_FILE=""
73
- LIVE_LOG_FILE=""
74
- STORY_RETRY_FILE=""
75
- FLAGGED_STORIES_FILE=""
76
-
77
- # Global arrays for driver command building
78
- declare -a CLAUDE_CMD_ARGS=()
79
- declare -a LIVE_CMD_ARGS=()
80
- declare -a VALID_TOOL_PATTERNS=()
81
-
82
- loop_count=0
83
- loop_start_time=""
84
-
85
- # ─── Logging ─────────────────────────────────────────────────────────────────
86
-
87
- log_status() {
88
- local level=$1
89
- local message=$2
90
- local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
91
- local color=""
92
-
93
- case $level in
94
- "INFO") color=$BLUE ;;
95
- "WARN") color=$YELLOW ;;
96
- "ERROR") color=$RED ;;
97
- "SUCCESS") color=$GREEN ;;
98
- "LOOP") color=$PURPLE ;;
99
- esac
100
-
101
- # DEBUG level: log file only, no terminal output
102
- if [[ "$level" == "DEBUG" ]]; then
103
- if [[ -n "$LOG_DIR" ]]; then
104
- echo "[$timestamp] [$level] $message" >> "$LOG_DIR/ralph.log"
105
- fi
106
- return
107
- fi
108
-
109
- echo -e "${color}[$timestamp] [$level] $message${NC}" >&2
110
- if [[ -n "$LOG_DIR" ]]; then
111
- echo "[$timestamp] [$level] $message" >> "$LOG_DIR/ralph.log"
112
- fi
113
- }
114
-
115
- # ─── Rate Limiting ───────────────────────────────────────────────────────────
116
-
117
- init_call_tracking() {
118
- local current_hour=$(date +%Y%m%d%H)
119
- local last_reset_hour=""
120
-
121
- if [[ -f "$TIMESTAMP_FILE" ]]; then
122
- last_reset_hour=$(cat "$TIMESTAMP_FILE")
123
- fi
124
-
125
- if [[ "$current_hour" != "$last_reset_hour" ]]; then
126
- echo "0" > "$CALL_COUNT_FILE"
127
- echo "$current_hour" > "$TIMESTAMP_FILE"
128
- fi
129
- }
130
-
131
- can_make_call() {
132
- local calls_made=0
133
- if [[ -f "$CALL_COUNT_FILE" ]]; then
134
- calls_made=$(cat "$CALL_COUNT_FILE")
135
- fi
136
-
137
- if [[ $calls_made -ge $MAX_CALLS_PER_HOUR ]]; then
138
- return 1
139
- else
140
- return 0
141
- fi
142
- }
143
-
144
- wait_for_reset() {
145
- local calls_made=$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")
146
- log_status "WARN" "Rate limit reached ($calls_made/$MAX_CALLS_PER_HOUR). Waiting for reset..."
147
-
148
- local current_minute=$(date +%M)
149
- local current_second=$(date +%S)
150
- local wait_time=$(((60 - current_minute - 1) * 60 + (60 - current_second)))
151
-
152
- log_status "INFO" "Sleeping for $wait_time seconds until next hour..."
153
- sleep "$wait_time"
154
-
155
- echo "0" > "$CALL_COUNT_FILE"
156
- echo "$(date +%Y%m%d%H)" > "$TIMESTAMP_FILE"
157
- log_status "SUCCESS" "Rate limit reset."
158
- }
159
-
160
- # ─── Progress Tracking ───────────────────────────────────────────────────────
161
-
162
- update_status() {
163
- local loop_count=$1
164
- local calls_made=$2
165
- local last_action=$3
166
- local status=$4
167
- local exit_reason=${5:-""}
168
-
169
- if [[ -z "$STATUS_FILE" ]]; then
170
- return
171
- fi
172
-
173
- # codeharness: Include sprint-status story counts in status JSON
174
- local stories_total=0
175
- local stories_completed=0
176
- if [[ -n "$SPRINT_STATUS_FILE" && -f "$SPRINT_STATUS_FILE" ]]; then
177
- local sprint_counts
178
- sprint_counts=$(get_task_counts)
179
- stories_total=${sprint_counts%% *}
180
- stories_completed=${sprint_counts##* }
181
- fi
182
-
183
- local stories_remaining=$((stories_total - stories_completed))
184
- local elapsed_seconds=0
185
- if [[ -n "$loop_start_time" ]]; then
186
- elapsed_seconds=$(( $(date +%s) - loop_start_time ))
187
- fi
188
-
189
- # Build flagged stories JSON array
190
- local flagged_json="[]"
191
- if [[ -n "$FLAGGED_STORIES_FILE" && -f "$FLAGGED_STORIES_FILE" ]]; then
192
- flagged_json=$(jq -R -s 'split("\n") | map(select(length > 0))' < "$FLAGGED_STORIES_FILE")
193
- fi
194
-
195
- # Get current story key for status tracking
196
- local current_story
197
- current_story=$(get_current_task)
198
-
199
- jq -n \
200
- --arg timestamp "$(get_iso_timestamp)" \
201
- --argjson loop_count "$loop_count" \
202
- --argjson calls_made "$calls_made" \
203
- --argjson max_calls "$MAX_CALLS_PER_HOUR" \
204
- --argjson max_iterations "$MAX_ITERATIONS" \
205
- --arg last_action "$last_action" \
206
- --arg status "$status" \
207
- --arg exit_reason "$exit_reason" \
208
- --arg version "$VERSION" \
209
- --arg story "${current_story:-}" \
210
- --argjson stories_total "$stories_total" \
211
- --argjson stories_completed "$stories_completed" \
212
- --argjson stories_remaining "$stories_remaining" \
213
- --argjson elapsed_seconds "$elapsed_seconds" \
214
- --argjson flagged_stories "$flagged_json" \
215
- '{
216
- timestamp: $timestamp,
217
- version: $version,
218
- loop_count: $loop_count,
219
- calls_made_this_hour: $calls_made,
220
- max_calls_per_hour: $max_calls,
221
- max_iterations: $max_iterations,
222
- last_action: $last_action,
223
- status: $status,
224
- story: $story,
225
- exit_reason: $exit_reason,
226
- stories_total: $stories_total,
227
- stories_completed: $stories_completed,
228
- stories_remaining: $stories_remaining,
229
- elapsed_seconds: $elapsed_seconds,
230
- flagged_stories: $flagged_stories
231
- }' > "$STATUS_FILE"
232
- }
233
-
234
- # codeharness: Task picking is handled by /harness-run skill inside each Claude session.
235
- # Ralph just spawns sessions and checks sprint-status.yaml for completion.
236
- get_current_task() {
237
- # Read the first in-progress or ready-for-dev story from sprint-state.json.
238
- # Task picking is done by /harness-run, but Ralph needs the story key
239
- # for timeout reports and status tracking.
240
- local state_file="sprint-state.json"
241
- if [[ ! -f "$state_file" ]]; then
242
- echo ""
243
- return 0
244
- fi
245
-
246
- # First try to find an in-progress story
247
- local story_key
248
- story_key=$(jq -r '
249
- .stories // {} | to_entries[]
250
- | select(.value.status == "in-progress")
251
- | .key
252
- ' "$state_file" 2>/dev/null | head -1)
253
-
254
- if [[ -n "$story_key" ]]; then
255
- echo "$story_key"
256
- return 0
257
- fi
258
-
259
- # Fall back to the first ready-for-dev story
260
- story_key=$(jq -r '
261
- .stories // {} | to_entries[]
262
- | select(.value.status == "ready-for-dev")
263
- | .key
264
- ' "$state_file" 2>/dev/null | head -1)
265
-
266
- echo "${story_key:-}"
267
- return 0
268
- }
269
-
270
- # codeharness: Check if all stories in sprint-status.yaml are done.
271
- # Reads development_status entries matching N-N-slug pattern (story keys).
272
- # Returns 0 (true) if ALL story entries have status "done", 1 otherwise.
273
- check_sprint_complete() {
274
- if [[ ! -f "$SPRINT_STATUS_FILE" ]]; then
275
- return 1
276
- fi
277
-
278
- local total=0
279
- local done_count=0
280
- local flagged_count=0
281
-
282
- # Load flagged stories into a newline-separated string for lookup
283
- local flagged_list=""
284
- if [[ -f "$FLAGGED_STORIES_FILE" ]]; then
285
- flagged_list=$(sed 's/^[[:space:]]*//;s/[[:space:]]*$//' "$FLAGGED_STORIES_FILE" | grep -v '^$')
286
- fi
287
-
288
- while IFS=: read -r key value; do
289
- # Trim whitespace
290
- key=$(echo "$key" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
291
- value=$(echo "$value" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
292
-
293
- # Skip comments and empty lines
294
- [[ -z "$key" || "$key" == \#* ]] && continue
295
-
296
- # Match story keys: N-N-slug (e.g. 5-1-ralph-loop-integration)
297
- if [[ "$key" =~ ^[0-9]+-[0-9]+- ]]; then
298
- total=$((total + 1))
299
- if [[ "$value" == "done" ]]; then
300
- done_count=$((done_count + 1))
301
- elif [[ -n "$flagged_list" ]] && echo "$flagged_list" | grep -qxF "$key"; then
302
- # Retry-exhausted/flagged stories count as "effectively done"
303
- # — no autonomous work can be done on them
304
- flagged_count=$((flagged_count + 1))
305
- fi
306
- fi
307
- done < "$SPRINT_STATUS_FILE"
308
-
309
- if [[ $total -eq 0 ]]; then
310
- return 1
311
- fi
312
-
313
- # Sprint is complete if all stories are either done or flagged (no autonomous work left)
314
- [[ $((done_count + flagged_count)) -eq $total ]]
315
- }
316
-
317
- # codeharness: Replaces all_tasks_complete() with sprint-status.yaml check.
318
- all_tasks_complete() {
319
- check_sprint_complete
320
- }
321
-
322
- # codeharness: Get story counts from sprint-status.yaml.
323
- # Returns "total completed" (space-separated).
324
- get_task_counts() {
325
- if [[ ! -f "$SPRINT_STATUS_FILE" ]]; then
326
- echo "0 0"
327
- return
328
- fi
329
-
330
- local total=0
331
- local completed=0
332
-
333
- while IFS=: read -r key value; do
334
- key=$(echo "$key" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
335
- value=$(echo "$value" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
336
-
337
- [[ -z "$key" || "$key" == \#* ]] && continue
338
-
339
- if [[ "$key" =~ ^[0-9]+-[0-9]+- ]]; then
340
- total=$((total + 1))
341
- if [[ "$value" == "done" || "$value" == "failed" ]]; then
342
- completed=$((completed + 1))
343
- fi
344
- fi
345
- done < "$SPRINT_STATUS_FILE"
346
-
347
- echo "$total $completed"
348
- }
349
-
350
- # ─── Retry Tracking ─────────────────────────────────────────────────────────
351
-
352
- # Increment retry count for a story. Returns the new count.
353
- increment_story_retry() {
354
- local story_key=$1
355
-
356
- if [[ -z "$STORY_RETRY_FILE" ]]; then
357
- echo "0"
358
- return
359
- fi
360
-
361
- local count=0
362
- local temp_file="${STORY_RETRY_FILE}.tmp"
363
-
364
- # Read current count if file exists
365
- if [[ -f "$STORY_RETRY_FILE" ]]; then
366
- local line
367
- while IFS=' ' read -r key val; do
368
- if [[ "$key" == "$story_key" ]]; then
369
- count=$((val + 0))
370
- fi
371
- done < "$STORY_RETRY_FILE"
372
- fi
373
-
374
- count=$((count + 1))
375
-
376
- # Rewrite the file with updated count (atomic via temp file + mv)
377
- # Clean up stale temp file from any previous crash
378
- rm -f "$temp_file" 2>/dev/null
379
-
380
- if [[ -f "$STORY_RETRY_FILE" ]]; then
381
- local found=false
382
- while IFS=' ' read -r key val; do
383
- if [[ "$key" == "$story_key" ]]; then
384
- echo "$key $count" >> "$temp_file"
385
- found=true
386
- else
387
- echo "$key $val" >> "$temp_file"
388
- fi
389
- done < "$STORY_RETRY_FILE"
390
- if [[ "$found" == "false" ]]; then
391
- echo "$story_key $count" >> "$temp_file"
392
- fi
393
- mv "$temp_file" "$STORY_RETRY_FILE"
394
- else
395
- echo "$story_key $count" > "$STORY_RETRY_FILE"
396
- fi
397
-
398
- echo "$count"
399
- }
400
-
401
- # Get current retry count for a story (0 if not tracked).
402
- get_story_retry_count() {
403
- local story_key=$1
404
-
405
- if [[ -z "$STORY_RETRY_FILE" || ! -f "$STORY_RETRY_FILE" ]]; then
406
- echo "0"
407
- return
408
- fi
409
-
410
- while IFS=' ' read -r key val; do
411
- if [[ "$key" == "$story_key" ]]; then
412
- echo "$((val + 0))"
413
- return
414
- fi
415
- done < "$STORY_RETRY_FILE"
416
-
417
- echo "0"
418
- }
419
-
420
- # Check if a story is flagged (exceeded retry limit).
421
- is_story_flagged() {
422
- local story_key=$1
423
-
424
- if [[ -z "$FLAGGED_STORIES_FILE" || ! -f "$FLAGGED_STORIES_FILE" ]]; then
425
- return 1
426
- fi
427
-
428
- grep -qx "$story_key" "$FLAGGED_STORIES_FILE" 2>/dev/null
429
- }
430
-
431
- # Flag a story that exceeded retry limit.
432
- flag_story() {
433
- local story_key=$1
434
-
435
- if [[ -z "$FLAGGED_STORIES_FILE" ]]; then
436
- return
437
- fi
438
-
439
- if ! is_story_flagged "$story_key"; then
440
- echo "$story_key" >> "$FLAGGED_STORIES_FILE"
441
- fi
442
-
443
- # Also update sprint-status.yaml to 'failed' so reconciliation picks it up
444
- # and sprint-state.json stays consistent (prevents flagged stories stuck at 'review')
445
- local sprint_yaml="${SPRINT_STATUS_FILE:-}"
446
- if [[ -n "$sprint_yaml" && -f "$sprint_yaml" ]]; then
447
- sed -i.bak "s/^ ${story_key}: .*/ ${story_key}: failed/" "$sprint_yaml" 2>/dev/null
448
- rm -f "${sprint_yaml}.bak" 2>/dev/null
449
- fi
450
- }
451
-
452
- # Get list of flagged stories (newline-separated).
453
- get_flagged_stories() {
454
- if [[ -z "$FLAGGED_STORIES_FILE" || ! -f "$FLAGGED_STORIES_FILE" ]]; then
455
- echo ""
456
- return
457
- fi
458
- cat "$FLAGGED_STORIES_FILE"
459
- }
460
-
461
- # Snapshot sprint-status.yaml story statuses as "key:status" lines.
462
- snapshot_story_statuses() {
463
- if [[ ! -f "$SPRINT_STATUS_FILE" ]]; then
464
- echo ""
465
- return
466
- fi
467
-
468
- while IFS=: read -r key value; do
469
- key=$(echo "$key" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
470
- value=$(echo "$value" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
471
- [[ -z "$key" || "$key" == \#* ]] && continue
472
- if [[ "$key" =~ ^[0-9]+-[0-9]+- ]]; then
473
- echo "$key:$value"
474
- fi
475
- done < "$SPRINT_STATUS_FILE"
476
- }
477
-
478
- # Compare before/after snapshots to detect story changes.
479
- # Sets CHANGED_STORIES (newly done) and UNCHANGED_STORIES (not done).
480
- detect_story_changes() {
481
- local before_snapshot=$1
482
- local after_snapshot=$2
483
-
484
- CHANGED_STORIES=""
485
- UNCHANGED_STORIES=""
486
-
487
- # Parse after snapshot
488
- while IFS=: read -r key status; do
489
- [[ -z "$key" ]] && continue
490
- local before_status=""
491
- # Find the same key in before snapshot
492
- while IFS=: read -r bkey bstatus; do
493
- if [[ "$bkey" == "$key" ]]; then
494
- before_status="$bstatus"
495
- break
496
- fi
497
- done <<< "$before_snapshot"
498
-
499
- if [[ "$status" == "done" && "$before_status" != "done" ]]; then
500
- CHANGED_STORIES="${CHANGED_STORIES}${key}
501
- "
502
- elif [[ "$status" != "done" ]]; then
503
- UNCHANGED_STORIES="${UNCHANGED_STORIES}${key}
504
- "
505
- fi
506
- done <<< "$after_snapshot"
507
- }
508
-
509
- # ─── Sprint State Progress Polling ─────────────────────────────────────────
510
-
511
- # Previous state tracking for change detection
512
- PREV_STORY=""
513
- PREV_PHASE=""
514
- PREV_AC_PROGRESS=""
515
- PREV_LAST_ACTION=""
516
-
517
- # Poll sprint-state.json for progress changes during background execution.
518
- # Prints structured update lines when progress fields change.
519
- poll_sprint_state_progress() {
520
- local state_file="sprint-state.json"
521
- [[ -f "$state_file" ]] || return 0
522
-
523
- # Single jq call to extract all fields (avoids 4 process spawns per poll cycle)
524
- local raw
525
- raw=$(jq -r '[.run.currentStory // "", .run.currentPhase // "", .run.lastAction // "", .run.acProgress // ""] | join("\t")' "$state_file" 2>/dev/null) || return 0
526
- [[ -n "$raw" ]] || return 0
527
-
528
- local cur_story cur_phase cur_action cur_ac
529
- IFS=$'\t' read -r cur_story cur_phase cur_action cur_ac <<< "$raw"
530
-
531
- # Nothing to report if no story is active
532
- [[ -z "$cur_story" ]] && return 0
533
-
534
- # Detect changes and print structured updates
535
- if [[ "$cur_story" != "$PREV_STORY" || "$cur_phase" != "$PREV_PHASE" ]]; then
536
- if [[ -n "$cur_action" && "$cur_action" != "null" ]]; then
537
- log_status "INFO" "Story ${cur_story}: ${cur_phase} (${cur_action})"
538
- else
539
- log_status "INFO" "Story ${cur_story}: ${cur_phase}"
540
- fi
541
- elif [[ "$cur_ac" != "$PREV_AC_PROGRESS" && -n "$cur_ac" && "$cur_ac" != "null" ]]; then
542
- log_status "INFO" "Story ${cur_story}: verify (AC ${cur_ac})"
543
- elif [[ "$cur_action" != "$PREV_LAST_ACTION" && -n "$cur_action" && "$cur_action" != "null" ]]; then
544
- log_status "INFO" "Story ${cur_story}: ${cur_phase} (${cur_action})"
545
- fi
546
-
547
- PREV_STORY="$cur_story"
548
- PREV_PHASE="$cur_phase"
549
- PREV_AC_PROGRESS="$cur_ac"
550
- PREV_LAST_ACTION="$cur_action"
551
- }
552
-
553
- # Reset polling state between iterations
554
- reset_poll_state() {
555
- PREV_STORY=""
556
- PREV_PHASE=""
557
- PREV_AC_PROGRESS=""
558
- PREV_LAST_ACTION=""
559
- }
560
-
561
- # ─── Progress Summary ───────────────────────────────────────────────────────
562
-
563
- print_progress_summary() {
564
- local counts
565
- counts=$(get_task_counts)
566
- local total=${counts%% *}
567
- local completed=${counts##* }
568
- local remaining=$((total - completed))
569
- local elapsed=$(( $(date +%s) - loop_start_time ))
570
- local elapsed_fmt
571
-
572
- if [[ $elapsed -ge 3600 ]]; then
573
- elapsed_fmt="$((elapsed / 3600))h$((elapsed % 3600 / 60))m"
574
- elif [[ $elapsed -ge 60 ]]; then
575
- elapsed_fmt="$((elapsed / 60))m$((elapsed % 60))s"
576
- else
577
- elapsed_fmt="${elapsed}s"
578
- fi
579
-
580
- # Read cost and failed stories from sprint-state.json (single jq call)
581
- local cost=""
582
- local cost_fmt=""
583
- local failed_stories=""
584
- if [[ -f "sprint-state.json" ]]; then
585
- local state_data
586
- state_data=$(jq -r '(.run.cost // 0 | tostring) + "\n" + ((.run.failed // []) | join("\n"))' "sprint-state.json" 2>/dev/null) || state_data=""
587
- if [[ -n "$state_data" ]]; then
588
- cost=$(head -1 <<< "$state_data")
589
- failed_stories=$(tail -n +2 <<< "$state_data")
590
- if [[ -n "$cost" && "$cost" != "0" && "$cost" != "null" ]]; then
591
- cost_fmt=", cost: \$${cost}"
592
- fi
593
- fi
594
- fi
595
-
596
- log_status "INFO" "Progress: ${completed}/${total} done, ${remaining} remaining (iterations: ${loop_count}, elapsed: ${elapsed_fmt}${cost_fmt})"
597
-
598
- # Show completed stories with ✓
599
- if [[ -f "$SPRINT_STATUS_FILE" ]]; then
600
- while IFS=: read -r key value; do
601
- key=$(echo "$key" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
602
- value=$(echo "$value" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
603
- [[ -z "$key" || "$key" == \#* ]] && continue
604
- if [[ "$key" =~ ^[0-9]+-[0-9]+- && "$value" == "done" ]]; then
605
- log_status "SUCCESS" " ✓ ${key}"
606
- fi
607
- done < "$SPRINT_STATUS_FILE"
608
- fi
609
-
610
- # Show failed stories with ✗ from sprint-state.json
611
- if [[ -n "$failed_stories" ]]; then
612
- while IFS= read -r fkey; do
613
- [[ -z "$fkey" ]] && continue
614
- log_status "ERROR" " ✗ ${fkey}"
615
- done <<< "$failed_stories"
616
- fi
617
-
618
- # Show flagged/blocked stories with ✕
619
- if [[ -f "$FLAGGED_STORIES_FILE" ]]; then
620
- while IFS= read -r bkey; do
621
- [[ -z "$bkey" ]] && continue
622
- log_status "WARN" " ✕ ${bkey} (blocked)"
623
- done < "$FLAGGED_STORIES_FILE"
624
- fi
625
-
626
- # Show the next story in line (first non-done, non-flagged)
627
- if [[ -f "$SPRINT_STATUS_FILE" ]]; then
628
- local next_story=""
629
- while IFS=: read -r key value; do
630
- key=$(echo "$key" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
631
- value=$(echo "$value" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
632
- [[ -z "$key" || "$key" == \#* ]] && continue
633
- if [[ "$key" =~ ^[0-9]+-[0-9]+- && "$value" != "done" ]]; then
634
- if ! is_story_flagged "$key"; then
635
- next_story="$key ($value)"
636
- break
637
- fi
638
- fi
639
- done < "$SPRINT_STATUS_FILE"
640
- if [[ -n "$next_story" ]]; then
641
- log_status "INFO" "Next up: ${next_story}"
642
- fi
643
- fi
644
- }
645
-
646
- # ─── Iteration Insights ──────────────────────────────────────────────────────
647
-
648
- print_iteration_insights() {
649
- local project_root
650
- project_root="$(pwd)"
651
- local issues_file="$project_root/_bmad-output/implementation-artifacts/.session-issues.md"
652
- local today
653
- today=$(date +%Y-%m-%d)
654
- local retro_file="$project_root/_bmad-output/implementation-artifacts/session-retro-${today}.md"
655
-
656
- # Show session issues (last 20 lines — most recent subagent)
657
- if [[ -f "$issues_file" ]]; then
658
- local issue_count
659
- issue_count=$(grep -c '^### ' "$issues_file" 2>/dev/null || echo "0")
660
- if [[ $issue_count -gt 0 ]]; then
661
- echo ""
662
- log_status "INFO" "━━━ Session Issues ($issue_count entries) ━━━"
663
- # Print the last subagent's issues block
664
- awk '/^### /{block=""} {block=block $0 "\n"} END{printf "%s", block}' "$issues_file" | head -15
665
- echo ""
666
- fi
667
- fi
668
-
669
- # Show retro summary if generated
670
- if [[ -f "$retro_file" ]]; then
671
- log_status "INFO" "━━━ Session Retro ━━━"
672
- # Print action items section if present, otherwise first 10 lines
673
- if grep -q '## Action items\|## Action Items' "$retro_file" 2>/dev/null; then
674
- sed -n '/^## Action [Ii]tems/,/^## /p' "$retro_file" | head -20
675
- else
676
- head -10 "$retro_file"
677
- fi
678
- echo ""
679
- fi
680
- }
681
-
682
- # ─── Driver Management ──────────────────────────────────────────────────────
683
-
684
- load_platform_driver() {
685
- local driver_file="$SCRIPT_DIR/drivers/${PLATFORM_DRIVER}.sh"
686
- if [[ ! -f "$driver_file" ]]; then
687
- log_status "ERROR" "Platform driver not found: $driver_file"
688
- exit 1
689
- fi
690
-
691
- # shellcheck source=/dev/null
692
- source "$driver_file"
693
-
694
- driver_valid_tools
695
-
696
- # Auto-populate CLAUDE_ALLOWED_TOOLS from driver's valid tool patterns
697
- # so Ralph runs autonomously without permission prompts
698
- if [[ -z "$CLAUDE_ALLOWED_TOOLS" && ${#VALID_TOOL_PATTERNS[@]} -gt 0 ]]; then
699
- CLAUDE_ALLOWED_TOOLS=$(IFS=','; echo "${VALID_TOOL_PATTERNS[*]}")
700
- fi
701
-
702
- log_status "DEBUG" "Platform driver: $(driver_display_name) ($(driver_cli_binary))"
703
- }
704
-
705
- # ─── Execution ───────────────────────────────────────────────────────────────
706
-
707
- execute_iteration() {
708
- local iteration=$1
709
- local task_id=$2
710
- local timestamp=$(date '+%Y-%m-%d_%H-%M-%S')
711
- local output_file="$LOG_DIR/claude_output_${timestamp}.log"
712
- local calls_made=$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")
713
- calls_made=$((calls_made + 1))
714
-
715
- # Capture git HEAD SHA at iteration start for progress detection
716
- local loop_start_sha=""
717
- if command -v git &>/dev/null && git rev-parse --git-dir &>/dev/null 2>&1; then
718
- loop_start_sha=$(git rev-parse HEAD 2>/dev/null || echo "")
719
- fi
720
-
721
- # Snapshot sprint-state.json before iteration (for timeout delta capture)
722
- local state_snapshot_path="ralph/.state-snapshot.json"
723
- if [[ -f "sprint-state.json" ]]; then
724
- cp "sprint-state.json" "$state_snapshot_path" 2>/dev/null || true
725
- fi
726
-
727
- log_status "LOOP" "Iteration $iteration — Task: ${task_id:-'(reading from prompt)'}"
728
- local timeout_seconds=$((ITERATION_TIMEOUT_MINUTES * 60))
729
-
730
- # Build loop context — pass time budget so the session can prioritize retro
731
- local start_time
732
- start_time=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
733
- local loop_context="Loop #${iteration}. Time budget: ${ITERATION_TIMEOUT_MINUTES} minutes (started: ${start_time}). Reserve the last 5 minutes for Step 8 (session retrospective) — do not start new story work if less than 10 minutes remain."
734
- if [[ -n "$task_id" ]]; then
735
- loop_context+=" Current task: $task_id."
736
- fi
737
-
738
- # Build the command via driver
739
- local session_id="" # Fresh context per iteration
740
- if ! driver_build_command "$PROMPT_FILE" "$loop_context" "$session_id" "$PLUGIN_DIR"; then
741
- log_status "ERROR" "Failed to build CLI command"
742
- return 1
743
- fi
744
-
745
- # Write deadline file for time-warning hook
746
- local deadline=$(( $(date +%s) + timeout_seconds ))
747
- echo "$deadline" > "ralph/.iteration_deadline"
748
-
749
- # DEBUG: log command (truncate prompt content to avoid dumping entire prompt to terminal)
750
- local cmd_summary="${CLAUDE_CMD_ARGS[*]}"
751
- if [[ ${#cmd_summary} -gt 200 ]]; then
752
- cmd_summary="${cmd_summary:0:200}... (truncated)"
753
- fi
754
- log_status "DEBUG" "Command: $cmd_summary"
755
- log_status "DEBUG" "Output file: $output_file"
756
- log_status "DEBUG" "LIVE_OUTPUT=$LIVE_OUTPUT, timeout=${timeout_seconds}s"
757
-
758
- log_status "INFO" "Starting $(driver_display_name) (timeout: ${ITERATION_TIMEOUT_MINUTES}m)..."
759
-
760
- # Execute with timeout
761
- local exit_code=0
762
-
763
- if [[ "$LIVE_OUTPUT" == "true" ]]; then
764
- # Live streaming mode
765
- echo -e "\n=== Iteration #$iteration — $(date '+%Y-%m-%d %H:%M:%S') ===" > "$LIVE_LOG_FILE"
766
- echo -e "${PURPLE}━━━━━━━━━━━━━ $(driver_display_name) Output ━━━━━━━━━━━━━${NC}"
767
-
768
- set -o pipefail
769
- portable_timeout ${timeout_seconds}s "${CLAUDE_CMD_ARGS[@]}" \
770
- < /dev/null 2>&1 | tee "$output_file" | tee "$LIVE_LOG_FILE"
771
- exit_code=${PIPESTATUS[0]}
772
- set +o pipefail
773
-
774
- echo -e "${PURPLE}━━━━━━━━━━━━━ End of Output ━━━━━━━━━━━━━━━━━━━${NC}"
775
- else
776
- # Background mode with progress monitoring
777
- portable_timeout ${timeout_seconds}s "${CLAUDE_CMD_ARGS[@]}" \
778
- < /dev/null > "$output_file" 2>&1 &
779
-
780
- local claude_pid=$!
781
- local progress_counter=0
782
-
783
- log_status "DEBUG" "Background PID: $claude_pid"
784
-
785
- reset_poll_state
786
- while kill -0 $claude_pid 2>/dev/null; do
787
- progress_counter=$((progress_counter + 1))
788
- if [[ -f "$output_file" && -s "$output_file" ]]; then
789
- cp "$output_file" "$LIVE_LOG_FILE" 2>/dev/null
790
- fi
791
- poll_sprint_state_progress
792
- sleep 10
793
- done
794
-
795
- wait $claude_pid
796
- exit_code=$?
797
- log_status "DEBUG" "Claude exited with code: $exit_code, output size: $(wc -c < "$output_file" 2>/dev/null || echo 0) bytes"
798
-
799
- # If output is empty and exit code is non-zero, log diagnostic info
800
- if [[ ! -s "$output_file" && $exit_code -ne 0 ]]; then
801
- log_status "ERROR" "Claude produced no output and exited with code $exit_code"
802
- log_status "DEBUG" "Checking if claude binary is responsive..."
803
- if claude --version > /dev/null 2>&1; then
804
- log_status "DEBUG" "claude binary OK: $(claude --version 2>&1)"
805
- else
806
- log_status "ERROR" "claude binary not responding"
807
- fi
808
- fi
809
- fi
810
-
811
- if [[ $exit_code -eq 0 ]]; then
812
- echo "$calls_made" > "$CALL_COUNT_FILE"
813
- log_status "SUCCESS" "$(driver_display_name) iteration completed successfully"
814
-
815
- # Detect progress: check for file changes (committed or uncommitted)
816
- local files_changed=0
817
- if command -v git &>/dev/null && git rev-parse --git-dir &>/dev/null 2>&1; then
818
- local current_sha=$(git rev-parse HEAD 2>/dev/null || echo "")
819
-
820
- if [[ -n "$loop_start_sha" && -n "$current_sha" && "$loop_start_sha" != "$current_sha" ]]; then
821
- files_changed=$(
822
- {
823
- git diff --name-only "$loop_start_sha" "$current_sha" 2>/dev/null
824
- git diff --name-only HEAD 2>/dev/null
825
- git diff --name-only --cached 2>/dev/null
826
- } | sort -u | wc -l
827
- )
828
- else
829
- files_changed=$(
830
- {
831
- git diff --name-only 2>/dev/null
832
- git diff --name-only --cached 2>/dev/null
833
- } | sort -u | wc -l
834
- )
835
- fi
836
- fi
837
-
838
- # If harness-run reported NO_WORK, don't count file changes as progress.
839
- # Writing session-issues.md with "NO_WORK" creates git diffs but is NOT real progress.
840
- # IMPORTANT: Only check non-JSON lines. The prompt text is echoed inside JSON objects
841
- # and contains these strings as instructions — those are false positives.
842
- if grep -v '^[[:space:]]*{' "$output_file" 2>/dev/null | grep -qE 'Result: NO_WORK'; then
843
- files_changed=0
844
- log_status "INFO" "NO_WORK detected — overriding files_changed to 0 for circuit breaker"
845
- fi
846
-
847
- local has_errors="false"
848
- # Only check non-JSON lines for errors. Stream-json output is NDJSON
849
- # (one JSON object per line), so any line starting with '{' is Claude
850
- # content — which naturally contains words like "error" and "Exception"
851
- # in code reviews, test output, and discussion. Grepping those produces
852
- # false positives that trip the circuit breaker.
853
- if grep -v '^[[:space:]]*{' "$output_file" 2>/dev/null | \
854
- grep -qE '(^Error:|^ERROR:|^error:|\]: error|Error occurred|failed with error|[Ee]xception|Fatal|FATAL)'; then
855
- has_errors="true"
856
- log_status "WARN" "Errors detected in output"
857
- fi
858
-
859
- local output_length=$(wc -c < "$output_file" 2>/dev/null || echo 0)
860
-
861
- # Record in circuit breaker
862
- record_loop_result "$iteration" "$files_changed" "$has_errors" "$output_length"
863
- local circuit_result=$?
864
-
865
- if [[ $circuit_result -ne 0 ]]; then
866
- log_status "WARN" "Circuit breaker opened — halting execution"
867
- return 3
868
- fi
869
-
870
- return 0
871
- elif [[ $exit_code -eq 124 ]]; then
872
- log_status "WARN" "Iteration timed out after ${ITERATION_TIMEOUT_MINUTES}m"
873
-
874
- # Capture timeout report
875
- if command -v npx &>/dev/null; then
876
- log_status "INFO" "Capturing timeout report..."
877
- npx codeharness timeout-report \
878
- --story "${task_id:-unknown}" \
879
- --iteration "$iteration" \
880
- --duration "$ITERATION_TIMEOUT_MINUTES" \
881
- --output-file "$output_file" \
882
- --state-snapshot "$state_snapshot_path" 2>/dev/null && \
883
- log_status "INFO" "Timeout report saved" || \
884
- log_status "WARN" "Failed to capture timeout report"
885
- fi
886
-
887
- # Verify report file exists with non-zero content
888
- local report_file="ralph/logs/timeout-report-${iteration}-${task_id:-unknown}.md"
889
- if [[ -s "$report_file" ]]; then
890
- log_status "INFO" "Timeout report verified: $report_file"
891
- fi
892
-
893
- return 1
894
- else
895
- # Check for API limit
896
- if grep -qi "5.*hour.*limit\|limit.*reached.*try.*back\|usage.*limit.*reached" "$output_file" 2>/dev/null; then
897
- log_status "ERROR" "Claude API usage limit reached"
898
- return 2
899
- # Check for transient API errors (500, 529, overloaded) — don't count against story
900
- # Status code patterns exclude decimal prefixes (e.g., cost_usd=0.503 ≠ HTTP 503)
901
- elif grep -qiE 'Internal server error|api_error|overloaded|(^|[^0-9.])529([^0-9]|$)|(^|[^0-9.])503([^0-9]|$)' "$output_file" 2>/dev/null; then
902
- log_status "WARN" "Transient API error (not story's fault) — will retry"
903
- return 4
904
- else
905
- log_status "ERROR" "$(driver_display_name) execution failed (exit code: $exit_code)"
906
- return 1
907
- fi
908
- fi
909
- }
910
-
911
- # ─── Cleanup ─────────────────────────────────────────────────────────────────
912
-
913
- cleanup() {
914
- log_status "INFO" "Ralph loop interrupted. Cleaning up..."
915
- update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "interrupted" "stopped" "user_cancelled"
916
-
917
- # Print progress summary on interruption
918
- if [[ -n "$loop_start_time" && -n "$SPRINT_STATUS_FILE" ]]; then
919
- local counts
920
- counts=$(get_task_counts)
921
- local total=${counts%% *}
922
- local completed=${counts##* }
923
- local elapsed=$(( $(date +%s) - loop_start_time ))
924
- local elapsed_min=$(( elapsed / 60 ))
925
-
926
- log_status "INFO" " Iterations: $loop_count"
927
- log_status "INFO" " Stories completed: $completed/$total"
928
- log_status "INFO" " Elapsed: ${elapsed_min}m"
929
- fi
930
-
931
- exit 0
932
- }
933
-
934
- trap cleanup SIGINT SIGTERM
935
-
936
- # ─── Help ────────────────────────────────────────────────────────────────────
937
-
938
- show_help() {
939
- cat << 'HELPEOF'
940
- codeharness Ralph Loop — Autonomous execution with verification gates
941
-
942
- Usage: ralph/ralph.sh --plugin-dir DIR [OPTIONS]
943
-
944
- Required:
945
- --plugin-dir DIR Path to codeharness plugin directory
946
-
947
- Options:
948
- -h, --help Show this help message
949
- --max-iterations NUM Maximum loop iterations (default: 50)
950
- --max-story-retries NUM Max retries per story before flagging (default: 3)
951
- --timeout SECONDS Total loop timeout in seconds (default: 14400 = 4h)
952
- --iteration-timeout MIN Per-iteration timeout in minutes (default: 30)
953
- --calls NUM Max API calls per hour (default: 100)
954
- --prompt FILE Prompt file for each iteration
955
- --progress FILE Progress file (tasks JSON)
956
- --live Show live output streaming
957
- --reset Clear retry counters, flagged stories, and circuit breaker before starting
958
- --reset-circuit Reset circuit breaker and exit
959
- --status Show current status and exit
960
-
961
- The loop:
962
- 1. Reads next task from progress file
963
- 2. Spawns fresh Claude Code instance with --plugin-dir
964
- 3. Agent implements story (harness hooks enforce verification)
965
- 4. Circuit breaker monitors for stagnation
966
- 5. On completion or gate failure, picks next task or iterates
967
- HELPEOF
968
- }
969
-
970
- # ─── Sprint Summary ──────────────────────────────────────────────────────────
971
-
972
- # Print a compact sprint summary at startup
973
- print_sprint_summary() {
974
- local counts
975
- counts=$(get_task_counts)
976
- local total=${counts%% *}
977
- local completed=${counts##* }
978
- local remaining=$((total - completed))
979
-
980
- # Find next story
981
- local next_story=""
982
- local next_status=""
983
- if [[ -f "$SPRINT_STATUS_FILE" ]]; then
984
- while IFS=: read -r key value; do
985
- key=$(echo "$key" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
986
- value=$(echo "$value" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
987
- [[ -z "$key" || "$key" == \#* ]] && continue
988
- if [[ "$key" =~ ^[0-9]+-[0-9]+- && "$value" != "done" ]]; then
989
- if ! is_story_flagged "$key"; then
990
- next_story="$key"
991
- next_status="$value"
992
- break
993
- fi
994
- fi
995
- done < "$SPRINT_STATUS_FILE"
996
- fi
997
-
998
- if [[ -n "$next_story" ]]; then
999
- log_status "INFO" "Sprint: ${completed}/${total} done, ${remaining} remaining — next: ${next_story} (${next_status})"
1000
- else
1001
- log_status "INFO" "Sprint: ${completed}/${total} done, ${remaining} remaining"
1002
- fi
1003
- }
1004
-
1005
- # ─── Main ────────────────────────────────────────────────────────────────────
1006
-
1007
- main() {
1008
- if [[ -z "$PLUGIN_DIR" ]]; then
1009
- log_status "ERROR" "Missing required --plugin-dir argument"
1010
- show_help
1011
- exit 1
1012
- fi
1013
-
1014
- # Resolve paths
1015
- PLUGIN_DIR="$(cd "$PLUGIN_DIR" 2>/dev/null && pwd)" || {
1016
- log_status "ERROR" "Plugin directory does not exist: $PLUGIN_DIR"
1017
- exit 1
1018
- }
1019
-
1020
- # Derive state paths from project root (cwd)
1021
- local project_root
1022
- project_root="$(pwd)"
1023
-
1024
- HARNESS_STATE_DIR="${project_root}/.claude"
1025
- LOG_DIR="${project_root}/ralph/logs"
1026
- STATUS_FILE="${project_root}/ralph/status.json"
1027
- LIVE_LOG_FILE="${project_root}/ralph/live.log"
1028
- CALL_COUNT_FILE="${project_root}/ralph/.call_count"
1029
- TIMESTAMP_FILE="${project_root}/ralph/.last_reset"
1030
- STORY_RETRY_FILE="${project_root}/ralph/.story_retries"
1031
- FLAGGED_STORIES_FILE="${project_root}/ralph/.flagged_stories"
1032
-
1033
- # Use progress file from argument or default (legacy, optional)
1034
- PROGRESS_FILE="${PROGRESS_FILE:-${project_root}/ralph/progress.json}"
1035
-
1036
- # codeharness: Sprint status file is the primary task source
1037
- SPRINT_STATUS_FILE="${project_root}/_bmad-output/implementation-artifacts/sprint-status.yaml"
1038
-
1039
- # Use prompt file from argument or default
1040
- PROMPT_FILE="${PROMPT_FILE:-${project_root}/.ralph/PROMPT.md}"
1041
-
1042
- # Create directories
1043
- mkdir -p "$LOG_DIR"
1044
-
1045
- # Check dependencies
1046
- if ! command -v jq &>/dev/null; then
1047
- log_status "ERROR" "Required dependency 'jq' is not installed"
1048
- exit 1
1049
- fi
1050
-
1051
- # Load platform driver
1052
- load_platform_driver
1053
-
1054
- # Check CLI binary
1055
- if ! driver_check_available; then
1056
- log_status "ERROR" "$(driver_display_name) CLI not found: $(driver_cli_binary)"
1057
- exit 1
1058
- fi
1059
-
1060
- # Initialize circuit breaker
1061
- export HARNESS_STATE_DIR
1062
- init_circuit_breaker
1063
-
1064
- # Initialize rate limiting
1065
- init_call_tracking
1066
-
1067
- # Crash recovery: detect if resuming from a previous run
1068
- if [[ -f "$STATUS_FILE" ]]; then
1069
- local prev_status
1070
- prev_status=$(jq -r '.status // ""' "$STATUS_FILE" 2>/dev/null || echo "")
1071
- if [[ -n "$prev_status" && "$prev_status" != "completed" ]]; then
1072
- log_status "INFO" "Resuming from last completed story"
1073
- fi
1074
- fi
1075
-
1076
- # Reset retry state if --reset flag was passed
1077
- if [[ "$RESET_RETRIES" == "true" ]]; then
1078
- if [[ -f "$STORY_RETRY_FILE" ]]; then
1079
- rm -f "$STORY_RETRY_FILE"
1080
- log_status "INFO" "Cleared story retry counters"
1081
- fi
1082
- if [[ -f "$FLAGGED_STORIES_FILE" ]]; then
1083
- rm -f "$FLAGGED_STORIES_FILE"
1084
- log_status "INFO" "Cleared flagged stories"
1085
- fi
1086
- reset_circuit_breaker "Reset via --reset flag"
1087
- log_status "INFO" "Circuit breaker reset to CLOSED"
1088
- fi
1089
-
1090
- # .story_retries and .flagged_stories are file-based — they persist automatically
1091
-
1092
- log_status "SUCCESS" "Ralph loop starting"
1093
- log_status "DEBUG" "Plugin: $PLUGIN_DIR"
1094
- log_status "DEBUG" "Max iterations: $MAX_ITERATIONS | Timeout: $((LOOP_TIMEOUT_SECONDS / 3600))h"
1095
- log_status "DEBUG" "Prompt: $PROMPT_FILE"
1096
- log_status "DEBUG" "Sprint status: $SPRINT_STATUS_FILE"
1097
- log_status "DEBUG" "Max story retries: $MAX_STORY_RETRIES"
1098
-
1099
- # Record loop start time for timeout
1100
- loop_start_time=$(date +%s)
1101
-
1102
- print_sprint_summary
1103
-
1104
- local consecutive_failures=0
1105
- local max_consecutive_failures=3
1106
-
1107
- while true; do
1108
- loop_count=$((loop_count + 1))
1109
-
1110
- # ── Check loop limits ──
1111
-
1112
- if [[ $loop_count -gt $MAX_ITERATIONS ]]; then
1113
- update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "max_iterations" "stopped" "max_iterations_reached"
1114
-
1115
- local counts
1116
- counts=$(get_task_counts)
1117
- local total=${counts%% *}
1118
- local completed=${counts##* }
1119
- log_status "INFO" "Max iterations ($MAX_ITERATIONS) reached. ${completed}/${total} stories complete."
1120
- break
1121
- fi
1122
-
1123
- # Check total timeout
1124
- local elapsed=$(( $(date +%s) - loop_start_time ))
1125
- if [[ $elapsed -ge $LOOP_TIMEOUT_SECONDS ]]; then
1126
- log_status "WARN" "Loop timeout reached (${LOOP_TIMEOUT_SECONDS}s)"
1127
- update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "timeout" "stopped" "loop_timeout"
1128
- break
1129
- fi
1130
-
1131
- # ── Check circuit breaker ──
1132
-
1133
- if should_halt_execution; then
1134
- # Auto-reset: if there are actionable stories (sprint not complete),
1135
- # the breaker was tripped by a previous session's no-ops. Reset and retry.
1136
- if ! all_tasks_complete; then
1137
- log_status "INFO" "Circuit breaker open but actionable stories exist — auto-resetting"
1138
- reset_circuit_breaker "Auto-reset: actionable stories detected"
1139
- else
1140
- local cb_no_progress=0
1141
- if [[ -f "$CB_STATE_FILE" ]]; then
1142
- cb_no_progress=$(jq -r '.consecutive_no_progress // 0' "$CB_STATE_FILE" 2>/dev/null || echo "0")
1143
- fi
1144
- log_status "WARN" "Circuit breaker: no progress in ${cb_no_progress} iterations"
1145
- update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "circuit_breaker" "halted" "stagnation_detected"
1146
- break
1147
- fi
1148
- fi
1149
-
1150
- # ── Check rate limit ──
1151
-
1152
- if ! can_make_call; then
1153
- wait_for_reset
1154
- continue
1155
- fi
1156
-
1157
- # ── Check task completion ──
1158
-
1159
- if all_tasks_complete; then
1160
- local counts
1161
- counts=$(get_task_counts)
1162
- local total=${counts%% *}
1163
-
1164
- update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "all_complete" "completed" "all_tasks_done"
1165
- log_status "SUCCESS" "All stories complete. ${total} stories verified in ${loop_count} iterations."
1166
- break
1167
- fi
1168
-
1169
- # ── Get current task ──
1170
-
1171
- local current_task
1172
- current_task=$(get_current_task)
1173
-
1174
- log_status "LOOP" "=== Iteration #$loop_count ==="
1175
- update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "executing" "running"
1176
-
1177
- # ── Snapshot story statuses before iteration ──
1178
- local before_snapshot
1179
- before_snapshot=$(snapshot_story_statuses)
1180
-
1181
- # ── Execute ──
1182
-
1183
- execute_iteration "$loop_count" "$current_task"
1184
- local exec_result=$?
1185
-
1186
- case $exec_result in
1187
- 0)
1188
- consecutive_failures=0
1189
- update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "completed" "success"
1190
-
1191
- # ── Retry tracking: compare sprint-status before/after ──
1192
- local after_snapshot
1193
- after_snapshot=$(snapshot_story_statuses)
1194
- detect_story_changes "$before_snapshot" "$after_snapshot"
1195
-
1196
- # Only increment retry for the FIRST non-done, non-flagged story
1197
- # (the one harness-run would have picked up). Other stories were
1198
- # never attempted — don't penalise them for not progressing.
1199
- if [[ -n "$UNCHANGED_STORIES" ]]; then
1200
- while IFS= read -r skey; do
1201
- [[ -z "$skey" ]] && continue
1202
- if is_story_flagged "$skey"; then
1203
- continue
1204
- fi
1205
- local retry_count
1206
- retry_count=$(increment_story_retry "$skey")
1207
- if [[ $retry_count -ge $MAX_STORY_RETRIES ]]; then
1208
- log_status "WARN" "Story ${skey} exceeded retry limit (${retry_count}) — flagging and moving on"
1209
- flag_story "$skey"
1210
- else
1211
- log_status "WARN" "Story ${skey} — retry ${retry_count}/${MAX_STORY_RETRIES}"
1212
- fi
1213
- break # only retry the first actionable story
1214
- done <<< "$UNCHANGED_STORIES"
1215
- fi
1216
-
1217
- if [[ -n "$CHANGED_STORIES" ]]; then
1218
- while IFS= read -r skey; do
1219
- [[ -z "$skey" ]] && continue
1220
- # Extract story title from story file if available
1221
- local story_file="$project_root/_bmad-output/implementation-artifacts/${skey}.md"
1222
- local story_title=""
1223
- if [[ -f "$story_file" ]]; then
1224
- story_title=$(grep -m1 '^# \|^## Story' "$story_file" 2>/dev/null | sed 's/^#* *//' | head -c 60)
1225
- fi
1226
- local proof_file="$project_root/verification/${skey}-proof.md"
1227
- local proof_info=""
1228
- if [[ -f "$proof_file" ]]; then
1229
- proof_info=" [proof: verification/${skey}-proof.md]"
1230
- fi
1231
- if [[ -n "$story_title" ]]; then
1232
- log_status "SUCCESS" "Story ${skey}: DONE — ${story_title}${proof_info}"
1233
- else
1234
- log_status "SUCCESS" "Story ${skey}: DONE${proof_info}"
1235
- fi
1236
- done <<< "$CHANGED_STORIES"
1237
- fi
1238
-
1239
- sleep 5 # Brief pause between iterations
1240
- ;;
1241
- 2)
1242
- # API limit — wait or exit
1243
- log_status "WARN" "API usage limit reached. Waiting 60 minutes..."
1244
- update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "api_limit" "paused"
1245
- sleep 3600
1246
- ;;
1247
- 3)
1248
- # Circuit breaker
1249
- log_status "ERROR" "Circuit breaker opened — halting loop"
1250
- update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "circuit_breaker" "halted"
1251
- break
1252
- ;;
1253
- 4)
1254
- # Transient API error — retry after brief pause, don't count against story
1255
- consecutive_failures=0 # reset — this isn't the story's fault
1256
- log_status "INFO" "Transient API error — retrying in 30s (not counting against story)"
1257
- sleep 30
1258
- ;;
1259
- *)
1260
- # Failure (timeout or crash) — increment retry for the story that was being worked on
1261
- consecutive_failures=$((consecutive_failures + 1))
1262
-
1263
- # Increment retry for the first non-done, non-flagged story (the one that caused the timeout)
1264
- local after_snap_fail
1265
- after_snap_fail=$(snapshot_story_statuses)
1266
- while IFS=: read -r fkey fstatus; do
1267
- [[ -z "$fkey" ]] && continue
1268
- [[ "$fstatus" == "done" ]] && continue
1269
- if ! is_story_flagged "$fkey"; then
1270
- local fail_retry
1271
- fail_retry=$(increment_story_retry "$fkey")
1272
- if [[ $fail_retry -ge $MAX_STORY_RETRIES ]]; then
1273
- log_status "WARN" "Story ${fkey} exceeded retry limit (${fail_retry}) after timeout — flagging"
1274
- flag_story "$fkey"
1275
- else
1276
- log_status "WARN" "Story ${fkey} — timeout retry ${fail_retry}/${MAX_STORY_RETRIES}"
1277
- fi
1278
- break
1279
- fi
1280
- done <<< "$after_snap_fail"
1281
-
1282
- if [[ $consecutive_failures -ge $max_consecutive_failures ]]; then
1283
- log_status "ERROR" "$max_consecutive_failures consecutive failures — halting"
1284
- update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "consecutive_failures" "halted"
1285
- break
1286
- fi
1287
-
1288
- update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "failed" "error"
1289
- log_status "WARN" "Iteration failed ($consecutive_failures/$max_consecutive_failures). Waiting 30s..."
1290
- sleep 30
1291
- ;;
1292
- esac
1293
-
1294
- # Print progress summary after every iteration
1295
- print_progress_summary
1296
-
1297
- # ── Show session issues and retro highlights ──
1298
- print_iteration_insights
1299
-
1300
- log_status "LOOP" "=== End Iteration #$loop_count ==="
1301
- done
1302
-
1303
- # Final summary — reads from sprint-status.yaml
1304
- local counts
1305
- counts=$(get_task_counts)
1306
- local total=${counts%% *}
1307
- local completed=${counts##* }
1308
-
1309
- local elapsed_total=$(( $(date +%s) - loop_start_time ))
1310
- local elapsed_min=$(( elapsed_total / 60 ))
1311
-
1312
- log_status "SUCCESS" "Ralph loop finished"
1313
- log_status "INFO" " Iterations: $loop_count"
1314
- log_status "INFO" " Stories completed: $completed/$total"
1315
- log_status "INFO" " Elapsed: ${elapsed_min}m"
1316
- log_status "INFO" " API calls: $(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")"
1317
-
1318
- if [[ $completed -eq $total && $total -gt 0 ]]; then
1319
- log_status "SUCCESS" "All stories complete. $total stories verified in $loop_count iterations."
1320
- fi
1321
-
1322
- # Write final summary to status file
1323
- update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "final_summary" \
1324
- "$(if [[ $completed -eq $total && $total -gt 0 ]]; then echo "completed"; else echo "stopped"; fi)" \
1325
- "completed:$completed/$total"
1326
-
1327
- }
1328
-
1329
- # ─── CLI Parsing ─────────────────────────────────────────────────────────────
1330
-
1331
- if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
1332
-
1333
- while [[ $# -gt 0 ]]; do
1334
- case $1 in
1335
- -h|--help)
1336
- show_help
1337
- exit 0
1338
- ;;
1339
- --plugin-dir)
1340
- PLUGIN_DIR="$2"
1341
- shift 2
1342
- ;;
1343
- --max-iterations)
1344
- MAX_ITERATIONS="$2"
1345
- shift 2
1346
- ;;
1347
- --max-story-retries)
1348
- MAX_STORY_RETRIES="$2"
1349
- shift 2
1350
- ;;
1351
- --timeout)
1352
- LOOP_TIMEOUT_SECONDS="$2"
1353
- shift 2
1354
- ;;
1355
- --iteration-timeout)
1356
- ITERATION_TIMEOUT_MINUTES="$2"
1357
- shift 2
1358
- ;;
1359
- --calls)
1360
- MAX_CALLS_PER_HOUR="$2"
1361
- shift 2
1362
- ;;
1363
- --prompt)
1364
- PROMPT_FILE="$2"
1365
- shift 2
1366
- ;;
1367
- --progress)
1368
- PROGRESS_FILE="$2"
1369
- shift 2
1370
- ;;
1371
- --live)
1372
- LIVE_OUTPUT=true
1373
- shift
1374
- ;;
1375
- --reset)
1376
- RESET_RETRIES=true
1377
- shift
1378
- ;;
1379
- --reset-circuit)
1380
- # Derive state paths so circuit breaker uses the correct directory
1381
- HARNESS_STATE_DIR="$(pwd)/.claude"
1382
- export HARNESS_STATE_DIR
1383
- init_circuit_breaker
1384
- reset_circuit_breaker "Manual reset via CLI"
1385
- echo "Circuit breaker reset to CLOSED"
1386
- exit 0
1387
- ;;
1388
- --status)
1389
- _status_file="$(pwd)/ralph/status.json"
1390
- if [[ -f "$_status_file" ]]; then
1391
- jq . "$_status_file" 2>/dev/null || cat "$_status_file"
1392
- else
1393
- echo "No status file found."
1394
- fi
1395
- exit 0
1396
- ;;
1397
- *)
1398
- echo "Unknown option: $1"
1399
- show_help
1400
- exit 1
1401
- ;;
1402
- esac
1403
- done
1404
-
1405
- main
1406
-
1407
- fi