agentic-loop 3.18.2 → 3.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/.claude/commands/tour.md +11 -7
  2. package/.claude/commands/vibe-help.md +5 -2
  3. package/.claude/commands/vibe-list.md +17 -2
  4. package/.claude/skills/prd/SKILL.md +21 -6
  5. package/.claude/skills/setup-review/SKILL.md +56 -0
  6. package/.claude/skills/tour/SKILL.md +11 -7
  7. package/.claude/skills/vibe-help/SKILL.md +2 -1
  8. package/.claude/skills/vibe-list/SKILL.md +5 -2
  9. package/.pre-commit-hooks.yaml +8 -0
  10. package/README.md +4 -0
  11. package/bin/agentic-loop.sh +7 -0
  12. package/bin/ralph.sh +35 -0
  13. package/dist/checks/check-signs-secrets.d.ts +9 -0
  14. package/dist/checks/check-signs-secrets.d.ts.map +1 -0
  15. package/dist/checks/check-signs-secrets.js +57 -0
  16. package/dist/checks/check-signs-secrets.js.map +1 -0
  17. package/dist/checks/index.d.ts +2 -5
  18. package/dist/checks/index.d.ts.map +1 -1
  19. package/dist/checks/index.js +4 -9
  20. package/dist/checks/index.js.map +1 -1
  21. package/dist/index.d.ts +1 -1
  22. package/dist/index.d.ts.map +1 -1
  23. package/dist/index.js +1 -1
  24. package/dist/index.js.map +1 -1
  25. package/package.json +2 -1
  26. package/ralph/hooks/common.sh +47 -0
  27. package/ralph/hooks/warn-debug.sh +12 -26
  28. package/ralph/hooks/warn-empty-catch.sh +21 -34
  29. package/ralph/hooks/warn-secrets.sh +39 -52
  30. package/ralph/hooks/warn-urls.sh +25 -45
  31. package/ralph/init.sh +60 -82
  32. package/ralph/loop.sh +533 -53
  33. package/ralph/prd-check.sh +177 -236
  34. package/ralph/prd.sh +5 -2
  35. package/ralph/setup/quick-setup.sh +2 -16
  36. package/ralph/setup.sh +68 -80
  37. package/ralph/signs.sh +8 -0
  38. package/ralph/uat.sh +2015 -0
  39. package/ralph/utils.sh +198 -69
  40. package/ralph/verify/tests.sh +65 -10
  41. package/templates/PROMPT.md +10 -4
  42. package/templates/UAT-PROMPT.md +197 -0
  43. package/templates/config/elixir.json +0 -2
  44. package/templates/config/fastmcp.json +0 -2
  45. package/templates/config/fullstack.json +2 -4
  46. package/templates/config/go.json +0 -2
  47. package/templates/config/minimal.json +0 -2
  48. package/templates/config/node.json +0 -2
  49. package/templates/config/python.json +0 -2
  50. package/templates/config/rust.json +0 -2
  51. package/templates/prd-example.json +6 -8
package/ralph/uat.sh ADDED
@@ -0,0 +1,2015 @@
1
+ #!/usr/bin/env bash
2
+ # shellcheck shell=bash
3
+ # uat.sh - UAT + Chaos Agent: Autonomous Testing Loops
4
+ #
5
+ # ============================================================================
6
+ # OVERVIEW
7
+ # ============================================================================
8
+ # Two commands share this file:
9
+ # uat — Acceptance testing team. "Does this work correctly?"
10
+ # chaos-agent — Chaos Agent red team. "Can we break it?"
11
+ #
12
+ # Both use Agent Teams for coordinated discovery, then strict TDD per test case:
13
+ # RED: Claude writes the test only (no app changes)
14
+ # GREEN: Claude fixes the app only (no test changes)
15
+ #
16
+ # 3-Phase Flow:
17
+ # Phase 1: DISCOVER + PLAN — Agent team explores app, generates plan
18
+ # Phase 2: EXECUTE LOOP — Per test case: RED (test) → GREEN (fix)
19
+ # Phase 3: REPORT — Summary of findings
20
+ #
21
+ # ============================================================================
22
+ # DEPENDENCIES: Requires utils.sh sourced first (get_config, print_*, etc.)
23
+ # ============================================================================
24
+
25
+ # UAT-specific directory variables (initialized by _init_uat_dirs)
26
+ UAT_MODE_DIR=""
27
+ UAT_PLAN_FILE=""
28
+ UAT_PROGRESS_FILE=""
29
+ UAT_FAILURE_FILE=""
30
+ UAT_SCREENSHOTS_DIR=""
31
+ UAT_MODE_LABEL=""
32
+ UAT_CONFIG_NS="" # config namespace: "uat" or "chaos"
33
+ UAT_CMD_NAME="" # CLI command name: "uat" or "chaos-agent"
34
+
35
+ # TDD phases
36
+ readonly UAT_PHASE_RED="RED"
37
+ readonly UAT_PHASE_GREEN="GREEN"
38
+
39
+ # Defaults (overridable via config)
40
+ readonly DEFAULT_UAT_MAX_ITERATIONS=20
41
+ readonly DEFAULT_UAT_MAX_SESSION_SECONDS=600
42
+ readonly DEFAULT_UAT_MAX_CASE_RETRIES=5
43
+
44
+ # Team mode timeouts (longer — Claude coordinates parallel agents)
45
+ readonly DEFAULT_UAT_SESSION_SECONDS=1800
46
+ readonly DEFAULT_CHAOS_SESSION_SECONDS=1800
47
+
48
+ # ============================================================================
49
+ # DIRECTORY INIT
50
+ # ============================================================================
51
+
52
+ _init_uat_dirs() {
53
+ local subdir="${1:-uat}"
54
+ local label="${2:-UAT}"
55
+ local cmd="${3:-$subdir}"
56
+ UAT_MODE_DIR="$RALPH_DIR/$subdir"
57
+ UAT_PLAN_FILE="$UAT_MODE_DIR/plan.json"
58
+ UAT_PROGRESS_FILE="$UAT_MODE_DIR/progress.txt"
59
+ UAT_FAILURE_FILE="$UAT_MODE_DIR/last_failure.txt"
60
+ UAT_SCREENSHOTS_DIR="$UAT_MODE_DIR/screenshots"
61
+ UAT_MODE_LABEL="$label"
62
+ UAT_CONFIG_NS="$subdir"
63
+ UAT_CMD_NAME="$cmd"
64
+ }
65
+
66
+ # ============================================================================
67
+ # SHARED ARG PARSING
68
+ # ============================================================================
69
+
70
+ # Sets: _ARG_FOCUS, _ARG_PLAN_ONLY, _ARG_FORCE_REVIEW, _ARG_NO_FIX,
71
+ # _ARG_MAX_ITERATIONS, _ARG_QUIET_MODE
72
+ _parse_uat_args() {
73
+ _ARG_FOCUS=""
74
+ _ARG_PLAN_ONLY=false
75
+ _ARG_FORCE_REVIEW=false
76
+ _ARG_NO_FIX=false
77
+ _ARG_MAX_ITERATIONS=""
78
+ _ARG_QUIET_MODE=$(get_config '.quiet' "false")
79
+
80
+ while [[ $# -gt 0 ]]; do
81
+ case "$1" in
82
+ --focus)
83
+ _ARG_FOCUS="$2"
84
+ shift 2
85
+ ;;
86
+ --plan-only)
87
+ _ARG_PLAN_ONLY=true
88
+ shift
89
+ ;;
90
+ --review)
91
+ _ARG_FORCE_REVIEW=true
92
+ shift
93
+ ;;
94
+ --no-fix)
95
+ _ARG_NO_FIX=true
96
+ shift
97
+ ;;
98
+ --max)
99
+ _ARG_MAX_ITERATIONS="$2"
100
+ shift 2
101
+ ;;
102
+ --quiet)
103
+ _ARG_QUIET_MODE=true
104
+ shift
105
+ ;;
106
+ *)
107
+ shift
108
+ ;;
109
+ esac
110
+ done
111
+ }
112
+
113
+ # ============================================================================
114
+ # ENTRY POINT
115
+ # ============================================================================
116
+
117
+ run_uat() {
118
+ _parse_uat_args "$@"
119
+
120
+ local focus="$_ARG_FOCUS"
121
+ local plan_only="$_ARG_PLAN_ONLY"
122
+ local force_review="$_ARG_FORCE_REVIEW"
123
+ local no_fix="$_ARG_NO_FIX"
124
+ local max_iterations="$_ARG_MAX_ITERATIONS"
125
+ local quiet_mode="$_ARG_QUIET_MODE"
126
+
127
+ # Initialize directories for UAT mode
128
+ _init_uat_dirs "uat" "UAT"
129
+
130
+ # Validate prerequisites
131
+ check_dependencies
132
+
133
+ # Concurrent execution guard
134
+ _acquire_uat_lock
135
+
136
+ # Ensure directory structure
137
+ mkdir -p "$UAT_MODE_DIR" "$UAT_SCREENSHOTS_DIR"
138
+
139
+ # Banner
140
+ _print_uat_banner
141
+
142
+ # Phase 1: Discover + Plan
143
+ if [[ ! -f "$UAT_PLAN_FILE" ]] || [[ "$force_review" == "true" ]] || [[ "$plan_only" == "true" ]]; then
144
+ if [[ -f "$UAT_PLAN_FILE" ]] && [[ "$force_review" == "true" ]]; then
145
+ print_info "Re-reviewing existing plan..."
146
+ else
147
+ echo ""
148
+ print_info "Phase 1: Exploring your app and building a test plan"
149
+ echo ""
150
+ if ! _discover_and_plan "$quiet_mode" "uat"; then
151
+ print_error "Something went wrong while exploring your app. See the progress log for details."
152
+ return 1
153
+ fi
154
+ fi
155
+
156
+ # Review the plan
157
+ if ! _review_plan; then
158
+ print_info "Plan review cancelled. No changes were made."
159
+ return 0
160
+ fi
161
+
162
+ if [[ "$plan_only" == "true" ]]; then
163
+ print_success "Plan generated. Run 'npx agentic-loop uat' to execute."
164
+ return 0
165
+ fi
166
+ else
167
+ local remaining
168
+ remaining=$(jq '[.testCases[] | select(.passes==false)] | length' "$UAT_PLAN_FILE" 2>/dev/null || echo "0")
169
+ print_info "Picking up where we left off ($remaining tests still to go)"
170
+ fi
171
+
172
+ # Phase 2: Execute Loop
173
+ echo ""
174
+ print_info "Phase 2: Running tests and fixing issues"
175
+ echo ""
176
+ _run_uat_loop "$focus" "$no_fix" "$max_iterations" "$quiet_mode"
177
+ local loop_exit=$?
178
+
179
+ # Phase 3: Report
180
+ _print_report
181
+
182
+ return $loop_exit
183
+ }
184
+
185
+ # ============================================================================
186
+ # CHAOS AGENT ENTRY POINT
187
+ # ============================================================================
188
+
189
+ run_chaos() {
190
+ _parse_uat_args "$@"
191
+
192
+ local focus="$_ARG_FOCUS"
193
+ local plan_only="$_ARG_PLAN_ONLY"
194
+ local force_review="$_ARG_FORCE_REVIEW"
195
+ local no_fix="$_ARG_NO_FIX"
196
+ local max_iterations="$_ARG_MAX_ITERATIONS"
197
+ local quiet_mode="$_ARG_QUIET_MODE"
198
+
199
+ # Initialize directories for chaos mode
200
+ _init_uat_dirs "chaos" "Chaos Agent" "chaos-agent"
201
+
202
+ # Validate prerequisites
203
+ check_dependencies
204
+
205
+ # Concurrent execution guard
206
+ _acquire_uat_lock
207
+
208
+ # Ensure directory structure
209
+ mkdir -p "$UAT_MODE_DIR" "$UAT_SCREENSHOTS_DIR"
210
+
211
+ # Banner
212
+ _print_chaos_banner
213
+
214
+ # Phase 1: Adversarial Discovery + Plan
215
+ if [[ ! -f "$UAT_PLAN_FILE" ]] || [[ "$force_review" == "true" ]] || [[ "$plan_only" == "true" ]]; then
216
+ if [[ -f "$UAT_PLAN_FILE" ]] && [[ "$force_review" == "true" ]]; then
217
+ print_info "Re-reviewing existing plan..."
218
+ else
219
+ echo ""
220
+ print_info "Phase 1: Red team exploring your app for vulnerabilities"
221
+ echo ""
222
+ if ! _discover_and_plan "$quiet_mode" "chaos"; then
223
+ print_error "Something went wrong during red team exploration. See the progress log for details."
224
+ return 1
225
+ fi
226
+ fi
227
+
228
+ # Review the plan
229
+ if ! _review_plan; then
230
+ print_info "Plan review cancelled. No changes were made."
231
+ return 0
232
+ fi
233
+
234
+ if [[ "$plan_only" == "true" ]]; then
235
+ print_success "Plan generated. Run 'npx agentic-loop chaos-agent' to execute."
236
+ return 0
237
+ fi
238
+ else
239
+ local remaining
240
+ remaining=$(jq '[.testCases[] | select(.passes==false)] | length' "$UAT_PLAN_FILE" 2>/dev/null || echo "0")
241
+ print_info "Picking up where we left off ($remaining tests still to go)"
242
+ fi
243
+
244
+ # Phase 2: Testing for vulnerabilities and fixing issues
245
+ echo ""
246
+ print_info "Phase 2: Running attack tests and fixing issues"
247
+ echo ""
248
+ _run_uat_loop "$focus" "$no_fix" "$max_iterations" "$quiet_mode"
249
+ local loop_exit=$?
250
+
251
+ # Phase 3: Report
252
+ _print_report
253
+
254
+ return $loop_exit
255
+ }
256
+
257
+ # ============================================================================
258
+ # CONCURRENT EXECUTION GUARD
259
+ # ============================================================================
260
+
261
+ _acquire_uat_lock() {
262
+ local lockfile="$RALPH_DIR/.lock"
263
+ if [[ -f "$lockfile" ]]; then
264
+ local pid
265
+ pid=$(cat "$lockfile")
266
+ if kill -0 "$pid" 2>/dev/null; then
267
+ print_error "Another $UAT_MODE_LABEL session is already running. Stop it first with 'npx agentic-loop stop'."
268
+ exit 1
269
+ fi
270
+ rm -f "$lockfile" # Stale lock
271
+ fi
272
+ echo $$ > "$lockfile"
273
+ # Chain cleanup: lock removal + kill child processes
274
+ # This replaces the trap from ralph.sh, so we handle both concerns
275
+ trap '_uat_cleanup' EXIT
276
+ trap '_uat_interrupt' INT TERM
277
+ }
278
+
279
+ _uat_cleanup() {
280
+ rm -f "$RALPH_DIR/.lock"
281
+ }
282
+
283
+ _uat_interrupt() {
284
+ echo ""
285
+ print_warning "Interrupted. Wrapping up $UAT_MODE_LABEL..."
286
+ # Kill all child processes (Claude sessions, test runners)
287
+ kill 0 2>/dev/null || true
288
+ _uat_cleanup
289
+ exit 130
290
+ }
291
+
292
+ # ============================================================================
293
+ # PHASE 1: DISCOVER + PLAN
294
+ # ============================================================================
295
+
296
+ _discover_and_plan() {
297
+ local quiet="${1:-false}"
298
+ local mode="${2:-uat}"
299
+ local prompt_file output_file
300
+ prompt_file=$(create_temp_file ".uat-discover-prompt.md")
301
+ output_file=$(create_temp_file ".uat-discover-output.log")
302
+
303
+ local timeout
304
+ if [[ "$mode" == "chaos" ]]; then
305
+ timeout=$(get_config '.chaos.sessionSeconds' "$DEFAULT_CHAOS_SESSION_SECONDS")
306
+ _build_chaos_agent_prompt "$prompt_file"
307
+ _log_uat "DISCOVER" "Starting Chaos Agent discovery (timeout: ${timeout}s)"
308
+ else
309
+ timeout=$(get_config '.uat.sessionSeconds' "$DEFAULT_UAT_SESSION_SECONDS")
310
+ _build_uat_team_prompt "$prompt_file"
311
+ _log_uat "DISCOVER" "Starting UAT team discovery (timeout: ${timeout}s)"
312
+ fi
313
+
314
+ # Run Claude with MCP exploration
315
+ local claude_exit=0
316
+ (
317
+ set -o pipefail
318
+ cat "$prompt_file" | run_with_timeout "$timeout" claude -p \
319
+ --dangerously-skip-permissions \
320
+ --verbose \
321
+ --output-format stream-json \
322
+ 2>&1 | tee "$output_file" | _parse_uat_activity "$quiet"
323
+ ) &
324
+ local pipeline_pid=$!
325
+ wait "$pipeline_pid" || claude_exit=$?
326
+
327
+ if [[ $claude_exit -ne 0 ]]; then
328
+ _log_uat "DISCOVER" "Claude session failed (exit $claude_exit)"
329
+ print_error "App exploration session failed"
330
+ if [[ -f "$output_file" ]]; then
331
+ echo " Last output:"
332
+ tail -10 "$output_file" | sed 's/^/ /'
333
+ fi
334
+ return 1
335
+ fi
336
+
337
+ # Validate plan was generated
338
+ if [[ ! -f "$UAT_PLAN_FILE" ]]; then
339
+ print_error "No test plan was created"
340
+ echo ""
341
+ echo " The exploration finished but didn't produce a plan."
342
+ echo " Check the output above for what went wrong."
343
+ return 1
344
+ fi
345
+
346
+ if ! _validate_plan; then
347
+ print_error "The generated plan has errors and can't be used"
348
+ return 1
349
+ fi
350
+
351
+ # Check if project-specific prompt was generated
352
+ if [[ ! -f "$UAT_MODE_DIR/UAT-PROMPT.md" ]]; then
353
+ print_warning "No project-specific test instructions were created."
354
+ echo " Tests will use generic patterns instead."
355
+ echo " For better results, re-run with 'npx agentic-loop $UAT_CMD_NAME --plan-only'."
356
+ fi
357
+
358
+ # Mark plan as generated
359
+ update_json "$UAT_PLAN_FILE" '.testSuite.status = "planned"'
360
+
361
+ local case_count
362
+ case_count=$(jq '.testCases | length' "$UAT_PLAN_FILE")
363
+ _log_uat "DISCOVER" "Plan generated with $case_count test cases"
364
+ print_success "Plan generated: $case_count test cases"
365
+
366
+ return 0
367
+ }
368
+
369
+ _build_uat_team_prompt() {
370
+ local prompt_file="$1"
371
+
372
+ # Start with UAT prompt template
373
+ cat "$RALPH_TEMPLATES/UAT-PROMPT.md" > "$prompt_file"
374
+
375
+ cat >> "$prompt_file" << 'PROMPT_SECTION'
376
+
377
+ ---
378
+
379
+ ## Phase: UAT Team Discovery + Plan Generation
380
+
381
+ You are the **team lead** of an acceptance testing team. Your job is to coordinate a team of
382
+ agents that explore a live app, verify features work correctly, and produce a comprehensive
383
+ UAT plan.
384
+
385
+ ### Step 1: Recon (~60 seconds)
386
+
387
+ Before spawning anyone, do a quick recon yourself:
388
+
389
+ 1. **Read `.ralph/config.json`** for URLs, auth config, and directories
390
+ 2. **Read `.ralph/prd.json`** if it exists — completed stories tell you what was built
391
+ 3. **Navigate the app** using Playwright MCP — click through nav, find pages, note the tech stack
392
+ 4. **Take 2-3 screenshots** of key pages (save to `.ralph/uat/screenshots/`)
393
+ 5. **Map the feature areas** — what exists? (auth, forms, API, navigation, etc.)
394
+
395
+ Don't go deep. Just map what's there. ~60 seconds max.
396
+
397
+ ### Step 2: Assemble the UAT Team
398
+
399
+ Create a team and spawn teammates:
400
+
401
+ ```
402
+ TeamCreate: "uat-team"
403
+ ```
404
+
405
+ Spawn these teammates using the Task tool with `team_name: "uat-team"`:
406
+
407
+ 1. **"recon"** (`subagent_type: "general-purpose"`) — Deep recon. Maps all routes/endpoints,
408
+ catalogs forms with selectors, identifies tech stack and auth. Shares intel with teammates
409
+ via SendMessage.
410
+
411
+ 2. **"happy-path-{area}"** (`subagent_type: "general-purpose"`) — One per feature area.
412
+ Completes primary user journeys, records correct behavior as ground truth assertions
413
+ (exact text, redirects, success messages).
414
+
415
+ 3. **"edge-cases"** (`subagent_type: "general-purpose"`) — Tests boundary conditions across
416
+ all areas. Empty fields, long input, required-field validation, back button after submit,
417
+ refresh mid-flow. Focus: does the app handle these gracefully?
418
+
419
+ **Only spawn agents for areas that exist.** If there are no forms, don't spawn a forms specialist.
420
+ If there's no auth, skip auth testing.
421
+
422
+ Mindset: **"Verify the app works correctly for real users."**
423
+
424
+ ### Agent Instructions Template
425
+
426
+ Every agent prompt MUST include:
427
+
428
+ 1. **Their role and focus area** (from above)
429
+ 2. **The recon intel** — pages, URLs, tech stack you discovered in Step 1
430
+ 3. **Browser tab isolation** — "Open your own browser tab via `browser_tabs(action: 'new')`
431
+ before navigating. Do NOT use the existing tab."
432
+ 4. **Communication** — "Share important discoveries with teammates via SendMessage.
433
+ Examples: 'Login redirects to /dashboard after success', 'Registration form has 4 required fields',
434
+ 'Profile page shows user email and name'. Read messages from teammates and adapt your testing."
435
+ 5. **Output format** — "When done, send your findings to the team lead via SendMessage.
436
+ Format each finding as a test case with: title, category, testFile path, targetFiles,
437
+ assertions (input/expected/strategy), and edgeCases."
438
+
439
+ ### Step 3: Coordinate
440
+
441
+ While your team works:
442
+
443
+ - **Monitor messages** from teammates as they report findings
444
+ - **Redirect effort** if needed — if recon discovers something important, message the
445
+ relevant specialist
446
+ - **Create tasks** in the shared task list for any new areas discovered
447
+
448
+ ### Step 4: Collect + Merge + Write Plan
449
+
450
+ After all teammates finish:
451
+
452
+ 1. Collect findings from all agent messages
453
+ 2. Dedup by test file path (keep the case with more assertions)
454
+ 3. Assign sequential IDs: `UAT-001`, `UAT-002`, ...
455
+ 4. Write `.ralph/uat/plan.json` (schema below)
456
+ 5. Write `.ralph/uat/UAT-PROMPT.md` (schema below)
457
+ 6. Shut down all teammates via SendMessage with `type: "shutdown_request"`
458
+ 7. Clean up with TeamDelete
459
+
460
+ ### plan.json Schema
461
+
462
+ Write `.ralph/uat/plan.json`:
463
+
464
+ ```json
465
+ {
466
+ "testSuite": {
467
+ "name": "UAT Loop",
468
+ "generatedAt": "<ISO timestamp>",
469
+ "status": "pending",
470
+ "discoveryMethod": "uat-team"
471
+ },
472
+ "testCases": [
473
+ {
474
+ "id": "UAT-001",
475
+ "title": "Feature area — what the test checks",
476
+ "category": "auth|forms|navigation|api|ui|data",
477
+ "type": "e2e|integration",
478
+ "userStory": "As a user, I...",
479
+ "testApproach": "What to test and how",
480
+ "testFile": "tests/e2e/feature/test-name.spec.ts",
481
+ "targetFiles": ["src/pages/feature.tsx"],
482
+ "edgeCases": ["Edge case 1", "Edge case 2"],
483
+ "assertions": [
484
+ {
485
+ "input": "Fill name='John', submit form",
486
+ "expected": "Shows 'Welcome, John'",
487
+ "strategy": "keyword"
488
+ }
489
+ ],
490
+ "passes": false,
491
+ "retryCount": 0,
492
+ "source": "uat-team:agent-name"
493
+ }
494
+ ]
495
+ }
496
+ ```
497
+
498
+ **Every test case MUST have at least 3 assertions** with concrete input/expected pairs:
499
+ 1. One happy-path assertion (correct input → correct output)
500
+ 2. One edge-case assertion (bad input → proper error handling)
501
+ 3. One content assertion (page shows the RIGHT data, not just that it loads)
502
+
503
+ ### UAT-PROMPT.md Schema
504
+
505
+ Write `.ralph/uat/UAT-PROMPT.md` — a project-specific testing guide based on what the
506
+ team ACTUALLY FOUND. Include:
507
+
508
+ ```markdown
509
+ # UAT Guide — [Project Name]
510
+
511
+ ## App Overview
512
+ - What the app does (1-2 sentences)
513
+ - Tech stack observed (framework, API patterns, auth method)
514
+ - Base URLs (frontend, API if applicable)
515
+
516
+ ## Pages & Routes Discovered
517
+ For each page:
518
+ - URL pattern and what it shows
519
+ - Key interactive elements (forms, buttons, links)
520
+ - Selectors that work (data-testid, roles, labels)
521
+
522
+ ## Auth Flow
523
+ - How login works (form fields, redirect after login)
524
+ - Test credentials if available (from config or .env)
525
+ - What pages require auth vs. public
526
+
527
+ ## Known Forms & Inputs
528
+ For each form:
529
+ - Fields with their labels/names/selectors
530
+ - Required vs optional fields
531
+ - Validation behavior observed
532
+
533
+ ## What "Correct" Looks Like
534
+ For each feature area:
535
+ - Expected behavior observed
536
+ - Specific text/numbers that should appear
537
+
538
+ ## Console & Network Observations
539
+ - Any existing console errors/warnings
540
+ - API endpoints observed
541
+ - Response patterns (JSON structure, status codes)
542
+ ```
543
+
544
+ This is NOT a copy of the template — it's ground truth from the team's exploration.
545
+
546
+ ### Rules
547
+
548
+ - Test auth flows FIRST (they gate everything else)
549
+ - One test case per feature area (not per edge case)
550
+ - Include edge cases as a list within each test case
551
+ - **Every test case MUST have assertions with input/expected pairs**
552
+ - `type: "e2e"` for anything involving browser interaction
553
+ - `type: "integration"` for API-only tests
554
+ - `targetFiles` should list the app source files the test covers
555
+ - `testFile` path should use the project's test directory conventions
556
+ - Aim for 5-15 test cases depending on app complexity
557
+ - Always clean up: shutdown teammates and delete team when done
558
+ PROMPT_SECTION
559
+
560
+ _inject_prompt_context "$prompt_file"
561
+ }
562
+
563
+ _validate_plan() {
564
+ # Check JSON is valid
565
+ if ! jq -e '.' "$UAT_PLAN_FILE" >/dev/null 2>&1; then
566
+ print_error "Test plan file is corrupted (not valid JSON)"
567
+ return 1
568
+ fi
569
+
570
+ # Check required structure
571
+ if ! jq -e '.testSuite and .testCases' "$UAT_PLAN_FILE" >/dev/null 2>&1; then
572
+ print_error "Test plan is incomplete — missing required sections"
573
+ return 1
574
+ fi
575
+
576
+ # Check test cases have required fields
577
+ local invalid_cases
578
+ invalid_cases=$(jq '[.testCases[] | select(.id == null or .title == null or .testFile == null)] | length' "$UAT_PLAN_FILE" 2>/dev/null)
579
+ if [[ "$invalid_cases" -gt 0 ]]; then
580
+ print_error "$invalid_cases test case(s) are incomplete — each needs an ID, title, and test file"
581
+ return 1
582
+ fi
583
+
584
+ # Check test cases have assertions (the eval contract)
585
+ local missing_assertions
586
+ missing_assertions=$(jq '[.testCases[] | select((.assertions // []) | length < 1)] | length' "$UAT_PLAN_FILE" 2>/dev/null)
587
+ if [[ "$missing_assertions" -gt 0 ]]; then
588
+ print_warning "$missing_assertions test case(s) have no expected results defined — tests may not catch real issues"
589
+ echo " Each test case should describe what to check (input and expected outcome)."
590
+ echo " Run 'npx agentic-loop $UAT_CMD_NAME --review' to edit the plan and add them."
591
+ # Warning only, not a hard failure — Claude may add assertions during execution
592
+ fi
593
+
594
+ return 0
595
+ }
596
+
597
+ # ============================================================================
598
+ # PLAN REVIEW
599
+ # ============================================================================
600
+
601
+ _review_plan() {
602
+ echo ""
603
+ echo " ┌──────────────────────────────────────────────────────┐"
604
+ printf " │ %-54s│\n" "$UAT_MODE_LABEL Test Plan"
605
+ echo " └──────────────────────────────────────────────────────┘"
606
+ echo ""
607
+
608
+ local total_cases
609
+ total_cases=$(jq '.testCases | length' "$UAT_PLAN_FILE")
610
+
611
+ # Print summary table
612
+ local idx=0
613
+ while IFS=$'\t' read -r id title category tc_type edge_count assert_count; do
614
+ idx=$((idx + 1))
615
+ local type_icon=""
616
+ case "$tc_type" in
617
+ e2e) type_icon="🌐" ;;
618
+ integration) type_icon="🔌" ;;
619
+ *) type_icon="📝" ;;
620
+ esac
621
+
622
+ # Truncate title
623
+ local display_title="$title"
624
+ [[ ${#display_title} -gt 40 ]] && display_title="${display_title:0:37}..."
625
+
626
+ printf " %s %-10s %-40s [%s edge cases, %s checks]\n" "$type_icon" "$id" "$display_title" "$edge_count" "$assert_count"
627
+ done < <(jq -r '.testCases[] | [.id, .title, .category, .type, (.edgeCases | length | tostring), ((.assertions // []) | length | tostring)] | @tsv' "$UAT_PLAN_FILE" 2>/dev/null)
628
+
629
+ echo ""
630
+ echo " Total: $total_cases test cases"
631
+ echo ""
632
+
633
+ # Prompt for review
634
+ local response
635
+ read -r -p " Execute this plan? [Y/n/e(dit)] " response
636
+
637
+ case "$response" in
638
+ [Nn])
639
+ return 1
640
+ ;;
641
+ [Ee])
642
+ local editor="${EDITOR:-vi}"
643
+ "$editor" "$UAT_PLAN_FILE"
644
+ # Re-validate after edit
645
+ if ! _validate_plan; then
646
+ print_error "Your edits made the plan invalid. Please fix and try again."
647
+ return 1
648
+ fi
649
+ # Mark as reviewed
650
+ update_json "$UAT_PLAN_FILE" \
651
+ --arg ts "$(date -Iseconds 2>/dev/null || date +%Y-%m-%dT%H:%M:%S)" \
652
+ '.testSuite.reviewedAt = $ts'
653
+ ;;
654
+ *)
655
+ # Mark as reviewed
656
+ update_json "$UAT_PLAN_FILE" \
657
+ --arg ts "$(date -Iseconds 2>/dev/null || date +%Y-%m-%dT%H:%M:%S)" \
658
+ '.testSuite.reviewedAt = $ts'
659
+ ;;
660
+ esac
661
+
662
+ return 0
663
+ }
664
+
665
+ # ============================================================================
666
+ # PHASE 2: EXECUTE LOOP
667
+ # ============================================================================
668
+
669
+ _run_uat_loop() {
670
+ local focus="$1"
671
+ local no_fix="$2"
672
+ local max_iterations_arg="$3"
673
+ local quiet="$4"
674
+
675
+ local max_iterations
676
+ max_iterations="${max_iterations_arg:-$(get_config ".$UAT_CONFIG_NS.maxIterations" "$DEFAULT_UAT_MAX_ITERATIONS")}"
677
+ local max_case_retries
678
+ max_case_retries=$(get_config ".$UAT_CONFIG_NS.maxCaseRetries" "$DEFAULT_UAT_MAX_CASE_RETRIES")
679
+ local timeout
680
+ timeout=$(get_config ".$UAT_CONFIG_NS.maxSessionSeconds" "$DEFAULT_UAT_MAX_SESSION_SECONDS")
681
+
682
+ local iteration=0
683
+
684
+ # Track results for report
685
+ UAT_TESTS_WRITTEN=0
686
+ UAT_BUGS_FOUND=0
687
+ UAT_BUGS_FIXED=0
688
+ UAT_CASES_PASSED=0
689
+ UAT_CASES_FAILED=0
690
+ UAT_CASES_SKIPPED=0
691
+ UAT_RED_ONLY_PASSED=0
692
+ UAT_GREEN_ATTEMPTS=0
693
+ UAT_FILES_FIXED=()
694
+ UAT_NEEDS_HUMAN=()
695
+
696
+ while [[ $iteration -lt $max_iterations ]]; do
697
+ # Check for stop signal
698
+ if [[ -f "$RALPH_DIR/.stop" ]]; then
699
+ rm -f "$RALPH_DIR/.stop"
700
+ print_warning "Stop requested. Finishing up..."
701
+ break
702
+ fi
703
+
704
+ iteration=$((iteration + 1))
705
+
706
+ # Pick next incomplete test case (with optional focus filter)
707
+ local case_id
708
+ if [[ -n "$focus" ]]; then
709
+ # Focus can be a case ID (UAT-003) or category (auth)
710
+ case_id=$(jq -r --arg f "$focus" '
711
+ .testCases[] |
712
+ select(.passes==false) |
713
+ select(.id==$f or .category==$f) |
714
+ .id
715
+ ' "$UAT_PLAN_FILE" | head -1)
716
+ else
717
+ case_id=$(jq -r '.testCases[] | select(.passes==false) | .id' "$UAT_PLAN_FILE" | head -1)
718
+ fi
719
+
720
+ # All done?
721
+ if [[ -z "$case_id" ]]; then
722
+ break
723
+ fi
724
+
725
+ # Get case details
726
+ local case_json case_title case_type
727
+ case_json=$(jq --arg id "$case_id" '.testCases[] | select(.id==$id)' "$UAT_PLAN_FILE")
728
+ case_title=$(echo "$case_json" | jq -r '.title')
729
+ case_type=$(echo "$case_json" | jq -r '.type // "e2e"')
730
+
731
+ # Read TDD phase state (null = start RED, "red" = resume GREEN)
732
+ local phase
733
+ phase=$(echo "$case_json" | jq -r '.phase // "null"')
734
+
735
+ # Compute per-phase retry counts (default 0 for old plan.json files)
736
+ local red_retries green_retries
737
+ red_retries=$(echo "$case_json" | jq -r '.redRetries // 0')
738
+ green_retries=$(echo "$case_json" | jq -r '.greenRetries // 0')
739
+
740
+ # Circuit breaker: combined red + green retries
741
+ local total_retries=$((red_retries + green_retries))
742
+ if [[ $total_retries -ge $max_case_retries ]]; then
743
+ print_warning "$case_id tried $max_case_retries times without success — skipping (needs manual review)"
744
+ _flag_for_human "$case_id" "Tried $max_case_retries times without success"
745
+ UAT_CASES_SKIPPED=$((UAT_CASES_SKIPPED + 1))
746
+ update_json "$UAT_PLAN_FILE" \
747
+ --arg id "$case_id" '(.testCases[] | select(.id==$id)) |= . + {passes: true, skipped: true}'
748
+ continue
749
+ fi
750
+
751
+ # Determine current phase
752
+ local current_phase="$UAT_PHASE_RED"
753
+ if [[ "$phase" == "red" ]]; then
754
+ current_phase="$UAT_PHASE_GREEN"
755
+ fi
756
+
757
+ # Display case banner with phase
758
+ local display_title="$case_title"
759
+ [[ ${#display_title} -gt 50 ]] && display_title="${display_title:0:47}..."
760
+
761
+ echo ""
762
+ echo "┌──────────────────────────────────────────────────────────┐"
763
+ printf "│ %-10s %-45s│\n" "$case_id" "$display_title"
764
+ local phase_label="Writing test"
765
+ [[ "$current_phase" == "$UAT_PHASE_GREEN" ]] && phase_label="Fixing app"
766
+ printf "│ %-14s Type: %-6s Attempt: %-3s │\n" "$phase_label" "$case_type" "$((total_retries + 1))"
767
+ echo "└──────────────────────────────────────────────────────────┘"
768
+ echo ""
769
+
770
+ # Git snapshot for rollback
771
+ _git_snapshot "$case_id"
772
+
773
+ local test_file
774
+ test_file=$(jq -r --arg id "$case_id" '.testCases[] | select(.id==$id) | .testFile' "$UAT_PLAN_FILE")
775
+
776
+ if [[ "$current_phase" == "$UAT_PHASE_RED" ]]; then
777
+ _run_red_phase "$case_id" "$case_type" "$test_file" "$no_fix" "$timeout" "$quiet"
778
+ else
779
+ _run_green_phase "$case_id" "$case_type" "$test_file" "$timeout" "$quiet"
780
+ fi
781
+
782
+ # Brief pause between iterations
783
+ sleep 1
784
+ done
785
+
786
+ # Update suite status
787
+ local all_passed
788
+ all_passed=$(jq '[.testCases[] | select(.passes==false)] | length' "$UAT_PLAN_FILE" 2>/dev/null)
789
+ if [[ "$all_passed" -eq 0 ]]; then
790
+ update_json "$UAT_PLAN_FILE" '.testSuite.status = "complete"'
791
+ else
792
+ update_json "$UAT_PLAN_FILE" '.testSuite.status = "partial"'
793
+ fi
794
+
795
+ [[ "$all_passed" -eq 0 ]] && return 0
796
+ return 1
797
+ }
798
+
799
+ # ============================================================================
800
+ # TDD PHASES: RED (test-only) and GREEN (fix-only)
801
+ # ============================================================================
802
+
803
+ _run_red_phase() {
804
+ local case_id="$1"
805
+ local case_type="$2"
806
+ local test_file="$3"
807
+ local no_fix="$4"
808
+ local timeout="$5"
809
+ local quiet="$6"
810
+
811
+ local prompt_file output_file
812
+ prompt_file=$(create_temp_file ".uat-red-prompt.md")
813
+ output_file=$(create_temp_file ".uat-red-output.log")
814
+
815
+ _build_red_prompt "$case_id" "$prompt_file"
816
+
817
+ _log_uat "$case_id" "RED: Starting test-only session"
818
+
819
+ local claude_exit=0
820
+ (
821
+ set -o pipefail
822
+ cat "$prompt_file" | run_with_timeout "$timeout" claude -p \
823
+ --dangerously-skip-permissions \
824
+ --verbose \
825
+ --output-format stream-json \
826
+ 2>&1 | tee "$output_file" | _parse_uat_activity "$quiet"
827
+ ) &
828
+ local pipeline_pid=$!
829
+ wait "$pipeline_pid" || claude_exit=$?
830
+
831
+ rm -f "$prompt_file"
832
+
833
+ if [[ $claude_exit -ne 0 ]] && [[ $claude_exit -ne 124 ]]; then
834
+ print_warning "Test-writing session ended unexpectedly — will retry"
835
+ _log_uat "$case_id" "RED: Session failed (exit $claude_exit)"
836
+ _increment_red_retry "$case_id"
837
+ rm -f "$output_file"
838
+ return
839
+ fi
840
+
841
+ # Check if test file was created
842
+ if [[ ! -f "$test_file" ]]; then
843
+ print_warning "$case_id: Test file was not created — will retry"
844
+ _log_uat "$case_id" "RED: Test file not created"
845
+ _increment_red_retry "$case_id"
846
+ rm -f "$output_file"
847
+ return
848
+ fi
849
+
850
+ # Enforce RED constraint: no app changes allowed
851
+ if _has_app_changes "$test_file"; then
852
+ print_warning "$case_id: App code was changed during test-writing (not allowed) — undoing changes"
853
+ _log_uat "$case_id" "RED: App changes detected — rollback"
854
+ _rollback_to_snapshot "$case_id"
855
+ _save_red_violation_feedback "$case_id"
856
+ _increment_red_retry "$case_id"
857
+ rm -f "$output_file"
858
+ return
859
+ fi
860
+
861
+ UAT_TESTS_WRITTEN=$((UAT_TESTS_WRITTEN + 1))
862
+
863
+ # Validate test quality — reject shallow tests
864
+ if ! _validate_test_quality "$test_file" "$case_id"; then
865
+ print_warning "$case_id: Test doesn't check enough — will retry with better guidance"
866
+ _save_shallow_test_feedback "$case_id" "$test_file"
867
+ _increment_red_retry "$case_id"
868
+ rm -f "$output_file"
869
+ return
870
+ fi
871
+
872
+ # Run the test
873
+ if _run_test "$test_file" "$case_type"; then
874
+ # PASS in RED — app already correct, no fix needed
875
+ print_success "$case_id: Test passes — app already works correctly"
876
+ _mark_passed "$case_id"
877
+ _commit_result "$case_id" "$test_file"
878
+ UAT_CASES_PASSED=$((UAT_CASES_PASSED + 1))
879
+ UAT_RED_ONLY_PASSED=$((UAT_RED_ONLY_PASSED + 1))
880
+ _log_uat "$case_id" "RED: PASSED (app already correct)"
881
+ else
882
+ # FAIL — classify: test bug or app bug?
883
+ local failure_type
884
+ failure_type=$(_classify_red_failure "$test_file" "$case_id")
885
+
886
+ if [[ "$failure_type" == "test_bug" ]]; then
887
+ print_warning "$case_id: Test has errors — will retry"
888
+ _save_failure_context "$case_id" "$output_file"
889
+ _increment_red_retry "$case_id"
890
+ else
891
+ # App bug found — commit the RED test, transition to GREEN
892
+ print_info "$case_id: Found an app bug — now fixing it"
893
+ UAT_BUGS_FOUND=$((UAT_BUGS_FOUND + 1))
894
+
895
+ if [[ "$no_fix" == "true" ]]; then
896
+ # --no-fix mode: commit failing test as documented bug
897
+ print_info "$case_id: Saving test as a documented bug (fix skipped with --no-fix)"
898
+ _commit_red_test "$case_id" "$test_file"
899
+ _mark_passed "$case_id"
900
+ UAT_CASES_PASSED=$((UAT_CASES_PASSED + 1))
901
+ _log_uat "$case_id" "RED: Documented bug (--no-fix mode)"
902
+ else
903
+ # Commit the RED test and transition to GREEN
904
+ _commit_red_test "$case_id" "$test_file"
905
+ _mark_phase "$case_id" "red"
906
+ _save_failure_context "$case_id" "$output_file"
907
+ _log_uat "$case_id" "RED: App bug found — transitioning to GREEN"
908
+ fi
909
+ fi
910
+ fi
911
+
912
+ rm -f "$output_file"
913
+ }
914
+
915
+ _run_green_phase() {
916
+ local case_id="$1"
917
+ local case_type="$2"
918
+ local test_file="$3"
919
+ local timeout="$4"
920
+ local quiet="$5"
921
+
922
+ UAT_GREEN_ATTEMPTS=$((UAT_GREEN_ATTEMPTS + 1))
923
+
924
+ local prompt_file output_file
925
+ prompt_file=$(create_temp_file ".uat-green-prompt.md")
926
+ output_file=$(create_temp_file ".uat-green-output.log")
927
+
928
+ _build_green_prompt "$case_id" "$test_file" "$prompt_file"
929
+
930
+ _log_uat "$case_id" "GREEN: Starting fix-only session"
931
+
932
+ local claude_exit=0
933
+ (
934
+ set -o pipefail
935
+ cat "$prompt_file" | run_with_timeout "$timeout" claude -p \
936
+ --dangerously-skip-permissions \
937
+ --verbose \
938
+ --output-format stream-json \
939
+ 2>&1 | tee "$output_file" | _parse_uat_activity "$quiet"
940
+ ) &
941
+ local pipeline_pid=$!
942
+ wait "$pipeline_pid" || claude_exit=$?
943
+
944
+ rm -f "$prompt_file"
945
+
946
+ if [[ $claude_exit -ne 0 ]] && [[ $claude_exit -ne 124 ]]; then
947
+ print_warning "Fix session ended unexpectedly — will retry"
948
+ _log_uat "$case_id" "GREEN: Session failed (exit $claude_exit)"
949
+ _increment_green_retry "$case_id"
950
+ rm -f "$output_file"
951
+ return
952
+ fi
953
+
954
+ # Enforce GREEN constraint: no test file modifications
955
+ if _test_file_modified "$test_file"; then
956
+ print_warning "$case_id: Test file was changed during fix (not allowed) — restoring original"
957
+ _restore_test_file "$test_file" "$case_id"
958
+ _log_uat "$case_id" "GREEN: Test file restored after modification"
959
+ fi
960
+
961
+ # Run the test
962
+ if _run_test "$test_file" "$case_type"; then
963
+ # PASS — check for regressions before committing
964
+ if _check_regressions; then
965
+ print_success "$case_id: Fixed! Test passes and nothing else broke"
966
+ _mark_passed "$case_id"
967
+ _track_fixed_files "$case_id"
968
+ UAT_BUGS_FIXED=$((UAT_BUGS_FIXED + 1))
969
+ _commit_result "$case_id" "$test_file"
970
+ UAT_CASES_PASSED=$((UAT_CASES_PASSED + 1))
971
+ _log_uat "$case_id" "GREEN: PASSED"
972
+ else
973
+ # Regression detected — rollback
974
+ print_error "$case_id: Fix broke other tests — undoing the change"
975
+ _rollback_to_snapshot "$case_id"
976
+ _flag_for_human "$case_id" "Fix broke other tests"
977
+ _increment_green_retry "$case_id"
978
+ _log_uat "$case_id" "GREEN: ROLLBACK — fix caused regression"
979
+ fi
980
+ else
981
+ # FAIL — retry GREEN
982
+ print_warning "$case_id: Fix didn't work — test still fails, will retry"
983
+ _save_failure_context "$case_id" "$output_file"
984
+ _increment_green_retry "$case_id"
985
+ fi
986
+
987
+ rm -f "$output_file"
988
+ }
989
+
990
+ # ============================================================================
991
+ # TEST EXECUTION
992
+ # ============================================================================
993
+
994
+ _run_test() {
995
+ local test_file="$1"
996
+ local test_type="$2"
997
+ local log_file
998
+ log_file=$(create_temp_file ".uat-test.log")
999
+
1000
+ local test_cmd=""
1001
+
1002
+ if [[ "$test_type" == "e2e" ]]; then
1003
+ # Playwright
1004
+ if [[ -f "playwright.config.ts" ]] || [[ -f "playwright.config.js" ]]; then
1005
+ test_cmd="npx playwright test $test_file"
1006
+ else
1007
+ test_cmd="npx playwright test $test_file --config=playwright.config.ts"
1008
+ fi
1009
+ else
1010
+ # Integration — detect test runner
1011
+ if [[ -f "vitest.config.ts" ]] || [[ -f "vitest.config.js" ]] || [[ -f "vite.config.ts" ]]; then
1012
+ test_cmd="npx vitest run $test_file"
1013
+ elif [[ -f "jest.config.ts" ]] || [[ -f "jest.config.js" ]] || grep -q '"jest"' package.json 2>/dev/null; then
1014
+ test_cmd="npx jest $test_file"
1015
+ elif [[ -f "pytest.ini" ]] || [[ -f "pyproject.toml" ]]; then
1016
+ local py_runner
1017
+ py_runner=$(detect_python_runner ".")
1018
+ test_cmd="${py_runner}${py_runner:+ }pytest $test_file -v"
1019
+ else
1020
+ test_cmd="npx vitest run $test_file"
1021
+ fi
1022
+ fi
1023
+
1024
+ echo " Running: $test_cmd"
1025
+
1026
+ if safe_exec "$test_cmd" "$log_file"; then
1027
+ rm -f "$log_file"
1028
+ return 0
1029
+ else
1030
+ echo ""
1031
+ echo " Test output (last 30 lines):"
1032
+ tail -30 "$log_file" | sed 's/^/ /'
1033
+ cp "$log_file" "$UAT_MODE_DIR/last_test_output.log"
1034
+ rm -f "$log_file"
1035
+ return 1
1036
+ fi
1037
+ }
1038
+
1039
+ # ============================================================================
1040
+ # TEST QUALITY VALIDATION
1041
+ # ============================================================================
1042
+
1043
+ # Reject tests that only check structure (page loads) without verifying content.
1044
+ # A test that asserts "page has URL /dashboard" proves nothing about correctness.
1045
+ # A test that asserts "page shows 'Welcome, John'" proves the right data rendered.
1046
+ _validate_test_quality() {
1047
+ local test_file="$1"
1048
+ local case_id="$2"
1049
+
1050
+ # Count total assertion calls
1051
+ local assertion_count
1052
+ assertion_count=$(grep -cE 'expect\(|assert\(|\.should\(' "$test_file" 2>/dev/null || true)
1053
+
1054
+ if [[ "$assertion_count" -lt 2 ]]; then
1055
+ _log_uat "$case_id" "SHALLOW: only $assertion_count assertion(s)"
1056
+ return 1
1057
+ fi
1058
+
1059
+ # Count content assertions — these verify the RIGHT data, not just structure
1060
+ # Includes: toContain, toHaveText, toBe, toEqual, toMatch, textContent, innerText
1061
+ local content_assertions
1062
+ content_assertions=$(grep -cE 'toContain\(|toHaveText\(|toBe\(|toEqual\(|toMatch\(|textContent|innerText|toHaveValue\(' "$test_file" 2>/dev/null || true)
1063
+
1064
+ if [[ "$content_assertions" -eq 0 ]]; then
1065
+ _log_uat "$case_id" "SHALLOW: no content assertions (only structural checks)"
1066
+ return 1
1067
+ fi
1068
+
1069
+ # Check for input→output test pattern: test fills data and checks the result
1070
+ # Look for fill/type followed by expect — proves the test verifies a response to input
1071
+ local has_input_output=false
1072
+ if grep -qE 'fill\(|type\(|press\(|click\(' "$test_file" 2>/dev/null; then
1073
+ if grep -qE 'toContain\(|toHaveText\(|toBe\(|toEqual\(|toMatch\(' "$test_file" 2>/dev/null; then
1074
+ has_input_output=true
1075
+ fi
1076
+ fi
1077
+
1078
+ # For e2e tests, require at least one input→output pattern
1079
+ if [[ "$has_input_output" == "false" ]]; then
1080
+ # Check if it's an API/integration test (no browser interaction expected)
1081
+ if grep -qE 'page\.|browser\.|playwright' "$test_file" 2>/dev/null; then
1082
+ _log_uat "$case_id" "SHALLOW: e2e test has no input→output assertions"
1083
+ return 1
1084
+ fi
1085
+ fi
1086
+
1087
+ _log_uat "$case_id" "Quality OK: $assertion_count assertions ($content_assertions content)"
1088
+ return 0
1089
+ }
1090
+
1091
+ # Save feedback about shallow tests so Claude gets specific guidance on retry
1092
+ _save_shallow_test_feedback() {
1093
+ local case_id="$1"
1094
+ local test_file="$2"
1095
+
1096
+ local assertion_count content_assertions
1097
+ assertion_count=$(grep -cE 'expect\(|assert\(|\.should\(' "$test_file" 2>/dev/null || true)
1098
+ content_assertions=$(grep -cE 'toContain\(|toHaveText\(|toBe\(|toEqual\(|toMatch\(|textContent|innerText|toHaveValue\(' "$test_file" 2>/dev/null || true)
1099
+
1100
+ {
1101
+ echo ""
1102
+ echo "=== Test quality check failed for $case_id ==="
1103
+ echo ""
1104
+ echo "Your test is too shallow. It checks structure but not correctness."
1105
+ echo ""
1106
+ echo "Stats: $assertion_count total assertions, $content_assertions content assertions"
1107
+ echo ""
1108
+ echo "What's wrong:"
1109
+ if [[ "$assertion_count" -lt 2 ]]; then
1110
+ echo " - Only $assertion_count assertion(s). Every test needs at least 2."
1111
+ fi
1112
+ if [[ "$content_assertions" -eq 0 ]]; then
1113
+ echo " - ZERO content assertions. You're only checking that pages load,"
1114
+ echo " not that they show the RIGHT content."
1115
+ echo ""
1116
+ echo " Bad: await expect(page).toHaveURL('/dashboard');"
1117
+ echo " Good: await expect(page.getByText('Welcome, John')).toBeVisible();"
1118
+ echo ""
1119
+ echo " Bad: await expect(form).toBeVisible();"
1120
+ echo " Good: await expect(page.getByText('Email is required')).toBeVisible();"
1121
+ fi
1122
+ echo ""
1123
+ echo "Fix: Read the assertions in .ralph/$UAT_CONFIG_NS/plan.json for this test case."
1124
+ echo "Each assertion has an 'input' and 'expected' — encode THOSE as expect() calls."
1125
+ echo "---"
1126
+ } >> "$UAT_FAILURE_FILE"
1127
+ }
1128
+
1129
+ # ============================================================================
1130
+ # FAILURE HANDLING
1131
+ # ============================================================================
1132
+
1133
+ _save_failure_context() {
1134
+ local case_id="$1"
1135
+ local output_file="$2"
1136
+
1137
+ local retry_count
1138
+ retry_count=$(jq -r --arg id "$case_id" '.testCases[] | select(.id==$id) | .retryCount // 0' "$UAT_PLAN_FILE")
1139
+
1140
+ {
1141
+ echo ""
1142
+ echo "=== Attempt $((retry_count + 1)) failed for $case_id ==="
1143
+ echo ""
1144
+ if [[ -f "$UAT_MODE_DIR/last_test_output.log" ]]; then
1145
+ echo "--- Test Output ---"
1146
+ tail -50 "$UAT_MODE_DIR/last_test_output.log"
1147
+ echo ""
1148
+ fi
1149
+ echo "---"
1150
+ } >> "$UAT_FAILURE_FILE"
1151
+
1152
+ # Cap at 200 lines
1153
+ if [[ -f "$UAT_FAILURE_FILE" ]]; then
1154
+ local line_count
1155
+ line_count=$(wc -l < "$UAT_FAILURE_FILE" | tr -d ' ')
1156
+ if [[ $line_count -gt 200 ]]; then
1157
+ tail -200 "$UAT_FAILURE_FILE" > "$UAT_FAILURE_FILE.tmp" && mv "$UAT_FAILURE_FILE.tmp" "$UAT_FAILURE_FILE"
1158
+ fi
1159
+ fi
1160
+ }
1161
+
1162
+ _increment_red_retry() {
1163
+ local case_id="$1"
1164
+ update_json "$UAT_PLAN_FILE" \
1165
+ --arg id "$case_id" \
1166
+ '(.testCases[] | select(.id==$id)) |= . + {
1167
+ redRetries: ((.redRetries // 0) + 1),
1168
+ retryCount: ((.redRetries // 0) + 1 + (.greenRetries // 0))
1169
+ }'
1170
+ }
1171
+
1172
+ _increment_green_retry() {
1173
+ local case_id="$1"
1174
+ update_json "$UAT_PLAN_FILE" \
1175
+ --arg id "$case_id" \
1176
+ '(.testCases[] | select(.id==$id)) |= . + {
1177
+ greenRetries: ((.greenRetries // 0) + 1),
1178
+ retryCount: ((.redRetries // 0) + (.greenRetries // 0) + 1)
1179
+ }'
1180
+ }
1181
+
1182
+ _mark_phase() {
1183
+ local case_id="$1"
1184
+ local phase="$2" # "red" or null
1185
+ if [[ "$phase" == "null" ]]; then
1186
+ update_json "$UAT_PLAN_FILE" \
1187
+ --arg id "$case_id" \
1188
+ '(.testCases[] | select(.id==$id)) |= . + {phase: null}'
1189
+ else
1190
+ update_json "$UAT_PLAN_FILE" \
1191
+ --arg id "$case_id" \
1192
+ --arg phase "$phase" \
1193
+ '(.testCases[] | select(.id==$id)) |= . + {phase: $phase}'
1194
+ fi
1195
+ }
1196
+
1197
+ _mark_passed() {
1198
+ local case_id="$1"
1199
+ update_json "$UAT_PLAN_FILE" \
1200
+ --arg id "$case_id" \
1201
+ '(.testCases[] | select(.id==$id)) |= . + {passes: true, retryCount: 0, phase: null, redRetries: 0, greenRetries: 0}'
1202
+ # Clear failure context for this case
1203
+ rm -f "$UAT_FAILURE_FILE"
1204
+ }
1205
+
1206
+ _commit_red_test() {
1207
+ local case_id="$1"
1208
+ local test_file="$2"
1209
+
1210
+ if ! command -v git &>/dev/null || [[ ! -d ".git" ]]; then
1211
+ return 0
1212
+ fi
1213
+
1214
+ git add "$test_file" 2>/dev/null || true
1215
+
1216
+ if git diff --cached --quiet 2>/dev/null; then
1217
+ return 0
1218
+ fi
1219
+
1220
+ local commit_log
1221
+ commit_log=$(mktemp)
1222
+ local success=false
1223
+
1224
+ for attempt in 1 2 3; do
1225
+ if git commit -m "test($case_id): TDD red -- failing test identifies bug" > "$commit_log" 2>&1; then
1226
+ success=true
1227
+ break
1228
+ fi
1229
+ if grep -q "files were modified by this hook" "$commit_log" 2>/dev/null; then
1230
+ git add "$test_file"
1231
+ continue
1232
+ fi
1233
+ break
1234
+ done
1235
+
1236
+ if [[ "$success" != "true" ]]; then
1237
+ git add "$test_file"
1238
+ git commit -m "test($case_id): TDD red -- failing test identifies bug" --no-verify > "$commit_log" 2>&1 || true
1239
+ fi
1240
+
1241
+ rm -f "$commit_log"
1242
+ }
1243
+
1244
+ _classify_red_failure() {
1245
+ local test_file="$1"
1246
+ local case_id="$2"
1247
+
1248
+ # Check last test output for test-bug patterns (syntax/import errors)
1249
+ local test_output="$UAT_MODE_DIR/last_test_output.log"
1250
+ if [[ -f "$test_output" ]]; then
1251
+ # Syntax errors, import failures, module not found = test bug
1252
+ if grep -qiE 'SyntaxError|Cannot find module|ModuleNotFoundError|ImportError|TypeError:.*is not a function|ReferenceError:.*is not defined|unexpected token' "$test_output" 2>/dev/null; then
1253
+ _log_uat "$case_id" "RED classify: test_bug (syntax/import error)"
1254
+ echo "test_bug"
1255
+ return
1256
+ fi
1257
+ fi
1258
+
1259
+ # Assertion failures, timeout waiting for element = app bug (test is correct, app is wrong)
1260
+ _log_uat "$case_id" "RED classify: app_bug (assertion failure)"
1261
+ echo "app_bug"
1262
+ }
1263
+
1264
+ _test_file_modified() {
1265
+ local test_file="$1"
1266
+ if command -v git &>/dev/null && [[ -d ".git" ]]; then
1267
+ # Check if test file has uncommitted changes (modified since last commit)
1268
+ ! git diff --quiet HEAD -- "$test_file" 2>/dev/null
1269
+ else
1270
+ return 1
1271
+ fi
1272
+ }
1273
+
1274
+ _restore_test_file() {
1275
+ local test_file="$1"
1276
+ local case_id="${2:-GREEN}"
1277
+ if command -v git &>/dev/null && [[ -d ".git" ]]; then
1278
+ git checkout HEAD -- "$test_file" 2>/dev/null || true
1279
+ _log_uat "$case_id" "GREEN: Restored test file: $test_file"
1280
+ fi
1281
+ }
1282
+
1283
+ _save_red_violation_feedback() {
1284
+ local case_id="$1"
1285
+ {
1286
+ echo ""
1287
+ echo "=== RED PHASE VIOLATION for $case_id ==="
1288
+ echo ""
1289
+ echo "You modified application source files during the RED phase."
1290
+ echo "In the RED phase, you must ONLY write the test file."
1291
+ echo ""
1292
+ echo "DO NOT modify any files in src/, api/, app/, lib/, or similar directories."
1293
+ echo "Write ONLY the test file specified in plan.json."
1294
+ echo ""
1295
+ echo "If the app has a bug, let the test FAIL. A separate GREEN session will fix the app."
1296
+ echo "---"
1297
+ } >> "$UAT_FAILURE_FILE"
1298
+ }
1299
+
1300
+ _flag_for_human() {
1301
+ local case_id="$1"
1302
+ local reason="$2"
1303
+ UAT_NEEDS_HUMAN+=("$case_id: $reason")
1304
+ _log_uat "$case_id" "NEEDS_HUMAN: $reason"
1305
+ }
1306
+
1307
+ # ============================================================================
1308
+ # GIT OPERATIONS
1309
+ # ============================================================================
1310
+
1311
+ _git_snapshot() {
1312
+ local case_id="$1"
1313
+ if command -v git &>/dev/null && [[ -d ".git" ]]; then
1314
+ # Commit any pending changes so the tag captures a clean state
1315
+ # (tags point at commits, not the working tree)
1316
+ if ! git diff --quiet HEAD 2>/dev/null || ! git diff --cached --quiet 2>/dev/null; then
1317
+ git add -A 2>/dev/null || true
1318
+ git commit -m "$UAT_CONFIG_NS: snapshot before $case_id" --no-verify 2>/dev/null || true
1319
+ fi
1320
+ git tag -f "uat-snapshot-${case_id}" 2>/dev/null || true
1321
+ fi
1322
+ }
1323
+
1324
+ _rollback_to_snapshot() {
1325
+ local case_id="$1"
1326
+ if command -v git &>/dev/null && [[ -d ".git" ]]; then
1327
+ local tag="uat-snapshot-${case_id}"
1328
+ if git rev-parse "$tag" >/dev/null 2>&1; then
1329
+ # Reset to the snapshot commit — undoes both staged and committed changes since
1330
+ git reset --hard "$tag" 2>/dev/null || true
1331
+ print_info "Reverted changes for $case_id"
1332
+ fi
1333
+ fi
1334
+ }
1335
+
1336
+ _has_app_changes() {
1337
+ local test_file="$1"
1338
+ if command -v git &>/dev/null && [[ -d ".git" ]]; then
1339
+ # Check if any files OTHER than the test file were modified
1340
+ local changed_files
1341
+ changed_files=$(git diff --name-only HEAD 2>/dev/null | grep -Fxv "$test_file" | grep -v '\.ralph/' || true)
1342
+ [[ -n "$changed_files" ]]
1343
+ else
1344
+ return 1
1345
+ fi
1346
+ }
1347
+
1348
+ _check_regressions() {
1349
+ echo " Making sure other tests still pass..."
1350
+
1351
+ # Run existing unit tests
1352
+ local test_cmd
1353
+ test_cmd=$(get_config '.checks.testCommand' "")
1354
+
1355
+ if [[ -z "$test_cmd" ]]; then
1356
+ # Auto-detect
1357
+ if [[ -f "package.json" ]] && grep -q '"test"' package.json; then
1358
+ test_cmd="npm test"
1359
+ elif [[ -f "pytest.ini" ]] || [[ -f "pyproject.toml" ]]; then
1360
+ local py_runner
1361
+ py_runner=$(detect_python_runner ".")
1362
+ test_cmd="${py_runner}${py_runner:+ }pytest"
1363
+ elif [[ -f "Cargo.toml" ]]; then
1364
+ test_cmd="cargo test"
1365
+ elif [[ -f "go.mod" ]]; then
1366
+ test_cmd="go test ./..."
1367
+ else
1368
+ # No test command — can't check regressions, assume ok
1369
+ return 0
1370
+ fi
1371
+ fi
1372
+
1373
+ local log_file
1374
+ log_file=$(create_temp_file ".uat-regression.log")
1375
+
1376
+ if safe_exec "$test_cmd" "$log_file"; then
1377
+ print_success " All other tests still pass"
1378
+ rm -f "$log_file"
1379
+ return 0
1380
+ else
1381
+ print_error " Some other tests broke!"
1382
+ echo " Output (last 20 lines):"
1383
+ tail -20 "$log_file" | sed 's/^/ /'
1384
+ rm -f "$log_file"
1385
+ return 1
1386
+ fi
1387
+ }
1388
+
1389
+ _commit_result() {
1390
+ local case_id="$1"
1391
+ local test_file="$2"
1392
+
1393
+ if ! command -v git &>/dev/null || [[ ! -d ".git" ]]; then
1394
+ return 0
1395
+ fi
1396
+
1397
+ # Stage the test file and any app fixes
1398
+ git add "$test_file" 2>/dev/null || true
1399
+ git add -A 2>/dev/null || true
1400
+
1401
+ # Check if there's anything to commit
1402
+ if git diff --cached --quiet 2>/dev/null; then
1403
+ return 0
1404
+ fi
1405
+
1406
+ local commit_msg
1407
+ if _has_app_changes "$test_file"; then
1408
+ commit_msg="test+fix($case_id): TDD green -- test + app fix"
1409
+ else
1410
+ commit_msg="test($case_id): $UAT_CONFIG_NS test"
1411
+ fi
1412
+
1413
+ # Try commit with retries for auto-fix hooks
1414
+ local commit_log
1415
+ commit_log=$(mktemp)
1416
+ local success=false
1417
+
1418
+ for attempt in 1 2 3; do
1419
+ if git commit -m "$commit_msg" > "$commit_log" 2>&1; then
1420
+ success=true
1421
+ break
1422
+ fi
1423
+ if grep -q "files were modified by this hook" "$commit_log" 2>/dev/null; then
1424
+ git add -A
1425
+ continue
1426
+ fi
1427
+ break
1428
+ done
1429
+
1430
+ if [[ "$success" != "true" ]]; then
1431
+ # Try with --no-verify as last resort
1432
+ git add -A
1433
+ git commit -m "$commit_msg" --no-verify > "$commit_log" 2>&1 || true
1434
+ fi
1435
+
1436
+ rm -f "$commit_log"
1437
+
1438
+ # Clean up snapshot tag
1439
+ git tag -d "uat-snapshot-${case_id}" 2>/dev/null || true
1440
+ }
1441
+
1442
+ _track_fixed_files() {
1443
+ local case_id="$1"
1444
+ if command -v git &>/dev/null && [[ -d ".git" ]]; then
1445
+ local fixed
1446
+ fixed=$(git diff --name-only HEAD~1 2>/dev/null | grep -v 'test' | grep -v '\.ralph/' || true)
1447
+ while IFS= read -r f; do
1448
+ [[ -n "$f" ]] && UAT_FILES_FIXED+=("$f ($case_id)")
1449
+ done <<< "$fixed"
1450
+ fi
1451
+ }
1452
+
1453
+ # ============================================================================
1454
+ # PROMPT BUILDING
1455
+ # ============================================================================
1456
+
1457
+ _build_red_prompt() {
1458
+ local case_id="$1"
1459
+ local prompt_file="$2"
1460
+
1461
+ # Prefer project-specific UAT prompt (generated during discovery),
1462
+ # fall back to the universal template
1463
+ local uat_prompt="$RALPH_TEMPLATES/UAT-PROMPT.md"
1464
+ if [[ -f "$UAT_MODE_DIR/UAT-PROMPT.md" ]]; then
1465
+ uat_prompt="$UAT_MODE_DIR/UAT-PROMPT.md"
1466
+ fi
1467
+ cat "$uat_prompt" > "$prompt_file"
1468
+
1469
+ cat >> "$prompt_file" << PROMPT_SECTION
1470
+
1471
+ ---
1472
+
1473
+ ## Phase: RED — Write Test Only
1474
+
1475
+ You are in the **RED phase** of TDD. Your ONLY job is to write the test.
1476
+
1477
+ **CRITICAL: DO NOT modify any application source files. Test files ONLY.**
1478
+
1479
+ Your tasks:
1480
+
1481
+ 1. **Read the test case** from \`.ralph/$UAT_CONFIG_NS/plan.json\` (case ID: $case_id)
1482
+ 2. **Explore the feature** using Playwright MCP — navigate to the relevant pages, interact with the UI
1483
+ 3. **Write the test file** at the path specified in the test case
1484
+ 4. **Encode every assertion** from the test case as an actual expect() call
1485
+ 5. **Include edge cases** listed in the test case
1486
+
1487
+ ### Rules
1488
+
1489
+ - DO NOT modify any application source files (src/, api/, app/, etc.)
1490
+ - Write the test to verify CORRECT behavior based on the plan's assertions
1491
+ - If the app has a bug, the test WILL fail — that is the expected and correct outcome
1492
+ - Ralph will detect and reject any app code changes in this phase
1493
+
1494
+ ### Assertions are mandatory
1495
+
1496
+ The test case in plan.json has an \`assertions\` array. Each assertion has:
1497
+ - \`input\`: what to do (fill form, click button, navigate to URL)
1498
+ - \`expected\`: what should happen (text appears, redirect occurs, error shown)
1499
+ - \`strategy\`: how to verify (keyword, structural, navigation, security, llm-judge)
1500
+
1501
+ **Every assertion MUST become an expect() call in your test.** This is how we verify
1502
+ correctness, not just that the page loads. Ralph will reject tests that only check
1503
+ structure without verifying content.
1504
+
1505
+ Example — assertion in plan.json:
1506
+ \`\`\`json
1507
+ {"input": "Fill name='John', submit", "expected": "Shows 'Welcome, John'", "strategy": "keyword"}
1508
+ \`\`\`
1509
+
1510
+ Becomes in the test:
1511
+ \`\`\`typescript
1512
+ await page.getByLabel('Name').fill('John');
1513
+ await page.getByRole('button', { name: 'Submit' }).click();
1514
+ await expect(page.getByText('Welcome, John')).toBeVisible();
1515
+ \`\`\`
1516
+ PROMPT_SECTION
1517
+
1518
+ # Inject failure context if retrying
1519
+ if [[ -f "$UAT_FAILURE_FILE" ]]; then
1520
+ echo "" >> "$prompt_file"
1521
+ echo "### Previous RED Attempt Failed" >> "$prompt_file"
1522
+ echo "" >> "$prompt_file"
1523
+ echo "Your previous test attempt had issues. Fix them:" >> "$prompt_file"
1524
+ echo '```' >> "$prompt_file"
1525
+ tail -50 "$UAT_FAILURE_FILE" >> "$prompt_file"
1526
+ echo '```' >> "$prompt_file"
1527
+ fi
1528
+
1529
+ # Inject config context
1530
+ echo "" >> "$prompt_file"
1531
+ echo "### Config" >> "$prompt_file"
1532
+ echo "" >> "$prompt_file"
1533
+ echo "Read \`.ralph/config.json\` for URLs and directories." >> "$prompt_file"
1534
+
1535
+ # Inject signs
1536
+ _inject_signs >> "$prompt_file"
1537
+ }
1538
+
1539
+ _build_green_prompt() {
1540
+ local case_id="$1"
1541
+ local test_file="$2"
1542
+ local prompt_file="$3"
1543
+
1544
+ # GREEN prompt is focused — no UAT-PROMPT.md preamble needed
1545
+ cat > "$prompt_file" << PROMPT_SECTION
1546
+ # GREEN Phase — Fix Application Code
1547
+
1548
+ A test has been written that correctly identifies a bug. Your job is to fix the
1549
+ APPLICATION CODE so the test passes.
1550
+
1551
+ **CRITICAL: DO NOT modify the test file (\`$test_file\`). Fix the app, not the test.**
1552
+
1553
+ ## Case: $case_id
1554
+
1555
+ 1. **Read the test file** at \`$test_file\` to understand what it checks
1556
+ 2. **Read the test case** from \`.ralph/$UAT_CONFIG_NS/plan.json\` (case ID: $case_id) for context
1557
+ 3. **Read the failure output** below to understand what went wrong
1558
+ 4. **Fix the APPLICATION CODE** — make the minimum change needed to pass the test
1559
+ 5. **DO NOT modify the test file** — Ralph will restore it if you do
1560
+
1561
+ ### Rules
1562
+
1563
+ - Make the MINIMUM change needed to fix the bug
1564
+ - Do NOT modify the test file — it has been validated and committed
1565
+ - Do NOT add workarounds or hacks — fix the actual bug
1566
+ - Read .ralph/config.json for project URLs and directories
1567
+ PROMPT_SECTION
1568
+
1569
+ # Inject failure context (critical for GREEN — this is what guides the fix)
1570
+ if [[ -f "$UAT_FAILURE_FILE" ]]; then
1571
+ echo "" >> "$prompt_file"
1572
+ echo "## Failure Output" >> "$prompt_file"
1573
+ echo "" >> "$prompt_file"
1574
+ echo '```' >> "$prompt_file"
1575
+ tail -80 "$UAT_FAILURE_FILE" >> "$prompt_file"
1576
+ echo '```' >> "$prompt_file"
1577
+ fi
1578
+
1579
+ # Also include last test output if available
1580
+ if [[ -f "$UAT_MODE_DIR/last_test_output.log" ]]; then
1581
+ echo "" >> "$prompt_file"
1582
+ echo "## Last Test Output" >> "$prompt_file"
1583
+ echo "" >> "$prompt_file"
1584
+ echo '```' >> "$prompt_file"
1585
+ tail -80 "$UAT_MODE_DIR/last_test_output.log" >> "$prompt_file"
1586
+ echo '```' >> "$prompt_file"
1587
+ fi
1588
+
1589
+ # Inject signs
1590
+ _inject_signs >> "$prompt_file"
1591
+ }
1592
+
1593
+ # ============================================================================
1594
+ # ACTIVITY FEED (reuses pattern from loop.sh)
1595
+ # ============================================================================
1596
+
1597
+ _parse_uat_activity() {
1598
+ local quiet="${1:-false}"
1599
+ local dim=$'\033[2m' green=$'\033[0;32m' nc=$'\033[0m'
1600
+ local line
1601
+ while IFS= read -r line; do
1602
+ # Non-JSON lines — always pass through
1603
+ if [[ "$line" != "{"* ]]; then
1604
+ echo "$line"
1605
+ continue
1606
+ fi
1607
+
1608
+ [[ "$quiet" == "true" ]] && continue
1609
+
1610
+ if [[ "$line" != *'"assistant"'* && "$line" != *'"result"'* ]]; then
1611
+ continue
1612
+ fi
1613
+
1614
+ local msg_type
1615
+ msg_type=$(jq -r '.type // empty' <<< "$line" 2>/dev/null) || continue
1616
+
1617
+ if [[ "$msg_type" == "assistant" ]]; then
1618
+ local tool_entries
1619
+ tool_entries=$(jq -r '
1620
+ .message.content[]?
1621
+ | select(.type == "tool_use")
1622
+ | .name + "\t" + (.input | tostring)
1623
+ ' <<< "$line" 2>/dev/null) || continue
1624
+
1625
+ while IFS=$'\t' read -r tool_name tool_input; do
1626
+ [[ -z "$tool_name" ]] && continue
1627
+ local label="" detail=""
1628
+ case "$tool_name" in
1629
+ Read)
1630
+ label="Reading"
1631
+ detail=$(jq -r '.file_path // empty' <<< "$tool_input" 2>/dev/null)
1632
+ detail="${detail#"$PWD/"}"
1633
+ ;;
1634
+ Edit)
1635
+ label="Editing"
1636
+ detail=$(jq -r '.file_path // empty' <<< "$tool_input" 2>/dev/null)
1637
+ detail="${detail#"$PWD/"}"
1638
+ ;;
1639
+ Write)
1640
+ label="Creating"
1641
+ detail=$(jq -r '.file_path // empty' <<< "$tool_input" 2>/dev/null)
1642
+ detail="${detail#"$PWD/"}"
1643
+ ;;
1644
+ Bash)
1645
+ label="Running"
1646
+ detail=$(jq -r '.description // .command // empty' <<< "$tool_input" 2>/dev/null)
1647
+ detail="${detail:0:60}"
1648
+ ;;
1649
+ mcp__playwright__*)
1650
+ label="Browser"
1651
+ local action="${tool_name#mcp__playwright__browser_}"
1652
+ detail="$action"
1653
+ ;;
1654
+ *)
1655
+ label="$tool_name"
1656
+ ;;
1657
+ esac
1658
+ printf " ${dim}⟳${nc} %-10s %s\n" "$label" "$detail"
1659
+ done <<< "$tool_entries"
1660
+
1661
+ elif [[ "$msg_type" == "result" ]]; then
1662
+ local cost duration_ms
1663
+ cost=$(jq -r '.total_cost_usd // empty' <<< "$line" 2>/dev/null)
1664
+ duration_ms=$(jq -r '.duration_ms // empty' <<< "$line" 2>/dev/null)
1665
+ local cost_str="" dur_str=""
1666
+ [[ -n "$cost" ]] && cost_str=$(printf '$%.2f' "$cost")
1667
+ if [[ -n "$duration_ms" ]]; then
1668
+ local total_secs=$(( duration_ms / 1000 ))
1669
+ if [[ $total_secs -ge 60 ]]; then
1670
+ dur_str="$((total_secs / 60))m $((total_secs % 60))s"
1671
+ else
1672
+ dur_str="${total_secs}s"
1673
+ fi
1674
+ fi
1675
+ echo ""
1676
+ if [[ -n "$cost_str" && -n "$dur_str" ]]; then
1677
+ echo -e " ${green}✓ Done${nc} ${dim}(${cost_str}, ${dur_str})${nc}"
1678
+ elif [[ -n "$cost_str" ]]; then
1679
+ echo -e " ${green}✓ Done${nc} ${dim}(${cost_str})${nc}"
1680
+ fi
1681
+ fi
1682
+ done
1683
+ }
1684
+
1685
+ # ============================================================================
1686
+ # PHASE 3: REPORT
1687
+ # ============================================================================
1688
+
1689
+ _print_report() {
1690
+ local total_cases passed_cases failed_cases skipped_cases
1691
+ total_cases=$(jq '.testCases | length' "$UAT_PLAN_FILE" 2>/dev/null || echo "0")
1692
+ passed_cases=$(jq '[.testCases[] | select(.passes==true and .skipped!=true)] | length' "$UAT_PLAN_FILE" 2>/dev/null || echo "0")
1693
+ failed_cases=$(jq '[.testCases[] | select(.passes==false)] | length' "$UAT_PLAN_FILE" 2>/dev/null || echo "0")
1694
+ skipped_cases=$(jq '[.testCases[] | select(.skipped==true)] | length' "$UAT_PLAN_FILE" 2>/dev/null || echo "0")
1695
+
1696
+ echo ""
1697
+ echo "╔══════════════════════════════════════════════════════════╗"
1698
+ printf "║ %-14s Results ║\n" "$UAT_MODE_LABEL"
1699
+ echo "╠══════════════════════════════════════════════════════════╣"
1700
+ printf "║ Test cases: %-3s total, %-3s passed, %-3s failed, %-3s skipped ║\n" \
1701
+ "$total_cases" "$passed_cases" "$failed_cases" "$skipped_cases"
1702
+ printf "║ App bugs found: %-3s Fixed: %-3s ║\n" \
1703
+ "$UAT_BUGS_FOUND" "$UAT_BUGS_FIXED"
1704
+ printf "║ Already working: %-3s Needed fixing: %-3s ║\n" \
1705
+ "$UAT_RED_ONLY_PASSED" "$UAT_GREEN_ATTEMPTS"
1706
+ echo "║ ║"
1707
+
1708
+ # List test files
1709
+ if [[ $UAT_TESTS_WRITTEN -gt 0 ]]; then
1710
+ echo "║ New test files: ║"
1711
+ jq -r '.testCases[] | select(.passes==true and .skipped!=true) | " " + .testFile + " ✅"' "$UAT_PLAN_FILE" 2>/dev/null | while IFS= read -r line; do
1712
+ printf "║ %-56s║\n" "$line"
1713
+ done
1714
+ jq -r '.testCases[] | select(.passes==false) | " " + .testFile + " ❌"' "$UAT_PLAN_FILE" 2>/dev/null | while IFS= read -r line; do
1715
+ printf "║ %-56s║\n" "$line"
1716
+ done
1717
+ fi
1718
+
1719
+ # List fixed app files
1720
+ if [[ ${#UAT_FILES_FIXED[@]} -gt 0 ]]; then
1721
+ echo "║ ║"
1722
+ echo "║ App files fixed: ║"
1723
+ for f in "${UAT_FILES_FIXED[@]}"; do
1724
+ local display="$f"
1725
+ [[ ${#display} -gt 54 ]] && display="${display:0:51}..."
1726
+ printf "║ %-54s║\n" "$display"
1727
+ done
1728
+ fi
1729
+
1730
+ # List items needing human attention
1731
+ if [[ ${#UAT_NEEDS_HUMAN[@]} -gt 0 ]]; then
1732
+ echo "║ ║"
1733
+ echo "║ Needs your attention: ║"
1734
+ for item in "${UAT_NEEDS_HUMAN[@]}"; do
1735
+ local display="$item"
1736
+ [[ ${#display} -gt 54 ]] && display="${display:0:51}..."
1737
+ printf "║ %-54s║\n" "$display"
1738
+ done
1739
+ fi
1740
+
1741
+ echo "╚══════════════════════════════════════════════════════════╝"
1742
+ echo ""
1743
+
1744
+ # Send notification
1745
+ send_notification "$UAT_MODE_LABEL: $passed_cases/$total_cases passed, $UAT_BUGS_FIXED bugs fixed"
1746
+ }
1747
+
1748
+ # ============================================================================
1749
+ # BANNER
1750
+ # ============================================================================
1751
+
1752
+ _print_uat_banner() {
1753
+ echo ""
1754
+ echo " _ _ _ _____ _ "
1755
+ echo " | | | | / \\|_ _| | | ___ ___ _ __"
1756
+ echo " | | | |/ _ \\ | | | | / _ \\ / _ \\| '_ \\"
1757
+ echo " | |_| / ___ \\| | | |__| (_) | (_) | |_) |"
1758
+ echo " \\___/_/ \\_\\_| |_____\\___/ \\___/| .__/"
1759
+ echo " |_|"
1760
+ echo ""
1761
+ }
1762
+
1763
+ _print_chaos_banner() {
1764
+ echo ""
1765
+ echo " ____ _ _ _ "
1766
+ echo " / ___| |__ __ _ ___ ___ / \\ __ _ ___ _ __ | |_ "
1767
+ echo " | | | '_ \\ / _\` |/ _ \\/ __|| _ \\ / _\` |/ _ \\ '_ \\| __|"
1768
+ echo " | |___| | | | (_| | (_) \\__ \\/ ___ \\ (_| | __/ | | | |_ "
1769
+ echo " \\____|_| |_|\\__,_|\\___/|___/_/ \\_\\__, |\\___|_| |_|\\__|"
1770
+ echo " |___/ "
1771
+ echo " Red team mode — trying to break things"
1772
+ echo ""
1773
+ }
1774
+
1775
+ # ============================================================================
1776
+ # CHAOS AGENT PROMPT
1777
+ # ============================================================================
1778
+
1779
+ _build_chaos_agent_prompt() {
1780
+ local prompt_file="$1"
1781
+
1782
+ # Start with UAT prompt template
1783
+ cat "$RALPH_TEMPLATES/UAT-PROMPT.md" > "$prompt_file"
1784
+
1785
+ cat >> "$prompt_file" << 'PROMPT_SECTION'
1786
+
1787
+ ---
1788
+
1789
+ ## Phase: Chaos Agent Red Team Discovery
1790
+
1791
+ You are the **team lead** of a red team. Your job is to coordinate a team of adversarial
1792
+ agents that attack a live app, share intel, and produce a battle-tested plan of
1793
+ vulnerabilities to fix.
1794
+
1795
+ **Mindset: "You are a red team. Coordinate to find every vulnerability."**
1796
+
1797
+ ### Step 1: Recon (~60 seconds)
1798
+
1799
+ Before spawning anyone, do a quick recon yourself:
1800
+
1801
+ 1. **Read `.ralph/config.json`** for URLs, auth config, and directories
1802
+ 2. **Read `.ralph/prd.json`** if it exists — completed stories tell you what was built
1803
+ 3. **Navigate the app** using Playwright MCP — click through nav, find pages, note the tech stack
1804
+ 4. **Take 2-3 screenshots** of key pages (save to `.ralph/chaos/screenshots/`)
1805
+ 5. **Map the attack surface** — what feature areas exist? (auth, forms, API, navigation, etc.)
1806
+
1807
+ Don't go deep. Just map what's there. ~60 seconds max.
1808
+
1809
+ ### Step 2: Assemble the Red Team
1810
+
1811
+ Create a team and spawn teammates:
1812
+
1813
+ ```
1814
+ TeamCreate: "chaos-agent"
1815
+ ```
1816
+
1817
+ Spawn these teammates using the Task tool with `team_name: "chaos-agent"`:
1818
+
1819
+ 1. **"recon"** (`subagent_type: "general-purpose"`) — Attack surface mapping. Catalogs every
1820
+ input, form, API endpoint, auth mechanism. Shares intel with team: "login uses JWT in
1821
+ localStorage", "admin panel at /admin has no auth check".
1822
+
1823
+ 2. **"chaos"** (`subagent_type: "general-purpose"`) — Chaos testing. For every input: empty
1824
+ strings, 10000-char payloads, special characters (`<>&"'/\`), unicode/emoji, null bytes.
1825
+ For every form: double-submit, missing fields, back button after submit. Rapid-fire
1826
+ interactions.
1827
+
1828
+ 3. **"security"** (`subagent_type: "general-purpose"`) — Security testing. XSS in every
1829
+ input (`<script>alert(1)</script>`), SQL injection (`'; DROP TABLE users; --`), auth bypass
1830
+ via direct URL, IDOR via ID manipulation, sensitive data in localStorage/console/page source,
1831
+ missing CSRF tokens.
1832
+
1833
+ **Only spawn agents for areas that exist.** If there are no forms, don't spawn a forms specialist.
1834
+ If there's no auth, skip auth testing.
1835
+
1836
+ Agents communicate via SendMessage — recon shares discoveries, security acts on them.
1837
+
1838
+ ### Agent Instructions Template
1839
+
1840
+ Every agent prompt MUST include:
1841
+
1842
+ 1. **Their role and focus area** (from above)
1843
+ 2. **The recon intel** — pages, URLs, tech stack you discovered in Step 1
1844
+ 3. **Browser tab isolation** — "Open your own browser tab via `browser_tabs(action: 'new')`
1845
+ before navigating. Do NOT use the existing tab."
1846
+ 4. **Communication** — "Share important discoveries with teammates via SendMessage.
1847
+ Examples: 'Auth uses JWT in localStorage', 'Found unprotected admin route at /admin',
1848
+ 'Form at /profile has no CSRF token'. Read messages from teammates and adapt your testing."
1849
+ 5. **Output format** — "When done, send your findings to the team lead via SendMessage.
1850
+ Format each finding as a test case with: title, category, testFile path, targetFiles,
1851
+ assertions (input/expected/strategy), and edgeCases."
1852
+
1853
+ ### Step 3: Coordinate
1854
+
1855
+ While your team works:
1856
+
1857
+ - **Monitor messages** from teammates as they report findings
1858
+ - **Redirect effort** if needed — if recon discovers something important, message the
1859
+ relevant specialist ("recon found an admin panel at /admin — security, check it for auth bypass")
1860
+ - **Create tasks** in the shared task list for any new areas discovered
1861
+
1862
+ ### Step 4: Collect + Merge + Write Plan
1863
+
1864
+ After all teammates finish:
1865
+
1866
+ 1. Collect findings from all agent messages
1867
+ 2. Dedup by test file path (keep the case with more assertions)
1868
+ 3. Assign sequential IDs: `UAT-001`, `UAT-002`, ...
1869
+ 4. Write `.ralph/chaos/plan.json` (schema below)
1870
+ 5. Write `.ralph/chaos/UAT-PROMPT.md` (schema below)
1871
+ 6. Shut down all teammates via SendMessage with `type: "shutdown_request"`
1872
+ 7. Clean up with TeamDelete
1873
+
1874
+ ### plan.json Schema
1875
+
1876
+ Write `.ralph/chaos/plan.json`:
1877
+
1878
+ ```json
1879
+ {
1880
+ "testSuite": {
1881
+ "name": "Chaos Agent",
1882
+ "generatedAt": "<ISO timestamp>",
1883
+ "status": "pending",
1884
+ "discoveryMethod": "chaos-agent"
1885
+ },
1886
+ "testCases": [
1887
+ {
1888
+ "id": "UAT-001",
1889
+ "title": "Feature area — what the test checks",
1890
+ "category": "auth|forms|navigation|api|ui|data|security",
1891
+ "type": "e2e|integration",
1892
+ "userStory": "As a user, I...",
1893
+ "testApproach": "What to test and how",
1894
+ "testFile": "tests/e2e/feature/test-name.spec.ts",
1895
+ "targetFiles": ["src/pages/feature.tsx"],
1896
+ "edgeCases": ["Edge case 1", "Edge case 2"],
1897
+ "assertions": [
1898
+ {
1899
+ "input": "Fill name='<script>alert(1)</script>', submit form",
1900
+ "expected": "Name displayed as literal text, no script execution",
1901
+ "strategy": "security"
1902
+ }
1903
+ ],
1904
+ "passes": false,
1905
+ "retryCount": 0,
1906
+ "source": "chaos-agent:agent-name"
1907
+ }
1908
+ ]
1909
+ }
1910
+ ```
1911
+
1912
+ **Every test case MUST have at least 3 assertions** with concrete input/expected pairs:
1913
+ 1. One happy-path assertion (correct input → correct output)
1914
+ 2. One edge-case assertion (bad input → proper error handling)
1915
+ 3. One content assertion (page shows the RIGHT data, not just that it loads)
1916
+
1917
+ ### UAT-PROMPT.md Schema
1918
+
1919
+ Write `.ralph/chaos/UAT-PROMPT.md` — a project-specific testing guide based on what the
1920
+ red team ACTUALLY FOUND. Include:
1921
+
1922
+ ```markdown
1923
+ # Chaos Agent Guide — [Project Name]
1924
+
1925
+ ## App Overview
1926
+ - What the app does (1-2 sentences)
1927
+ - Tech stack observed (framework, API patterns, auth method)
1928
+ - Base URLs (frontend, API if applicable)
1929
+
1930
+ ## Pages & Routes Discovered
1931
+ For each page:
1932
+ - URL pattern and what it shows
1933
+ - Key interactive elements (forms, buttons, links)
1934
+ - Selectors that work (data-testid, roles, labels)
1935
+
1936
+ ## Auth Flow
1937
+ - How login works (form fields, redirect after login)
1938
+ - Test credentials if available (from config or .env)
1939
+ - What pages require auth vs. public
1940
+
1941
+ ## Known Forms & Inputs
1942
+ For each form:
1943
+ - Fields with their labels/names/selectors
1944
+ - Required vs optional fields
1945
+ - Validation behavior observed
1946
+
1947
+ ## What "Correct" Looks Like
1948
+ For each feature area:
1949
+ - Expected behavior observed
1950
+ - Specific text/numbers that should appear
1951
+
1952
+ ## Console & Network Observations
1953
+ - Any existing console errors/warnings
1954
+ - API endpoints observed
1955
+ - Response patterns (JSON structure, status codes)
1956
+
1957
+ ## Red Team Findings
1958
+ - Vulnerabilities discovered (XSS, injection, auth bypass, etc.)
1959
+ - Edge cases that broke the app
1960
+ - Areas that need hardening
1961
+ ```
1962
+
1963
+ This is NOT a copy of the template — it's ground truth from the red team's exploration.
1964
+
1965
+ ### Rules
1966
+
1967
+ - Test auth flows FIRST (they gate everything else)
1968
+ - One test case per feature area per attack vector
1969
+ - `type: "e2e"` for anything involving browser interaction
1970
+ - `targetFiles` should list the app source files the test covers
1971
+ - `testFile` path should use the project's test directory conventions
1972
+ - Always clean up: shutdown teammates and delete team when done
1973
+ PROMPT_SECTION
1974
+
1975
+ _inject_prompt_context "$prompt_file"
1976
+ }
1977
+
1978
+ # ============================================================================
1979
+ # HELPERS
1980
+ # ============================================================================
1981
+
1982
+ _inject_prompt_context() {
1983
+ local prompt_file="$1"
1984
+
1985
+ # Inject PRD context if available
1986
+ if [[ -f "$RALPH_DIR/prd.json" ]]; then
1987
+ echo "" >> "$prompt_file"
1988
+ echo "### Completed Stories (from PRD)" >> "$prompt_file"
1989
+ echo "" >> "$prompt_file"
1990
+ echo "These features have been built and should be testable:" >> "$prompt_file"
1991
+ echo '```json' >> "$prompt_file"
1992
+ jq '[.stories[] | select(.passes==true) | {id, title, type, testUrl: .testUrl}]' \
1993
+ "$RALPH_DIR/prd.json" >> "$prompt_file" 2>/dev/null
1994
+ echo '```' >> "$prompt_file"
1995
+ fi
1996
+
1997
+ # Inject config context
1998
+ if [[ -f "$RALPH_DIR/config.json" ]]; then
1999
+ echo "" >> "$prompt_file"
2000
+ echo "### Project Config" >> "$prompt_file"
2001
+ echo "" >> "$prompt_file"
2002
+ echo "Read \`.ralph/config.json\` for URLs and directories." >> "$prompt_file"
2003
+ fi
2004
+
2005
+ # Inject signs
2006
+ _inject_signs >> "$prompt_file"
2007
+ }
2008
+
2009
+ _log_uat() {
2010
+ local id="$1"
2011
+ local msg="$2"
2012
+ local timestamp
2013
+ timestamp=$(date -Iseconds 2>/dev/null || date +%Y-%m-%dT%H:%M:%S)
2014
+ echo "[$timestamp] $id $msg" >> "$UAT_PROGRESS_FILE"
2015
+ }