ralph-code 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,972 @@
1
+ """Harness CLI integration for ralph-coding application.
2
+
3
+ This module provides the execution layer for running AI prompts through
4
+ CLI harness tools. It handles:
5
+
6
+ - Command building with harness-specific CLI flags
7
+ - Model name mapping between internal names and harness-specific names
8
+ - Rate limit detection and retry with exponential backoff
9
+ - Debug logging for troubleshooting
10
+
11
+ Key classes:
12
+ - HarnessRunner: Main class for executing prompts through the configured harness
13
+ - HarnessResponse: Dataclass representing the result of a harness execution
14
+
15
+ The module also provides backwards compatibility aliases (ClaudeRunner, ClaudeResponse)
16
+ for code that hasn't migrated to the new naming.
17
+
18
+ Example usage:
19
+ from ralph.harness_runner import HarnessRunner
20
+
21
+ runner = HarnessRunner(project_dir=Path("."))
22
+
23
+ # Run a simple prompt
24
+ response = runner.run("Explain this code")
25
+ if response.success:
26
+ print(response.output)
27
+
28
+ # Implement a story (allows file writes)
29
+ response = runner.implement_story(story_prompt, context)
30
+
31
+ See HARNESS_ARCHITECTURE.md for documentation on CLI flag patterns per harness type.
32
+ """
33
+
34
+ import json
35
+ import re
36
+ import subprocess
37
+ import time
38
+ from dataclasses import dataclass
39
+ from datetime import datetime
40
+ from pathlib import Path
41
+ from typing import Callable
42
+
43
+ from .config import get_config
44
+ from .harness import Harness, HarnessType
45
+ from .storage import get_project_logs_dir
46
+
47
+
48
+ @dataclass
49
+ class HarnessResponse:
50
+ """Represents a response from a harness CLI execution.
51
+
52
+ Attributes:
53
+ success: True if the command exited with code 0, False otherwise.
54
+ output: The stdout content from the harness CLI.
55
+ error: The stderr content, typically containing error messages.
56
+ rate_limited: True if rate limiting was detected in the response.
57
+ cost: Estimated cost of the API call (if available from harness).
58
+ """
59
+
60
+ success: bool
61
+ output: str
62
+ error: str = ""
63
+ rate_limited: bool = False
64
+ cost: float = 0.0
65
+
66
+
67
+ # Model mapping for different harness types.
68
+ # Maps Ralph's internal model names (haiku, sonnet, opus) to harness-specific names.
69
+ # This allows users to select models using consistent names regardless of harness.
70
+ #
71
+ # When adding a new harness type, add a mapping entry here. The keys are Ralph's
72
+ # internal names, values are what gets passed to --model flag.
73
+ HARNESS_MODEL_MAPPING: dict[HarnessType, dict[str, str]] = {
74
+ "claude": {
75
+ # Claude CLI uses the same model names internally
76
+ "haiku": "haiku",
77
+ "sonnet": "sonnet",
78
+ "opus": "opus",
79
+ },
80
+ "codex": {
81
+ # Codex maps to OpenAI/Codex model names
82
+ "haiku": "gpt-5.1-codex-mini",
83
+ "sonnet": "gpt-5.2-codex",
84
+ "opus": "gpt-5.1-codex-max",
85
+ },
86
+ "custom": {
87
+ # Custom harnesses default to Claude-like names
88
+ # Users can override by using harness-native model names directly
89
+ "haiku": "haiku",
90
+ "sonnet": "sonnet",
91
+ "opus": "opus",
92
+ },
93
+ }
94
+
95
+
96
+ class HarnessRunner:
97
+ """Executes prompts through the configured AI harness CLI tool.
98
+
99
+ HarnessRunner is the main interface for running AI prompts in Ralph. It:
100
+ - Builds CLI commands with the correct flags for each harness type
101
+ - Maps internal model names to harness-specific model names
102
+ - Handles rate limiting with configurable retry and exponential backoff
103
+ - Provides logging for debugging harness interactions
104
+
105
+ The runner lazily loads the Harness object from configuration and caches it
106
+ for the lifetime of the runner instance.
107
+
108
+ Attributes:
109
+ project_dir: Working directory for command execution.
110
+ debug: If True, enables logging of commands and responses.
111
+ on_output: Optional callback for streaming status updates to UI.
112
+
113
+ Example:
114
+ runner = HarnessRunner(Path("/my/project"), debug=True)
115
+
116
+ # Simple prompt
117
+ result = runner.run("Explain the code in main.py")
118
+
119
+ # Implementation with file writes enabled
120
+ result = runner.implement_story(story_spec, context="Use async/await")
121
+
122
+ # PRD generation (uses lighter model)
123
+ prd = runner.create_prd("Add user authentication")
124
+
125
+ Note:
126
+ All high-level methods (create_prd, implement_story, etc.) are convenience
127
+ wrappers around run() with appropriate prompt templates and settings.
128
+ """
129
+
130
+ # Regex patterns for detecting rate limit errors in harness output.
131
+ # When matched, the runner will retry with exponential backoff if configured.
132
+ RATE_LIMIT_PATTERNS = [
133
+ r"rate.?limit",
134
+ r"too many requests",
135
+ r"quota exceeded",
136
+ r"429", # HTTP status code often present in error messages
137
+ ]
138
+
139
+ def __init__(
140
+ self,
141
+ project_dir: Path,
142
+ debug: bool = False,
143
+ on_output: Callable[[str], None] | None = None,
144
+ ):
145
+ """Initialize the HarnessRunner.
146
+
147
+ Args:
148
+ project_dir: Working directory for running harness commands.
149
+ All file paths in prompts should be relative to this.
150
+ debug: If True, log all commands and responses to a timestamped
151
+ file in the project's logs directory.
152
+ on_output: Optional callback invoked with status messages (e.g.,
153
+ "Rate limited. Waiting 60s...") for UI feedback.
154
+ """
155
+ self.project_dir = project_dir
156
+ self.debug = debug
157
+ self.on_output = on_output
158
+ self._config = get_config()
159
+ self._log_file: Path | None = None
160
+ self._harness: Harness | None = None # Lazily loaded from config
161
+
162
+ if debug:
163
+ self._setup_logging()
164
+
165
+ def _get_harness(self) -> Harness:
166
+ """Get the harness object, creating it lazily from config."""
167
+ if self._harness is None:
168
+ self._harness = Harness.from_config(self._config.harness)
169
+ return self._harness
170
+
171
+ def _setup_logging(self) -> None:
172
+ """Setup debug logging."""
173
+ logs_dir = get_project_logs_dir(self.project_dir)
174
+ timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
175
+ self._log_file = logs_dir / f"{timestamp}.log"
176
+
177
+ def _log(self, message: str) -> None:
178
+ """Log a message to the debug log file."""
179
+ if self._log_file:
180
+ with open(self._log_file, "a", encoding="utf-8") as f:
181
+ timestamp = datetime.now().isoformat()
182
+ f.write(f"[{timestamp}] {message}\n")
183
+
184
+ def _is_rate_limited(self, output: str, error: str) -> bool:
185
+ """Check if the response indicates rate limiting."""
186
+ combined = (output + error).lower()
187
+ for pattern in self.RATE_LIMIT_PATTERNS:
188
+ if re.search(pattern, combined, re.IGNORECASE):
189
+ return True
190
+ return False
191
+
192
+ def _map_model(self, model: str) -> str:
193
+ """Map the internal model type to harness-specific model name."""
194
+ harness = self._get_harness()
195
+ mapping = HARNESS_MODEL_MAPPING.get(harness.type, HARNESS_MODEL_MAPPING["custom"])
196
+ return mapping.get(model, model)
197
+
198
+ def _build_command(
199
+ self,
200
+ prompt: str,
201
+ model: str | None = None,
202
+ print_output: bool = True,
203
+ allow_writes: bool = False,
204
+ ) -> list[str]:
205
+ """Build the harness CLI command with harness-specific flags.
206
+
207
+ This method handles the differences in CLI interfaces between harness types:
208
+
209
+ Claude CLI:
210
+ claude --model <model> --print [--dangerously-skip-permissions] -p "<prompt>"
211
+
212
+ Codex CLI (non-interactive):
213
+ codex exec --model <model> --sandbox <mode> "<prompt>"
214
+
215
+ Custom (defaults to Claude-like):
216
+ custom --model <model> --print [--dangerously-skip-permissions] -p "<prompt>"
217
+
218
+ Args:
219
+ prompt: The prompt text to send to the harness.
220
+ model: Model name (internal name like 'sonnet', mapped automatically).
221
+ If None, uses the worker model from configuration.
222
+ print_output: If True, add flags for non-interactive output mode.
223
+ allow_writes: If True, add flags to permit file modifications.
224
+
225
+ Returns:
226
+ List of command arguments suitable for subprocess.run().
227
+ """
228
+ harness = self._get_harness()
229
+ model = model or self._config.worker_model
230
+ mapped_model = self._map_model(model)
231
+
232
+ cmd = [harness.path]
233
+
234
+ # Build command based on harness type
235
+ if harness.type == "claude":
236
+ # Claude CLI flags
237
+ cmd.extend(["--model", mapped_model])
238
+
239
+ if print_output:
240
+ cmd.append("--print")
241
+
242
+ if allow_writes:
243
+ cmd.append("--dangerously-skip-permissions")
244
+
245
+ cmd.extend(["-p", prompt])
246
+
247
+ elif harness.type == "codex":
248
+ # Codex CLI uses 'exec' subcommand for non-interactive execution
249
+ if print_output:
250
+ cmd.append("exec")
251
+
252
+ cmd.extend(["--model", mapped_model])
253
+
254
+ # Codex uses --sandbox for write permissions
255
+ if allow_writes:
256
+ cmd.extend(["--sandbox", "workspace-write", "--full-auto"])
257
+ else:
258
+ cmd.extend(["--sandbox", "read-only"])
259
+
260
+ # Codex uses positional argument for prompt
261
+ cmd.append(prompt)
262
+
263
+ else:
264
+ # Custom harness - use Claude-like flags as default
265
+ cmd.extend(["--model", mapped_model])
266
+
267
+ if print_output:
268
+ cmd.append("--print")
269
+
270
+ if allow_writes:
271
+ cmd.append("--dangerously-skip-permissions")
272
+
273
+ cmd.extend(["-p", prompt])
274
+
275
+ return cmd
276
+
277
+ def run(
278
+ self,
279
+ prompt: str,
280
+ model: str | None = None,
281
+ max_retries: int = 3,
282
+ retry_delay: float = 60.0,
283
+ allow_writes: bool = False,
284
+ ) -> HarnessResponse:
285
+ """
286
+ Run a prompt through the harness CLI.
287
+
288
+ Args:
289
+ prompt: The prompt to send to the harness
290
+ model: Model to use. Uses worker model config if not specified.
291
+ max_retries: Maximum retries on rate limit
292
+ retry_delay: Base delay between retries (doubles each retry)
293
+ allow_writes: Whether to allow file writes (for implementation tasks)
294
+
295
+ Returns:
296
+ HarnessResponse with the result
297
+ """
298
+ cmd = self._build_command(prompt, model, allow_writes=allow_writes)
299
+
300
+ self._log(f"Command: {' '.join(cmd)}")
301
+ self._log(f"Prompt:\n{prompt}\n")
302
+
303
+ attempt = 0
304
+ current_delay = retry_delay
305
+
306
+ while attempt <= max_retries:
307
+ try:
308
+ result = subprocess.run(
309
+ cmd,
310
+ cwd=self.project_dir,
311
+ capture_output=True,
312
+ text=True,
313
+ )
314
+
315
+ output = result.stdout
316
+ error = result.stderr
317
+
318
+ self._log(f"Output:\n{output}\n")
319
+ if error:
320
+ self._log(f"Error:\n{error}\n")
321
+
322
+ # Check for rate limiting (only on failure)
323
+ if result.returncode != 0 and self._is_rate_limited(output, error):
324
+ if self._config.wait_on_rate_limit and attempt < max_retries:
325
+ self._log(f"Rate limited. Waiting {current_delay}s before retry...")
326
+ if self.on_output:
327
+ self.on_output(f"Rate limited. Waiting {current_delay:.0f}s...")
328
+ time.sleep(current_delay)
329
+ attempt += 1
330
+ current_delay *= 2 # Exponential backoff
331
+ continue
332
+
333
+ return HarnessResponse(
334
+ success=False,
335
+ output=output,
336
+ error=error,
337
+ rate_limited=True,
338
+ )
339
+
340
+ return HarnessResponse(
341
+ success=result.returncode == 0,
342
+ output=output,
343
+ error=error,
344
+ )
345
+
346
+ except FileNotFoundError:
347
+ harness = self._get_harness()
348
+ error_msg = f"Harness not found: {harness.path}"
349
+ self._log(f"Error: {error_msg}")
350
+ return HarnessResponse(
351
+ success=False,
352
+ output="",
353
+ error=error_msg,
354
+ )
355
+ except Exception as e:
356
+ error_msg = str(e)
357
+ self._log(f"Exception: {error_msg}")
358
+ return HarnessResponse(
359
+ success=False,
360
+ output="",
361
+ error=error_msg,
362
+ )
363
+
364
+ return HarnessResponse(
365
+ success=False,
366
+ output="",
367
+ error="Max retries exceeded",
368
+ rate_limited=True,
369
+ )
370
+
371
+ def create_prd(self, task_description: str, learnings: str = "") -> HarnessResponse:
372
+ """
373
+ Create a PRD (Product Requirements Document) for a task using the prd skill format.
374
+
375
+ Args:
376
+ task_description: Brief description of the task/feature
377
+ learnings: Optional learnings content to inform the PRD
378
+
379
+ Returns:
380
+ HarnessResponse containing the PRD in markdown format
381
+ """
382
+ # Read the PRD skill file for guidance
383
+ skill_path = self.project_dir / "skills" / "prd" / "SKILL.md"
384
+ skill_content = ""
385
+ if skill_path.exists():
386
+ skill_content = skill_path.read_text()
387
+
388
+ learnings_section = ""
389
+ if learnings:
390
+ learnings_section = f"""
391
+ PROJECT LEARNINGS (use these to inform your PRD):
392
+ {learnings}
393
+ """
394
+
395
+ prompt = f"""Create a Product Requirements Document (PRD) for the following feature/task.
396
+
397
+ {f"PRD SKILL GUIDE:{chr(10)}{skill_content}" if skill_content else ""}
398
+
399
+ TASK DESCRIPTION:
400
+ {task_description}
401
+ {learnings_section}
402
+ IMPORTANT: You MUST include a "## State" section in the PRD with one of these values:
403
+ - "Ready to Implement" - if the PRD is complete and has no ambiguities
404
+ - "Open Questions" - if there are questions that need answers before implementation
405
+
406
+ If there are open questions, list them one per line after the State section.
407
+ For multi-choice questions, include a JSON array of options at the end of the line.
408
+ Example:
409
+ ## State
410
+ - Open Questions
411
+ How should authentication be handled? ["JWT", "Session cookies", "OAuth2"]
412
+ What database should be used? ["PostgreSQL", "SQLite", "MySQL"]
413
+
414
+ Output ONLY the PRD markdown content, nothing else. Start with "# PRD:"."""
415
+
416
+ return self.run(prompt, model=self._config.summary_model)
417
+
418
+ def pick_next_task(self, tasks_summary: str) -> HarnessResponse:
419
+ """
420
+ Ask the harness to pick the most important task to work on next.
421
+
422
+ Args:
423
+ tasks_summary: Summary of available tasks
424
+
425
+ Returns:
426
+ HarnessResponse with the task selection
427
+ """
428
+ prompt = f"""You are helping to prioritize tasks. Given the following list of available tasks, pick the most important one to work on next.
429
+
430
+ Consider:
431
+ - Dependencies (tasks that unblock other tasks are higher priority)
432
+ - Foundational work (core functionality before features)
433
+ - Complexity (start with simpler tasks to build momentum)
434
+
435
+ Available tasks:
436
+ {tasks_summary}
437
+
438
+ Respond with just the task ID of the task you recommend working on next, followed by a brief explanation.
439
+
440
+ Format:
441
+ TASK_ID: <uuid>
442
+ REASON: <brief explanation>"""
443
+
444
+ return self.run(prompt, model=self._config.summary_model)
445
+
446
+ def implement_task(self, task_spec: str, context: str = "") -> HarnessResponse:
447
+ """
448
+ Ask the harness to implement a task.
449
+
450
+ Args:
451
+ task_spec: The full task specification
452
+ context: Additional context about the project
453
+
454
+ Returns:
455
+ HarnessResponse with the implementation
456
+ """
457
+ prompt = f"""Please implement the following task:
458
+
459
+ {task_spec}
460
+
461
+ {f"Additional context: {context}" if context else ""}
462
+
463
+ Implement this task completely. Create or modify files as needed. Run tests if applicable."""
464
+
465
+ return self.run(prompt, allow_writes=True)
466
+
467
+ def write_tests(self, implementation_summary: str) -> HarnessResponse:
468
+ """
469
+ Ask the harness to write tests for an implementation.
470
+
471
+ Args:
472
+ implementation_summary: Summary of what was implemented
473
+
474
+ Returns:
475
+ HarnessResponse with the test implementation
476
+ """
477
+ prompt = f"""Please write tests for the following implementation:
478
+
479
+ {implementation_summary}
480
+
481
+ Create comprehensive tests that cover:
482
+ - Happy path scenarios
483
+ - Edge cases
484
+ - Error handling
485
+
486
+ Place tests in the appropriate test directory following the project's conventions."""
487
+
488
+ return self.run(prompt, allow_writes=True)
489
+
490
+ def verify_implementation(
491
+ self, task_spec: str, git_diff: str, require_tests: bool = False
492
+ ) -> HarnessResponse:
493
+ """
494
+ Use the harness to verify if implementation meets acceptance criteria.
495
+
496
+ Args:
497
+ task_spec: The task specification with acceptance criteria
498
+ git_diff: The git diff of changes made
499
+ require_tests: Whether tests are required for completion
500
+
501
+ Returns:
502
+ HarnessResponse with verification result (COMPLETE/INCOMPLETE with feedback)
503
+ """
504
+ prompt = f"""You are a code reviewer. Review the following implementation against the task requirements.
505
+
506
+ TASK SPECIFICATION:
507
+ {task_spec}
508
+
509
+ GIT DIFF OF CHANGES:
510
+ {git_diff if git_diff else "(No changes detected)"}
511
+
512
+ {"IMPORTANT: Tests are required for this task. Check that appropriate tests were created." if require_tests else ""}
513
+
514
+ Evaluate whether the implementation:
515
+ 1. Addresses all acceptance criteria
516
+ 2. Actually modifies/creates the necessary files (not just documentation)
517
+ 3. {"Includes appropriate tests" if require_tests else "Is functionally complete"}
518
+
519
+ Respond in this exact format:
520
+ STATUS: COMPLETE or INCOMPLETE
521
+ FEEDBACK: <If INCOMPLETE, explain what's missing or needs to be fixed. If COMPLETE, briefly confirm what was done.>"""
522
+
523
+ return self.run(prompt, model=self._config.summary_model)
524
+
525
+ def generate_commit_message(self, changes_summary: str) -> HarnessResponse:
526
+ """
527
+ Ask the harness to generate a commit message for changes.
528
+
529
+ Args:
530
+ changes_summary: Summary of the changes made
531
+
532
+ Returns:
533
+ HarnessResponse with the commit message
534
+ """
535
+ prompt = f"""Generate a concise but descriptive git commit message for the following changes:
536
+
537
+ {changes_summary}
538
+
539
+ Follow conventional commit format if appropriate. Keep the first line under 72 characters.
540
+ Respond with just the commit message, nothing else.
541
+ Do NOT include Co-Authored-By, Signed-off-by, or any other git trailers."""
542
+
543
+ return self.run(prompt, model=self._config.summary_model)
544
+
545
+ def update_learnings(self, session_summary: str, current_learnings: str) -> HarnessResponse:
546
+ """
547
+ Ask the harness to update the learnings file with new insights.
548
+
549
+ Args:
550
+ session_summary: Summary of the current session
551
+ current_learnings: Current contents of learnings.md
552
+
553
+ Returns:
554
+ HarnessResponse with updated learnings content
555
+ """
556
+ prompt = f"""Based on the following session, identify any new learnings or insights that should be documented.
557
+
558
+ Session summary:
559
+ {session_summary}
560
+
561
+ Current learnings.md content:
562
+ {current_learnings if current_learnings else "(empty)"}
563
+
564
+ If there are new insights worth documenting, provide the updated content for learnings.md.
565
+ If there's nothing new to add, respond with "NO_UPDATES".
566
+
567
+ Keep the format clean and organized with clear sections."""
568
+
569
+ return self.run(prompt, model=self._config.summary_model)
570
+
571
+ def filter_learnings_after_prd(self, prd_name: str, current_learnings: str) -> HarnessResponse:
572
+ """
573
+ Filter learnings after a PRD is completed, removing PRD-scoped items.
574
+
575
+ Args:
576
+ prd_name: Name of the completed PRD
577
+ current_learnings: Current contents of learnings.md
578
+
579
+ Returns:
580
+ HarnessResponse with filtered learnings content (only project-wide ones)
581
+ """
582
+ prompt = f"""A PRD has been completed: "{prd_name}"
583
+
584
+ Review the current learnings and remove any that are scoped to this specific PRD.
585
+
586
+ REMOVE learnings about:
587
+ - Package/library choices for this specific feature
588
+ - Code style decisions specific to this implementation
589
+ - Implementation strategies or technical approaches for this PRD
590
+ - Design decisions that only apply to this feature
591
+
592
+ KEEP learnings about:
593
+ - Project-wide conventions or standards
594
+ - Development workflow preferences
595
+ - General architectural decisions
596
+ - User preferences that apply across features
597
+ - Things the user explicitly wants remembered for future work
598
+
599
+ Current learnings.md:
600
+ {current_learnings}
601
+
602
+ Return the filtered learnings.md content with only project-wide learnings.
603
+ If all learnings should be removed, return just the header:
604
+ # Project Learnings
605
+
606
+ Preserve the markdown format and date sections for any remaining learnings."""
607
+
608
+ return self.run(prompt, model=self._config.summary_model)
609
+
610
+ def update_progress(self, task_name: str, status: str, notes: str = "") -> HarnessResponse:
611
+ """
612
+ Generate a progress update for progress.md.
613
+
614
+ Args:
615
+ task_name: Name of the task
616
+ status: Current status
617
+ notes: Additional notes
618
+
619
+ Returns:
620
+ HarnessResponse with progress update content
621
+ """
622
+ prompt = f"""Generate a brief progress update entry for the following:
623
+
624
+ Task: {task_name}
625
+ Status: {status}
626
+ Notes: {notes if notes else "None"}
627
+
628
+ Format as a markdown list item with timestamp. Keep it concise."""
629
+
630
+ return self.run(prompt, model=self._config.summary_model)
631
+
632
+ def convert_prd_to_tasks(
633
+ self, prd_content: str, project_name: str = "Project", branch_prefix: str = "ralph"
634
+ ) -> HarnessResponse:
635
+ """
636
+ Convert a PRD to tasks.json format using the ralph skill.
637
+
638
+ Args:
639
+ prd_content: The full PRD markdown content
640
+ project_name: Name of the project
641
+ branch_prefix: Prefix for feature branch names (e.g., 'ralph' -> 'ralph/feature-name')
642
+
643
+ Returns:
644
+ HarnessResponse containing the tasks.json content
645
+ """
646
+ # Read the ralph skill file for guidance
647
+ skill_path = self.project_dir / "skills" / "ralph" / "SKILL.md"
648
+ skill_content = ""
649
+ if skill_path.exists():
650
+ skill_content = skill_path.read_text()
651
+
652
+ prompt = f"""Convert the following PRD to tasks.json format for the Ralph autonomous agent system.
653
+
654
+ {f"RALPH SKILL GUIDE:{chr(10)}{skill_content}" if skill_content else ""}
655
+
656
+ PRD CONTENT:
657
+ {prd_content}
658
+
659
+ PROJECT NAME: {project_name}
660
+
661
+ IMPORTANT RULES:
662
+ 1. Each user story MUST be small enough to implement in ONE iteration (one context window)
663
+ 2. Stories are ordered by dependency (schema/database first, then backend, then UI)
664
+ 3. Every story MUST have "Typecheck passes" in acceptance criteria
665
+ 4. UI stories MUST have "Verify in browser using dev-browser skill" as criterion
666
+ 5. Acceptance criteria must be specific and verifiable (not vague)
667
+ 6. IDs must be in format US-001, US-002, etc.
668
+ 7. Priority is execution order (1 = first)
669
+ 8. All stories start with passes: false
670
+
671
+ Output ONLY valid JSON in the exact format below, nothing else:
672
+ {{
673
+ "project": "{project_name}",
674
+ "branchName": "{branch_prefix}/feature-name-here",
675
+ "description": "Feature description here",
676
+ "userStories": [
677
+ {{
678
+ "id": "US-001",
679
+ "title": "Story title",
680
+ "description": "As a [user], I want [feature] so that [benefit]",
681
+ "acceptanceCriteria": ["criterion 1", "criterion 2", "Typecheck passes"],
682
+ "priority": 1,
683
+ "passes": false,
684
+ "notes": ""
685
+ }}
686
+ ]
687
+ }}"""
688
+
689
+ return self.run(prompt, model=self._config.summary_model)
690
+
691
+ def implement_story(self, story_prompt: str, context: str = "") -> HarnessResponse:
692
+ """
693
+ Implement a single user story.
694
+
695
+ Args:
696
+ story_prompt: The user story with acceptance criteria
697
+ context: Additional context (progress notes, learnings, etc.)
698
+
699
+ Returns:
700
+ HarnessResponse with the implementation result
701
+ """
702
+ prompt = f"""Implement the following user story completely:
703
+
704
+ {story_prompt}
705
+
706
+ {f"CONTEXT:{chr(10)}{context}" if context else ""}
707
+
708
+ IMPORTANT:
709
+ - Implement the story completely in one go
710
+ - Ensure all acceptance criteria are met
711
+ - Run typecheck/linting to verify
712
+ - If you encounter issues, document them clearly
713
+ - Do NOT leave work half-done
714
+
715
+ After implementation, verify each acceptance criterion is met."""
716
+
717
+ return self.run(prompt, allow_writes=True)
718
+
719
+ def verify_story(self, story_prompt: str, git_diff: str) -> HarnessResponse:
720
+ """
721
+ Verify a user story implementation against its acceptance criteria.
722
+
723
+ Args:
724
+ story_prompt: The user story with acceptance criteria
725
+ git_diff: The git diff of changes made
726
+
727
+ Returns:
728
+ HarnessResponse with verification result (PASSES/FAILS/BLOCKED with feedback)
729
+ """
730
+ prompt = f"""Verify the following user story implementation against its acceptance criteria.
731
+
732
+ {story_prompt}
733
+
734
+ GIT DIFF OF CHANGES:
735
+ {git_diff if git_diff else "(No changes detected)"}
736
+
737
+ Check each acceptance criterion and determine if it has been met.
738
+
739
+ IMPORTANT: Distinguish between these scenarios:
740
+ 1. PASSES - All acceptance criteria are verifiably met (or irrelevant - see below)
741
+ 2. FAILS - One or more criteria are NOT met (implementation is wrong/incomplete)
742
+ 3. BLOCKED - Some criteria CANNOT BE VERIFIED due to external factors (permission issues,
743
+ pre-existing errors unrelated to this change, environment constraints, etc.)
744
+
745
+ HANDLING IRRELEVANT OR NONSENSICAL CRITERIA:
746
+ - If a criterion is based on an erroneous assumption (e.g., "update file X for library Y" but
747
+ file X doesn't use library Y), treat it as PASSED by virtue of being irrelevant.
748
+ - If a criterion seems nonsensical or impossible, question it and explain why it doesn't apply.
749
+ - These criteria may have been auto-generated by earlier processes and don't reflect reality.
750
+ - Mark such criteria as passed with a note explaining why they're not applicable.
751
+
752
+ Respond in this exact format:
753
+
754
+ STATUS: PASSES or FAILS or BLOCKED
755
+ NOTES:
756
+ **What Passed:**
757
+ - ✅ List each criterion that was clearly met
758
+ - ✅ List criteria that are N/A with explanation (e.g., "N/A - file doesn't use this library")
759
+
760
+ **What Failed:**
761
+ - ❌ List each criterion that was NOT met (implementation issue)
762
+
763
+ **What Could Not Be Verified:**
764
+ - ⚠️ List criteria that cannot be verified with reasons (e.g., "Tests execution - requires permission")
765
+
766
+ **Pre-existing Issues:**
767
+ - List any errors/issues that exist in the codebase but are NOT related to this change
768
+ (e.g., "30 type errors in unrelated files: git_manager.py, config.py")
769
+
770
+ **To Complete:**
771
+ - List specific actions needed to resolve blockers or failures
772
+
773
+ Be strict on PASSES - all applicable criteria must be verifiably met.
774
+ Use BLOCKED when the implementation looks correct but verification is impossible.
775
+ Use FAILS when the implementation itself is wrong or incomplete."""
776
+
777
+ return self.run(prompt, model=self._config.summary_model)
778
+
779
+ def stage_story_changes(
780
+ self, story_id: str, story_title: str, changed_files: list[str]
781
+ ) -> HarnessResponse:
782
+ """
783
+ Ask the LLM to identify which changed files should be staged for commit.
784
+
785
+ The LLM reviews the list of changed files and determines which are
786
+ relevant to the story being implemented. This allows intelligent
787
+ filtering of artifacts, generated files, and unrelated changes.
788
+
789
+ Args:
790
+ story_id: The story ID (e.g., "US-001").
791
+ story_title: The story title for context.
792
+ changed_files: List of file paths that have uncommitted changes.
793
+
794
+ Returns:
795
+ HarnessResponse with FILES_TO_STAGE section listing files to stage,
796
+ one per line.
797
+ """
798
+ files_list = "\n".join(f" - {f}" for f in changed_files)
799
+
800
+ prompt = f"""You are reviewing uncommitted changes to determine which files should be committed for a user story.
801
+
802
+ STORY: {story_id} - {story_title}
803
+
804
+ CHANGED FILES (uncommitted):
805
+ {files_list}
806
+
807
+ Review these files and decide which should be committed as part of this story's implementation.
808
+ The goal is to commit files needed for the production codebase to function correctly.
809
+
810
+ INCLUDE files that are:
811
+ - Source code modules (.py, .js, .ts, etc.) that are imported/referenced by other code
812
+ - Functions, classes, and utilities needed for normal application operation
813
+ - Test files that are part of the permanent test suite
814
+ - Schema/migration files for database changes
815
+ - Configuration files with non-sensitive default values
816
+ - Constants files that are NOT user-configurable and contain NO secrets
817
+ - Long-term utility scripts intended as permanent tools
818
+ - Important .md documentation ONLY for significant features (be conservative)
819
+ - PNG files that appear to be assets, documentation images, or icon master files
820
+ (e.g., in assets/, images/, icons/, static/, docs/ directories)
821
+
822
+ EXCLUDE files that are:
823
+ - Binary files (.pdf, .jpg, .zip, .exe, .xlsx, .doc, etc.)
824
+ - Short-term test scripts or throwaway debugging code
825
+ - Generated/compiled files (.pyc, .class, node_modules/, dist/, build/)
826
+ - IDE/editor files (.idea/, .vscode/, *.swp)
827
+ - Temporary files (*.tmp, *.log, *.bak)
828
+ - Large data files, exports, or reports
829
+ - Unrelated changes that happened to be in the working directory
830
+ - Credentials, secrets, or API keys (.env, *.pem, credentials.*, *secret*)
831
+ - User-configurable settings files that should remain local
832
+ - Excessive .md files for minor/esoteric changes (too many = noise)
833
+
834
+ Respond with ONLY the files that should be staged, one per line, in this exact format:
835
+
836
+ FILES_TO_STAGE:
837
+ path/to/file1.py
838
+ path/to/file2.py
839
+
840
+ If no files should be staged, respond with:
841
+
842
+ FILES_TO_STAGE:
843
+ NONE"""
844
+
845
+ return self.run(prompt, model=self._config.summary_model, allow_writes=False)
846
+
847
+ def detect_project_context(self) -> str:
848
+ """
849
+ Detect project context including test framework, build tools, and type checker.
850
+
851
+ Returns:
852
+ A string describing the detected project configuration.
853
+ """
854
+ context_parts = []
855
+
856
+ # Detect test framework
857
+ if (self.project_dir / "pytest.ini").exists():
858
+ context_parts.append("Test framework: pytest (pytest.ini found)")
859
+ elif (self.project_dir / "pyproject.toml").exists():
860
+ context_parts.append("Test framework: likely pytest (pyproject.toml found)")
861
+ elif (self.project_dir / "tests").is_dir():
862
+ context_parts.append("Test framework: tests/ directory found")
863
+
864
+ # Detect build tools
865
+ if (self.project_dir / "Pipfile").exists():
866
+ context_parts.append("Build tool: pipenv (Pipfile found)")
867
+ if (self.project_dir / "pyproject.toml").exists():
868
+ context_parts.append("Build tool: pyproject.toml found")
869
+ if (self.project_dir / "setup.py").exists():
870
+ context_parts.append("Build tool: setup.py found")
871
+ if (self.project_dir / "requirements.txt").exists():
872
+ context_parts.append("Dependencies: requirements.txt found")
873
+
874
+ # Detect type checker
875
+ if (self.project_dir / "mypy.ini").exists():
876
+ context_parts.append("Type checker: mypy (mypy.ini found)")
877
+ elif (self.project_dir / ".mypy.ini").exists():
878
+ context_parts.append("Type checker: mypy (.mypy.ini found)")
879
+
880
+ # Detect CLAUDE.md for project instructions
881
+ if (self.project_dir / "CLAUDE.md").exists():
882
+ context_parts.append("Project instructions: CLAUDE.md found")
883
+
884
+ if not context_parts:
885
+ return "No specific project configuration detected."
886
+
887
+ return "\n".join(context_parts)
888
+
889
+ def analyze_blocked_story(
890
+ self,
891
+ story_prompt: str,
892
+ full_notes: str,
893
+ git_diff: str,
894
+ project_context: str,
895
+ ) -> HarnessResponse:
896
+ """
897
+ Analyze a blocked story to determine if it can be retried with adjustments.
898
+
899
+ Uses the sonnet model for deeper analysis. This is an analysis-only call
900
+ (no file writes) that provides recommendations for the next retry attempt.
901
+
902
+ Args:
903
+ story_prompt: The full user story prompt with acceptance criteria.
904
+ full_notes: Complete notes from all previous attempts.
905
+ git_diff: The current git diff of changes.
906
+ project_context: Detected project configuration (from detect_project_context).
907
+
908
+ Returns:
909
+ HarnessResponse containing:
910
+ - ACTION: RETRY or NEEDS_INTERVENTION
911
+ - SUMMARY: Brief summary for notes replacement
912
+ - ANALYSIS: Detailed failure breakdown
913
+ - RECOMMENDATIONS: Steps for next retry attempt
914
+ - INTERVENTION_REASON: If human intervention is needed
915
+ """
916
+ prompt = f"""You are analyzing a blocked user story that has failed multiple implementation attempts.
917
+ Your job is to determine if it can succeed with one more attempt (with adjustments) or if it needs human intervention.
918
+
919
+ PROJECT CONTEXT:
920
+ {project_context}
921
+
922
+ USER STORY:
923
+ {story_prompt}
924
+
925
+ NOTES FROM PREVIOUS ATTEMPTS:
926
+ {full_notes}
927
+
928
+ GIT DIFF OF CURRENT STATE:
929
+ {git_diff if git_diff else "(No changes detected)"}
930
+
931
+ Analyze the failure patterns and determine the best course of action.
932
+
933
+ Consider RETRY if:
934
+ - The failures are due to fixable issues (wrong approach, missing imports, typos)
935
+ - The acceptance criteria are achievable with a different strategy
936
+ - Pre-existing issues can be worked around
937
+ - The implementation is close but needs minor adjustments
938
+
939
+ Consider NEEDS_INTERVENTION if:
940
+ - There are fundamental blockers (missing dependencies, infrastructure issues)
941
+ - The acceptance criteria are ambiguous or impossible
942
+ - External factors prevent completion (permissions, environment)
943
+ - The same error keeps recurring despite different approaches
944
+ - Human judgment is needed to clarify requirements
945
+
946
+ Respond in this EXACT format:
947
+
948
+ ACTION: RETRY or NEEDS_INTERVENTION
949
+
950
+ SUMMARY:
951
+ <2-3 sentence summary of what happened and what the next approach should be>
952
+
953
+ ANALYSIS:
954
+ <Detailed breakdown of what failed and why>
955
+
956
+ RECOMMENDATIONS:
957
+ <If RETRY: Specific steps to try in the next attempt>
958
+ <If NEEDS_INTERVENTION: What the human should do to unblock>
959
+
960
+ INTERVENTION_REASON:
961
+ <Only if NEEDS_INTERVENTION: Why human help is required>"""
962
+
963
+ # Use sonnet model for deeper analysis, no file writes
964
+ return self.run(prompt, model="sonnet", allow_writes=False)
965
+
966
+
967
+ # Backwards compatibility aliases.
968
+ # These allow existing code using the old ClaudeRunner name to continue working
969
+ # without modification. New code should use HarnessRunner and HarnessResponse.
970
+ # See HARNESS_ARCHITECTURE.md "Migration from claude_binary" section.
971
+ ClaudeRunner = HarnessRunner
972
+ ClaudeResponse = HarnessResponse