ralph-code 0.6.1__tar.gz → 0.6.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. {ralph_code-0.6.1 → ralph_code-0.6.2}/PKG-INFO +13 -4
  2. {ralph_code-0.6.1 → ralph_code-0.6.2}/README.md +12 -3
  3. {ralph_code-0.6.1 → ralph_code-0.6.2}/pyproject.toml +1 -1
  4. {ralph_code-0.6.1 → ralph_code-0.6.2}/ralph/__init__.py +1 -1
  5. {ralph_code-0.6.1 → ralph_code-0.6.2}/ralph/app.py +1 -1
  6. {ralph_code-0.6.1 → ralph_code-0.6.2}/ralph/config.py +25 -3
  7. {ralph_code-0.6.1 → ralph_code-0.6.2}/ralph/git_manager.py +5 -0
  8. {ralph_code-0.6.1 → ralph_code-0.6.2}/ralph/harness.py +8 -2
  9. {ralph_code-0.6.1 → ralph_code-0.6.2}/ralph/harness_runner.py +186 -40
  10. {ralph_code-0.6.1 → ralph_code-0.6.2}/ralph/prd_manager.py +19 -3
  11. {ralph_code-0.6.1 → ralph_code-0.6.2}/ralph/workflow.py +54 -17
  12. {ralph_code-0.6.1 → ralph_code-0.6.2}/ralph_code.egg-info/PKG-INFO +13 -4
  13. {ralph_code-0.6.1 → ralph_code-0.6.2}/setup.py +1 -1
  14. {ralph_code-0.6.1 → ralph_code-0.6.2}/tests/test_config.py +14 -2
  15. {ralph_code-0.6.1 → ralph_code-0.6.2}/tests/test_harness.py +25 -6
  16. {ralph_code-0.6.1 → ralph_code-0.6.2}/tests/test_harness_runner.py +105 -9
  17. {ralph_code-0.6.1 → ralph_code-0.6.2}/tests/test_prd_manager.py +26 -2
  18. {ralph_code-0.6.1 → ralph_code-0.6.2}/tests/test_workflow.py +117 -0
  19. {ralph_code-0.6.1 → ralph_code-0.6.2}/LICENSE +0 -0
  20. {ralph_code-0.6.1 → ralph_code-0.6.2}/MANIFEST.in +0 -0
  21. {ralph_code-0.6.1 → ralph_code-0.6.2}/ralph/__main__.py +0 -0
  22. {ralph_code-0.6.1 → ralph_code-0.6.2}/ralph/claude_runner.py +0 -0
  23. {ralph_code-0.6.1 → ralph_code-0.6.2}/ralph/colors.py +0 -0
  24. {ralph_code-0.6.1 → ralph_code-0.6.2}/ralph/schemas/ralph_tasks_schema.json +0 -0
  25. {ralph_code-0.6.1 → ralph_code-0.6.2}/ralph/schemas/task_schema.json +0 -0
  26. {ralph_code-0.6.1 → ralph_code-0.6.2}/ralph/spinner.py +0 -0
  27. {ralph_code-0.6.1 → ralph_code-0.6.2}/ralph/storage.py +0 -0
  28. {ralph_code-0.6.1 → ralph_code-0.6.2}/ralph/tasks.py +0 -0
  29. {ralph_code-0.6.1 → ralph_code-0.6.2}/ralph/user_stories.py +0 -0
  30. {ralph_code-0.6.1 → ralph_code-0.6.2}/ralph_code.egg-info/SOURCES.txt +0 -0
  31. {ralph_code-0.6.1 → ralph_code-0.6.2}/ralph_code.egg-info/dependency_links.txt +0 -0
  32. {ralph_code-0.6.1 → ralph_code-0.6.2}/ralph_code.egg-info/entry_points.txt +0 -0
  33. {ralph_code-0.6.1 → ralph_code-0.6.2}/ralph_code.egg-info/requires.txt +0 -0
  34. {ralph_code-0.6.1 → ralph_code-0.6.2}/ralph_code.egg-info/top_level.txt +0 -0
  35. {ralph_code-0.6.1 → ralph_code-0.6.2}/setup.cfg +0 -0
  36. {ralph_code-0.6.1 → ralph_code-0.6.2}/tests/test_app.py +0 -0
  37. {ralph_code-0.6.1 → ralph_code-0.6.2}/tests/test_app_integration.py +0 -0
  38. {ralph_code-0.6.1 → ralph_code-0.6.2}/tests/test_colors.py +0 -0
  39. {ralph_code-0.6.1 → ralph_code-0.6.2}/tests/test_git_manager.py +0 -0
  40. {ralph_code-0.6.1 → ralph_code-0.6.2}/tests/test_spinner.py +0 -0
  41. {ralph_code-0.6.1 → ralph_code-0.6.2}/tests/test_storage.py +0 -0
  42. {ralph_code-0.6.1 → ralph_code-0.6.2}/tests/test_tasks.py +0 -0
  43. {ralph_code-0.6.1 → ralph_code-0.6.2}/tests/test_user_stories.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ralph-code
3
- Version: 0.6.1
3
+ Version: 0.6.2
4
4
  Summary: Automated task implementation with Claude Code and Codex
5
5
  Author: Ralph Coding
6
6
  License: MIT
@@ -37,7 +37,7 @@ Dynamic: requires-python
37
37
 
38
38
  # ralph-code
39
39
 
40
- Automated task implementation with Claude Code and Codex for "Ralph Coding". What is [Ralph Coding](https://ghuntley.com/ralph/)? It's a method of coding where context rot is avoided by controlling the retention of information. This method involves re-invoking claude or codex for each task, and passing information about the requirements, acceptance testing, and any progress that's made (or roadblocks/challenges faced) through files, rather than retaining all prompts + thinking + response tokens. It tends to result in more requests, some duplicated token work, but fairly consistent performance, and best of all it can largely be done unattended. Recommend Claude Max account or codex equivalent, but be aware that GPT-5 - GPT5.2's slow reasoning and response makes this ponderous, it's fine overnight.
40
+ Automated task implementation with Claude Code and Codex for "Ralph Coding". What is [Ralph Coding](https://ghuntley.com/ralph/)? It's a method of coding where context rot is avoided by controlling the retention of information. This method involves re-invoking claude or codex for each task, and passing information about the requirements, acceptance testing, and any progress that's made (or roadblocks/challenges faced) through files, rather than retaining all prompts + thinking + response tokens. It tends to result in more requests, some duplicated token work, but fairly consistent performance, and best of all it can largely be done unattended. Ralph now defaults to continuing past PRD-to-task conversion instead of pausing there, and non-interactive harness calls are bounded by timeouts and turn caps so stuck agent runs fail fast instead of hanging forever. Recommend Claude Max account or codex equivalent, but be aware that GPT-5 - GPT5.2's slow reasoning and response makes this ponderous, it's fine overnight.
41
41
 
42
42
  Because LLMs are carrying out the work, we can specify a job of "Find all the python files in the project that directly or indirectly access sqlalchemy objects, and upgrade the code to work with sqlalchemy 2.* This will result in probably a single-task project, but that one task might add 50 other tasks (on per file) to the backlog, which are then processed sequentially."
43
43
 
@@ -59,6 +59,15 @@ pip install ralph-code
59
59
  ralph [OPTIONS] [DIRECTORY]
60
60
  ```
61
61
 
62
+ ## Recent changes
63
+
64
+ Version `0.6.2` includes:
65
+ - Bounded non-interactive harness execution with timeout and turn limits
66
+ - Structured `tasks.json` generation for more reliable PRD conversion
67
+ - Automatic continuation after task generation by default
68
+ - `PRDs/` as the standard task directory, with legacy `PRD/` compatibility
69
+ - Refreshed model catalogs and current defaults
70
+
62
71
  ### Options
63
72
 
64
73
  - `--debug`: Enable debug logging, logs are saved into the .ralph subdirectory of the project
@@ -66,8 +75,8 @@ ralph [OPTIONS] [DIRECTORY]
66
75
 
67
76
  ## Usage
68
77
 
69
- First create a task, give a short name for the task (used for the branch commits will be added to), and then give a description.
70
- Then you run the ralph-coder, it will produce a .md file of the specifications, which will be broken into small tasks put into a tasks.json file. Each task will be worked on independently.
78
+ First create a task in `PRDs/`, give a short name for the task (used for the branch commits will be added to), and then give a description.
79
+ Then you run `ralph`, it will produce a `.md` file of the specifications, which will be broken into small tasks put into a `tasks.json` file. Each task will be worked on independently.
71
80
 
72
81
  ## Requirements
73
82
 
@@ -1,6 +1,6 @@
1
1
  # ralph-code
2
2
 
3
- Automated task implementation with Claude Code and Codex for "Ralph Coding". What is [Ralph Coding](https://ghuntley.com/ralph/)? It's a method of coding where context rot is avoided by controlling the retention of information. This method involves re-invoking claude or codex for each task, and passing information about the requirements, acceptance testing, and any progress that's made (or roadblocks/challenges faced) through files, rather than retaining all prompts + thinking + response tokens. It tends to result in more requests, some duplicated token work, but fairly consistent performance, and best of all it can largely be done unattended. Recommend Claude Max account or codex equivalent, but be aware that GPT-5 - GPT5.2's slow reasoning and response makes this ponderous, it's fine overnight.
3
+ Automated task implementation with Claude Code and Codex for "Ralph Coding". What is [Ralph Coding](https://ghuntley.com/ralph/)? It's a method of coding where context rot is avoided by controlling the retention of information. This method involves re-invoking claude or codex for each task, and passing information about the requirements, acceptance testing, and any progress that's made (or roadblocks/challenges faced) through files, rather than retaining all prompts + thinking + response tokens. It tends to result in more requests, some duplicated token work, but fairly consistent performance, and best of all it can largely be done unattended. Ralph now defaults to continuing past PRD-to-task conversion instead of pausing there, and non-interactive harness calls are bounded by timeouts and turn caps so stuck agent runs fail fast instead of hanging forever. Recommend Claude Max account or codex equivalent, but be aware that GPT-5 - GPT5.2's slow reasoning and response makes this ponderous, it's fine overnight.
4
4
 
5
5
  Because LLMs are carrying out the work, we can specify a job of "Find all the python files in the project that directly or indirectly access sqlalchemy objects, and upgrade the code to work with sqlalchemy 2.* This will result in probably a single-task project, but that one task might add 50 other tasks (on per file) to the backlog, which are then processed sequentially."
6
6
 
@@ -22,6 +22,15 @@ pip install ralph-code
22
22
  ralph [OPTIONS] [DIRECTORY]
23
23
  ```
24
24
 
25
+ ## Recent changes
26
+
27
+ Version `0.6.2` includes:
28
+ - Bounded non-interactive harness execution with timeout and turn limits
29
+ - Structured `tasks.json` generation for more reliable PRD conversion
30
+ - Automatic continuation after task generation by default
31
+ - `PRDs/` as the standard task directory, with legacy `PRD/` compatibility
32
+ - Refreshed model catalogs and current defaults
33
+
25
34
  ### Options
26
35
 
27
36
  - `--debug`: Enable debug logging, logs are saved into the .ralph subdirectory of the project
@@ -29,8 +38,8 @@ ralph [OPTIONS] [DIRECTORY]
29
38
 
30
39
  ## Usage
31
40
 
32
- First create a task, give a short name for the task (used for the branch commits will be added to), and then give a description.
33
- Then you run the ralph-coder, it will produce a .md file of the specifications, which will be broken into small tasks put into a tasks.json file. Each task will be worked on independently.
41
+ First create a task in `PRDs/`, give a short name for the task (used for the branch commits will be added to), and then give a description.
42
+ Then you run `ralph`, it will produce a `.md` file of the specifications, which will be broken into small tasks put into a `tasks.json` file. Each task will be worked on independently.
34
43
 
35
44
  ## Requirements
36
45
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "ralph-code"
7
- version = "0.6.1"
7
+ version = "0.6.2"
8
8
  description = "Automated task implementation with Claude Code and Codex"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
@@ -1,6 +1,6 @@
1
1
  """ralph-code: Automated task implementation with Claude Code and Codex."""
2
2
 
3
- __version__ = "0.1.0"
3
+ __version__ = "0.6.2"
4
4
  __author__ = "Ralph Coding"
5
5
 
6
6
  from .app import RalphApp, main
@@ -559,7 +559,7 @@ class RalphApp:
559
559
  choices: list[Choice] = []
560
560
 
561
561
  for model_name, label in supported_models:
562
- choices.append(Choice(title=model_name, value=model_name))
562
+ choices.append(Choice(title=f"{model_name} ({label})", value=model_name))
563
563
 
564
564
  # No models available - show alert and return
565
565
  if not choices:
@@ -14,11 +14,13 @@ DEFAULT_CONFIG = {
14
14
  "harness": "claude",
15
15
  "worker_model": "opus",
16
16
  "summary_model": "haiku",
17
+ "harness_timeout_seconds": 1800,
18
+ "non_interactive_max_turns": 12,
17
19
  "max_iterations": 10,
18
20
  "max_story_attempts": 3,
19
21
  "auto_spec_without_oversight": True,
20
22
  "pause_after_spec": False,
21
- "pause_after_tasks": True,
23
+ "pause_after_tasks": False,
22
24
  "wait_on_rate_limit": True,
23
25
  "pause_on_completion": True,
24
26
  "always_build_tests": False,
@@ -63,7 +65,7 @@ class Config:
63
65
  if "worker_model" not in self._config:
64
66
  harness = self._config.get("harness", DEFAULT_CONFIG["harness"])
65
67
  if harness == "codex":
66
- self._config["worker_model"] = "gpt-5.2-codex"
68
+ self._config["worker_model"] = "gpt-5.3-codex"
67
69
  else:
68
70
  self._config["worker_model"] = "opus"
69
71
  needs_save = True
@@ -72,7 +74,7 @@ class Config:
72
74
  if "summary_model" not in self._config:
73
75
  harness = self._config.get("harness", DEFAULT_CONFIG["harness"])
74
76
  if harness == "codex":
75
- self._config["summary_model"] = "gpt-5.2"
77
+ self._config["summary_model"] = "gpt-5.1-codex-mini"
76
78
  else:
77
79
  self._config["summary_model"] = "haiku"
78
80
  needs_save = True
@@ -124,6 +126,26 @@ class Config:
124
126
  self._config["summary_model"] = value
125
127
  self._save()
126
128
 
129
+ @property
130
+ def harness_timeout_seconds(self) -> int:
131
+ """Maximum seconds to wait for a harness subprocess before aborting."""
132
+ return int(self._config.get("harness_timeout_seconds", 1800))
133
+
134
+ @harness_timeout_seconds.setter
135
+ def harness_timeout_seconds(self, value: int) -> None:
136
+ self._config["harness_timeout_seconds"] = max(1, value)
137
+ self._save()
138
+
139
+ @property
140
+ def non_interactive_max_turns(self) -> int:
141
+ """Maximum agent turns for non-interactive harness runs."""
142
+ return int(self._config.get("non_interactive_max_turns", 12))
143
+
144
+ @non_interactive_max_turns.setter
145
+ def non_interactive_max_turns(self, value: int) -> None:
146
+ self._config["non_interactive_max_turns"] = max(1, value)
147
+ self._save()
148
+
127
149
  @property
128
150
  def max_iterations(self) -> int:
129
151
  """Maximum iterations for implementation loop."""
@@ -230,6 +230,11 @@ class GitManager:
230
230
 
231
231
  return self.commit(message)
232
232
 
233
+ def get_staged_files(self) -> list[str]:
234
+ """Get list of files currently staged for commit."""
235
+ result = self._run_git("diff", "--cached", "--name-only")
236
+ return [f for f in result.stdout.strip().split("\n") if f]
237
+
233
238
  def get_unstaged_files(self) -> list[str]:
234
239
  """Get list of files with unstaged changes (modified + untracked).
235
240
 
@@ -46,13 +46,19 @@ HarnessType = Literal["claude", "codex", "custom"]
46
46
  # These defaults are used when CLI model querying fails or isn't supported.
47
47
  DEFAULT_MODELS: dict[HarnessType, list[tuple[str, str]]] = {
48
48
  "claude": [
49
+ ("default", "Standard"),
49
50
  ("haiku", "Light"),
50
51
  ("sonnet", "Standard"),
51
52
  ("opus", "Standard"),
53
+ ("sonnet[1m]", "Extended"),
54
+ ("opusplan", "Planning"),
52
55
  ],
53
56
  "codex": [
54
57
  ("gpt-5.1-codex-mini", "Light"),
58
+ ("gpt-5.3-codex-spark", "Preview"),
59
+ ("gpt-5.3-codex", "Standard"),
55
60
  ("gpt-5.2-codex", "Standard"),
61
+ ("gpt-5.1-codex", "Standard"),
56
62
  ("gpt-5.1-codex-max", "Standard"),
57
63
  ("gpt-5.2", "Standard"),
58
64
  ],
@@ -61,13 +67,13 @@ DEFAULT_MODELS: dict[HarnessType, list[tuple[str, str]]] = {
61
67
 
62
68
  DEFAULT_WORKER_MODEL: dict[HarnessType, str] = {
63
69
  "claude": "opus",
64
- "codex": "gpt-5.2-codex",
70
+ "codex": "gpt-5.3-codex",
65
71
  "custom": "",
66
72
  }
67
73
 
68
74
  DEFAULT_SUMMARY_MODEL: dict[HarnessType, str] = {
69
75
  "claude": "haiku",
70
- "codex": "gpt-5.2",
76
+ "codex": "gpt-5.1-codex-mini",
71
77
  "custom": "",
72
78
  }
73
79
 
@@ -38,7 +38,7 @@ import time
38
38
  from dataclasses import dataclass
39
39
  from datetime import datetime
40
40
  from pathlib import Path
41
- from typing import Callable
41
+ from typing import Any, Callable
42
42
 
43
43
  from .config import get_config
44
44
  from .harness import Harness, HarnessType
@@ -80,7 +80,7 @@ HARNESS_MODEL_MAPPING: dict[HarnessType, dict[str, str]] = {
80
80
  "codex": {
81
81
  # Codex maps to OpenAI/Codex model names
82
82
  "haiku": "gpt-5.1-codex-mini",
83
- "sonnet": "gpt-5.2-codex",
83
+ "sonnet": "gpt-5.3-codex",
84
84
  "opus": "gpt-5.1-codex-max",
85
85
  },
86
86
  "custom": {
@@ -201,19 +201,22 @@ class HarnessRunner:
201
201
  model: str | None = None,
202
202
  print_output: bool = True,
203
203
  allow_writes: bool = False,
204
+ output_format: str | None = None,
205
+ json_schema: dict[str, Any] | None = None,
206
+ max_turns: int | None = None,
204
207
  ) -> list[str]:
205
208
  """Build the harness CLI command with harness-specific flags.
206
209
 
207
210
  This method handles the differences in CLI interfaces between harness types:
208
211
 
209
212
  Claude CLI:
210
- claude --model <model> --print [--dangerously-skip-permissions] -p "<prompt>"
213
+ claude --model <model> --print [--dangerously-skip-permissions] "<prompt>"
211
214
 
212
215
  Codex CLI (non-interactive):
213
216
  codex exec --model <model> --sandbox <mode> "<prompt>"
214
217
 
215
218
  Custom (defaults to Claude-like):
216
- custom --model <model> --print [--dangerously-skip-permissions] -p "<prompt>"
219
+ custom --model <model> --print [--dangerously-skip-permissions] "<prompt>"
217
220
 
218
221
  Args:
219
222
  prompt: The prompt text to send to the harness.
@@ -228,6 +231,7 @@ class HarnessRunner:
228
231
  harness = self._get_harness()
229
232
  model = model or self._config.worker_model
230
233
  mapped_model = self._map_model(model)
234
+ effective_max_turns = max_turns if max_turns is not None else self._config.non_interactive_max_turns
231
235
 
232
236
  cmd = [harness.path]
233
237
 
@@ -239,6 +243,14 @@ class HarnessRunner:
239
243
  if print_output:
240
244
  cmd.append("--print")
241
245
 
246
+ cmd.extend(["--max-turns", str(effective_max_turns)])
247
+
248
+ if output_format is not None:
249
+ cmd.extend(["--output-format", output_format])
250
+
251
+ if json_schema is not None:
252
+ cmd.extend(["--json-schema", json.dumps(json_schema)])
253
+
242
254
  if allow_writes:
243
255
  cmd.append("--dangerously-skip-permissions")
244
256
 
@@ -283,6 +295,9 @@ class HarnessRunner:
283
295
  max_retries: int = 3,
284
296
  retry_delay: float = 60.0,
285
297
  allow_writes: bool = False,
298
+ output_format: str | None = None,
299
+ json_schema: dict[str, Any] | None = None,
300
+ max_turns: int | None = None,
286
301
  ) -> HarnessResponse:
287
302
  """
288
303
  Run a prompt through the harness CLI.
@@ -297,7 +312,14 @@ class HarnessRunner:
297
312
  Returns:
298
313
  HarnessResponse with the result
299
314
  """
300
- cmd = self._build_command(prompt, model, allow_writes=allow_writes)
315
+ cmd = self._build_command(
316
+ prompt,
317
+ model,
318
+ allow_writes=allow_writes,
319
+ output_format=output_format,
320
+ json_schema=json_schema,
321
+ max_turns=max_turns if max_turns is not None else self._config.non_interactive_max_turns,
322
+ )
301
323
 
302
324
  self._log(f"Command: {' '.join(cmd)}")
303
325
  self._log(f"Prompt:\n{prompt}\n")
@@ -313,12 +335,16 @@ class HarnessRunner:
313
335
  cwd=self.project_dir,
314
336
  capture_output=True,
315
337
  text=True,
338
+ timeout=self._config.harness_timeout_seconds,
316
339
  # Don't catch KeyboardInterrupt - let it propagate
317
340
  )
318
341
 
319
342
  output = result.stdout
320
343
  error = result.stderr
321
344
 
345
+ if output_format == "json" and self._get_harness().type == "claude":
346
+ output, error = self._extract_claude_json_result(output, error, result.returncode)
347
+
322
348
  self._log(f"Output:\n{output}\n")
323
349
  if error:
324
350
  self._log(f"Error:\n{error}\n")
@@ -356,6 +382,15 @@ class HarnessRunner:
356
382
  output="",
357
383
  error=error_msg,
358
384
  )
385
+ except subprocess.TimeoutExpired:
386
+ timeout = self._config.harness_timeout_seconds
387
+ error_msg = f"Harness timed out after {timeout}s"
388
+ self._log(f"Error: {error_msg}")
389
+ return HarnessResponse(
390
+ success=False,
391
+ output="",
392
+ error=error_msg,
393
+ )
359
394
  except Exception as e:
360
395
  error_msg = str(e)
361
396
  self._log(f"Exception: {error_msg}")
@@ -372,6 +407,77 @@ class HarnessRunner:
372
407
  rate_limited=True,
373
408
  )
374
409
 
410
+ def _extract_claude_json_result(
411
+ self, output: str, error: str, returncode: int
412
+ ) -> tuple[str, str]:
413
+ """Extract the useful result from Claude's JSON envelope."""
414
+ stripped = output.strip()
415
+ if not stripped:
416
+ return output, error
417
+
418
+ try:
419
+ payload = json.loads(stripped)
420
+ except json.JSONDecodeError:
421
+ return output, error
422
+
423
+ if isinstance(payload, dict):
424
+ result = payload.get("result", payload)
425
+ if isinstance(result, str):
426
+ normalized_output = result
427
+ else:
428
+ normalized_output = json.dumps(result, ensure_ascii=False)
429
+
430
+ if returncode != 0 and not error:
431
+ error_value = payload.get("error") or payload.get("message") or normalized_output
432
+ error = str(error_value)
433
+
434
+ return normalized_output, error
435
+
436
+ return output, error
437
+
438
+ def run_structured(
439
+ self,
440
+ prompt: str,
441
+ json_schema: dict[str, Any],
442
+ model: str | None = None,
443
+ allow_writes: bool = False,
444
+ ) -> tuple[HarnessResponse, dict[str, Any] | None]:
445
+ """Run a prompt expecting a schema-validated JSON object."""
446
+ harness = self._get_harness()
447
+ response = self.run(
448
+ prompt,
449
+ model=model,
450
+ allow_writes=allow_writes,
451
+ output_format="json" if harness.type == "claude" else None,
452
+ json_schema=json_schema if harness.type == "claude" else None,
453
+ )
454
+ if not response.success:
455
+ return response, None
456
+
457
+ try:
458
+ parsed = json.loads(response.output)
459
+ except json.JSONDecodeError as exc:
460
+ return (
461
+ HarnessResponse(
462
+ success=False,
463
+ output=response.output,
464
+ error=f"Structured output was not valid JSON: {exc}",
465
+ ),
466
+ None,
467
+ )
468
+
469
+ if not isinstance(parsed, dict):
470
+ return (
471
+ HarnessResponse(
472
+ success=False,
473
+ output=response.output,
474
+ error="Structured output was not a JSON object",
475
+ ),
476
+ None,
477
+ )
478
+
479
+ return response, parsed
480
+
375
481
  def create_prd(self, task_description: str, learnings: str = "") -> HarnessResponse:
376
482
  """
377
483
  Create a PRD (Product Requirements Document) for a task using the prd skill format.
@@ -673,25 +779,66 @@ IMPORTANT RULES:
673
779
  8. All stories start with passes: false
674
780
  9. branchName MUST start with "{branch_prefix}/" (use lowercase letters, numbers, and hyphens only)
675
781
 
676
- Output ONLY valid JSON in the exact format below, nothing else:
677
- {{
678
- "project": "{project_name}",
679
- "branchName": "{branch_prefix}/feature-name-here",
680
- "description": "Feature description here",
681
- "userStories": [
682
- {{
683
- "id": "US-001",
684
- "title": "Story title",
685
- "description": "As a [user], I want [feature] so that [benefit]",
686
- "acceptanceCriteria": ["criterion 1", "criterion 2", "Typecheck passes"],
687
- "priority": 1,
688
- "passes": false,
689
- "notes": ""
690
- }}
691
- ]
692
- }}"""
782
+ Return a JSON object only. Do not wrap it in markdown fences."""
783
+
784
+ schema: dict[str, Any] = {
785
+ "type": "object",
786
+ "required": ["project", "branchName", "description", "userStories"],
787
+ "properties": {
788
+ "project": {"type": "string", "const": project_name},
789
+ "branchName": {
790
+ "type": "string",
791
+ "pattern": rf"^{re.escape(branch_prefix)}/[a-z0-9-]+$",
792
+ },
793
+ "description": {"type": "string", "minLength": 1},
794
+ "userStories": {
795
+ "type": "array",
796
+ "minItems": 1,
797
+ "items": {
798
+ "type": "object",
799
+ "required": [
800
+ "id",
801
+ "title",
802
+ "description",
803
+ "acceptanceCriteria",
804
+ "priority",
805
+ "passes",
806
+ ],
807
+ "properties": {
808
+ "id": {"type": "string", "pattern": r"^US-[0-9]{3}$"},
809
+ "title": {"type": "string", "minLength": 1},
810
+ "description": {"type": "string", "minLength": 1},
811
+ "acceptanceCriteria": {
812
+ "type": "array",
813
+ "minItems": 1,
814
+ "items": {"type": "string", "minLength": 1},
815
+ },
816
+ "priority": {"type": "integer", "minimum": 1},
817
+ "passes": {"type": "boolean", "const": False},
818
+ "notes": {"type": "string"},
819
+ },
820
+ "additionalProperties": False,
821
+ },
822
+ },
823
+ },
824
+ "additionalProperties": False,
825
+ }
826
+
827
+ response, parsed = self.run_structured(
828
+ prompt,
829
+ json_schema=schema,
830
+ model=self._config.summary_model,
831
+ )
832
+ if not response.success or parsed is None:
833
+ return response
693
834
 
694
- return self.run(prompt, model=self._config.summary_model)
835
+ return HarnessResponse(
836
+ success=True,
837
+ output=json.dumps(parsed, indent=2, ensure_ascii=False),
838
+ error=response.error,
839
+ rate_limited=response.rate_limited,
840
+ cost=response.cost,
841
+ )
695
842
 
696
843
  def implement_story(self, story_prompt: str, context: str = "") -> HarnessResponse:
697
844
  """
@@ -739,31 +886,30 @@ After implementation, verify each acceptance criterion is met."""
739
886
  GIT DIFF OF CHANGES:
740
887
  {git_diff if git_diff else "(No changes detected)"}
741
888
 
742
- Check each acceptance criterion and determine if it has been met.
889
+ Check each acceptance criterion and determine if it has been met.
743
890
 
744
- IMPORTANT: Distinguish between these scenarios:
745
- 1. PASSES - All acceptance criteria are verifiably met (or irrelevant - see below)
746
- 2. FAILS - One or more criteria are NOT met (implementation is wrong/incomplete)
747
- 3. BLOCKED - Some criteria CANNOT BE VERIFIED due to external factors (permission issues,
748
- pre-existing errors unrelated to this change, environment constraints, etc.)
891
+ IMPORTANT: Distinguish between these scenarios:
892
+ 1. PASSES - All acceptance criteria are verifiably met
893
+ 2. FAILS - One or more criteria are NOT met (implementation is wrong/incomplete)
894
+ 3. BLOCKED - Some criteria CANNOT BE VERIFIED due to external factors (permission issues,
895
+ pre-existing errors unrelated to this change, environment constraints, etc.)
749
896
 
750
- HANDLING IRRELEVANT OR NONSENSICAL CRITERIA:
751
- - If a criterion is based on an erroneous assumption (e.g., "update file X for library Y" but
752
- file X doesn't use library Y), treat it as PASSED by virtue of being irrelevant.
753
- - If a criterion seems nonsensical or impossible, question it and explain why it doesn't apply.
754
- - These criteria may have been auto-generated by earlier processes and don't reflect reality.
755
- - Mark such criteria as passed with a note explaining why they're not applicable.
897
+ HANDLING AMBIGUOUS OR NONSENSICAL CRITERIA:
898
+ - Do NOT auto-pass criteria as "N/A" or "irrelevant".
899
+ - If a criterion is based on a bad assumption, contradictory, or impossible to satisfy as written,
900
+ mark it as FAILED and explain precisely why.
901
+ - In "To Complete", include concrete guidance to correct that criterion/story definition.
902
+ - Use BLOCKED only when verification is impossible due to external constraints, not because criteria are poor.
756
903
 
757
904
  Respond in this exact format:
758
905
 
759
906
  STATUS: PASSES or FAILS or BLOCKED
760
907
  NOTES:
761
- **What Passed:**
762
- - ✅ List each criterion that was clearly met
763
- - ✅ List criteria that are N/A with explanation (e.g., "N/A - file doesn't use this library")
908
+ **What Passed:**
909
+ - ✅ List each criterion that was clearly met
764
910
 
765
- **What Failed:**
766
- - ❌ List each criterion that was NOT met (implementation issue)
911
+ **What Failed:**
912
+ - ❌ List each criterion that was NOT met, including malformed/incorrect criteria
767
913
 
768
914
  **What Could Not Be Verified:**
769
915
  - ⚠️ List criteria that cannot be verified with reasons (e.g., "Tests execution - requires permission")
@@ -38,6 +38,11 @@ def slugify(text: str) -> str:
38
38
  return slug[:50] # Limit length
39
39
 
40
40
 
41
+ def stable_prd_id(file_path: Path) -> str:
42
+ """Generate a deterministic PRD ID from file path."""
43
+ return str(uuid.uuid5(uuid.NAMESPACE_URL, str(file_path.resolve())))
44
+
45
+
41
46
  @dataclass
42
47
  class PRD:
43
48
  """Represents a Product Requirements Document."""
@@ -58,7 +63,7 @@ class PRD:
58
63
  content = file_path.read_text().strip()
59
64
  name = file_path.stem # filename without extension
60
65
  return cls(
61
- id=str(uuid.uuid4()),
66
+ id=stable_prd_id(file_path),
62
67
  name=name,
63
68
  file_path=file_path,
64
69
  is_specced=False,
@@ -119,7 +124,7 @@ class PRD:
119
124
  status = "errored"
120
125
 
121
126
  return cls(
122
- id=str(uuid.uuid4()),
127
+ id=stable_prd_id(file_path),
123
128
  name=name,
124
129
  file_path=file_path,
125
130
  is_specced=True,
@@ -155,11 +160,22 @@ class PRDManager:
155
160
 
156
161
  def __init__(self, project_dir: Path):
157
162
  self.project_dir = project_dir
158
- self.prd_dir = project_dir / "PRD"
163
+ self.prd_dir = self._resolve_prd_dir()
159
164
  self._ensure_prd_dir()
160
165
  self._prds: list[PRD] = []
161
166
  self._load()
162
167
 
168
+ def _resolve_prd_dir(self) -> Path:
169
+ """Resolve the active PRD directory, supporting the legacy singular path."""
170
+ plural_dir = self.project_dir / "PRDs"
171
+ legacy_dir = self.project_dir / "PRD"
172
+
173
+ if plural_dir.exists():
174
+ return plural_dir
175
+ if legacy_dir.exists():
176
+ return legacy_dir
177
+ return plural_dir
178
+
163
179
  def _ensure_prd_dir(self) -> None:
164
180
  """Ensure PRD directory exists."""
165
181
  self.prd_dir.mkdir(parents=True, exist_ok=True)