stellars-claude-code-plugins 0.8.44__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- stellars_claude_code_plugins/__init__.py +3 -0
- stellars_claude_code_plugins/engine/__init__.py +35 -0
- stellars_claude_code_plugins/engine/fsm.py +258 -0
- stellars_claude_code_plugins/engine/model.py +376 -0
- stellars_claude_code_plugins/engine/orchestrator.py +2444 -0
- stellars_claude_code_plugins-0.8.44.dist-info/METADATA +126 -0
- stellars_claude_code_plugins-0.8.44.dist-info/RECORD +11 -0
- stellars_claude_code_plugins-0.8.44.dist-info/WHEEL +5 -0
- stellars_claude_code_plugins-0.8.44.dist-info/entry_points.txt +2 -0
- stellars_claude_code_plugins-0.8.44.dist-info/licenses/LICENSE +21 -0
- stellars_claude_code_plugins-0.8.44.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,2444 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""YAML-driven declarative build iteration orchestrator.
|
|
3
|
+
|
|
4
|
+
All content loaded from YAML resources (phases, agents, workflow types,
|
|
5
|
+
guardian checklist, display strings). The engine is content-agnostic -
|
|
6
|
+
each plugin provides its own YAML resource files.
|
|
7
|
+
|
|
8
|
+
10-command CLI with 2 calls per phase (start + end).
|
|
9
|
+
Stateful phases, agent review, automated testing, independent gatekeeper.
|
|
10
|
+
|
|
11
|
+
State: <artifacts_dir>/state.yaml
|
|
12
|
+
Audit: <artifacts_dir>/log.yaml
|
|
13
|
+
Failures: <artifacts_dir>/failures.yaml
|
|
14
|
+
Hypotheses: <artifacts_dir>/hypotheses.yaml
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import collections
|
|
19
|
+
from datetime import datetime, timezone
|
|
20
|
+
import os
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
import re
|
|
23
|
+
import subprocess
|
|
24
|
+
import sys
|
|
25
|
+
|
|
26
|
+
import yaml
|
|
27
|
+
|
|
28
|
+
from stellars_claude_code_plugins.engine.model import (
|
|
29
|
+
load_model,
|
|
30
|
+
validate_model,
|
|
31
|
+
_resolve_key,
|
|
32
|
+
_KNOWN_VARS as _KNOWN_TEMPLATE_VARS,
|
|
33
|
+
)
|
|
34
|
+
from stellars_claude_code_plugins.engine.fsm import (
|
|
35
|
+
resolve_phase_key,
|
|
36
|
+
build_phase_lifecycle_fsm,
|
|
37
|
+
State as FSMState,
|
|
38
|
+
Event as FSMEvent,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# ── Module-level state (set by _initialize) ────────────────────────
|
|
42
|
+
|
|
43
|
+
PROJECT_ROOT = Path.cwd()
|
|
44
|
+
|
|
45
|
+
_MODEL = None
|
|
46
|
+
DEFAULT_ARTIFACTS_DIR = None
|
|
47
|
+
STATE_FILE = None
|
|
48
|
+
LOG_FILE = None
|
|
49
|
+
FAILURES_FILE = None
|
|
50
|
+
HYPOTHESES_FILE = None
|
|
51
|
+
CONTEXT_FILE = None
|
|
52
|
+
CMD = None
|
|
53
|
+
_SEP_CHAR = None
|
|
54
|
+
_SEP_WIDTH = None
|
|
55
|
+
_HDR_CHAR = None
|
|
56
|
+
_HDR_WIDTH = None
|
|
57
|
+
_PHASE_FSM = None
|
|
58
|
+
_FSM_STATE_VALUES = None
|
|
59
|
+
ITERATION_TYPES = {}
|
|
60
|
+
PHASE_AGENTS = {}
|
|
61
|
+
_PHASE_START = {}
|
|
62
|
+
_PHASE_END = {}
|
|
63
|
+
|
|
64
|
+
_AUTO_ACTION_REGISTRY = {}
|
|
65
|
+
|
|
66
|
+
_initialized = False
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _initialize(resources_dir: Path) -> None:
|
|
70
|
+
"""Load model from YAML resources and set up all module-level state.
|
|
71
|
+
|
|
72
|
+
Called once by main() before any command handler runs. This defers
|
|
73
|
+
model loading so the engine module can be imported without requiring
|
|
74
|
+
a specific resources directory.
|
|
75
|
+
"""
|
|
76
|
+
global _MODEL, DEFAULT_ARTIFACTS_DIR, STATE_FILE, LOG_FILE, FAILURES_FILE
|
|
77
|
+
global HYPOTHESES_FILE, CONTEXT_FILE, CMD, _SEP_CHAR, _SEP_WIDTH
|
|
78
|
+
global _HDR_CHAR, _HDR_WIDTH, _PHASE_FSM, _FSM_STATE_VALUES
|
|
79
|
+
global ITERATION_TYPES, PHASE_AGENTS, _PHASE_START, _PHASE_END
|
|
80
|
+
global _AUTO_ACTION_REGISTRY, _initialized
|
|
81
|
+
|
|
82
|
+
_MODEL = load_model(resources_dir)
|
|
83
|
+
|
|
84
|
+
DEFAULT_ARTIFACTS_DIR = PROJECT_ROOT / _MODEL.app.artifacts_dir
|
|
85
|
+
STATE_FILE = DEFAULT_ARTIFACTS_DIR / "state.yaml"
|
|
86
|
+
LOG_FILE = DEFAULT_ARTIFACTS_DIR / "log.yaml"
|
|
87
|
+
FAILURES_FILE = DEFAULT_ARTIFACTS_DIR / "failures.yaml"
|
|
88
|
+
HYPOTHESES_FILE = DEFAULT_ARTIFACTS_DIR / "hypotheses.yaml"
|
|
89
|
+
CONTEXT_FILE = DEFAULT_ARTIFACTS_DIR / "context.yaml"
|
|
90
|
+
CMD = _MODEL.app.cmd or "python orchestrate.py"
|
|
91
|
+
_SEP_CHAR = _MODEL.app.display.separator
|
|
92
|
+
_SEP_WIDTH = _MODEL.app.display.separator_width
|
|
93
|
+
_HDR_CHAR = _MODEL.app.display.header_char
|
|
94
|
+
_HDR_WIDTH = _MODEL.app.display.header_width
|
|
95
|
+
|
|
96
|
+
_PHASE_FSM = build_phase_lifecycle_fsm()
|
|
97
|
+
_FSM_STATE_VALUES = {s.value for s in FSMState}
|
|
98
|
+
|
|
99
|
+
# Build ITERATION_TYPES from model.workflow_types
|
|
100
|
+
ITERATION_TYPES.clear()
|
|
101
|
+
ITERATION_TYPES.update({
|
|
102
|
+
name: {
|
|
103
|
+
"description": wt.description,
|
|
104
|
+
"phases": wt.phase_names,
|
|
105
|
+
"required": wt.required,
|
|
106
|
+
"skippable": wt.skippable,
|
|
107
|
+
}
|
|
108
|
+
for name, wt in _MODEL.workflow_types.items()
|
|
109
|
+
})
|
|
110
|
+
|
|
111
|
+
# Extract flat agent name lists from model.agents
|
|
112
|
+
PHASE_AGENTS.clear()
|
|
113
|
+
PHASE_AGENTS.update({
|
|
114
|
+
phase: [a.name for a in agents]
|
|
115
|
+
for phase, agents in _MODEL.agents.items()
|
|
116
|
+
})
|
|
117
|
+
|
|
118
|
+
# Populate _PHASE_START and _PHASE_END from model.phases
|
|
119
|
+
_PHASE_START.clear()
|
|
120
|
+
_PHASE_END.clear()
|
|
121
|
+
for phase_name in _MODEL.phases:
|
|
122
|
+
_PHASE_START[phase_name] = _make_phase_callable(phase_name, "start")
|
|
123
|
+
_PHASE_END[phase_name] = _make_phase_callable(phase_name, "end")
|
|
124
|
+
|
|
125
|
+
# Auto-action registry
|
|
126
|
+
_AUTO_ACTION_REGISTRY.clear()
|
|
127
|
+
_AUTO_ACTION_REGISTRY.update({
|
|
128
|
+
"hypothesis_autowrite": _action_hypothesis_autowrite,
|
|
129
|
+
"hypothesis_gc": _action_hypothesis_gc,
|
|
130
|
+
"plan_save": _action_plan_save,
|
|
131
|
+
"iteration_summary": _action_iteration_summary,
|
|
132
|
+
"iteration_advance": _action_iteration_advance,
|
|
133
|
+
})
|
|
134
|
+
|
|
135
|
+
_initialized = True
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
# ── FSM helpers ─────────────────────────────────────────────────────
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _fire_fsm(event: FSMEvent, state: dict) -> FSMState:
|
|
142
|
+
"""Fire FSM event and sync phase_status to state dict.
|
|
143
|
+
|
|
144
|
+
Syncs FSM from persisted state before firing, then writes back.
|
|
145
|
+
All phase_status mutations go through this function.
|
|
146
|
+
"""
|
|
147
|
+
status = state.get("phase_status", "pending")
|
|
148
|
+
_PHASE_FSM.current_state = FSMState(status) if status in _FSM_STATE_VALUES else FSMState.PENDING
|
|
149
|
+
new_state = _PHASE_FSM.fire(event)
|
|
150
|
+
state["phase_status"] = new_state.value
|
|
151
|
+
return new_state
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
# ── Display helpers ─────────────────────────────────────────────────
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _msg(key: str, **kwargs) -> str:
|
|
158
|
+
"""Look up a message template from app.yaml and render with kwargs.
|
|
159
|
+
|
|
160
|
+
This is the display text abstraction layer. All user-facing CLI output
|
|
161
|
+
goes through this function, making the Python engine content-agnostic.
|
|
162
|
+
Uses format_map with defaultdict(str) so missing variables render as
|
|
163
|
+
empty strings instead of raising KeyError.
|
|
164
|
+
"""
|
|
165
|
+
template = _MODEL.app.messages.get(key, key)
|
|
166
|
+
ctx = {"cmd": CMD, "separator_line": _SEP_CHAR * _SEP_WIDTH, "header_line": _HDR_CHAR * _HDR_WIDTH}
|
|
167
|
+
ctx.update(kwargs)
|
|
168
|
+
return template.format_map(collections.defaultdict(str, ctx))
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _cli(section: str, key: str) -> str:
|
|
172
|
+
"""Look up a CLI help string from app.yaml.
|
|
173
|
+
|
|
174
|
+
Provides argparse descriptions and help text from YAML so CLI
|
|
175
|
+
documentation can be customised without touching Python code.
|
|
176
|
+
Supports top-level keys (description, epilog) and nested
|
|
177
|
+
command/argument help via section.key lookup.
|
|
178
|
+
"""
|
|
179
|
+
cli = _MODEL.app.cli
|
|
180
|
+
if section == "description":
|
|
181
|
+
return cli.description
|
|
182
|
+
if section == "epilog":
|
|
183
|
+
return cli.epilog.format_map(collections.defaultdict(str, {"cmd": CMD}))
|
|
184
|
+
val = cli.commands.get(key, key) if section == "commands" else cli.args.get(key, key)
|
|
185
|
+
return val.format_map(collections.defaultdict(str, {"cmd": CMD})) if "{" in val else val
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
# ── Exposed data structures ────────────────────────────────────────
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _guardian_checklist() -> str:
|
|
192
|
+
"""Return the guardian checklist text from model agents.
|
|
193
|
+
|
|
194
|
+
Searches all phase agent definitions for the first guardian agent
|
|
195
|
+
that has a checklist field. The checklist is injected into phase
|
|
196
|
+
templates via the {{checklist}} template variable in _build_context().
|
|
197
|
+
Used by guardian agents in both PLAN and REVIEW phases.
|
|
198
|
+
"""
|
|
199
|
+
for agent_list in _MODEL.agents.values():
|
|
200
|
+
for agent in agent_list:
|
|
201
|
+
if agent.name == "guardian" and agent.checklist:
|
|
202
|
+
return agent.checklist
|
|
203
|
+
return ""
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _current_workflow_type() -> str:
|
|
207
|
+
"""Get current workflow type from state, defaulting to 'full'."""
|
|
208
|
+
state = _load_state()
|
|
209
|
+
return (state or {}).get("type", "full")
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def _resolve_phase(phase: str) -> str:
|
|
213
|
+
"""Resolve a phase name to its namespaced key in phases.yaml."""
|
|
214
|
+
return resolve_phase_key(_current_workflow_type(), phase, _MODEL.phases)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _resolve_agents(phase: str) -> str:
|
|
218
|
+
"""Resolve a phase name to its namespaced key in agents.yaml."""
|
|
219
|
+
return resolve_phase_key(_current_workflow_type(), phase, _MODEL.agents)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def _resolve_gate(phase: str, gate_type: str) -> str:
|
|
223
|
+
"""Resolve a gate key for a phase using the :: fallback chain.
|
|
224
|
+
|
|
225
|
+
Gate keys are namespaced: FULL::RESEARCH::readback, FULL::TEST::gatekeeper.
|
|
226
|
+
Resolution follows the same WORKFLOW::PHASE -> PHASE -> FULL::PHASE chain.
|
|
227
|
+
"""
|
|
228
|
+
gate_phases = {
|
|
229
|
+
k.rsplit("::", 1)[0]
|
|
230
|
+
for k in _MODEL.gates
|
|
231
|
+
if "::" in k and k.rsplit("::", 1)[1] == gate_type
|
|
232
|
+
}
|
|
233
|
+
resolved = _resolve_key(_current_workflow_type(), phase, gate_phases)
|
|
234
|
+
return f"{resolved}::{gate_type}"
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def _build_agent_instructions(phase: str, ctx: dict | None = None) -> str:
|
|
238
|
+
"""Generate formatted agent instructions from model agents for a phase.
|
|
239
|
+
|
|
240
|
+
Produces '### Agent N: DISPLAY_NAME' formatted text matching the
|
|
241
|
+
pattern that v1 hardcoded in phase templates. If an agent has a
|
|
242
|
+
checklist field, it is appended to the prompt. Template variables
|
|
243
|
+
like {{checklist}} in agent prompts are resolved using the context dict.
|
|
244
|
+
Called by _build_context() to populate the {{agents_instructions}} variable.
|
|
245
|
+
"""
|
|
246
|
+
# Resolve namespaced agent key (FULL::RESEARCH, etc.) with fallback
|
|
247
|
+
resolved = _resolve_agents(phase)
|
|
248
|
+
agents = _MODEL.agents.get(resolved, [])
|
|
249
|
+
if not agents:
|
|
250
|
+
return ""
|
|
251
|
+
|
|
252
|
+
lines = []
|
|
253
|
+
for agent in agents:
|
|
254
|
+
prompt = agent.prompt
|
|
255
|
+
checklist = agent.checklist or ""
|
|
256
|
+
# Append checklist to prompt if agent has one
|
|
257
|
+
if checklist:
|
|
258
|
+
prompt = prompt.rstrip() + "\n\n" + checklist
|
|
259
|
+
# Resolve any template variables in the prompt (e.g., {checklist})
|
|
260
|
+
if ctx and "{" in prompt:
|
|
261
|
+
prompt = prompt.format_map(collections.defaultdict(str, ctx))
|
|
262
|
+
lines.append(f"### Agent {agent.number}: {agent.display_name}")
|
|
263
|
+
lines.append(prompt.rstrip())
|
|
264
|
+
lines.append("")
|
|
265
|
+
return "\n".join(lines).rstrip()
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
# ── Build context for template rendering ────────────────────────────
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def _build_context(state: dict | None = None, phase: str = "", event: str = "") -> dict:
|
|
272
|
+
"""Compute all template variables from state for phase rendering.
|
|
273
|
+
|
|
274
|
+
This is the central factory that every phase callable uses to
|
|
275
|
+
assemble the context dict for str.format_map(). Computes dynamic
|
|
276
|
+
content from iteration state: prior failures, hypothesis catalogue,
|
|
277
|
+
benchmark info, iteration plan. Also generates spawn instructions
|
|
278
|
+
and agent instructions from agents.yaml.
|
|
279
|
+
|
|
280
|
+
Args:
|
|
281
|
+
state: current iteration state from state.yaml
|
|
282
|
+
phase: phase name for agent instruction lookup
|
|
283
|
+
event: 'start' or 'end' to select correct agent set
|
|
284
|
+
"""
|
|
285
|
+
s = state or {}
|
|
286
|
+
|
|
287
|
+
# Prior failures context
|
|
288
|
+
prior_context = ""
|
|
289
|
+
all_failures = _load_yaml_list(FAILURES_FILE)
|
|
290
|
+
if all_failures:
|
|
291
|
+
prior_context = f"\n**Prior failures** ({len(all_failures)} total):\n"
|
|
292
|
+
for f in all_failures[-5:]:
|
|
293
|
+
prior_context += (
|
|
294
|
+
f" - [{f.get('mode', '?')}] "
|
|
295
|
+
f"(iter {f.get('iteration', '?')}) "
|
|
296
|
+
f"{f.get('description', '?')}\n"
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
# Plan context from iteration 0
|
|
300
|
+
plan_context = ""
|
|
301
|
+
iteration_plan = s.get("iteration_plan", "")
|
|
302
|
+
iteration = s.get("iteration", 1)
|
|
303
|
+
if iteration_plan and iteration > 0:
|
|
304
|
+
plan_context = (
|
|
305
|
+
f"\n**Iteration plan** (from planning iteration 0):\n{iteration_plan[:300]}\n"
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
# Hypothesis catalogue summary
|
|
309
|
+
prior_hyp = ""
|
|
310
|
+
catalogue = _hypothesis_catalogue_summary()
|
|
311
|
+
if catalogue and catalogue != "(no hypotheses yet)":
|
|
312
|
+
prior_hyp = f"\n**Hypothesis catalogue** (rate, review, evolve this list):\n{catalogue}\n"
|
|
313
|
+
|
|
314
|
+
# Benchmark info
|
|
315
|
+
benchmark_info = ""
|
|
316
|
+
benchmark_cmd = s.get("benchmark_cmd", "")
|
|
317
|
+
if benchmark_cmd:
|
|
318
|
+
scores = s.get("benchmark_scores", [])
|
|
319
|
+
if scores:
|
|
320
|
+
last = scores[-1]["score"]
|
|
321
|
+
benchmark_info = f"""
|
|
322
|
+
**Benchmark**: `{benchmark_cmd}` (last score: {last})
|
|
323
|
+
The benchmark runs automatically after tests pass. Score is tracked across
|
|
324
|
+
iterations - lower is better. The trend is shown in the output."""
|
|
325
|
+
else:
|
|
326
|
+
benchmark_info = f"""
|
|
327
|
+
**Benchmark**: `{benchmark_cmd}` (no prior score - first run)
|
|
328
|
+
The benchmark runs automatically after tests pass. It must output a numeric
|
|
329
|
+
value. This score will be tracked across iterations - lower is better."""
|
|
330
|
+
|
|
331
|
+
# Iteration purpose - explains what this iteration is about
|
|
332
|
+
total_iters = s.get("total_iterations", 1)
|
|
333
|
+
itype = s.get("type", "full")
|
|
334
|
+
wf_def = _MODEL.workflow_types.get(itype)
|
|
335
|
+
if wf_def and wf_def.dependency:
|
|
336
|
+
iteration_purpose = "\n" + _msg(
|
|
337
|
+
"dependency_banner", description=wf_def.description
|
|
338
|
+
) + "\n"
|
|
339
|
+
elif iteration > 0 and iteration_plan:
|
|
340
|
+
iteration_purpose = "\n" + _msg(
|
|
341
|
+
"iteration_n_banner", iteration=iteration, total=total_iters
|
|
342
|
+
) + "\n"
|
|
343
|
+
else:
|
|
344
|
+
iteration_purpose = ""
|
|
345
|
+
|
|
346
|
+
ctx = {
|
|
347
|
+
"CMD": CMD,
|
|
348
|
+
"objective": s.get("objective", "not set"),
|
|
349
|
+
"iteration": iteration,
|
|
350
|
+
"iteration_purpose": iteration_purpose,
|
|
351
|
+
"total": total_iters,
|
|
352
|
+
"remaining": total_iters - iteration,
|
|
353
|
+
"prior_context": prior_context,
|
|
354
|
+
"plan_context": plan_context,
|
|
355
|
+
"prior_hyp": prior_hyp,
|
|
356
|
+
"checklist": _guardian_checklist(),
|
|
357
|
+
"benchmark_info": benchmark_info,
|
|
358
|
+
}
|
|
359
|
+
# Agent instructions - resolve via :: namespace (FULL::PLAN has agents for end review)
|
|
360
|
+
agent_phase_key = _resolve_agents(phase or s.get("current_phase", ""))
|
|
361
|
+
ctx["agents_instructions"] = _build_agent_instructions(agent_phase_key, ctx)
|
|
362
|
+
|
|
363
|
+
# Spawn instruction - derived from agent count
|
|
364
|
+
_NUM_WORDS = {1: "ONE", 2: "TWO", 3: "THREE", 4: "FOUR", 5: "FIVE", 6: "SIX"}
|
|
365
|
+
agent_count = len(_MODEL.agents.get(agent_phase_key, []))
|
|
366
|
+
spawn_mode = "PARALLEL" # all agents spawn in parallel
|
|
367
|
+
if agent_count > 0:
|
|
368
|
+
word = _NUM_WORDS.get(agent_count, str(agent_count))
|
|
369
|
+
ctx["spawn_instruction"] = (
|
|
370
|
+
f"**MANDATORY: Spawn {word} SEPARATE agents IN {spawn_mode}** "
|
|
371
|
+
f"(single message, {word} Agent tool calls)."
|
|
372
|
+
)
|
|
373
|
+
else:
|
|
374
|
+
ctx["spawn_instruction"] = ""
|
|
375
|
+
|
|
376
|
+
# PLAN end variant with "to review the plan" suffix
|
|
377
|
+
if agent_count > 0:
|
|
378
|
+
word = _NUM_WORDS.get(agent_count, str(agent_count))
|
|
379
|
+
ctx["spawn_instruction_plan"] = (
|
|
380
|
+
f"**MANDATORY: Spawn {word} SEPARATE agents IN {spawn_mode} "
|
|
381
|
+
f"to review the plan** (single message, {word} Agent tool calls)."
|
|
382
|
+
)
|
|
383
|
+
else:
|
|
384
|
+
ctx["spawn_instruction_plan"] = ""
|
|
385
|
+
|
|
386
|
+
return ctx
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
# ── Phase instruction registry (YAML-driven) ────────────────────────
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def _make_phase_callable(phase: str, event: str) -> object:
|
|
393
|
+
"""Create a zero-arg callable that loads state and renders a phase template.
|
|
394
|
+
|
|
395
|
+
Registered in _PHASE_START/_PHASE_END dicts, these closures are the
|
|
396
|
+
bridge between YAML templates and the orchestrator. Each callable:
|
|
397
|
+
1. Loads current state from disk
|
|
398
|
+
2. Builds context via _build_context()
|
|
399
|
+
3. Selects the right template (handles NEXT remaining conditionals)
|
|
400
|
+
4. Renders the template with format_map()
|
|
401
|
+
"""
|
|
402
|
+
|
|
403
|
+
def _callable():
|
|
404
|
+
"""Load state, build context, render the model Phase template for this phase/event."""
|
|
405
|
+
state = _load_state()
|
|
406
|
+
ctx = _build_context(state, phase=phase, event=event)
|
|
407
|
+
# Handle conditional templates (NEXT has remaining/final variants)
|
|
408
|
+
key = event
|
|
409
|
+
if phase == "NEXT":
|
|
410
|
+
remaining = ctx["remaining"]
|
|
411
|
+
if event == "start":
|
|
412
|
+
key = "start_continue" if remaining > 0 else "start_final"
|
|
413
|
+
elif event == "end":
|
|
414
|
+
key = "end_continue" if remaining > 0 else "end_final"
|
|
415
|
+
resolved_phase = _resolve_phase(phase)
|
|
416
|
+
phase_obj = _MODEL.phases.get(resolved_phase)
|
|
417
|
+
template = getattr(phase_obj, key, "") if phase_obj else ""
|
|
418
|
+
if not template:
|
|
419
|
+
template = f"Phase {phase} {event}"
|
|
420
|
+
return template.format_map(collections.defaultdict(str, ctx))
|
|
421
|
+
|
|
422
|
+
return _callable
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
# ── Auto-action handlers ──────────────────────────────────────────
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
def _action_hypothesis_autowrite(state: dict, phase: str):
|
|
429
|
+
output_content = state.get("phase_outputs", {}).get(phase, "")
|
|
430
|
+
if output_content:
|
|
431
|
+
_auto_write_hypotheses(output_content, state.get("iteration", 0))
|
|
432
|
+
|
|
433
|
+
def _action_hypothesis_gc(state: dict, phase: str):
|
|
434
|
+
print("\n" + _msg("auto_separator"))
|
|
435
|
+
print(_msg("auto_hypothesis_gc"))
|
|
436
|
+
print(_msg("auto_separator"))
|
|
437
|
+
_run_hypothesis_gc()
|
|
438
|
+
|
|
439
|
+
def _action_iteration_summary(state: dict, phase: str):
|
|
440
|
+
print("\n" + _msg("auto_separator"))
|
|
441
|
+
print(_msg("auto_summary"))
|
|
442
|
+
print(_msg("auto_separator"))
|
|
443
|
+
_run_summary(state)
|
|
444
|
+
nxt = _next_phase(state)
|
|
445
|
+
if nxt == "NEXT":
|
|
446
|
+
print("\n" + _msg("auto_separator"))
|
|
447
|
+
print(_msg("auto_next"))
|
|
448
|
+
print(_msg("auto_autonomous"))
|
|
449
|
+
print(_msg("auto_separator"))
|
|
450
|
+
next_instructions = _PHASE_START.get("NEXT", lambda: "")()
|
|
451
|
+
print(next_instructions)
|
|
452
|
+
|
|
453
|
+
def _action_iteration_advance(state: dict, phase: str):
|
|
454
|
+
_run_next_iteration(state)
|
|
455
|
+
return "return"
|
|
456
|
+
|
|
457
|
+
def _action_plan_save(state: dict, phase: str):
|
|
458
|
+
"""Save PLAN output as plan.yaml for dependency workflows."""
|
|
459
|
+
wf_def = _MODEL.workflow_types.get(state.get("type", ""))
|
|
460
|
+
if not (wf_def and wf_def.dependency):
|
|
461
|
+
return
|
|
462
|
+
output_content = state.get("phase_outputs", {}).get(phase, "")
|
|
463
|
+
if not output_content:
|
|
464
|
+
return
|
|
465
|
+
plan_file = DEFAULT_ARTIFACTS_DIR / "plan.yaml"
|
|
466
|
+
plan_data = {
|
|
467
|
+
"objective": state.get("objective", ""),
|
|
468
|
+
"total_iterations": state.get("total_iterations", 1),
|
|
469
|
+
"plan": output_content,
|
|
470
|
+
"created_at": _now(),
|
|
471
|
+
}
|
|
472
|
+
plan_file.write_text(_yaml_dump(plan_data))
|
|
473
|
+
print(_msg("plan_saved", path=plan_file))
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
def _run_auto_actions(phase: str, state: dict) -> bool:
|
|
477
|
+
"""Run auto_actions.on_complete for the resolved phase. Returns True if handler signalled early return."""
|
|
478
|
+
resolved = _resolve_phase(phase)
|
|
479
|
+
phase_obj = _MODEL.phases.get(resolved)
|
|
480
|
+
if not phase_obj or not phase_obj.auto_actions:
|
|
481
|
+
return False
|
|
482
|
+
actions = phase_obj.auto_actions.get("on_complete", [])
|
|
483
|
+
for action_name in actions:
|
|
484
|
+
handler = _AUTO_ACTION_REGISTRY.get(action_name)
|
|
485
|
+
if handler:
|
|
486
|
+
result = handler(state, phase)
|
|
487
|
+
if result == "return":
|
|
488
|
+
return True
|
|
489
|
+
return False
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
# ── Helper functions ─────────────────────────────────────────────────
|
|
493
|
+
|
|
494
|
+
|
|
495
|
+
def _now() -> str:
|
|
496
|
+
"""Return current UTC timestamp as ISO 8601 string."""
|
|
497
|
+
return datetime.now(timezone.utc).isoformat(timespec="seconds")
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
def _load_state() -> dict | None:
|
|
501
|
+
"""Load iteration state from state.yaml."""
|
|
502
|
+
if STATE_FILE.exists():
|
|
503
|
+
return yaml.safe_load(STATE_FILE.read_text())
|
|
504
|
+
return None
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
def _yaml_dump(data: object) -> str:
|
|
508
|
+
"""Dump data to YAML with literal block style for readable output.
|
|
509
|
+
|
|
510
|
+
Uses a custom LiteralStr type and YAML representer to output
|
|
511
|
+
multiline strings as literal block scalars (|) instead of quoted
|
|
512
|
+
strings. Long single-line strings are word-wrapped at 80 chars.
|
|
513
|
+
This produces human-readable state.yaml and log.yaml files.
|
|
514
|
+
"""
|
|
515
|
+
|
|
516
|
+
class LiteralStr(str):
|
|
517
|
+
pass
|
|
518
|
+
|
|
519
|
+
def _literal_representer(dumper, data):
|
|
520
|
+
"""YAML representer that outputs strings as literal block scalars."""
|
|
521
|
+
return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
|
|
522
|
+
|
|
523
|
+
def _wrap_long(text: str, width: int = 80) -> str:
|
|
524
|
+
"""Wrap long single-line strings into multiline at sentence/clause boundaries."""
|
|
525
|
+
if len(text) <= width:
|
|
526
|
+
return text
|
|
527
|
+
lines = []
|
|
528
|
+
current = ""
|
|
529
|
+
for word in text.split():
|
|
530
|
+
if len(current) + len(word) + 1 > width:
|
|
531
|
+
lines.append(current)
|
|
532
|
+
current = word
|
|
533
|
+
else:
|
|
534
|
+
current = f"{current} {word}" if current else word
|
|
535
|
+
if current:
|
|
536
|
+
lines.append(current)
|
|
537
|
+
return "\n".join(lines)
|
|
538
|
+
|
|
539
|
+
def _prepare(obj):
|
|
540
|
+
"""Recursively convert long or multiline strings to LiteralStr."""
|
|
541
|
+
if isinstance(obj, dict):
|
|
542
|
+
return {k: _prepare(v) for k, v in obj.items()}
|
|
543
|
+
if isinstance(obj, list):
|
|
544
|
+
return [_prepare(v) for v in obj]
|
|
545
|
+
if isinstance(obj, str) and ("\n" in obj or len(obj) > 80):
|
|
546
|
+
text = _wrap_long(obj) if "\n" not in obj else obj
|
|
547
|
+
if not text.endswith("\n"):
|
|
548
|
+
text += "\n"
|
|
549
|
+
return LiteralStr(text)
|
|
550
|
+
return obj
|
|
551
|
+
|
|
552
|
+
dumper = yaml.Dumper
|
|
553
|
+
dumper.add_representer(LiteralStr, _literal_representer)
|
|
554
|
+
return yaml.dump(
|
|
555
|
+
_prepare(data),
|
|
556
|
+
Dumper=dumper,
|
|
557
|
+
default_flow_style=False,
|
|
558
|
+
sort_keys=False,
|
|
559
|
+
)
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
def _save_state(state: dict) -> None:
|
|
563
|
+
"""Write current iteration state dict to state.yaml.
|
|
564
|
+
|
|
565
|
+
Called after every state mutation (phase transitions, agent recording,
|
|
566
|
+
gatekeeper results, rejections) to persist progress to disk.
|
|
567
|
+
"""
|
|
568
|
+
STATE_FILE.write_text(_yaml_dump(state))
|
|
569
|
+
|
|
570
|
+
|
|
571
|
+
def _save_objective(objective: str, iterations: int) -> None:
|
|
572
|
+
"""Save objective to objective.yaml in artifacts dir."""
|
|
573
|
+
obj_file = DEFAULT_ARTIFACTS_DIR / "objective.yaml"
|
|
574
|
+
obj_file.write_text(
|
|
575
|
+
_yaml_dump(
|
|
576
|
+
{
|
|
577
|
+
"objective": objective,
|
|
578
|
+
"iterations": iterations,
|
|
579
|
+
"created_at": _now(),
|
|
580
|
+
}
|
|
581
|
+
)
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
|
|
585
|
+
def _load_yaml_list(path: Path) -> list[dict]:
|
|
586
|
+
"""Load a YAML file containing a list of entries."""
|
|
587
|
+
if not path.exists():
|
|
588
|
+
return []
|
|
589
|
+
data = yaml.safe_load(path.read_text())
|
|
590
|
+
return data if isinstance(data, list) else []
|
|
591
|
+
|
|
592
|
+
|
|
593
|
+
def _append_yaml_entry(path: Path, entry: dict) -> None:
|
|
594
|
+
"""Append an entry to a YAML list file."""
|
|
595
|
+
entries = _load_yaml_list(path)
|
|
596
|
+
entries.append(entry)
|
|
597
|
+
path.write_text(_yaml_dump(entries))
|
|
598
|
+
|
|
599
|
+
|
|
600
|
+
def _append_log(entry: dict) -> None:
|
|
601
|
+
"""Append a timestamped entry to the audit log."""
|
|
602
|
+
entry["timestamp"] = _now()
|
|
603
|
+
_append_yaml_entry(LOG_FILE, entry)
|
|
604
|
+
|
|
605
|
+
|
|
606
|
+
def _append_failure(entry: dict) -> None:
|
|
607
|
+
"""Append a timestamped failure entry to the failures log."""
|
|
608
|
+
entry["timestamp"] = _now()
|
|
609
|
+
_append_yaml_entry(FAILURES_FILE, entry)
|
|
610
|
+
|
|
611
|
+
|
|
612
|
+
def _append_hypothesis(entry: dict) -> None:
|
|
613
|
+
"""Add or update hypothesis in the catalogue.
|
|
614
|
+
|
|
615
|
+
The catalogue is a persistent list of ALL hypotheses across iterations,
|
|
616
|
+
not per-iteration snapshots. Each entry has: id, hypothesis, predict,
|
|
617
|
+
evidence, risk, status, votes, avg_score.
|
|
618
|
+
"""
|
|
619
|
+
entry["timestamp"] = _now()
|
|
620
|
+
_append_yaml_entry(HYPOTHESES_FILE, entry)
|
|
621
|
+
|
|
622
|
+
|
|
623
|
+
def _auto_write_hypotheses(output_content: str, iteration: int) -> None:
|
|
624
|
+
"""Extract structured hypothesis entries from HYPOTHESIS phase output.
|
|
625
|
+
|
|
626
|
+
Splits the output on ID: boundaries and parses each block for
|
|
627
|
+
structured fields (ID/HYPOTHESIS/PREDICT/EVIDENCE/RISK/STARS).
|
|
628
|
+
Writes valid entries to hypotheses.yaml. Entries missing required
|
|
629
|
+
fields are skipped with a warning.
|
|
630
|
+
"""
|
|
631
|
+
required_fields = {"id", "hypothesis", "predict", "evidence", "risk"}
|
|
632
|
+
fields_to_parse = ["ID", "HYPOTHESIS", "PREDICT", "EVIDENCE", "RISK",
|
|
633
|
+
"STARS", "WHAT TO DO", "STATUS"]
|
|
634
|
+
|
|
635
|
+
# Split on ID: boundaries to isolate each hypothesis block
|
|
636
|
+
blocks = re.split(r"(?=^ID:\s)", output_content, flags=re.MULTILINE)
|
|
637
|
+
|
|
638
|
+
entries = []
|
|
639
|
+
for block in blocks:
|
|
640
|
+
if not block.strip():
|
|
641
|
+
continue
|
|
642
|
+
entry: dict = {}
|
|
643
|
+
for line in block.split("\n"):
|
|
644
|
+
stripped = line.strip()
|
|
645
|
+
for field in fields_to_parse:
|
|
646
|
+
if stripped.upper().startswith(field + ":"):
|
|
647
|
+
value = stripped[len(field) + 1:].strip()
|
|
648
|
+
key = field.lower().replace(" ", "_")
|
|
649
|
+
if key == "stars":
|
|
650
|
+
try:
|
|
651
|
+
entry["avg_score"] = float(value.split("/")[0])
|
|
652
|
+
except (ValueError, IndexError):
|
|
653
|
+
entry["avg_score"] = 0.0
|
|
654
|
+
entry["votes"] = value
|
|
655
|
+
else:
|
|
656
|
+
entry[key] = value
|
|
657
|
+
break
|
|
658
|
+
if entry.get("id"):
|
|
659
|
+
entries.append(entry)
|
|
660
|
+
|
|
661
|
+
written = 0
|
|
662
|
+
for entry in entries:
|
|
663
|
+
missing = required_fields - set(entry.keys())
|
|
664
|
+
if missing:
|
|
665
|
+
print(_msg("auto_hypothesis_warn", hid=entry.get("id", "?"), missing=str(missing)))
|
|
666
|
+
continue
|
|
667
|
+
entry.setdefault("status", "proposed")
|
|
668
|
+
entry.setdefault("votes", "")
|
|
669
|
+
entry.setdefault("avg_score", 0.0)
|
|
670
|
+
entry["iteration"] = iteration
|
|
671
|
+
_append_hypothesis(entry)
|
|
672
|
+
written += 1
|
|
673
|
+
|
|
674
|
+
if written:
|
|
675
|
+
print(_msg("auto_hypothesis_wrote", count=written))
|
|
676
|
+
|
|
677
|
+
|
|
678
|
+
def _load_context() -> dict:
|
|
679
|
+
"""Load context messages from context.yaml.
|
|
680
|
+
|
|
681
|
+
Returns a dict mapping phase names to message strings. Returns empty
|
|
682
|
+
dict if file doesn't exist (first run or never set).
|
|
683
|
+
"""
|
|
684
|
+
if not CONTEXT_FILE.exists():
|
|
685
|
+
return {}
|
|
686
|
+
data = yaml.safe_load(CONTEXT_FILE.read_text())
|
|
687
|
+
return data if isinstance(data, dict) else {}
|
|
688
|
+
|
|
689
|
+
|
|
690
|
+
def _save_context(ctx: dict) -> None:
|
|
691
|
+
"""Save context messages to context.yaml."""
|
|
692
|
+
CONTEXT_FILE.write_text(_yaml_dump(ctx))
|
|
693
|
+
|
|
694
|
+
|
|
695
|
+
def _load_prior_hypotheses() -> list[dict]:
|
|
696
|
+
"""Load the full hypothesis catalogue for agents to review."""
|
|
697
|
+
return _load_yaml_list(HYPOTHESES_FILE)
|
|
698
|
+
|
|
699
|
+
|
|
700
|
+
def _hypothesis_catalogue_summary() -> str:
|
|
701
|
+
"""Format hypothesis catalogue for agent context."""
|
|
702
|
+
hyps = _load_prior_hypotheses()
|
|
703
|
+
if not hyps:
|
|
704
|
+
return "(no hypotheses yet)"
|
|
705
|
+
lines = []
|
|
706
|
+
for h in hyps:
|
|
707
|
+
hid = h.get("id", "?")
|
|
708
|
+
text = h.get("hypothesis", "?")[:100]
|
|
709
|
+
status = h.get("status", "?")
|
|
710
|
+
avg = h.get("avg_score", "?")
|
|
711
|
+
lines.append(f" {hid} ({avg}/5, {status}): {text}")
|
|
712
|
+
return "\n".join(lines)
|
|
713
|
+
|
|
714
|
+
|
|
715
|
+
def _phase_dir(state: dict) -> Path:
|
|
716
|
+
"""Get/create phase artifacts subfolder: phase_N_NAME/."""
|
|
717
|
+
itype = ITERATION_TYPES[state["type"]]
|
|
718
|
+
phases = itype["phases"]
|
|
719
|
+
phase = state["current_phase"]
|
|
720
|
+
idx = phases.index(phase) + 1 if phase in phases else 0
|
|
721
|
+
folder = DEFAULT_ARTIFACTS_DIR / f"phase_{idx:02d}_{phase.lower()}"
|
|
722
|
+
folder.mkdir(parents=True, exist_ok=True)
|
|
723
|
+
return folder
|
|
724
|
+
|
|
725
|
+
|
|
726
|
+
def _next_phase(state: dict) -> str | None:
|
|
727
|
+
"""Return the next phase name in the workflow sequence.
|
|
728
|
+
|
|
729
|
+
Looks up the current phase in ITERATION_TYPES and returns the
|
|
730
|
+
following phase, or None if the current phase is the last one.
|
|
731
|
+
Used by cmd_end to advance the state machine.
|
|
732
|
+
"""
|
|
733
|
+
itype = ITERATION_TYPES[state["type"]]
|
|
734
|
+
phases = itype["phases"]
|
|
735
|
+
try:
|
|
736
|
+
idx = phases.index(state["current_phase"])
|
|
737
|
+
if idx + 1 < len(phases):
|
|
738
|
+
return phases[idx + 1]
|
|
739
|
+
except ValueError:
|
|
740
|
+
pass
|
|
741
|
+
return None
|
|
742
|
+
|
|
743
|
+
|
|
744
|
+
def _prev_implementable(state: dict) -> str:
|
|
745
|
+
"""Find the phase to return to when a reviewer rejects.
|
|
746
|
+
|
|
747
|
+
Walks backward through the phase sequence looking for IMPLEMENT.
|
|
748
|
+
Used by cmd_reject and the TEST auto-reject to determine which
|
|
749
|
+
phase to roll back to.
|
|
750
|
+
"""
|
|
751
|
+
itype = ITERATION_TYPES[state["type"]]
|
|
752
|
+
phases = itype["phases"]
|
|
753
|
+
idx = phases.index(state["current_phase"])
|
|
754
|
+
for i in range(idx - 1, -1, -1):
|
|
755
|
+
if phases[i] == "IMPLEMENT":
|
|
756
|
+
return "IMPLEMENT"
|
|
757
|
+
return phases[0]
|
|
758
|
+
|
|
759
|
+
|
|
760
|
+
def _count_iteration_failures(iteration: int) -> list[dict]:
|
|
761
|
+
"""Read failure log entries for a specific iteration."""
|
|
762
|
+
return [e for e in _load_yaml_list(FAILURES_FILE) if e.get("iteration") == iteration]
|
|
763
|
+
|
|
764
|
+
|
|
765
|
+
def _init_artifacts_dir(artifacts_dir: Path | None = None) -> None:
|
|
766
|
+
"""Initialise the artifacts directory and set global path variables.
|
|
767
|
+
|
|
768
|
+
Mutates module-level STATE_FILE, LOG_FILE, FAILURES_FILE, and
|
|
769
|
+
HYPOTHESES_FILE to point to the correct artifacts directory.
|
|
770
|
+
Called once in main() before any command handler runs.
|
|
771
|
+
"""
|
|
772
|
+
global STATE_FILE, LOG_FILE, FAILURES_FILE, HYPOTHESES_FILE, CONTEXT_FILE # noqa: PLW0603
|
|
773
|
+
d = artifacts_dir or DEFAULT_ARTIFACTS_DIR
|
|
774
|
+
d.mkdir(parents=True, exist_ok=True)
|
|
775
|
+
STATE_FILE = d / "state.yaml"
|
|
776
|
+
LOG_FILE = d / "log.yaml"
|
|
777
|
+
FAILURES_FILE = d / "failures.yaml"
|
|
778
|
+
HYPOTHESES_FILE = d / "hypotheses.yaml"
|
|
779
|
+
CONTEXT_FILE = d / "context.yaml"
|
|
780
|
+
|
|
781
|
+
|
|
782
|
+
def _read_last_iteration(artifacts_dir: Path | None = None) -> int:
|
|
783
|
+
"""Read the last iteration number before cleaning. Returns 0 if none."""
|
|
784
|
+
d = artifacts_dir or DEFAULT_ARTIFACTS_DIR
|
|
785
|
+
state_file = d / "state.yaml"
|
|
786
|
+
if state_file.exists():
|
|
787
|
+
try:
|
|
788
|
+
return yaml.safe_load(state_file.read_text()).get("iteration", 0)
|
|
789
|
+
except (yaml.YAMLError, KeyError, AttributeError):
|
|
790
|
+
pass
|
|
791
|
+
return 0
|
|
792
|
+
|
|
793
|
+
|
|
794
|
+
def _clean_artifacts_dir(artifacts_dir: Path | None = None) -> None:
|
|
795
|
+
"""Clean artifacts directory for fresh run.
|
|
796
|
+
|
|
797
|
+
Preserves hypotheses*.yaml, hypotheses_archive.yaml, and context.yaml.
|
|
798
|
+
"""
|
|
799
|
+
d = artifacts_dir or DEFAULT_ARTIFACTS_DIR
|
|
800
|
+
if d.exists():
|
|
801
|
+
for f in d.iterdir():
|
|
802
|
+
if f.is_file():
|
|
803
|
+
# Preserve hypothesis and context files across clean
|
|
804
|
+
if f.name.startswith("hypotheses") or f.name == "context.yaml":
|
|
805
|
+
continue
|
|
806
|
+
f.unlink()
|
|
807
|
+
elif f.is_dir():
|
|
808
|
+
import shutil
|
|
809
|
+
|
|
810
|
+
shutil.rmtree(f)
|
|
811
|
+
d.mkdir(parents=True, exist_ok=True)
|
|
812
|
+
|
|
813
|
+
|
|
814
|
+
# ── Programmatic verification ────────────────────────────────────────
|
|
815
|
+
|
|
816
|
+
|
|
817
|
+
def _verify_test_phase(state: dict | None = None) -> tuple[bool, str]:
|
|
818
|
+
"""Run automated test suite, linter, and optional benchmark.
|
|
819
|
+
|
|
820
|
+
Executed automatically during TEST phase's cmd_end. Runs make test
|
|
821
|
+
and make lint with 120s timeout each. If a benchmark command is
|
|
822
|
+
configured (via --benchmark on cmd_new), runs it after tests pass
|
|
823
|
+
and tracks the numeric score across iterations.
|
|
824
|
+
Failure auto-rejects back to IMPLEMENT phase.
|
|
825
|
+
"""
|
|
826
|
+
results = []
|
|
827
|
+
for cmd_name, cmd in [("test", "make test"), ("lint", "make lint")]:
|
|
828
|
+
try:
|
|
829
|
+
result = subprocess.run(
|
|
830
|
+
cmd,
|
|
831
|
+
shell=True,
|
|
832
|
+
capture_output=True,
|
|
833
|
+
text=True,
|
|
834
|
+
timeout=120,
|
|
835
|
+
cwd=str(PROJECT_ROOT),
|
|
836
|
+
)
|
|
837
|
+
if result.returncode == 0:
|
|
838
|
+
results.append(f"{cmd_name}: PASS")
|
|
839
|
+
else:
|
|
840
|
+
output = result.stdout[-500:] if result.stdout else result.stderr[-500:]
|
|
841
|
+
results.append(f"{cmd_name}: FAIL\n{output}")
|
|
842
|
+
return False, "\n".join(results)
|
|
843
|
+
except subprocess.TimeoutExpired:
|
|
844
|
+
results.append(f"{cmd_name}: TIMEOUT (120s)")
|
|
845
|
+
return False, "\n".join(results)
|
|
846
|
+
except FileNotFoundError:
|
|
847
|
+
results.append(f"{cmd_name}: Makefile target not found, skipping")
|
|
848
|
+
|
|
849
|
+
# Run benchmark if configured
|
|
850
|
+
# The benchmark is always a generative instruction - text that tells the
|
|
851
|
+
# orchestrating Claude what to evaluate. It is NOT a shell command.
|
|
852
|
+
# Claude reads the referenced file, evaluates each checklist item,
|
|
853
|
+
# updates the file, and reports the violation count.
|
|
854
|
+
benchmark_cmd = (state or {}).get("benchmark_cmd", "")
|
|
855
|
+
if benchmark_cmd:
|
|
856
|
+
results.append(f"benchmark (generative): {benchmark_cmd}")
|
|
857
|
+
results.append(
|
|
858
|
+
" INSTRUCTION: Read the benchmark file, evaluate each [ ] item,")
|
|
859
|
+
results.append(
|
|
860
|
+
" update [ ] -> [x] for passing items, add new [ ] items if discovered,")
|
|
861
|
+
results.append(
|
|
862
|
+
" then report the count of remaining [ ] items as the violation score.")
|
|
863
|
+
results.append(
|
|
864
|
+
" The orchestrating agent MUST follow this instruction before proceeding.")
|
|
865
|
+
|
|
866
|
+
return True, "\n".join(results)
|
|
867
|
+
|
|
868
|
+
|
|
869
|
+
# ── Claude evaluation ────────────────────────────────────────────────
|
|
870
|
+
|
|
871
|
+
|
|
872
|
+
def _claude_evaluate(
|
|
873
|
+
prompt: str,
|
|
874
|
+
timeout: int = 60,
|
|
875
|
+
) -> tuple[bool, str]:
|
|
876
|
+
"""Run claude -p with a PASS/FAIL evaluation prompt.
|
|
877
|
+
|
|
878
|
+
Used by readback and gatekeeper gates for independent validation.
|
|
879
|
+
Strips the CLAUDECODE environment variable to prevent subprocess
|
|
880
|
+
hang (claude-agent-sdk detects it and enters degraded mode).
|
|
881
|
+
Uses sonnet model with max-turns 3 and 60s timeout.
|
|
882
|
+
Logs every prompt+response to artifacts/logs/ for debugging.
|
|
883
|
+
"""
|
|
884
|
+
env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}
|
|
885
|
+
|
|
886
|
+
try:
|
|
887
|
+
result = subprocess.run(
|
|
888
|
+
[
|
|
889
|
+
"claude",
|
|
890
|
+
"-p",
|
|
891
|
+
prompt,
|
|
892
|
+
"--model",
|
|
893
|
+
"sonnet",
|
|
894
|
+
"--max-turns",
|
|
895
|
+
"3",
|
|
896
|
+
],
|
|
897
|
+
capture_output=True,
|
|
898
|
+
text=True,
|
|
899
|
+
timeout=timeout,
|
|
900
|
+
env=env,
|
|
901
|
+
cwd=str(PROJECT_ROOT),
|
|
902
|
+
)
|
|
903
|
+
output = result.stdout.strip()
|
|
904
|
+
first_line = output.split("\n")[0].strip("*#> ").strip().upper()
|
|
905
|
+
passed = first_line.startswith("PASS")
|
|
906
|
+
except FileNotFoundError:
|
|
907
|
+
passed, output = False, "FAIL: claude CLI not found."
|
|
908
|
+
except subprocess.TimeoutExpired:
|
|
909
|
+
passed, output = False, f"FAIL: claude -p timed out ({timeout}s)."
|
|
910
|
+
|
|
911
|
+
# Log for tracing
|
|
912
|
+
log_dir = DEFAULT_ARTIFACTS_DIR / "logs"
|
|
913
|
+
log_dir.mkdir(parents=True, exist_ok=True)
|
|
914
|
+
ts = _now().replace(":", "-")
|
|
915
|
+
log_file = log_dir / f"eval_{ts}.log"
|
|
916
|
+
log_file.write_text(
|
|
917
|
+
f"PROMPT:\n{prompt}\n\nRESPONSE:\n{output}\n\nRESULT: {'PASS' if passed else 'FAIL'}\n",
|
|
918
|
+
encoding="utf-8",
|
|
919
|
+
)
|
|
920
|
+
|
|
921
|
+
return passed, output
|
|
922
|
+
|
|
923
|
+
|
|
924
|
+
def _readback_validate(
|
|
925
|
+
phase: str,
|
|
926
|
+
understanding: str,
|
|
927
|
+
instructions: str,
|
|
928
|
+
) -> tuple[bool, str]:
|
|
929
|
+
"""Validate agent understanding before phase execution via claude -p.
|
|
930
|
+
|
|
931
|
+
This is a BLOCKING gate at phase start. The agent provides a brief
|
|
932
|
+
understanding of what the phase requires, and an independent Claude
|
|
933
|
+
session evaluates whether it captures the essential requirements.
|
|
934
|
+
If readback fails, the phase stays PENDING until retried.
|
|
935
|
+
Prompt template loaded from agents.yaml gates.readback.
|
|
936
|
+
"""
|
|
937
|
+
obj_line = ""
|
|
938
|
+
action_part = instructions
|
|
939
|
+
for marker in [
|
|
940
|
+
"**Goal",
|
|
941
|
+
"**MANDATORY",
|
|
942
|
+
"**CRITICAL",
|
|
943
|
+
"**Execution",
|
|
944
|
+
"### Agent",
|
|
945
|
+
"**Actions",
|
|
946
|
+
]:
|
|
947
|
+
idx = instructions.find(marker)
|
|
948
|
+
if idx >= 0:
|
|
949
|
+
obj_line = instructions[:idx].replace("\n", " ").strip()[:150]
|
|
950
|
+
action_part = instructions[idx:]
|
|
951
|
+
break
|
|
952
|
+
action_abbrev = action_part[:500].replace("\n", " ").strip()
|
|
953
|
+
gate_key = _resolve_gate(phase, "readback")
|
|
954
|
+
gate_template = _MODEL.gates.get(gate_key)
|
|
955
|
+
prompt = (gate_template.prompt if gate_template else "").format_map(collections.defaultdict(str, {
|
|
956
|
+
"phase": phase,
|
|
957
|
+
"objective": obj_line,
|
|
958
|
+
"instructions": action_abbrev,
|
|
959
|
+
"understanding": understanding,
|
|
960
|
+
}))
|
|
961
|
+
return _claude_evaluate(prompt)
|
|
962
|
+
|
|
963
|
+
|
|
964
|
+
def _gatekeeper_validate(
|
|
965
|
+
phase: str,
|
|
966
|
+
state: dict,
|
|
967
|
+
evidence: str = "",
|
|
968
|
+
) -> tuple[bool, str]:
|
|
969
|
+
"""Validate phase execution quality against exit criteria via claude -p.
|
|
970
|
+
|
|
971
|
+
This is a BLOCKING gate at phase end. An independent Claude session
|
|
972
|
+
evaluates whether the agent's evidence satisfies the phase's exit
|
|
973
|
+
criteria. ASK response is treated as BLOCK (not pass) - the agent
|
|
974
|
+
must retry with better evidence.
|
|
975
|
+
Prompt template loaded from agents.yaml gates.gatekeeper.
|
|
976
|
+
"""
|
|
977
|
+
agents = state.get("phase_agents", {}).get(phase, [])
|
|
978
|
+
output = state.get("phase_outputs", {}).get(phase, "")
|
|
979
|
+
readback = state.get("readbacks", {}).get(phase, {})
|
|
980
|
+
agent_key = _resolve_agents(phase)
|
|
981
|
+
required_agents = PHASE_AGENTS.get(agent_key, [])
|
|
982
|
+
|
|
983
|
+
exit_fn = _PHASE_END.get(phase)
|
|
984
|
+
exit_criteria = exit_fn() if exit_fn else f"No exit criteria defined for {phase}"
|
|
985
|
+
|
|
986
|
+
gate_key = _resolve_gate(phase, "gatekeeper")
|
|
987
|
+
gate_template = _MODEL.gates.get(gate_key)
|
|
988
|
+
prompt = (gate_template.prompt if gate_template else "").format_map(collections.defaultdict(str, {
|
|
989
|
+
"phase": phase,
|
|
990
|
+
"exit_criteria": exit_criteria[:400],
|
|
991
|
+
"required_agents": ", ".join(required_agents) if required_agents else "none",
|
|
992
|
+
"recorded_agents": ", ".join(agents) if agents else "NONE",
|
|
993
|
+
"output_status": f"yes ({len(output)} chars)" if output else "no",
|
|
994
|
+
"readback_status": "PASS" if readback.get("passed") else ("FAIL" if readback else "not done"),
|
|
995
|
+
"benchmark_configured": "yes" if state.get("benchmark_cmd") else "no",
|
|
996
|
+
"evidence": evidence if evidence else "(no report provided)",
|
|
997
|
+
}))
|
|
998
|
+
passed, explanation = _claude_evaluate(prompt)
|
|
999
|
+
|
|
1000
|
+
# ASK response = BLOCK (not pass)
|
|
1001
|
+
first_line = explanation.split("\n")[0].strip("*#> ").strip().upper() if explanation else ""
|
|
1002
|
+
if first_line.startswith("ASK"):
|
|
1003
|
+
print("\n" + _msg("gatekeeper_question", explanation=explanation))
|
|
1004
|
+
return False, f"ASK: {explanation}"
|
|
1005
|
+
|
|
1006
|
+
return passed, explanation
|
|
1007
|
+
|
|
1008
|
+
|
|
1009
|
+
def _gatekeeper_evaluate_skip(
|
|
1010
|
+
phase: str,
|
|
1011
|
+
reason: str,
|
|
1012
|
+
state: dict,
|
|
1013
|
+
) -> tuple[bool, str]:
|
|
1014
|
+
"""Gatekeeper decides if a phase skip is justified."""
|
|
1015
|
+
objective = state.get("objective", "not set")
|
|
1016
|
+
iteration = state.get("iteration", "?")
|
|
1017
|
+
itype = state.get("type", "?")
|
|
1018
|
+
|
|
1019
|
+
instructions_fn = _PHASE_START.get(phase)
|
|
1020
|
+
instructions = instructions_fn() if instructions_fn else f"Phase {phase}"
|
|
1021
|
+
abbrev = instructions[:300].replace("\n", " ").strip()
|
|
1022
|
+
|
|
1023
|
+
gate_template = _MODEL.gates.get("gatekeeper_skip", None)
|
|
1024
|
+
prompt = (gate_template.prompt if gate_template else "").format_map(collections.defaultdict(str, {
|
|
1025
|
+
"phase": phase,
|
|
1026
|
+
"iteration": str(iteration),
|
|
1027
|
+
"itype": itype,
|
|
1028
|
+
"objective": objective[:150],
|
|
1029
|
+
"phase_purpose": abbrev,
|
|
1030
|
+
"reason": reason,
|
|
1031
|
+
}))
|
|
1032
|
+
passed, output = _claude_evaluate(prompt)
|
|
1033
|
+
first_line = output.split("\n")[0].strip("*#> ").strip().upper() if output else ""
|
|
1034
|
+
approved = first_line.startswith("APPROVE")
|
|
1035
|
+
return approved, output
|
|
1036
|
+
|
|
1037
|
+
|
|
1038
|
+
def _gatekeeper_evaluate_force_skip(
|
|
1039
|
+
phase: str,
|
|
1040
|
+
reason: str,
|
|
1041
|
+
state: dict,
|
|
1042
|
+
) -> tuple[bool, str]:
|
|
1043
|
+
"""Very conservative gatekeeper for force-skipping REQUIRED phases.
|
|
1044
|
+
|
|
1045
|
+
Required phases exist for a reason. Force-skip should only be approved
|
|
1046
|
+
when:
|
|
1047
|
+
- The iteration is being stopped early (all work done)
|
|
1048
|
+
- The phase was already executed in substance
|
|
1049
|
+
- External constraint makes the phase impossible
|
|
1050
|
+
"""
|
|
1051
|
+
iteration = state.get("iteration", "?")
|
|
1052
|
+
completed = state.get("completed_phases", [])
|
|
1053
|
+
|
|
1054
|
+
gate_template = _MODEL.gates.get("gatekeeper_force_skip", None)
|
|
1055
|
+
prompt = (gate_template.prompt if gate_template else "").format_map(collections.defaultdict(str, {
|
|
1056
|
+
"phase": phase,
|
|
1057
|
+
"iteration": str(iteration),
|
|
1058
|
+
"completed_phases": ", ".join(completed) if completed else "none",
|
|
1059
|
+
"reason": reason,
|
|
1060
|
+
}))
|
|
1061
|
+
passed, output = _claude_evaluate(prompt)
|
|
1062
|
+
first_line = output.split("\n")[0].strip("*#> ").strip().upper() if output else ""
|
|
1063
|
+
approved = first_line.startswith("APPROVE")
|
|
1064
|
+
return approved, output
|
|
1065
|
+
|
|
1066
|
+
|
|
1067
|
+
# ── Banner and footer ───────────────────────────────────────────────
|
|
1068
|
+
|
|
1069
|
+
|
|
1070
|
+
def _banner(phase: str, action: str, state: dict) -> str:
|
|
1071
|
+
"""Render the phase header banner with iteration progress.
|
|
1072
|
+
|
|
1073
|
+
Displays iteration number, phase position, objective, and a progress
|
|
1074
|
+
bar showing completed/current/pending phases. Template loaded from
|
|
1075
|
+
app.yaml banner.header. Called at the start of cmd_start and cmd_end.
|
|
1076
|
+
"""
|
|
1077
|
+
iteration = state.get("iteration", "?")
|
|
1078
|
+
itype = state.get("type", "?")
|
|
1079
|
+
phases = ITERATION_TYPES[itype]["phases"]
|
|
1080
|
+
phase_idx = phases.index(phase) + 1 if phase in phases else 0
|
|
1081
|
+
total = len(phases)
|
|
1082
|
+
|
|
1083
|
+
_banner_tmpl = _MODEL.app.banner
|
|
1084
|
+
progress_parts = []
|
|
1085
|
+
for p in phases:
|
|
1086
|
+
if p == phase:
|
|
1087
|
+
progress_parts.append(_banner_tmpl.progress_current.format_map({"p": p}))
|
|
1088
|
+
elif p in state.get("completed_phases", []):
|
|
1089
|
+
progress_parts.append(_banner_tmpl.progress_done.format_map({"p": p}))
|
|
1090
|
+
else:
|
|
1091
|
+
progress_parts.append(p)
|
|
1092
|
+
progress = " -> ".join(progress_parts)
|
|
1093
|
+
|
|
1094
|
+
rejected = state.get("rejected_count", 0)
|
|
1095
|
+
reject_info = f" | REJECTED {rejected}x" if rejected else ""
|
|
1096
|
+
objective = state.get("objective", "")
|
|
1097
|
+
total_iters = state.get("total_iterations", 1)
|
|
1098
|
+
wf_def = _MODEL.workflow_types.get(itype)
|
|
1099
|
+
if wf_def and wf_def.dependency:
|
|
1100
|
+
iter_label = itype.upper()
|
|
1101
|
+
elif total_iters > 1:
|
|
1102
|
+
iter_label = f"{iteration}/{total_iters}"
|
|
1103
|
+
else:
|
|
1104
|
+
iter_label = str(iteration)
|
|
1105
|
+
|
|
1106
|
+
template = _banner_tmpl.header
|
|
1107
|
+
ctx = {
|
|
1108
|
+
"header_line": _HDR_CHAR * _HDR_WIDTH,
|
|
1109
|
+
"iter_label": iter_label,
|
|
1110
|
+
"itype": itype,
|
|
1111
|
+
"action": action,
|
|
1112
|
+
"phase_idx": phase_idx,
|
|
1113
|
+
"total": total,
|
|
1114
|
+
"phase": phase,
|
|
1115
|
+
"reject_info": reject_info,
|
|
1116
|
+
"objective": objective,
|
|
1117
|
+
"progress": progress,
|
|
1118
|
+
}
|
|
1119
|
+
return template.format_map(collections.defaultdict(str, ctx))
|
|
1120
|
+
|
|
1121
|
+
|
|
1122
|
+
def _footer(phase: str, status: str, state: dict) -> str:
|
|
1123
|
+
"""Render the phase footer with next-step guidance.
|
|
1124
|
+
|
|
1125
|
+
Three variants loaded from app.yaml: 'start' (reminds agent of
|
|
1126
|
+
claw commands), 'end' (directs to next phase), 'final' (last phase
|
|
1127
|
+
in iteration). Provides the command hints that guide autonomous
|
|
1128
|
+
execution through the phase sequence.
|
|
1129
|
+
"""
|
|
1130
|
+
iteration = state.get("iteration", "?")
|
|
1131
|
+
itype = state.get("type", "?")
|
|
1132
|
+
_footer_tmpl = _MODEL.app.footer
|
|
1133
|
+
ctx = {
|
|
1134
|
+
"separator_line": _SEP_CHAR * _SEP_WIDTH,
|
|
1135
|
+
"iteration": iteration,
|
|
1136
|
+
"itype": itype,
|
|
1137
|
+
"phase": phase,
|
|
1138
|
+
"cmd": CMD,
|
|
1139
|
+
}
|
|
1140
|
+
|
|
1141
|
+
if status == "start":
|
|
1142
|
+
return _footer_tmpl.start.format_map(collections.defaultdict(str, ctx))
|
|
1143
|
+
else:
|
|
1144
|
+
nxt = _next_phase(state)
|
|
1145
|
+
if nxt:
|
|
1146
|
+
ctx["nxt"] = nxt
|
|
1147
|
+
return _footer_tmpl.end.format_map(collections.defaultdict(str, ctx))
|
|
1148
|
+
else:
|
|
1149
|
+
return _footer_tmpl.final.format_map(collections.defaultdict(str, ctx))
|
|
1150
|
+
|
|
1151
|
+
|
|
1152
|
+
# ── Auto-action helpers ──────────────────────────────────────────────
|
|
1153
|
+
|
|
1154
|
+
|
|
1155
|
+
def _run_hypothesis_gc() -> None:
|
|
1156
|
+
"""Archive DONE and REMOVED hypotheses after HYPOTHESIS phase.
|
|
1157
|
+
|
|
1158
|
+
Auto-action triggered when HYPOTHESIS phase gatekeeper passes.
|
|
1159
|
+
Moves hypotheses with status DONE or REMOVED from the active
|
|
1160
|
+
catalogue to hypotheses_archive.yaml, keeping the working list
|
|
1161
|
+
clean for future iterations.
|
|
1162
|
+
"""
|
|
1163
|
+
hyps = _load_yaml_list(HYPOTHESES_FILE)
|
|
1164
|
+
if not hyps:
|
|
1165
|
+
print(_msg("hypothesis_gc_none"))
|
|
1166
|
+
return
|
|
1167
|
+
|
|
1168
|
+
active = []
|
|
1169
|
+
archived = []
|
|
1170
|
+
for h in hyps:
|
|
1171
|
+
status = h.get("status", "").upper()
|
|
1172
|
+
if status in ("DONE", "REMOVED"):
|
|
1173
|
+
archived.append(h)
|
|
1174
|
+
else:
|
|
1175
|
+
active.append(h)
|
|
1176
|
+
|
|
1177
|
+
if not archived:
|
|
1178
|
+
print(_msg("hypothesis_gc_no_archive", count=len(active)))
|
|
1179
|
+
return
|
|
1180
|
+
|
|
1181
|
+
archive_path = DEFAULT_ARTIFACTS_DIR / "hypotheses_archive.yaml"
|
|
1182
|
+
existing_archive = _load_yaml_list(archive_path)
|
|
1183
|
+
existing_archive.extend(archived)
|
|
1184
|
+
archive_path.write_text(_yaml_dump(existing_archive))
|
|
1185
|
+
|
|
1186
|
+
HYPOTHESES_FILE.write_text(_yaml_dump(active))
|
|
1187
|
+
|
|
1188
|
+
print(_msg("hypothesis_gc_archived", count=len(archived), path=archive_path.name))
|
|
1189
|
+
print(_msg("hypothesis_gc_active", count=len(active)))
|
|
1190
|
+
for h in active:
|
|
1191
|
+
print(_msg("hypothesis_gc_item", hid=h.get("id", "?"), status=h.get("status", "?"), hyp=h.get("hypothesis", "?")[:80]))
|
|
1192
|
+
|
|
1193
|
+
|
|
1194
|
+
def _run_summary(state: dict) -> None:
|
|
1195
|
+
"""Write iteration_N.md executive summary to artifacts directory.
|
|
1196
|
+
|
|
1197
|
+
Auto-action triggered after RECORD phase completes. Compiles
|
|
1198
|
+
research findings, hypotheses, plan, implementation evidence,
|
|
1199
|
+
and review verdicts into a single markdown summary file for
|
|
1200
|
+
the iteration audit trail.
|
|
1201
|
+
"""
|
|
1202
|
+
iteration = state["iteration"]
|
|
1203
|
+
outputs = state.get("phase_outputs", {})
|
|
1204
|
+
agents = state.get("phase_agents", {})
|
|
1205
|
+
readbacks = state.get("readbacks", {})
|
|
1206
|
+
rejected = state.get("rejected_count", 0)
|
|
1207
|
+
objective = state.get("objective", "not set")
|
|
1208
|
+
completed = state.get("completed_phases", [])
|
|
1209
|
+
itype = state.get("type", "?")
|
|
1210
|
+
|
|
1211
|
+
iteration_plan = state.get("iteration_plan", "")
|
|
1212
|
+
scope = ""
|
|
1213
|
+
if iteration_plan:
|
|
1214
|
+
for line in iteration_plan.split("\n"):
|
|
1215
|
+
if f"ITERATION {iteration}:" in line.upper():
|
|
1216
|
+
scope = line.strip()
|
|
1217
|
+
break
|
|
1218
|
+
|
|
1219
|
+
total_iters = state.get("total_iterations", 1)
|
|
1220
|
+
lines = [
|
|
1221
|
+
f"# Iteration {iteration}/{total_iters} - Executive Summary",
|
|
1222
|
+
"",
|
|
1223
|
+
f"**Scope**: {scope if scope else 'see plan'}<br>",
|
|
1224
|
+
f"**Objective**: {objective}<br>",
|
|
1225
|
+
f"**Type**: {itype}<br>",
|
|
1226
|
+
f"**Phases completed**: {', '.join(completed) if completed else 'none'}<br>",
|
|
1227
|
+
f"**Rejections**: {rejected}<br>",
|
|
1228
|
+
f"**Started**: {state.get('started_at', '?')}",
|
|
1229
|
+
"",
|
|
1230
|
+
]
|
|
1231
|
+
|
|
1232
|
+
if "RESEARCH" in outputs:
|
|
1233
|
+
lines.append("## Research Findings")
|
|
1234
|
+
lines.append("")
|
|
1235
|
+
for line in outputs["RESEARCH"].split("\n"):
|
|
1236
|
+
if line.strip() and not line.startswith(("#", "-", "|")):
|
|
1237
|
+
lines.append(f"{line}<br>")
|
|
1238
|
+
else:
|
|
1239
|
+
lines.append(line)
|
|
1240
|
+
lines.append("")
|
|
1241
|
+
|
|
1242
|
+
if "HYPOTHESIS" in outputs:
|
|
1243
|
+
lines.append("## Hypotheses")
|
|
1244
|
+
lines.append("")
|
|
1245
|
+
for line in outputs["HYPOTHESIS"].split("\n"):
|
|
1246
|
+
if line.strip() and not line.startswith(("#", "-", "|")):
|
|
1247
|
+
lines.append(f"{line}<br>")
|
|
1248
|
+
else:
|
|
1249
|
+
lines.append(line)
|
|
1250
|
+
lines.append("")
|
|
1251
|
+
|
|
1252
|
+
if "PLAN" in outputs:
|
|
1253
|
+
lines.append("## Plan")
|
|
1254
|
+
lines.append("")
|
|
1255
|
+
for line in outputs["PLAN"].split("\n"):
|
|
1256
|
+
if line.strip() and not line.startswith(("#", "-", "|")):
|
|
1257
|
+
lines.append(f"{line}<br>")
|
|
1258
|
+
else:
|
|
1259
|
+
lines.append(line)
|
|
1260
|
+
lines.append("")
|
|
1261
|
+
|
|
1262
|
+
for phase_name in ["IMPLEMENT", "TEST", "REVIEW"]:
|
|
1263
|
+
if phase_name in outputs:
|
|
1264
|
+
lines.append(f"## {phase_name.title()}")
|
|
1265
|
+
lines.append("")
|
|
1266
|
+
for line in outputs[phase_name].split("\n"):
|
|
1267
|
+
if line.strip() and not line.startswith(("#", "-", "|")):
|
|
1268
|
+
lines.append(f"{line}<br>")
|
|
1269
|
+
else:
|
|
1270
|
+
lines.append(line)
|
|
1271
|
+
lines.append("")
|
|
1272
|
+
|
|
1273
|
+
lines.append("## Execution Metrics")
|
|
1274
|
+
lines.append("")
|
|
1275
|
+
if agents:
|
|
1276
|
+
total_agents = sum(len(v) for v in agents.values())
|
|
1277
|
+
lines.append(f"- {total_agents} agents spawned across {len(agents)} phases<br>")
|
|
1278
|
+
for p, agent_list in agents.items():
|
|
1279
|
+
lines.append(f" - **{p}**: {', '.join(agent_list)}<br>")
|
|
1280
|
+
if readbacks:
|
|
1281
|
+
passed = sum(1 for r in readbacks.values() if r.get("passed"))
|
|
1282
|
+
lines.append(f"- Readbacks: {passed}/{len(readbacks)} passed<br>")
|
|
1283
|
+
gatekeepers = state.get("gatekeepers", {})
|
|
1284
|
+
if gatekeepers:
|
|
1285
|
+
gk_passed = sum(1 for g in gatekeepers.values() if g.get("passed"))
|
|
1286
|
+
lines.append(f"- Gatekeepers: {gk_passed}/{len(gatekeepers)} passed<br>")
|
|
1287
|
+
if rejected:
|
|
1288
|
+
lines.append(f"- Rejections: {rejected}<br>")
|
|
1289
|
+
lines.append("")
|
|
1290
|
+
|
|
1291
|
+
failures = _count_iteration_failures(iteration)
|
|
1292
|
+
if failures:
|
|
1293
|
+
lines.append("## Failures")
|
|
1294
|
+
lines.append("")
|
|
1295
|
+
for f in failures:
|
|
1296
|
+
lines.append(f"- [{f.get('mode', '?')}] {f.get('description', '?')}")
|
|
1297
|
+
lines.append("")
|
|
1298
|
+
|
|
1299
|
+
summary_path = DEFAULT_ARTIFACTS_DIR / f"iteration_{iteration}.md"
|
|
1300
|
+
summary_path.write_text("\n".join(lines), encoding="utf-8")
|
|
1301
|
+
print(_msg("summary_written", path=summary_path))
|
|
1302
|
+
|
|
1303
|
+
|
|
1304
|
+
def _run_next_iteration(state: dict) -> None:
|
|
1305
|
+
"""Advance to the next iteration after NEXT phase completes.
|
|
1306
|
+
|
|
1307
|
+
Resets phase_outputs and phase_agents for the new iteration,
|
|
1308
|
+
preserves hypothesis catalogue and failure log, increments the
|
|
1309
|
+
iteration counter, and displays the new iteration info.
|
|
1310
|
+
If all requested iterations are done, reports completion.
|
|
1311
|
+
"""
|
|
1312
|
+
total = state.get("total_iterations", 1)
|
|
1313
|
+
current = state["iteration"]
|
|
1314
|
+
remaining = total - current
|
|
1315
|
+
|
|
1316
|
+
if remaining <= 0:
|
|
1317
|
+
print("\n" + _msg("iteration_complete", total=total))
|
|
1318
|
+
print(_msg("iteration_new_cmd", cmd=CMD, itype=state["type"]))
|
|
1319
|
+
return
|
|
1320
|
+
|
|
1321
|
+
new_iteration = current + 1
|
|
1322
|
+
|
|
1323
|
+
# Switch from dependency workflow to parent workflow after planning iteration completes
|
|
1324
|
+
parent = state.get("parent_type", "")
|
|
1325
|
+
if parent and parent != state["type"]:
|
|
1326
|
+
wf_def = _MODEL.workflow_types.get(state["type"])
|
|
1327
|
+
if wf_def and wf_def.dependency:
|
|
1328
|
+
state["type"] = parent
|
|
1329
|
+
state.pop("parent_type", None)
|
|
1330
|
+
|
|
1331
|
+
itype_info = ITERATION_TYPES[state["type"]]
|
|
1332
|
+
first_phase = itype_info["phases"][0]
|
|
1333
|
+
|
|
1334
|
+
# Preserve iteration_plan from iteration 0
|
|
1335
|
+
iteration_plan = state.get("iteration_plan", "") or state.get("phase_outputs", {}).get(
|
|
1336
|
+
"PLAN", ""
|
|
1337
|
+
)
|
|
1338
|
+
|
|
1339
|
+
state["iteration"] = new_iteration
|
|
1340
|
+
state["current_phase"] = first_phase
|
|
1341
|
+
state["phase_status"] = "pending"
|
|
1342
|
+
state["completed_phases"] = []
|
|
1343
|
+
state["skipped_phases"] = []
|
|
1344
|
+
state["rejected_count"] = 0
|
|
1345
|
+
state["started_at"] = _now()
|
|
1346
|
+
# Reset phase_outputs AND phase_agents for new iteration
|
|
1347
|
+
state["phase_outputs"] = {}
|
|
1348
|
+
state["phase_agents"] = {}
|
|
1349
|
+
if iteration_plan:
|
|
1350
|
+
state["iteration_plan"] = iteration_plan
|
|
1351
|
+
_save_state(state)
|
|
1352
|
+
_append_log(
|
|
1353
|
+
{
|
|
1354
|
+
"iteration": new_iteration,
|
|
1355
|
+
"type": state["type"],
|
|
1356
|
+
"event": "next_iteration",
|
|
1357
|
+
"objective": state["objective"],
|
|
1358
|
+
}
|
|
1359
|
+
)
|
|
1360
|
+
|
|
1361
|
+
label = f"{new_iteration}/{total}" if total > 1 else str(new_iteration)
|
|
1362
|
+
print("\n" + _msg("iteration_started_short", iter_label=label, itype=state["type"]))
|
|
1363
|
+
print(_msg("iteration_objective", objective=state["objective"]))
|
|
1364
|
+
print(_msg("iteration_remaining", remaining=total - new_iteration))
|
|
1365
|
+
if iteration_plan:
|
|
1366
|
+
print("\n" + _msg("iteration_plan_header"))
|
|
1367
|
+
print(_msg("iteration_plan_content", plan=iteration_plan[:200]))
|
|
1368
|
+
|
|
1369
|
+
prior_failures = _count_iteration_failures(current)
|
|
1370
|
+
if prior_failures:
|
|
1371
|
+
print("\n" + _msg("prior_failures_header_short", count=len(prior_failures)))
|
|
1372
|
+
for f in prior_failures[-3:]:
|
|
1373
|
+
print(_msg("prior_failure_item", mode=f.get("mode", "?"), description=f.get("description", "?")))
|
|
1374
|
+
|
|
1375
|
+
print("\n" + _msg("iteration_begin_short", cmd=CMD))
|
|
1376
|
+
|
|
1377
|
+
|
|
1378
|
+
# ── Command functions ───────────────────────────────────────────────
|
|
1379
|
+
|
|
1380
|
+
|
|
1381
|
+
def cmd_new(args) -> None:
|
|
1382
|
+
"""Start a new iteration request.
|
|
1383
|
+
|
|
1384
|
+
Creates initial state with objective, iteration count, type, and
|
|
1385
|
+
optional benchmark command. Auto-starts iteration 0 (planning)
|
|
1386
|
+
when multiple iterations are requested with 'full' type.
|
|
1387
|
+
Cleans prior artifacts by default (preserves hypotheses).
|
|
1388
|
+
"""
|
|
1389
|
+
itype = args.type
|
|
1390
|
+
if itype not in ITERATION_TYPES:
|
|
1391
|
+
print(
|
|
1392
|
+
f"Unknown type: {itype}. Choose: {', '.join(ITERATION_TYPES)}",
|
|
1393
|
+
file=sys.stderr,
|
|
1394
|
+
)
|
|
1395
|
+
sys.exit(1)
|
|
1396
|
+
|
|
1397
|
+
# Block dependency workflows from direct invocation
|
|
1398
|
+
wf_def = _MODEL.workflow_types.get(itype)
|
|
1399
|
+
if wf_def and wf_def.dependency:
|
|
1400
|
+
print(_msg("dependency_blocked", itype=itype), file=sys.stderr)
|
|
1401
|
+
sys.exit(1)
|
|
1402
|
+
|
|
1403
|
+
total_iterations = getattr(args, "iterations", 1)
|
|
1404
|
+
|
|
1405
|
+
# --dry-run: validate and print execution plan, no state files
|
|
1406
|
+
if getattr(args, "dry_run", False):
|
|
1407
|
+
_dry_run(itype, total_iterations)
|
|
1408
|
+
return
|
|
1409
|
+
|
|
1410
|
+
# Read iteration counter BEFORE cleaning (clean wipes state file)
|
|
1411
|
+
last_iteration = _read_last_iteration()
|
|
1412
|
+
|
|
1413
|
+
# Clean artifacts from prior runs (default: yes)
|
|
1414
|
+
if getattr(args, "clean", True):
|
|
1415
|
+
_clean_artifacts_dir()
|
|
1416
|
+
print(_msg("cleaned") + "\n")
|
|
1417
|
+
|
|
1418
|
+
old_state = _load_state()
|
|
1419
|
+
iteration = max(
|
|
1420
|
+
(old_state["iteration"] + 1) if old_state else 1,
|
|
1421
|
+
last_iteration + 1,
|
|
1422
|
+
)
|
|
1423
|
+
|
|
1424
|
+
# Auto-run dependency workflow (iteration 0) when configured
|
|
1425
|
+
run_type = itype
|
|
1426
|
+
if wf_def and wf_def.depends_on and total_iterations > 1:
|
|
1427
|
+
dep_wf = _MODEL.workflow_types.get(wf_def.depends_on)
|
|
1428
|
+
if dep_wf:
|
|
1429
|
+
iteration = 0
|
|
1430
|
+
run_type = wf_def.depends_on
|
|
1431
|
+
|
|
1432
|
+
type_info = ITERATION_TYPES[run_type]
|
|
1433
|
+
first_phase = type_info["phases"][0]
|
|
1434
|
+
|
|
1435
|
+
objective = args.objective
|
|
1436
|
+
|
|
1437
|
+
benchmark_cmd = getattr(args, "benchmark", "") or ""
|
|
1438
|
+
state = {
|
|
1439
|
+
"iteration": iteration,
|
|
1440
|
+
"total_iterations": total_iterations,
|
|
1441
|
+
"type": run_type,
|
|
1442
|
+
"objective": objective,
|
|
1443
|
+
"benchmark_cmd": benchmark_cmd,
|
|
1444
|
+
"benchmark_scores": [],
|
|
1445
|
+
"current_phase": first_phase,
|
|
1446
|
+
"phase_status": "pending",
|
|
1447
|
+
"completed_phases": [],
|
|
1448
|
+
"skipped_phases": [],
|
|
1449
|
+
"rejected_count": 0,
|
|
1450
|
+
"started_at": _now(),
|
|
1451
|
+
"phase_outputs": {},
|
|
1452
|
+
"phase_agents": {},
|
|
1453
|
+
"parent_type": itype if run_type != itype else "",
|
|
1454
|
+
}
|
|
1455
|
+
_save_state(state)
|
|
1456
|
+
_save_objective(objective, total_iterations)
|
|
1457
|
+
_append_log(
|
|
1458
|
+
{
|
|
1459
|
+
"iteration": iteration,
|
|
1460
|
+
"type": run_type,
|
|
1461
|
+
"event": "new_iteration",
|
|
1462
|
+
"objective": objective,
|
|
1463
|
+
}
|
|
1464
|
+
)
|
|
1465
|
+
|
|
1466
|
+
run_wf = _MODEL.workflow_types.get(run_type)
|
|
1467
|
+
if run_wf and run_wf.dependency:
|
|
1468
|
+
iter_label = f"{run_type.upper()} (before {total_iterations} iterations)"
|
|
1469
|
+
elif total_iterations > 1:
|
|
1470
|
+
iter_label = f"{iteration} of {total_iterations}"
|
|
1471
|
+
else:
|
|
1472
|
+
iter_label = str(iteration)
|
|
1473
|
+
print(_msg("iteration_started", iter_label=iter_label, itype=run_type, description=type_info["description"]))
|
|
1474
|
+
print("\n" + _msg("iteration_objective", objective=objective))
|
|
1475
|
+
if total_iterations > 1:
|
|
1476
|
+
print(_msg("iteration_requested", total=total_iterations))
|
|
1477
|
+
if run_wf and run_wf.dependency:
|
|
1478
|
+
print("\n" + _msg("dependency_purpose", description=run_wf.description))
|
|
1479
|
+
print("\n" + _msg("iteration_phases", phases=" -> ".join(type_info["phases"])))
|
|
1480
|
+
print(_msg("iteration_required", required=", ".join(type_info["required"])))
|
|
1481
|
+
if type_info["skippable"]:
|
|
1482
|
+
print(_msg("iteration_skippable", skippable=", ".join(type_info["skippable"])))
|
|
1483
|
+
|
|
1484
|
+
# Show prior failures if any
|
|
1485
|
+
if old_state:
|
|
1486
|
+
prior_failures = _count_iteration_failures(old_state["iteration"])
|
|
1487
|
+
if prior_failures:
|
|
1488
|
+
print("\n" + _msg("prior_failures_header", count=len(prior_failures)))
|
|
1489
|
+
for f in prior_failures[-3:]:
|
|
1490
|
+
print(_msg("prior_failure_item_full", mode=f.get("mode", "?"), description=f.get("description", "?")))
|
|
1491
|
+
|
|
1492
|
+
print("\n" + _msg("iteration_begin", cmd=CMD))
|
|
1493
|
+
|
|
1494
|
+
|
|
1495
|
+
def cmd_start(args) -> None:
|
|
1496
|
+
"""Enter current phase with BLOCKING readback validation.
|
|
1497
|
+
|
|
1498
|
+
Loads phase instructions from YAML, runs readback gate via claude -p
|
|
1499
|
+
to validate agent understanding, then displays the phase instructions
|
|
1500
|
+
with banner, agent definitions, and user context if provided.
|
|
1501
|
+
Phase stays PENDING if readback fails.
|
|
1502
|
+
"""
|
|
1503
|
+
state = _load_state()
|
|
1504
|
+
if not state:
|
|
1505
|
+
print(_msg("no_active_start"), file=sys.stderr)
|
|
1506
|
+
print(_msg("no_active_start_cmd", cmd=CMD), file=sys.stderr)
|
|
1507
|
+
sys.exit(1)
|
|
1508
|
+
|
|
1509
|
+
phase = state["current_phase"]
|
|
1510
|
+
|
|
1511
|
+
# FSM guards against starting from in_progress (raises ValueError)
|
|
1512
|
+
try:
|
|
1513
|
+
_fire_fsm(FSMEvent.START, state) # pending -> readback
|
|
1514
|
+
except ValueError:
|
|
1515
|
+
print(_msg("phase_in_progress", phase=phase), file=sys.stderr)
|
|
1516
|
+
print(_msg("phase_in_progress_cmd", cmd=CMD), file=sys.stderr)
|
|
1517
|
+
sys.exit(1)
|
|
1518
|
+
|
|
1519
|
+
understanding = getattr(args, "understanding", None)
|
|
1520
|
+
if not understanding:
|
|
1521
|
+
print(_msg("understanding_required"), file=sys.stderr)
|
|
1522
|
+
print(_msg("understanding_required_cmd", cmd=CMD), file=sys.stderr)
|
|
1523
|
+
sys.exit(1)
|
|
1524
|
+
|
|
1525
|
+
# Get phase instructions for readback validation
|
|
1526
|
+
instructions_fn = _PHASE_START.get(phase)
|
|
1527
|
+
instructions = instructions_fn() if instructions_fn else f"Phase {phase}"
|
|
1528
|
+
|
|
1529
|
+
# BLOCKING readback validation
|
|
1530
|
+
print(_msg("readback_separator"))
|
|
1531
|
+
print(_msg("readback_validating", phase=phase))
|
|
1532
|
+
print(_msg("readback_separator"))
|
|
1533
|
+
passed, explanation = _readback_validate(
|
|
1534
|
+
phase,
|
|
1535
|
+
understanding,
|
|
1536
|
+
instructions,
|
|
1537
|
+
)
|
|
1538
|
+
|
|
1539
|
+
# Save readback artifact (pass or fail)
|
|
1540
|
+
pdir = _phase_dir(state)
|
|
1541
|
+
readback_file = pdir / "readback.md"
|
|
1542
|
+
readback_file.write_text(
|
|
1543
|
+
f"# Readback - {phase}\n\n"
|
|
1544
|
+
f"## Agent Understanding\n{understanding}\n\n"
|
|
1545
|
+
f"## Validation Result\n{'PASS' if passed else 'FAIL'}\n\n"
|
|
1546
|
+
f"## Explanation\n{explanation}\n",
|
|
1547
|
+
encoding="utf-8",
|
|
1548
|
+
)
|
|
1549
|
+
|
|
1550
|
+
# Update state with readback result
|
|
1551
|
+
if "readbacks" not in state:
|
|
1552
|
+
state["readbacks"] = {}
|
|
1553
|
+
state["readbacks"][phase] = {"passed": passed, "at": _now()}
|
|
1554
|
+
|
|
1555
|
+
_append_log(
|
|
1556
|
+
{
|
|
1557
|
+
"iteration": state["iteration"],
|
|
1558
|
+
"phase": phase,
|
|
1559
|
+
"event": "readback",
|
|
1560
|
+
"passed": passed,
|
|
1561
|
+
}
|
|
1562
|
+
)
|
|
1563
|
+
|
|
1564
|
+
if not passed:
|
|
1565
|
+
# Readback failed - return to pending
|
|
1566
|
+
_fire_fsm(FSMEvent.READBACK_FAIL, state) # readback -> pending
|
|
1567
|
+
_save_state(state)
|
|
1568
|
+
print("\n" + _msg("readback_fail", phase=phase))
|
|
1569
|
+
print(_msg("readback_fail_reason", reason=explanation[:200]))
|
|
1570
|
+
print("\n" + _msg("readback_retry", cmd=CMD))
|
|
1571
|
+
return
|
|
1572
|
+
|
|
1573
|
+
print(_msg("readback_pass", phase=phase) + "\n")
|
|
1574
|
+
|
|
1575
|
+
# Readback passed - advance to in_progress via FSM
|
|
1576
|
+
_fire_fsm(FSMEvent.READBACK_PASS, state) # readback -> in_progress
|
|
1577
|
+
state["phase_started_at"] = _now()
|
|
1578
|
+
_save_state(state)
|
|
1579
|
+
_append_log(
|
|
1580
|
+
{
|
|
1581
|
+
"iteration": state["iteration"],
|
|
1582
|
+
"phase": phase,
|
|
1583
|
+
"event": "phase_start",
|
|
1584
|
+
}
|
|
1585
|
+
)
|
|
1586
|
+
|
|
1587
|
+
header = _banner(phase, "ENTERING", state)
|
|
1588
|
+
|
|
1589
|
+
# Inject ALL user context from context.yaml (broadcast to all phases)
|
|
1590
|
+
body = instructions
|
|
1591
|
+
all_ctx = _load_context()
|
|
1592
|
+
if all_ctx:
|
|
1593
|
+
count = len(all_ctx)
|
|
1594
|
+
body += f"\n\n{count} context message(s) active:\n"
|
|
1595
|
+
body += _msg("user_guidance_header_line") + "\n"
|
|
1596
|
+
body += _msg("user_guidance_header") + "\n"
|
|
1597
|
+
body += _msg("user_guidance_header_line") + "\n\n"
|
|
1598
|
+
for ctx_phase, ctx_msg in all_ctx.items():
|
|
1599
|
+
body += f"**[{ctx_phase}]**: {ctx_msg}\n\n"
|
|
1600
|
+
body += _msg("user_guidance_instruction")
|
|
1601
|
+
|
|
1602
|
+
foot = _footer(phase, "start", state)
|
|
1603
|
+
print(header + body + foot)
|
|
1604
|
+
|
|
1605
|
+
|
|
1606
|
+
def cmd_end(args) -> None:
|
|
1607
|
+
"""Complete current phase with gatekeeper validation.
|
|
1608
|
+
|
|
1609
|
+
Validates --agents against required agents from agents.yaml,
|
|
1610
|
+
records output file content, runs TEST automation if in TEST phase,
|
|
1611
|
+
runs gatekeeper gate for quality validation, then advances to
|
|
1612
|
+
next phase. Auto-actions: hypothesis-gc after HYPOTHESIS,
|
|
1613
|
+
summary after RECORD, inline NEXT display after RECORD.
|
|
1614
|
+
"""
|
|
1615
|
+
state = _load_state()
|
|
1616
|
+
if not state:
|
|
1617
|
+
print(_msg("no_active"), file=sys.stderr)
|
|
1618
|
+
sys.exit(1)
|
|
1619
|
+
|
|
1620
|
+
phase = state["current_phase"]
|
|
1621
|
+
if state["phase_status"] != "in_progress":
|
|
1622
|
+
print(_msg("phase_not_started", phase=phase), file=sys.stderr)
|
|
1623
|
+
print(_msg("phase_not_started_cmd", cmd=CMD), file=sys.stderr)
|
|
1624
|
+
sys.exit(1)
|
|
1625
|
+
|
|
1626
|
+
# ── Fail-fast: validate ALL inputs at top ──
|
|
1627
|
+
evidence = getattr(args, "evidence", "") or ""
|
|
1628
|
+
agents_str = getattr(args, "agents", "") or ""
|
|
1629
|
+
output_file_str = getattr(args, "output_file", "") or ""
|
|
1630
|
+
|
|
1631
|
+
# Resolve and validate --output-file
|
|
1632
|
+
output_file_path = None
|
|
1633
|
+
output_content = ""
|
|
1634
|
+
if output_file_str:
|
|
1635
|
+
output_file_path = Path(output_file_str).resolve()
|
|
1636
|
+
if not output_file_path.exists():
|
|
1637
|
+
print(_msg("output_file_missing", path=output_file_path), file=sys.stderr)
|
|
1638
|
+
sys.exit(1)
|
|
1639
|
+
output_content = output_file_path.read_text(encoding="utf-8")
|
|
1640
|
+
|
|
1641
|
+
# Parse agents
|
|
1642
|
+
agents = [a.strip() for a in agents_str.split(",") if a.strip()] if agents_str else []
|
|
1643
|
+
|
|
1644
|
+
# Check required agents - resolve via :: namespace
|
|
1645
|
+
required_key = _resolve_agents(phase)
|
|
1646
|
+
required_agents = PHASE_AGENTS.get(required_key, [])
|
|
1647
|
+
if required_agents and agents:
|
|
1648
|
+
missing = [r for r in required_agents if r not in agents]
|
|
1649
|
+
if missing:
|
|
1650
|
+
print(_msg("missing_agents", phase=phase, missing=", ".join(missing)), file=sys.stderr)
|
|
1651
|
+
print(_msg("missing_agents_required", required=", ".join(required_agents)), file=sys.stderr)
|
|
1652
|
+
sys.exit(1)
|
|
1653
|
+
elif required_agents and not agents:
|
|
1654
|
+
print(_msg("requires_agents", phase=phase, required=", ".join(required_agents)), file=sys.stderr)
|
|
1655
|
+
print(_msg("requires_agents_provide", required=",".join(required_agents)), file=sys.stderr)
|
|
1656
|
+
sys.exit(1)
|
|
1657
|
+
|
|
1658
|
+
# ── Record agents BEFORE gatekeeper (so gatekeeper sees them) ──
|
|
1659
|
+
if agents:
|
|
1660
|
+
if "phase_agents" not in state:
|
|
1661
|
+
state["phase_agents"] = {}
|
|
1662
|
+
state["phase_agents"][phase] = agents
|
|
1663
|
+
|
|
1664
|
+
# ── Record output-file (OVERWRITE phase_outputs) ──
|
|
1665
|
+
if output_file_path:
|
|
1666
|
+
if "phase_outputs" not in state:
|
|
1667
|
+
state["phase_outputs"] = {}
|
|
1668
|
+
state["phase_outputs"][phase] = output_content
|
|
1669
|
+
|
|
1670
|
+
# Also save to phase subfolder
|
|
1671
|
+
pdir = _phase_dir(state)
|
|
1672
|
+
output_dest = pdir / "output.md"
|
|
1673
|
+
md_lines = []
|
|
1674
|
+
for line in output_content.split("\n"):
|
|
1675
|
+
if (
|
|
1676
|
+
line.strip()
|
|
1677
|
+
and not line.startswith("#")
|
|
1678
|
+
and not line.startswith("-")
|
|
1679
|
+
and not line.startswith("|")
|
|
1680
|
+
):
|
|
1681
|
+
md_lines.append(line + "<br>")
|
|
1682
|
+
else:
|
|
1683
|
+
md_lines.append(line)
|
|
1684
|
+
md_content = "\n".join(md_lines)
|
|
1685
|
+
output_dest.write_text(
|
|
1686
|
+
f"# {phase} Output\n\n{md_content}\n",
|
|
1687
|
+
encoding="utf-8",
|
|
1688
|
+
)
|
|
1689
|
+
elif evidence:
|
|
1690
|
+
# Evidence stored as gap-fill only if no --output-file
|
|
1691
|
+
if "phase_outputs" not in state:
|
|
1692
|
+
state["phase_outputs"] = {}
|
|
1693
|
+
if phase not in state["phase_outputs"]:
|
|
1694
|
+
state["phase_outputs"][phase] = evidence
|
|
1695
|
+
|
|
1696
|
+
_save_state(state)
|
|
1697
|
+
|
|
1698
|
+
header = _banner(phase, "COMPLETING", state)
|
|
1699
|
+
|
|
1700
|
+
# ── TEST phase: run automated verification ──
|
|
1701
|
+
if phase == "TEST":
|
|
1702
|
+
print(header)
|
|
1703
|
+
body = _PHASE_END.get(phase, lambda: "")()
|
|
1704
|
+
print(body)
|
|
1705
|
+
|
|
1706
|
+
passed, output = _verify_test_phase(state)
|
|
1707
|
+
print(output)
|
|
1708
|
+
|
|
1709
|
+
if not passed:
|
|
1710
|
+
target = _prev_implementable(state)
|
|
1711
|
+
_fire_fsm(FSMEvent.END, state) # in_progress -> gatekeeper
|
|
1712
|
+
_fire_fsm(FSMEvent.GATE_FAIL, state) # gatekeeper -> in_progress
|
|
1713
|
+
_fire_fsm(FSMEvent.REJECT, state) # in_progress -> rejected
|
|
1714
|
+
_fire_fsm(FSMEvent.ADVANCE, state) # rejected -> pending
|
|
1715
|
+
state["current_phase"] = target
|
|
1716
|
+
state["rejected_count"] = state.get("rejected_count", 0) + 1
|
|
1717
|
+
state.pop("phase_started_at", None)
|
|
1718
|
+
_save_state(state)
|
|
1719
|
+
_append_log(
|
|
1720
|
+
{
|
|
1721
|
+
"iteration": state["iteration"],
|
|
1722
|
+
"phase": phase,
|
|
1723
|
+
"event": "auto_reject",
|
|
1724
|
+
"reason": "tests/lint failed",
|
|
1725
|
+
"target": target,
|
|
1726
|
+
}
|
|
1727
|
+
)
|
|
1728
|
+
_append_failure(
|
|
1729
|
+
{
|
|
1730
|
+
"iteration": state["iteration"],
|
|
1731
|
+
"phase": phase,
|
|
1732
|
+
"mode": "FM-TEST-FAIL",
|
|
1733
|
+
"description": output[:200],
|
|
1734
|
+
}
|
|
1735
|
+
)
|
|
1736
|
+
print("\n" + _msg("tests_fail", target=target))
|
|
1737
|
+
print(_msg("tests_fail_run", cmd=CMD))
|
|
1738
|
+
return
|
|
1739
|
+
|
|
1740
|
+
print("\n" + _msg("tests_pass"))
|
|
1741
|
+
|
|
1742
|
+
else:
|
|
1743
|
+
body = _PHASE_END.get(phase, lambda: "")()
|
|
1744
|
+
print(header + body)
|
|
1745
|
+
|
|
1746
|
+
# ── Gatekeeper: per-phase generative validation ──
|
|
1747
|
+
_fire_fsm(FSMEvent.END, state) # in_progress -> gatekeeper
|
|
1748
|
+
print("\n" + _msg("gatekeeper_separator"))
|
|
1749
|
+
print(_msg("gatekeeper_evaluating", phase=phase))
|
|
1750
|
+
print(_msg("gatekeeper_separator"))
|
|
1751
|
+
gk_passed, gk_output = _gatekeeper_validate(
|
|
1752
|
+
phase,
|
|
1753
|
+
state,
|
|
1754
|
+
evidence,
|
|
1755
|
+
)
|
|
1756
|
+
|
|
1757
|
+
# Save gatekeeper result to phase subfolder
|
|
1758
|
+
pdir = _phase_dir(state)
|
|
1759
|
+
gk_file = pdir / "gatekeeper.md"
|
|
1760
|
+
gk_file.write_text(
|
|
1761
|
+
f"# Gatekeeper - {phase}\n\n"
|
|
1762
|
+
f"## Result\n{'PASS' if gk_passed else 'FAIL'}\n\n"
|
|
1763
|
+
f"## Evaluation\n{gk_output}\n",
|
|
1764
|
+
encoding="utf-8",
|
|
1765
|
+
)
|
|
1766
|
+
|
|
1767
|
+
# Update state
|
|
1768
|
+
if "gatekeepers" not in state:
|
|
1769
|
+
state["gatekeepers"] = {}
|
|
1770
|
+
state["gatekeepers"][phase] = {
|
|
1771
|
+
"passed": gk_passed,
|
|
1772
|
+
"at": _now(),
|
|
1773
|
+
}
|
|
1774
|
+
_save_state(state)
|
|
1775
|
+
_append_log(
|
|
1776
|
+
{
|
|
1777
|
+
"iteration": state["iteration"],
|
|
1778
|
+
"phase": phase,
|
|
1779
|
+
"event": "gatekeeper",
|
|
1780
|
+
"passed": gk_passed,
|
|
1781
|
+
}
|
|
1782
|
+
)
|
|
1783
|
+
|
|
1784
|
+
if not gk_passed:
|
|
1785
|
+
_fire_fsm(FSMEvent.GATE_FAIL, state) # gatekeeper -> in_progress (retry)
|
|
1786
|
+
_save_state(state)
|
|
1787
|
+
print("\n" + _msg("gatekeeper_fail", phase=phase))
|
|
1788
|
+
print(_msg("gatekeeper_fail_reason", reason=gk_output[:300]))
|
|
1789
|
+
print("\n" + _msg("gatekeeper_fail_retry", cmd=CMD))
|
|
1790
|
+
return
|
|
1791
|
+
|
|
1792
|
+
_fire_fsm(FSMEvent.GATE_PASS, state) # gatekeeper -> complete
|
|
1793
|
+
print(_msg("gatekeeper_pass", phase=phase))
|
|
1794
|
+
|
|
1795
|
+
# Mark phase complete and advance
|
|
1796
|
+
state["completed_phases"].append(phase)
|
|
1797
|
+
started_at = state.get("phase_started_at", "")
|
|
1798
|
+
|
|
1799
|
+
nxt = _next_phase(state)
|
|
1800
|
+
if nxt:
|
|
1801
|
+
_fire_fsm(FSMEvent.ADVANCE, state) # complete -> pending
|
|
1802
|
+
state["current_phase"] = nxt
|
|
1803
|
+
else:
|
|
1804
|
+
state["phase_status"] = "iteration_complete"
|
|
1805
|
+
|
|
1806
|
+
state.pop("phase_started_at", None)
|
|
1807
|
+
_save_state(state)
|
|
1808
|
+
_append_log(
|
|
1809
|
+
{
|
|
1810
|
+
"iteration": state["iteration"],
|
|
1811
|
+
"phase": phase,
|
|
1812
|
+
"event": "phase_complete",
|
|
1813
|
+
"started_at": started_at,
|
|
1814
|
+
}
|
|
1815
|
+
)
|
|
1816
|
+
|
|
1817
|
+
# Phase-end executive summary
|
|
1818
|
+
outputs = state.get("phase_outputs", {})
|
|
1819
|
+
agents_map = state.get("phase_agents", {})
|
|
1820
|
+
readbacks = state.get("readbacks", {})
|
|
1821
|
+
gatekeepers = state.get("gatekeepers", {})
|
|
1822
|
+
summary_lines = ["\n" + _msg("phase_complete", phase=phase)]
|
|
1823
|
+
if phase in outputs:
|
|
1824
|
+
out_text = outputs[phase]
|
|
1825
|
+
summary_lines.append(_msg("phase_output", output=out_text[:100]))
|
|
1826
|
+
if phase in agents_map:
|
|
1827
|
+
summary_lines.append(_msg("phase_agents", agents=", ".join(agents_map[phase])))
|
|
1828
|
+
if phase in readbacks:
|
|
1829
|
+
rb = readbacks[phase]
|
|
1830
|
+
summary_lines.append(_msg("phase_readback", status="PASS" if rb.get("passed") else "FAIL"))
|
|
1831
|
+
if phase in gatekeepers:
|
|
1832
|
+
gk = gatekeepers[phase]
|
|
1833
|
+
summary_lines.append(_msg("phase_gatekeeper", status="PASS" if gk.get("passed") else "FAIL"))
|
|
1834
|
+
print("\n".join(summary_lines))
|
|
1835
|
+
|
|
1836
|
+
# ── Auto-actions from phases.yaml auto_actions.on_complete ──
|
|
1837
|
+
if _run_auto_actions(phase, state):
|
|
1838
|
+
return
|
|
1839
|
+
|
|
1840
|
+
print(_footer(phase, "end", state))
|
|
1841
|
+
|
|
1842
|
+
|
|
1843
|
+
def cmd_status(args) -> None:
|
|
1844
|
+
"""Show current iteration state with phase progress.
|
|
1845
|
+
|
|
1846
|
+
Displays iteration info, phase checklist with completion markers,
|
|
1847
|
+
agents recorded per phase, failures logged, and next command hint.
|
|
1848
|
+
Useful for resuming work after context loss.
|
|
1849
|
+
"""
|
|
1850
|
+
state = _load_state()
|
|
1851
|
+
if not state:
|
|
1852
|
+
print(_msg("no_active"))
|
|
1853
|
+
print("\n" + _msg("no_active_start_full", cmd=CMD))
|
|
1854
|
+
print("\n" + _msg("available_types"))
|
|
1855
|
+
for name, info in ITERATION_TYPES.items():
|
|
1856
|
+
print(_msg("available_type_item", name=name, description=info["description"]))
|
|
1857
|
+
return
|
|
1858
|
+
|
|
1859
|
+
wf_type = state["type"]
|
|
1860
|
+
itype = ITERATION_TYPES[wf_type]
|
|
1861
|
+
phases = itype["phases"]
|
|
1862
|
+
total_iters = state.get("total_iterations", 1)
|
|
1863
|
+
iteration = state.get("iteration", "?")
|
|
1864
|
+
|
|
1865
|
+
wf_def = _MODEL.workflow_types.get(wf_type)
|
|
1866
|
+
if wf_def and wf_def.dependency:
|
|
1867
|
+
iter_label = wf_type.upper()
|
|
1868
|
+
elif total_iters > 1:
|
|
1869
|
+
iter_label = f"{iteration}/{total_iters}"
|
|
1870
|
+
else:
|
|
1871
|
+
iter_label = str(iteration)
|
|
1872
|
+
|
|
1873
|
+
print(_msg("status_header", iter_label=iter_label, itype=wf_type))
|
|
1874
|
+
print(_msg("status_objective", objective=state.get("objective", "?")))
|
|
1875
|
+
print(_msg("status_started", started=state.get("started_at", "?")))
|
|
1876
|
+
print(_msg("status_current", phase=state["current_phase"], status=state["phase_status"]))
|
|
1877
|
+
rejected = state.get("rejected_count", 0)
|
|
1878
|
+
if rejected:
|
|
1879
|
+
print(_msg("status_rejections", count=rejected))
|
|
1880
|
+
lr = state.get("last_rejection", {})
|
|
1881
|
+
if lr:
|
|
1882
|
+
print(_msg("status_last_reject", from_phase=lr.get("from", "?"), reason=lr.get("reason", "?")))
|
|
1883
|
+
print()
|
|
1884
|
+
|
|
1885
|
+
for p in phases:
|
|
1886
|
+
if p in state["completed_phases"]:
|
|
1887
|
+
marker = "[x]"
|
|
1888
|
+
elif p == state["current_phase"]:
|
|
1889
|
+
marker = "[>]" if state["phase_status"] == "in_progress" else "[ ]"
|
|
1890
|
+
elif any(s["phase"] == p for s in state.get("skipped_phases", [])):
|
|
1891
|
+
marker = "[-]"
|
|
1892
|
+
else:
|
|
1893
|
+
marker = "[ ]"
|
|
1894
|
+
req = "*" if p in itype["required"] else " "
|
|
1895
|
+
print(_msg("status_phase_item", marker=marker, p=p, req=req))
|
|
1896
|
+
|
|
1897
|
+
# Show agents recorded per phase
|
|
1898
|
+
agents_map = state.get("phase_agents", {})
|
|
1899
|
+
if agents_map:
|
|
1900
|
+
print("\n" + _msg("status_agents_header"))
|
|
1901
|
+
for p, agent_list in agents_map.items():
|
|
1902
|
+
print(_msg("status_agent_item", phase=p, agents=", ".join(agent_list)))
|
|
1903
|
+
|
|
1904
|
+
# Show failures for this iteration
|
|
1905
|
+
failures = _count_iteration_failures(state["iteration"])
|
|
1906
|
+
if failures:
|
|
1907
|
+
print("\n" + _msg("status_failures_header", count=len(failures)))
|
|
1908
|
+
for f in failures:
|
|
1909
|
+
print(_msg("status_failure_item", mode=f.get("mode", "?"), desc=f.get("description", "?")[:60]))
|
|
1910
|
+
|
|
1911
|
+
print("\n" + _msg("status_required_note"))
|
|
1912
|
+
if state["phase_status"] == "pending":
|
|
1913
|
+
print("\n" + _msg("status_next_start", cmd=CMD))
|
|
1914
|
+
elif state["phase_status"] == "in_progress":
|
|
1915
|
+
print("\n" + _msg("status_next_end", cmd=CMD))
|
|
1916
|
+
|
|
1917
|
+
|
|
1918
|
+
def cmd_reject(args) -> None:
|
|
1919
|
+
"""Critic rejects current phase, returning to an earlier phase.
|
|
1920
|
+
|
|
1921
|
+
Rolls back to the most recent IMPLEMENT phase in the sequence,
|
|
1922
|
+
increments rejection count, and logs the rejection reason.
|
|
1923
|
+
Used when review agents find issues that need fixing.
|
|
1924
|
+
"""
|
|
1925
|
+
state = _load_state()
|
|
1926
|
+
if not state:
|
|
1927
|
+
print(_msg("no_active"), file=sys.stderr)
|
|
1928
|
+
sys.exit(1)
|
|
1929
|
+
|
|
1930
|
+
phase = state["current_phase"]
|
|
1931
|
+
reason = args.reason or "no reason given"
|
|
1932
|
+
|
|
1933
|
+
# Check reject_to declaration on current phase
|
|
1934
|
+
resolved = _resolve_phase(phase)
|
|
1935
|
+
phase_obj = _MODEL.phases.get(resolved)
|
|
1936
|
+
if phase_obj and phase_obj.reject_to:
|
|
1937
|
+
target = phase_obj.reject_to.get("phase", _prev_implementable(state))
|
|
1938
|
+
else:
|
|
1939
|
+
target = _prev_implementable(state)
|
|
1940
|
+
|
|
1941
|
+
# FSM: reject current phase and advance to target
|
|
1942
|
+
_fire_fsm(FSMEvent.REJECT, state) # in_progress -> rejected
|
|
1943
|
+
_fire_fsm(FSMEvent.ADVANCE, state) # rejected -> pending
|
|
1944
|
+
state["current_phase"] = target
|
|
1945
|
+
state["rejected_count"] = state.get("rejected_count", 0) + 1
|
|
1946
|
+
state["last_rejection"] = {
|
|
1947
|
+
"from": phase,
|
|
1948
|
+
"reason": reason,
|
|
1949
|
+
"at": _now(),
|
|
1950
|
+
}
|
|
1951
|
+
state.pop("phase_started_at", None)
|
|
1952
|
+
_save_state(state)
|
|
1953
|
+
|
|
1954
|
+
_append_log(
|
|
1955
|
+
{
|
|
1956
|
+
"iteration": state["iteration"],
|
|
1957
|
+
"phase": phase,
|
|
1958
|
+
"event": "rejected",
|
|
1959
|
+
"reason": reason,
|
|
1960
|
+
"target": target,
|
|
1961
|
+
}
|
|
1962
|
+
)
|
|
1963
|
+
|
|
1964
|
+
print("\n" + _msg("reject_header", phase=phase, target=target))
|
|
1965
|
+
print(_msg("reject_reason", reason=reason))
|
|
1966
|
+
print(_msg("reject_count", count=state["rejected_count"]))
|
|
1967
|
+
print("\n" + _msg("reject_fix", cmd=CMD))
|
|
1968
|
+
|
|
1969
|
+
|
|
1970
|
+
def cmd_skip(args) -> None:
|
|
1971
|
+
"""Skip an optional phase or force-skip a required one.
|
|
1972
|
+
|
|
1973
|
+
Optional phases (skippable: true in workflow.yaml) can be
|
|
1974
|
+
skipped with gatekeeper approval. Required phases need --force
|
|
1975
|
+
flag and pass a conservative gatekeeper that defaults to DENY.
|
|
1976
|
+
"""
|
|
1977
|
+
state = _load_state()
|
|
1978
|
+
if not state:
|
|
1979
|
+
print(_msg("no_active"), file=sys.stderr)
|
|
1980
|
+
sys.exit(1)
|
|
1981
|
+
|
|
1982
|
+
phase = state["current_phase"]
|
|
1983
|
+
itype = ITERATION_TYPES[state["type"]]
|
|
1984
|
+
force = getattr(args, "force", False)
|
|
1985
|
+
|
|
1986
|
+
if phase in itype["required"] and not force:
|
|
1987
|
+
print(_msg("skip_blocked", phase=phase, itype=state["type"]), file=sys.stderr)
|
|
1988
|
+
print(_msg("skip_blocked_required", required=", ".join(itype["required"])), file=sys.stderr)
|
|
1989
|
+
print("\n" + _msg("skip_blocked_force"), file=sys.stderr)
|
|
1990
|
+
sys.exit(1)
|
|
1991
|
+
|
|
1992
|
+
reason = args.reason or "no reason given"
|
|
1993
|
+
is_required = phase in itype["required"]
|
|
1994
|
+
|
|
1995
|
+
print(_msg("gatekeeper_skip_separator"))
|
|
1996
|
+
label = "FORCE-SKIP (required phase)" if is_required else "SKIP"
|
|
1997
|
+
print(_msg("gatekeeper_skip_evaluating", label=label, phase=phase))
|
|
1998
|
+
print(_msg("gatekeeper_skip_separator"))
|
|
1999
|
+
|
|
2000
|
+
if is_required:
|
|
2001
|
+
approved, explanation = _gatekeeper_evaluate_force_skip(
|
|
2002
|
+
phase,
|
|
2003
|
+
reason,
|
|
2004
|
+
state,
|
|
2005
|
+
)
|
|
2006
|
+
else:
|
|
2007
|
+
approved, explanation = _gatekeeper_evaluate_skip(
|
|
2008
|
+
phase,
|
|
2009
|
+
reason,
|
|
2010
|
+
state,
|
|
2011
|
+
)
|
|
2012
|
+
|
|
2013
|
+
if not approved:
|
|
2014
|
+
print("\n" + _msg("gatekeeper_skip_denied", phase=phase))
|
|
2015
|
+
print(_msg("gatekeeper_skip_denied_reason", reason=explanation[:300]))
|
|
2016
|
+
print("\n" + _msg("gatekeeper_skip_denied_retry", cmd=CMD))
|
|
2017
|
+
_append_log(
|
|
2018
|
+
{
|
|
2019
|
+
"iteration": state["iteration"],
|
|
2020
|
+
"phase": phase,
|
|
2021
|
+
"event": "skip_denied",
|
|
2022
|
+
"reason": reason,
|
|
2023
|
+
"gatekeeper": explanation[:200],
|
|
2024
|
+
}
|
|
2025
|
+
)
|
|
2026
|
+
return
|
|
2027
|
+
|
|
2028
|
+
print(_msg("gatekeeper_skip_approved", phase=phase))
|
|
2029
|
+
|
|
2030
|
+
state["skipped_phases"].append({"phase": phase, "reason": reason})
|
|
2031
|
+
|
|
2032
|
+
# FSM: skip and advance
|
|
2033
|
+
_fire_fsm(FSMEvent.SKIP, state) # pending -> skipped
|
|
2034
|
+
nxt = _next_phase(state)
|
|
2035
|
+
if nxt:
|
|
2036
|
+
_fire_fsm(FSMEvent.ADVANCE, state) # skipped -> pending
|
|
2037
|
+
state["current_phase"] = nxt
|
|
2038
|
+
else:
|
|
2039
|
+
state["phase_status"] = "iteration_complete"
|
|
2040
|
+
|
|
2041
|
+
_save_state(state)
|
|
2042
|
+
_append_log(
|
|
2043
|
+
{
|
|
2044
|
+
"iteration": state["iteration"],
|
|
2045
|
+
"phase": phase,
|
|
2046
|
+
"event": "phase_skipped",
|
|
2047
|
+
"reason": reason,
|
|
2048
|
+
"gatekeeper": "approved",
|
|
2049
|
+
}
|
|
2050
|
+
)
|
|
2051
|
+
|
|
2052
|
+
print(_msg("skip_approved_msg", phase=phase, reason=reason))
|
|
2053
|
+
if nxt:
|
|
2054
|
+
print("\n" + _msg("skip_next", nxt=nxt))
|
|
2055
|
+
print(_msg("skip_next_cmd", cmd=CMD))
|
|
2056
|
+
else:
|
|
2057
|
+
print("\n" + _msg("skip_iteration_complete"))
|
|
2058
|
+
|
|
2059
|
+
|
|
2060
|
+
def cmd_context(args) -> None:
|
|
2061
|
+
"""Inject user guidance into a phase, broadcast to all agents.
|
|
2062
|
+
|
|
2063
|
+
Stores the user's message in context.yaml (persistent across --clean).
|
|
2064
|
+
Displays as a prominent banner in phase instructions. All agents
|
|
2065
|
+
spawned in any phase receive the guidance. Can target a specific
|
|
2066
|
+
phase or the current one.
|
|
2067
|
+
"""
|
|
2068
|
+
state = _load_state()
|
|
2069
|
+
if not state:
|
|
2070
|
+
print(_msg("no_active"), file=sys.stderr)
|
|
2071
|
+
sys.exit(1)
|
|
2072
|
+
|
|
2073
|
+
phase = getattr(args, "phase", "") or state["current_phase"]
|
|
2074
|
+
phase = phase.upper()
|
|
2075
|
+
clear = getattr(args, "clear", False)
|
|
2076
|
+
|
|
2077
|
+
if clear:
|
|
2078
|
+
ctx = _load_context()
|
|
2079
|
+
ctx.pop(phase, None)
|
|
2080
|
+
_save_context(ctx)
|
|
2081
|
+
print(_msg("context_cleared", phase=phase))
|
|
2082
|
+
return
|
|
2083
|
+
|
|
2084
|
+
message = args.message
|
|
2085
|
+
if not message:
|
|
2086
|
+
ctx = _load_context()
|
|
2087
|
+
if not ctx:
|
|
2088
|
+
print(_msg("context_none"))
|
|
2089
|
+
else:
|
|
2090
|
+
for p, msg in ctx.items():
|
|
2091
|
+
truncated = msg[:100]
|
|
2092
|
+
ellipsis = "..." if len(msg) > 100 else ""
|
|
2093
|
+
print(_msg("context_item", phase=p, text=truncated + ellipsis))
|
|
2094
|
+
return
|
|
2095
|
+
|
|
2096
|
+
ctx = _load_context()
|
|
2097
|
+
ctx[phase] = message
|
|
2098
|
+
_save_context(ctx)
|
|
2099
|
+
_append_log(
|
|
2100
|
+
{
|
|
2101
|
+
"iteration": state["iteration"],
|
|
2102
|
+
"phase": phase,
|
|
2103
|
+
"event": "user_context",
|
|
2104
|
+
"message": message[:200],
|
|
2105
|
+
}
|
|
2106
|
+
)
|
|
2107
|
+
print(_msg("context_set", phase=phase))
|
|
2108
|
+
print(_msg("context_message", message=message))
|
|
2109
|
+
if state["phase_status"] == "in_progress" and state["current_phase"] == phase:
|
|
2110
|
+
print("\n" + _msg("context_in_progress", cmd=CMD))
|
|
2111
|
+
else:
|
|
2112
|
+
print("\n" + _msg("context_will_show", phase=phase))
|
|
2113
|
+
|
|
2114
|
+
|
|
2115
|
+
def cmd_log_failure(args) -> None:
|
|
2116
|
+
"""Log a failure mode found during the iteration.
|
|
2117
|
+
|
|
2118
|
+
Appends to failures.yaml with mode ID, description, iteration,
|
|
2119
|
+
and phase. Failure modes accumulate across iterations and feed
|
|
2120
|
+
into RESEARCH phase context for the next iteration.
|
|
2121
|
+
"""
|
|
2122
|
+
state = _load_state()
|
|
2123
|
+
iteration = state["iteration"] if state else 0
|
|
2124
|
+
phase = state["current_phase"] if state else "unknown"
|
|
2125
|
+
|
|
2126
|
+
_append_failure(
|
|
2127
|
+
{
|
|
2128
|
+
"iteration": iteration,
|
|
2129
|
+
"phase": phase,
|
|
2130
|
+
"mode": args.mode,
|
|
2131
|
+
"description": args.desc,
|
|
2132
|
+
}
|
|
2133
|
+
)
|
|
2134
|
+
print(_msg("failure_logged", mode=args.mode, desc=args.desc))
|
|
2135
|
+
|
|
2136
|
+
|
|
2137
|
+
def cmd_failures(args) -> None:
|
|
2138
|
+
"""Display the failure log grouped by iteration.
|
|
2139
|
+
|
|
2140
|
+
Shows all logged failure modes with their mode ID, phase,
|
|
2141
|
+
description, and timestamp. Used to review what went wrong
|
|
2142
|
+
across iterations.
|
|
2143
|
+
"""
|
|
2144
|
+
if not FAILURES_FILE.exists():
|
|
2145
|
+
print(_msg("no_failures"))
|
|
2146
|
+
return
|
|
2147
|
+
|
|
2148
|
+
entries = _load_yaml_list(FAILURES_FILE)
|
|
2149
|
+
|
|
2150
|
+
if not entries:
|
|
2151
|
+
print(_msg("no_failures"))
|
|
2152
|
+
return
|
|
2153
|
+
|
|
2154
|
+
by_iter: dict[int, list] = {}
|
|
2155
|
+
for e in entries:
|
|
2156
|
+
it = e.get("iteration", 0)
|
|
2157
|
+
by_iter.setdefault(it, []).append(e)
|
|
2158
|
+
|
|
2159
|
+
for it in sorted(by_iter.keys()):
|
|
2160
|
+
print("\n" + _msg("failure_iteration_header", iteration=it))
|
|
2161
|
+
for e in by_iter[it]:
|
|
2162
|
+
mode = e.get("mode", "?")
|
|
2163
|
+
desc = e.get("description", "?")
|
|
2164
|
+
phase = e.get("phase", "?")
|
|
2165
|
+
ts = e.get("timestamp", "?")
|
|
2166
|
+
print(_msg("failure_item", mode=mode, phase=phase, desc=desc, ts=ts))
|
|
2167
|
+
|
|
2168
|
+
|
|
2169
|
+
def cmd_hypotheses(args) -> None:
|
|
2170
|
+
"""Display the hypothesis catalogue across all iterations.
|
|
2171
|
+
|
|
2172
|
+
Shows hypothesis ID, star rating average, status, and text.
|
|
2173
|
+
The catalogue persists across iterations - hypotheses marked
|
|
2174
|
+
DONE or REMOVED are archived by hypothesis-gc.
|
|
2175
|
+
"""
|
|
2176
|
+
entries = _load_prior_hypotheses()
|
|
2177
|
+
if not entries:
|
|
2178
|
+
print(_msg("no_hypotheses"))
|
|
2179
|
+
return
|
|
2180
|
+
|
|
2181
|
+
for e in entries:
|
|
2182
|
+
hid = e.get("id", "?")
|
|
2183
|
+
status = e.get("status", "?")
|
|
2184
|
+
avg = e.get("avg_score", "?")
|
|
2185
|
+
hyp = e.get("hypothesis", "?")
|
|
2186
|
+
ts = e.get("timestamp", "?")
|
|
2187
|
+
print("\n" + _msg("hypothesis_item", hid=hid, avg=avg, status=status, hyp=hyp[:200], ts=ts))
|
|
2188
|
+
|
|
2189
|
+
|
|
2190
|
+
def cmd_validate(args) -> None:
|
|
2191
|
+
"""Run model validation and report any issues found.
|
|
2192
|
+
|
|
2193
|
+
Loads the model from YAML resources, runs validate_model(), and prints
|
|
2194
|
+
each issue in human-readable format with file origin, location, and fix
|
|
2195
|
+
suggestion. Exits with code 0 if the model is valid, 1 if issues found.
|
|
2196
|
+
"""
|
|
2197
|
+
issues = validate_model(_MODEL)
|
|
2198
|
+
if not issues:
|
|
2199
|
+
print(_msg("validate_success"))
|
|
2200
|
+
sys.exit(0)
|
|
2201
|
+
print(_msg("validate_issues", count=len(issues)))
|
|
2202
|
+
for i, issue in enumerate(issues, 1):
|
|
2203
|
+
print(_msg("validate_item", num=i, issue=issue))
|
|
2204
|
+
sys.exit(1)
|
|
2205
|
+
|
|
2206
|
+
|
|
2207
|
+
def _dry_run_phase(workflow: str, phase_name: str) -> list[str]:
|
|
2208
|
+
"""Print expected agents and gates for one phase. Returns list of issues."""
|
|
2209
|
+
issues: list[str] = []
|
|
2210
|
+
phase_key = _resolve_key(workflow, phase_name, set(_MODEL.phases.keys()))
|
|
2211
|
+
agent_key = _resolve_key(workflow, phase_name, set(_MODEL.agents.keys()))
|
|
2212
|
+
agents = _MODEL.agents.get(agent_key, [])
|
|
2213
|
+
|
|
2214
|
+
gate_phases = {k.rsplit("::", 1)[0] for k in _MODEL.gates if "::" in k}
|
|
2215
|
+
gate_key = _resolve_key(workflow, phase_name, gate_phases)
|
|
2216
|
+
has_rb = f"{gate_key}::readback" in _MODEL.gates
|
|
2217
|
+
has_gk = f"{gate_key}::gatekeeper" in _MODEL.gates
|
|
2218
|
+
|
|
2219
|
+
skippable = any(
|
|
2220
|
+
p.get("skippable") for p in _MODEL.workflow_types[workflow].phases
|
|
2221
|
+
if p["name"] == phase_name
|
|
2222
|
+
)
|
|
2223
|
+
tag = "skip" if skippable else "req"
|
|
2224
|
+
agent_names = ", ".join(a.name for a in agents) if agents else "none"
|
|
2225
|
+
rb = "yes" if has_rb else "NO"
|
|
2226
|
+
gk = "yes" if has_gk else "NO"
|
|
2227
|
+
|
|
2228
|
+
# Report resolution path
|
|
2229
|
+
resolved_display = phase_key if phase_key != phase_name else phase_name
|
|
2230
|
+
print(_msg("dry_run_phase_line", phase=phase_name, tag=tag, agents=agent_names, rb=rb, gk=gk)
|
|
2231
|
+
+ f" [{resolved_display}]")
|
|
2232
|
+
|
|
2233
|
+
# Test template rendering with dummy context
|
|
2234
|
+
phase_obj = _MODEL.phases.get(phase_key)
|
|
2235
|
+
if phase_obj:
|
|
2236
|
+
dummy_ctx = collections.defaultdict(str, {v: f"<{v}>" for v in _KNOWN_TEMPLATE_VARS})
|
|
2237
|
+
for attr in ("start", "end", "start_continue", "start_final", "end_continue", "end_final"):
|
|
2238
|
+
text = getattr(phase_obj, attr, "")
|
|
2239
|
+
if text:
|
|
2240
|
+
try:
|
|
2241
|
+
text.format_map(dummy_ctx)
|
|
2242
|
+
except (KeyError, ValueError, IndexError) as exc:
|
|
2243
|
+
issues.append(f"[phases.yaml] '{phase_key}.{attr}': template render error: {exc}")
|
|
2244
|
+
|
|
2245
|
+
return issues
|
|
2246
|
+
|
|
2247
|
+
|
|
2248
|
+
def _dry_run(itype: str, total_iterations: int) -> None:
|
|
2249
|
+
"""Print expected execution plan without creating state."""
|
|
2250
|
+
issues = validate_model(_MODEL)
|
|
2251
|
+
if issues:
|
|
2252
|
+
for issue in issues:
|
|
2253
|
+
print(_msg("dry_run_error", issue=issue))
|
|
2254
|
+
sys.exit(1)
|
|
2255
|
+
print(_msg("dry_run_valid"))
|
|
2256
|
+
|
|
2257
|
+
wf = _MODEL.workflow_types[itype]
|
|
2258
|
+
dep_wf = _MODEL.workflow_types.get(wf.depends_on) if wf.depends_on else None
|
|
2259
|
+
template_issues: list[str] = []
|
|
2260
|
+
|
|
2261
|
+
if dep_wf and total_iterations > 1:
|
|
2262
|
+
print(_msg("dry_run_planning_iter", wtype=wf.depends_on))
|
|
2263
|
+
for p in dep_wf.phases:
|
|
2264
|
+
template_issues.extend(_dry_run_phase(wf.depends_on, p["name"]))
|
|
2265
|
+
|
|
2266
|
+
for i in range(1, total_iterations + 1):
|
|
2267
|
+
print(_msg("dry_run_impl_iter", num=i, wtype=itype))
|
|
2268
|
+
for p in wf.phases:
|
|
2269
|
+
template_issues.extend(_dry_run_phase(itype, p["name"]))
|
|
2270
|
+
|
|
2271
|
+
if template_issues:
|
|
2272
|
+
print("\n Template rendering issues:")
|
|
2273
|
+
for ti in template_issues:
|
|
2274
|
+
print(f" {ti}")
|
|
2275
|
+
sys.exit(1)
|
|
2276
|
+
|
|
2277
|
+
# FSM lifecycle simulation - verify all transitions work
|
|
2278
|
+
if dep_wf and total_iterations > 1:
|
|
2279
|
+
dep_reports = _PHASE_FSM.simulate([p["name"] for p in dep_wf.phases])
|
|
2280
|
+
for r in dep_reports:
|
|
2281
|
+
if not r["valid"]:
|
|
2282
|
+
print(_msg("dry_run_error", issue=f"FSM simulation failed for {r['phase']}: {r.get('error', '')}"))
|
|
2283
|
+
sys.exit(1)
|
|
2284
|
+
|
|
2285
|
+
reports = _PHASE_FSM.simulate([p["name"] for p in wf.phases])
|
|
2286
|
+
for r in reports:
|
|
2287
|
+
if not r["valid"]:
|
|
2288
|
+
print(_msg("dry_run_error", issue=f"FSM simulation failed for {r['phase']}: {r.get('error', '')}"))
|
|
2289
|
+
sys.exit(1)
|
|
2290
|
+
|
|
2291
|
+
print(_msg("dry_run_complete"))
|
|
2292
|
+
|
|
2293
|
+
|
|
2294
|
+
def cmd_add_iteration(args) -> None:
|
|
2295
|
+
"""Add iterations to an active cycle without restarting."""
|
|
2296
|
+
state = _load_state()
|
|
2297
|
+
if not state:
|
|
2298
|
+
print(_msg("no_active_add_iteration"), file=sys.stderr)
|
|
2299
|
+
sys.exit(1)
|
|
2300
|
+
count = args.count
|
|
2301
|
+
old_total = state["total_iterations"]
|
|
2302
|
+
state["total_iterations"] = old_total + count
|
|
2303
|
+
new_objective = getattr(args, "objective", "") or ""
|
|
2304
|
+
if new_objective:
|
|
2305
|
+
state["objective"] = new_objective
|
|
2306
|
+
_save_state(state)
|
|
2307
|
+
_append_log(
|
|
2308
|
+
{
|
|
2309
|
+
"iteration": state["iteration"],
|
|
2310
|
+
"event": "add_iteration",
|
|
2311
|
+
"count": count,
|
|
2312
|
+
"old_total": old_total,
|
|
2313
|
+
"new_total": old_total + count,
|
|
2314
|
+
}
|
|
2315
|
+
)
|
|
2316
|
+
print(_msg("add_iteration_success", count=count, old=old_total, new=old_total + count))
|
|
2317
|
+
|
|
2318
|
+
|
|
2319
|
+
# ── Main ─────────────────────────────────────────────────────────────
|
|
2320
|
+
|
|
2321
|
+
|
|
2322
|
+
def main(resources_dir: Path | None = None):
|
|
2323
|
+
"""CLI entry point. Parses arguments and dispatches to command handlers.
|
|
2324
|
+
|
|
2325
|
+
Args:
|
|
2326
|
+
resources_dir: Path to YAML resource files directory. If None,
|
|
2327
|
+
checks --resources-dir CLI argument, then falls back to
|
|
2328
|
+
a 'resources' subdirectory next to this file.
|
|
2329
|
+
"""
|
|
2330
|
+
# Resolve resources_dir: explicit arg > CLI --resources-dir > default
|
|
2331
|
+
if resources_dir is None:
|
|
2332
|
+
# Pre-parse --resources-dir before full argparse (it needs _initialize first)
|
|
2333
|
+
for i, arg in enumerate(sys.argv[1:]):
|
|
2334
|
+
if arg == "--resources-dir" and i + 1 < len(sys.argv) - 1:
|
|
2335
|
+
resources_dir = Path(sys.argv[i + 2])
|
|
2336
|
+
break
|
|
2337
|
+
if arg.startswith("--resources-dir="):
|
|
2338
|
+
resources_dir = Path(arg.split("=", 1)[1])
|
|
2339
|
+
break
|
|
2340
|
+
if resources_dir is None:
|
|
2341
|
+
resources_dir = Path(__file__).parent / "resources"
|
|
2342
|
+
|
|
2343
|
+
_initialize(resources_dir)
|
|
2344
|
+
|
|
2345
|
+
parser = argparse.ArgumentParser(
|
|
2346
|
+
description=_cli("description", ""),
|
|
2347
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
2348
|
+
epilog=_cli("epilog", ""),
|
|
2349
|
+
)
|
|
2350
|
+
parser.add_argument(
|
|
2351
|
+
"--resources-dir",
|
|
2352
|
+
default=str(resources_dir),
|
|
2353
|
+
help="Path to YAML resource files directory",
|
|
2354
|
+
)
|
|
2355
|
+
sub = parser.add_subparsers(dest="command")
|
|
2356
|
+
|
|
2357
|
+
# ── new ──
|
|
2358
|
+
p_new = sub.add_parser("new", help=_cli("commands", "new"))
|
|
2359
|
+
p_new.add_argument(
|
|
2360
|
+
"--type",
|
|
2361
|
+
required=True,
|
|
2362
|
+
choices=list(ITERATION_TYPES.keys()),
|
|
2363
|
+
)
|
|
2364
|
+
p_new.add_argument("--objective", required=True, help=_cli("args", "objective"))
|
|
2365
|
+
p_new.add_argument("--iterations", type=int, default=1, help=_cli("args", "iterations"))
|
|
2366
|
+
p_new.add_argument("--benchmark", default="", help=_cli("args", "benchmark"))
|
|
2367
|
+
p_new.add_argument("--clean", action="store_true", default=True, help=_cli("args", "clean"))
|
|
2368
|
+
p_new.add_argument("--no-clean", action="store_false", dest="clean", help=_cli("args", "no_clean"))
|
|
2369
|
+
p_new.add_argument("--dry-run", action="store_true", default=False, help=_cli("args", "dry_run"))
|
|
2370
|
+
|
|
2371
|
+
# ── start ──
|
|
2372
|
+
p_start = sub.add_parser("start", help=_cli("commands", "start"))
|
|
2373
|
+
p_start.add_argument("--understanding", required=True, help=_cli("args", "understanding"))
|
|
2374
|
+
|
|
2375
|
+
# ── end ──
|
|
2376
|
+
p_end = sub.add_parser("end", help=_cli("commands", "end"))
|
|
2377
|
+
p_end.add_argument("--evidence", default="", help=_cli("args", "evidence"))
|
|
2378
|
+
p_end.add_argument("--agents", default="", help=_cli("args", "agents"))
|
|
2379
|
+
p_end.add_argument("--output-file", default="", help=_cli("args", "output_file"))
|
|
2380
|
+
|
|
2381
|
+
# ── status ──
|
|
2382
|
+
sub.add_parser("status", help=_cli("commands", "status"))
|
|
2383
|
+
|
|
2384
|
+
# ── reject ──
|
|
2385
|
+
p_reject = sub.add_parser("reject", help=_cli("commands", "reject"))
|
|
2386
|
+
p_reject.add_argument("--reason", required=True, help=_cli("args", "reason"))
|
|
2387
|
+
|
|
2388
|
+
# ── skip ──
|
|
2389
|
+
p_skip = sub.add_parser("skip", help=_cli("commands", "skip"))
|
|
2390
|
+
p_skip.add_argument("--reason", default="", help=_cli("args", "skip_reason"))
|
|
2391
|
+
p_skip.add_argument("--force", action="store_true", default=False, help=_cli("args", "force"))
|
|
2392
|
+
|
|
2393
|
+
# ── context ──
|
|
2394
|
+
p_ctx = sub.add_parser("context", help=_cli("commands", "context"))
|
|
2395
|
+
p_ctx.add_argument("--message", default="", help=_cli("args", "message"))
|
|
2396
|
+
p_ctx.add_argument("--phase", default="", help=_cli("args", "phase"))
|
|
2397
|
+
p_ctx.add_argument("--clear", action="store_true", default=False, help=_cli("args", "clear"))
|
|
2398
|
+
|
|
2399
|
+
# ── log-failure ──
|
|
2400
|
+
p_fail = sub.add_parser("log-failure", help=_cli("commands", "log_failure"))
|
|
2401
|
+
p_fail.add_argument("--mode", required=True, help=_cli("args", "mode"))
|
|
2402
|
+
p_fail.add_argument("--desc", required=True, help=_cli("args", "desc"))
|
|
2403
|
+
|
|
2404
|
+
# ── failures ──
|
|
2405
|
+
sub.add_parser("failures", help=_cli("commands", "failures"))
|
|
2406
|
+
|
|
2407
|
+
# ── hypotheses ──
|
|
2408
|
+
sub.add_parser("hypotheses", help=_cli("commands", "hypotheses"))
|
|
2409
|
+
|
|
2410
|
+
# ── add-iteration ──
|
|
2411
|
+
p_add = sub.add_parser("add-iteration", help=_cli("commands", "add_iteration"))
|
|
2412
|
+
p_add.add_argument("--count", type=int, required=True, help=_cli("args", "count"))
|
|
2413
|
+
p_add.add_argument("--objective", default="", help=_cli("args", "add_objective"))
|
|
2414
|
+
|
|
2415
|
+
# ── validate ──
|
|
2416
|
+
sub.add_parser("validate", help="Validate YAML resources against the model schema")
|
|
2417
|
+
|
|
2418
|
+
args = parser.parse_args()
|
|
2419
|
+
if not args.command:
|
|
2420
|
+
parser.print_help()
|
|
2421
|
+
sys.exit(1)
|
|
2422
|
+
|
|
2423
|
+
# Initialize artifacts directory
|
|
2424
|
+
_init_artifacts_dir()
|
|
2425
|
+
|
|
2426
|
+
cmds = {
|
|
2427
|
+
"new": cmd_new,
|
|
2428
|
+
"start": cmd_start,
|
|
2429
|
+
"end": cmd_end,
|
|
2430
|
+
"status": cmd_status,
|
|
2431
|
+
"reject": cmd_reject,
|
|
2432
|
+
"skip": cmd_skip,
|
|
2433
|
+
"context": cmd_context,
|
|
2434
|
+
"log-failure": cmd_log_failure,
|
|
2435
|
+
"failures": cmd_failures,
|
|
2436
|
+
"hypotheses": cmd_hypotheses,
|
|
2437
|
+
"add-iteration": cmd_add_iteration,
|
|
2438
|
+
"validate": cmd_validate,
|
|
2439
|
+
}
|
|
2440
|
+
cmds[args.command](args)
|
|
2441
|
+
|
|
2442
|
+
|
|
2443
|
+
if __name__ == "__main__":
|
|
2444
|
+
main()
|