@lnilluv/pi-ralph-loop 0.1.4-dev.0 → 0.1.4-dev.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +63 -12
- package/package.json +1 -1
- package/src/index.ts +1034 -168
- package/src/ralph-draft-llm.ts +35 -7
- package/src/ralph-draft.ts +1 -1
- package/src/ralph.ts +708 -51
- package/src/runner-rpc.ts +434 -0
- package/src/runner-state.ts +822 -0
- package/src/runner.ts +957 -0
- package/tests/fixtures/parity/migrate/OPEN_QUESTIONS.md +3 -0
- package/tests/fixtures/parity/migrate/RALPH.md +27 -0
- package/tests/fixtures/parity/migrate/golden/MIGRATED.md +15 -0
- package/tests/fixtures/parity/migrate/legacy/source.md +6 -0
- package/tests/fixtures/parity/migrate/legacy/source.yaml +3 -0
- package/tests/fixtures/parity/migrate/scripts/show-legacy.sh +10 -0
- package/tests/fixtures/parity/migrate/scripts/verify.sh +15 -0
- package/tests/fixtures/parity/research/OPEN_QUESTIONS.md +3 -0
- package/tests/fixtures/parity/research/RALPH.md +45 -0
- package/tests/fixtures/parity/research/claim-evidence-checklist.md +15 -0
- package/tests/fixtures/parity/research/expected-outputs.md +22 -0
- package/tests/fixtures/parity/research/scripts/show-snapshots.sh +13 -0
- package/tests/fixtures/parity/research/scripts/verify.sh +55 -0
- package/tests/fixtures/parity/research/snapshots/app-factory-ai-cli.md +11 -0
- package/tests/fixtures/parity/research/snapshots/docs-factory-ai-cli-features-missions.md +11 -0
- package/tests/fixtures/parity/research/snapshots/factory-ai-news-missions.md +11 -0
- package/tests/fixtures/parity/research/source-manifest.md +20 -0
- package/tests/index.test.ts +3169 -104
- package/tests/parity/README.md +9 -0
- package/tests/parity/harness.py +526 -0
- package/tests/parity-harness.test.ts +42 -0
- package/tests/parity-research-fixture.test.ts +34 -0
- package/tests/ralph-draft-llm.test.ts +82 -9
- package/tests/ralph-draft.test.ts +1 -1
- package/tests/ralph.test.ts +1265 -36
- package/tests/runner-event-contract.test.ts +235 -0
- package/tests/runner-rpc.test.ts +358 -0
- package/tests/runner-state.test.ts +553 -0
- package/tests/runner.test.ts +1347 -0
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# Parity harness
|
|
2
|
+
|
|
3
|
+
Run from the repo root:
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
python3 tests/parity/harness.py --implementation pi-ralph-loop --fixture research --fixture migrate
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
Fixtures live under `tests/fixtures/parity/`. The harness copies each fixture into a temporary task workspace, replays it, and writes the bundle root path when it finishes.
|
|
@@ -0,0 +1,526 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import hashlib
|
|
6
|
+
import json
|
|
7
|
+
import os
|
|
8
|
+
import shlex
|
|
9
|
+
import shutil
|
|
10
|
+
import subprocess
|
|
11
|
+
import tempfile
|
|
12
|
+
import threading
|
|
13
|
+
import time
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
REPO_ROOT = Path(__file__).resolve().parents[2]
|
|
18
|
+
FIXTURES_ROOT = REPO_ROOT / "tests" / "fixtures" / "parity"
|
|
19
|
+
DEFAULT_MODEL = os.environ.get("PI_RALPH_PARITY_MODEL")
|
|
20
|
+
DEFAULT_LOOP_RPC_COMMAND = os.environ.get("PI_RALPH_PARITY_LOOP_RPC_COMMAND")
|
|
21
|
+
DEFAULT_RALPHIFY_RPC_COMMAND = os.environ.get("PI_RALPH_PARITY_RALPHIFY_RPC_COMMAND", "")
|
|
22
|
+
DEFAULT_LOOP_PROMPT_TEMPLATE = os.environ.get("PI_RALPH_PARITY_LOOP_PROMPT_TEMPLATE", "/ralph --path {ralph_path}")
|
|
23
|
+
DEFAULT_RALPHIFY_PROMPT_TEMPLATE = os.environ.get(
|
|
24
|
+
"PI_RALPH_PARITY_RALPHIFY_PROMPT_TEMPLATE",
|
|
25
|
+
"/ralph --path {ralph_path}",
|
|
26
|
+
)
|
|
27
|
+
TERMINAL_STATUSES = {
|
|
28
|
+
"complete",
|
|
29
|
+
"max-iterations",
|
|
30
|
+
"no-progress-exhaustion",
|
|
31
|
+
"stopped",
|
|
32
|
+
"timeout",
|
|
33
|
+
"error",
|
|
34
|
+
"cancelled",
|
|
35
|
+
}
|
|
36
|
+
AGENT_FILE_NAMES = ["auth.json", "models.json", "agent-models.json"]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def utc_now() -> str:
|
|
40
|
+
return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def read_text(path: Path) -> str:
|
|
44
|
+
return path.read_text(encoding="utf-8")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def write_text(path: Path, content: str) -> None:
|
|
48
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
49
|
+
path.write_text(content, encoding="utf-8")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def write_json(path: Path, payload: Any) -> None:
|
|
53
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
54
|
+
path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def inventory(path: Path) -> list[dict[str, Any]]:
|
|
58
|
+
rows: list[dict[str, Any]] = []
|
|
59
|
+
if not path.exists():
|
|
60
|
+
return rows
|
|
61
|
+
|
|
62
|
+
for entry in sorted(path.rglob("*")):
|
|
63
|
+
if not entry.is_file():
|
|
64
|
+
continue
|
|
65
|
+
rel = entry.relative_to(path).as_posix()
|
|
66
|
+
data = entry.read_bytes()
|
|
67
|
+
rows.append({
|
|
68
|
+
"path": rel,
|
|
69
|
+
"size": len(data),
|
|
70
|
+
"sha256": hashlib.sha256(data).hexdigest(),
|
|
71
|
+
})
|
|
72
|
+
return rows
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def write_inventory_tsv(path: Path, rows: list[dict[str, Any]]) -> None:
|
|
76
|
+
lines = ["path\tsize\tsha256"]
|
|
77
|
+
for row in rows:
|
|
78
|
+
lines.append(f"{row['path']}\t{row['size']}\t{row['sha256']}")
|
|
79
|
+
write_text(path, "\n".join(lines) + "\n")
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def read_status(task_dir: Path, error_context: list[dict[str, str]] | None = None) -> dict[str, Any] | None:
|
|
83
|
+
status_path = task_dir / ".ralph-runner" / "status.json"
|
|
84
|
+
if not status_path.exists():
|
|
85
|
+
return None
|
|
86
|
+
try:
|
|
87
|
+
payload = json.loads(read_text(status_path))
|
|
88
|
+
except (OSError, json.JSONDecodeError, UnicodeDecodeError) as exc:
|
|
89
|
+
if error_context is not None:
|
|
90
|
+
entry = {"path": str(status_path), "error": f"{type(exc).__name__}: {exc}"}
|
|
91
|
+
if not error_context or error_context[-1] != entry:
|
|
92
|
+
error_context.append(entry)
|
|
93
|
+
return None
|
|
94
|
+
if isinstance(payload, dict):
|
|
95
|
+
return payload
|
|
96
|
+
return None
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def ensure_agent_dir(bundle_root: Path) -> dict[str, Any]:
|
|
100
|
+
agent_dir = bundle_root / "agent"
|
|
101
|
+
agent_dir.mkdir(parents=True, exist_ok=True)
|
|
102
|
+
|
|
103
|
+
source_dir = os.environ.get("PI_CODING_AGENT_DIR")
|
|
104
|
+
if source_dir:
|
|
105
|
+
source = Path(source_dir)
|
|
106
|
+
else:
|
|
107
|
+
source = Path.home() / ".pi" / "agent"
|
|
108
|
+
|
|
109
|
+
copied: list[str] = []
|
|
110
|
+
for file_name in AGENT_FILE_NAMES:
|
|
111
|
+
src = source / file_name
|
|
112
|
+
if src.exists():
|
|
113
|
+
shutil.copy2(src, agent_dir / file_name)
|
|
114
|
+
copied.append(file_name)
|
|
115
|
+
|
|
116
|
+
return {
|
|
117
|
+
"source": str(source),
|
|
118
|
+
"destination": str(agent_dir),
|
|
119
|
+
"copied_files": copied,
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def create_bundle_root(explicit_root: str | None) -> Path:
|
|
124
|
+
if explicit_root:
|
|
125
|
+
root = Path(explicit_root).expanduser().resolve()
|
|
126
|
+
root.mkdir(parents=True, exist_ok=True)
|
|
127
|
+
return root
|
|
128
|
+
return Path(tempfile.mkdtemp(prefix="pi-ralph-parity-")).resolve()
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def build_loop_rpc_command(model: str | None) -> list[str]:
|
|
132
|
+
command = [
|
|
133
|
+
"pi",
|
|
134
|
+
"--mode",
|
|
135
|
+
"rpc",
|
|
136
|
+
"--no-extensions",
|
|
137
|
+
"-e",
|
|
138
|
+
str(REPO_ROOT / "src" / "index.ts"),
|
|
139
|
+
]
|
|
140
|
+
if model:
|
|
141
|
+
command.extend(["--model", model])
|
|
142
|
+
return command
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def run_git(args: list[str]) -> str:
|
|
146
|
+
result = subprocess.run(
|
|
147
|
+
["git", *args],
|
|
148
|
+
# Run the RPC session inside the copied task workspace so file writes stay
|
|
149
|
+
# isolated from the repository checkout.
|
|
150
|
+
cwd=REPO_ROOT,
|
|
151
|
+
capture_output=True,
|
|
152
|
+
text=True,
|
|
153
|
+
check=False,
|
|
154
|
+
)
|
|
155
|
+
output = (result.stdout + result.stderr).strip()
|
|
156
|
+
return output
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def parse_command(text: str) -> list[str]:
|
|
160
|
+
return shlex.split(text)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def task_prompt(prompt_template: str, ralph_path: Path) -> str:
|
|
164
|
+
return prompt_template.format(
|
|
165
|
+
ralph_path=shlex.quote(str(ralph_path)),
|
|
166
|
+
task_dir=shlex.quote(str(ralph_path.parent)),
|
|
167
|
+
fixture=ralph_path.parents[2].name,
|
|
168
|
+
implementation=ralph_path.parents[1].name,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def stream_reader(stream, file_obj, store: list[str], last_output: list[float]) -> None:
|
|
173
|
+
try:
|
|
174
|
+
for line in iter(stream.readline, ""):
|
|
175
|
+
last_output[0] = time.time()
|
|
176
|
+
store.append(line)
|
|
177
|
+
file_obj.write(line)
|
|
178
|
+
file_obj.flush()
|
|
179
|
+
finally:
|
|
180
|
+
try:
|
|
181
|
+
stream.close()
|
|
182
|
+
except (OSError, ValueError):
|
|
183
|
+
pass
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def run_rpc_session(
|
|
187
|
+
command: list[str],
|
|
188
|
+
prompt: str,
|
|
189
|
+
cwd: Path,
|
|
190
|
+
env: dict[str, str],
|
|
191
|
+
stdout_path: Path,
|
|
192
|
+
stderr_path: Path,
|
|
193
|
+
task_dir: Path,
|
|
194
|
+
run_timeout_seconds: int,
|
|
195
|
+
quiet_kill_seconds: float,
|
|
196
|
+
) -> dict[str, Any]:
|
|
197
|
+
stdout_path.parent.mkdir(parents=True, exist_ok=True)
|
|
198
|
+
stderr_path.parent.mkdir(parents=True, exist_ok=True)
|
|
199
|
+
|
|
200
|
+
proc = subprocess.Popen(
|
|
201
|
+
command,
|
|
202
|
+
cwd=str(cwd),
|
|
203
|
+
env=env,
|
|
204
|
+
stdin=subprocess.PIPE,
|
|
205
|
+
stdout=subprocess.PIPE,
|
|
206
|
+
stderr=subprocess.PIPE,
|
|
207
|
+
text=True,
|
|
208
|
+
bufsize=1,
|
|
209
|
+
)
|
|
210
|
+
stdout_lines: list[str] = []
|
|
211
|
+
stderr_lines: list[str] = []
|
|
212
|
+
last_output = [time.time()]
|
|
213
|
+
termination_reason = "timeout"
|
|
214
|
+
cleanup_errors: list[dict[str, str]] = []
|
|
215
|
+
status_errors: list[dict[str, str]] = []
|
|
216
|
+
startup_grace_seconds = max(quiet_kill_seconds * 10, 30.0)
|
|
217
|
+
|
|
218
|
+
stdout_file = stdout_path.open("w", encoding="utf-8")
|
|
219
|
+
stderr_file = stderr_path.open("w", encoding="utf-8")
|
|
220
|
+
threads = [
|
|
221
|
+
threading.Thread(target=stream_reader, args=(proc.stdout, stdout_file, stdout_lines, last_output), daemon=True),
|
|
222
|
+
threading.Thread(target=stream_reader, args=(proc.stderr, stderr_file, stderr_lines, last_output), daemon=True),
|
|
223
|
+
]
|
|
224
|
+
for thread in threads:
|
|
225
|
+
thread.start()
|
|
226
|
+
|
|
227
|
+
payload = json.dumps({"type": "prompt", "id": f"parity-{int(time.time() * 1000)}", "message": prompt}) + "\n"
|
|
228
|
+
if proc.stdin is not None:
|
|
229
|
+
proc.stdin.write(payload)
|
|
230
|
+
proc.stdin.flush()
|
|
231
|
+
|
|
232
|
+
started_at = time.time()
|
|
233
|
+
status_snapshot: dict[str, Any] | None = None
|
|
234
|
+
|
|
235
|
+
try:
|
|
236
|
+
while time.time() - started_at < run_timeout_seconds:
|
|
237
|
+
if proc.poll() is not None:
|
|
238
|
+
termination_reason = "process-exited"
|
|
239
|
+
break
|
|
240
|
+
|
|
241
|
+
status_snapshot = read_status(task_dir, status_errors)
|
|
242
|
+
status_value = status_snapshot.get("status") if status_snapshot else None
|
|
243
|
+
if status_value in TERMINAL_STATUSES:
|
|
244
|
+
termination_reason = f"terminal-status:{status_value}"
|
|
245
|
+
time.sleep(1.0)
|
|
246
|
+
break
|
|
247
|
+
|
|
248
|
+
status_is_active = isinstance(status_value, str) and status_value in {"initializing", "running"}
|
|
249
|
+
if (
|
|
250
|
+
quiet_kill_seconds > 0
|
|
251
|
+
and time.time() - started_at >= startup_grace_seconds
|
|
252
|
+
and time.time() - last_output[0] > quiet_kill_seconds
|
|
253
|
+
and not status_is_active
|
|
254
|
+
):
|
|
255
|
+
termination_reason = "idle-timeout:no-status"
|
|
256
|
+
break
|
|
257
|
+
|
|
258
|
+
time.sleep(0.25)
|
|
259
|
+
finally:
|
|
260
|
+
if proc.poll() is None:
|
|
261
|
+
try:
|
|
262
|
+
proc.terminate()
|
|
263
|
+
except (ProcessLookupError, OSError) as exc:
|
|
264
|
+
cleanup_errors.append({"action": "terminate", "error": f"{type(exc).__name__}: {exc}"})
|
|
265
|
+
try:
|
|
266
|
+
proc.wait(timeout=5)
|
|
267
|
+
except subprocess.TimeoutExpired as exc:
|
|
268
|
+
cleanup_errors.append({"action": "wait-after-terminate", "error": f"{type(exc).__name__}: {exc}"})
|
|
269
|
+
try:
|
|
270
|
+
proc.kill()
|
|
271
|
+
except (ProcessLookupError, OSError) as kill_exc:
|
|
272
|
+
cleanup_errors.append({"action": "kill", "error": f"{type(kill_exc).__name__}: {kill_exc}"})
|
|
273
|
+
try:
|
|
274
|
+
proc.wait(timeout=5)
|
|
275
|
+
except subprocess.TimeoutExpired as wait_exc:
|
|
276
|
+
cleanup_errors.append({"action": "wait-after-kill", "error": f"{type(wait_exc).__name__}: {wait_exc}"})
|
|
277
|
+
|
|
278
|
+
if proc.stdin is not None:
|
|
279
|
+
try:
|
|
280
|
+
proc.stdin.close()
|
|
281
|
+
except (OSError, ValueError) as exc:
|
|
282
|
+
cleanup_errors.append({"action": "close-stdin", "error": f"{type(exc).__name__}: {exc}"})
|
|
283
|
+
|
|
284
|
+
for thread in threads:
|
|
285
|
+
thread.join(timeout=2)
|
|
286
|
+
stdout_file.close()
|
|
287
|
+
stderr_file.close()
|
|
288
|
+
|
|
289
|
+
return {
|
|
290
|
+
"returncode": proc.returncode,
|
|
291
|
+
"stdout_lines": stdout_lines,
|
|
292
|
+
"stderr_lines": stderr_lines,
|
|
293
|
+
"status": status_snapshot,
|
|
294
|
+
"termination_reason": termination_reason,
|
|
295
|
+
"cleanup_errors": cleanup_errors,
|
|
296
|
+
"status_errors": status_errors,
|
|
297
|
+
"command": command,
|
|
298
|
+
"prompt": prompt,
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def run_verifier(task_dir: Path, env: dict[str, str], run_dir: Path) -> dict[str, Any]:
|
|
303
|
+
verify_script = task_dir / "scripts" / "verify.sh"
|
|
304
|
+
if not verify_script.exists():
|
|
305
|
+
return {"skipped": True}
|
|
306
|
+
|
|
307
|
+
verify_command = ["bash", str(verify_script)]
|
|
308
|
+
result = subprocess.run(
|
|
309
|
+
verify_command,
|
|
310
|
+
cwd=str(task_dir),
|
|
311
|
+
env=env,
|
|
312
|
+
capture_output=True,
|
|
313
|
+
text=True,
|
|
314
|
+
check=False,
|
|
315
|
+
)
|
|
316
|
+
write_text(run_dir / "verify.command.txt", " ".join(shlex.quote(part) for part in verify_command) + "\n")
|
|
317
|
+
write_text(run_dir / "verify.stdout.log", result.stdout)
|
|
318
|
+
write_text(run_dir / "verify.stderr.log", result.stderr)
|
|
319
|
+
payload = {
|
|
320
|
+
"returncode": result.returncode,
|
|
321
|
+
"command": verify_command,
|
|
322
|
+
"cwd": str(task_dir),
|
|
323
|
+
"passed": result.returncode == 0,
|
|
324
|
+
}
|
|
325
|
+
write_json(run_dir / "verify.json", payload)
|
|
326
|
+
return payload
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def implementation_plan(implementation: str, loop_command: list[str], loop_prompt_template: str, ralphify_command: list[str] | None) -> list[tuple[str, list[str], str]]:
|
|
330
|
+
if implementation == "pi-ralph-loop":
|
|
331
|
+
return [("pi-ralph-loop", loop_command, loop_prompt_template)]
|
|
332
|
+
if implementation == "ralphify":
|
|
333
|
+
if not ralphify_command:
|
|
334
|
+
raise SystemExit("--implementation ralphify requires PI_RALPH_PARITY_RALPHIFY_RPC_COMMAND or --ralphify-rpc-command")
|
|
335
|
+
return [("ralphify", ralphify_command, DEFAULT_RALPHIFY_PROMPT_TEMPLATE)]
|
|
336
|
+
if implementation == "both":
|
|
337
|
+
if not ralphify_command:
|
|
338
|
+
raise SystemExit("--implementation both requires a Ralphify RPC command")
|
|
339
|
+
return [
|
|
340
|
+
("pi-ralph-loop", loop_command, loop_prompt_template),
|
|
341
|
+
("ralphify", ralphify_command, DEFAULT_RALPHIFY_PROMPT_TEMPLATE),
|
|
342
|
+
]
|
|
343
|
+
raise SystemExit(f"Unknown implementation: {implementation}")
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def run_fixture(bundle_root: Path, fixture_name: str, implementation: str, rpc_command: list[str], prompt_template: str, env: dict[str, str], run_timeout_seconds: int, quiet_kill_seconds: float) -> dict[str, Any]:
|
|
347
|
+
fixture_dir = FIXTURES_ROOT / fixture_name
|
|
348
|
+
if not fixture_dir.exists():
|
|
349
|
+
raise SystemExit(f"Missing fixture directory: {fixture_dir}")
|
|
350
|
+
|
|
351
|
+
run_dir = bundle_root / "runs" / fixture_name / implementation
|
|
352
|
+
task_dir = run_dir / "task"
|
|
353
|
+
if run_dir.exists():
|
|
354
|
+
shutil.rmtree(run_dir)
|
|
355
|
+
run_dir.mkdir(parents=True, exist_ok=True)
|
|
356
|
+
|
|
357
|
+
shutil.copytree(fixture_dir, task_dir)
|
|
358
|
+
|
|
359
|
+
before_rows = inventory(task_dir)
|
|
360
|
+
write_inventory_tsv(run_dir / "inventory-before.tsv", before_rows)
|
|
361
|
+
write_json(run_dir / "fixture-manifest.json", {
|
|
362
|
+
"fixture": fixture_name,
|
|
363
|
+
"implementation": implementation,
|
|
364
|
+
"fixture_dir": str(fixture_dir),
|
|
365
|
+
"task_dir": str(task_dir),
|
|
366
|
+
"files": before_rows,
|
|
367
|
+
})
|
|
368
|
+
|
|
369
|
+
ralph_path = task_dir / "RALPH.md"
|
|
370
|
+
prompt = task_prompt(prompt_template, ralph_path)
|
|
371
|
+
write_text(run_dir / "command.txt", " ".join(shlex.quote(part) for part in rpc_command) + "\n")
|
|
372
|
+
write_text(run_dir / "prompt.txt", prompt + "\n")
|
|
373
|
+
|
|
374
|
+
started_at = utc_now()
|
|
375
|
+
session_result = run_rpc_session(
|
|
376
|
+
rpc_command,
|
|
377
|
+
prompt,
|
|
378
|
+
task_dir,
|
|
379
|
+
env,
|
|
380
|
+
run_dir / "top-level-rpc.jsonl",
|
|
381
|
+
run_dir / "top-level-stderr.log",
|
|
382
|
+
task_dir,
|
|
383
|
+
run_timeout_seconds,
|
|
384
|
+
quiet_kill_seconds,
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
after_rows = inventory(task_dir)
|
|
388
|
+
write_inventory_tsv(run_dir / "inventory-after.tsv", after_rows)
|
|
389
|
+
verifier_result = run_verifier(task_dir, env, run_dir)
|
|
390
|
+
|
|
391
|
+
metadata = {
|
|
392
|
+
"fixture": fixture_name,
|
|
393
|
+
"implementation": implementation,
|
|
394
|
+
"command": rpc_command,
|
|
395
|
+
"prompt": prompt,
|
|
396
|
+
"task_dir": str(task_dir),
|
|
397
|
+
"started_at": started_at,
|
|
398
|
+
"finished_at": utc_now(),
|
|
399
|
+
"termination_reason": session_result["termination_reason"],
|
|
400
|
+
"session": {
|
|
401
|
+
"returncode": session_result["returncode"],
|
|
402
|
+
"status": session_result["status"],
|
|
403
|
+
"termination_reason": session_result["termination_reason"],
|
|
404
|
+
"cleanup_errors": session_result["cleanup_errors"],
|
|
405
|
+
"status_errors": session_result["status_errors"],
|
|
406
|
+
"stdout_lines": len(session_result["stdout_lines"]),
|
|
407
|
+
"stderr_lines": len(session_result["stderr_lines"]),
|
|
408
|
+
},
|
|
409
|
+
"verifier": verifier_result,
|
|
410
|
+
"inventory": {
|
|
411
|
+
"before_count": len(before_rows),
|
|
412
|
+
"after_count": len(after_rows),
|
|
413
|
+
},
|
|
414
|
+
}
|
|
415
|
+
write_json(run_dir / "run-metadata.json", metadata)
|
|
416
|
+
|
|
417
|
+
return metadata
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def main() -> int:
|
|
421
|
+
parser = argparse.ArgumentParser(description="Replay parity fixtures and capture provenance bundles.")
|
|
422
|
+
parser.add_argument(
|
|
423
|
+
"--fixture",
|
|
424
|
+
action="append",
|
|
425
|
+
choices=("research", "migrate"),
|
|
426
|
+
help="Fixture to run. May be provided more than once. Default: both fixtures.",
|
|
427
|
+
)
|
|
428
|
+
parser.add_argument(
|
|
429
|
+
"--implementation",
|
|
430
|
+
choices=("pi-ralph-loop", "ralphify", "both"),
|
|
431
|
+
default="pi-ralph-loop",
|
|
432
|
+
help="Which implementation to run.",
|
|
433
|
+
)
|
|
434
|
+
parser.add_argument(
|
|
435
|
+
"--root",
|
|
436
|
+
help="Reuse this artifact root instead of creating a fresh temp dir.",
|
|
437
|
+
)
|
|
438
|
+
parser.add_argument(
|
|
439
|
+
"--model",
|
|
440
|
+
default=DEFAULT_MODEL,
|
|
441
|
+
help="Model to pin for the built-in pi-ralph-loop command. If omitted, pi uses the currently active model.",
|
|
442
|
+
)
|
|
443
|
+
parser.add_argument(
|
|
444
|
+
"--loop-rpc-command",
|
|
445
|
+
default=DEFAULT_LOOP_RPC_COMMAND,
|
|
446
|
+
help="Full RPC command override for pi-ralph-loop. When set, this replaces the built-in pi command entirely.",
|
|
447
|
+
)
|
|
448
|
+
parser.add_argument(
|
|
449
|
+
"--ralphify-rpc-command",
|
|
450
|
+
default=DEFAULT_RALPHIFY_RPC_COMMAND,
|
|
451
|
+
help="RPC command used for Ralphify.",
|
|
452
|
+
)
|
|
453
|
+
parser.add_argument(
|
|
454
|
+
"--loop-prompt-template",
|
|
455
|
+
default=DEFAULT_LOOP_PROMPT_TEMPLATE,
|
|
456
|
+
help="Prompt template used for pi-ralph-loop.",
|
|
457
|
+
)
|
|
458
|
+
parser.add_argument(
|
|
459
|
+
"--ralphify-prompt-template",
|
|
460
|
+
default=DEFAULT_RALPHIFY_PROMPT_TEMPLATE,
|
|
461
|
+
help="Prompt template used for Ralphify.",
|
|
462
|
+
)
|
|
463
|
+
parser.add_argument(
|
|
464
|
+
"--run-timeout-seconds",
|
|
465
|
+
type=int,
|
|
466
|
+
default=900,
|
|
467
|
+
help="Maximum wall-clock time per run.",
|
|
468
|
+
)
|
|
469
|
+
parser.add_argument(
|
|
470
|
+
"--quiet-kill-seconds",
|
|
471
|
+
type=float,
|
|
472
|
+
default=3.0,
|
|
473
|
+
help="How long to wait with no output before a non-status-aware process is considered idle. Processes that report initializing/running status are allowed to stay silent.",
|
|
474
|
+
)
|
|
475
|
+
args = parser.parse_args()
|
|
476
|
+
|
|
477
|
+
fixtures = args.fixture if args.fixture else ["research", "migrate"]
|
|
478
|
+
bundle_root = create_bundle_root(args.root or os.environ.get("PI_RALPH_PARITY_ROOT"))
|
|
479
|
+
agent_info = ensure_agent_dir(bundle_root)
|
|
480
|
+
|
|
481
|
+
env = os.environ.copy()
|
|
482
|
+
env["PI_CODING_AGENT_DIR"] = agent_info["destination"]
|
|
483
|
+
|
|
484
|
+
loop_command = (
|
|
485
|
+
parse_command(args.loop_rpc_command)
|
|
486
|
+
if args.loop_rpc_command
|
|
487
|
+
else build_loop_rpc_command(args.model)
|
|
488
|
+
)
|
|
489
|
+
ralphify_command = parse_command(args.ralphify_rpc_command) if args.ralphify_rpc_command else None
|
|
490
|
+
|
|
491
|
+
runs: list[dict[str, Any]] = []
|
|
492
|
+
for fixture_name in fixtures:
|
|
493
|
+
for impl_name, rpc_command, prompt_template in implementation_plan(args.implementation, loop_command, args.loop_prompt_template, ralphify_command):
|
|
494
|
+
metadata = run_fixture(
|
|
495
|
+
bundle_root,
|
|
496
|
+
fixture_name,
|
|
497
|
+
impl_name,
|
|
498
|
+
rpc_command,
|
|
499
|
+
prompt_template if impl_name == "pi-ralph-loop" else args.ralphify_prompt_template,
|
|
500
|
+
env,
|
|
501
|
+
args.run_timeout_seconds,
|
|
502
|
+
args.quiet_kill_seconds,
|
|
503
|
+
)
|
|
504
|
+
runs.append(metadata)
|
|
505
|
+
|
|
506
|
+
manifest = {
|
|
507
|
+
"created_at": utc_now(),
|
|
508
|
+
"repo_root": str(REPO_ROOT),
|
|
509
|
+
"repo_head": run_git(["rev-parse", "HEAD"]),
|
|
510
|
+
"repo_status": run_git(["status", "--short"]),
|
|
511
|
+
"bundle_root": str(bundle_root),
|
|
512
|
+
"agent": agent_info,
|
|
513
|
+
"fixtures": fixtures,
|
|
514
|
+
"implementation": args.implementation,
|
|
515
|
+
"loop_rpc_command": loop_command,
|
|
516
|
+
"ralphify_rpc_command": ralphify_command,
|
|
517
|
+
"runs": runs,
|
|
518
|
+
}
|
|
519
|
+
write_json(bundle_root / "manifest.json", manifest)
|
|
520
|
+
|
|
521
|
+
print(bundle_root)
|
|
522
|
+
return 0
|
|
523
|
+
|
|
524
|
+
|
|
525
|
+
if __name__ == "__main__":
|
|
526
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import assert from "node:assert/strict";
|
|
2
|
+
import { execFile } from "node:child_process";
|
|
3
|
+
import { fileURLToPath } from "node:url";
|
|
4
|
+
import { promisify } from "node:util";
|
|
5
|
+
import test from "node:test";
|
|
6
|
+
|
|
7
|
+
const execFileAsync = promisify(execFile);
|
|
8
|
+
const repoRoot = fileURLToPath(new URL("..", import.meta.url));
|
|
9
|
+
|
|
10
|
+
function cleanEnv(): NodeJS.ProcessEnv {
|
|
11
|
+
const env = { ...process.env };
|
|
12
|
+
delete env.PI_RALPH_PARITY_MODEL;
|
|
13
|
+
delete env.PI_RALPH_PARITY_LOOP_RPC_COMMAND;
|
|
14
|
+
return env;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
test("parity harness builds a pi command without pinning a model by default", async () => {
|
|
18
|
+
const { stdout } = await execFileAsync(
|
|
19
|
+
"python3",
|
|
20
|
+
[
|
|
21
|
+
"-c",
|
|
22
|
+
"from tests.parity.harness import build_loop_rpc_command; print(' '.join(build_loop_rpc_command(None)))",
|
|
23
|
+
],
|
|
24
|
+
{ cwd: repoRoot, env: cleanEnv() },
|
|
25
|
+
);
|
|
26
|
+
|
|
27
|
+
assert.match(stdout.trim(), /^pi --mode rpc --no-extensions -e .*\/src\/index\.ts$/);
|
|
28
|
+
assert.equal(stdout.includes("--model"), false);
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
test("parity harness appends an explicit model when requested", async () => {
|
|
32
|
+
const { stdout } = await execFileAsync(
|
|
33
|
+
"python3",
|
|
34
|
+
[
|
|
35
|
+
"-c",
|
|
36
|
+
"from tests.parity.harness import build_loop_rpc_command; print(' '.join(build_loop_rpc_command('openai-codex/gpt-5.4-mini:high')))"
|
|
37
|
+
],
|
|
38
|
+
{ cwd: repoRoot, env: cleanEnv() },
|
|
39
|
+
);
|
|
40
|
+
|
|
41
|
+
assert.match(stdout.trim(), /^pi --mode rpc --no-extensions -e .*\/src\/index\.ts --model openai-codex\/gpt-5\.4-mini:high$/);
|
|
42
|
+
});
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import assert from "node:assert/strict";
|
|
2
|
+
import { existsSync, readFileSync } from "node:fs";
|
|
3
|
+
import { join } from "node:path";
|
|
4
|
+
import test from "node:test";
|
|
5
|
+
|
|
6
|
+
const fixtureDir = join(process.cwd(), "tests/fixtures/parity/research");
|
|
7
|
+
|
|
8
|
+
const generatedOutputs = [
|
|
9
|
+
"INSTALL_FLOW.md",
|
|
10
|
+
"MISSIONS_FINDINGS.md",
|
|
11
|
+
"evidence/INDEX.md",
|
|
12
|
+
"evidence/raw/app-factory-ai-cli.md",
|
|
13
|
+
"evidence/raw/docs-factory-ai-cli-features-missions.md",
|
|
14
|
+
"evidence/raw/factory-ai-news-missions.md",
|
|
15
|
+
];
|
|
16
|
+
|
|
17
|
+
test("research fixture does not include generated outputs", () => {
|
|
18
|
+
for (const rel of generatedOutputs) {
|
|
19
|
+
assert.equal(existsSync(join(fixtureDir, rel)), false, `${rel} should not be checked in`);
|
|
20
|
+
}
|
|
21
|
+
});
|
|
22
|
+
|
|
23
|
+
test("research fixture instructions name the helper scripts explicitly", () => {
|
|
24
|
+
const ralph = readFileSync(join(fixtureDir, "RALPH.md"), "utf8");
|
|
25
|
+
assert.match(ralph, /First, inspect `\.\/scripts\/show-snapshots\.sh`\./);
|
|
26
|
+
assert.match(ralph, /Before you finish, run `\.\/scripts\/verify\.sh`\./);
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
test("research checklist leaves generated outputs as pending work", () => {
|
|
30
|
+
const checklist = readFileSync(join(fixtureDir, "claim-evidence-checklist.md"), "utf8");
|
|
31
|
+
assert.match(checklist, /- \[ \] `INSTALL_FLOW\.md` must synthesize the shared installer claim across all three snapshots\./);
|
|
32
|
+
assert.match(checklist, /- \[ \] `MISSIONS_FINDINGS\.md` must cite each snapshot path directly\./);
|
|
33
|
+
assert.match(checklist, /- \[ \] `evidence\/INDEX\.md` must map each raw evidence file back to its snapshot\./);
|
|
34
|
+
});
|