@kestrel-agents/ruhroh 0.5.0-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +114 -0
  3. package/assets/ruhroh-badge.png +0 -0
  4. package/assets/ruhroh-logo.png +0 -0
  5. package/dist/adapters.d.ts +97 -0
  6. package/dist/adapters.d.ts.map +1 -0
  7. package/dist/adapters.js +21 -0
  8. package/dist/adapters.js.map +1 -0
  9. package/dist/builtin-scenarios.d.ts +8 -0
  10. package/dist/builtin-scenarios.d.ts.map +1 -0
  11. package/dist/builtin-scenarios.js +22 -0
  12. package/dist/builtin-scenarios.js.map +1 -0
  13. package/dist/cli.d.ts +30 -0
  14. package/dist/cli.d.ts.map +1 -0
  15. package/dist/cli.js +313 -0
  16. package/dist/cli.js.map +1 -0
  17. package/dist/env.d.ts +6 -0
  18. package/dist/env.d.ts.map +1 -0
  19. package/dist/env.js +66 -0
  20. package/dist/env.js.map +1 -0
  21. package/dist/generate.d.ts +32 -0
  22. package/dist/generate.d.ts.map +1 -0
  23. package/dist/generate.js +231 -0
  24. package/dist/generate.js.map +1 -0
  25. package/dist/harbor.d.ts +28 -0
  26. package/dist/harbor.d.ts.map +1 -0
  27. package/dist/harbor.js +47 -0
  28. package/dist/harbor.js.map +1 -0
  29. package/dist/index.d.ts +8 -0
  30. package/dist/index.d.ts.map +1 -0
  31. package/dist/index.js +8 -0
  32. package/dist/index.js.map +1 -0
  33. package/dist/results.d.ts +66 -0
  34. package/dist/results.d.ts.map +1 -0
  35. package/dist/results.js +31 -0
  36. package/dist/results.js.map +1 -0
  37. package/dist/scenarios.d.ts +61 -0
  38. package/dist/scenarios.d.ts.map +1 -0
  39. package/dist/scenarios.js +69 -0
  40. package/dist/scenarios.js.map +1 -0
  41. package/package.json +66 -0
  42. package/python/ruhroh/__init__.py +5 -0
  43. package/python/ruhroh/harbor_agent.py +345 -0
  44. package/python/ruhroh/loop_controller.py +783 -0
  45. package/python/ruhroh/setup.sh +12 -0
  46. package/scenarios/grocery-budget-planner/instruction.md +1 -0
  47. package/scenarios/grocery-budget-planner/scenario.json +44 -0
  48. package/scenarios/nextjs-task-board/instruction.md +1 -0
  49. package/scenarios/nextjs-task-board/scenario.json +45 -0
  50. package/scenarios/shift-coverage-planner/assets/prompt-assets/shift-coverage/coverage-rules.json +29 -0
  51. package/scenarios/shift-coverage-planner/assets/prompt-assets/shift-coverage/employees.csv +8 -0
  52. package/scenarios/shift-coverage-planner/assets/prompt-assets/shift-coverage/existing-schedule.csv +9 -0
  53. package/scenarios/shift-coverage-planner/assets/prompt-assets/shift-coverage/shift-requirements.csv +8 -0
  54. package/scenarios/shift-coverage-planner/assets/prompt-assets/shift-coverage/time-off-requests.csv +5 -0
  55. package/scenarios/shift-coverage-planner/instruction.md +1 -0
  56. package/scenarios/shift-coverage-planner/scenario.json +47 -0
  57. package/scenarios/simple-newsletter/instruction.md +1 -0
  58. package/scenarios/simple-newsletter/scenario.json +40 -0
  59. package/scenarios/vite-csv-reconciliation/assets/prompt-assets/csv-reconciliation-people/source-a.csv +9 -0
  60. package/scenarios/vite-csv-reconciliation/assets/prompt-assets/csv-reconciliation-people/source-b.csv +9 -0
  61. package/scenarios/vite-csv-reconciliation/instruction.md +1 -0
  62. package/scenarios/vite-csv-reconciliation/scenario.json +48 -0
  63. package/scenarios/vite-sprint-planner/instruction.md +1 -0
  64. package/scenarios/vite-sprint-planner/scenario.json +45 -0
@@ -0,0 +1,783 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import base64
5
+ import json
6
+ import os
7
+ import shutil
8
+ import subprocess
9
+ import sys
10
+ import tarfile
11
+ import time
12
+ import uuid
13
+ from pathlib import Path
14
+ from typing import Any
15
+
16
+
17
+ RESULT_MARKER_PREFIX = "RUHROH_RESULT_JSON_BASE64:"
18
+ DEFAULT_DATASET = "ruhroh@local"
19
+ DEFAULT_ADAPTER = "ruhroh-harbor"
20
+ DEFAULT_MAX_ITERATIONS = 3
21
+ SKIP_WORKSPACE_TAR_NAMES = {"node_modules", ".next", "dist", "build", ".git"}
22
+ COMPLETION_TERMINAL_FAILURE_REASONS = {"cannot_satisfy", "policy_blocked", "out_of_scope", "runtime_failure", "infra_failure"}
23
+
24
+
25
+ def main() -> int:
26
+ parser = argparse.ArgumentParser()
27
+ parser.add_argument("--instruction-base64", required=True)
28
+ parser.add_argument("--scenario-id", required=True)
29
+ parser.add_argument("--max-iterations", type=int, default=read_max_iterations())
30
+ args = parser.parse_args()
31
+
32
+ load_run_env_file()
33
+ load_repo_dotenv()
34
+ instruction = base64.b64decode(args.instruction_base64).decode("utf-8")
35
+ scenario_id = safe_id(args.scenario_id)
36
+ result = run_ruhroh_trial(
37
+ instruction=instruction,
38
+ scenario_id=scenario_id,
39
+ max_iterations=max(1, args.max_iterations),
40
+ workspace_root=Path(resolve_workspace_root()),
41
+ installed_dir=Path("/installed-agent"),
42
+ )
43
+ emit_result(result)
44
+ return 0 if result.get("status") == "completed" else 1
45
+
46
+
47
+ def run_ruhroh_trial(
48
+ instruction: str,
49
+ scenario_id: str,
50
+ max_iterations: int,
51
+ workspace_root: Path,
52
+ installed_dir: Path,
53
+ ) -> dict[str, Any]:
54
+ started_at = time.monotonic()
55
+ installed_dir.mkdir(parents=True, exist_ok=True)
56
+ workspace_root.mkdir(parents=True, exist_ok=True)
57
+ run_root = installed_dir / "ruhroh-loop"
58
+ run_root.mkdir(parents=True, exist_ok=True)
59
+ runs_path = installed_dir / "ruhroh-loop-iterations.jsonl"
60
+ journey_path = installed_dir / "ruhroh-loop-journey.json"
61
+ eval_result_path = installed_dir / "ruhroh-loop-eval.json"
62
+ result_path = installed_dir / "ruhroh-loop-result.json"
63
+ workspace_tarball_path = installed_dir / "ruhroh-workspace.tar.gz"
64
+ events_tarball_path = installed_dir / "ruhroh-loop-events.tar.gz"
65
+ transcripts_tarball_path = installed_dir / "ruhroh-loop-transcripts.tar.gz"
66
+ eval_workspace_root = run_root / "eval-workspace"
67
+
68
+ implementation_runs: list[dict[str, Any]] = []
69
+ implementation_stopped_reason = "max_iterations"
70
+ adapter = build_run_agent_adapter(
71
+ adapter_id=read_run_agent_adapter(),
72
+ scenario_id=scenario_id,
73
+ workspace_root=workspace_root,
74
+ installed_dir=installed_dir,
75
+ run_root=run_root,
76
+ )
77
+ session_handle = "unstarted"
78
+ run_agent_manifest: dict[str, Any] = {
79
+ "adapterId": adapter.id,
80
+ "continuityLevel": adapter.continuity_level,
81
+ "sessionHandle": session_handle,
82
+ "runIds": [],
83
+ "transcriptPaths": [],
84
+ "eventLogPaths": [],
85
+ "artifactPaths": {},
86
+ }
87
+ try:
88
+ adapter.prepare()
89
+ session = adapter.start_session()
90
+ session_handle = session["sessionHandle"]
91
+
92
+ for iteration in range(1, max_iterations + 1):
93
+ message = build_iteration_message(instruction, iteration, adapter.completion_instruction())
94
+ turn_result = adapter.run_turn(iteration=iteration, message=message)
95
+ completion_status = adapter.detect_completion(turn_result)
96
+ implementation_run = build_implementation_run_record_from_turn(turn_result, completion_status)
97
+ implementation_runs.append(implementation_run)
98
+ append_jsonl(runs_path, implementation_run)
99
+
100
+ if completion_status.get("state") == "done":
101
+ implementation_stopped_reason = str(completion_status.get("reason") or "done")
102
+ break
103
+ if completion_status.get("state") == "terminal_failure":
104
+ implementation_stopped_reason = str(completion_status.get("reason") or "terminal_failure")
105
+ break
106
+
107
+ run_agent_manifest = adapter.collect_artifacts()
108
+ journey = {
109
+ "version": "ruhroh_implementation_journey_v1",
110
+ "scenarioId": scenario_id,
111
+ "userPrompt": instruction,
112
+ "implementationStoppedReason": implementation_stopped_reason,
113
+ "implementationIterationsUsed": len(implementation_runs),
114
+ "runAgent": run_agent_manifest,
115
+ "runAgentAdapterId": adapter.id,
116
+ "continuityLevel": adapter.continuity_level,
117
+ "sessionHandle": session_handle,
118
+ "workspacePath": str(workspace_root),
119
+ "implementationRuns": implementation_runs,
120
+ }
121
+ journey.update(adapter.legacy_journey_fields())
122
+ journey_path.write_text(json.dumps(journey, indent=2, sort_keys=True) + "\n", encoding="utf-8")
123
+
124
+ copy_workspace_for_eval(workspace_root, eval_workspace_root)
125
+ eval_result = run_eval_agent(
126
+ scenario_id=scenario_id,
127
+ eval_workspace_root=eval_workspace_root,
128
+ original_workspace_root=workspace_root,
129
+ journey_path=journey_path,
130
+ eval_output_path=eval_result_path,
131
+ )
132
+ write_workspace_tarball(workspace_root, workspace_tarball_path)
133
+ adapter_artifact_paths = run_agent_manifest.get("artifactPaths") if isinstance(run_agent_manifest.get("artifactPaths"), dict) else {}
134
+ event_log_dir = Path(str(adapter_artifact_paths.get("eventLogDir") or run_root / "events"))
135
+ transcript_dir = Path(str(adapter_artifact_paths.get("transcriptDir") or run_root / "transcripts"))
136
+ write_directory_tarball(event_log_dir, events_tarball_path)
137
+ write_directory_tarball(transcript_dir, transcripts_tarball_path)
138
+
139
+ verdict = derive_final_verdict(implementation_runs, eval_result)
140
+ final_result = {
141
+ "version": "ruhroh_loop_result_v1",
142
+ "adapter": result_adapter(),
143
+ "dataset": result_dataset(),
144
+ "scenarioId": scenario_id,
145
+ "task_id": scenario_id,
146
+ "status": verdict["status"],
147
+ "failure_kind": verdict["failure_kind"],
148
+ "failureBucket": verdict["failure_kind"],
149
+ "score": verdict["score"],
150
+ "iterationsUsed": len(implementation_runs),
151
+ "implementationIterationsUsed": len(implementation_runs),
152
+ "implementationStoppedReason": implementation_stopped_reason,
153
+ "stoppedReason": implementation_stopped_reason,
154
+ "duration_ms": round((time.monotonic() - started_at) * 1000),
155
+ "runAgent": run_agent_manifest,
156
+ "runAgentAdapterId": adapter.id,
157
+ "continuityLevel": adapter.continuity_level,
158
+ "sessionHandle": session_handle,
159
+ "runIds": run_agent_manifest.get("runIds", []),
160
+ "implementationRuns": implementation_runs,
161
+ "evalResult": eval_result,
162
+ "artifactPaths": {
163
+ "result": str(result_path),
164
+ "implementationRuns": str(runs_path),
165
+ "journey": str(journey_path),
166
+ "evalResult": str(eval_result_path),
167
+ "bridgeLog": str(adapter_artifact_paths.get("bridgeLogPath", "")),
168
+ "workspaceTarball": str(workspace_tarball_path),
169
+ "eventsTarball": str(events_tarball_path),
170
+ "transcriptsTarball": str(transcripts_tarball_path),
171
+ "evalWorkspace": str(eval_workspace_root),
172
+ },
173
+ }
174
+ final_result.update(adapter.legacy_result_fields(run_agent_manifest))
175
+ result_path.write_text(json.dumps(final_result, indent=2, sort_keys=True) + "\n", encoding="utf-8")
176
+ return final_result
177
+ except Exception as error:
178
+ final_result = {
179
+ "version": "ruhroh_loop_result_v1",
180
+ "adapter": result_adapter(),
181
+ "dataset": result_dataset(),
182
+ "scenarioId": scenario_id,
183
+ "task_id": scenario_id,
184
+ "status": "failed",
185
+ "failure_kind": "infra_failed",
186
+ "failureBucket": "infra_failed",
187
+ "score": 0,
188
+ "iterationsUsed": len(implementation_runs),
189
+ "implementationIterationsUsed": len(implementation_runs),
190
+ "implementationStoppedReason": "exception",
191
+ "stoppedReason": "exception",
192
+ "duration_ms": round((time.monotonic() - started_at) * 1000),
193
+ "runAgent": run_agent_manifest,
194
+ "runAgentAdapterId": adapter.id,
195
+ "continuityLevel": adapter.continuity_level,
196
+ "sessionHandle": session_handle,
197
+ "runIds": run_agent_manifest.get("runIds", []),
198
+ "implementationRuns": implementation_runs,
199
+ "failure_details": {"message": str(error), "type": type(error).__name__},
200
+ }
201
+ final_result.update(adapter.legacy_result_fields(run_agent_manifest))
202
+ result_path.write_text(json.dumps(final_result, indent=2, sort_keys=True) + "\n", encoding="utf-8")
203
+ return final_result
204
+ finally:
205
+ adapter.cleanup()
206
+
207
+
208
+ class RunAgentAdapter:
209
+ id = "base"
210
+ continuity_level = "workspace_only"
211
+
212
+ def __init__(self, scenario_id: str, workspace_root: Path, installed_dir: Path, run_root: Path) -> None:
213
+ self.scenario_id = scenario_id
214
+ self.workspace_root = workspace_root
215
+ self.installed_dir = installed_dir
216
+ self.run_root = run_root
217
+ self.session_handle = f"{self.id}-{scenario_id}-{uuid.uuid4().hex[:8]}"
218
+ self.turns: list[dict[str, Any]] = []
219
+
220
+ def prepare(self) -> dict[str, Any]:
221
+ return {"artifactPaths": {}}
222
+
223
+ def start_session(self) -> dict[str, Any]:
224
+ return {"sessionHandle": self.session_handle, "artifactPaths": {}}
225
+
226
+ def run_turn(self, *, iteration: int, message: str) -> dict[str, Any]:
227
+ raise NotImplementedError
228
+
229
+ def detect_completion(self, turn_result: dict[str, Any]) -> dict[str, Any]:
230
+ raise NotImplementedError
231
+
232
+ def collect_artifacts(self) -> dict[str, Any]:
233
+ return {
234
+ "adapterId": self.id,
235
+ "continuityLevel": self.continuity_level,
236
+ "sessionHandle": self.session_handle,
237
+ "runIds": [
238
+ str(turn["runId"])
239
+ for turn in self.turns
240
+ if isinstance(turn.get("runId"), str)
241
+ ],
242
+ "transcriptPaths": [
243
+ str(turn["transcriptPath"])
244
+ for turn in self.turns
245
+ if isinstance(turn.get("transcriptPath"), str)
246
+ ],
247
+ "eventLogPaths": [
248
+ str(turn["eventLogPath"])
249
+ for turn in self.turns
250
+ if isinstance(turn.get("eventLogPath"), str)
251
+ ],
252
+ "artifactPaths": {},
253
+ }
254
+
255
+ def cleanup(self) -> None:
256
+ return None
257
+
258
+ def completion_instruction(self) -> str:
259
+ return "If the goal is complete, emit the adapter completion signal for goal_satisfied. If the goal is not complete, keep working in this same session."
260
+
261
+ def legacy_journey_fields(self) -> dict[str, Any]:
262
+ return {}
263
+
264
+ def legacy_result_fields(self, manifest: dict[str, Any]) -> dict[str, Any]:
265
+ del manifest
266
+ return {}
267
+
268
+
269
+ class CommandRunAgentAdapter(RunAgentAdapter):
270
+ continuity_level = "workspace_only"
271
+
272
+ def __init__(
273
+ self,
274
+ scenario_id: str,
275
+ workspace_root: Path,
276
+ installed_dir: Path,
277
+ run_root: Path,
278
+ adapter_id: str,
279
+ command_env_key: str = "RUHROH_RUN_AGENT_COMMAND",
280
+ completion_protocol_env_key: str = "RUHROH_RUN_AGENT_COMPLETION_PROTOCOL",
281
+ ) -> None:
282
+ self.id = adapter_id
283
+ self.command_env_key = command_env_key
284
+ self.completion_protocol_env_key = completion_protocol_env_key
285
+ super().__init__(scenario_id, workspace_root, installed_dir, run_root)
286
+
287
+ def run_turn(self, *, iteration: int, message: str) -> dict[str, Any]:
288
+ command = os.environ.get(self.command_env_key)
289
+ if command is None or command.strip() == "":
290
+ raise RuntimeError(f"{self.command_env_key} is required for Ruhroh adapter {self.id}")
291
+ transcript_path = self.run_root / "transcripts" / f"iteration-{iteration}.log"
292
+ transcript_path.parent.mkdir(parents=True, exist_ok=True)
293
+ goal_path = self.run_root / "custom-shell" / f"goal-{iteration}.md"
294
+ result_path = self.run_root / "custom-shell" / f"result-{iteration}.json"
295
+ goal_path.parent.mkdir(parents=True, exist_ok=True)
296
+ goal_path.write_text(message, encoding="utf-8")
297
+ env = {
298
+ **os.environ,
299
+ "RUHROH_MESSAGE": message,
300
+ "RUHROH_ITERATION": str(iteration),
301
+ "RUHROH_WORKSPACE": str(self.workspace_root),
302
+ "RUHROH_GOAL_PATH": str(goal_path),
303
+ "RUHROH_MESSAGE_PATH": str(goal_path),
304
+ "RUHROH_WORKSPACE_PATH": str(self.workspace_root),
305
+ "RUHROH_RESULT_PATH": str(result_path),
306
+ "RUHROH_SESSION_HANDLE": self.session_handle,
307
+ "RUHROH_SCENARIO_ID": self.scenario_id,
308
+ "RUHROH_RUN_ROOT": str(self.run_root),
309
+ "RUHROH_ADAPTER_ID": self.id,
310
+ }
311
+ completed = subprocess.run(
312
+ command,
313
+ cwd=str(self.workspace_root),
314
+ env=env,
315
+ text=True,
316
+ stdout=subprocess.PIPE,
317
+ stderr=subprocess.STDOUT,
318
+ timeout=read_iteration_timeout_sec(),
319
+ shell=True,
320
+ )
321
+ transcript_path.write_text(completed.stdout, encoding="utf-8")
322
+ parsed_result = read_json_file(result_path)
323
+ if not isinstance(parsed_result, dict):
324
+ parsed_result = {}
325
+ status = "completed" if completed.returncode == 0 else "failed"
326
+ turn = {
327
+ "version": "ruhroh_run_agent_turn_v1",
328
+ "adapterId": self.id,
329
+ "continuityLevel": self.continuity_level,
330
+ "iteration": iteration,
331
+ "status": status,
332
+ "failureKind": "none" if status == "completed" else "custom_shell_failed",
333
+ "sessionHandle": self.session_handle,
334
+ "runId": parsed_result.get("runId") if isinstance(parsed_result.get("runId"), str) else f"{self.session_handle}-{iteration}",
335
+ "threadId": parsed_result.get("threadId") if isinstance(parsed_result.get("threadId"), str) else None,
336
+ "eventLogPath": parsed_result.get("eventLogPath") if isinstance(parsed_result.get("eventLogPath"), str) else None,
337
+ "jobInputPath": parsed_result.get("jobInputPath") if isinstance(parsed_result.get("jobInputPath"), str) else None,
338
+ "jobOutputPath": parsed_result.get("jobOutputPath") if isinstance(parsed_result.get("jobOutputPath"), str) else None,
339
+ "finalizedPayload": parsed_result.get("finalizedPayload"),
340
+ "returnCode": completed.returncode,
341
+ "transcriptPath": str(transcript_path),
342
+ "artifactPaths": {
343
+ "goal": str(goal_path),
344
+ "transcript": str(transcript_path),
345
+ "result": str(result_path),
346
+ "message": str(goal_path),
347
+ **(parsed_result.get("artifacts") if isinstance(parsed_result.get("artifacts"), dict) else {}),
348
+ },
349
+ "notes": completed.stdout[-2000:],
350
+ }
351
+ self.turns.append(turn)
352
+ return turn
353
+
354
+ def detect_completion(self, turn_result: dict[str, Any]) -> dict[str, Any]:
355
+ evidence = completion_evidence_for_turn(turn_result)
356
+ if turn_result.get("status") != "completed":
357
+ return {"state": "terminal_failure", "reason": "runtime_failure", "evidenceRefs": evidence}
358
+ protocol = os.environ.get(self.completion_protocol_env_key, "json-final-line")
359
+ artifact_paths = turn_result.get("artifactPaths")
360
+ result_path = artifact_paths.get("result") if isinstance(artifact_paths, dict) else None
361
+ if isinstance(result_path, str):
362
+ parsed_result = read_json_file(Path(result_path))
363
+ if isinstance(parsed_result, dict):
364
+ status = parsed_result.get("status")
365
+ if status == "goal_satisfied":
366
+ return {
367
+ "state": "done",
368
+ "reason": "goal_satisfied",
369
+ "confidence": "adapter_inferred",
370
+ "evidenceRefs": evidence,
371
+ }
372
+ if status in COMPLETION_TERMINAL_FAILURE_REASONS:
373
+ return {"state": "terminal_failure", "reason": status, "evidenceRefs": evidence}
374
+ if status == "continue":
375
+ return {"state": "not_done", "reason": "partial_progress", "evidenceRefs": evidence}
376
+ notes = str(turn_result.get("notes") or "")
377
+ if protocol == "json-final-line":
378
+ for line in reversed(notes.splitlines()):
379
+ try:
380
+ parsed = json.loads(line)
381
+ except Exception:
382
+ continue
383
+ if isinstance(parsed, dict) and parsed.get("status") == "goal_satisfied":
384
+ return {
385
+ "state": "done",
386
+ "reason": "goal_satisfied",
387
+ "confidence": "adapter_inferred",
388
+ "evidenceRefs": evidence,
389
+ }
390
+ return {"state": "not_done", "reason": "missing_completion_signal", "evidenceRefs": evidence}
391
+
392
+ def completion_instruction(self) -> str:
393
+ return (
394
+ "If the goal is complete, end your response with one JSON line: "
395
+ "{\"status\":\"goal_satisfied\"}. If the goal is not complete, keep working in this same workspace."
396
+ )
397
+
398
+
399
+ class CustomShellRunAgentAdapter(CommandRunAgentAdapter):
400
+ def __init__(self, scenario_id: str, workspace_root: Path, installed_dir: Path, run_root: Path) -> None:
401
+ super().__init__(
402
+ scenario_id,
403
+ workspace_root,
404
+ installed_dir,
405
+ run_root,
406
+ adapter_id="custom-shell",
407
+ command_env_key="RUHROH_RUN_AGENT_COMMAND",
408
+ completion_protocol_env_key="RUHROH_RUN_AGENT_COMPLETION_PROTOCOL",
409
+ )
410
+
411
+ def completion_instruction(self) -> str:
412
+ return CommandRunAgentAdapter.completion_instruction(self)
413
+
414
+
415
+ def build_run_agent_adapter(
416
+ *,
417
+ adapter_id: str,
418
+ scenario_id: str,
419
+ workspace_root: Path,
420
+ installed_dir: Path,
421
+ run_root: Path,
422
+ ) -> RunAgentAdapter:
423
+ if adapter_id == "custom-shell":
424
+ return CustomShellRunAgentAdapter(scenario_id, workspace_root, installed_dir, run_root)
425
+ return CommandRunAgentAdapter(scenario_id, workspace_root, installed_dir, run_root, adapter_id=adapter_id)
426
+
427
+
428
+ def read_run_agent_adapter() -> str:
429
+ return os.environ.get("RUHROH_RUN_AGENT_ADAPTER") or os.environ.get("RUHROH_RUN_AGENT_ADAPTER") or "custom-shell"
430
+
431
+
432
+ def completion_evidence_for_turn(turn_result: dict[str, Any]) -> list[dict[str, str]]:
433
+ refs: list[dict[str, str]] = []
434
+ for kind, key in (("transcript", "transcriptPath"), ("event_log", "eventLogPath"), ("job_output", "jobOutputPath")):
435
+ value = turn_result.get(key)
436
+ if isinstance(value, str):
437
+ refs.append({"kind": kind, "ref": value, "summary": f"{kind} for iteration {turn_result.get('iteration')}"})
438
+ return refs
439
+
440
+
441
+ def build_implementation_run_record_from_turn(turn_result: dict[str, Any], completion_status: dict[str, Any]) -> dict[str, Any]:
442
+ stop_reason = completion_status.get("reason") or "not_done"
443
+ record = {
444
+ "version": "ruhroh_implementation_run_v1",
445
+ "iteration": turn_result.get("iteration"),
446
+ "adapterId": turn_result.get("adapterId"),
447
+ "continuityLevel": turn_result.get("continuityLevel"),
448
+ "status": turn_result.get("status"),
449
+ "failureKind": turn_result.get("failureKind"),
450
+ "sessionHandle": turn_result.get("sessionHandle"),
451
+ "completionStatus": completion_status,
452
+ "stopReason": stop_reason,
453
+ "returnCode": turn_result.get("returnCode"),
454
+ "artifactPaths": turn_result.get("artifactPaths", {}),
455
+ "notes": str(turn_result.get("notes") or "")[-2000:],
456
+ }
457
+ for key in (
458
+ "sessionId",
459
+ "runId",
460
+ "threadId",
461
+ "finalizationStatus",
462
+ "finalizedPayload",
463
+ "jobInputPath",
464
+ "jobOutputPath",
465
+ "transcriptPath",
466
+ "eventLogPath",
467
+ ):
468
+ value = turn_result.get(key)
469
+ if value is not None:
470
+ record[key] = value
471
+ return record
472
+
473
+
474
+ def build_iteration_message(
475
+ instruction: str,
476
+ iteration: int,
477
+ completion_instruction: str = "If the goal is complete, emit the adapter completion signal for goal_satisfied. If the goal is not complete, keep working in this same session.",
478
+ previous_eval: dict[str, Any] | None = None,
479
+ ) -> str:
480
+ del previous_eval
481
+ if iteration == 1:
482
+ return instruction
483
+ return (
484
+ "Continue the same app-development task in the existing workspace.\n\n"
485
+ f"Original user goal:\n{instruction}\n\n"
486
+ f"This is Ruhroh implementation continuation {iteration}. Do not restart or create a separate project. "
487
+ "Inspect the current workspace, continue any unfinished work, and verify the final delivered state. "
488
+ f"{completion_instruction}"
489
+ )
490
+
491
+
492
+ def copy_workspace_for_eval(workspace_root: Path, eval_workspace_root: Path) -> None:
493
+ if eval_workspace_root.exists():
494
+ shutil.rmtree(eval_workspace_root)
495
+ if not workspace_root.exists():
496
+ eval_workspace_root.mkdir(parents=True, exist_ok=True)
497
+ return
498
+ shutil.copytree(
499
+ workspace_root,
500
+ eval_workspace_root,
501
+ ignore=shutil.ignore_patterns(*SKIP_WORKSPACE_TAR_NAMES),
502
+ )
503
+
504
+
505
+ def run_eval_agent(
506
+ scenario_id: str,
507
+ eval_workspace_root: Path,
508
+ original_workspace_root: Path,
509
+ journey_path: Path,
510
+ eval_output_path: Path,
511
+ ) -> dict[str, Any]:
512
+ fixture = read_eval_fixture()
513
+ if fixture is not None:
514
+ fixture.setdefault("artifacts", {})
515
+ if isinstance(fixture["artifacts"], dict):
516
+ fixture["artifacts"].setdefault("workspacePath", str(eval_workspace_root))
517
+ fixture["artifacts"].setdefault("originalWorkspacePath", str(original_workspace_root))
518
+ fixture["artifacts"].setdefault("journeyPath", str(journey_path))
519
+ eval_output_path.write_text(json.dumps(fixture, indent=2, sort_keys=True) + "\n", encoding="utf-8")
520
+ return fixture
521
+ command = os.environ.get("RUHROH_EVAL_COMMAND")
522
+ if command is not None and command.strip() != "":
523
+ env = {
524
+ **os.environ,
525
+ "RUHROH_EVAL_SCENARIO_ID": scenario_id,
526
+ "RUHROH_EVAL_WORKSPACE_PATH": str(eval_workspace_root),
527
+ "RUHROH_EVAL_ORIGINAL_WORKSPACE_PATH": str(original_workspace_root),
528
+ "RUHROH_EVAL_JOURNEY_PATH": str(journey_path),
529
+ "RUHROH_EVAL_OUTPUT_PATH": str(eval_output_path),
530
+ }
531
+ completed = subprocess.run(
532
+ command,
533
+ cwd=str(eval_workspace_root),
534
+ env=env,
535
+ text=True,
536
+ stdout=subprocess.PIPE,
537
+ stderr=subprocess.STDOUT,
538
+ timeout=int(os.environ.get("RUHROH_EVAL_TIMEOUT_SEC", "300")),
539
+ shell=True,
540
+ )
541
+ if completed.returncode != 0:
542
+ return synthetic_eval_infra_failure(
543
+ scenario_id=scenario_id,
544
+ eval_workspace_root=eval_workspace_root,
545
+ eval_output_path=eval_output_path,
546
+ diagnostics=completed.stdout[-4000:],
547
+ )
548
+ parsed = read_json_file(eval_output_path)
549
+ if isinstance(parsed, dict):
550
+ return parsed
551
+ for line in reversed(completed.stdout.splitlines()):
552
+ try:
553
+ parsed_line = json.loads(line)
554
+ except Exception:
555
+ continue
556
+ if isinstance(parsed_line, dict):
557
+ eval_output_path.write_text(json.dumps(parsed_line, indent=2, sort_keys=True) + "\n", encoding="utf-8")
558
+ return parsed_line
559
+ return synthetic_eval_infra_failure(
560
+ scenario_id=scenario_id,
561
+ eval_workspace_root=eval_workspace_root,
562
+ eval_output_path=eval_output_path,
563
+ diagnostics="RUHROH_EVAL_COMMAND completed but did not write or print a JSON eval result.",
564
+ )
565
+ return synthetic_eval_infra_failure(
566
+ scenario_id=scenario_id,
567
+ eval_workspace_root=eval_workspace_root,
568
+ eval_output_path=eval_output_path,
569
+ diagnostics="Package-owned Ruhroh runtime requires RUHROH_EVAL_RESULT_FIXTURE, RUHROH_EVAL_RESULT_FIXTURE_PATH, or RUHROH_EVAL_COMMAND.",
570
+ )
571
+
572
+
573
+ def read_eval_fixture() -> dict[str, Any] | None:
574
+ raw = os.environ.get("RUHROH_EVAL_RESULT_FIXTURE")
575
+ if raw:
576
+ try:
577
+ parsed = json.loads(raw)
578
+ except json.JSONDecodeError:
579
+ return None
580
+ return parsed if isinstance(parsed, dict) else None
581
+ path = os.environ.get("RUHROH_EVAL_RESULT_FIXTURE_PATH")
582
+ if not path:
583
+ return None
584
+ parsed = read_json_file(Path(path))
585
+ return parsed if isinstance(parsed, dict) else None
586
+
587
+
588
+ def synthetic_eval_infra_failure(
589
+ scenario_id: str,
590
+ eval_workspace_root: Path,
591
+ eval_output_path: Path,
592
+ diagnostics: str,
593
+ ) -> dict[str, Any]:
594
+ result = {
595
+ "version": "ruhroh_eval_result_v1",
596
+ "status": "infra_failed",
597
+ "goalMet": False,
598
+ "confidence": "high",
599
+ "reasons": ["Eval-agent failed to produce a usable terminal judgment."],
600
+ "unmetCriteria": ["Eval-agent failed."],
601
+ "evidenceRefs": [{"kind": "environment", "ref": str(eval_output_path), "summary": diagnostics[-1000:]}],
602
+ "commandsRun": [],
603
+ "artifacts": {"workspacePath": str(eval_workspace_root), "evalOutputPath": str(eval_output_path)},
604
+ "finalSummary": f"Eval-agent failed for {scenario_id}.",
605
+ }
606
+ eval_output_path.write_text(json.dumps(result, indent=2, sort_keys=True) + "\n", encoding="utf-8")
607
+ return result
608
+
609
+
610
+ def derive_final_verdict(implementation_runs: list[dict[str, Any]], eval_result: dict[str, Any]) -> dict[str, Any]:
611
+ runtime_failure = next(
612
+ (
613
+ run
614
+ for run in implementation_runs
615
+ if run.get("status") != "completed"
616
+ ),
617
+ None,
618
+ )
619
+ if runtime_failure is not None:
620
+ return {
621
+ "status": "failed",
622
+ "failure_kind": runtime_failure.get("failureKind") or "runtime_failure",
623
+ "score": 0,
624
+ }
625
+ eval_status = eval_result.get("status")
626
+ if eval_status == "passed":
627
+ return {"status": "completed", "failure_kind": "none", "score": 1}
628
+ if eval_status == "review":
629
+ return {"status": "failed", "failure_kind": "review_required", "score": 0}
630
+ if eval_status == "infra_failed":
631
+ return {"status": "failed", "failure_kind": "infra_failed", "score": 0}
632
+ return {"status": "failed", "failure_kind": "goal_mismatch", "score": 0}
633
+
634
+
635
+ def write_workspace_tarball(workspace_root: Path, output_path: Path) -> None:
636
+ with tarfile.open(output_path, "w:gz") as tar:
637
+ if not workspace_root.exists():
638
+ return
639
+ for path in workspace_root.rglob("*"):
640
+ if any(part in SKIP_WORKSPACE_TAR_NAMES for part in path.relative_to(workspace_root).parts):
641
+ continue
642
+ tar.add(path, arcname=str(path.relative_to(workspace_root)))
643
+
644
+
645
+ def write_directory_tarball(directory: Path, output_path: Path) -> None:
646
+ with tarfile.open(output_path, "w:gz") as tar:
647
+ if not directory.exists():
648
+ return
649
+ for path in directory.rglob("*"):
650
+ tar.add(path, arcname=str(path.relative_to(directory)))
651
+
652
+
653
+ def append_jsonl(path: Path, value: dict[str, Any]) -> None:
654
+ with path.open("a", encoding="utf-8") as handle:
655
+ handle.write(json.dumps(value, sort_keys=True) + "\n")
656
+
657
+
658
+ def read_json_file(path: Path) -> Any | None:
659
+ try:
660
+ if not path.exists():
661
+ return None
662
+ return json.loads(path.read_text(encoding="utf-8"))
663
+ except Exception:
664
+ return None
665
+
666
+
667
+ def emit_result(result: dict[str, Any]) -> None:
668
+ encoded = base64.b64encode(json.dumps(result, sort_keys=True).encode("utf-8")).decode("ascii")
669
+ print(f"{RESULT_MARKER_PREFIX}{encoded}", flush=True)
670
+
671
+
672
+ def resolve_workspace_root() -> str:
673
+ configured = os.environ.get("RUHROH_WORKSPACE_ROOT") or os.environ.get("KESTREL_TBENCH_WORKSPACE_ROOT")
674
+ if configured and Path(configured).is_dir():
675
+ return configured
676
+ if Path("/app").is_dir():
677
+ return "/app"
678
+ cwd = Path.cwd()
679
+ if cwd.is_dir() and str(cwd) != "/":
680
+ return str(cwd)
681
+ return "/app"
682
+
683
+
684
+ def read_max_iterations() -> int:
685
+ raw = os.environ.get("RUHROH_MAX_ITERATIONS")
686
+ if raw is None:
687
+ return DEFAULT_MAX_ITERATIONS
688
+ try:
689
+ return max(1, int(raw))
690
+ except ValueError:
691
+ return DEFAULT_MAX_ITERATIONS
692
+
693
+
694
+ def read_iteration_timeout_sec() -> int:
695
+ raw = os.environ.get("RUHROH_ITERATION_TIMEOUT_SEC")
696
+ if raw is None:
697
+ return 1200
698
+ try:
699
+ return max(1, int(raw))
700
+ except ValueError:
701
+ return 1200
702
+
703
+
704
+ def load_repo_dotenv() -> None:
705
+ for env_path in (Path.cwd() / ".env",):
706
+ if not env_path.exists():
707
+ continue
708
+ for key, value in parse_dotenv(env_path.read_text(encoding="utf-8")).items():
709
+ os.environ.setdefault(key, value)
710
+ return
711
+
712
+
713
+ def load_run_env_file() -> None:
714
+ env_path = os.environ.get("RUHROH_ENV_FILE")
715
+ if not env_path:
716
+ return
717
+ path = Path(env_path)
718
+ if not path.exists():
719
+ return
720
+ try:
721
+ parsed = json.loads(path.read_text(encoding="utf-8"))
722
+ except Exception:
723
+ return
724
+ if not isinstance(parsed, dict):
725
+ return
726
+ for key, value in parsed.items():
727
+ if isinstance(key, str) and isinstance(value, str):
728
+ os.environ[key] = value
729
+
730
+
731
+ def parse_dotenv(content: str) -> dict[str, str]:
732
+ values: dict[str, str] = {}
733
+ for line in content.splitlines():
734
+ parsed = parse_dotenv_line(line)
735
+ if parsed is not None:
736
+ key, value = parsed
737
+ values[key] = value
738
+ return values
739
+
740
+
741
+ def parse_dotenv_line(line: str) -> tuple[str, str] | None:
742
+ stripped = line.strip()
743
+ if not stripped or stripped.startswith("#"):
744
+ return None
745
+ if stripped.startswith("export "):
746
+ stripped = stripped[len("export "):].strip()
747
+ if "=" not in stripped:
748
+ return None
749
+ key, value = stripped.split("=", 1)
750
+ key = key.strip()
751
+ if not key:
752
+ return None
753
+ if not key.replace("_", "").isalnum() or key[0].isdigit():
754
+ return None
755
+ return key, unquote_dotenv_value(value.strip())
756
+
757
+
758
+ def unquote_dotenv_value(value: str) -> str:
759
+ if value.startswith('"') and value.endswith('"'):
760
+ return value[1:-1].replace("\\n", "\n").replace('\\"', '"').replace("\\\\", "\\")
761
+ if value.startswith("'") and value.endswith("'"):
762
+ return value[1:-1]
763
+ marker = value.find(" #")
764
+ return (value[:marker] if marker >= 0 else value).strip()
765
+
766
+
767
+ def result_adapter() -> str:
768
+ return os.environ.get("RUHROH_RESULT_ADAPTER") or DEFAULT_ADAPTER
769
+
770
+
771
+ def result_dataset() -> str:
772
+ return os.environ.get("RUHROH_RESULT_DATASET") or DEFAULT_DATASET
773
+
774
+
775
+ def safe_id(value: str) -> str:
776
+ stripped = value.strip().split("/")[-1]
777
+ if not stripped.replace("-", "").replace("_", "").replace(".", "").isalnum():
778
+ raise ValueError(f"Unsafe Ruhroh scenario id: {value}")
779
+ return stripped
780
+
781
+
782
+ if __name__ == "__main__":
783
+ sys.exit(main())