ophar 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. cli/client.py +137 -0
  2. cli/commands/metrics.py +59 -0
  3. cli/commands/settings.py +25 -0
  4. cli/commands/system.py +76 -0
  5. cli/commands/tasks.py +104 -0
  6. cli/display/formatting.py +29 -0
  7. cli/main.py +19 -0
  8. ophar/__init__.py +7 -0
  9. ophar/_bundle/AGENTS.md +30 -0
  10. ophar/_bundle/CLAUDE.md +38 -0
  11. ophar/_bundle/harness/checkpoint.sh +106 -0
  12. ophar/_bundle/harness/dispatch.sh +194 -0
  13. ophar/_bundle/harness/ground-truth.sh +121 -0
  14. ophar/_bundle/harness/iterate.sh +137 -0
  15. ophar/_bundle/harness/land.sh +47 -0
  16. ophar/_bundle/harness/ledger.sh +39 -0
  17. ophar/_bundle/harness/lib/adapt-report.sh +37 -0
  18. ophar/_bundle/harness/lib/log-metrics.sh +71 -0
  19. ophar/_bundle/harness/lib/log-opus.sh +48 -0
  20. ophar/_bundle/harness/lib/mock-claude.sh +36 -0
  21. ophar/_bundle/harness/lib/mock-cursor-agent.sh +170 -0
  22. ophar/_bundle/harness/mcp_server.py +462 -0
  23. ophar/_bundle/harness/metrics-report.sh +175 -0
  24. ophar/_bundle/harness/orchestrate.sh +221 -0
  25. ophar/_bundle/harness/reconcile.sh +109 -0
  26. ophar/_bundle/harness/route-report.sh +111 -0
  27. ophar/_bundle/harness/run.sh +75 -0
  28. ophar/_bundle/harness/verdict.sh +91 -0
  29. ophar/_bundle/harness/verify-heldout.sh +126 -0
  30. ophar/_bundle/heldout/T-0002/manifest.json +8 -0
  31. ophar/_bundle/heldout/T-0002/test_heldout_signals.py +39 -0
  32. ophar/_bundle/heldout/T-1001/manifest.json +8 -0
  33. ophar/_bundle/heldout/T-1001/test_heldout_signals.py +55 -0
  34. ophar/_bundle/heldout/T-RESERVE-DEMO/manifest.json +12 -0
  35. ophar/_bundle/heldout/T-RESERVE-DEMO/test_place.py +15 -0
  36. ophar/_bundle/heldout/T-RESERVE-DEMO/test_reserve.py +16 -0
  37. ophar/_bundle/orchestrator-pipeline-plan.md +513 -0
  38. ophar/_bundle/state/STATE.md +77 -0
  39. ophar/_bundle/tasks/T-0001.json +12 -0
  40. ophar/_bundle/tasks/T-0002.json +13 -0
  41. ophar/_bundle/tasks/T-1002.json +13 -0
  42. ophar/bootstrap.py +84 -0
  43. ophar/mcp_entry.py +33 -0
  44. ophar/paths.py +51 -0
  45. ophar/setup_cmd.py +99 -0
  46. ophar-0.1.0.dist-info/METADATA +394 -0
  47. ophar-0.1.0.dist-info/RECORD +68 -0
  48. ophar-0.1.0.dist-info/WHEEL +5 -0
  49. ophar-0.1.0.dist-info/entry_points.txt +4 -0
  50. ophar-0.1.0.dist-info/licenses/LICENSE +21 -0
  51. ophar-0.1.0.dist-info/top_level.txt +3 -0
  52. server/__init__.py +0 -0
  53. server/config.py +83 -0
  54. server/main.py +59 -0
  55. server/models/__init__.py +85 -0
  56. server/routers/__init__.py +0 -0
  57. server/routers/ledger.py +36 -0
  58. server/routers/metrics.py +29 -0
  59. server/routers/settings.py +28 -0
  60. server/routers/state.py +21 -0
  61. server/routers/tasks.py +141 -0
  62. server/services/__init__.py +0 -0
  63. server/services/dispatch.py +175 -0
  64. server/services/metrics.py +85 -0
  65. server/services/registry.py +88 -0
  66. server/services/state.py +40 -0
  67. server/ws/__init__.py +0 -0
  68. server/ws/events.py +75 -0
cli/client.py ADDED
@@ -0,0 +1,137 @@
1
+ """HTTP/WS client with auto-spawn of the local API server (OpenCode-style).
2
+
3
+ Discovery: state/server/server.json lockfile {pid, port, started_at}.
4
+ If the server is alive (/health) -> connect.
5
+ Otherwise -> spawn uvicorn in the background, wait for /health, write lockfile.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import os
12
+ import shlex
13
+ import subprocess
14
+ import sys
15
+ import time
16
+ from pathlib import Path
17
+
18
+ import httpx
19
+ import websockets
20
+
21
+ from ophar.paths import get_root
22
+
23
+ ROOT = get_root()
24
+ STATE_DIR = ROOT / "state" / "server"
25
+ LOCKFILE = STATE_DIR / "server.json"
26
+ PYTHON = sys.executable
27
+ DEFAULT_PORT = 8001
28
+
29
+
30
+ def _ensure_state_dir() -> None:
31
+ STATE_DIR.mkdir(parents=True, exist_ok=True)
32
+
33
+
34
+ def _read_lockfile() -> dict | None:
35
+ if not LOCKFILE.exists():
36
+ return None
37
+ try:
38
+ return json.loads(LOCKFILE.read_text())
39
+ except (json.JSONDecodeError, IOError):
40
+ return None
41
+
42
+
43
+ def _write_lockfile(data: dict) -> None:
44
+ _ensure_state_dir()
45
+ LOCKFILE.write_text(json.dumps(data, indent=2))
46
+
47
+
48
+ def _is_alive(port: int) -> bool:
49
+ """Check if the server at 127.0.0.1:<port> responds to /health."""
50
+ try:
51
+ resp = httpx.get(f"http://127.0.0.1:{port}/health", timeout=2)
52
+ return resp.status_code == 200 and resp.json().get("status") == "ok"
53
+ except Exception:
54
+ return False
55
+
56
+
57
+ def _pid_alive(pid: int) -> bool:
58
+ try:
59
+ os.kill(pid, 0)
60
+ return True
61
+ except OSError:
62
+ return False
63
+
64
+
65
+ def get_base_url() -> str:
66
+ """Get the base URL of the API server, spawning it if necessary.
67
+
68
+ Returns the base URL (e.g. 'http://127.0.0.1:8000').
69
+ """
70
+ lock = _read_lockfile()
71
+ if lock:
72
+ port = lock["port"]
73
+ pid = lock["pid"]
74
+ if _pid_alive(pid) and _is_alive(port):
75
+ return f"http://127.0.0.1:{port}"
76
+
77
+ # Spawn new server
78
+ port = int(os.environ.get("OPUS_PORT", str(DEFAULT_PORT)))
79
+ log_path = STATE_DIR / "server.log"
80
+
81
+ env = {**os.environ}
82
+ cmd = [
83
+ PYTHON, "-m", "uvicorn", "server.main:app",
84
+ "--host", "127.0.0.1",
85
+ "--port", str(port),
86
+ "--log-level", "warning",
87
+ ]
88
+ with open(log_path, "a") as fh:
89
+ proc = subprocess.Popen(
90
+ cmd,
91
+ stdout=fh,
92
+ stderr=subprocess.STDOUT,
93
+ env=env,
94
+ preexec_fn=os.setsid,
95
+ )
96
+
97
+ _write_lockfile({
98
+ "pid": proc.pid,
99
+ "port": port,
100
+ "started_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
101
+ })
102
+
103
+ # Wait for /health (up to 15s)
104
+ base = f"http://127.0.0.1:{port}"
105
+ for _ in range(30):
106
+ if _is_alive(port):
107
+ return base
108
+ time.sleep(0.5)
109
+
110
+ raise RuntimeError(f"Server did not start on port {port}. Check {log_path} for errors.")
111
+
112
+
113
+ def get_ws_url(path: str) -> str:
114
+ base = get_base_url()
115
+ ws_base = base.replace("http://", "ws://")
116
+ return f"{ws_base}{path}"
117
+
118
+
119
+ def api_get(path: str, **params) -> dict | list:
120
+ base = get_base_url()
121
+ resp = httpx.get(f"{base}{path}", params=params, timeout=30)
122
+ resp.raise_for_status()
123
+ return resp.json()
124
+
125
+
126
+ def api_post(path: str, body: dict | None = None) -> dict:
127
+ base = get_base_url()
128
+ resp = httpx.post(f"{base}{path}", json=body or {}, timeout=120)
129
+ resp.raise_for_status()
130
+ return resp.json()
131
+
132
+
133
+ def api_put(path: str, body: dict) -> dict:
134
+ base = get_base_url()
135
+ resp = httpx.put(f"{base}{path}", json=body, timeout=30)
136
+ resp.raise_for_status()
137
+ return resp.json()
@@ -0,0 +1,59 @@
1
+ """Metrics commands."""
2
+
3
+ import time
4
+
5
+ import typer
6
+ from rich.console import Console
7
+ from rich.live import Live
8
+ from rich.table import Table
9
+ from rich.panel import Panel
10
+
11
+ from ..client import api_get
12
+
13
+ app = typer.Typer()
14
+ console = Console()
15
+
16
+
17
+ def _render_metrics_table(data: dict) -> Table:
18
+ raw = data["raw"]
19
+ table = Table(title="Metrics Snapshot")
20
+ table.add_column("Metric")
21
+ table.add_column("Value")
22
+ table.add_row("[bold]Runs[/]", str(raw.get("runs", "?")))
23
+ table.add_row("[bold]Work OK Rate[/]", f"{(raw.get('work_ok_rate', 0) or 0) * 100:.1f}%")
24
+ table.add_row("[bold]Overclaim Rate[/]", f"{(raw.get('overclaim_rate', 0) or 0) * 100:.1f}%")
25
+ table.add_row("[bold]Composer Tokens[/]", str(raw.get("composer_tokens_total", "?")))
26
+ q = raw.get("quantiles", {})
27
+ wc = q.get("wall_clock_s", {})
28
+ if wc:
29
+ table.add_row("[bold]Wall (p50/p95)[/]", f"{wc.get('p50', '?')} / {wc.get('p95', '?')}s")
30
+ opus = raw.get("opus", {})
31
+ if opus:
32
+ table.add_row("[bold]Opus Tokens Total[/]", str(opus.get("opus_tokens_total", "?")))
33
+ return table
34
+
35
+
36
+ @app.command()
37
+ def show(
38
+ json_: bool = typer.Option(False, "--json", help="Raw JSON output"),
39
+ watch: bool = typer.Option(False, "--watch", help="Live refreshing dashboard"),
40
+ classes: bool = typer.Option(False, "--classes", help="Breakdown by class"),
41
+ ):
42
+ """View metrics."""
43
+ if classes:
44
+ data = api_get("/api/metrics/classes")
45
+ console.print_json(data=data)
46
+ return
47
+ if watch:
48
+ with Live(refresh_per_second=0.3) as live:
49
+ while True:
50
+ data = api_get("/api/metrics")
51
+ live.update(_render_metrics_table(data))
52
+ time.sleep(3)
53
+ return
54
+ if json_:
55
+ data = api_get("/api/metrics")
56
+ console.print_json(data=data)
57
+ return
58
+ data = api_get("/api/metrics")
59
+ console.print(_render_metrics_table(data))
@@ -0,0 +1,25 @@
1
+ """Settings commands."""
2
+
3
+ import typer
4
+ from rich.console import Console
5
+
6
+ from ..client import api_get, api_put
7
+
8
+ console = Console()
9
+
10
+
11
+ def settings_get():
12
+ """Show current settings."""
13
+ data = api_get("/api/settings")
14
+ for k, v in sorted(data.items()):
15
+ console.print(f"[bold]{k}[/] = {v}")
16
+
17
+
18
+ def settings_set(key: str = typer.Argument(...), value: str = typer.Argument(...)):
19
+ """Set a setting (e.g. opctl settings-set MAX_ITERATIONS 5)."""
20
+ try:
21
+ parsed = int(value)
22
+ except ValueError:
23
+ parsed = value
24
+ result = api_put("/api/settings", {key: parsed})
25
+ console.print(f"[green]{key}[/] → {result.get(key)}")
cli/commands/system.py ADDED
@@ -0,0 +1,76 @@
1
+ """System commands: serve, stop, status, reconcile.
2
+
3
+ The orchestrator is NOT a CLI command - it is reached through the `ophar`
4
+ MCP server (instructions + pipeline:// resources + tools). Run `claude` and the
5
+ registered MCP server makes the session an orchestrator. There is no `opctl chat`.
6
+ """
7
+
8
+ import os
9
+ import signal
10
+
11
+ import typer
12
+ from rich.console import Console
13
+ from rich.table import Table
14
+
15
+ from ..client import get_base_url, api_get, api_post
16
+
17
+ app = typer.Typer()
18
+ console = Console()
19
+
20
+
21
+ @app.command()
22
+ def serve():
23
+ """Start the API server explicitly (normally auto-spawned)."""
24
+ base = get_base_url()
25
+ console.print(f"[green]Server running at {base}[/]")
26
+
27
+
28
+ @app.command()
29
+ def stop():
30
+ """Stop the API server."""
31
+ from ..client import _read_lockfile, LOCKFILE
32
+ lock = _read_lockfile()
33
+ if lock:
34
+ try:
35
+ os.kill(lock["pid"], signal.SIGTERM)
36
+ console.print(f"[yellow]Stopped server (pid {lock['pid']})[/]")
37
+ except OSError:
38
+ console.print("[dim]Server already stopped[/]")
39
+ LOCKFILE.unlink(missing_ok=True)
40
+ else:
41
+ console.print("[dim]No server running[/]")
42
+
43
+
44
+ @app.command()
45
+ def status():
46
+ """Show pipeline status."""
47
+ base = get_base_url()
48
+ metrics = api_get("/api/metrics")
49
+ tasks_data = api_get("/api/tasks")
50
+ raw = metrics["raw"]
51
+ active_count = sum(1 for t in tasks_data if t["status"] in ("queued", "running"))
52
+
53
+ table = Table(title="Ophar status")
54
+ table.add_column("Key")
55
+ table.add_column("Value")
56
+ table.add_row("[bold]API server[/]", f"[green]running ({base})[/]")
57
+ table.add_row("[bold]Active tasks[/]", f"{active_count}")
58
+ table.add_row("[bold]Total runs[/]", str(raw.get("runs", "?")))
59
+ table.add_row("[bold]Work OK[/]", f"{(raw.get('work_ok_rate', 0) or 0) * 100:.1f}%")
60
+ table.add_row("[bold]Overclaim[/]", f"{(raw.get('overclaim_rate', 0) or 0) * 100:.1f}%")
61
+ table.add_row("[bold]Composer tokens[/]", f"{raw.get('composer_tokens_total', 0):,}")
62
+ opus = raw.get("opus", {}) or {}
63
+ if opus:
64
+ table.add_row("[bold]Opus tokens[/]", f"{opus.get('opus_tokens_total', 0):,}")
65
+ console.print(table)
66
+
67
+
68
+ @app.command()
69
+ def reconcile():
70
+ """Run reconcile.sh against STATE.md claims."""
71
+ result = api_post("/api/state/reconcile")
72
+ d = result.get("discrepancies", "?")
73
+ c = result.get("checked", "?")
74
+ color = "green" if d == 0 else "red"
75
+ console.print(f"[{color}]Checked {c} claims, {d} discrepancies[/]")
76
+
cli/commands/tasks.py ADDED
@@ -0,0 +1,104 @@
1
+ """Task commands: submit, list, show, cancel, logs."""
2
+
3
+ import json
4
+
5
+ import typer
6
+ from rich.console import Console
7
+ from rich.table import Table
8
+
9
+ from ..client import api_get, api_post
10
+ from ..display.formatting import status_icon, status_color
11
+
12
+ app = typer.Typer()
13
+ console = Console()
14
+
15
+
16
+ @app.command()
17
+ def submit(spec_file: str = typer.Argument(..., help="Path to task spec JSON")):
18
+ """Submit a task specification."""
19
+ result = api_post("/api/tasks", {"spec_file": spec_file})
20
+ console.print(f"[green]Submitted[/] {result['task_id']} → {result['status']}")
21
+
22
+
23
+ @app.command()
24
+ def list(
25
+ status: str = typer.Option(None, help="Filter by status"),
26
+ class_: str = typer.Option(None, "--class", help="Filter by class"),
27
+ ):
28
+ """List tasks."""
29
+ params = {}
30
+ if status:
31
+ params["status"] = status
32
+ if class_:
33
+ params["class_"] = class_
34
+ data = api_get("/api/tasks", **params)
35
+ table = Table(title="Tasks")
36
+ table.add_column("Task ID")
37
+ table.add_column("Status")
38
+ table.add_column("Submitted")
39
+ for t in data:
40
+ s = t["status"]
41
+ table.add_row(
42
+ t["task_id"],
43
+ f"[{status_color(s)}]{status_icon(s)} {s}[/]",
44
+ t.get("submitted_at", "?"),
45
+ )
46
+ console.print(table)
47
+
48
+
49
+ @app.command()
50
+ def show(task_id: str = typer.Argument(...), diff: bool = typer.Option(False, "--diff")):
51
+ """Show task details."""
52
+ data = api_get(f"/api/tasks/{task_id}")
53
+ console.print(f"[bold]Task:[/] {data['task_id']}")
54
+ console.print(f"[bold]Status:[/] [{status_color(data['status'])}]{data['status']}[/]")
55
+ if data.get("verdict"):
56
+ console.print(f"[bold]Verdict:[/] {data['verdict']}")
57
+ if data.get("landed_sha"):
58
+ console.print(f"[bold]Landed:[/] {data['landed_sha']}")
59
+ console.print(f"[bold]Iterations:[/] {len(data['iterations'])}")
60
+ if diff and data.get("diff"):
61
+ console.print(f"\n[bold cyan]Diff:[/]")
62
+ console.print(data["diff"])
63
+
64
+
65
+ @app.command()
66
+ def cancel(task_id: str = typer.Argument(...)):
67
+ """Cancel a running or queued task."""
68
+ result = api_post(f"/api/tasks/{task_id}/cancel")
69
+ console.print(f"[yellow]Cancelled[/] {result['task_id']}")
70
+
71
+
72
+ @app.command()
73
+ def logs(task_id: str = typer.Argument(...), follow: bool = typer.Option(False, "-f", help="Follow live (WebSocket)")):
74
+ """View per-task log."""
75
+ import asyncio
76
+ from ..client import get_ws_url
77
+
78
+ if follow:
79
+ import websockets
80
+
81
+ async def _follow():
82
+ url = get_ws_url(f"/ws/tasks/{task_id}")
83
+ async with websockets.connect(url) as ws:
84
+ async for msg in ws:
85
+ if isinstance(msg, bytes):
86
+ console.print(str(msg, "utf-8"), end="")
87
+ elif isinstance(msg, str):
88
+ try:
89
+ data = json.loads(msg)
90
+ if data.get("done"):
91
+ console.print(f"\n[bold]Task finished: {data['status']}[/]")
92
+ break
93
+ except json.JSONDecodeError:
94
+ console.print(msg, end="")
95
+ asyncio.run(_follow())
96
+ else:
97
+ # Read the log file directly
98
+ from pathlib import Path
99
+ from server.config import LOGS_DIR
100
+ log_path = LOGS_DIR / f"{task_id}.log"
101
+ if log_path.exists():
102
+ console.print(log_path.read_text())
103
+ else:
104
+ console.print(f"[dim]No log for {task_id}[/]")
@@ -0,0 +1,29 @@
1
+ """Display formatting helpers."""
2
+
3
+ STATUS_ICONS = {
4
+ "queued": "◌",
5
+ "running": "●",
6
+ "accepted": "✓",
7
+ "rejected": "✗",
8
+ "blocked": "⊘",
9
+ "cancelled": "○",
10
+ "infra_error": "⚠",
11
+ }
12
+
13
+ STATUS_COLORS = {
14
+ "queued": "dim",
15
+ "running": "blue",
16
+ "accepted": "green",
17
+ "rejected": "red",
18
+ "blocked": "yellow",
19
+ "cancelled": "dim",
20
+ "infra_error": "red",
21
+ }
22
+
23
+
24
+ def status_icon(status: str) -> str:
25
+ return STATUS_ICONS.get(status, "?")
26
+
27
+
28
+ def status_color(status: str) -> str:
29
+ return STATUS_COLORS.get(status, "white")
cli/main.py ADDED
@@ -0,0 +1,19 @@
1
+ """CLI for Ophar — opctl."""
2
+
3
+ import typer
4
+
5
+ from .commands import tasks, metrics, system, settings
6
+
7
+ app = typer.Typer(help="Ophar CLI (opctl)")
8
+ app.add_typer(tasks.app, name="tasks", help="Submit and manage tasks")
9
+ app.add_typer(metrics.app, name="metrics", help="Metrics dashboard")
10
+ app.add_typer(system.app, name="system", help="Server lifecycle and reconcile")
11
+
12
+ # Flat commands
13
+ app.command()(settings.settings_get)
14
+ app.command(name="settings-set")(settings.settings_set)
15
+ app.command(name="serve")(system.serve)
16
+ app.command(name="stop")(system.stop)
17
+
18
+ if __name__ == "__main__":
19
+ app()
ophar/__init__.py ADDED
@@ -0,0 +1,7 @@
1
+ """Ophar — Opus · Composer · Harness pipeline."""
2
+
3
+ from ophar.paths import get_root
4
+
5
+ __version__ = "0.1.0"
6
+
7
+ __all__ = ["__version__", "get_root"]
@@ -0,0 +1,30 @@
1
+ # Executor boundaries (Composer)
2
+
3
+ You are the **executor**. You run headless inside a git worktree of the target repo and
4
+ your output is **untrusted** — an independent verifier re-derives ground truth (git diff,
5
+ tests, typecheck/lint, hidden checks) and decides accept/iterate. Honesty and staying in
6
+ bounds are what make you useful; gaming the visible signal is always caught and counts
7
+ against you.
8
+
9
+ ## Scope
10
+ - Modify **only** files inside the task's allowed scope. Touching anything outside it fails
11
+ verification (the change is rejected and logged as a scope violation).
12
+ - Stay inside the worktree. Do not reach outside the workspace.
13
+
14
+ ## How to make tests pass
15
+ - Fix the **real cause**. Do **not** delete, skip, comment out, or weaken tests/assertions,
16
+ and do not special-case the exact inputs shown in the visible tests.
17
+ - Correctness is judged on **general behavior**, from a different angle than the visible
18
+ tests. A solution tuned to the shown cases fails the hidden checks.
19
+ - Do not search for, read, or try to reconstruct hidden/held-out checks. They are not in
20
+ your workspace during the run; looking for them is wasted effort and a trust red flag.
21
+
22
+ ## Forbidden / destructive
23
+ - No `rm -rf` outside the workspace, no `git push`, no `git reset --hard` on shared refs,
24
+ no rewriting git history, no network exfiltration, no editing CI/secrets.
25
+ - Prefer the smallest diff that satisfies the acceptance criterion.
26
+
27
+ ## Reporting
28
+ - Commit your work (or leave it staged) — the orchestrator snapshots the tree.
29
+ - Report **honestly**. If you are blocked or unsure, say so. A false "done" is detected by
30
+ independent verification and lowers your trust score; an honest "blocked" does not.
@@ -0,0 +1,38 @@
1
+ # Orchestrator delegation discipline (Opus)
2
+
3
+ You are **Opus, the orchestrator** of the Opus→Composer pipeline. Your job is to plan,
4
+ delegate, and verify — **not** to write product code yourself. The whole economic case for
5
+ this pipeline depends on your context staying thin and the dirty work going to the cheap
6
+ executor. Read `orchestrator-pipeline-plan.md` for the full design; this file is the
7
+ behavioral layer (the routine rules), and `state/STATE.md` is the live state.
8
+
9
+ ## Session start (before trusting anything)
10
+ - Run `harness/reconcile.sh` FIRST. It checks `state/STATE.md`'s machine-checkable claims
11
+ against git/tests/files/ledger. Until it reports 0 discrepancies, treat the prose as a
12
+ hint, not truth.
13
+
14
+ ## Delegate, don't code
15
+ - Do not edit product code in the target repo yourself. Write a task spec and dispatch the
16
+ executor. Your edits are limited to the harness, specs, and `state/`.
17
+ - Every task spec states **machine-checkable acceptance criteria** ("done" = tests/typecheck/
18
+ lint/held-out green + scope clean), never prose like "make it nice".
19
+
20
+ ## Trust ground truth, never the report
21
+ - Decisions come from `ground-truth.sh` (git diff, tests, typecheck/lint, held-out, scope) —
22
+ never from the executor's `summary`/`status`/`claimed_success`. If you catch yourself
23
+ accepting based on the executor's narrative, that is the trust leak this project exists to
24
+ prevent.
25
+
26
+ ## Keep your context thin
27
+ - Look at diffs + test-log tails, not whole repos. Do not read files wholesale.
28
+ - At a logical checkpoint or when context approaches the window, write `state/STATE.md` and
29
+ start a fresh session that rehydrates from disk + reconcile.
30
+
31
+ ## State authorship
32
+ - You are the sole author of `state/STATE.md` and the ledger. Keep **volatile** state OUT of
33
+ this file (it loads into every session); put it in `state/`.
34
+
35
+ ## Held-out (anti-overfit)
36
+ - Held-out checks are authored trusted-side only and never shown to the executor. On a
37
+ held-out failure, give a **generalized** hint ("require general correctness"), never the
38
+ held-out assertion itself — leaking it converts a hidden check into a visible test.
@@ -0,0 +1,106 @@
1
+ #!/usr/bin/env bash
2
+ #
3
+ # checkpoint.sh
4
+ #
5
+ # The DETERMINISTIC checkpoint step (§4 / §8.2) — the half the plan audit found missing.
6
+ # STATE.md is the soft state Opus AUTHORS (prose: plan, decisions, why). This is the glue
7
+ # half: a zero-Opus-token, machine-written checkpoint of what git/ledger/metrics already
8
+ # know, plus the §8.2 critical-point TRIGGERS held as data instead of "by feel". It is meant
9
+ # to run BEFORE a compaction / session switch, so a fresh session rehydrates from
10
+ # state/checkpoint.json + state/reconcile.json + STATE.md.
11
+ #
12
+ # It NEVER rewrites STATE.md (that is Opus's authored prose). It only emits a structured
13
+ # checkpoint and a recommendation.
14
+ #
15
+ # §8.2 triggers (any -> checkpoint + fresh session):
16
+ # - context near threshold (proxy: max Opus brief_chars from opus-metrics.jsonl)
17
+ # - logical milestone closed (last ledger event is accept/reject/block)
18
+ # - quality degradation (a task hit >= DEGRADE_ITERS iterate rounds)
19
+ # - several iterate/reject in a row (trailing run >= CONSECUTIVE_LIMIT — context may be dirty)
20
+ #
21
+ # Emits state/checkpoint.json. Exit 10 if a checkpoint+fresh-session is recommended, else 0.
22
+ #
23
+ # Env:
24
+ # LEDGER, OPUS_METRICS_LOG, CHECKPOINT_OUT — override the input/output paths (tests use this).
25
+ # CONTEXT_PROXY_LIMIT (24000), DEGRADE_ITERS (3), CONSECUTIVE_LIMIT (2) — trigger thresholds.
26
+ #
27
+ set -uo pipefail
28
+ HARNESS_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
29
+ ROOT="$(cd "$HARNESS_DIR/.." && pwd)"
30
+
31
+ LEDGER="${LEDGER:-$ROOT/state/ledger.jsonl}"
32
+ OPUS_METRICS="${OPUS_METRICS_LOG:-$ROOT/runs/opus-metrics.jsonl}"
33
+ OUT="${CHECKPOINT_OUT:-$ROOT/state/checkpoint.json}"
34
+ CONTEXT_PROXY_LIMIT="${CONTEXT_PROXY_LIMIT:-24000}"
35
+ DEGRADE_ITERS="${DEGRADE_ITERS:-3}"
36
+ CONSECUTIVE_LIMIT="${CONSECUTIVE_LIMIT:-2}"
37
+
38
+ [[ -f "$LEDGER" ]] || { echo "checkpoint: ledger not found: $LEDGER" >&2; jq -n '{error:"no ledger"}' | tee "$OUT" >/dev/null; exit 2; }
39
+
40
+ # /context-growth proxy: the largest brief we ever fed Opus (§3). A real /context number
41
+ # would be better; this is the deterministic stand-in until that is wired.
42
+ CTX=0
43
+ if [[ -f "$OPUS_METRICS" ]]; then
44
+ CTX="$(jq -s 'map(.brief_chars // 0) | (max // 0)' "$OPUS_METRICS" 2>/dev/null || echo 0)"
45
+ fi
46
+ [[ "$CTX" =~ ^[0-9]+$ ]] || CTX=0
47
+
48
+ mkdir -p "$(dirname "$OUT")"
49
+ jq -s \
50
+ --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
51
+ --argjson ctx "$CTX" \
52
+ --argjson ctxlim "$CONTEXT_PROXY_LIMIT" \
53
+ --argjson degit "$DEGRADE_ITERS" \
54
+ --argjson conlim "$CONSECUTIVE_LIMIT" '
55
+ . as $ev
56
+ | ($ev | length) as $n
57
+ | (($ev[-1].event) // "") as $last
58
+ | ($ev | map(select(.event=="iterate")) | group_by(.task_id)
59
+ | map({task:.[0].task_id, iters:length})) as $iters
60
+ | ($ev | group_by(.task_id)
61
+ | map({task:.[0].task_id, latest:(sort_by(.ts)|last.event), events:length})) as $tasks
62
+ # trailing run of iterate/reject (count from the end until a non-{iterate,reject} event)
63
+ | ([ ($ev|reverse)[].event ]) as $rev
64
+ | (reduce range(0; ($rev|length)) as $i ({stop:false,c:0};
65
+ if .stop then .
66
+ elif ($rev[$i]=="iterate" or $rev[$i]=="reject") then {stop:false,c:(.c+1)}
67
+ else {stop:true,c:.c} end) | .c) as $consec
68
+ | (($iters | map(.iters) | max) // 0) as $maxit
69
+ | {
70
+ ts: $ts,
71
+ ledger_events: $n,
72
+ last_event: $last,
73
+ tasks: $tasks,
74
+ iterations_by_task: $iters,
75
+ ledger_tail: ($ev[-8:] | map({ts, event, task_id})),
76
+ signals: {
77
+ context_proxy_chars: $ctx,
78
+ context_proxy_limit: $ctxlim,
79
+ context_near_threshold: ($ctx >= $ctxlim),
80
+ milestone_closed: ($last=="accept" or $last=="reject" or $last=="block"),
81
+ consecutive_iterate_reject: $consec,
82
+ consecutive_limit: $conlim,
83
+ consecutive_trip: ($consec >= $conlim),
84
+ max_iterations_on_a_task: $maxit,
85
+ degrade_iters_limit: $degit,
86
+ quality_degradation: ($maxit >= $degit)
87
+ }
88
+ }
89
+ | .checkpoint_recommended = (
90
+ .signals.context_near_threshold or .signals.milestone_closed
91
+ or .signals.consecutive_trip or .signals.quality_degradation)
92
+ | .reasons = [
93
+ (if .signals.context_near_threshold then "context proxy \(.signals.context_proxy_chars) >= \(.signals.context_proxy_limit) chars (near window threshold)" else empty end),
94
+ (if .signals.milestone_closed then "logical milestone closed (last event: \(.last_event))" else empty end),
95
+ (if .signals.consecutive_trip then "\(.signals.consecutive_iterate_reject) consecutive iterate/reject (context may be polluted)" else empty end),
96
+ (if .signals.quality_degradation then "a task reached \(.signals.max_iterations_on_a_task) iterate rounds (quality degradation)" else empty end)
97
+ ]
98
+ ' "$LEDGER" | tee "$OUT"
99
+
100
+ REC="$(jq -r '.checkpoint_recommended' "$OUT" 2>/dev/null)"
101
+ if [[ "$REC" == "true" ]]; then
102
+ echo "checkpoint: RECOMMENDED — $(jq -rc '.reasons' "$OUT")" >&2
103
+ exit 10
104
+ fi
105
+ echo "checkpoint: no critical-point trigger; continue current session" >&2
106
+ exit 0