atris 2.5.2 → 2.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/README.md +10 -0
  2. package/atris/experiments/README.md +118 -0
  3. package/atris/experiments/_examples/smoke-keep-revert/README.md +45 -0
  4. package/atris/experiments/_examples/smoke-keep-revert/candidate.py +8 -0
  5. package/atris/experiments/_examples/smoke-keep-revert/loop.py +129 -0
  6. package/atris/experiments/_examples/smoke-keep-revert/measure.py +47 -0
  7. package/atris/experiments/_examples/smoke-keep-revert/program.md +3 -0
  8. package/atris/experiments/_examples/smoke-keep-revert/proposals/bad_patch.py +19 -0
  9. package/atris/experiments/_examples/smoke-keep-revert/proposals/fix_patch.py +22 -0
  10. package/atris/experiments/_examples/smoke-keep-revert/reset.py +21 -0
  11. package/atris/experiments/_examples/smoke-keep-revert/results.tsv +5 -0
  12. package/atris/experiments/_examples/smoke-keep-revert/visual.svg +52 -0
  13. package/atris/experiments/_fixtures/invalid/BadName/loop.py +1 -0
  14. package/atris/experiments/_fixtures/invalid/BadName/program.md +3 -0
  15. package/atris/experiments/_fixtures/invalid/BadName/results.tsv +1 -0
  16. package/atris/experiments/_fixtures/invalid/bloated-context/loop.py +1 -0
  17. package/atris/experiments/_fixtures/invalid/bloated-context/measure.py +1 -0
  18. package/atris/experiments/_fixtures/invalid/bloated-context/program.md +6 -0
  19. package/atris/experiments/_fixtures/invalid/bloated-context/results.tsv +1 -0
  20. package/atris/experiments/_fixtures/valid/good-experiment/loop.py +1 -0
  21. package/atris/experiments/_fixtures/valid/good-experiment/measure.py +1 -0
  22. package/atris/experiments/_fixtures/valid/good-experiment/program.md +3 -0
  23. package/atris/experiments/_fixtures/valid/good-experiment/results.tsv +1 -0
  24. package/atris/experiments/_template/pack/loop.py +3 -0
  25. package/atris/experiments/_template/pack/measure.py +13 -0
  26. package/atris/experiments/_template/pack/program.md +3 -0
  27. package/atris/experiments/_template/pack/reset.py +3 -0
  28. package/atris/experiments/_template/pack/results.tsv +1 -0
  29. package/atris/experiments/benchmark_runtime.py +81 -0
  30. package/atris/experiments/benchmark_validate.py +70 -0
  31. package/atris/experiments/validate.py +92 -0
  32. package/atris/policies/atris-design.md +66 -0
  33. package/atris/skills/README.md +1 -0
  34. package/atris/skills/apps/SKILL.md +243 -0
  35. package/atris/skills/autoresearch/SKILL.md +63 -0
  36. package/atris/skills/create-app/SKILL.md +6 -0
  37. package/atris/skills/design/SKILL.md +15 -1
  38. package/atris/skills/drive/SKILL.md +335 -20
  39. package/atris/skills/ramp/SKILL.md +295 -0
  40. package/bin/atris.js +76 -5
  41. package/commands/business.js +132 -0
  42. package/commands/clean.js +113 -70
  43. package/commands/console.js +397 -0
  44. package/commands/experiments.js +216 -0
  45. package/commands/init.js +4 -0
  46. package/commands/pull.js +311 -0
  47. package/commands/push.js +170 -0
  48. package/commands/run.js +366 -0
  49. package/commands/status.js +21 -1
  50. package/package.json +2 -1
package/README.md CHANGED
@@ -41,6 +41,16 @@ Commands: `brainstorm` (optional) → `plan` → `do` → `review`
41
41
 
42
42
  Works with: Claude Code, Cursor, Windsurf, GitHub Copilot, any agent.
43
43
 
44
+ ## Experiments
45
+
46
+ Atris also supports Karpathy-style keep/revert loops inside `atris/experiments/`.
47
+
48
+ ```bash
49
+ atris experiments init self-heal
50
+ atris experiments validate
51
+ atris experiments benchmark
52
+ ```
53
+
44
54
  ## Update
45
55
 
46
56
  ```bash
@@ -0,0 +1,118 @@
1
+ # experiments
2
+
3
+ Karpathy-style experiment framework for Atris workspaces.
4
+
5
+ This folder defines the schema, validation rules, and benchmark harness for self-improvement loops.
6
+ Live experiment packs belong directly inside `atris/experiments/`.
7
+
8
+ ## What This Is
9
+
10
+ An experiment is not "the agent rewrote its prompt and said it improved."
11
+
12
+ An experiment is:
13
+
14
+ 1. one bounded target
15
+ 2. one external metric
16
+ 3. one keep/revert loop
17
+ 4. one append-only log
18
+
19
+ If the metric goes up, keep the change.
20
+ If it does not, revert it.
21
+
22
+ ## Schema
23
+
24
+ ```text
25
+ atris/experiments/
26
+ ├── README.md
27
+ ├── validate.py
28
+ ├── benchmark_validate.py
29
+ ├── benchmark_runtime.py
30
+ ├── _template/ # packaged scaffolds
31
+ ├── _examples/ # packaged smoke examples
32
+ ├── _fixtures/ # validator benchmark cases
33
+ └── <experiment-slug>/
34
+ ├── program.md
35
+ ├── measure.py
36
+ ├── loop.py
37
+ ├── results.tsv
38
+ ├── reset.py # preferred
39
+ ├── proposals/ # optional
40
+ └── <bounded-target> # candidate.py, system_prompt.txt, etc.
41
+ ```
42
+
43
+ ## Rules
44
+
45
+ 1. One bounded mutation target per experiment.
46
+ 2. `measure.py` must use an external metric the agent cannot fake.
47
+ 3. `loop.py` must keep only improvements and revert regressions.
48
+ 4. `program.md` stays short and task-specific.
49
+ 5. `results.tsv` stays append-only.
50
+
51
+ ## Repo Contents
52
+
53
+ - `_template/pack/` - starter files for a new experiment
54
+ - `validate.py` - structural and bloat checks
55
+ - `benchmark_validate.py` - validator benchmark on fixed good/bad fixtures
56
+ - `benchmark_runtime.py` - runtime benchmark on packaged example packs
57
+ - `_examples/` - tiny reference implementation
58
+
59
+ ## Example
60
+
61
+ Start with the smallest honest pack:
62
+
63
+ ```text
64
+ _examples/smoke-keep-revert/
65
+ ├── candidate.py
66
+ ├── measure.py
67
+ ├── loop.py
68
+ ├── reset.py
69
+ ├── results.tsv
70
+ └── proposals/
71
+ ├── bad_patch.py
72
+ └── fix_patch.py
73
+ ```
74
+
75
+ What it does:
76
+
77
+ - `candidate.py` starts broken on purpose
78
+ - `measure.py` scores it on a fixed word-count test
79
+ - `bad_patch.py` makes it worse
80
+ - `fix_patch.py` actually fixes it
81
+ - `loop.py` keeps only the fix
82
+
83
+ Run it:
84
+
85
+ ```bash
86
+ python _examples/smoke-keep-revert/reset.py
87
+ python _examples/smoke-keep-revert/loop.py \
88
+ --proposal _examples/smoke-keep-revert/proposals/bad_patch.py \
89
+ --proposal _examples/smoke-keep-revert/proposals/fix_patch.py
90
+ ```
91
+
92
+ Visual:
93
+
94
+ ```text
95
+ broken target
96
+
97
+ score = 0.2
98
+
99
+ bad patch
100
+
101
+ score = 0.0
102
+
103
+ REVERT
104
+
105
+ good patch
106
+
107
+ score = 1.0
108
+
109
+ KEEP
110
+ ```
111
+
112
+ ## Commands
113
+
114
+ ```bash
115
+ python validate.py .
116
+ python benchmark_validate.py
117
+ python benchmark_runtime.py
118
+ ```
@@ -0,0 +1,45 @@
1
+ # smoke-keep-revert
2
+
3
+ Smallest honest example of the framework.
4
+
5
+ ![Smoke Keep/Revert Flow](./visual.svg)
6
+
7
+ ## Files
8
+
9
+ ```text
10
+ candidate.py -> bounded target
11
+ measure.py -> hard score
12
+ loop.py -> keep/revert engine
13
+ reset.py -> restore baseline
14
+ results.tsv -> trial log
15
+ proposals/ -> bad patch + good patch
16
+ ```
17
+
18
+ ## Flow
19
+
20
+ ```text
21
+ candidate.py is wrong
22
+
23
+ measure.py scores baseline
24
+
25
+ loop.py applies bad_patch.py
26
+
27
+ score does not improve
28
+
29
+ loop.py reverts the change
30
+
31
+ loop.py applies fix_patch.py
32
+
33
+ score improves
34
+
35
+ loop.py keeps the change
36
+ ```
37
+
38
+ ## Run
39
+
40
+ ```bash
41
+ python reset.py
42
+ python loop.py \
43
+ --proposal proposals/bad_patch.py \
44
+ --proposal proposals/fix_patch.py
45
+ ```
@@ -0,0 +1,8 @@
1
+ """Bounded mutation target for the smoke experiment."""
2
+
3
+
4
+ def count_words(text: str) -> int:
5
+ cleaned = text.strip()
6
+ if not cleaned:
7
+ return 0
8
+ return len(cleaned.split())
@@ -0,0 +1,129 @@
1
+ """Shared keep/revert loop for a bounded local experiment."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import csv
7
+ import json
8
+ import os
9
+ from pathlib import Path
10
+ import shutil
11
+ import subprocess
12
+ import sys
13
+ from datetime import datetime, timezone
14
+
15
+
16
+ EXPERIMENT_DIR = Path(__file__).resolve().parent
17
+ DEFAULT_TARGET = EXPERIMENT_DIR / "candidate.py"
18
+ DEFAULT_MEASURE = EXPERIMENT_DIR / "measure.py"
19
+ DEFAULT_RESULTS = EXPERIMENT_DIR / "results.tsv"
20
+
21
+
22
+ def run_measure(measure_path: Path) -> dict:
23
+ proc = subprocess.run(
24
+ [sys.executable, str(measure_path)],
25
+ cwd=str(EXPERIMENT_DIR),
26
+ capture_output=True,
27
+ text=True,
28
+ check=True,
29
+ )
30
+ return json.loads(proc.stdout.strip())
31
+
32
+
33
+ def append_result(results_path: Path, row: dict) -> None:
34
+ write_header = not results_path.exists() or results_path.stat().st_size == 0
35
+ with results_path.open("a", newline="", encoding="utf-8") as handle:
36
+ writer = csv.DictWriter(
37
+ handle,
38
+ fieldnames=[
39
+ "timestamp",
40
+ "trial",
41
+ "status",
42
+ "old_score",
43
+ "new_score",
44
+ "proposal",
45
+ "description",
46
+ ],
47
+ delimiter="\t",
48
+ )
49
+ if write_header:
50
+ writer.writeheader()
51
+ writer.writerow(row)
52
+
53
+
54
+ def restore_backup(backup_path: Path, target_path: Path) -> None:
55
+ shutil.copy2(backup_path, target_path)
56
+ backup_path.unlink(missing_ok=True)
57
+
58
+
59
+ def main() -> int:
60
+ parser = argparse.ArgumentParser(description="Run a bounded keep/revert experiment.")
61
+ parser.add_argument("--proposal", action="append", default=[])
62
+ args = parser.parse_args()
63
+
64
+ target_path = DEFAULT_TARGET.resolve()
65
+ measure_path = DEFAULT_MEASURE.resolve()
66
+ results_path = DEFAULT_RESULTS.resolve()
67
+
68
+ baseline = run_measure(measure_path)
69
+ current_score = float(baseline["score"])
70
+ print(f"BASELINE {current_score:.4f}")
71
+
72
+ for trial_index, proposal in enumerate(args.proposal, start=1):
73
+ proposal_path = Path(proposal).resolve()
74
+ backup_path = target_path.with_suffix(target_path.suffix + f".trial{trial_index}.bak")
75
+ shutil.copy2(target_path, backup_path)
76
+
77
+ status = "error"
78
+ old_score = current_score
79
+ new_score = current_score
80
+ description = ""
81
+
82
+ try:
83
+ proc = subprocess.run(
84
+ [sys.executable, str(proposal_path)],
85
+ cwd=str(EXPERIMENT_DIR),
86
+ capture_output=True,
87
+ text=True,
88
+ check=True,
89
+ env={**os.environ, "EXPERIMENT_TARGET": str(target_path)},
90
+ )
91
+ if proc.stdout.strip():
92
+ description = proc.stdout.strip().splitlines()[-1][:200]
93
+
94
+ measured = run_measure(measure_path)
95
+ new_score = float(measured["score"])
96
+ if new_score > current_score:
97
+ status = "kept"
98
+ current_score = new_score
99
+ backup_path.unlink(missing_ok=True)
100
+ else:
101
+ status = "reverted"
102
+ restore_backup(backup_path, target_path)
103
+ except subprocess.CalledProcessError as exc:
104
+ restore_backup(backup_path, target_path)
105
+ stderr = (exc.stderr or exc.stdout or "").strip()
106
+ description = (stderr.splitlines()[-1] if stderr else "proposal failed")[:200]
107
+ status = "error"
108
+
109
+ append_result(
110
+ results_path,
111
+ {
112
+ "timestamp": datetime.now(timezone.utc).isoformat(),
113
+ "trial": trial_index,
114
+ "status": status,
115
+ "old_score": f"{old_score:.4f}",
116
+ "new_score": f"{new_score:.4f}",
117
+ "proposal": proposal_path.name,
118
+ "description": description,
119
+ },
120
+ )
121
+ print(f"TRIAL {trial_index} {status.upper()} score={new_score:.4f} proposal={proposal_path.name}")
122
+
123
+ final_measure = run_measure(measure_path)
124
+ print(f"FINAL {final_measure['score']:.4f}")
125
+ return 0
126
+
127
+
128
+ if __name__ == "__main__":
129
+ raise SystemExit(main())
@@ -0,0 +1,47 @@
1
+ """Objective metric for the smoke keep/revert example."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+ import sys
8
+
9
+
10
+ EXPERIMENT_DIR = Path(__file__).resolve().parent
11
+ if str(EXPERIMENT_DIR) not in sys.path:
12
+ sys.path.insert(0, str(EXPERIMENT_DIR))
13
+
14
+ from candidate import count_words
15
+
16
+
17
+ CASES = [
18
+ ("", 0),
19
+ ("one", 1),
20
+ ("two words", 2),
21
+ (" three spaced words ", 3),
22
+ ("punctuation, still counts", 3),
23
+ ]
24
+
25
+
26
+ def main() -> int:
27
+ passed = 0
28
+
29
+ for text, expected in CASES:
30
+ actual = count_words(text)
31
+ if actual == expected:
32
+ passed += 1
33
+
34
+ total = len(CASES)
35
+ score = passed / total if total else 0.0
36
+ payload = {
37
+ "score": round(score, 4),
38
+ "passed": passed,
39
+ "total": total,
40
+ "status": "pass" if passed == total else "fail",
41
+ }
42
+ print(json.dumps(payload))
43
+ return 0
44
+
45
+
46
+ if __name__ == "__main__":
47
+ raise SystemExit(main())
@@ -0,0 +1,3 @@
1
+ # Program
2
+
3
+ Prove the keep/revert loop in the smallest honest way.
@@ -0,0 +1,19 @@
1
+ """A deliberately bad mutation that should be reverted."""
2
+
3
+ from pathlib import Path
4
+ import os
5
+
6
+
7
+ TARGET = Path(os.environ["EXPERIMENT_TARGET"])
8
+
9
+ TARGET.write_text(
10
+ '''"""Bounded mutation target for the smoke experiment."""
11
+
12
+
13
+ def count_words(text: str) -> int:
14
+ return 0
15
+ ''',
16
+ encoding="utf-8",
17
+ )
18
+
19
+ print("applied bad proposal")
@@ -0,0 +1,22 @@
1
+ """A good mutation that should be kept."""
2
+
3
+ from pathlib import Path
4
+ import os
5
+
6
+
7
+ TARGET = Path(os.environ["EXPERIMENT_TARGET"])
8
+
9
+ TARGET.write_text(
10
+ '''"""Bounded mutation target for the smoke experiment."""
11
+
12
+
13
+ def count_words(text: str) -> int:
14
+ cleaned = text.strip()
15
+ if not cleaned:
16
+ return 0
17
+ return len(cleaned.split())
18
+ ''',
19
+ encoding="utf-8",
20
+ )
21
+
22
+ print("applied good proposal")
@@ -0,0 +1,21 @@
1
+ """Restore the smoke example to its baseline."""
2
+
3
+ from pathlib import Path
4
+
5
+
6
+ TARGET = Path(__file__).resolve().parent / "candidate.py"
7
+
8
+ TARGET.write_text(
9
+ '''"""Bounded mutation target for the smoke experiment."""
10
+
11
+
12
+ def count_words(text: str) -> int:
13
+ cleaned = text.strip()
14
+ if not cleaned:
15
+ return 0
16
+ return len(cleaned)
17
+ ''',
18
+ encoding="utf-8",
19
+ )
20
+
21
+ print("reset smoke-keep-revert to baseline")
@@ -0,0 +1,5 @@
1
+ timestamp trial status old_score new_score proposal description
2
+ 2026-03-11T11:05:17.887045+00:00 1 reverted 0.2000 0.2000 bad_patch.py applied bad proposal
3
+ 2026-03-11T11:05:17.920737+00:00 2 kept 0.2000 1.0000 fix_patch.py applied good proposal
4
+ 2026-03-11T11:05:40.063680+00:00 1 reverted 0.2000 0.2000 bad_patch.py applied bad proposal
5
+ 2026-03-11T11:05:40.097842+00:00 2 kept 0.2000 1.0000 fix_patch.py applied good proposal
@@ -0,0 +1,52 @@
1
+ <svg width="980" height="260" viewBox="0 0 980 260" fill="none" xmlns="http://www.w3.org/2000/svg">
2
+ <rect width="980" height="260" fill="#F7F5EF"/>
3
+ <text x="40" y="42" font-family="Helvetica, Arial, sans-serif" font-size="28" font-weight="700" fill="#111111">Smoke Keep/Revert</text>
4
+ <text x="40" y="68" font-family="Helvetica, Arial, sans-serif" font-size="14" fill="#4B5563">One bounded target. One hard metric. Reject the loser. Keep the winner.</text>
5
+
6
+ <rect x="40" y="110" width="150" height="88" rx="16" fill="#FFF7ED" stroke="#C2410C" stroke-width="2"/>
7
+ <text x="115" y="140" text-anchor="middle" font-family="Helvetica, Arial, sans-serif" font-size="18" font-weight="700" fill="#9A3412">Broken Target</text>
8
+ <text x="115" y="166" text-anchor="middle" font-family="Helvetica, Arial, sans-serif" font-size="14" fill="#7C2D12">candidate.py</text>
9
+ <text x="115" y="186" text-anchor="middle" font-family="Helvetica, Arial, sans-serif" font-size="14" fill="#7C2D12">buggy on purpose</text>
10
+
11
+ <rect x="220" y="110" width="150" height="88" rx="16" fill="#EFF6FF" stroke="#1D4ED8" stroke-width="2"/>
12
+ <text x="295" y="140" text-anchor="middle" font-family="Helvetica, Arial, sans-serif" font-size="18" font-weight="700" fill="#1E3A8A">Measure</text>
13
+ <text x="295" y="166" text-anchor="middle" font-family="Helvetica, Arial, sans-serif" font-size="14" fill="#1D4ED8">score = 0.2</text>
14
+ <text x="295" y="186" text-anchor="middle" font-family="Helvetica, Arial, sans-serif" font-size="14" fill="#1D4ED8">baseline truth</text>
15
+
16
+ <rect x="400" y="38" width="160" height="72" rx="16" fill="#FEF2F2" stroke="#DC2626" stroke-width="2"/>
17
+ <text x="480" y="66" text-anchor="middle" font-family="Helvetica, Arial, sans-serif" font-size="18" font-weight="700" fill="#991B1B">Bad Patch</text>
18
+ <text x="480" y="92" text-anchor="middle" font-family="Helvetica, Arial, sans-serif" font-size="14" fill="#B91C1C">score falls to 0.0</text>
19
+
20
+ <rect x="400" y="150" width="160" height="72" rx="16" fill="#ECFDF5" stroke="#059669" stroke-width="2"/>
21
+ <text x="480" y="178" text-anchor="middle" font-family="Helvetica, Arial, sans-serif" font-size="18" font-weight="700" fill="#065F46">Good Patch</text>
22
+ <text x="480" y="204" text-anchor="middle" font-family="Helvetica, Arial, sans-serif" font-size="14" fill="#047857">score rises to 1.0</text>
23
+
24
+ <rect x="610" y="38" width="150" height="72" rx="16" fill="#FEE2E2" stroke="#DC2626" stroke-width="2"/>
25
+ <text x="685" y="66" text-anchor="middle" font-family="Helvetica, Arial, sans-serif" font-size="18" font-weight="700" fill="#991B1B">REVERT</text>
26
+ <text x="685" y="92" text-anchor="middle" font-family="Helvetica, Arial, sans-serif" font-size="14" fill="#B91C1C">reject loser</text>
27
+
28
+ <rect x="610" y="150" width="150" height="72" rx="16" fill="#DCFCE7" stroke="#16A34A" stroke-width="2"/>
29
+ <text x="685" y="178" text-anchor="middle" font-family="Helvetica, Arial, sans-serif" font-size="18" font-weight="700" fill="#166534">KEEP</text>
30
+ <text x="685" y="204" text-anchor="middle" font-family="Helvetica, Arial, sans-serif" font-size="14" fill="#15803D">accept winner</text>
31
+
32
+ <rect x="800" y="110" width="140" height="88" rx="16" fill="#F0FDF4" stroke="#16A34A" stroke-width="2"/>
33
+ <text x="870" y="140" text-anchor="middle" font-family="Helvetica, Arial, sans-serif" font-size="18" font-weight="700" fill="#166534">Final State</text>
34
+ <text x="870" y="166" text-anchor="middle" font-family="Helvetica, Arial, sans-serif" font-size="14" fill="#15803D">fixed target</text>
35
+ <text x="870" y="186" text-anchor="middle" font-family="Helvetica, Arial, sans-serif" font-size="14" fill="#15803D">score = 1.0</text>
36
+
37
+ <path d="M190 154H220" stroke="#6B7280" stroke-width="3"/>
38
+ <path d="M365 154H385" stroke="#6B7280" stroke-width="3"/>
39
+ <path d="M560 74H600" stroke="#DC2626" stroke-width="3"/>
40
+ <path d="M560 186H600" stroke="#16A34A" stroke-width="3"/>
41
+ <path d="M760 154H790" stroke="#6B7280" stroke-width="3"/>
42
+
43
+ <path d="M480 110V138" stroke="#6B7280" stroke-width="3" stroke-dasharray="8 8"/>
44
+ <path d="M295 154C340 154 350 74 400 74" stroke="#DC2626" stroke-width="3" fill="none"/>
45
+ <path d="M295 154C340 154 350 186 400 186" stroke="#16A34A" stroke-width="3" fill="none"/>
46
+
47
+ <polygon points="220,154 210,148 210,160" fill="#6B7280"/>
48
+ <polygon points="385,154 375,148 375,160" fill="#6B7280"/>
49
+ <polygon points="600,74 590,68 590,80" fill="#DC2626"/>
50
+ <polygon points="600,186 590,180 590,192" fill="#16A34A"/>
51
+ <polygon points="790,154 780,148 780,160" fill="#6B7280"/>
52
+ </svg>
@@ -0,0 +1 @@
1
+ print("ok")
@@ -0,0 +1,3 @@
1
+ # Program
2
+
3
+ This pack is invalid because the folder name is bad and files are missing.
@@ -0,0 +1 @@
1
+ timestamp trial status old_score new_score proposal description
@@ -0,0 +1,6 @@
1
+ # Program
2
+
3
+ This program is intentionally too long.
4
+
5
+ aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
6
+ aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
@@ -0,0 +1 @@
1
+ timestamp trial status old_score new_score proposal description
@@ -0,0 +1,3 @@
1
+ # Program
2
+
3
+ Keep this experiment small and measurable.
@@ -0,0 +1 @@
1
+ timestamp trial status old_score new_score proposal description
@@ -0,0 +1,3 @@
1
+ """Minimal keep/revert loop template."""
2
+
3
+ print("Replace this template with a bounded keep/revert loop.")
@@ -0,0 +1,13 @@
1
+ """Return a machine-readable score for the experiment."""
2
+
3
+ import json
4
+
5
+
6
+ payload = {
7
+ "score": 0.0,
8
+ "passed": 0,
9
+ "total": 0,
10
+ "status": "fail",
11
+ }
12
+
13
+ print(json.dumps(payload))
@@ -0,0 +1,3 @@
1
+ # Program
2
+
3
+ State the outcome you want to improve in one short paragraph.
@@ -0,0 +1,3 @@
1
+ """Restore the experiment pack to baseline."""
2
+
3
+ print("Implement baseline reset here.")
@@ -0,0 +1 @@
1
+ timestamp trial status old_score new_score proposal description
@@ -0,0 +1,81 @@
1
+ """Runtime benchmark for example experiment packs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+ import subprocess
8
+ import sys
9
+
10
+
11
+ ROOT = Path(__file__).resolve().parent
12
+ EXAMPLES_DIR = ROOT / "_examples"
13
+
14
+ CASES = [
15
+ {
16
+ "name": "smoke-keep-revert",
17
+ "baseline_below": 1.0,
18
+ "expected_final": 1.0,
19
+ "proposals": ["proposals/bad_patch.py", "proposals/fix_patch.py"],
20
+ },
21
+ ]
22
+
23
+
24
+ def run_python(script: Path, *args: str) -> subprocess.CompletedProcess[str]:
25
+ return subprocess.run(
26
+ [sys.executable, str(script), *args],
27
+ cwd=str(script.parent),
28
+ capture_output=True,
29
+ text=True,
30
+ check=True,
31
+ )
32
+
33
+
34
+ def run_measure(exp_dir: Path) -> dict:
35
+ proc = run_python(exp_dir / "measure.py")
36
+ return json.loads(proc.stdout.strip())
37
+
38
+
39
+ def main() -> int:
40
+ passed = 0
41
+ failures = []
42
+
43
+ for case in CASES:
44
+ exp_dir = EXAMPLES_DIR / case["name"]
45
+ run_python(exp_dir / "reset.py")
46
+ baseline = run_measure(exp_dir)
47
+
48
+ if float(baseline["score"]) >= case["baseline_below"]:
49
+ failures.append(f"{case['name']}: baseline too high ({baseline['score']})")
50
+ continue
51
+
52
+ proposal_args: list[str] = []
53
+ for proposal in case["proposals"]:
54
+ proposal_args.extend(["--proposal", str(exp_dir / proposal)])
55
+
56
+ run_python(exp_dir / "loop.py", *proposal_args)
57
+ final = run_measure(exp_dir)
58
+
59
+ if float(final["score"]) != case["expected_final"]:
60
+ failures.append(
61
+ f"{case['name']}: final score {final['score']} != {case['expected_final']}"
62
+ )
63
+ continue
64
+
65
+ passed += 1
66
+
67
+ total = len(CASES)
68
+ score = passed / total if total else 0.0
69
+ print(f"SCORE {score:.4f} ({passed}/{total})")
70
+
71
+ if failures:
72
+ for failure in failures:
73
+ print(f"FAIL {failure}")
74
+ return 1
75
+
76
+ print("PASS benchmark_runtime")
77
+ return 0
78
+
79
+
80
+ if __name__ == "__main__":
81
+ raise SystemExit(main())