atris 2.5.2 → 2.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -0
- package/atris/experiments/README.md +118 -0
- package/atris/experiments/_examples/smoke-keep-revert/README.md +45 -0
- package/atris/experiments/_examples/smoke-keep-revert/candidate.py +8 -0
- package/atris/experiments/_examples/smoke-keep-revert/loop.py +129 -0
- package/atris/experiments/_examples/smoke-keep-revert/measure.py +47 -0
- package/atris/experiments/_examples/smoke-keep-revert/program.md +3 -0
- package/atris/experiments/_examples/smoke-keep-revert/proposals/bad_patch.py +19 -0
- package/atris/experiments/_examples/smoke-keep-revert/proposals/fix_patch.py +22 -0
- package/atris/experiments/_examples/smoke-keep-revert/reset.py +21 -0
- package/atris/experiments/_examples/smoke-keep-revert/results.tsv +5 -0
- package/atris/experiments/_examples/smoke-keep-revert/visual.svg +52 -0
- package/atris/experiments/_fixtures/invalid/BadName/loop.py +1 -0
- package/atris/experiments/_fixtures/invalid/BadName/program.md +3 -0
- package/atris/experiments/_fixtures/invalid/BadName/results.tsv +1 -0
- package/atris/experiments/_fixtures/invalid/bloated-context/loop.py +1 -0
- package/atris/experiments/_fixtures/invalid/bloated-context/measure.py +1 -0
- package/atris/experiments/_fixtures/invalid/bloated-context/program.md +6 -0
- package/atris/experiments/_fixtures/invalid/bloated-context/results.tsv +1 -0
- package/atris/experiments/_fixtures/valid/good-experiment/loop.py +1 -0
- package/atris/experiments/_fixtures/valid/good-experiment/measure.py +1 -0
- package/atris/experiments/_fixtures/valid/good-experiment/program.md +3 -0
- package/atris/experiments/_fixtures/valid/good-experiment/results.tsv +1 -0
- package/atris/experiments/_template/pack/loop.py +3 -0
- package/atris/experiments/_template/pack/measure.py +13 -0
- package/atris/experiments/_template/pack/program.md +3 -0
- package/atris/experiments/_template/pack/reset.py +3 -0
- package/atris/experiments/_template/pack/results.tsv +1 -0
- package/atris/experiments/benchmark_runtime.py +81 -0
- package/atris/experiments/benchmark_validate.py +70 -0
- package/atris/experiments/validate.py +92 -0
- package/atris/policies/atris-design.md +66 -0
- package/atris/skills/README.md +1 -0
- package/atris/skills/apps/SKILL.md +243 -0
- package/atris/skills/autoresearch/SKILL.md +63 -0
- package/atris/skills/create-app/SKILL.md +6 -0
- package/atris/skills/design/SKILL.md +15 -1
- package/atris/skills/drive/SKILL.md +335 -20
- package/atris/skills/ramp/SKILL.md +295 -0
- package/bin/atris.js +76 -5
- package/commands/business.js +132 -0
- package/commands/clean.js +113 -70
- package/commands/console.js +397 -0
- package/commands/experiments.js +216 -0
- package/commands/init.js +4 -0
- package/commands/pull.js +311 -0
- package/commands/push.js +170 -0
- package/commands/run.js +366 -0
- package/commands/status.js +21 -1
- package/package.json +2 -1
package/README.md
CHANGED
|
@@ -41,6 +41,16 @@ Commands: `brainstorm` (optional) → `plan` → `do` → `review`
|
|
|
41
41
|
|
|
42
42
|
Works with: Claude Code, Cursor, Windsurf, GitHub Copilot, any agent.
|
|
43
43
|
|
|
44
|
+
## Experiments
|
|
45
|
+
|
|
46
|
+
Atris also supports Karpathy-style keep/revert loops inside `atris/experiments/`.
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
atris experiments init self-heal
|
|
50
|
+
atris experiments validate
|
|
51
|
+
atris experiments benchmark
|
|
52
|
+
```
|
|
53
|
+
|
|
44
54
|
## Update
|
|
45
55
|
|
|
46
56
|
```bash
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
# experiments
|
|
2
|
+
|
|
3
|
+
Karpathy-style experiment framework for Atris workspaces.
|
|
4
|
+
|
|
5
|
+
This folder defines the schema, validation rules, and benchmark harness for self-improvement loops.
|
|
6
|
+
Live experiment packs belong directly inside `atris/experiments/`.
|
|
7
|
+
|
|
8
|
+
## What This Is
|
|
9
|
+
|
|
10
|
+
An experiment is not "the agent rewrote its prompt and said it improved."
|
|
11
|
+
|
|
12
|
+
An experiment is:
|
|
13
|
+
|
|
14
|
+
1. one bounded target
|
|
15
|
+
2. one external metric
|
|
16
|
+
3. one keep/revert loop
|
|
17
|
+
4. one append-only log
|
|
18
|
+
|
|
19
|
+
If the metric goes up, keep the change.
|
|
20
|
+
If it does not, revert it.
|
|
21
|
+
|
|
22
|
+
## Schema
|
|
23
|
+
|
|
24
|
+
```text
|
|
25
|
+
atris/experiments/
|
|
26
|
+
├── README.md
|
|
27
|
+
├── validate.py
|
|
28
|
+
├── benchmark_validate.py
|
|
29
|
+
├── benchmark_runtime.py
|
|
30
|
+
├── _template/ # packaged scaffolds
|
|
31
|
+
├── _examples/ # packaged smoke examples
|
|
32
|
+
├── _fixtures/ # validator benchmark cases
|
|
33
|
+
└── <experiment-slug>/
|
|
34
|
+
├── program.md
|
|
35
|
+
├── measure.py
|
|
36
|
+
├── loop.py
|
|
37
|
+
├── results.tsv
|
|
38
|
+
├── reset.py # preferred
|
|
39
|
+
├── proposals/ # optional
|
|
40
|
+
└── <bounded-target> # candidate.py, system_prompt.txt, etc.
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Rules
|
|
44
|
+
|
|
45
|
+
1. One bounded mutation target per experiment.
|
|
46
|
+
2. `measure.py` must use an external metric the agent cannot fake.
|
|
47
|
+
3. `loop.py` must keep only improvements and revert regressions.
|
|
48
|
+
4. `program.md` stays short and task-specific.
|
|
49
|
+
5. `results.tsv` stays append-only.
|
|
50
|
+
|
|
51
|
+
## Repo Contents
|
|
52
|
+
|
|
53
|
+
- `_template/pack/` - starter files for a new experiment
|
|
54
|
+
- `validate.py` - structural and bloat checks
|
|
55
|
+
- `benchmark_validate.py` - validator benchmark on fixed good/bad fixtures
|
|
56
|
+
- `benchmark_runtime.py` - runtime benchmark on packaged example packs
|
|
57
|
+
- `_examples/` - tiny reference implementation
|
|
58
|
+
|
|
59
|
+
## Example
|
|
60
|
+
|
|
61
|
+
Start with the smallest honest pack:
|
|
62
|
+
|
|
63
|
+
```text
|
|
64
|
+
_examples/smoke-keep-revert/
|
|
65
|
+
├── candidate.py
|
|
66
|
+
├── measure.py
|
|
67
|
+
├── loop.py
|
|
68
|
+
├── reset.py
|
|
69
|
+
├── results.tsv
|
|
70
|
+
└── proposals/
|
|
71
|
+
├── bad_patch.py
|
|
72
|
+
└── fix_patch.py
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
What it does:
|
|
76
|
+
|
|
77
|
+
- `candidate.py` starts broken on purpose
|
|
78
|
+
- `measure.py` scores it on a fixed word-count test
|
|
79
|
+
- `bad_patch.py` makes it worse
|
|
80
|
+
- `fix_patch.py` actually fixes it
|
|
81
|
+
- `loop.py` keeps only the fix
|
|
82
|
+
|
|
83
|
+
Run it:
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
python _examples/smoke-keep-revert/reset.py
|
|
87
|
+
python _examples/smoke-keep-revert/loop.py \
|
|
88
|
+
--proposal _examples/smoke-keep-revert/proposals/bad_patch.py \
|
|
89
|
+
--proposal _examples/smoke-keep-revert/proposals/fix_patch.py
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Visual:
|
|
93
|
+
|
|
94
|
+
```text
|
|
95
|
+
broken target
|
|
96
|
+
↓
|
|
97
|
+
score = 0.2
|
|
98
|
+
↓
|
|
99
|
+
bad patch
|
|
100
|
+
↓
|
|
101
|
+
score = 0.0
|
|
102
|
+
↓
|
|
103
|
+
REVERT
|
|
104
|
+
↓
|
|
105
|
+
good patch
|
|
106
|
+
↓
|
|
107
|
+
score = 1.0
|
|
108
|
+
↓
|
|
109
|
+
KEEP
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
## Commands
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
python validate.py .
|
|
116
|
+
python benchmark_validate.py
|
|
117
|
+
python benchmark_runtime.py
|
|
118
|
+
```
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# smoke-keep-revert
|
|
2
|
+
|
|
3
|
+
Smallest honest example of the framework.
|
|
4
|
+
|
|
5
|
+

|
|
6
|
+
|
|
7
|
+
## Files
|
|
8
|
+
|
|
9
|
+
```text
|
|
10
|
+
candidate.py -> bounded target
|
|
11
|
+
measure.py -> hard score
|
|
12
|
+
loop.py -> keep/revert engine
|
|
13
|
+
reset.py -> restore baseline
|
|
14
|
+
results.tsv -> trial log
|
|
15
|
+
proposals/ -> bad patch + good patch
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Flow
|
|
19
|
+
|
|
20
|
+
```text
|
|
21
|
+
candidate.py is wrong
|
|
22
|
+
↓
|
|
23
|
+
measure.py scores baseline
|
|
24
|
+
↓
|
|
25
|
+
loop.py applies bad_patch.py
|
|
26
|
+
↓
|
|
27
|
+
score does not improve
|
|
28
|
+
↓
|
|
29
|
+
loop.py reverts the change
|
|
30
|
+
↓
|
|
31
|
+
loop.py applies fix_patch.py
|
|
32
|
+
↓
|
|
33
|
+
score improves
|
|
34
|
+
↓
|
|
35
|
+
loop.py keeps the change
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Run
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
python reset.py
|
|
42
|
+
python loop.py \
|
|
43
|
+
--proposal proposals/bad_patch.py \
|
|
44
|
+
--proposal proposals/fix_patch.py
|
|
45
|
+
```
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""Shared keep/revert loop for a bounded local experiment."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import csv
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
import shutil
|
|
11
|
+
import subprocess
|
|
12
|
+
import sys
|
|
13
|
+
from datetime import datetime, timezone
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
EXPERIMENT_DIR = Path(__file__).resolve().parent
|
|
17
|
+
DEFAULT_TARGET = EXPERIMENT_DIR / "candidate.py"
|
|
18
|
+
DEFAULT_MEASURE = EXPERIMENT_DIR / "measure.py"
|
|
19
|
+
DEFAULT_RESULTS = EXPERIMENT_DIR / "results.tsv"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def run_measure(measure_path: Path) -> dict:
|
|
23
|
+
proc = subprocess.run(
|
|
24
|
+
[sys.executable, str(measure_path)],
|
|
25
|
+
cwd=str(EXPERIMENT_DIR),
|
|
26
|
+
capture_output=True,
|
|
27
|
+
text=True,
|
|
28
|
+
check=True,
|
|
29
|
+
)
|
|
30
|
+
return json.loads(proc.stdout.strip())
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def append_result(results_path: Path, row: dict) -> None:
|
|
34
|
+
write_header = not results_path.exists() or results_path.stat().st_size == 0
|
|
35
|
+
with results_path.open("a", newline="", encoding="utf-8") as handle:
|
|
36
|
+
writer = csv.DictWriter(
|
|
37
|
+
handle,
|
|
38
|
+
fieldnames=[
|
|
39
|
+
"timestamp",
|
|
40
|
+
"trial",
|
|
41
|
+
"status",
|
|
42
|
+
"old_score",
|
|
43
|
+
"new_score",
|
|
44
|
+
"proposal",
|
|
45
|
+
"description",
|
|
46
|
+
],
|
|
47
|
+
delimiter="\t",
|
|
48
|
+
)
|
|
49
|
+
if write_header:
|
|
50
|
+
writer.writeheader()
|
|
51
|
+
writer.writerow(row)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def restore_backup(backup_path: Path, target_path: Path) -> None:
|
|
55
|
+
shutil.copy2(backup_path, target_path)
|
|
56
|
+
backup_path.unlink(missing_ok=True)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def main() -> int:
|
|
60
|
+
parser = argparse.ArgumentParser(description="Run a bounded keep/revert experiment.")
|
|
61
|
+
parser.add_argument("--proposal", action="append", default=[])
|
|
62
|
+
args = parser.parse_args()
|
|
63
|
+
|
|
64
|
+
target_path = DEFAULT_TARGET.resolve()
|
|
65
|
+
measure_path = DEFAULT_MEASURE.resolve()
|
|
66
|
+
results_path = DEFAULT_RESULTS.resolve()
|
|
67
|
+
|
|
68
|
+
baseline = run_measure(measure_path)
|
|
69
|
+
current_score = float(baseline["score"])
|
|
70
|
+
print(f"BASELINE {current_score:.4f}")
|
|
71
|
+
|
|
72
|
+
for trial_index, proposal in enumerate(args.proposal, start=1):
|
|
73
|
+
proposal_path = Path(proposal).resolve()
|
|
74
|
+
backup_path = target_path.with_suffix(target_path.suffix + f".trial{trial_index}.bak")
|
|
75
|
+
shutil.copy2(target_path, backup_path)
|
|
76
|
+
|
|
77
|
+
status = "error"
|
|
78
|
+
old_score = current_score
|
|
79
|
+
new_score = current_score
|
|
80
|
+
description = ""
|
|
81
|
+
|
|
82
|
+
try:
|
|
83
|
+
proc = subprocess.run(
|
|
84
|
+
[sys.executable, str(proposal_path)],
|
|
85
|
+
cwd=str(EXPERIMENT_DIR),
|
|
86
|
+
capture_output=True,
|
|
87
|
+
text=True,
|
|
88
|
+
check=True,
|
|
89
|
+
env={**os.environ, "EXPERIMENT_TARGET": str(target_path)},
|
|
90
|
+
)
|
|
91
|
+
if proc.stdout.strip():
|
|
92
|
+
description = proc.stdout.strip().splitlines()[-1][:200]
|
|
93
|
+
|
|
94
|
+
measured = run_measure(measure_path)
|
|
95
|
+
new_score = float(measured["score"])
|
|
96
|
+
if new_score > current_score:
|
|
97
|
+
status = "kept"
|
|
98
|
+
current_score = new_score
|
|
99
|
+
backup_path.unlink(missing_ok=True)
|
|
100
|
+
else:
|
|
101
|
+
status = "reverted"
|
|
102
|
+
restore_backup(backup_path, target_path)
|
|
103
|
+
except subprocess.CalledProcessError as exc:
|
|
104
|
+
restore_backup(backup_path, target_path)
|
|
105
|
+
stderr = (exc.stderr or exc.stdout or "").strip()
|
|
106
|
+
description = (stderr.splitlines()[-1] if stderr else "proposal failed")[:200]
|
|
107
|
+
status = "error"
|
|
108
|
+
|
|
109
|
+
append_result(
|
|
110
|
+
results_path,
|
|
111
|
+
{
|
|
112
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
113
|
+
"trial": trial_index,
|
|
114
|
+
"status": status,
|
|
115
|
+
"old_score": f"{old_score:.4f}",
|
|
116
|
+
"new_score": f"{new_score:.4f}",
|
|
117
|
+
"proposal": proposal_path.name,
|
|
118
|
+
"description": description,
|
|
119
|
+
},
|
|
120
|
+
)
|
|
121
|
+
print(f"TRIAL {trial_index} {status.upper()} score={new_score:.4f} proposal={proposal_path.name}")
|
|
122
|
+
|
|
123
|
+
final_measure = run_measure(measure_path)
|
|
124
|
+
print(f"FINAL {final_measure['score']:.4f}")
|
|
125
|
+
return 0
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
if __name__ == "__main__":
|
|
129
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""Objective metric for the smoke keep/revert example."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
import sys
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
EXPERIMENT_DIR = Path(__file__).resolve().parent
|
|
11
|
+
if str(EXPERIMENT_DIR) not in sys.path:
|
|
12
|
+
sys.path.insert(0, str(EXPERIMENT_DIR))
|
|
13
|
+
|
|
14
|
+
from candidate import count_words
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
CASES = [
|
|
18
|
+
("", 0),
|
|
19
|
+
("one", 1),
|
|
20
|
+
("two words", 2),
|
|
21
|
+
(" three spaced words ", 3),
|
|
22
|
+
("punctuation, still counts", 3),
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def main() -> int:
|
|
27
|
+
passed = 0
|
|
28
|
+
|
|
29
|
+
for text, expected in CASES:
|
|
30
|
+
actual = count_words(text)
|
|
31
|
+
if actual == expected:
|
|
32
|
+
passed += 1
|
|
33
|
+
|
|
34
|
+
total = len(CASES)
|
|
35
|
+
score = passed / total if total else 0.0
|
|
36
|
+
payload = {
|
|
37
|
+
"score": round(score, 4),
|
|
38
|
+
"passed": passed,
|
|
39
|
+
"total": total,
|
|
40
|
+
"status": "pass" if passed == total else "fail",
|
|
41
|
+
}
|
|
42
|
+
print(json.dumps(payload))
|
|
43
|
+
return 0
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
if __name__ == "__main__":
|
|
47
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""A deliberately bad mutation that should be reverted."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
TARGET = Path(os.environ["EXPERIMENT_TARGET"])
|
|
8
|
+
|
|
9
|
+
TARGET.write_text(
|
|
10
|
+
'''"""Bounded mutation target for the smoke experiment."""
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def count_words(text: str) -> int:
|
|
14
|
+
return 0
|
|
15
|
+
''',
|
|
16
|
+
encoding="utf-8",
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
print("applied bad proposal")
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""A good mutation that should be kept."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
TARGET = Path(os.environ["EXPERIMENT_TARGET"])
|
|
8
|
+
|
|
9
|
+
TARGET.write_text(
|
|
10
|
+
'''"""Bounded mutation target for the smoke experiment."""
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def count_words(text: str) -> int:
|
|
14
|
+
cleaned = text.strip()
|
|
15
|
+
if not cleaned:
|
|
16
|
+
return 0
|
|
17
|
+
return len(cleaned.split())
|
|
18
|
+
''',
|
|
19
|
+
encoding="utf-8",
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
print("applied good proposal")
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Restore the smoke example to its baseline."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
TARGET = Path(__file__).resolve().parent / "candidate.py"
|
|
7
|
+
|
|
8
|
+
TARGET.write_text(
|
|
9
|
+
'''"""Bounded mutation target for the smoke experiment."""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def count_words(text: str) -> int:
|
|
13
|
+
cleaned = text.strip()
|
|
14
|
+
if not cleaned:
|
|
15
|
+
return 0
|
|
16
|
+
return len(cleaned)
|
|
17
|
+
''',
|
|
18
|
+
encoding="utf-8",
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
print("reset smoke-keep-revert to baseline")
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
timestamp trial status old_score new_score proposal description
|
|
2
|
+
2026-03-11T11:05:17.887045+00:00 1 reverted 0.2000 0.2000 bad_patch.py applied bad proposal
|
|
3
|
+
2026-03-11T11:05:17.920737+00:00 2 kept 0.2000 1.0000 fix_patch.py applied good proposal
|
|
4
|
+
2026-03-11T11:05:40.063680+00:00 1 reverted 0.2000 0.2000 bad_patch.py applied bad proposal
|
|
5
|
+
2026-03-11T11:05:40.097842+00:00 2 kept 0.2000 1.0000 fix_patch.py applied good proposal
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
<svg width="980" height="260" viewBox="0 0 980 260" fill="none" xmlns="http://www.w3.org/2000/svg">
|
|
2
|
+
<rect width="980" height="260" fill="#F7F5EF"/>
|
|
3
|
+
<text x="40" y="42" font-family="Helvetica, Arial, sans-serif" font-size="28" font-weight="700" fill="#111111">Smoke Keep/Revert</text>
|
|
4
|
+
<text x="40" y="68" font-family="Helvetica, Arial, sans-serif" font-size="14" fill="#4B5563">One bounded target. One hard metric. Reject the loser. Keep the winner.</text>
|
|
5
|
+
|
|
6
|
+
<rect x="40" y="110" width="150" height="88" rx="16" fill="#FFF7ED" stroke="#C2410C" stroke-width="2"/>
|
|
7
|
+
<text x="115" y="140" text-anchor="middle" font-family="Helvetica, Arial, sans-serif" font-size="18" font-weight="700" fill="#9A3412">Broken Target</text>
|
|
8
|
+
<text x="115" y="166" text-anchor="middle" font-family="Helvetica, Arial, sans-serif" font-size="14" fill="#7C2D12">candidate.py</text>
|
|
9
|
+
<text x="115" y="186" text-anchor="middle" font-family="Helvetica, Arial, sans-serif" font-size="14" fill="#7C2D12">buggy on purpose</text>
|
|
10
|
+
|
|
11
|
+
<rect x="220" y="110" width="150" height="88" rx="16" fill="#EFF6FF" stroke="#1D4ED8" stroke-width="2"/>
|
|
12
|
+
<text x="295" y="140" text-anchor="middle" font-family="Helvetica, Arial, sans-serif" font-size="18" font-weight="700" fill="#1E3A8A">Measure</text>
|
|
13
|
+
<text x="295" y="166" text-anchor="middle" font-family="Helvetica, Arial, sans-serif" font-size="14" fill="#1D4ED8">score = 0.2</text>
|
|
14
|
+
<text x="295" y="186" text-anchor="middle" font-family="Helvetica, Arial, sans-serif" font-size="14" fill="#1D4ED8">baseline truth</text>
|
|
15
|
+
|
|
16
|
+
<rect x="400" y="38" width="160" height="72" rx="16" fill="#FEF2F2" stroke="#DC2626" stroke-width="2"/>
|
|
17
|
+
<text x="480" y="66" text-anchor="middle" font-family="Helvetica, Arial, sans-serif" font-size="18" font-weight="700" fill="#991B1B">Bad Patch</text>
|
|
18
|
+
<text x="480" y="92" text-anchor="middle" font-family="Helvetica, Arial, sans-serif" font-size="14" fill="#B91C1C">score falls to 0.0</text>
|
|
19
|
+
|
|
20
|
+
<rect x="400" y="150" width="160" height="72" rx="16" fill="#ECFDF5" stroke="#059669" stroke-width="2"/>
|
|
21
|
+
<text x="480" y="178" text-anchor="middle" font-family="Helvetica, Arial, sans-serif" font-size="18" font-weight="700" fill="#065F46">Good Patch</text>
|
|
22
|
+
<text x="480" y="204" text-anchor="middle" font-family="Helvetica, Arial, sans-serif" font-size="14" fill="#047857">score rises to 1.0</text>
|
|
23
|
+
|
|
24
|
+
<rect x="610" y="38" width="150" height="72" rx="16" fill="#FEE2E2" stroke="#DC2626" stroke-width="2"/>
|
|
25
|
+
<text x="685" y="66" text-anchor="middle" font-family="Helvetica, Arial, sans-serif" font-size="18" font-weight="700" fill="#991B1B">REVERT</text>
|
|
26
|
+
<text x="685" y="92" text-anchor="middle" font-family="Helvetica, Arial, sans-serif" font-size="14" fill="#B91C1C">reject loser</text>
|
|
27
|
+
|
|
28
|
+
<rect x="610" y="150" width="150" height="72" rx="16" fill="#DCFCE7" stroke="#16A34A" stroke-width="2"/>
|
|
29
|
+
<text x="685" y="178" text-anchor="middle" font-family="Helvetica, Arial, sans-serif" font-size="18" font-weight="700" fill="#166534">KEEP</text>
|
|
30
|
+
<text x="685" y="204" text-anchor="middle" font-family="Helvetica, Arial, sans-serif" font-size="14" fill="#15803D">accept winner</text>
|
|
31
|
+
|
|
32
|
+
<rect x="800" y="110" width="140" height="88" rx="16" fill="#F0FDF4" stroke="#16A34A" stroke-width="2"/>
|
|
33
|
+
<text x="870" y="140" text-anchor="middle" font-family="Helvetica, Arial, sans-serif" font-size="18" font-weight="700" fill="#166534">Final State</text>
|
|
34
|
+
<text x="870" y="166" text-anchor="middle" font-family="Helvetica, Arial, sans-serif" font-size="14" fill="#15803D">fixed target</text>
|
|
35
|
+
<text x="870" y="186" text-anchor="middle" font-family="Helvetica, Arial, sans-serif" font-size="14" fill="#15803D">score = 1.0</text>
|
|
36
|
+
|
|
37
|
+
<path d="M190 154H220" stroke="#6B7280" stroke-width="3"/>
|
|
38
|
+
<path d="M365 154H385" stroke="#6B7280" stroke-width="3"/>
|
|
39
|
+
<path d="M560 74H600" stroke="#DC2626" stroke-width="3"/>
|
|
40
|
+
<path d="M560 186H600" stroke="#16A34A" stroke-width="3"/>
|
|
41
|
+
<path d="M760 154H790" stroke="#6B7280" stroke-width="3"/>
|
|
42
|
+
|
|
43
|
+
<path d="M480 110V138" stroke="#6B7280" stroke-width="3" stroke-dasharray="8 8"/>
|
|
44
|
+
<path d="M295 154C340 154 350 74 400 74" stroke="#DC2626" stroke-width="3" fill="none"/>
|
|
45
|
+
<path d="M295 154C340 154 350 186 400 186" stroke="#16A34A" stroke-width="3" fill="none"/>
|
|
46
|
+
|
|
47
|
+
<polygon points="220,154 210,148 210,160" fill="#6B7280"/>
|
|
48
|
+
<polygon points="385,154 375,148 375,160" fill="#6B7280"/>
|
|
49
|
+
<polygon points="600,74 590,68 590,80" fill="#DC2626"/>
|
|
50
|
+
<polygon points="600,186 590,180 590,192" fill="#16A34A"/>
|
|
51
|
+
<polygon points="790,154 780,148 780,160" fill="#6B7280"/>
|
|
52
|
+
</svg>
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
print("ok")
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
timestamp trial status old_score new_score proposal description
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
print("ok")
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
print("ok")
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
# Program
|
|
2
|
+
|
|
3
|
+
This program is intentionally too long.
|
|
4
|
+
|
|
5
|
+
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
|
|
6
|
+
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
timestamp trial status old_score new_score proposal description
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
print("ok")
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
print("ok")
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
timestamp trial status old_score new_score proposal description
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
timestamp trial status old_score new_score proposal description
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""Runtime benchmark for example experiment packs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
import subprocess
|
|
8
|
+
import sys
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
ROOT = Path(__file__).resolve().parent
|
|
12
|
+
EXAMPLES_DIR = ROOT / "_examples"
|
|
13
|
+
|
|
14
|
+
CASES = [
|
|
15
|
+
{
|
|
16
|
+
"name": "smoke-keep-revert",
|
|
17
|
+
"baseline_below": 1.0,
|
|
18
|
+
"expected_final": 1.0,
|
|
19
|
+
"proposals": ["proposals/bad_patch.py", "proposals/fix_patch.py"],
|
|
20
|
+
},
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def run_python(script: Path, *args: str) -> subprocess.CompletedProcess[str]:
|
|
25
|
+
return subprocess.run(
|
|
26
|
+
[sys.executable, str(script), *args],
|
|
27
|
+
cwd=str(script.parent),
|
|
28
|
+
capture_output=True,
|
|
29
|
+
text=True,
|
|
30
|
+
check=True,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def run_measure(exp_dir: Path) -> dict:
|
|
35
|
+
proc = run_python(exp_dir / "measure.py")
|
|
36
|
+
return json.loads(proc.stdout.strip())
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def main() -> int:
|
|
40
|
+
passed = 0
|
|
41
|
+
failures = []
|
|
42
|
+
|
|
43
|
+
for case in CASES:
|
|
44
|
+
exp_dir = EXAMPLES_DIR / case["name"]
|
|
45
|
+
run_python(exp_dir / "reset.py")
|
|
46
|
+
baseline = run_measure(exp_dir)
|
|
47
|
+
|
|
48
|
+
if float(baseline["score"]) >= case["baseline_below"]:
|
|
49
|
+
failures.append(f"{case['name']}: baseline too high ({baseline['score']})")
|
|
50
|
+
continue
|
|
51
|
+
|
|
52
|
+
proposal_args: list[str] = []
|
|
53
|
+
for proposal in case["proposals"]:
|
|
54
|
+
proposal_args.extend(["--proposal", str(exp_dir / proposal)])
|
|
55
|
+
|
|
56
|
+
run_python(exp_dir / "loop.py", *proposal_args)
|
|
57
|
+
final = run_measure(exp_dir)
|
|
58
|
+
|
|
59
|
+
if float(final["score"]) != case["expected_final"]:
|
|
60
|
+
failures.append(
|
|
61
|
+
f"{case['name']}: final score {final['score']} != {case['expected_final']}"
|
|
62
|
+
)
|
|
63
|
+
continue
|
|
64
|
+
|
|
65
|
+
passed += 1
|
|
66
|
+
|
|
67
|
+
total = len(CASES)
|
|
68
|
+
score = passed / total if total else 0.0
|
|
69
|
+
print(f"SCORE {score:.4f} ({passed}/{total})")
|
|
70
|
+
|
|
71
|
+
if failures:
|
|
72
|
+
for failure in failures:
|
|
73
|
+
print(f"FAIL {failure}")
|
|
74
|
+
return 1
|
|
75
|
+
|
|
76
|
+
print("PASS benchmark_runtime")
|
|
77
|
+
return 0
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
if __name__ == "__main__":
|
|
81
|
+
raise SystemExit(main())
|