loopbench 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loopbench/__init__.py +3 -0
- loopbench/cli.py +136 -0
- loopbench/conformance.py +79 -0
- loopbench/les_compute.py +189 -0
- loopbench/runner.py +142 -0
- loopbench/tasks.py +47 -0
- loopbench-0.1.0.dist-info/METADATA +182 -0
- loopbench-0.1.0.dist-info/RECORD +11 -0
- loopbench-0.1.0.dist-info/WHEEL +4 -0
- loopbench-0.1.0.dist-info/entry_points.txt +2 -0
- loopbench-0.1.0.dist-info/licenses/LICENSE +21 -0
loopbench/__init__.py
ADDED
loopbench/cli.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""LoopBench CLI — local eval, validate, rank."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import json
|
|
7
|
+
import sys
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from loopbench import __version__
|
|
11
|
+
from loopbench.conformance import validate_file
|
|
12
|
+
from loopbench.runner import build_submission, run_task
|
|
13
|
+
from loopbench.tasks import list_tasks
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _parse_seeds(raw: str) -> list[int]:
|
|
17
|
+
return [int(s.strip()) for s in raw.split(",") if s.strip()]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def cmd_list(_args: argparse.Namespace) -> int:
|
|
21
|
+
print("LoopBench tasks:")
|
|
22
|
+
for task_id in list_tasks():
|
|
23
|
+
print(f" {task_id}")
|
|
24
|
+
return 0
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def cmd_run(args: argparse.Namespace) -> int:
|
|
28
|
+
spec_path = Path(args.spec)
|
|
29
|
+
if not spec_path.exists():
|
|
30
|
+
print(f"Error: spec not found: {spec_path}", file=sys.stderr)
|
|
31
|
+
return 2
|
|
32
|
+
|
|
33
|
+
seeds = _parse_seeds(args.seeds) if args.seeds else None
|
|
34
|
+
task_ids = args.task.split(",") if args.task else ["LB-CR-1"]
|
|
35
|
+
task_results = []
|
|
36
|
+
for task_id in task_ids:
|
|
37
|
+
print(f"Running {task_id} via LoopGym (backend={args.backend})...", file=sys.stderr)
|
|
38
|
+
task_results.append(
|
|
39
|
+
run_task(
|
|
40
|
+
task_id.strip(),
|
|
41
|
+
spec_path,
|
|
42
|
+
seeds=seeds,
|
|
43
|
+
backend=args.backend,
|
|
44
|
+
)
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
submission = build_submission(
|
|
48
|
+
args.submitter,
|
|
49
|
+
spec_path,
|
|
50
|
+
task_results,
|
|
51
|
+
backend=args.backend,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
if args.output:
|
|
55
|
+
out = Path(args.output)
|
|
56
|
+
out.parent.mkdir(parents=True, exist_ok=True)
|
|
57
|
+
out.write_text(json.dumps(submission, indent=2), encoding="utf-8")
|
|
58
|
+
print(f"Wrote {out}", file=sys.stderr)
|
|
59
|
+
else:
|
|
60
|
+
print(json.dumps(submission, indent=2))
|
|
61
|
+
|
|
62
|
+
return 0
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def cmd_validate(args: argparse.Namespace) -> int:
|
|
66
|
+
path = Path(args.file)
|
|
67
|
+
if not path.exists():
|
|
68
|
+
print(f"Error: file not found: {path}", file=sys.stderr)
|
|
69
|
+
return 2
|
|
70
|
+
valid, errors = validate_file(path)
|
|
71
|
+
if args.json:
|
|
72
|
+
print(json.dumps({"valid": valid, "errors": errors}, indent=2))
|
|
73
|
+
elif valid:
|
|
74
|
+
print(f"VALID: {path}")
|
|
75
|
+
else:
|
|
76
|
+
print(f"INVALID: {path}", file=sys.stderr)
|
|
77
|
+
for err in errors:
|
|
78
|
+
print(f" - {err}", file=sys.stderr)
|
|
79
|
+
return 0 if valid else 1
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def cmd_rank(args: argparse.Namespace) -> int:
|
|
83
|
+
path = Path(args.file)
|
|
84
|
+
with path.open(encoding="utf-8") as fh:
|
|
85
|
+
data = json.load(fh)
|
|
86
|
+
|
|
87
|
+
entries = data if isinstance(data, list) else data.get("entries", [])
|
|
88
|
+
ranked = sorted(
|
|
89
|
+
entries,
|
|
90
|
+
key=lambda e: e.get("composite", {}).get("rank_score", 0.0),
|
|
91
|
+
reverse=True,
|
|
92
|
+
)
|
|
93
|
+
print(f"{'Rank':<6}{'Submitter':<24}{'LES':>8}{'Display':>10}{'Backend':>10}")
|
|
94
|
+
print("-" * 58)
|
|
95
|
+
for i, entry in enumerate(ranked, 1):
|
|
96
|
+
comp = entry.get("composite", {})
|
|
97
|
+
print(
|
|
98
|
+
f"{i:<6}{entry.get('submitter', '?')[:24]:<24}"
|
|
99
|
+
f"{comp.get('les_observed', 0):>8.4f}"
|
|
100
|
+
f"{comp.get('les_display', 0):>10.1f}"
|
|
101
|
+
f"{entry.get('backend', '?'):>10}"
|
|
102
|
+
)
|
|
103
|
+
return 0
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def main() -> int:
|
|
107
|
+
parser = argparse.ArgumentParser(description=f"LoopBench CLI v{__version__}")
|
|
108
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
109
|
+
|
|
110
|
+
p_list = sub.add_parser("list", help="List benchmark tasks")
|
|
111
|
+
p_list.set_defaults(func=cmd_list)
|
|
112
|
+
|
|
113
|
+
p_run = sub.add_parser("run", help="Run local evaluation via LoopGym")
|
|
114
|
+
p_run.add_argument("--task", default="LB-CR-1", help="Task ID or comma-separated list")
|
|
115
|
+
p_run.add_argument("--spec", required=True, help="Path to LSS YAML spec")
|
|
116
|
+
p_run.add_argument("--seeds", default="0,1,2,3,4", help="Comma-separated seeds")
|
|
117
|
+
p_run.add_argument("--submitter", default="local-dev", help="Submitter name")
|
|
118
|
+
p_run.add_argument("--backend", default="sim", choices=["sim", "live", "replay"])
|
|
119
|
+
p_run.add_argument("--output", "-o", help="Write results JSON to path")
|
|
120
|
+
p_run.set_defaults(func=cmd_run)
|
|
121
|
+
|
|
122
|
+
p_val = sub.add_parser("validate", help="Validate submission JSON")
|
|
123
|
+
p_val.add_argument("file", help="Results JSON path")
|
|
124
|
+
p_val.add_argument("--json", action="store_true")
|
|
125
|
+
p_val.set_defaults(func=cmd_validate)
|
|
126
|
+
|
|
127
|
+
p_rank = sub.add_parser("rank", help="Rank leaderboard entries by composite LES")
|
|
128
|
+
p_rank.add_argument("file", help="Leaderboard JSON path")
|
|
129
|
+
p_rank.set_defaults(func=cmd_rank)
|
|
130
|
+
|
|
131
|
+
args = parser.parse_args()
|
|
132
|
+
return args.func(args)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
if __name__ == "__main__":
|
|
136
|
+
raise SystemExit(main())
|
loopbench/conformance.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Validate LoopBench submission JSON against schema and conformance rules."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import jsonschema
|
|
9
|
+
|
|
10
|
+
REPO_ROOT = Path(__file__).resolve().parents[1]
|
|
11
|
+
SCHEMA_PATH = REPO_ROOT / "submit" / "schema.json"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def load_schema() -> dict:
|
|
15
|
+
with SCHEMA_PATH.open(encoding="utf-8") as fh:
|
|
16
|
+
return json.load(fh)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def validate_submission(data: dict, schema: dict | None = None) -> list[str]:
|
|
20
|
+
schema = schema or load_schema()
|
|
21
|
+
validator = jsonschema.Draft202012Validator(schema)
|
|
22
|
+
errors = sorted(validator.iter_errors(data), key=lambda e: list(e.absolute_path))
|
|
23
|
+
messages = [f"{'.'.join(str(p) for p in e.absolute_path) or '(root)'}: {e.message}" for e in errors]
|
|
24
|
+
|
|
25
|
+
spec_path = data.get("spec_path")
|
|
26
|
+
spec_hash = data.get("spec_hash")
|
|
27
|
+
if spec_path and spec_hash:
|
|
28
|
+
path = Path(spec_path)
|
|
29
|
+
if not path.is_absolute():
|
|
30
|
+
path = REPO_ROOT / path
|
|
31
|
+
if path.exists():
|
|
32
|
+
import hashlib
|
|
33
|
+
|
|
34
|
+
digest = hashlib.sha256(path.read_bytes()).hexdigest()
|
|
35
|
+
expected = f"sha256:{digest}"
|
|
36
|
+
if spec_hash != expected:
|
|
37
|
+
messages.append(f"spec_hash: mismatch (expected {expected})")
|
|
38
|
+
|
|
39
|
+
for task_result in data.get("results", []):
|
|
40
|
+
agg = task_result.get("aggregate", {})
|
|
41
|
+
les_obs = agg.get("les_observed")
|
|
42
|
+
les_disp = agg.get("les_display")
|
|
43
|
+
if les_obs is not None and les_disp is not None:
|
|
44
|
+
expected_disp = round(float(les_obs) * 100, 1)
|
|
45
|
+
if abs(float(les_disp) - expected_disp) > 0.05:
|
|
46
|
+
messages.append(
|
|
47
|
+
f"results.{task_result.get('task_id')}.aggregate: "
|
|
48
|
+
f"les_display {les_disp} != round(les_observed*100,1)={expected_disp}"
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
return messages
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def validate_leaderboard(data: dict) -> list[str]:
|
|
55
|
+
messages: list[str] = []
|
|
56
|
+
if not isinstance(data, dict):
|
|
57
|
+
return ["(root): expected object"]
|
|
58
|
+
for key in ("version", "updated", "entries"):
|
|
59
|
+
if key not in data:
|
|
60
|
+
messages.append(f"(root): missing required property '{key}'")
|
|
61
|
+
entries = data.get("entries")
|
|
62
|
+
if not isinstance(entries, list):
|
|
63
|
+
messages.append("entries: expected array")
|
|
64
|
+
return messages
|
|
65
|
+
for i, entry in enumerate(entries):
|
|
66
|
+
entry_errors = validate_submission(entry)
|
|
67
|
+
for err in entry_errors:
|
|
68
|
+
messages.append(f"entries[{i}].{err}")
|
|
69
|
+
return messages
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def validate_file(path: Path) -> tuple[bool, list[str]]:
|
|
73
|
+
with path.open(encoding="utf-8") as fh:
|
|
74
|
+
data = json.load(fh)
|
|
75
|
+
if isinstance(data, dict) and "entries" in data:
|
|
76
|
+
errors = validate_leaderboard(data)
|
|
77
|
+
else:
|
|
78
|
+
errors = validate_submission(data)
|
|
79
|
+
return len(errors) == 0, errors
|
loopbench/les_compute.py
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
"""Observed LES computation for LoopBench runs (LES-1.0 aligned)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from statistics import mean, median
|
|
7
|
+
|
|
8
|
+
LES_WEIGHTS: dict[str, float] = {
|
|
9
|
+
"effectiveness": 0.20,
|
|
10
|
+
"speed": 0.15,
|
|
11
|
+
"cost": 0.12,
|
|
12
|
+
"robustness": 0.13,
|
|
13
|
+
"scalability": 0.10,
|
|
14
|
+
"safety": 0.12,
|
|
15
|
+
"adaptability": 0.10,
|
|
16
|
+
"autonomy": 0.08,
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _clamp(value: float, low: float = 0.0, high: float = 1.0) -> float:
|
|
21
|
+
return max(low, min(high, value))
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _normalize(x: float, b_floor: float, b_ceiling: float) -> float:
|
|
25
|
+
if b_ceiling <= b_floor:
|
|
26
|
+
return 0.0
|
|
27
|
+
return _clamp((x - b_floor) / (b_ceiling - b_floor))
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _is_improving(goal_trace: list[float]) -> bool:
|
|
31
|
+
if len(goal_trace) < 3:
|
|
32
|
+
return False
|
|
33
|
+
start = max(1, (2 * len(goal_trace)) // 3)
|
|
34
|
+
window = goal_trace[start:]
|
|
35
|
+
if len(window) < 2:
|
|
36
|
+
return False
|
|
37
|
+
improvements = sum(1 for i in range(1, len(window)) if window[i] > window[i - 1])
|
|
38
|
+
return improvements / (len(window) - 1) >= 0.6
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class RunMetrics:
|
|
43
|
+
task_instance_id: str
|
|
44
|
+
seed: int
|
|
45
|
+
g_final: float
|
|
46
|
+
g_0: float
|
|
47
|
+
g_target: float
|
|
48
|
+
iterations: int
|
|
49
|
+
t_budget: int
|
|
50
|
+
cost_usd: float
|
|
51
|
+
elapsed_s: float
|
|
52
|
+
success: bool
|
|
53
|
+
termination_reason: str
|
|
54
|
+
safety_violations: int = 0
|
|
55
|
+
goal_trace: list[float] = field(default_factory=list)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass
|
|
59
|
+
class LesResult:
|
|
60
|
+
les_observed: float
|
|
61
|
+
les_display: float
|
|
62
|
+
categories: dict[str, float]
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def effectiveness_score(run: RunMetrics) -> float:
|
|
66
|
+
g_final, g_target = run.g_final, run.g_target
|
|
67
|
+
if g_target <= 0:
|
|
68
|
+
return 0.0
|
|
69
|
+
if g_final >= g_target:
|
|
70
|
+
e_raw = g_final / g_target
|
|
71
|
+
elif _is_improving(run.goal_trace):
|
|
72
|
+
e_raw = (g_final / g_target) * (run.t_budget / max(run.iterations, 1))
|
|
73
|
+
else:
|
|
74
|
+
e_raw = (g_final / g_target) * 0.5
|
|
75
|
+
return _normalize(e_raw, 0.5, 1.0)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def speed_score(run: RunMetrics, b_floor: float = 0.001, b_ceiling: float = 0.05) -> float:
|
|
79
|
+
if run.iterations <= 0 or run.elapsed_s <= 0:
|
|
80
|
+
return 0.0
|
|
81
|
+
tau = run.elapsed_s / run.iterations
|
|
82
|
+
s_raw = 1.0 / tau
|
|
83
|
+
return _normalize(s_raw, b_floor, b_ceiling)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def cost_score(
|
|
87
|
+
run: RunMetrics,
|
|
88
|
+
b_floor: float = 0.05,
|
|
89
|
+
b_ceiling: float = 2.0,
|
|
90
|
+
) -> float:
|
|
91
|
+
delta_g = run.g_final - run.g_0
|
|
92
|
+
if delta_g <= 0 or run.cost_usd <= 0:
|
|
93
|
+
return 0.0
|
|
94
|
+
return _normalize(delta_g / run.cost_usd, b_floor, b_ceiling)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def safety_score(run: RunMetrics) -> float:
|
|
98
|
+
if run.safety_violations >= 10:
|
|
99
|
+
return 0.0
|
|
100
|
+
return _clamp(1.0 - min(run.safety_violations / 10.0, 1.0))
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def robustness_from_seeds(runs: list[RunMetrics]) -> float:
|
|
104
|
+
finals = [r.g_final for r in runs]
|
|
105
|
+
if not finals:
|
|
106
|
+
return 0.0
|
|
107
|
+
g_clean = max(finals)
|
|
108
|
+
g_perturbed = min(finals)
|
|
109
|
+
if g_clean <= 0:
|
|
110
|
+
return 0.0
|
|
111
|
+
degradation = 1.0 - (g_perturbed / g_clean)
|
|
112
|
+
return _clamp(1.0 - degradation)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def scalability_score(runs: list[RunMetrics]) -> float:
|
|
116
|
+
if not runs:
|
|
117
|
+
return 0.0
|
|
118
|
+
retention = mean(r.g_final / r.g_target if r.g_target > 0 else 0.0 for r in runs)
|
|
119
|
+
return _normalize(retention, 0.4, 0.90)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def adaptability_score(runs: list[RunMetrics]) -> float:
|
|
123
|
+
if not runs:
|
|
124
|
+
return 0.0
|
|
125
|
+
return _clamp(mean(1.0 if r.success else r.g_final / r.g_target if r.g_target else 0.0 for r in runs))
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def autonomy_score(_runs: list[RunMetrics], backend: str = "sim") -> float:
|
|
129
|
+
return 0.95 if backend == "sim" else 0.5
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def compute_run_categories(
|
|
133
|
+
run: RunMetrics,
|
|
134
|
+
*,
|
|
135
|
+
speed_baselines: tuple[float, float] | None = None,
|
|
136
|
+
cost_baselines: tuple[float, float] | None = None,
|
|
137
|
+
) -> dict[str, float]:
|
|
138
|
+
b_floor, b_ceiling = speed_baselines or (0.001, 0.05)
|
|
139
|
+
c_floor, c_ceiling = cost_baselines or (0.05, 2.0)
|
|
140
|
+
return {
|
|
141
|
+
"effectiveness": effectiveness_score(run),
|
|
142
|
+
"speed": speed_score(run, b_floor, b_ceiling),
|
|
143
|
+
"cost": cost_score(run, c_floor, c_ceiling),
|
|
144
|
+
"safety": safety_score(run),
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def compute_task_les(
|
|
149
|
+
runs: list[RunMetrics],
|
|
150
|
+
*,
|
|
151
|
+
speed_baselines: tuple[float, float] | None = None,
|
|
152
|
+
cost_baselines: tuple[float, float] | None = None,
|
|
153
|
+
backend: str = "sim",
|
|
154
|
+
) -> LesResult:
|
|
155
|
+
if not runs:
|
|
156
|
+
return LesResult(0.0, 0.0, {k: 0.0 for k in LES_WEIGHTS})
|
|
157
|
+
|
|
158
|
+
per_run = [
|
|
159
|
+
compute_run_categories(
|
|
160
|
+
r,
|
|
161
|
+
speed_baselines=speed_baselines,
|
|
162
|
+
cost_baselines=cost_baselines,
|
|
163
|
+
)
|
|
164
|
+
for r in runs
|
|
165
|
+
]
|
|
166
|
+
categories: dict[str, float] = {
|
|
167
|
+
"effectiveness": mean(c["effectiveness"] for c in per_run),
|
|
168
|
+
"speed": mean(c["speed"] for c in per_run),
|
|
169
|
+
"cost": mean(c["cost"] for c in per_run),
|
|
170
|
+
"safety": mean(c["safety"] for c in per_run),
|
|
171
|
+
"robustness": robustness_from_seeds(runs),
|
|
172
|
+
"scalability": scalability_score(runs),
|
|
173
|
+
"adaptability": adaptability_score(runs),
|
|
174
|
+
"autonomy": autonomy_score(runs, backend),
|
|
175
|
+
}
|
|
176
|
+
composite = sum(LES_WEIGHTS[k] * categories[k] for k in LES_WEIGHTS)
|
|
177
|
+
les_observed = round(composite, 4)
|
|
178
|
+
return LesResult(
|
|
179
|
+
les_observed=les_observed,
|
|
180
|
+
les_display=round(les_observed * 100, 1),
|
|
181
|
+
categories={k: round(v, 4) for k, v in categories.items()},
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def compute_composite(task_les: list[float]) -> tuple[float, float, float]:
|
|
186
|
+
if not task_les:
|
|
187
|
+
return 0.0, 0.0, 0.0
|
|
188
|
+
avg = mean(task_les)
|
|
189
|
+
return round(avg, 4), round(avg * 100, 1), round(avg, 4)
|
loopbench/runner.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""Run LoopBench evaluations via LoopGym."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import time
|
|
7
|
+
from datetime import datetime, timezone
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from uuid import uuid4
|
|
10
|
+
|
|
11
|
+
import loopgym as lg
|
|
12
|
+
import yaml
|
|
13
|
+
|
|
14
|
+
from loopbench import __version__
|
|
15
|
+
from loopbench.les_compute import LES_WEIGHTS, LesResult, RunMetrics, compute_composite, compute_task_les
|
|
16
|
+
from loopbench.tasks import cost_baselines, load_task, speed_baselines
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def spec_hash(path: Path) -> str:
|
|
20
|
+
digest = hashlib.sha256(path.read_bytes()).hexdigest()
|
|
21
|
+
return f"sha256:{digest}"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def estimate_cost_usd(spec: dict, iterations: int) -> float:
|
|
25
|
+
cost = spec.get("cost_limits") or {}
|
|
26
|
+
per_iter = float(cost.get("per_iteration_usd", 0.05))
|
|
27
|
+
return round(per_iter * iterations, 4)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def run_task(
|
|
31
|
+
task_id: str,
|
|
32
|
+
spec_path: Path,
|
|
33
|
+
*,
|
|
34
|
+
seeds: list[int] | None = None,
|
|
35
|
+
instances: list[str] | None = None,
|
|
36
|
+
backend: str = "sim",
|
|
37
|
+
) -> dict:
|
|
38
|
+
task = load_task(task_id)
|
|
39
|
+
env_id = task["loopgym_env"]
|
|
40
|
+
g_target = float(task["goal"]["g_target"])
|
|
41
|
+
t_budget = int(task["budget"]["iteration_budget"])
|
|
42
|
+
eval_cfg = task.get("evaluation") or {}
|
|
43
|
+
seed_list = seeds if seeds is not None else list(eval_cfg.get("seeds", [0]))
|
|
44
|
+
instance_list = instances if instances is not None else list(eval_cfg.get("primary_instances", []))
|
|
45
|
+
|
|
46
|
+
with spec_path.open(encoding="utf-8") as fh:
|
|
47
|
+
spec = yaml.safe_load(fh)
|
|
48
|
+
|
|
49
|
+
env = lg.make(env_id, spec_path=spec_path, backend=backend)
|
|
50
|
+
runs: list[RunMetrics] = []
|
|
51
|
+
|
|
52
|
+
for instance_id in instance_list:
|
|
53
|
+
for seed in seed_list:
|
|
54
|
+
t0 = time.perf_counter()
|
|
55
|
+
result = env.run_episode(task_id=instance_id, seed=seed)
|
|
56
|
+
elapsed = time.perf_counter() - t0
|
|
57
|
+
trace = [s["quality_score"] for s in result.get("trajectory", [])]
|
|
58
|
+
g_0 = trace[0] if trace else 0.0
|
|
59
|
+
g_final = float(result.get("quality_score", 0.0))
|
|
60
|
+
iterations = int(result.get("steps", 1))
|
|
61
|
+
runs.append(
|
|
62
|
+
RunMetrics(
|
|
63
|
+
task_instance_id=instance_id,
|
|
64
|
+
seed=seed,
|
|
65
|
+
g_final=g_final,
|
|
66
|
+
g_0=g_0,
|
|
67
|
+
g_target=g_target,
|
|
68
|
+
iterations=iterations,
|
|
69
|
+
t_budget=t_budget,
|
|
70
|
+
cost_usd=estimate_cost_usd(spec, iterations),
|
|
71
|
+
elapsed_s=round(elapsed, 4),
|
|
72
|
+
success=bool(result.get("success")) or g_final >= g_target,
|
|
73
|
+
termination_reason="goal_met" if (bool(result.get("success")) or g_final >= g_target) else "budget_exhausted",
|
|
74
|
+
goal_trace=trace,
|
|
75
|
+
)
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
les: LesResult = compute_task_les(
|
|
79
|
+
runs,
|
|
80
|
+
speed_baselines=speed_baselines(task),
|
|
81
|
+
cost_baselines=cost_baselines(task),
|
|
82
|
+
backend=backend,
|
|
83
|
+
)
|
|
84
|
+
success_at_k = sum(1 for r in runs if r.success) / len(runs) if runs else 0.0
|
|
85
|
+
|
|
86
|
+
return {
|
|
87
|
+
"task_id": task_id,
|
|
88
|
+
"env_id": env_id,
|
|
89
|
+
"runs": [
|
|
90
|
+
{
|
|
91
|
+
"task_instance_id": r.task_instance_id,
|
|
92
|
+
"seed": r.seed,
|
|
93
|
+
"success": r.success,
|
|
94
|
+
"g_final": round(r.g_final, 4),
|
|
95
|
+
"g_0": round(r.g_0, 4),
|
|
96
|
+
"g_target": r.g_target,
|
|
97
|
+
"iterations": r.iterations,
|
|
98
|
+
"cost_usd": r.cost_usd,
|
|
99
|
+
"elapsed_s": r.elapsed_s,
|
|
100
|
+
"termination_reason": r.termination_reason,
|
|
101
|
+
"safety_violations": r.safety_violations,
|
|
102
|
+
"goal_trace": [round(g, 4) for g in r.goal_trace],
|
|
103
|
+
}
|
|
104
|
+
for r in runs
|
|
105
|
+
],
|
|
106
|
+
"aggregate": {
|
|
107
|
+
"success_at_k": round(success_at_k, 4),
|
|
108
|
+
"les_observed": les.les_observed,
|
|
109
|
+
"les_display": les.les_display,
|
|
110
|
+
"categories": les.categories,
|
|
111
|
+
"cost_usd_mean": round(sum(r.cost_usd for r in runs) / len(runs), 4) if runs else 0.0,
|
|
112
|
+
"robustness_seeds": len(seed_list),
|
|
113
|
+
},
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def build_submission(
|
|
118
|
+
submitter: str,
|
|
119
|
+
spec_path: Path,
|
|
120
|
+
task_results: list[dict],
|
|
121
|
+
*,
|
|
122
|
+
backend: str = "sim",
|
|
123
|
+
) -> dict:
|
|
124
|
+
les_values = [tr["aggregate"]["les_observed"] for tr in task_results]
|
|
125
|
+
les_obs, les_disp, rank = compute_composite(les_values)
|
|
126
|
+
return {
|
|
127
|
+
"submission_id": str(uuid4()),
|
|
128
|
+
"submitter": submitter,
|
|
129
|
+
"loopbench_version": __version__,
|
|
130
|
+
"lss_version": "1.0.0",
|
|
131
|
+
"les_version": "1.0.0",
|
|
132
|
+
"spec_path": spec_path.as_posix(),
|
|
133
|
+
"spec_hash": spec_hash(spec_path),
|
|
134
|
+
"submitted_at": datetime.now(timezone.utc).isoformat(),
|
|
135
|
+
"backend": backend,
|
|
136
|
+
"results": task_results,
|
|
137
|
+
"composite": {
|
|
138
|
+
"les_observed": les_obs,
|
|
139
|
+
"les_display": les_disp,
|
|
140
|
+
"rank_score": rank,
|
|
141
|
+
},
|
|
142
|
+
}
|
loopbench/tasks.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""Load LoopBench task definitions."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import yaml
|
|
8
|
+
|
|
9
|
+
REPO_ROOT = Path(__file__).resolve().parents[1]
|
|
10
|
+
TASKS_ROOT = REPO_ROOT / "tasks"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def list_tasks() -> list[str]:
|
|
14
|
+
index_path = TASKS_ROOT / "index.yaml"
|
|
15
|
+
if not index_path.exists():
|
|
16
|
+
return []
|
|
17
|
+
with index_path.open(encoding="utf-8") as fh:
|
|
18
|
+
index = yaml.safe_load(fh)
|
|
19
|
+
return [t["id"] for t in index.get("tasks", [])]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def load_task(task_id: str) -> dict:
|
|
23
|
+
task_path = TASKS_ROOT / task_id / "task.yaml"
|
|
24
|
+
if not task_path.exists():
|
|
25
|
+
available = ", ".join(list_tasks())
|
|
26
|
+
raise FileNotFoundError(f"Unknown task '{task_id}'. Available: {available}")
|
|
27
|
+
with task_path.open(encoding="utf-8") as fh:
|
|
28
|
+
data = yaml.safe_load(fh)
|
|
29
|
+
if not isinstance(data, dict):
|
|
30
|
+
raise ValueError(f"Invalid task spec: {task_path}")
|
|
31
|
+
return data
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def speed_baselines(task: dict) -> tuple[float, float]:
|
|
35
|
+
sb = task.get("speed_baselines") or {}
|
|
36
|
+
return (
|
|
37
|
+
float(sb.get("b_floor_iter_per_s", 0.001)),
|
|
38
|
+
float(sb.get("b_ceiling_iter_per_s", 0.05)),
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def cost_baselines(task: dict) -> tuple[float, float]:
|
|
43
|
+
cb = task.get("cost_baselines") or {}
|
|
44
|
+
return (
|
|
45
|
+
float(cb.get("b_floor_efficiency", 0.05)),
|
|
46
|
+
float(cb.get("b_ceiling_efficiency", 2.0)),
|
|
47
|
+
)
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: loopbench
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: LoopBench — benchmark suite, metrics, submission pipeline, leaderboards
|
|
5
|
+
Project-URL: Homepage, https://github.com/KanakMalpani/LoopBench
|
|
6
|
+
Project-URL: Repository, https://github.com/KanakMalpani/LoopBench
|
|
7
|
+
Project-URL: Issues, https://github.com/KanakMalpani/LoopBench/issues
|
|
8
|
+
Project-URL: Documentation, https://github.com/KanakMalpani/LoopBench/blob/main/SUITE-OVERVIEW.md
|
|
9
|
+
Project-URL: LoopGym, https://github.com/KanakMalpani/LoopGym
|
|
10
|
+
Project-URL: Loop Core Engineering, https://github.com/KanakMalpani/Loop-Core-Engineering
|
|
11
|
+
Author: Kanak Malpani
|
|
12
|
+
License: MIT
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Keywords: benchmark,les,loop-engineering,loopbench
|
|
15
|
+
Classifier: Development Status :: 4 - Beta
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Requires-Python: >=3.12
|
|
19
|
+
Requires-Dist: jsonschema>=4.21
|
|
20
|
+
Requires-Dist: pyyaml>=6.0
|
|
21
|
+
Provides-Extra: dev
|
|
22
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
23
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
|
|
26
|
+
<p align="center">
|
|
27
|
+
<strong>LoopBench</strong><br>
|
|
28
|
+
<em>MLPerf for loops.</em>
|
|
29
|
+
</p>
|
|
30
|
+
|
|
31
|
+
<p align="center">
|
|
32
|
+
<a href="https://github.com/KanakMalpani/LoopBench/actions/workflows/test.yml"><img src="https://github.com/KanakMalpani/LoopBench/actions/workflows/test.yml/badge.svg" alt="CI"></a>
|
|
33
|
+
<a href="LICENSE"><img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="MIT"></a>
|
|
34
|
+
<img src="https://img.shields.io/badge/python-3.12+-blue.svg" alt="Python 3.12+">
|
|
35
|
+
<a href="SUITE-OVERVIEW.md"><img src="https://img.shields.io/badge/suite-ALS_v2-blue.svg" alt="ALS v2"></a>
|
|
36
|
+
<img src="https://img.shields.io/badge/tasks-3-green.svg" alt="3 tasks">
|
|
37
|
+
</p>
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
**LoopBench** is the public scoreboard for Loop Engineering — fixed tasks, fixed seeds, observed [LES](https://github.com/KanakMalpani/Loop-Core-Engineering/blob/main/specs/les-1.0.md), and a submission pipeline anyone can audit.
|
|
42
|
+
|
|
43
|
+
You bring an [LSS](https://github.com/KanakMalpani/Loop-Core-Engineering) loop spec. LoopBench runs it through [LoopGym](https://github.com/KanakMalpani/LoopGym), computes **LES_obs** across eight categories, validates your results JSON, and ranks you on the leaderboard. No hand-waved demos.
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
loopbench run --task LB-CR-1 --spec your-loop.yaml --seeds 0,1,2,3,4 -o results.json
|
|
47
|
+
loopbench validate results.json
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
<p align="center">
|
|
51
|
+
<a href="#-run-your-first-score"><strong>Run your first score →</strong></a> ·
|
|
52
|
+
<a href="leaderboard/entries.json">Leaderboard</a> ·
|
|
53
|
+
<a href="SUITE-OVERVIEW.md">Suite architecture</a>
|
|
54
|
+
</p>
|
|
55
|
+
|
|
56
|
+
<p align="center">
|
|
57
|
+
<img src="assets/demo.gif" alt="LoopBench CLI demo: install, list tasks, run, validate, rank" width="720">
|
|
58
|
+
</p>
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## The contract
|
|
63
|
+
|
|
64
|
+
```mermaid
|
|
65
|
+
flowchart LR
|
|
66
|
+
YOU[Your LSS spec]
|
|
67
|
+
LB[LoopBench<br/>tasks · scoring · schema]
|
|
68
|
+
LG[LoopGym<br/>execution]
|
|
69
|
+
OUT[results.json → leaderboard]
|
|
70
|
+
|
|
71
|
+
YOU --> LB
|
|
72
|
+
LB -->|env_id, seeds| LG
|
|
73
|
+
LG -->|trajectories| LB
|
|
74
|
+
LB --> OUT
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
| Layer | Owns | Repo |
|
|
78
|
+
|-------|------|------|
|
|
79
|
+
| **Spec** | LSS schema, LES formulas | [Loop Core Engineering](https://github.com/KanakMalpani/Loop-Core-Engineering) |
|
|
80
|
+
| **Data** | Trajectories (optional holdout) | [LoopNet](https://github.com/KanakMalpani/loopnet) |
|
|
81
|
+
| **Runtime** | `env.run_episode()` | [LoopGym](https://github.com/KanakMalpani/LoopGym) |
|
|
82
|
+
| **Measurement** | Tasks, LES_obs, submissions | **LoopBench** |
|
|
83
|
+
|
|
84
|
+
LoopBench **defines** and **scores**. LoopGym **runs**. Never the other way around.
|
|
85
|
+
|
|
86
|
+
---
|
|
87
|
+
|
|
88
|
+
## ⚡ Run your first score
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
pip install git+https://github.com/KanakMalpani/LoopGym.git
|
|
92
|
+
pip install git+https://github.com/KanakMalpani/LoopBench.git
|
|
93
|
+
|
|
94
|
+
loopbench list
|
|
95
|
+
|
|
96
|
+
loopbench run \
|
|
97
|
+
--task LB-CR-1 \
|
|
98
|
+
--spec submissions/examples/spec-fast-loop.yaml \
|
|
99
|
+
--seeds 0,1,2,3,4 \
|
|
100
|
+
-o results.json
|
|
101
|
+
|
|
102
|
+
loopbench validate results.json
|
|
103
|
+
loopbench rank leaderboard/entries.json
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
**Local dev** (sibling clones):
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
git clone https://github.com/KanakMalpani/LoopGym.git
|
|
110
|
+
git clone https://github.com/KanakMalpani/LoopBench.git
|
|
111
|
+
cd LoopBench && pip install -e ../LoopGym -e ".[dev]"
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
On Windows: `py -3.12` if needed. PyPI: [PUBLISHING.md](PUBLISHING.md).
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
## Tasks (v0.1 · ALS v2)
|
|
119
|
+
|
|
120
|
+
| ID | Name | Env | What it stress-tests |
|
|
121
|
+
|----|------|-----|----------------------|
|
|
122
|
+
| `LB-CR-1` | Code repair | `loopbench/code-repair-v1` | Effectiveness, speed, robustness |
|
|
123
|
+
| `LB-RS-1` | Research synthesis | `loopbench/research-synthesis-v1` | Effectiveness, cost |
|
|
124
|
+
| `LB-MA-1` | Multi-agent debate | `loopbench/multi-agent-debate-v1` | Autonomy, scalability |
|
|
125
|
+
|
|
126
|
+
Each task ships YAML + README under [`tasks/`](tasks/). Five seeds by default. Success@k + **LES_obs** composite.
|
|
127
|
+
|
|
128
|
+
---
|
|
129
|
+
|
|
130
|
+
## Metrics
|
|
131
|
+
|
|
132
|
+
| Metric | Meaning |
|
|
133
|
+
|--------|---------|
|
|
134
|
+
| **Success@k** | Fraction of instances reaching goal threshold `g_target` |
|
|
135
|
+
| **LES_obs** | Observed eight-category composite ∈ `[0, 1]` — see [`metrics/les-compute.md`](metrics/les-compute.md) |
|
|
136
|
+
| **Cost** | Estimated USD per run from LSS cost limits |
|
|
137
|
+
| **Robustness** | Quality retention across seeds |
|
|
138
|
+
|
|
139
|
+
Display scale `0–100` is optional (`les_display = les_observed × 100`).
|
|
140
|
+
|
|
141
|
+
---
|
|
142
|
+
|
|
143
|
+
## Submit to the leaderboard
|
|
144
|
+
|
|
145
|
+
1. Run all tasks (or start with one):
|
|
146
|
+
`loopbench run --task LB-CR-1,LB-RS-1,LB-MA-1 --spec your-loop.yaml -o results.json`
|
|
147
|
+
2. Validate: `loopbench validate results.json`
|
|
148
|
+
3. Open a PR adding your entry to [`leaderboard/entries.json`](leaderboard/entries.json)
|
|
149
|
+
|
|
150
|
+
v0.1 rankings accept **SimEnv** submissions only (no API keys, fully reproducible). LiveEnv tier: v0.2.
|
|
151
|
+
|
|
152
|
+
---
|
|
153
|
+
|
|
154
|
+
## Repository layout
|
|
155
|
+
|
|
156
|
+
| Path | Purpose |
|
|
157
|
+
|------|---------|
|
|
158
|
+
| [`tasks/`](tasks/) | ALS v2 task definitions |
|
|
159
|
+
| [`metrics/les-compute.md`](metrics/les-compute.md) | LES_obs formulas |
|
|
160
|
+
| [`submit/schema.json`](submit/schema.json) | Submission JSON schema |
|
|
161
|
+
| [`loopbench/`](loopbench/) | Runner, LES compute, conformance |
|
|
162
|
+
| [`leaderboard/`](leaderboard/) | Public rankings (JSON v0.1) |
|
|
163
|
+
| [`submissions/examples/`](submissions/examples/) | Reference specs |
|
|
164
|
+
|
|
165
|
+
---
|
|
166
|
+
|
|
167
|
+
## Citation
|
|
168
|
+
|
|
169
|
+
```bibtex
|
|
170
|
+
@software{loopbench2026,
|
|
171
|
+
title={LoopBench: Benchmark Suite for Loop Engineering},
|
|
172
|
+
author={Malpani, Kanak},
|
|
173
|
+
year={2026},
|
|
174
|
+
url={https://github.com/KanakMalpani/LoopBench}
|
|
175
|
+
}
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
---
|
|
179
|
+
|
|
180
|
+
<p align="center">
|
|
181
|
+
<sub>MIT · v0.1 · <a href="CONTRIBUTING.md">Contributing</a> · <a href="SECURITY.md">Security</a> · <a href="STATUS.md">Status</a></sub>
|
|
182
|
+
</p>
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
loopbench/__init__.py,sha256=Q4iIij63xecJOuziHQHVxGwYIRS7LALAxYl4CGFZ3sQ,104
|
|
2
|
+
loopbench/cli.py,sha256=cGFJv3wPPRWyGVwGVFgKzD8DuYBMBRP6DODoO3ocQ_c,4444
|
|
3
|
+
loopbench/conformance.py,sha256=bva2ZdSA2wqlEwanvvoUJ-hbacEu_-0MBZ2eCYMpkfM,2797
|
|
4
|
+
loopbench/les_compute.py,sha256=snPCVSsSrAnAZvSRmAgraMX-6YGUsLMWl3ENofPz7io,5477
|
|
5
|
+
loopbench/runner.py,sha256=DhXtS-JX2gQ6yjbE_uj6izWyU-Ln6y1kGcEfkua60Hs,4868
|
|
6
|
+
loopbench/tasks.py,sha256=LOWuJHeVDDlRs-ffpQ9Z_o8H8KVA-7D9120Lqy0B5TU,1350
|
|
7
|
+
loopbench-0.1.0.dist-info/METADATA,sha256=oVRRYccTqDheR9a8cxL40pUV-XoDY55S0opF651gvEM,6403
|
|
8
|
+
loopbench-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
9
|
+
loopbench-0.1.0.dist-info/entry_points.txt,sha256=P_Cli-8VX2IotD0zFv-yNBKU1nFoOvpmrFg_GeX940g,49
|
|
10
|
+
loopbench-0.1.0.dist-info/licenses/LICENSE,sha256=evRYU4i8S6LPZ42e9jNkROkb-chKgbu-HyltWnYncvk,1069
|
|
11
|
+
loopbench-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 KanakMalpani
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|