loopbench 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
loopbench/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """LoopBench — benchmark suite, metrics, submission pipeline, leaderboards."""
2
+
3
+ __version__ = "0.1.0"
loopbench/cli.py ADDED
@@ -0,0 +1,136 @@
1
+ """LoopBench CLI — local eval, validate, rank."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import json
7
+ import sys
8
+ from pathlib import Path
9
+
10
+ from loopbench import __version__
11
+ from loopbench.conformance import validate_file
12
+ from loopbench.runner import build_submission, run_task
13
+ from loopbench.tasks import list_tasks
14
+
15
+
16
+ def _parse_seeds(raw: str) -> list[int]:
17
+ return [int(s.strip()) for s in raw.split(",") if s.strip()]
18
+
19
+
20
+ def cmd_list(_args: argparse.Namespace) -> int:
21
+ print("LoopBench tasks:")
22
+ for task_id in list_tasks():
23
+ print(f" {task_id}")
24
+ return 0
25
+
26
+
27
+ def cmd_run(args: argparse.Namespace) -> int:
28
+ spec_path = Path(args.spec)
29
+ if not spec_path.exists():
30
+ print(f"Error: spec not found: {spec_path}", file=sys.stderr)
31
+ return 2
32
+
33
+ seeds = _parse_seeds(args.seeds) if args.seeds else None
34
+ task_ids = args.task.split(",") if args.task else ["LB-CR-1"]
35
+ task_results = []
36
+ for task_id in task_ids:
37
+ print(f"Running {task_id} via LoopGym (backend={args.backend})...", file=sys.stderr)
38
+ task_results.append(
39
+ run_task(
40
+ task_id.strip(),
41
+ spec_path,
42
+ seeds=seeds,
43
+ backend=args.backend,
44
+ )
45
+ )
46
+
47
+ submission = build_submission(
48
+ args.submitter,
49
+ spec_path,
50
+ task_results,
51
+ backend=args.backend,
52
+ )
53
+
54
+ if args.output:
55
+ out = Path(args.output)
56
+ out.parent.mkdir(parents=True, exist_ok=True)
57
+ out.write_text(json.dumps(submission, indent=2), encoding="utf-8")
58
+ print(f"Wrote {out}", file=sys.stderr)
59
+ else:
60
+ print(json.dumps(submission, indent=2))
61
+
62
+ return 0
63
+
64
+
65
+ def cmd_validate(args: argparse.Namespace) -> int:
66
+ path = Path(args.file)
67
+ if not path.exists():
68
+ print(f"Error: file not found: {path}", file=sys.stderr)
69
+ return 2
70
+ valid, errors = validate_file(path)
71
+ if args.json:
72
+ print(json.dumps({"valid": valid, "errors": errors}, indent=2))
73
+ elif valid:
74
+ print(f"VALID: {path}")
75
+ else:
76
+ print(f"INVALID: {path}", file=sys.stderr)
77
+ for err in errors:
78
+ print(f" - {err}", file=sys.stderr)
79
+ return 0 if valid else 1
80
+
81
+
82
+ def cmd_rank(args: argparse.Namespace) -> int:
83
+ path = Path(args.file)
84
+ with path.open(encoding="utf-8") as fh:
85
+ data = json.load(fh)
86
+
87
+ entries = data if isinstance(data, list) else data.get("entries", [])
88
+ ranked = sorted(
89
+ entries,
90
+ key=lambda e: e.get("composite", {}).get("rank_score", 0.0),
91
+ reverse=True,
92
+ )
93
+ print(f"{'Rank':<6}{'Submitter':<24}{'LES':>8}{'Display':>10}{'Backend':>10}")
94
+ print("-" * 58)
95
+ for i, entry in enumerate(ranked, 1):
96
+ comp = entry.get("composite", {})
97
+ print(
98
+ f"{i:<6}{entry.get('submitter', '?')[:24]:<24}"
99
+ f"{comp.get('les_observed', 0):>8.4f}"
100
+ f"{comp.get('les_display', 0):>10.1f}"
101
+ f"{entry.get('backend', '?'):>10}"
102
+ )
103
+ return 0
104
+
105
+
106
+ def main() -> int:
107
+ parser = argparse.ArgumentParser(description=f"LoopBench CLI v{__version__}")
108
+ sub = parser.add_subparsers(dest="command", required=True)
109
+
110
+ p_list = sub.add_parser("list", help="List benchmark tasks")
111
+ p_list.set_defaults(func=cmd_list)
112
+
113
+ p_run = sub.add_parser("run", help="Run local evaluation via LoopGym")
114
+ p_run.add_argument("--task", default="LB-CR-1", help="Task ID or comma-separated list")
115
+ p_run.add_argument("--spec", required=True, help="Path to LSS YAML spec")
116
+ p_run.add_argument("--seeds", default="0,1,2,3,4", help="Comma-separated seeds")
117
+ p_run.add_argument("--submitter", default="local-dev", help="Submitter name")
118
+ p_run.add_argument("--backend", default="sim", choices=["sim", "live", "replay"])
119
+ p_run.add_argument("--output", "-o", help="Write results JSON to path")
120
+ p_run.set_defaults(func=cmd_run)
121
+
122
+ p_val = sub.add_parser("validate", help="Validate submission JSON")
123
+ p_val.add_argument("file", help="Results JSON path")
124
+ p_val.add_argument("--json", action="store_true")
125
+ p_val.set_defaults(func=cmd_validate)
126
+
127
+ p_rank = sub.add_parser("rank", help="Rank leaderboard entries by composite LES")
128
+ p_rank.add_argument("file", help="Leaderboard JSON path")
129
+ p_rank.set_defaults(func=cmd_rank)
130
+
131
+ args = parser.parse_args()
132
+ return args.func(args)
133
+
134
+
135
+ if __name__ == "__main__":
136
+ raise SystemExit(main())
@@ -0,0 +1,79 @@
1
+ """Validate LoopBench submission JSON against schema and conformance rules."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+
8
+ import jsonschema
9
+
10
+ REPO_ROOT = Path(__file__).resolve().parents[1]
11
+ SCHEMA_PATH = REPO_ROOT / "submit" / "schema.json"
12
+
13
+
14
+ def load_schema() -> dict:
15
+ with SCHEMA_PATH.open(encoding="utf-8") as fh:
16
+ return json.load(fh)
17
+
18
+
19
+ def validate_submission(data: dict, schema: dict | None = None) -> list[str]:
20
+ schema = schema or load_schema()
21
+ validator = jsonschema.Draft202012Validator(schema)
22
+ errors = sorted(validator.iter_errors(data), key=lambda e: list(e.absolute_path))
23
+ messages = [f"{'.'.join(str(p) for p in e.absolute_path) or '(root)'}: {e.message}" for e in errors]
24
+
25
+ spec_path = data.get("spec_path")
26
+ spec_hash = data.get("spec_hash")
27
+ if spec_path and spec_hash:
28
+ path = Path(spec_path)
29
+ if not path.is_absolute():
30
+ path = REPO_ROOT / path
31
+ if path.exists():
32
+ import hashlib
33
+
34
+ digest = hashlib.sha256(path.read_bytes()).hexdigest()
35
+ expected = f"sha256:{digest}"
36
+ if spec_hash != expected:
37
+ messages.append(f"spec_hash: mismatch (expected {expected})")
38
+
39
+ for task_result in data.get("results", []):
40
+ agg = task_result.get("aggregate", {})
41
+ les_obs = agg.get("les_observed")
42
+ les_disp = agg.get("les_display")
43
+ if les_obs is not None and les_disp is not None:
44
+ expected_disp = round(float(les_obs) * 100, 1)
45
+ if abs(float(les_disp) - expected_disp) > 0.05:
46
+ messages.append(
47
+ f"results.{task_result.get('task_id')}.aggregate: "
48
+ f"les_display {les_disp} != round(les_observed*100,1)={expected_disp}"
49
+ )
50
+
51
+ return messages
52
+
53
+
54
+ def validate_leaderboard(data: dict) -> list[str]:
55
+ messages: list[str] = []
56
+ if not isinstance(data, dict):
57
+ return ["(root): expected object"]
58
+ for key in ("version", "updated", "entries"):
59
+ if key not in data:
60
+ messages.append(f"(root): missing required property '{key}'")
61
+ entries = data.get("entries")
62
+ if not isinstance(entries, list):
63
+ messages.append("entries: expected array")
64
+ return messages
65
+ for i, entry in enumerate(entries):
66
+ entry_errors = validate_submission(entry)
67
+ for err in entry_errors:
68
+ messages.append(f"entries[{i}].{err}")
69
+ return messages
70
+
71
+
72
+ def validate_file(path: Path) -> tuple[bool, list[str]]:
73
+ with path.open(encoding="utf-8") as fh:
74
+ data = json.load(fh)
75
+ if isinstance(data, dict) and "entries" in data:
76
+ errors = validate_leaderboard(data)
77
+ else:
78
+ errors = validate_submission(data)
79
+ return len(errors) == 0, errors
@@ -0,0 +1,189 @@
1
+ """Observed LES computation for LoopBench runs (LES-1.0 aligned)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from statistics import mean, median
7
+
8
+ LES_WEIGHTS: dict[str, float] = {
9
+ "effectiveness": 0.20,
10
+ "speed": 0.15,
11
+ "cost": 0.12,
12
+ "robustness": 0.13,
13
+ "scalability": 0.10,
14
+ "safety": 0.12,
15
+ "adaptability": 0.10,
16
+ "autonomy": 0.08,
17
+ }
18
+
19
+
20
+ def _clamp(value: float, low: float = 0.0, high: float = 1.0) -> float:
21
+ return max(low, min(high, value))
22
+
23
+
24
+ def _normalize(x: float, b_floor: float, b_ceiling: float) -> float:
25
+ if b_ceiling <= b_floor:
26
+ return 0.0
27
+ return _clamp((x - b_floor) / (b_ceiling - b_floor))
28
+
29
+
30
+ def _is_improving(goal_trace: list[float]) -> bool:
31
+ if len(goal_trace) < 3:
32
+ return False
33
+ start = max(1, (2 * len(goal_trace)) // 3)
34
+ window = goal_trace[start:]
35
+ if len(window) < 2:
36
+ return False
37
+ improvements = sum(1 for i in range(1, len(window)) if window[i] > window[i - 1])
38
+ return improvements / (len(window) - 1) >= 0.6
39
+
40
+
41
+ @dataclass
42
+ class RunMetrics:
43
+ task_instance_id: str
44
+ seed: int
45
+ g_final: float
46
+ g_0: float
47
+ g_target: float
48
+ iterations: int
49
+ t_budget: int
50
+ cost_usd: float
51
+ elapsed_s: float
52
+ success: bool
53
+ termination_reason: str
54
+ safety_violations: int = 0
55
+ goal_trace: list[float] = field(default_factory=list)
56
+
57
+
58
+ @dataclass
59
+ class LesResult:
60
+ les_observed: float
61
+ les_display: float
62
+ categories: dict[str, float]
63
+
64
+
65
+ def effectiveness_score(run: RunMetrics) -> float:
66
+ g_final, g_target = run.g_final, run.g_target
67
+ if g_target <= 0:
68
+ return 0.0
69
+ if g_final >= g_target:
70
+ e_raw = g_final / g_target
71
+ elif _is_improving(run.goal_trace):
72
+ e_raw = (g_final / g_target) * (run.t_budget / max(run.iterations, 1))
73
+ else:
74
+ e_raw = (g_final / g_target) * 0.5
75
+ return _normalize(e_raw, 0.5, 1.0)
76
+
77
+
78
+ def speed_score(run: RunMetrics, b_floor: float = 0.001, b_ceiling: float = 0.05) -> float:
79
+ if run.iterations <= 0 or run.elapsed_s <= 0:
80
+ return 0.0
81
+ tau = run.elapsed_s / run.iterations
82
+ s_raw = 1.0 / tau
83
+ return _normalize(s_raw, b_floor, b_ceiling)
84
+
85
+
86
+ def cost_score(
87
+ run: RunMetrics,
88
+ b_floor: float = 0.05,
89
+ b_ceiling: float = 2.0,
90
+ ) -> float:
91
+ delta_g = run.g_final - run.g_0
92
+ if delta_g <= 0 or run.cost_usd <= 0:
93
+ return 0.0
94
+ return _normalize(delta_g / run.cost_usd, b_floor, b_ceiling)
95
+
96
+
97
+ def safety_score(run: RunMetrics) -> float:
98
+ if run.safety_violations >= 10:
99
+ return 0.0
100
+ return _clamp(1.0 - min(run.safety_violations / 10.0, 1.0))
101
+
102
+
103
+ def robustness_from_seeds(runs: list[RunMetrics]) -> float:
104
+ finals = [r.g_final for r in runs]
105
+ if not finals:
106
+ return 0.0
107
+ g_clean = max(finals)
108
+ g_perturbed = min(finals)
109
+ if g_clean <= 0:
110
+ return 0.0
111
+ degradation = 1.0 - (g_perturbed / g_clean)
112
+ return _clamp(1.0 - degradation)
113
+
114
+
115
+ def scalability_score(runs: list[RunMetrics]) -> float:
116
+ if not runs:
117
+ return 0.0
118
+ retention = mean(r.g_final / r.g_target if r.g_target > 0 else 0.0 for r in runs)
119
+ return _normalize(retention, 0.4, 0.90)
120
+
121
+
122
+ def adaptability_score(runs: list[RunMetrics]) -> float:
123
+ if not runs:
124
+ return 0.0
125
+ return _clamp(mean(1.0 if r.success else r.g_final / r.g_target if r.g_target else 0.0 for r in runs))
126
+
127
+
128
+ def autonomy_score(_runs: list[RunMetrics], backend: str = "sim") -> float:
129
+ return 0.95 if backend == "sim" else 0.5
130
+
131
+
132
+ def compute_run_categories(
133
+ run: RunMetrics,
134
+ *,
135
+ speed_baselines: tuple[float, float] | None = None,
136
+ cost_baselines: tuple[float, float] | None = None,
137
+ ) -> dict[str, float]:
138
+ b_floor, b_ceiling = speed_baselines or (0.001, 0.05)
139
+ c_floor, c_ceiling = cost_baselines or (0.05, 2.0)
140
+ return {
141
+ "effectiveness": effectiveness_score(run),
142
+ "speed": speed_score(run, b_floor, b_ceiling),
143
+ "cost": cost_score(run, c_floor, c_ceiling),
144
+ "safety": safety_score(run),
145
+ }
146
+
147
+
148
+ def compute_task_les(
149
+ runs: list[RunMetrics],
150
+ *,
151
+ speed_baselines: tuple[float, float] | None = None,
152
+ cost_baselines: tuple[float, float] | None = None,
153
+ backend: str = "sim",
154
+ ) -> LesResult:
155
+ if not runs:
156
+ return LesResult(0.0, 0.0, {k: 0.0 for k in LES_WEIGHTS})
157
+
158
+ per_run = [
159
+ compute_run_categories(
160
+ r,
161
+ speed_baselines=speed_baselines,
162
+ cost_baselines=cost_baselines,
163
+ )
164
+ for r in runs
165
+ ]
166
+ categories: dict[str, float] = {
167
+ "effectiveness": mean(c["effectiveness"] for c in per_run),
168
+ "speed": mean(c["speed"] for c in per_run),
169
+ "cost": mean(c["cost"] for c in per_run),
170
+ "safety": mean(c["safety"] for c in per_run),
171
+ "robustness": robustness_from_seeds(runs),
172
+ "scalability": scalability_score(runs),
173
+ "adaptability": adaptability_score(runs),
174
+ "autonomy": autonomy_score(runs, backend),
175
+ }
176
+ composite = sum(LES_WEIGHTS[k] * categories[k] for k in LES_WEIGHTS)
177
+ les_observed = round(composite, 4)
178
+ return LesResult(
179
+ les_observed=les_observed,
180
+ les_display=round(les_observed * 100, 1),
181
+ categories={k: round(v, 4) for k, v in categories.items()},
182
+ )
183
+
184
+
185
+ def compute_composite(task_les: list[float]) -> tuple[float, float, float]:
186
+ if not task_les:
187
+ return 0.0, 0.0, 0.0
188
+ avg = mean(task_les)
189
+ return round(avg, 4), round(avg * 100, 1), round(avg, 4)
loopbench/runner.py ADDED
@@ -0,0 +1,142 @@
1
+ """Run LoopBench evaluations via LoopGym."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ import time
7
+ from datetime import datetime, timezone
8
+ from pathlib import Path
9
+ from uuid import uuid4
10
+
11
+ import loopgym as lg
12
+ import yaml
13
+
14
+ from loopbench import __version__
15
+ from loopbench.les_compute import LES_WEIGHTS, LesResult, RunMetrics, compute_composite, compute_task_les
16
+ from loopbench.tasks import cost_baselines, load_task, speed_baselines
17
+
18
+
19
+ def spec_hash(path: Path) -> str:
20
+ digest = hashlib.sha256(path.read_bytes()).hexdigest()
21
+ return f"sha256:{digest}"
22
+
23
+
24
+ def estimate_cost_usd(spec: dict, iterations: int) -> float:
25
+ cost = spec.get("cost_limits") or {}
26
+ per_iter = float(cost.get("per_iteration_usd", 0.05))
27
+ return round(per_iter * iterations, 4)
28
+
29
+
30
+ def run_task(
31
+ task_id: str,
32
+ spec_path: Path,
33
+ *,
34
+ seeds: list[int] | None = None,
35
+ instances: list[str] | None = None,
36
+ backend: str = "sim",
37
+ ) -> dict:
38
+ task = load_task(task_id)
39
+ env_id = task["loopgym_env"]
40
+ g_target = float(task["goal"]["g_target"])
41
+ t_budget = int(task["budget"]["iteration_budget"])
42
+ eval_cfg = task.get("evaluation") or {}
43
+ seed_list = seeds if seeds is not None else list(eval_cfg.get("seeds", [0]))
44
+ instance_list = instances if instances is not None else list(eval_cfg.get("primary_instances", []))
45
+
46
+ with spec_path.open(encoding="utf-8") as fh:
47
+ spec = yaml.safe_load(fh)
48
+
49
+ env = lg.make(env_id, spec_path=spec_path, backend=backend)
50
+ runs: list[RunMetrics] = []
51
+
52
+ for instance_id in instance_list:
53
+ for seed in seed_list:
54
+ t0 = time.perf_counter()
55
+ result = env.run_episode(task_id=instance_id, seed=seed)
56
+ elapsed = time.perf_counter() - t0
57
+ trace = [s["quality_score"] for s in result.get("trajectory", [])]
58
+ g_0 = trace[0] if trace else 0.0
59
+ g_final = float(result.get("quality_score", 0.0))
60
+ iterations = int(result.get("steps", 1))
61
+ runs.append(
62
+ RunMetrics(
63
+ task_instance_id=instance_id,
64
+ seed=seed,
65
+ g_final=g_final,
66
+ g_0=g_0,
67
+ g_target=g_target,
68
+ iterations=iterations,
69
+ t_budget=t_budget,
70
+ cost_usd=estimate_cost_usd(spec, iterations),
71
+ elapsed_s=round(elapsed, 4),
72
+ success=bool(result.get("success")) or g_final >= g_target,
73
+ termination_reason="goal_met" if (bool(result.get("success")) or g_final >= g_target) else "budget_exhausted",
74
+ goal_trace=trace,
75
+ )
76
+ )
77
+
78
+ les: LesResult = compute_task_les(
79
+ runs,
80
+ speed_baselines=speed_baselines(task),
81
+ cost_baselines=cost_baselines(task),
82
+ backend=backend,
83
+ )
84
+ success_at_k = sum(1 for r in runs if r.success) / len(runs) if runs else 0.0
85
+
86
+ return {
87
+ "task_id": task_id,
88
+ "env_id": env_id,
89
+ "runs": [
90
+ {
91
+ "task_instance_id": r.task_instance_id,
92
+ "seed": r.seed,
93
+ "success": r.success,
94
+ "g_final": round(r.g_final, 4),
95
+ "g_0": round(r.g_0, 4),
96
+ "g_target": r.g_target,
97
+ "iterations": r.iterations,
98
+ "cost_usd": r.cost_usd,
99
+ "elapsed_s": r.elapsed_s,
100
+ "termination_reason": r.termination_reason,
101
+ "safety_violations": r.safety_violations,
102
+ "goal_trace": [round(g, 4) for g in r.goal_trace],
103
+ }
104
+ for r in runs
105
+ ],
106
+ "aggregate": {
107
+ "success_at_k": round(success_at_k, 4),
108
+ "les_observed": les.les_observed,
109
+ "les_display": les.les_display,
110
+ "categories": les.categories,
111
+ "cost_usd_mean": round(sum(r.cost_usd for r in runs) / len(runs), 4) if runs else 0.0,
112
+ "robustness_seeds": len(seed_list),
113
+ },
114
+ }
115
+
116
+
117
+ def build_submission(
118
+ submitter: str,
119
+ spec_path: Path,
120
+ task_results: list[dict],
121
+ *,
122
+ backend: str = "sim",
123
+ ) -> dict:
124
+ les_values = [tr["aggregate"]["les_observed"] for tr in task_results]
125
+ les_obs, les_disp, rank = compute_composite(les_values)
126
+ return {
127
+ "submission_id": str(uuid4()),
128
+ "submitter": submitter,
129
+ "loopbench_version": __version__,
130
+ "lss_version": "1.0.0",
131
+ "les_version": "1.0.0",
132
+ "spec_path": spec_path.as_posix(),
133
+ "spec_hash": spec_hash(spec_path),
134
+ "submitted_at": datetime.now(timezone.utc).isoformat(),
135
+ "backend": backend,
136
+ "results": task_results,
137
+ "composite": {
138
+ "les_observed": les_obs,
139
+ "les_display": les_disp,
140
+ "rank_score": rank,
141
+ },
142
+ }
loopbench/tasks.py ADDED
@@ -0,0 +1,47 @@
1
+ """Load LoopBench task definitions."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ import yaml
8
+
9
+ REPO_ROOT = Path(__file__).resolve().parents[1]
10
+ TASKS_ROOT = REPO_ROOT / "tasks"
11
+
12
+
13
+ def list_tasks() -> list[str]:
14
+ index_path = TASKS_ROOT / "index.yaml"
15
+ if not index_path.exists():
16
+ return []
17
+ with index_path.open(encoding="utf-8") as fh:
18
+ index = yaml.safe_load(fh)
19
+ return [t["id"] for t in index.get("tasks", [])]
20
+
21
+
22
+ def load_task(task_id: str) -> dict:
23
+ task_path = TASKS_ROOT / task_id / "task.yaml"
24
+ if not task_path.exists():
25
+ available = ", ".join(list_tasks())
26
+ raise FileNotFoundError(f"Unknown task '{task_id}'. Available: {available}")
27
+ with task_path.open(encoding="utf-8") as fh:
28
+ data = yaml.safe_load(fh)
29
+ if not isinstance(data, dict):
30
+ raise ValueError(f"Invalid task spec: {task_path}")
31
+ return data
32
+
33
+
34
+ def speed_baselines(task: dict) -> tuple[float, float]:
35
+ sb = task.get("speed_baselines") or {}
36
+ return (
37
+ float(sb.get("b_floor_iter_per_s", 0.001)),
38
+ float(sb.get("b_ceiling_iter_per_s", 0.05)),
39
+ )
40
+
41
+
42
+ def cost_baselines(task: dict) -> tuple[float, float]:
43
+ cb = task.get("cost_baselines") or {}
44
+ return (
45
+ float(cb.get("b_floor_efficiency", 0.05)),
46
+ float(cb.get("b_ceiling_efficiency", 2.0)),
47
+ )
@@ -0,0 +1,182 @@
1
+ Metadata-Version: 2.4
2
+ Name: loopbench
3
+ Version: 0.1.0
4
+ Summary: LoopBench — benchmark suite, metrics, submission pipeline, leaderboards
5
+ Project-URL: Homepage, https://github.com/KanakMalpani/LoopBench
6
+ Project-URL: Repository, https://github.com/KanakMalpani/LoopBench
7
+ Project-URL: Issues, https://github.com/KanakMalpani/LoopBench/issues
8
+ Project-URL: Documentation, https://github.com/KanakMalpani/LoopBench/blob/main/SUITE-OVERVIEW.md
9
+ Project-URL: LoopGym, https://github.com/KanakMalpani/LoopGym
10
+ Project-URL: Loop Core Engineering, https://github.com/KanakMalpani/Loop-Core-Engineering
11
+ Author: Kanak Malpani
12
+ License: MIT
13
+ License-File: LICENSE
14
+ Keywords: benchmark,les,loop-engineering,loopbench
15
+ Classifier: Development Status :: 4 - Beta
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Requires-Python: >=3.12
19
+ Requires-Dist: jsonschema>=4.21
20
+ Requires-Dist: pyyaml>=6.0
21
+ Provides-Extra: dev
22
+ Requires-Dist: pytest>=8.0; extra == 'dev'
23
+ Requires-Dist: ruff>=0.4; extra == 'dev'
24
+ Description-Content-Type: text/markdown
25
+
26
+ <p align="center">
27
+ <strong>LoopBench</strong><br>
28
+ <em>MLPerf for loops.</em>
29
+ </p>
30
+
31
+ <p align="center">
32
+ <a href="https://github.com/KanakMalpani/LoopBench/actions/workflows/test.yml"><img src="https://github.com/KanakMalpani/LoopBench/actions/workflows/test.yml/badge.svg" alt="CI"></a>
33
+ <a href="LICENSE"><img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="MIT"></a>
34
+ <img src="https://img.shields.io/badge/python-3.12+-blue.svg" alt="Python 3.12+">
35
+ <a href="SUITE-OVERVIEW.md"><img src="https://img.shields.io/badge/suite-ALS_v2-blue.svg" alt="ALS v2"></a>
36
+ <img src="https://img.shields.io/badge/tasks-3-green.svg" alt="3 tasks">
37
+ </p>
38
+
39
+ ---
40
+
41
+ **LoopBench** is the public scoreboard for Loop Engineering — fixed tasks, fixed seeds, observed [LES](https://github.com/KanakMalpani/Loop-Core-Engineering/blob/main/specs/les-1.0.md), and a submission pipeline anyone can audit.
42
+
43
+ You bring an [LSS](https://github.com/KanakMalpani/Loop-Core-Engineering) loop spec. LoopBench runs it through [LoopGym](https://github.com/KanakMalpani/LoopGym), computes **LES_obs** across eight categories, validates your results JSON, and ranks you on the leaderboard. No hand-waved demos.
44
+
45
+ ```bash
46
+ loopbench run --task LB-CR-1 --spec your-loop.yaml --seeds 0,1,2,3,4 -o results.json
47
+ loopbench validate results.json
48
+ ```
49
+
50
+ <p align="center">
51
+ <a href="#-run-your-first-score"><strong>Run your first score →</strong></a> ·
52
+ <a href="leaderboard/entries.json">Leaderboard</a> ·
53
+ <a href="SUITE-OVERVIEW.md">Suite architecture</a>
54
+ </p>
55
+
56
+ <p align="center">
57
+ <img src="assets/demo.gif" alt="LoopBench CLI demo: install, list tasks, run, validate, rank" width="720">
58
+ </p>
59
+
60
+ ---
61
+
62
+ ## The contract
63
+
64
+ ```mermaid
65
+ flowchart LR
66
+ YOU[Your LSS spec]
67
+ LB[LoopBench<br/>tasks · scoring · schema]
68
+ LG[LoopGym<br/>execution]
69
+ OUT[results.json → leaderboard]
70
+
71
+ YOU --> LB
72
+ LB -->|env_id, seeds| LG
73
+ LG -->|trajectories| LB
74
+ LB --> OUT
75
+ ```
76
+
77
+ | Layer | Owns | Repo |
78
+ |-------|------|------|
79
+ | **Spec** | LSS schema, LES formulas | [Loop Core Engineering](https://github.com/KanakMalpani/Loop-Core-Engineering) |
80
+ | **Data** | Trajectories (optional holdout) | [LoopNet](https://github.com/KanakMalpani/loopnet) |
81
+ | **Runtime** | `env.run_episode()` | [LoopGym](https://github.com/KanakMalpani/LoopGym) |
82
+ | **Measurement** | Tasks, LES_obs, submissions | **LoopBench** |
83
+
84
+ LoopBench **defines** and **scores**. LoopGym **runs**. Never the other way around.
85
+
86
+ ---
87
+
88
+ ## ⚡ Run your first score
89
+
90
+ ```bash
91
+ pip install git+https://github.com/KanakMalpani/LoopGym.git
92
+ pip install git+https://github.com/KanakMalpani/LoopBench.git
93
+
94
+ loopbench list
95
+
96
+ loopbench run \
97
+ --task LB-CR-1 \
98
+ --spec submissions/examples/spec-fast-loop.yaml \
99
+ --seeds 0,1,2,3,4 \
100
+ -o results.json
101
+
102
+ loopbench validate results.json
103
+ loopbench rank leaderboard/entries.json
104
+ ```
105
+
106
+ **Local dev** (sibling clones):
107
+
108
+ ```bash
109
+ git clone https://github.com/KanakMalpani/LoopGym.git
110
+ git clone https://github.com/KanakMalpani/LoopBench.git
111
+ cd LoopBench && pip install -e ../LoopGym -e ".[dev]"
112
+ ```
113
+
114
+ On Windows: `py -3.12` if needed. PyPI: [PUBLISHING.md](PUBLISHING.md).
115
+
116
+ ---
117
+
118
+ ## Tasks (v0.1 · ALS v2)
119
+
120
+ | ID | Name | Env | What it stress-tests |
121
+ |----|------|-----|----------------------|
122
+ | `LB-CR-1` | Code repair | `loopbench/code-repair-v1` | Effectiveness, speed, robustness |
123
+ | `LB-RS-1` | Research synthesis | `loopbench/research-synthesis-v1` | Effectiveness, cost |
124
+ | `LB-MA-1` | Multi-agent debate | `loopbench/multi-agent-debate-v1` | Autonomy, scalability |
125
+
126
+ Each task ships YAML + README under [`tasks/`](tasks/). Five seeds by default. Success@k + **LES_obs** composite.
127
+
128
+ ---
129
+
130
+ ## Metrics
131
+
132
+ | Metric | Meaning |
133
+ |--------|---------|
134
+ | **Success@k** | Fraction of instances reaching goal threshold `g_target` |
135
+ | **LES_obs** | Observed eight-category composite ∈ `[0, 1]` — see [`metrics/les-compute.md`](metrics/les-compute.md) |
136
+ | **Cost** | Estimated USD per run from LSS cost limits |
137
+ | **Robustness** | Quality retention across seeds |
138
+
139
+ Display scale `0–100` is optional (`les_display = les_observed × 100`).
140
+
141
+ ---
142
+
143
+ ## Submit to the leaderboard
144
+
145
+ 1. Run all tasks (or start with one):
146
+ `loopbench run --task LB-CR-1,LB-RS-1,LB-MA-1 --spec your-loop.yaml -o results.json`
147
+ 2. Validate: `loopbench validate results.json`
148
+ 3. Open a PR adding your entry to [`leaderboard/entries.json`](leaderboard/entries.json)
149
+
150
+ v0.1 rankings accept **SimEnv** submissions only (no API keys, fully reproducible). LiveEnv tier: v0.2.
151
+
152
+ ---
153
+
154
+ ## Repository layout
155
+
156
+ | Path | Purpose |
157
+ |------|---------|
158
+ | [`tasks/`](tasks/) | ALS v2 task definitions |
159
+ | [`metrics/les-compute.md`](metrics/les-compute.md) | LES_obs formulas |
160
+ | [`submit/schema.json`](submit/schema.json) | Submission JSON schema |
161
+ | [`loopbench/`](loopbench/) | Runner, LES compute, conformance |
162
+ | [`leaderboard/`](leaderboard/) | Public rankings (JSON v0.1) |
163
+ | [`submissions/examples/`](submissions/examples/) | Reference specs |
164
+
165
+ ---
166
+
167
+ ## Citation
168
+
169
+ ```bibtex
170
+ @software{loopbench2026,
171
+ title={LoopBench: Benchmark Suite for Loop Engineering},
172
+ author={Malpani, Kanak},
173
+ year={2026},
174
+ url={https://github.com/KanakMalpani/LoopBench}
175
+ }
176
+ ```
177
+
178
+ ---
179
+
180
+ <p align="center">
181
+ <sub>MIT · v0.1 · <a href="CONTRIBUTING.md">Contributing</a> · <a href="SECURITY.md">Security</a> · <a href="STATUS.md">Status</a></sub>
182
+ </p>
@@ -0,0 +1,11 @@
1
+ loopbench/__init__.py,sha256=Q4iIij63xecJOuziHQHVxGwYIRS7LALAxYl4CGFZ3sQ,104
2
+ loopbench/cli.py,sha256=cGFJv3wPPRWyGVwGVFgKzD8DuYBMBRP6DODoO3ocQ_c,4444
3
+ loopbench/conformance.py,sha256=bva2ZdSA2wqlEwanvvoUJ-hbacEu_-0MBZ2eCYMpkfM,2797
4
+ loopbench/les_compute.py,sha256=snPCVSsSrAnAZvSRmAgraMX-6YGUsLMWl3ENofPz7io,5477
5
+ loopbench/runner.py,sha256=DhXtS-JX2gQ6yjbE_uj6izWyU-Ln6y1kGcEfkua60Hs,4868
6
+ loopbench/tasks.py,sha256=LOWuJHeVDDlRs-ffpQ9Z_o8H8KVA-7D9120Lqy0B5TU,1350
7
+ loopbench-0.1.0.dist-info/METADATA,sha256=oVRRYccTqDheR9a8cxL40pUV-XoDY55S0opF651gvEM,6403
8
+ loopbench-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
9
+ loopbench-0.1.0.dist-info/entry_points.txt,sha256=P_Cli-8VX2IotD0zFv-yNBKU1nFoOvpmrFg_GeX940g,49
10
+ loopbench-0.1.0.dist-info/licenses/LICENSE,sha256=evRYU4i8S6LPZ42e9jNkROkb-chKgbu-HyltWnYncvk,1069
11
+ loopbench-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ loopbench = loopbench.cli:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 KanakMalpani
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.