loopbench 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loopbench-0.1.0/.github/workflows/publish.yml +33 -0
- loopbench-0.1.0/.github/workflows/test.yml +51 -0
- loopbench-0.1.0/.gitignore +17 -0
- loopbench-0.1.0/CONTRIBUTING.md +33 -0
- loopbench-0.1.0/LICENSE +21 -0
- loopbench-0.1.0/PKG-INFO +182 -0
- loopbench-0.1.0/PLAN.md +56 -0
- loopbench-0.1.0/PUBLISHING.md +42 -0
- loopbench-0.1.0/README.md +157 -0
- loopbench-0.1.0/SECURITY.md +15 -0
- loopbench-0.1.0/STATUS.md +31 -0
- loopbench-0.1.0/SUITE-OVERVIEW.md +65 -0
- loopbench-0.1.0/SYNC.md +36 -0
- loopbench-0.1.0/assets/demo.gif +0 -0
- loopbench-0.1.0/cli/loopbench.py +7 -0
- loopbench-0.1.0/leaderboard/README.md +31 -0
- loopbench-0.1.0/leaderboard/entries.json +253 -0
- loopbench-0.1.0/loopbench/__init__.py +3 -0
- loopbench-0.1.0/loopbench/cli.py +136 -0
- loopbench-0.1.0/loopbench/conformance.py +79 -0
- loopbench-0.1.0/loopbench/les_compute.py +189 -0
- loopbench-0.1.0/loopbench/runner.py +142 -0
- loopbench-0.1.0/loopbench/tasks.py +47 -0
- loopbench-0.1.0/metrics/les-compute.md +142 -0
- loopbench-0.1.0/pyproject.toml +49 -0
- loopbench-0.1.0/requirements.txt +5 -0
- loopbench-0.1.0/scripts/generate_demo_gif.py +96 -0
- loopbench-0.1.0/scripts/prepare-commit-msg +7 -0
- loopbench-0.1.0/submissions/examples/spec-fast-loop.yaml +110 -0
- loopbench-0.1.0/submissions/examples/spec-thorough-loop.yaml +146 -0
- loopbench-0.1.0/submit/schema.json +168 -0
- loopbench-0.1.0/tasks/LB-CR-1/README.md +36 -0
- loopbench-0.1.0/tasks/LB-CR-1/task.yaml +73 -0
- loopbench-0.1.0/tasks/LB-MA-1/README.md +30 -0
- loopbench-0.1.0/tasks/LB-MA-1/task.yaml +65 -0
- loopbench-0.1.0/tasks/LB-RS-1/README.md +30 -0
- loopbench-0.1.0/tasks/LB-RS-1/task.yaml +66 -0
- loopbench-0.1.0/tasks/index.yaml +10 -0
- loopbench-0.1.0/tests/test_golden_submission.py +66 -0
- loopbench-0.1.0/tests/test_lss_and_tasks.py +57 -0
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
workflow_dispatch:
|
|
7
|
+
|
|
8
|
+
permissions:
|
|
9
|
+
id-token: write
|
|
10
|
+
|
|
11
|
+
jobs:
|
|
12
|
+
publish:
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
steps:
|
|
15
|
+
- uses: actions/checkout@v4
|
|
16
|
+
|
|
17
|
+
- uses: actions/setup-python@v5
|
|
18
|
+
with:
|
|
19
|
+
python-version: "3.12"
|
|
20
|
+
|
|
21
|
+
- name: Install build tools
|
|
22
|
+
run: pip install hatchling build
|
|
23
|
+
|
|
24
|
+
- name: Build package
|
|
25
|
+
run: python -m build
|
|
26
|
+
|
|
27
|
+
# Uses PYPI_API_TOKEN when set; otherwise OIDC trusted publishing.
|
|
28
|
+
- name: Publish to PyPI
|
|
29
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
30
|
+
with:
|
|
31
|
+
password: ${{ secrets.PYPI_API_TOKEN }}
|
|
32
|
+
skip-existing: true
|
|
33
|
+
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
name: test
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main, master]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main, master]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v4
|
|
14
|
+
|
|
15
|
+
- uses: actions/checkout@v4
|
|
16
|
+
with:
|
|
17
|
+
repository: KanakMalpani/LoopGym
|
|
18
|
+
path: deps/LoopGym
|
|
19
|
+
|
|
20
|
+
- uses: actions/checkout@v4
|
|
21
|
+
with:
|
|
22
|
+
repository: KanakMalpani/Loop-Core-Engineering
|
|
23
|
+
path: deps/loop-core
|
|
24
|
+
|
|
25
|
+
- uses: actions/setup-python@v5
|
|
26
|
+
with:
|
|
27
|
+
python-version: "3.12"
|
|
28
|
+
|
|
29
|
+
- name: Install loopgym
|
|
30
|
+
run: pip install -e deps/LoopGym
|
|
31
|
+
|
|
32
|
+
- name: Install loopbench
|
|
33
|
+
run: pip install -e ".[dev]"
|
|
34
|
+
|
|
35
|
+
- name: Validate submission LSS specs (Loop Core Engineering)
|
|
36
|
+
run: |
|
|
37
|
+
pip install -r deps/loop-core/requirements.txt
|
|
38
|
+
python deps/loop-core/tools/validate_lss.py submissions/examples/spec-fast-loop.yaml
|
|
39
|
+
python deps/loop-core/tools/validate_lss.py submissions/examples/spec-thorough-loop.yaml
|
|
40
|
+
|
|
41
|
+
- name: Run tests
|
|
42
|
+
run: pytest tests/ -q
|
|
43
|
+
|
|
44
|
+
- name: Validate leaderboard
|
|
45
|
+
run: loopbench validate leaderboard/entries.json
|
|
46
|
+
|
|
47
|
+
- name: Golden submission CLI
|
|
48
|
+
run: |
|
|
49
|
+
loopbench run --task LB-CR-1 --spec submissions/examples/spec-fast-loop.yaml \
|
|
50
|
+
--seeds 0,1 --submitter ci-fast -o /tmp/fast.json
|
|
51
|
+
loopbench validate /tmp/fast.json
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# Contributing to LoopBench
|
|
2
|
+
|
|
3
|
+
## What belongs here
|
|
4
|
+
|
|
5
|
+
- Benchmark task definitions (ALS v2 YAML)
|
|
6
|
+
- Observed LES computation and submission schema
|
|
7
|
+
- CLI, conformance validation, leaderboard entries
|
|
8
|
+
- Golden submission tests
|
|
9
|
+
|
|
10
|
+
## What does not belong here
|
|
11
|
+
|
|
12
|
+
- LSS schema — [Loop Core Engineering](https://github.com/KanakMalpani/Loop-Core-Engineering)
|
|
13
|
+
- Environment runtime — [LoopGym](https://github.com/KanakMalpani/LoopGym)
|
|
14
|
+
- Dataset records — [LoopNet](https://github.com/KanakMalpani/loopnet)
|
|
15
|
+
|
|
16
|
+
## Before opening a PR
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install -e ../LoopGym # or deps/LoopGym in CI layout
|
|
20
|
+
pip install -e ".[dev]"
|
|
21
|
+
pytest tests/ -q
|
|
22
|
+
loopbench validate leaderboard/entries.json
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Submitting benchmark results
|
|
26
|
+
|
|
27
|
+
1. Run all tasks: `loopbench run --task LB-CR-1,LB-RS-1,LB-MA-1 --spec your-loop.yaml -o results.json`
|
|
28
|
+
2. Validate: `loopbench validate results.json`
|
|
29
|
+
3. Open a PR adding your entry to `leaderboard/entries.json`
|
|
30
|
+
|
|
31
|
+
## License
|
|
32
|
+
|
|
33
|
+
MIT — see [LICENSE](LICENSE).
|
loopbench-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 KanakMalpani
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
loopbench-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: loopbench
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: LoopBench — benchmark suite, metrics, submission pipeline, leaderboards
|
|
5
|
+
Project-URL: Homepage, https://github.com/KanakMalpani/LoopBench
|
|
6
|
+
Project-URL: Repository, https://github.com/KanakMalpani/LoopBench
|
|
7
|
+
Project-URL: Issues, https://github.com/KanakMalpani/LoopBench/issues
|
|
8
|
+
Project-URL: Documentation, https://github.com/KanakMalpani/LoopBench/blob/main/SUITE-OVERVIEW.md
|
|
9
|
+
Project-URL: LoopGym, https://github.com/KanakMalpani/LoopGym
|
|
10
|
+
Project-URL: Loop Core Engineering, https://github.com/KanakMalpani/Loop-Core-Engineering
|
|
11
|
+
Author: Kanak Malpani
|
|
12
|
+
License: MIT
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Keywords: benchmark,les,loop-engineering,loopbench
|
|
15
|
+
Classifier: Development Status :: 4 - Beta
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Requires-Python: >=3.12
|
|
19
|
+
Requires-Dist: jsonschema>=4.21
|
|
20
|
+
Requires-Dist: pyyaml>=6.0
|
|
21
|
+
Provides-Extra: dev
|
|
22
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
23
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
|
|
26
|
+
<p align="center">
|
|
27
|
+
<strong>LoopBench</strong><br>
|
|
28
|
+
<em>MLPerf for loops.</em>
|
|
29
|
+
</p>
|
|
30
|
+
|
|
31
|
+
<p align="center">
|
|
32
|
+
<a href="https://github.com/KanakMalpani/LoopBench/actions/workflows/test.yml"><img src="https://github.com/KanakMalpani/LoopBench/actions/workflows/test.yml/badge.svg" alt="CI"></a>
|
|
33
|
+
<a href="LICENSE"><img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="MIT"></a>
|
|
34
|
+
<img src="https://img.shields.io/badge/python-3.12+-blue.svg" alt="Python 3.12+">
|
|
35
|
+
<a href="SUITE-OVERVIEW.md"><img src="https://img.shields.io/badge/suite-ALS_v2-blue.svg" alt="ALS v2"></a>
|
|
36
|
+
<img src="https://img.shields.io/badge/tasks-3-green.svg" alt="3 tasks">
|
|
37
|
+
</p>
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
**LoopBench** is the public scoreboard for Loop Engineering — fixed tasks, fixed seeds, observed [LES](https://github.com/KanakMalpani/Loop-Core-Engineering/blob/main/specs/les-1.0.md), and a submission pipeline anyone can audit.
|
|
42
|
+
|
|
43
|
+
You bring an [LSS](https://github.com/KanakMalpani/Loop-Core-Engineering) loop spec. LoopBench runs it through [LoopGym](https://github.com/KanakMalpani/LoopGym), computes **LES_obs** across eight categories, validates your results JSON, and ranks you on the leaderboard. No hand-waved demos.
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
loopbench run --task LB-CR-1 --spec your-loop.yaml --seeds 0,1,2,3,4 -o results.json
|
|
47
|
+
loopbench validate results.json
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
<p align="center">
|
|
51
|
+
<a href="#-run-your-first-score"><strong>Run your first score →</strong></a> ·
|
|
52
|
+
<a href="leaderboard/entries.json">Leaderboard</a> ·
|
|
53
|
+
<a href="SUITE-OVERVIEW.md">Suite architecture</a>
|
|
54
|
+
</p>
|
|
55
|
+
|
|
56
|
+
<p align="center">
|
|
57
|
+
<img src="assets/demo.gif" alt="LoopBench CLI demo: install, list tasks, run, validate, rank" width="720">
|
|
58
|
+
</p>
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## The contract
|
|
63
|
+
|
|
64
|
+
```mermaid
|
|
65
|
+
flowchart LR
|
|
66
|
+
YOU[Your LSS spec]
|
|
67
|
+
LB[LoopBench<br/>tasks · scoring · schema]
|
|
68
|
+
LG[LoopGym<br/>execution]
|
|
69
|
+
OUT[results.json → leaderboard]
|
|
70
|
+
|
|
71
|
+
YOU --> LB
|
|
72
|
+
LB -->|env_id, seeds| LG
|
|
73
|
+
LG -->|trajectories| LB
|
|
74
|
+
LB --> OUT
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
| Layer | Owns | Repo |
|
|
78
|
+
|-------|------|------|
|
|
79
|
+
| **Spec** | LSS schema, LES formulas | [Loop Core Engineering](https://github.com/KanakMalpani/Loop-Core-Engineering) |
|
|
80
|
+
| **Data** | Trajectories (optional holdout) | [LoopNet](https://github.com/KanakMalpani/loopnet) |
|
|
81
|
+
| **Runtime** | `env.run_episode()` | [LoopGym](https://github.com/KanakMalpani/LoopGym) |
|
|
82
|
+
| **Measurement** | Tasks, LES_obs, submissions | **LoopBench** |
|
|
83
|
+
|
|
84
|
+
LoopBench **defines** and **scores**. LoopGym **runs**. Never the other way around.
|
|
85
|
+
|
|
86
|
+
---
|
|
87
|
+
|
|
88
|
+
## ⚡ Run your first score
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
pip install git+https://github.com/KanakMalpani/LoopGym.git
|
|
92
|
+
pip install git+https://github.com/KanakMalpani/LoopBench.git
|
|
93
|
+
|
|
94
|
+
loopbench list
|
|
95
|
+
|
|
96
|
+
loopbench run \
|
|
97
|
+
--task LB-CR-1 \
|
|
98
|
+
--spec submissions/examples/spec-fast-loop.yaml \
|
|
99
|
+
--seeds 0,1,2,3,4 \
|
|
100
|
+
-o results.json
|
|
101
|
+
|
|
102
|
+
loopbench validate results.json
|
|
103
|
+
loopbench rank leaderboard/entries.json
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
**Local dev** (sibling clones):
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
git clone https://github.com/KanakMalpani/LoopGym.git
|
|
110
|
+
git clone https://github.com/KanakMalpani/LoopBench.git
|
|
111
|
+
cd LoopBench && pip install -e ../LoopGym -e ".[dev]"
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
On Windows: `py -3.12` if needed. PyPI: [PUBLISHING.md](PUBLISHING.md).
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
## Tasks (v0.1 · ALS v2)
|
|
119
|
+
|
|
120
|
+
| ID | Name | Env | What it stress-tests |
|
|
121
|
+
|----|------|-----|----------------------|
|
|
122
|
+
| `LB-CR-1` | Code repair | `loopbench/code-repair-v1` | Effectiveness, speed, robustness |
|
|
123
|
+
| `LB-RS-1` | Research synthesis | `loopbench/research-synthesis-v1` | Effectiveness, cost |
|
|
124
|
+
| `LB-MA-1` | Multi-agent debate | `loopbench/multi-agent-debate-v1` | Autonomy, scalability |
|
|
125
|
+
|
|
126
|
+
Each task ships YAML + README under [`tasks/`](tasks/). Five seeds by default. Success@k + **LES_obs** composite.
|
|
127
|
+
|
|
128
|
+
---
|
|
129
|
+
|
|
130
|
+
## Metrics
|
|
131
|
+
|
|
132
|
+
| Metric | Meaning |
|
|
133
|
+
|--------|---------|
|
|
134
|
+
| **Success@k** | Fraction of instances reaching goal threshold `g_target` |
|
|
135
|
+
| **LES_obs** | Observed eight-category composite ∈ `[0, 1]` — see [`metrics/les-compute.md`](metrics/les-compute.md) |
|
|
136
|
+
| **Cost** | Estimated USD per run from LSS cost limits |
|
|
137
|
+
| **Robustness** | Quality retention across seeds |
|
|
138
|
+
|
|
139
|
+
Display scale `0–100` is optional (`les_display = les_observed × 100`).
|
|
140
|
+
|
|
141
|
+
---
|
|
142
|
+
|
|
143
|
+
## Submit to the leaderboard
|
|
144
|
+
|
|
145
|
+
1. Run all tasks (or start with one):
|
|
146
|
+
`loopbench run --task LB-CR-1,LB-RS-1,LB-MA-1 --spec your-loop.yaml -o results.json`
|
|
147
|
+
2. Validate: `loopbench validate results.json`
|
|
148
|
+
3. Open a PR adding your entry to [`leaderboard/entries.json`](leaderboard/entries.json)
|
|
149
|
+
|
|
150
|
+
v0.1 rankings accept **SimEnv** submissions only (no API keys, fully reproducible). LiveEnv tier: v0.2.
|
|
151
|
+
|
|
152
|
+
---
|
|
153
|
+
|
|
154
|
+
## Repository layout
|
|
155
|
+
|
|
156
|
+
| Path | Purpose |
|
|
157
|
+
|------|---------|
|
|
158
|
+
| [`tasks/`](tasks/) | ALS v2 task definitions |
|
|
159
|
+
| [`metrics/les-compute.md`](metrics/les-compute.md) | LES_obs formulas |
|
|
160
|
+
| [`submit/schema.json`](submit/schema.json) | Submission JSON schema |
|
|
161
|
+
| [`loopbench/`](loopbench/) | Runner, LES compute, conformance |
|
|
162
|
+
| [`leaderboard/`](leaderboard/) | Public rankings (JSON v0.1) |
|
|
163
|
+
| [`submissions/examples/`](submissions/examples/) | Reference specs |
|
|
164
|
+
|
|
165
|
+
---
|
|
166
|
+
|
|
167
|
+
## Citation
|
|
168
|
+
|
|
169
|
+
```bibtex
|
|
170
|
+
@software{loopbench2026,
|
|
171
|
+
title={LoopBench: Benchmark Suite for Loop Engineering},
|
|
172
|
+
author={Malpani, Kanak},
|
|
173
|
+
year={2026},
|
|
174
|
+
url={https://github.com/KanakMalpani/LoopBench}
|
|
175
|
+
}
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
---
|
|
179
|
+
|
|
180
|
+
<p align="center">
|
|
181
|
+
<sub>MIT · v0.1 · <a href="CONTRIBUTING.md">Contributing</a> · <a href="SECURITY.md">Security</a> · <a href="STATUS.md">Status</a></sub>
|
|
182
|
+
</p>
|
loopbench-0.1.0/PLAN.md
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# 06 — loopbench
|
|
2
|
+
|
|
3
|
+
## One-line purpose
|
|
4
|
+
|
|
5
|
+
**LoopBench** — world's definitive loop benchmark suite, metrics, submission pipeline, and leaderboards.
|
|
6
|
+
|
|
7
|
+
## Why this repo exists
|
|
8
|
+
|
|
9
|
+
Fields need **comparable scores**. LoopBench is MLPerf for loops — tasks, rules, anti-gaming, public ranks.
|
|
10
|
+
|
|
11
|
+
## Scope (in scope)
|
|
12
|
+
|
|
13
|
+
- Task definitions (ALS v2): code-repair, research-synthesis, multi-agent-debate (+ expand)
|
|
14
|
+
- Oracle specifications per task
|
|
15
|
+
- Metrics: Success@k, LES-obs, cost, robustness (5 seeds), safety
|
|
16
|
+
- Submission format: container + LSS + results JSON
|
|
17
|
+
- Local runner CLI (`loopbench run`)
|
|
18
|
+
- Leaderboard schema (static JSON v0.1 → web v0.2)
|
|
19
|
+
- Conformance tests: "does submission follow rules?"
|
|
20
|
+
|
|
21
|
+
## Scope (out of scope)
|
|
22
|
+
|
|
23
|
+
- Gym implementation → `05-loopgym` (LoopBench *defines*, LoopGym *runs*)
|
|
24
|
+
|
|
25
|
+
## Deliverables v0.1
|
|
26
|
+
|
|
27
|
+
- [x] `tasks/` — 3 task specs (YAML + README each)
|
|
28
|
+
- [x] `metrics/les-compute.md` — how observed LES is computed on bench
|
|
29
|
+
- [x] `submit/schema.json` — results file format
|
|
30
|
+
- [x] `cli/loopbench.py` — local eval
|
|
31
|
+
- [x] `leaderboard/` — README + example entries JSON
|
|
32
|
+
|
|
33
|
+
## Task IDs (initial)
|
|
34
|
+
|
|
35
|
+
| ID | Name |
|
|
36
|
+
|----|------|
|
|
37
|
+
| `LB-CR-1` | Code repair |
|
|
38
|
+
| `LB-RS-1` | Research synthesis |
|
|
39
|
+
| `LB-MA-1` | Multi-agent debate |
|
|
40
|
+
|
|
41
|
+
## Dependencies
|
|
42
|
+
|
|
43
|
+
- **01-loop-engineering-core** — LES, LSS
|
|
44
|
+
- **05-loopgym** — execution engine
|
|
45
|
+
|
|
46
|
+
## Success criteria
|
|
47
|
+
|
|
48
|
+
Two different LSS specs submitted locally; leaderboard JSON ranks by composite; CI runs golden submission.
|
|
49
|
+
|
|
50
|
+
## Agent instructions
|
|
51
|
+
|
|
52
|
+
Align with existing `benchmarks/` in `Loop Engineering` repo then **move** canonical tasks here. Fixed seeds, pinned deps.
|
|
53
|
+
|
|
54
|
+
## Status
|
|
55
|
+
|
|
56
|
+
✅ v0.1 shipped (2026-06-13) — see [STATUS.md](./STATUS.md)
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# Publishing LoopBench to PyPI
|
|
2
|
+
|
|
3
|
+
**Prerequisite:** [LoopGym](https://pypi.org/project/loopgym/) must be published first.
|
|
4
|
+
|
|
5
|
+
## One-time setup
|
|
6
|
+
|
|
7
|
+
1. Create a PyPI project named **`loopbench`**.
|
|
8
|
+
2. **Preferred:** configure [trusted publishing](https://docs.pypi.org/trusted-publishers/) on the PyPI project:
|
|
9
|
+
- **PyPI project name:** `loopbench`
|
|
10
|
+
- **Owner:** `KanakMalpani`
|
|
11
|
+
- **Repository name:** `LoopBench`
|
|
12
|
+
- **Workflow name:** `publish.yml`
|
|
13
|
+
- **Environment name:** *(leave blank)*
|
|
14
|
+
|
|
15
|
+
3. **Fallback:** add **`PYPI_API_TOKEN`** (upload scope) to this repo's Actions secrets.
|
|
16
|
+
|
|
17
|
+
## Publish
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
git tag v0.1.0
|
|
21
|
+
git push origin v0.1.0
|
|
22
|
+
gh release create v0.1.0 --title "v0.1.0" --notes "Initial public release"
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
Or trigger **Actions → Publish to PyPI** manually.
|
|
26
|
+
|
|
27
|
+
## Install
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
pip install loopbench loopgym
|
|
31
|
+
loopbench list
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Verify locally
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install -e ../LoopGym
|
|
38
|
+
pip install build
|
|
39
|
+
python -m build
|
|
40
|
+
pip install dist/loopbench-*.whl
|
|
41
|
+
pytest tests/ -q
|
|
42
|
+
```
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<strong>LoopBench</strong><br>
|
|
3
|
+
<em>MLPerf for loops.</em>
|
|
4
|
+
</p>
|
|
5
|
+
|
|
6
|
+
<p align="center">
|
|
7
|
+
<a href="https://github.com/KanakMalpani/LoopBench/actions/workflows/test.yml"><img src="https://github.com/KanakMalpani/LoopBench/actions/workflows/test.yml/badge.svg" alt="CI"></a>
|
|
8
|
+
<a href="LICENSE"><img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="MIT"></a>
|
|
9
|
+
<img src="https://img.shields.io/badge/python-3.12+-blue.svg" alt="Python 3.12+">
|
|
10
|
+
<a href="SUITE-OVERVIEW.md"><img src="https://img.shields.io/badge/suite-ALS_v2-blue.svg" alt="ALS v2"></a>
|
|
11
|
+
<img src="https://img.shields.io/badge/tasks-3-green.svg" alt="3 tasks">
|
|
12
|
+
</p>
|
|
13
|
+
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
**LoopBench** is the public scoreboard for Loop Engineering — fixed tasks, fixed seeds, observed [LES](https://github.com/KanakMalpani/Loop-Core-Engineering/blob/main/specs/les-1.0.md), and a submission pipeline anyone can audit.
|
|
17
|
+
|
|
18
|
+
You bring an [LSS](https://github.com/KanakMalpani/Loop-Core-Engineering) loop spec. LoopBench runs it through [LoopGym](https://github.com/KanakMalpani/LoopGym), computes **LES_obs** across eight categories, validates your results JSON, and ranks you on the leaderboard. No hand-waved demos.
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
loopbench run --task LB-CR-1 --spec your-loop.yaml --seeds 0,1,2,3,4 -o results.json
|
|
22
|
+
loopbench validate results.json
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
<p align="center">
|
|
26
|
+
<a href="#-run-your-first-score"><strong>Run your first score →</strong></a> ·
|
|
27
|
+
<a href="leaderboard/entries.json">Leaderboard</a> ·
|
|
28
|
+
<a href="SUITE-OVERVIEW.md">Suite architecture</a>
|
|
29
|
+
</p>
|
|
30
|
+
|
|
31
|
+
<p align="center">
|
|
32
|
+
<img src="assets/demo.gif" alt="LoopBench CLI demo: install, list tasks, run, validate, rank" width="720">
|
|
33
|
+
</p>
|
|
34
|
+
|
|
35
|
+
---
|
|
36
|
+
|
|
37
|
+
## The contract
|
|
38
|
+
|
|
39
|
+
```mermaid
|
|
40
|
+
flowchart LR
|
|
41
|
+
YOU[Your LSS spec]
|
|
42
|
+
LB[LoopBench<br/>tasks · scoring · schema]
|
|
43
|
+
LG[LoopGym<br/>execution]
|
|
44
|
+
OUT[results.json → leaderboard]
|
|
45
|
+
|
|
46
|
+
YOU --> LB
|
|
47
|
+
LB -->|env_id, seeds| LG
|
|
48
|
+
LG -->|trajectories| LB
|
|
49
|
+
LB --> OUT
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
| Layer | Owns | Repo |
|
|
53
|
+
|-------|------|------|
|
|
54
|
+
| **Spec** | LSS schema, LES formulas | [Loop Core Engineering](https://github.com/KanakMalpani/Loop-Core-Engineering) |
|
|
55
|
+
| **Data** | Trajectories (optional holdout) | [LoopNet](https://github.com/KanakMalpani/loopnet) |
|
|
56
|
+
| **Runtime** | `env.run_episode()` | [LoopGym](https://github.com/KanakMalpani/LoopGym) |
|
|
57
|
+
| **Measurement** | Tasks, LES_obs, submissions | **LoopBench** |
|
|
58
|
+
|
|
59
|
+
LoopBench **defines** and **scores**. LoopGym **runs**. Never the other way around.
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
## ⚡ Run your first score
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pip install git+https://github.com/KanakMalpani/LoopGym.git
|
|
67
|
+
pip install git+https://github.com/KanakMalpani/LoopBench.git
|
|
68
|
+
|
|
69
|
+
loopbench list
|
|
70
|
+
|
|
71
|
+
loopbench run \
|
|
72
|
+
--task LB-CR-1 \
|
|
73
|
+
--spec submissions/examples/spec-fast-loop.yaml \
|
|
74
|
+
--seeds 0,1,2,3,4 \
|
|
75
|
+
-o results.json
|
|
76
|
+
|
|
77
|
+
loopbench validate results.json
|
|
78
|
+
loopbench rank leaderboard/entries.json
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
**Local dev** (sibling clones):
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
git clone https://github.com/KanakMalpani/LoopGym.git
|
|
85
|
+
git clone https://github.com/KanakMalpani/LoopBench.git
|
|
86
|
+
cd LoopBench && pip install -e ../LoopGym -e ".[dev]"
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
On Windows: `py -3.12` if needed. PyPI: [PUBLISHING.md](PUBLISHING.md).
|
|
90
|
+
|
|
91
|
+
---
|
|
92
|
+
|
|
93
|
+
## Tasks (v0.1 · ALS v2)
|
|
94
|
+
|
|
95
|
+
| ID | Name | Env | What it stress-tests |
|
|
96
|
+
|----|------|-----|----------------------|
|
|
97
|
+
| `LB-CR-1` | Code repair | `loopbench/code-repair-v1` | Effectiveness, speed, robustness |
|
|
98
|
+
| `LB-RS-1` | Research synthesis | `loopbench/research-synthesis-v1` | Effectiveness, cost |
|
|
99
|
+
| `LB-MA-1` | Multi-agent debate | `loopbench/multi-agent-debate-v1` | Autonomy, scalability |
|
|
100
|
+
|
|
101
|
+
Each task ships YAML + README under [`tasks/`](tasks/). Five seeds by default. Success@k + **LES_obs** composite.
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
## Metrics
|
|
106
|
+
|
|
107
|
+
| Metric | Meaning |
|
|
108
|
+
|--------|---------|
|
|
109
|
+
| **Success@k** | Fraction of instances reaching goal threshold `g_target` |
|
|
110
|
+
| **LES_obs** | Observed eight-category composite ∈ `[0, 1]` — see [`metrics/les-compute.md`](metrics/les-compute.md) |
|
|
111
|
+
| **Cost** | Estimated USD per run from LSS cost limits |
|
|
112
|
+
| **Robustness** | Quality retention across seeds |
|
|
113
|
+
|
|
114
|
+
Display scale `0–100` is optional (`les_display = les_observed × 100`).
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
## Submit to the leaderboard
|
|
119
|
+
|
|
120
|
+
1. Run all tasks (or start with one):
|
|
121
|
+
`loopbench run --task LB-CR-1,LB-RS-1,LB-MA-1 --spec your-loop.yaml -o results.json`
|
|
122
|
+
2. Validate: `loopbench validate results.json`
|
|
123
|
+
3. Open a PR adding your entry to [`leaderboard/entries.json`](leaderboard/entries.json)
|
|
124
|
+
|
|
125
|
+
v0.1 rankings accept **SimEnv** submissions only (no API keys, fully reproducible). LiveEnv tier: v0.2.
|
|
126
|
+
|
|
127
|
+
---
|
|
128
|
+
|
|
129
|
+
## Repository layout
|
|
130
|
+
|
|
131
|
+
| Path | Purpose |
|
|
132
|
+
|------|---------|
|
|
133
|
+
| [`tasks/`](tasks/) | ALS v2 task definitions |
|
|
134
|
+
| [`metrics/les-compute.md`](metrics/les-compute.md) | LES_obs formulas |
|
|
135
|
+
| [`submit/schema.json`](submit/schema.json) | Submission JSON schema |
|
|
136
|
+
| [`loopbench/`](loopbench/) | Runner, LES compute, conformance |
|
|
137
|
+
| [`leaderboard/`](leaderboard/) | Public rankings (JSON v0.1) |
|
|
138
|
+
| [`submissions/examples/`](submissions/examples/) | Reference specs |
|
|
139
|
+
|
|
140
|
+
---
|
|
141
|
+
|
|
142
|
+
## Citation
|
|
143
|
+
|
|
144
|
+
```bibtex
|
|
145
|
+
@software{loopbench2026,
|
|
146
|
+
title={LoopBench: Benchmark Suite for Loop Engineering},
|
|
147
|
+
author={Malpani, Kanak},
|
|
148
|
+
year={2026},
|
|
149
|
+
url={https://github.com/KanakMalpani/LoopBench}
|
|
150
|
+
}
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
---
|
|
154
|
+
|
|
155
|
+
<p align="center">
|
|
156
|
+
<sub>MIT · v0.1 · <a href="CONTRIBUTING.md">Contributing</a> · <a href="SECURITY.md">Security</a> · <a href="STATUS.md">Status</a></sub>
|
|
157
|
+
</p>
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# Security Policy
|
|
2
|
+
|
|
3
|
+
## Supported versions
|
|
4
|
+
|
|
5
|
+
| Version | Supported |
|
|
6
|
+
|---------|-----------|
|
|
7
|
+
| 0.1.x | Yes |
|
|
8
|
+
|
|
9
|
+
## Reporting
|
|
10
|
+
|
|
11
|
+
Report privately via [GitHub Security Advisories](https://github.com/KanakMalpani/LoopBench/security/advisories/new).
|
|
12
|
+
|
|
13
|
+
## Submissions
|
|
14
|
+
|
|
15
|
+
Validate all results with `loopbench validate` before opening leaderboard PRs. v0.1 accepts SimEnv only for public rankings.
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Status
|
|
2
|
+
|
|
3
|
+
| Field | Value |
|
|
4
|
+
|-------|-------|
|
|
5
|
+
| **Phase** | v0.1 shipped |
|
|
6
|
+
| **Symbol** | ✅ |
|
|
7
|
+
| **Started** | 2026-06-13 |
|
|
8
|
+
| **Shipped** | 2026-06-13 |
|
|
9
|
+
| **Owner** | — |
|
|
10
|
+
| **Blockers** | — |
|
|
11
|
+
| **Notes** | Published at https://github.com/KanakMalpani/LoopBench; LiveEnv submissions v0.2 |
|
|
12
|
+
|
|
13
|
+
## Completion checklist
|
|
14
|
+
|
|
15
|
+
- [x] `tasks/` — 3 task specs (YAML + README each)
|
|
16
|
+
- [x] `metrics/les-compute.md` — observed LES computation
|
|
17
|
+
- [x] `submit/schema.json` — results file format
|
|
18
|
+
- [x] `cli/loopbench.py` — local eval CLI
|
|
19
|
+
- [x] `leaderboard/` — README + example entries JSON
|
|
20
|
+
- [x] Golden submission tests + CI
|
|
21
|
+
- [x] `SYNC.md`, `LICENSE`, `SUITE-OVERVIEW.md`
|
|
22
|
+
- [x] LSS validation in CI (01-core)
|
|
23
|
+
- [x] Smoke tests for all 3 tasks
|
|
24
|
+
- [ ] PyPI publish (`pip install loopbench` — pending trusted publisher or token on PyPI)
|
|
25
|
+
|
|
26
|
+
## Links
|
|
27
|
+
|
|
28
|
+
- Parent workspace: [../README.md](../README.md)
|
|
29
|
+
- LSS/LES: [../01-loop-engineering-core](../01-loop-engineering-core/)
|
|
30
|
+
- Runtime: [../05-loopgym](../05-loopgym/)
|
|
31
|
+
- Agent brief: [../AGENT-BRIEF.md](../AGENT-BRIEF.md)
|