loopbench 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. loopbench-0.1.0/.github/workflows/publish.yml +33 -0
  2. loopbench-0.1.0/.github/workflows/test.yml +51 -0
  3. loopbench-0.1.0/.gitignore +17 -0
  4. loopbench-0.1.0/CONTRIBUTING.md +33 -0
  5. loopbench-0.1.0/LICENSE +21 -0
  6. loopbench-0.1.0/PKG-INFO +182 -0
  7. loopbench-0.1.0/PLAN.md +56 -0
  8. loopbench-0.1.0/PUBLISHING.md +42 -0
  9. loopbench-0.1.0/README.md +157 -0
  10. loopbench-0.1.0/SECURITY.md +15 -0
  11. loopbench-0.1.0/STATUS.md +31 -0
  12. loopbench-0.1.0/SUITE-OVERVIEW.md +65 -0
  13. loopbench-0.1.0/SYNC.md +36 -0
  14. loopbench-0.1.0/assets/demo.gif +0 -0
  15. loopbench-0.1.0/cli/loopbench.py +7 -0
  16. loopbench-0.1.0/leaderboard/README.md +31 -0
  17. loopbench-0.1.0/leaderboard/entries.json +253 -0
  18. loopbench-0.1.0/loopbench/__init__.py +3 -0
  19. loopbench-0.1.0/loopbench/cli.py +136 -0
  20. loopbench-0.1.0/loopbench/conformance.py +79 -0
  21. loopbench-0.1.0/loopbench/les_compute.py +189 -0
  22. loopbench-0.1.0/loopbench/runner.py +142 -0
  23. loopbench-0.1.0/loopbench/tasks.py +47 -0
  24. loopbench-0.1.0/metrics/les-compute.md +142 -0
  25. loopbench-0.1.0/pyproject.toml +49 -0
  26. loopbench-0.1.0/requirements.txt +5 -0
  27. loopbench-0.1.0/scripts/generate_demo_gif.py +96 -0
  28. loopbench-0.1.0/scripts/prepare-commit-msg +7 -0
  29. loopbench-0.1.0/submissions/examples/spec-fast-loop.yaml +110 -0
  30. loopbench-0.1.0/submissions/examples/spec-thorough-loop.yaml +146 -0
  31. loopbench-0.1.0/submit/schema.json +168 -0
  32. loopbench-0.1.0/tasks/LB-CR-1/README.md +36 -0
  33. loopbench-0.1.0/tasks/LB-CR-1/task.yaml +73 -0
  34. loopbench-0.1.0/tasks/LB-MA-1/README.md +30 -0
  35. loopbench-0.1.0/tasks/LB-MA-1/task.yaml +65 -0
  36. loopbench-0.1.0/tasks/LB-RS-1/README.md +30 -0
  37. loopbench-0.1.0/tasks/LB-RS-1/task.yaml +66 -0
  38. loopbench-0.1.0/tasks/index.yaml +10 -0
  39. loopbench-0.1.0/tests/test_golden_submission.py +66 -0
  40. loopbench-0.1.0/tests/test_lss_and_tasks.py +57 -0
@@ -0,0 +1,33 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+ workflow_dispatch:
7
+
8
+ permissions:
9
+ id-token: write
10
+
11
+ jobs:
12
+ publish:
13
+ runs-on: ubuntu-latest
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+
17
+ - uses: actions/setup-python@v5
18
+ with:
19
+ python-version: "3.12"
20
+
21
+ - name: Install build tools
22
+ run: pip install hatchling build
23
+
24
+ - name: Build package
25
+ run: python -m build
26
+
27
+ # Uses PYPI_API_TOKEN when set; otherwise OIDC trusted publishing.
28
+ - name: Publish to PyPI
29
+ uses: pypa/gh-action-pypi-publish@release/v1
30
+ with:
31
+ password: ${{ secrets.PYPI_API_TOKEN }}
32
+ skip-existing: true
33
+
@@ -0,0 +1,51 @@
1
+ name: test
2
+
3
+ on:
4
+ push:
5
+ branches: [main, master]
6
+ pull_request:
7
+ branches: [main, master]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v4
14
+
15
+ - uses: actions/checkout@v4
16
+ with:
17
+ repository: KanakMalpani/LoopGym
18
+ path: deps/LoopGym
19
+
20
+ - uses: actions/checkout@v4
21
+ with:
22
+ repository: KanakMalpani/Loop-Core-Engineering
23
+ path: deps/loop-core
24
+
25
+ - uses: actions/setup-python@v5
26
+ with:
27
+ python-version: "3.12"
28
+
29
+ - name: Install loopgym
30
+ run: pip install -e deps/LoopGym
31
+
32
+ - name: Install loopbench
33
+ run: pip install -e ".[dev]"
34
+
35
+ - name: Validate submission LSS specs (Loop Core Engineering)
36
+ run: |
37
+ pip install -r deps/loop-core/requirements.txt
38
+ python deps/loop-core/tools/validate_lss.py submissions/examples/spec-fast-loop.yaml
39
+ python deps/loop-core/tools/validate_lss.py submissions/examples/spec-thorough-loop.yaml
40
+
41
+ - name: Run tests
42
+ run: pytest tests/ -q
43
+
44
+ - name: Validate leaderboard
45
+ run: loopbench validate leaderboard/entries.json
46
+
47
+ - name: Golden submission CLI
48
+ run: |
49
+ loopbench run --task LB-CR-1 --spec submissions/examples/spec-fast-loop.yaml \
50
+ --seeds 0,1 --submitter ci-fast -o /tmp/fast.json
51
+ loopbench validate /tmp/fast.json
@@ -0,0 +1,17 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ .venv/
4
+ venv/
5
+ .pytest_cache/
6
+ .ruff_cache/
7
+ *.egg-info/
8
+ dist/
9
+ build/
10
+ .DS_Store
11
+ .env
12
+ .env.*
13
+ !.env.example
14
+ *.pem
15
+ credentials.json
16
+ /tmp/
17
+ results*.json
@@ -0,0 +1,33 @@
1
+ # Contributing to LoopBench
2
+
3
+ ## What belongs here
4
+
5
+ - Benchmark task definitions (ALS v2 YAML)
6
+ - Observed LES computation and submission schema
7
+ - CLI, conformance validation, leaderboard entries
8
+ - Golden submission tests
9
+
10
+ ## What does not belong here
11
+
12
+ - LSS schema — [Loop Core Engineering](https://github.com/KanakMalpani/Loop-Core-Engineering)
13
+ - Environment runtime — [LoopGym](https://github.com/KanakMalpani/LoopGym)
14
+ - Dataset records — [LoopNet](https://github.com/KanakMalpani/loopnet)
15
+
16
+ ## Before opening a PR
17
+
18
+ ```bash
19
+ pip install -e ../LoopGym # or deps/LoopGym in CI layout
20
+ pip install -e ".[dev]"
21
+ pytest tests/ -q
22
+ loopbench validate leaderboard/entries.json
23
+ ```
24
+
25
+ ## Submitting benchmark results
26
+
27
+ 1. Run all tasks: `loopbench run --task LB-CR-1,LB-RS-1,LB-MA-1 --spec your-loop.yaml -o results.json`
28
+ 2. Validate: `loopbench validate results.json`
29
+ 3. Open a PR adding your entry to `leaderboard/entries.json`
30
+
31
+ ## License
32
+
33
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 KanakMalpani
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,182 @@
1
+ Metadata-Version: 2.4
2
+ Name: loopbench
3
+ Version: 0.1.0
4
+ Summary: LoopBench — benchmark suite, metrics, submission pipeline, leaderboards
5
+ Project-URL: Homepage, https://github.com/KanakMalpani/LoopBench
6
+ Project-URL: Repository, https://github.com/KanakMalpani/LoopBench
7
+ Project-URL: Issues, https://github.com/KanakMalpani/LoopBench/issues
8
+ Project-URL: Documentation, https://github.com/KanakMalpani/LoopBench/blob/main/SUITE-OVERVIEW.md
9
+ Project-URL: LoopGym, https://github.com/KanakMalpani/LoopGym
10
+ Project-URL: Loop Core Engineering, https://github.com/KanakMalpani/Loop-Core-Engineering
11
+ Author: Kanak Malpani
12
+ License: MIT
13
+ License-File: LICENSE
14
+ Keywords: benchmark,les,loop-engineering,loopbench
15
+ Classifier: Development Status :: 4 - Beta
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Requires-Python: >=3.12
19
+ Requires-Dist: jsonschema>=4.21
20
+ Requires-Dist: pyyaml>=6.0
21
+ Provides-Extra: dev
22
+ Requires-Dist: pytest>=8.0; extra == 'dev'
23
+ Requires-Dist: ruff>=0.4; extra == 'dev'
24
+ Description-Content-Type: text/markdown
25
+
26
+ <p align="center">
27
+ <strong>LoopBench</strong><br>
28
+ <em>MLPerf for loops.</em>
29
+ </p>
30
+
31
+ <p align="center">
32
+ <a href="https://github.com/KanakMalpani/LoopBench/actions/workflows/test.yml"><img src="https://github.com/KanakMalpani/LoopBench/actions/workflows/test.yml/badge.svg" alt="CI"></a>
33
+ <a href="LICENSE"><img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="MIT"></a>
34
+ <img src="https://img.shields.io/badge/python-3.12+-blue.svg" alt="Python 3.12+">
35
+ <a href="SUITE-OVERVIEW.md"><img src="https://img.shields.io/badge/suite-ALS_v2-blue.svg" alt="ALS v2"></a>
36
+ <img src="https://img.shields.io/badge/tasks-3-green.svg" alt="3 tasks">
37
+ </p>
38
+
39
+ ---
40
+
41
+ **LoopBench** is the public scoreboard for Loop Engineering — fixed tasks, fixed seeds, observed [LES](https://github.com/KanakMalpani/Loop-Core-Engineering/blob/main/specs/les-1.0.md), and a submission pipeline anyone can audit.
42
+
43
+ You bring an [LSS](https://github.com/KanakMalpani/Loop-Core-Engineering) loop spec. LoopBench runs it through [LoopGym](https://github.com/KanakMalpani/LoopGym), computes **LES_obs** across eight categories, validates your results JSON, and ranks you on the leaderboard. No hand-waved demos.
44
+
45
+ ```bash
46
+ loopbench run --task LB-CR-1 --spec your-loop.yaml --seeds 0,1,2,3,4 -o results.json
47
+ loopbench validate results.json
48
+ ```
49
+
50
+ <p align="center">
51
+ <a href="#-run-your-first-score"><strong>Run your first score →</strong></a> ·
52
+ <a href="leaderboard/entries.json">Leaderboard</a> ·
53
+ <a href="SUITE-OVERVIEW.md">Suite architecture</a>
54
+ </p>
55
+
56
+ <p align="center">
57
+ <img src="assets/demo.gif" alt="LoopBench CLI demo: install, list tasks, run, validate, rank" width="720">
58
+ </p>
59
+
60
+ ---
61
+
62
+ ## The contract
63
+
64
+ ```mermaid
65
+ flowchart LR
66
+ YOU[Your LSS spec]
67
+ LB[LoopBench<br/>tasks · scoring · schema]
68
+ LG[LoopGym<br/>execution]
69
+ OUT[results.json → leaderboard]
70
+
71
+ YOU --> LB
72
+ LB -->|env_id, seeds| LG
73
+ LG -->|trajectories| LB
74
+ LB --> OUT
75
+ ```
76
+
77
+ | Layer | Owns | Repo |
78
+ |-------|------|------|
79
+ | **Spec** | LSS schema, LES formulas | [Loop Core Engineering](https://github.com/KanakMalpani/Loop-Core-Engineering) |
80
+ | **Data** | Trajectories (optional holdout) | [LoopNet](https://github.com/KanakMalpani/loopnet) |
81
+ | **Runtime** | `env.run_episode()` | [LoopGym](https://github.com/KanakMalpani/LoopGym) |
82
+ | **Measurement** | Tasks, LES_obs, submissions | **LoopBench** |
83
+
84
+ LoopBench **defines** and **scores**. LoopGym **runs**. Never the other way around.
85
+
86
+ ---
87
+
88
+ ## ⚡ Run your first score
89
+
90
+ ```bash
91
+ pip install git+https://github.com/KanakMalpani/LoopGym.git
92
+ pip install git+https://github.com/KanakMalpani/LoopBench.git
93
+
94
+ loopbench list
95
+
96
+ loopbench run \
97
+ --task LB-CR-1 \
98
+ --spec submissions/examples/spec-fast-loop.yaml \
99
+ --seeds 0,1,2,3,4 \
100
+ -o results.json
101
+
102
+ loopbench validate results.json
103
+ loopbench rank leaderboard/entries.json
104
+ ```
105
+
106
+ **Local dev** (sibling clones):
107
+
108
+ ```bash
109
+ git clone https://github.com/KanakMalpani/LoopGym.git
110
+ git clone https://github.com/KanakMalpani/LoopBench.git
111
+ cd LoopBench && pip install -e ../LoopGym -e ".[dev]"
112
+ ```
113
+
114
+ On Windows: `py -3.12` if needed. PyPI: [PUBLISHING.md](PUBLISHING.md).
115
+
116
+ ---
117
+
118
+ ## Tasks (v0.1 · ALS v2)
119
+
120
+ | ID | Name | Env | What it stress-tests |
121
+ |----|------|-----|----------------------|
122
+ | `LB-CR-1` | Code repair | `loopbench/code-repair-v1` | Effectiveness, speed, robustness |
123
+ | `LB-RS-1` | Research synthesis | `loopbench/research-synthesis-v1` | Effectiveness, cost |
124
+ | `LB-MA-1` | Multi-agent debate | `loopbench/multi-agent-debate-v1` | Autonomy, scalability |
125
+
126
+ Each task ships YAML + README under [`tasks/`](tasks/). Five seeds by default. Success@k + **LES_obs** composite.
127
+
128
+ ---
129
+
130
+ ## Metrics
131
+
132
+ | Metric | Meaning |
133
+ |--------|---------|
134
+ | **Success@k** | Fraction of instances reaching goal threshold `g_target` |
135
+ | **LES_obs** | Observed eight-category composite ∈ `[0, 1]` — see [`metrics/les-compute.md`](metrics/les-compute.md) |
136
+ | **Cost** | Estimated USD per run from LSS cost limits |
137
+ | **Robustness** | Quality retention across seeds |
138
+
139
+ Display scale `0–100` is optional (`les_display = les_observed × 100`).
140
+
141
+ ---
142
+
143
+ ## Submit to the leaderboard
144
+
145
+ 1. Run all tasks (or start with one):
146
+ `loopbench run --task LB-CR-1,LB-RS-1,LB-MA-1 --spec your-loop.yaml -o results.json`
147
+ 2. Validate: `loopbench validate results.json`
148
+ 3. Open a PR adding your entry to [`leaderboard/entries.json`](leaderboard/entries.json)
149
+
150
+ v0.1 rankings accept **SimEnv** submissions only (no API keys, fully reproducible). LiveEnv tier: v0.2.
151
+
152
+ ---
153
+
154
+ ## Repository layout
155
+
156
+ | Path | Purpose |
157
+ |------|---------|
158
+ | [`tasks/`](tasks/) | ALS v2 task definitions |
159
+ | [`metrics/les-compute.md`](metrics/les-compute.md) | LES_obs formulas |
160
+ | [`submit/schema.json`](submit/schema.json) | Submission JSON schema |
161
+ | [`loopbench/`](loopbench/) | Runner, LES compute, conformance |
162
+ | [`leaderboard/`](leaderboard/) | Public rankings (JSON v0.1) |
163
+ | [`submissions/examples/`](submissions/examples/) | Reference specs |
164
+
165
+ ---
166
+
167
+ ## Citation
168
+
169
+ ```bibtex
170
+ @software{loopbench2026,
171
+ title={LoopBench: Benchmark Suite for Loop Engineering},
172
+ author={Malpani, Kanak},
173
+ year={2026},
174
+ url={https://github.com/KanakMalpani/LoopBench}
175
+ }
176
+ ```
177
+
178
+ ---
179
+
180
+ <p align="center">
181
+ <sub>MIT · v0.1 · <a href="CONTRIBUTING.md">Contributing</a> · <a href="SECURITY.md">Security</a> · <a href="STATUS.md">Status</a></sub>
182
+ </p>
@@ -0,0 +1,56 @@
1
+ # 06 — loopbench
2
+
3
+ ## One-line purpose
4
+
5
+ **LoopBench** — world's definitive loop benchmark suite, metrics, submission pipeline, and leaderboards.
6
+
7
+ ## Why this repo exists
8
+
9
+ Fields need **comparable scores**. LoopBench is MLPerf for loops — tasks, rules, anti-gaming, public ranks.
10
+
11
+ ## Scope (in scope)
12
+
13
+ - Task definitions (ALS v2): code-repair, research-synthesis, multi-agent-debate (+ expand)
14
+ - Oracle specifications per task
15
+ - Metrics: Success@k, LES-obs, cost, robustness (5 seeds), safety
16
+ - Submission format: container + LSS + results JSON
17
+ - Local runner CLI (`loopbench run`)
18
+ - Leaderboard schema (static JSON v0.1 → web v0.2)
19
+ - Conformance tests: "does submission follow rules?"
20
+
21
+ ## Scope (out of scope)
22
+
23
+ - Gym implementation → `05-loopgym` (LoopBench *defines*, LoopGym *runs*)
24
+
25
+ ## Deliverables v0.1
26
+
27
+ - [x] `tasks/` — 3 task specs (YAML + README each)
28
+ - [x] `metrics/les-compute.md` — how observed LES is computed on bench
29
+ - [x] `submit/schema.json` — results file format
30
+ - [x] `cli/loopbench.py` — local eval
31
+ - [x] `leaderboard/` — README + example entries JSON
32
+
33
+ ## Task IDs (initial)
34
+
35
+ | ID | Name |
36
+ |----|------|
37
+ | `LB-CR-1` | Code repair |
38
+ | `LB-RS-1` | Research synthesis |
39
+ | `LB-MA-1` | Multi-agent debate |
40
+
41
+ ## Dependencies
42
+
43
+ - **01-loop-engineering-core** — LES, LSS
44
+ - **05-loopgym** — execution engine
45
+
46
+ ## Success criteria
47
+
48
+ Two different LSS specs submitted locally; leaderboard JSON ranks by composite; CI runs golden submission.
49
+
50
+ ## Agent instructions
51
+
52
+ Align with existing `benchmarks/` in `Loop Engineering` repo then **move** canonical tasks here. Fixed seeds, pinned deps.
53
+
54
+ ## Status
55
+
56
+ ✅ v0.1 shipped (2026-06-13) — see [STATUS.md](./STATUS.md)
@@ -0,0 +1,42 @@
1
+ # Publishing LoopBench to PyPI
2
+
3
+ **Prerequisite:** [LoopGym](https://pypi.org/project/loopgym/) must be published first.
4
+
5
+ ## One-time setup
6
+
7
+ 1. Create a PyPI project named **`loopbench`**.
8
+ 2. **Preferred:** configure [trusted publishing](https://docs.pypi.org/trusted-publishers/) on the PyPI project:
9
+ - **PyPI project name:** `loopbench`
10
+ - **Owner:** `KanakMalpani`
11
+ - **Repository name:** `LoopBench`
12
+ - **Workflow name:** `publish.yml`
13
+ - **Environment name:** *(leave blank)*
14
+
15
+ 3. **Fallback:** add **`PYPI_API_TOKEN`** (upload scope) to this repo's Actions secrets.
16
+
17
+ ## Publish
18
+
19
+ ```bash
20
+ git tag v0.1.0
21
+ git push origin v0.1.0
22
+ gh release create v0.1.0 --title "v0.1.0" --notes "Initial public release"
23
+ ```
24
+
25
+ Or trigger **Actions → Publish to PyPI** manually.
26
+
27
+ ## Install
28
+
29
+ ```bash
30
+ pip install loopbench loopgym
31
+ loopbench list
32
+ ```
33
+
34
+ ## Verify locally
35
+
36
+ ```bash
37
+ pip install -e ../LoopGym
38
+ pip install build
39
+ python -m build
40
+ pip install dist/loopbench-*.whl
41
+ pytest tests/ -q
42
+ ```
@@ -0,0 +1,157 @@
1
+ <p align="center">
2
+ <strong>LoopBench</strong><br>
3
+ <em>MLPerf for loops.</em>
4
+ </p>
5
+
6
+ <p align="center">
7
+ <a href="https://github.com/KanakMalpani/LoopBench/actions/workflows/test.yml"><img src="https://github.com/KanakMalpani/LoopBench/actions/workflows/test.yml/badge.svg" alt="CI"></a>
8
+ <a href="LICENSE"><img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="MIT"></a>
9
+ <img src="https://img.shields.io/badge/python-3.12+-blue.svg" alt="Python 3.12+">
10
+ <a href="SUITE-OVERVIEW.md"><img src="https://img.shields.io/badge/suite-ALS_v2-blue.svg" alt="ALS v2"></a>
11
+ <img src="https://img.shields.io/badge/tasks-3-green.svg" alt="3 tasks">
12
+ </p>
13
+
14
+ ---
15
+
16
+ **LoopBench** is the public scoreboard for Loop Engineering — fixed tasks, fixed seeds, observed [LES](https://github.com/KanakMalpani/Loop-Core-Engineering/blob/main/specs/les-1.0.md), and a submission pipeline anyone can audit.
17
+
18
+ You bring an [LSS](https://github.com/KanakMalpani/Loop-Core-Engineering) loop spec. LoopBench runs it through [LoopGym](https://github.com/KanakMalpani/LoopGym), computes **LES_obs** across eight categories, validates your results JSON, and ranks you on the leaderboard. No hand-waved demos.
19
+
20
+ ```bash
21
+ loopbench run --task LB-CR-1 --spec your-loop.yaml --seeds 0,1,2,3,4 -o results.json
22
+ loopbench validate results.json
23
+ ```
24
+
25
+ <p align="center">
26
+ <a href="#-run-your-first-score"><strong>Run your first score →</strong></a> ·
27
+ <a href="leaderboard/entries.json">Leaderboard</a> ·
28
+ <a href="SUITE-OVERVIEW.md">Suite architecture</a>
29
+ </p>
30
+
31
+ <p align="center">
32
+ <img src="assets/demo.gif" alt="LoopBench CLI demo: install, list tasks, run, validate, rank" width="720">
33
+ </p>
34
+
35
+ ---
36
+
37
+ ## The contract
38
+
39
+ ```mermaid
40
+ flowchart LR
41
+ YOU[Your LSS spec]
42
+ LB[LoopBench<br/>tasks · scoring · schema]
43
+ LG[LoopGym<br/>execution]
44
+ OUT[results.json → leaderboard]
45
+
46
+ YOU --> LB
47
+ LB -->|env_id, seeds| LG
48
+ LG -->|trajectories| LB
49
+ LB --> OUT
50
+ ```
51
+
52
+ | Layer | Owns | Repo |
53
+ |-------|------|------|
54
+ | **Spec** | LSS schema, LES formulas | [Loop Core Engineering](https://github.com/KanakMalpani/Loop-Core-Engineering) |
55
+ | **Data** | Trajectories (optional holdout) | [LoopNet](https://github.com/KanakMalpani/loopnet) |
56
+ | **Runtime** | `env.run_episode()` | [LoopGym](https://github.com/KanakMalpani/LoopGym) |
57
+ | **Measurement** | Tasks, LES_obs, submissions | **LoopBench** |
58
+
59
+ LoopBench **defines** and **scores**. LoopGym **runs**. Never the other way around.
60
+
61
+ ---
62
+
63
+ ## ⚡ Run your first score
64
+
65
+ ```bash
66
+ pip install git+https://github.com/KanakMalpani/LoopGym.git
67
+ pip install git+https://github.com/KanakMalpani/LoopBench.git
68
+
69
+ loopbench list
70
+
71
+ loopbench run \
72
+ --task LB-CR-1 \
73
+ --spec submissions/examples/spec-fast-loop.yaml \
74
+ --seeds 0,1,2,3,4 \
75
+ -o results.json
76
+
77
+ loopbench validate results.json
78
+ loopbench rank leaderboard/entries.json
79
+ ```
80
+
81
+ **Local dev** (sibling clones):
82
+
83
+ ```bash
84
+ git clone https://github.com/KanakMalpani/LoopGym.git
85
+ git clone https://github.com/KanakMalpani/LoopBench.git
86
+ cd LoopBench && pip install -e ../LoopGym -e ".[dev]"
87
+ ```
88
+
89
+ On Windows: `py -3.12` if needed. PyPI: [PUBLISHING.md](PUBLISHING.md).
90
+
91
+ ---
92
+
93
+ ## Tasks (v0.1 · ALS v2)
94
+
95
+ | ID | Name | Env | What it stress-tests |
96
+ |----|------|-----|----------------------|
97
+ | `LB-CR-1` | Code repair | `loopbench/code-repair-v1` | Effectiveness, speed, robustness |
98
+ | `LB-RS-1` | Research synthesis | `loopbench/research-synthesis-v1` | Effectiveness, cost |
99
+ | `LB-MA-1` | Multi-agent debate | `loopbench/multi-agent-debate-v1` | Autonomy, scalability |
100
+
101
+ Each task ships YAML + README under [`tasks/`](tasks/). Five seeds by default. Success@k + **LES_obs** composite.
102
+
103
+ ---
104
+
105
+ ## Metrics
106
+
107
+ | Metric | Meaning |
108
+ |--------|---------|
109
+ | **Success@k** | Fraction of instances reaching goal threshold `g_target` |
110
+ | **LES_obs** | Observed eight-category composite ∈ `[0, 1]` — see [`metrics/les-compute.md`](metrics/les-compute.md) |
111
+ | **Cost** | Estimated USD per run from LSS cost limits |
112
+ | **Robustness** | Quality retention across seeds |
113
+
114
+ Display scale `0–100` is optional (`les_display = les_observed × 100`).
115
+
116
+ ---
117
+
118
+ ## Submit to the leaderboard
119
+
120
+ 1. Run all tasks (or start with one):
121
+ `loopbench run --task LB-CR-1,LB-RS-1,LB-MA-1 --spec your-loop.yaml -o results.json`
122
+ 2. Validate: `loopbench validate results.json`
123
+ 3. Open a PR adding your entry to [`leaderboard/entries.json`](leaderboard/entries.json)
124
+
125
+ v0.1 rankings accept **SimEnv** submissions only (no API keys, fully reproducible). LiveEnv tier: v0.2.
126
+
127
+ ---
128
+
129
+ ## Repository layout
130
+
131
+ | Path | Purpose |
132
+ |------|---------|
133
+ | [`tasks/`](tasks/) | ALS v2 task definitions |
134
+ | [`metrics/les-compute.md`](metrics/les-compute.md) | LES_obs formulas |
135
+ | [`submit/schema.json`](submit/schema.json) | Submission JSON schema |
136
+ | [`loopbench/`](loopbench/) | Runner, LES compute, conformance |
137
+ | [`leaderboard/`](leaderboard/) | Public rankings (JSON v0.1) |
138
+ | [`submissions/examples/`](submissions/examples/) | Reference specs |
139
+
140
+ ---
141
+
142
+ ## Citation
143
+
144
+ ```bibtex
145
+ @software{loopbench2026,
146
+ title={LoopBench: Benchmark Suite for Loop Engineering},
147
+ author={Malpani, Kanak},
148
+ year={2026},
149
+ url={https://github.com/KanakMalpani/LoopBench}
150
+ }
151
+ ```
152
+
153
+ ---
154
+
155
+ <p align="center">
156
+ <sub>MIT · v0.1 · <a href="CONTRIBUTING.md">Contributing</a> · <a href="SECURITY.md">Security</a> · <a href="STATUS.md">Status</a></sub>
157
+ </p>
@@ -0,0 +1,15 @@
1
+ # Security Policy
2
+
3
+ ## Supported versions
4
+
5
+ | Version | Supported |
6
+ |---------|-----------|
7
+ | 0.1.x | Yes |
8
+
9
+ ## Reporting
10
+
11
+ Report privately via [GitHub Security Advisories](https://github.com/KanakMalpani/LoopBench/security/advisories/new).
12
+
13
+ ## Submissions
14
+
15
+ Validate all results with `loopbench validate` before opening leaderboard PRs. v0.1 accepts SimEnv only for public rankings.
@@ -0,0 +1,31 @@
1
+ # Status
2
+
3
+ | Field | Value |
4
+ |-------|-------|
5
+ | **Phase** | v0.1 shipped |
6
+ | **Symbol** | ✅ |
7
+ | **Started** | 2026-06-13 |
8
+ | **Shipped** | 2026-06-13 |
9
+ | **Owner** | — |
10
+ | **Blockers** | — |
11
+ | **Notes** | Published at https://github.com/KanakMalpani/LoopBench; LiveEnv submissions v0.2 |
12
+
13
+ ## Completion checklist
14
+
15
+ - [x] `tasks/` — 3 task specs (YAML + README each)
16
+ - [x] `metrics/les-compute.md` — observed LES computation
17
+ - [x] `submit/schema.json` — results file format
18
+ - [x] `cli/loopbench.py` — local eval CLI
19
+ - [x] `leaderboard/` — README + example entries JSON
20
+ - [x] Golden submission tests + CI
21
+ - [x] `SYNC.md`, `LICENSE`, `SUITE-OVERVIEW.md`
22
+ - [x] LSS validation in CI (01-core)
23
+ - [x] Smoke tests for all 3 tasks
24
+ - [ ] PyPI publish (`pip install loopbench` — pending trusted publisher or token on PyPI)
25
+
26
+ ## Links
27
+
28
+ - Parent workspace: [../README.md](../README.md)
29
+ - LSS/LES: [../01-loop-engineering-core](../01-loop-engineering-core/)
30
+ - Runtime: [../05-loopgym](../05-loopgym/)
31
+ - Agent brief: [../AGENT-BRIEF.md](../AGENT-BRIEF.md)