reasonbench 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- reasonbench-0.0.1/LICENSE +21 -0
- reasonbench-0.0.1/PKG-INFO +242 -0
- reasonbench-0.0.1/README.md +211 -0
- reasonbench-0.0.1/pyproject.toml +50 -0
- reasonbench-0.0.1/reasonbench/__init__.py +126 -0
- reasonbench-0.0.1/reasonbench/datasets.py +53 -0
- reasonbench-0.0.1/reasonbench/methods/__init__.py +9 -0
- reasonbench-0.0.1/reasonbench/methods/cot.py +55 -0
- reasonbench-0.0.1/reasonbench/methods/cot_sc.py +54 -0
- reasonbench-0.0.1/reasonbench/methods/foa.py +127 -0
- reasonbench-0.0.1/reasonbench/methods/got.py +123 -0
- reasonbench-0.0.1/reasonbench/methods/io.py +63 -0
- reasonbench-0.0.1/reasonbench/methods/rap.py +214 -0
- reasonbench-0.0.1/reasonbench/methods/react.py +54 -0
- reasonbench-0.0.1/reasonbench/methods/tot_bfs.py +93 -0
- reasonbench-0.0.1/reasonbench/methods/tot_dfs.py +169 -0
- reasonbench-0.0.1/reasonbench/models/__init__.py +4 -0
- reasonbench-0.0.1/reasonbench/models/anthropic.py +58 -0
- reasonbench-0.0.1/reasonbench/models/api.py +176 -0
- reasonbench-0.0.1/reasonbench/models/online.py +228 -0
- reasonbench-0.0.1/reasonbench/models/qroq.py +132 -0
- reasonbench-0.0.1/reasonbench/tasks/__init__.py +8 -0
- reasonbench-0.0.1/reasonbench/tasks/game24/__init__.py +3 -0
- reasonbench-0.0.1/reasonbench/tasks/game24/agents.py +440 -0
- reasonbench-0.0.1/reasonbench/tasks/game24/benchmark.py +51 -0
- reasonbench-0.0.1/reasonbench/tasks/game24/environment.py +85 -0
- reasonbench-0.0.1/reasonbench/tasks/game24/prompts.py +393 -0
- reasonbench-0.0.1/reasonbench/tasks/game24/state.py +49 -0
- reasonbench-0.0.1/reasonbench/tasks/hle/__init__.py +3 -0
- reasonbench-0.0.1/reasonbench/tasks/hle/agents.py +332 -0
- reasonbench-0.0.1/reasonbench/tasks/hle/benchmark.py +128 -0
- reasonbench-0.0.1/reasonbench/tasks/hle/environment.py +188 -0
- reasonbench-0.0.1/reasonbench/tasks/hle/prompts.py +200 -0
- reasonbench-0.0.1/reasonbench/tasks/hle/state.py +80 -0
- reasonbench-0.0.1/reasonbench/tasks/hotpotqa/__init__.py +3 -0
- reasonbench-0.0.1/reasonbench/tasks/hotpotqa/agents.py +411 -0
- reasonbench-0.0.1/reasonbench/tasks/hotpotqa/benchmark.py +104 -0
- reasonbench-0.0.1/reasonbench/tasks/hotpotqa/environment.py +123 -0
- reasonbench-0.0.1/reasonbench/tasks/hotpotqa/prompts.py +489 -0
- reasonbench-0.0.1/reasonbench/tasks/hotpotqa/state.py +61 -0
- reasonbench-0.0.1/reasonbench/tasks/humaneval/__init__.py +3 -0
- reasonbench-0.0.1/reasonbench/tasks/humaneval/agents.py +361 -0
- reasonbench-0.0.1/reasonbench/tasks/humaneval/benchmark.py +53 -0
- reasonbench-0.0.1/reasonbench/tasks/humaneval/environment.py +503 -0
- reasonbench-0.0.1/reasonbench/tasks/humaneval/prompts.py +152 -0
- reasonbench-0.0.1/reasonbench/tasks/humaneval/state.py +62 -0
- reasonbench-0.0.1/reasonbench/tasks/logiqa/__init__.py +3 -0
- reasonbench-0.0.1/reasonbench/tasks/logiqa/agents.py +258 -0
- reasonbench-0.0.1/reasonbench/tasks/logiqa/benchmark.py +81 -0
- reasonbench-0.0.1/reasonbench/tasks/logiqa/environment.py +78 -0
- reasonbench-0.0.1/reasonbench/tasks/logiqa/prompts.py +211 -0
- reasonbench-0.0.1/reasonbench/tasks/logiqa/state.py +66 -0
- reasonbench-0.0.1/reasonbench/tasks/matharena/__init__.py +0 -0
- reasonbench-0.0.1/reasonbench/tasks/matharena/agents.py +154 -0
- reasonbench-0.0.1/reasonbench/tasks/matharena/benchmark.py +105 -0
- reasonbench-0.0.1/reasonbench/tasks/matharena/environment.py +110 -0
- reasonbench-0.0.1/reasonbench/tasks/matharena/prompts.py +79 -0
- reasonbench-0.0.1/reasonbench/tasks/matharena/state.py +66 -0
- reasonbench-0.0.1/reasonbench/tasks/scibench/__init__.py +3 -0
- reasonbench-0.0.1/reasonbench/tasks/scibench/agents.py +460 -0
- reasonbench-0.0.1/reasonbench/tasks/scibench/benchmark.py +89 -0
- reasonbench-0.0.1/reasonbench/tasks/scibench/environment.py +103 -0
- reasonbench-0.0.1/reasonbench/tasks/scibench/prompts.py +242 -0
- reasonbench-0.0.1/reasonbench/tasks/scibench/state.py +61 -0
- reasonbench-0.0.1/reasonbench/tasks/sonnetwriting/__init__.py +3 -0
- reasonbench-0.0.1/reasonbench/tasks/sonnetwriting/agents.py +338 -0
- reasonbench-0.0.1/reasonbench/tasks/sonnetwriting/benchmark.py +92 -0
- reasonbench-0.0.1/reasonbench/tasks/sonnetwriting/environment.py +302 -0
- reasonbench-0.0.1/reasonbench/tasks/sonnetwriting/prompts.py +382 -0
- reasonbench-0.0.1/reasonbench/tasks/sonnetwriting/state.py +52 -0
- reasonbench-0.0.1/reasonbench/typedefs.py +154 -0
- reasonbench-0.0.1/reasonbench/utils.py +430 -0
- reasonbench-0.0.1/reasonbench.egg-info/PKG-INFO +242 -0
- reasonbench-0.0.1/reasonbench.egg-info/SOURCES.txt +76 -0
- reasonbench-0.0.1/reasonbench.egg-info/dependency_links.txt +1 -0
- reasonbench-0.0.1/reasonbench.egg-info/requires.txt +19 -0
- reasonbench-0.0.1/reasonbench.egg-info/top_level.txt +1 -0
- reasonbench-0.0.1/setup.cfg +4 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) #Author
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: reasonbench
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Benchmark for evaluating the stability of LLM reasoning strategies
|
|
5
|
+
Author: AU-CLAN
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Repository, https://github.com/au-clan/ReasonBench
|
|
8
|
+
Requires-Python: >=3.10
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Requires-Dist: pandas
|
|
12
|
+
Requires-Dist: diskcache
|
|
13
|
+
Requires-Dist: omegaconf
|
|
14
|
+
Requires-Dist: together
|
|
15
|
+
Requires-Dist: openai
|
|
16
|
+
Requires-Dist: sympy
|
|
17
|
+
Requires-Dist: torch
|
|
18
|
+
Requires-Dist: langchain
|
|
19
|
+
Requires-Dist: langchain-community
|
|
20
|
+
Requires-Dist: wikipedia
|
|
21
|
+
Requires-Dist: joblib
|
|
22
|
+
Requires-Dist: pyphen
|
|
23
|
+
Requires-Dist: syllables
|
|
24
|
+
Requires-Dist: pronouncing
|
|
25
|
+
Requires-Dist: groq
|
|
26
|
+
Requires-Dist: lazykey
|
|
27
|
+
Requires-Dist: python-dotenv
|
|
28
|
+
Requires-Dist: cachesaver
|
|
29
|
+
Requires-Dist: huggingface-hub
|
|
30
|
+
Dynamic: license-file
|
|
31
|
+
|
|
32
|
+
# ReasonBENCH: Benchmarking the (In)Stability of LLM Reasoning
|
|
33
|
+
|
|
34
|
+
**ReasonBENCH** is a benchmark suite and open-source library for controlled multi-run evaluation of LLM reasoning. It measures both the quality and stability of reasoning strategies by running repeated independent trials and reporting variance-aware metrics — including confidence intervals, run deviation, and global noise — rather than relying on single-run averages.
|
|
35
|
+
|
|
36
|
+
> *Preliminary work. Under review by the International Conference on Machine Learning (ICML).*
|
|
37
|
+
|
|
38
|
+
<!-- Leaderboard: http://reasonbench.github.io -->
|
|
39
|
+
|
|
40
|
+
## Motivation
|
|
41
|
+
|
|
42
|
+
LLM reasoning is typically evaluated using single runs, masking how much performance can vary across repeated executions. This practice obscures both reliability and cost, and can lead to misleading comparisons between methods and models. ReasonBENCH addresses this by repeating every model-strategy-task configuration with 10 independent trials and reporting distributional metrics alongside averages.
|
|
43
|
+
|
|
44
|
+
Key findings from our evaluation:
|
|
45
|
+
- **Run-to-run variability is substantial** — often large enough to change model/method rankings relative to single-run averages
|
|
46
|
+
- **Quality and cost stability decouple** — the most accurate strategy is not necessarily the most stable, and vice versa
|
|
47
|
+
- **Model scaling improves both quality and stability** — larger models within a family yield tighter distributions
|
|
48
|
+
- **Prompt refinements improve quality but not stability** — clarifying prompts and parsers boosts accuracy without reducing run-to-run variance
|
|
49
|
+
- **Reasoning effort scales cost, not quality** — increasing test-time reasoning effort primarily raises cost with limited and statistically insignificant quality gains
|
|
50
|
+
|
|
51
|
+
## Reasoning Strategies
|
|
52
|
+
|
|
53
|
+
We implement 10 representative reasoning strategies using a standardized interface:
|
|
54
|
+
|
|
55
|
+
| Strategy | Type | Reference |
|
|
56
|
+
|----------|------|-----------|
|
|
57
|
+
| **IO** | Direct | — |
|
|
58
|
+
| **CoT** | Direct | Wei et al., 2022 |
|
|
59
|
+
| **CoT-SC** | Direct | Wang et al., 2023 |
|
|
60
|
+
| **ReAct** | Adaptive | Yao et al., 2023b |
|
|
61
|
+
| **Reflexion** | Adaptive | Shinn et al., 2023 |
|
|
62
|
+
| **ToT-BFS** | Structured | Yao et al., 2023a |
|
|
63
|
+
| **ToT-DFS** | Structured | Yao et al., 2023a |
|
|
64
|
+
| **GoT** | Structured | Besta et al., 2024 |
|
|
65
|
+
| **RAP** | Planning | Hao et al., 2023 |
|
|
66
|
+
| **FoA** | Evolutionary | Klein et al., 2025 |
|
|
67
|
+
|
|
68
|
+
## Benchmarks
|
|
69
|
+
|
|
70
|
+
6 tasks spanning diverse reasoning domains:
|
|
71
|
+
|
|
72
|
+
| Task | Domain | Metric | Size |
|
|
73
|
+
|------|--------|--------|------|
|
|
74
|
+
| **Game of 24** | Mathematical reasoning | Accuracy | 100 |
|
|
75
|
+
| **SciBench** | Scientific reasoning | Accuracy (exact match) | 109 |
|
|
76
|
+
| **HumanEval** | Code generation | pass@1 | 100 |
|
|
77
|
+
| **HotPotQA** | Multi-hop QA | Exact match | 100 |
|
|
78
|
+
| **Sonnet Writing** | Creative writing | Accuracy (rhyme + words) | 50 |
|
|
79
|
+
| **HLE** | General reasoning (Humanity's Last Exam) | Accuracy | 50 |
|
|
80
|
+
|
|
81
|
+
## Evaluated Models
|
|
82
|
+
|
|
83
|
+
10 contemporary reasoning models from 6 providers:
|
|
84
|
+
|
|
85
|
+
| Model | Provider |
|
|
86
|
+
|-------|----------|
|
|
87
|
+
| GPT-4.1 Nano, GPT-4.1 Mini | OpenAI |
|
|
88
|
+
| GPT-5 Nano, GPT-5 Mini | OpenAI |
|
|
89
|
+
| GPT-OSS 120B | Together AI |
|
|
90
|
+
| DeepSeek R1 | Together AI |
|
|
91
|
+
| Llama 4 Maverick | Together AI |
|
|
92
|
+
| Qwen3-235B Thinking | Together AI |
|
|
93
|
+
| Claude Haiku 4.5 | Anthropic |
|
|
94
|
+
| Gemini 3 Flash | Google |
|
|
95
|
+
|
|
96
|
+
## Setup
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
pip install -r requirements.txt
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
You also need [CacheSaver](https://github.com/au-clan/cachesaver) — a client-side inference optimization framework for efficient, affordable, and reproducible LLM inference:
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
pip install cachesaver
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
Set your API keys as environment variables:
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
export OPENAI_API_KEY="sk-..."
|
|
112
|
+
# and/or other provider keys
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
## Quick Start
|
|
116
|
+
|
|
117
|
+
The simplest way to run an experiment is via the shell script:
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
bash scripts/simple/simple.sh
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
Edit the variables at the top of `scripts/simple/simple.sh` to change the benchmark, method, model, and split.
|
|
124
|
+
|
|
125
|
+
For direct invocation:
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
python scripts/simple/simple.py \
|
|
129
|
+
--benchmark game24 \
|
|
130
|
+
--method tot_bfs \
|
|
131
|
+
--split mini \
|
|
132
|
+
--provider openai \
|
|
133
|
+
--api_key OPENAI_API_KEY \
|
|
134
|
+
--model gpt-4.1-nano \
|
|
135
|
+
--temperature 1.0 \
|
|
136
|
+
--max_completion_tokens 10000 \
|
|
137
|
+
--top_p 1.0 \
|
|
138
|
+
--batch_size 1 \
|
|
139
|
+
--timeout 2.0 \
|
|
140
|
+
--correctness 1 \
|
|
141
|
+
--allow_batch_overflow 1 \
|
|
142
|
+
--ns_ratio 0.0 \
|
|
143
|
+
--value_cache
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
### Key arguments
|
|
147
|
+
|
|
148
|
+
| Argument | Description |
|
|
149
|
+
|----------|-------------|
|
|
150
|
+
| `--benchmark` | Task name: `game24`, `humaneval`, `hotpotqa`, `scibench`, `hle`, `sonnetwriting` |
|
|
151
|
+
| `--method` | Reasoning method: `io`, `cot`, `cot_sc`, `foa`, `tot_bfs`, `tot_dfs`, `got`, `react`, `rap` |
|
|
152
|
+
| `--split` | Dataset split: `train`, `validation`, `test`, `mini` |
|
|
153
|
+
| `--provider` | LLM provider: `openai`, `gemini`, `anthropic`, `groq`, `together` |
|
|
154
|
+
| `--model` | Model identifier (e.g., `gpt-4.1-nano`, `claude-haiku-4-5`) |
|
|
155
|
+
| `--ns_ratio` | Namespace ratio (0.0–1.0) for controlling parallel execution |
|
|
156
|
+
|
|
157
|
+
## Evaluation Metrics
|
|
158
|
+
|
|
159
|
+
For each model-strategy-task configuration, we report metrics along two dimensions:
|
|
160
|
+
|
|
161
|
+
**Quality:**
|
|
162
|
+
- **Average** — stratified bootstrap mean over runs; benchmarks treated as strata
|
|
163
|
+
- **Run Deviation** — typical run-to-run deviation from the strategy mean per benchmark
|
|
164
|
+
- **Noise (Global)** — variance of z-scored outcomes across all benchmarks
|
|
165
|
+
- **Noise (Run)** — average within-benchmark z-score variance
|
|
166
|
+
|
|
167
|
+
**Cost:**
|
|
168
|
+
- Same four metrics computed over token usage and wall-clock time, expressed in USD
|
|
169
|
+
|
|
170
|
+
## Configuration
|
|
171
|
+
|
|
172
|
+
Method hyperparameters are defined per task in YAML files under `scripts/configs/`:
|
|
173
|
+
|
|
174
|
+
```yaml
|
|
175
|
+
# scripts/configs/game24.yaml
|
|
176
|
+
tot_bfs:
|
|
177
|
+
num_selections: 3
|
|
178
|
+
num_steps: 4
|
|
179
|
+
num_evaluations: 3
|
|
180
|
+
|
|
181
|
+
got:
|
|
182
|
+
num_selections: 5
|
|
183
|
+
num_steps: 4
|
|
184
|
+
num_generate: 10
|
|
185
|
+
num_evaluations: 3
|
|
186
|
+
num_best: 2
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
Decoding parameters (temperature, top_p, max tokens) are sourced from `scripts/configs/<task>.env`.
|
|
190
|
+
|
|
191
|
+
## Architecture
|
|
192
|
+
|
|
193
|
+
ReasonBENCH is organized around four core abstractions:
|
|
194
|
+
|
|
195
|
+
- **Method** — specifies the reasoning strategy independently of the model or task. Integrates agents, the environment, and the model, and exposes a standard `solve()` interface.
|
|
196
|
+
- **Environment** — formalizes task-specific dynamics: state transitions, action validation, terminal conditions, and evaluation.
|
|
197
|
+
- **Agent** — defines the interface between methods, models, and states. Agents construct prompts, issue queries, and parse responses into actions.
|
|
198
|
+
- **Model** — uniform interface for LLM providers, supporting async execution and integrated with CacheSaver for response caching and deduplication.
|
|
199
|
+
|
|
200
|
+
```
|
|
201
|
+
src/
|
|
202
|
+
├── models/ # LLM provider adapters (OpenAI, Anthropic, Groq, Together, Gemini)
|
|
203
|
+
├── methods/ # Reasoning strategy implementations
|
|
204
|
+
├── tasks/ # Task definitions (state, environment, agents, prompts)
|
|
205
|
+
│ ├── game24/
|
|
206
|
+
│ ├── humaneval/
|
|
207
|
+
│ ├── hotpotqa/
|
|
208
|
+
│ └── ...
|
|
209
|
+
├── __init__.py # Factory registrations
|
|
210
|
+
├── typedefs.py # Core ABCs and type definitions
|
|
211
|
+
└── utils.py # Logging and utility functions
|
|
212
|
+
|
|
213
|
+
scripts/
|
|
214
|
+
├── simple/ # Single-run experiment scripts
|
|
215
|
+
├── repeats/ # Batch/repeated experiment scripts
|
|
216
|
+
├── cached/ # Cached inference scripts
|
|
217
|
+
└── configs/ # YAML and .env configuration files
|
|
218
|
+
|
|
219
|
+
datasets/ # Gzip-compressed task datasets
|
|
220
|
+
tests/ # Pytest test suite
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
## Tests
|
|
224
|
+
|
|
225
|
+
```bash
|
|
226
|
+
pytest # run all tests
|
|
227
|
+
pytest tests/got/test_game24.py # single file
|
|
228
|
+
pytest tests/got/test_game24.py -k "test_x" # single test
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
Tests use async fixtures and require valid API keys (Groq/OpenAI) for the mock LLM clients.
|
|
232
|
+
|
|
233
|
+
## Citation
|
|
234
|
+
|
|
235
|
+
```bibtex
|
|
236
|
+
@inproceedings{reasonbench2025,
|
|
237
|
+
title={ReasonBENCH: Benchmarking the (In)Stability of LLM Reasoning},
|
|
238
|
+
author={Anonymous},
|
|
239
|
+
year={2025},
|
|
240
|
+
note={Under review at ICML}
|
|
241
|
+
}
|
|
242
|
+
```
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
# ReasonBENCH: Benchmarking the (In)Stability of LLM Reasoning
|
|
2
|
+
|
|
3
|
+
**ReasonBENCH** is a benchmark suite and open-source library for controlled multi-run evaluation of LLM reasoning. It measures both the quality and stability of reasoning strategies by running repeated independent trials and reporting variance-aware metrics — including confidence intervals, run deviation, and global noise — rather than relying on single-run averages.
|
|
4
|
+
|
|
5
|
+
> *Preliminary work. Under review by the International Conference on Machine Learning (ICML).*
|
|
6
|
+
|
|
7
|
+
<!-- Leaderboard: http://reasonbench.github.io -->
|
|
8
|
+
|
|
9
|
+
## Motivation
|
|
10
|
+
|
|
11
|
+
LLM reasoning is typically evaluated using single runs, masking how much performance can vary across repeated executions. This practice obscures both reliability and cost, and can lead to misleading comparisons between methods and models. ReasonBENCH addresses this by repeating every model-strategy-task configuration with 10 independent trials and reporting distributional metrics alongside averages.
|
|
12
|
+
|
|
13
|
+
Key findings from our evaluation:
|
|
14
|
+
- **Run-to-run variability is substantial** — often large enough to change model/method rankings relative to single-run averages
|
|
15
|
+
- **Quality and cost stability decouple** — the most accurate strategy is not necessarily the most stable, and vice versa
|
|
16
|
+
- **Model scaling improves both quality and stability** — larger models within a family yield tighter distributions
|
|
17
|
+
- **Prompt refinements improve quality but not stability** — clarifying prompts and parsers boosts accuracy without reducing run-to-run variance
|
|
18
|
+
- **Reasoning effort scales cost, not quality** — increasing test-time reasoning effort primarily raises cost with limited and statistically insignificant quality gains
|
|
19
|
+
|
|
20
|
+
## Reasoning Strategies
|
|
21
|
+
|
|
22
|
+
We implement 10 representative reasoning strategies using a standardized interface:
|
|
23
|
+
|
|
24
|
+
| Strategy | Type | Reference |
|
|
25
|
+
|----------|------|-----------|
|
|
26
|
+
| **IO** | Direct | — |
|
|
27
|
+
| **CoT** | Direct | Wei et al., 2022 |
|
|
28
|
+
| **CoT-SC** | Direct | Wang et al., 2023 |
|
|
29
|
+
| **ReAct** | Adaptive | Yao et al., 2023b |
|
|
30
|
+
| **Reflexion** | Adaptive | Shinn et al., 2023 |
|
|
31
|
+
| **ToT-BFS** | Structured | Yao et al., 2023a |
|
|
32
|
+
| **ToT-DFS** | Structured | Yao et al., 2023a |
|
|
33
|
+
| **GoT** | Structured | Besta et al., 2024 |
|
|
34
|
+
| **RAP** | Planning | Hao et al., 2023 |
|
|
35
|
+
| **FoA** | Evolutionary | Klein et al., 2025 |
|
|
36
|
+
|
|
37
|
+
## Benchmarks
|
|
38
|
+
|
|
39
|
+
6 tasks spanning diverse reasoning domains:
|
|
40
|
+
|
|
41
|
+
| Task | Domain | Metric | Size |
|
|
42
|
+
|------|--------|--------|------|
|
|
43
|
+
| **Game of 24** | Mathematical reasoning | Accuracy | 100 |
|
|
44
|
+
| **SciBench** | Scientific reasoning | Accuracy (exact match) | 109 |
|
|
45
|
+
| **HumanEval** | Code generation | pass@1 | 100 |
|
|
46
|
+
| **HotPotQA** | Multi-hop QA | Exact match | 100 |
|
|
47
|
+
| **Sonnet Writing** | Creative writing | Accuracy (rhyme + words) | 50 |
|
|
48
|
+
| **HLE** | General reasoning (Humanity's Last Exam) | Accuracy | 50 |
|
|
49
|
+
|
|
50
|
+
## Evaluated Models
|
|
51
|
+
|
|
52
|
+
10 contemporary reasoning models from 6 providers:
|
|
53
|
+
|
|
54
|
+
| Model | Provider |
|
|
55
|
+
|-------|----------|
|
|
56
|
+
| GPT-4.1 Nano, GPT-4.1 Mini | OpenAI |
|
|
57
|
+
| GPT-5 Nano, GPT-5 Mini | OpenAI |
|
|
58
|
+
| GPT-OSS 120B | Together AI |
|
|
59
|
+
| DeepSeek R1 | Together AI |
|
|
60
|
+
| Llama 4 Maverick | Together AI |
|
|
61
|
+
| Qwen3-235B Thinking | Together AI |
|
|
62
|
+
| Claude Haiku 4.5 | Anthropic |
|
|
63
|
+
| Gemini 3 Flash | Google |
|
|
64
|
+
|
|
65
|
+
## Setup
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
pip install -r requirements.txt
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
You also need [CacheSaver](https://github.com/au-clan/cachesaver) — a client-side inference optimization framework for efficient, affordable, and reproducible LLM inference:
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
pip install cachesaver
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Set your API keys as environment variables:
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
export OPENAI_API_KEY="sk-..."
|
|
81
|
+
# and/or other provider keys
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## Quick Start
|
|
85
|
+
|
|
86
|
+
The simplest way to run an experiment is via the shell script:
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
bash scripts/simple/simple.sh
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Edit the variables at the top of `scripts/simple/simple.sh` to change the benchmark, method, model, and split.
|
|
93
|
+
|
|
94
|
+
For direct invocation:
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
python scripts/simple/simple.py \
|
|
98
|
+
--benchmark game24 \
|
|
99
|
+
--method tot_bfs \
|
|
100
|
+
--split mini \
|
|
101
|
+
--provider openai \
|
|
102
|
+
--api_key OPENAI_API_KEY \
|
|
103
|
+
--model gpt-4.1-nano \
|
|
104
|
+
--temperature 1.0 \
|
|
105
|
+
--max_completion_tokens 10000 \
|
|
106
|
+
--top_p 1.0 \
|
|
107
|
+
--batch_size 1 \
|
|
108
|
+
--timeout 2.0 \
|
|
109
|
+
--correctness 1 \
|
|
110
|
+
--allow_batch_overflow 1 \
|
|
111
|
+
--ns_ratio 0.0 \
|
|
112
|
+
--value_cache
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
### Key arguments
|
|
116
|
+
|
|
117
|
+
| Argument | Description |
|
|
118
|
+
|----------|-------------|
|
|
119
|
+
| `--benchmark` | Task name: `game24`, `humaneval`, `hotpotqa`, `scibench`, `hle`, `sonnetwriting` |
|
|
120
|
+
| `--method` | Reasoning method: `io`, `cot`, `cot_sc`, `foa`, `tot_bfs`, `tot_dfs`, `got`, `react`, `rap` |
|
|
121
|
+
| `--split` | Dataset split: `train`, `validation`, `test`, `mini` |
|
|
122
|
+
| `--provider` | LLM provider: `openai`, `gemini`, `anthropic`, `groq`, `together` |
|
|
123
|
+
| `--model` | Model identifier (e.g., `gpt-4.1-nano`, `claude-haiku-4-5`) |
|
|
124
|
+
| `--ns_ratio` | Namespace ratio (0.0–1.0) for controlling parallel execution |
|
|
125
|
+
|
|
126
|
+
## Evaluation Metrics
|
|
127
|
+
|
|
128
|
+
For each model-strategy-task configuration, we report metrics along two dimensions:
|
|
129
|
+
|
|
130
|
+
**Quality:**
|
|
131
|
+
- **Average** — stratified bootstrap mean over runs; benchmarks treated as strata
|
|
132
|
+
- **Run Deviation** — typical run-to-run deviation from the strategy mean per benchmark
|
|
133
|
+
- **Noise (Global)** — variance of z-scored outcomes across all benchmarks
|
|
134
|
+
- **Noise (Run)** — average within-benchmark z-score variance
|
|
135
|
+
|
|
136
|
+
**Cost:**
|
|
137
|
+
- Same four metrics computed over token usage and wall-clock time, expressed in USD
|
|
138
|
+
|
|
139
|
+
## Configuration
|
|
140
|
+
|
|
141
|
+
Method hyperparameters are defined per task in YAML files under `scripts/configs/`:
|
|
142
|
+
|
|
143
|
+
```yaml
|
|
144
|
+
# scripts/configs/game24.yaml
|
|
145
|
+
tot_bfs:
|
|
146
|
+
num_selections: 3
|
|
147
|
+
num_steps: 4
|
|
148
|
+
num_evaluations: 3
|
|
149
|
+
|
|
150
|
+
got:
|
|
151
|
+
num_selections: 5
|
|
152
|
+
num_steps: 4
|
|
153
|
+
num_generate: 10
|
|
154
|
+
num_evaluations: 3
|
|
155
|
+
num_best: 2
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
Decoding parameters (temperature, top_p, max tokens) are sourced from `scripts/configs/<task>.env`.
|
|
159
|
+
|
|
160
|
+
## Architecture
|
|
161
|
+
|
|
162
|
+
ReasonBENCH is organized around four core abstractions:
|
|
163
|
+
|
|
164
|
+
- **Method** — specifies the reasoning strategy independently of the model or task. Integrates agents, the environment, and the model, and exposes a standard `solve()` interface.
|
|
165
|
+
- **Environment** — formalizes task-specific dynamics: state transitions, action validation, terminal conditions, and evaluation.
|
|
166
|
+
- **Agent** — defines the interface between methods, models, and states. Agents construct prompts, issue queries, and parse responses into actions.
|
|
167
|
+
- **Model** — uniform interface for LLM providers, supporting async execution and integrated with CacheSaver for response caching and deduplication.
|
|
168
|
+
|
|
169
|
+
```
|
|
170
|
+
src/
|
|
171
|
+
├── models/ # LLM provider adapters (OpenAI, Anthropic, Groq, Together, Gemini)
|
|
172
|
+
├── methods/ # Reasoning strategy implementations
|
|
173
|
+
├── tasks/ # Task definitions (state, environment, agents, prompts)
|
|
174
|
+
│ ├── game24/
|
|
175
|
+
│ ├── humaneval/
|
|
176
|
+
│ ├── hotpotqa/
|
|
177
|
+
│ └── ...
|
|
178
|
+
├── __init__.py # Factory registrations
|
|
179
|
+
├── typedefs.py # Core ABCs and type definitions
|
|
180
|
+
└── utils.py # Logging and utility functions
|
|
181
|
+
|
|
182
|
+
scripts/
|
|
183
|
+
├── simple/ # Single-run experiment scripts
|
|
184
|
+
├── repeats/ # Batch/repeated experiment scripts
|
|
185
|
+
├── cached/ # Cached inference scripts
|
|
186
|
+
└── configs/ # YAML and .env configuration files
|
|
187
|
+
|
|
188
|
+
datasets/ # Gzip-compressed task datasets
|
|
189
|
+
tests/ # Pytest test suite
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
## Tests
|
|
193
|
+
|
|
194
|
+
```bash
|
|
195
|
+
pytest # run all tests
|
|
196
|
+
pytest tests/got/test_game24.py # single file
|
|
197
|
+
pytest tests/got/test_game24.py -k "test_x" # single test
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
Tests use async fixtures and require valid API keys (Groq/OpenAI) for the mock LLM clients.
|
|
201
|
+
|
|
202
|
+
## Citation
|
|
203
|
+
|
|
204
|
+
```bibtex
|
|
205
|
+
@inproceedings{reasonbench2025,
|
|
206
|
+
title={ReasonBENCH: Benchmarking the (In)Stability of LLM Reasoning},
|
|
207
|
+
author={Anonymous},
|
|
208
|
+
year={2025},
|
|
209
|
+
note={Under review at ICML}
|
|
210
|
+
}
|
|
211
|
+
```
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "reasonbench"
|
|
7
|
+
version = "0.0.1"
|
|
8
|
+
description = "Benchmark for evaluating the stability of LLM reasoning strategies"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "AU-CLAN"},
|
|
14
|
+
]
|
|
15
|
+
dependencies = [
|
|
16
|
+
"pandas",
|
|
17
|
+
"diskcache",
|
|
18
|
+
"omegaconf",
|
|
19
|
+
"together",
|
|
20
|
+
"openai",
|
|
21
|
+
"sympy",
|
|
22
|
+
"torch",
|
|
23
|
+
"langchain",
|
|
24
|
+
"langchain-community",
|
|
25
|
+
"wikipedia",
|
|
26
|
+
"joblib",
|
|
27
|
+
"pyphen",
|
|
28
|
+
"syllables",
|
|
29
|
+
"pronouncing",
|
|
30
|
+
"groq",
|
|
31
|
+
"lazykey",
|
|
32
|
+
"python-dotenv",
|
|
33
|
+
"cachesaver",
|
|
34
|
+
"huggingface-hub",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[project.urls]
|
|
38
|
+
Repository = "https://github.com/au-clan/ReasonBench"
|
|
39
|
+
|
|
40
|
+
[tool.setuptools.packages.find]
|
|
41
|
+
include = ["reasonbench*"]
|
|
42
|
+
|
|
43
|
+
[tool.setuptools.package-data]
|
|
44
|
+
reasonbench = ["configs/*.yaml", "configs/*.env"]
|
|
45
|
+
|
|
46
|
+
[tool.pytest.ini_options]
|
|
47
|
+
pythonpath = ["."]
|
|
48
|
+
asyncio_mode = "auto"
|
|
49
|
+
asyncio_default_test_loop_scope = "class"
|
|
50
|
+
asyncio_default_fixture_loop_scope = "function"
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
from typing import TypedDict
|
|
2
|
+
from .typedefs import DecodingParameters
|
|
3
|
+
from .datasets import get_dataset_path
|
|
4
|
+
|
|
5
|
+
class BenchmarkFactory:
|
|
6
|
+
registry = {}
|
|
7
|
+
|
|
8
|
+
@classmethod
|
|
9
|
+
def register(cls, benchmark_cls):
|
|
10
|
+
cls.registry[benchmark_cls.__name__.lower()] = benchmark_cls
|
|
11
|
+
return benchmark_cls
|
|
12
|
+
|
|
13
|
+
@classmethod
|
|
14
|
+
def get(cls, task: str, *args, **kwargs):
|
|
15
|
+
key = f"benchmark{task}".lower()
|
|
16
|
+
try:
|
|
17
|
+
path = get_dataset_path(task)
|
|
18
|
+
return cls.registry[key](path=path, *args, **kwargs)
|
|
19
|
+
except KeyError:
|
|
20
|
+
raise ValueError(f"No benchmark found for task={task}")
|
|
21
|
+
|
|
22
|
+
class EnvironmentFactory:
|
|
23
|
+
registry = {}
|
|
24
|
+
|
|
25
|
+
@classmethod
|
|
26
|
+
def register(cls, env_cls):
|
|
27
|
+
cls.registry[env_cls.__name__.lower()] = env_cls
|
|
28
|
+
return env_cls
|
|
29
|
+
|
|
30
|
+
@classmethod
|
|
31
|
+
def get(cls, task: str, *args, **kwargs):
|
|
32
|
+
key = f"environment{task}".lower()
|
|
33
|
+
try:
|
|
34
|
+
return cls.registry[key](*args, **kwargs)
|
|
35
|
+
except KeyError:
|
|
36
|
+
raise ValueError(f"No environment found for task={task}")
|
|
37
|
+
|
|
38
|
+
class AgentFactory:
|
|
39
|
+
registry = {}
|
|
40
|
+
|
|
41
|
+
@classmethod
|
|
42
|
+
def register(cls, agent_cls):
|
|
43
|
+
cls.registry[agent_cls.__name__.lower()] = agent_cls
|
|
44
|
+
return agent_cls
|
|
45
|
+
|
|
46
|
+
@classmethod
|
|
47
|
+
def get(cls, agent_type: str, benchmark: str, *args, **kwargs):
|
|
48
|
+
key = f"agent{agent_type}{benchmark}".lower()
|
|
49
|
+
try:
|
|
50
|
+
return cls.registry[key]#(*args, **kwargs) : Not initialized
|
|
51
|
+
except KeyError:
|
|
52
|
+
raise ValueError(f"No agent found for type={agent_type}, benchmark={benchmark}")
|
|
53
|
+
|
|
54
|
+
class AgentDictFactory:
|
|
55
|
+
registry = {}
|
|
56
|
+
|
|
57
|
+
@classmethod
|
|
58
|
+
def register(cls, agent_dict_cls):
|
|
59
|
+
cls.registry[agent_dict_cls.__name__.lower()] = agent_dict_cls
|
|
60
|
+
return agent_dict_cls
|
|
61
|
+
|
|
62
|
+
@classmethod
|
|
63
|
+
def get(cls, method: str, *args, **kwargs):
|
|
64
|
+
key = f"agentdict{method}".lower()
|
|
65
|
+
try:
|
|
66
|
+
return cls.registry[key](*args, **kwargs)
|
|
67
|
+
except KeyError:
|
|
68
|
+
raise ValueError(f"No agent dict found for method={method}")
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class MethodFactory:
|
|
72
|
+
registry = {}
|
|
73
|
+
|
|
74
|
+
@classmethod
|
|
75
|
+
def register(cls, method_cls):
|
|
76
|
+
cls.registry[method_cls.__name__.lower()] = method_cls
|
|
77
|
+
return method_cls
|
|
78
|
+
|
|
79
|
+
@classmethod
|
|
80
|
+
def get(cls, method: str, benchmark: str, params: DecodingParameters, *args, **kwargs):
|
|
81
|
+
key = f"method{method}".lower()
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
if method == "io":
|
|
85
|
+
agents = {
|
|
86
|
+
"step": AgentFactory.get("io", benchmark),
|
|
87
|
+
}
|
|
88
|
+
elif method in ["cot", "cot_sc"]:
|
|
89
|
+
agents = {
|
|
90
|
+
"step": AgentFactory.get("cot", benchmark),
|
|
91
|
+
}
|
|
92
|
+
elif method == "foa":
|
|
93
|
+
agents = {
|
|
94
|
+
"step": AgentFactory.get("act", benchmark),
|
|
95
|
+
"evaluate": AgentFactory.get("evaluate", benchmark),
|
|
96
|
+
}
|
|
97
|
+
elif method in ["tot_bfs", "tot_dfs"]:
|
|
98
|
+
agents = {
|
|
99
|
+
"step": AgentFactory.get("bfs", benchmark),
|
|
100
|
+
"evaluate": AgentFactory.get("evaluate", benchmark),
|
|
101
|
+
}
|
|
102
|
+
elif method == "got":
|
|
103
|
+
agents = {
|
|
104
|
+
"step": AgentFactory.get("act", benchmark),
|
|
105
|
+
"aggregate": AgentFactory.get("aggregate", benchmark),
|
|
106
|
+
"evaluate": AgentFactory.get("evaluate", benchmark),
|
|
107
|
+
}
|
|
108
|
+
elif method == "rap":
|
|
109
|
+
agents = {
|
|
110
|
+
"step": AgentFactory.get("react", benchmark),
|
|
111
|
+
"evaluate": AgentFactory.get("selfevaluate", benchmark),
|
|
112
|
+
}
|
|
113
|
+
elif method == "react":
|
|
114
|
+
agents = {
|
|
115
|
+
"step": AgentFactory.get("react", benchmark),
|
|
116
|
+
}
|
|
117
|
+
else:
|
|
118
|
+
raise NotImplementedError(f"Method {method} is not implemented yet.")
|
|
119
|
+
|
|
120
|
+
# For the moment, only supporting same params for all agents
|
|
121
|
+
agents.update({k+"_params": params for k in agents.keys()})
|
|
122
|
+
|
|
123
|
+
try:
|
|
124
|
+
return cls.registry[key](agents=agents, *args, **kwargs)
|
|
125
|
+
except KeyError:
|
|
126
|
+
raise ValueError(f"No method found for name={method}")
|