reasonbench 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. reasonbench-0.0.1/LICENSE +21 -0
  2. reasonbench-0.0.1/PKG-INFO +242 -0
  3. reasonbench-0.0.1/README.md +211 -0
  4. reasonbench-0.0.1/pyproject.toml +50 -0
  5. reasonbench-0.0.1/reasonbench/__init__.py +126 -0
  6. reasonbench-0.0.1/reasonbench/datasets.py +53 -0
  7. reasonbench-0.0.1/reasonbench/methods/__init__.py +9 -0
  8. reasonbench-0.0.1/reasonbench/methods/cot.py +55 -0
  9. reasonbench-0.0.1/reasonbench/methods/cot_sc.py +54 -0
  10. reasonbench-0.0.1/reasonbench/methods/foa.py +127 -0
  11. reasonbench-0.0.1/reasonbench/methods/got.py +123 -0
  12. reasonbench-0.0.1/reasonbench/methods/io.py +63 -0
  13. reasonbench-0.0.1/reasonbench/methods/rap.py +214 -0
  14. reasonbench-0.0.1/reasonbench/methods/react.py +54 -0
  15. reasonbench-0.0.1/reasonbench/methods/tot_bfs.py +93 -0
  16. reasonbench-0.0.1/reasonbench/methods/tot_dfs.py +169 -0
  17. reasonbench-0.0.1/reasonbench/models/__init__.py +4 -0
  18. reasonbench-0.0.1/reasonbench/models/anthropic.py +58 -0
  19. reasonbench-0.0.1/reasonbench/models/api.py +176 -0
  20. reasonbench-0.0.1/reasonbench/models/online.py +228 -0
  21. reasonbench-0.0.1/reasonbench/models/qroq.py +132 -0
  22. reasonbench-0.0.1/reasonbench/tasks/__init__.py +8 -0
  23. reasonbench-0.0.1/reasonbench/tasks/game24/__init__.py +3 -0
  24. reasonbench-0.0.1/reasonbench/tasks/game24/agents.py +440 -0
  25. reasonbench-0.0.1/reasonbench/tasks/game24/benchmark.py +51 -0
  26. reasonbench-0.0.1/reasonbench/tasks/game24/environment.py +85 -0
  27. reasonbench-0.0.1/reasonbench/tasks/game24/prompts.py +393 -0
  28. reasonbench-0.0.1/reasonbench/tasks/game24/state.py +49 -0
  29. reasonbench-0.0.1/reasonbench/tasks/hle/__init__.py +3 -0
  30. reasonbench-0.0.1/reasonbench/tasks/hle/agents.py +332 -0
  31. reasonbench-0.0.1/reasonbench/tasks/hle/benchmark.py +128 -0
  32. reasonbench-0.0.1/reasonbench/tasks/hle/environment.py +188 -0
  33. reasonbench-0.0.1/reasonbench/tasks/hle/prompts.py +200 -0
  34. reasonbench-0.0.1/reasonbench/tasks/hle/state.py +80 -0
  35. reasonbench-0.0.1/reasonbench/tasks/hotpotqa/__init__.py +3 -0
  36. reasonbench-0.0.1/reasonbench/tasks/hotpotqa/agents.py +411 -0
  37. reasonbench-0.0.1/reasonbench/tasks/hotpotqa/benchmark.py +104 -0
  38. reasonbench-0.0.1/reasonbench/tasks/hotpotqa/environment.py +123 -0
  39. reasonbench-0.0.1/reasonbench/tasks/hotpotqa/prompts.py +489 -0
  40. reasonbench-0.0.1/reasonbench/tasks/hotpotqa/state.py +61 -0
  41. reasonbench-0.0.1/reasonbench/tasks/humaneval/__init__.py +3 -0
  42. reasonbench-0.0.1/reasonbench/tasks/humaneval/agents.py +361 -0
  43. reasonbench-0.0.1/reasonbench/tasks/humaneval/benchmark.py +53 -0
  44. reasonbench-0.0.1/reasonbench/tasks/humaneval/environment.py +503 -0
  45. reasonbench-0.0.1/reasonbench/tasks/humaneval/prompts.py +152 -0
  46. reasonbench-0.0.1/reasonbench/tasks/humaneval/state.py +62 -0
  47. reasonbench-0.0.1/reasonbench/tasks/logiqa/__init__.py +3 -0
  48. reasonbench-0.0.1/reasonbench/tasks/logiqa/agents.py +258 -0
  49. reasonbench-0.0.1/reasonbench/tasks/logiqa/benchmark.py +81 -0
  50. reasonbench-0.0.1/reasonbench/tasks/logiqa/environment.py +78 -0
  51. reasonbench-0.0.1/reasonbench/tasks/logiqa/prompts.py +211 -0
  52. reasonbench-0.0.1/reasonbench/tasks/logiqa/state.py +66 -0
  53. reasonbench-0.0.1/reasonbench/tasks/matharena/__init__.py +0 -0
  54. reasonbench-0.0.1/reasonbench/tasks/matharena/agents.py +154 -0
  55. reasonbench-0.0.1/reasonbench/tasks/matharena/benchmark.py +105 -0
  56. reasonbench-0.0.1/reasonbench/tasks/matharena/environment.py +110 -0
  57. reasonbench-0.0.1/reasonbench/tasks/matharena/prompts.py +79 -0
  58. reasonbench-0.0.1/reasonbench/tasks/matharena/state.py +66 -0
  59. reasonbench-0.0.1/reasonbench/tasks/scibench/__init__.py +3 -0
  60. reasonbench-0.0.1/reasonbench/tasks/scibench/agents.py +460 -0
  61. reasonbench-0.0.1/reasonbench/tasks/scibench/benchmark.py +89 -0
  62. reasonbench-0.0.1/reasonbench/tasks/scibench/environment.py +103 -0
  63. reasonbench-0.0.1/reasonbench/tasks/scibench/prompts.py +242 -0
  64. reasonbench-0.0.1/reasonbench/tasks/scibench/state.py +61 -0
  65. reasonbench-0.0.1/reasonbench/tasks/sonnetwriting/__init__.py +3 -0
  66. reasonbench-0.0.1/reasonbench/tasks/sonnetwriting/agents.py +338 -0
  67. reasonbench-0.0.1/reasonbench/tasks/sonnetwriting/benchmark.py +92 -0
  68. reasonbench-0.0.1/reasonbench/tasks/sonnetwriting/environment.py +302 -0
  69. reasonbench-0.0.1/reasonbench/tasks/sonnetwriting/prompts.py +382 -0
  70. reasonbench-0.0.1/reasonbench/tasks/sonnetwriting/state.py +52 -0
  71. reasonbench-0.0.1/reasonbench/typedefs.py +154 -0
  72. reasonbench-0.0.1/reasonbench/utils.py +430 -0
  73. reasonbench-0.0.1/reasonbench.egg-info/PKG-INFO +242 -0
  74. reasonbench-0.0.1/reasonbench.egg-info/SOURCES.txt +76 -0
  75. reasonbench-0.0.1/reasonbench.egg-info/dependency_links.txt +1 -0
  76. reasonbench-0.0.1/reasonbench.egg-info/requires.txt +19 -0
  77. reasonbench-0.0.1/reasonbench.egg-info/top_level.txt +1 -0
  78. reasonbench-0.0.1/setup.cfg +4 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) #Author
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,242 @@
1
+ Metadata-Version: 2.4
2
+ Name: reasonbench
3
+ Version: 0.0.1
4
+ Summary: Benchmark for evaluating the stability of LLM reasoning strategies
5
+ Author: AU-CLAN
6
+ License: MIT
7
+ Project-URL: Repository, https://github.com/au-clan/ReasonBench
8
+ Requires-Python: >=3.10
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE
11
+ Requires-Dist: pandas
12
+ Requires-Dist: diskcache
13
+ Requires-Dist: omegaconf
14
+ Requires-Dist: together
15
+ Requires-Dist: openai
16
+ Requires-Dist: sympy
17
+ Requires-Dist: torch
18
+ Requires-Dist: langchain
19
+ Requires-Dist: langchain-community
20
+ Requires-Dist: wikipedia
21
+ Requires-Dist: joblib
22
+ Requires-Dist: pyphen
23
+ Requires-Dist: syllables
24
+ Requires-Dist: pronouncing
25
+ Requires-Dist: groq
26
+ Requires-Dist: lazykey
27
+ Requires-Dist: python-dotenv
28
+ Requires-Dist: cachesaver
29
+ Requires-Dist: huggingface-hub
30
+ Dynamic: license-file
31
+
32
+ # ReasonBENCH: Benchmarking the (In)Stability of LLM Reasoning
33
+
34
+ **ReasonBENCH** is a benchmark suite and open-source library for controlled multi-run evaluation of LLM reasoning. It measures both the quality and stability of reasoning strategies by running repeated independent trials and reporting variance-aware metrics — including confidence intervals, run deviation, and global noise — rather than relying on single-run averages.
35
+
36
+ > *Preliminary work. Under review by the International Conference on Machine Learning (ICML).*
37
+
38
+ <!-- Leaderboard: http://reasonbench.github.io -->
39
+
40
+ ## Motivation
41
+
42
+ LLM reasoning is typically evaluated using single runs, masking how much performance can vary across repeated executions. This practice obscures both reliability and cost, and can lead to misleading comparisons between methods and models. ReasonBENCH addresses this by repeating every model-strategy-task configuration with 10 independent trials and reporting distributional metrics alongside averages.
43
+
44
+ Key findings from our evaluation:
45
+ - **Run-to-run variability is substantial** — often large enough to change model/method rankings relative to single-run averages
46
+ - **Quality and cost stability decouple** — the most accurate strategy is not necessarily the most stable, and vice versa
47
+ - **Model scaling improves both quality and stability** — larger models within a family yield tighter distributions
48
+ - **Prompt refinements improve quality but not stability** — clarifying prompts and parsers boosts accuracy without reducing run-to-run variance
49
+ - **Reasoning effort scales cost, not quality** — increasing test-time reasoning effort primarily raises cost with limited and statistically insignificant quality gains
50
+
51
+ ## Reasoning Strategies
52
+
53
+ We implement 10 representative reasoning strategies using a standardized interface:
54
+
55
+ | Strategy | Type | Reference |
56
+ |----------|------|-----------|
57
+ | **IO** | Direct | — |
58
+ | **CoT** | Direct | Wei et al., 2022 |
59
+ | **CoT-SC** | Direct | Wang et al., 2023 |
60
+ | **ReAct** | Adaptive | Yao et al., 2023b |
61
+ | **Reflexion** | Adaptive | Shinn et al., 2023 |
62
+ | **ToT-BFS** | Structured | Yao et al., 2023a |
63
+ | **ToT-DFS** | Structured | Yao et al., 2023a |
64
+ | **GoT** | Structured | Besta et al., 2024 |
65
+ | **RAP** | Planning | Hao et al., 2023 |
66
+ | **FoA** | Evolutionary | Klein et al., 2025 |
67
+
68
+ ## Benchmarks
69
+
70
+ 6 tasks spanning diverse reasoning domains:
71
+
72
+ | Task | Domain | Metric | Size |
73
+ |------|--------|--------|------|
74
+ | **Game of 24** | Mathematical reasoning | Accuracy | 100 |
75
+ | **SciBench** | Scientific reasoning | Accuracy (exact match) | 109 |
76
+ | **HumanEval** | Code generation | pass@1 | 100 |
77
+ | **HotPotQA** | Multi-hop QA | Exact match | 100 |
78
+ | **Sonnet Writing** | Creative writing | Accuracy (rhyme + words) | 50 |
79
+ | **HLE** | General reasoning (Humanity's Last Exam) | Accuracy | 50 |
80
+
81
+ ## Evaluated Models
82
+
83
+ 10 contemporary reasoning models from 6 providers:
84
+
85
+ | Model | Provider |
86
+ |-------|----------|
87
+ | GPT-4.1 Nano, GPT-4.1 Mini | OpenAI |
88
+ | GPT-5 Nano, GPT-5 Mini | OpenAI |
89
+ | GPT-OSS 120B | Together AI |
90
+ | DeepSeek R1 | Together AI |
91
+ | Llama 4 Maverick | Together AI |
92
+ | Qwen3-235B Thinking | Together AI |
93
+ | Claude Haiku 4.5 | Anthropic |
94
+ | Gemini 3 Flash | Google |
95
+
96
+ ## Setup
97
+
98
+ ```bash
99
+ pip install -r requirements.txt
100
+ ```
101
+
102
+ You also need [CacheSaver](https://github.com/au-clan/cachesaver) — a client-side inference optimization framework for efficient, affordable, and reproducible LLM inference:
103
+
104
+ ```bash
105
+ pip install cachesaver
106
+ ```
107
+
108
+ Set your API keys as environment variables:
109
+
110
+ ```bash
111
+ export OPENAI_API_KEY="sk-..."
112
+ # and/or other provider keys
113
+ ```
114
+
115
+ ## Quick Start
116
+
117
+ The simplest way to run an experiment is via the shell script:
118
+
119
+ ```bash
120
+ bash scripts/simple/simple.sh
121
+ ```
122
+
123
+ Edit the variables at the top of `scripts/simple/simple.sh` to change the benchmark, method, model, and split.
124
+
125
+ For direct invocation:
126
+
127
+ ```bash
128
+ python scripts/simple/simple.py \
129
+ --benchmark game24 \
130
+ --method tot_bfs \
131
+ --split mini \
132
+ --provider openai \
133
+ --api_key OPENAI_API_KEY \
134
+ --model gpt-4.1-nano \
135
+ --temperature 1.0 \
136
+ --max_completion_tokens 10000 \
137
+ --top_p 1.0 \
138
+ --batch_size 1 \
139
+ --timeout 2.0 \
140
+ --correctness 1 \
141
+ --allow_batch_overflow 1 \
142
+ --ns_ratio 0.0 \
143
+ --value_cache
144
+ ```
145
+
146
+ ### Key arguments
147
+
148
+ | Argument | Description |
149
+ |----------|-------------|
150
+ | `--benchmark` | Task name: `game24`, `humaneval`, `hotpotqa`, `scibench`, `hle`, `sonnetwriting` |
151
+ | `--method` | Reasoning method: `io`, `cot`, `cot_sc`, `foa`, `tot_bfs`, `tot_dfs`, `got`, `react`, `rap` |
152
+ | `--split` | Dataset split: `train`, `validation`, `test`, `mini` |
153
+ | `--provider` | LLM provider: `openai`, `gemini`, `anthropic`, `groq`, `together` |
154
+ | `--model` | Model identifier (e.g., `gpt-4.1-nano`, `claude-haiku-4-5`) |
155
+ | `--ns_ratio` | Namespace ratio (0.0–1.0) for controlling parallel execution |
156
+
157
+ ## Evaluation Metrics
158
+
159
+ For each model-strategy-task configuration, we report metrics along two dimensions:
160
+
161
+ **Quality:**
162
+ - **Average** — stratified bootstrap mean over runs; benchmarks treated as strata
163
+ - **Run Deviation** — typical run-to-run deviation from the strategy mean per benchmark
164
+ - **Noise (Global)** — variance of z-scored outcomes across all benchmarks
165
+ - **Noise (Run)** — average within-benchmark z-score variance
166
+
167
+ **Cost:**
168
+ - Same four metrics computed over token usage and wall-clock time, expressed in USD
169
+
170
+ ## Configuration
171
+
172
+ Method hyperparameters are defined per task in YAML files under `scripts/configs/`:
173
+
174
+ ```yaml
175
+ # scripts/configs/game24.yaml
176
+ tot_bfs:
177
+ num_selections: 3
178
+ num_steps: 4
179
+ num_evaluations: 3
180
+
181
+ got:
182
+ num_selections: 5
183
+ num_steps: 4
184
+ num_generate: 10
185
+ num_evaluations: 3
186
+ num_best: 2
187
+ ```
188
+
189
+ Decoding parameters (temperature, top_p, max tokens) are sourced from `scripts/configs/<task>.env`.
190
+
191
+ ## Architecture
192
+
193
+ ReasonBENCH is organized around four core abstractions:
194
+
195
+ - **Method** — specifies the reasoning strategy independently of the model or task. Integrates agents, the environment, and the model, and exposes a standard `solve()` interface.
196
+ - **Environment** — formalizes task-specific dynamics: state transitions, action validation, terminal conditions, and evaluation.
197
+ - **Agent** — defines the interface between methods, models, and states. Agents construct prompts, issue queries, and parse responses into actions.
198
+ - **Model** — uniform interface for LLM providers, supporting async execution and integrated with CacheSaver for response caching and deduplication.
199
+
200
+ ```
201
+ src/
202
+ ├── models/ # LLM provider adapters (OpenAI, Anthropic, Groq, Together, Gemini)
203
+ ├── methods/ # Reasoning strategy implementations
204
+ ├── tasks/ # Task definitions (state, environment, agents, prompts)
205
+ │ ├── game24/
206
+ │ ├── humaneval/
207
+ │ ├── hotpotqa/
208
+ │ └── ...
209
+ ├── __init__.py # Factory registrations
210
+ ├── typedefs.py # Core ABCs and type definitions
211
+ └── utils.py # Logging and utility functions
212
+
213
+ scripts/
214
+ ├── simple/ # Single-run experiment scripts
215
+ ├── repeats/ # Batch/repeated experiment scripts
216
+ ├── cached/ # Cached inference scripts
217
+ └── configs/ # YAML and .env configuration files
218
+
219
+ datasets/ # Gzip-compressed task datasets
220
+ tests/ # Pytest test suite
221
+ ```
222
+
223
+ ## Tests
224
+
225
+ ```bash
226
+ pytest # run all tests
227
+ pytest tests/got/test_game24.py # single file
228
+ pytest tests/got/test_game24.py -k "test_x" # single test
229
+ ```
230
+
231
+ Tests use async fixtures and require valid API keys (Groq/OpenAI) for the mock LLM clients.
232
+
233
+ ## Citation
234
+
235
+ ```bibtex
236
+ @inproceedings{reasonbench2025,
237
+ title={ReasonBENCH: Benchmarking the (In)Stability of LLM Reasoning},
238
+ author={Anonymous},
239
+ year={2025},
240
+ note={Under review at ICML}
241
+ }
242
+ ```
@@ -0,0 +1,211 @@
1
+ # ReasonBENCH: Benchmarking the (In)Stability of LLM Reasoning
2
+
3
+ **ReasonBENCH** is a benchmark suite and open-source library for controlled multi-run evaluation of LLM reasoning. It measures both the quality and stability of reasoning strategies by running repeated independent trials and reporting variance-aware metrics — including confidence intervals, run deviation, and global noise — rather than relying on single-run averages.
4
+
5
+ > *Preliminary work. Under review by the International Conference on Machine Learning (ICML).*
6
+
7
+ <!-- Leaderboard: http://reasonbench.github.io -->
8
+
9
+ ## Motivation
10
+
11
+ LLM reasoning is typically evaluated using single runs, masking how much performance can vary across repeated executions. This practice obscures both reliability and cost, and can lead to misleading comparisons between methods and models. ReasonBENCH addresses this by repeating every model-strategy-task configuration with 10 independent trials and reporting distributional metrics alongside averages.
12
+
13
+ Key findings from our evaluation:
14
+ - **Run-to-run variability is substantial** — often large enough to change model/method rankings relative to single-run averages
15
+ - **Quality and cost stability decouple** — the most accurate strategy is not necessarily the most stable, and vice versa
16
+ - **Model scaling improves both quality and stability** — larger models within a family yield tighter distributions
17
+ - **Prompt refinements improve quality but not stability** — clarifying prompts and parsers boosts accuracy without reducing run-to-run variance
18
+ - **Reasoning effort scales cost, not quality** — increasing test-time reasoning effort primarily raises cost with limited and statistically insignificant quality gains
19
+
20
+ ## Reasoning Strategies
21
+
22
+ We implement 10 representative reasoning strategies using a standardized interface:
23
+
24
+ | Strategy | Type | Reference |
25
+ |----------|------|-----------|
26
+ | **IO** | Direct | — |
27
+ | **CoT** | Direct | Wei et al., 2022 |
28
+ | **CoT-SC** | Direct | Wang et al., 2023 |
29
+ | **ReAct** | Adaptive | Yao et al., 2023b |
30
+ | **Reflexion** | Adaptive | Shinn et al., 2023 |
31
+ | **ToT-BFS** | Structured | Yao et al., 2023a |
32
+ | **ToT-DFS** | Structured | Yao et al., 2023a |
33
+ | **GoT** | Structured | Besta et al., 2024 |
34
+ | **RAP** | Planning | Hao et al., 2023 |
35
+ | **FoA** | Evolutionary | Klein et al., 2025 |
36
+
37
+ ## Benchmarks
38
+
39
+ 6 tasks spanning diverse reasoning domains:
40
+
41
+ | Task | Domain | Metric | Size |
42
+ |------|--------|--------|------|
43
+ | **Game of 24** | Mathematical reasoning | Accuracy | 100 |
44
+ | **SciBench** | Scientific reasoning | Accuracy (exact match) | 109 |
45
+ | **HumanEval** | Code generation | pass@1 | 100 |
46
+ | **HotPotQA** | Multi-hop QA | Exact match | 100 |
47
+ | **Sonnet Writing** | Creative writing | Accuracy (rhyme + words) | 50 |
48
+ | **HLE** | General reasoning (Humanity's Last Exam) | Accuracy | 50 |
49
+
50
+ ## Evaluated Models
51
+
52
+ 10 contemporary reasoning models from 6 providers:
53
+
54
+ | Model | Provider |
55
+ |-------|----------|
56
+ | GPT-4.1 Nano, GPT-4.1 Mini | OpenAI |
57
+ | GPT-5 Nano, GPT-5 Mini | OpenAI |
58
+ | GPT-OSS 120B | Together AI |
59
+ | DeepSeek R1 | Together AI |
60
+ | Llama 4 Maverick | Together AI |
61
+ | Qwen3-235B Thinking | Together AI |
62
+ | Claude Haiku 4.5 | Anthropic |
63
+ | Gemini 3 Flash | Google |
64
+
65
+ ## Setup
66
+
67
+ ```bash
68
+ pip install -r requirements.txt
69
+ ```
70
+
71
+ You also need [CacheSaver](https://github.com/au-clan/cachesaver) — a client-side inference optimization framework for efficient, affordable, and reproducible LLM inference:
72
+
73
+ ```bash
74
+ pip install cachesaver
75
+ ```
76
+
77
+ Set your API keys as environment variables:
78
+
79
+ ```bash
80
+ export OPENAI_API_KEY="sk-..."
81
+ # and/or other provider keys
82
+ ```
83
+
84
+ ## Quick Start
85
+
86
+ The simplest way to run an experiment is via the shell script:
87
+
88
+ ```bash
89
+ bash scripts/simple/simple.sh
90
+ ```
91
+
92
+ Edit the variables at the top of `scripts/simple/simple.sh` to change the benchmark, method, model, and split.
93
+
94
+ For direct invocation:
95
+
96
+ ```bash
97
+ python scripts/simple/simple.py \
98
+ --benchmark game24 \
99
+ --method tot_bfs \
100
+ --split mini \
101
+ --provider openai \
102
+ --api_key OPENAI_API_KEY \
103
+ --model gpt-4.1-nano \
104
+ --temperature 1.0 \
105
+ --max_completion_tokens 10000 \
106
+ --top_p 1.0 \
107
+ --batch_size 1 \
108
+ --timeout 2.0 \
109
+ --correctness 1 \
110
+ --allow_batch_overflow 1 \
111
+ --ns_ratio 0.0 \
112
+ --value_cache
113
+ ```
114
+
115
+ ### Key arguments
116
+
117
+ | Argument | Description |
118
+ |----------|-------------|
119
+ | `--benchmark` | Task name: `game24`, `humaneval`, `hotpotqa`, `scibench`, `hle`, `sonnetwriting` |
120
+ | `--method` | Reasoning method: `io`, `cot`, `cot_sc`, `foa`, `tot_bfs`, `tot_dfs`, `got`, `react`, `rap` |
121
+ | `--split` | Dataset split: `train`, `validation`, `test`, `mini` |
122
+ | `--provider` | LLM provider: `openai`, `gemini`, `anthropic`, `groq`, `together` |
123
+ | `--model` | Model identifier (e.g., `gpt-4.1-nano`, `claude-haiku-4-5`) |
124
+ | `--ns_ratio` | Namespace ratio (0.0–1.0) for controlling parallel execution |
125
+
126
+ ## Evaluation Metrics
127
+
128
+ For each model-strategy-task configuration, we report metrics along two dimensions:
129
+
130
+ **Quality:**
131
+ - **Average** — stratified bootstrap mean over runs; benchmarks treated as strata
132
+ - **Run Deviation** — typical run-to-run deviation from the strategy mean per benchmark
133
+ - **Noise (Global)** — variance of z-scored outcomes across all benchmarks
134
+ - **Noise (Run)** — average within-benchmark z-score variance
135
+
136
+ **Cost:**
137
+ - Same four metrics computed over token usage and wall-clock time, expressed in USD
138
+
139
+ ## Configuration
140
+
141
+ Method hyperparameters are defined per task in YAML files under `scripts/configs/`:
142
+
143
+ ```yaml
144
+ # scripts/configs/game24.yaml
145
+ tot_bfs:
146
+ num_selections: 3
147
+ num_steps: 4
148
+ num_evaluations: 3
149
+
150
+ got:
151
+ num_selections: 5
152
+ num_steps: 4
153
+ num_generate: 10
154
+ num_evaluations: 3
155
+ num_best: 2
156
+ ```
157
+
158
+ Decoding parameters (temperature, top_p, max tokens) are sourced from `scripts/configs/<task>.env`.
159
+
160
+ ## Architecture
161
+
162
+ ReasonBENCH is organized around four core abstractions:
163
+
164
+ - **Method** — specifies the reasoning strategy independently of the model or task. Integrates agents, the environment, and the model, and exposes a standard `solve()` interface.
165
+ - **Environment** — formalizes task-specific dynamics: state transitions, action validation, terminal conditions, and evaluation.
166
+ - **Agent** — defines the interface between methods, models, and states. Agents construct prompts, issue queries, and parse responses into actions.
167
+ - **Model** — uniform interface for LLM providers, supporting async execution and integrated with CacheSaver for response caching and deduplication.
168
+
169
+ ```
170
+ src/
171
+ ├── models/ # LLM provider adapters (OpenAI, Anthropic, Groq, Together, Gemini)
172
+ ├── methods/ # Reasoning strategy implementations
173
+ ├── tasks/ # Task definitions (state, environment, agents, prompts)
174
+ │ ├── game24/
175
+ │ ├── humaneval/
176
+ │ ├── hotpotqa/
177
+ │ └── ...
178
+ ├── __init__.py # Factory registrations
179
+ ├── typedefs.py # Core ABCs and type definitions
180
+ └── utils.py # Logging and utility functions
181
+
182
+ scripts/
183
+ ├── simple/ # Single-run experiment scripts
184
+ ├── repeats/ # Batch/repeated experiment scripts
185
+ ├── cached/ # Cached inference scripts
186
+ └── configs/ # YAML and .env configuration files
187
+
188
+ datasets/ # Gzip-compressed task datasets
189
+ tests/ # Pytest test suite
190
+ ```
191
+
192
+ ## Tests
193
+
194
+ ```bash
195
+ pytest # run all tests
196
+ pytest tests/got/test_game24.py # single file
197
+ pytest tests/got/test_game24.py -k "test_x" # single test
198
+ ```
199
+
200
+ Tests use async fixtures and require valid API keys (Groq/OpenAI) for the mock LLM clients.
201
+
202
+ ## Citation
203
+
204
+ ```bibtex
205
+ @inproceedings{reasonbench2025,
206
+ title={ReasonBENCH: Benchmarking the (In)Stability of LLM Reasoning},
207
+ author={Anonymous},
208
+ year={2025},
209
+ note={Under review at ICML}
210
+ }
211
+ ```
@@ -0,0 +1,50 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "reasonbench"
7
+ version = "0.0.1"
8
+ description = "Benchmark for evaluating the stability of LLM reasoning strategies"
9
+ readme = "README.md"
10
+ license = {text = "MIT"}
11
+ requires-python = ">=3.10"
12
+ authors = [
13
+ {name = "AU-CLAN"},
14
+ ]
15
+ dependencies = [
16
+ "pandas",
17
+ "diskcache",
18
+ "omegaconf",
19
+ "together",
20
+ "openai",
21
+ "sympy",
22
+ "torch",
23
+ "langchain",
24
+ "langchain-community",
25
+ "wikipedia",
26
+ "joblib",
27
+ "pyphen",
28
+ "syllables",
29
+ "pronouncing",
30
+ "groq",
31
+ "lazykey",
32
+ "python-dotenv",
33
+ "cachesaver",
34
+ "huggingface-hub",
35
+ ]
36
+
37
+ [project.urls]
38
+ Repository = "https://github.com/au-clan/ReasonBench"
39
+
40
+ [tool.setuptools.packages.find]
41
+ include = ["reasonbench*"]
42
+
43
+ [tool.setuptools.package-data]
44
+ reasonbench = ["configs/*.yaml", "configs/*.env"]
45
+
46
+ [tool.pytest.ini_options]
47
+ pythonpath = ["."]
48
+ asyncio_mode = "auto"
49
+ asyncio_default_test_loop_scope = "class"
50
+ asyncio_default_fixture_loop_scope = "function"
@@ -0,0 +1,126 @@
1
+ from typing import TypedDict
2
+ from .typedefs import DecodingParameters
3
+ from .datasets import get_dataset_path
4
+
5
+ class BenchmarkFactory:
6
+ registry = {}
7
+
8
+ @classmethod
9
+ def register(cls, benchmark_cls):
10
+ cls.registry[benchmark_cls.__name__.lower()] = benchmark_cls
11
+ return benchmark_cls
12
+
13
+ @classmethod
14
+ def get(cls, task: str, *args, **kwargs):
15
+ key = f"benchmark{task}".lower()
16
+ try:
17
+ path = get_dataset_path(task)
18
+ return cls.registry[key](path=path, *args, **kwargs)
19
+ except KeyError:
20
+ raise ValueError(f"No benchmark found for task={task}")
21
+
22
+ class EnvironmentFactory:
23
+ registry = {}
24
+
25
+ @classmethod
26
+ def register(cls, env_cls):
27
+ cls.registry[env_cls.__name__.lower()] = env_cls
28
+ return env_cls
29
+
30
+ @classmethod
31
+ def get(cls, task: str, *args, **kwargs):
32
+ key = f"environment{task}".lower()
33
+ try:
34
+ return cls.registry[key](*args, **kwargs)
35
+ except KeyError:
36
+ raise ValueError(f"No environment found for task={task}")
37
+
38
+ class AgentFactory:
39
+ registry = {}
40
+
41
+ @classmethod
42
+ def register(cls, agent_cls):
43
+ cls.registry[agent_cls.__name__.lower()] = agent_cls
44
+ return agent_cls
45
+
46
+ @classmethod
47
+ def get(cls, agent_type: str, benchmark: str, *args, **kwargs):
48
+ key = f"agent{agent_type}{benchmark}".lower()
49
+ try:
50
+ return cls.registry[key]#(*args, **kwargs) : Not initialized
51
+ except KeyError:
52
+ raise ValueError(f"No agent found for type={agent_type}, benchmark={benchmark}")
53
+
54
+ class AgentDictFactory:
55
+ registry = {}
56
+
57
+ @classmethod
58
+ def register(cls, agent_dict_cls):
59
+ cls.registry[agent_dict_cls.__name__.lower()] = agent_dict_cls
60
+ return agent_dict_cls
61
+
62
+ @classmethod
63
+ def get(cls, method: str, *args, **kwargs):
64
+ key = f"agentdict{method}".lower()
65
+ try:
66
+ return cls.registry[key](*args, **kwargs)
67
+ except KeyError:
68
+ raise ValueError(f"No agent dict found for method={method}")
69
+
70
+
71
+ class MethodFactory:
72
+ registry = {}
73
+
74
+ @classmethod
75
+ def register(cls, method_cls):
76
+ cls.registry[method_cls.__name__.lower()] = method_cls
77
+ return method_cls
78
+
79
+ @classmethod
80
+ def get(cls, method: str, benchmark: str, params: DecodingParameters, *args, **kwargs):
81
+ key = f"method{method}".lower()
82
+
83
+
84
+ if method == "io":
85
+ agents = {
86
+ "step": AgentFactory.get("io", benchmark),
87
+ }
88
+ elif method in ["cot", "cot_sc"]:
89
+ agents = {
90
+ "step": AgentFactory.get("cot", benchmark),
91
+ }
92
+ elif method == "foa":
93
+ agents = {
94
+ "step": AgentFactory.get("act", benchmark),
95
+ "evaluate": AgentFactory.get("evaluate", benchmark),
96
+ }
97
+ elif method in ["tot_bfs", "tot_dfs"]:
98
+ agents = {
99
+ "step": AgentFactory.get("bfs", benchmark),
100
+ "evaluate": AgentFactory.get("evaluate", benchmark),
101
+ }
102
+ elif method == "got":
103
+ agents = {
104
+ "step": AgentFactory.get("act", benchmark),
105
+ "aggregate": AgentFactory.get("aggregate", benchmark),
106
+ "evaluate": AgentFactory.get("evaluate", benchmark),
107
+ }
108
+ elif method == "rap":
109
+ agents = {
110
+ "step": AgentFactory.get("react", benchmark),
111
+ "evaluate": AgentFactory.get("selfevaluate", benchmark),
112
+ }
113
+ elif method == "react":
114
+ agents = {
115
+ "step": AgentFactory.get("react", benchmark),
116
+ }
117
+ else:
118
+ raise NotImplementedError(f"Method {method} is not implemented yet.")
119
+
120
+ # For the moment, only supporting same params for all agents
121
+ agents.update({k+"_params": params for k in agents.keys()})
122
+
123
+ try:
124
+ return cls.registry[key](agents=agents, *args, **kwargs)
125
+ except KeyError:
126
+ raise ValueError(f"No method found for name={method}")