thinkbooster 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (140) hide show
  1. thinkbooster-0.1.0/LICENSE +22 -0
  2. thinkbooster-0.1.0/PKG-INFO +288 -0
  3. thinkbooster-0.1.0/README.md +229 -0
  4. thinkbooster-0.1.0/llm_tts/datasets/__init__.py +46 -0
  5. thinkbooster-0.1.0/llm_tts/datasets/gsm8k.py +168 -0
  6. thinkbooster-0.1.0/llm_tts/datasets/human_eval_plus.py +266 -0
  7. thinkbooster-0.1.0/llm_tts/datasets/kernelbench.py +238 -0
  8. thinkbooster-0.1.0/llm_tts/datasets/mbpp_plus.py +283 -0
  9. thinkbooster-0.1.0/llm_tts/early_stopping.py +295 -0
  10. thinkbooster-0.1.0/llm_tts/evaluation/__init__.py +13 -0
  11. thinkbooster-0.1.0/llm_tts/evaluation/alignscore.py +86 -0
  12. thinkbooster-0.1.0/llm_tts/evaluation/exact_match.py +258 -0
  13. thinkbooster-0.1.0/llm_tts/evaluation/grader.py +399 -0
  14. thinkbooster-0.1.0/llm_tts/evaluation/human_eval_plus_evaluator.py +277 -0
  15. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/__init__.py +8 -0
  16. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/asciimath_printer.py +50 -0
  17. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/gen/PSLexer.py +1692 -0
  18. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/gen/PSListener.py +579 -0
  19. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/gen/PSParser.py +7502 -0
  20. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/gen/PSVisitor.py +328 -0
  21. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/gen/__init__.py +0 -0
  22. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/latex2sympy2.py +1157 -0
  23. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/sandbox/linalg_equations.py +10 -0
  24. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/sandbox/linalg_span.py +19 -0
  25. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/sandbox/matrix.py +46 -0
  26. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/sandbox/matrix_placeholders.py +65 -0
  27. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/sandbox/sandbox.py +23 -0
  28. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/sandbox/sandbox_equality.py +75 -0
  29. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/sandbox/sectan.py +51 -0
  30. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/sandbox/vector.py +75 -0
  31. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/setup.py +45 -0
  32. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/tests/__init__.py +0 -0
  33. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/tests/abs_test.py +19 -0
  34. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/tests/all_bad_test.py +70 -0
  35. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/tests/all_good_test.py +284 -0
  36. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/tests/atom_expr_test.py +58 -0
  37. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/tests/binomial_test.py +36 -0
  38. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/tests/ceil_test.py +29 -0
  39. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/tests/complex_test.py +21 -0
  40. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/tests/context.py +84 -0
  41. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/tests/exp_test.py +57 -0
  42. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/tests/floor_test.py +29 -0
  43. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/tests/gcd_test.py +161 -0
  44. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/tests/greek_test.py +19 -0
  45. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/tests/grouping_test.py +52 -0
  46. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/tests/lcm_test.py +161 -0
  47. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/tests/left_right_cdot_test.py +9 -0
  48. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/tests/linalg_test.py +15 -0
  49. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/tests/max_test.py +79 -0
  50. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/tests/min_test.py +79 -0
  51. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/tests/mod_test.py +70 -0
  52. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/tests/overline_test.py +9 -0
  53. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/tests/pi_test.py +15 -0
  54. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/tests/trig_test.py +21 -0
  55. thinkbooster-0.1.0/llm_tts/evaluation/latex2sympy/tests/variable_test.py +92 -0
  56. thinkbooster-0.1.0/llm_tts/evaluation/llm_as_a_judge.py +309 -0
  57. thinkbooster-0.1.0/llm_tts/evaluation/math_normalize.py +417 -0
  58. thinkbooster-0.1.0/llm_tts/evaluation/mbpp_plus_evaluator.py +277 -0
  59. thinkbooster-0.1.0/llm_tts/evaluation/parser.py +770 -0
  60. thinkbooster-0.1.0/llm_tts/generators/__init__.py +66 -0
  61. thinkbooster-0.1.0/llm_tts/generators/api.py +1249 -0
  62. thinkbooster-0.1.0/llm_tts/generators/base.py +430 -0
  63. thinkbooster-0.1.0/llm_tts/generators/huggingface.py +728 -0
  64. thinkbooster-0.1.0/llm_tts/generators/vllm.py +1394 -0
  65. thinkbooster-0.1.0/llm_tts/integrations/__init__.py +17 -0
  66. thinkbooster-0.1.0/llm_tts/integrations/langchain_chat_model.py +168 -0
  67. thinkbooster-0.1.0/llm_tts/models/__init__.py +8 -0
  68. thinkbooster-0.1.0/llm_tts/models/base.py +62 -0
  69. thinkbooster-0.1.0/llm_tts/models/blackboxmodel_with_streaming.py +392 -0
  70. thinkbooster-0.1.0/llm_tts/scale_discriminator.py +127 -0
  71. thinkbooster-0.1.0/llm_tts/scorers/__init__.py +14 -0
  72. thinkbooster-0.1.0/llm_tts/scorers/estimator_uncertainty_pd.py +48 -0
  73. thinkbooster-0.1.0/llm_tts/scorers/majority_voting.py +236 -0
  74. thinkbooster-0.1.0/llm_tts/scorers/multi_scorer.py +236 -0
  75. thinkbooster-0.1.0/llm_tts/scorers/step_scorer_base.py +153 -0
  76. thinkbooster-0.1.0/llm_tts/scorers/step_scorer_confidence.py +47 -0
  77. thinkbooster-0.1.0/llm_tts/scorers/step_scorer_llm_critic.py +947 -0
  78. thinkbooster-0.1.0/llm_tts/scorers/step_scorer_prm.py +1002 -0
  79. thinkbooster-0.1.0/llm_tts/scorers/step_scorer_reward_base.py +47 -0
  80. thinkbooster-0.1.0/llm_tts/scorers/step_scorer_uncertainty.py +48 -0
  81. thinkbooster-0.1.0/llm_tts/step_boundary_detectors/__init__.py +65 -0
  82. thinkbooster-0.1.0/llm_tts/step_boundary_detectors/base.py +23 -0
  83. thinkbooster-0.1.0/llm_tts/step_boundary_detectors/non_thinking/__init__.py +12 -0
  84. thinkbooster-0.1.0/llm_tts/step_boundary_detectors/non_thinking/structured.py +169 -0
  85. thinkbooster-0.1.0/llm_tts/step_boundary_detectors/thinking/__init__.py +39 -0
  86. thinkbooster-0.1.0/llm_tts/step_boundary_detectors/thinking/huggingface/__init__.py +18 -0
  87. thinkbooster-0.1.0/llm_tts/step_boundary_detectors/thinking/marker.py +662 -0
  88. thinkbooster-0.1.0/llm_tts/step_boundary_detectors/thinking/offline/__init__.py +18 -0
  89. thinkbooster-0.1.0/llm_tts/step_boundary_detectors/thinking/offline/hybrid.py +308 -0
  90. thinkbooster-0.1.0/llm_tts/step_boundary_detectors/thinking/offline/llm.py +384 -0
  91. thinkbooster-0.1.0/llm_tts/step_boundary_detectors/thinking/offline/sentence.py +138 -0
  92. thinkbooster-0.1.0/llm_tts/step_boundary_detectors/thinking/vllm/__init__.py +31 -0
  93. thinkbooster-0.1.0/llm_tts/step_boundary_detectors/thinking/vllm/stop_tokens.py +480 -0
  94. thinkbooster-0.1.0/llm_tts/strategies/__init__.py +35 -0
  95. thinkbooster-0.1.0/llm_tts/strategies/adaptive_scaling_best_of_n.py +679 -0
  96. thinkbooster-0.1.0/llm_tts/strategies/deepconf/__init__.py +9 -0
  97. thinkbooster-0.1.0/llm_tts/strategies/deepconf/strategy.py +1364 -0
  98. thinkbooster-0.1.0/llm_tts/strategies/deepconf/utils.py +312 -0
  99. thinkbooster-0.1.0/llm_tts/strategies/metadata_builder.py +222 -0
  100. thinkbooster-0.1.0/llm_tts/strategies/phi.py +228 -0
  101. thinkbooster-0.1.0/llm_tts/strategies/strategy_base.py +183 -0
  102. thinkbooster-0.1.0/llm_tts/strategies/strategy_baseline.py +399 -0
  103. thinkbooster-0.1.0/llm_tts/strategies/strategy_beam_search.py +1168 -0
  104. thinkbooster-0.1.0/llm_tts/strategies/strategy_chain_of_thought.py +119 -0
  105. thinkbooster-0.1.0/llm_tts/strategies/strategy_extended_thinking.py +386 -0
  106. thinkbooster-0.1.0/llm_tts/strategies/strategy_offline_best_of_n.py +969 -0
  107. thinkbooster-0.1.0/llm_tts/strategies/strategy_online_best_of_n.py +1101 -0
  108. thinkbooster-0.1.0/llm_tts/strategies/strategy_self_consistency.py +512 -0
  109. thinkbooster-0.1.0/llm_tts/strategies/strategy_uncertainty_cot.py +343 -0
  110. thinkbooster-0.1.0/llm_tts/utils/__init__.py +15 -0
  111. thinkbooster-0.1.0/llm_tts/utils/answer_extraction.py +141 -0
  112. thinkbooster-0.1.0/llm_tts/utils/flops.py +295 -0
  113. thinkbooster-0.1.0/llm_tts/utils/parallel.py +82 -0
  114. thinkbooster-0.1.0/llm_tts/utils/telegram.py +154 -0
  115. thinkbooster-0.1.0/llm_tts/utils/telegram_bot.py +83 -0
  116. thinkbooster-0.1.0/llm_tts/utils/torch_dtype.py +25 -0
  117. thinkbooster-0.1.0/pyproject.toml +178 -0
  118. thinkbooster-0.1.0/service_app/__init__.py +0 -0
  119. thinkbooster-0.1.0/service_app/api/__init__.py +0 -0
  120. thinkbooster-0.1.0/service_app/api/models/__init__.py +0 -0
  121. thinkbooster-0.1.0/service_app/api/models/openai_compat.py +238 -0
  122. thinkbooster-0.1.0/service_app/api/routes/__init__.py +1 -0
  123. thinkbooster-0.1.0/service_app/api/routes/chat.py +514 -0
  124. thinkbooster-0.1.0/service_app/api/routes/debugger.py +103 -0
  125. thinkbooster-0.1.0/service_app/api/routes/models.py +71 -0
  126. thinkbooster-0.1.0/service_app/core/__init__.py +0 -0
  127. thinkbooster-0.1.0/service_app/core/config.py +95 -0
  128. thinkbooster-0.1.0/service_app/core/debugger_events.py +1035 -0
  129. thinkbooster-0.1.0/service_app/core/logging_config.py +83 -0
  130. thinkbooster-0.1.0/service_app/core/prm_scorer_factory.py +86 -0
  131. thinkbooster-0.1.0/service_app/core/strategy_manager.py +687 -0
  132. thinkbooster-0.1.0/service_app/core/visual_debugger_demo.py +689 -0
  133. thinkbooster-0.1.0/service_app/main.py +314 -0
  134. thinkbooster-0.1.0/setup.cfg +4 -0
  135. thinkbooster-0.1.0/tests/test_config_naming.py +221 -0
  136. thinkbooster-0.1.0/thinkbooster.egg-info/PKG-INFO +288 -0
  137. thinkbooster-0.1.0/thinkbooster.egg-info/SOURCES.txt +138 -0
  138. thinkbooster-0.1.0/thinkbooster.egg-info/dependency_links.txt +1 -0
  139. thinkbooster-0.1.0/thinkbooster.egg-info/requires.txt +40 -0
  140. thinkbooster-0.1.0/thinkbooster.egg-info/top_level.txt +2 -0
@@ -0,0 +1,22 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 IINemo
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
@@ -0,0 +1,288 @@
1
+ Metadata-Version: 2.4
2
+ Name: thinkbooster
3
+ Version: 0.1.0
4
+ Summary: ThinkBooster: a unified framework for test-time compute scaling of LLM reasoning
5
+ Author-email: "List of contributors: https://github.com/IINemo/thinkbooster/graphs/contributors" <artemshelmanov@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/IINemo/thinkbooster
8
+ Project-URL: Repository, https://github.com/IINemo/thinkbooster
9
+ Project-URL: Issues, https://github.com/IINemo/thinkbooster/issues
10
+ Keywords: llm,reasoning,test-time-scaling,best-of-n,reasoning-evaluation
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Requires-Python: >=3.10
19
+ Description-Content-Type: text/markdown
20
+ License-File: LICENSE
21
+ Requires-Dist: torch>=1.9.0
22
+ Requires-Dist: transformers>=4.56.0
23
+ Requires-Dist: datasets>=2.14.0
24
+ Requires-Dist: numpy>=1.23.5
25
+ Requires-Dist: tqdm>=4.64.0
26
+ Requires-Dist: parse>=1.19.0
27
+ Requires-Dist: hydra-core>=1.2.0
28
+ Requires-Dist: omegaconf>=2.2.0
29
+ Requires-Dist: python-dotenv>=0.19.0
30
+ Requires-Dist: lm-polygraph>=0.6.0
31
+ Requires-Dist: pylatexenc>=2.10
32
+ Requires-Dist: sympy>=1.12
33
+ Requires-Dist: regex>=2023.0.0
34
+ Requires-Dist: plotly>=5.0.0
35
+ Requires-Dist: word2number>=1.1
36
+ Requires-Dist: wandb>=0.15.0
37
+ Requires-Dist: evalplus>=0.3.1
38
+ Requires-Dist: vllm<0.13.0,>=0.12.0
39
+ Provides-Extra: dev
40
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
41
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
42
+ Requires-Dist: black>=24.0.0; extra == "dev"
43
+ Requires-Dist: isort==7.0.0; extra == "dev"
44
+ Requires-Dist: flake8>=7.0.0; extra == "dev"
45
+ Requires-Dist: pre-commit>=3.0.0; extra == "dev"
46
+ Provides-Extra: docs
47
+ Requires-Dist: sphinx>=5.0.0; extra == "docs"
48
+ Requires-Dist: sphinx-rtd-theme>=1.0.0; extra == "docs"
49
+ Requires-Dist: myst-parser>=0.18.0; extra == "docs"
50
+ Provides-Extra: service
51
+ Requires-Dist: fastapi>=0.104.0; extra == "service"
52
+ Requires-Dist: uvicorn[standard]>=0.24.0; extra == "service"
53
+ Requires-Dist: pydantic>=2.0.0; extra == "service"
54
+ Requires-Dist: pydantic-settings>=2.0.0; extra == "service"
55
+ Requires-Dist: httpx>=0.25.0; extra == "service"
56
+ Requires-Dist: python-multipart>=0.0.6; extra == "service"
57
+ Requires-Dist: python-json-logger>=2.0.7; extra == "service"
58
+ Dynamic: license-file
59
+
60
+ <div align="center">
61
+ <img src="assets/logo.png" alt="ThinkBooster logo" width="140" />
62
+ <h1>ThinkBooster: A Unified Framework for Seamless Test-Time Scaling of LLM Reasoning</h1>
63
+ </div>
64
+
65
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
66
+ [![Python 3.11](https://img.shields.io/badge/python-3.11-blue.svg)](https://www.python.org/downloads/release/python-3110/)
67
+ [![arXiv](https://img.shields.io/badge/arXiv-b31b1b.svg)](https://thinkbooster.s3.us-east-1.amazonaws.com/thinkbooster.pdf)
68
+
69
+ [Quick Start](#quick-start) | [Key Features](#key-features) | [Strategies](#supported-strategies) | [Visual Debugger](#visual-debugger) | [Documentation](#documentation)
70
+
71
+ ThinkBooster is an open-source framework for **test-time compute scaling** of large language models. It implements nine state-of-the-art scaling strategies — beam search, best-of-N, self-consistency, DeepConf, MUR, phi-decoding, and more — scored by process reward models (PRMs), uncertainty estimators, LLM-as-a-critic, and ReProbes. The framework includes an evaluation pipeline for math, science, and coding benchmarks, an OpenAI-compatible endpoint gateway, and an interactive visual debugger for inspecting strategy behavior step by step.
72
+
73
+ ---
74
+
75
+ ## Key Features
76
+
77
+ - **9 scaling strategies** — beam search, best-of-N, self-consistency, DeepConf, MUR, phi-decoding, extended thinking, uncertainty CoT, and adaptive scaling (online and offline)
78
+ - **4 scorer families** — process reward models (PRMs), uncertainty/confidence scores, LLM-as-a-critic, and ReProbes; with configurable aggregation (min, mean, max, product) and sliding window
79
+ - **OpenAI-compatible endpoint gateway** — drop-in replacement for any OpenAI SDK; select strategy and scorer via URL path; enables "Pro reasoning mode" for any LLM deployment
80
+ - **Visual debugger** — interactive web UI for comparing strategies, inspecting step-by-step reasoning traces and confidence signals
81
+ - **Evaluation pipeline** — math (MATH-500, OlympiadBench, GaoKao, AIME), science (GPQA-Diamond), and coding (HumanEval+, MBPP+, KernelBench) with crash-resistant resume
82
+
83
+ ---
84
+
85
+ ## Quick Start
86
+
87
+ ### Installation
88
+
89
+ ```bash
90
+ # Clone the repository
91
+ git clone https://github.com/IINemo/thinkbooster.git
92
+ cd thinkbooster
93
+
94
+ # Create conda environment
95
+ conda create -n thinkbooster python=3.11 -y
96
+ conda activate thinkbooster
97
+
98
+ # Install dependencies
99
+ ./setup.sh
100
+
101
+ # Configure API keys
102
+ cp .env.example .env
103
+ # Edit .env and add your OPENROUTER_API_KEY
104
+ ```
105
+
106
+ ### REST API
107
+
108
+ ```bash
109
+ pip install -e ".[service]"
110
+ python service_app/main.py # starts on http://localhost:8001
111
+ ```
112
+
113
+ Use with any OpenAI SDK:
114
+
115
+ ```python
116
+ from openai import OpenAI
117
+
118
+ client = OpenAI(
119
+ base_url="http://localhost:8001/v1/beam_search/prm",
120
+ api_key="<YOUR_API_KEY>",
121
+ )
122
+ response = client.chat.completions.create(
123
+ model="Qwen/Qwen3-30B-A3B",
124
+ messages=[{"role": "user", "content":
125
+ "Find the number of ordered pairs (x, y) of "
126
+ "positive integers satisfying x + 2y = 2xy."}],
127
+ extra_body={
128
+ "max_tokens": 8192, "tts_beam_size": 4,
129
+ },
130
+ )
131
+ print(response.choices[0].message.content)
132
+ ```
133
+
134
+ The `base_url` encodes the scaling strategy and scorer (`beam_search/prm`). To switch strategy, just change the URL — no other code changes needed.
135
+
136
+ See [Service API Guide](docs/service/api_guide.md) for the full reference.
137
+
138
+ ### Run an Experiment
139
+
140
+ ```bash
141
+ # Beam search on GSM8K (3 samples for quick verification)
142
+ python scripts/run_tts_eval.py \
143
+ --config-name experiments/beam_search/gsm8k/window_all/mean/beam_search_vllm_qwen25_math_7b_instruct_gsm8k_prm \
144
+ dataset.subset=3
145
+ ```
146
+
147
+ Results are saved to `outputs/` with full config snapshots for reproducibility. Add `--resume` to continue interrupted runs.
148
+
149
+ ---
150
+
151
+ ## Visual Debugger
152
+
153
+ The interactive debugger lets you compare multiple TTS strategies side by side on the same problem. Inspect per-step decisions (escalate, stop, prune, select), view confidence and uncertainty signals, and drill into sampled candidates and tree expansions.
154
+
155
+ <table border="0">
156
+ <tr>
157
+ <td width="40%"><img src="https://github.com/user-attachments/assets/e1fec504-d6f7-49d8-85e3-bf42d4e7baec" alt="Visual Debugger — main interface" width="100%" /></td>
158
+ <td valign="middle"><b>Main interface.</b> Select a cached example or enter a custom math/science/coding problem. Choose any strategy (beam search, best-of-N, MUR, …) and scorer (PRM, uncertainty, LLM-as-a-critic) and run it directly from the browser.</td>
159
+ </tr>
160
+ <tr><td colspan="2"><br/></td></tr>
161
+ <tr>
162
+ <td width="40%"><img src="https://github.com/user-attachments/assets/21c7fc24-7507-46e3-9ce3-34cb6a37d7b5" alt="Step-by-step reasoning inspector" width="100%" /></td>
163
+ <td valign="middle"><b>Step inspector.</b> Replay the strategy execution step by step. Each entry in the reasoning timeline shows the operation (select, prune, escalate), the candidates considered, their scores, and the full text of the chosen step.</td>
164
+ </tr>
165
+ <tr><td colspan="2"><br/></td></tr>
166
+ <tr>
167
+ <td width="40%"><img src="https://github.com/user-attachments/assets/df03cc3e-a933-4b6c-aa96-f35ab3e9b986" alt="Trajectory tree visualization" width="100%" /></td>
168
+ <td valign="middle"><b>Trajectory tree.</b> Global branching view of the entire strategy run. Nodes represent reasoning steps; the orange path highlights the final selected trajectory. Useful for understanding how beam search or tree-of-thought explores and prunes the search space.</td>
169
+ </tr>
170
+ </table>
171
+
172
+ After starting the REST API service, open:
173
+
174
+ ```
175
+ http://localhost:8001/debugger
176
+ ```
177
+
178
+ See [service_app/README.md](service_app/README.md) for details on cached examples and custom input modes.
179
+
180
+ ---
181
+
182
+ ## Supported Strategies
183
+
184
+ | Strategy | Online/Offline | LLM Access | Prefill | Description |
185
+ |---|---|---|---|---|
186
+ | Best-of-N | Offline | Black-box | No | Sample N solutions, select best by scorer |
187
+ | Majority Voting | Offline | Black-box | No | Sample N solutions, select answer by majority vote |
188
+ | Beam Search (ToT) | Online | Black-box | Yes | Explore tree of reasoning paths, prune by score |
189
+ | Extended Thinking | Online | Black-box | Yes | Control reasoning budget to force longer CoT |
190
+ | MUR | Online | White-box | Yes | Allocate more compute only on uncertain steps |
191
+ | DeepConf Online | Online | White-box | Yes | Steer generation toward high-confidence tokens |
192
+ | DeepConf Offline | Offline | White-box | No | Rerank candidates by model confidence scores |
193
+ | Phi-decoding | Online | White-box | Yes | Foresight sampling and adaptive pruning |
194
+ | Uncertainty CoT | Online | White-box | Yes | Generate multiple trajectories when uncertain |
195
+
196
+ ---
197
+
198
+ ## Project Structure
199
+
200
+ ```
201
+ thinkbooster/
202
+ ├── llm_tts/ # Core library
203
+ │ ├── strategies/ # TTS strategy implementations
204
+ │ ├── models/ # Model wrappers (vLLM, HuggingFace, API)
205
+ │ ├── scorers/ # Step scoring (PRM, uncertainty, voting)
206
+ │ ├── evaluation/ # Correctness evaluation (exact match, LLM judge)
207
+ │ └── datasets/ # Dataset loaders and utilities
208
+ ├── config/ # Hydra configuration system
209
+ ├── scripts/ # Evaluation scripts (run_tts_eval.py)
210
+ ├── service_app/ # REST API service + visual debugger
211
+ ├── tests/ # Test suite with strategy registry
212
+ ├── docs/ # Documentation
213
+ └── lm-polygraph/ # Submodule: uncertainty estimation
214
+ ```
215
+
216
+ See [Project Structure](docs/getting_started/project_structure.md) for a detailed architecture overview.
217
+
218
+ ---
219
+
220
+ ## Documentation
221
+
222
+ - [Project Structure](docs/getting_started/project_structure.md) — architecture and component descriptions
223
+ - [Evaluation Protocol](docs/evaluation/README.md) — datasets, metrics (accuracy, tokens, FLOPs), and reporting
224
+ - [Strategy Registration](docs/core/strategy_registration.md) — how to add new strategies with tests
225
+ - [Service API Guide](docs/service/api_guide.md) — REST API reference and configuration
226
+ - [DeepConf Guide](docs/strategies/deepconf.md) — confidence-based test-time scaling
227
+
228
+ ---
229
+
230
+ ## Contributing
231
+
232
+ We welcome contributions! Whether it's a new strategy, scorer, dataset, or bug fix — see the [Contributing Guide](docs/getting_started/contributing.md) for setup instructions, development workflow, and coding standards.
233
+
234
+ ---
235
+
236
+ ## Citation
237
+
238
+ If you use ThinkBooster in your research, please cite:
239
+
240
+ ```bibtex
241
+ @misc{thinkbooster2026,
242
+ title = {ThinkBooster: A Unified Framework for Seamless Test-Time Scaling of LLM Reasoning},
243
+ author = {Smirnov, Vladislav and Nguyen, Chieu and Senichev, Sergey and Ta, Minh Ngoc and Fadeeva, Ekaterina and Vazhentsev, Artem and Galimzianova, Daria and Rozanov, Nikolai and Mazanov, Viktor and Ni, Jingwei and Wu, Tianyi and Kiselev, Igor and Sachan, Mrinmaya and Gurevych, Iryna and Nakov, Preslav and Baldwin, Timothy and Shelmanov, Artem},
244
+ booktitle = {Preprint},
245
+ year = {2026},
246
+ url = {https://thinkbooster.s3.us-east-1.amazonaws.com/thinkbooster.pdf}
247
+ }
248
+ ```
249
+
250
+ ---
251
+
252
+ ## Troubleshooting
253
+
254
+ <details>
255
+ <summary>vLLM engine fails to start</summary>
256
+
257
+ **Corrupted torch compile cache:** If you see `RuntimeError: Engine core initialization failed`:
258
+
259
+ ```bash
260
+ rm -rf ~/.cache/vllm/torch_compile_cache/
261
+ ```
262
+
263
+ **Missing C compiler:** If Triton can't find `gcc`:
264
+
265
+ ```bash
266
+ conda install -c conda-forge gcc_linux-64 gxx_linux-64 -y
267
+ ln -s $CONDA_PREFIX/bin/x86_64-conda-linux-gnu-gcc $CONDA_PREFIX/bin/gcc
268
+ ln -s $CONDA_PREFIX/bin/x86_64-conda-linux-gnu-g++ $CONDA_PREFIX/bin/g++
269
+ ```
270
+
271
+ </details>
272
+
273
+ <details>
274
+ <summary>ANTLR version mismatch warnings</summary>
275
+
276
+ ```
277
+ ANTLR runtime and generated code versions disagree: 4.9.3!=4.7.2
278
+ ```
279
+
280
+ This is expected — Hydra uses ANTLR 4.9.3, latex2sympy2 was built with 4.7.2. Both work correctly.
281
+
282
+ </details>
283
+
284
+ ---
285
+
286
+ ## License
287
+
288
+ This project is licensed under the MIT License — see the [LICENSE](LICENSE) file for details.
@@ -0,0 +1,229 @@
1
+ <div align="center">
2
+ <img src="assets/logo.png" alt="ThinkBooster logo" width="140" />
3
+ <h1>ThinkBooster: A Unified Framework for Seamless Test-Time Scaling of LLM Reasoning</h1>
4
+ </div>
5
+
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
7
+ [![Python 3.11](https://img.shields.io/badge/python-3.11-blue.svg)](https://www.python.org/downloads/release/python-3110/)
8
+ [![arXiv](https://img.shields.io/badge/arXiv-b31b1b.svg)](https://thinkbooster.s3.us-east-1.amazonaws.com/thinkbooster.pdf)
9
+
10
+ [Quick Start](#quick-start) | [Key Features](#key-features) | [Strategies](#supported-strategies) | [Visual Debugger](#visual-debugger) | [Documentation](#documentation)
11
+
12
+ ThinkBooster is an open-source framework for **test-time compute scaling** of large language models. It implements nine state-of-the-art scaling strategies — beam search, best-of-N, self-consistency, DeepConf, MUR, phi-decoding, and more — scored by process reward models (PRMs), uncertainty estimators, LLM-as-a-critic, and ReProbes. The framework includes an evaluation pipeline for math, science, and coding benchmarks, an OpenAI-compatible endpoint gateway, and an interactive visual debugger for inspecting strategy behavior step by step.
13
+
14
+ ---
15
+
16
+ ## Key Features
17
+
18
+ - **9 scaling strategies** — beam search, best-of-N, self-consistency, DeepConf, MUR, phi-decoding, extended thinking, uncertainty CoT, and adaptive scaling (online and offline)
19
+ - **4 scorer families** — process reward models (PRMs), uncertainty/confidence scores, LLM-as-a-critic, and ReProbes; with configurable aggregation (min, mean, max, product) and sliding window
20
+ - **OpenAI-compatible endpoint gateway** — drop-in replacement for any OpenAI SDK; select strategy and scorer via URL path; enables "Pro reasoning mode" for any LLM deployment
21
+ - **Visual debugger** — interactive web UI for comparing strategies, inspecting step-by-step reasoning traces and confidence signals
22
+ - **Evaluation pipeline** — math (MATH-500, OlympiadBench, GaoKao, AIME), science (GPQA-Diamond), and coding (HumanEval+, MBPP+, KernelBench) with crash-resistant resume
23
+
24
+ ---
25
+
26
+ ## Quick Start
27
+
28
+ ### Installation
29
+
30
+ ```bash
31
+ # Clone the repository
32
+ git clone https://github.com/IINemo/thinkbooster.git
33
+ cd thinkbooster
34
+
35
+ # Create conda environment
36
+ conda create -n thinkbooster python=3.11 -y
37
+ conda activate thinkbooster
38
+
39
+ # Install dependencies
40
+ ./setup.sh
41
+
42
+ # Configure API keys
43
+ cp .env.example .env
44
+ # Edit .env and add your OPENROUTER_API_KEY
45
+ ```
46
+
47
+ ### REST API
48
+
49
+ ```bash
50
+ pip install -e ".[service]"
51
+ python service_app/main.py # starts on http://localhost:8001
52
+ ```
53
+
54
+ Use with any OpenAI SDK:
55
+
56
+ ```python
57
+ from openai import OpenAI
58
+
59
+ client = OpenAI(
60
+ base_url="http://localhost:8001/v1/beam_search/prm",
61
+ api_key="<YOUR_API_KEY>",
62
+ )
63
+ response = client.chat.completions.create(
64
+ model="Qwen/Qwen3-30B-A3B",
65
+ messages=[{"role": "user", "content":
66
+ "Find the number of ordered pairs (x, y) of "
67
+ "positive integers satisfying x + 2y = 2xy."}],
68
+ extra_body={
69
+ "max_tokens": 8192, "tts_beam_size": 4,
70
+ },
71
+ )
72
+ print(response.choices[0].message.content)
73
+ ```
74
+
75
+ The `base_url` encodes the scaling strategy and scorer (`beam_search/prm`). To switch strategy, just change the URL — no other code changes needed.
76
+
77
+ See [Service API Guide](docs/service/api_guide.md) for the full reference.
78
+
79
+ ### Run an Experiment
80
+
81
+ ```bash
82
+ # Beam search on GSM8K (3 samples for quick verification)
83
+ python scripts/run_tts_eval.py \
84
+ --config-name experiments/beam_search/gsm8k/window_all/mean/beam_search_vllm_qwen25_math_7b_instruct_gsm8k_prm \
85
+ dataset.subset=3
86
+ ```
87
+
88
+ Results are saved to `outputs/` with full config snapshots for reproducibility. Add `--resume` to continue interrupted runs.
89
+
90
+ ---
91
+
92
+ ## Visual Debugger
93
+
94
+ The interactive debugger lets you compare multiple TTS strategies side by side on the same problem. Inspect per-step decisions (escalate, stop, prune, select), view confidence and uncertainty signals, and drill into sampled candidates and tree expansions.
95
+
96
+ <table border="0">
97
+ <tr>
98
+ <td width="40%"><img src="https://github.com/user-attachments/assets/e1fec504-d6f7-49d8-85e3-bf42d4e7baec" alt="Visual Debugger — main interface" width="100%" /></td>
99
+ <td valign="middle"><b>Main interface.</b> Select a cached example or enter a custom math/science/coding problem. Choose any strategy (beam search, best-of-N, MUR, …) and scorer (PRM, uncertainty, LLM-as-a-critic) and run it directly from the browser.</td>
100
+ </tr>
101
+ <tr><td colspan="2"><br/></td></tr>
102
+ <tr>
103
+ <td width="40%"><img src="https://github.com/user-attachments/assets/21c7fc24-7507-46e3-9ce3-34cb6a37d7b5" alt="Step-by-step reasoning inspector" width="100%" /></td>
104
+ <td valign="middle"><b>Step inspector.</b> Replay the strategy execution step by step. Each entry in the reasoning timeline shows the operation (select, prune, escalate), the candidates considered, their scores, and the full text of the chosen step.</td>
105
+ </tr>
106
+ <tr><td colspan="2"><br/></td></tr>
107
+ <tr>
108
+ <td width="40%"><img src="https://github.com/user-attachments/assets/df03cc3e-a933-4b6c-aa96-f35ab3e9b986" alt="Trajectory tree visualization" width="100%" /></td>
109
+ <td valign="middle"><b>Trajectory tree.</b> Global branching view of the entire strategy run. Nodes represent reasoning steps; the orange path highlights the final selected trajectory. Useful for understanding how beam search or tree-of-thought explores and prunes the search space.</td>
110
+ </tr>
111
+ </table>
112
+
113
+ After starting the REST API service, open:
114
+
115
+ ```
116
+ http://localhost:8001/debugger
117
+ ```
118
+
119
+ See [service_app/README.md](service_app/README.md) for details on cached examples and custom input modes.
120
+
121
+ ---
122
+
123
+ ## Supported Strategies
124
+
125
+ | Strategy | Online/Offline | LLM Access | Prefill | Description |
126
+ |---|---|---|---|---|
127
+ | Best-of-N | Offline | Black-box | No | Sample N solutions, select best by scorer |
128
+ | Majority Voting | Offline | Black-box | No | Sample N solutions, select answer by majority vote |
129
+ | Beam Search (ToT) | Online | Black-box | Yes | Explore tree of reasoning paths, prune by score |
130
+ | Extended Thinking | Online | Black-box | Yes | Control reasoning budget to force longer CoT |
131
+ | MUR | Online | White-box | Yes | Allocate more compute only on uncertain steps |
132
+ | DeepConf Online | Online | White-box | Yes | Steer generation toward high-confidence tokens |
133
+ | DeepConf Offline | Offline | White-box | No | Rerank candidates by model confidence scores |
134
+ | Phi-decoding | Online | White-box | Yes | Foresight sampling and adaptive pruning |
135
+ | Uncertainty CoT | Online | White-box | Yes | Generate multiple trajectories when uncertain |
136
+
137
+ ---
138
+
139
+ ## Project Structure
140
+
141
+ ```
142
+ thinkbooster/
143
+ ├── llm_tts/ # Core library
144
+ │ ├── strategies/ # TTS strategy implementations
145
+ │ ├── models/ # Model wrappers (vLLM, HuggingFace, API)
146
+ │ ├── scorers/ # Step scoring (PRM, uncertainty, voting)
147
+ │ ├── evaluation/ # Correctness evaluation (exact match, LLM judge)
148
+ │ └── datasets/ # Dataset loaders and utilities
149
+ ├── config/ # Hydra configuration system
150
+ ├── scripts/ # Evaluation scripts (run_tts_eval.py)
151
+ ├── service_app/ # REST API service + visual debugger
152
+ ├── tests/ # Test suite with strategy registry
153
+ ├── docs/ # Documentation
154
+ └── lm-polygraph/ # Submodule: uncertainty estimation
155
+ ```
156
+
157
+ See [Project Structure](docs/getting_started/project_structure.md) for a detailed architecture overview.
158
+
159
+ ---
160
+
161
+ ## Documentation
162
+
163
+ - [Project Structure](docs/getting_started/project_structure.md) — architecture and component descriptions
164
+ - [Evaluation Protocol](docs/evaluation/README.md) — datasets, metrics (accuracy, tokens, FLOPs), and reporting
165
+ - [Strategy Registration](docs/core/strategy_registration.md) — how to add new strategies with tests
166
+ - [Service API Guide](docs/service/api_guide.md) — REST API reference and configuration
167
+ - [DeepConf Guide](docs/strategies/deepconf.md) — confidence-based test-time scaling
168
+
169
+ ---
170
+
171
+ ## Contributing
172
+
173
+ We welcome contributions! Whether it's a new strategy, scorer, dataset, or bug fix — see the [Contributing Guide](docs/getting_started/contributing.md) for setup instructions, development workflow, and coding standards.
174
+
175
+ ---
176
+
177
+ ## Citation
178
+
179
+ If you use ThinkBooster in your research, please cite:
180
+
181
+ ```bibtex
182
+ @misc{thinkbooster2026,
183
+ title = {ThinkBooster: A Unified Framework for Seamless Test-Time Scaling of LLM Reasoning},
184
+ author = {Smirnov, Vladislav and Nguyen, Chieu and Senichev, Sergey and Ta, Minh Ngoc and Fadeeva, Ekaterina and Vazhentsev, Artem and Galimzianova, Daria and Rozanov, Nikolai and Mazanov, Viktor and Ni, Jingwei and Wu, Tianyi and Kiselev, Igor and Sachan, Mrinmaya and Gurevych, Iryna and Nakov, Preslav and Baldwin, Timothy and Shelmanov, Artem},
185
+ booktitle = {Preprint},
186
+ year = {2026},
187
+ url = {https://thinkbooster.s3.us-east-1.amazonaws.com/thinkbooster.pdf}
188
+ }
189
+ ```
190
+
191
+ ---
192
+
193
+ ## Troubleshooting
194
+
195
+ <details>
196
+ <summary>vLLM engine fails to start</summary>
197
+
198
+ **Corrupted torch compile cache:** If you see `RuntimeError: Engine core initialization failed`:
199
+
200
+ ```bash
201
+ rm -rf ~/.cache/vllm/torch_compile_cache/
202
+ ```
203
+
204
+ **Missing C compiler:** If Triton can't find `gcc`:
205
+
206
+ ```bash
207
+ conda install -c conda-forge gcc_linux-64 gxx_linux-64 -y
208
+ ln -s $CONDA_PREFIX/bin/x86_64-conda-linux-gnu-gcc $CONDA_PREFIX/bin/gcc
209
+ ln -s $CONDA_PREFIX/bin/x86_64-conda-linux-gnu-g++ $CONDA_PREFIX/bin/g++
210
+ ```
211
+
212
+ </details>
213
+
214
+ <details>
215
+ <summary>ANTLR version mismatch warnings</summary>
216
+
217
+ ```
218
+ ANTLR runtime and generated code versions disagree: 4.9.3!=4.7.2
219
+ ```
220
+
221
+ This is expected — Hydra uses ANTLR 4.9.3, latex2sympy2 was built with 4.7.2. Both work correctly.
222
+
223
+ </details>
224
+
225
+ ---
226
+
227
+ ## License
228
+
229
+ This project is licensed under the MIT License — see the [LICENSE](LICENSE) file for details.
@@ -0,0 +1,46 @@
1
+ """Dataset loaders for various benchmarks."""
2
+
3
+ from .gsm8k import (
4
+ evaluate_gsm8k_answer,
5
+ extract_answer_from_gsm8k,
6
+ format_gsm8k_for_deepconf,
7
+ load_gsm8k,
8
+ )
9
+ from .human_eval_plus import create_evalplus_samples as create_human_eval_plus_samples
10
+ from .human_eval_plus import (
11
+ extract_code_from_response as extract_code_from_response_human_eval,
12
+ )
13
+ from .human_eval_plus import (
14
+ format_human_eval_prompt,
15
+ )
16
+ from .human_eval_plus import load_evalplus_samples as load_human_eval_plus_samples
17
+ from .human_eval_plus import (
18
+ load_human_eval_plus,
19
+ )
20
+ from .mbpp_plus import (
21
+ create_evalplus_samples,
22
+ extract_code_from_response,
23
+ format_mbpp_prompt,
24
+ load_evalplus_samples,
25
+ load_mbpp_plus,
26
+ )
27
+
28
+ __all__ = [
29
+ # GSM8K
30
+ "load_gsm8k",
31
+ "evaluate_gsm8k_answer",
32
+ "extract_answer_from_gsm8k",
33
+ "format_gsm8k_for_deepconf",
34
+ # MBPP+
35
+ "load_mbpp_plus",
36
+ "extract_code_from_response",
37
+ "format_mbpp_prompt",
38
+ "create_evalplus_samples",
39
+ "load_evalplus_samples",
40
+ # HumanEval+
41
+ "load_human_eval_plus",
42
+ "extract_code_from_response_human_eval",
43
+ "format_human_eval_prompt",
44
+ "create_human_eval_plus_samples",
45
+ "load_human_eval_plus_samples",
46
+ ]