konash 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. konash-0.1.0/PKG-INFO +21 -0
  2. konash-0.1.0/README.md +153 -0
  3. konash-0.1.0/konash/__init__.py +6 -0
  4. konash-0.1.0/konash/agent.py +315 -0
  5. konash-0.1.0/konash/api.py +706 -0
  6. konash-0.1.0/konash/cli.py +166 -0
  7. konash-0.1.0/konash/corpora/__init__.py +0 -0
  8. konash-0.1.0/konash/corpora/policies.py +71 -0
  9. konash-0.1.0/konash/corpus.py +232 -0
  10. konash-0.1.0/konash/eval/__init__.py +0 -0
  11. konash-0.1.0/konash/eval/ablations.py +91 -0
  12. konash-0.1.0/konash/eval/benchmarks.py +75 -0
  13. konash-0.1.0/konash/eval/experiments.py +189 -0
  14. konash-0.1.0/konash/eval/metrics.py +78 -0
  15. konash-0.1.0/konash/eval/nuggets.py +470 -0
  16. konash-0.1.0/konash/eval/runner.py +296 -0
  17. konash-0.1.0/konash/eval/stats.py +79 -0
  18. konash-0.1.0/konash/eval/ttc.py +117 -0
  19. konash-0.1.0/konash/harness/__init__.py +0 -0
  20. konash-0.1.0/konash/harness/dispatcher.py +166 -0
  21. konash-0.1.0/konash/harness/environment.py +291 -0
  22. konash-0.1.0/konash/harness/runtime.py +94 -0
  23. konash-0.1.0/konash/harness/strategy.py +370 -0
  24. konash-0.1.0/konash/inference/__init__.py +0 -0
  25. konash-0.1.0/konash/inference/aggregation.py +206 -0
  26. konash-0.1.0/konash/inference/config.py +14 -0
  27. konash-0.1.0/konash/inference/local.py +503 -0
  28. konash-0.1.0/konash/inference/parallel.py +250 -0
  29. konash-0.1.0/konash/inference/value_model.py +777 -0
  30. konash-0.1.0/konash/inference/value_search.py +393 -0
  31. konash-0.1.0/konash/plugins/__init__.py +0 -0
  32. konash-0.1.0/konash/plugins/base.py +129 -0
  33. konash-0.1.0/konash/plugins/compression.py +423 -0
  34. konash-0.1.0/konash/plugins/control.py +184 -0
  35. konash-0.1.0/konash/prompts/__init__.py +0 -0
  36. konash-0.1.0/konash/prompts/registry.py +293 -0
  37. konash-0.1.0/konash/retrieval/__init__.py +0 -0
  38. konash-0.1.0/konash/retrieval/vector_search.py +515 -0
  39. konash-0.1.0/konash/rewards/__init__.py +103 -0
  40. konash-0.1.0/konash/rewards/base.py +25 -0
  41. konash-0.1.0/konash/rewards/nugget.py +66 -0
  42. konash-0.1.0/konash/rewards/tasks.py +45 -0
  43. konash-0.1.0/konash/synthesis/__init__.py +0 -0
  44. konash-0.1.0/konash/synthesis/config.py +94 -0
  45. konash-0.1.0/konash/synthesis/dedup.py +873 -0
  46. konash-0.1.0/konash/synthesis/filters.py +714 -0
  47. konash-0.1.0/konash/synthesis/pipeline.py +298 -0
  48. konash-0.1.0/konash/synthesis/qa.py +591 -0
  49. konash-0.1.0/konash/synthesis/rollouts.py +756 -0
  50. konash-0.1.0/konash/training/__init__.py +0 -0
  51. konash-0.1.0/konash/training/dataset.py +140 -0
  52. konash-0.1.0/konash/training/iteration.py +426 -0
  53. konash-0.1.0/konash/training/multitask.py +149 -0
  54. konash-0.1.0/konash/training/oapl.py +634 -0
  55. konash-0.1.0/konash/training/segmentation.py +163 -0
  56. konash-0.1.0/konash/training/stats.py +37 -0
  57. konash-0.1.0/konash.egg-info/PKG-INFO +21 -0
  58. konash-0.1.0/konash.egg-info/SOURCES.txt +64 -0
  59. konash-0.1.0/konash.egg-info/dependency_links.txt +1 -0
  60. konash-0.1.0/konash.egg-info/entry_points.txt +2 -0
  61. konash-0.1.0/konash.egg-info/requires.txt +20 -0
  62. konash-0.1.0/konash.egg-info/top_level.txt +1 -0
  63. konash-0.1.0/pyproject.toml +43 -0
  64. konash-0.1.0/setup.cfg +4 -0
  65. konash-0.1.0/tests/test_api_integration.py +169 -0
  66. konash-0.1.0/tests/test_rewards_runtime.py +49 -0
konash-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,21 @@
1
+ Metadata-Version: 2.4
2
+ Name: konash
3
+ Version: 0.1.0
4
+ Summary: Train knowledge agents that search, retrieve, compress, and reason — on a single GPU.
5
+ Requires-Python: >=3.11
6
+ Requires-Dist: numpy>=1.24
7
+ Provides-Extra: dev
8
+ Requires-Dist: pytest>=7.4; extra == "dev"
9
+ Provides-Extra: train
10
+ Requires-Dist: torch>=2.1; extra == "train"
11
+ Requires-Dist: transformers>=4.36; extra == "train"
12
+ Requires-Dist: peft>=0.7; extra == "train"
13
+ Requires-Dist: accelerate>=0.25; extra == "train"
14
+ Provides-Extra: quant
15
+ Requires-Dist: bitsandbytes>=0.41; extra == "quant"
16
+ Provides-Extra: all
17
+ Requires-Dist: torch>=2.1; extra == "all"
18
+ Requires-Dist: transformers>=4.36; extra == "all"
19
+ Requires-Dist: peft>=0.7; extra == "all"
20
+ Requires-Dist: accelerate>=0.25; extra == "all"
21
+ Requires-Dist: bitsandbytes>=0.41; extra == "all"
konash-0.1.0/README.md ADDED
@@ -0,0 +1,153 @@
1
+ <div align="center">
2
+
3
+ # KONASH
4
+
5
+ **Knowledge-grounded Off-policy Networks for Agentic System Harnesses**
6
+
7
+ <p>
8
+ Train knowledge agents that search, retrieve, compress, and reason — on a single GPU.
9
+ </p>
10
+
11
+ [![PRs-Welcome](https://img.shields.io/badge/PRs-welcome-blue.svg)](CONTRIBUTING.md)
12
+ [![License](https://img.shields.io/badge/License-Apache%202.0-green.svg)](LICENSE)
13
+
14
+ [![Join Discord](https://img.shields.io/badge/Join%20Discord-5865F2?style=plastic&logo=discord&logoColor=white)](#)
15
+ [![Documentation](https://img.shields.io/badge/Documentation-orange?style=plastic&logo=gitbook&logoColor=white)](#)
16
+
17
+ </div>
18
+
19
+ KONASH trains knowledge agents via reinforcement learning that match or exceed frontier models on grounded reasoning tasks — at a fraction of the cost. **Single-GPU training, open-source models, 1/100th the compute.**
20
+
21
+ ---
22
+
23
+ ## Key Benefits
24
+
25
+ - **100x cheaper training** — Single GPU replaces multi-node clusters. ~$100–500 per iteration instead of ~$10K–50K.
26
+ - **Higher quality** — RL-trained agents search more efficiently, retrieve more diversely, and reason more accurately than their base models. The gains are algorithmic, not scale-dependent.
27
+ - **Consistent results** — Parallel thinking (N=10–20 rollouts + aggregation) turns probabilistic search into near-deterministic accuracy. Cheap rollouts on a small model mean you can afford this on every query.
28
+ - **Zero lock-in** — Your model, your weights, your infrastructure. Deploy anywhere with vLLM and LoRA hot-swapping.
29
+
30
+ ```python
31
+ # Before: Static RAG — one query, hope for the best
32
+ docs = retriever.search(query, top_k=10)
33
+ answer = llm.generate(f"Answer based on: {docs}\n\n{query}")
34
+
35
+ # After: KONASH — RL-trained agent that searches iteratively
36
+ agent = konash.Agent("./checkpoints/iter2", corpus="./my_docs")
37
+ answer = agent.solve(query, parallel_rollouts=10)
38
+ ```
39
+
40
+ [Learn more about KONASH](https://kona.sh)
41
+
42
+ ---
43
+
44
+ ## KONASH Overview
45
+
46
+ KONASH is an open-source RL framework that improves agent reliability by training knowledge agents to search, retrieve, compress, and reason over evidence — all on a single GPU using off-policy RL. For a quick hands-on introduction, run one of the notebooks below. When you're ready to learn more, check out the [docs](https://kona.sh).
47
+
48
+ ### Notebooks
49
+
50
+ | Agent Task | Example Notebook | Description | Comparative Performance |
51
+ |---|---|---|---|
52
+ | **Trivia Night** | [Train agent](notebooks/trivia_night.ipynb) | Qwen 3.5 7B learns to answer multi-constraint trivia by searching Wikipedia | [Link coming soon] |
53
+ | **20 Questions** | [Train agent](#) | Qwen 3.5 7B learns to identify a mystery entity in 20 yes/no searches | [Link coming soon] |
54
+ | **GeoGuessr** | [Train agent](#) | Qwen 3.5 7B learns to pinpoint locations from landmark and terrain descriptions | [Link coming soon] |
55
+
56
+ ## KONASH News
57
+
58
+ Explore our latest research and updates on building SOTA knowledge agents.
59
+
60
+ - **[OAPL: Off-Policy RL That Actually Works on One GPU](#)** — Train knowledge agents without multi-node clusters using large-batch iterative off-policy reinforcement learning.
61
+ - **[Agentic Data Synthesis: Let Your Model Write Its Own Curriculum](#)** — Generate diverse, grounded training data from any corpus — no manual annotation required.
62
+ - **[Parallel Thinking: How a 7B Model Beats Frontier Single-Shot](#)** — Scale quality at inference time with N parallel rollouts and generative aggregation.
63
+ - **[Compression as an RL Skill: Teaching Models What to Remember](#)** — Train context compression end-to-end with task reward, not as a separate summarization step.
64
+
65
+ [See all blog posts](https://kona.sh/blog)
66
+
67
+ ## Why KONASH?
68
+
69
+ - KONASH provides a complete pipeline for training knowledge agents on **existing corpora**. We abstract the training, synthesis, and serving into a modular system that your code doesn't need to interface with.
70
+ - **Train from anywhere.** Run the KONASH client on your laptop and let the server kick off training on a single GPU — local or cloud. No multi-node clusters required.
71
+ - Integrations with hosted platforms like W&B and Langfuse provide flexible observability and **simplify debugging** across the full synthesis-train-eval loop.
72
+ - KONASH is customizable with **intelligent defaults**. You can configure OAPL hyperparameters, compression thresholds, and inference engine settings to meet specific needs, or take advantage of defaults optimized for single-GPU training efficiency and stability.
73
+
74
+ ## Installation
75
+
76
+ KONASH agents can be trained from any client machine that runs Python. To add to an existing project, run this command:
77
+
78
+ ```
79
+ pip install konash
80
+ ```
81
+
82
+ ---
83
+
84
+ ## Training Loop Overview
85
+
86
+ KONASH uses **large-batch iterative off-policy RL** — unlike online RL frameworks, all data is generated upfront and training happens in a single offline pass. Each iteration improves the model, which then generates better data for the next iteration.
87
+
88
+ 1. **Data Synthesis**
89
+
90
+ 1. KONASH generates training questions from your corpus using an agentic synthesis pipeline — the model explores documents via vector search and proposes grounded QA pairs.
91
+ 2. A deduplication step ensures no overlap with your evaluation set.
92
+ 3. On later iterations, the improved model synthesizes its own curriculum — harder, more diverse questions.
93
+
94
+ 2. **Rollout Generation**
95
+
96
+ 1. The model (or latest checkpoint) generates multiple rollouts per question, interacting with vector search and compression tools.
97
+ 2. Each rollout is a full multi-step agent trajectory: search queries, retrieved documents, context compression, and a final answer.
98
+ 3. Rewards are computed automatically from answer correctness against ground truth.
99
+ 4. Pass-rate filtering keeps questions at the learning frontier — not too easy, not too hard.
100
+
101
+ 3. **Training**
102
+
103
+ 1. The full set of trajectories becomes a large offline dataset. Training runs in a single batch — no interleaving with inference.
104
+ 2. The server trains your model using OAPL with QLoRA. Long trajectories are segmented at compression boundaries, and tool outputs are masked from log-prob computation.
105
+ 3. The newly trained LoRA is saved and becomes the starting point for the next iteration.
106
+
107
+ 4. **Iterate**
108
+
109
+ 1. The trained checkpoint becomes the new reference policy.
110
+ 2. All rollouts are **regenerated from scratch** with the improved model — this is what makes each iteration progressively better.
111
+ 3. Training runs again on the fresh data. 2–3 iterations yields the best results.
112
+
113
+ ## Supported Models
114
+
115
+ KONASH should work with most vLLM/HuggingFace-transformers compatible causal language models, or at least the ones supported by [Unsloth](https://docs.unsloth.ai/get-started/all-our-models). If any model isn't working for you, please let us know on [Discord](#) or open an issue on [GitHub](https://github.com/konaequity/openkona/issues)!
116
+
117
+ ---
118
+
119
+ ## Contributing
120
+
121
+ KONASH is in active development and contributions are welcome! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for more information.
122
+
123
+ ---
124
+
125
+ ## Citation
126
+
127
+ ```bibtex
128
+ @misc{konaequity2026konash,
129
+ author = {Kona Equity},
130
+ title = {KONASH: Knowledge-grounded Off-policy Networks for Agentic System Harnesses},
131
+ year = {2026},
132
+ publisher = {GitHub},
133
+ journal = {GitHub repository},
134
+ howpublished = {\url{https://github.com/konaequity/openkona}}
135
+ }
136
+ ```
137
+
138
+ ---
139
+
140
+ ## License
141
+
142
+ This repository's source code is available under the [Apache-2.0 License](LICENSE).
143
+
144
+ ---
145
+
146
+ ## Credits
147
+
148
+ KONASH builds directly on the research and open-source work of:
149
+
150
+ - [OAPL](https://arxiv.org/abs/2503.01735) — Ritter et al., 2026 (the RL algorithm)
151
+ - [Unsloth](https://github.com/unslothai/unsloth) — Parameter-efficient training backend
152
+ - [vLLM](https://github.com/vllm-project/vllm) — High-throughput inference engine
153
+ - [FAISS](https://github.com/facebookresearch/faiss) — Vector search
@@ -0,0 +1,6 @@
1
+ """KONASH: Knowledge-grounded Off-policy Networks for Agentic System Harnesses."""
2
+
3
+ from konash.api import Agent
4
+ from konash.corpus import Corpus
5
+
6
+ __all__ = ["Agent", "Corpus"]
@@ -0,0 +1,315 @@
1
+ from __future__ import annotations
2
+
3
+ import copy
4
+ from typing import Any, Callable, Dict, List, Optional
5
+
6
+
7
+ class Agent:
8
+ """Core agent that wraps an injected LLM client to drive multi-step rollouts.
9
+
10
+ The LLM client is any object that exposes a ``generate(messages, **kwargs)``
11
+ method returning an assistant message dict (at minimum ``{"role": "assistant",
12
+ "content": "..."}``). Tool calls, adapters, and history compression are
13
+ handled at this layer so that the harness environment stays LLM-agnostic.
14
+ """
15
+
16
+ llm_client = None
17
+
18
+ def __init__(
19
+ self,
20
+ llm_client: Any = None,
21
+ *,
22
+ system_prompt: str | None = None,
23
+ max_steps: int = 20,
24
+ stop_sequences: List[str] | None = None,
25
+ ) -> None:
26
+ self.llm_client = llm_client
27
+ self.system_prompt = system_prompt
28
+ self.max_steps = max_steps
29
+ self.stop_sequences = stop_sequences or []
30
+ self._active_adapters: List[Dict[str, Any]] = []
31
+
32
+ # ------------------------------------------------------------------
33
+ # Core generation
34
+ # ------------------------------------------------------------------
35
+
36
+ def generate(self, messages: List[Dict[str, str]], **kwargs) -> Dict[str, Any]:
37
+ """Send *messages* to the LLM client and return the raw response.
38
+
39
+ The caller is responsible for building the message list (including any
40
+ system prompt). Extra ``kwargs`` are forwarded to the client.
41
+ """
42
+ if self.llm_client is None:
43
+ raise RuntimeError("No llm_client configured on this Agent.")
44
+ return self.llm_client.generate(messages, **kwargs)
45
+
46
+ # ------------------------------------------------------------------
47
+ # Step-level generation
48
+ # ------------------------------------------------------------------
49
+
50
+ def generate_step(
51
+ self,
52
+ conversation_history: List[Dict[str, str]],
53
+ available_tools: List[Dict[str, Any]] | None = None,
54
+ **kwargs,
55
+ ) -> Dict[str, Any]:
56
+ """Produce a single agent step (one LLM turn).
57
+
58
+ Returns a dict with at least ``{"role": "assistant", "content": ...}``.
59
+ If the model emits a tool-call the dict may also contain a
60
+ ``"tool_calls"`` key.
61
+ """
62
+ messages = self._build_messages(conversation_history)
63
+ gen_kwargs: Dict[str, Any] = dict(kwargs)
64
+ if available_tools:
65
+ gen_kwargs["tools"] = available_tools
66
+ if self.stop_sequences:
67
+ gen_kwargs.setdefault("stop", self.stop_sequences)
68
+
69
+ response = self.generate(messages, **gen_kwargs)
70
+ return response
71
+
72
+ # ------------------------------------------------------------------
73
+ # Full rollout
74
+ # ------------------------------------------------------------------
75
+
76
+ def generate_rollout(
77
+ self,
78
+ prompt: str,
79
+ environment: Any = None,
80
+ *,
81
+ max_steps: int | None = None,
82
+ **kwargs,
83
+ ) -> Dict[str, Any]:
84
+ """Run a complete episode loop and return the trajectory.
85
+
86
+ If an *environment* is provided its ``step`` / ``run_episode`` contract
87
+ is used; otherwise we fall back to a simple generate-until-done loop.
88
+ """
89
+ steps = max_steps or self.max_steps
90
+
91
+ if environment is not None:
92
+ environment.reset(prompt=prompt)
93
+ result = environment.run_episode(agent=self, max_steps=steps, **kwargs)
94
+ return result
95
+
96
+ # Standalone loop (no environment)
97
+ history: List[Dict[str, str]] = [{"role": "user", "content": prompt}]
98
+ trajectory: List[Dict[str, Any]] = []
99
+
100
+ for _step_idx in range(steps):
101
+ response = self.generate_step(history)
102
+ trajectory.append(response)
103
+ history.append(response)
104
+
105
+ # Check for natural termination
106
+ if self._is_terminal(response):
107
+ break
108
+
109
+ return {
110
+ "prompt": prompt,
111
+ "history": history,
112
+ "trajectory": trajectory,
113
+ "final_answer": self.extract_final_answer(history),
114
+ }
115
+
116
+ # ------------------------------------------------------------------
117
+ # History compression
118
+ # ------------------------------------------------------------------
119
+
120
+ def compress_history(
121
+ self,
122
+ conversation_history: List[Dict[str, str]],
123
+ *,
124
+ target_tokens: int | None = None,
125
+ **kwargs,
126
+ ) -> List[Dict[str, str]]:
127
+ """Ask the LLM to produce a shorter version of *conversation_history*.
128
+
129
+ Returns a new message list whose semantic content is preserved but
130
+ whose token footprint is reduced.
131
+ """
132
+ if not conversation_history:
133
+ return []
134
+
135
+ # Build a compression prompt
136
+ serialized = "\n".join(
137
+ f"[{m.get('role', 'unknown')}]: {m.get('content', '')}"
138
+ for m in conversation_history
139
+ )
140
+ compression_prompt = (
141
+ "Compress the following conversation into a concise summary that "
142
+ "preserves all critical facts, tool results, and reasoning steps. "
143
+ "Return ONLY the summary.\n\n" + serialized
144
+ )
145
+ if target_tokens is not None:
146
+ compression_prompt += f"\n\nTarget length: roughly {target_tokens} tokens."
147
+
148
+ messages = [{"role": "user", "content": compression_prompt}]
149
+ response = self.generate(messages, **kwargs)
150
+ summary_content = response.get("content", "") if isinstance(response, dict) else str(response)
151
+
152
+ compressed: List[Dict[str, str]] = [
153
+ {"role": "system", "content": f"[Compressed history] {summary_content}"},
154
+ ]
155
+ return compressed
156
+
157
+ # ------------------------------------------------------------------
158
+ # Answer extraction
159
+ # ------------------------------------------------------------------
160
+
161
+ def extract_final_answer(
162
+ self,
163
+ conversation_history: List[Dict[str, str]],
164
+ **kwargs,
165
+ ) -> str | None:
166
+ """Extract the final answer from the conversation history.
167
+
168
+ Walks the history backwards looking for the last assistant message
169
+ that does not appear to be a tool call.
170
+ """
171
+ for message in reversed(conversation_history):
172
+ if message.get("role") != "assistant":
173
+ continue
174
+ # Skip messages that are purely tool calls with no textual content
175
+ if message.get("tool_calls") and not message.get("content"):
176
+ continue
177
+ content = message.get("content", "")
178
+ if content:
179
+ return content
180
+ return None
181
+
182
+ # ------------------------------------------------------------------
183
+ # LoRA adapter management
184
+ # ------------------------------------------------------------------
185
+
186
+ def load_adapter(
187
+ self,
188
+ adapter_path: str,
189
+ *,
190
+ adapter_name: str | None = None,
191
+ weight: float = 1.0,
192
+ **kwargs,
193
+ ) -> None:
194
+ """Load a LoRA adapter and merge it into the active set."""
195
+ name = adapter_name or adapter_path.rstrip("/").split("/")[-1]
196
+ adapter_record = {
197
+ "path": adapter_path,
198
+ "name": name,
199
+ "weight": weight,
200
+ **kwargs,
201
+ }
202
+ self._active_adapters.append(adapter_record)
203
+
204
+ # Delegate to the LLM client if it supports adapter loading
205
+ if hasattr(self.llm_client, "load_adapter"):
206
+ self.llm_client.load_adapter(adapter_path, name=name, weight=weight, **kwargs)
207
+
208
+ def unload_adapter(
209
+ self,
210
+ adapter_name: str | None = None,
211
+ **kwargs,
212
+ ) -> None:
213
+ """Unload a LoRA adapter (or all adapters if *adapter_name* is ``None``)."""
214
+ if adapter_name is None:
215
+ removed = list(self._active_adapters)
216
+ self._active_adapters.clear()
217
+ else:
218
+ removed = [a for a in self._active_adapters if a["name"] == adapter_name]
219
+ self._active_adapters = [
220
+ a for a in self._active_adapters if a["name"] != adapter_name
221
+ ]
222
+
223
+ # Delegate to the LLM client if it supports adapter unloading
224
+ if hasattr(self.llm_client, "unload_adapter"):
225
+ for adapter in removed:
226
+ self.llm_client.unload_adapter(adapter["name"], **kwargs)
227
+
228
+ # ------------------------------------------------------------------
229
+ # Internal helpers
230
+ # ------------------------------------------------------------------
231
+
232
+ def _build_messages(
233
+ self, conversation_history: List[Dict[str, str]]
234
+ ) -> List[Dict[str, str]]:
235
+ """Prepend the system prompt (if any) to the conversation history."""
236
+ messages: List[Dict[str, str]] = []
237
+ if self.system_prompt:
238
+ messages.append({"role": "system", "content": self.system_prompt})
239
+ messages.extend(conversation_history)
240
+ return messages
241
+
242
+ @staticmethod
243
+ def _is_terminal(response: Dict[str, Any]) -> bool:
244
+ """Heuristic: a response is terminal if it has content and no tool calls."""
245
+ has_tool_calls = bool(response.get("tool_calls"))
246
+ has_content = bool(response.get("content"))
247
+ return has_content and not has_tool_calls
248
+
249
+
250
+ class ValueGuidedAgent:
251
+ """Agent variant that generates multiple candidate continuations per step,
252
+ scores each with a value model, and selects the best one.
253
+
254
+ Drop-in replacement for ``Agent.generate_step`` when plugged into an
255
+ environment or strategy that expects the same interface.
256
+ """
257
+
258
+ candidate_width = 2
259
+ value_model = None
260
+
261
+ def __init__(
262
+ self,
263
+ llm_client: Any = None,
264
+ *,
265
+ candidate_width: int = 2,
266
+ value_model: Any = None,
267
+ system_prompt: str | None = None,
268
+ max_steps: int = 20,
269
+ stop_sequences: List[str] | None = None,
270
+ ) -> None:
271
+ # Compose an inner Agent for actual generation
272
+ self._agent = Agent(
273
+ llm_client=llm_client,
274
+ system_prompt=system_prompt,
275
+ max_steps=max_steps,
276
+ stop_sequences=stop_sequences,
277
+ )
278
+ self.candidate_width = candidate_width
279
+ self.value_model = value_model
280
+ self.llm_client = llm_client
281
+
282
+ def generate_step(
283
+ self,
284
+ conversation_history: List[Dict[str, str]],
285
+ available_tools: List[Dict[str, Any]] | None = None,
286
+ **kwargs,
287
+ ) -> Dict[str, Any]:
288
+ """Generate *candidate_width* candidates, score them, pick the best."""
289
+ candidates: List[Dict[str, Any]] = []
290
+ for _ in range(self.candidate_width):
291
+ candidate = self._agent.generate_step(
292
+ conversation_history,
293
+ available_tools=available_tools,
294
+ **kwargs,
295
+ )
296
+ candidates.append(candidate)
297
+
298
+ if not candidates:
299
+ raise RuntimeError("No candidates generated.")
300
+
301
+ # If no value model is available, fall back to first candidate
302
+ if self.value_model is None:
303
+ return candidates[0]
304
+
305
+ # Score each candidate
306
+ best_candidate = candidates[0]
307
+ best_score = float("-inf")
308
+ for candidate in candidates:
309
+ augmented_history = list(conversation_history) + [candidate]
310
+ score = self.value_model.score_partial_rollout(augmented_history)
311
+ if score > best_score:
312
+ best_score = score
313
+ best_candidate = candidate
314
+
315
+ return best_candidate