konash 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- konash-0.1.0/PKG-INFO +21 -0
- konash-0.1.0/README.md +153 -0
- konash-0.1.0/konash/__init__.py +6 -0
- konash-0.1.0/konash/agent.py +315 -0
- konash-0.1.0/konash/api.py +706 -0
- konash-0.1.0/konash/cli.py +166 -0
- konash-0.1.0/konash/corpora/__init__.py +0 -0
- konash-0.1.0/konash/corpora/policies.py +71 -0
- konash-0.1.0/konash/corpus.py +232 -0
- konash-0.1.0/konash/eval/__init__.py +0 -0
- konash-0.1.0/konash/eval/ablations.py +91 -0
- konash-0.1.0/konash/eval/benchmarks.py +75 -0
- konash-0.1.0/konash/eval/experiments.py +189 -0
- konash-0.1.0/konash/eval/metrics.py +78 -0
- konash-0.1.0/konash/eval/nuggets.py +470 -0
- konash-0.1.0/konash/eval/runner.py +296 -0
- konash-0.1.0/konash/eval/stats.py +79 -0
- konash-0.1.0/konash/eval/ttc.py +117 -0
- konash-0.1.0/konash/harness/__init__.py +0 -0
- konash-0.1.0/konash/harness/dispatcher.py +166 -0
- konash-0.1.0/konash/harness/environment.py +291 -0
- konash-0.1.0/konash/harness/runtime.py +94 -0
- konash-0.1.0/konash/harness/strategy.py +370 -0
- konash-0.1.0/konash/inference/__init__.py +0 -0
- konash-0.1.0/konash/inference/aggregation.py +206 -0
- konash-0.1.0/konash/inference/config.py +14 -0
- konash-0.1.0/konash/inference/local.py +503 -0
- konash-0.1.0/konash/inference/parallel.py +250 -0
- konash-0.1.0/konash/inference/value_model.py +777 -0
- konash-0.1.0/konash/inference/value_search.py +393 -0
- konash-0.1.0/konash/plugins/__init__.py +0 -0
- konash-0.1.0/konash/plugins/base.py +129 -0
- konash-0.1.0/konash/plugins/compression.py +423 -0
- konash-0.1.0/konash/plugins/control.py +184 -0
- konash-0.1.0/konash/prompts/__init__.py +0 -0
- konash-0.1.0/konash/prompts/registry.py +293 -0
- konash-0.1.0/konash/retrieval/__init__.py +0 -0
- konash-0.1.0/konash/retrieval/vector_search.py +515 -0
- konash-0.1.0/konash/rewards/__init__.py +103 -0
- konash-0.1.0/konash/rewards/base.py +25 -0
- konash-0.1.0/konash/rewards/nugget.py +66 -0
- konash-0.1.0/konash/rewards/tasks.py +45 -0
- konash-0.1.0/konash/synthesis/__init__.py +0 -0
- konash-0.1.0/konash/synthesis/config.py +94 -0
- konash-0.1.0/konash/synthesis/dedup.py +873 -0
- konash-0.1.0/konash/synthesis/filters.py +714 -0
- konash-0.1.0/konash/synthesis/pipeline.py +298 -0
- konash-0.1.0/konash/synthesis/qa.py +591 -0
- konash-0.1.0/konash/synthesis/rollouts.py +756 -0
- konash-0.1.0/konash/training/__init__.py +0 -0
- konash-0.1.0/konash/training/dataset.py +140 -0
- konash-0.1.0/konash/training/iteration.py +426 -0
- konash-0.1.0/konash/training/multitask.py +149 -0
- konash-0.1.0/konash/training/oapl.py +634 -0
- konash-0.1.0/konash/training/segmentation.py +163 -0
- konash-0.1.0/konash/training/stats.py +37 -0
- konash-0.1.0/konash.egg-info/PKG-INFO +21 -0
- konash-0.1.0/konash.egg-info/SOURCES.txt +64 -0
- konash-0.1.0/konash.egg-info/dependency_links.txt +1 -0
- konash-0.1.0/konash.egg-info/entry_points.txt +2 -0
- konash-0.1.0/konash.egg-info/requires.txt +20 -0
- konash-0.1.0/konash.egg-info/top_level.txt +1 -0
- konash-0.1.0/pyproject.toml +43 -0
- konash-0.1.0/setup.cfg +4 -0
- konash-0.1.0/tests/test_api_integration.py +169 -0
- konash-0.1.0/tests/test_rewards_runtime.py +49 -0
konash-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: konash
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Train knowledge agents that search, retrieve, compress, and reason — on a single GPU.
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Requires-Dist: numpy>=1.24
|
|
7
|
+
Provides-Extra: dev
|
|
8
|
+
Requires-Dist: pytest>=7.4; extra == "dev"
|
|
9
|
+
Provides-Extra: train
|
|
10
|
+
Requires-Dist: torch>=2.1; extra == "train"
|
|
11
|
+
Requires-Dist: transformers>=4.36; extra == "train"
|
|
12
|
+
Requires-Dist: peft>=0.7; extra == "train"
|
|
13
|
+
Requires-Dist: accelerate>=0.25; extra == "train"
|
|
14
|
+
Provides-Extra: quant
|
|
15
|
+
Requires-Dist: bitsandbytes>=0.41; extra == "quant"
|
|
16
|
+
Provides-Extra: all
|
|
17
|
+
Requires-Dist: torch>=2.1; extra == "all"
|
|
18
|
+
Requires-Dist: transformers>=4.36; extra == "all"
|
|
19
|
+
Requires-Dist: peft>=0.7; extra == "all"
|
|
20
|
+
Requires-Dist: accelerate>=0.25; extra == "all"
|
|
21
|
+
Requires-Dist: bitsandbytes>=0.41; extra == "all"
|
konash-0.1.0/README.md
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
|
|
3
|
+
# KONASH
|
|
4
|
+
|
|
5
|
+
**Knowledge-grounded Off-policy Networks for Agentic System Harnesses**
|
|
6
|
+
|
|
7
|
+
<p>
|
|
8
|
+
Train knowledge agents that search, retrieve, compress, and reason — on a single GPU.
|
|
9
|
+
</p>
|
|
10
|
+
|
|
11
|
+
[](CONTRIBUTING.md)
|
|
12
|
+
[](LICENSE)
|
|
13
|
+
|
|
14
|
+
[](#)
|
|
15
|
+
[](#)
|
|
16
|
+
|
|
17
|
+
</div>
|
|
18
|
+
|
|
19
|
+
KONASH trains knowledge agents via reinforcement learning that match or exceed frontier models on grounded reasoning tasks — at a fraction of the cost. **Single-GPU training, open-source models, 1/100th the compute.**
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## Key Benefits
|
|
24
|
+
|
|
25
|
+
- **100x cheaper training** — Single GPU replaces multi-node clusters. ~$100–500 per iteration instead of ~$10K–50K.
|
|
26
|
+
- **Higher quality** — RL-trained agents search more efficiently, retrieve more diversely, and reason more accurately than their base models. The gains are algorithmic, not scale-dependent.
|
|
27
|
+
- **Consistent results** — Parallel thinking (N=10–20 rollouts + aggregation) turns probabilistic search into near-deterministic accuracy. Cheap rollouts on a small model mean you can afford this on every query.
|
|
28
|
+
- **Zero lock-in** — Your model, your weights, your infrastructure. Deploy anywhere with vLLM and LoRA hot-swapping.
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
# Before: Static RAG — one query, hope for the best
|
|
32
|
+
docs = retriever.search(query, top_k=10)
|
|
33
|
+
answer = llm.generate(f"Answer based on: {docs}\n\n{query}")
|
|
34
|
+
|
|
35
|
+
# After: KONASH — RL-trained agent that searches iteratively
|
|
36
|
+
agent = konash.Agent("./checkpoints/iter2", corpus="./my_docs")
|
|
37
|
+
answer = agent.solve(query, parallel_rollouts=10)
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
[Learn more about KONASH](https://kona.sh)
|
|
41
|
+
|
|
42
|
+
---
|
|
43
|
+
|
|
44
|
+
## KONASH Overview
|
|
45
|
+
|
|
46
|
+
KONASH is an open-source RL framework that improves agent reliability by training knowledge agents to search, retrieve, compress, and reason over evidence — all on a single GPU using off-policy RL. For a quick hands-on introduction, run one of the notebooks below. When you're ready to learn more, check out the [docs](https://kona.sh).
|
|
47
|
+
|
|
48
|
+
### Notebooks
|
|
49
|
+
|
|
50
|
+
| Agent Task | Example Notebook | Description | Comparative Performance |
|
|
51
|
+
|---|---|---|---|
|
|
52
|
+
| **Trivia Night** | [Train agent](notebooks/trivia_night.ipynb) | Qwen 3.5 7B learns to answer multi-constraint trivia by searching Wikipedia | [Link coming soon] |
|
|
53
|
+
| **20 Questions** | [Train agent](#) | Qwen 3.5 7B learns to identify a mystery entity in 20 yes/no searches | [Link coming soon] |
|
|
54
|
+
| **GeoGuessr** | [Train agent](#) | Qwen 3.5 7B learns to pinpoint locations from landmark and terrain descriptions | [Link coming soon] |
|
|
55
|
+
|
|
56
|
+
## KONASH News
|
|
57
|
+
|
|
58
|
+
Explore our latest research and updates on building SOTA knowledge agents.
|
|
59
|
+
|
|
60
|
+
- **[OAPL: Off-Policy RL That Actually Works on One GPU](#)** — Train knowledge agents without multi-node clusters using large-batch iterative off-policy reinforcement learning.
|
|
61
|
+
- **[Agentic Data Synthesis: Let Your Model Write Its Own Curriculum](#)** — Generate diverse, grounded training data from any corpus — no manual annotation required.
|
|
62
|
+
- **[Parallel Thinking: How a 7B Model Beats Frontier Single-Shot](#)** — Scale quality at inference time with N parallel rollouts and generative aggregation.
|
|
63
|
+
- **[Compression as an RL Skill: Teaching Models What to Remember](#)** — Train context compression end-to-end with task reward, not as a separate summarization step.
|
|
64
|
+
|
|
65
|
+
[See all blog posts](https://kona.sh/blog)
|
|
66
|
+
|
|
67
|
+
## Why KONASH?
|
|
68
|
+
|
|
69
|
+
- KONASH provides a complete pipeline for training knowledge agents on **existing corpora**. We abstract the training, synthesis, and serving into a modular system that your code doesn't need to interface with.
|
|
70
|
+
- **Train from anywhere.** Run the KONASH client on your laptop and let the server kick off training on a single GPU — local or cloud. No multi-node clusters required.
|
|
71
|
+
- Integrations with hosted platforms like W&B and Langfuse provide flexible observability and **simplify debugging** across the full synthesis-train-eval loop.
|
|
72
|
+
- KONASH is customizable with **intelligent defaults**. You can configure OAPL hyperparameters, compression thresholds, and inference engine settings to meet specific needs, or take advantage of defaults optimized for single-GPU training efficiency and stability.
|
|
73
|
+
|
|
74
|
+
## Installation
|
|
75
|
+
|
|
76
|
+
KONASH agents can be trained from any client machine that runs Python. To add to an existing project, run this command:
|
|
77
|
+
|
|
78
|
+
```
|
|
79
|
+
pip install konash
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
## Training Loop Overview
|
|
85
|
+
|
|
86
|
+
KONASH uses **large-batch iterative off-policy RL** — unlike online RL frameworks, all data is generated upfront and training happens in a single offline pass. Each iteration improves the model, which then generates better data for the next iteration.
|
|
87
|
+
|
|
88
|
+
1. **Data Synthesis**
|
|
89
|
+
|
|
90
|
+
1. KONASH generates training questions from your corpus using an agentic synthesis pipeline — the model explores documents via vector search and proposes grounded QA pairs.
|
|
91
|
+
2. A deduplication step ensures no overlap with your evaluation set.
|
|
92
|
+
3. On later iterations, the improved model synthesizes its own curriculum — harder, more diverse questions.
|
|
93
|
+
|
|
94
|
+
2. **Rollout Generation**
|
|
95
|
+
|
|
96
|
+
1. The model (or latest checkpoint) generates multiple rollouts per question, interacting with vector search and compression tools.
|
|
97
|
+
2. Each rollout is a full multi-step agent trajectory: search queries, retrieved documents, context compression, and a final answer.
|
|
98
|
+
3. Rewards are computed automatically from answer correctness against ground truth.
|
|
99
|
+
4. Pass-rate filtering keeps questions at the learning frontier — not too easy, not too hard.
|
|
100
|
+
|
|
101
|
+
3. **Training**
|
|
102
|
+
|
|
103
|
+
1. The full set of trajectories becomes a large offline dataset. Training runs in a single batch — no interleaving with inference.
|
|
104
|
+
2. The server trains your model using OAPL with QLoRA. Long trajectories are segmented at compression boundaries, and tool outputs are masked from log-prob computation.
|
|
105
|
+
3. The newly trained LoRA is saved and becomes the starting point for the next iteration.
|
|
106
|
+
|
|
107
|
+
4. **Iterate**
|
|
108
|
+
|
|
109
|
+
1. The trained checkpoint becomes the new reference policy.
|
|
110
|
+
2. All rollouts are **regenerated from scratch** with the improved model — this is what makes each iteration progressively better.
|
|
111
|
+
3. Training runs again on the fresh data. 2–3 iterations yields the best results.
|
|
112
|
+
|
|
113
|
+
## Supported Models
|
|
114
|
+
|
|
115
|
+
KONASH should work with most vLLM/HuggingFace-transformers compatible causal language models, or at least the ones supported by [Unsloth](https://docs.unsloth.ai/get-started/all-our-models). If any model isn't working for you, please let us know on [Discord](#) or open an issue on [GitHub](https://github.com/konaequity/openkona/issues)!
|
|
116
|
+
|
|
117
|
+
---
|
|
118
|
+
|
|
119
|
+
## Contributing
|
|
120
|
+
|
|
121
|
+
KONASH is in active development and contributions are welcome! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for more information.
|
|
122
|
+
|
|
123
|
+
---
|
|
124
|
+
|
|
125
|
+
## Citation
|
|
126
|
+
|
|
127
|
+
```bibtex
|
|
128
|
+
@misc{konaequity2026konash,
|
|
129
|
+
author = {Kona Equity},
|
|
130
|
+
title = {KONASH: Knowledge-grounded Off-policy Networks for Agentic System Harnesses},
|
|
131
|
+
year = {2026},
|
|
132
|
+
publisher = {GitHub},
|
|
133
|
+
journal = {GitHub repository},
|
|
134
|
+
howpublished = {\url{https://github.com/konaequity/openkona}}
|
|
135
|
+
}
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
---
|
|
139
|
+
|
|
140
|
+
## License
|
|
141
|
+
|
|
142
|
+
This repository's source code is available under the [Apache-2.0 License](LICENSE).
|
|
143
|
+
|
|
144
|
+
---
|
|
145
|
+
|
|
146
|
+
## Credits
|
|
147
|
+
|
|
148
|
+
KONASH builds directly on the research and open-source work of:
|
|
149
|
+
|
|
150
|
+
- [OAPL](https://arxiv.org/abs/2503.01735) — Ritter et al., 2026 (the RL algorithm)
|
|
151
|
+
- [Unsloth](https://github.com/unslothai/unsloth) — Parameter-efficient training backend
|
|
152
|
+
- [vLLM](https://github.com/vllm-project/vllm) — High-throughput inference engine
|
|
153
|
+
- [FAISS](https://github.com/facebookresearch/faiss) — Vector search
|
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import copy
|
|
4
|
+
from typing import Any, Callable, Dict, List, Optional
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Agent:
|
|
8
|
+
"""Core agent that wraps an injected LLM client to drive multi-step rollouts.
|
|
9
|
+
|
|
10
|
+
The LLM client is any object that exposes a ``generate(messages, **kwargs)``
|
|
11
|
+
method returning an assistant message dict (at minimum ``{"role": "assistant",
|
|
12
|
+
"content": "..."}``). Tool calls, adapters, and history compression are
|
|
13
|
+
handled at this layer so that the harness environment stays LLM-agnostic.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
llm_client = None
|
|
17
|
+
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
llm_client: Any = None,
|
|
21
|
+
*,
|
|
22
|
+
system_prompt: str | None = None,
|
|
23
|
+
max_steps: int = 20,
|
|
24
|
+
stop_sequences: List[str] | None = None,
|
|
25
|
+
) -> None:
|
|
26
|
+
self.llm_client = llm_client
|
|
27
|
+
self.system_prompt = system_prompt
|
|
28
|
+
self.max_steps = max_steps
|
|
29
|
+
self.stop_sequences = stop_sequences or []
|
|
30
|
+
self._active_adapters: List[Dict[str, Any]] = []
|
|
31
|
+
|
|
32
|
+
# ------------------------------------------------------------------
|
|
33
|
+
# Core generation
|
|
34
|
+
# ------------------------------------------------------------------
|
|
35
|
+
|
|
36
|
+
def generate(self, messages: List[Dict[str, str]], **kwargs) -> Dict[str, Any]:
|
|
37
|
+
"""Send *messages* to the LLM client and return the raw response.
|
|
38
|
+
|
|
39
|
+
The caller is responsible for building the message list (including any
|
|
40
|
+
system prompt). Extra ``kwargs`` are forwarded to the client.
|
|
41
|
+
"""
|
|
42
|
+
if self.llm_client is None:
|
|
43
|
+
raise RuntimeError("No llm_client configured on this Agent.")
|
|
44
|
+
return self.llm_client.generate(messages, **kwargs)
|
|
45
|
+
|
|
46
|
+
# ------------------------------------------------------------------
|
|
47
|
+
# Step-level generation
|
|
48
|
+
# ------------------------------------------------------------------
|
|
49
|
+
|
|
50
|
+
def generate_step(
|
|
51
|
+
self,
|
|
52
|
+
conversation_history: List[Dict[str, str]],
|
|
53
|
+
available_tools: List[Dict[str, Any]] | None = None,
|
|
54
|
+
**kwargs,
|
|
55
|
+
) -> Dict[str, Any]:
|
|
56
|
+
"""Produce a single agent step (one LLM turn).
|
|
57
|
+
|
|
58
|
+
Returns a dict with at least ``{"role": "assistant", "content": ...}``.
|
|
59
|
+
If the model emits a tool-call the dict may also contain a
|
|
60
|
+
``"tool_calls"`` key.
|
|
61
|
+
"""
|
|
62
|
+
messages = self._build_messages(conversation_history)
|
|
63
|
+
gen_kwargs: Dict[str, Any] = dict(kwargs)
|
|
64
|
+
if available_tools:
|
|
65
|
+
gen_kwargs["tools"] = available_tools
|
|
66
|
+
if self.stop_sequences:
|
|
67
|
+
gen_kwargs.setdefault("stop", self.stop_sequences)
|
|
68
|
+
|
|
69
|
+
response = self.generate(messages, **gen_kwargs)
|
|
70
|
+
return response
|
|
71
|
+
|
|
72
|
+
# ------------------------------------------------------------------
|
|
73
|
+
# Full rollout
|
|
74
|
+
# ------------------------------------------------------------------
|
|
75
|
+
|
|
76
|
+
def generate_rollout(
|
|
77
|
+
self,
|
|
78
|
+
prompt: str,
|
|
79
|
+
environment: Any = None,
|
|
80
|
+
*,
|
|
81
|
+
max_steps: int | None = None,
|
|
82
|
+
**kwargs,
|
|
83
|
+
) -> Dict[str, Any]:
|
|
84
|
+
"""Run a complete episode loop and return the trajectory.
|
|
85
|
+
|
|
86
|
+
If an *environment* is provided its ``step`` / ``run_episode`` contract
|
|
87
|
+
is used; otherwise we fall back to a simple generate-until-done loop.
|
|
88
|
+
"""
|
|
89
|
+
steps = max_steps or self.max_steps
|
|
90
|
+
|
|
91
|
+
if environment is not None:
|
|
92
|
+
environment.reset(prompt=prompt)
|
|
93
|
+
result = environment.run_episode(agent=self, max_steps=steps, **kwargs)
|
|
94
|
+
return result
|
|
95
|
+
|
|
96
|
+
# Standalone loop (no environment)
|
|
97
|
+
history: List[Dict[str, str]] = [{"role": "user", "content": prompt}]
|
|
98
|
+
trajectory: List[Dict[str, Any]] = []
|
|
99
|
+
|
|
100
|
+
for _step_idx in range(steps):
|
|
101
|
+
response = self.generate_step(history)
|
|
102
|
+
trajectory.append(response)
|
|
103
|
+
history.append(response)
|
|
104
|
+
|
|
105
|
+
# Check for natural termination
|
|
106
|
+
if self._is_terminal(response):
|
|
107
|
+
break
|
|
108
|
+
|
|
109
|
+
return {
|
|
110
|
+
"prompt": prompt,
|
|
111
|
+
"history": history,
|
|
112
|
+
"trajectory": trajectory,
|
|
113
|
+
"final_answer": self.extract_final_answer(history),
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
# ------------------------------------------------------------------
|
|
117
|
+
# History compression
|
|
118
|
+
# ------------------------------------------------------------------
|
|
119
|
+
|
|
120
|
+
def compress_history(
|
|
121
|
+
self,
|
|
122
|
+
conversation_history: List[Dict[str, str]],
|
|
123
|
+
*,
|
|
124
|
+
target_tokens: int | None = None,
|
|
125
|
+
**kwargs,
|
|
126
|
+
) -> List[Dict[str, str]]:
|
|
127
|
+
"""Ask the LLM to produce a shorter version of *conversation_history*.
|
|
128
|
+
|
|
129
|
+
Returns a new message list whose semantic content is preserved but
|
|
130
|
+
whose token footprint is reduced.
|
|
131
|
+
"""
|
|
132
|
+
if not conversation_history:
|
|
133
|
+
return []
|
|
134
|
+
|
|
135
|
+
# Build a compression prompt
|
|
136
|
+
serialized = "\n".join(
|
|
137
|
+
f"[{m.get('role', 'unknown')}]: {m.get('content', '')}"
|
|
138
|
+
for m in conversation_history
|
|
139
|
+
)
|
|
140
|
+
compression_prompt = (
|
|
141
|
+
"Compress the following conversation into a concise summary that "
|
|
142
|
+
"preserves all critical facts, tool results, and reasoning steps. "
|
|
143
|
+
"Return ONLY the summary.\n\n" + serialized
|
|
144
|
+
)
|
|
145
|
+
if target_tokens is not None:
|
|
146
|
+
compression_prompt += f"\n\nTarget length: roughly {target_tokens} tokens."
|
|
147
|
+
|
|
148
|
+
messages = [{"role": "user", "content": compression_prompt}]
|
|
149
|
+
response = self.generate(messages, **kwargs)
|
|
150
|
+
summary_content = response.get("content", "") if isinstance(response, dict) else str(response)
|
|
151
|
+
|
|
152
|
+
compressed: List[Dict[str, str]] = [
|
|
153
|
+
{"role": "system", "content": f"[Compressed history] {summary_content}"},
|
|
154
|
+
]
|
|
155
|
+
return compressed
|
|
156
|
+
|
|
157
|
+
# ------------------------------------------------------------------
|
|
158
|
+
# Answer extraction
|
|
159
|
+
# ------------------------------------------------------------------
|
|
160
|
+
|
|
161
|
+
def extract_final_answer(
|
|
162
|
+
self,
|
|
163
|
+
conversation_history: List[Dict[str, str]],
|
|
164
|
+
**kwargs,
|
|
165
|
+
) -> str | None:
|
|
166
|
+
"""Extract the final answer from the conversation history.
|
|
167
|
+
|
|
168
|
+
Walks the history backwards looking for the last assistant message
|
|
169
|
+
that does not appear to be a tool call.
|
|
170
|
+
"""
|
|
171
|
+
for message in reversed(conversation_history):
|
|
172
|
+
if message.get("role") != "assistant":
|
|
173
|
+
continue
|
|
174
|
+
# Skip messages that are purely tool calls with no textual content
|
|
175
|
+
if message.get("tool_calls") and not message.get("content"):
|
|
176
|
+
continue
|
|
177
|
+
content = message.get("content", "")
|
|
178
|
+
if content:
|
|
179
|
+
return content
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
# ------------------------------------------------------------------
|
|
183
|
+
# LoRA adapter management
|
|
184
|
+
# ------------------------------------------------------------------
|
|
185
|
+
|
|
186
|
+
def load_adapter(
|
|
187
|
+
self,
|
|
188
|
+
adapter_path: str,
|
|
189
|
+
*,
|
|
190
|
+
adapter_name: str | None = None,
|
|
191
|
+
weight: float = 1.0,
|
|
192
|
+
**kwargs,
|
|
193
|
+
) -> None:
|
|
194
|
+
"""Load a LoRA adapter and merge it into the active set."""
|
|
195
|
+
name = adapter_name or adapter_path.rstrip("/").split("/")[-1]
|
|
196
|
+
adapter_record = {
|
|
197
|
+
"path": adapter_path,
|
|
198
|
+
"name": name,
|
|
199
|
+
"weight": weight,
|
|
200
|
+
**kwargs,
|
|
201
|
+
}
|
|
202
|
+
self._active_adapters.append(adapter_record)
|
|
203
|
+
|
|
204
|
+
# Delegate to the LLM client if it supports adapter loading
|
|
205
|
+
if hasattr(self.llm_client, "load_adapter"):
|
|
206
|
+
self.llm_client.load_adapter(adapter_path, name=name, weight=weight, **kwargs)
|
|
207
|
+
|
|
208
|
+
def unload_adapter(
|
|
209
|
+
self,
|
|
210
|
+
adapter_name: str | None = None,
|
|
211
|
+
**kwargs,
|
|
212
|
+
) -> None:
|
|
213
|
+
"""Unload a LoRA adapter (or all adapters if *adapter_name* is ``None``)."""
|
|
214
|
+
if adapter_name is None:
|
|
215
|
+
removed = list(self._active_adapters)
|
|
216
|
+
self._active_adapters.clear()
|
|
217
|
+
else:
|
|
218
|
+
removed = [a for a in self._active_adapters if a["name"] == adapter_name]
|
|
219
|
+
self._active_adapters = [
|
|
220
|
+
a for a in self._active_adapters if a["name"] != adapter_name
|
|
221
|
+
]
|
|
222
|
+
|
|
223
|
+
# Delegate to the LLM client if it supports adapter unloading
|
|
224
|
+
if hasattr(self.llm_client, "unload_adapter"):
|
|
225
|
+
for adapter in removed:
|
|
226
|
+
self.llm_client.unload_adapter(adapter["name"], **kwargs)
|
|
227
|
+
|
|
228
|
+
# ------------------------------------------------------------------
|
|
229
|
+
# Internal helpers
|
|
230
|
+
# ------------------------------------------------------------------
|
|
231
|
+
|
|
232
|
+
def _build_messages(
|
|
233
|
+
self, conversation_history: List[Dict[str, str]]
|
|
234
|
+
) -> List[Dict[str, str]]:
|
|
235
|
+
"""Prepend the system prompt (if any) to the conversation history."""
|
|
236
|
+
messages: List[Dict[str, str]] = []
|
|
237
|
+
if self.system_prompt:
|
|
238
|
+
messages.append({"role": "system", "content": self.system_prompt})
|
|
239
|
+
messages.extend(conversation_history)
|
|
240
|
+
return messages
|
|
241
|
+
|
|
242
|
+
@staticmethod
|
|
243
|
+
def _is_terminal(response: Dict[str, Any]) -> bool:
|
|
244
|
+
"""Heuristic: a response is terminal if it has content and no tool calls."""
|
|
245
|
+
has_tool_calls = bool(response.get("tool_calls"))
|
|
246
|
+
has_content = bool(response.get("content"))
|
|
247
|
+
return has_content and not has_tool_calls
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
class ValueGuidedAgent:
|
|
251
|
+
"""Agent variant that generates multiple candidate continuations per step,
|
|
252
|
+
scores each with a value model, and selects the best one.
|
|
253
|
+
|
|
254
|
+
Drop-in replacement for ``Agent.generate_step`` when plugged into an
|
|
255
|
+
environment or strategy that expects the same interface.
|
|
256
|
+
"""
|
|
257
|
+
|
|
258
|
+
candidate_width = 2
|
|
259
|
+
value_model = None
|
|
260
|
+
|
|
261
|
+
def __init__(
|
|
262
|
+
self,
|
|
263
|
+
llm_client: Any = None,
|
|
264
|
+
*,
|
|
265
|
+
candidate_width: int = 2,
|
|
266
|
+
value_model: Any = None,
|
|
267
|
+
system_prompt: str | None = None,
|
|
268
|
+
max_steps: int = 20,
|
|
269
|
+
stop_sequences: List[str] | None = None,
|
|
270
|
+
) -> None:
|
|
271
|
+
# Compose an inner Agent for actual generation
|
|
272
|
+
self._agent = Agent(
|
|
273
|
+
llm_client=llm_client,
|
|
274
|
+
system_prompt=system_prompt,
|
|
275
|
+
max_steps=max_steps,
|
|
276
|
+
stop_sequences=stop_sequences,
|
|
277
|
+
)
|
|
278
|
+
self.candidate_width = candidate_width
|
|
279
|
+
self.value_model = value_model
|
|
280
|
+
self.llm_client = llm_client
|
|
281
|
+
|
|
282
|
+
def generate_step(
|
|
283
|
+
self,
|
|
284
|
+
conversation_history: List[Dict[str, str]],
|
|
285
|
+
available_tools: List[Dict[str, Any]] | None = None,
|
|
286
|
+
**kwargs,
|
|
287
|
+
) -> Dict[str, Any]:
|
|
288
|
+
"""Generate *candidate_width* candidates, score them, pick the best."""
|
|
289
|
+
candidates: List[Dict[str, Any]] = []
|
|
290
|
+
for _ in range(self.candidate_width):
|
|
291
|
+
candidate = self._agent.generate_step(
|
|
292
|
+
conversation_history,
|
|
293
|
+
available_tools=available_tools,
|
|
294
|
+
**kwargs,
|
|
295
|
+
)
|
|
296
|
+
candidates.append(candidate)
|
|
297
|
+
|
|
298
|
+
if not candidates:
|
|
299
|
+
raise RuntimeError("No candidates generated.")
|
|
300
|
+
|
|
301
|
+
# If no value model is available, fall back to first candidate
|
|
302
|
+
if self.value_model is None:
|
|
303
|
+
return candidates[0]
|
|
304
|
+
|
|
305
|
+
# Score each candidate
|
|
306
|
+
best_candidate = candidates[0]
|
|
307
|
+
best_score = float("-inf")
|
|
308
|
+
for candidate in candidates:
|
|
309
|
+
augmented_history = list(conversation_history) + [candidate]
|
|
310
|
+
score = self.value_model.score_partial_rollout(augmented_history)
|
|
311
|
+
if score > best_score:
|
|
312
|
+
best_score = score
|
|
313
|
+
best_candidate = candidate
|
|
314
|
+
|
|
315
|
+
return best_candidate
|