agentforge-ml 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentforge/__init__.py +12 -0
- agentforge/cli.py +175 -0
- agentforge/core/__init__.py +15 -0
- agentforge/core/agent.py +186 -0
- agentforge/core/parser.py +85 -0
- agentforge/core/prompts.py +59 -0
- agentforge/eval/__init__.py +19 -0
- agentforge/eval/metrics.py +105 -0
- agentforge/eval/report.py +55 -0
- agentforge/llm/__init__.py +7 -0
- agentforge/llm/base.py +16 -0
- agentforge/llm/hf.py +83 -0
- agentforge/llm/quantized.py +39 -0
- agentforge/memory/__init__.py +7 -0
- agentforge/memory/base.py +23 -0
- agentforge/memory/conversation.py +30 -0
- agentforge/memory/persistent.py +80 -0
- agentforge/serve/__init__.py +5 -0
- agentforge/serve/app.py +83 -0
- agentforge/tools/__init__.py +18 -0
- agentforge/tools/base.py +55 -0
- agentforge/tools/calculator.py +115 -0
- agentforge/tools/python_repl.py +143 -0
- agentforge/tools/rag.py +54 -0
- agentforge/tools/sql.py +64 -0
- agentforge/tools/web_search.py +48 -0
- agentforge/utils.py +35 -0
- agentforge_ml-0.1.0.dist-info/METADATA +242 -0
- agentforge_ml-0.1.0.dist-info/RECORD +32 -0
- agentforge_ml-0.1.0.dist-info/WHEEL +4 -0
- agentforge_ml-0.1.0.dist-info/entry_points.txt +3 -0
- agentforge_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
agentforge/__init__.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""AgentForge — ReAct agents on open-weight LLMs with tools and an eval harness."""
|
|
2
|
+
|
|
3
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
4
|
+
|
|
5
|
+
from agentforge.core.agent import Agent, AgentResult, Step
|
|
6
|
+
|
|
7
|
+
try:
|
|
8
|
+
__version__ = version("agentforge-ml")
|
|
9
|
+
except PackageNotFoundError:
|
|
10
|
+
__version__ = "0.0.0+local"
|
|
11
|
+
|
|
12
|
+
__all__ = ["Agent", "AgentResult", "Step", "__version__"]
|
agentforge/cli.py
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
"""Command-line interface — ``agentforge`` / ``af``."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import time
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Annotated
|
|
9
|
+
|
|
10
|
+
import typer
|
|
11
|
+
from rich.console import Console
|
|
12
|
+
from rich.panel import Panel
|
|
13
|
+
from rich.table import Table
|
|
14
|
+
|
|
15
|
+
app = typer.Typer(
|
|
16
|
+
name="agentforge",
|
|
17
|
+
help="ReAct agents on open-weight LLMs with tools and an eval harness.",
|
|
18
|
+
no_args_is_help=True,
|
|
19
|
+
rich_markup_mode="rich",
|
|
20
|
+
)
|
|
21
|
+
console = Console()
|
|
22
|
+
|
|
23
|
+
_TOOL_FACTORY = {
|
|
24
|
+
"calculator": lambda: _import("agentforge.tools.Calculator")(),
|
|
25
|
+
"python_repl": lambda: _import("agentforge.tools.PythonREPL")(),
|
|
26
|
+
"web_search": lambda: _import("agentforge.tools.WebSearch")(),
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _import(dotted: str):
|
|
31
|
+
mod_name, cls_name = dotted.rsplit(".", 1)
|
|
32
|
+
import importlib
|
|
33
|
+
|
|
34
|
+
return getattr(importlib.import_module(mod_name), cls_name)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _build_tools(spec: str):
|
|
38
|
+
if spec.lower() in ("all", "*"):
|
|
39
|
+
spec = ",".join(_TOOL_FACTORY)
|
|
40
|
+
tools = []
|
|
41
|
+
for name in (x.strip() for x in spec.split(",")):
|
|
42
|
+
if not name:
|
|
43
|
+
continue
|
|
44
|
+
if name not in _TOOL_FACTORY:
|
|
45
|
+
console.print(f"[red]unknown tool: {name}[/] (available: {', '.join(_TOOL_FACTORY)})")
|
|
46
|
+
raise typer.Exit(2)
|
|
47
|
+
tools.append(_TOOL_FACTORY[name]())
|
|
48
|
+
return tools
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@app.command()
|
|
52
|
+
def ask(
|
|
53
|
+
question: Annotated[str, typer.Argument()],
|
|
54
|
+
model_id: Annotated[str, typer.Option()] = "Qwen/Qwen2.5-3B-Instruct",
|
|
55
|
+
tools: Annotated[str, typer.Option(help="Comma-separated or 'all'")] = "calculator,python_repl",
|
|
56
|
+
max_steps: Annotated[int, typer.Option()] = 6,
|
|
57
|
+
max_new_tokens: Annotated[int, typer.Option()] = 256,
|
|
58
|
+
quantize: Annotated[str | None, typer.Option(help="Optional turboquant-ml method")] = None,
|
|
59
|
+
verbose: Annotated[bool, typer.Option()] = False,
|
|
60
|
+
) -> None:
|
|
61
|
+
"""Run the ReAct agent on one question."""
|
|
62
|
+
from agentforge import Agent
|
|
63
|
+
|
|
64
|
+
console.print(Panel.fit(f"[bold teal]agentforge ask[/] [dim]{question}[/]"))
|
|
65
|
+
tool_list = _build_tools(tools)
|
|
66
|
+
agent = Agent.from_defaults(
|
|
67
|
+
model_id=model_id, tools=tool_list, max_steps=max_steps, verbose=verbose, quantize=quantize
|
|
68
|
+
)
|
|
69
|
+
result = agent.run(question, max_new_tokens=max_new_tokens)
|
|
70
|
+
|
|
71
|
+
if verbose or not result.success:
|
|
72
|
+
table = Table(title="Steps", show_header=True)
|
|
73
|
+
table.add_column("#", style="dim")
|
|
74
|
+
table.add_column("tool")
|
|
75
|
+
table.add_column("input")
|
|
76
|
+
table.add_column("observation")
|
|
77
|
+
for i, s in enumerate(result.steps, 1):
|
|
78
|
+
table.add_row(
|
|
79
|
+
str(i), s.tool or "-", (s.action_input or "")[:60], (s.observation or "")[:80]
|
|
80
|
+
)
|
|
81
|
+
console.print(table)
|
|
82
|
+
|
|
83
|
+
console.print(
|
|
84
|
+
f"\n[bold]Final answer[/] [dim]({result.latency_ms:.0f} ms, {result.n_steps} steps)[/]"
|
|
85
|
+
)
|
|
86
|
+
console.print(result.final_answer)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@app.command()
|
|
90
|
+
def eval(
|
|
91
|
+
dataset: Annotated[
|
|
92
|
+
Path, typer.Argument(help="JSONL with {question, ground_truth, expected_tools?}")
|
|
93
|
+
],
|
|
94
|
+
model_id: Annotated[str, typer.Option()] = "Qwen/Qwen2.5-3B-Instruct",
|
|
95
|
+
tools: Annotated[str, typer.Option()] = "all",
|
|
96
|
+
max_steps: Annotated[int, typer.Option()] = 6,
|
|
97
|
+
out: Annotated[Path | None, typer.Option()] = None,
|
|
98
|
+
limit: Annotated[int | None, typer.Option()] = None,
|
|
99
|
+
) -> None:
|
|
100
|
+
"""Run the eval harness over a JSONL dataset."""
|
|
101
|
+
from agentforge import Agent
|
|
102
|
+
from agentforge.eval import evaluate
|
|
103
|
+
from agentforge.eval.report import EvalReport
|
|
104
|
+
|
|
105
|
+
samples = _read_jsonl(dataset, limit=limit)
|
|
106
|
+
console.print(Panel.fit(f"[bold teal]agentforge eval[/] n={len(samples)}"))
|
|
107
|
+
|
|
108
|
+
tool_list = _build_tools(tools)
|
|
109
|
+
agent = Agent.from_defaults(model_id=model_id, tools=tool_list, max_steps=max_steps)
|
|
110
|
+
|
|
111
|
+
results = []
|
|
112
|
+
latencies: list[float] = []
|
|
113
|
+
for s in samples:
|
|
114
|
+
t0 = time.perf_counter()
|
|
115
|
+
r = agent.run(s["question"])
|
|
116
|
+
latencies.append((time.perf_counter() - t0) * 1000)
|
|
117
|
+
results.append(r)
|
|
118
|
+
|
|
119
|
+
res = evaluate(samples, results)
|
|
120
|
+
report = EvalReport(
|
|
121
|
+
n=res["n"], means=res["means"], per_sample=res["per_sample"], latencies_ms=latencies
|
|
122
|
+
)
|
|
123
|
+
console.print(report.as_table())
|
|
124
|
+
if out:
|
|
125
|
+
report.save(out)
|
|
126
|
+
console.print(f"[green]ok[/] saved {out}")
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
@app.command()
|
|
130
|
+
def serve(
|
|
131
|
+
model_id: Annotated[str, typer.Option()] = "Qwen/Qwen2.5-3B-Instruct",
|
|
132
|
+
tools: Annotated[str, typer.Option()] = "calculator,python_repl",
|
|
133
|
+
max_steps: Annotated[int, typer.Option()] = 6,
|
|
134
|
+
host: Annotated[str, typer.Option()] = "127.0.0.1",
|
|
135
|
+
port: Annotated[int, typer.Option()] = 8000,
|
|
136
|
+
) -> None:
|
|
137
|
+
"""Start the FastAPI agent server."""
|
|
138
|
+
import uvicorn
|
|
139
|
+
|
|
140
|
+
from agentforge import Agent
|
|
141
|
+
from agentforge.serve import build_app
|
|
142
|
+
|
|
143
|
+
tool_list = _build_tools(tools)
|
|
144
|
+
agent = Agent.from_defaults(model_id=model_id, tools=tool_list, max_steps=max_steps)
|
|
145
|
+
app_ = build_app(agent)
|
|
146
|
+
uvicorn.run(app_, host=host, port=port, log_level="info")
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
@app.command(name="tools")
|
|
150
|
+
def list_tools_cmd() -> None:
|
|
151
|
+
"""List the built-in tools."""
|
|
152
|
+
table = Table(title="Built-in tools")
|
|
153
|
+
table.add_column("name")
|
|
154
|
+
table.add_column("description")
|
|
155
|
+
for name, factory in _TOOL_FACTORY.items():
|
|
156
|
+
t = factory()
|
|
157
|
+
table.add_row(name, t.description)
|
|
158
|
+
console.print(table)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def _read_jsonl(path: Path, *, limit: int | None) -> list[dict]:
|
|
162
|
+
rows: list[dict] = []
|
|
163
|
+
with path.open(encoding="utf-8") as f:
|
|
164
|
+
for line in f:
|
|
165
|
+
s = line.strip()
|
|
166
|
+
if not s:
|
|
167
|
+
continue
|
|
168
|
+
rows.append(json.loads(s))
|
|
169
|
+
if limit and len(rows) >= limit:
|
|
170
|
+
break
|
|
171
|
+
return rows
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
if __name__ == "__main__":
|
|
175
|
+
app()
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""ReAct loop + parser + prompts."""
|
|
2
|
+
|
|
3
|
+
from agentforge.core.agent import Agent, AgentResult, Step
|
|
4
|
+
from agentforge.core.parser import ParsedStep, parse_step
|
|
5
|
+
from agentforge.core.prompts import REACT_SYSTEM_PROMPT, build_user_prompt
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"REACT_SYSTEM_PROMPT",
|
|
9
|
+
"Agent",
|
|
10
|
+
"AgentResult",
|
|
11
|
+
"ParsedStep",
|
|
12
|
+
"Step",
|
|
13
|
+
"build_user_prompt",
|
|
14
|
+
"parse_step",
|
|
15
|
+
]
|
agentforge/core/agent.py
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
"""ReAct agent — the loop.
|
|
2
|
+
|
|
3
|
+
while step < max_steps:
|
|
4
|
+
thought, action, action_input = LLM(question + scratchpad)
|
|
5
|
+
if final_answer: return
|
|
6
|
+
observation = tool(action_input)
|
|
7
|
+
scratchpad += step
|
|
8
|
+
|
|
9
|
+
That is the whole idea. Everything else is just plumbing.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import logging
|
|
15
|
+
import time
|
|
16
|
+
from dataclasses import dataclass, field
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
from agentforge.core.parser import parse_step
|
|
20
|
+
from agentforge.core.prompts import build_system_prompt, build_user_prompt, format_scratchpad
|
|
21
|
+
from agentforge.llm import LLM
|
|
22
|
+
from agentforge.tools import Tool, ToolRegistry
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger("agentforge")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class Step:
|
|
29
|
+
thought: str
|
|
30
|
+
tool: str | None
|
|
31
|
+
action_input: str | None
|
|
32
|
+
observation: str | None
|
|
33
|
+
elapsed_ms: float
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class AgentResult:
|
|
38
|
+
question: str
|
|
39
|
+
final_answer: str
|
|
40
|
+
steps: list[Step]
|
|
41
|
+
n_steps: int
|
|
42
|
+
success: bool
|
|
43
|
+
latency_ms: float
|
|
44
|
+
extras: dict[str, Any] = field(default_factory=dict)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class Agent:
|
|
48
|
+
"""ReAct agent with a tool registry and a configurable LLM."""
|
|
49
|
+
|
|
50
|
+
def __init__(
|
|
51
|
+
self,
|
|
52
|
+
llm: LLM,
|
|
53
|
+
tools: list[Tool] | ToolRegistry,
|
|
54
|
+
*,
|
|
55
|
+
max_steps: int = 6,
|
|
56
|
+
stop: tuple[str, ...] = ("\nObservation:",),
|
|
57
|
+
verbose: bool = False,
|
|
58
|
+
) -> None:
|
|
59
|
+
self.llm = llm
|
|
60
|
+
self.tools = tools if isinstance(tools, ToolRegistry) else ToolRegistry(tools)
|
|
61
|
+
self.max_steps = max_steps
|
|
62
|
+
self.stop = stop
|
|
63
|
+
self.verbose = verbose
|
|
64
|
+
|
|
65
|
+
@classmethod
|
|
66
|
+
def from_defaults(
|
|
67
|
+
cls,
|
|
68
|
+
model_id: str = "Qwen/Qwen2.5-3B-Instruct",
|
|
69
|
+
*,
|
|
70
|
+
tools: list[Tool] | None = None,
|
|
71
|
+
max_steps: int = 6,
|
|
72
|
+
verbose: bool = False,
|
|
73
|
+
quantize: str | None = None,
|
|
74
|
+
device_map: str | dict | None = "auto",
|
|
75
|
+
) -> Agent:
|
|
76
|
+
if quantize:
|
|
77
|
+
from agentforge.llm import QuantizedHFLLM
|
|
78
|
+
|
|
79
|
+
llm = QuantizedHFLLM(model_id, method=quantize, device_map=device_map)
|
|
80
|
+
else:
|
|
81
|
+
from agentforge.llm import HFLLM
|
|
82
|
+
|
|
83
|
+
llm = HFLLM(model_id, device_map=device_map)
|
|
84
|
+
|
|
85
|
+
return cls(llm=llm, tools=tools or [], max_steps=max_steps, verbose=verbose)
|
|
86
|
+
|
|
87
|
+
# ------------------------------------------------------------------ public
|
|
88
|
+
|
|
89
|
+
def run(
|
|
90
|
+
self,
|
|
91
|
+
question: str,
|
|
92
|
+
*,
|
|
93
|
+
max_steps: int | None = None,
|
|
94
|
+
max_new_tokens: int = 256,
|
|
95
|
+
) -> AgentResult:
|
|
96
|
+
max_steps = max_steps or self.max_steps
|
|
97
|
+
sys_prompt = build_system_prompt(list(self.tools))
|
|
98
|
+
steps: list[Step] = []
|
|
99
|
+
t0 = time.perf_counter()
|
|
100
|
+
|
|
101
|
+
for _ in range(max_steps):
|
|
102
|
+
scratchpad = format_scratchpad(steps)
|
|
103
|
+
user_prompt = build_user_prompt(question, scratchpad)
|
|
104
|
+
full_prompt = f"{sys_prompt}\n\n{user_prompt}"
|
|
105
|
+
|
|
106
|
+
llm_t0 = time.perf_counter()
|
|
107
|
+
raw = self.llm.generate(
|
|
108
|
+
full_prompt,
|
|
109
|
+
max_new_tokens=max_new_tokens,
|
|
110
|
+
stop=list(self.stop),
|
|
111
|
+
)
|
|
112
|
+
elapsed = (time.perf_counter() - llm_t0) * 1000
|
|
113
|
+
|
|
114
|
+
parsed = parse_step(raw)
|
|
115
|
+
if self.verbose:
|
|
116
|
+
logger.info("step parse: %s", parsed)
|
|
117
|
+
|
|
118
|
+
if parsed.is_final:
|
|
119
|
+
steps.append(
|
|
120
|
+
Step(
|
|
121
|
+
thought=parsed.thought,
|
|
122
|
+
tool=None,
|
|
123
|
+
action_input=None,
|
|
124
|
+
observation=None,
|
|
125
|
+
elapsed_ms=elapsed,
|
|
126
|
+
)
|
|
127
|
+
)
|
|
128
|
+
return _success(question, parsed.final_answer or "", steps, t0)
|
|
129
|
+
|
|
130
|
+
# Need a tool to continue. If the LLM didn't emit one, treat the
|
|
131
|
+
# raw response as the final answer (graceful degradation).
|
|
132
|
+
if not parsed.tool:
|
|
133
|
+
return _success(question, raw.strip(), steps, t0, success=False)
|
|
134
|
+
|
|
135
|
+
tool = self.tools.get(parsed.tool)
|
|
136
|
+
if tool is None:
|
|
137
|
+
observation = (
|
|
138
|
+
f"Error: unknown tool '{parsed.tool}'. "
|
|
139
|
+
f"Available: {', '.join(t.name for t in self.tools)}"
|
|
140
|
+
)
|
|
141
|
+
else:
|
|
142
|
+
try:
|
|
143
|
+
observation = tool.run(parsed.action_input or "")
|
|
144
|
+
except Exception as e:
|
|
145
|
+
observation = f"Error running {parsed.tool}: {type(e).__name__}: {e}"
|
|
146
|
+
|
|
147
|
+
steps.append(
|
|
148
|
+
Step(
|
|
149
|
+
thought=parsed.thought,
|
|
150
|
+
tool=parsed.tool,
|
|
151
|
+
action_input=parsed.action_input,
|
|
152
|
+
observation=observation,
|
|
153
|
+
elapsed_ms=elapsed,
|
|
154
|
+
)
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
# Out of steps.
|
|
158
|
+
return _success(
|
|
159
|
+
question,
|
|
160
|
+
steps[-1].observation if steps and steps[-1].observation else "",
|
|
161
|
+
steps,
|
|
162
|
+
t0,
|
|
163
|
+
success=False,
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def _success(
|
|
168
|
+
question: str,
|
|
169
|
+
answer: str,
|
|
170
|
+
steps: list[Step],
|
|
171
|
+
t0: float,
|
|
172
|
+
*,
|
|
173
|
+
success: bool = True,
|
|
174
|
+
) -> AgentResult:
|
|
175
|
+
latency_ms = (time.perf_counter() - t0) * 1000
|
|
176
|
+
return AgentResult(
|
|
177
|
+
question=question,
|
|
178
|
+
final_answer=answer,
|
|
179
|
+
steps=steps,
|
|
180
|
+
n_steps=len(steps),
|
|
181
|
+
success=success,
|
|
182
|
+
latency_ms=round(latency_ms, 2),
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
__all__ = ["Agent", "AgentResult", "Step"]
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""ReAct output parser.
|
|
2
|
+
|
|
3
|
+
The LLM emits free-form text shaped roughly like:
|
|
4
|
+
|
|
5
|
+
Thought: ...
|
|
6
|
+
Action: tool_name
|
|
7
|
+
Action Input: ...
|
|
8
|
+
|
|
9
|
+
or:
|
|
10
|
+
|
|
11
|
+
Thought: ...
|
|
12
|
+
Final Answer: ...
|
|
13
|
+
|
|
14
|
+
This parser is intentionally **forgiving** — small open models often forget
|
|
15
|
+
whitespace, mix cases, or fail to close the action input. We grab the most
|
|
16
|
+
recent plausible step from the buffer and stop at the first unambiguous
|
|
17
|
+
boundary.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import re
|
|
23
|
+
from dataclasses import dataclass
|
|
24
|
+
|
|
25
|
+
_FINAL = re.compile(r"final\s*answer\s*:\s*(.*)", re.IGNORECASE | re.DOTALL)
|
|
26
|
+
_THOUGHT = re.compile(
|
|
27
|
+
r"thought\s*:\s*(.*?)(?=\n\s*(?:action|final\s*answer)\s*:|$)", re.IGNORECASE | re.DOTALL
|
|
28
|
+
)
|
|
29
|
+
_ACTION = re.compile(r"action\s*:\s*([^\n]+)", re.IGNORECASE)
|
|
30
|
+
_INPUT = re.compile(
|
|
31
|
+
r"action\s*input\s*:\s*(.*?)(?=\n\s*(?:observation|thought|action|final\s*answer)\s*:|$)",
|
|
32
|
+
re.IGNORECASE | re.DOTALL,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class ParsedStep:
|
|
38
|
+
thought: str = ""
|
|
39
|
+
tool: str | None = None
|
|
40
|
+
action_input: str | None = None
|
|
41
|
+
final_answer: str | None = None
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def is_final(self) -> bool:
|
|
45
|
+
return self.final_answer is not None
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def parse_step(text: str) -> ParsedStep:
|
|
49
|
+
"""Pull the next Thought/Action/Action Input *or* Final Answer out of ``text``."""
|
|
50
|
+
text = text.strip()
|
|
51
|
+
|
|
52
|
+
final = _FINAL.search(text)
|
|
53
|
+
if final:
|
|
54
|
+
# If the model emitted a Final Answer, anything before it is the thought.
|
|
55
|
+
thought = _extract_thought(text[: final.start()])
|
|
56
|
+
ans = final.group(1).strip()
|
|
57
|
+
# Trim a possible trailing block ("Observation: ...") in case the model went past.
|
|
58
|
+
ans = _trim_after(ans, ("\nThought:", "\nObservation:", "\nAction:"))
|
|
59
|
+
return ParsedStep(thought=thought, final_answer=ans)
|
|
60
|
+
|
|
61
|
+
thought = _extract_thought(text)
|
|
62
|
+
action = _ACTION.search(text)
|
|
63
|
+
inp = _INPUT.search(text)
|
|
64
|
+
tool = action.group(1).strip() if action else None
|
|
65
|
+
action_input = inp.group(1).strip() if inp else None
|
|
66
|
+
if action_input is not None:
|
|
67
|
+
action_input = _trim_after(action_input, ("\nThought:", "\nObservation:", "\nAction:"))
|
|
68
|
+
return ParsedStep(thought=thought, tool=tool, action_input=action_input)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _extract_thought(text: str) -> str:
|
|
72
|
+
m = _THOUGHT.search(text)
|
|
73
|
+
if not m:
|
|
74
|
+
return ""
|
|
75
|
+
return m.group(1).strip()
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _trim_after(s: str, stops: tuple[str, ...]) -> str:
|
|
79
|
+
earliest = len(s)
|
|
80
|
+
lower = s.lower()
|
|
81
|
+
for stop in stops:
|
|
82
|
+
idx = lower.find(stop.lower())
|
|
83
|
+
if idx != -1:
|
|
84
|
+
earliest = min(earliest, idx)
|
|
85
|
+
return s[:earliest].strip()
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""ReAct prompt templates.
|
|
2
|
+
|
|
3
|
+
The system prompt explains the loop format and lists the available tools with
|
|
4
|
+
their docstrings. We keep it short and concrete — long prompts confuse small
|
|
5
|
+
open models more than they help.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
REACT_SYSTEM_PROMPT = """You are a helpful assistant that solves problems step by step.
|
|
11
|
+
|
|
12
|
+
You have access to the following tools:
|
|
13
|
+
|
|
14
|
+
{tool_block}
|
|
15
|
+
|
|
16
|
+
Use this exact format, one Thought/Action/Action Input per step:
|
|
17
|
+
|
|
18
|
+
Thought: <your reasoning about what to do next>
|
|
19
|
+
Action: <one of: {tool_names}>
|
|
20
|
+
Action Input: <the input to the tool>
|
|
21
|
+
|
|
22
|
+
After each step, you will receive an Observation: with the tool's output.
|
|
23
|
+
Continue until you can answer, then write:
|
|
24
|
+
|
|
25
|
+
Thought: <reasoning summarizing the result>
|
|
26
|
+
Final Answer: <your final, concise answer>
|
|
27
|
+
|
|
28
|
+
Important rules:
|
|
29
|
+
- Use only one Action per step. Do not write multiple actions at once.
|
|
30
|
+
- If the answer is straightforward and no tool is needed, you may go directly to "Final Answer:".
|
|
31
|
+
- Do not invent observations — wait for the system to provide them.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def build_system_prompt(tools: list) -> str:
|
|
36
|
+
"""Render the tool registry into the system prompt."""
|
|
37
|
+
tool_block = "\n".join(f"- {t.name}: {t.description}" for t in tools)
|
|
38
|
+
tool_names = ", ".join(t.name for t in tools) if tools else "(none)"
|
|
39
|
+
return REACT_SYSTEM_PROMPT.format(tool_block=tool_block, tool_names=tool_names)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def build_user_prompt(question: str, scratchpad: str = "") -> str:
|
|
43
|
+
"""Render the user turn: the question + the running scratchpad of prior steps."""
|
|
44
|
+
if not scratchpad:
|
|
45
|
+
return f"Question: {question}\n\nThought:"
|
|
46
|
+
return f"Question: {question}\n\n{scratchpad}\nThought:"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def format_scratchpad(steps: list) -> str:
|
|
50
|
+
"""Concatenate completed steps back into the prompt for the next iteration."""
|
|
51
|
+
parts = []
|
|
52
|
+
for s in steps:
|
|
53
|
+
parts.append(
|
|
54
|
+
f"Thought: {s.thought}\n"
|
|
55
|
+
f"Action: {s.tool}\n"
|
|
56
|
+
f"Action Input: {s.action_input}\n"
|
|
57
|
+
f"Observation: {s.observation}"
|
|
58
|
+
)
|
|
59
|
+
return "\n\n".join(parts)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Evaluation harness for ReAct agents."""
|
|
2
|
+
|
|
3
|
+
from agentforge.eval.metrics import (
|
|
4
|
+
evaluate,
|
|
5
|
+
final_answer_match,
|
|
6
|
+
step_efficiency,
|
|
7
|
+
task_completion,
|
|
8
|
+
tool_accuracy,
|
|
9
|
+
)
|
|
10
|
+
from agentforge.eval.report import EvalReport
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"EvalReport",
|
|
14
|
+
"evaluate",
|
|
15
|
+
"final_answer_match",
|
|
16
|
+
"step_efficiency",
|
|
17
|
+
"task_completion",
|
|
18
|
+
"tool_accuracy",
|
|
19
|
+
]
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""Agent-quality metrics.
|
|
2
|
+
|
|
3
|
+
Pure Python, no judge model. The metrics are intentionally simple so the
|
|
4
|
+
relationship between "the agent did X" and "the metric says Y" is auditable
|
|
5
|
+
from the code alone.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from collections.abc import Iterable
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from agentforge.core.agent import AgentResult
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def task_completion(result: AgentResult, sample: dict) -> float:
|
|
17
|
+
"""Did the agent produce *any* final answer? (Did the loop terminate cleanly?)"""
|
|
18
|
+
return 1.0 if result.success and result.final_answer.strip() else 0.0
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def final_answer_match(result: AgentResult, sample: dict) -> float:
|
|
22
|
+
"""Does the final answer contain the ground-truth string (case-folded)?
|
|
23
|
+
|
|
24
|
+
Substring rather than exact match because LLMs tend to wrap answers in
|
|
25
|
+
natural-language framing ("The answer is 42."). For numeric questions,
|
|
26
|
+
pass the bare number as ``ground_truth`` and it will be matched.
|
|
27
|
+
"""
|
|
28
|
+
gt = sample.get("ground_truth")
|
|
29
|
+
if not gt:
|
|
30
|
+
return 0.0
|
|
31
|
+
return 1.0 if str(gt).strip().lower() in result.final_answer.lower() else 0.0
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def tool_accuracy(result: AgentResult, sample: dict) -> float:
|
|
35
|
+
"""Fraction of tool calls that match the expected tool(s).
|
|
36
|
+
|
|
37
|
+
``expected_tools`` in the sample can be:
|
|
38
|
+
- a string (one tool expected at any step)
|
|
39
|
+
- a list of strings (each step's expected tool, in order, with "any" wildcard)
|
|
40
|
+
- missing (returns 1.0 for trivia-style samples)
|
|
41
|
+
"""
|
|
42
|
+
expected = sample.get("expected_tools")
|
|
43
|
+
if expected is None:
|
|
44
|
+
return 1.0
|
|
45
|
+
|
|
46
|
+
actual = [s.tool for s in result.steps if s.tool]
|
|
47
|
+
if isinstance(expected, str):
|
|
48
|
+
return 1.0 if expected in actual else 0.0
|
|
49
|
+
|
|
50
|
+
if not isinstance(expected, list):
|
|
51
|
+
return 0.0
|
|
52
|
+
if not actual:
|
|
53
|
+
return 0.0
|
|
54
|
+
|
|
55
|
+
# Index-wise comparison up to the shorter sequence.
|
|
56
|
+
n = min(len(actual), len(expected))
|
|
57
|
+
if n == 0:
|
|
58
|
+
return 0.0
|
|
59
|
+
matched = sum(1 for i in range(n) if expected[i] in ("*", "any", actual[i]))
|
|
60
|
+
return matched / max(len(expected), 1)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def step_efficiency(result: AgentResult, sample: dict) -> float:
|
|
64
|
+
"""``ground_truth_steps / actual_steps`` clipped to [0, 1].
|
|
65
|
+
|
|
66
|
+
Encourages reaching the answer in as few steps as possible. If the sample
|
|
67
|
+
omits ``ground_truth_steps``, defaults to 1 step.
|
|
68
|
+
"""
|
|
69
|
+
if not result.success or result.n_steps == 0:
|
|
70
|
+
return 0.0
|
|
71
|
+
gt_steps = max(1, int(sample.get("ground_truth_steps", 1)))
|
|
72
|
+
return min(1.0, gt_steps / result.n_steps)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
_REGISTRY = {
|
|
76
|
+
"task_completion": task_completion,
|
|
77
|
+
"final_answer_match": final_answer_match,
|
|
78
|
+
"tool_accuracy": tool_accuracy,
|
|
79
|
+
"step_efficiency": step_efficiency,
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def evaluate(
|
|
84
|
+
samples: list[dict[str, Any]],
|
|
85
|
+
results: list[AgentResult],
|
|
86
|
+
*,
|
|
87
|
+
metrics: Iterable[str] = (
|
|
88
|
+
"task_completion",
|
|
89
|
+
"final_answer_match",
|
|
90
|
+
"tool_accuracy",
|
|
91
|
+
"step_efficiency",
|
|
92
|
+
),
|
|
93
|
+
) -> dict:
|
|
94
|
+
"""Score a parallel list of samples + results across one or more metrics."""
|
|
95
|
+
if len(samples) != len(results):
|
|
96
|
+
raise ValueError(f"len(samples)={len(samples)} != len(results)={len(results)}")
|
|
97
|
+
|
|
98
|
+
by_metric: dict[str, list[float]] = {m: [] for m in metrics}
|
|
99
|
+
for sample, result in zip(samples, results, strict=True):
|
|
100
|
+
for m in metrics:
|
|
101
|
+
fn = _REGISTRY[m]
|
|
102
|
+
by_metric[m].append(float(fn(result, sample)))
|
|
103
|
+
|
|
104
|
+
means = {m: (sum(v) / len(v) if v else 0.0) for m, v in by_metric.items()}
|
|
105
|
+
return {"n": len(samples), "means": means, "per_sample": by_metric}
|