agentforge-ml 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
agentforge/__init__.py ADDED
@@ -0,0 +1,12 @@
1
+ """AgentForge — ReAct agents on open-weight LLMs with tools and an eval harness."""
2
+
3
+ from importlib.metadata import PackageNotFoundError, version
4
+
5
+ from agentforge.core.agent import Agent, AgentResult, Step
6
+
7
+ try:
8
+ __version__ = version("agentforge-ml")
9
+ except PackageNotFoundError:
10
+ __version__ = "0.0.0+local"
11
+
12
+ __all__ = ["Agent", "AgentResult", "Step", "__version__"]
agentforge/cli.py ADDED
@@ -0,0 +1,175 @@
1
+ """Command-line interface — ``agentforge`` / ``af``."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import time
7
+ from pathlib import Path
8
+ from typing import Annotated
9
+
10
+ import typer
11
+ from rich.console import Console
12
+ from rich.panel import Panel
13
+ from rich.table import Table
14
+
15
+ app = typer.Typer(
16
+ name="agentforge",
17
+ help="ReAct agents on open-weight LLMs with tools and an eval harness.",
18
+ no_args_is_help=True,
19
+ rich_markup_mode="rich",
20
+ )
21
+ console = Console()
22
+
23
+ _TOOL_FACTORY = {
24
+ "calculator": lambda: _import("agentforge.tools.Calculator")(),
25
+ "python_repl": lambda: _import("agentforge.tools.PythonREPL")(),
26
+ "web_search": lambda: _import("agentforge.tools.WebSearch")(),
27
+ }
28
+
29
+
30
+ def _import(dotted: str):
31
+ mod_name, cls_name = dotted.rsplit(".", 1)
32
+ import importlib
33
+
34
+ return getattr(importlib.import_module(mod_name), cls_name)
35
+
36
+
37
+ def _build_tools(spec: str):
38
+ if spec.lower() in ("all", "*"):
39
+ spec = ",".join(_TOOL_FACTORY)
40
+ tools = []
41
+ for name in (x.strip() for x in spec.split(",")):
42
+ if not name:
43
+ continue
44
+ if name not in _TOOL_FACTORY:
45
+ console.print(f"[red]unknown tool: {name}[/] (available: {', '.join(_TOOL_FACTORY)})")
46
+ raise typer.Exit(2)
47
+ tools.append(_TOOL_FACTORY[name]())
48
+ return tools
49
+
50
+
51
+ @app.command()
52
+ def ask(
53
+ question: Annotated[str, typer.Argument()],
54
+ model_id: Annotated[str, typer.Option()] = "Qwen/Qwen2.5-3B-Instruct",
55
+ tools: Annotated[str, typer.Option(help="Comma-separated or 'all'")] = "calculator,python_repl",
56
+ max_steps: Annotated[int, typer.Option()] = 6,
57
+ max_new_tokens: Annotated[int, typer.Option()] = 256,
58
+ quantize: Annotated[str | None, typer.Option(help="Optional turboquant-ml method")] = None,
59
+ verbose: Annotated[bool, typer.Option()] = False,
60
+ ) -> None:
61
+ """Run the ReAct agent on one question."""
62
+ from agentforge import Agent
63
+
64
+ console.print(Panel.fit(f"[bold teal]agentforge ask[/] [dim]{question}[/]"))
65
+ tool_list = _build_tools(tools)
66
+ agent = Agent.from_defaults(
67
+ model_id=model_id, tools=tool_list, max_steps=max_steps, verbose=verbose, quantize=quantize
68
+ )
69
+ result = agent.run(question, max_new_tokens=max_new_tokens)
70
+
71
+ if verbose or not result.success:
72
+ table = Table(title="Steps", show_header=True)
73
+ table.add_column("#", style="dim")
74
+ table.add_column("tool")
75
+ table.add_column("input")
76
+ table.add_column("observation")
77
+ for i, s in enumerate(result.steps, 1):
78
+ table.add_row(
79
+ str(i), s.tool or "-", (s.action_input or "")[:60], (s.observation or "")[:80]
80
+ )
81
+ console.print(table)
82
+
83
+ console.print(
84
+ f"\n[bold]Final answer[/] [dim]({result.latency_ms:.0f} ms, {result.n_steps} steps)[/]"
85
+ )
86
+ console.print(result.final_answer)
87
+
88
+
89
+ @app.command()
90
+ def eval(
91
+ dataset: Annotated[
92
+ Path, typer.Argument(help="JSONL with {question, ground_truth, expected_tools?}")
93
+ ],
94
+ model_id: Annotated[str, typer.Option()] = "Qwen/Qwen2.5-3B-Instruct",
95
+ tools: Annotated[str, typer.Option()] = "all",
96
+ max_steps: Annotated[int, typer.Option()] = 6,
97
+ out: Annotated[Path | None, typer.Option()] = None,
98
+ limit: Annotated[int | None, typer.Option()] = None,
99
+ ) -> None:
100
+ """Run the eval harness over a JSONL dataset."""
101
+ from agentforge import Agent
102
+ from agentforge.eval import evaluate
103
+ from agentforge.eval.report import EvalReport
104
+
105
+ samples = _read_jsonl(dataset, limit=limit)
106
+ console.print(Panel.fit(f"[bold teal]agentforge eval[/] n={len(samples)}"))
107
+
108
+ tool_list = _build_tools(tools)
109
+ agent = Agent.from_defaults(model_id=model_id, tools=tool_list, max_steps=max_steps)
110
+
111
+ results = []
112
+ latencies: list[float] = []
113
+ for s in samples:
114
+ t0 = time.perf_counter()
115
+ r = agent.run(s["question"])
116
+ latencies.append((time.perf_counter() - t0) * 1000)
117
+ results.append(r)
118
+
119
+ res = evaluate(samples, results)
120
+ report = EvalReport(
121
+ n=res["n"], means=res["means"], per_sample=res["per_sample"], latencies_ms=latencies
122
+ )
123
+ console.print(report.as_table())
124
+ if out:
125
+ report.save(out)
126
+ console.print(f"[green]ok[/] saved {out}")
127
+
128
+
129
+ @app.command()
130
+ def serve(
131
+ model_id: Annotated[str, typer.Option()] = "Qwen/Qwen2.5-3B-Instruct",
132
+ tools: Annotated[str, typer.Option()] = "calculator,python_repl",
133
+ max_steps: Annotated[int, typer.Option()] = 6,
134
+ host: Annotated[str, typer.Option()] = "127.0.0.1",
135
+ port: Annotated[int, typer.Option()] = 8000,
136
+ ) -> None:
137
+ """Start the FastAPI agent server."""
138
+ import uvicorn
139
+
140
+ from agentforge import Agent
141
+ from agentforge.serve import build_app
142
+
143
+ tool_list = _build_tools(tools)
144
+ agent = Agent.from_defaults(model_id=model_id, tools=tool_list, max_steps=max_steps)
145
+ app_ = build_app(agent)
146
+ uvicorn.run(app_, host=host, port=port, log_level="info")
147
+
148
+
149
+ @app.command(name="tools")
150
+ def list_tools_cmd() -> None:
151
+ """List the built-in tools."""
152
+ table = Table(title="Built-in tools")
153
+ table.add_column("name")
154
+ table.add_column("description")
155
+ for name, factory in _TOOL_FACTORY.items():
156
+ t = factory()
157
+ table.add_row(name, t.description)
158
+ console.print(table)
159
+
160
+
161
+ def _read_jsonl(path: Path, *, limit: int | None) -> list[dict]:
162
+ rows: list[dict] = []
163
+ with path.open(encoding="utf-8") as f:
164
+ for line in f:
165
+ s = line.strip()
166
+ if not s:
167
+ continue
168
+ rows.append(json.loads(s))
169
+ if limit and len(rows) >= limit:
170
+ break
171
+ return rows
172
+
173
+
174
+ if __name__ == "__main__":
175
+ app()
@@ -0,0 +1,15 @@
1
+ """ReAct loop + parser + prompts."""
2
+
3
+ from agentforge.core.agent import Agent, AgentResult, Step
4
+ from agentforge.core.parser import ParsedStep, parse_step
5
+ from agentforge.core.prompts import REACT_SYSTEM_PROMPT, build_user_prompt
6
+
7
+ __all__ = [
8
+ "REACT_SYSTEM_PROMPT",
9
+ "Agent",
10
+ "AgentResult",
11
+ "ParsedStep",
12
+ "Step",
13
+ "build_user_prompt",
14
+ "parse_step",
15
+ ]
@@ -0,0 +1,186 @@
1
+ """ReAct agent — the loop.
2
+
3
+ while step < max_steps:
4
+ thought, action, action_input = LLM(question + scratchpad)
5
+ if final_answer: return
6
+ observation = tool(action_input)
7
+ scratchpad += step
8
+
9
+ That is the whole idea. Everything else is just plumbing.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import logging
15
+ import time
16
+ from dataclasses import dataclass, field
17
+ from typing import Any
18
+
19
+ from agentforge.core.parser import parse_step
20
+ from agentforge.core.prompts import build_system_prompt, build_user_prompt, format_scratchpad
21
+ from agentforge.llm import LLM
22
+ from agentforge.tools import Tool, ToolRegistry
23
+
24
+ logger = logging.getLogger("agentforge")
25
+
26
+
27
+ @dataclass
28
+ class Step:
29
+ thought: str
30
+ tool: str | None
31
+ action_input: str | None
32
+ observation: str | None
33
+ elapsed_ms: float
34
+
35
+
36
+ @dataclass
37
+ class AgentResult:
38
+ question: str
39
+ final_answer: str
40
+ steps: list[Step]
41
+ n_steps: int
42
+ success: bool
43
+ latency_ms: float
44
+ extras: dict[str, Any] = field(default_factory=dict)
45
+
46
+
47
+ class Agent:
48
+ """ReAct agent with a tool registry and a configurable LLM."""
49
+
50
+ def __init__(
51
+ self,
52
+ llm: LLM,
53
+ tools: list[Tool] | ToolRegistry,
54
+ *,
55
+ max_steps: int = 6,
56
+ stop: tuple[str, ...] = ("\nObservation:",),
57
+ verbose: bool = False,
58
+ ) -> None:
59
+ self.llm = llm
60
+ self.tools = tools if isinstance(tools, ToolRegistry) else ToolRegistry(tools)
61
+ self.max_steps = max_steps
62
+ self.stop = stop
63
+ self.verbose = verbose
64
+
65
+ @classmethod
66
+ def from_defaults(
67
+ cls,
68
+ model_id: str = "Qwen/Qwen2.5-3B-Instruct",
69
+ *,
70
+ tools: list[Tool] | None = None,
71
+ max_steps: int = 6,
72
+ verbose: bool = False,
73
+ quantize: str | None = None,
74
+ device_map: str | dict | None = "auto",
75
+ ) -> Agent:
76
+ if quantize:
77
+ from agentforge.llm import QuantizedHFLLM
78
+
79
+ llm = QuantizedHFLLM(model_id, method=quantize, device_map=device_map)
80
+ else:
81
+ from agentforge.llm import HFLLM
82
+
83
+ llm = HFLLM(model_id, device_map=device_map)
84
+
85
+ return cls(llm=llm, tools=tools or [], max_steps=max_steps, verbose=verbose)
86
+
87
+ # ------------------------------------------------------------------ public
88
+
89
+ def run(
90
+ self,
91
+ question: str,
92
+ *,
93
+ max_steps: int | None = None,
94
+ max_new_tokens: int = 256,
95
+ ) -> AgentResult:
96
+ max_steps = max_steps or self.max_steps
97
+ sys_prompt = build_system_prompt(list(self.tools))
98
+ steps: list[Step] = []
99
+ t0 = time.perf_counter()
100
+
101
+ for _ in range(max_steps):
102
+ scratchpad = format_scratchpad(steps)
103
+ user_prompt = build_user_prompt(question, scratchpad)
104
+ full_prompt = f"{sys_prompt}\n\n{user_prompt}"
105
+
106
+ llm_t0 = time.perf_counter()
107
+ raw = self.llm.generate(
108
+ full_prompt,
109
+ max_new_tokens=max_new_tokens,
110
+ stop=list(self.stop),
111
+ )
112
+ elapsed = (time.perf_counter() - llm_t0) * 1000
113
+
114
+ parsed = parse_step(raw)
115
+ if self.verbose:
116
+ logger.info("step parse: %s", parsed)
117
+
118
+ if parsed.is_final:
119
+ steps.append(
120
+ Step(
121
+ thought=parsed.thought,
122
+ tool=None,
123
+ action_input=None,
124
+ observation=None,
125
+ elapsed_ms=elapsed,
126
+ )
127
+ )
128
+ return _success(question, parsed.final_answer or "", steps, t0)
129
+
130
+ # Need a tool to continue. If the LLM didn't emit one, treat the
131
+ # raw response as the final answer (graceful degradation).
132
+ if not parsed.tool:
133
+ return _success(question, raw.strip(), steps, t0, success=False)
134
+
135
+ tool = self.tools.get(parsed.tool)
136
+ if tool is None:
137
+ observation = (
138
+ f"Error: unknown tool '{parsed.tool}'. "
139
+ f"Available: {', '.join(t.name for t in self.tools)}"
140
+ )
141
+ else:
142
+ try:
143
+ observation = tool.run(parsed.action_input or "")
144
+ except Exception as e:
145
+ observation = f"Error running {parsed.tool}: {type(e).__name__}: {e}"
146
+
147
+ steps.append(
148
+ Step(
149
+ thought=parsed.thought,
150
+ tool=parsed.tool,
151
+ action_input=parsed.action_input,
152
+ observation=observation,
153
+ elapsed_ms=elapsed,
154
+ )
155
+ )
156
+
157
+ # Out of steps.
158
+ return _success(
159
+ question,
160
+ steps[-1].observation if steps and steps[-1].observation else "",
161
+ steps,
162
+ t0,
163
+ success=False,
164
+ )
165
+
166
+
167
+ def _success(
168
+ question: str,
169
+ answer: str,
170
+ steps: list[Step],
171
+ t0: float,
172
+ *,
173
+ success: bool = True,
174
+ ) -> AgentResult:
175
+ latency_ms = (time.perf_counter() - t0) * 1000
176
+ return AgentResult(
177
+ question=question,
178
+ final_answer=answer,
179
+ steps=steps,
180
+ n_steps=len(steps),
181
+ success=success,
182
+ latency_ms=round(latency_ms, 2),
183
+ )
184
+
185
+
186
+ __all__ = ["Agent", "AgentResult", "Step"]
@@ -0,0 +1,85 @@
1
+ """ReAct output parser.
2
+
3
+ The LLM emits free-form text shaped roughly like:
4
+
5
+ Thought: ...
6
+ Action: tool_name
7
+ Action Input: ...
8
+
9
+ or:
10
+
11
+ Thought: ...
12
+ Final Answer: ...
13
+
14
+ This parser is intentionally **forgiving** — small open models often forget
15
+ whitespace, mix cases, or fail to close the action input. We grab the most
16
+ recent plausible step from the buffer and stop at the first unambiguous
17
+ boundary.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import re
23
+ from dataclasses import dataclass
24
+
25
+ _FINAL = re.compile(r"final\s*answer\s*:\s*(.*)", re.IGNORECASE | re.DOTALL)
26
+ _THOUGHT = re.compile(
27
+ r"thought\s*:\s*(.*?)(?=\n\s*(?:action|final\s*answer)\s*:|$)", re.IGNORECASE | re.DOTALL
28
+ )
29
+ _ACTION = re.compile(r"action\s*:\s*([^\n]+)", re.IGNORECASE)
30
+ _INPUT = re.compile(
31
+ r"action\s*input\s*:\s*(.*?)(?=\n\s*(?:observation|thought|action|final\s*answer)\s*:|$)",
32
+ re.IGNORECASE | re.DOTALL,
33
+ )
34
+
35
+
36
+ @dataclass
37
+ class ParsedStep:
38
+ thought: str = ""
39
+ tool: str | None = None
40
+ action_input: str | None = None
41
+ final_answer: str | None = None
42
+
43
+ @property
44
+ def is_final(self) -> bool:
45
+ return self.final_answer is not None
46
+
47
+
48
+ def parse_step(text: str) -> ParsedStep:
49
+ """Pull the next Thought/Action/Action Input *or* Final Answer out of ``text``."""
50
+ text = text.strip()
51
+
52
+ final = _FINAL.search(text)
53
+ if final:
54
+ # If the model emitted a Final Answer, anything before it is the thought.
55
+ thought = _extract_thought(text[: final.start()])
56
+ ans = final.group(1).strip()
57
+ # Trim a possible trailing block ("Observation: ...") in case the model went past.
58
+ ans = _trim_after(ans, ("\nThought:", "\nObservation:", "\nAction:"))
59
+ return ParsedStep(thought=thought, final_answer=ans)
60
+
61
+ thought = _extract_thought(text)
62
+ action = _ACTION.search(text)
63
+ inp = _INPUT.search(text)
64
+ tool = action.group(1).strip() if action else None
65
+ action_input = inp.group(1).strip() if inp else None
66
+ if action_input is not None:
67
+ action_input = _trim_after(action_input, ("\nThought:", "\nObservation:", "\nAction:"))
68
+ return ParsedStep(thought=thought, tool=tool, action_input=action_input)
69
+
70
+
71
+ def _extract_thought(text: str) -> str:
72
+ m = _THOUGHT.search(text)
73
+ if not m:
74
+ return ""
75
+ return m.group(1).strip()
76
+
77
+
78
+ def _trim_after(s: str, stops: tuple[str, ...]) -> str:
79
+ earliest = len(s)
80
+ lower = s.lower()
81
+ for stop in stops:
82
+ idx = lower.find(stop.lower())
83
+ if idx != -1:
84
+ earliest = min(earliest, idx)
85
+ return s[:earliest].strip()
@@ -0,0 +1,59 @@
1
+ """ReAct prompt templates.
2
+
3
+ The system prompt explains the loop format and lists the available tools with
4
+ their docstrings. We keep it short and concrete — long prompts confuse small
5
+ open models more than they help.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ REACT_SYSTEM_PROMPT = """You are a helpful assistant that solves problems step by step.
11
+
12
+ You have access to the following tools:
13
+
14
+ {tool_block}
15
+
16
+ Use this exact format, one Thought/Action/Action Input per step:
17
+
18
+ Thought: <your reasoning about what to do next>
19
+ Action: <one of: {tool_names}>
20
+ Action Input: <the input to the tool>
21
+
22
+ After each step, you will receive an Observation: with the tool's output.
23
+ Continue until you can answer, then write:
24
+
25
+ Thought: <reasoning summarizing the result>
26
+ Final Answer: <your final, concise answer>
27
+
28
+ Important rules:
29
+ - Use only one Action per step. Do not write multiple actions at once.
30
+ - If the answer is straightforward and no tool is needed, you may go directly to "Final Answer:".
31
+ - Do not invent observations — wait for the system to provide them.
32
+ """
33
+
34
+
35
+ def build_system_prompt(tools: list) -> str:
36
+ """Render the tool registry into the system prompt."""
37
+ tool_block = "\n".join(f"- {t.name}: {t.description}" for t in tools)
38
+ tool_names = ", ".join(t.name for t in tools) if tools else "(none)"
39
+ return REACT_SYSTEM_PROMPT.format(tool_block=tool_block, tool_names=tool_names)
40
+
41
+
42
+ def build_user_prompt(question: str, scratchpad: str = "") -> str:
43
+ """Render the user turn: the question + the running scratchpad of prior steps."""
44
+ if not scratchpad:
45
+ return f"Question: {question}\n\nThought:"
46
+ return f"Question: {question}\n\n{scratchpad}\nThought:"
47
+
48
+
49
+ def format_scratchpad(steps: list) -> str:
50
+ """Concatenate completed steps back into the prompt for the next iteration."""
51
+ parts = []
52
+ for s in steps:
53
+ parts.append(
54
+ f"Thought: {s.thought}\n"
55
+ f"Action: {s.tool}\n"
56
+ f"Action Input: {s.action_input}\n"
57
+ f"Observation: {s.observation}"
58
+ )
59
+ return "\n\n".join(parts)
@@ -0,0 +1,19 @@
1
+ """Evaluation harness for ReAct agents."""
2
+
3
+ from agentforge.eval.metrics import (
4
+ evaluate,
5
+ final_answer_match,
6
+ step_efficiency,
7
+ task_completion,
8
+ tool_accuracy,
9
+ )
10
+ from agentforge.eval.report import EvalReport
11
+
12
+ __all__ = [
13
+ "EvalReport",
14
+ "evaluate",
15
+ "final_answer_match",
16
+ "step_efficiency",
17
+ "task_completion",
18
+ "tool_accuracy",
19
+ ]
@@ -0,0 +1,105 @@
1
+ """Agent-quality metrics.
2
+
3
+ Pure Python, no judge model. The metrics are intentionally simple so the
4
+ relationship between "the agent did X" and "the metric says Y" is auditable
5
+ from the code alone.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from collections.abc import Iterable
11
+ from typing import Any
12
+
13
+ from agentforge.core.agent import AgentResult
14
+
15
+
16
+ def task_completion(result: AgentResult, sample: dict) -> float:
17
+ """Did the agent produce *any* final answer? (Did the loop terminate cleanly?)"""
18
+ return 1.0 if result.success and result.final_answer.strip() else 0.0
19
+
20
+
21
+ def final_answer_match(result: AgentResult, sample: dict) -> float:
22
+ """Does the final answer contain the ground-truth string (case-folded)?
23
+
24
+ Substring rather than exact match because LLMs tend to wrap answers in
25
+ natural-language framing ("The answer is 42."). For numeric questions,
26
+ pass the bare number as ``ground_truth`` and it will be matched.
27
+ """
28
+ gt = sample.get("ground_truth")
29
+ if not gt:
30
+ return 0.0
31
+ return 1.0 if str(gt).strip().lower() in result.final_answer.lower() else 0.0
32
+
33
+
34
+ def tool_accuracy(result: AgentResult, sample: dict) -> float:
35
+ """Fraction of tool calls that match the expected tool(s).
36
+
37
+ ``expected_tools`` in the sample can be:
38
+ - a string (one tool expected at any step)
39
+ - a list of strings (each step's expected tool, in order, with "any" wildcard)
40
+ - missing (returns 1.0 for trivia-style samples)
41
+ """
42
+ expected = sample.get("expected_tools")
43
+ if expected is None:
44
+ return 1.0
45
+
46
+ actual = [s.tool for s in result.steps if s.tool]
47
+ if isinstance(expected, str):
48
+ return 1.0 if expected in actual else 0.0
49
+
50
+ if not isinstance(expected, list):
51
+ return 0.0
52
+ if not actual:
53
+ return 0.0
54
+
55
+ # Index-wise comparison up to the shorter sequence.
56
+ n = min(len(actual), len(expected))
57
+ if n == 0:
58
+ return 0.0
59
+ matched = sum(1 for i in range(n) if expected[i] in ("*", "any", actual[i]))
60
+ return matched / max(len(expected), 1)
61
+
62
+
63
+ def step_efficiency(result: AgentResult, sample: dict) -> float:
64
+ """``ground_truth_steps / actual_steps`` clipped to [0, 1].
65
+
66
+ Encourages reaching the answer in as few steps as possible. If the sample
67
+ omits ``ground_truth_steps``, defaults to 1 step.
68
+ """
69
+ if not result.success or result.n_steps == 0:
70
+ return 0.0
71
+ gt_steps = max(1, int(sample.get("ground_truth_steps", 1)))
72
+ return min(1.0, gt_steps / result.n_steps)
73
+
74
+
75
+ _REGISTRY = {
76
+ "task_completion": task_completion,
77
+ "final_answer_match": final_answer_match,
78
+ "tool_accuracy": tool_accuracy,
79
+ "step_efficiency": step_efficiency,
80
+ }
81
+
82
+
83
+ def evaluate(
84
+ samples: list[dict[str, Any]],
85
+ results: list[AgentResult],
86
+ *,
87
+ metrics: Iterable[str] = (
88
+ "task_completion",
89
+ "final_answer_match",
90
+ "tool_accuracy",
91
+ "step_efficiency",
92
+ ),
93
+ ) -> dict:
94
+ """Score a parallel list of samples + results across one or more metrics."""
95
+ if len(samples) != len(results):
96
+ raise ValueError(f"len(samples)={len(samples)} != len(results)={len(results)}")
97
+
98
+ by_metric: dict[str, list[float]] = {m: [] for m in metrics}
99
+ for sample, result in zip(samples, results, strict=True):
100
+ for m in metrics:
101
+ fn = _REGISTRY[m]
102
+ by_metric[m].append(float(fn(result, sample)))
103
+
104
+ means = {m: (sum(v) / len(v) if v else 0.0) for m, v in by_metric.items()}
105
+ return {"n": len(samples), "means": means, "per_sample": by_metric}