entropy-agent-eval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,13 @@
1
+ from entropy_agent_eval.evaluator import EntropicAgentScore, EntropyEvaluator, EvaluationReport
2
+ from entropy_agent_eval.models import AgentEvent, AgentRun, InformationState
3
+
4
+ __all__ = [
5
+ "AgentEvent",
6
+ "AgentRun",
7
+ "EntropicAgentScore",
8
+ "EntropyEvaluator",
9
+ "EvaluationReport",
10
+ "InformationState",
11
+ ]
12
+
13
+ __version__ = "0.1.0"
@@ -0,0 +1,3 @@
1
+ from entropy_agent_eval.adapters.generic import EventRecorder, normalize_events
2
+
3
+ __all__ = ["EventRecorder", "normalize_events"]
@@ -0,0 +1,53 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Iterable, List, Mapping, Optional
4
+
5
+ from entropy_agent_eval.models import AgentEvent, AgentRun, InformationState
6
+
7
+
8
+ def normalize_events(events: Iterable[Mapping[str, Any]]) -> List[AgentEvent]:
9
+ """Normalize dictionaries from arbitrary agent runtimes into AgentEvent objects."""
10
+
11
+ return [AgentEvent.from_mapping(event) for event in events]
12
+
13
+
14
+ class EventRecorder:
15
+ """Small framework-agnostic recorder for custom agent loops."""
16
+
17
+ def __init__(self, task_id: str) -> None:
18
+ self.task_id = task_id
19
+ self.events: list[AgentEvent] = []
20
+
21
+ def record(self, kind: str, name: str, **metadata: Any) -> None:
22
+ timestamp = metadata.pop("timestamp", None)
23
+ self.events.append(AgentEvent(kind=kind, name=name, timestamp=timestamp, metadata=metadata))
24
+
25
+ def action(self, name: str, **metadata: Any) -> None:
26
+ self.record("action", name, **metadata)
27
+
28
+ def tool(self, name: str, **metadata: Any) -> None:
29
+ self.record("tool", name, **metadata)
30
+
31
+ def llm(self, name: str = "llm", **metadata: Any) -> None:
32
+ self.record("llm", name, **metadata)
33
+
34
+ def to_run(
35
+ self,
36
+ *,
37
+ success: Optional[bool] = None,
38
+ cost: float = 0.0,
39
+ before_state: Optional[InformationState] = None,
40
+ after_state: Optional[InformationState] = None,
41
+ outcome: Optional[str] = None,
42
+ **metadata: Any,
43
+ ) -> AgentRun:
44
+ return AgentRun(
45
+ task_id=self.task_id,
46
+ events=list(self.events),
47
+ success=success,
48
+ cost=cost,
49
+ before_state=before_state,
50
+ after_state=after_state,
51
+ outcome=outcome,
52
+ metadata=metadata,
53
+ )
@@ -0,0 +1,33 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Iterable, Mapping
4
+
5
+ from entropy_agent_eval.models import AgentEvent, AgentRun
6
+
7
+
8
+ def runs_from_adk_events(
9
+ task_id: str,
10
+ events: Iterable[Mapping[str, Any]],
11
+ *,
12
+ success: bool | None = None,
13
+ cost: float = 0.0,
14
+ outcome: str | None = None,
15
+ ) -> AgentRun:
16
+ normalized = []
17
+ for event in events:
18
+ kind = str(event.get("kind") or event.get("event_type") or event.get("type") or "event")
19
+ name = str(
20
+ event.get("name")
21
+ or event.get("tool_name")
22
+ or event.get("agent_name")
23
+ or event.get("model")
24
+ or kind
25
+ )
26
+ timestamp = event.get("timestamp") or event.get("time")
27
+ metadata = {
28
+ key: value
29
+ for key, value in event.items()
30
+ if key not in {"kind", "event_type", "type", "name", "tool_name", "agent_name", "model", "timestamp", "time"}
31
+ }
32
+ normalized.append(AgentEvent(kind=kind, name=name, timestamp=timestamp, metadata=metadata))
33
+ return AgentRun(task_id=task_id, events=normalized, success=success, cost=cost, outcome=outcome)
@@ -0,0 +1,43 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Optional
4
+
5
+ from entropy_agent_eval.adapters.generic import EventRecorder
6
+ from entropy_agent_eval.models import AgentRun
7
+
8
+
9
+ try:
10
+ from langchain_core.callbacks import BaseCallbackHandler
11
+ except Exception:
12
+ BaseCallbackHandler = object # type: ignore[misc,assignment]
13
+
14
+
15
+ class EntropyCallbackHandler(BaseCallbackHandler): # type: ignore[misc]
16
+ def __init__(self, task_id: str) -> None:
17
+ self.recorder = EventRecorder(task_id)
18
+
19
+ def on_tool_start(self, serialized: dict[str, Any], input_str: str, **kwargs: Any) -> None:
20
+ name = serialized.get("name") or serialized.get("id") or "tool"
21
+ self.recorder.tool(str(name), input=input_str, **_compact(kwargs))
22
+
23
+ def on_llm_start(self, serialized: dict[str, Any], prompts: list[str], **kwargs: Any) -> None:
24
+ name = serialized.get("name") or serialized.get("id") or "llm"
25
+ self.recorder.llm(str(name), prompt_count=len(prompts), **_compact(kwargs))
26
+
27
+ def on_chain_start(self, serialized: dict[str, Any], inputs: dict[str, Any], **kwargs: Any) -> None:
28
+ name = serialized.get("name") or serialized.get("id") or "chain"
29
+ self.recorder.action(str(name), input_keys=sorted(inputs.keys()), **_compact(kwargs))
30
+
31
+ def to_run(
32
+ self,
33
+ *,
34
+ success: Optional[bool] = None,
35
+ cost: float = 0.0,
36
+ outcome: Optional[str] = None,
37
+ **metadata: Any,
38
+ ) -> AgentRun:
39
+ return self.recorder.to_run(success=success, cost=cost, outcome=outcome, **metadata)
40
+
41
+
42
+ def _compact(kwargs: dict[str, Any]) -> dict[str, Any]:
43
+ return {key: value for key, value in kwargs.items() if key in {"run_id", "parent_run_id", "tags"}}
@@ -0,0 +1,5 @@
1
+ from entropy_agent_eval.benchmarks.base import AgentCallable, BenchmarkTask, run_benchmark
2
+ from entropy_agent_eval.benchmarks.coding_tasks import CODING_TASKS
3
+ from entropy_agent_eval.benchmarks.qa_tasks import QA_TASKS
4
+
5
+ __all__ = ["AgentCallable", "BenchmarkTask", "CODING_TASKS", "QA_TASKS", "run_benchmark"]
@@ -0,0 +1,38 @@
1
+ from __future__ import annotations
2
+
3
+ import time
4
+ from dataclasses import dataclass, field
5
+ from typing import Any, Iterable, List, Mapping, Protocol
6
+
7
+ from entropy_agent_eval.models import AgentRun
8
+
9
+
10
+ @dataclass(frozen=True)
11
+ class BenchmarkTask:
12
+ id: str
13
+ prompt: str
14
+ expected: str | None = None
15
+ metadata: Mapping[str, Any] = field(default_factory=dict)
16
+
17
+
18
+ class AgentCallable(Protocol):
19
+ def __call__(self, task: BenchmarkTask) -> AgentRun | Mapping[str, Any]:
20
+ """Execute a task and return an AgentRun or AgentRun-compatible mapping."""
21
+
22
+
23
+ def run_benchmark(tasks: Iterable[BenchmarkTask], agent: AgentCallable) -> List[AgentRun]:
24
+ """Run tasks through any callable agent and normalize outputs."""
25
+
26
+ runs: list[AgentRun] = []
27
+ for task in tasks:
28
+ started = time.perf_counter()
29
+ result = agent(task)
30
+ elapsed_ms = (time.perf_counter() - started) * 1000
31
+ run = result if isinstance(result, AgentRun) else AgentRun.from_mapping(result)
32
+ run.task_id = run.task_id if run.task_id != "unknown" else task.id
33
+ run.latency_ms = run.latency_ms if run.latency_ms is not None else elapsed_ms
34
+ run.metadata.setdefault("benchmark_prompt", task.prompt)
35
+ if task.expected is not None:
36
+ run.metadata.setdefault("expected", task.expected)
37
+ runs.append(run)
38
+ return runs
@@ -0,0 +1,17 @@
1
+ from entropy_agent_eval.benchmarks.base import BenchmarkTask
2
+
3
+
4
+ CODING_TASKS = [
5
+ BenchmarkTask(
6
+ id="code-sort",
7
+ prompt="Write a Python function that returns a sorted copy of a list.",
8
+ expected="Function should not mutate the input and should return ascending order.",
9
+ metadata={"difficulty": "easy"},
10
+ ),
11
+ BenchmarkTask(
12
+ id="code-dedupe-stable",
13
+ prompt="Write a Python function that removes duplicates while preserving order.",
14
+ expected="Function should preserve the first occurrence of each item.",
15
+ metadata={"difficulty": "medium"},
16
+ ),
17
+ ]
@@ -0,0 +1,16 @@
1
+ from entropy_agent_eval.benchmarks.base import BenchmarkTask
2
+
3
+
4
+ QA_TASKS = [
5
+ BenchmarkTask(
6
+ id="qa-capital-france",
7
+ prompt="What is the capital of France?",
8
+ expected="Paris",
9
+ metadata={"difficulty": "easy"},
10
+ ),
11
+ BenchmarkTask(
12
+ id="qa-entropy-definition",
13
+ prompt="In one sentence, define Shannon entropy.",
14
+ metadata={"difficulty": "medium"},
15
+ ),
16
+ ]
@@ -0,0 +1,30 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ from typing import Sequence
6
+
7
+ from entropy_agent_eval.evaluator import EntropyEvaluator
8
+ from entropy_agent_eval.io import load_runs
9
+
10
+
11
+ def main(argv: Sequence[str] | None = None) -> int:
12
+ parser = argparse.ArgumentParser(prog="eea", description="Evaluate agent logs with entropy metrics.")
13
+ parser.add_argument("path", help="JSON or JSONL run log path")
14
+ parser.add_argument("--per-run", action="store_true", help="Emit per-run metrics instead of corpus metrics")
15
+ parser.add_argument("--indent", type=int, default=2, help="JSON indentation")
16
+ args = parser.parse_args(argv)
17
+
18
+ runs = load_runs(args.path)
19
+ evaluator = EntropyEvaluator()
20
+ payload = (
21
+ [dict(evaluator.evaluate_run(run)) for run in runs]
22
+ if args.per_run
23
+ else evaluator.evaluate(runs).as_dict()
24
+ )
25
+ print(json.dumps(payload, indent=args.indent, sort_keys=True))
26
+ return 0
27
+
28
+
29
+ if __name__ == "__main__":
30
+ raise SystemExit(main())
@@ -0,0 +1,174 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from statistics import mean
5
+ from typing import Dict, Iterable, Mapping, Optional
6
+
7
+ from entropy_agent_eval.metrics.core import (
8
+ entropy_reduction,
9
+ exploration_efficiency,
10
+ normalized_entropy,
11
+ shannon_entropy,
12
+ )
13
+ from entropy_agent_eval.metrics.robustness import robustness_summary
14
+ from entropy_agent_eval.metrics.temporal import entropy_curve, rolling_entropy_curve
15
+ from entropy_agent_eval.models import AgentRun
16
+
17
+
18
+ @dataclass(frozen=True)
19
+ class EntropicAgentScore:
20
+ """Weighted composite score.
21
+
22
+ Defaults reward success, information gain, and exploration efficiency while
23
+ penalizing monetary or token-normalized cost.
24
+ """
25
+
26
+ success_weight: float = 1.0
27
+ information_gain_weight: float = 1.0
28
+ exploration_efficiency_weight: float = 1.0
29
+ cost_weight: float = 1.0
30
+
31
+ def compute(
32
+ self,
33
+ success_rate: float,
34
+ information_gain: float,
35
+ exploration_efficiency_value: float,
36
+ cost: float,
37
+ ) -> float:
38
+ return (
39
+ self.success_weight * success_rate
40
+ + self.information_gain_weight * information_gain
41
+ + self.exploration_efficiency_weight * exploration_efficiency_value
42
+ - self.cost_weight * cost
43
+ )
44
+
45
+
46
+ @dataclass
47
+ class EvaluationReport:
48
+ runs: int
49
+ action_entropy: float
50
+ action_entropy_normalized: float
51
+ tool_entropy: float
52
+ tool_entropy_normalized: float
53
+ trajectory_entropy: float
54
+ trajectory_entropy_normalized: float
55
+ success_rate: Optional[float]
56
+ information_gain: float
57
+ exploration_efficiency: Optional[float]
58
+ mean_cost: float
59
+ entropic_agent_score: Optional[float]
60
+ robustness: Dict[str, object] = field(default_factory=dict)
61
+
62
+ def as_dict(self) -> Dict[str, object]:
63
+ return {
64
+ "runs": self.runs,
65
+ "action_entropy": self.action_entropy,
66
+ "action_entropy_normalized": self.action_entropy_normalized,
67
+ "tool_entropy": self.tool_entropy,
68
+ "tool_entropy_normalized": self.tool_entropy_normalized,
69
+ "trajectory_entropy": self.trajectory_entropy,
70
+ "trajectory_entropy_normalized": self.trajectory_entropy_normalized,
71
+ "success_rate": self.success_rate,
72
+ "information_gain": self.information_gain,
73
+ "exploration_efficiency": self.exploration_efficiency,
74
+ "mean_cost": self.mean_cost,
75
+ "entropic_agent_score": self.entropic_agent_score,
76
+ "robustness": self.robustness,
77
+ }
78
+
79
+
80
+ class EntropyEvaluator:
81
+ """Compute entropy metrics for one run or a corpus of runs."""
82
+
83
+ def __init__(self, score: Optional[EntropicAgentScore] = None) -> None:
84
+ self.score = score or EntropicAgentScore()
85
+
86
+ @staticmethod
87
+ def action_entropy(actions: Iterable[str]) -> float:
88
+ return shannon_entropy(list(actions))
89
+
90
+ @staticmethod
91
+ def tool_entropy(tools: Iterable[str]) -> float:
92
+ return shannon_entropy(list(tools))
93
+
94
+ @staticmethod
95
+ def trajectory_entropy(trajectories: Iterable[Iterable[str] | str]) -> float:
96
+ normalized = [
97
+ tuple(trajectory) if not isinstance(trajectory, str) else trajectory
98
+ for trajectory in trajectories
99
+ ]
100
+ return shannon_entropy(normalized)
101
+
102
+ @staticmethod
103
+ def information_gain(before: Iterable[float], after: Iterable[float]) -> float:
104
+ return entropy_reduction(list(before), list(after))
105
+
106
+ @staticmethod
107
+ def entropy_curve(symbols: Iterable[str]) -> list[float]:
108
+ return entropy_curve(symbols)
109
+
110
+ @staticmethod
111
+ def rolling_entropy_curve(symbols: Iterable[str], window_size: int) -> list[float]:
112
+ return rolling_entropy_curve(symbols, window_size)
113
+
114
+ def evaluate_run(self, run: AgentRun) -> Mapping[str, object]:
115
+ actions = run.actions
116
+ tools = run.tools
117
+ gain = (
118
+ entropy_reduction(run.before_state.values(), run.after_state.values())
119
+ if run.before_state and run.after_state
120
+ else 0.0
121
+ )
122
+ return {
123
+ "task_id": run.task_id,
124
+ "action_entropy": shannon_entropy(actions),
125
+ "action_entropy_normalized": normalized_entropy(actions),
126
+ "tool_entropy": shannon_entropy(tools),
127
+ "tool_entropy_normalized": normalized_entropy(tools),
128
+ "trajectory": list(run.trajectory_tuple()),
129
+ "trajectory_length": len(run.events),
130
+ "entropy_curve": entropy_curve(actions),
131
+ "information_gain": gain,
132
+ "success": run.success,
133
+ "cost": run.cost,
134
+ }
135
+
136
+ def evaluate(self, runs: Iterable[AgentRun]) -> EvaluationReport:
137
+ materialized = list(runs)
138
+ actions = [action for run in materialized for action in run.actions]
139
+ tools = [tool for run in materialized for tool in run.tools]
140
+ trajectories = [run.trajectory_tuple() for run in materialized]
141
+ successes = [run.success for run in materialized if run.success is not None]
142
+ success_rate = mean([1.0 if success else 0.0 for success in successes]) if successes else None
143
+ gains = [
144
+ entropy_reduction(run.before_state.values(), run.after_state.values())
145
+ for run in materialized
146
+ if run.before_state and run.after_state
147
+ ]
148
+ information_gain = mean(gains) if gains else 0.0
149
+ mean_cost = mean([run.cost for run in materialized]) if materialized else 0.0
150
+ action_h = shannon_entropy(actions)
151
+ efficiency = (
152
+ exploration_efficiency(success_rate, action_h) if success_rate is not None else None
153
+ )
154
+ score = (
155
+ self.score.compute(success_rate, information_gain, efficiency, mean_cost)
156
+ if success_rate is not None and efficiency is not None
157
+ else None
158
+ )
159
+
160
+ return EvaluationReport(
161
+ runs=len(materialized),
162
+ action_entropy=action_h,
163
+ action_entropy_normalized=normalized_entropy(actions),
164
+ tool_entropy=shannon_entropy(tools),
165
+ tool_entropy_normalized=normalized_entropy(tools),
166
+ trajectory_entropy=shannon_entropy(trajectories),
167
+ trajectory_entropy_normalized=normalized_entropy(trajectories),
168
+ success_rate=success_rate,
169
+ information_gain=information_gain,
170
+ exploration_efficiency=efficiency,
171
+ mean_cost=mean_cost,
172
+ entropic_agent_score=score,
173
+ robustness=robustness_summary(materialized),
174
+ )
@@ -0,0 +1,21 @@
1
+ from entropy_agent_eval.experiments.runner import ExperimentAgent, ExperimentConfig, ExperimentResult
2
+ from entropy_agent_eval.experiments.learning_roadmap import load_learning_roadmap_tasks
3
+ from entropy_agent_eval.experiments.reference_agents import (
4
+ DirectAgent,
5
+ PlannerAgent,
6
+ ReferenceAgent,
7
+ SearchCodeAgent,
8
+ SearchOnlyAgent,
9
+ )
10
+
11
+ __all__ = [
12
+ "DirectAgent",
13
+ "ExperimentAgent",
14
+ "ExperimentConfig",
15
+ "ExperimentResult",
16
+ "PlannerAgent",
17
+ "ReferenceAgent",
18
+ "SearchCodeAgent",
19
+ "SearchOnlyAgent",
20
+ "load_learning_roadmap_tasks",
21
+ ]
@@ -0,0 +1,99 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+
5
+ from entropy_agent_eval.adapters import EventRecorder
6
+ from entropy_agent_eval.benchmarks import BenchmarkTask
7
+ from entropy_agent_eval.experiments.learning_roadmap import (
8
+ ROADMAP_SYSTEM_PROMPT,
9
+ build_roadmap_context,
10
+ build_roadmap_prompt,
11
+ grade_roadmap_response,
12
+ roadmap_information_states,
13
+ )
14
+ from entropy_agent_eval.models import AgentRun
15
+
16
+
17
+ class GoogleADKLearningRoadmapAgent:
18
+ name = "google-adk-learning-roadmap"
19
+
20
+ def __init__(self, model: str = "gemini-2.5-flash") -> None:
21
+ try:
22
+ from google.adk.agents import LlmAgent
23
+ from google.adk.runners import Runner
24
+ from google.adk.sessions import InMemorySessionService
25
+ from google.genai import types
26
+ except ImportError as exc:
27
+ raise RuntimeError(
28
+ "Install Google ADK support with: pip install 'entropy-agent-eval[google-adk]'"
29
+ ) from exc
30
+ self.model_name = model
31
+ self.agent_class = LlmAgent
32
+ self.runner_class = Runner
33
+ self.session_service_class = InMemorySessionService
34
+ self.types = types
35
+
36
+ def run(self, task: BenchmarkTask, repetition: int) -> AgentRun:
37
+ return asyncio.run(self._run_async(task, repetition))
38
+
39
+ async def _run_async(self, task: BenchmarkTask, repetition: int) -> AgentRun:
40
+ recorder = EventRecorder(task_id=task.id)
41
+ context = self._prepare_context(task, recorder)
42
+ prompt = build_roadmap_prompt(task, context)
43
+ recorder.llm(self.model_name, provider="google-adk")
44
+ response_text = await self._call_adk(task, prompt, repetition)
45
+ expected_terms = list(task.metadata.get("expected_terms", []))
46
+ success = grade_roadmap_response(response_text, expected_terms)
47
+ before, after = roadmap_information_states(success, expected_terms, response_text)
48
+ return recorder.to_run(
49
+ success=success,
50
+ cost=0.0,
51
+ before_state=before,
52
+ after_state=after,
53
+ outcome="usable_roadmap" if success else "incomplete_roadmap",
54
+ agent_name=self.name,
55
+ provider="google-adk",
56
+ model=self.model_name,
57
+ repetition=repetition,
58
+ response=response_text,
59
+ )
60
+
61
+ def _prepare_context(self, task: BenchmarkTask, recorder: EventRecorder) -> dict[str, object]:
62
+ recorder.tool("assess_learner_profile")
63
+ recorder.tool("select_learning_modules")
64
+ recorder.tool("build_weekly_schedule")
65
+ recorder.tool("design_assessment_checkpoints")
66
+ return build_roadmap_context(task)
67
+
68
+ async def _call_adk(self, task: BenchmarkTask, prompt: str, repetition: int) -> str:
69
+ app_name = "learning_roadmap_experiment"
70
+ user_id = "eea_user"
71
+ session_id = f"{task.id}_{repetition}"
72
+ agent = self.agent_class(
73
+ name="learning_roadmap_agent",
74
+ model=self.model_name,
75
+ instruction=ROADMAP_SYSTEM_PROMPT,
76
+ description="Creates structured learning roadmaps.",
77
+ )
78
+ session_service = self.session_service_class()
79
+ await session_service.create_session(
80
+ app_name=app_name,
81
+ user_id=user_id,
82
+ session_id=session_id,
83
+ )
84
+ runner = self.runner_class(
85
+ agent=agent,
86
+ app_name=app_name,
87
+ session_service=session_service,
88
+ )
89
+ content = self.types.Content(role="user", parts=[self.types.Part(text=prompt)])
90
+ final_response = ""
91
+ async for event in runner.run_async(
92
+ user_id=user_id,
93
+ session_id=session_id,
94
+ new_message=content,
95
+ ):
96
+ if event.is_final_response() and event.content and event.content.parts:
97
+ final_response = event.content.parts[0].text or ""
98
+ break
99
+ return final_response
@@ -0,0 +1,75 @@
1
+ from __future__ import annotations
2
+
3
+ from entropy_agent_eval.adapters import EventRecorder
4
+ from entropy_agent_eval.benchmarks import BenchmarkTask
5
+ from entropy_agent_eval.experiments.learning_roadmap import (
6
+ ROADMAP_SYSTEM_PROMPT,
7
+ build_roadmap_context,
8
+ build_roadmap_prompt,
9
+ grade_roadmap_response,
10
+ roadmap_information_states,
11
+ )
12
+ from entropy_agent_eval.models import AgentRun
13
+
14
+
15
+ class LangChainLearningRoadmapAgent:
16
+ name = "langchain-learning-roadmap"
17
+
18
+ def __init__(self, model: str = "gpt-4o-mini", temperature: float = 0.0) -> None:
19
+ try:
20
+ from langchain_core.messages import HumanMessage, SystemMessage
21
+ from langchain_openai import ChatOpenAI
22
+ except ImportError as exc:
23
+ raise RuntimeError(
24
+ "Install LangChain support with: pip install 'entropy-agent-eval[langchain]'"
25
+ ) from exc
26
+ self.model_name = model
27
+ self.llm = ChatOpenAI(model=model, temperature=temperature)
28
+ self.human_message = HumanMessage
29
+ self.system_message = SystemMessage
30
+
31
+ def run(self, task: BenchmarkTask, repetition: int) -> AgentRun:
32
+ recorder = EventRecorder(task_id=task.id)
33
+ context = self._prepare_context(task, recorder)
34
+ prompt = build_roadmap_prompt(task, context)
35
+ recorder.llm(self.model_name, provider="langchain")
36
+ response = self.llm.invoke(
37
+ [
38
+ self.system_message(content=ROADMAP_SYSTEM_PROMPT),
39
+ self.human_message(content=prompt),
40
+ ]
41
+ )
42
+ text = str(getattr(response, "content", response))
43
+ expected_terms = list(task.metadata.get("expected_terms", []))
44
+ success = grade_roadmap_response(text, expected_terms)
45
+ before, after = roadmap_information_states(success, expected_terms, text)
46
+ usage = getattr(response, "usage_metadata", None) or {}
47
+ cost = _cost_from_usage(usage)
48
+ return recorder.to_run(
49
+ success=success,
50
+ cost=cost,
51
+ before_state=before,
52
+ after_state=after,
53
+ outcome="usable_roadmap" if success else "incomplete_roadmap",
54
+ agent_name=self.name,
55
+ provider="langchain",
56
+ model=self.model_name,
57
+ repetition=repetition,
58
+ response=text,
59
+ usage=usage,
60
+ )
61
+
62
+ def _prepare_context(self, task: BenchmarkTask, recorder: EventRecorder) -> dict[str, object]:
63
+ recorder.tool("assess_learner_profile")
64
+ recorder.tool("select_learning_modules")
65
+ recorder.tool("build_weekly_schedule")
66
+ recorder.tool("design_assessment_checkpoints")
67
+ return build_roadmap_context(task)
68
+
69
+
70
+ def _cost_from_usage(usage: dict[str, object]) -> float:
71
+ total_tokens = usage.get("total_tokens") or usage.get("total_token_count") or 0
72
+ try:
73
+ return round(float(total_tokens) * 0.000001, 6)
74
+ except (TypeError, ValueError):
75
+ return 0.0