entropy-agent-eval 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. entropy_agent_eval-0.1.0/CHANGELOG.md +13 -0
  2. entropy_agent_eval-0.1.0/LICENSE +21 -0
  3. entropy_agent_eval-0.1.0/PKG-INFO +332 -0
  4. entropy_agent_eval-0.1.0/README.md +306 -0
  5. entropy_agent_eval-0.1.0/entropy_agent_eval/__init__.py +13 -0
  6. entropy_agent_eval-0.1.0/entropy_agent_eval/adapters/__init__.py +3 -0
  7. entropy_agent_eval-0.1.0/entropy_agent_eval/adapters/generic.py +53 -0
  8. entropy_agent_eval-0.1.0/entropy_agent_eval/adapters/google_adk.py +33 -0
  9. entropy_agent_eval-0.1.0/entropy_agent_eval/adapters/langchain.py +43 -0
  10. entropy_agent_eval-0.1.0/entropy_agent_eval/benchmarks/__init__.py +5 -0
  11. entropy_agent_eval-0.1.0/entropy_agent_eval/benchmarks/base.py +38 -0
  12. entropy_agent_eval-0.1.0/entropy_agent_eval/benchmarks/coding_tasks.py +17 -0
  13. entropy_agent_eval-0.1.0/entropy_agent_eval/benchmarks/qa_tasks.py +16 -0
  14. entropy_agent_eval-0.1.0/entropy_agent_eval/cli.py +30 -0
  15. entropy_agent_eval-0.1.0/entropy_agent_eval/evaluator.py +174 -0
  16. entropy_agent_eval-0.1.0/entropy_agent_eval/experiments/__init__.py +21 -0
  17. entropy_agent_eval-0.1.0/entropy_agent_eval/experiments/google_adk_roadmap.py +99 -0
  18. entropy_agent_eval-0.1.0/entropy_agent_eval/experiments/langchain_roadmap.py +75 -0
  19. entropy_agent_eval-0.1.0/entropy_agent_eval/experiments/learning_roadmap.py +148 -0
  20. entropy_agent_eval-0.1.0/entropy_agent_eval/experiments/reference_agents.py +113 -0
  21. entropy_agent_eval-0.1.0/entropy_agent_eval/experiments/runner.py +124 -0
  22. entropy_agent_eval-0.1.0/entropy_agent_eval/io.py +51 -0
  23. entropy_agent_eval-0.1.0/entropy_agent_eval/metrics/__init__.py +20 -0
  24. entropy_agent_eval-0.1.0/entropy_agent_eval/metrics/core.py +73 -0
  25. entropy_agent_eval-0.1.0/entropy_agent_eval/metrics/robustness.py +42 -0
  26. entropy_agent_eval-0.1.0/entropy_agent_eval/metrics/temporal.py +28 -0
  27. entropy_agent_eval-0.1.0/entropy_agent_eval/models.py +138 -0
  28. entropy_agent_eval-0.1.0/entropy_agent_eval/visualizations/__init__.py +3 -0
  29. entropy_agent_eval-0.1.0/entropy_agent_eval/visualizations/entropy_plots.py +27 -0
  30. entropy_agent_eval-0.1.0/pyproject.toml +49 -0
@@ -0,0 +1,13 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented here.
4
+
5
+ ## 0.1.0 - Unreleased
6
+
7
+ - Add core entropy metrics for actions, tools, trajectories, uncertainty reduction, and temporal curves.
8
+ - Add `AgentRun`, `AgentEvent`, and `InformationState` as the framework-neutral data contract.
9
+ - Add `EntropyEvaluator`, `EvaluationReport`, and configurable `EntropicAgentScore`.
10
+ - Add generic, LangChain, and Google ADK-style adapters.
11
+ - Add JSON/JSONL loading and the `eea` CLI.
12
+ - Add a minimal benchmark harness with sample QA and coding tasks.
13
+ - Add optional matplotlib entropy curve plotting.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 HypelBase
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,332 @@
1
+ Metadata-Version: 2.3
2
+ Name: entropy-agent-eval
3
+ Version: 0.1.0
4
+ Summary: Entropy-based evaluation metrics for AI agent behavior, tools, trajectories, uncertainty reduction, and robustness.
5
+ License: MIT
6
+ Keywords: agents,evaluation,entropy,llm,langchain,adk,benchmark
7
+ Author: HypelBase
8
+ Requires-Python: >=3.12
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3 :: Only
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Provides-Extra: google-adk
17
+ Provides-Extra: langchain
18
+ Provides-Extra: plots
19
+ Requires-Dist: google-adk (>=0.1.0) ; extra == "google-adk"
20
+ Requires-Dist: google-genai (>=1.0.0) ; extra == "google-adk"
21
+ Requires-Dist: langchain-core (>=0.3) ; extra == "langchain"
22
+ Requires-Dist: langchain-openai (>=0.2) ; extra == "langchain"
23
+ Requires-Dist: matplotlib (>=3.7) ; extra == "plots"
24
+ Description-Content-Type: text/markdown
25
+
26
+ # Entropy-Based Evaluation of AI Agents
27
+
28
+ `entropy-agent-eval` implements **EEA**, a toolkit for measuring agent behavior with entropy metrics:
29
+
30
+ - action entropy for action-selection uncertainty
31
+ - trajectory entropy for strategy diversity
32
+ - tool entropy for tool-use specialization
33
+ - information gain for uncertainty reduction
34
+ - entropy curves for temporal behavior
35
+ - robustness summaries across repeated runs
36
+ - a configurable Entropic Agent Score
37
+
38
+ Any agent library can integrate by converting its trace events into `AgentRun`
39
+ records.
40
+
41
+ ## Who This Is For
42
+
43
+ Use EEA when you want to compare agent behavior beyond success rate:
44
+
45
+ - framework authors who want behavioral diagnostics
46
+ - application teams evaluating agent changes before deployment
47
+ - researchers comparing ReAct, planner, tool-using, or multi-agent systems
48
+ - observability teams turning traces into evaluation metrics
49
+
50
+ ## Install
51
+
52
+ Requires Python 3.12 or newer.
53
+
54
+ From GitHub:
55
+
56
+ ```bash
57
+ pip install git+https://github.com/olahsymbo/entropy-agent-eval.git
58
+ ```
59
+
60
+ For local development:
61
+
62
+ ```bash
63
+ poetry install --with dev
64
+ ```
65
+
66
+ Optional plotting support:
67
+
68
+ ```bash
69
+ pip install "entropy-agent-eval[plots]"
70
+ ```
71
+
72
+ Build source and wheel distributions:
73
+
74
+ ```bash
75
+ poetry build
76
+ ```
77
+
78
+ Install a local wheel:
79
+
80
+ ```bash
81
+ pip install dist/entropy_agent_eval-0.1.0-py3-none-any.whl
82
+ ```
83
+
84
+ ## Release
85
+
86
+ Package builds are handled by Poetry. To cut a release:
87
+
88
+ ```bash
89
+ poetry version patch
90
+ git tag v0.1.1
91
+ git push origin main --tags
92
+ ```
93
+
94
+
95
+ ## Quick Start
96
+
97
+ ```python
98
+ from entropy_agent_eval import AgentRun, EntropyEvaluator
99
+
100
+ runs = [
101
+ AgentRun.from_mapping(
102
+ {
103
+ "task": "Write sorting algorithm",
104
+ "success": True,
105
+ "cost": 0.12,
106
+ "trajectory": ["search", "python", "test", "answer"],
107
+ "before": {"A": 0.4, "B": 0.3, "C": 0.2, "D": 0.1},
108
+ "after": {"A": 0.9, "B": 0.05, "C": 0.03, "D": 0.02},
109
+ }
110
+ )
111
+ ]
112
+
113
+ report = EntropyEvaluator().evaluate(runs)
114
+ print(report.as_dict())
115
+ ```
116
+
117
+ ## CLI
118
+
119
+ ```bash
120
+ eea examples/runs.json
121
+ eea examples/runs.json --per-run
122
+ ```
123
+
124
+ The CLI accepts JSON objects with a top-level `runs` list, raw JSON lists, or
125
+ JSONL files.
126
+
127
+ ## Integration Model
128
+
129
+ You do not have to export JSON logs. JSON is only one supported path.
130
+
131
+ EEA needs one thing: normalized traces as `AgentRun` objects. Those traces can
132
+ come from live callbacks, custom wrappers, databases, observability systems,
133
+ JSON/JSONL files, or benchmark harnesses.
134
+
135
+ ```text
136
+ LangChain / Google ADK / custom agent / stored trace
137
+
138
+ AgentRun
139
+
140
+ EntropyEvaluator
141
+
142
+ entropy metrics + Entropic Agent Score
143
+ ```
144
+
145
+ ## Data Contract
146
+
147
+ The central integration type is `AgentRun`:
148
+
149
+ ```json
150
+ {
151
+ "task": "qa-001",
152
+ "success": true,
153
+ "cost": 0.08,
154
+ "trajectory": ["search", "read", "answer"],
155
+ "before": {"correct": 0.45, "distractor": 0.55},
156
+ "after": {"correct": 0.92, "distractor": 0.08}
157
+ }
158
+ ```
159
+
160
+ For richer logs, use explicit events:
161
+
162
+ ```json
163
+ {
164
+ "task_id": "coding-42",
165
+ "events": [
166
+ {"kind": "tool", "name": "search"},
167
+ {"kind": "tool", "name": "python"},
168
+ {"kind": "action", "name": "answer"}
169
+ ],
170
+ "success": true
171
+ }
172
+ ```
173
+
174
+ ### Cost
175
+
176
+ `cost` is user or framework supplied. It can mean USD, total tokens,
177
+ token-normalized cost, tool-call cost, compute cost, or any other numeric
178
+ penalty you want to apply consistently across compared runs.
179
+
180
+ The evaluator reports it as `mean_cost` and subtracts it inside
181
+ `EntropicAgentScore`. If cost is unknown or irrelevant, omit it or leave it as
182
+ `0.0`.
183
+
184
+ Full guide: [docs/concepts/cost.md](docs/concepts/cost.md)
185
+
186
+ ## Custom Agent Integration
187
+
188
+ ```python
189
+ from entropy_agent_eval import EntropyEvaluator
190
+ from entropy_agent_eval.adapters import EventRecorder
191
+
192
+ recorder = EventRecorder(task_id="task-123")
193
+ recorder.tool("search")
194
+ recorder.tool("python")
195
+ recorder.action("answer")
196
+
197
+ run = recorder.to_run(success=True, cost=0.04)
198
+ print(EntropyEvaluator().evaluate([run]).as_dict())
199
+ ```
200
+
201
+ Full guide: [docs/integrations/custom-agents.md](docs/integrations/custom-agents.md)
202
+
203
+ ## LangChain Integration
204
+
205
+ ```python
206
+ from entropy_agent_eval.adapters.langchain import EntropyCallbackHandler
207
+
208
+ handler = EntropyCallbackHandler(task_id="lc-001")
209
+
210
+ # Pass `handler` in your LangChain config/callbacks.
211
+ # result = chain.invoke(inputs, config={"callbacks": [handler]})
212
+
213
+ run = handler.to_run(success=True, cost=0.10)
214
+ ```
215
+
216
+ Full guide: [docs/integrations/langchain.md](docs/integrations/langchain.md)
217
+
218
+ ## Google ADK-Style Event Integration
219
+
220
+ ```python
221
+ from entropy_agent_eval.adapters.google_adk import runs_from_adk_events
222
+
223
+ run = runs_from_adk_events(
224
+ "adk-001",
225
+ [
226
+ {"event_type": "tool", "tool_name": "Search"},
227
+ {"event_type": "model", "model": "gemini"},
228
+ ],
229
+ success=True,
230
+ )
231
+ ```
232
+
233
+ Full guide: [docs/integrations/google-adk.md](docs/integrations/google-adk.md)
234
+
235
+ ## Stored Trace Integration
236
+
237
+ If your traces are already in a database, warehouse, or observability platform,
238
+ export or query them into `AgentRun`-compatible dictionaries and evaluate them
239
+ offline.
240
+
241
+ Full guide: [docs/integrations/observability.md](docs/integrations/observability.md)
242
+
243
+ ## Metric Notes
244
+
245
+ High entropy is not automatically good. EEA treats entropy as a behavioral
246
+ signature:
247
+
248
+ - low action entropy can mean focus or brittle determinism
249
+ - medium entropy can indicate adaptive branching
250
+ - high entropy can indicate exploration or chaos
251
+ - successful agents should often reduce state entropy over time
252
+ - robust agents can have moderate trajectory entropy with low outcome entropy
253
+
254
+ `EntropicAgentScore` is configurable:
255
+
256
+ ```python
257
+ from entropy_agent_eval import EntropicAgentScore, EntropyEvaluator
258
+
259
+ evaluator = EntropyEvaluator(
260
+ EntropicAgentScore(
261
+ success_weight=2.0,
262
+ information_gain_weight=1.0,
263
+ exploration_efficiency_weight=0.5,
264
+ cost_weight=1.5,
265
+ )
266
+ )
267
+ ```
268
+
269
+ ## Benchmark
270
+
271
+ Any callable that accepts a `BenchmarkTask` and returns an `AgentRun` or
272
+ compatible dictionary can be benchmarked:
273
+
274
+ ```python
275
+ from entropy_agent_eval.benchmarks import QA_TASKS, run_benchmark
276
+
277
+ def agent(task):
278
+ return {
279
+ "task_id": task.id,
280
+ "trajectory": ["think", "answer"],
281
+ "success": True,
282
+ }
283
+
284
+ runs = run_benchmark(QA_TASKS, agent)
285
+ ```
286
+
287
+ ## Controlled Benchmark
288
+
289
+ The [experiments](experiments/) directory contains a controlled benchmark that
290
+ compares reference agent patterns across factual QA, multi-hop, and coding
291
+ tasks.
292
+
293
+ ```bash
294
+ poetry run python scripts/run_experiment.py
295
+ ```
296
+
297
+ The script writes normalized runs and per-agent summaries to
298
+ `experiments/results/`.
299
+
300
+ ## Learning Roadmap Agent Experiment
301
+
302
+ The project also includes a framework-backed experiment for a Learning Roadmap
303
+ Agent. It can run with LangChain, Google ADK, or both when the optional
304
+ dependencies and API keys are installed.
305
+
306
+ ```bash
307
+ pip install "entropy-agent-eval[langchain]"
308
+ export OPENAI_API_KEY="..."
309
+ poetry run python scripts/run_learning_roadmap_experiment.py --provider langchain
310
+ ```
311
+
312
+ ```bash
313
+ pip install "entropy-agent-eval[google-adk]"
314
+ export GOOGLE_API_KEY="..."
315
+ poetry run python scripts/run_learning_roadmap_experiment.py --provider google-adk
316
+ ```
317
+
318
+ The roadmap experiment runner also reads `.env` automatically. For Google ADK,
319
+ set `GOOGLE_API_KEY` or `GEMINI_API_KEY`.
320
+
321
+ Full guide: [docs/experiments/learning-roadmap-agent.md](docs/experiments/learning-roadmap-agent.md)
322
+
323
+ ## Contributing
324
+
325
+ See [CONTRIBUTING.md](CONTRIBUTING.md). New adapters are welcome, especially for
326
+ frameworks that can expose tool calls, model calls, actions, costs, outcomes,
327
+ and uncertainty states.
328
+
329
+ ## License
330
+
331
+ MIT. See [LICENSE](LICENSE).
332
+
@@ -0,0 +1,306 @@
1
+ # Entropy-Based Evaluation of AI Agents
2
+
3
+ `entropy-agent-eval` implements **EEA**, a toolkit for measuring agent behavior with entropy metrics:
4
+
5
+ - action entropy for action-selection uncertainty
6
+ - trajectory entropy for strategy diversity
7
+ - tool entropy for tool-use specialization
8
+ - information gain for uncertainty reduction
9
+ - entropy curves for temporal behavior
10
+ - robustness summaries across repeated runs
11
+ - a configurable Entropic Agent Score
12
+
13
+ Any agent library can integrate by converting its trace events into `AgentRun`
14
+ records.
15
+
16
+ ## Who This Is For
17
+
18
+ Use EEA when you want to compare agent behavior beyond success rate:
19
+
20
+ - framework authors who want behavioral diagnostics
21
+ - application teams evaluating agent changes before deployment
22
+ - researchers comparing ReAct, planner, tool-using, or multi-agent systems
23
+ - observability teams turning traces into evaluation metrics
24
+
25
+ ## Install
26
+
27
+ Requires Python 3.12 or newer.
28
+
29
+ From GitHub:
30
+
31
+ ```bash
32
+ pip install git+https://github.com/olahsymbo/entropy-agent-eval.git
33
+ ```
34
+
35
+ For local development:
36
+
37
+ ```bash
38
+ poetry install --with dev
39
+ ```
40
+
41
+ Optional plotting support:
42
+
43
+ ```bash
44
+ pip install "entropy-agent-eval[plots]"
45
+ ```
46
+
47
+ Build source and wheel distributions:
48
+
49
+ ```bash
50
+ poetry build
51
+ ```
52
+
53
+ Install a local wheel:
54
+
55
+ ```bash
56
+ pip install dist/entropy_agent_eval-0.1.0-py3-none-any.whl
57
+ ```
58
+
59
+ ## Release
60
+
61
+ Package builds are handled by Poetry. To cut a release:
62
+
63
+ ```bash
64
+ poetry version patch
65
+ git tag v0.1.1
66
+ git push origin main --tags
67
+ ```
68
+
69
+
70
+ ## Quick Start
71
+
72
+ ```python
73
+ from entropy_agent_eval import AgentRun, EntropyEvaluator
74
+
75
+ runs = [
76
+ AgentRun.from_mapping(
77
+ {
78
+ "task": "Write sorting algorithm",
79
+ "success": True,
80
+ "cost": 0.12,
81
+ "trajectory": ["search", "python", "test", "answer"],
82
+ "before": {"A": 0.4, "B": 0.3, "C": 0.2, "D": 0.1},
83
+ "after": {"A": 0.9, "B": 0.05, "C": 0.03, "D": 0.02},
84
+ }
85
+ )
86
+ ]
87
+
88
+ report = EntropyEvaluator().evaluate(runs)
89
+ print(report.as_dict())
90
+ ```
91
+
92
+ ## CLI
93
+
94
+ ```bash
95
+ eea examples/runs.json
96
+ eea examples/runs.json --per-run
97
+ ```
98
+
99
+ The CLI accepts JSON objects with a top-level `runs` list, raw JSON lists, or
100
+ JSONL files.
101
+
102
+ ## Integration Model
103
+
104
+ You do not have to export JSON logs. JSON is only one supported path.
105
+
106
+ EEA needs one thing: normalized traces as `AgentRun` objects. Those traces can
107
+ come from live callbacks, custom wrappers, databases, observability systems,
108
+ JSON/JSONL files, or benchmark harnesses.
109
+
110
+ ```text
111
+ LangChain / Google ADK / custom agent / stored trace
112
+
113
+ AgentRun
114
+
115
+ EntropyEvaluator
116
+
117
+ entropy metrics + Entropic Agent Score
118
+ ```
119
+
120
+ ## Data Contract
121
+
122
+ The central integration type is `AgentRun`:
123
+
124
+ ```json
125
+ {
126
+ "task": "qa-001",
127
+ "success": true,
128
+ "cost": 0.08,
129
+ "trajectory": ["search", "read", "answer"],
130
+ "before": {"correct": 0.45, "distractor": 0.55},
131
+ "after": {"correct": 0.92, "distractor": 0.08}
132
+ }
133
+ ```
134
+
135
+ For richer logs, use explicit events:
136
+
137
+ ```json
138
+ {
139
+ "task_id": "coding-42",
140
+ "events": [
141
+ {"kind": "tool", "name": "search"},
142
+ {"kind": "tool", "name": "python"},
143
+ {"kind": "action", "name": "answer"}
144
+ ],
145
+ "success": true
146
+ }
147
+ ```
148
+
149
+ ### Cost
150
+
151
+ `cost` is user or framework supplied. It can mean USD, total tokens,
152
+ token-normalized cost, tool-call cost, compute cost, or any other numeric
153
+ penalty you want to apply consistently across compared runs.
154
+
155
+ The evaluator reports it as `mean_cost` and subtracts it inside
156
+ `EntropicAgentScore`. If cost is unknown or irrelevant, omit it or leave it as
157
+ `0.0`.
158
+
159
+ Full guide: [docs/concepts/cost.md](docs/concepts/cost.md)
160
+
161
+ ## Custom Agent Integration
162
+
163
+ ```python
164
+ from entropy_agent_eval import EntropyEvaluator
165
+ from entropy_agent_eval.adapters import EventRecorder
166
+
167
+ recorder = EventRecorder(task_id="task-123")
168
+ recorder.tool("search")
169
+ recorder.tool("python")
170
+ recorder.action("answer")
171
+
172
+ run = recorder.to_run(success=True, cost=0.04)
173
+ print(EntropyEvaluator().evaluate([run]).as_dict())
174
+ ```
175
+
176
+ Full guide: [docs/integrations/custom-agents.md](docs/integrations/custom-agents.md)
177
+
178
+ ## LangChain Integration
179
+
180
+ ```python
181
+ from entropy_agent_eval.adapters.langchain import EntropyCallbackHandler
182
+
183
+ handler = EntropyCallbackHandler(task_id="lc-001")
184
+
185
+ # Pass `handler` in your LangChain config/callbacks.
186
+ # result = chain.invoke(inputs, config={"callbacks": [handler]})
187
+
188
+ run = handler.to_run(success=True, cost=0.10)
189
+ ```
190
+
191
+ Full guide: [docs/integrations/langchain.md](docs/integrations/langchain.md)
192
+
193
+ ## Google ADK-Style Event Integration
194
+
195
+ ```python
196
+ from entropy_agent_eval.adapters.google_adk import runs_from_adk_events
197
+
198
+ run = runs_from_adk_events(
199
+ "adk-001",
200
+ [
201
+ {"event_type": "tool", "tool_name": "Search"},
202
+ {"event_type": "model", "model": "gemini"},
203
+ ],
204
+ success=True,
205
+ )
206
+ ```
207
+
208
+ Full guide: [docs/integrations/google-adk.md](docs/integrations/google-adk.md)
209
+
210
+ ## Stored Trace Integration
211
+
212
+ If your traces are already in a database, warehouse, or observability platform,
213
+ export or query them into `AgentRun`-compatible dictionaries and evaluate them
214
+ offline.
215
+
216
+ Full guide: [docs/integrations/observability.md](docs/integrations/observability.md)
217
+
218
+ ## Metric Notes
219
+
220
+ High entropy is not automatically good. EEA treats entropy as a behavioral
221
+ signature:
222
+
223
+ - low action entropy can mean focus or brittle determinism
224
+ - medium entropy can indicate adaptive branching
225
+ - high entropy can indicate exploration or chaos
226
+ - successful agents should often reduce state entropy over time
227
+ - robust agents can have moderate trajectory entropy with low outcome entropy
228
+
229
+ `EntropicAgentScore` is configurable:
230
+
231
+ ```python
232
+ from entropy_agent_eval import EntropicAgentScore, EntropyEvaluator
233
+
234
+ evaluator = EntropyEvaluator(
235
+ EntropicAgentScore(
236
+ success_weight=2.0,
237
+ information_gain_weight=1.0,
238
+ exploration_efficiency_weight=0.5,
239
+ cost_weight=1.5,
240
+ )
241
+ )
242
+ ```
243
+
244
+ ## Benchmark
245
+
246
+ Any callable that accepts a `BenchmarkTask` and returns an `AgentRun` or
247
+ compatible dictionary can be benchmarked:
248
+
249
+ ```python
250
+ from entropy_agent_eval.benchmarks import QA_TASKS, run_benchmark
251
+
252
+ def agent(task):
253
+ return {
254
+ "task_id": task.id,
255
+ "trajectory": ["think", "answer"],
256
+ "success": True,
257
+ }
258
+
259
+ runs = run_benchmark(QA_TASKS, agent)
260
+ ```
261
+
262
+ ## Controlled Benchmark
263
+
264
+ The [experiments](experiments/) directory contains a controlled benchmark that
265
+ compares reference agent patterns across factual QA, multi-hop, and coding
266
+ tasks.
267
+
268
+ ```bash
269
+ poetry run python scripts/run_experiment.py
270
+ ```
271
+
272
+ The script writes normalized runs and per-agent summaries to
273
+ `experiments/results/`.
274
+
275
+ ## Learning Roadmap Agent Experiment
276
+
277
+ The project also includes a framework-backed experiment for a Learning Roadmap
278
+ Agent. It can run with LangChain, Google ADK, or both when the optional
279
+ dependencies and API keys are installed.
280
+
281
+ ```bash
282
+ pip install "entropy-agent-eval[langchain]"
283
+ export OPENAI_API_KEY="..."
284
+ poetry run python scripts/run_learning_roadmap_experiment.py --provider langchain
285
+ ```
286
+
287
+ ```bash
288
+ pip install "entropy-agent-eval[google-adk]"
289
+ export GOOGLE_API_KEY="..."
290
+ poetry run python scripts/run_learning_roadmap_experiment.py --provider google-adk
291
+ ```
292
+
293
+ The roadmap experiment runner also reads `.env` automatically. For Google ADK,
294
+ set `GOOGLE_API_KEY` or `GEMINI_API_KEY`.
295
+
296
+ Full guide: [docs/experiments/learning-roadmap-agent.md](docs/experiments/learning-roadmap-agent.md)
297
+
298
+ ## Contributing
299
+
300
+ See [CONTRIBUTING.md](CONTRIBUTING.md). New adapters are welcome, especially for
301
+ frameworks that can expose tool calls, model calls, actions, costs, outcomes,
302
+ and uncertainty states.
303
+
304
+ ## License
305
+
306
+ MIT. See [LICENSE](LICENSE).
@@ -0,0 +1,13 @@
1
+ from entropy_agent_eval.evaluator import EntropicAgentScore, EntropyEvaluator, EvaluationReport
2
+ from entropy_agent_eval.models import AgentEvent, AgentRun, InformationState
3
+
4
+ __all__ = [
5
+ "AgentEvent",
6
+ "AgentRun",
7
+ "EntropicAgentScore",
8
+ "EntropyEvaluator",
9
+ "EvaluationReport",
10
+ "InformationState",
11
+ ]
12
+
13
+ __version__ = "0.1.0"
@@ -0,0 +1,3 @@
1
+ from entropy_agent_eval.adapters.generic import EventRecorder, normalize_events
2
+
3
+ __all__ = ["EventRecorder", "normalize_events"]