entropy-agent-eval 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- entropy_agent_eval-0.1.0/CHANGELOG.md +13 -0
- entropy_agent_eval-0.1.0/LICENSE +21 -0
- entropy_agent_eval-0.1.0/PKG-INFO +332 -0
- entropy_agent_eval-0.1.0/README.md +306 -0
- entropy_agent_eval-0.1.0/entropy_agent_eval/__init__.py +13 -0
- entropy_agent_eval-0.1.0/entropy_agent_eval/adapters/__init__.py +3 -0
- entropy_agent_eval-0.1.0/entropy_agent_eval/adapters/generic.py +53 -0
- entropy_agent_eval-0.1.0/entropy_agent_eval/adapters/google_adk.py +33 -0
- entropy_agent_eval-0.1.0/entropy_agent_eval/adapters/langchain.py +43 -0
- entropy_agent_eval-0.1.0/entropy_agent_eval/benchmarks/__init__.py +5 -0
- entropy_agent_eval-0.1.0/entropy_agent_eval/benchmarks/base.py +38 -0
- entropy_agent_eval-0.1.0/entropy_agent_eval/benchmarks/coding_tasks.py +17 -0
- entropy_agent_eval-0.1.0/entropy_agent_eval/benchmarks/qa_tasks.py +16 -0
- entropy_agent_eval-0.1.0/entropy_agent_eval/cli.py +30 -0
- entropy_agent_eval-0.1.0/entropy_agent_eval/evaluator.py +174 -0
- entropy_agent_eval-0.1.0/entropy_agent_eval/experiments/__init__.py +21 -0
- entropy_agent_eval-0.1.0/entropy_agent_eval/experiments/google_adk_roadmap.py +99 -0
- entropy_agent_eval-0.1.0/entropy_agent_eval/experiments/langchain_roadmap.py +75 -0
- entropy_agent_eval-0.1.0/entropy_agent_eval/experiments/learning_roadmap.py +148 -0
- entropy_agent_eval-0.1.0/entropy_agent_eval/experiments/reference_agents.py +113 -0
- entropy_agent_eval-0.1.0/entropy_agent_eval/experiments/runner.py +124 -0
- entropy_agent_eval-0.1.0/entropy_agent_eval/io.py +51 -0
- entropy_agent_eval-0.1.0/entropy_agent_eval/metrics/__init__.py +20 -0
- entropy_agent_eval-0.1.0/entropy_agent_eval/metrics/core.py +73 -0
- entropy_agent_eval-0.1.0/entropy_agent_eval/metrics/robustness.py +42 -0
- entropy_agent_eval-0.1.0/entropy_agent_eval/metrics/temporal.py +28 -0
- entropy_agent_eval-0.1.0/entropy_agent_eval/models.py +138 -0
- entropy_agent_eval-0.1.0/entropy_agent_eval/visualizations/__init__.py +3 -0
- entropy_agent_eval-0.1.0/entropy_agent_eval/visualizations/entropy_plots.py +27 -0
- entropy_agent_eval-0.1.0/pyproject.toml +49 -0
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented here.
|
|
4
|
+
|
|
5
|
+
## 0.1.0 - Unreleased
|
|
6
|
+
|
|
7
|
+
- Add core entropy metrics for actions, tools, trajectories, uncertainty reduction, and temporal curves.
|
|
8
|
+
- Add `AgentRun`, `AgentEvent`, and `InformationState` as the framework-neutral data contract.
|
|
9
|
+
- Add `EntropyEvaluator`, `EvaluationReport`, and configurable `EntropicAgentScore`.
|
|
10
|
+
- Add generic, LangChain, and Google ADK-style adapters.
|
|
11
|
+
- Add JSON/JSONL loading and the `eea` CLI.
|
|
12
|
+
- Add a minimal benchmark harness with sample QA and coding tasks.
|
|
13
|
+
- Add optional matplotlib entropy curve plotting.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 HypelBase
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: entropy-agent-eval
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Entropy-based evaluation metrics for AI agent behavior, tools, trajectories, uncertainty reduction, and robustness.
|
|
5
|
+
License: MIT
|
|
6
|
+
Keywords: agents,evaluation,entropy,llm,langchain,adk,benchmark
|
|
7
|
+
Author: HypelBase
|
|
8
|
+
Requires-Python: >=3.12
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Provides-Extra: google-adk
|
|
17
|
+
Provides-Extra: langchain
|
|
18
|
+
Provides-Extra: plots
|
|
19
|
+
Requires-Dist: google-adk (>=0.1.0) ; extra == "google-adk"
|
|
20
|
+
Requires-Dist: google-genai (>=1.0.0) ; extra == "google-adk"
|
|
21
|
+
Requires-Dist: langchain-core (>=0.3) ; extra == "langchain"
|
|
22
|
+
Requires-Dist: langchain-openai (>=0.2) ; extra == "langchain"
|
|
23
|
+
Requires-Dist: matplotlib (>=3.7) ; extra == "plots"
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
|
|
26
|
+
# Entropy-Based Evaluation of AI Agents
|
|
27
|
+
|
|
28
|
+
`entropy-agent-eval` implements **EEA**, a toolkit for measuring agent behavior with entropy metrics:
|
|
29
|
+
|
|
30
|
+
- action entropy for action-selection uncertainty
|
|
31
|
+
- trajectory entropy for strategy diversity
|
|
32
|
+
- tool entropy for tool-use specialization
|
|
33
|
+
- information gain for uncertainty reduction
|
|
34
|
+
- entropy curves for temporal behavior
|
|
35
|
+
- robustness summaries across repeated runs
|
|
36
|
+
- a configurable Entropic Agent Score
|
|
37
|
+
|
|
38
|
+
Any agent library can integrate by converting its trace events into `AgentRun`
|
|
39
|
+
records.
|
|
40
|
+
|
|
41
|
+
## Who This Is For
|
|
42
|
+
|
|
43
|
+
Use EEA when you want to compare agent behavior beyond success rate:
|
|
44
|
+
|
|
45
|
+
- framework authors who want behavioral diagnostics
|
|
46
|
+
- application teams evaluating agent changes before deployment
|
|
47
|
+
- researchers comparing ReAct, planner, tool-using, or multi-agent systems
|
|
48
|
+
- observability teams turning traces into evaluation metrics
|
|
49
|
+
|
|
50
|
+
## Install
|
|
51
|
+
|
|
52
|
+
Requires Python 3.12 or newer.
|
|
53
|
+
|
|
54
|
+
From GitHub:
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
pip install git+https://github.com/olahsymbo/entropy-agent-eval.git
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
For local development:
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
poetry install --with dev
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Optional plotting support:
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
pip install "entropy-agent-eval[plots]"
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
Build source and wheel distributions:
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
poetry build
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
Install a local wheel:
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
pip install dist/entropy_agent_eval-0.1.0-py3-none-any.whl
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## Release
|
|
85
|
+
|
|
86
|
+
Package builds are handled by Poetry. To cut a release:
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
poetry version patch
|
|
90
|
+
git tag v0.1.1
|
|
91
|
+
git push origin main --tags
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
## Quick Start
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
from entropy_agent_eval import AgentRun, EntropyEvaluator
|
|
99
|
+
|
|
100
|
+
runs = [
|
|
101
|
+
AgentRun.from_mapping(
|
|
102
|
+
{
|
|
103
|
+
"task": "Write sorting algorithm",
|
|
104
|
+
"success": True,
|
|
105
|
+
"cost": 0.12,
|
|
106
|
+
"trajectory": ["search", "python", "test", "answer"],
|
|
107
|
+
"before": {"A": 0.4, "B": 0.3, "C": 0.2, "D": 0.1},
|
|
108
|
+
"after": {"A": 0.9, "B": 0.05, "C": 0.03, "D": 0.02},
|
|
109
|
+
}
|
|
110
|
+
)
|
|
111
|
+
]
|
|
112
|
+
|
|
113
|
+
report = EntropyEvaluator().evaluate(runs)
|
|
114
|
+
print(report.as_dict())
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
## CLI
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
eea examples/runs.json
|
|
121
|
+
eea examples/runs.json --per-run
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
The CLI accepts JSON objects with a top-level `runs` list, raw JSON lists, or
|
|
125
|
+
JSONL files.
|
|
126
|
+
|
|
127
|
+
## Integration Model
|
|
128
|
+
|
|
129
|
+
You do not have to export JSON logs. JSON is only one supported path.
|
|
130
|
+
|
|
131
|
+
EEA needs one thing: normalized traces as `AgentRun` objects. Those traces can
|
|
132
|
+
come from live callbacks, custom wrappers, databases, observability systems,
|
|
133
|
+
JSON/JSONL files, or benchmark harnesses.
|
|
134
|
+
|
|
135
|
+
```text
|
|
136
|
+
LangChain / Google ADK / custom agent / stored trace
|
|
137
|
+
↓
|
|
138
|
+
AgentRun
|
|
139
|
+
↓
|
|
140
|
+
EntropyEvaluator
|
|
141
|
+
↓
|
|
142
|
+
entropy metrics + Entropic Agent Score
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
## Data Contract
|
|
146
|
+
|
|
147
|
+
The central integration type is `AgentRun`:
|
|
148
|
+
|
|
149
|
+
```json
|
|
150
|
+
{
|
|
151
|
+
"task": "qa-001",
|
|
152
|
+
"success": true,
|
|
153
|
+
"cost": 0.08,
|
|
154
|
+
"trajectory": ["search", "read", "answer"],
|
|
155
|
+
"before": {"correct": 0.45, "distractor": 0.55},
|
|
156
|
+
"after": {"correct": 0.92, "distractor": 0.08}
|
|
157
|
+
}
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
For richer logs, use explicit events:
|
|
161
|
+
|
|
162
|
+
```json
|
|
163
|
+
{
|
|
164
|
+
"task_id": "coding-42",
|
|
165
|
+
"events": [
|
|
166
|
+
{"kind": "tool", "name": "search"},
|
|
167
|
+
{"kind": "tool", "name": "python"},
|
|
168
|
+
{"kind": "action", "name": "answer"}
|
|
169
|
+
],
|
|
170
|
+
"success": true
|
|
171
|
+
}
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
### Cost
|
|
175
|
+
|
|
176
|
+
`cost` is user or framework supplied. It can mean USD, total tokens,
|
|
177
|
+
token-normalized cost, tool-call cost, compute cost, or any other numeric
|
|
178
|
+
penalty you want to apply consistently across compared runs.
|
|
179
|
+
|
|
180
|
+
The evaluator reports it as `mean_cost` and subtracts it inside
|
|
181
|
+
`EntropicAgentScore`. If cost is unknown or irrelevant, omit it or leave it as
|
|
182
|
+
`0.0`.
|
|
183
|
+
|
|
184
|
+
Full guide: [docs/concepts/cost.md](docs/concepts/cost.md)
|
|
185
|
+
|
|
186
|
+
## Custom Agent Integration
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
from entropy_agent_eval import EntropyEvaluator
|
|
190
|
+
from entropy_agent_eval.adapters import EventRecorder
|
|
191
|
+
|
|
192
|
+
recorder = EventRecorder(task_id="task-123")
|
|
193
|
+
recorder.tool("search")
|
|
194
|
+
recorder.tool("python")
|
|
195
|
+
recorder.action("answer")
|
|
196
|
+
|
|
197
|
+
run = recorder.to_run(success=True, cost=0.04)
|
|
198
|
+
print(EntropyEvaluator().evaluate([run]).as_dict())
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
Full guide: [docs/integrations/custom-agents.md](docs/integrations/custom-agents.md)
|
|
202
|
+
|
|
203
|
+
## LangChain Integration
|
|
204
|
+
|
|
205
|
+
```python
|
|
206
|
+
from entropy_agent_eval.adapters.langchain import EntropyCallbackHandler
|
|
207
|
+
|
|
208
|
+
handler = EntropyCallbackHandler(task_id="lc-001")
|
|
209
|
+
|
|
210
|
+
# Pass `handler` in your LangChain config/callbacks.
|
|
211
|
+
# result = chain.invoke(inputs, config={"callbacks": [handler]})
|
|
212
|
+
|
|
213
|
+
run = handler.to_run(success=True, cost=0.10)
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
Full guide: [docs/integrations/langchain.md](docs/integrations/langchain.md)
|
|
217
|
+
|
|
218
|
+
## Google ADK-Style Event Integration
|
|
219
|
+
|
|
220
|
+
```python
|
|
221
|
+
from entropy_agent_eval.adapters.google_adk import runs_from_adk_events
|
|
222
|
+
|
|
223
|
+
run = runs_from_adk_events(
|
|
224
|
+
"adk-001",
|
|
225
|
+
[
|
|
226
|
+
{"event_type": "tool", "tool_name": "Search"},
|
|
227
|
+
{"event_type": "model", "model": "gemini"},
|
|
228
|
+
],
|
|
229
|
+
success=True,
|
|
230
|
+
)
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
Full guide: [docs/integrations/google-adk.md](docs/integrations/google-adk.md)
|
|
234
|
+
|
|
235
|
+
## Stored Trace Integration
|
|
236
|
+
|
|
237
|
+
If your traces are already in a database, warehouse, or observability platform,
|
|
238
|
+
export or query them into `AgentRun`-compatible dictionaries and evaluate them
|
|
239
|
+
offline.
|
|
240
|
+
|
|
241
|
+
Full guide: [docs/integrations/observability.md](docs/integrations/observability.md)
|
|
242
|
+
|
|
243
|
+
## Metric Notes
|
|
244
|
+
|
|
245
|
+
High entropy is not automatically good. EEA treats entropy as a behavioral
|
|
246
|
+
signature:
|
|
247
|
+
|
|
248
|
+
- low action entropy can mean focus or brittle determinism
|
|
249
|
+
- medium entropy can indicate adaptive branching
|
|
250
|
+
- high entropy can indicate exploration or chaos
|
|
251
|
+
- successful agents should often reduce state entropy over time
|
|
252
|
+
- robust agents can have moderate trajectory entropy with low outcome entropy
|
|
253
|
+
|
|
254
|
+
`EntropicAgentScore` is configurable:
|
|
255
|
+
|
|
256
|
+
```python
|
|
257
|
+
from entropy_agent_eval import EntropicAgentScore, EntropyEvaluator
|
|
258
|
+
|
|
259
|
+
evaluator = EntropyEvaluator(
|
|
260
|
+
EntropicAgentScore(
|
|
261
|
+
success_weight=2.0,
|
|
262
|
+
information_gain_weight=1.0,
|
|
263
|
+
exploration_efficiency_weight=0.5,
|
|
264
|
+
cost_weight=1.5,
|
|
265
|
+
)
|
|
266
|
+
)
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
## Benchmark
|
|
270
|
+
|
|
271
|
+
Any callable that accepts a `BenchmarkTask` and returns an `AgentRun` or
|
|
272
|
+
compatible dictionary can be benchmarked:
|
|
273
|
+
|
|
274
|
+
```python
|
|
275
|
+
from entropy_agent_eval.benchmarks import QA_TASKS, run_benchmark
|
|
276
|
+
|
|
277
|
+
def agent(task):
|
|
278
|
+
return {
|
|
279
|
+
"task_id": task.id,
|
|
280
|
+
"trajectory": ["think", "answer"],
|
|
281
|
+
"success": True,
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
runs = run_benchmark(QA_TASKS, agent)
|
|
285
|
+
```
|
|
286
|
+
|
|
287
|
+
## Controlled Benchmark
|
|
288
|
+
|
|
289
|
+
The [experiments](experiments/) directory contains a controlled benchmark that
|
|
290
|
+
compares reference agent patterns across factual QA, multi-hop, and coding
|
|
291
|
+
tasks.
|
|
292
|
+
|
|
293
|
+
```bash
|
|
294
|
+
poetry run python scripts/run_experiment.py
|
|
295
|
+
```
|
|
296
|
+
|
|
297
|
+
The script writes normalized runs and per-agent summaries to
|
|
298
|
+
`experiments/results/`.
|
|
299
|
+
|
|
300
|
+
## Learning Roadmap Agent Experiment
|
|
301
|
+
|
|
302
|
+
The project also includes a framework-backed experiment for a Learning Roadmap
|
|
303
|
+
Agent. It can run with LangChain, Google ADK, or both when the optional
|
|
304
|
+
dependencies and API keys are installed.
|
|
305
|
+
|
|
306
|
+
```bash
|
|
307
|
+
pip install "entropy-agent-eval[langchain]"
|
|
308
|
+
export OPENAI_API_KEY="..."
|
|
309
|
+
poetry run python scripts/run_learning_roadmap_experiment.py --provider langchain
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
```bash
|
|
313
|
+
pip install "entropy-agent-eval[google-adk]"
|
|
314
|
+
export GOOGLE_API_KEY="..."
|
|
315
|
+
poetry run python scripts/run_learning_roadmap_experiment.py --provider google-adk
|
|
316
|
+
```
|
|
317
|
+
|
|
318
|
+
The roadmap experiment runner also reads `.env` automatically. For Google ADK,
|
|
319
|
+
set `GOOGLE_API_KEY` or `GEMINI_API_KEY`.
|
|
320
|
+
|
|
321
|
+
Full guide: [docs/experiments/learning-roadmap-agent.md](docs/experiments/learning-roadmap-agent.md)
|
|
322
|
+
|
|
323
|
+
## Contributing
|
|
324
|
+
|
|
325
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md). New adapters are welcome, especially for
|
|
326
|
+
frameworks that can expose tool calls, model calls, actions, costs, outcomes,
|
|
327
|
+
and uncertainty states.
|
|
328
|
+
|
|
329
|
+
## License
|
|
330
|
+
|
|
331
|
+
MIT. See [LICENSE](LICENSE).
|
|
332
|
+
|
|
@@ -0,0 +1,306 @@
|
|
|
1
|
+
# Entropy-Based Evaluation of AI Agents
|
|
2
|
+
|
|
3
|
+
`entropy-agent-eval` implements **EEA**, a toolkit for measuring agent behavior with entropy metrics:
|
|
4
|
+
|
|
5
|
+
- action entropy for action-selection uncertainty
|
|
6
|
+
- trajectory entropy for strategy diversity
|
|
7
|
+
- tool entropy for tool-use specialization
|
|
8
|
+
- information gain for uncertainty reduction
|
|
9
|
+
- entropy curves for temporal behavior
|
|
10
|
+
- robustness summaries across repeated runs
|
|
11
|
+
- a configurable Entropic Agent Score
|
|
12
|
+
|
|
13
|
+
Any agent library can integrate by converting its trace events into `AgentRun`
|
|
14
|
+
records.
|
|
15
|
+
|
|
16
|
+
## Who This Is For
|
|
17
|
+
|
|
18
|
+
Use EEA when you want to compare agent behavior beyond success rate:
|
|
19
|
+
|
|
20
|
+
- framework authors who want behavioral diagnostics
|
|
21
|
+
- application teams evaluating agent changes before deployment
|
|
22
|
+
- researchers comparing ReAct, planner, tool-using, or multi-agent systems
|
|
23
|
+
- observability teams turning traces into evaluation metrics
|
|
24
|
+
|
|
25
|
+
## Install
|
|
26
|
+
|
|
27
|
+
Requires Python 3.12 or newer.
|
|
28
|
+
|
|
29
|
+
From GitHub:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install git+https://github.com/olahsymbo/entropy-agent-eval.git
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
For local development:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
poetry install --with dev
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Optional plotting support:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
pip install "entropy-agent-eval[plots]"
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Build source and wheel distributions:
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
poetry build
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Install a local wheel:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
pip install dist/entropy_agent_eval-0.1.0-py3-none-any.whl
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## Release
|
|
60
|
+
|
|
61
|
+
Package builds are handled by Poetry. To cut a release:
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
poetry version patch
|
|
65
|
+
git tag v0.1.1
|
|
66
|
+
git push origin main --tags
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
## Quick Start
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
from entropy_agent_eval import AgentRun, EntropyEvaluator
|
|
74
|
+
|
|
75
|
+
runs = [
|
|
76
|
+
AgentRun.from_mapping(
|
|
77
|
+
{
|
|
78
|
+
"task": "Write sorting algorithm",
|
|
79
|
+
"success": True,
|
|
80
|
+
"cost": 0.12,
|
|
81
|
+
"trajectory": ["search", "python", "test", "answer"],
|
|
82
|
+
"before": {"A": 0.4, "B": 0.3, "C": 0.2, "D": 0.1},
|
|
83
|
+
"after": {"A": 0.9, "B": 0.05, "C": 0.03, "D": 0.02},
|
|
84
|
+
}
|
|
85
|
+
)
|
|
86
|
+
]
|
|
87
|
+
|
|
88
|
+
report = EntropyEvaluator().evaluate(runs)
|
|
89
|
+
print(report.as_dict())
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## CLI
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
eea examples/runs.json
|
|
96
|
+
eea examples/runs.json --per-run
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
The CLI accepts JSON objects with a top-level `runs` list, raw JSON lists, or
|
|
100
|
+
JSONL files.
|
|
101
|
+
|
|
102
|
+
## Integration Model
|
|
103
|
+
|
|
104
|
+
You do not have to export JSON logs. JSON is only one supported path.
|
|
105
|
+
|
|
106
|
+
EEA needs one thing: normalized traces as `AgentRun` objects. Those traces can
|
|
107
|
+
come from live callbacks, custom wrappers, databases, observability systems,
|
|
108
|
+
JSON/JSONL files, or benchmark harnesses.
|
|
109
|
+
|
|
110
|
+
```text
|
|
111
|
+
LangChain / Google ADK / custom agent / stored trace
|
|
112
|
+
↓
|
|
113
|
+
AgentRun
|
|
114
|
+
↓
|
|
115
|
+
EntropyEvaluator
|
|
116
|
+
↓
|
|
117
|
+
entropy metrics + Entropic Agent Score
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
## Data Contract
|
|
121
|
+
|
|
122
|
+
The central integration type is `AgentRun`:
|
|
123
|
+
|
|
124
|
+
```json
|
|
125
|
+
{
|
|
126
|
+
"task": "qa-001",
|
|
127
|
+
"success": true,
|
|
128
|
+
"cost": 0.08,
|
|
129
|
+
"trajectory": ["search", "read", "answer"],
|
|
130
|
+
"before": {"correct": 0.45, "distractor": 0.55},
|
|
131
|
+
"after": {"correct": 0.92, "distractor": 0.08}
|
|
132
|
+
}
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
For richer logs, use explicit events:
|
|
136
|
+
|
|
137
|
+
```json
|
|
138
|
+
{
|
|
139
|
+
"task_id": "coding-42",
|
|
140
|
+
"events": [
|
|
141
|
+
{"kind": "tool", "name": "search"},
|
|
142
|
+
{"kind": "tool", "name": "python"},
|
|
143
|
+
{"kind": "action", "name": "answer"}
|
|
144
|
+
],
|
|
145
|
+
"success": true
|
|
146
|
+
}
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
### Cost
|
|
150
|
+
|
|
151
|
+
`cost` is user or framework supplied. It can mean USD, total tokens,
|
|
152
|
+
token-normalized cost, tool-call cost, compute cost, or any other numeric
|
|
153
|
+
penalty you want to apply consistently across compared runs.
|
|
154
|
+
|
|
155
|
+
The evaluator reports it as `mean_cost` and subtracts it inside
|
|
156
|
+
`EntropicAgentScore`. If cost is unknown or irrelevant, omit it or leave it as
|
|
157
|
+
`0.0`.
|
|
158
|
+
|
|
159
|
+
Full guide: [docs/concepts/cost.md](docs/concepts/cost.md)
|
|
160
|
+
|
|
161
|
+
## Custom Agent Integration
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
from entropy_agent_eval import EntropyEvaluator
|
|
165
|
+
from entropy_agent_eval.adapters import EventRecorder
|
|
166
|
+
|
|
167
|
+
recorder = EventRecorder(task_id="task-123")
|
|
168
|
+
recorder.tool("search")
|
|
169
|
+
recorder.tool("python")
|
|
170
|
+
recorder.action("answer")
|
|
171
|
+
|
|
172
|
+
run = recorder.to_run(success=True, cost=0.04)
|
|
173
|
+
print(EntropyEvaluator().evaluate([run]).as_dict())
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
Full guide: [docs/integrations/custom-agents.md](docs/integrations/custom-agents.md)
|
|
177
|
+
|
|
178
|
+
## LangChain Integration
|
|
179
|
+
|
|
180
|
+
```python
|
|
181
|
+
from entropy_agent_eval.adapters.langchain import EntropyCallbackHandler
|
|
182
|
+
|
|
183
|
+
handler = EntropyCallbackHandler(task_id="lc-001")
|
|
184
|
+
|
|
185
|
+
# Pass `handler` in your LangChain config/callbacks.
|
|
186
|
+
# result = chain.invoke(inputs, config={"callbacks": [handler]})
|
|
187
|
+
|
|
188
|
+
run = handler.to_run(success=True, cost=0.10)
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
Full guide: [docs/integrations/langchain.md](docs/integrations/langchain.md)
|
|
192
|
+
|
|
193
|
+
## Google ADK-Style Event Integration
|
|
194
|
+
|
|
195
|
+
```python
|
|
196
|
+
from entropy_agent_eval.adapters.google_adk import runs_from_adk_events
|
|
197
|
+
|
|
198
|
+
run = runs_from_adk_events(
|
|
199
|
+
"adk-001",
|
|
200
|
+
[
|
|
201
|
+
{"event_type": "tool", "tool_name": "Search"},
|
|
202
|
+
{"event_type": "model", "model": "gemini"},
|
|
203
|
+
],
|
|
204
|
+
success=True,
|
|
205
|
+
)
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
Full guide: [docs/integrations/google-adk.md](docs/integrations/google-adk.md)
|
|
209
|
+
|
|
210
|
+
## Stored Trace Integration
|
|
211
|
+
|
|
212
|
+
If your traces are already in a database, warehouse, or observability platform,
|
|
213
|
+
export or query them into `AgentRun`-compatible dictionaries and evaluate them
|
|
214
|
+
offline.
|
|
215
|
+
|
|
216
|
+
Full guide: [docs/integrations/observability.md](docs/integrations/observability.md)
|
|
217
|
+
|
|
218
|
+
## Metric Notes
|
|
219
|
+
|
|
220
|
+
High entropy is not automatically good. EEA treats entropy as a behavioral
|
|
221
|
+
signature:
|
|
222
|
+
|
|
223
|
+
- low action entropy can mean focus or brittle determinism
|
|
224
|
+
- medium entropy can indicate adaptive branching
|
|
225
|
+
- high entropy can indicate exploration or chaos
|
|
226
|
+
- successful agents should often reduce state entropy over time
|
|
227
|
+
- robust agents can have moderate trajectory entropy with low outcome entropy
|
|
228
|
+
|
|
229
|
+
`EntropicAgentScore` is configurable:
|
|
230
|
+
|
|
231
|
+
```python
|
|
232
|
+
from entropy_agent_eval import EntropicAgentScore, EntropyEvaluator
|
|
233
|
+
|
|
234
|
+
evaluator = EntropyEvaluator(
|
|
235
|
+
EntropicAgentScore(
|
|
236
|
+
success_weight=2.0,
|
|
237
|
+
information_gain_weight=1.0,
|
|
238
|
+
exploration_efficiency_weight=0.5,
|
|
239
|
+
cost_weight=1.5,
|
|
240
|
+
)
|
|
241
|
+
)
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
## Benchmark
|
|
245
|
+
|
|
246
|
+
Any callable that accepts a `BenchmarkTask` and returns an `AgentRun` or
|
|
247
|
+
compatible dictionary can be benchmarked:
|
|
248
|
+
|
|
249
|
+
```python
|
|
250
|
+
from entropy_agent_eval.benchmarks import QA_TASKS, run_benchmark
|
|
251
|
+
|
|
252
|
+
def agent(task):
|
|
253
|
+
return {
|
|
254
|
+
"task_id": task.id,
|
|
255
|
+
"trajectory": ["think", "answer"],
|
|
256
|
+
"success": True,
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
runs = run_benchmark(QA_TASKS, agent)
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
## Controlled Benchmark
|
|
263
|
+
|
|
264
|
+
The [experiments](experiments/) directory contains a controlled benchmark that
|
|
265
|
+
compares reference agent patterns across factual QA, multi-hop, and coding
|
|
266
|
+
tasks.
|
|
267
|
+
|
|
268
|
+
```bash
|
|
269
|
+
poetry run python scripts/run_experiment.py
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
The script writes normalized runs and per-agent summaries to
|
|
273
|
+
`experiments/results/`.
|
|
274
|
+
|
|
275
|
+
## Learning Roadmap Agent Experiment
|
|
276
|
+
|
|
277
|
+
The project also includes a framework-backed experiment for a Learning Roadmap
|
|
278
|
+
Agent. It can run with LangChain, Google ADK, or both when the optional
|
|
279
|
+
dependencies and API keys are installed.
|
|
280
|
+
|
|
281
|
+
```bash
|
|
282
|
+
pip install "entropy-agent-eval[langchain]"
|
|
283
|
+
export OPENAI_API_KEY="..."
|
|
284
|
+
poetry run python scripts/run_learning_roadmap_experiment.py --provider langchain
|
|
285
|
+
```
|
|
286
|
+
|
|
287
|
+
```bash
|
|
288
|
+
pip install "entropy-agent-eval[google-adk]"
|
|
289
|
+
export GOOGLE_API_KEY="..."
|
|
290
|
+
poetry run python scripts/run_learning_roadmap_experiment.py --provider google-adk
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
The roadmap experiment runner also reads `.env` automatically. For Google ADK,
|
|
294
|
+
set `GOOGLE_API_KEY` or `GEMINI_API_KEY`.
|
|
295
|
+
|
|
296
|
+
Full guide: [docs/experiments/learning-roadmap-agent.md](docs/experiments/learning-roadmap-agent.md)
|
|
297
|
+
|
|
298
|
+
## Contributing
|
|
299
|
+
|
|
300
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md). New adapters are welcome, especially for
|
|
301
|
+
frameworks that can expose tool calls, model calls, actions, costs, outcomes,
|
|
302
|
+
and uncertainty states.
|
|
303
|
+
|
|
304
|
+
## License
|
|
305
|
+
|
|
306
|
+
MIT. See [LICENSE](LICENSE).
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from entropy_agent_eval.evaluator import EntropicAgentScore, EntropyEvaluator, EvaluationReport
|
|
2
|
+
from entropy_agent_eval.models import AgentEvent, AgentRun, InformationState
|
|
3
|
+
|
|
4
|
+
__all__ = [
|
|
5
|
+
"AgentEvent",
|
|
6
|
+
"AgentRun",
|
|
7
|
+
"EntropicAgentScore",
|
|
8
|
+
"EntropyEvaluator",
|
|
9
|
+
"EvaluationReport",
|
|
10
|
+
"InformationState",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
__version__ = "0.1.0"
|