agentburn 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentburn/__init__.py +8 -0
- agentburn/adapters/__init__.py +14 -0
- agentburn/adapters/hermes.py +239 -0
- agentburn/analyze.py +180 -0
- agentburn/baseline.py +92 -0
- agentburn/benchmarks.py +41 -0
- agentburn/cli.py +136 -0
- agentburn/doctor.py +97 -0
- agentburn/model.py +71 -0
- agentburn/recommend.py +100 -0
- agentburn/report.py +191 -0
- agentburn/share.py +107 -0
- agentburn-0.2.0.dist-info/METADATA +130 -0
- agentburn-0.2.0.dist-info/RECORD +18 -0
- agentburn-0.2.0.dist-info/WHEEL +5 -0
- agentburn-0.2.0.dist-info/entry_points.txt +2 -0
- agentburn-0.2.0.dist-info/licenses/LICENSE +21 -0
- agentburn-0.2.0.dist-info/top_level.txt +1 -0
agentburn/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""agentburn — where does your AI agent burn money?
|
|
2
|
+
|
|
3
|
+
Local, zero-dependency token/cost profiler for always-on AI agents.
|
|
4
|
+
Adapter #1: Hermes Agent (~/.hermes/state.db). Honest methodology:
|
|
5
|
+
numbers come from the agent's own accounting; gaps are surfaced, not hidden.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
__version__ = "0.2.0"
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Adapter registry. v0.1 ships Hermes; OpenClaw and Claude Code are next."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from . import hermes
|
|
6
|
+
|
|
7
|
+
ADAPTERS = {
|
|
8
|
+
"hermes": hermes,
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def detect() -> list[str]:
|
|
13
|
+
"""Return adapter names whose data is present on this machine."""
|
|
14
|
+
return [name for name, mod in ADAPTERS.items() if mod.available()]
|
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
"""Hermes Agent adapter: reads ~/.hermes/state.db (SQLite) read-only.
|
|
2
|
+
|
|
3
|
+
Schema observed in NousResearch/hermes-agent `hermes_state.py` (June 2026):
|
|
4
|
+
sessions(id, source, model, parent_session_id, started_at, ended_at,
|
|
5
|
+
message_count, tool_call_count, api_call_count,
|
|
6
|
+
input_tokens, output_tokens, cache_read_tokens, cache_write_tokens,
|
|
7
|
+
reasoning_tokens, estimated_cost_usd, actual_cost_usd, cost_status,
|
|
8
|
+
title, archived, ...)
|
|
9
|
+
messages(session_id, role, tool_name, tool_calls, timestamp, token_count, ...)
|
|
10
|
+
|
|
11
|
+
Known upstream accounting gaps (hermes-agent #12023, #6775, #8337): some
|
|
12
|
+
providers/streams record zero tokens. We DETECT and REPORT those gaps instead
|
|
13
|
+
of silently presenting totals as truth.
|
|
14
|
+
|
|
15
|
+
Optional precision layer: request_dump_*.json files (written when request
|
|
16
|
+
dumping is enabled) contain the full API body; we sample them to estimate the
|
|
17
|
+
input composition (system prompt vs tool definitions vs history).
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import glob
|
|
23
|
+
import json
|
|
24
|
+
import os
|
|
25
|
+
import sqlite3
|
|
26
|
+
import time
|
|
27
|
+
from typing import Optional
|
|
28
|
+
|
|
29
|
+
from ..model import DumpComposition, SessionRec, Snapshot, ToolStat
|
|
30
|
+
|
|
31
|
+
GATEWAY_SOURCES = {
|
|
32
|
+
"telegram",
|
|
33
|
+
"whatsapp",
|
|
34
|
+
"discord",
|
|
35
|
+
"slack",
|
|
36
|
+
"signal",
|
|
37
|
+
"imessage",
|
|
38
|
+
"email",
|
|
39
|
+
"api",
|
|
40
|
+
"api_server",
|
|
41
|
+
"web",
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def default_db_path() -> str:
|
|
46
|
+
return os.path.join(os.path.expanduser("~"), ".hermes", "state.db")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def available() -> bool:
|
|
50
|
+
return os.path.exists(default_db_path())
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def normalize_source(raw: Optional[str]) -> str:
|
|
54
|
+
s = (raw or "unknown").strip().lower()
|
|
55
|
+
if s in ("cli", "cron", "subagent"):
|
|
56
|
+
return s
|
|
57
|
+
if s in GATEWAY_SOURCES:
|
|
58
|
+
return f"gateway:{s}"
|
|
59
|
+
if s.startswith(("gateway:", "other:")):
|
|
60
|
+
return s
|
|
61
|
+
return f"other:{s}"
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _columns(con: sqlite3.Connection, table: str) -> set:
|
|
65
|
+
try:
|
|
66
|
+
return {r[1] for r in con.execute(f"PRAGMA table_info({table})")}
|
|
67
|
+
except sqlite3.Error:
|
|
68
|
+
return set()
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _col(cols: set, name: str, default_sql: str = "NULL") -> str:
|
|
72
|
+
return name if name in cols else f"{default_sql} AS {name}"
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def load(
|
|
76
|
+
db_path: Optional[str] = None,
|
|
77
|
+
days: Optional[int] = 30,
|
|
78
|
+
dumps_dir: Optional[str] = None,
|
|
79
|
+
now: Optional[float] = None,
|
|
80
|
+
) -> Snapshot:
|
|
81
|
+
path = db_path or default_db_path()
|
|
82
|
+
if not os.path.exists(path):
|
|
83
|
+
raise FileNotFoundError(
|
|
84
|
+
f"Hermes state not found at {path}. Pass --db /path/to/state.db "
|
|
85
|
+
"or run on the machine where Hermes Agent lives."
|
|
86
|
+
)
|
|
87
|
+
now = now or time.time()
|
|
88
|
+
since = now - days * 86400 if days else 0
|
|
89
|
+
|
|
90
|
+
snap = Snapshot(
|
|
91
|
+
agent="hermes",
|
|
92
|
+
source_path=path,
|
|
93
|
+
generated_at=now,
|
|
94
|
+
days=days,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
con = sqlite3.connect(f"file:{path}?mode=ro", uri=True)
|
|
98
|
+
try:
|
|
99
|
+
con.row_factory = sqlite3.Row
|
|
100
|
+
scols = _columns(con, "sessions")
|
|
101
|
+
if "id" not in scols:
|
|
102
|
+
raise RuntimeError(
|
|
103
|
+
"sessions table not found — is this really a Hermes state.db? "
|
|
104
|
+
"(schema may have changed; please open an issue with `PRAGMA table_info(sessions)` output)"
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
fields = ", ".join(
|
|
108
|
+
[
|
|
109
|
+
"id",
|
|
110
|
+
_col(scols, "source", "'unknown'"),
|
|
111
|
+
_col(scols, "model"),
|
|
112
|
+
_col(scols, "parent_session_id"),
|
|
113
|
+
_col(scols, "started_at"),
|
|
114
|
+
_col(scols, "ended_at"),
|
|
115
|
+
_col(scols, "title"),
|
|
116
|
+
_col(scols, "message_count", "0"),
|
|
117
|
+
_col(scols, "api_call_count", "0"),
|
|
118
|
+
_col(scols, "input_tokens", "0"),
|
|
119
|
+
_col(scols, "output_tokens", "0"),
|
|
120
|
+
_col(scols, "cache_read_tokens", "0"),
|
|
121
|
+
_col(scols, "cache_write_tokens", "0"),
|
|
122
|
+
_col(scols, "reasoning_tokens", "0"),
|
|
123
|
+
_col(scols, "estimated_cost_usd"),
|
|
124
|
+
_col(scols, "actual_cost_usd"),
|
|
125
|
+
_col(scols, "billing_provider"),
|
|
126
|
+
]
|
|
127
|
+
)
|
|
128
|
+
where = "WHERE COALESCE(started_at, 0) >= ?" if days else ""
|
|
129
|
+
rows = con.execute(
|
|
130
|
+
f"SELECT {fields} FROM sessions {where}", (since,) if days else ()
|
|
131
|
+
).fetchall()
|
|
132
|
+
|
|
133
|
+
for r in rows:
|
|
134
|
+
actual = r["actual_cost_usd"]
|
|
135
|
+
est = r["estimated_cost_usd"]
|
|
136
|
+
cost, basis = (
|
|
137
|
+
(actual, "actual")
|
|
138
|
+
if actual is not None
|
|
139
|
+
else (est, "estimated")
|
|
140
|
+
if est is not None
|
|
141
|
+
else (None, "unknown")
|
|
142
|
+
)
|
|
143
|
+
snap.sessions.append(
|
|
144
|
+
SessionRec(
|
|
145
|
+
id=str(r["id"]),
|
|
146
|
+
source=normalize_source(r["source"]),
|
|
147
|
+
model=r["model"],
|
|
148
|
+
started_at=r["started_at"],
|
|
149
|
+
ended_at=r["ended_at"],
|
|
150
|
+
parent_id=r["parent_session_id"],
|
|
151
|
+
title=r["title"],
|
|
152
|
+
api_calls=int(r["api_call_count"] or 0),
|
|
153
|
+
input_tokens=int(r["input_tokens"] or 0),
|
|
154
|
+
output_tokens=int(r["output_tokens"] or 0),
|
|
155
|
+
cache_read_tokens=int(r["cache_read_tokens"] or 0),
|
|
156
|
+
cache_write_tokens=int(r["cache_write_tokens"] or 0),
|
|
157
|
+
reasoning_tokens=int(r["reasoning_tokens"] or 0),
|
|
158
|
+
cost_usd=float(cost) if cost is not None else None,
|
|
159
|
+
cost_basis=basis,
|
|
160
|
+
message_count=int(r["message_count"] or 0),
|
|
161
|
+
provider=r["billing_provider"],
|
|
162
|
+
)
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
mcols = _columns(con, "messages")
|
|
166
|
+
if {"tool_name", "timestamp"} <= mcols:
|
|
167
|
+
tok = "COALESCE(token_count, 0)" if "token_count" in mcols else "0"
|
|
168
|
+
mwhere = "AND timestamp >= ?" if days else ""
|
|
169
|
+
for r in con.execute(
|
|
170
|
+
f"""SELECT tool_name, COUNT(*) AS calls, SUM({tok}) AS toks
|
|
171
|
+
FROM messages
|
|
172
|
+
WHERE tool_name IS NOT NULL AND tool_name != '' {mwhere}
|
|
173
|
+
GROUP BY tool_name ORDER BY toks DESC""",
|
|
174
|
+
(since,) if days else (),
|
|
175
|
+
):
|
|
176
|
+
snap.tools.append(
|
|
177
|
+
ToolStat(name=r["tool_name"], calls=int(r["calls"]), result_tokens=int(r["toks"] or 0))
|
|
178
|
+
)
|
|
179
|
+
finally:
|
|
180
|
+
con.close()
|
|
181
|
+
|
|
182
|
+
comp = _sample_dumps(dumps_dir or os.path.join(os.path.dirname(path), "sessions"))
|
|
183
|
+
if comp:
|
|
184
|
+
snap.composition = comp
|
|
185
|
+
return snap
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _sample_dumps(dumps_dir: str, limit: int = 20) -> Optional[DumpComposition]:
|
|
189
|
+
"""Estimate input composition from request dumps (newest `limit` files).
|
|
190
|
+
|
|
191
|
+
Char-proportional split of the request body: system prompt vs tool
|
|
192
|
+
definitions vs message history. Proportions, not exact tokens — labeled
|
|
193
|
+
as sampled estimate in the report.
|
|
194
|
+
"""
|
|
195
|
+
try:
|
|
196
|
+
files = sorted(glob.glob(os.path.join(dumps_dir, "request_dump_*.json")))[-limit:]
|
|
197
|
+
except OSError:
|
|
198
|
+
return None
|
|
199
|
+
if not files:
|
|
200
|
+
return None
|
|
201
|
+
sys_c = tools_c = hist_c = 0
|
|
202
|
+
samples = 0
|
|
203
|
+
for f in files:
|
|
204
|
+
try:
|
|
205
|
+
with open(f, "r", encoding="utf-8", errors="replace") as fh:
|
|
206
|
+
payload = json.load(fh)
|
|
207
|
+
except (OSError, json.JSONDecodeError):
|
|
208
|
+
continue
|
|
209
|
+
body = payload.get("body") or payload.get("request") or payload
|
|
210
|
+
if not isinstance(body, dict):
|
|
211
|
+
continue
|
|
212
|
+
msgs = body.get("messages") or []
|
|
213
|
+
tools = body.get("tools") or []
|
|
214
|
+
s = t = h = 0
|
|
215
|
+
if isinstance(body.get("system"), str):
|
|
216
|
+
s += len(body["system"])
|
|
217
|
+
for m in msgs if isinstance(msgs, list) else []:
|
|
218
|
+
chunk = len(json.dumps(m, ensure_ascii=False, default=str))
|
|
219
|
+
if isinstance(m, dict) and m.get("role") == "system":
|
|
220
|
+
s += chunk
|
|
221
|
+
else:
|
|
222
|
+
h += chunk
|
|
223
|
+
t = len(json.dumps(tools, ensure_ascii=False, default=str)) if tools else 0
|
|
224
|
+
total = s + t + h
|
|
225
|
+
if total <= 0:
|
|
226
|
+
continue
|
|
227
|
+
sys_c += s
|
|
228
|
+
tools_c += t
|
|
229
|
+
hist_c += h
|
|
230
|
+
samples += 1
|
|
231
|
+
total = sys_c + tools_c + hist_c
|
|
232
|
+
if samples == 0 or total == 0:
|
|
233
|
+
return None
|
|
234
|
+
return DumpComposition(
|
|
235
|
+
samples=samples,
|
|
236
|
+
system_share=sys_c / total,
|
|
237
|
+
tools_share=tools_c / total,
|
|
238
|
+
history_share=hist_c / total,
|
|
239
|
+
)
|
agentburn/analyze.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
"""Aggregations over the normalized snapshot. Pure functions, no I/O."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
from .model import SessionRec, Snapshot
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class Bucket:
|
|
14
|
+
sessions: int = 0
|
|
15
|
+
api_calls: int = 0
|
|
16
|
+
tokens: int = 0
|
|
17
|
+
input_tokens: int = 0
|
|
18
|
+
cost: float = 0.0
|
|
19
|
+
cost_known: bool = False
|
|
20
|
+
|
|
21
|
+
def add(self, s: SessionRec) -> None:
|
|
22
|
+
self.sessions += 1
|
|
23
|
+
self.api_calls += s.api_calls
|
|
24
|
+
self.tokens += s.total_tokens
|
|
25
|
+
self.input_tokens += s.input_tokens
|
|
26
|
+
if s.cost_usd is not None:
|
|
27
|
+
self.cost += s.cost_usd
|
|
28
|
+
self.cost_known = True
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class ParentRollup:
|
|
33
|
+
id: str
|
|
34
|
+
title: str
|
|
35
|
+
model: Optional[str]
|
|
36
|
+
own_cost: float
|
|
37
|
+
sub_cost: float
|
|
38
|
+
sub_sessions: int
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class Analysis:
|
|
43
|
+
agent: str
|
|
44
|
+
source_path: str
|
|
45
|
+
days: Optional[int]
|
|
46
|
+
period_start: Optional[float]
|
|
47
|
+
period_end: float
|
|
48
|
+
total: Bucket
|
|
49
|
+
by_source: dict
|
|
50
|
+
by_model: dict
|
|
51
|
+
tools: list
|
|
52
|
+
night: Bucket
|
|
53
|
+
night_by_source: dict
|
|
54
|
+
night_window: tuple
|
|
55
|
+
rollups: list
|
|
56
|
+
overhead_per_call: dict # source -> avg input tokens per api call
|
|
57
|
+
composition: object
|
|
58
|
+
cost_basis: str # actual | estimated | mixed | unknown
|
|
59
|
+
zero_token_sessions: int
|
|
60
|
+
daily_cost: Optional[float]
|
|
61
|
+
monthly_projection: Optional[float]
|
|
62
|
+
warnings: list = field(default_factory=list)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _is_night(ts: float, window: tuple) -> bool:
|
|
66
|
+
h = time.localtime(ts).tm_hour
|
|
67
|
+
start, end = window
|
|
68
|
+
if start <= end:
|
|
69
|
+
return start <= h < end
|
|
70
|
+
return h >= start or h < end # wraps midnight, e.g. 23-7
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def analyze(snap: Snapshot, night_window: tuple = (0, 8)) -> Analysis:
|
|
74
|
+
total = Bucket()
|
|
75
|
+
by_source: dict = {}
|
|
76
|
+
by_model: dict = {}
|
|
77
|
+
night = Bucket()
|
|
78
|
+
night_by_source: dict = {}
|
|
79
|
+
zero_token = 0
|
|
80
|
+
bases = set()
|
|
81
|
+
|
|
82
|
+
for s in snap.sessions:
|
|
83
|
+
total.add(s)
|
|
84
|
+
by_source.setdefault(s.source, Bucket()).add(s)
|
|
85
|
+
by_model.setdefault(s.model or "unknown", Bucket()).add(s)
|
|
86
|
+
if s.started_at and _is_night(s.started_at, night_window):
|
|
87
|
+
night.add(s)
|
|
88
|
+
night_by_source.setdefault(s.source, Bucket()).add(s)
|
|
89
|
+
if s.total_tokens == 0 and s.message_count > 0:
|
|
90
|
+
zero_token += 1
|
|
91
|
+
bases.add(s.cost_basis)
|
|
92
|
+
|
|
93
|
+
bases.discard("unknown")
|
|
94
|
+
cost_basis = (
|
|
95
|
+
"unknown" if not bases else bases.pop() if len(bases) == 1 else "mixed"
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# subagent costs rolled up to their root parents
|
|
99
|
+
by_id = {s.id: s for s in snap.sessions}
|
|
100
|
+
sub_cost: dict = {}
|
|
101
|
+
sub_count: dict = {}
|
|
102
|
+
for s in snap.sessions:
|
|
103
|
+
if s.source != "subagent":
|
|
104
|
+
continue
|
|
105
|
+
root = s
|
|
106
|
+
seen = set()
|
|
107
|
+
while root.parent_id and root.parent_id in by_id and root.id not in seen:
|
|
108
|
+
seen.add(root.id)
|
|
109
|
+
root = by_id[root.parent_id]
|
|
110
|
+
if root.id != s.id:
|
|
111
|
+
sub_cost[root.id] = sub_cost.get(root.id, 0.0) + (s.cost_usd or 0.0)
|
|
112
|
+
sub_count[root.id] = sub_count.get(root.id, 0) + 1
|
|
113
|
+
rollups = sorted(
|
|
114
|
+
(
|
|
115
|
+
ParentRollup(
|
|
116
|
+
id=pid,
|
|
117
|
+
title=(by_id[pid].title or pid)[:60],
|
|
118
|
+
model=by_id[pid].model,
|
|
119
|
+
own_cost=by_id[pid].cost_usd or 0.0,
|
|
120
|
+
sub_cost=c,
|
|
121
|
+
sub_sessions=sub_count[pid],
|
|
122
|
+
)
|
|
123
|
+
for pid, c in sub_cost.items()
|
|
124
|
+
if pid in by_id
|
|
125
|
+
),
|
|
126
|
+
key=lambda r: r.sub_cost,
|
|
127
|
+
reverse=True,
|
|
128
|
+
)[:5]
|
|
129
|
+
|
|
130
|
+
overhead = {
|
|
131
|
+
src: round(b.input_tokens / b.api_calls)
|
|
132
|
+
for src, b in by_source.items()
|
|
133
|
+
if b.api_calls > 0
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
starts = [s.started_at for s in snap.sessions if s.started_at]
|
|
137
|
+
period_start = min(starts) if starts else None
|
|
138
|
+
span_days = (
|
|
139
|
+
max(1.0, (snap.generated_at - period_start) / 86400) if period_start else None
|
|
140
|
+
)
|
|
141
|
+
daily = (total.cost / span_days) if (span_days and total.cost_known) else None
|
|
142
|
+
|
|
143
|
+
warnings = list(snap.warnings)
|
|
144
|
+
if zero_token > 0:
|
|
145
|
+
warnings.append(
|
|
146
|
+
f"{zero_token} session(s) have messages but zero recorded tokens — known Hermes "
|
|
147
|
+
"accounting gaps (e.g. streaming without usage, hermes-agent #12023). "
|
|
148
|
+
"All totals are a LOWER BOUND."
|
|
149
|
+
)
|
|
150
|
+
if cost_basis == "estimated":
|
|
151
|
+
warnings.append("Costs are Hermes' own estimates, not provider-billed actuals.")
|
|
152
|
+
if cost_basis == "mixed":
|
|
153
|
+
warnings.append("Costs mix provider-billed actuals and Hermes estimates.")
|
|
154
|
+
if cost_basis == "unknown" and total.tokens > 0:
|
|
155
|
+
warnings.append("No cost data recorded by Hermes — token counts only.")
|
|
156
|
+
|
|
157
|
+
return Analysis(
|
|
158
|
+
agent=snap.agent,
|
|
159
|
+
source_path=snap.source_path,
|
|
160
|
+
days=snap.days,
|
|
161
|
+
period_start=period_start,
|
|
162
|
+
period_end=snap.generated_at,
|
|
163
|
+
total=total,
|
|
164
|
+
by_source=dict(sorted(by_source.items(), key=lambda kv: kv[1].cost, reverse=True)),
|
|
165
|
+
by_model=dict(sorted(by_model.items(), key=lambda kv: kv[1].cost, reverse=True)),
|
|
166
|
+
tools=snap.tools[:10],
|
|
167
|
+
night=night,
|
|
168
|
+
night_by_source=dict(
|
|
169
|
+
sorted(night_by_source.items(), key=lambda kv: kv[1].cost, reverse=True)
|
|
170
|
+
),
|
|
171
|
+
night_window=night_window,
|
|
172
|
+
rollups=rollups,
|
|
173
|
+
overhead_per_call=overhead,
|
|
174
|
+
composition=snap.composition,
|
|
175
|
+
cost_basis=cost_basis,
|
|
176
|
+
zero_token_sessions=zero_token,
|
|
177
|
+
daily_cost=daily,
|
|
178
|
+
monthly_projection=daily * 30 if daily is not None else None,
|
|
179
|
+
warnings=warnings,
|
|
180
|
+
)
|
agentburn/baseline.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""Optimize → prove it: save a baseline, change your config, compare.
|
|
2
|
+
|
|
3
|
+
Comparisons are pace-normalized (per-month figures), so a 7-day baseline can
|
|
4
|
+
be compared against a 30-day current window honestly.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
import os
|
|
11
|
+
import time
|
|
12
|
+
|
|
13
|
+
from .analyze import Analysis
|
|
14
|
+
from .report import fmt_money
|
|
15
|
+
|
|
16
|
+
DEFAULT_PATH = os.path.join(os.path.expanduser("~"), ".agentburn", "baseline.json")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _monthly_by_source(a: Analysis) -> dict:
|
|
20
|
+
total = a.total.cost or 0.0
|
|
21
|
+
proj = a.monthly_projection or 0.0
|
|
22
|
+
if total <= 0 or proj <= 0:
|
|
23
|
+
return {}
|
|
24
|
+
return {src: proj * (b.cost / total) for src, b in a.by_source.items() if b.cost > 0}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def snapshot_for_baseline(a: Analysis) -> dict:
|
|
28
|
+
return {
|
|
29
|
+
"saved_at": time.time(),
|
|
30
|
+
"agent": a.agent,
|
|
31
|
+
"days": a.days,
|
|
32
|
+
"cost_basis": a.cost_basis,
|
|
33
|
+
"monthly_projection": a.monthly_projection,
|
|
34
|
+
"monthly_by_source": _monthly_by_source(a),
|
|
35
|
+
"overhead_per_call": a.overhead_per_call,
|
|
36
|
+
"night_monthly": (a.monthly_projection or 0.0)
|
|
37
|
+
* ((a.night.cost / a.total.cost) if (a.total.cost or 0) > 0 else 0.0),
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def save(a: Analysis, path: str = DEFAULT_PATH) -> str:
|
|
42
|
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
43
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
44
|
+
json.dump(snapshot_for_baseline(a), f, indent=1)
|
|
45
|
+
return path
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def load(path: str = DEFAULT_PATH) -> dict:
|
|
49
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
50
|
+
return json.load(f)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _delta(old: float, new: float, basis: str) -> str:
|
|
54
|
+
d = new - old
|
|
55
|
+
pct = f" ({d / old:+.0%})" if old else ""
|
|
56
|
+
return f"{fmt_money(old, basis)} → {fmt_money(new, basis)} ({'+' if d >= 0 else '−'}{fmt_money(abs(d), '').lstrip('~')}{pct})"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def render_compare(a: Analysis, base: dict) -> str:
|
|
60
|
+
basis = a.cost_basis
|
|
61
|
+
cur = snapshot_for_baseline(a)
|
|
62
|
+
age_days = max(0, (time.time() - base.get("saved_at", time.time())) / 86400)
|
|
63
|
+
out = ["", f"📐 Δ vs baseline saved {age_days:.0f} day(s) ago", ""]
|
|
64
|
+
|
|
65
|
+
bo, co = base.get("monthly_projection"), cur.get("monthly_projection")
|
|
66
|
+
if bo is not None and co is not None:
|
|
67
|
+
eps = max(0.01, abs(bo) * 0.002) # ignore float drift / sub-cent noise
|
|
68
|
+
verdict = "✅ cheaper" if co < bo - eps else "⚠ more expensive" if co > bo + eps else "≈ flat"
|
|
69
|
+
out.append(f" monthly pace : {_delta(bo, co, basis)} {verdict}")
|
|
70
|
+
|
|
71
|
+
bsrc, csrc = base.get("monthly_by_source", {}), cur.get("monthly_by_source", {})
|
|
72
|
+
for src in sorted(set(bsrc) | set(csrc), key=lambda s: bsrc.get(s, 0), reverse=True)[:6]:
|
|
73
|
+
out.append(f" {src:<13}: {_delta(bsrc.get(src, 0.0), csrc.get(src, 0.0), basis)}")
|
|
74
|
+
|
|
75
|
+
bov, cov = base.get("overhead_per_call", {}), cur.get("overhead_per_call", {})
|
|
76
|
+
common = [s for s in cov if s in bov and bov[s] > 0]
|
|
77
|
+
if common:
|
|
78
|
+
out.append("")
|
|
79
|
+
out.append(" overhead, input tokens per call:")
|
|
80
|
+
for s in sorted(common, key=lambda s: bov[s], reverse=True)[:4]:
|
|
81
|
+
d = cov[s] - bov[s]
|
|
82
|
+
out.append(f" {s:<13}: {bov[s]:,} → {cov[s]:,} ({d:+,})")
|
|
83
|
+
|
|
84
|
+
bn, cn = base.get("night_monthly"), cur.get("night_monthly")
|
|
85
|
+
if bn is not None and cn is not None and (bn or cn):
|
|
86
|
+
out.append("")
|
|
87
|
+
out.append(f" 🌙 night/mo : {_delta(bn, cn, basis)}")
|
|
88
|
+
|
|
89
|
+
out.append("")
|
|
90
|
+
out.append(" (pace-normalized: monthly figures, so different windows compare honestly)")
|
|
91
|
+
out.append("")
|
|
92
|
+
return "\n".join(out)
|
agentburn/benchmarks.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Community-measured reference points, embedded as dated constants.
|
|
2
|
+
|
|
3
|
+
These are NOT our measurements. Each constant carries its public source and
|
|
4
|
+
date; the report cites them verbatim so users can calibrate "is my number
|
|
5
|
+
normal?" without agentburn ever touching the network.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
# Phala / Clawdi token benchmark of an always-on agent (OpenClaw-class),
|
|
11
|
+
# published 2026-03-10: https://phala.com/posts/understanding-openclaws-token-usage
|
|
12
|
+
PHALA_2026_03 = {
|
|
13
|
+
"source": "Phala token benchmark, 2026-03",
|
|
14
|
+
"url": "https://phala.com/posts/understanding-openclaws-token-usage",
|
|
15
|
+
"baseline_tokens_per_call": 8_000, # instruction/bootstrap baseline resent per request
|
|
16
|
+
"multi_turn_5x_cost_factor": 13.3, # 5-turn dialog vs single turn
|
|
17
|
+
"output_share_typical": 0.06, # output is 1–6% of tokens
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
# Hermes Agent community measurement (user-reported, issue #4379, 2026):
|
|
21
|
+
# https://github.com/NousResearch/hermes-agent/issues/4379
|
|
22
|
+
HERMES_4379 = {
|
|
23
|
+
"source": "hermes-agent #4379 (user-measured)",
|
|
24
|
+
"url": "https://github.com/NousResearch/hermes-agent/issues/4379",
|
|
25
|
+
"fixed_overhead_tokens": 13_935,
|
|
26
|
+
"overhead_share": 0.73,
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
REFERENCE_BASELINE = PHALA_2026_03["baseline_tokens_per_call"]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def overhead_vs_reference(avg_input_per_call: int) -> str:
|
|
33
|
+
"""One-line calibration against the community baseline."""
|
|
34
|
+
if avg_input_per_call <= 0:
|
|
35
|
+
return ""
|
|
36
|
+
delta = avg_input_per_call / REFERENCE_BASELINE - 1
|
|
37
|
+
sign = "+" if delta >= 0 else "−"
|
|
38
|
+
return (
|
|
39
|
+
f"community baseline ≈{REFERENCE_BASELINE // 1000}k/call "
|
|
40
|
+
f"({PHALA_2026_03['source']}): {sign}{abs(delta):.0%}"
|
|
41
|
+
)
|