react-agent-harness 0.5.2__tar.gz → 0.6.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {react_agent_harness-0.5.2/react_agent_harness.egg-info → react_agent_harness-0.6.1}/PKG-INFO +2 -2
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/README.md +238 -5
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/agents/base.py +13 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/cli.py +34 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/console.py +39 -2
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/llm/anthropic.py +26 -5
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/llm/claude_code.py +40 -2
- react_agent_harness-0.6.1/harness/llm/fallback.py +171 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/llm/openai.py +21 -5
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/llm/openai_codex.py +34 -2
- react_agent_harness-0.6.1/harness/llm/routing.py +141 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/runtime.py +200 -30
- react_agent_harness-0.6.1/harness/trace.py +171 -0
- react_agent_harness-0.6.1/harness/trace_viewer.py +326 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/orchestrator/planner.py +19 -3
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/pyproject.toml +2 -2
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1/react_agent_harness.egg-info}/PKG-INFO +2 -2
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/react_agent_harness.egg-info/SOURCES.txt +9 -0
- react_agent_harness-0.6.1/tests/test_budget_guard.py +134 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_cli.py +4 -0
- react_agent_harness-0.6.1/tests/test_console_renderer.py +145 -0
- react_agent_harness-0.6.1/tests/test_fallback_llm.py +221 -0
- react_agent_harness-0.6.1/tests/test_per_call_site_llm.py +420 -0
- react_agent_harness-0.6.1/tests/test_routing_llm.py +164 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_streaming.py +1 -1
- react_agent_harness-0.6.1/tests/test_trace.py +240 -0
- react_agent_harness-0.5.2/tests/test_console_renderer.py +0 -52
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/LICENSE +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/agents/__init__.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/__init__.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/annotation.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/checkpoint.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/events.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/executor_bridge.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/hitl.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/llm/__init__.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/llm/_streaming.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/llm/auth.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/oauth_browser.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/otel.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/steering.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/tool_policy.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/utils.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/memory/__init__.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/memory/episodic_lance.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/memory/manager.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/memory/redis_store.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/memory/stores.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/memory/working.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/orchestrator/__init__.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/react_agent_harness.egg-info/dependency_links.txt +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/react_agent_harness.egg-info/entry_points.txt +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/react_agent_harness.egg-info/requires.txt +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/react_agent_harness.egg-info/top_level.txt +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/setup.cfg +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_agents_base.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_annotation.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_anthropic_llm.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_checkpoint_resume.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_claude_code_llm.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_executor_bridge.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_http_fetch.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_llm_auth.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_mcp_adapter.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_mcp_auth.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_memory.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_oauth_browser.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_openai_codex_llm.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_openai_llm.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_orchestrator.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_otel.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_parse_action_json.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_redis_store.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_steering.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_tool_policy.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_utils.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_vision.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_working_memory.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tools/__init__.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tools/builtin/__init__.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tools/builtin/fetch_image.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tools/builtin/http_fetch.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tools/mcp/__init__.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tools/mcp/adapter.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tools/mcp/auth.py +0 -0
{react_agent_harness-0.5.2/react_agent_harness.egg-info → react_agent_harness-0.6.1}/PKG-INFO
RENAMED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: react-agent-harness
|
|
3
|
-
Version: 0.
|
|
4
|
-
Summary: Multi-agent LLM orchestration: hybrid DAG planning, two-tier memory, streaming
|
|
3
|
+
Version: 0.6.1
|
|
4
|
+
Summary: Multi-agent LLM orchestration: hybrid DAG planning, two-tier memory, streaming, cost/token budgets with per-call-site breakdown
|
|
5
5
|
Requires-Python: >=3.10
|
|
6
6
|
License-File: LICENSE
|
|
7
7
|
Requires-Dist: prompt_toolkit>=3.0
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
# react-agent-harness
|
|
2
2
|
|
|
3
3
|
Bring-your-own-LLM multi-agent harness: hybrid DAG planning with replan-on-failure,
|
|
4
|
-
two-tier memory (semantic KV + episodic vector),
|
|
4
|
+
two-tier memory (semantic KV + episodic vector), a streaming-primary event model,
|
|
5
|
+
and cost/token budgets with per-call-site attribution (classifier, router,
|
|
6
|
+
planner, synthesizer, agent).
|
|
5
7
|
|
|
6
8
|
Config-driven — register tools and agents, run any goal. No subclassing.
|
|
7
9
|
|
|
@@ -33,13 +35,21 @@ events to stdout and prints elapsed time + cost at the end.
|
|
|
33
35
|
## Architecture
|
|
34
36
|
|
|
35
37
|
```
|
|
36
|
-
harness/runtime.py AgentRuntime — single entry point
|
|
38
|
+
harness/runtime.py AgentRuntime — single entry point; BudgetGuard with cost/token caps + per-call-site breakdown
|
|
37
39
|
harness/events.py BusEvent + EventType — canonical event vocabulary
|
|
38
|
-
harness/llm/openai.py OpenAILLM — OpenAI adapter with usage + cost tracking
|
|
40
|
+
harness/llm/openai.py OpenAILLM — OpenAI API-key adapter with usage + cost tracking
|
|
41
|
+
harness/llm/anthropic.py AnthropicLLM — direct Anthropic API-key adapter with prompt-caching support
|
|
42
|
+
harness/llm/claude_code.py ClaudeCodeLLM — Claude subscription OAuth adapter (experimental, ToS caveats)
|
|
43
|
+
harness/llm/openai_codex.py OpenAICodexLLM — ChatGPT subscription OAuth adapter (experimental, ToS caveats)
|
|
44
|
+
harness/llm/auth.py Shared OAuth + auth-file primitives for the subscription adapters
|
|
45
|
+
harness/llm/fallback.py FallbackLLM — transparent retry on transient upstream errors
|
|
46
|
+
harness/llm/routing.py RoutingLLM — dispatch calls to different adapters by a selector
|
|
47
|
+
harness/trace.py JSONL trace recorder + replay — durable, per-event flush
|
|
48
|
+
harness/trace_viewer.py Local web timeline viewer for recorded JSONL traces
|
|
39
49
|
harness/annotation.py Annotation store + AnnotationHook — RLHF trajectory capture
|
|
40
50
|
harness/hitl.py HITL approval gate — interactive CLI, session-allow list
|
|
41
51
|
harness/tool_policy.py Persistent tool policy — user-scoped allow rules, CLI management
|
|
42
|
-
harness/console.py ConsoleRenderer — centralised BusEvent formatting
|
|
52
|
+
harness/console.py ConsoleRenderer — centralised BusEvent formatting + render_budget helper
|
|
43
53
|
harness/steering.py Async steering — agent.steer(text), StdinRouter pub/sub, FileSteer, factory helpers
|
|
44
54
|
harness/checkpoint.py CheckpointStore + _ResumeHint + maybe_resume_key — pluggable run-state persistence (file + Redis); auto-resume built into dispatch_stream / run_stream
|
|
45
55
|
harness/otel.py OTELHook — OpenTelemetry span exporter (opt-in)
|
|
@@ -54,7 +64,8 @@ memory/redis_store.py Redis semantic store — durable KV with TTL
|
|
|
54
64
|
memory/stores.py InMemory stores — local dev default, no deps
|
|
55
65
|
tools/builtin/http_fetch.py HTTPFetch — minimal read-only GET tool
|
|
56
66
|
tools/builtin/fetch_image.py FetchImage — fetch URL and return OpenAI image_url block
|
|
57
|
-
tools/mcp/adapter.py MCP tool adapter —
|
|
67
|
+
tools/mcp/adapter.py MCP tool adapter — stdio, SSE, streamable-HTTP transports
|
|
68
|
+
tools/mcp/auth.py ApiKeyMCPAuth + BrowserOAuthMCPAuth — auth primitives for remote MCP servers
|
|
58
69
|
```
|
|
59
70
|
|
|
60
71
|
Execution is **streaming-primary**: every path yields `BusEvent`s for
|
|
@@ -188,6 +199,99 @@ llm = ClaudeCodeLLM(
|
|
|
188
199
|
)
|
|
189
200
|
```
|
|
190
201
|
|
|
202
|
+
### Cost shaping + reliability
|
|
203
|
+
|
|
204
|
+
Two patterns, ordered by how production teams actually solve this:
|
|
205
|
+
|
|
206
|
+
**1. Per-call-site LLM injection (the recommended pattern)**
|
|
207
|
+
|
|
208
|
+
`AgentRuntime` exposes one slot per orchestrator call site. Each defaults to
|
|
209
|
+
`llm` when unset, so existing code keeps working. The classifier and router
|
|
210
|
+
both see only the goal + agent descriptions (~300 tokens) and emit a
|
|
211
|
+
one-token decision — natural candidates for a cheaper model. The planner
|
|
212
|
+
and synthesiser produce structured DAGs and final answers and usually want
|
|
213
|
+
to stay on the main model.
|
|
214
|
+
|
|
215
|
+
```python
|
|
216
|
+
runtime = AgentRuntime(
|
|
217
|
+
agent_registry=agents,
|
|
218
|
+
tool_registry=tools,
|
|
219
|
+
memory=memory,
|
|
220
|
+
llm=premium, # default — agent ReAct loops use this
|
|
221
|
+
classifier_llm=cheap, # simple vs complex dispatch decision
|
|
222
|
+
router_llm=cheap, # single-agent picker
|
|
223
|
+
# planner_llm=... # defaults to llm; override only if you want
|
|
224
|
+
# synthesizer_llm=... # defaults to llm
|
|
225
|
+
)
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
No guessing, no keyword matching, no fragility — you read the runtime
|
|
229
|
+
construction and you know exactly which model serves which purpose. The
|
|
230
|
+
budget guard is wired into every distinct LLM instance automatically
|
|
231
|
+
(deduped by object identity, so injecting the same wrapper into multiple
|
|
232
|
+
slots costs no extra calls).
|
|
233
|
+
|
|
234
|
+
**2. `FallbackLLM` for resilience**
|
|
235
|
+
|
|
236
|
+
Try each adapter in order; transparently switch to the next on rate
|
|
237
|
+
limits, timeouts, or 5xx errors:
|
|
238
|
+
|
|
239
|
+
```python
|
|
240
|
+
from harness.llm.fallback import FallbackLLM
|
|
241
|
+
|
|
242
|
+
llm = FallbackLLM([
|
|
243
|
+
AnthropicLLM(model="claude-sonnet-4-6"), # primary
|
|
244
|
+
OpenAILLM(model="gpt-4o-mini"), # backup
|
|
245
|
+
])
|
|
246
|
+
runtime = AgentRuntime(..., llm=llm)
|
|
247
|
+
print(llm.last_route) # 0 if primary worked, 1 if backup did
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
Permanent errors (auth, bad request) propagate immediately — only transient
|
|
251
|
+
upstream errors trigger fallback. Customise with `transient_errors=...`.
|
|
252
|
+
Streaming retries only fire before the first token; mid-stream failures
|
|
253
|
+
propagate to preserve response integrity.
|
|
254
|
+
|
|
255
|
+
**3. `RoutingLLM` for bring-your-own-selector cases**
|
|
256
|
+
|
|
257
|
+
When you need runtime routing — capability gating (`vision` vs
|
|
258
|
+
`long_context`), learned classifiers (RouteLLM-style), cascade
|
|
259
|
+
routing (cheap-then-escalate-on-low-confidence) — wrap a routes dict
|
|
260
|
+
with your own selector callable:
|
|
261
|
+
|
|
262
|
+
```python
|
|
263
|
+
from harness.llm.routing import RoutingLLM
|
|
264
|
+
|
|
265
|
+
def by_capability(system, messages):
|
|
266
|
+
if _needs_vision(messages):
|
|
267
|
+
return "vision"
|
|
268
|
+
if _estimated_tokens(system, messages) > 100_000:
|
|
269
|
+
return "long_context"
|
|
270
|
+
return "default"
|
|
271
|
+
|
|
272
|
+
llm = RoutingLLM(
|
|
273
|
+
routes={
|
|
274
|
+
"default": OpenAILLM(model="gpt-4o-mini"),
|
|
275
|
+
"vision": OpenAILLM(model="gpt-4o"),
|
|
276
|
+
"long_context": AnthropicLLM(model="claude-sonnet-4-6"),
|
|
277
|
+
},
|
|
278
|
+
selector=by_capability,
|
|
279
|
+
default_route="default",
|
|
280
|
+
)
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
The harness intentionally does not ship default selectors. Naive selectors
|
|
284
|
+
(keyword matching, fixed token thresholds) misroute in subtle ways and
|
|
285
|
+
encourage the wrong mental model — if you're reaching for one, you almost
|
|
286
|
+
certainly want per-call-site injection instead.
|
|
287
|
+
|
|
288
|
+
Compose freely: `FallbackLLM([premium, backup])` injected into the
|
|
289
|
+
`llm=` slot gives the agent loops resilience, with `classifier_llm=cheap`
|
|
290
|
+
and `router_llm=cheap` shaping the cheap-call cost — all without a custom
|
|
291
|
+
selector.
|
|
292
|
+
|
|
293
|
+
---
|
|
294
|
+
|
|
191
295
|
`ClaudeCodeLLM` reads a `claude-code` OAuth entry, refreshes it automatically
|
|
192
296
|
when expired, and retries once after `401`/`403`. This mirrors Pi's Claude
|
|
193
297
|
Pro/Max extension approach rather than shelling out to the Claude CLI. The
|
|
@@ -495,6 +599,92 @@ Cost ceiling fires on the *next* `check()` (start of next ReAct step or
|
|
|
495
599
|
orchestrator batch), not synchronously mid-call — accept this for 0.0.1, the
|
|
496
600
|
guard's job is preventing runaway loops, not bounding individual calls.
|
|
497
601
|
|
|
602
|
+
### Token limits + per-call-site breakdown
|
|
603
|
+
|
|
604
|
+
`GuardrailConfig.max_input_tokens` / `max_output_tokens` cap raw token
|
|
605
|
+
usage independently of dollar cost. This is the only enforcement available
|
|
606
|
+
to subscription-auth runs (`ClaudeCodeLLM`, `OpenAICodexLLM`) — those tiers
|
|
607
|
+
don't expose pricing, so cost stays 0 and only token caps can fire.
|
|
608
|
+
|
|
609
|
+
```python
|
|
610
|
+
runtime = AgentRuntime(
|
|
611
|
+
...,
|
|
612
|
+
guardrail_config=GuardrailConfig(
|
|
613
|
+
max_total_cost_usd=2.0,
|
|
614
|
+
max_input_tokens=100_000,
|
|
615
|
+
max_output_tokens=20_000,
|
|
616
|
+
),
|
|
617
|
+
)
|
|
618
|
+
```
|
|
619
|
+
|
|
620
|
+
Per-call-site attribution lives on the terminal event's `budget` payload
|
|
621
|
+
— a snapshot of spending bucketed by the LLM slot that ran each call.
|
|
622
|
+
The runtime tags classifier / router / planner / synthesizer calls
|
|
623
|
+
automatically; ReAct agent calls go into the totals but don't get a
|
|
624
|
+
bucket. So `cheap` (used for both `classifier_llm` and `router_llm`) and
|
|
625
|
+
`premium` (used for `planner_llm`) report separately even though one is
|
|
626
|
+
the same physical LLM instance shared across slots:
|
|
627
|
+
|
|
628
|
+
```python
|
|
629
|
+
async for event in runtime.dispatch_stream(goal):
|
|
630
|
+
# Routed (simple) goals terminate with TASK_DONE; orchestrated goals
|
|
631
|
+
# with DONE. Both carry the same ``budget`` shape.
|
|
632
|
+
if event.type in (EventType.TASK_DONE, EventType.DONE):
|
|
633
|
+
budget = event.payload["budget"]
|
|
634
|
+
print(f"total: in={budget['tokens_in']} out={budget['tokens_out']} "
|
|
635
|
+
f"${budget['cost_usd']:.4f}")
|
|
636
|
+
for slot, stats in budget["breakdown"].items():
|
|
637
|
+
print(f" {slot}: in={stats['tokens_in']} out={stats['tokens_out']}")
|
|
638
|
+
```
|
|
639
|
+
|
|
640
|
+
The same `budget` dict is attached to `runtime.run(...)` and
|
|
641
|
+
`runtime.dispatch(...)` return values under the `budget` key, so blocking
|
|
642
|
+
callers don't need to read events.
|
|
643
|
+
|
|
644
|
+
Anthropic / Claude Code adapters count input tokens as the *total* that
|
|
645
|
+
hit the wire (non-cached + cache-creation + cache-read), so token caps
|
|
646
|
+
reflect actual consumption regardless of cache hit rate. Cost calculation
|
|
647
|
+
via `cost_fn` still respects cache pricing.
|
|
648
|
+
|
|
649
|
+
### Evals via the trace recorder
|
|
650
|
+
|
|
651
|
+
There's no shipped evals framework — opinions on scorers, judge models,
|
|
652
|
+
and golden-set management belong outside the orchestration core. The
|
|
653
|
+
[trace recorder](#trace-recorder--replay--local-viewer) already writes
|
|
654
|
+
per-event token/cost/latency to JSONL, so a few lines of glue cover most
|
|
655
|
+
in-house eval setups:
|
|
656
|
+
|
|
657
|
+
```python
|
|
658
|
+
import json
|
|
659
|
+
from harness.trace import record_trace
|
|
660
|
+
|
|
661
|
+
# 1. Record traces while running a fixture set.
|
|
662
|
+
for fixture in fixtures:
|
|
663
|
+
async for _event in record_trace(
|
|
664
|
+
runtime.dispatch_stream(fixture["input"]),
|
|
665
|
+
path=f"runs/{fixture['id']}.jsonl",
|
|
666
|
+
):
|
|
667
|
+
pass
|
|
668
|
+
|
|
669
|
+
# 2. Score offline by replaying.
|
|
670
|
+
def score_run(path: str, expected: str) -> dict:
|
|
671
|
+
answer = ""
|
|
672
|
+
budget = {"tokens_in": 0, "tokens_out": 0, "cost_usd": 0.0, "breakdown": {}}
|
|
673
|
+
for line in open(path):
|
|
674
|
+
event = json.loads(line)
|
|
675
|
+
if event["type"] in ("done", "task_done"):
|
|
676
|
+
answer = event["payload"].get("answer", "")
|
|
677
|
+
budget = event["payload"].get("budget", budget)
|
|
678
|
+
return {
|
|
679
|
+
"success": expected.lower() in answer.lower(),
|
|
680
|
+
**budget, # tokens_in, tokens_out, cost_usd, breakdown
|
|
681
|
+
}
|
|
682
|
+
```
|
|
683
|
+
|
|
684
|
+
Plug in your own scorer (exact-match, LLM-judge, semantic similarity) on
|
|
685
|
+
top. External tools like Braintrust, LangSmith, and Weave are
|
|
686
|
+
purpose-built for this and ingest the same JSONL shape directly.
|
|
687
|
+
|
|
498
688
|
## Tool execution
|
|
499
689
|
|
|
500
690
|
Tools that shell out (`kubectl`, `curl`, `sh -c …`) should not run inside the
|
|
@@ -722,6 +912,49 @@ The OTEL hook is a side-channel on the existing `Tracer` — the in-memory trace
|
|
|
722
912
|
is always available via `result["trace"]` regardless of whether OTEL is enabled.
|
|
723
913
|
Zero overhead and zero imports when `enable_otel=False`.
|
|
724
914
|
|
|
915
|
+
## Trace recorder + replay + local viewer
|
|
916
|
+
|
|
917
|
+
For local debug and post-mortem inspection without an OTEL backend, the
|
|
918
|
+
harness ships a JSONL trace recorder and a stdlib-only HTML viewer. Wrap
|
|
919
|
+
any streaming call:
|
|
920
|
+
|
|
921
|
+
```python
|
|
922
|
+
from harness.trace import record_trace, replay
|
|
923
|
+
|
|
924
|
+
async for event in record_trace(runtime.dispatch_stream(goal), "run.jsonl"):
|
|
925
|
+
... # your normal handling
|
|
926
|
+
```
|
|
927
|
+
|
|
928
|
+
Each `BusEvent` is flushed per-line, so a partial trace survives a crash.
|
|
929
|
+
View the trace in your browser:
|
|
930
|
+
|
|
931
|
+
```bash
|
|
932
|
+
agent-harness trace view run.jsonl # opens http://127.0.0.1:8765/
|
|
933
|
+
```
|
|
934
|
+
|
|
935
|
+
The viewer is a single embedded HTML page — vertical timeline, filter by
|
|
936
|
+
agent / event type / text, expandable per-event JSON. No build step, no
|
|
937
|
+
external services.
|
|
938
|
+
|
|
939
|
+
Replay a trace through `ConsoleRenderer` (great for grepping or piping
|
|
940
|
+
into another script):
|
|
941
|
+
|
|
942
|
+
```bash
|
|
943
|
+
agent-harness trace replay run.jsonl
|
|
944
|
+
agent-harness trace replay run.jsonl --realtime --speed 2.0
|
|
945
|
+
```
|
|
946
|
+
|
|
947
|
+
Programmatic replay yields reconstructed `BusEvent` objects:
|
|
948
|
+
|
|
949
|
+
```python
|
|
950
|
+
async for event in replay("run.jsonl", realtime=False):
|
|
951
|
+
... # reuse the same loops you write for live streams
|
|
952
|
+
```
|
|
953
|
+
|
|
954
|
+
This is complementary to OTEL — OTEL is for production observability and
|
|
955
|
+
long-term storage in Jaeger/Datadog; the JSONL recorder is for local
|
|
956
|
+
debugging, sharing reproductions, and replaying past runs.
|
|
957
|
+
|
|
725
958
|
## Vision / multimodal agents
|
|
726
959
|
|
|
727
960
|
`WorkingMemory` accepts `str | list` content so image blocks pass through to
|
|
@@ -422,6 +422,12 @@ class BaseAgent:
|
|
|
422
422
|
"summarizations": self._working_memory.summarization_count,
|
|
423
423
|
},
|
|
424
424
|
}
|
|
425
|
+
# Attach the current budget snapshot so dispatch_stream
|
|
426
|
+
# consumers can read totals + per-call-site breakdown off
|
|
427
|
+
# the routed path's terminal event, same shape as the
|
|
428
|
+
# orchestrator's DONE event.
|
|
429
|
+
if self._guard is not None and hasattr(self._guard, "snapshot"):
|
|
430
|
+
result["budget"] = self._guard.snapshot()
|
|
425
431
|
logger.info(
|
|
426
432
|
"Agent %s completed: steps=%d confidence=%.2f summarizations=%d",
|
|
427
433
|
self.config.agent_id,
|
|
@@ -653,11 +659,17 @@ class BaseAgent:
|
|
|
653
659
|
payload=before_usage,
|
|
654
660
|
)
|
|
655
661
|
|
|
662
|
+
# Tag ReAct spending so it shows up in BudgetGuard.breakdown alongside
|
|
663
|
+
# classifier/router/planner/synthesizer. Per-agent attribution makes
|
|
664
|
+
# multi-agent demos surface which specialist agent actually drove the
|
|
665
|
+
# bulk of token usage.
|
|
666
|
+
react_source = f"agent:{self.config.agent_id}"
|
|
656
667
|
try:
|
|
657
668
|
if hasattr(self._llm, "stream_complete"):
|
|
658
669
|
async for token in self._llm.stream_complete(
|
|
659
670
|
system=None,
|
|
660
671
|
messages=messages,
|
|
672
|
+
source=react_source,
|
|
661
673
|
):
|
|
662
674
|
accumulated += token
|
|
663
675
|
if self.config.stream_tokens:
|
|
@@ -679,6 +691,7 @@ class BaseAgent:
|
|
|
679
691
|
system=None,
|
|
680
692
|
messages=messages,
|
|
681
693
|
response_format={"type": "json_object"},
|
|
694
|
+
source=react_source,
|
|
682
695
|
)
|
|
683
696
|
response = _normalize_response(raw)
|
|
684
697
|
if response is None:
|
|
@@ -46,6 +46,19 @@ def main() -> int:
|
|
|
46
46
|
policy_clear = policy_sub.add_parser("clear", help="remove all policy rules")
|
|
47
47
|
policy_clear.add_argument("--policy-file", default=str(default_policy_file()))
|
|
48
48
|
|
|
49
|
+
trace = sub.add_parser("trace", help="view or replay a recorded run trace")
|
|
50
|
+
trace_sub = trace.add_subparsers(dest="trace_command", required=True)
|
|
51
|
+
trace_view = trace_sub.add_parser("view", help="open a local web viewer for a trace")
|
|
52
|
+
trace_view.add_argument("path", help="path to a JSONL trace produced by record_trace")
|
|
53
|
+
trace_view.add_argument("--port", type=int, default=8765)
|
|
54
|
+
trace_view.add_argument("--no-open", action="store_true", help="don't auto-open the browser")
|
|
55
|
+
trace_replay = trace_sub.add_parser("replay", help="dump a trace to stdout via ConsoleRenderer")
|
|
56
|
+
trace_replay.add_argument("path", help="path to a JSONL trace produced by record_trace")
|
|
57
|
+
trace_replay.add_argument(
|
|
58
|
+
"--realtime", action="store_true", help="preserve recorded inter-event timing"
|
|
59
|
+
)
|
|
60
|
+
trace_replay.add_argument("--speed", type=float, default=1.0, help="realtime speed multiplier")
|
|
61
|
+
|
|
49
62
|
args = parser.parse_args()
|
|
50
63
|
try:
|
|
51
64
|
if args.command == "login":
|
|
@@ -71,6 +84,16 @@ def main() -> int:
|
|
|
71
84
|
return _policy_revoke(path, args.rule_id)
|
|
72
85
|
if args.policy_command == "clear":
|
|
73
86
|
return _policy_clear(path)
|
|
87
|
+
if args.command == "trace":
|
|
88
|
+
if args.trace_command == "view":
|
|
89
|
+
from harness.trace_viewer import serve
|
|
90
|
+
|
|
91
|
+
serve(args.path, port=args.port, open_browser=not args.no_open)
|
|
92
|
+
return 0
|
|
93
|
+
if args.trace_command == "replay":
|
|
94
|
+
return asyncio.run(
|
|
95
|
+
_trace_replay(args.path, realtime=args.realtime, speed=args.speed)
|
|
96
|
+
)
|
|
74
97
|
except Exception as e:
|
|
75
98
|
print(f"agent-harness: {e}", file=sys.stderr)
|
|
76
99
|
return 1
|
|
@@ -180,5 +203,16 @@ def _policy_clear(path: Path) -> int:
|
|
|
180
203
|
return 0
|
|
181
204
|
|
|
182
205
|
|
|
206
|
+
async def _trace_replay(path: str, *, realtime: bool, speed: float) -> int:
|
|
207
|
+
"""Read a JSONL trace and render it via ConsoleRenderer."""
|
|
208
|
+
from harness.console import ConsoleRenderer
|
|
209
|
+
from harness.trace import replay
|
|
210
|
+
|
|
211
|
+
renderer = ConsoleRenderer()
|
|
212
|
+
async for event in replay(path, realtime=realtime, speed=speed):
|
|
213
|
+
renderer.render(event)
|
|
214
|
+
return 0
|
|
215
|
+
|
|
216
|
+
|
|
183
217
|
if __name__ == "__main__":
|
|
184
218
|
raise SystemExit(main())
|
|
@@ -178,17 +178,54 @@ class ConsoleRenderer:
|
|
|
178
178
|
self.sep("═")
|
|
179
179
|
print(p.get("answer", "(no answer)"), file=self._out)
|
|
180
180
|
self.sep()
|
|
181
|
+
# ``budget`` snapshot supersedes the flat cost/elapsed fields when
|
|
182
|
+
# present (added with token caps + per-call-site breakdown).
|
|
183
|
+
budget = p.get("budget") or {}
|
|
184
|
+
cost = budget.get("cost_usd", p.get("cost_usd", 0))
|
|
185
|
+
elapsed = budget.get("elapsed_seconds", p.get("elapsed_seconds", 0))
|
|
181
186
|
print(
|
|
182
187
|
f"Confidence: {p.get('confidence', 0):.2f} | "
|
|
183
188
|
f"Replans: {p.get('replan_count', 0)} | "
|
|
184
|
-
f"Cost: ${
|
|
185
|
-
f"Time: {
|
|
189
|
+
f"Cost: ${cost:.4f} | "
|
|
190
|
+
f"Time: {elapsed:.1f}s",
|
|
186
191
|
file=self._out,
|
|
187
192
|
)
|
|
193
|
+
self.render_budget(budget)
|
|
188
194
|
|
|
189
195
|
elif t == EventType.ERROR:
|
|
190
196
|
print(f"\n[error] {event.error}", file=sys.stderr)
|
|
191
197
|
|
|
198
|
+
def render_budget(self, budget: dict | None) -> None:
|
|
199
|
+
"""Print tokens + per-call-site breakdown from a ``BudgetGuard.snapshot()``
|
|
200
|
+
dict. Safe to call with ``{}`` or ``None`` — prints nothing when
|
|
201
|
+
there's no usage to show.
|
|
202
|
+
|
|
203
|
+
Exposed publicly so demos and other consumers that own their own
|
|
204
|
+
DONE / TASK_DONE rendering can still surface the breakdown without
|
|
205
|
+
duplicating the formatting.
|
|
206
|
+
"""
|
|
207
|
+
if not budget:
|
|
208
|
+
return
|
|
209
|
+
tokens_in = budget.get("tokens_in")
|
|
210
|
+
tokens_out = budget.get("tokens_out")
|
|
211
|
+
if tokens_in is not None or tokens_out is not None:
|
|
212
|
+
print(
|
|
213
|
+
f"Tokens: in={int(tokens_in or 0):,} out={int(tokens_out or 0):,}",
|
|
214
|
+
file=self._out,
|
|
215
|
+
)
|
|
216
|
+
breakdown = budget.get("breakdown") or {}
|
|
217
|
+
if breakdown:
|
|
218
|
+
# Right-pad the slot label so columns line up — matters when
|
|
219
|
+
# the demo prints multiple slots in sequence.
|
|
220
|
+
width = max(len(name) for name in breakdown)
|
|
221
|
+
for slot, stats in breakdown.items():
|
|
222
|
+
print(
|
|
223
|
+
f" {slot:<{width}} "
|
|
224
|
+
f"in={int(stats.get('tokens_in', 0)):>7,} "
|
|
225
|
+
f"out={int(stats.get('tokens_out', 0)):>6,}",
|
|
226
|
+
file=self._out,
|
|
227
|
+
)
|
|
228
|
+
|
|
192
229
|
# ── private helpers ───────────────────────────────────────────────────────
|
|
193
230
|
|
|
194
231
|
def _label(self, event: BusEvent) -> str:
|
|
@@ -91,6 +91,8 @@ class AnthropicLLM:
|
|
|
91
91
|
self,
|
|
92
92
|
system: str | None,
|
|
93
93
|
messages: list[dict],
|
|
94
|
+
*,
|
|
95
|
+
source: str | None = None,
|
|
94
96
|
**kwargs: Any,
|
|
95
97
|
) -> dict:
|
|
96
98
|
max_tokens = int(kwargs.pop("max_tokens", self._max_tokens))
|
|
@@ -110,7 +112,7 @@ class AnthropicLLM:
|
|
|
110
112
|
cost = _compute_cost(usage, self._cost_fn)
|
|
111
113
|
if cost is not None:
|
|
112
114
|
usage["cost_usd"] = cost
|
|
113
|
-
self.
|
|
115
|
+
self._record_usage(usage, source=source)
|
|
114
116
|
self.last_usage = usage
|
|
115
117
|
|
|
116
118
|
text = _collect_text(resp.content)
|
|
@@ -122,6 +124,8 @@ class AnthropicLLM:
|
|
|
122
124
|
self,
|
|
123
125
|
system: str | None,
|
|
124
126
|
messages: list[dict],
|
|
127
|
+
*,
|
|
128
|
+
source: str | None = None,
|
|
125
129
|
) -> AsyncGenerator[str, None]:
|
|
126
130
|
sys_blocks = _system_blocks(system, prompt_caching=self._prompt_caching)
|
|
127
131
|
built_messages = _build_messages(messages, prompt_caching=self._prompt_caching)
|
|
@@ -143,17 +147,34 @@ class AnthropicLLM:
|
|
|
143
147
|
cost = _compute_cost(usage, self._cost_fn)
|
|
144
148
|
if cost is not None:
|
|
145
149
|
usage["cost_usd"] = cost
|
|
146
|
-
self.
|
|
150
|
+
self._record_usage(usage, source=source)
|
|
147
151
|
self.last_usage = usage
|
|
148
152
|
|
|
149
153
|
# ── Internals ─────────────────────────────────────────────────────────────
|
|
150
154
|
|
|
151
|
-
def
|
|
152
|
-
|
|
155
|
+
def _record_usage(self, usage: dict, *, source: str | None) -> None:
|
|
156
|
+
"""Forward usage to the budget guard.
|
|
157
|
+
|
|
158
|
+
Token count for budget purposes is the total input that hit the wire
|
|
159
|
+
— non-cached + cache-creation + cache-read — so token caps reflect
|
|
160
|
+
real wall-clock consumption regardless of cache hit rate. Cost
|
|
161
|
+
(which respects cache pricing via ``cost_fn``) is reported when
|
|
162
|
+
known.
|
|
163
|
+
"""
|
|
164
|
+
guard = self._budget
|
|
165
|
+
if not guard:
|
|
153
166
|
return
|
|
167
|
+
tokens_in = (
|
|
168
|
+
int(usage.get("tokens_in") or 0)
|
|
169
|
+
+ int(usage.get("cache_read_tokens") or 0)
|
|
170
|
+
+ int(usage.get("cache_creation_tokens") or 0)
|
|
171
|
+
)
|
|
172
|
+
tokens_out = int(usage.get("tokens_out") or 0)
|
|
173
|
+
if (tokens_in or tokens_out) and hasattr(guard, "add_tokens"):
|
|
174
|
+
guard.add_tokens(tokens_in, tokens_out, source=source)
|
|
154
175
|
cost = usage.get("cost_usd")
|
|
155
176
|
if cost and cost > 0:
|
|
156
|
-
|
|
177
|
+
guard.add_cost(cost, source=source)
|
|
157
178
|
|
|
158
179
|
|
|
159
180
|
# ── Module-level helpers ──────────────────────────────────────────────────────
|
|
@@ -68,12 +68,24 @@ class ClaudeCodeLLM:
|
|
|
68
68
|
self._user_agent = user_agent or _default_user_agent()
|
|
69
69
|
self._betas = betas
|
|
70
70
|
self._prompt_caching = prompt_caching
|
|
71
|
+
self._budget: Any = None
|
|
71
72
|
self.last_usage: dict | None = None
|
|
72
73
|
|
|
74
|
+
def set_budget(self, guard: Any) -> None:
|
|
75
|
+
"""Inject a BudgetGuard so token caps fire on subscription-auth runs.
|
|
76
|
+
|
|
77
|
+
Cost stays 0 (no pricing schedule available for the subscription
|
|
78
|
+
tier), but ``add_tokens`` still lands so ``max_input_tokens`` /
|
|
79
|
+
``max_output_tokens`` are enforced.
|
|
80
|
+
"""
|
|
81
|
+
self._budget = guard
|
|
82
|
+
|
|
73
83
|
async def complete(
|
|
74
84
|
self,
|
|
75
85
|
system: str | None,
|
|
76
86
|
messages: list[dict],
|
|
87
|
+
*,
|
|
88
|
+
source: str | None = None,
|
|
77
89
|
**kwargs: Any,
|
|
78
90
|
) -> dict:
|
|
79
91
|
"""Collect the streaming response into a single text + usage dict.
|
|
@@ -84,7 +96,9 @@ class ClaudeCodeLLM:
|
|
|
84
96
|
"""
|
|
85
97
|
max_tokens = int(kwargs.pop("max_tokens", self._max_tokens))
|
|
86
98
|
parts: list[str] = []
|
|
87
|
-
async for delta in self._iter_stream(
|
|
99
|
+
async for delta in self._iter_stream(
|
|
100
|
+
system, messages, max_tokens=max_tokens, extra=kwargs, source=source
|
|
101
|
+
):
|
|
88
102
|
parts.append(delta)
|
|
89
103
|
text = "".join(parts)
|
|
90
104
|
if not text:
|
|
@@ -95,9 +109,11 @@ class ClaudeCodeLLM:
|
|
|
95
109
|
self,
|
|
96
110
|
system: str | None,
|
|
97
111
|
messages: list[dict],
|
|
112
|
+
*,
|
|
113
|
+
source: str | None = None,
|
|
98
114
|
) -> AsyncGenerator[str, None]:
|
|
99
115
|
async for delta in self._iter_stream(
|
|
100
|
-
system, messages, max_tokens=self._max_tokens, extra={}
|
|
116
|
+
system, messages, max_tokens=self._max_tokens, extra={}, source=source
|
|
101
117
|
):
|
|
102
118
|
yield delta
|
|
103
119
|
|
|
@@ -114,6 +130,7 @@ class ClaudeCodeLLM:
|
|
|
114
130
|
*,
|
|
115
131
|
max_tokens: int,
|
|
116
132
|
extra: dict[str, Any],
|
|
133
|
+
source: str | None = None,
|
|
117
134
|
) -> AsyncGenerator[str, None]:
|
|
118
135
|
"""Single source of truth: open Anthropic SSE stream, yield text
|
|
119
136
|
deltas, populate `self.last_usage`. Auth refresh on 401/403
|
|
@@ -182,10 +199,31 @@ class ClaudeCodeLLM:
|
|
|
182
199
|
"total_tokens": tokens_in + tokens_out,
|
|
183
200
|
"provider": "claude-code",
|
|
184
201
|
}
|
|
202
|
+
self._record_usage(self.last_usage, source=source)
|
|
185
203
|
return
|
|
186
204
|
|
|
187
205
|
raise RuntimeError("Claude Code authentication failed after refresh")
|
|
188
206
|
|
|
207
|
+
def _record_usage(self, usage: dict, *, source: str | None) -> None:
|
|
208
|
+
"""Report token totals to the budget guard.
|
|
209
|
+
|
|
210
|
+
Tokens budgeted = total input that hit the wire (non-cached +
|
|
211
|
+
cache-creation + cache-read) plus output tokens — so ``max_input_tokens``
|
|
212
|
+
/ ``max_output_tokens`` reflect real consumption regardless of cache
|
|
213
|
+
hit rate. No cost is reported (subscription auth, no pricing).
|
|
214
|
+
"""
|
|
215
|
+
guard = self._budget
|
|
216
|
+
if not guard or not hasattr(guard, "add_tokens"):
|
|
217
|
+
return
|
|
218
|
+
tokens_in = (
|
|
219
|
+
int(usage.get("tokens_in") or 0)
|
|
220
|
+
+ int(usage.get("cache_read_tokens") or 0)
|
|
221
|
+
+ int(usage.get("cache_creation_tokens") or 0)
|
|
222
|
+
)
|
|
223
|
+
tokens_out = int(usage.get("tokens_out") or 0)
|
|
224
|
+
if tokens_in or tokens_out:
|
|
225
|
+
guard.add_tokens(tokens_in, tokens_out, source=source)
|
|
226
|
+
|
|
189
227
|
async def _get_client(self) -> Any:
|
|
190
228
|
if self._client is None:
|
|
191
229
|
try:
|