react-agent-harness 0.5.2__tar.gz → 0.6.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. {react_agent_harness-0.5.2/react_agent_harness.egg-info → react_agent_harness-0.6.1}/PKG-INFO +2 -2
  2. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/README.md +238 -5
  3. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/agents/base.py +13 -0
  4. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/cli.py +34 -0
  5. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/console.py +39 -2
  6. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/llm/anthropic.py +26 -5
  7. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/llm/claude_code.py +40 -2
  8. react_agent_harness-0.6.1/harness/llm/fallback.py +171 -0
  9. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/llm/openai.py +21 -5
  10. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/llm/openai_codex.py +34 -2
  11. react_agent_harness-0.6.1/harness/llm/routing.py +141 -0
  12. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/runtime.py +200 -30
  13. react_agent_harness-0.6.1/harness/trace.py +171 -0
  14. react_agent_harness-0.6.1/harness/trace_viewer.py +326 -0
  15. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/orchestrator/planner.py +19 -3
  16. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/pyproject.toml +2 -2
  17. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1/react_agent_harness.egg-info}/PKG-INFO +2 -2
  18. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/react_agent_harness.egg-info/SOURCES.txt +9 -0
  19. react_agent_harness-0.6.1/tests/test_budget_guard.py +134 -0
  20. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_cli.py +4 -0
  21. react_agent_harness-0.6.1/tests/test_console_renderer.py +145 -0
  22. react_agent_harness-0.6.1/tests/test_fallback_llm.py +221 -0
  23. react_agent_harness-0.6.1/tests/test_per_call_site_llm.py +420 -0
  24. react_agent_harness-0.6.1/tests/test_routing_llm.py +164 -0
  25. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_streaming.py +1 -1
  26. react_agent_harness-0.6.1/tests/test_trace.py +240 -0
  27. react_agent_harness-0.5.2/tests/test_console_renderer.py +0 -52
  28. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/LICENSE +0 -0
  29. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/agents/__init__.py +0 -0
  30. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/__init__.py +0 -0
  31. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/annotation.py +0 -0
  32. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/checkpoint.py +0 -0
  33. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/events.py +0 -0
  34. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/executor_bridge.py +0 -0
  35. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/hitl.py +0 -0
  36. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/llm/__init__.py +0 -0
  37. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/llm/_streaming.py +0 -0
  38. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/llm/auth.py +0 -0
  39. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/oauth_browser.py +0 -0
  40. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/otel.py +0 -0
  41. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/steering.py +0 -0
  42. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/tool_policy.py +0 -0
  43. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/harness/utils.py +0 -0
  44. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/memory/__init__.py +0 -0
  45. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/memory/episodic_lance.py +0 -0
  46. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/memory/manager.py +0 -0
  47. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/memory/redis_store.py +0 -0
  48. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/memory/stores.py +0 -0
  49. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/memory/working.py +0 -0
  50. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/orchestrator/__init__.py +0 -0
  51. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/react_agent_harness.egg-info/dependency_links.txt +0 -0
  52. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/react_agent_harness.egg-info/entry_points.txt +0 -0
  53. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/react_agent_harness.egg-info/requires.txt +0 -0
  54. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/react_agent_harness.egg-info/top_level.txt +0 -0
  55. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/setup.cfg +0 -0
  56. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_agents_base.py +0 -0
  57. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_annotation.py +0 -0
  58. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_anthropic_llm.py +0 -0
  59. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_checkpoint_resume.py +0 -0
  60. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_claude_code_llm.py +0 -0
  61. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_executor_bridge.py +0 -0
  62. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_http_fetch.py +0 -0
  63. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_llm_auth.py +0 -0
  64. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_mcp_adapter.py +0 -0
  65. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_mcp_auth.py +0 -0
  66. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_memory.py +0 -0
  67. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_oauth_browser.py +0 -0
  68. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_openai_codex_llm.py +0 -0
  69. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_openai_llm.py +0 -0
  70. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_orchestrator.py +0 -0
  71. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_otel.py +0 -0
  72. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_parse_action_json.py +0 -0
  73. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_redis_store.py +0 -0
  74. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_steering.py +0 -0
  75. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_tool_policy.py +0 -0
  76. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_utils.py +0 -0
  77. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_vision.py +0 -0
  78. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tests/test_working_memory.py +0 -0
  79. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tools/__init__.py +0 -0
  80. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tools/builtin/__init__.py +0 -0
  81. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tools/builtin/fetch_image.py +0 -0
  82. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tools/builtin/http_fetch.py +0 -0
  83. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tools/mcp/__init__.py +0 -0
  84. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tools/mcp/adapter.py +0 -0
  85. {react_agent_harness-0.5.2 → react_agent_harness-0.6.1}/tools/mcp/auth.py +0 -0
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: react-agent-harness
3
- Version: 0.5.2
4
- Summary: Multi-agent LLM orchestration: hybrid DAG planning, two-tier memory, streaming
3
+ Version: 0.6.1
4
+ Summary: Multi-agent LLM orchestration: hybrid DAG planning, two-tier memory, streaming, cost/token budgets with per-call-site breakdown
5
5
  Requires-Python: >=3.10
6
6
  License-File: LICENSE
7
7
  Requires-Dist: prompt_toolkit>=3.0
@@ -1,7 +1,9 @@
1
1
  # react-agent-harness
2
2
 
3
3
  Bring-your-own-LLM multi-agent harness: hybrid DAG planning with replan-on-failure,
4
- two-tier memory (semantic KV + episodic vector), and a streaming-primary event model.
4
+ two-tier memory (semantic KV + episodic vector), a streaming-primary event model,
5
+ and cost/token budgets with per-call-site attribution (classifier, router,
6
+ planner, synthesizer, agent).
5
7
 
6
8
  Config-driven — register tools and agents, run any goal. No subclassing.
7
9
 
@@ -33,13 +35,21 @@ events to stdout and prints elapsed time + cost at the end.
33
35
  ## Architecture
34
36
 
35
37
  ```
36
- harness/runtime.py AgentRuntime — single entry point, wire once run anything
38
+ harness/runtime.py AgentRuntime — single entry point; BudgetGuard with cost/token caps + per-call-site breakdown
37
39
  harness/events.py BusEvent + EventType — canonical event vocabulary
38
- harness/llm/openai.py OpenAILLM — OpenAI adapter with usage + cost tracking
40
+ harness/llm/openai.py OpenAILLM — OpenAI API-key adapter with usage + cost tracking
41
+ harness/llm/anthropic.py AnthropicLLM — direct Anthropic API-key adapter with prompt-caching support
42
+ harness/llm/claude_code.py ClaudeCodeLLM — Claude subscription OAuth adapter (experimental, ToS caveats)
43
+ harness/llm/openai_codex.py OpenAICodexLLM — ChatGPT subscription OAuth adapter (experimental, ToS caveats)
44
+ harness/llm/auth.py Shared OAuth + auth-file primitives for the subscription adapters
45
+ harness/llm/fallback.py FallbackLLM — transparent retry on transient upstream errors
46
+ harness/llm/routing.py RoutingLLM — dispatch calls to different adapters by a selector
47
+ harness/trace.py JSONL trace recorder + replay — durable, per-event flush
48
+ harness/trace_viewer.py Local web timeline viewer for recorded JSONL traces
39
49
  harness/annotation.py Annotation store + AnnotationHook — RLHF trajectory capture
40
50
  harness/hitl.py HITL approval gate — interactive CLI, session-allow list
41
51
  harness/tool_policy.py Persistent tool policy — user-scoped allow rules, CLI management
42
- harness/console.py ConsoleRenderer — centralised BusEvent formatting for CLI apps
52
+ harness/console.py ConsoleRenderer — centralised BusEvent formatting + render_budget helper
43
53
  harness/steering.py Async steering — agent.steer(text), StdinRouter pub/sub, FileSteer, factory helpers
44
54
  harness/checkpoint.py CheckpointStore + _ResumeHint + maybe_resume_key — pluggable run-state persistence (file + Redis); auto-resume built into dispatch_stream / run_stream
45
55
  harness/otel.py OTELHook — OpenTelemetry span exporter (opt-in)
@@ -54,7 +64,8 @@ memory/redis_store.py Redis semantic store — durable KV with TTL
54
64
  memory/stores.py InMemory stores — local dev default, no deps
55
65
  tools/builtin/http_fetch.py HTTPFetch — minimal read-only GET tool
56
66
  tools/builtin/fetch_image.py FetchImage — fetch URL and return OpenAI image_url block
57
- tools/mcp/adapter.py MCP tool adapter — connect any MCP server
67
+ tools/mcp/adapter.py MCP tool adapter — stdio, SSE, streamable-HTTP transports
68
+ tools/mcp/auth.py ApiKeyMCPAuth + BrowserOAuthMCPAuth — auth primitives for remote MCP servers
58
69
  ```
59
70
 
60
71
  Execution is **streaming-primary**: every path yields `BusEvent`s for
@@ -188,6 +199,99 @@ llm = ClaudeCodeLLM(
188
199
  )
189
200
  ```
190
201
 
202
+ ### Cost shaping + reliability
203
+
204
+ Two patterns, ordered by how production teams actually solve this:
205
+
206
+ **1. Per-call-site LLM injection (the recommended pattern)**
207
+
208
+ `AgentRuntime` exposes one slot per orchestrator call site. Each defaults to
209
+ `llm` when unset, so existing code keeps working. The classifier and router
210
+ both see only the goal + agent descriptions (~300 tokens) and emit a
211
+ one-token decision — natural candidates for a cheaper model. The planner
212
+ and synthesiser produce structured DAGs and final answers and usually want
213
+ to stay on the main model.
214
+
215
+ ```python
216
+ runtime = AgentRuntime(
217
+ agent_registry=agents,
218
+ tool_registry=tools,
219
+ memory=memory,
220
+ llm=premium, # default — agent ReAct loops use this
221
+ classifier_llm=cheap, # simple vs complex dispatch decision
222
+ router_llm=cheap, # single-agent picker
223
+ # planner_llm=... # defaults to llm; override only if you want
224
+ # synthesizer_llm=... # defaults to llm
225
+ )
226
+ ```
227
+
228
+ No guessing, no keyword matching, no fragility — you read the runtime
229
+ construction and you know exactly which model serves which purpose. The
230
+ budget guard is wired into every distinct LLM instance automatically
231
+ (deduped by object identity, so injecting the same wrapper into multiple
232
+ slots costs no extra calls).
233
+
234
+ **2. `FallbackLLM` for resilience**
235
+
236
+ Try each adapter in order; transparently switch to the next on rate
237
+ limits, timeouts, or 5xx errors:
238
+
239
+ ```python
240
+ from harness.llm.fallback import FallbackLLM
241
+
242
+ llm = FallbackLLM([
243
+ AnthropicLLM(model="claude-sonnet-4-6"), # primary
244
+ OpenAILLM(model="gpt-4o-mini"), # backup
245
+ ])
246
+ runtime = AgentRuntime(..., llm=llm)
247
+ print(llm.last_route) # 0 if primary worked, 1 if backup did
248
+ ```
249
+
250
+ Permanent errors (auth, bad request) propagate immediately — only transient
251
+ upstream errors trigger fallback. Customise with `transient_errors=...`.
252
+ Streaming retries only fire before the first token; mid-stream failures
253
+ propagate to preserve response integrity.
254
+
255
+ **3. `RoutingLLM` for bring-your-own-selector cases**
256
+
257
+ When you need runtime routing — capability gating (`vision` vs
258
+ `long_context`), learned classifiers (RouteLLM-style), cascade
259
+ routing (cheap-then-escalate-on-low-confidence) — wrap a routes dict
260
+ with your own selector callable:
261
+
262
+ ```python
263
+ from harness.llm.routing import RoutingLLM
264
+
265
+ def by_capability(system, messages):
266
+ if _needs_vision(messages):
267
+ return "vision"
268
+ if _estimated_tokens(system, messages) > 100_000:
269
+ return "long_context"
270
+ return "default"
271
+
272
+ llm = RoutingLLM(
273
+ routes={
274
+ "default": OpenAILLM(model="gpt-4o-mini"),
275
+ "vision": OpenAILLM(model="gpt-4o"),
276
+ "long_context": AnthropicLLM(model="claude-sonnet-4-6"),
277
+ },
278
+ selector=by_capability,
279
+ default_route="default",
280
+ )
281
+ ```
282
+
283
+ The harness intentionally does not ship default selectors. Naive selectors
284
+ (keyword matching, fixed token thresholds) misroute in subtle ways and
285
+ encourage the wrong mental model — if you're reaching for one, you almost
286
+ certainly want per-call-site injection instead.
287
+
288
+ Compose freely: `FallbackLLM([premium, backup])` injected into the
289
+ `llm=` slot gives the agent loops resilience, with `classifier_llm=cheap`
290
+ and `router_llm=cheap` shaping the cheap-call cost — all without a custom
291
+ selector.
292
+
293
+ ---
294
+
191
295
  `ClaudeCodeLLM` reads a `claude-code` OAuth entry, refreshes it automatically
192
296
  when expired, and retries once after `401`/`403`. This mirrors Pi's Claude
193
297
  Pro/Max extension approach rather than shelling out to the Claude CLI. The
@@ -495,6 +599,92 @@ Cost ceiling fires on the *next* `check()` (start of next ReAct step or
495
599
  orchestrator batch), not synchronously mid-call — accept this for 0.0.1, the
496
600
  guard's job is preventing runaway loops, not bounding individual calls.
497
601
 
602
+ ### Token limits + per-call-site breakdown
603
+
604
+ `GuardrailConfig.max_input_tokens` / `max_output_tokens` cap raw token
605
+ usage independently of dollar cost. This is the only enforcement available
606
+ to subscription-auth runs (`ClaudeCodeLLM`, `OpenAICodexLLM`) — those tiers
607
+ don't expose pricing, so cost stays 0 and only token caps can fire.
608
+
609
+ ```python
610
+ runtime = AgentRuntime(
611
+ ...,
612
+ guardrail_config=GuardrailConfig(
613
+ max_total_cost_usd=2.0,
614
+ max_input_tokens=100_000,
615
+ max_output_tokens=20_000,
616
+ ),
617
+ )
618
+ ```
619
+
620
+ Per-call-site attribution lives on the terminal event's `budget` payload
621
+ — a snapshot of spending bucketed by the LLM slot that ran each call.
622
+ The runtime tags classifier / router / planner / synthesizer calls
623
+ automatically; ReAct agent calls go into the totals but don't get a
624
+ bucket. So `cheap` (used for both `classifier_llm` and `router_llm`) and
625
+ `premium` (used for `planner_llm`) report separately even though one is
626
+ the same physical LLM instance shared across slots:
627
+
628
+ ```python
629
+ async for event in runtime.dispatch_stream(goal):
630
+ # Routed (simple) goals terminate with TASK_DONE; orchestrated goals
631
+ # with DONE. Both carry the same ``budget`` shape.
632
+ if event.type in (EventType.TASK_DONE, EventType.DONE):
633
+ budget = event.payload["budget"]
634
+ print(f"total: in={budget['tokens_in']} out={budget['tokens_out']} "
635
+ f"${budget['cost_usd']:.4f}")
636
+ for slot, stats in budget["breakdown"].items():
637
+ print(f" {slot}: in={stats['tokens_in']} out={stats['tokens_out']}")
638
+ ```
639
+
640
+ The same `budget` dict is attached to `runtime.run(...)` and
641
+ `runtime.dispatch(...)` return values under the `budget` key, so blocking
642
+ callers don't need to read events.
643
+
644
+ Anthropic / Claude Code adapters count input tokens as the *total* that
645
+ hit the wire (non-cached + cache-creation + cache-read), so token caps
646
+ reflect actual consumption regardless of cache hit rate. Cost calculation
647
+ via `cost_fn` still respects cache pricing.
648
+
649
+ ### Evals via the trace recorder
650
+
651
+ There's no shipped evals framework — opinions on scorers, judge models,
652
+ and golden-set management belong outside the orchestration core. The
653
+ [trace recorder](#trace-recorder--replay--local-viewer) already writes
654
+ per-event token/cost/latency to JSONL, so a few lines of glue cover most
655
+ in-house eval setups:
656
+
657
+ ```python
658
+ import json
659
+ from harness.trace import record_trace
660
+
661
+ # 1. Record traces while running a fixture set.
662
+ for fixture in fixtures:
663
+ async for _event in record_trace(
664
+ runtime.dispatch_stream(fixture["input"]),
665
+ path=f"runs/{fixture['id']}.jsonl",
666
+ ):
667
+ pass
668
+
669
+ # 2. Score offline by replaying.
670
+ def score_run(path: str, expected: str) -> dict:
671
+ answer = ""
672
+ budget = {"tokens_in": 0, "tokens_out": 0, "cost_usd": 0.0, "breakdown": {}}
673
+ for line in open(path):
674
+ event = json.loads(line)
675
+ if event["type"] in ("done", "task_done"):
676
+ answer = event["payload"].get("answer", "")
677
+ budget = event["payload"].get("budget", budget)
678
+ return {
679
+ "success": expected.lower() in answer.lower(),
680
+ **budget, # tokens_in, tokens_out, cost_usd, breakdown
681
+ }
682
+ ```
683
+
684
+ Plug in your own scorer (exact-match, LLM-judge, semantic similarity) on
685
+ top. External tools like Braintrust, LangSmith, and Weave are
686
+ purpose-built for this and ingest the same JSONL shape directly.
687
+
498
688
  ## Tool execution
499
689
 
500
690
  Tools that shell out (`kubectl`, `curl`, `sh -c …`) should not run inside the
@@ -722,6 +912,49 @@ The OTEL hook is a side-channel on the existing `Tracer` — the in-memory trace
722
912
  is always available via `result["trace"]` regardless of whether OTEL is enabled.
723
913
  Zero overhead and zero imports when `enable_otel=False`.
724
914
 
915
+ ## Trace recorder + replay + local viewer
916
+
917
+ For local debug and post-mortem inspection without an OTEL backend, the
918
+ harness ships a JSONL trace recorder and a stdlib-only HTML viewer. Wrap
919
+ any streaming call:
920
+
921
+ ```python
922
+ from harness.trace import record_trace, replay
923
+
924
+ async for event in record_trace(runtime.dispatch_stream(goal), "run.jsonl"):
925
+ ... # your normal handling
926
+ ```
927
+
928
+ Each `BusEvent` is flushed per-line, so a partial trace survives a crash.
929
+ View the trace in your browser:
930
+
931
+ ```bash
932
+ agent-harness trace view run.jsonl # opens http://127.0.0.1:8765/
933
+ ```
934
+
935
+ The viewer is a single embedded HTML page — vertical timeline, filter by
936
+ agent / event type / text, expandable per-event JSON. No build step, no
937
+ external services.
938
+
939
+ Replay a trace through `ConsoleRenderer` (great for grepping or piping
940
+ into another script):
941
+
942
+ ```bash
943
+ agent-harness trace replay run.jsonl
944
+ agent-harness trace replay run.jsonl --realtime --speed 2.0
945
+ ```
946
+
947
+ Programmatic replay yields reconstructed `BusEvent` objects:
948
+
949
+ ```python
950
+ async for event in replay("run.jsonl", realtime=False):
951
+ ... # reuse the same loops you write for live streams
952
+ ```
953
+
954
+ This is complementary to OTEL — OTEL is for production observability and
955
+ long-term storage in Jaeger/Datadog; the JSONL recorder is for local
956
+ debugging, sharing reproductions, and replaying past runs.
957
+
725
958
  ## Vision / multimodal agents
726
959
 
727
960
  `WorkingMemory` accepts `str | list` content so image blocks pass through to
@@ -422,6 +422,12 @@ class BaseAgent:
422
422
  "summarizations": self._working_memory.summarization_count,
423
423
  },
424
424
  }
425
+ # Attach the current budget snapshot so dispatch_stream
426
+ # consumers can read totals + per-call-site breakdown off
427
+ # the routed path's terminal event, same shape as the
428
+ # orchestrator's DONE event.
429
+ if self._guard is not None and hasattr(self._guard, "snapshot"):
430
+ result["budget"] = self._guard.snapshot()
425
431
  logger.info(
426
432
  "Agent %s completed: steps=%d confidence=%.2f summarizations=%d",
427
433
  self.config.agent_id,
@@ -653,11 +659,17 @@ class BaseAgent:
653
659
  payload=before_usage,
654
660
  )
655
661
 
662
+ # Tag ReAct spending so it shows up in BudgetGuard.breakdown alongside
663
+ # classifier/router/planner/synthesizer. Per-agent attribution makes
664
+ # multi-agent demos surface which specialist agent actually drove the
665
+ # bulk of token usage.
666
+ react_source = f"agent:{self.config.agent_id}"
656
667
  try:
657
668
  if hasattr(self._llm, "stream_complete"):
658
669
  async for token in self._llm.stream_complete(
659
670
  system=None,
660
671
  messages=messages,
672
+ source=react_source,
661
673
  ):
662
674
  accumulated += token
663
675
  if self.config.stream_tokens:
@@ -679,6 +691,7 @@ class BaseAgent:
679
691
  system=None,
680
692
  messages=messages,
681
693
  response_format={"type": "json_object"},
694
+ source=react_source,
682
695
  )
683
696
  response = _normalize_response(raw)
684
697
  if response is None:
@@ -46,6 +46,19 @@ def main() -> int:
46
46
  policy_clear = policy_sub.add_parser("clear", help="remove all policy rules")
47
47
  policy_clear.add_argument("--policy-file", default=str(default_policy_file()))
48
48
 
49
+ trace = sub.add_parser("trace", help="view or replay a recorded run trace")
50
+ trace_sub = trace.add_subparsers(dest="trace_command", required=True)
51
+ trace_view = trace_sub.add_parser("view", help="open a local web viewer for a trace")
52
+ trace_view.add_argument("path", help="path to a JSONL trace produced by record_trace")
53
+ trace_view.add_argument("--port", type=int, default=8765)
54
+ trace_view.add_argument("--no-open", action="store_true", help="don't auto-open the browser")
55
+ trace_replay = trace_sub.add_parser("replay", help="dump a trace to stdout via ConsoleRenderer")
56
+ trace_replay.add_argument("path", help="path to a JSONL trace produced by record_trace")
57
+ trace_replay.add_argument(
58
+ "--realtime", action="store_true", help="preserve recorded inter-event timing"
59
+ )
60
+ trace_replay.add_argument("--speed", type=float, default=1.0, help="realtime speed multiplier")
61
+
49
62
  args = parser.parse_args()
50
63
  try:
51
64
  if args.command == "login":
@@ -71,6 +84,16 @@ def main() -> int:
71
84
  return _policy_revoke(path, args.rule_id)
72
85
  if args.policy_command == "clear":
73
86
  return _policy_clear(path)
87
+ if args.command == "trace":
88
+ if args.trace_command == "view":
89
+ from harness.trace_viewer import serve
90
+
91
+ serve(args.path, port=args.port, open_browser=not args.no_open)
92
+ return 0
93
+ if args.trace_command == "replay":
94
+ return asyncio.run(
95
+ _trace_replay(args.path, realtime=args.realtime, speed=args.speed)
96
+ )
74
97
  except Exception as e:
75
98
  print(f"agent-harness: {e}", file=sys.stderr)
76
99
  return 1
@@ -180,5 +203,16 @@ def _policy_clear(path: Path) -> int:
180
203
  return 0
181
204
 
182
205
 
206
+ async def _trace_replay(path: str, *, realtime: bool, speed: float) -> int:
207
+ """Read a JSONL trace and render it via ConsoleRenderer."""
208
+ from harness.console import ConsoleRenderer
209
+ from harness.trace import replay
210
+
211
+ renderer = ConsoleRenderer()
212
+ async for event in replay(path, realtime=realtime, speed=speed):
213
+ renderer.render(event)
214
+ return 0
215
+
216
+
183
217
  if __name__ == "__main__":
184
218
  raise SystemExit(main())
@@ -178,17 +178,54 @@ class ConsoleRenderer:
178
178
  self.sep("═")
179
179
  print(p.get("answer", "(no answer)"), file=self._out)
180
180
  self.sep()
181
+ # ``budget`` snapshot supersedes the flat cost/elapsed fields when
182
+ # present (added with token caps + per-call-site breakdown).
183
+ budget = p.get("budget") or {}
184
+ cost = budget.get("cost_usd", p.get("cost_usd", 0))
185
+ elapsed = budget.get("elapsed_seconds", p.get("elapsed_seconds", 0))
181
186
  print(
182
187
  f"Confidence: {p.get('confidence', 0):.2f} | "
183
188
  f"Replans: {p.get('replan_count', 0)} | "
184
- f"Cost: ${p.get('cost_usd', 0):.4f} | "
185
- f"Time: {p.get('elapsed_seconds', 0):.1f}s",
189
+ f"Cost: ${cost:.4f} | "
190
+ f"Time: {elapsed:.1f}s",
186
191
  file=self._out,
187
192
  )
193
+ self.render_budget(budget)
188
194
 
189
195
  elif t == EventType.ERROR:
190
196
  print(f"\n[error] {event.error}", file=sys.stderr)
191
197
 
198
+ def render_budget(self, budget: dict | None) -> None:
199
+ """Print tokens + per-call-site breakdown from a ``BudgetGuard.snapshot()``
200
+ dict. Safe to call with ``{}`` or ``None`` — prints nothing when
201
+ there's no usage to show.
202
+
203
+ Exposed publicly so demos and other consumers that own their own
204
+ DONE / TASK_DONE rendering can still surface the breakdown without
205
+ duplicating the formatting.
206
+ """
207
+ if not budget:
208
+ return
209
+ tokens_in = budget.get("tokens_in")
210
+ tokens_out = budget.get("tokens_out")
211
+ if tokens_in is not None or tokens_out is not None:
212
+ print(
213
+ f"Tokens: in={int(tokens_in or 0):,} out={int(tokens_out or 0):,}",
214
+ file=self._out,
215
+ )
216
+ breakdown = budget.get("breakdown") or {}
217
+ if breakdown:
218
+ # Right-pad the slot label so columns line up — matters when
219
+ # the demo prints multiple slots in sequence.
220
+ width = max(len(name) for name in breakdown)
221
+ for slot, stats in breakdown.items():
222
+ print(
223
+ f" {slot:<{width}} "
224
+ f"in={int(stats.get('tokens_in', 0)):>7,} "
225
+ f"out={int(stats.get('tokens_out', 0)):>6,}",
226
+ file=self._out,
227
+ )
228
+
192
229
  # ── private helpers ───────────────────────────────────────────────────────
193
230
 
194
231
  def _label(self, event: BusEvent) -> str:
@@ -91,6 +91,8 @@ class AnthropicLLM:
91
91
  self,
92
92
  system: str | None,
93
93
  messages: list[dict],
94
+ *,
95
+ source: str | None = None,
94
96
  **kwargs: Any,
95
97
  ) -> dict:
96
98
  max_tokens = int(kwargs.pop("max_tokens", self._max_tokens))
@@ -110,7 +112,7 @@ class AnthropicLLM:
110
112
  cost = _compute_cost(usage, self._cost_fn)
111
113
  if cost is not None:
112
114
  usage["cost_usd"] = cost
113
- self._record_cost(usage)
115
+ self._record_usage(usage, source=source)
114
116
  self.last_usage = usage
115
117
 
116
118
  text = _collect_text(resp.content)
@@ -122,6 +124,8 @@ class AnthropicLLM:
122
124
  self,
123
125
  system: str | None,
124
126
  messages: list[dict],
127
+ *,
128
+ source: str | None = None,
125
129
  ) -> AsyncGenerator[str, None]:
126
130
  sys_blocks = _system_blocks(system, prompt_caching=self._prompt_caching)
127
131
  built_messages = _build_messages(messages, prompt_caching=self._prompt_caching)
@@ -143,17 +147,34 @@ class AnthropicLLM:
143
147
  cost = _compute_cost(usage, self._cost_fn)
144
148
  if cost is not None:
145
149
  usage["cost_usd"] = cost
146
- self._record_cost(usage)
150
+ self._record_usage(usage, source=source)
147
151
  self.last_usage = usage
148
152
 
149
153
  # ── Internals ─────────────────────────────────────────────────────────────
150
154
 
151
- def _record_cost(self, usage: dict) -> None:
152
- if not self._budget:
155
+ def _record_usage(self, usage: dict, *, source: str | None) -> None:
156
+ """Forward usage to the budget guard.
157
+
158
+ Token count for budget purposes is the total input that hit the wire
159
+ — non-cached + cache-creation + cache-read — so token caps reflect
160
+ real wall-clock consumption regardless of cache hit rate. Cost
161
+ (which respects cache pricing via ``cost_fn``) is reported when
162
+ known.
163
+ """
164
+ guard = self._budget
165
+ if not guard:
153
166
  return
167
+ tokens_in = (
168
+ int(usage.get("tokens_in") or 0)
169
+ + int(usage.get("cache_read_tokens") or 0)
170
+ + int(usage.get("cache_creation_tokens") or 0)
171
+ )
172
+ tokens_out = int(usage.get("tokens_out") or 0)
173
+ if (tokens_in or tokens_out) and hasattr(guard, "add_tokens"):
174
+ guard.add_tokens(tokens_in, tokens_out, source=source)
154
175
  cost = usage.get("cost_usd")
155
176
  if cost and cost > 0:
156
- self._budget.add_cost(cost)
177
+ guard.add_cost(cost, source=source)
157
178
 
158
179
 
159
180
  # ── Module-level helpers ──────────────────────────────────────────────────────
@@ -68,12 +68,24 @@ class ClaudeCodeLLM:
68
68
  self._user_agent = user_agent or _default_user_agent()
69
69
  self._betas = betas
70
70
  self._prompt_caching = prompt_caching
71
+ self._budget: Any = None
71
72
  self.last_usage: dict | None = None
72
73
 
74
+ def set_budget(self, guard: Any) -> None:
75
+ """Inject a BudgetGuard so token caps fire on subscription-auth runs.
76
+
77
+ Cost stays 0 (no pricing schedule available for the subscription
78
+ tier), but ``add_tokens`` still lands so ``max_input_tokens`` /
79
+ ``max_output_tokens`` are enforced.
80
+ """
81
+ self._budget = guard
82
+
73
83
  async def complete(
74
84
  self,
75
85
  system: str | None,
76
86
  messages: list[dict],
87
+ *,
88
+ source: str | None = None,
77
89
  **kwargs: Any,
78
90
  ) -> dict:
79
91
  """Collect the streaming response into a single text + usage dict.
@@ -84,7 +96,9 @@ class ClaudeCodeLLM:
84
96
  """
85
97
  max_tokens = int(kwargs.pop("max_tokens", self._max_tokens))
86
98
  parts: list[str] = []
87
- async for delta in self._iter_stream(system, messages, max_tokens=max_tokens, extra=kwargs):
99
+ async for delta in self._iter_stream(
100
+ system, messages, max_tokens=max_tokens, extra=kwargs, source=source
101
+ ):
88
102
  parts.append(delta)
89
103
  text = "".join(parts)
90
104
  if not text:
@@ -95,9 +109,11 @@ class ClaudeCodeLLM:
95
109
  self,
96
110
  system: str | None,
97
111
  messages: list[dict],
112
+ *,
113
+ source: str | None = None,
98
114
  ) -> AsyncGenerator[str, None]:
99
115
  async for delta in self._iter_stream(
100
- system, messages, max_tokens=self._max_tokens, extra={}
116
+ system, messages, max_tokens=self._max_tokens, extra={}, source=source
101
117
  ):
102
118
  yield delta
103
119
 
@@ -114,6 +130,7 @@ class ClaudeCodeLLM:
114
130
  *,
115
131
  max_tokens: int,
116
132
  extra: dict[str, Any],
133
+ source: str | None = None,
117
134
  ) -> AsyncGenerator[str, None]:
118
135
  """Single source of truth: open Anthropic SSE stream, yield text
119
136
  deltas, populate `self.last_usage`. Auth refresh on 401/403
@@ -182,10 +199,31 @@ class ClaudeCodeLLM:
182
199
  "total_tokens": tokens_in + tokens_out,
183
200
  "provider": "claude-code",
184
201
  }
202
+ self._record_usage(self.last_usage, source=source)
185
203
  return
186
204
 
187
205
  raise RuntimeError("Claude Code authentication failed after refresh")
188
206
 
207
+ def _record_usage(self, usage: dict, *, source: str | None) -> None:
208
+ """Report token totals to the budget guard.
209
+
210
+ Tokens budgeted = total input that hit the wire (non-cached +
211
+ cache-creation + cache-read) plus output tokens — so ``max_input_tokens``
212
+ / ``max_output_tokens`` reflect real consumption regardless of cache
213
+ hit rate. No cost is reported (subscription auth, no pricing).
214
+ """
215
+ guard = self._budget
216
+ if not guard or not hasattr(guard, "add_tokens"):
217
+ return
218
+ tokens_in = (
219
+ int(usage.get("tokens_in") or 0)
220
+ + int(usage.get("cache_read_tokens") or 0)
221
+ + int(usage.get("cache_creation_tokens") or 0)
222
+ )
223
+ tokens_out = int(usage.get("tokens_out") or 0)
224
+ if tokens_in or tokens_out:
225
+ guard.add_tokens(tokens_in, tokens_out, source=source)
226
+
189
227
  async def _get_client(self) -> Any:
190
228
  if self._client is None:
191
229
  try: