react-agent-harness 0.6.0__tar.gz → 0.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. {react_agent_harness-0.6.0/react_agent_harness.egg-info → react_agent_harness-0.7.0}/PKG-INFO +2 -2
  2. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/README.md +163 -5
  3. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/agents/base.py +57 -2
  4. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/console.py +39 -2
  5. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/llm/anthropic.py +26 -5
  6. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/llm/claude_code.py +40 -2
  7. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/llm/fallback.py +3 -1
  8. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/llm/openai.py +21 -5
  9. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/llm/openai_codex.py +34 -2
  10. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/llm/routing.py +4 -2
  11. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/runtime.py +177 -22
  12. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/memory/episodic_lance.py +205 -66
  13. react_agent_harness-0.7.0/memory/manager.py +925 -0
  14. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/memory/redis_store.py +1 -0
  15. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/memory/stores.py +90 -9
  16. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/orchestrator/planner.py +25 -1
  17. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/pyproject.toml +2 -2
  18. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0/react_agent_harness.egg-info}/PKG-INFO +2 -2
  19. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/react_agent_harness.egg-info/SOURCES.txt +3 -0
  20. react_agent_harness-0.7.0/tests/test_budget_guard.py +134 -0
  21. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_cli.py +4 -0
  22. react_agent_harness-0.7.0/tests/test_console_renderer.py +145 -0
  23. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_fallback_llm.py +1 -1
  24. react_agent_harness-0.7.0/tests/test_memory.py +381 -0
  25. react_agent_harness-0.7.0/tests/test_memory_reconciler.py +374 -0
  26. react_agent_harness-0.7.0/tests/test_memory_touchpoints.py +108 -0
  27. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_orchestrator.py +4 -2
  28. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_per_call_site_llm.py +133 -2
  29. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_routing_llm.py +1 -1
  30. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_streaming.py +1 -1
  31. react_agent_harness-0.6.0/memory/manager.py +0 -372
  32. react_agent_harness-0.6.0/tests/test_console_renderer.py +0 -52
  33. react_agent_harness-0.6.0/tests/test_memory.py +0 -158
  34. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/LICENSE +0 -0
  35. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/agents/__init__.py +0 -0
  36. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/__init__.py +0 -0
  37. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/annotation.py +0 -0
  38. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/checkpoint.py +0 -0
  39. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/cli.py +0 -0
  40. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/events.py +0 -0
  41. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/executor_bridge.py +0 -0
  42. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/hitl.py +0 -0
  43. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/llm/__init__.py +0 -0
  44. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/llm/_streaming.py +0 -0
  45. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/llm/auth.py +0 -0
  46. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/oauth_browser.py +0 -0
  47. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/otel.py +0 -0
  48. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/steering.py +0 -0
  49. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/tool_policy.py +0 -0
  50. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/trace.py +0 -0
  51. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/trace_viewer.py +0 -0
  52. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/utils.py +0 -0
  53. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/memory/__init__.py +0 -0
  54. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/memory/working.py +0 -0
  55. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/orchestrator/__init__.py +0 -0
  56. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/react_agent_harness.egg-info/dependency_links.txt +0 -0
  57. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/react_agent_harness.egg-info/entry_points.txt +0 -0
  58. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/react_agent_harness.egg-info/requires.txt +0 -0
  59. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/react_agent_harness.egg-info/top_level.txt +0 -0
  60. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/setup.cfg +0 -0
  61. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_agents_base.py +0 -0
  62. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_annotation.py +0 -0
  63. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_anthropic_llm.py +0 -0
  64. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_checkpoint_resume.py +0 -0
  65. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_claude_code_llm.py +0 -0
  66. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_executor_bridge.py +0 -0
  67. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_http_fetch.py +0 -0
  68. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_llm_auth.py +0 -0
  69. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_mcp_adapter.py +0 -0
  70. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_mcp_auth.py +0 -0
  71. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_oauth_browser.py +0 -0
  72. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_openai_codex_llm.py +0 -0
  73. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_openai_llm.py +0 -0
  74. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_otel.py +0 -0
  75. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_parse_action_json.py +0 -0
  76. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_redis_store.py +0 -0
  77. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_steering.py +0 -0
  78. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_tool_policy.py +0 -0
  79. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_trace.py +0 -0
  80. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_utils.py +0 -0
  81. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_vision.py +0 -0
  82. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_working_memory.py +0 -0
  83. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tools/__init__.py +0 -0
  84. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tools/builtin/__init__.py +0 -0
  85. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tools/builtin/fetch_image.py +0 -0
  86. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tools/builtin/http_fetch.py +0 -0
  87. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tools/mcp/__init__.py +0 -0
  88. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tools/mcp/adapter.py +0 -0
  89. {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tools/mcp/auth.py +0 -0
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: react-agent-harness
3
- Version: 0.6.0
4
- Summary: Multi-agent LLM orchestration: hybrid DAG planning, two-tier memory, streaming
3
+ Version: 0.7.0
4
+ Summary: Multi-agent LLM orchestration: hybrid DAG planning, two-tier memory, streaming, cost/token budgets with per-call-site breakdown
5
5
  Requires-Python: >=3.10
6
6
  License-File: LICENSE
7
7
  Requires-Dist: prompt_toolkit>=3.0
@@ -1,7 +1,9 @@
1
1
  # react-agent-harness
2
2
 
3
3
  Bring-your-own-LLM multi-agent harness: hybrid DAG planning with replan-on-failure,
4
- two-tier memory (semantic KV + episodic vector), and a streaming-primary event model.
4
+ two-tier memory (semantic KV + episodic vector), a streaming-primary event model,
5
+ and cost/token budgets with per-call-site attribution (classifier, router,
6
+ planner, synthesizer, agent).
5
7
 
6
8
  Config-driven — register tools and agents, run any goal. No subclassing.
7
9
 
@@ -33,15 +35,21 @@ events to stdout and prints elapsed time + cost at the end.
33
35
  ## Architecture
34
36
 
35
37
  ```
36
- harness/runtime.py AgentRuntime — single entry point, wire once run anything
38
+ harness/runtime.py AgentRuntime — single entry point; BudgetGuard with cost/token caps + per-call-site breakdown
37
39
  harness/events.py BusEvent + EventType — canonical event vocabulary
38
- harness/llm/openai.py OpenAILLM — OpenAI adapter with usage + cost tracking
40
+ harness/llm/openai.py OpenAILLM — OpenAI API-key adapter with usage + cost tracking
41
+ harness/llm/anthropic.py AnthropicLLM — direct Anthropic API-key adapter with prompt-caching support
42
+ harness/llm/claude_code.py ClaudeCodeLLM — Claude subscription OAuth adapter (experimental, ToS caveats)
43
+ harness/llm/openai_codex.py OpenAICodexLLM — ChatGPT subscription OAuth adapter (experimental, ToS caveats)
44
+ harness/llm/auth.py Shared OAuth + auth-file primitives for the subscription adapters
39
45
  harness/llm/fallback.py FallbackLLM — transparent retry on transient upstream errors
40
46
  harness/llm/routing.py RoutingLLM — dispatch calls to different adapters by a selector
47
+ harness/trace.py JSONL trace recorder + replay — durable, per-event flush
48
+ harness/trace_viewer.py Local web timeline viewer for recorded JSONL traces
41
49
  harness/annotation.py Annotation store + AnnotationHook — RLHF trajectory capture
42
50
  harness/hitl.py HITL approval gate — interactive CLI, session-allow list
43
51
  harness/tool_policy.py Persistent tool policy — user-scoped allow rules, CLI management
44
- harness/console.py ConsoleRenderer — centralised BusEvent formatting for CLI apps
52
+ harness/console.py ConsoleRenderer — centralised BusEvent formatting + render_budget helper
45
53
  harness/steering.py Async steering — agent.steer(text), StdinRouter pub/sub, FileSteer, factory helpers
46
54
  harness/checkpoint.py CheckpointStore + _ResumeHint + maybe_resume_key — pluggable run-state persistence (file + Redis); auto-resume built into dispatch_stream / run_stream
47
55
  harness/otel.py OTELHook — OpenTelemetry span exporter (opt-in)
@@ -56,7 +64,8 @@ memory/redis_store.py Redis semantic store — durable KV with TTL
56
64
  memory/stores.py InMemory stores — local dev default, no deps
57
65
  tools/builtin/http_fetch.py HTTPFetch — minimal read-only GET tool
58
66
  tools/builtin/fetch_image.py FetchImage — fetch URL and return OpenAI image_url block
59
- tools/mcp/adapter.py MCP tool adapter — connect any MCP server
67
+ tools/mcp/adapter.py MCP tool adapter — stdio, SSE, streamable-HTTP transports
68
+ tools/mcp/auth.py ApiKeyMCPAuth + BrowserOAuthMCPAuth — auth primitives for remote MCP servers
60
69
  ```
61
70
 
62
71
  Execution is **streaming-primary**: every path yields `BusEvent`s for
@@ -331,6 +340,69 @@ crashing the loop.
331
340
  - **During run**: `write_working_fact()` — lightweight KV, namespaced, short TTL
332
341
  - **End of run**: `write_run_end()` — LLM extraction → global semantic + episodic vector
333
342
 
343
+ ### Memory reconciliation (default-on)
344
+
345
+ `write_run_end` runs the LLM-arbitrated reconciler by default: instead of
346
+ extract-and-overwrite, the LLM sees existing relevant memory + new evidence
347
+ and emits a plan of per-fact actions (`ADD` / `UPDATE` / `MERGE` / `DELETE`
348
+ / `NOOP`). Same call count as the legacy extraction step; the prompt is
349
+ larger only when there's existing context to reconcile against.
350
+
351
+ ```python
352
+ manager = MemoryManager(
353
+ semantic_store=…,
354
+ episodic_store=…,
355
+ llm=…,
356
+ reconcile_on_write=True, # default — set False for legacy extract path
357
+ allow_destructive_reconcile=False, # default — DELETE actions demoted to NOOP
358
+ auto_compact_threshold={"agent_task": 20}, # optional — fire compact()
359
+ # when an agent accumulates this
360
+ # many task episodes
361
+ )
362
+ ```
363
+
364
+ `allow_destructive_reconcile=False` keeps the LLM from removing data unless
365
+ you've vetted that DELETE actions are sensible for your workload — demoted
366
+ decisions land in `manager.get_conflict_log()` so you can audit.
367
+
368
+ `manager.compact(goal="…", agent_id="…")` is the same primitive with no
369
+ new evidence — a pure cleanup pass that consolidates accumulated episodes
370
+ and prunes redundant facts. Triggered automatically by
371
+ `auto_compact_threshold`, or call it explicitly.
372
+
373
+ Episodic supersede is now **hard-delete** (no `active=False` tombstones
374
+ accumulating per run): `memory_policy="latest"` writes and reconciler
375
+ `DELETE` actions both remove rows.
376
+
377
+ If the LLM returns a response that doesn't parse as a reconcile plan
378
+ (older / smaller models that don't follow the multi-action schema),
379
+ `write_run_end` silently falls back to the legacy extract-and-overwrite
380
+ path — no crash, no missed run-end write.
381
+
382
+ ### Tool-result caching (opt-in, per run)
383
+
384
+ `AgentConfig.cache_tool_results = True` memoizes tool calls within a single
385
+ run, keyed by `(tool_name, args)`. Useful for multi-agent runs where agents
386
+ redo each other's idempotent reads (`HTTPFetch` on stable URLs,
387
+ `kubectl get ...` discovery, MCP filesystem reads).
388
+
389
+ A tool can veto caching for itself with `cacheable = False` on the
390
+ instance — required for anything with side effects or time-dependent
391
+ output. Errors are never cached (a transient failure shouldn't poison the
392
+ rest of the run).
393
+
394
+ ```python
395
+ class HTTPFetch:
396
+ name = "http_fetch"
397
+ cacheable = True # default; explicit for clarity
398
+
399
+ agents.register(AgentConfig(
400
+ agent_id="web",
401
+ ...,
402
+ cache_tool_results=True,
403
+ ))
404
+ ```
405
+
334
406
  Defaults are in-memory (`InMemorySemanticStore`, `InMemoryEpisodicStore`).
335
407
  For durable storage:
336
408
 
@@ -590,6 +662,92 @@ Cost ceiling fires on the *next* `check()` (start of next ReAct step or
590
662
  orchestrator batch), not synchronously mid-call — accept this for 0.0.1, the
591
663
  guard's job is preventing runaway loops, not bounding individual calls.
592
664
 
665
+ ### Token limits + per-call-site breakdown
666
+
667
+ `GuardrailConfig.max_input_tokens` / `max_output_tokens` cap raw token
668
+ usage independently of dollar cost. This is the only enforcement available
669
+ to subscription-auth runs (`ClaudeCodeLLM`, `OpenAICodexLLM`) — those tiers
670
+ don't expose pricing, so cost stays 0 and only token caps can fire.
671
+
672
+ ```python
673
+ runtime = AgentRuntime(
674
+ ...,
675
+ guardrail_config=GuardrailConfig(
676
+ max_total_cost_usd=2.0,
677
+ max_input_tokens=100_000,
678
+ max_output_tokens=20_000,
679
+ ),
680
+ )
681
+ ```
682
+
683
+ Per-call-site attribution lives on the terminal event's `budget` payload
684
+ — a snapshot of spending bucketed by the LLM slot that ran each call.
685
+ The runtime tags classifier / router / planner / synthesizer calls
686
+ automatically; ReAct agent calls go into the totals but don't get a
687
+ bucket. So `cheap` (used for both `classifier_llm` and `router_llm`) and
688
+ `premium` (used for `planner_llm`) report separately even though one is
689
+ the same physical LLM instance shared across slots:
690
+
691
+ ```python
692
+ async for event in runtime.dispatch_stream(goal):
693
+ # Routed (simple) goals terminate with TASK_DONE; orchestrated goals
694
+ # with DONE. Both carry the same ``budget`` shape.
695
+ if event.type in (EventType.TASK_DONE, EventType.DONE):
696
+ budget = event.payload["budget"]
697
+ print(f"total: in={budget['tokens_in']} out={budget['tokens_out']} "
698
+ f"${budget['cost_usd']:.4f}")
699
+ for slot, stats in budget["breakdown"].items():
700
+ print(f" {slot}: in={stats['tokens_in']} out={stats['tokens_out']}")
701
+ ```
702
+
703
+ The same `budget` dict is attached to `runtime.run(...)` and
704
+ `runtime.dispatch(...)` return values under the `budget` key, so blocking
705
+ callers don't need to read events.
706
+
707
+ Anthropic / Claude Code adapters count input tokens as the *total* that
708
+ hit the wire (non-cached + cache-creation + cache-read), so token caps
709
+ reflect actual consumption regardless of cache hit rate. Cost calculation
710
+ via `cost_fn` still respects cache pricing.
711
+
712
+ ### Evals via the trace recorder
713
+
714
+ There's no shipped evals framework — opinions on scorers, judge models,
715
+ and golden-set management belong outside the orchestration core. The
716
+ [trace recorder](#trace-recorder--replay--local-viewer) already writes
717
+ per-event token/cost/latency to JSONL, so a few lines of glue cover most
718
+ in-house eval setups:
719
+
720
+ ```python
721
+ import json
722
+ from harness.trace import record_trace
723
+
724
+ # 1. Record traces while running a fixture set.
725
+ for fixture in fixtures:
726
+ async for _event in record_trace(
727
+ runtime.dispatch_stream(fixture["input"]),
728
+ path=f"runs/{fixture['id']}.jsonl",
729
+ ):
730
+ pass
731
+
732
+ # 2. Score offline by replaying.
733
+ def score_run(path: str, expected: str) -> dict:
734
+ answer = ""
735
+ budget = {"tokens_in": 0, "tokens_out": 0, "cost_usd": 0.0, "breakdown": {}}
736
+ for line in open(path):
737
+ event = json.loads(line)
738
+ if event["type"] in ("done", "task_done"):
739
+ answer = event["payload"].get("answer", "")
740
+ budget = event["payload"].get("budget", budget)
741
+ return {
742
+ "success": expected.lower() in answer.lower(),
743
+ **budget, # tokens_in, tokens_out, cost_usd, breakdown
744
+ }
745
+ ```
746
+
747
+ Plug in your own scorer (exact-match, LLM-judge, semantic similarity) on
748
+ top. External tools like Braintrust, LangSmith, and Weave are
749
+ purpose-built for this and ingest the same JSONL shape directly.
750
+
593
751
  ## Tool execution
594
752
 
595
753
  Tools that shell out (`kubectl`, `curl`, `sh -c …`) should not run inside the
@@ -30,6 +30,7 @@ import asyncio
30
30
  import contextlib
31
31
  import json
32
32
  import logging
33
+ import os
33
34
  import uuid
34
35
  from collections.abc import AsyncGenerator
35
36
  from dataclasses import dataclass
@@ -65,6 +66,13 @@ class AgentConfig:
65
66
  working_memory_max_tokens: int = 8000 # WorkingMemory eviction threshold; tune per agent
66
67
  hitl_tools: list[str] = None # tools requiring human approval; None = no HITL
67
68
  checkpoint_every: int = 0 # write a resumable checkpoint every N steps; 0 = disabled
69
+ # Cache tool results within a single run, keyed by (tool_name, args).
70
+ # Opt-in because not every tool is idempotent — a tool may also veto
71
+ # caching for itself by exposing ``cacheable = False`` on its instance.
72
+ # Designed for read-mostly multi-agent runs where agents redo each
73
+ # other's lookups (HTTPFetch on stable URLs, ``kubectl get …`` style
74
+ # discovery, MCP filesystem reads).
75
+ cache_tool_results: bool = False
68
76
 
69
77
  def __post_init__(self):
70
78
  if self.hitl_tools is None:
@@ -159,6 +167,12 @@ class BaseAgent:
159
167
  self._resume_key: str = (
160
168
  "" # key printed in --resume banner; set by orchestrator to outer run_id
161
169
  )
170
+ # Per-run tool-result cache. ``None`` when caching is off so the
171
+ # hot path on ``_execute_tool`` skips the lookup entirely; a fresh
172
+ # dict per BaseAgent instance bounds the lifetime to one run.
173
+ self._tool_cache: dict[tuple[str, str], Any] | None = (
174
+ {} if config.cache_tool_results else None
175
+ )
162
176
 
163
177
  # ── Async steering ────────────────────────────────────────────────────────
164
178
 
@@ -335,7 +349,15 @@ class BaseAgent:
335
349
  agent_id=self.config.agent_id,
336
350
  )
337
351
  if not mem_context.is_empty():
338
- parts.append(mem_context.render())
352
+ rendered = mem_context.render()
353
+ if os.environ.get("DEBUG_MEMORY_CONTEXT") == "1":
354
+ print(f"\n[debug:memory] context injected for {self.config.agent_id}")
355
+ print("─" * 64)
356
+ print(rendered)
357
+ print("─" * 64)
358
+ parts.append(rendered)
359
+ elif os.environ.get("DEBUG_MEMORY_CONTEXT") == "1":
360
+ print(f"\n[debug:memory] context injected for {self.config.agent_id}: (empty)")
339
361
 
340
362
  tool_list = ", ".join(self._tools.keys()) or "none"
341
363
  parts.append(REACT_FORMAT.replace("__TOOL_LIST__", tool_list))
@@ -422,6 +444,12 @@ class BaseAgent:
422
444
  "summarizations": self._working_memory.summarization_count,
423
445
  },
424
446
  }
447
+ # Attach the current budget snapshot so dispatch_stream
448
+ # consumers can read totals + per-call-site breakdown off
449
+ # the routed path's terminal event, same shape as the
450
+ # orchestrator's DONE event.
451
+ if self._guard is not None and hasattr(self._guard, "snapshot"):
452
+ result["budget"] = self._guard.snapshot()
425
453
  logger.info(
426
454
  "Agent %s completed: steps=%d confidence=%.2f summarizations=%d",
427
455
  self.config.agent_id,
@@ -653,11 +681,17 @@ class BaseAgent:
653
681
  payload=before_usage,
654
682
  )
655
683
 
684
+ # Tag ReAct spending so it shows up in BudgetGuard.breakdown alongside
685
+ # classifier/router/planner/synthesizer. Per-agent attribution makes
686
+ # multi-agent demos surface which specialist agent actually drove the
687
+ # bulk of token usage.
688
+ react_source = f"agent:{self.config.agent_id}"
656
689
  try:
657
690
  if hasattr(self._llm, "stream_complete"):
658
691
  async for token in self._llm.stream_complete(
659
692
  system=None,
660
693
  messages=messages,
694
+ source=react_source,
661
695
  ):
662
696
  accumulated += token
663
697
  if self.config.stream_tokens:
@@ -679,6 +713,7 @@ class BaseAgent:
679
713
  system=None,
680
714
  messages=messages,
681
715
  response_format={"type": "json_object"},
716
+ source=react_source,
682
717
  )
683
718
  response = _normalize_response(raw)
684
719
  if response is None:
@@ -739,12 +774,32 @@ class BaseAgent:
739
774
  return (
740
775
  f"Error: tool '{name}' not available. Available tools: {list(self._tools.keys())}"
741
776
  )
777
+ tool = self._tools[name]
778
+
779
+ # Per-run memoization, gated by both agent opt-in AND tool consent.
780
+ # Tools that have side effects or time-dependent output can veto
781
+ # caching by setting ``cacheable = False`` on the instance. Errors
782
+ # are NOT cached — a transient failure should not poison the rest
783
+ # of the run.
784
+ cache_key: tuple[str, str] | None = None
785
+ if self._tool_cache is not None and getattr(tool, "cacheable", True) is True:
786
+ try:
787
+ cache_key = (name, json.dumps(args, sort_keys=True, default=str))
788
+ except (TypeError, ValueError):
789
+ cache_key = None # un-serialisable args — silently skip
790
+ if cache_key is not None and cache_key in self._tool_cache:
791
+ return self._tool_cache[cache_key]
792
+
742
793
  try:
743
- return await self._tools[name].execute(**args)
794
+ result = await tool.execute(**args)
744
795
  except Exception as e:
745
796
  logger.error("Tool %s failed: %s", name, e)
746
797
  return f"Tool error ({name}): {e}"
747
798
 
799
+ if cache_key is not None and self._tool_cache is not None:
800
+ self._tool_cache[cache_key] = result
801
+ return result
802
+
748
803
  # ── Helpers ───────────────────────────────────────────────────────────────
749
804
 
750
805
  def _error_result(self, reason: str, steps: int) -> dict:
@@ -178,17 +178,54 @@ class ConsoleRenderer:
178
178
  self.sep("═")
179
179
  print(p.get("answer", "(no answer)"), file=self._out)
180
180
  self.sep()
181
+ # ``budget`` snapshot supersedes the flat cost/elapsed fields when
182
+ # present (added with token caps + per-call-site breakdown).
183
+ budget = p.get("budget") or {}
184
+ cost = budget.get("cost_usd", p.get("cost_usd", 0))
185
+ elapsed = budget.get("elapsed_seconds", p.get("elapsed_seconds", 0))
181
186
  print(
182
187
  f"Confidence: {p.get('confidence', 0):.2f} | "
183
188
  f"Replans: {p.get('replan_count', 0)} | "
184
- f"Cost: ${p.get('cost_usd', 0):.4f} | "
185
- f"Time: {p.get('elapsed_seconds', 0):.1f}s",
189
+ f"Cost: ${cost:.4f} | "
190
+ f"Time: {elapsed:.1f}s",
186
191
  file=self._out,
187
192
  )
193
+ self.render_budget(budget)
188
194
 
189
195
  elif t == EventType.ERROR:
190
196
  print(f"\n[error] {event.error}", file=sys.stderr)
191
197
 
198
+ def render_budget(self, budget: dict | None) -> None:
199
+ """Print tokens + per-call-site breakdown from a ``BudgetGuard.snapshot()``
200
+ dict. Safe to call with ``{}`` or ``None`` — prints nothing when
201
+ there's no usage to show.
202
+
203
+ Exposed publicly so demos and other consumers that own their own
204
+ DONE / TASK_DONE rendering can still surface the breakdown without
205
+ duplicating the formatting.
206
+ """
207
+ if not budget:
208
+ return
209
+ tokens_in = budget.get("tokens_in")
210
+ tokens_out = budget.get("tokens_out")
211
+ if tokens_in is not None or tokens_out is not None:
212
+ print(
213
+ f"Tokens: in={int(tokens_in or 0):,} out={int(tokens_out or 0):,}",
214
+ file=self._out,
215
+ )
216
+ breakdown = budget.get("breakdown") or {}
217
+ if breakdown:
218
+ # Right-pad the slot label so columns line up — matters when
219
+ # the demo prints multiple slots in sequence.
220
+ width = max(len(name) for name in breakdown)
221
+ for slot, stats in breakdown.items():
222
+ print(
223
+ f" {slot:<{width}} "
224
+ f"in={int(stats.get('tokens_in', 0)):>7,} "
225
+ f"out={int(stats.get('tokens_out', 0)):>6,}",
226
+ file=self._out,
227
+ )
228
+
192
229
  # ── private helpers ───────────────────────────────────────────────────────
193
230
 
194
231
  def _label(self, event: BusEvent) -> str:
@@ -91,6 +91,8 @@ class AnthropicLLM:
91
91
  self,
92
92
  system: str | None,
93
93
  messages: list[dict],
94
+ *,
95
+ source: str | None = None,
94
96
  **kwargs: Any,
95
97
  ) -> dict:
96
98
  max_tokens = int(kwargs.pop("max_tokens", self._max_tokens))
@@ -110,7 +112,7 @@ class AnthropicLLM:
110
112
  cost = _compute_cost(usage, self._cost_fn)
111
113
  if cost is not None:
112
114
  usage["cost_usd"] = cost
113
- self._record_cost(usage)
115
+ self._record_usage(usage, source=source)
114
116
  self.last_usage = usage
115
117
 
116
118
  text = _collect_text(resp.content)
@@ -122,6 +124,8 @@ class AnthropicLLM:
122
124
  self,
123
125
  system: str | None,
124
126
  messages: list[dict],
127
+ *,
128
+ source: str | None = None,
125
129
  ) -> AsyncGenerator[str, None]:
126
130
  sys_blocks = _system_blocks(system, prompt_caching=self._prompt_caching)
127
131
  built_messages = _build_messages(messages, prompt_caching=self._prompt_caching)
@@ -143,17 +147,34 @@ class AnthropicLLM:
143
147
  cost = _compute_cost(usage, self._cost_fn)
144
148
  if cost is not None:
145
149
  usage["cost_usd"] = cost
146
- self._record_cost(usage)
150
+ self._record_usage(usage, source=source)
147
151
  self.last_usage = usage
148
152
 
149
153
  # ── Internals ─────────────────────────────────────────────────────────────
150
154
 
151
- def _record_cost(self, usage: dict) -> None:
152
- if not self._budget:
155
+ def _record_usage(self, usage: dict, *, source: str | None) -> None:
156
+ """Forward usage to the budget guard.
157
+
158
+ Token count for budget purposes is the total input that hit the wire
159
+ — non-cached + cache-creation + cache-read — so token caps reflect
160
+ real wall-clock consumption regardless of cache hit rate. Cost
161
+ (which respects cache pricing via ``cost_fn``) is reported when
162
+ known.
163
+ """
164
+ guard = self._budget
165
+ if not guard:
153
166
  return
167
+ tokens_in = (
168
+ int(usage.get("tokens_in") or 0)
169
+ + int(usage.get("cache_read_tokens") or 0)
170
+ + int(usage.get("cache_creation_tokens") or 0)
171
+ )
172
+ tokens_out = int(usage.get("tokens_out") or 0)
173
+ if (tokens_in or tokens_out) and hasattr(guard, "add_tokens"):
174
+ guard.add_tokens(tokens_in, tokens_out, source=source)
154
175
  cost = usage.get("cost_usd")
155
176
  if cost and cost > 0:
156
- self._budget.add_cost(cost)
177
+ guard.add_cost(cost, source=source)
157
178
 
158
179
 
159
180
  # ── Module-level helpers ──────────────────────────────────────────────────────
@@ -68,12 +68,24 @@ class ClaudeCodeLLM:
68
68
  self._user_agent = user_agent or _default_user_agent()
69
69
  self._betas = betas
70
70
  self._prompt_caching = prompt_caching
71
+ self._budget: Any = None
71
72
  self.last_usage: dict | None = None
72
73
 
74
+ def set_budget(self, guard: Any) -> None:
75
+ """Inject a BudgetGuard so token caps fire on subscription-auth runs.
76
+
77
+ Cost stays 0 (no pricing schedule available for the subscription
78
+ tier), but ``add_tokens`` still lands so ``max_input_tokens`` /
79
+ ``max_output_tokens`` are enforced.
80
+ """
81
+ self._budget = guard
82
+
73
83
  async def complete(
74
84
  self,
75
85
  system: str | None,
76
86
  messages: list[dict],
87
+ *,
88
+ source: str | None = None,
77
89
  **kwargs: Any,
78
90
  ) -> dict:
79
91
  """Collect the streaming response into a single text + usage dict.
@@ -84,7 +96,9 @@ class ClaudeCodeLLM:
84
96
  """
85
97
  max_tokens = int(kwargs.pop("max_tokens", self._max_tokens))
86
98
  parts: list[str] = []
87
- async for delta in self._iter_stream(system, messages, max_tokens=max_tokens, extra=kwargs):
99
+ async for delta in self._iter_stream(
100
+ system, messages, max_tokens=max_tokens, extra=kwargs, source=source
101
+ ):
88
102
  parts.append(delta)
89
103
  text = "".join(parts)
90
104
  if not text:
@@ -95,9 +109,11 @@ class ClaudeCodeLLM:
95
109
  self,
96
110
  system: str | None,
97
111
  messages: list[dict],
112
+ *,
113
+ source: str | None = None,
98
114
  ) -> AsyncGenerator[str, None]:
99
115
  async for delta in self._iter_stream(
100
- system, messages, max_tokens=self._max_tokens, extra={}
116
+ system, messages, max_tokens=self._max_tokens, extra={}, source=source
101
117
  ):
102
118
  yield delta
103
119
 
@@ -114,6 +130,7 @@ class ClaudeCodeLLM:
114
130
  *,
115
131
  max_tokens: int,
116
132
  extra: dict[str, Any],
133
+ source: str | None = None,
117
134
  ) -> AsyncGenerator[str, None]:
118
135
  """Single source of truth: open Anthropic SSE stream, yield text
119
136
  deltas, populate `self.last_usage`. Auth refresh on 401/403
@@ -182,10 +199,31 @@ class ClaudeCodeLLM:
182
199
  "total_tokens": tokens_in + tokens_out,
183
200
  "provider": "claude-code",
184
201
  }
202
+ self._record_usage(self.last_usage, source=source)
185
203
  return
186
204
 
187
205
  raise RuntimeError("Claude Code authentication failed after refresh")
188
206
 
207
+ def _record_usage(self, usage: dict, *, source: str | None) -> None:
208
+ """Report token totals to the budget guard.
209
+
210
+ Tokens budgeted = total input that hit the wire (non-cached +
211
+ cache-creation + cache-read) plus output tokens — so ``max_input_tokens``
212
+ / ``max_output_tokens`` reflect real consumption regardless of cache
213
+ hit rate. No cost is reported (subscription auth, no pricing).
214
+ """
215
+ guard = self._budget
216
+ if not guard or not hasattr(guard, "add_tokens"):
217
+ return
218
+ tokens_in = (
219
+ int(usage.get("tokens_in") or 0)
220
+ + int(usage.get("cache_read_tokens") or 0)
221
+ + int(usage.get("cache_creation_tokens") or 0)
222
+ )
223
+ tokens_out = int(usage.get("tokens_out") or 0)
224
+ if tokens_in or tokens_out:
225
+ guard.add_tokens(tokens_in, tokens_out, source=source)
226
+
189
227
  async def _get_client(self) -> Any:
190
228
  if self._client is None:
191
229
  try:
@@ -123,6 +123,8 @@ class FallbackLLM:
123
123
  self,
124
124
  system: str | None,
125
125
  messages: list[dict],
126
+ *,
127
+ source: str | None = None,
126
128
  ) -> AsyncGenerator[str, None]:
127
129
  """Stream from the first adapter that doesn't fail before yielding.
128
130
 
@@ -136,7 +138,7 @@ class FallbackLLM:
136
138
  if not hasattr(llm, "stream_complete"):
137
139
  continue
138
140
  try:
139
- gen = llm.stream_complete(system, messages)
141
+ gen = llm.stream_complete(system, messages, source=source)
140
142
  first = await _peek_first(gen)
141
143
  except BaseException as exc:
142
144
  if i == len(self._llms) - 1 or not self._is_transient(exc):
@@ -101,6 +101,8 @@ class OpenAILLM:
101
101
  self,
102
102
  system: str | None,
103
103
  messages: list[dict],
104
+ *,
105
+ source: str | None = None,
104
106
  **kwargs: Any,
105
107
  ) -> dict:
106
108
  full_messages = _prepend_system(system, messages)
@@ -120,7 +122,7 @@ class OpenAILLM:
120
122
  resp = raw.parse()
121
123
  headers = _headers_dict(raw)
122
124
  usage = self._build_usage(resp, headers)
123
- self._record_cost(usage)
125
+ self._record_usage(usage, source=source)
124
126
  self.last_usage = usage
125
127
 
126
128
  content = resp.choices[0].message.content or ""
@@ -132,6 +134,8 @@ class OpenAILLM:
132
134
  self,
133
135
  system: str | None,
134
136
  messages: list[dict],
137
+ *,
138
+ source: str | None = None,
135
139
  ) -> AsyncGenerator[str, None]:
136
140
  full_messages = _prepend_system(system, messages)
137
141
  # include_usage adds a final SSE chunk with the same usage block as
@@ -156,7 +160,7 @@ class OpenAILLM:
156
160
 
157
161
  if final_chunk is not None:
158
162
  usage = self._build_usage(final_chunk, headers)
159
- self._record_cost(usage)
163
+ self._record_usage(usage, source=source)
160
164
  self.last_usage = usage
161
165
 
162
166
  # ── Internals ─────────────────────────────────────────────────────────────
@@ -185,12 +189,24 @@ class OpenAILLM:
185
189
  usage["cost_usd"] = cost
186
190
  return usage
187
191
 
188
- def _record_cost(self, usage: dict) -> None:
189
- if not self._budget:
192
+ def _record_usage(self, usage: dict, *, source: str | None) -> None:
193
+ """Forward usage to the budget guard.
194
+
195
+ Tokens are reported on every call (even when no ``cost_fn`` is wired)
196
+ so token-based caps still fire. Cost is forwarded only when known.
197
+ Both calls accept the per-call-site ``source`` tag so the guard's
198
+ breakdown attributes spending to the right slot.
199
+ """
200
+ guard = self._budget
201
+ if not guard:
190
202
  return
203
+ tokens_in = int(usage.get("tokens_in") or 0)
204
+ tokens_out = int(usage.get("tokens_out") or 0)
205
+ if (tokens_in or tokens_out) and hasattr(guard, "add_tokens"):
206
+ guard.add_tokens(tokens_in, tokens_out, source=source)
191
207
  cost = usage.get("cost_usd")
192
208
  if cost and cost > 0:
193
- self._budget.add_cost(cost)
209
+ guard.add_cost(cost, source=source)
194
210
 
195
211
 
196
212
  # ── Module-level helpers ─────────────────────────────────────────────────────