react-agent-harness 0.6.0__tar.gz → 0.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {react_agent_harness-0.6.0/react_agent_harness.egg-info → react_agent_harness-0.7.0}/PKG-INFO +2 -2
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/README.md +163 -5
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/agents/base.py +57 -2
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/console.py +39 -2
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/llm/anthropic.py +26 -5
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/llm/claude_code.py +40 -2
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/llm/fallback.py +3 -1
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/llm/openai.py +21 -5
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/llm/openai_codex.py +34 -2
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/llm/routing.py +4 -2
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/runtime.py +177 -22
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/memory/episodic_lance.py +205 -66
- react_agent_harness-0.7.0/memory/manager.py +925 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/memory/redis_store.py +1 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/memory/stores.py +90 -9
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/orchestrator/planner.py +25 -1
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/pyproject.toml +2 -2
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0/react_agent_harness.egg-info}/PKG-INFO +2 -2
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/react_agent_harness.egg-info/SOURCES.txt +3 -0
- react_agent_harness-0.7.0/tests/test_budget_guard.py +134 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_cli.py +4 -0
- react_agent_harness-0.7.0/tests/test_console_renderer.py +145 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_fallback_llm.py +1 -1
- react_agent_harness-0.7.0/tests/test_memory.py +381 -0
- react_agent_harness-0.7.0/tests/test_memory_reconciler.py +374 -0
- react_agent_harness-0.7.0/tests/test_memory_touchpoints.py +108 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_orchestrator.py +4 -2
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_per_call_site_llm.py +133 -2
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_routing_llm.py +1 -1
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_streaming.py +1 -1
- react_agent_harness-0.6.0/memory/manager.py +0 -372
- react_agent_harness-0.6.0/tests/test_console_renderer.py +0 -52
- react_agent_harness-0.6.0/tests/test_memory.py +0 -158
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/LICENSE +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/agents/__init__.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/__init__.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/annotation.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/checkpoint.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/cli.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/events.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/executor_bridge.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/hitl.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/llm/__init__.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/llm/_streaming.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/llm/auth.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/oauth_browser.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/otel.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/steering.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/tool_policy.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/trace.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/trace_viewer.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/harness/utils.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/memory/__init__.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/memory/working.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/orchestrator/__init__.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/react_agent_harness.egg-info/dependency_links.txt +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/react_agent_harness.egg-info/entry_points.txt +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/react_agent_harness.egg-info/requires.txt +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/react_agent_harness.egg-info/top_level.txt +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/setup.cfg +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_agents_base.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_annotation.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_anthropic_llm.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_checkpoint_resume.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_claude_code_llm.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_executor_bridge.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_http_fetch.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_llm_auth.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_mcp_adapter.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_mcp_auth.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_oauth_browser.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_openai_codex_llm.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_openai_llm.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_otel.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_parse_action_json.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_redis_store.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_steering.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_tool_policy.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_trace.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_utils.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_vision.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tests/test_working_memory.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tools/__init__.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tools/builtin/__init__.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tools/builtin/fetch_image.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tools/builtin/http_fetch.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tools/mcp/__init__.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tools/mcp/adapter.py +0 -0
- {react_agent_harness-0.6.0 → react_agent_harness-0.7.0}/tools/mcp/auth.py +0 -0
{react_agent_harness-0.6.0/react_agent_harness.egg-info → react_agent_harness-0.7.0}/PKG-INFO
RENAMED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: react-agent-harness
|
|
3
|
-
Version: 0.
|
|
4
|
-
Summary: Multi-agent LLM orchestration: hybrid DAG planning, two-tier memory, streaming
|
|
3
|
+
Version: 0.7.0
|
|
4
|
+
Summary: Multi-agent LLM orchestration: hybrid DAG planning, two-tier memory, streaming, cost/token budgets with per-call-site breakdown
|
|
5
5
|
Requires-Python: >=3.10
|
|
6
6
|
License-File: LICENSE
|
|
7
7
|
Requires-Dist: prompt_toolkit>=3.0
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
# react-agent-harness
|
|
2
2
|
|
|
3
3
|
Bring-your-own-LLM multi-agent harness: hybrid DAG planning with replan-on-failure,
|
|
4
|
-
two-tier memory (semantic KV + episodic vector),
|
|
4
|
+
two-tier memory (semantic KV + episodic vector), a streaming-primary event model,
|
|
5
|
+
and cost/token budgets with per-call-site attribution (classifier, router,
|
|
6
|
+
planner, synthesizer, agent).
|
|
5
7
|
|
|
6
8
|
Config-driven — register tools and agents, run any goal. No subclassing.
|
|
7
9
|
|
|
@@ -33,15 +35,21 @@ events to stdout and prints elapsed time + cost at the end.
|
|
|
33
35
|
## Architecture
|
|
34
36
|
|
|
35
37
|
```
|
|
36
|
-
harness/runtime.py AgentRuntime — single entry point
|
|
38
|
+
harness/runtime.py AgentRuntime — single entry point; BudgetGuard with cost/token caps + per-call-site breakdown
|
|
37
39
|
harness/events.py BusEvent + EventType — canonical event vocabulary
|
|
38
|
-
harness/llm/openai.py OpenAILLM — OpenAI adapter with usage + cost tracking
|
|
40
|
+
harness/llm/openai.py OpenAILLM — OpenAI API-key adapter with usage + cost tracking
|
|
41
|
+
harness/llm/anthropic.py AnthropicLLM — direct Anthropic API-key adapter with prompt-caching support
|
|
42
|
+
harness/llm/claude_code.py ClaudeCodeLLM — Claude subscription OAuth adapter (experimental, ToS caveats)
|
|
43
|
+
harness/llm/openai_codex.py OpenAICodexLLM — ChatGPT subscription OAuth adapter (experimental, ToS caveats)
|
|
44
|
+
harness/llm/auth.py Shared OAuth + auth-file primitives for the subscription adapters
|
|
39
45
|
harness/llm/fallback.py FallbackLLM — transparent retry on transient upstream errors
|
|
40
46
|
harness/llm/routing.py RoutingLLM — dispatch calls to different adapters by a selector
|
|
47
|
+
harness/trace.py JSONL trace recorder + replay — durable, per-event flush
|
|
48
|
+
harness/trace_viewer.py Local web timeline viewer for recorded JSONL traces
|
|
41
49
|
harness/annotation.py Annotation store + AnnotationHook — RLHF trajectory capture
|
|
42
50
|
harness/hitl.py HITL approval gate — interactive CLI, session-allow list
|
|
43
51
|
harness/tool_policy.py Persistent tool policy — user-scoped allow rules, CLI management
|
|
44
|
-
harness/console.py ConsoleRenderer — centralised BusEvent formatting
|
|
52
|
+
harness/console.py ConsoleRenderer — centralised BusEvent formatting + render_budget helper
|
|
45
53
|
harness/steering.py Async steering — agent.steer(text), StdinRouter pub/sub, FileSteer, factory helpers
|
|
46
54
|
harness/checkpoint.py CheckpointStore + _ResumeHint + maybe_resume_key — pluggable run-state persistence (file + Redis); auto-resume built into dispatch_stream / run_stream
|
|
47
55
|
harness/otel.py OTELHook — OpenTelemetry span exporter (opt-in)
|
|
@@ -56,7 +64,8 @@ memory/redis_store.py Redis semantic store — durable KV with TTL
|
|
|
56
64
|
memory/stores.py InMemory stores — local dev default, no deps
|
|
57
65
|
tools/builtin/http_fetch.py HTTPFetch — minimal read-only GET tool
|
|
58
66
|
tools/builtin/fetch_image.py FetchImage — fetch URL and return OpenAI image_url block
|
|
59
|
-
tools/mcp/adapter.py MCP tool adapter —
|
|
67
|
+
tools/mcp/adapter.py MCP tool adapter — stdio, SSE, streamable-HTTP transports
|
|
68
|
+
tools/mcp/auth.py ApiKeyMCPAuth + BrowserOAuthMCPAuth — auth primitives for remote MCP servers
|
|
60
69
|
```
|
|
61
70
|
|
|
62
71
|
Execution is **streaming-primary**: every path yields `BusEvent`s for
|
|
@@ -331,6 +340,69 @@ crashing the loop.
|
|
|
331
340
|
- **During run**: `write_working_fact()` — lightweight KV, namespaced, short TTL
|
|
332
341
|
- **End of run**: `write_run_end()` — LLM extraction → global semantic + episodic vector
|
|
333
342
|
|
|
343
|
+
### Memory reconciliation (default-on)
|
|
344
|
+
|
|
345
|
+
`write_run_end` runs the LLM-arbitrated reconciler by default: instead of
|
|
346
|
+
extract-and-overwrite, the LLM sees existing relevant memory + new evidence
|
|
347
|
+
and emits a plan of per-fact actions (`ADD` / `UPDATE` / `MERGE` / `DELETE`
|
|
348
|
+
/ `NOOP`). Same call count as the legacy extraction step; the prompt is
|
|
349
|
+
larger only when there's existing context to reconcile against.
|
|
350
|
+
|
|
351
|
+
```python
|
|
352
|
+
manager = MemoryManager(
|
|
353
|
+
semantic_store=…,
|
|
354
|
+
episodic_store=…,
|
|
355
|
+
llm=…,
|
|
356
|
+
reconcile_on_write=True, # default — set False for legacy extract path
|
|
357
|
+
allow_destructive_reconcile=False, # default — DELETE actions demoted to NOOP
|
|
358
|
+
auto_compact_threshold={"agent_task": 20}, # optional — fire compact()
|
|
359
|
+
# when an agent accumulates this
|
|
360
|
+
# many task episodes
|
|
361
|
+
)
|
|
362
|
+
```
|
|
363
|
+
|
|
364
|
+
`allow_destructive_reconcile=False` keeps the LLM from removing data unless
|
|
365
|
+
you've vetted that DELETE actions are sensible for your workload — demoted
|
|
366
|
+
decisions land in `manager.get_conflict_log()` so you can audit.
|
|
367
|
+
|
|
368
|
+
`manager.compact(goal="…", agent_id="…")` is the same primitive with no
|
|
369
|
+
new evidence — a pure cleanup pass that consolidates accumulated episodes
|
|
370
|
+
and prunes redundant facts. Triggered automatically by
|
|
371
|
+
`auto_compact_threshold`, or call it explicitly.
|
|
372
|
+
|
|
373
|
+
Episodic supersede is now **hard-delete** (no `active=False` tombstones
|
|
374
|
+
accumulating per run): `memory_policy="latest"` writes and reconciler
|
|
375
|
+
`DELETE` actions both remove rows.
|
|
376
|
+
|
|
377
|
+
If the LLM returns a response that doesn't parse as a reconcile plan
|
|
378
|
+
(older / smaller models that don't follow the multi-action schema),
|
|
379
|
+
`write_run_end` silently falls back to the legacy extract-and-overwrite
|
|
380
|
+
path — no crash, no missed run-end write.
|
|
381
|
+
|
|
382
|
+
### Tool-result caching (opt-in, per run)
|
|
383
|
+
|
|
384
|
+
`AgentConfig.cache_tool_results = True` memoizes tool calls within a single
|
|
385
|
+
run, keyed by `(tool_name, args)`. Useful for multi-agent runs where agents
|
|
386
|
+
redo each other's idempotent reads (`HTTPFetch` on stable URLs,
|
|
387
|
+
`kubectl get ...` discovery, MCP filesystem reads).
|
|
388
|
+
|
|
389
|
+
A tool can veto caching for itself with `cacheable = False` on the
|
|
390
|
+
instance — required for anything with side effects or time-dependent
|
|
391
|
+
output. Errors are never cached (a transient failure shouldn't poison the
|
|
392
|
+
rest of the run).
|
|
393
|
+
|
|
394
|
+
```python
|
|
395
|
+
class HTTPFetch:
|
|
396
|
+
name = "http_fetch"
|
|
397
|
+
cacheable = True # default; explicit for clarity
|
|
398
|
+
|
|
399
|
+
agents.register(AgentConfig(
|
|
400
|
+
agent_id="web",
|
|
401
|
+
...,
|
|
402
|
+
cache_tool_results=True,
|
|
403
|
+
))
|
|
404
|
+
```
|
|
405
|
+
|
|
334
406
|
Defaults are in-memory (`InMemorySemanticStore`, `InMemoryEpisodicStore`).
|
|
335
407
|
For durable storage:
|
|
336
408
|
|
|
@@ -590,6 +662,92 @@ Cost ceiling fires on the *next* `check()` (start of next ReAct step or
|
|
|
590
662
|
orchestrator batch), not synchronously mid-call — accept this for 0.0.1, the
|
|
591
663
|
guard's job is preventing runaway loops, not bounding individual calls.
|
|
592
664
|
|
|
665
|
+
### Token limits + per-call-site breakdown
|
|
666
|
+
|
|
667
|
+
`GuardrailConfig.max_input_tokens` / `max_output_tokens` cap raw token
|
|
668
|
+
usage independently of dollar cost. This is the only enforcement available
|
|
669
|
+
to subscription-auth runs (`ClaudeCodeLLM`, `OpenAICodexLLM`) — those tiers
|
|
670
|
+
don't expose pricing, so cost stays 0 and only token caps can fire.
|
|
671
|
+
|
|
672
|
+
```python
|
|
673
|
+
runtime = AgentRuntime(
|
|
674
|
+
...,
|
|
675
|
+
guardrail_config=GuardrailConfig(
|
|
676
|
+
max_total_cost_usd=2.0,
|
|
677
|
+
max_input_tokens=100_000,
|
|
678
|
+
max_output_tokens=20_000,
|
|
679
|
+
),
|
|
680
|
+
)
|
|
681
|
+
```
|
|
682
|
+
|
|
683
|
+
Per-call-site attribution lives on the terminal event's `budget` payload
|
|
684
|
+
— a snapshot of spending bucketed by the LLM slot that ran each call.
|
|
685
|
+
The runtime tags classifier / router / planner / synthesizer calls
|
|
686
|
+
automatically; ReAct agent calls go into the totals but don't get a
|
|
687
|
+
bucket. So `cheap` (used for both `classifier_llm` and `router_llm`) and
|
|
688
|
+
`premium` (used for `planner_llm`) report separately even though one is
|
|
689
|
+
the same physical LLM instance shared across slots:
|
|
690
|
+
|
|
691
|
+
```python
|
|
692
|
+
async for event in runtime.dispatch_stream(goal):
|
|
693
|
+
# Routed (simple) goals terminate with TASK_DONE; orchestrated goals
|
|
694
|
+
# with DONE. Both carry the same ``budget`` shape.
|
|
695
|
+
if event.type in (EventType.TASK_DONE, EventType.DONE):
|
|
696
|
+
budget = event.payload["budget"]
|
|
697
|
+
print(f"total: in={budget['tokens_in']} out={budget['tokens_out']} "
|
|
698
|
+
f"${budget['cost_usd']:.4f}")
|
|
699
|
+
for slot, stats in budget["breakdown"].items():
|
|
700
|
+
print(f" {slot}: in={stats['tokens_in']} out={stats['tokens_out']}")
|
|
701
|
+
```
|
|
702
|
+
|
|
703
|
+
The same `budget` dict is attached to `runtime.run(...)` and
|
|
704
|
+
`runtime.dispatch(...)` return values under the `budget` key, so blocking
|
|
705
|
+
callers don't need to read events.
|
|
706
|
+
|
|
707
|
+
Anthropic / Claude Code adapters count input tokens as the *total* that
|
|
708
|
+
hit the wire (non-cached + cache-creation + cache-read), so token caps
|
|
709
|
+
reflect actual consumption regardless of cache hit rate. Cost calculation
|
|
710
|
+
via `cost_fn` still respects cache pricing.
|
|
711
|
+
|
|
712
|
+
### Evals via the trace recorder
|
|
713
|
+
|
|
714
|
+
There's no shipped evals framework — opinions on scorers, judge models,
|
|
715
|
+
and golden-set management belong outside the orchestration core. The
|
|
716
|
+
[trace recorder](#trace-recorder--replay--local-viewer) already writes
|
|
717
|
+
per-event token/cost/latency to JSONL, so a few lines of glue cover most
|
|
718
|
+
in-house eval setups:
|
|
719
|
+
|
|
720
|
+
```python
|
|
721
|
+
import json
|
|
722
|
+
from harness.trace import record_trace
|
|
723
|
+
|
|
724
|
+
# 1. Record traces while running a fixture set.
|
|
725
|
+
for fixture in fixtures:
|
|
726
|
+
async for _event in record_trace(
|
|
727
|
+
runtime.dispatch_stream(fixture["input"]),
|
|
728
|
+
path=f"runs/{fixture['id']}.jsonl",
|
|
729
|
+
):
|
|
730
|
+
pass
|
|
731
|
+
|
|
732
|
+
# 2. Score offline by replaying.
|
|
733
|
+
def score_run(path: str, expected: str) -> dict:
|
|
734
|
+
answer = ""
|
|
735
|
+
budget = {"tokens_in": 0, "tokens_out": 0, "cost_usd": 0.0, "breakdown": {}}
|
|
736
|
+
for line in open(path):
|
|
737
|
+
event = json.loads(line)
|
|
738
|
+
if event["type"] in ("done", "task_done"):
|
|
739
|
+
answer = event["payload"].get("answer", "")
|
|
740
|
+
budget = event["payload"].get("budget", budget)
|
|
741
|
+
return {
|
|
742
|
+
"success": expected.lower() in answer.lower(),
|
|
743
|
+
**budget, # tokens_in, tokens_out, cost_usd, breakdown
|
|
744
|
+
}
|
|
745
|
+
```
|
|
746
|
+
|
|
747
|
+
Plug in your own scorer (exact-match, LLM-judge, semantic similarity) on
|
|
748
|
+
top. External tools like Braintrust, LangSmith, and Weave are
|
|
749
|
+
purpose-built for this and ingest the same JSONL shape directly.
|
|
750
|
+
|
|
593
751
|
## Tool execution
|
|
594
752
|
|
|
595
753
|
Tools that shell out (`kubectl`, `curl`, `sh -c …`) should not run inside the
|
|
@@ -30,6 +30,7 @@ import asyncio
|
|
|
30
30
|
import contextlib
|
|
31
31
|
import json
|
|
32
32
|
import logging
|
|
33
|
+
import os
|
|
33
34
|
import uuid
|
|
34
35
|
from collections.abc import AsyncGenerator
|
|
35
36
|
from dataclasses import dataclass
|
|
@@ -65,6 +66,13 @@ class AgentConfig:
|
|
|
65
66
|
working_memory_max_tokens: int = 8000 # WorkingMemory eviction threshold; tune per agent
|
|
66
67
|
hitl_tools: list[str] = None # tools requiring human approval; None = no HITL
|
|
67
68
|
checkpoint_every: int = 0 # write a resumable checkpoint every N steps; 0 = disabled
|
|
69
|
+
# Cache tool results within a single run, keyed by (tool_name, args).
|
|
70
|
+
# Opt-in because not every tool is idempotent — a tool may also veto
|
|
71
|
+
# caching for itself by exposing ``cacheable = False`` on its instance.
|
|
72
|
+
# Designed for read-mostly multi-agent runs where agents redo each
|
|
73
|
+
# other's lookups (HTTPFetch on stable URLs, ``kubectl get …`` style
|
|
74
|
+
# discovery, MCP filesystem reads).
|
|
75
|
+
cache_tool_results: bool = False
|
|
68
76
|
|
|
69
77
|
def __post_init__(self):
|
|
70
78
|
if self.hitl_tools is None:
|
|
@@ -159,6 +167,12 @@ class BaseAgent:
|
|
|
159
167
|
self._resume_key: str = (
|
|
160
168
|
"" # key printed in --resume banner; set by orchestrator to outer run_id
|
|
161
169
|
)
|
|
170
|
+
# Per-run tool-result cache. ``None`` when caching is off so the
|
|
171
|
+
# hot path on ``_execute_tool`` skips the lookup entirely; a fresh
|
|
172
|
+
# dict per BaseAgent instance bounds the lifetime to one run.
|
|
173
|
+
self._tool_cache: dict[tuple[str, str], Any] | None = (
|
|
174
|
+
{} if config.cache_tool_results else None
|
|
175
|
+
)
|
|
162
176
|
|
|
163
177
|
# ── Async steering ────────────────────────────────────────────────────────
|
|
164
178
|
|
|
@@ -335,7 +349,15 @@ class BaseAgent:
|
|
|
335
349
|
agent_id=self.config.agent_id,
|
|
336
350
|
)
|
|
337
351
|
if not mem_context.is_empty():
|
|
338
|
-
|
|
352
|
+
rendered = mem_context.render()
|
|
353
|
+
if os.environ.get("DEBUG_MEMORY_CONTEXT") == "1":
|
|
354
|
+
print(f"\n[debug:memory] context injected for {self.config.agent_id}")
|
|
355
|
+
print("─" * 64)
|
|
356
|
+
print(rendered)
|
|
357
|
+
print("─" * 64)
|
|
358
|
+
parts.append(rendered)
|
|
359
|
+
elif os.environ.get("DEBUG_MEMORY_CONTEXT") == "1":
|
|
360
|
+
print(f"\n[debug:memory] context injected for {self.config.agent_id}: (empty)")
|
|
339
361
|
|
|
340
362
|
tool_list = ", ".join(self._tools.keys()) or "none"
|
|
341
363
|
parts.append(REACT_FORMAT.replace("__TOOL_LIST__", tool_list))
|
|
@@ -422,6 +444,12 @@ class BaseAgent:
|
|
|
422
444
|
"summarizations": self._working_memory.summarization_count,
|
|
423
445
|
},
|
|
424
446
|
}
|
|
447
|
+
# Attach the current budget snapshot so dispatch_stream
|
|
448
|
+
# consumers can read totals + per-call-site breakdown off
|
|
449
|
+
# the routed path's terminal event, same shape as the
|
|
450
|
+
# orchestrator's DONE event.
|
|
451
|
+
if self._guard is not None and hasattr(self._guard, "snapshot"):
|
|
452
|
+
result["budget"] = self._guard.snapshot()
|
|
425
453
|
logger.info(
|
|
426
454
|
"Agent %s completed: steps=%d confidence=%.2f summarizations=%d",
|
|
427
455
|
self.config.agent_id,
|
|
@@ -653,11 +681,17 @@ class BaseAgent:
|
|
|
653
681
|
payload=before_usage,
|
|
654
682
|
)
|
|
655
683
|
|
|
684
|
+
# Tag ReAct spending so it shows up in BudgetGuard.breakdown alongside
|
|
685
|
+
# classifier/router/planner/synthesizer. Per-agent attribution makes
|
|
686
|
+
# multi-agent demos surface which specialist agent actually drove the
|
|
687
|
+
# bulk of token usage.
|
|
688
|
+
react_source = f"agent:{self.config.agent_id}"
|
|
656
689
|
try:
|
|
657
690
|
if hasattr(self._llm, "stream_complete"):
|
|
658
691
|
async for token in self._llm.stream_complete(
|
|
659
692
|
system=None,
|
|
660
693
|
messages=messages,
|
|
694
|
+
source=react_source,
|
|
661
695
|
):
|
|
662
696
|
accumulated += token
|
|
663
697
|
if self.config.stream_tokens:
|
|
@@ -679,6 +713,7 @@ class BaseAgent:
|
|
|
679
713
|
system=None,
|
|
680
714
|
messages=messages,
|
|
681
715
|
response_format={"type": "json_object"},
|
|
716
|
+
source=react_source,
|
|
682
717
|
)
|
|
683
718
|
response = _normalize_response(raw)
|
|
684
719
|
if response is None:
|
|
@@ -739,12 +774,32 @@ class BaseAgent:
|
|
|
739
774
|
return (
|
|
740
775
|
f"Error: tool '{name}' not available. Available tools: {list(self._tools.keys())}"
|
|
741
776
|
)
|
|
777
|
+
tool = self._tools[name]
|
|
778
|
+
|
|
779
|
+
# Per-run memoization, gated by both agent opt-in AND tool consent.
|
|
780
|
+
# Tools that have side effects or time-dependent output can veto
|
|
781
|
+
# caching by setting ``cacheable = False`` on the instance. Errors
|
|
782
|
+
# are NOT cached — a transient failure should not poison the rest
|
|
783
|
+
# of the run.
|
|
784
|
+
cache_key: tuple[str, str] | None = None
|
|
785
|
+
if self._tool_cache is not None and getattr(tool, "cacheable", True) is True:
|
|
786
|
+
try:
|
|
787
|
+
cache_key = (name, json.dumps(args, sort_keys=True, default=str))
|
|
788
|
+
except (TypeError, ValueError):
|
|
789
|
+
cache_key = None # un-serialisable args — silently skip
|
|
790
|
+
if cache_key is not None and cache_key in self._tool_cache:
|
|
791
|
+
return self._tool_cache[cache_key]
|
|
792
|
+
|
|
742
793
|
try:
|
|
743
|
-
|
|
794
|
+
result = await tool.execute(**args)
|
|
744
795
|
except Exception as e:
|
|
745
796
|
logger.error("Tool %s failed: %s", name, e)
|
|
746
797
|
return f"Tool error ({name}): {e}"
|
|
747
798
|
|
|
799
|
+
if cache_key is not None and self._tool_cache is not None:
|
|
800
|
+
self._tool_cache[cache_key] = result
|
|
801
|
+
return result
|
|
802
|
+
|
|
748
803
|
# ── Helpers ───────────────────────────────────────────────────────────────
|
|
749
804
|
|
|
750
805
|
def _error_result(self, reason: str, steps: int) -> dict:
|
|
@@ -178,17 +178,54 @@ class ConsoleRenderer:
|
|
|
178
178
|
self.sep("═")
|
|
179
179
|
print(p.get("answer", "(no answer)"), file=self._out)
|
|
180
180
|
self.sep()
|
|
181
|
+
# ``budget`` snapshot supersedes the flat cost/elapsed fields when
|
|
182
|
+
# present (added with token caps + per-call-site breakdown).
|
|
183
|
+
budget = p.get("budget") or {}
|
|
184
|
+
cost = budget.get("cost_usd", p.get("cost_usd", 0))
|
|
185
|
+
elapsed = budget.get("elapsed_seconds", p.get("elapsed_seconds", 0))
|
|
181
186
|
print(
|
|
182
187
|
f"Confidence: {p.get('confidence', 0):.2f} | "
|
|
183
188
|
f"Replans: {p.get('replan_count', 0)} | "
|
|
184
|
-
f"Cost: ${
|
|
185
|
-
f"Time: {
|
|
189
|
+
f"Cost: ${cost:.4f} | "
|
|
190
|
+
f"Time: {elapsed:.1f}s",
|
|
186
191
|
file=self._out,
|
|
187
192
|
)
|
|
193
|
+
self.render_budget(budget)
|
|
188
194
|
|
|
189
195
|
elif t == EventType.ERROR:
|
|
190
196
|
print(f"\n[error] {event.error}", file=sys.stderr)
|
|
191
197
|
|
|
198
|
+
def render_budget(self, budget: dict | None) -> None:
|
|
199
|
+
"""Print tokens + per-call-site breakdown from a ``BudgetGuard.snapshot()``
|
|
200
|
+
dict. Safe to call with ``{}`` or ``None`` — prints nothing when
|
|
201
|
+
there's no usage to show.
|
|
202
|
+
|
|
203
|
+
Exposed publicly so demos and other consumers that own their own
|
|
204
|
+
DONE / TASK_DONE rendering can still surface the breakdown without
|
|
205
|
+
duplicating the formatting.
|
|
206
|
+
"""
|
|
207
|
+
if not budget:
|
|
208
|
+
return
|
|
209
|
+
tokens_in = budget.get("tokens_in")
|
|
210
|
+
tokens_out = budget.get("tokens_out")
|
|
211
|
+
if tokens_in is not None or tokens_out is not None:
|
|
212
|
+
print(
|
|
213
|
+
f"Tokens: in={int(tokens_in or 0):,} out={int(tokens_out or 0):,}",
|
|
214
|
+
file=self._out,
|
|
215
|
+
)
|
|
216
|
+
breakdown = budget.get("breakdown") or {}
|
|
217
|
+
if breakdown:
|
|
218
|
+
# Right-pad the slot label so columns line up — matters when
|
|
219
|
+
# the demo prints multiple slots in sequence.
|
|
220
|
+
width = max(len(name) for name in breakdown)
|
|
221
|
+
for slot, stats in breakdown.items():
|
|
222
|
+
print(
|
|
223
|
+
f" {slot:<{width}} "
|
|
224
|
+
f"in={int(stats.get('tokens_in', 0)):>7,} "
|
|
225
|
+
f"out={int(stats.get('tokens_out', 0)):>6,}",
|
|
226
|
+
file=self._out,
|
|
227
|
+
)
|
|
228
|
+
|
|
192
229
|
# ── private helpers ───────────────────────────────────────────────────────
|
|
193
230
|
|
|
194
231
|
def _label(self, event: BusEvent) -> str:
|
|
@@ -91,6 +91,8 @@ class AnthropicLLM:
|
|
|
91
91
|
self,
|
|
92
92
|
system: str | None,
|
|
93
93
|
messages: list[dict],
|
|
94
|
+
*,
|
|
95
|
+
source: str | None = None,
|
|
94
96
|
**kwargs: Any,
|
|
95
97
|
) -> dict:
|
|
96
98
|
max_tokens = int(kwargs.pop("max_tokens", self._max_tokens))
|
|
@@ -110,7 +112,7 @@ class AnthropicLLM:
|
|
|
110
112
|
cost = _compute_cost(usage, self._cost_fn)
|
|
111
113
|
if cost is not None:
|
|
112
114
|
usage["cost_usd"] = cost
|
|
113
|
-
self.
|
|
115
|
+
self._record_usage(usage, source=source)
|
|
114
116
|
self.last_usage = usage
|
|
115
117
|
|
|
116
118
|
text = _collect_text(resp.content)
|
|
@@ -122,6 +124,8 @@ class AnthropicLLM:
|
|
|
122
124
|
self,
|
|
123
125
|
system: str | None,
|
|
124
126
|
messages: list[dict],
|
|
127
|
+
*,
|
|
128
|
+
source: str | None = None,
|
|
125
129
|
) -> AsyncGenerator[str, None]:
|
|
126
130
|
sys_blocks = _system_blocks(system, prompt_caching=self._prompt_caching)
|
|
127
131
|
built_messages = _build_messages(messages, prompt_caching=self._prompt_caching)
|
|
@@ -143,17 +147,34 @@ class AnthropicLLM:
|
|
|
143
147
|
cost = _compute_cost(usage, self._cost_fn)
|
|
144
148
|
if cost is not None:
|
|
145
149
|
usage["cost_usd"] = cost
|
|
146
|
-
self.
|
|
150
|
+
self._record_usage(usage, source=source)
|
|
147
151
|
self.last_usage = usage
|
|
148
152
|
|
|
149
153
|
# ── Internals ─────────────────────────────────────────────────────────────
|
|
150
154
|
|
|
151
|
-
def
|
|
152
|
-
|
|
155
|
+
def _record_usage(self, usage: dict, *, source: str | None) -> None:
|
|
156
|
+
"""Forward usage to the budget guard.
|
|
157
|
+
|
|
158
|
+
Token count for budget purposes is the total input that hit the wire
|
|
159
|
+
— non-cached + cache-creation + cache-read — so token caps reflect
|
|
160
|
+
real wall-clock consumption regardless of cache hit rate. Cost
|
|
161
|
+
(which respects cache pricing via ``cost_fn``) is reported when
|
|
162
|
+
known.
|
|
163
|
+
"""
|
|
164
|
+
guard = self._budget
|
|
165
|
+
if not guard:
|
|
153
166
|
return
|
|
167
|
+
tokens_in = (
|
|
168
|
+
int(usage.get("tokens_in") or 0)
|
|
169
|
+
+ int(usage.get("cache_read_tokens") or 0)
|
|
170
|
+
+ int(usage.get("cache_creation_tokens") or 0)
|
|
171
|
+
)
|
|
172
|
+
tokens_out = int(usage.get("tokens_out") or 0)
|
|
173
|
+
if (tokens_in or tokens_out) and hasattr(guard, "add_tokens"):
|
|
174
|
+
guard.add_tokens(tokens_in, tokens_out, source=source)
|
|
154
175
|
cost = usage.get("cost_usd")
|
|
155
176
|
if cost and cost > 0:
|
|
156
|
-
|
|
177
|
+
guard.add_cost(cost, source=source)
|
|
157
178
|
|
|
158
179
|
|
|
159
180
|
# ── Module-level helpers ──────────────────────────────────────────────────────
|
|
@@ -68,12 +68,24 @@ class ClaudeCodeLLM:
|
|
|
68
68
|
self._user_agent = user_agent or _default_user_agent()
|
|
69
69
|
self._betas = betas
|
|
70
70
|
self._prompt_caching = prompt_caching
|
|
71
|
+
self._budget: Any = None
|
|
71
72
|
self.last_usage: dict | None = None
|
|
72
73
|
|
|
74
|
+
def set_budget(self, guard: Any) -> None:
|
|
75
|
+
"""Inject a BudgetGuard so token caps fire on subscription-auth runs.
|
|
76
|
+
|
|
77
|
+
Cost stays 0 (no pricing schedule available for the subscription
|
|
78
|
+
tier), but ``add_tokens`` still lands so ``max_input_tokens`` /
|
|
79
|
+
``max_output_tokens`` are enforced.
|
|
80
|
+
"""
|
|
81
|
+
self._budget = guard
|
|
82
|
+
|
|
73
83
|
async def complete(
|
|
74
84
|
self,
|
|
75
85
|
system: str | None,
|
|
76
86
|
messages: list[dict],
|
|
87
|
+
*,
|
|
88
|
+
source: str | None = None,
|
|
77
89
|
**kwargs: Any,
|
|
78
90
|
) -> dict:
|
|
79
91
|
"""Collect the streaming response into a single text + usage dict.
|
|
@@ -84,7 +96,9 @@ class ClaudeCodeLLM:
|
|
|
84
96
|
"""
|
|
85
97
|
max_tokens = int(kwargs.pop("max_tokens", self._max_tokens))
|
|
86
98
|
parts: list[str] = []
|
|
87
|
-
async for delta in self._iter_stream(
|
|
99
|
+
async for delta in self._iter_stream(
|
|
100
|
+
system, messages, max_tokens=max_tokens, extra=kwargs, source=source
|
|
101
|
+
):
|
|
88
102
|
parts.append(delta)
|
|
89
103
|
text = "".join(parts)
|
|
90
104
|
if not text:
|
|
@@ -95,9 +109,11 @@ class ClaudeCodeLLM:
|
|
|
95
109
|
self,
|
|
96
110
|
system: str | None,
|
|
97
111
|
messages: list[dict],
|
|
112
|
+
*,
|
|
113
|
+
source: str | None = None,
|
|
98
114
|
) -> AsyncGenerator[str, None]:
|
|
99
115
|
async for delta in self._iter_stream(
|
|
100
|
-
system, messages, max_tokens=self._max_tokens, extra={}
|
|
116
|
+
system, messages, max_tokens=self._max_tokens, extra={}, source=source
|
|
101
117
|
):
|
|
102
118
|
yield delta
|
|
103
119
|
|
|
@@ -114,6 +130,7 @@ class ClaudeCodeLLM:
|
|
|
114
130
|
*,
|
|
115
131
|
max_tokens: int,
|
|
116
132
|
extra: dict[str, Any],
|
|
133
|
+
source: str | None = None,
|
|
117
134
|
) -> AsyncGenerator[str, None]:
|
|
118
135
|
"""Single source of truth: open Anthropic SSE stream, yield text
|
|
119
136
|
deltas, populate `self.last_usage`. Auth refresh on 401/403
|
|
@@ -182,10 +199,31 @@ class ClaudeCodeLLM:
|
|
|
182
199
|
"total_tokens": tokens_in + tokens_out,
|
|
183
200
|
"provider": "claude-code",
|
|
184
201
|
}
|
|
202
|
+
self._record_usage(self.last_usage, source=source)
|
|
185
203
|
return
|
|
186
204
|
|
|
187
205
|
raise RuntimeError("Claude Code authentication failed after refresh")
|
|
188
206
|
|
|
207
|
+
def _record_usage(self, usage: dict, *, source: str | None) -> None:
|
|
208
|
+
"""Report token totals to the budget guard.
|
|
209
|
+
|
|
210
|
+
Tokens budgeted = total input that hit the wire (non-cached +
|
|
211
|
+
cache-creation + cache-read) plus output tokens — so ``max_input_tokens``
|
|
212
|
+
/ ``max_output_tokens`` reflect real consumption regardless of cache
|
|
213
|
+
hit rate. No cost is reported (subscription auth, no pricing).
|
|
214
|
+
"""
|
|
215
|
+
guard = self._budget
|
|
216
|
+
if not guard or not hasattr(guard, "add_tokens"):
|
|
217
|
+
return
|
|
218
|
+
tokens_in = (
|
|
219
|
+
int(usage.get("tokens_in") or 0)
|
|
220
|
+
+ int(usage.get("cache_read_tokens") or 0)
|
|
221
|
+
+ int(usage.get("cache_creation_tokens") or 0)
|
|
222
|
+
)
|
|
223
|
+
tokens_out = int(usage.get("tokens_out") or 0)
|
|
224
|
+
if tokens_in or tokens_out:
|
|
225
|
+
guard.add_tokens(tokens_in, tokens_out, source=source)
|
|
226
|
+
|
|
189
227
|
async def _get_client(self) -> Any:
|
|
190
228
|
if self._client is None:
|
|
191
229
|
try:
|
|
@@ -123,6 +123,8 @@ class FallbackLLM:
|
|
|
123
123
|
self,
|
|
124
124
|
system: str | None,
|
|
125
125
|
messages: list[dict],
|
|
126
|
+
*,
|
|
127
|
+
source: str | None = None,
|
|
126
128
|
) -> AsyncGenerator[str, None]:
|
|
127
129
|
"""Stream from the first adapter that doesn't fail before yielding.
|
|
128
130
|
|
|
@@ -136,7 +138,7 @@ class FallbackLLM:
|
|
|
136
138
|
if not hasattr(llm, "stream_complete"):
|
|
137
139
|
continue
|
|
138
140
|
try:
|
|
139
|
-
gen = llm.stream_complete(system, messages)
|
|
141
|
+
gen = llm.stream_complete(system, messages, source=source)
|
|
140
142
|
first = await _peek_first(gen)
|
|
141
143
|
except BaseException as exc:
|
|
142
144
|
if i == len(self._llms) - 1 or not self._is_transient(exc):
|
|
@@ -101,6 +101,8 @@ class OpenAILLM:
|
|
|
101
101
|
self,
|
|
102
102
|
system: str | None,
|
|
103
103
|
messages: list[dict],
|
|
104
|
+
*,
|
|
105
|
+
source: str | None = None,
|
|
104
106
|
**kwargs: Any,
|
|
105
107
|
) -> dict:
|
|
106
108
|
full_messages = _prepend_system(system, messages)
|
|
@@ -120,7 +122,7 @@ class OpenAILLM:
|
|
|
120
122
|
resp = raw.parse()
|
|
121
123
|
headers = _headers_dict(raw)
|
|
122
124
|
usage = self._build_usage(resp, headers)
|
|
123
|
-
self.
|
|
125
|
+
self._record_usage(usage, source=source)
|
|
124
126
|
self.last_usage = usage
|
|
125
127
|
|
|
126
128
|
content = resp.choices[0].message.content or ""
|
|
@@ -132,6 +134,8 @@ class OpenAILLM:
|
|
|
132
134
|
self,
|
|
133
135
|
system: str | None,
|
|
134
136
|
messages: list[dict],
|
|
137
|
+
*,
|
|
138
|
+
source: str | None = None,
|
|
135
139
|
) -> AsyncGenerator[str, None]:
|
|
136
140
|
full_messages = _prepend_system(system, messages)
|
|
137
141
|
# include_usage adds a final SSE chunk with the same usage block as
|
|
@@ -156,7 +160,7 @@ class OpenAILLM:
|
|
|
156
160
|
|
|
157
161
|
if final_chunk is not None:
|
|
158
162
|
usage = self._build_usage(final_chunk, headers)
|
|
159
|
-
self.
|
|
163
|
+
self._record_usage(usage, source=source)
|
|
160
164
|
self.last_usage = usage
|
|
161
165
|
|
|
162
166
|
# ── Internals ─────────────────────────────────────────────────────────────
|
|
@@ -185,12 +189,24 @@ class OpenAILLM:
|
|
|
185
189
|
usage["cost_usd"] = cost
|
|
186
190
|
return usage
|
|
187
191
|
|
|
188
|
-
def
|
|
189
|
-
|
|
192
|
+
def _record_usage(self, usage: dict, *, source: str | None) -> None:
|
|
193
|
+
"""Forward usage to the budget guard.
|
|
194
|
+
|
|
195
|
+
Tokens are reported on every call (even when no ``cost_fn`` is wired)
|
|
196
|
+
so token-based caps still fire. Cost is forwarded only when known.
|
|
197
|
+
Both calls accept the per-call-site ``source`` tag so the guard's
|
|
198
|
+
breakdown attributes spending to the right slot.
|
|
199
|
+
"""
|
|
200
|
+
guard = self._budget
|
|
201
|
+
if not guard:
|
|
190
202
|
return
|
|
203
|
+
tokens_in = int(usage.get("tokens_in") or 0)
|
|
204
|
+
tokens_out = int(usage.get("tokens_out") or 0)
|
|
205
|
+
if (tokens_in or tokens_out) and hasattr(guard, "add_tokens"):
|
|
206
|
+
guard.add_tokens(tokens_in, tokens_out, source=source)
|
|
191
207
|
cost = usage.get("cost_usd")
|
|
192
208
|
if cost and cost > 0:
|
|
193
|
-
|
|
209
|
+
guard.add_cost(cost, source=source)
|
|
194
210
|
|
|
195
211
|
|
|
196
212
|
# ── Module-level helpers ─────────────────────────────────────────────────────
|