react-agent-harness 0.5.2__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. {react_agent_harness-0.5.2/react_agent_harness.egg-info → react_agent_harness-0.6.0}/PKG-INFO +1 -1
  2. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/README.md +138 -0
  3. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/cli.py +34 -0
  4. react_agent_harness-0.6.0/harness/llm/fallback.py +169 -0
  5. react_agent_harness-0.6.0/harness/llm/routing.py +139 -0
  6. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/runtime.py +43 -10
  7. react_agent_harness-0.6.0/harness/trace.py +171 -0
  8. react_agent_harness-0.6.0/harness/trace_viewer.py +326 -0
  9. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/orchestrator/planner.py +12 -3
  10. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/pyproject.toml +1 -1
  11. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0/react_agent_harness.egg-info}/PKG-INFO +1 -1
  12. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/react_agent_harness.egg-info/SOURCES.txt +8 -0
  13. react_agent_harness-0.6.0/tests/test_fallback_llm.py +221 -0
  14. react_agent_harness-0.6.0/tests/test_per_call_site_llm.py +289 -0
  15. react_agent_harness-0.6.0/tests/test_routing_llm.py +164 -0
  16. react_agent_harness-0.6.0/tests/test_trace.py +240 -0
  17. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/LICENSE +0 -0
  18. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/agents/__init__.py +0 -0
  19. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/agents/base.py +0 -0
  20. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/__init__.py +0 -0
  21. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/annotation.py +0 -0
  22. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/checkpoint.py +0 -0
  23. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/console.py +0 -0
  24. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/events.py +0 -0
  25. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/executor_bridge.py +0 -0
  26. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/hitl.py +0 -0
  27. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/llm/__init__.py +0 -0
  28. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/llm/_streaming.py +0 -0
  29. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/llm/anthropic.py +0 -0
  30. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/llm/auth.py +0 -0
  31. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/llm/claude_code.py +0 -0
  32. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/llm/openai.py +0 -0
  33. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/llm/openai_codex.py +0 -0
  34. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/oauth_browser.py +0 -0
  35. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/otel.py +0 -0
  36. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/steering.py +0 -0
  37. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/tool_policy.py +0 -0
  38. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/utils.py +0 -0
  39. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/memory/__init__.py +0 -0
  40. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/memory/episodic_lance.py +0 -0
  41. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/memory/manager.py +0 -0
  42. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/memory/redis_store.py +0 -0
  43. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/memory/stores.py +0 -0
  44. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/memory/working.py +0 -0
  45. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/orchestrator/__init__.py +0 -0
  46. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/react_agent_harness.egg-info/dependency_links.txt +0 -0
  47. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/react_agent_harness.egg-info/entry_points.txt +0 -0
  48. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/react_agent_harness.egg-info/requires.txt +0 -0
  49. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/react_agent_harness.egg-info/top_level.txt +0 -0
  50. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/setup.cfg +0 -0
  51. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_agents_base.py +0 -0
  52. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_annotation.py +0 -0
  53. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_anthropic_llm.py +0 -0
  54. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_checkpoint_resume.py +0 -0
  55. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_claude_code_llm.py +0 -0
  56. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_cli.py +0 -0
  57. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_console_renderer.py +0 -0
  58. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_executor_bridge.py +0 -0
  59. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_http_fetch.py +0 -0
  60. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_llm_auth.py +0 -0
  61. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_mcp_adapter.py +0 -0
  62. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_mcp_auth.py +0 -0
  63. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_memory.py +0 -0
  64. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_oauth_browser.py +0 -0
  65. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_openai_codex_llm.py +0 -0
  66. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_openai_llm.py +0 -0
  67. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_orchestrator.py +0 -0
  68. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_otel.py +0 -0
  69. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_parse_action_json.py +0 -0
  70. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_redis_store.py +0 -0
  71. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_steering.py +0 -0
  72. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_streaming.py +0 -0
  73. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_tool_policy.py +0 -0
  74. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_utils.py +0 -0
  75. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_vision.py +0 -0
  76. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_working_memory.py +0 -0
  77. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tools/__init__.py +0 -0
  78. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tools/builtin/__init__.py +0 -0
  79. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tools/builtin/fetch_image.py +0 -0
  80. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tools/builtin/http_fetch.py +0 -0
  81. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tools/mcp/__init__.py +0 -0
  82. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tools/mcp/adapter.py +0 -0
  83. {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tools/mcp/auth.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: react-agent-harness
3
- Version: 0.5.2
3
+ Version: 0.6.0
4
4
  Summary: Multi-agent LLM orchestration: hybrid DAG planning, two-tier memory, streaming
5
5
  Requires-Python: >=3.10
6
6
  License-File: LICENSE
@@ -36,6 +36,8 @@ events to stdout and prints elapsed time + cost at the end.
36
36
  harness/runtime.py AgentRuntime — single entry point, wire once run anything
37
37
  harness/events.py BusEvent + EventType — canonical event vocabulary
38
38
  harness/llm/openai.py OpenAILLM — OpenAI adapter with usage + cost tracking
39
+ harness/llm/fallback.py FallbackLLM — transparent retry on transient upstream errors
40
+ harness/llm/routing.py RoutingLLM — dispatch calls to different adapters by a selector
39
41
  harness/annotation.py Annotation store + AnnotationHook — RLHF trajectory capture
40
42
  harness/hitl.py HITL approval gate — interactive CLI, session-allow list
41
43
  harness/tool_policy.py Persistent tool policy — user-scoped allow rules, CLI management
@@ -188,6 +190,99 @@ llm = ClaudeCodeLLM(
188
190
  )
189
191
  ```
190
192
 
193
+ ### Cost shaping + reliability
194
+
195
+ Two patterns, ordered by how production teams actually solve this:
196
+
197
+ **1. Per-call-site LLM injection (the recommended pattern)**
198
+
199
+ `AgentRuntime` exposes one slot per orchestrator call site. Each defaults to
200
+ `llm` when unset, so existing code keeps working. The classifier and router
201
+ both see only the goal + agent descriptions (~300 tokens) and emit a
202
+ one-token decision — natural candidates for a cheaper model. The planner
203
+ and synthesiser produce structured DAGs and final answers and usually want
204
+ to stay on the main model.
205
+
206
+ ```python
207
+ runtime = AgentRuntime(
208
+ agent_registry=agents,
209
+ tool_registry=tools,
210
+ memory=memory,
211
+ llm=premium, # default — agent ReAct loops use this
212
+ classifier_llm=cheap, # simple vs complex dispatch decision
213
+ router_llm=cheap, # single-agent picker
214
+ # planner_llm=... # defaults to llm; override only if you want
215
+ # synthesizer_llm=... # defaults to llm
216
+ )
217
+ ```
218
+
219
+ No guessing, no keyword matching, no fragility — you read the runtime
220
+ construction and you know exactly which model serves which purpose. The
221
+ budget guard is wired into every distinct LLM instance automatically
222
+ (deduped by object identity, so injecting the same wrapper into multiple
223
+ slots costs no extra calls).
224
+
225
+ **2. `FallbackLLM` for resilience**
226
+
227
+ Try each adapter in order; transparently switch to the next on rate
228
+ limits, timeouts, or 5xx errors:
229
+
230
+ ```python
231
+ from harness.llm.fallback import FallbackLLM
232
+
233
+ llm = FallbackLLM([
234
+ AnthropicLLM(model="claude-sonnet-4-6"), # primary
235
+ OpenAILLM(model="gpt-4o-mini"), # backup
236
+ ])
237
+ runtime = AgentRuntime(..., llm=llm)
238
+ print(llm.last_route) # 0 if primary worked, 1 if backup did
239
+ ```
240
+
241
+ Permanent errors (auth, bad request) propagate immediately — only transient
242
+ upstream errors trigger fallback. Customise with `transient_errors=...`.
243
+ Streaming retries only fire before the first token; mid-stream failures
244
+ propagate to preserve response integrity.
245
+
246
+ **3. `RoutingLLM` for bring-your-own-selector cases**
247
+
248
+ When you need runtime routing — capability gating (`vision` vs
249
+ `long_context`), learned classifiers (RouteLLM-style), cascade
250
+ routing (cheap-then-escalate-on-low-confidence) — wrap a routes dict
251
+ with your own selector callable:
252
+
253
+ ```python
254
+ from harness.llm.routing import RoutingLLM
255
+
256
+ def by_capability(system, messages):
257
+ if _needs_vision(messages):
258
+ return "vision"
259
+ if _estimated_tokens(system, messages) > 100_000:
260
+ return "long_context"
261
+ return "default"
262
+
263
+ llm = RoutingLLM(
264
+ routes={
265
+ "default": OpenAILLM(model="gpt-4o-mini"),
266
+ "vision": OpenAILLM(model="gpt-4o"),
267
+ "long_context": AnthropicLLM(model="claude-sonnet-4-6"),
268
+ },
269
+ selector=by_capability,
270
+ default_route="default",
271
+ )
272
+ ```
273
+
274
+ The harness intentionally does not ship default selectors. Naive selectors
275
+ (keyword matching, fixed token thresholds) misroute in subtle ways and
276
+ encourage the wrong mental model — if you're reaching for one, you almost
277
+ certainly want per-call-site injection instead.
278
+
279
+ Compose freely: `FallbackLLM([premium, backup])` injected into the
280
+ `llm=` slot gives the agent loops resilience, with `classifier_llm=cheap`
281
+ and `router_llm=cheap` shaping the cheap-call cost — all without a custom
282
+ selector.
283
+
284
+ ---
285
+
191
286
  `ClaudeCodeLLM` reads a `claude-code` OAuth entry, refreshes it automatically
192
287
  when expired, and retries once after `401`/`403`. This mirrors Pi's Claude
193
288
  Pro/Max extension approach rather than shelling out to the Claude CLI. The
@@ -722,6 +817,49 @@ The OTEL hook is a side-channel on the existing `Tracer` — the in-memory trace
722
817
  is always available via `result["trace"]` regardless of whether OTEL is enabled.
723
818
  Zero overhead and zero imports when `enable_otel=False`.
724
819
 
820
+ ## Trace recorder + replay + local viewer
821
+
822
+ For local debug and post-mortem inspection without an OTEL backend, the
823
+ harness ships a JSONL trace recorder and a stdlib-only HTML viewer. Wrap
824
+ any streaming call:
825
+
826
+ ```python
827
+ from harness.trace import record_trace, replay
828
+
829
+ async for event in record_trace(runtime.dispatch_stream(goal), "run.jsonl"):
830
+ ... # your normal handling
831
+ ```
832
+
833
+ Each `BusEvent` is flushed per-line, so a partial trace survives a crash.
834
+ View the trace in your browser:
835
+
836
+ ```bash
837
+ agent-harness trace view run.jsonl # opens http://127.0.0.1:8765/
838
+ ```
839
+
840
+ The viewer is a single embedded HTML page — vertical timeline, filter by
841
+ agent / event type / text, expandable per-event JSON. No build step, no
842
+ external services.
843
+
844
+ Replay a trace through `ConsoleRenderer` (great for grepping or piping
845
+ into another script):
846
+
847
+ ```bash
848
+ agent-harness trace replay run.jsonl
849
+ agent-harness trace replay run.jsonl --realtime --speed 2.0
850
+ ```
851
+
852
+ Programmatic replay yields reconstructed `BusEvent` objects:
853
+
854
+ ```python
855
+ async for event in replay("run.jsonl", realtime=False):
856
+ ... # reuse the same loops you write for live streams
857
+ ```
858
+
859
+ This is complementary to OTEL — OTEL is for production observability and
860
+ long-term storage in Jaeger/Datadog; the JSONL recorder is for local
861
+ debugging, sharing reproductions, and replaying past runs.
862
+
725
863
  ## Vision / multimodal agents
726
864
 
727
865
  `WorkingMemory` accepts `str | list` content so image blocks pass through to
@@ -46,6 +46,19 @@ def main() -> int:
46
46
  policy_clear = policy_sub.add_parser("clear", help="remove all policy rules")
47
47
  policy_clear.add_argument("--policy-file", default=str(default_policy_file()))
48
48
 
49
+ trace = sub.add_parser("trace", help="view or replay a recorded run trace")
50
+ trace_sub = trace.add_subparsers(dest="trace_command", required=True)
51
+ trace_view = trace_sub.add_parser("view", help="open a local web viewer for a trace")
52
+ trace_view.add_argument("path", help="path to a JSONL trace produced by record_trace")
53
+ trace_view.add_argument("--port", type=int, default=8765)
54
+ trace_view.add_argument("--no-open", action="store_true", help="don't auto-open the browser")
55
+ trace_replay = trace_sub.add_parser("replay", help="dump a trace to stdout via ConsoleRenderer")
56
+ trace_replay.add_argument("path", help="path to a JSONL trace produced by record_trace")
57
+ trace_replay.add_argument(
58
+ "--realtime", action="store_true", help="preserve recorded inter-event timing"
59
+ )
60
+ trace_replay.add_argument("--speed", type=float, default=1.0, help="realtime speed multiplier")
61
+
49
62
  args = parser.parse_args()
50
63
  try:
51
64
  if args.command == "login":
@@ -71,6 +84,16 @@ def main() -> int:
71
84
  return _policy_revoke(path, args.rule_id)
72
85
  if args.policy_command == "clear":
73
86
  return _policy_clear(path)
87
+ if args.command == "trace":
88
+ if args.trace_command == "view":
89
+ from harness.trace_viewer import serve
90
+
91
+ serve(args.path, port=args.port, open_browser=not args.no_open)
92
+ return 0
93
+ if args.trace_command == "replay":
94
+ return asyncio.run(
95
+ _trace_replay(args.path, realtime=args.realtime, speed=args.speed)
96
+ )
74
97
  except Exception as e:
75
98
  print(f"agent-harness: {e}", file=sys.stderr)
76
99
  return 1
@@ -180,5 +203,16 @@ def _policy_clear(path: Path) -> int:
180
203
  return 0
181
204
 
182
205
 
206
+ async def _trace_replay(path: str, *, realtime: bool, speed: float) -> int:
207
+ """Read a JSONL trace and render it via ConsoleRenderer."""
208
+ from harness.console import ConsoleRenderer
209
+ from harness.trace import replay
210
+
211
+ renderer = ConsoleRenderer()
212
+ async for event in replay(path, realtime=realtime, speed=speed):
213
+ renderer.render(event)
214
+ return 0
215
+
216
+
183
217
  if __name__ == "__main__":
184
218
  raise SystemExit(main())
@@ -0,0 +1,169 @@
1
+ """``FallbackLLM`` — try multiple LLM clients in order on transient failures.
2
+
3
+ Wraps any number of LLM adapters that share the standard harness contract
4
+ (``complete``, optionally ``stream_complete``, ``set_budget``, ``last_usage``).
5
+ On a transient error (rate limit, timeout, 5xx) the next adapter in the list
6
+ is tried. The first non-transient error — or exhausting the list — re-raises.
7
+
8
+ Example::
9
+
10
+ from harness.llm.openai import OpenAILLM
11
+ from harness.llm.anthropic import AnthropicLLM
12
+ from harness.llm.fallback import FallbackLLM
13
+
14
+ primary = AnthropicLLM(model="claude-sonnet-4-6")
15
+ backup = OpenAILLM(model="gpt-4o-mini")
16
+ llm = FallbackLLM([primary, backup])
17
+
18
+ runtime = AgentRuntime(..., llm=llm)
19
+
20
+ Set ``transient_errors`` to a callable that returns True when the exception
21
+ should trigger the next fallback. The default heuristic catches rate-limit
22
+ and 5xx-class errors from the OpenAI and Anthropic SDKs and any
23
+ ``asyncio.TimeoutError`` / ``ConnectionError`` / ``OSError`` raised by the
24
+ transport.
25
+
26
+ ``last_route`` exposes the index of the adapter that actually answered the
27
+ most recent call, so callers can see which one was hit::
28
+
29
+ await llm.complete(system, messages)
30
+ print(llm.last_route) # 0 if primary worked, 1 if backup did, ...
31
+ """
32
+
33
+ from __future__ import annotations
34
+
35
+ import asyncio
36
+ import logging
37
+ from collections.abc import AsyncGenerator, Callable
38
+ from typing import Any
39
+
40
+ logger = logging.getLogger(__name__)
41
+
42
+
43
+ def _default_is_transient(exc: BaseException) -> bool:
44
+ """Best-effort classifier for retryable upstream errors.
45
+
46
+ Detects without importing the SDKs (so the fallback adapter has no
47
+ optional-dep coupling):
48
+ - ``status_code`` attr in {408, 425, 429, 500, 502, 503, 504}
49
+ - class name suffixed with ``RateLimitError`` / ``ServiceUnavailableError``
50
+ / ``APITimeoutError`` / ``InternalServerError`` / ``OverloadedError``
51
+ - ``asyncio.TimeoutError``, ``ConnectionError``, ``OSError``
52
+ """
53
+ if isinstance(exc, asyncio.TimeoutError | ConnectionError | OSError):
54
+ return True
55
+ status = getattr(exc, "status_code", None)
56
+ if isinstance(status, int) and status in {408, 425, 429, 500, 502, 503, 504}:
57
+ return True
58
+ name = type(exc).__name__
59
+ transient_suffixes = (
60
+ "RateLimitError",
61
+ "ServiceUnavailableError",
62
+ "APITimeoutError",
63
+ "InternalServerError",
64
+ "OverloadedError",
65
+ "TimeoutError",
66
+ )
67
+ return any(name.endswith(s) for s in transient_suffixes)
68
+
69
+
70
+ class FallbackLLM:
71
+ def __init__(
72
+ self,
73
+ llms: list[Any],
74
+ *,
75
+ transient_errors: Callable[[BaseException], bool] | None = None,
76
+ ) -> None:
77
+ if not llms:
78
+ raise ValueError("FallbackLLM requires at least one inner LLM")
79
+ self._llms = list(llms)
80
+ self._is_transient = transient_errors or _default_is_transient
81
+ self.last_route: int = -1
82
+ self.last_usage: dict | None = None
83
+
84
+ def set_budget(self, guard: Any) -> None:
85
+ """Forward the budget guard to every inner LLM."""
86
+ for llm in self._llms:
87
+ if hasattr(llm, "set_budget"):
88
+ llm.set_budget(guard)
89
+
90
+ # ── Non-streaming ────────────────────────────────────────────────────────
91
+
92
+ async def complete(
93
+ self,
94
+ system: str | None,
95
+ messages: list[dict],
96
+ **kwargs: Any,
97
+ ) -> dict:
98
+ last_exc: BaseException | None = None
99
+ for i, llm in enumerate(self._llms):
100
+ try:
101
+ result = await llm.complete(system, messages, **kwargs)
102
+ except BaseException as exc:
103
+ if i == len(self._llms) - 1 or not self._is_transient(exc):
104
+ raise
105
+ logger.warning(
106
+ "FallbackLLM: adapter %d (%s) raised transient %s — trying next",
107
+ i,
108
+ type(llm).__name__,
109
+ type(exc).__name__,
110
+ )
111
+ last_exc = exc
112
+ continue
113
+ self.last_route = i
114
+ self.last_usage = getattr(llm, "last_usage", None)
115
+ return result
116
+ # Unreachable in practice — the loop always returns or re-raises.
117
+ assert last_exc is not None
118
+ raise last_exc
119
+
120
+ # ── Streaming ────────────────────────────────────────────────────────────
121
+
122
+ async def stream_complete(
123
+ self,
124
+ system: str | None,
125
+ messages: list[dict],
126
+ ) -> AsyncGenerator[str, None]:
127
+ """Stream from the first adapter that doesn't fail before yielding.
128
+
129
+ We can only retry until the first token has been emitted — once the
130
+ caller has seen partial output, a switch mid-stream would corrupt the
131
+ response. The transient check therefore runs against errors raised
132
+ before the generator yields anything.
133
+ """
134
+ last_exc: BaseException | None = None
135
+ for i, llm in enumerate(self._llms):
136
+ if not hasattr(llm, "stream_complete"):
137
+ continue
138
+ try:
139
+ gen = llm.stream_complete(system, messages)
140
+ first = await _peek_first(gen)
141
+ except BaseException as exc:
142
+ if i == len(self._llms) - 1 or not self._is_transient(exc):
143
+ raise
144
+ logger.warning(
145
+ "FallbackLLM(stream): adapter %d (%s) raised transient %s "
146
+ "before first token — trying next",
147
+ i,
148
+ type(llm).__name__,
149
+ type(exc).__name__,
150
+ )
151
+ last_exc = exc
152
+ continue
153
+ self.last_route = i
154
+ if first is not None:
155
+ yield first
156
+ async for chunk in gen:
157
+ yield chunk
158
+ self.last_usage = getattr(llm, "last_usage", None)
159
+ return
160
+ assert last_exc is not None
161
+ raise last_exc
162
+
163
+
164
+ async def _peek_first(gen: AsyncGenerator[str, None]) -> str | None:
165
+ """Pull the first item from an async generator, or None if exhausted."""
166
+ try:
167
+ return await gen.__anext__()
168
+ except StopAsyncIteration:
169
+ return None
@@ -0,0 +1,139 @@
1
+ """``RoutingLLM`` — dispatch each LLM call to a different adapter by a selector.
2
+
3
+ **For agent-harness's own call sites, prefer per-call-site injection** —
4
+ ``AgentRuntime`` exposes ``classifier_llm`` / ``router_llm`` and
5
+ ``Orchestrator`` exposes ``planner_llm`` / ``synthesizer_llm``. That's the
6
+ production-style pattern: each call site is hard-wired to a model chosen
7
+ for that workload's cost / quality / latency budget, no runtime guessing.
8
+
9
+ ``RoutingLLM`` is the **bring-your-own-selector primitive** for cases
10
+ where per-call-site injection isn't enough:
11
+
12
+ - You're wrapping an existing harness instance you can't restructure.
13
+ - You're routing based on **capability** (does this query need
14
+ vision / function calling / >200K context?) — that's a real
15
+ production pattern, but the metadata is provider-specific so the
16
+ selector has to be yours.
17
+ - You're doing **learned routing** (RouteLLM-style classifier) where
18
+ the selector is a small ML model.
19
+ - You're doing **cascade routing** (cheap-then-escalate-on-low-confidence)
20
+ via a custom selector that inspects prior responses.
21
+
22
+ Wire it up with your own selector callable that returns a key from the
23
+ ``routes`` dict::
24
+
25
+ from harness.llm.routing import RoutingLLM
26
+
27
+ def my_capability_selector(system, messages):
28
+ # Inspect the call's requirements and pick the cheapest viable model.
29
+ if _needs_vision(messages):
30
+ return "vision"
31
+ if _estimated_tokens(system, messages) > 100_000:
32
+ return "long_context"
33
+ return "default"
34
+
35
+ llm = RoutingLLM(
36
+ routes={
37
+ "default": OpenAILLM(model="gpt-4o-mini"),
38
+ "vision": OpenAILLM(model="gpt-4o"),
39
+ "long_context": AnthropicLLM(model="claude-sonnet-4-6"),
40
+ },
41
+ selector=my_capability_selector,
42
+ default_route="default",
43
+ )
44
+
45
+ The harness does **not** ship default selectors. Naive selectors
46
+ (keyword matching, fixed token thresholds) misroute in subtle ways and
47
+ encourage the wrong mental model. If you find yourself reaching for one,
48
+ the per-call-site injection path on ``AgentRuntime`` / ``Orchestrator``
49
+ is almost certainly what you actually want.
50
+
51
+ ``last_route`` exposes the key of the route that handled the most recent
52
+ call — handy for logging and tests.
53
+ """
54
+
55
+ from __future__ import annotations
56
+
57
+ import logging
58
+ from collections.abc import AsyncGenerator, Callable, Mapping
59
+ from typing import Any
60
+
61
+ logger = logging.getLogger(__name__)
62
+
63
+
64
+ Selector = Callable[[str | None, list[dict]], str]
65
+
66
+
67
+ class RoutingLLM:
68
+ def __init__(
69
+ self,
70
+ routes: Mapping[str, Any],
71
+ *,
72
+ selector: Selector,
73
+ default_route: str,
74
+ ) -> None:
75
+ if not routes:
76
+ raise ValueError("RoutingLLM requires at least one route")
77
+ if default_route not in routes:
78
+ raise ValueError(f"default_route {default_route!r} is not in routes")
79
+ self._routes = dict(routes)
80
+ self._selector = selector
81
+ self._default_route = default_route
82
+ self.last_route: str = default_route
83
+ self.last_usage: dict | None = None
84
+
85
+ def set_budget(self, guard: Any) -> None:
86
+ """Forward the budget guard to every routed LLM."""
87
+ for llm in self._routes.values():
88
+ if hasattr(llm, "set_budget"):
89
+ llm.set_budget(guard)
90
+
91
+ def _pick(self, system: str | None, messages: list[dict]) -> tuple[str, Any]:
92
+ try:
93
+ key = self._selector(system, messages)
94
+ except Exception as e: # noqa: BLE001 — fall back gracefully
95
+ logger.warning("RoutingLLM selector raised %s — using default route", e)
96
+ key = self._default_route
97
+ if key not in self._routes:
98
+ logger.warning(
99
+ "RoutingLLM selector returned unknown key %r — using default route %r",
100
+ key,
101
+ self._default_route,
102
+ )
103
+ key = self._default_route
104
+ return key, self._routes[key]
105
+
106
+ # ── Non-streaming ────────────────────────────────────────────────────────
107
+
108
+ async def complete(
109
+ self,
110
+ system: str | None,
111
+ messages: list[dict],
112
+ **kwargs: Any,
113
+ ) -> dict:
114
+ key, llm = self._pick(system, messages)
115
+ self.last_route = key
116
+ result = await llm.complete(system, messages, **kwargs)
117
+ self.last_usage = getattr(llm, "last_usage", None)
118
+ return result
119
+
120
+ # ── Streaming ────────────────────────────────────────────────────────────
121
+
122
+ async def stream_complete(
123
+ self,
124
+ system: str | None,
125
+ messages: list[dict],
126
+ ) -> AsyncGenerator[str, None]:
127
+ key, llm = self._pick(system, messages)
128
+ self.last_route = key
129
+ if not hasattr(llm, "stream_complete"):
130
+ # Fall back to non-streaming for routes that don't implement it.
131
+ result = await llm.complete(system, messages)
132
+ text = result.get("text", "") if isinstance(result, dict) else str(result)
133
+ if text:
134
+ yield text
135
+ self.last_usage = getattr(llm, "last_usage", None)
136
+ return
137
+ async for chunk in llm.stream_complete(system, messages):
138
+ yield chunk
139
+ self.last_usage = getattr(llm, "last_usage", None)
@@ -286,11 +286,36 @@ class AgentRuntime:
286
286
  annotation_store: Any | None = None, # InMemoryAnnotationStore or compatible
287
287
  checkpoint_store: Any | None = None, # FileCheckpointStore / RedisCheckpointStore
288
288
  steering_source_factory: Any | None = None, # passed to each spawned BaseAgent
289
+ # ── Optional per-call-site LLM overrides ──────────────────────────────
290
+ # Each defaults to ``llm`` when unset. The dispatch classifier and the
291
+ # single-agent router both see only the goal + agent descriptions
292
+ # (~300 tokens) and emit a one-token decision — they're the natural
293
+ # candidates for a cheaper model. The planner and synthesiser produce
294
+ # structured DAGs and final answers and should usually stay on the
295
+ # main model. See README "Smart routing + fallback" for the pattern.
296
+ classifier_llm: Any | None = None,
297
+ router_llm: Any | None = None,
298
+ planner_llm: Any | None = None,
299
+ synthesizer_llm: Any | None = None,
289
300
  ) -> None:
290
301
  self._agent_registry = agent_registry
291
302
  self._tool_registry = tool_registry
292
303
  self._memory = memory
293
304
  self._llm = llm
305
+ self._classifier_llm = classifier_llm or llm
306
+ self._router_llm = router_llm or llm
307
+ self._planner_llm = planner_llm or llm
308
+ self._synthesizer_llm = synthesizer_llm or llm
309
+ # ``set_budget`` should reach every distinct LLM instance — if the
310
+ # user injected the same wrapper into multiple slots, dedupe by
311
+ # object identity so we don't call it N times.
312
+ self._budget_targets: list[Any] = []
313
+ for candidate in (llm, classifier_llm, router_llm, planner_llm, synthesizer_llm):
314
+ if candidate is None:
315
+ continue
316
+ if any(candidate is existing for existing in self._budget_targets):
317
+ continue
318
+ self._budget_targets.append(candidate)
294
319
  self._guardrail_config = guardrail_config or GuardrailConfig()
295
320
  self._enable_otel = enable_otel
296
321
  self._annotation_store = annotation_store
@@ -307,6 +332,16 @@ class AgentRuntime:
307
332
  checkpoint_store = FileCheckpointStore()
308
333
  self._checkpoint_store = checkpoint_store
309
334
 
335
+ def _attach_budget(self, guard: Any) -> None:
336
+ """Wire the per-run budget guard into every distinct LLM instance.
337
+
338
+ Duck-typed: adapters that don't implement ``set_budget`` (e.g. a
339
+ bare custom client) are skipped silently.
340
+ """
341
+ for target in self._budget_targets:
342
+ if hasattr(target, "set_budget"):
343
+ target.set_budget(guard)
344
+
310
345
  def _steering_lifecycle(self):
311
346
  """Wrap the dispatch in the steering factory's lifecycle if it has one.
312
347
 
@@ -343,8 +378,7 @@ class AgentRuntime:
343
378
  from agents.base import BaseAgent
344
379
 
345
380
  guard = BudgetGuard(self._guardrail_config)
346
- if hasattr(self._llm, "set_budget"):
347
- self._llm.set_budget(guard)
381
+ self._attach_budget(guard)
348
382
 
349
383
  config = self._agent_registry.get(agent_id)
350
384
  agent = BaseAgent(
@@ -393,8 +427,7 @@ class AgentRuntime:
393
427
 
394
428
  config = self._agent_registry.get(checkpoint["agent_id"])
395
429
  guard = BudgetGuard(self._guardrail_config)
396
- if hasattr(self._llm, "set_budget"):
397
- self._llm.set_budget(guard)
430
+ self._attach_budget(guard)
398
431
  tracer = self._make_tracer()
399
432
 
400
433
  agent = BaseAgent(
@@ -501,8 +534,7 @@ class AgentRuntime:
501
534
  outer_run_id = checkpoint["run_id"]
502
535
  config = self._agent_registry.get(checkpoint["agent_id"])
503
536
  guard = BudgetGuard(self._guardrail_config)
504
- if hasattr(self._llm, "set_budget"):
505
- self._llm.set_budget(guard)
537
+ self._attach_budget(guard)
506
538
  tracer = self._make_tracer()
507
539
  agent = BaseAgent(
508
540
  config=config,
@@ -557,8 +589,7 @@ class AgentRuntime:
557
589
  # Adapters that implement set_budget(guard) (e.g. OpenAILLM) get the
558
590
  # fresh per-run guard so they can call add_cost() on every completion.
559
591
  # Duck-typed so users can plug in any LLM client that doesn't.
560
- if hasattr(self._llm, "set_budget"):
561
- self._llm.set_budget(guard)
592
+ self._attach_budget(guard)
562
593
 
563
594
  # state lives in memory, not agents — instantiate fresh per run
564
595
  agents = {
@@ -583,6 +614,8 @@ class AgentRuntime:
583
614
  tracer=tracer,
584
615
  guard=guard,
585
616
  llm=self._llm,
617
+ planner_llm=self._planner_llm,
618
+ synthesizer_llm=self._synthesizer_llm,
586
619
  eval_config=EvalConfig(
587
620
  confidence_threshold=self._guardrail_config.confidence_threshold,
588
621
  max_replan_count=self._guardrail_config.max_replan_count,
@@ -605,7 +638,7 @@ class AgentRuntime:
605
638
  f" {aid}: {self._agent_registry.get(aid).role}"
606
639
  for aid in self._agent_registry.all_ids()
607
640
  )
608
- response = await self._llm.complete(
641
+ response = await self._classifier_llm.complete(
609
642
  system=_CLASSIFIER_SYSTEM.format(agent_descriptions=agent_descriptions),
610
643
  messages=[{"role": "user", "content": f"Goal: {goal}"}],
611
644
  response_format={"type": "json_object"},
@@ -708,7 +741,7 @@ class AgentRuntime:
708
741
  agent_descriptions = "\n".join(
709
742
  f" {aid}: {self._agent_registry.get(aid).role}" for aid in all_ids
710
743
  )
711
- response = await self._llm.complete(
744
+ response = await self._router_llm.complete(
712
745
  system=_ROUTER_SYSTEM.format(agent_descriptions=agent_descriptions),
713
746
  messages=[{"role": "user", "content": f"Goal: {goal}"}],
714
747
  response_format={"type": "json_object"},