react-agent-harness 0.5.2__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {react_agent_harness-0.5.2/react_agent_harness.egg-info → react_agent_harness-0.6.0}/PKG-INFO +1 -1
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/README.md +138 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/cli.py +34 -0
- react_agent_harness-0.6.0/harness/llm/fallback.py +169 -0
- react_agent_harness-0.6.0/harness/llm/routing.py +139 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/runtime.py +43 -10
- react_agent_harness-0.6.0/harness/trace.py +171 -0
- react_agent_harness-0.6.0/harness/trace_viewer.py +326 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/orchestrator/planner.py +12 -3
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/pyproject.toml +1 -1
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0/react_agent_harness.egg-info}/PKG-INFO +1 -1
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/react_agent_harness.egg-info/SOURCES.txt +8 -0
- react_agent_harness-0.6.0/tests/test_fallback_llm.py +221 -0
- react_agent_harness-0.6.0/tests/test_per_call_site_llm.py +289 -0
- react_agent_harness-0.6.0/tests/test_routing_llm.py +164 -0
- react_agent_harness-0.6.0/tests/test_trace.py +240 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/LICENSE +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/agents/__init__.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/agents/base.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/__init__.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/annotation.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/checkpoint.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/console.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/events.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/executor_bridge.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/hitl.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/llm/__init__.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/llm/_streaming.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/llm/anthropic.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/llm/auth.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/llm/claude_code.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/llm/openai.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/llm/openai_codex.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/oauth_browser.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/otel.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/steering.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/tool_policy.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/harness/utils.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/memory/__init__.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/memory/episodic_lance.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/memory/manager.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/memory/redis_store.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/memory/stores.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/memory/working.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/orchestrator/__init__.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/react_agent_harness.egg-info/dependency_links.txt +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/react_agent_harness.egg-info/entry_points.txt +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/react_agent_harness.egg-info/requires.txt +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/react_agent_harness.egg-info/top_level.txt +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/setup.cfg +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_agents_base.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_annotation.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_anthropic_llm.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_checkpoint_resume.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_claude_code_llm.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_cli.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_console_renderer.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_executor_bridge.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_http_fetch.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_llm_auth.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_mcp_adapter.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_mcp_auth.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_memory.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_oauth_browser.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_openai_codex_llm.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_openai_llm.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_orchestrator.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_otel.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_parse_action_json.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_redis_store.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_steering.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_streaming.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_tool_policy.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_utils.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_vision.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tests/test_working_memory.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tools/__init__.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tools/builtin/__init__.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tools/builtin/fetch_image.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tools/builtin/http_fetch.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tools/mcp/__init__.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tools/mcp/adapter.py +0 -0
- {react_agent_harness-0.5.2 → react_agent_harness-0.6.0}/tools/mcp/auth.py +0 -0
|
@@ -36,6 +36,8 @@ events to stdout and prints elapsed time + cost at the end.
|
|
|
36
36
|
harness/runtime.py AgentRuntime — single entry point, wire once run anything
|
|
37
37
|
harness/events.py BusEvent + EventType — canonical event vocabulary
|
|
38
38
|
harness/llm/openai.py OpenAILLM — OpenAI adapter with usage + cost tracking
|
|
39
|
+
harness/llm/fallback.py FallbackLLM — transparent retry on transient upstream errors
|
|
40
|
+
harness/llm/routing.py RoutingLLM — dispatch calls to different adapters by a selector
|
|
39
41
|
harness/annotation.py Annotation store + AnnotationHook — RLHF trajectory capture
|
|
40
42
|
harness/hitl.py HITL approval gate — interactive CLI, session-allow list
|
|
41
43
|
harness/tool_policy.py Persistent tool policy — user-scoped allow rules, CLI management
|
|
@@ -188,6 +190,99 @@ llm = ClaudeCodeLLM(
|
|
|
188
190
|
)
|
|
189
191
|
```
|
|
190
192
|
|
|
193
|
+
### Cost shaping + reliability
|
|
194
|
+
|
|
195
|
+
Two patterns, ordered by how production teams actually solve this:
|
|
196
|
+
|
|
197
|
+
**1. Per-call-site LLM injection (the recommended pattern)**
|
|
198
|
+
|
|
199
|
+
`AgentRuntime` exposes one slot per orchestrator call site. Each defaults to
|
|
200
|
+
`llm` when unset, so existing code keeps working. The classifier and router
|
|
201
|
+
both see only the goal + agent descriptions (~300 tokens) and emit a
|
|
202
|
+
one-token decision — natural candidates for a cheaper model. The planner
|
|
203
|
+
and synthesiser produce structured DAGs and final answers and usually want
|
|
204
|
+
to stay on the main model.
|
|
205
|
+
|
|
206
|
+
```python
|
|
207
|
+
runtime = AgentRuntime(
|
|
208
|
+
agent_registry=agents,
|
|
209
|
+
tool_registry=tools,
|
|
210
|
+
memory=memory,
|
|
211
|
+
llm=premium, # default — agent ReAct loops use this
|
|
212
|
+
classifier_llm=cheap, # simple vs complex dispatch decision
|
|
213
|
+
router_llm=cheap, # single-agent picker
|
|
214
|
+
# planner_llm=... # defaults to llm; override only if you want
|
|
215
|
+
# synthesizer_llm=... # defaults to llm
|
|
216
|
+
)
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
No guessing, no keyword matching, no fragility — you read the runtime
|
|
220
|
+
construction and you know exactly which model serves which purpose. The
|
|
221
|
+
budget guard is wired into every distinct LLM instance automatically
|
|
222
|
+
(deduped by object identity, so injecting the same wrapper into multiple
|
|
223
|
+
slots costs no extra calls).
|
|
224
|
+
|
|
225
|
+
**2. `FallbackLLM` for resilience**
|
|
226
|
+
|
|
227
|
+
Try each adapter in order; transparently switch to the next on rate
|
|
228
|
+
limits, timeouts, or 5xx errors:
|
|
229
|
+
|
|
230
|
+
```python
|
|
231
|
+
from harness.llm.fallback import FallbackLLM
|
|
232
|
+
|
|
233
|
+
llm = FallbackLLM([
|
|
234
|
+
AnthropicLLM(model="claude-sonnet-4-6"), # primary
|
|
235
|
+
OpenAILLM(model="gpt-4o-mini"), # backup
|
|
236
|
+
])
|
|
237
|
+
runtime = AgentRuntime(..., llm=llm)
|
|
238
|
+
print(llm.last_route) # 0 if primary worked, 1 if backup did
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
Permanent errors (auth, bad request) propagate immediately — only transient
|
|
242
|
+
upstream errors trigger fallback. Customise with `transient_errors=...`.
|
|
243
|
+
Streaming retries only fire before the first token; mid-stream failures
|
|
244
|
+
propagate to preserve response integrity.
|
|
245
|
+
|
|
246
|
+
**3. `RoutingLLM` for bring-your-own-selector cases**
|
|
247
|
+
|
|
248
|
+
When you need runtime routing — capability gating (`vision` vs
|
|
249
|
+
`long_context`), learned classifiers (RouteLLM-style), cascade
|
|
250
|
+
routing (cheap-then-escalate-on-low-confidence) — wrap a routes dict
|
|
251
|
+
with your own selector callable:
|
|
252
|
+
|
|
253
|
+
```python
|
|
254
|
+
from harness.llm.routing import RoutingLLM
|
|
255
|
+
|
|
256
|
+
def by_capability(system, messages):
|
|
257
|
+
if _needs_vision(messages):
|
|
258
|
+
return "vision"
|
|
259
|
+
if _estimated_tokens(system, messages) > 100_000:
|
|
260
|
+
return "long_context"
|
|
261
|
+
return "default"
|
|
262
|
+
|
|
263
|
+
llm = RoutingLLM(
|
|
264
|
+
routes={
|
|
265
|
+
"default": OpenAILLM(model="gpt-4o-mini"),
|
|
266
|
+
"vision": OpenAILLM(model="gpt-4o"),
|
|
267
|
+
"long_context": AnthropicLLM(model="claude-sonnet-4-6"),
|
|
268
|
+
},
|
|
269
|
+
selector=by_capability,
|
|
270
|
+
default_route="default",
|
|
271
|
+
)
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
The harness intentionally does not ship default selectors. Naive selectors
|
|
275
|
+
(keyword matching, fixed token thresholds) misroute in subtle ways and
|
|
276
|
+
encourage the wrong mental model — if you're reaching for one, you almost
|
|
277
|
+
certainly want per-call-site injection instead.
|
|
278
|
+
|
|
279
|
+
Compose freely: `FallbackLLM([premium, backup])` injected into the
|
|
280
|
+
`llm=` slot gives the agent loops resilience, with `classifier_llm=cheap`
|
|
281
|
+
and `router_llm=cheap` shaping the cheap-call cost — all without a custom
|
|
282
|
+
selector.
|
|
283
|
+
|
|
284
|
+
---
|
|
285
|
+
|
|
191
286
|
`ClaudeCodeLLM` reads a `claude-code` OAuth entry, refreshes it automatically
|
|
192
287
|
when expired, and retries once after `401`/`403`. This mirrors Pi's Claude
|
|
193
288
|
Pro/Max extension approach rather than shelling out to the Claude CLI. The
|
|
@@ -722,6 +817,49 @@ The OTEL hook is a side-channel on the existing `Tracer` — the in-memory trace
|
|
|
722
817
|
is always available via `result["trace"]` regardless of whether OTEL is enabled.
|
|
723
818
|
Zero overhead and zero imports when `enable_otel=False`.
|
|
724
819
|
|
|
820
|
+
## Trace recorder + replay + local viewer
|
|
821
|
+
|
|
822
|
+
For local debug and post-mortem inspection without an OTEL backend, the
|
|
823
|
+
harness ships a JSONL trace recorder and a stdlib-only HTML viewer. Wrap
|
|
824
|
+
any streaming call:
|
|
825
|
+
|
|
826
|
+
```python
|
|
827
|
+
from harness.trace import record_trace, replay
|
|
828
|
+
|
|
829
|
+
async for event in record_trace(runtime.dispatch_stream(goal), "run.jsonl"):
|
|
830
|
+
... # your normal handling
|
|
831
|
+
```
|
|
832
|
+
|
|
833
|
+
Each `BusEvent` is flushed per-line, so a partial trace survives a crash.
|
|
834
|
+
View the trace in your browser:
|
|
835
|
+
|
|
836
|
+
```bash
|
|
837
|
+
agent-harness trace view run.jsonl # opens http://127.0.0.1:8765/
|
|
838
|
+
```
|
|
839
|
+
|
|
840
|
+
The viewer is a single embedded HTML page — vertical timeline, filter by
|
|
841
|
+
agent / event type / text, expandable per-event JSON. No build step, no
|
|
842
|
+
external services.
|
|
843
|
+
|
|
844
|
+
Replay a trace through `ConsoleRenderer` (great for grepping or piping
|
|
845
|
+
into another script):
|
|
846
|
+
|
|
847
|
+
```bash
|
|
848
|
+
agent-harness trace replay run.jsonl
|
|
849
|
+
agent-harness trace replay run.jsonl --realtime --speed 2.0
|
|
850
|
+
```
|
|
851
|
+
|
|
852
|
+
Programmatic replay yields reconstructed `BusEvent` objects:
|
|
853
|
+
|
|
854
|
+
```python
|
|
855
|
+
async for event in replay("run.jsonl", realtime=False):
|
|
856
|
+
... # reuse the same loops you write for live streams
|
|
857
|
+
```
|
|
858
|
+
|
|
859
|
+
This is complementary to OTEL — OTEL is for production observability and
|
|
860
|
+
long-term storage in Jaeger/Datadog; the JSONL recorder is for local
|
|
861
|
+
debugging, sharing reproductions, and replaying past runs.
|
|
862
|
+
|
|
725
863
|
## Vision / multimodal agents
|
|
726
864
|
|
|
727
865
|
`WorkingMemory` accepts `str | list` content so image blocks pass through to
|
|
@@ -46,6 +46,19 @@ def main() -> int:
|
|
|
46
46
|
policy_clear = policy_sub.add_parser("clear", help="remove all policy rules")
|
|
47
47
|
policy_clear.add_argument("--policy-file", default=str(default_policy_file()))
|
|
48
48
|
|
|
49
|
+
trace = sub.add_parser("trace", help="view or replay a recorded run trace")
|
|
50
|
+
trace_sub = trace.add_subparsers(dest="trace_command", required=True)
|
|
51
|
+
trace_view = trace_sub.add_parser("view", help="open a local web viewer for a trace")
|
|
52
|
+
trace_view.add_argument("path", help="path to a JSONL trace produced by record_trace")
|
|
53
|
+
trace_view.add_argument("--port", type=int, default=8765)
|
|
54
|
+
trace_view.add_argument("--no-open", action="store_true", help="don't auto-open the browser")
|
|
55
|
+
trace_replay = trace_sub.add_parser("replay", help="dump a trace to stdout via ConsoleRenderer")
|
|
56
|
+
trace_replay.add_argument("path", help="path to a JSONL trace produced by record_trace")
|
|
57
|
+
trace_replay.add_argument(
|
|
58
|
+
"--realtime", action="store_true", help="preserve recorded inter-event timing"
|
|
59
|
+
)
|
|
60
|
+
trace_replay.add_argument("--speed", type=float, default=1.0, help="realtime speed multiplier")
|
|
61
|
+
|
|
49
62
|
args = parser.parse_args()
|
|
50
63
|
try:
|
|
51
64
|
if args.command == "login":
|
|
@@ -71,6 +84,16 @@ def main() -> int:
|
|
|
71
84
|
return _policy_revoke(path, args.rule_id)
|
|
72
85
|
if args.policy_command == "clear":
|
|
73
86
|
return _policy_clear(path)
|
|
87
|
+
if args.command == "trace":
|
|
88
|
+
if args.trace_command == "view":
|
|
89
|
+
from harness.trace_viewer import serve
|
|
90
|
+
|
|
91
|
+
serve(args.path, port=args.port, open_browser=not args.no_open)
|
|
92
|
+
return 0
|
|
93
|
+
if args.trace_command == "replay":
|
|
94
|
+
return asyncio.run(
|
|
95
|
+
_trace_replay(args.path, realtime=args.realtime, speed=args.speed)
|
|
96
|
+
)
|
|
74
97
|
except Exception as e:
|
|
75
98
|
print(f"agent-harness: {e}", file=sys.stderr)
|
|
76
99
|
return 1
|
|
@@ -180,5 +203,16 @@ def _policy_clear(path: Path) -> int:
|
|
|
180
203
|
return 0
|
|
181
204
|
|
|
182
205
|
|
|
206
|
+
async def _trace_replay(path: str, *, realtime: bool, speed: float) -> int:
|
|
207
|
+
"""Read a JSONL trace and render it via ConsoleRenderer."""
|
|
208
|
+
from harness.console import ConsoleRenderer
|
|
209
|
+
from harness.trace import replay
|
|
210
|
+
|
|
211
|
+
renderer = ConsoleRenderer()
|
|
212
|
+
async for event in replay(path, realtime=realtime, speed=speed):
|
|
213
|
+
renderer.render(event)
|
|
214
|
+
return 0
|
|
215
|
+
|
|
216
|
+
|
|
183
217
|
if __name__ == "__main__":
|
|
184
218
|
raise SystemExit(main())
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
"""``FallbackLLM`` — try multiple LLM clients in order on transient failures.
|
|
2
|
+
|
|
3
|
+
Wraps any number of LLM adapters that share the standard harness contract
|
|
4
|
+
(``complete``, optionally ``stream_complete``, ``set_budget``, ``last_usage``).
|
|
5
|
+
On a transient error (rate limit, timeout, 5xx) the next adapter in the list
|
|
6
|
+
is tried. The first non-transient error — or exhausting the list — re-raises.
|
|
7
|
+
|
|
8
|
+
Example::
|
|
9
|
+
|
|
10
|
+
from harness.llm.openai import OpenAILLM
|
|
11
|
+
from harness.llm.anthropic import AnthropicLLM
|
|
12
|
+
from harness.llm.fallback import FallbackLLM
|
|
13
|
+
|
|
14
|
+
primary = AnthropicLLM(model="claude-sonnet-4-6")
|
|
15
|
+
backup = OpenAILLM(model="gpt-4o-mini")
|
|
16
|
+
llm = FallbackLLM([primary, backup])
|
|
17
|
+
|
|
18
|
+
runtime = AgentRuntime(..., llm=llm)
|
|
19
|
+
|
|
20
|
+
Set ``transient_errors`` to a callable that returns True when the exception
|
|
21
|
+
should trigger the next fallback. The default heuristic catches rate-limit
|
|
22
|
+
and 5xx-class errors from the OpenAI and Anthropic SDKs and any
|
|
23
|
+
``asyncio.TimeoutError`` / ``ConnectionError`` / ``OSError`` raised by the
|
|
24
|
+
transport.
|
|
25
|
+
|
|
26
|
+
``last_route`` exposes the index of the adapter that actually answered the
|
|
27
|
+
most recent call, so callers can see which one was hit::
|
|
28
|
+
|
|
29
|
+
await llm.complete(system, messages)
|
|
30
|
+
print(llm.last_route) # 0 if primary worked, 1 if backup did, ...
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
from __future__ import annotations
|
|
34
|
+
|
|
35
|
+
import asyncio
|
|
36
|
+
import logging
|
|
37
|
+
from collections.abc import AsyncGenerator, Callable
|
|
38
|
+
from typing import Any
|
|
39
|
+
|
|
40
|
+
logger = logging.getLogger(__name__)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _default_is_transient(exc: BaseException) -> bool:
|
|
44
|
+
"""Best-effort classifier for retryable upstream errors.
|
|
45
|
+
|
|
46
|
+
Detects without importing the SDKs (so the fallback adapter has no
|
|
47
|
+
optional-dep coupling):
|
|
48
|
+
- ``status_code`` attr in {408, 425, 429, 500, 502, 503, 504}
|
|
49
|
+
- class name suffixed with ``RateLimitError`` / ``ServiceUnavailableError``
|
|
50
|
+
/ ``APITimeoutError`` / ``InternalServerError`` / ``OverloadedError``
|
|
51
|
+
- ``asyncio.TimeoutError``, ``ConnectionError``, ``OSError``
|
|
52
|
+
"""
|
|
53
|
+
if isinstance(exc, asyncio.TimeoutError | ConnectionError | OSError):
|
|
54
|
+
return True
|
|
55
|
+
status = getattr(exc, "status_code", None)
|
|
56
|
+
if isinstance(status, int) and status in {408, 425, 429, 500, 502, 503, 504}:
|
|
57
|
+
return True
|
|
58
|
+
name = type(exc).__name__
|
|
59
|
+
transient_suffixes = (
|
|
60
|
+
"RateLimitError",
|
|
61
|
+
"ServiceUnavailableError",
|
|
62
|
+
"APITimeoutError",
|
|
63
|
+
"InternalServerError",
|
|
64
|
+
"OverloadedError",
|
|
65
|
+
"TimeoutError",
|
|
66
|
+
)
|
|
67
|
+
return any(name.endswith(s) for s in transient_suffixes)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class FallbackLLM:
|
|
71
|
+
def __init__(
|
|
72
|
+
self,
|
|
73
|
+
llms: list[Any],
|
|
74
|
+
*,
|
|
75
|
+
transient_errors: Callable[[BaseException], bool] | None = None,
|
|
76
|
+
) -> None:
|
|
77
|
+
if not llms:
|
|
78
|
+
raise ValueError("FallbackLLM requires at least one inner LLM")
|
|
79
|
+
self._llms = list(llms)
|
|
80
|
+
self._is_transient = transient_errors or _default_is_transient
|
|
81
|
+
self.last_route: int = -1
|
|
82
|
+
self.last_usage: dict | None = None
|
|
83
|
+
|
|
84
|
+
def set_budget(self, guard: Any) -> None:
|
|
85
|
+
"""Forward the budget guard to every inner LLM."""
|
|
86
|
+
for llm in self._llms:
|
|
87
|
+
if hasattr(llm, "set_budget"):
|
|
88
|
+
llm.set_budget(guard)
|
|
89
|
+
|
|
90
|
+
# ── Non-streaming ────────────────────────────────────────────────────────
|
|
91
|
+
|
|
92
|
+
async def complete(
|
|
93
|
+
self,
|
|
94
|
+
system: str | None,
|
|
95
|
+
messages: list[dict],
|
|
96
|
+
**kwargs: Any,
|
|
97
|
+
) -> dict:
|
|
98
|
+
last_exc: BaseException | None = None
|
|
99
|
+
for i, llm in enumerate(self._llms):
|
|
100
|
+
try:
|
|
101
|
+
result = await llm.complete(system, messages, **kwargs)
|
|
102
|
+
except BaseException as exc:
|
|
103
|
+
if i == len(self._llms) - 1 or not self._is_transient(exc):
|
|
104
|
+
raise
|
|
105
|
+
logger.warning(
|
|
106
|
+
"FallbackLLM: adapter %d (%s) raised transient %s — trying next",
|
|
107
|
+
i,
|
|
108
|
+
type(llm).__name__,
|
|
109
|
+
type(exc).__name__,
|
|
110
|
+
)
|
|
111
|
+
last_exc = exc
|
|
112
|
+
continue
|
|
113
|
+
self.last_route = i
|
|
114
|
+
self.last_usage = getattr(llm, "last_usage", None)
|
|
115
|
+
return result
|
|
116
|
+
# Unreachable in practice — the loop always returns or re-raises.
|
|
117
|
+
assert last_exc is not None
|
|
118
|
+
raise last_exc
|
|
119
|
+
|
|
120
|
+
# ── Streaming ────────────────────────────────────────────────────────────
|
|
121
|
+
|
|
122
|
+
async def stream_complete(
|
|
123
|
+
self,
|
|
124
|
+
system: str | None,
|
|
125
|
+
messages: list[dict],
|
|
126
|
+
) -> AsyncGenerator[str, None]:
|
|
127
|
+
"""Stream from the first adapter that doesn't fail before yielding.
|
|
128
|
+
|
|
129
|
+
We can only retry until the first token has been emitted — once the
|
|
130
|
+
caller has seen partial output, a switch mid-stream would corrupt the
|
|
131
|
+
response. The transient check therefore runs against errors raised
|
|
132
|
+
before the generator yields anything.
|
|
133
|
+
"""
|
|
134
|
+
last_exc: BaseException | None = None
|
|
135
|
+
for i, llm in enumerate(self._llms):
|
|
136
|
+
if not hasattr(llm, "stream_complete"):
|
|
137
|
+
continue
|
|
138
|
+
try:
|
|
139
|
+
gen = llm.stream_complete(system, messages)
|
|
140
|
+
first = await _peek_first(gen)
|
|
141
|
+
except BaseException as exc:
|
|
142
|
+
if i == len(self._llms) - 1 or not self._is_transient(exc):
|
|
143
|
+
raise
|
|
144
|
+
logger.warning(
|
|
145
|
+
"FallbackLLM(stream): adapter %d (%s) raised transient %s "
|
|
146
|
+
"before first token — trying next",
|
|
147
|
+
i,
|
|
148
|
+
type(llm).__name__,
|
|
149
|
+
type(exc).__name__,
|
|
150
|
+
)
|
|
151
|
+
last_exc = exc
|
|
152
|
+
continue
|
|
153
|
+
self.last_route = i
|
|
154
|
+
if first is not None:
|
|
155
|
+
yield first
|
|
156
|
+
async for chunk in gen:
|
|
157
|
+
yield chunk
|
|
158
|
+
self.last_usage = getattr(llm, "last_usage", None)
|
|
159
|
+
return
|
|
160
|
+
assert last_exc is not None
|
|
161
|
+
raise last_exc
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
async def _peek_first(gen: AsyncGenerator[str, None]) -> str | None:
|
|
165
|
+
"""Pull the first item from an async generator, or None if exhausted."""
|
|
166
|
+
try:
|
|
167
|
+
return await gen.__anext__()
|
|
168
|
+
except StopAsyncIteration:
|
|
169
|
+
return None
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""``RoutingLLM`` — dispatch each LLM call to a different adapter by a selector.
|
|
2
|
+
|
|
3
|
+
**For agent-harness's own call sites, prefer per-call-site injection** —
|
|
4
|
+
``AgentRuntime`` exposes ``classifier_llm`` / ``router_llm`` and
|
|
5
|
+
``Orchestrator`` exposes ``planner_llm`` / ``synthesizer_llm``. That's the
|
|
6
|
+
production-style pattern: each call site is hard-wired to a model chosen
|
|
7
|
+
for that workload's cost / quality / latency budget, no runtime guessing.
|
|
8
|
+
|
|
9
|
+
``RoutingLLM`` is the **bring-your-own-selector primitive** for cases
|
|
10
|
+
where per-call-site injection isn't enough:
|
|
11
|
+
|
|
12
|
+
- You're wrapping an existing harness instance you can't restructure.
|
|
13
|
+
- You're routing based on **capability** (does this query need
|
|
14
|
+
vision / function calling / >200K context?) — that's a real
|
|
15
|
+
production pattern, but the metadata is provider-specific so the
|
|
16
|
+
selector has to be yours.
|
|
17
|
+
- You're doing **learned routing** (RouteLLM-style classifier) where
|
|
18
|
+
the selector is a small ML model.
|
|
19
|
+
- You're doing **cascade routing** (cheap-then-escalate-on-low-confidence)
|
|
20
|
+
via a custom selector that inspects prior responses.
|
|
21
|
+
|
|
22
|
+
Wire it up with your own selector callable that returns a key from the
|
|
23
|
+
``routes`` dict::
|
|
24
|
+
|
|
25
|
+
from harness.llm.routing import RoutingLLM
|
|
26
|
+
|
|
27
|
+
def my_capability_selector(system, messages):
|
|
28
|
+
# Inspect the call's requirements and pick the cheapest viable model.
|
|
29
|
+
if _needs_vision(messages):
|
|
30
|
+
return "vision"
|
|
31
|
+
if _estimated_tokens(system, messages) > 100_000:
|
|
32
|
+
return "long_context"
|
|
33
|
+
return "default"
|
|
34
|
+
|
|
35
|
+
llm = RoutingLLM(
|
|
36
|
+
routes={
|
|
37
|
+
"default": OpenAILLM(model="gpt-4o-mini"),
|
|
38
|
+
"vision": OpenAILLM(model="gpt-4o"),
|
|
39
|
+
"long_context": AnthropicLLM(model="claude-sonnet-4-6"),
|
|
40
|
+
},
|
|
41
|
+
selector=my_capability_selector,
|
|
42
|
+
default_route="default",
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
The harness does **not** ship default selectors. Naive selectors
|
|
46
|
+
(keyword matching, fixed token thresholds) misroute in subtle ways and
|
|
47
|
+
encourage the wrong mental model. If you find yourself reaching for one,
|
|
48
|
+
the per-call-site injection path on ``AgentRuntime`` / ``Orchestrator``
|
|
49
|
+
is almost certainly what you actually want.
|
|
50
|
+
|
|
51
|
+
``last_route`` exposes the key of the route that handled the most recent
|
|
52
|
+
call — handy for logging and tests.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
from __future__ import annotations
|
|
56
|
+
|
|
57
|
+
import logging
|
|
58
|
+
from collections.abc import AsyncGenerator, Callable, Mapping
|
|
59
|
+
from typing import Any
|
|
60
|
+
|
|
61
|
+
logger = logging.getLogger(__name__)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
Selector = Callable[[str | None, list[dict]], str]
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class RoutingLLM:
|
|
68
|
+
def __init__(
|
|
69
|
+
self,
|
|
70
|
+
routes: Mapping[str, Any],
|
|
71
|
+
*,
|
|
72
|
+
selector: Selector,
|
|
73
|
+
default_route: str,
|
|
74
|
+
) -> None:
|
|
75
|
+
if not routes:
|
|
76
|
+
raise ValueError("RoutingLLM requires at least one route")
|
|
77
|
+
if default_route not in routes:
|
|
78
|
+
raise ValueError(f"default_route {default_route!r} is not in routes")
|
|
79
|
+
self._routes = dict(routes)
|
|
80
|
+
self._selector = selector
|
|
81
|
+
self._default_route = default_route
|
|
82
|
+
self.last_route: str = default_route
|
|
83
|
+
self.last_usage: dict | None = None
|
|
84
|
+
|
|
85
|
+
def set_budget(self, guard: Any) -> None:
|
|
86
|
+
"""Forward the budget guard to every routed LLM."""
|
|
87
|
+
for llm in self._routes.values():
|
|
88
|
+
if hasattr(llm, "set_budget"):
|
|
89
|
+
llm.set_budget(guard)
|
|
90
|
+
|
|
91
|
+
def _pick(self, system: str | None, messages: list[dict]) -> tuple[str, Any]:
|
|
92
|
+
try:
|
|
93
|
+
key = self._selector(system, messages)
|
|
94
|
+
except Exception as e: # noqa: BLE001 — fall back gracefully
|
|
95
|
+
logger.warning("RoutingLLM selector raised %s — using default route", e)
|
|
96
|
+
key = self._default_route
|
|
97
|
+
if key not in self._routes:
|
|
98
|
+
logger.warning(
|
|
99
|
+
"RoutingLLM selector returned unknown key %r — using default route %r",
|
|
100
|
+
key,
|
|
101
|
+
self._default_route,
|
|
102
|
+
)
|
|
103
|
+
key = self._default_route
|
|
104
|
+
return key, self._routes[key]
|
|
105
|
+
|
|
106
|
+
# ── Non-streaming ────────────────────────────────────────────────────────
|
|
107
|
+
|
|
108
|
+
async def complete(
|
|
109
|
+
self,
|
|
110
|
+
system: str | None,
|
|
111
|
+
messages: list[dict],
|
|
112
|
+
**kwargs: Any,
|
|
113
|
+
) -> dict:
|
|
114
|
+
key, llm = self._pick(system, messages)
|
|
115
|
+
self.last_route = key
|
|
116
|
+
result = await llm.complete(system, messages, **kwargs)
|
|
117
|
+
self.last_usage = getattr(llm, "last_usage", None)
|
|
118
|
+
return result
|
|
119
|
+
|
|
120
|
+
# ── Streaming ────────────────────────────────────────────────────────────
|
|
121
|
+
|
|
122
|
+
async def stream_complete(
|
|
123
|
+
self,
|
|
124
|
+
system: str | None,
|
|
125
|
+
messages: list[dict],
|
|
126
|
+
) -> AsyncGenerator[str, None]:
|
|
127
|
+
key, llm = self._pick(system, messages)
|
|
128
|
+
self.last_route = key
|
|
129
|
+
if not hasattr(llm, "stream_complete"):
|
|
130
|
+
# Fall back to non-streaming for routes that don't implement it.
|
|
131
|
+
result = await llm.complete(system, messages)
|
|
132
|
+
text = result.get("text", "") if isinstance(result, dict) else str(result)
|
|
133
|
+
if text:
|
|
134
|
+
yield text
|
|
135
|
+
self.last_usage = getattr(llm, "last_usage", None)
|
|
136
|
+
return
|
|
137
|
+
async for chunk in llm.stream_complete(system, messages):
|
|
138
|
+
yield chunk
|
|
139
|
+
self.last_usage = getattr(llm, "last_usage", None)
|
|
@@ -286,11 +286,36 @@ class AgentRuntime:
|
|
|
286
286
|
annotation_store: Any | None = None, # InMemoryAnnotationStore or compatible
|
|
287
287
|
checkpoint_store: Any | None = None, # FileCheckpointStore / RedisCheckpointStore
|
|
288
288
|
steering_source_factory: Any | None = None, # passed to each spawned BaseAgent
|
|
289
|
+
# ── Optional per-call-site LLM overrides ──────────────────────────────
|
|
290
|
+
# Each defaults to ``llm`` when unset. The dispatch classifier and the
|
|
291
|
+
# single-agent router both see only the goal + agent descriptions
|
|
292
|
+
# (~300 tokens) and emit a one-token decision — they're the natural
|
|
293
|
+
# candidates for a cheaper model. The planner and synthesiser produce
|
|
294
|
+
# structured DAGs and final answers and should usually stay on the
|
|
295
|
+
# main model. See README "Smart routing + fallback" for the pattern.
|
|
296
|
+
classifier_llm: Any | None = None,
|
|
297
|
+
router_llm: Any | None = None,
|
|
298
|
+
planner_llm: Any | None = None,
|
|
299
|
+
synthesizer_llm: Any | None = None,
|
|
289
300
|
) -> None:
|
|
290
301
|
self._agent_registry = agent_registry
|
|
291
302
|
self._tool_registry = tool_registry
|
|
292
303
|
self._memory = memory
|
|
293
304
|
self._llm = llm
|
|
305
|
+
self._classifier_llm = classifier_llm or llm
|
|
306
|
+
self._router_llm = router_llm or llm
|
|
307
|
+
self._planner_llm = planner_llm or llm
|
|
308
|
+
self._synthesizer_llm = synthesizer_llm or llm
|
|
309
|
+
# ``set_budget`` should reach every distinct LLM instance — if the
|
|
310
|
+
# user injected the same wrapper into multiple slots, dedupe by
|
|
311
|
+
# object identity so we don't call it N times.
|
|
312
|
+
self._budget_targets: list[Any] = []
|
|
313
|
+
for candidate in (llm, classifier_llm, router_llm, planner_llm, synthesizer_llm):
|
|
314
|
+
if candidate is None:
|
|
315
|
+
continue
|
|
316
|
+
if any(candidate is existing for existing in self._budget_targets):
|
|
317
|
+
continue
|
|
318
|
+
self._budget_targets.append(candidate)
|
|
294
319
|
self._guardrail_config = guardrail_config or GuardrailConfig()
|
|
295
320
|
self._enable_otel = enable_otel
|
|
296
321
|
self._annotation_store = annotation_store
|
|
@@ -307,6 +332,16 @@ class AgentRuntime:
|
|
|
307
332
|
checkpoint_store = FileCheckpointStore()
|
|
308
333
|
self._checkpoint_store = checkpoint_store
|
|
309
334
|
|
|
335
|
+
def _attach_budget(self, guard: Any) -> None:
|
|
336
|
+
"""Wire the per-run budget guard into every distinct LLM instance.
|
|
337
|
+
|
|
338
|
+
Duck-typed: adapters that don't implement ``set_budget`` (e.g. a
|
|
339
|
+
bare custom client) are skipped silently.
|
|
340
|
+
"""
|
|
341
|
+
for target in self._budget_targets:
|
|
342
|
+
if hasattr(target, "set_budget"):
|
|
343
|
+
target.set_budget(guard)
|
|
344
|
+
|
|
310
345
|
def _steering_lifecycle(self):
|
|
311
346
|
"""Wrap the dispatch in the steering factory's lifecycle if it has one.
|
|
312
347
|
|
|
@@ -343,8 +378,7 @@ class AgentRuntime:
|
|
|
343
378
|
from agents.base import BaseAgent
|
|
344
379
|
|
|
345
380
|
guard = BudgetGuard(self._guardrail_config)
|
|
346
|
-
|
|
347
|
-
self._llm.set_budget(guard)
|
|
381
|
+
self._attach_budget(guard)
|
|
348
382
|
|
|
349
383
|
config = self._agent_registry.get(agent_id)
|
|
350
384
|
agent = BaseAgent(
|
|
@@ -393,8 +427,7 @@ class AgentRuntime:
|
|
|
393
427
|
|
|
394
428
|
config = self._agent_registry.get(checkpoint["agent_id"])
|
|
395
429
|
guard = BudgetGuard(self._guardrail_config)
|
|
396
|
-
|
|
397
|
-
self._llm.set_budget(guard)
|
|
430
|
+
self._attach_budget(guard)
|
|
398
431
|
tracer = self._make_tracer()
|
|
399
432
|
|
|
400
433
|
agent = BaseAgent(
|
|
@@ -501,8 +534,7 @@ class AgentRuntime:
|
|
|
501
534
|
outer_run_id = checkpoint["run_id"]
|
|
502
535
|
config = self._agent_registry.get(checkpoint["agent_id"])
|
|
503
536
|
guard = BudgetGuard(self._guardrail_config)
|
|
504
|
-
|
|
505
|
-
self._llm.set_budget(guard)
|
|
537
|
+
self._attach_budget(guard)
|
|
506
538
|
tracer = self._make_tracer()
|
|
507
539
|
agent = BaseAgent(
|
|
508
540
|
config=config,
|
|
@@ -557,8 +589,7 @@ class AgentRuntime:
|
|
|
557
589
|
# Adapters that implement set_budget(guard) (e.g. OpenAILLM) get the
|
|
558
590
|
# fresh per-run guard so they can call add_cost() on every completion.
|
|
559
591
|
# Duck-typed so users can plug in any LLM client that doesn't.
|
|
560
|
-
|
|
561
|
-
self._llm.set_budget(guard)
|
|
592
|
+
self._attach_budget(guard)
|
|
562
593
|
|
|
563
594
|
# state lives in memory, not agents — instantiate fresh per run
|
|
564
595
|
agents = {
|
|
@@ -583,6 +614,8 @@ class AgentRuntime:
|
|
|
583
614
|
tracer=tracer,
|
|
584
615
|
guard=guard,
|
|
585
616
|
llm=self._llm,
|
|
617
|
+
planner_llm=self._planner_llm,
|
|
618
|
+
synthesizer_llm=self._synthesizer_llm,
|
|
586
619
|
eval_config=EvalConfig(
|
|
587
620
|
confidence_threshold=self._guardrail_config.confidence_threshold,
|
|
588
621
|
max_replan_count=self._guardrail_config.max_replan_count,
|
|
@@ -605,7 +638,7 @@ class AgentRuntime:
|
|
|
605
638
|
f" {aid}: {self._agent_registry.get(aid).role}"
|
|
606
639
|
for aid in self._agent_registry.all_ids()
|
|
607
640
|
)
|
|
608
|
-
response = await self.
|
|
641
|
+
response = await self._classifier_llm.complete(
|
|
609
642
|
system=_CLASSIFIER_SYSTEM.format(agent_descriptions=agent_descriptions),
|
|
610
643
|
messages=[{"role": "user", "content": f"Goal: {goal}"}],
|
|
611
644
|
response_format={"type": "json_object"},
|
|
@@ -708,7 +741,7 @@ class AgentRuntime:
|
|
|
708
741
|
agent_descriptions = "\n".join(
|
|
709
742
|
f" {aid}: {self._agent_registry.get(aid).role}" for aid in all_ids
|
|
710
743
|
)
|
|
711
|
-
response = await self.
|
|
744
|
+
response = await self._router_llm.complete(
|
|
712
745
|
system=_ROUTER_SYSTEM.format(agent_descriptions=agent_descriptions),
|
|
713
746
|
messages=[{"role": "user", "content": f"Goal: {goal}"}],
|
|
714
747
|
response_format={"type": "json_object"},
|