zu-providers 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zu_providers-0.1.0/.gitignore +60 -0
- zu_providers-0.1.0/PKG-INFO +21 -0
- zu_providers-0.1.0/README.md +31 -0
- zu_providers-0.1.0/pyproject.toml +37 -0
- zu_providers-0.1.0/src/zu_providers/__init__.py +12 -0
- zu_providers-0.1.0/src/zu_providers/_messages.py +179 -0
- zu_providers-0.1.0/src/zu_providers/anthropic.py +149 -0
- zu_providers-0.1.0/src/zu_providers/openai_compatible.py +173 -0
- zu_providers-0.1.0/src/zu_providers/scripted.py +106 -0
- zu_providers-0.1.0/tests/test_providers.py +377 -0
- zu_providers-0.1.0/tests/test_scripted.py +48 -0
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.egg-info/
|
|
5
|
+
.eggs/
|
|
6
|
+
build/
|
|
7
|
+
dist/
|
|
8
|
+
|
|
9
|
+
# uv / venv
|
|
10
|
+
.venv/
|
|
11
|
+
uv.lock.bak
|
|
12
|
+
|
|
13
|
+
# Test / type caches
|
|
14
|
+
.pytest_cache/
|
|
15
|
+
.mypy_cache/
|
|
16
|
+
.ruff_cache/
|
|
17
|
+
.coverage
|
|
18
|
+
htmlcov/
|
|
19
|
+
|
|
20
|
+
# Zu runtime artifacts
|
|
21
|
+
*.db
|
|
22
|
+
zu.db
|
|
23
|
+
zu.yaml.local
|
|
24
|
+
zu_review.jsonl
|
|
25
|
+
*.review.jsonl
|
|
26
|
+
# Per-agent cost telemetry ledger — machine-local run history, not source.
|
|
27
|
+
cost.jsonl
|
|
28
|
+
# A recorded replay path is learned per-run and machine-local — regenerated on
|
|
29
|
+
# every successful run, not source. The agent ships; its track does not.
|
|
30
|
+
track.json
|
|
31
|
+
# …except the flagship example ships its track on purpose, as a demo of the
|
|
32
|
+
# record/replay convergence (committed; re-runs show as ordinary modifications).
|
|
33
|
+
!examples/agents/vet-appointment/track.json
|
|
34
|
+
|
|
35
|
+
# Editor / OS
|
|
36
|
+
.idea/
|
|
37
|
+
.vscode/
|
|
38
|
+
.DS_Store
|
|
39
|
+
|
|
40
|
+
# Claude Code local session state
|
|
41
|
+
.claude/
|
|
42
|
+
|
|
43
|
+
# Secrets
|
|
44
|
+
.env
|
|
45
|
+
.env.*
|
|
46
|
+
!.env.example
|
|
47
|
+
|
|
48
|
+
# Microsoft Office temp/lock files
|
|
49
|
+
~$*
|
|
50
|
+
|
|
51
|
+
# Internal design / strategy docs — kept local, never in the public repo
|
|
52
|
+
*.docx
|
|
53
|
+
*.pdf
|
|
54
|
+
# BUILD.md is the internal build-sequence / deferred-gaps ledger — kept local.
|
|
55
|
+
# (ARCHITECTURE.md is public: an onboarding agent needs the structural map.)
|
|
56
|
+
docs/BUILD.md
|
|
57
|
+
|
|
58
|
+
# Local secret — API key for live validation, never commit
|
|
59
|
+
zu_demo_key.md
|
|
60
|
+
*_key.md
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: zu-providers
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Zu model-provider adapters: scripted, anthropic, openai-compatible
|
|
5
|
+
Project-URL: Homepage, https://github.com/k3-mt/zu
|
|
6
|
+
Project-URL: Repository, https://github.com/k3-mt/zu
|
|
7
|
+
License-Expression: Apache-2.0
|
|
8
|
+
Classifier: Development Status :: 4 - Beta
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
|
|
15
|
+
Classifier: Typing :: Typed
|
|
16
|
+
Requires-Python: >=3.11
|
|
17
|
+
Requires-Dist: zu-core==0.1.0
|
|
18
|
+
Provides-Extra: anthropic
|
|
19
|
+
Requires-Dist: anthropic>=0.40; extra == 'anthropic'
|
|
20
|
+
Provides-Extra: openai
|
|
21
|
+
Requires-Dist: openai>=1.40; extra == 'openai'
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# zu-providers
|
|
2
|
+
|
|
3
|
+
Model adapters — the **`ModelProvider`** port (the any-model seam). An adapter
|
|
4
|
+
turns the harness's one normalized `ModelRequest` into a `ModelResponse` (text +
|
|
5
|
+
tool calls + usage + finish reason) and declares its `Capabilities`. The core
|
|
6
|
+
never special-cases a provider; it reads capabilities and proceeds.
|
|
7
|
+
|
|
8
|
+
**Credentials are resolved from the environment inside the adapter** — never
|
|
9
|
+
placed in the model's context or in a config file.
|
|
10
|
+
|
|
11
|
+
## Registered plugins (`zu.providers`)
|
|
12
|
+
|
|
13
|
+
| Name | Class | Notes |
|
|
14
|
+
|------|-------|-------|
|
|
15
|
+
| `scripted` | `ScriptedProvider` | The fake model: replays fixed moves in order. Deterministic; the basis of every offline test. No key, no network. |
|
|
16
|
+
| `anthropic` | `AnthropicProvider` | The Anthropic Messages API. Needs `[anthropic]` SDK extra + an API key. |
|
|
17
|
+
| `openai-compatible` | `OpenAICompatibleProvider` | Any OpenAI-compatible endpoint (OpenAI, OpenRouter, Ollama, vLLM) via a base URL. Needs `[openai]` SDK extra. |
|
|
18
|
+
|
|
19
|
+
`_messages.py` holds the shared request/response translation both real adapters
|
|
20
|
+
build on, so they behave identically against the neutral contract.
|
|
21
|
+
|
|
22
|
+
## Extend
|
|
23
|
+
|
|
24
|
+
Implement the `ModelProvider` shape, register it under `zu.providers` in
|
|
25
|
+
`pyproject.toml`, and add a deterministic test (the contract test asserts every
|
|
26
|
+
adapter behaves identically on the neutral surface).
|
|
27
|
+
|
|
28
|
+
## Tests
|
|
29
|
+
|
|
30
|
+
`uv run pytest packages/zu-providers` — offline. Live-API smoke tests are opt-in
|
|
31
|
+
behind `ZU_LIVE_*` env flags.
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "zu-providers"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Zu model-provider adapters: scripted, anthropic, openai-compatible"
|
|
5
|
+
requires-python = ">=3.11"
|
|
6
|
+
license = "Apache-2.0"
|
|
7
|
+
classifiers = [
|
|
8
|
+
"Development Status :: 4 - Beta",
|
|
9
|
+
"Intended Audience :: Developers",
|
|
10
|
+
"License :: OSI Approved :: Apache Software License",
|
|
11
|
+
"Programming Language :: Python :: 3",
|
|
12
|
+
"Programming Language :: Python :: 3.11",
|
|
13
|
+
"Programming Language :: Python :: 3.12",
|
|
14
|
+
"Topic :: Software Development :: Libraries :: Application Frameworks",
|
|
15
|
+
"Typing :: Typed",
|
|
16
|
+
]
|
|
17
|
+
dependencies = ["zu-core==0.1.0"]
|
|
18
|
+
|
|
19
|
+
[project.optional-dependencies]
|
|
20
|
+
anthropic = ["anthropic>=0.40"]
|
|
21
|
+
openai = ["openai>=1.40"]
|
|
22
|
+
|
|
23
|
+
[project.entry-points."zu.providers"]
|
|
24
|
+
scripted = "zu_providers.scripted:ScriptedProvider"
|
|
25
|
+
anthropic = "zu_providers.anthropic:AnthropicProvider"
|
|
26
|
+
openai-compatible = "zu_providers.openai_compatible:OpenAICompatibleProvider"
|
|
27
|
+
|
|
28
|
+
[project.urls]
|
|
29
|
+
Homepage = "https://github.com/k3-mt/zu"
|
|
30
|
+
Repository = "https://github.com/k3-mt/zu"
|
|
31
|
+
|
|
32
|
+
[build-system]
|
|
33
|
+
requires = ["hatchling"]
|
|
34
|
+
build-backend = "hatchling.build"
|
|
35
|
+
|
|
36
|
+
[tool.hatch.build.targets.wheel]
|
|
37
|
+
packages = ["src/zu_providers"]
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Zu model-provider adapters.
|
|
2
|
+
|
|
3
|
+
Every adapter implements the ModelProvider port: it turns a normalized
|
|
4
|
+
ModelRequest into a ModelResponse and declares its Capabilities. The harness
|
|
5
|
+
never imports a model SDK — it speaks only the port.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from .scripted import ScriptedProvider
|
|
11
|
+
|
|
12
|
+
__all__ = ["ScriptedProvider"]
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
"""Translate Zu's neutral message format into each provider's wire format.
|
|
2
|
+
|
|
3
|
+
The loop speaks one neutral shape (pinned by ``test_message_format_is_stable``):
|
|
4
|
+
|
|
5
|
+
{"role": "system" | "user", "content": "<text>"}
|
|
6
|
+
{"role": "assistant", "tool_calls": [{"name": ..., "args": {...}}, ...],
|
|
7
|
+
"content": "<optional reasoning text>"}
|
|
8
|
+
{"role": "tool", "name": ..., "content": "<json string>"}
|
|
9
|
+
|
|
10
|
+
An assistant tool-call turn MAY carry ``content`` (the model's reasoning emitted
|
|
11
|
+
alongside the calls); it is preserved into each wire format (a leading Anthropic
|
|
12
|
+
text block / an OpenAI ``content`` field) and omitted when empty.
|
|
13
|
+
|
|
14
|
+
Crucially the neutral form carries **no tool-call ids** — an assistant turn's
|
|
15
|
+
tool calls and the ``tool`` results that follow are matched by *order*. Both
|
|
16
|
+
provider wire formats require ids (Anthropic ``tool_use.id`` ↔
|
|
17
|
+
``tool_result.tool_use_id``; OpenAI ``tool_calls[].id`` ↔ ``tool.tool_call_id``),
|
|
18
|
+
so we synthesize ids on the assistant turn and assign them to the following
|
|
19
|
+
results FIFO. This is safe because the loop emits one assistant-tool turn
|
|
20
|
+
immediately followed by its results, in order.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import json
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class _ToolIds:
|
|
29
|
+
"""The shared id bookkeeping both translators need: synthesize a fresh id per
|
|
30
|
+
tool call on an assistant turn, then match the following ``tool`` results to
|
|
31
|
+
those ids FIFO. Both wire formats require ids on a matched pair; the neutral
|
|
32
|
+
form carries none, so order is the contract (see the module docstring).
|
|
33
|
+
|
|
34
|
+
Failing loudly here — rather than fabricating or dropping an id — turns a
|
|
35
|
+
malformed history into a clear local ValueError instead of an opaque provider
|
|
36
|
+
400 (``tool_result references unknown tool_use_id`` / a tool message with no
|
|
37
|
+
matching call). The mismatch is *symmetric*: too many results (a result with
|
|
38
|
+
no pending call) and too few (calls left unmatched at the end) both raise."""
|
|
39
|
+
|
|
40
|
+
def __init__(self, prefix: str) -> None:
|
|
41
|
+
self._prefix = prefix
|
|
42
|
+
self._counter = 0
|
|
43
|
+
self._pending: list[str] = []
|
|
44
|
+
|
|
45
|
+
def open_calls(self, n: int) -> list[str]:
|
|
46
|
+
"""Begin an assistant tool-call turn: mint ``n`` fresh ids, replacing any
|
|
47
|
+
still-pending ones (the loop emits a turn's results before the next turn,
|
|
48
|
+
so anything left here is the too-few case, caught by ``finish``)."""
|
|
49
|
+
self._pending = []
|
|
50
|
+
for _ in range(n):
|
|
51
|
+
self._counter += 1
|
|
52
|
+
self._pending.append(f"{self._prefix}{self._counter}")
|
|
53
|
+
return list(self._pending)
|
|
54
|
+
|
|
55
|
+
def match_result(self) -> str:
|
|
56
|
+
"""Claim the next pending id for a ``tool`` result. Raises if there is no
|
|
57
|
+
preceding tool call to match — more results than calls (out of order, or
|
|
58
|
+
a stray result)."""
|
|
59
|
+
if not self._pending:
|
|
60
|
+
raise ValueError(
|
|
61
|
+
"tool result has no matching tool call in the message history "
|
|
62
|
+
"(an assistant tool-call turn must immediately precede its results)"
|
|
63
|
+
)
|
|
64
|
+
return self._pending.pop(0)
|
|
65
|
+
|
|
66
|
+
def finish(self) -> None:
|
|
67
|
+
"""End of history: every opened tool call must have been matched. Leftover
|
|
68
|
+
pending ids mean fewer results than calls — the mirror of ``match_result``,
|
|
69
|
+
and just as much a malformed history, so it fails just as loudly."""
|
|
70
|
+
if self._pending:
|
|
71
|
+
raise ValueError(
|
|
72
|
+
f"{len(self._pending)} tool call(s) have no matching tool result in "
|
|
73
|
+
"the message history (each tool call must be followed by its result)"
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def to_anthropic_messages(messages: list[dict]) -> tuple[str | None, list[dict]]:
|
|
78
|
+
"""Return ``(system, messages)`` for the Anthropic Messages API.
|
|
79
|
+
|
|
80
|
+
System turns are concatenated into the separate ``system`` parameter
|
|
81
|
+
(Anthropic keeps system out of ``messages``). Tool results are gathered into
|
|
82
|
+
a single ``user`` turn of ``tool_result`` blocks, as the API expects."""
|
|
83
|
+
system_parts: list[str] = []
|
|
84
|
+
out: list[dict] = []
|
|
85
|
+
ids = _ToolIds("toolu_")
|
|
86
|
+
pending_results: list[dict] = [] # tool_result blocks to flush as one user turn
|
|
87
|
+
|
|
88
|
+
def flush() -> None:
|
|
89
|
+
nonlocal pending_results
|
|
90
|
+
if pending_results:
|
|
91
|
+
out.append({"role": "user", "content": pending_results})
|
|
92
|
+
pending_results = []
|
|
93
|
+
|
|
94
|
+
for m in messages:
|
|
95
|
+
role = m.get("role")
|
|
96
|
+
if role == "system":
|
|
97
|
+
system_parts.append(str(m.get("content", "")))
|
|
98
|
+
elif role == "user":
|
|
99
|
+
flush()
|
|
100
|
+
out.append({"role": "user", "content": m.get("content", "")})
|
|
101
|
+
elif role == "assistant":
|
|
102
|
+
flush()
|
|
103
|
+
calls = m.get("tool_calls")
|
|
104
|
+
if calls:
|
|
105
|
+
tids = ids.open_calls(len(calls))
|
|
106
|
+
blocks: list[dict] = []
|
|
107
|
+
# A leading text block preserves the model's reasoning emitted
|
|
108
|
+
# with the tool calls; Anthropic accepts text before tool_use.
|
|
109
|
+
text = m.get("content")
|
|
110
|
+
if text:
|
|
111
|
+
blocks.append({"type": "text", "text": str(text)})
|
|
112
|
+
blocks += [
|
|
113
|
+
{"type": "tool_use", "id": tid, "name": c["name"], "input": c.get("args", {})}
|
|
114
|
+
for tid, c in zip(tids, calls, strict=True)
|
|
115
|
+
]
|
|
116
|
+
out.append({"role": "assistant", "content": blocks})
|
|
117
|
+
else:
|
|
118
|
+
out.append({"role": "assistant", "content": m.get("content", "")})
|
|
119
|
+
elif role == "tool":
|
|
120
|
+
pending_results.append(
|
|
121
|
+
{"type": "tool_result", "tool_use_id": ids.match_result(), "content": m.get("content", "")}
|
|
122
|
+
)
|
|
123
|
+
ids.finish()
|
|
124
|
+
flush()
|
|
125
|
+
system = "\n\n".join(p for p in system_parts if p) or None
|
|
126
|
+
return system, out
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def to_openai_messages(messages: list[dict]) -> list[dict]:
|
|
130
|
+
"""Return messages for the OpenAI Chat Completions API (system stays inline)."""
|
|
131
|
+
out: list[dict] = []
|
|
132
|
+
ids = _ToolIds("call_")
|
|
133
|
+
|
|
134
|
+
for m in messages:
|
|
135
|
+
role = m.get("role")
|
|
136
|
+
if role in ("system", "user"):
|
|
137
|
+
out.append({"role": role, "content": m.get("content", "")})
|
|
138
|
+
elif role == "assistant":
|
|
139
|
+
calls = m.get("tool_calls")
|
|
140
|
+
if calls:
|
|
141
|
+
tids = ids.open_calls(len(calls))
|
|
142
|
+
tcs = [
|
|
143
|
+
{
|
|
144
|
+
"id": tid,
|
|
145
|
+
"type": "function",
|
|
146
|
+
"function": {"name": c["name"], "arguments": json.dumps(c.get("args", {}))},
|
|
147
|
+
}
|
|
148
|
+
for tid, c in zip(tids, calls, strict=True)
|
|
149
|
+
]
|
|
150
|
+
# Keep the model's reasoning text alongside the calls when
|
|
151
|
+
# present (OpenAI allows content + tool_calls on one message).
|
|
152
|
+
out.append({"role": "assistant", "content": m.get("content") or None, "tool_calls": tcs})
|
|
153
|
+
else:
|
|
154
|
+
out.append({"role": "assistant", "content": m.get("content", "")})
|
|
155
|
+
elif role == "tool":
|
|
156
|
+
out.append({"role": "tool", "tool_call_id": ids.match_result(), "content": m.get("content", "")})
|
|
157
|
+
ids.finish()
|
|
158
|
+
return out
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def anthropic_tool(schema: dict) -> dict:
|
|
162
|
+
"""Neutral tool schema (name/description/parameters) → Anthropic tool."""
|
|
163
|
+
return {
|
|
164
|
+
"name": schema["name"],
|
|
165
|
+
"description": schema.get("description", ""),
|
|
166
|
+
"input_schema": schema.get("parameters", {"type": "object", "properties": {}}),
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def openai_tool(schema: dict) -> dict:
|
|
171
|
+
"""Neutral tool schema → OpenAI function tool (near-identity)."""
|
|
172
|
+
return {
|
|
173
|
+
"type": "function",
|
|
174
|
+
"function": {
|
|
175
|
+
"name": schema["name"],
|
|
176
|
+
"description": schema.get("description", ""),
|
|
177
|
+
"parameters": schema.get("parameters", {"type": "object", "properties": {}}),
|
|
178
|
+
},
|
|
179
|
+
}
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
"""Anthropic Messages API adapter (build step 7).
|
|
2
|
+
|
|
3
|
+
Translates Zu's neutral ``ModelRequest`` into a Messages API call via the
|
|
4
|
+
official ``anthropic`` SDK, and the response back into a neutral
|
|
5
|
+
``ModelResponse`` — so the rest of the runtime never imports a model SDK. The
|
|
6
|
+
API key is resolved from the environment *inside* the adapter and never placed
|
|
7
|
+
in the model's context or in config, consistent with the security model.
|
|
8
|
+
|
|
9
|
+
The client is injectable (an ``AsyncAnthropic`` with a mock transport) so the
|
|
10
|
+
translation and parsing are proven offline against the real SDK; a live call is
|
|
11
|
+
opt-in. The same neutral contract is implemented by ``openai_compatible`` —
|
|
12
|
+
both pass one shared checklist, which is what makes "run on any model" real.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import logging
|
|
18
|
+
import os
|
|
19
|
+
from typing import Any
|
|
20
|
+
|
|
21
|
+
from zu_core.ports import Capabilities, Finish, ModelRequest, ModelResponse, ToolCall
|
|
22
|
+
|
|
23
|
+
from ._messages import anthropic_tool, to_anthropic_messages
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger("zu.providers.anthropic")
|
|
26
|
+
|
|
27
|
+
# Anthropic stop_reason -> neutral Finish. A tool-call finish is decided by the
|
|
28
|
+
# presence of tool calls, not this map, so the ``tool_use`` reason is absent here
|
|
29
|
+
# (a text+tool response still finalises right via presence-of-calls).
|
|
30
|
+
_FINISH = {
|
|
31
|
+
"end_turn": Finish.STOP,
|
|
32
|
+
"stop_sequence": Finish.STOP,
|
|
33
|
+
"max_tokens": Finish.LENGTH,
|
|
34
|
+
"refusal": Finish.STOP,
|
|
35
|
+
"pause_turn": Finish.STOP,
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
# Default per-response output cap. Agent turns are short (a tool call or a small
|
|
39
|
+
# JSON answer); override per request via ``ModelRequest.params["max_tokens"]``.
|
|
40
|
+
_DEFAULT_MAX_TOKENS = 4096
|
|
41
|
+
|
|
42
|
+
# Default per-call wall-time and retry bounds. A "production runtime" must not
|
|
43
|
+
# inherit the SDK's unbounded defaults: a hung connection or the SDK's own
|
|
44
|
+
# exponential-backoff retries can otherwise stack arbitrarily inside a short
|
|
45
|
+
# run budget. The loop wraps ``complete`` in its own wall-time too, but the
|
|
46
|
+
# adapter sets a floor so direct/embed use (no loop deadline) is bounded as well.
|
|
47
|
+
_DEFAULT_TIMEOUT_S = 60.0
|
|
48
|
+
_DEFAULT_MAX_RETRIES = 2
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class AnthropicProvider:
|
|
52
|
+
def __init__(
|
|
53
|
+
self,
|
|
54
|
+
model: str = "claude-opus-4-8",
|
|
55
|
+
api_key_env: str = "ANTHROPIC_API_KEY",
|
|
56
|
+
api_key: str | None = None,
|
|
57
|
+
max_tokens: int = _DEFAULT_MAX_TOKENS,
|
|
58
|
+
timeout: float = _DEFAULT_TIMEOUT_S,
|
|
59
|
+
max_retries: int = _DEFAULT_MAX_RETRIES,
|
|
60
|
+
client: Any = None,
|
|
61
|
+
) -> None:
|
|
62
|
+
self.model = model
|
|
63
|
+
self.api_key_env = api_key_env
|
|
64
|
+
# An explicit key (for programmatic / in-memory use, e.g. zu.run with a
|
|
65
|
+
# key your app already holds). Prefer ``api_key_env`` so the key never
|
|
66
|
+
# lands in a committed config file; either way it stays out of the
|
|
67
|
+
# model's context. Never hard-code or ship a key.
|
|
68
|
+
self.api_key = api_key
|
|
69
|
+
self.max_tokens = max_tokens
|
|
70
|
+
self.timeout = timeout
|
|
71
|
+
self.max_retries = max_retries
|
|
72
|
+
# client is a testability/config seam (an AsyncAnthropic, possibly with a
|
|
73
|
+
# mock transport); None -> construct from the resolved key on first use.
|
|
74
|
+
self._client = client
|
|
75
|
+
# vision=False: the neutral ModelRequest has no image channel yet, so the
|
|
76
|
+
# adapter never sends or handles image blocks — multimodal is deferred
|
|
77
|
+
# until the neutral request grows one. Declaring it would be decorative.
|
|
78
|
+
self.capabilities = Capabilities(native_tools=True, vision=False, max_context=1_000_000)
|
|
79
|
+
|
|
80
|
+
def _ensure_client(self) -> Any:
|
|
81
|
+
if self._client is None:
|
|
82
|
+
try:
|
|
83
|
+
import anthropic
|
|
84
|
+
except ModuleNotFoundError as exc:
|
|
85
|
+
raise RuntimeError(
|
|
86
|
+
"the anthropic provider needs the SDK: "
|
|
87
|
+
"pip install 'zu-runtime[anthropic]'"
|
|
88
|
+
) from exc
|
|
89
|
+
|
|
90
|
+
key = self.api_key or os.environ.get(self.api_key_env)
|
|
91
|
+
if not key:
|
|
92
|
+
raise RuntimeError(
|
|
93
|
+
f"no Anthropic API key: pass api_key=... or set ${self.api_key_env} "
|
|
94
|
+
"(the key is read here, never placed in the model's context or a config file)."
|
|
95
|
+
)
|
|
96
|
+
self._client = anthropic.AsyncAnthropic(
|
|
97
|
+
api_key=key, timeout=self.timeout, max_retries=self.max_retries
|
|
98
|
+
)
|
|
99
|
+
return self._client
|
|
100
|
+
|
|
101
|
+
async def complete(self, req: ModelRequest) -> ModelResponse:
|
|
102
|
+
client = self._ensure_client()
|
|
103
|
+
system, messages = to_anthropic_messages(req.messages)
|
|
104
|
+
kwargs: dict[str, Any] = {
|
|
105
|
+
"model": self.model,
|
|
106
|
+
"max_tokens": int(req.params.get("max_tokens", self.max_tokens)),
|
|
107
|
+
"messages": messages,
|
|
108
|
+
}
|
|
109
|
+
if system:
|
|
110
|
+
kwargs["system"] = system
|
|
111
|
+
if req.tools:
|
|
112
|
+
kwargs["tools"] = [anthropic_tool(t) for t in req.tools]
|
|
113
|
+
resp = await client.messages.create(**kwargs)
|
|
114
|
+
return _to_model_response(resp)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _to_model_response(resp: Any) -> ModelResponse:
|
|
118
|
+
text_parts: list[str] = []
|
|
119
|
+
calls: list[ToolCall] = []
|
|
120
|
+
for block in resp.content:
|
|
121
|
+
if block.type == "text":
|
|
122
|
+
text_parts.append(block.text)
|
|
123
|
+
elif block.type == "tool_use":
|
|
124
|
+
calls.append(ToolCall(name=block.name, args=dict(block.input or {})))
|
|
125
|
+
if not calls and resp.stop_reason == "refusal":
|
|
126
|
+
# A model refusal. No distinct neutral Finish exists, so it maps to STOP —
|
|
127
|
+
# but warn rather than collapse it silently, so a refusal isn't mistaken
|
|
128
|
+
# for a clean completion (mirrors the openai adapter's content_filter).
|
|
129
|
+
logger.warning("model response was a refusal (mapped to STOP)")
|
|
130
|
+
finish = Finish.TOOL_CALLS if calls else _FINISH.get(resp.stop_reason, Finish.STOP)
|
|
131
|
+
# Normalised usage shape shared with the openai-compatible adapter:
|
|
132
|
+
# input/output/total. Anthropic's API doesn't return a total, so compute it
|
|
133
|
+
# (input + output) — both adapters hand the cost projection the same shape.
|
|
134
|
+
# Guard a missing/partial usage object the same way the openai adapter does,
|
|
135
|
+
# so a response without usage degrades to {} rather than raising AttributeError
|
|
136
|
+
# — the two adapters behave identically on this edge, not just the happy path.
|
|
137
|
+
raw_usage = getattr(resp, "usage", None)
|
|
138
|
+
if raw_usage is None:
|
|
139
|
+
usage: dict = {}
|
|
140
|
+
else:
|
|
141
|
+
in_tok = getattr(raw_usage, "input_tokens", 0) or 0
|
|
142
|
+
out_tok = getattr(raw_usage, "output_tokens", 0) or 0
|
|
143
|
+
usage = {"input_tokens": in_tok, "output_tokens": out_tok, "total_tokens": in_tok + out_tok}
|
|
144
|
+
return ModelResponse(
|
|
145
|
+
text="".join(text_parts) or None,
|
|
146
|
+
tool_calls=calls,
|
|
147
|
+
finish=finish,
|
|
148
|
+
usage=usage,
|
|
149
|
+
)
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
"""OpenAI-compatible adapter (build step 7).
|
|
2
|
+
|
|
3
|
+
One adapter, pointed at a different base URL, reaches OpenRouter, OpenAI, and
|
|
4
|
+
local servers (Ollama, vLLM) — covering a vast range of models, including open
|
|
5
|
+
ones. It translates Zu's neutral ``ModelRequest`` into a Chat Completions call
|
|
6
|
+
via the official ``openai`` SDK and parses the response back, so the rest of
|
|
7
|
+
the runtime never imports a model SDK. Base URL and key are resolved from the
|
|
8
|
+
environment *inside* the adapter, never placed in the model's context.
|
|
9
|
+
|
|
10
|
+
The client is injectable (an ``AsyncOpenAI`` with a mock transport) so the
|
|
11
|
+
translation and parsing are proven offline against the real SDK; a live call is
|
|
12
|
+
opt-in. This adapter and ``anthropic`` pass one shared checklist — identical
|
|
13
|
+
neutral behaviour from two different wire formats.
|
|
14
|
+
|
|
15
|
+
A model without native tool-calling would need the prompt-based tool fallback
|
|
16
|
+
(inject schemas into the prompt, parse a structured action out of the text);
|
|
17
|
+
that path is deferred. The native path is what ships here.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import json
|
|
23
|
+
import logging
|
|
24
|
+
import os
|
|
25
|
+
from typing import Any
|
|
26
|
+
|
|
27
|
+
from zu_core.ports import Capabilities, Finish, ModelRequest, ModelResponse, ToolCall
|
|
28
|
+
|
|
29
|
+
from ._messages import openai_tool, to_openai_messages
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger("zu.providers.openai")
|
|
32
|
+
|
|
33
|
+
# OpenAI finish_reason -> neutral Finish. A tool-call finish is decided by the
|
|
34
|
+
# presence of tool calls, not this map, so the tool_calls/function_call reasons
|
|
35
|
+
# are intentionally absent here.
|
|
36
|
+
_FINISH = {
|
|
37
|
+
"stop": Finish.STOP,
|
|
38
|
+
"length": Finish.LENGTH,
|
|
39
|
+
"content_filter": Finish.STOP,
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
# Default per-call wall-time and retry bounds — see the anthropic adapter: a
|
|
43
|
+
# production runtime must not inherit the SDK's unbounded timeout / stacked
|
|
44
|
+
# backoff. Override per provider via the constructor (or config ``options``).
|
|
45
|
+
_DEFAULT_TIMEOUT_S = 60.0
|
|
46
|
+
_DEFAULT_MAX_RETRIES = 2
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class OpenAICompatibleProvider:
|
|
50
|
+
def __init__(
|
|
51
|
+
self,
|
|
52
|
+
model: str,
|
|
53
|
+
base_url_env: str = "OPENAI_BASE_URL",
|
|
54
|
+
api_key_env: str = "OPENAI_API_KEY",
|
|
55
|
+
api_key: str | None = None,
|
|
56
|
+
base_url: str | None = None,
|
|
57
|
+
native_tools: bool = True,
|
|
58
|
+
max_tokens: int | None = None,
|
|
59
|
+
timeout: float = _DEFAULT_TIMEOUT_S,
|
|
60
|
+
max_retries: int = _DEFAULT_MAX_RETRIES,
|
|
61
|
+
client: Any = None,
|
|
62
|
+
) -> None:
|
|
63
|
+
self.model = model
|
|
64
|
+
self.base_url_env = base_url_env
|
|
65
|
+
self.api_key_env = api_key_env
|
|
66
|
+
# Explicit key/base_url for programmatic use; prefer the *_env forms so a
|
|
67
|
+
# key never lands in a committed config. Either way it stays out of the
|
|
68
|
+
# model's context. Never hard-code or ship a key.
|
|
69
|
+
self.api_key = api_key
|
|
70
|
+
self.base_url = base_url
|
|
71
|
+
self.max_tokens = max_tokens
|
|
72
|
+
self.timeout = timeout
|
|
73
|
+
self.max_retries = max_retries
|
|
74
|
+
self._client = client
|
|
75
|
+
self.capabilities = Capabilities(native_tools=native_tools)
|
|
76
|
+
|
|
77
|
+
def _ensure_client(self) -> Any:
|
|
78
|
+
if self._client is None:
|
|
79
|
+
try:
|
|
80
|
+
import openai
|
|
81
|
+
except ModuleNotFoundError as exc:
|
|
82
|
+
raise RuntimeError(
|
|
83
|
+
"the openai-compatible provider needs the SDK: "
|
|
84
|
+
"pip install 'zu-runtime[openai]'"
|
|
85
|
+
) from exc
|
|
86
|
+
|
|
87
|
+
# Local servers (Ollama/vLLM) need no key; the SDK still wants a
|
|
88
|
+
# non-empty string, so fall back to a placeholder. Base URL is
|
|
89
|
+
# optional (defaults to OpenAI) and read from the env when set.
|
|
90
|
+
key = self.api_key or os.environ.get(self.api_key_env) or "not-needed"
|
|
91
|
+
base_url = self.base_url or os.environ.get(self.base_url_env) or None
|
|
92
|
+
self._client = openai.AsyncOpenAI(
|
|
93
|
+
api_key=key,
|
|
94
|
+
base_url=base_url,
|
|
95
|
+
timeout=self.timeout,
|
|
96
|
+
max_retries=self.max_retries,
|
|
97
|
+
)
|
|
98
|
+
return self._client
|
|
99
|
+
|
|
100
|
+
async def complete(self, req: ModelRequest) -> ModelResponse:
|
|
101
|
+
if not self.capabilities.native_tools:
|
|
102
|
+
raise NotImplementedError(
|
|
103
|
+
"prompt-based tool fallback for non-native-tool models is deferred; "
|
|
104
|
+
"set native_tools=True to use the Chat Completions path."
|
|
105
|
+
)
|
|
106
|
+
client = self._ensure_client()
|
|
107
|
+
kwargs: dict[str, Any] = {
|
|
108
|
+
"model": self.model,
|
|
109
|
+
"messages": to_openai_messages(req.messages),
|
|
110
|
+
}
|
|
111
|
+
if req.tools:
|
|
112
|
+
kwargs["tools"] = [openai_tool(t) for t in req.tools]
|
|
113
|
+
max_tokens = req.params.get("max_tokens", self.max_tokens)
|
|
114
|
+
if max_tokens is not None:
|
|
115
|
+
kwargs["max_tokens"] = int(max_tokens)
|
|
116
|
+
resp = await client.chat.completions.create(**kwargs)
|
|
117
|
+
return _to_model_response(resp)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _to_model_response(resp: Any) -> ModelResponse:
|
|
121
|
+
# Some OpenAI-compatible servers (vLLM/Ollama/proxies) return an empty
|
|
122
|
+
# ``choices`` array on certain errors or policy stops. Index [0] would
|
|
123
|
+
# IndexError; instead surface it as a no-answer STOP (the loop ends the run
|
|
124
|
+
# cleanly with "model finalised with no answer") and keep any usage reported.
|
|
125
|
+
choices = resp.choices or []
|
|
126
|
+
if not choices:
|
|
127
|
+
logger.warning("provider returned no choices (mapped to an empty STOP response)")
|
|
128
|
+
return ModelResponse(text=None, tool_calls=[], finish=Finish.STOP, usage=_usage_of(resp))
|
|
129
|
+
choice = choices[0]
|
|
130
|
+
msg = choice.message
|
|
131
|
+
calls: list[ToolCall] = []
|
|
132
|
+
for tc in msg.tool_calls or []:
|
|
133
|
+
raw = tc.function.arguments or "{}"
|
|
134
|
+
try:
|
|
135
|
+
args = json.loads(raw)
|
|
136
|
+
except (ValueError, TypeError):
|
|
137
|
+
args = {}
|
|
138
|
+
if not isinstance(args, dict):
|
|
139
|
+
args = {}
|
|
140
|
+
if args == {} and raw not in ("", "{}"):
|
|
141
|
+
# Malformed (or non-object) tool args. We keep the run alive — a
|
|
142
|
+
# malformed-args tool call becomes an empty-args call the loop will
|
|
143
|
+
# still dispatch — but we do NOT swallow it: surface it as a warning
|
|
144
|
+
# so a model emitting broken arguments is visible, not silent.
|
|
145
|
+
logger.warning(
|
|
146
|
+
"tool call %r produced unparsable arguments, dispatching with {}: %r",
|
|
147
|
+
tc.function.name,
|
|
148
|
+
raw,
|
|
149
|
+
)
|
|
150
|
+
calls.append(ToolCall(name=tc.function.name, args=args))
|
|
151
|
+
if not calls and choice.finish_reason == "content_filter":
|
|
152
|
+
# The provider's moderation stopped generation. The neutral Finish set has
|
|
153
|
+
# no distinct moderation value, so it maps to STOP — but we do NOT collapse
|
|
154
|
+
# it silently: surface it so a refusal/cut-off is visible, not mistaken for
|
|
155
|
+
# a clean completion (the same "fail loudly" posture as malformed args).
|
|
156
|
+
logger.warning("model response stopped by content_filter (mapped to STOP)")
|
|
157
|
+
finish = Finish.TOOL_CALLS if calls else _FINISH.get(choice.finish_reason, Finish.STOP)
|
|
158
|
+
return ModelResponse(
|
|
159
|
+
text=msg.content or None, tool_calls=calls, finish=finish, usage=_usage_of(resp)
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _usage_of(resp: Any) -> dict:
|
|
164
|
+
"""The normalised usage shape (input/output/total) shared with the anthropic
|
|
165
|
+
adapter, degrading to ``{}`` when the provider reports no usage."""
|
|
166
|
+
raw = getattr(resp, "usage", None)
|
|
167
|
+
if raw is None:
|
|
168
|
+
return {}
|
|
169
|
+
return {
|
|
170
|
+
"input_tokens": raw.prompt_tokens,
|
|
171
|
+
"output_tokens": raw.completion_tokens,
|
|
172
|
+
"total_tokens": raw.total_tokens,
|
|
173
|
+
}
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""The fake model — a deterministic stand-in for an LLM.
|
|
2
|
+
|
|
3
|
+
You hand it a fixed list of moves — call this tool, then that one, then
|
|
4
|
+
finish — and it plays them back in order, ignoring the request. With no API
|
|
5
|
+
key, no token cost, and no randomness, the whole loop does the exact same
|
|
6
|
+
thing every run. Almost every offline test leans on this provider; it is what
|
|
7
|
+
makes build steps 3–6 testable before any real model is wired in (step 7).
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from zu_core.ports import (
|
|
13
|
+
Capabilities,
|
|
14
|
+
Finish,
|
|
15
|
+
ModelProvider,
|
|
16
|
+
ModelRequest,
|
|
17
|
+
ModelResponse,
|
|
18
|
+
ToolCall,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ScriptedProvider:
|
|
23
|
+
"""Replays a fixed list of ModelResponse moves, one per `complete` call.
|
|
24
|
+
|
|
25
|
+
Construct from explicit ModelResponse objects, or use the `tool_calls` /
|
|
26
|
+
`finish` helpers to build a script tersely:
|
|
27
|
+
|
|
28
|
+
ScriptedProvider.from_moves([
|
|
29
|
+
{"tool": "http_fetch", "args": {"url": "https://example.com"}},
|
|
30
|
+
{"text": "done", "finish": "stop"},
|
|
31
|
+
])
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
# No real model behind the fake provider, so cost attribution records None
|
|
35
|
+
# (satisfies the ModelProvider ``model`` contract; real adapters set an id).
|
|
36
|
+
model: str | None = None
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
moves: list[ModelResponse],
|
|
41
|
+
capabilities: Capabilities | None = None,
|
|
42
|
+
) -> None:
|
|
43
|
+
self._moves = list(moves)
|
|
44
|
+
self._i = 0
|
|
45
|
+
self.capabilities = capabilities or Capabilities()
|
|
46
|
+
|
|
47
|
+
@classmethod
|
|
48
|
+
def from_moves(cls, moves: list[dict], **kw) -> ScriptedProvider:
|
|
49
|
+
responses: list[ModelResponse] = []
|
|
50
|
+
for i, m in enumerate(moves):
|
|
51
|
+
# Fail loudly on a malformed move rather than silently swallowing it:
|
|
52
|
+
# a tool move is {tool, args}; a text move is {text?, finish?}. An
|
|
53
|
+
# unrecognised key (a typo like {"toolname": ...}) or an empty move
|
|
54
|
+
# would otherwise be quietly turned into a do-nothing STOP response,
|
|
55
|
+
# masking a broken script — exactly the "explicit over implicit" trap.
|
|
56
|
+
# An optional ``usage`` dict ({"total_tokens": N} or {"input_tokens",
|
|
57
|
+
# "output_tokens"}) lets a script carry real per-turn token cost, so the
|
|
58
|
+
# loop's token accounting and the resource observer can be exercised
|
|
59
|
+
# deterministically (without it, a scripted run reports zero tokens and
|
|
60
|
+
# any token-budget check is vacuous).
|
|
61
|
+
if "tool" in m:
|
|
62
|
+
extra = set(m) - {"tool", "args", "usage"}
|
|
63
|
+
if extra:
|
|
64
|
+
raise ValueError(
|
|
65
|
+
f"move {i} is a tool call with unexpected key(s) {sorted(extra)}; "
|
|
66
|
+
"a tool move takes only 'tool', 'args', and optional 'usage'"
|
|
67
|
+
)
|
|
68
|
+
responses.append(
|
|
69
|
+
ModelResponse(
|
|
70
|
+
tool_calls=[ToolCall(name=m["tool"], args=m.get("args", {}))],
|
|
71
|
+
finish=Finish.TOOL_CALLS,
|
|
72
|
+
usage=m.get("usage") or {},
|
|
73
|
+
)
|
|
74
|
+
)
|
|
75
|
+
else:
|
|
76
|
+
extra = set(m) - {"text", "finish", "usage"}
|
|
77
|
+
if extra or not m:
|
|
78
|
+
raise ValueError(
|
|
79
|
+
f"move {i} is not a valid move: {m!r}; expected a tool move "
|
|
80
|
+
"{'tool': ..., 'args': ...} or a text move {'text': ..., 'finish': ...} "
|
|
81
|
+
"(either may carry an optional 'usage')"
|
|
82
|
+
)
|
|
83
|
+
responses.append(
|
|
84
|
+
ModelResponse(
|
|
85
|
+
text=m.get("text"),
|
|
86
|
+
finish=Finish(m.get("finish", "stop")),
|
|
87
|
+
usage=m.get("usage") or {},
|
|
88
|
+
)
|
|
89
|
+
)
|
|
90
|
+
return cls(responses, **kw)
|
|
91
|
+
|
|
92
|
+
async def complete(self, req: ModelRequest) -> ModelResponse:
|
|
93
|
+
if self._i >= len(self._moves):
|
|
94
|
+
# Out of script: behave as a model that has nothing left to say.
|
|
95
|
+
return ModelResponse(text=None, finish=Finish.STOP)
|
|
96
|
+
move = self._moves[self._i]
|
|
97
|
+
self._i += 1
|
|
98
|
+
return move
|
|
99
|
+
|
|
100
|
+
@property
|
|
101
|
+
def exhausted(self) -> bool:
|
|
102
|
+
return self._i >= len(self._moves)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
# Structural conformance check (no runtime cost; documents intent).
|
|
106
|
+
_: type[ModelProvider] = ScriptedProvider
|
|
@@ -0,0 +1,377 @@
|
|
|
1
|
+
"""Build step 7 — the real ModelProvider adapters: anthropic + openai-compatible.
|
|
2
|
+
|
|
3
|
+
The headline contract: **both adapters pass one shared checklist, so they behave
|
|
4
|
+
identically.** Each is exercised offline against its *real* SDK — an injected
|
|
5
|
+
client wired to an ``httpx.MockTransport`` returns canned provider JSON, so the
|
|
6
|
+
adapter's translation and the SDK's own parsing both run, with no network. A
|
|
7
|
+
live call against each API is opt-in (env-gated) so it never blocks CI.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
import os
|
|
14
|
+
|
|
15
|
+
import anthropic
|
|
16
|
+
import httpx
|
|
17
|
+
import openai
|
|
18
|
+
import pytest
|
|
19
|
+
|
|
20
|
+
from zu_core.ports import Finish, ModelRequest
|
|
21
|
+
from zu_providers._messages import to_anthropic_messages, to_openai_messages
|
|
22
|
+
from zu_providers.anthropic import AnthropicProvider
|
|
23
|
+
from zu_providers.openai_compatible import OpenAICompatibleProvider
|
|
24
|
+
|
|
25
|
+
_TOOL = {
|
|
26
|
+
"name": "http_fetch",
|
|
27
|
+
"description": "Fetch a URL.",
|
|
28
|
+
"parameters": {
|
|
29
|
+
"type": "object",
|
|
30
|
+
"properties": {"url": {"type": "string"}},
|
|
31
|
+
"required": ["url"],
|
|
32
|
+
},
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
# Canned provider responses for each checklist scenario, in each wire format.
|
|
36
|
+
_ANTHROPIC: dict[str, dict] = {
|
|
37
|
+
"text": {
|
|
38
|
+
"id": "msg_1", "type": "message", "role": "assistant", "model": "claude-opus-4-8",
|
|
39
|
+
"content": [{"type": "text", "text": "hello world"}],
|
|
40
|
+
"stop_reason": "end_turn", "stop_sequence": None,
|
|
41
|
+
"usage": {"input_tokens": 10, "output_tokens": 5},
|
|
42
|
+
},
|
|
43
|
+
"tool": {
|
|
44
|
+
"id": "msg_2", "type": "message", "role": "assistant", "model": "claude-opus-4-8",
|
|
45
|
+
"content": [{"type": "tool_use", "id": "toolu_x", "name": "http_fetch", "input": {"url": "http://e.test/"}}],
|
|
46
|
+
"stop_reason": "tool_use", "stop_sequence": None,
|
|
47
|
+
"usage": {"input_tokens": 12, "output_tokens": 7},
|
|
48
|
+
},
|
|
49
|
+
"length": {
|
|
50
|
+
"id": "msg_3", "type": "message", "role": "assistant", "model": "claude-opus-4-8",
|
|
51
|
+
"content": [{"type": "text", "text": "partial"}],
|
|
52
|
+
"stop_reason": "max_tokens", "stop_sequence": None,
|
|
53
|
+
"usage": {"input_tokens": 10, "output_tokens": 4},
|
|
54
|
+
},
|
|
55
|
+
}
|
|
56
|
+
_OPENAI: dict[str, dict] = {
|
|
57
|
+
"text": {
|
|
58
|
+
"id": "c1", "object": "chat.completion", "created": 0, "model": "gpt-x",
|
|
59
|
+
"choices": [{"index": 0, "message": {"role": "assistant", "content": "hello world"}, "finish_reason": "stop"}],
|
|
60
|
+
"usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15},
|
|
61
|
+
},
|
|
62
|
+
"tool": {
|
|
63
|
+
"id": "c2", "object": "chat.completion", "created": 0, "model": "gpt-x",
|
|
64
|
+
"choices": [{
|
|
65
|
+
"index": 0,
|
|
66
|
+
"message": {
|
|
67
|
+
"role": "assistant", "content": None,
|
|
68
|
+
"tool_calls": [{
|
|
69
|
+
"id": "call_x", "type": "function",
|
|
70
|
+
"function": {"name": "http_fetch", "arguments": "{\"url\": \"http://e.test/\"}"},
|
|
71
|
+
}],
|
|
72
|
+
},
|
|
73
|
+
"finish_reason": "tool_calls",
|
|
74
|
+
}],
|
|
75
|
+
"usage": {"prompt_tokens": 12, "completion_tokens": 7, "total_tokens": 19},
|
|
76
|
+
},
|
|
77
|
+
"length": {
|
|
78
|
+
"id": "c3", "object": "chat.completion", "created": 0, "model": "gpt-x",
|
|
79
|
+
"choices": [{"index": 0, "message": {"role": "assistant", "content": "partial"}, "finish_reason": "length"}],
|
|
80
|
+
"usage": {"prompt_tokens": 10, "completion_tokens": 4, "total_tokens": 14},
|
|
81
|
+
},
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _mock_transport(payload: dict, captured: list) -> httpx.MockTransport:
|
|
86
|
+
def handler(request: httpx.Request) -> httpx.Response:
|
|
87
|
+
captured.append(json.loads(request.content))
|
|
88
|
+
return httpx.Response(200, json=payload)
|
|
89
|
+
|
|
90
|
+
return httpx.MockTransport(handler)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def make_anthropic(scenario: str, captured: list) -> AnthropicProvider:
|
|
94
|
+
client = anthropic.AsyncAnthropic(
|
|
95
|
+
api_key="test", http_client=httpx.AsyncClient(transport=_mock_transport(_ANTHROPIC[scenario], captured))
|
|
96
|
+
)
|
|
97
|
+
return AnthropicProvider(client=client)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def make_openai(scenario: str, captured: list) -> OpenAICompatibleProvider:
|
|
101
|
+
client = openai.AsyncOpenAI(
|
|
102
|
+
api_key="test",
|
|
103
|
+
base_url="http://test.local/v1",
|
|
104
|
+
http_client=httpx.AsyncClient(transport=_mock_transport(_OPENAI[scenario], captured)),
|
|
105
|
+
)
|
|
106
|
+
return OpenAICompatibleProvider(model="gpt-x", client=client)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
_PROVIDERS = [
|
|
110
|
+
pytest.param(make_anthropic, id="anthropic"),
|
|
111
|
+
pytest.param(make_openai, id="openai-compatible"),
|
|
112
|
+
]
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
# --- the shared checklist: identical neutral behaviour from both adapters -----
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@pytest.mark.parametrize("make", _PROVIDERS)
|
|
119
|
+
async def test_text_finalize(make) -> None:
|
|
120
|
+
p = make("text", [])
|
|
121
|
+
r = await p.complete(ModelRequest(messages=[{"role": "user", "content": "hi"}]))
|
|
122
|
+
assert r.finish is Finish.STOP
|
|
123
|
+
assert r.text == "hello world"
|
|
124
|
+
assert r.tool_calls == []
|
|
125
|
+
assert r.usage["input_tokens"] == 10 and r.usage["output_tokens"] == 5
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
@pytest.mark.parametrize("make", _PROVIDERS)
|
|
129
|
+
async def test_tool_call(make) -> None:
|
|
130
|
+
p = make("tool", [])
|
|
131
|
+
r = await p.complete(ModelRequest(messages=[{"role": "user", "content": "fetch"}], tools=[_TOOL]))
|
|
132
|
+
assert r.finish is Finish.TOOL_CALLS
|
|
133
|
+
assert len(r.tool_calls) == 1
|
|
134
|
+
assert r.tool_calls[0].name == "http_fetch"
|
|
135
|
+
assert r.tool_calls[0].args == {"url": "http://e.test/"} # parsed to a dict, both ways
|
|
136
|
+
assert r.text is None
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
@pytest.mark.parametrize("make", _PROVIDERS)
|
|
140
|
+
async def test_length_is_finish_length(make) -> None:
|
|
141
|
+
p = make("length", [])
|
|
142
|
+
r = await p.complete(ModelRequest(messages=[{"role": "user", "content": "x"}]))
|
|
143
|
+
assert r.finish is Finish.LENGTH
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
@pytest.mark.parametrize("make", _PROVIDERS)
|
|
147
|
+
async def test_capabilities_present(make) -> None:
|
|
148
|
+
assert make("text", []).capabilities.native_tools is True
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
@pytest.mark.parametrize("make", _PROVIDERS)
|
|
152
|
+
async def test_usage_shape_is_normalised(make) -> None:
|
|
153
|
+
# The neutral usage dict the cost projection reads carries the SAME keys from
|
|
154
|
+
# both adapters: input/output/total. OpenAI returns a total on the wire;
|
|
155
|
+
# Anthropic doesn't, so the adapter computes it (input + output) — either way
|
|
156
|
+
# the cost projection sees one shape.
|
|
157
|
+
r = await make("text", []).complete(ModelRequest(messages=[{"role": "user", "content": "hi"}]))
|
|
158
|
+
assert r.usage["input_tokens"] == 10
|
|
159
|
+
assert r.usage["output_tokens"] == 5
|
|
160
|
+
assert r.usage["total_tokens"] == 15 # 10 + 5, whether the API gave it or not
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def test_api_key_resolution_prefers_explicit_then_env(monkeypatch) -> None:
|
|
164
|
+
# A directly-passed key works with no env var set; without either, the
|
|
165
|
+
# adapter raises a clear error rather than calling the API with no auth.
|
|
166
|
+
monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
|
|
167
|
+
AnthropicProvider(model="claude-x", api_key="sk-explicit")._ensure_client() # no raise
|
|
168
|
+
|
|
169
|
+
with pytest.raises(RuntimeError, match="no Anthropic API key"):
|
|
170
|
+
AnthropicProvider(model="claude-x")._ensure_client()
|
|
171
|
+
|
|
172
|
+
monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-from-env")
|
|
173
|
+
AnthropicProvider(model="claude-x")._ensure_client() # resolves from env
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
async def test_native_tools_false_raises_not_implemented() -> None:
|
|
177
|
+
# The prompt-based tool fallback for non-native-tool models is deferred;
|
|
178
|
+
# the adapter must raise clearly, never silently guess.
|
|
179
|
+
p = OpenAICompatibleProvider(model="local", native_tools=False, client=object())
|
|
180
|
+
with pytest.raises(NotImplementedError):
|
|
181
|
+
await p.complete(ModelRequest(messages=[{"role": "user", "content": "hi"}]))
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
async def test_openai_empty_choices_is_no_answer_not_crash() -> None:
|
|
185
|
+
# Some OpenAI-compatible servers (vLLM/Ollama/proxies) return choices: [] on
|
|
186
|
+
# certain errors/policy stops. The adapter must surface that as a no-answer
|
|
187
|
+
# STOP, never IndexError on choices[0].
|
|
188
|
+
captured: list = []
|
|
189
|
+
payload = {
|
|
190
|
+
"id": "c0", "object": "chat.completion", "created": 0, "model": "gpt-x",
|
|
191
|
+
"choices": [],
|
|
192
|
+
"usage": {"prompt_tokens": 3, "completion_tokens": 0, "total_tokens": 3},
|
|
193
|
+
}
|
|
194
|
+
client = openai.AsyncOpenAI(
|
|
195
|
+
api_key="test", base_url="http://test.local/v1",
|
|
196
|
+
http_client=httpx.AsyncClient(transport=_mock_transport(payload, captured)),
|
|
197
|
+
)
|
|
198
|
+
p = OpenAICompatibleProvider(model="gpt-x", client=client)
|
|
199
|
+
resp = await p.complete(ModelRequest(messages=[{"role": "user", "content": "hi"}]))
|
|
200
|
+
assert resp.text is None
|
|
201
|
+
assert resp.tool_calls == []
|
|
202
|
+
assert resp.finish is Finish.STOP
|
|
203
|
+
assert resp.usage["total_tokens"] == 3 # usage still captured
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def test_orphan_tool_result_raises_not_silent() -> None:
|
|
207
|
+
# A tool result with no preceding tool call is a malformed history; both
|
|
208
|
+
# translators must fail loudly here rather than fabricate an id that the
|
|
209
|
+
# provider would reject downstream as an opaque 400.
|
|
210
|
+
bad = [{"role": "tool", "name": "http_fetch", "content": "{}"}]
|
|
211
|
+
with pytest.raises(ValueError):
|
|
212
|
+
to_anthropic_messages(bad)
|
|
213
|
+
with pytest.raises(ValueError):
|
|
214
|
+
to_openai_messages(bad)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
# --- request translation (provider-specific wire shapes) ----------------------
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
async def test_anthropic_request_translation() -> None:
|
|
221
|
+
captured: list = []
|
|
222
|
+
p = make_anthropic("text", captured)
|
|
223
|
+
await p.complete(
|
|
224
|
+
ModelRequest(
|
|
225
|
+
messages=[{"role": "system", "content": "sys"}, {"role": "user", "content": "hi"}],
|
|
226
|
+
tools=[_TOOL],
|
|
227
|
+
)
|
|
228
|
+
)
|
|
229
|
+
body = captured[0]
|
|
230
|
+
assert body["system"] == "sys" # system lifted out of messages
|
|
231
|
+
assert body["messages"] == [{"role": "user", "content": "hi"}]
|
|
232
|
+
assert body["max_tokens"] == 4096 # adapter default
|
|
233
|
+
assert body["tools"][0]["name"] == "http_fetch"
|
|
234
|
+
assert "input_schema" in body["tools"][0] # parameters -> input_schema
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
async def test_openai_request_translation() -> None:
|
|
238
|
+
captured: list = []
|
|
239
|
+
p = make_openai("text", captured)
|
|
240
|
+
await p.complete(
|
|
241
|
+
ModelRequest(
|
|
242
|
+
messages=[{"role": "system", "content": "sys"}, {"role": "user", "content": "hi"}],
|
|
243
|
+
tools=[_TOOL],
|
|
244
|
+
)
|
|
245
|
+
)
|
|
246
|
+
body = captured[0]
|
|
247
|
+
assert body["messages"][0] == {"role": "system", "content": "sys"} # system stays inline
|
|
248
|
+
assert body["tools"][0]["type"] == "function"
|
|
249
|
+
assert body["tools"][0]["function"]["name"] == "http_fetch"
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
# --- the id-matching translation (pure, no SDK) -------------------------------
|
|
253
|
+
|
|
254
|
+
_TOOL_HISTORY: list[dict] = [
|
|
255
|
+
{"role": "user", "content": "q"},
|
|
256
|
+
{"role": "assistant", "tool_calls": [{"name": "http_fetch", "args": {"url": "u"}}]},
|
|
257
|
+
{"role": "tool", "name": "http_fetch", "content": "{\"html\": \"x\"}"},
|
|
258
|
+
]
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def test_to_anthropic_matches_synthesised_tool_ids() -> None:
|
|
262
|
+
_, out = to_anthropic_messages(_TOOL_HISTORY)
|
|
263
|
+
use_id = out[1]["content"][0]["id"]
|
|
264
|
+
result_block = out[2]["content"][0]
|
|
265
|
+
assert result_block["type"] == "tool_result"
|
|
266
|
+
assert result_block["tool_use_id"] == use_id # result references the same id
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def test_to_openai_matches_synthesised_tool_ids() -> None:
|
|
270
|
+
out = to_openai_messages(_TOOL_HISTORY)
|
|
271
|
+
call_id = out[1]["tool_calls"][0]["id"]
|
|
272
|
+
assert out[2]["role"] == "tool"
|
|
273
|
+
assert out[2]["tool_call_id"] == call_id
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
# An assistant turn that reasons *and* calls a tool: the text must survive into
|
|
277
|
+
# both wire formats (regression: it used to be silently dropped on replay).
|
|
278
|
+
_TOOL_HISTORY_WITH_TEXT: list[dict] = [
|
|
279
|
+
{"role": "user", "content": "q"},
|
|
280
|
+
{"role": "assistant", "content": "I'll fetch the page first.",
|
|
281
|
+
"tool_calls": [{"name": "http_fetch", "args": {"url": "u"}}]},
|
|
282
|
+
{"role": "tool", "name": "http_fetch", "content": "{\"html\": \"x\"}"},
|
|
283
|
+
]
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def test_anthropic_preserves_assistant_text_with_tool_calls() -> None:
|
|
287
|
+
_, out = to_anthropic_messages(_TOOL_HISTORY_WITH_TEXT)
|
|
288
|
+
blocks = out[1]["content"]
|
|
289
|
+
assert blocks[0] == {"type": "text", "text": "I'll fetch the page first."}
|
|
290
|
+
assert blocks[1]["type"] == "tool_use" # text leads, tool_use follows
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def test_openai_preserves_assistant_text_with_tool_calls() -> None:
|
|
294
|
+
out = to_openai_messages(_TOOL_HISTORY_WITH_TEXT)
|
|
295
|
+
assert out[1]["content"] == "I'll fetch the page first."
|
|
296
|
+
assert out[1]["tool_calls"][0]["function"]["name"] == "http_fetch"
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
# --- the adapter drives the real loop (full assistant/tool history) -----------
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
async def test_anthropic_adapter_drives_the_loop() -> None:
|
|
303
|
+
# The checklist uses simple messages; this proves the loop's full neutral
|
|
304
|
+
# history (system + user -> tool_use -> assistant tool_calls + tool result ->
|
|
305
|
+
# finalise) round-trips through the adapter's translation, end to end.
|
|
306
|
+
from zu_core.bus import EventBus
|
|
307
|
+
from zu_core.contracts import Status, TaskSpec
|
|
308
|
+
from zu_core.loop import run_task
|
|
309
|
+
from zu_core.registry import Registry
|
|
310
|
+
from zu_tools.fetch import HttpFetch
|
|
311
|
+
|
|
312
|
+
page = "<html><body><span class='price'>$9.00</span></body></html>"
|
|
313
|
+
|
|
314
|
+
def fetch_handler(request: httpx.Request) -> httpx.Response:
|
|
315
|
+
return httpx.Response(200, text=page)
|
|
316
|
+
|
|
317
|
+
reg = Registry()
|
|
318
|
+
reg.register(
|
|
319
|
+
"tools", "http_fetch", HttpFetch(allow_private=True, transport=httpx.MockTransport(fetch_handler))
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
# Stateful model mock: first call asks to fetch, second call finalises.
|
|
323
|
+
turn = {"n": 0}
|
|
324
|
+
|
|
325
|
+
def model_handler(request: httpx.Request) -> httpx.Response:
|
|
326
|
+
turn["n"] += 1
|
|
327
|
+
if turn["n"] == 1:
|
|
328
|
+
payload = {
|
|
329
|
+
"id": "m1", "type": "message", "role": "assistant", "model": "claude-opus-4-8",
|
|
330
|
+
"content": [{"type": "tool_use", "id": "toolu_a", "name": "http_fetch", "input": {"url": "http://x.test/"}}],
|
|
331
|
+
"stop_reason": "tool_use", "stop_sequence": None,
|
|
332
|
+
"usage": {"input_tokens": 20, "output_tokens": 8},
|
|
333
|
+
}
|
|
334
|
+
else:
|
|
335
|
+
payload = {
|
|
336
|
+
"id": "m2", "type": "message", "role": "assistant", "model": "claude-opus-4-8",
|
|
337
|
+
"content": [{"type": "text", "text": "{\"price\": \"$9.00\"}"}],
|
|
338
|
+
"stop_reason": "end_turn", "stop_sequence": None,
|
|
339
|
+
"usage": {"input_tokens": 40, "output_tokens": 12},
|
|
340
|
+
}
|
|
341
|
+
return httpx.Response(200, json=payload)
|
|
342
|
+
|
|
343
|
+
client = anthropic.AsyncAnthropic(
|
|
344
|
+
api_key="test", http_client=httpx.AsyncClient(transport=httpx.MockTransport(model_handler))
|
|
345
|
+
)
|
|
346
|
+
provider = AnthropicProvider(client=client)
|
|
347
|
+
|
|
348
|
+
result = await run_task(TaskSpec(query="get the price"), provider, reg, EventBus())
|
|
349
|
+
assert result.status == Status.SUCCESS
|
|
350
|
+
assert result.value == {"price": "$9.00"}
|
|
351
|
+
assert turn["n"] == 2 # the loop drove two model turns through the adapter
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
# --- opt-in live calls (@pytest.mark.live + a real key; run with --run-live) ---
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
@pytest.mark.live
|
|
358
|
+
@pytest.mark.skipif(not os.environ.get("ANTHROPIC_API_KEY"), reason="needs ANTHROPIC_API_KEY")
|
|
359
|
+
async def test_live_anthropic() -> None:
|
|
360
|
+
p = AnthropicProvider()
|
|
361
|
+
r = await p.complete(
|
|
362
|
+
ModelRequest(messages=[{"role": "user", "content": "Reply with the single word: pong"}], params={"max_tokens": 16})
|
|
363
|
+
)
|
|
364
|
+
assert r.text is not None and "pong" in r.text.lower()
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
@pytest.mark.live
|
|
368
|
+
@pytest.mark.skipif(
|
|
369
|
+
not (os.environ.get("OPENAI_API_KEY") or os.environ.get("OPENAI_BASE_URL")),
|
|
370
|
+
reason="needs OPENAI_API_KEY or OPENAI_BASE_URL",
|
|
371
|
+
)
|
|
372
|
+
async def test_live_openai() -> None:
|
|
373
|
+
p = OpenAICompatibleProvider(model=os.environ.get("ZU_LIVE_OPENAI_MODEL", "gpt-4o-mini"))
|
|
374
|
+
r = await p.complete(
|
|
375
|
+
ModelRequest(messages=[{"role": "user", "content": "Reply with the single word: pong"}], params={"max_tokens": 16})
|
|
376
|
+
)
|
|
377
|
+
assert r.text is not None and "pong" in r.text.lower()
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""Build step 2 — the fake model plays its script back in order."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from zu_core.ports import Finish
|
|
6
|
+
from zu_providers.scripted import ScriptedProvider
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
async def _req():
|
|
10
|
+
from zu_core.ports import ModelRequest
|
|
11
|
+
|
|
12
|
+
return ModelRequest(messages=[{"role": "user", "content": "go"}])
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
async def test_plays_moves_in_order() -> None:
|
|
16
|
+
p = ScriptedProvider.from_moves(
|
|
17
|
+
[
|
|
18
|
+
{"tool": "http_fetch", "args": {"url": "https://example.com"}},
|
|
19
|
+
{"tool": "html_parse", "args": {"selector": ".price"}},
|
|
20
|
+
{"text": "the price is $9", "finish": "stop"},
|
|
21
|
+
]
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
r1 = await p.complete(await _req())
|
|
25
|
+
assert r1.finish is Finish.TOOL_CALLS
|
|
26
|
+
assert r1.tool_calls[0].name == "http_fetch"
|
|
27
|
+
assert r1.tool_calls[0].args == {"url": "https://example.com"}
|
|
28
|
+
|
|
29
|
+
r2 = await p.complete(await _req())
|
|
30
|
+
assert r2.tool_calls[0].name == "html_parse"
|
|
31
|
+
|
|
32
|
+
r3 = await p.complete(await _req())
|
|
33
|
+
assert r3.finish is Finish.STOP
|
|
34
|
+
assert r3.text == "the price is $9"
|
|
35
|
+
assert p.exhausted
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
async def test_past_end_is_a_stop() -> None:
|
|
39
|
+
p = ScriptedProvider.from_moves([{"text": "done"}])
|
|
40
|
+
await p.complete(await _req())
|
|
41
|
+
overrun = await p.complete(await _req())
|
|
42
|
+
assert overrun.finish is Finish.STOP
|
|
43
|
+
assert overrun.text is None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def test_declares_capabilities() -> None:
|
|
47
|
+
p = ScriptedProvider.from_moves([])
|
|
48
|
+
assert p.capabilities.native_tools is True
|