modelmeld 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- modelmeld/__init__.py +6 -0
- modelmeld/__main__.py +24 -0
- modelmeld/adapters/__init__.py +11 -0
- modelmeld/adapters/anthropic_adapter.py +165 -0
- modelmeld/adapters/base.py +116 -0
- modelmeld/adapters/openai_adapter.py +125 -0
- modelmeld/adapters/retry.py +181 -0
- modelmeld/adapters/stub.py +87 -0
- modelmeld/adapters/tensorrt_llm_adapter.py +60 -0
- modelmeld/adapters/vllm_adapter.py +57 -0
- modelmeld/api/__init__.py +3 -0
- modelmeld/api/body_size_limit.py +124 -0
- modelmeld/api/byok.py +197 -0
- modelmeld/api/routes/__init__.py +3 -0
- modelmeld/api/routes/chat.py +950 -0
- modelmeld/api/routes/healthz.py +13 -0
- modelmeld/api/routes/messages.py +728 -0
- modelmeld/api/routes/models.py +84 -0
- modelmeld/api/routing_hints.py +180 -0
- modelmeld/api/schemas.py +341 -0
- modelmeld/api/schemas_anthropic.py +418 -0
- modelmeld/api/server.py +125 -0
- modelmeld/cache/__init__.py +59 -0
- modelmeld/cache/base.py +125 -0
- modelmeld/cache/embedding.py +94 -0
- modelmeld/cache/in_memory.py +79 -0
- modelmeld/cache/semantic.py +134 -0
- modelmeld/cli/__init__.py +95 -0
- modelmeld/cli/__main__.py +7 -0
- modelmeld/cli/doctor.py +488 -0
- modelmeld/cli/setup.py +504 -0
- modelmeld/config.py +114 -0
- modelmeld/hooks.py +117 -0
- modelmeld/licensing.py +247 -0
- modelmeld/memory/__init__.py +110 -0
- modelmeld/memory/base.py +352 -0
- modelmeld/memory/context.py +313 -0
- modelmeld/memory/identity.py +130 -0
- modelmeld/memory/in_memory.py +249 -0
- modelmeld/memory/summarizer.py +304 -0
- modelmeld/memory/tiers.py +82 -0
- modelmeld/privacy/__init__.py +31 -0
- modelmeld/privacy/scrubber.py +220 -0
- modelmeld/py.typed +0 -0
- modelmeld/router/__init__.py +173 -0
- modelmeld/router/base.py +338 -0
- modelmeld/router/capability.py +263 -0
- modelmeld/scout/__init__.py +81 -0
- modelmeld/scout/base.py +54 -0
- modelmeld/scout/benchmarks/__init__.py +55 -0
- modelmeld/scout/benchmarks/aider_polyglot.py +147 -0
- modelmeld/scout/benchmarks/artificial_analysis.py +201 -0
- modelmeld/scout/benchmarks/base.py +52 -0
- modelmeld/scout/benchmarks/livebench.py +140 -0
- modelmeld/scout/benchmarks/lmarena.py +174 -0
- modelmeld/scout/benchmarks/refresher.py +339 -0
- modelmeld/scout/capability.py +472 -0
- modelmeld/scout/data/LICENSE.md +91 -0
- modelmeld/scout/data/__init__.py +3 -0
- modelmeld/scout/data/default_registry.json +502 -0
- modelmeld/scout/devtool.py +179 -0
- modelmeld/scout/feed.py +320 -0
- modelmeld/scout/heuristics.py +192 -0
- modelmeld/scout/policy.py +226 -0
- modelmeld/scout/registry.py +283 -0
- modelmeld/scout/task_category.py +193 -0
- modelmeld/tokens/__init__.py +27 -0
- modelmeld/tokens/counter.py +170 -0
- modelmeld/translation/__init__.py +32 -0
- modelmeld/translation/openai_anthropic.py +1104 -0
- modelmeld-0.1.0.dist-info/METADATA +262 -0
- modelmeld-0.1.0.dist-info/RECORD +76 -0
- modelmeld-0.1.0.dist-info/WHEEL +4 -0
- modelmeld-0.1.0.dist-info/entry_points.txt +2 -0
- modelmeld-0.1.0.dist-info/licenses/LICENSE +661 -0
- modelmeld-0.1.0.dist-info/licenses/NOTICE +88 -0
modelmeld/__init__.py
ADDED
modelmeld/__main__.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (c) 2026 ModelMeld.
|
|
3
|
+
|
|
4
|
+
"""Run the gateway server: `python -m modelmeld`."""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def main() -> None:
|
|
10
|
+
import uvicorn
|
|
11
|
+
|
|
12
|
+
from modelmeld.config import GatewaySettings
|
|
13
|
+
|
|
14
|
+
settings = GatewaySettings()
|
|
15
|
+
uvicorn.run(
|
|
16
|
+
"modelmeld.api.server:app",
|
|
17
|
+
host=settings.host,
|
|
18
|
+
port=settings.port,
|
|
19
|
+
log_level=settings.log_level.lower(),
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
if __name__ == "__main__":
|
|
24
|
+
main()
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (c) 2026 ModelMeld.
|
|
3
|
+
|
|
4
|
+
"""Provider adapters. See base.py for the contract."""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from modelmeld.adapters.base import AdapterError, ProviderAdapter
|
|
9
|
+
from modelmeld.adapters.stub import StubAdapter
|
|
10
|
+
|
|
11
|
+
__all__ = ["AdapterError", "ProviderAdapter", "StubAdapter"]
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (c) 2026 ModelMeld.
|
|
3
|
+
|
|
4
|
+
"""AnthropicAdapter — pass-through to Anthropic Messages API with schema translation."""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
from collections.abc import AsyncIterator
|
|
10
|
+
|
|
11
|
+
from modelmeld.adapters.base import AdapterError, ProviderAdapter
|
|
12
|
+
from modelmeld.adapters.retry import (
|
|
13
|
+
RetryConfig,
|
|
14
|
+
retry_async,
|
|
15
|
+
wrap_as_adapter_error,
|
|
16
|
+
)
|
|
17
|
+
from modelmeld.api.schemas import (
|
|
18
|
+
ChatCompletion,
|
|
19
|
+
ChatCompletionChunk,
|
|
20
|
+
ChatCompletionRequest,
|
|
21
|
+
)
|
|
22
|
+
from modelmeld.api.schemas_anthropic import AnthropicMessagesRequest
|
|
23
|
+
from modelmeld.translation import (
|
|
24
|
+
AnthropicStreamTranslator,
|
|
25
|
+
from_anthropic_response,
|
|
26
|
+
to_anthropic_params,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class AnthropicAdapter(ProviderAdapter):
|
|
31
|
+
name = "anthropic"
|
|
32
|
+
is_egress = True
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
api_key: str | None = None,
|
|
37
|
+
base_url: str | None = None,
|
|
38
|
+
retry_config: RetryConfig | None = None,
|
|
39
|
+
served_model: str | None = None,
|
|
40
|
+
) -> None:
|
|
41
|
+
try:
|
|
42
|
+
from anthropic import AsyncAnthropic
|
|
43
|
+
except ImportError as e:
|
|
44
|
+
raise AdapterError(
|
|
45
|
+
"AnthropicAdapter requires the `anthropic` package. "
|
|
46
|
+
"Install with: pip install 'modelmeld[anthropic]'"
|
|
47
|
+
) from e
|
|
48
|
+
|
|
49
|
+
key = api_key or os.environ.get("ANTHROPIC_API_KEY")
|
|
50
|
+
if not key:
|
|
51
|
+
raise AdapterError(
|
|
52
|
+
"AnthropicAdapter requires an API key "
|
|
53
|
+
"(pass api_key= or set ANTHROPIC_API_KEY / MODELMELD_ANTHROPIC_API_KEY)."
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# Disable the SDK's built-in retry; we own retry policy via retry_async.
|
|
57
|
+
# Stacking SDK retries on top of ours wastes time and rate limit.
|
|
58
|
+
kwargs: dict = {"api_key": key, "max_retries": 0}
|
|
59
|
+
if base_url:
|
|
60
|
+
kwargs["base_url"] = base_url
|
|
61
|
+
self._client = AsyncAnthropic(**kwargs)
|
|
62
|
+
self._retry_config = retry_config or RetryConfig()
|
|
63
|
+
# F-8: operator-pinned upstream model (overrides request.model).
|
|
64
|
+
self.served_model = served_model
|
|
65
|
+
|
|
66
|
+
async def chat(
|
|
67
|
+
self,
|
|
68
|
+
request: ChatCompletionRequest,
|
|
69
|
+
*,
|
|
70
|
+
native_request: object | None = None,
|
|
71
|
+
extra_headers: dict[str, str] | None = None,
|
|
72
|
+
) -> ChatCompletion:
|
|
73
|
+
"""Non-streaming chat.
|
|
74
|
+
|
|
75
|
+
`extra_headers` is the optional /v1/messages escape hatch for
|
|
76
|
+
forwarding caller-supplied Anthropic protocol headers
|
|
77
|
+
(`anthropic-beta`, `anthropic-version`, etc.) verbatim to the
|
|
78
|
+
upstream. Without this, beta features the customer activates
|
|
79
|
+
silently fall back at our gateway.
|
|
80
|
+
"""
|
|
81
|
+
params = self._build_params(request, native_request)
|
|
82
|
+
if extra_headers:
|
|
83
|
+
params["extra_headers"] = dict(extra_headers)
|
|
84
|
+
|
|
85
|
+
async def _call():
|
|
86
|
+
return await self._client.messages.create(**params)
|
|
87
|
+
|
|
88
|
+
try:
|
|
89
|
+
sdk_message = await retry_async(
|
|
90
|
+
_call, self._retry_config, label="anthropic.chat",
|
|
91
|
+
)
|
|
92
|
+
except Exception as e:
|
|
93
|
+
raise wrap_as_adapter_error(e, "Anthropic chat call failed") from e
|
|
94
|
+
return from_anthropic_response(sdk_message.model_dump())
|
|
95
|
+
|
|
96
|
+
async def stream_chat(
|
|
97
|
+
self,
|
|
98
|
+
request: ChatCompletionRequest,
|
|
99
|
+
*,
|
|
100
|
+
native_request: object | None = None,
|
|
101
|
+
extra_headers: dict[str, str] | None = None,
|
|
102
|
+
) -> AsyncIterator[ChatCompletionChunk]:
|
|
103
|
+
params = self._build_params(request, native_request)
|
|
104
|
+
params["stream"] = True
|
|
105
|
+
if extra_headers:
|
|
106
|
+
params["extra_headers"] = dict(extra_headers)
|
|
107
|
+
|
|
108
|
+
async def _open_stream():
|
|
109
|
+
return await self._client.messages.create(**params)
|
|
110
|
+
|
|
111
|
+
try:
|
|
112
|
+
stream = await retry_async(
|
|
113
|
+
_open_stream, self._retry_config, label="anthropic.stream_chat",
|
|
114
|
+
)
|
|
115
|
+
except Exception as e:
|
|
116
|
+
raise wrap_as_adapter_error(
|
|
117
|
+
e, "Anthropic stream_chat call failed",
|
|
118
|
+
) from e
|
|
119
|
+
|
|
120
|
+
translator = AnthropicStreamTranslator()
|
|
121
|
+
async for event in stream:
|
|
122
|
+
chunk = translator.translate_event(event.model_dump())
|
|
123
|
+
if chunk is not None:
|
|
124
|
+
yield chunk
|
|
125
|
+
|
|
126
|
+
def _build_params(
|
|
127
|
+
self,
|
|
128
|
+
request: ChatCompletionRequest,
|
|
129
|
+
native_request: object | None,
|
|
130
|
+
) -> dict:
|
|
131
|
+
"""Construct the Anthropic SDK params from either the native
|
|
132
|
+
Anthropic request (preserving cache_control + tool schemas +
|
|
133
|
+
image content blocks intact) or, when not available, by
|
|
134
|
+
round-tripping through the OpenAI internal shape.
|
|
135
|
+
|
|
136
|
+
Native-passthrough is the path /v1/messages takes when routing
|
|
137
|
+
to an Anthropic upstream — without it, cache_control breakpoints
|
|
138
|
+
get silently dropped and customers pay ~5x more on what would
|
|
139
|
+
otherwise be cache hits (the failure mode musistudio/claude-code-router
|
|
140
|
+
ships today). /v1/chat/completions callers don't supply
|
|
141
|
+
native_request and use the translation path.
|
|
142
|
+
"""
|
|
143
|
+
if isinstance(native_request, AnthropicMessagesRequest):
|
|
144
|
+
# Native passthrough — preserve the customer's exact request
|
|
145
|
+
# shape. Apply F-8 served_model substitution at this layer so
|
|
146
|
+
# operators can still pin the upstream model regardless of
|
|
147
|
+
# what the customer asked for.
|
|
148
|
+
params = native_request.model_dump(exclude_none=True)
|
|
149
|
+
if self.served_model is not None:
|
|
150
|
+
params["model"] = self.served_model
|
|
151
|
+
elif params.get("model") is None:
|
|
152
|
+
# Defense in depth — Anthropic SDK requires `model`.
|
|
153
|
+
params["model"] = request.model
|
|
154
|
+
return params
|
|
155
|
+
# Translation path (the existing /v1/chat/completions behavior).
|
|
156
|
+
request = self._apply_served_model(request)
|
|
157
|
+
return to_anthropic_params(request)
|
|
158
|
+
|
|
159
|
+
async def health(self) -> bool:
|
|
160
|
+
# Anthropic has no cheap public health endpoint; consider the client
|
|
161
|
+
# configured-and-imported as healthy. Real check happens on first call.
|
|
162
|
+
return True
|
|
163
|
+
|
|
164
|
+
async def close(self) -> None:
|
|
165
|
+
await self._client.close()
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (c) 2026 ModelMeld.
|
|
3
|
+
|
|
4
|
+
"""Provider adapter abstract base class.
|
|
5
|
+
|
|
6
|
+
`ProviderAdapter` is the extension point through which the gateway forwards
|
|
7
|
+
OpenAI-shaped requests to a concrete upstream (OpenAI cloud, Anthropic cloud,
|
|
8
|
+
local vLLM, etc.). Implementations live in sibling modules.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from abc import ABC, abstractmethod
|
|
14
|
+
from collections.abc import AsyncIterator
|
|
15
|
+
|
|
16
|
+
from modelmeld.api.schemas import (
|
|
17
|
+
ChatCompletion,
|
|
18
|
+
ChatCompletionChunk,
|
|
19
|
+
ChatCompletionRequest,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class AdapterError(Exception):
|
|
24
|
+
"""Raised when an adapter fails to fulfill a request.
|
|
25
|
+
|
|
26
|
+
Network failures, upstream 5xx responses, schema-translation errors, and
|
|
27
|
+
misconfiguration (missing API key, etc.) all surface as this exception.
|
|
28
|
+
|
|
29
|
+
Subclasses `TransientAdapterError` and `PermanentAdapterError` carry the
|
|
30
|
+
retry-ability signal so the TieredRouter can decide whether to fail over
|
|
31
|
+
to the other tier or bubble the error up to the caller.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class TransientAdapterError(AdapterError):
|
|
36
|
+
"""Adapter failed in a way that may succeed on retry / failover.
|
|
37
|
+
|
|
38
|
+
Examples: HTTP 5xx, 429 rate limit, 529 overloaded, network blip,
|
|
39
|
+
timeout. Routers should attempt the other tier; callers should treat
|
|
40
|
+
repeated occurrences as a real outage.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class PermanentAdapterError(AdapterError):
|
|
45
|
+
"""Adapter failed in a way that retry won't fix.
|
|
46
|
+
|
|
47
|
+
Examples: HTTP 401/403 auth failure, HTTP 404 model-not-found,
|
|
48
|
+
schema-translation errors, misconfiguration. Routers should NOT fail
|
|
49
|
+
over — surface the error so the caller sees the real cause instead of
|
|
50
|
+
a misleading fallback response.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class ProviderAdapter(ABC):
|
|
55
|
+
"""Translate an OpenAI-shaped request to a concrete upstream provider."""
|
|
56
|
+
|
|
57
|
+
name: str
|
|
58
|
+
# True when this adapter sends traffic outside the customer's network.
|
|
59
|
+
# Used by the chat route to gate PII scrubbing.
|
|
60
|
+
is_egress: bool = False
|
|
61
|
+
# F-8: operator-configured model this adapter actually serves upstream.
|
|
62
|
+
# When set, the adapter substitutes `request.model` with this value on
|
|
63
|
+
# outbound calls — the client can send any model name (or none) and the
|
|
64
|
+
# gateway routes them based on the scout's tier decision while the
|
|
65
|
+
# adapter uses its configured upstream model.
|
|
66
|
+
# When None, the adapter passes the client's model name through unchanged
|
|
67
|
+
# (default for adapters that proxy to multi-model providers).
|
|
68
|
+
served_model: str | None = None
|
|
69
|
+
|
|
70
|
+
@abstractmethod
|
|
71
|
+
async def chat(self, request: ChatCompletionRequest) -> ChatCompletion:
|
|
72
|
+
"""Non-streaming chat completion."""
|
|
73
|
+
|
|
74
|
+
@abstractmethod
|
|
75
|
+
def stream_chat(
|
|
76
|
+
self, request: ChatCompletionRequest
|
|
77
|
+
) -> AsyncIterator[ChatCompletionChunk]:
|
|
78
|
+
"""Streaming chat completion. Implementations are async generators."""
|
|
79
|
+
|
|
80
|
+
@abstractmethod
|
|
81
|
+
async def health(self) -> bool:
|
|
82
|
+
"""Cheap upstream reachability check. Returns False on failure."""
|
|
83
|
+
|
|
84
|
+
def serves_model(self, model_id: str) -> bool: # noqa: ARG002 — base default
|
|
85
|
+
"""Whether this adapter can serve the given model id (F-8).
|
|
86
|
+
|
|
87
|
+
Default returns True for both pinned and pass-through configurations:
|
|
88
|
+
- `served_model=None` → pass-through; we don't know what upstream
|
|
89
|
+
supports, so we assume failover is safe.
|
|
90
|
+
- `served_model="X"` → substitution; `_apply_served_model()` will
|
|
91
|
+
rewrite `request.model` to X on the outbound call, so the
|
|
92
|
+
adapter will serve any request regardless of the client's
|
|
93
|
+
model id. Setting `served_model` is opting into substitution.
|
|
94
|
+
|
|
95
|
+
TieredRouter consults this before failover. Subclasses can
|
|
96
|
+
override for stricter behavior (e.g. compliance-mode adapter
|
|
97
|
+
that rejects non-matching model ids outright).
|
|
98
|
+
"""
|
|
99
|
+
return True
|
|
100
|
+
|
|
101
|
+
def _apply_served_model(
|
|
102
|
+
self, request: ChatCompletionRequest,
|
|
103
|
+
) -> ChatCompletionRequest:
|
|
104
|
+
"""Return a request with `model` substituted to `served_model` if set.
|
|
105
|
+
|
|
106
|
+
Returns the original request when `served_model` is None — no copy
|
|
107
|
+
on the hot path for the pass-through case. Adapters call this at
|
|
108
|
+
the top of `chat()` / `stream_chat()` before delegating upstream.
|
|
109
|
+
"""
|
|
110
|
+
if self.served_model is None or request.model == self.served_model:
|
|
111
|
+
return request
|
|
112
|
+
# Pydantic model_copy is shallow + cheap; preserves all other fields.
|
|
113
|
+
return request.model_copy(update={"model": self.served_model})
|
|
114
|
+
|
|
115
|
+
async def close(self) -> None:
|
|
116
|
+
"""Release any held resources. Default no-op; override if needed."""
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (c) 2026 ModelMeld.
|
|
3
|
+
|
|
4
|
+
"""OpenAIAdapter — pass-through to OpenAI's cloud API via the official SDK.
|
|
5
|
+
|
|
6
|
+
Named `openai_adapter` (not `openai`) to avoid shadowing the upstream package
|
|
7
|
+
when read by humans.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import os
|
|
13
|
+
from collections.abc import AsyncIterator
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
import httpx
|
|
17
|
+
|
|
18
|
+
from modelmeld.adapters.base import AdapterError, ProviderAdapter
|
|
19
|
+
from modelmeld.adapters.retry import (
|
|
20
|
+
RetryConfig,
|
|
21
|
+
retry_async,
|
|
22
|
+
wrap_as_adapter_error,
|
|
23
|
+
)
|
|
24
|
+
from modelmeld.api.schemas import (
|
|
25
|
+
ChatCompletion,
|
|
26
|
+
ChatCompletionChunk,
|
|
27
|
+
ChatCompletionRequest,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class OpenAIAdapter(ProviderAdapter):
|
|
32
|
+
name = "openai"
|
|
33
|
+
is_egress = True
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
api_key: str | None = None,
|
|
38
|
+
base_url: str | None = None,
|
|
39
|
+
http_client: httpx.AsyncClient | None = None,
|
|
40
|
+
retry_config: RetryConfig | None = None,
|
|
41
|
+
served_model: str | None = None,
|
|
42
|
+
) -> None:
|
|
43
|
+
try:
|
|
44
|
+
from openai import AsyncOpenAI
|
|
45
|
+
except ImportError as e:
|
|
46
|
+
raise AdapterError(
|
|
47
|
+
"OpenAIAdapter requires the `openai` package. "
|
|
48
|
+
"Install with: pip install 'modelmeld[openai]'"
|
|
49
|
+
) from e
|
|
50
|
+
|
|
51
|
+
key = api_key or os.environ.get("OPENAI_API_KEY")
|
|
52
|
+
if not key:
|
|
53
|
+
raise AdapterError(
|
|
54
|
+
"OpenAIAdapter requires an API key "
|
|
55
|
+
"(pass api_key= or set OPENAI_API_KEY / MODELMELD_OPENAI_API_KEY)."
|
|
56
|
+
)
|
|
57
|
+
# Disable the SDK's built-in retry; retry policy lives in retry_async.
|
|
58
|
+
self._client = AsyncOpenAI(
|
|
59
|
+
api_key=key,
|
|
60
|
+
base_url=base_url,
|
|
61
|
+
http_client=http_client,
|
|
62
|
+
max_retries=0,
|
|
63
|
+
)
|
|
64
|
+
self._retry_config = retry_config or RetryConfig()
|
|
65
|
+
# F-8: operator-pinned upstream model (overrides request.model).
|
|
66
|
+
self.served_model = served_model
|
|
67
|
+
|
|
68
|
+
def _to_params(
|
|
69
|
+
self, request: ChatCompletionRequest, *, stream: bool
|
|
70
|
+
) -> dict[str, Any]:
|
|
71
|
+
# exclude_none keeps optional fields off the wire so we don't override the
|
|
72
|
+
# upstream's defaults; we set `stream` explicitly per call.
|
|
73
|
+
excluded: set[str] = {"stream"}
|
|
74
|
+
if not stream:
|
|
75
|
+
excluded.add("stream_options")
|
|
76
|
+
params = request.model_dump(exclude_none=True, exclude=excluded)
|
|
77
|
+
params["stream"] = stream
|
|
78
|
+
return params
|
|
79
|
+
|
|
80
|
+
async def chat(self, request: ChatCompletionRequest) -> ChatCompletion:
|
|
81
|
+
request = self._apply_served_model(request)
|
|
82
|
+
|
|
83
|
+
async def _call():
|
|
84
|
+
return await self._client.chat.completions.create(
|
|
85
|
+
**self._to_params(request, stream=False)
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
try:
|
|
89
|
+
sdk_response = await retry_async(
|
|
90
|
+
_call, self._retry_config, label="openai.chat",
|
|
91
|
+
)
|
|
92
|
+
except Exception as e:
|
|
93
|
+
raise wrap_as_adapter_error(e, "OpenAI chat call failed") from e
|
|
94
|
+
return ChatCompletion.model_validate(sdk_response.model_dump())
|
|
95
|
+
|
|
96
|
+
async def stream_chat(
|
|
97
|
+
self, request: ChatCompletionRequest
|
|
98
|
+
) -> AsyncIterator[ChatCompletionChunk]:
|
|
99
|
+
request = self._apply_served_model(request)
|
|
100
|
+
|
|
101
|
+
async def _open_stream():
|
|
102
|
+
return await self._client.chat.completions.create(
|
|
103
|
+
**self._to_params(request, stream=True)
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
try:
|
|
107
|
+
stream = await retry_async(
|
|
108
|
+
_open_stream, self._retry_config, label="openai.stream_chat",
|
|
109
|
+
)
|
|
110
|
+
except Exception as e:
|
|
111
|
+
raise wrap_as_adapter_error(
|
|
112
|
+
e, "OpenAI stream_chat call failed",
|
|
113
|
+
) from e
|
|
114
|
+
async for chunk in stream:
|
|
115
|
+
yield ChatCompletionChunk.model_validate(chunk.model_dump())
|
|
116
|
+
|
|
117
|
+
async def health(self) -> bool:
|
|
118
|
+
try:
|
|
119
|
+
await self._client.models.list()
|
|
120
|
+
return True
|
|
121
|
+
except Exception:
|
|
122
|
+
return False
|
|
123
|
+
|
|
124
|
+
async def close(self) -> None:
|
|
125
|
+
await self._client.close()
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (c) 2026 ModelMeld.
|
|
3
|
+
|
|
4
|
+
"""Retry-with-backoff utility for adapter calls (F-5).
|
|
5
|
+
|
|
6
|
+
Wraps an async adapter call with exponential backoff retry on transient
|
|
7
|
+
errors. Permanent errors (auth failure, config mismatch, schema errors)
|
|
8
|
+
raise immediately - retrying them just wastes time and exhausts the
|
|
9
|
+
provider's rate limit.
|
|
10
|
+
|
|
11
|
+
Used by `AnthropicAdapter` and `OpenAIAdapter` to absorb provider
|
|
12
|
+
throttling and 5xx blips before they reach the `TieredRouter`. With this
|
|
13
|
+
in place, the router's failover logic only triggers on outages that
|
|
14
|
+
genuinely persist across retries.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import asyncio
|
|
20
|
+
import logging
|
|
21
|
+
import random
|
|
22
|
+
from collections.abc import Awaitable, Callable
|
|
23
|
+
from dataclasses import dataclass
|
|
24
|
+
from typing import TypeVar
|
|
25
|
+
|
|
26
|
+
from modelmeld.adapters.base import (
|
|
27
|
+
AdapterError,
|
|
28
|
+
PermanentAdapterError,
|
|
29
|
+
TransientAdapterError,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
T = TypeVar("T")
|
|
35
|
+
|
|
36
|
+
# HTTP status codes that justify a retry. Anything else is treated as
|
|
37
|
+
# permanent - retrying a 401 won't make the credentials valid.
|
|
38
|
+
TRANSIENT_STATUS_CODES: frozenset[int] = frozenset({
|
|
39
|
+
408, # Request Timeout
|
|
40
|
+
409, # Conflict (some APIs use for "operation in progress")
|
|
41
|
+
425, # Too Early
|
|
42
|
+
429, # Too Many Requests
|
|
43
|
+
500, 502, 503, 504, # Server-side failures
|
|
44
|
+
529, # Anthropic-specific Overloaded
|
|
45
|
+
})
|
|
46
|
+
|
|
47
|
+
# Class-name fragments that indicate transience when status_code isn't
|
|
48
|
+
# available on the exception (some SDKs raise network errors that wrap
|
|
49
|
+
# the underlying httpx/aiohttp exception).
|
|
50
|
+
TRANSIENT_CLASS_HINTS: tuple[str, ...] = (
|
|
51
|
+
"ratelimit",
|
|
52
|
+
"overloaded",
|
|
53
|
+
"timeout",
|
|
54
|
+
"connection",
|
|
55
|
+
"apiconnection",
|
|
56
|
+
"internalservererror",
|
|
57
|
+
"serviceunavailable",
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass(frozen=True)
|
|
62
|
+
class RetryConfig:
|
|
63
|
+
"""Retry policy. Defaults aim for ~7 seconds of total backoff over 3 tries.
|
|
64
|
+
|
|
65
|
+
max_attempts=3, base_delay=1s, jitter=20% → waits ~1s, ~2s between
|
|
66
|
+
attempts, with up to 20% randomization to avoid thundering herd.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
max_attempts: int = 3
|
|
70
|
+
base_delay_sec: float = 1.0
|
|
71
|
+
max_delay_sec: float = 30.0
|
|
72
|
+
jitter: float = 0.2 # ±20% randomization of the computed delay
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def is_transient_error(exc: BaseException) -> bool:
|
|
76
|
+
"""Classify whether an exception should trigger retry.
|
|
77
|
+
|
|
78
|
+
Inspects, in order:
|
|
79
|
+
1. Network-level exception types (asyncio.TimeoutError, ConnectionError)
|
|
80
|
+
2. HTTP status code on the exception (.status_code attribute)
|
|
81
|
+
3. Class-name fragments (fallback for SDKs without status_code)
|
|
82
|
+
"""
|
|
83
|
+
if isinstance(exc, (asyncio.TimeoutError, ConnectionError)):
|
|
84
|
+
return True
|
|
85
|
+
|
|
86
|
+
status = getattr(exc, "status_code", None)
|
|
87
|
+
if isinstance(status, int):
|
|
88
|
+
return status in TRANSIENT_STATUS_CODES
|
|
89
|
+
|
|
90
|
+
cls_name = type(exc).__name__.lower()
|
|
91
|
+
return any(hint in cls_name for hint in TRANSIENT_CLASS_HINTS)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _compute_backoff(
|
|
95
|
+
attempt: int, config: RetryConfig, rng: random.Random | None = None,
|
|
96
|
+
) -> float:
|
|
97
|
+
"""Exponential backoff with optional ±jitter."""
|
|
98
|
+
raw = config.base_delay_sec * (2 ** (attempt - 1))
|
|
99
|
+
capped = min(raw, config.max_delay_sec)
|
|
100
|
+
if config.jitter > 0:
|
|
101
|
+
r = (rng or random).uniform(-config.jitter, config.jitter)
|
|
102
|
+
capped = capped * (1.0 + r)
|
|
103
|
+
return max(capped, 0.0)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
async def retry_async(
|
|
107
|
+
func: Callable[[], Awaitable[T]],
|
|
108
|
+
config: RetryConfig | None = None,
|
|
109
|
+
*,
|
|
110
|
+
label: str = "adapter call",
|
|
111
|
+
sleep: Callable[[float], Awaitable[None]] | None = None,
|
|
112
|
+
rng: random.Random | None = None,
|
|
113
|
+
) -> T:
|
|
114
|
+
"""Run an async callable with exponential-backoff retry.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
func: zero-arg async callable to invoke. Wrap your call in a lambda
|
|
118
|
+
or nested coroutine def.
|
|
119
|
+
config: retry policy. Defaults to `RetryConfig()`.
|
|
120
|
+
label: human-readable label for log lines (e.g. "anthropic.chat").
|
|
121
|
+
sleep: injectable async sleep. Defaults to `asyncio.sleep`. Tests
|
|
122
|
+
override this to avoid real wall-clock waits.
|
|
123
|
+
rng: injectable RNG for jitter. Tests pass a seeded `random.Random`
|
|
124
|
+
for deterministic backoff timing.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
Whatever `func()` returns on success.
|
|
128
|
+
|
|
129
|
+
Raises:
|
|
130
|
+
The last exception, unmodified, after all attempts exhausted.
|
|
131
|
+
Non-transient errors raise immediately (no retry).
|
|
132
|
+
"""
|
|
133
|
+
cfg = config or RetryConfig()
|
|
134
|
+
_sleep = sleep or asyncio.sleep
|
|
135
|
+
last_exc: BaseException | None = None
|
|
136
|
+
|
|
137
|
+
for attempt in range(1, cfg.max_attempts + 1):
|
|
138
|
+
try:
|
|
139
|
+
return await func()
|
|
140
|
+
except BaseException as e:
|
|
141
|
+
last_exc = e
|
|
142
|
+
if not is_transient_error(e):
|
|
143
|
+
# Permanent error - bail immediately, no retry.
|
|
144
|
+
raise
|
|
145
|
+
if attempt >= cfg.max_attempts:
|
|
146
|
+
# Out of retries - re-raise the last error.
|
|
147
|
+
raise
|
|
148
|
+
delay = _compute_backoff(attempt, cfg, rng)
|
|
149
|
+
logger.info(
|
|
150
|
+
"[%s] attempt %d/%d failed (%s: %s); retrying in %.2fs",
|
|
151
|
+
label, attempt, cfg.max_attempts,
|
|
152
|
+
type(e).__name__, str(e)[:120], delay,
|
|
153
|
+
)
|
|
154
|
+
await _sleep(delay)
|
|
155
|
+
|
|
156
|
+
# Unreachable in practice, but satisfies type checkers.
|
|
157
|
+
assert last_exc is not None
|
|
158
|
+
raise last_exc
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def wrap_as_adapter_error(exc: BaseException, prefix: str) -> AdapterError:
|
|
162
|
+
"""Wrap an upstream exception in the appropriate AdapterError subclass.
|
|
163
|
+
|
|
164
|
+
`TieredRouter` (Sprint 2.6 / F-2) branches on the subclass:
|
|
165
|
+
- `TransientAdapterError` → safe to fail over to the other tier
|
|
166
|
+
- `PermanentAdapterError` → bubble up so the caller sees the real error
|
|
167
|
+
|
|
168
|
+
Detection mirrors `is_transient_error`. The string carries enough
|
|
169
|
+
detail to debug from logs without exposing the underlying exception
|
|
170
|
+
type leak.
|
|
171
|
+
"""
|
|
172
|
+
msg = f"{prefix}: {exc}"
|
|
173
|
+
if is_transient_error(exc):
|
|
174
|
+
return TransientAdapterError(msg)
|
|
175
|
+
return PermanentAdapterError(msg)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
# Backward-compat alias for the underscore-prefixed call site in
|
|
179
|
+
# anthropic_adapter / openai_adapter. Both spellings are part of the
|
|
180
|
+
# adapter-internal contract; tests should prefer `wrap_as_adapter_error`.
|
|
181
|
+
_wrap_as_adapter_error = wrap_as_adapter_error
|