npm - @pentatonic-ai/ai-agent-sdk - Versions diffs - 0.9.6 → 0.10.0 - Mend

@pentatonic-ai/ai-agent-sdk 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (127) hide show

package/packages/memory-engine/tests/test_embed_provider.py DELETED Viewed

@@ -1,693 +0,0 @@
-"""Unit tests for engine/services/_shared/embed_provider.py.
-Run with:
-    cd packages/memory-engine
-    python -m pytest tests/test_embed_provider.py -v
-"""
-from __future__ import annotations
-import sys
-from pathlib import Path
-# Make the engine/services tree importable for tests without packaging it.
-ROOT = Path(__file__).parent.parent / "engine" / "services"
-sys.path.insert(0, str(ROOT))
-import json  # noqa: E402
-import httpx  # noqa: E402
-import pytest  # noqa: E402
-from _shared.embed_provider import (  # noqa: E402
-    PROVIDERS,
-    EmbedAuthError,
-    EmbedClient,
-    EmbedHTTPError,
-    EmbedProvider,
-    resolve_provider,
-)
-# ----------------------------------------------------------------------
-# Helpers — stub httpx so we can assert the request shape.
-# ----------------------------------------------------------------------
-class _FakeResponse:
-    def __init__(self, status_code: int, payload: dict | str = ""):
-        self.status_code = status_code
-        if isinstance(payload, dict):
-            self._json = payload
-            self.text = json.dumps(payload)
-        else:
-            self._json = None
-            self.text = payload
-    @property
-    def is_success(self) -> bool:
-        return 200 <= self.status_code < 300
-    def json(self) -> dict:
-        if self._json is None:
-            raise ValueError("not json")
-        return self._json
-class _Recorder:
-    """Records every httpx.post call and returns canned responses keyed by URL."""
-    def __init__(self):
-        self.calls: list[dict] = []
-        self.responses: dict[str, _FakeResponse] = {}
-    def respond(self, url: str, response: _FakeResponse) -> None:
-        self.responses[url] = response
-    def __call__(self, url, *, json, headers, timeout):
-        self.calls.append({"url": url, "json": json, "headers": headers, "timeout": timeout})
-        if url in self.responses:
-            return self.responses[url]
-        # default: 401 to flush out unmatched URLs
-        return _FakeResponse(401, "no stub for this url")
-@pytest.fixture
-def recorder(monkeypatch):
-    rec = _Recorder()
-    monkeypatch.setattr(httpx, "post", rec)
-    return rec
-# ----------------------------------------------------------------------
-# Provider resolution
-# ----------------------------------------------------------------------
-def test_resolve_built_in_providers():
-    for name in ("openai", "pentatonic-gateway", "cohere"):
-        p = resolve_provider(name)
-        assert p.name == name
-def test_resolve_unknown_provider_raises():
-    with pytest.raises(ValueError):
-        resolve_provider("not-a-provider")
-def test_resolve_custom_provider_from_env(monkeypatch):
-    monkeypatch.setenv("L4_EMBED_AUTH_HEADER", "X-Custom-Auth")
-    monkeypatch.setenv("L4_EMBED_AUTH_FORMAT", "Token {key}")
-    monkeypatch.setenv("L4_EMBED_PATH_DEFAULT", "/embed")
-    monkeypatch.setenv("L4_EMBED_BODY_SHAPE", "cohere")
-    monkeypatch.setenv("L4_EMBED_RESPONSE_SHAPE", "cohere")
-    p = resolve_provider("custom", env_prefix="L4_")
-    assert p.auth_header == "X-Custom-Auth"
-    assert p.auth_format == "Token {key}"
-    assert p.path_default == "/embed"
-    # body shape produces Cohere-style "texts" field
-    body = p.body_builder(["hi"], "model-x")
-    assert body == {"texts": ["hi"], "model": "model-x", "input_type": "search_document"}
-# ----------------------------------------------------------------------
-# Request shape
-# ----------------------------------------------------------------------
-def test_openai_provider_request_shape(recorder):
-    recorder.respond(
-        "https://gw/v1/embeddings",
-        _FakeResponse(200, {"data": [{"embedding": [0.1, 0.2]}]}),
-    )
-    client = EmbedClient(
-        url="https://gw/v1/embeddings",
-        api_key="k",
-        model="m",
-        provider=PROVIDERS["openai"],
-    )
-    out = client.embed_batch(["hello"])
-    assert out == [[0.1, 0.2]]
-    call = recorder.calls[0]
-    assert call["url"] == "https://gw/v1/embeddings"
-    assert call["json"] == {"input": ["hello"], "model": "m"}
-    assert call["headers"] == {"Authorization": "Bearer k"}
-def test_pentatonic_provider_request_shape(recorder):
-    recorder.respond(
-        "https://lambda-gateway.pentatonic.com/v1/embed",
-        _FakeResponse(200, {"data": [{"embedding": [1.0, 2.0]}]}),
-    )
-    client = EmbedClient(
-        url="https://lambda-gateway.pentatonic.com/v1/embed",
-        api_key="secret",
-        model="nv-embed-v2",
-        provider=PROVIDERS["pentatonic-gateway"],
-    )
-    out = client.embed_batch(["t1"])
-    assert out == [[1.0, 2.0]]
-    call = recorder.calls[0]
-    assert call["url"] == "https://lambda-gateway.pentatonic.com/v1/embed"
-    assert call["json"] == {"input": ["t1"], "model": "nv-embed-v2"}
-    assert call["headers"] == {"X-API-Key": "secret"}
-def test_pentatonic_response_parser_handles_both_shapes(recorder):
-    """Pentatonic Gateway has historically returned both {"data":[...]} and
-    {"embeddings":[...]} on different endpoints. Parser accepts either."""
-    p = PROVIDERS["pentatonic-gateway"]
-    assert p.response_parser({"data": [{"embedding": [1.0]}]}) == [[1.0]]
-    assert p.response_parser({"embeddings": [[1.0]]}) == [[1.0]]
-def test_cohere_provider_request_shape(recorder):
-    recorder.respond(
-        "https://api.cohere.ai/v1/embed",
-        _FakeResponse(200, {"embeddings": [[3.0, 4.0]]}),
-    )
-    client = EmbedClient(
-        url="https://api.cohere.ai/v1/embed",
-        api_key="cohere-key",
-        model="embed-english-v3.0",
-        provider=PROVIDERS["cohere"],
-    )
-    out = client.embed_batch(["hi"])
-    assert out == [[3.0, 4.0]]
-    call = recorder.calls[0]
-    assert call["json"] == {
-        "texts": ["hi"],
-        "model": "embed-english-v3.0",
-        "input_type": "search_document",
-    }
-    assert call["headers"] == {"Authorization": "Bearer cohere-key"}
-# ----------------------------------------------------------------------
-# Auto-detect
-# ----------------------------------------------------------------------
-def test_autodetect_on_401_falls_back_to_pentatonic(recorder):
-    """Operator configured openai but the URL+key actually belong to
-    Pentatonic Gateway. First call 401s, auto-detect probes pentatonic
-    and succeeds."""
-    recorder.respond(
-        "https://lambda-gateway.pentatonic.com/v1/embeddings",
-        _FakeResponse(401, '{"error":"Invalid or missing API key"}'),
-    )
-    recorder.respond(
-        "https://lambda-gateway.pentatonic.com/v1/embed",
-        _FakeResponse(200, {"data": [{"embedding": [9.0]}]}),
-    )
-    client = EmbedClient(
-        url="https://lambda-gateway.pentatonic.com/v1/embeddings",
-        api_key="k",
-        model="nv-embed-v2",
-        provider=PROVIDERS["openai"],
-    )
-    out = client.embed_batch(["x"])
-    assert out == [[9.0]]
-    assert client.active_provider == "pentatonic-gateway"
-    # First call uses configured (openai) shape, second uses pentatonic
-    assert recorder.calls[0]["headers"] == {"Authorization": "Bearer k"}
-    assert recorder.calls[1]["headers"] == {"X-API-Key": "k"}
-def test_autodetect_caches_after_first_success(recorder):
-    """Once auto-detect picks a winner, subsequent calls go straight to it
-    without retrying the original 401."""
-    recorder.respond(
-        "https://gw/v1/embeddings",
-        _FakeResponse(401, "wrong scheme"),
-    )
-    recorder.respond(
-        "https://gw/v1/embed",
-        _FakeResponse(200, {"data": [{"embedding": [1.0]}]}),
-    )
-    client = EmbedClient(
-        url="https://gw/v1/embeddings",
-        api_key="k",
-        model="m",
-        provider=PROVIDERS["openai"],
-    )
-    client.embed_batch(["a"])  # triggers detect
-    n_after_first = len(recorder.calls)
-    client.embed_batch(["b"])  # should go straight to /v1/embed
-    assert len(recorder.calls) == n_after_first + 1
-    assert recorder.calls[-1]["url"] == "https://gw/v1/embed"
-def test_autodetect_disabled_raises(recorder):
-    recorder.respond("https://gw/v1/embeddings", _FakeResponse(401, "no auth"))
-    client = EmbedClient(
-        url="https://gw/v1/embeddings",
-        api_key="k",
-        model="m",
-        provider=PROVIDERS["openai"],
-        autodetect=False,
-    )
-    with pytest.raises(EmbedAuthError):
-        client.embed_batch(["x"])
-    # Only one call: no probing happened.
-    assert len(recorder.calls) == 1
-def test_autodetect_all_fail_raises(recorder):
-    """Every candidate also 401s — raise EmbedAuthError."""
-    recorder.respond("https://gw/v1/embeddings", _FakeResponse(401, "x"))
-    recorder.respond("https://gw/v1/embed", _FakeResponse(401, "x"))
-    client = EmbedClient(
-        url="https://gw/v1/embeddings",
-        api_key="k",
-        model="m",
-        provider=PROVIDERS["openai"],
-    )
-    with pytest.raises(EmbedAuthError):
-        client.embed_batch(["x"])
-# ----------------------------------------------------------------------
-# Error handling
-# ----------------------------------------------------------------------
-def test_non_401_http_error_does_not_trigger_autodetect(recorder):
-    # max_retries=0 isolates this test to autodetect behaviour. With
-    # retries enabled (default), 503 triggers the retry path which is
-    # exercised separately in the retry tests below.
-    recorder.respond(
-        "https://gw/v1/embeddings",
-        _FakeResponse(503, "upstream down"),
-    )
-    client = EmbedClient(
-        url="https://gw/v1/embeddings",
-        api_key="k",
-        model="m",
-        provider=PROVIDERS["openai"],
-        max_retries=0,
-    )
-    with pytest.raises(EmbedHTTPError) as exc:
-        client.embed_batch(["x"])
-    assert exc.value.status == 503
-    assert len(recorder.calls) == 1
-def test_empty_input_returns_empty(recorder):
-    client = EmbedClient(
-        url="https://gw/v1/embeddings",
-        api_key="k",
-        model="m",
-        provider=PROVIDERS["openai"],
-    )
-    assert client.embed_batch([]) == []
-    assert recorder.calls == []
-# ----------------------------------------------------------------------
-# from_env construction
-# ----------------------------------------------------------------------
-def test_from_env_reads_layer_prefix(monkeypatch, recorder):
-    monkeypatch.setenv("L4_NV_EMBED_URL", "https://lambda-gateway.pentatonic.com/v1/embed")
-    monkeypatch.setenv("L4_EMBED_API_KEY", "real-key")
-    monkeypatch.setenv("L4_EMBED_MODEL", "nv-embed-v2")
-    monkeypatch.setenv("L4_EMBED_PROVIDER", "pentatonic-gateway")
-    recorder.respond(
-        "https://lambda-gateway.pentatonic.com/v1/embed",
-        _FakeResponse(200, {"data": [{"embedding": [42.0]}]}),
-    )
-    client = EmbedClient.from_env(prefix="L4_")
-    out = client.embed_batch(["t"])
-    assert out == [[42.0]]
-    assert client.active_provider == "pentatonic-gateway"
-    assert recorder.calls[0]["headers"] == {"X-API-Key": "real-key"}
-def test_from_env_default_provider_is_openai(monkeypatch):
-    monkeypatch.setenv("L5_NV_EMBED_URL", "https://gw/v1/embeddings")
-    monkeypatch.setenv("L5_EMBED_API_KEY", "k")
-    client = EmbedClient.from_env(prefix="L5_")
-    assert client.active_provider == "openai"
-def test_from_env_autodetect_opt_out(monkeypatch, recorder):
-    monkeypatch.setenv("L4_NV_EMBED_URL", "https://gw/v1/embeddings")
-    monkeypatch.setenv("L4_EMBED_API_KEY", "k")
-    monkeypatch.setenv("L4_EMBED_AUTODETECT", "false")
-    recorder.respond("https://gw/v1/embeddings", _FakeResponse(401, "x"))
-    client = EmbedClient.from_env(prefix="L4_")
-    with pytest.raises(EmbedAuthError):
-        client.embed_batch(["x"])
-    assert len(recorder.calls) == 1
-# ----------------------------------------------------------------------
-# URL handling
-# ----------------------------------------------------------------------
-def test_url_without_path_gets_provider_default(recorder):
-    """If operator provides only a base URL, the provider's path_default
-    is appended."""
-    recorder.respond(
-        "https://lambda-gateway.pentatonic.com/v1/embed",
-        _FakeResponse(200, {"data": [{"embedding": [0.0]}]}),
-    )
-    client = EmbedClient(
-        url="https://lambda-gateway.pentatonic.com",
-        api_key="k",
-        model="m",
-        provider=PROVIDERS["pentatonic-gateway"],
-    )
-    client.embed_batch(["x"])
-    assert recorder.calls[0]["url"] == "https://lambda-gateway.pentatonic.com/v1/embed"
-# ----------------------------------------------------------------------
-# Chunking — work around the Pentatonic AI Gateway's per-call cap of 5
-# texts. Above the cap the gateway 502s; without chunking the layer's
-# /index-batch handler raises, the compat shim swallows it, and vector
-# writes silently drop. Chunking splits the request into chunks of
-# `max_batch` so each call stays within the gateway's limit.
-# ----------------------------------------------------------------------
-class _PentatonicEchoStub:
-    """httpx.post replacement that returns one embedding per input text,
-    matching real gateway behaviour. Each response embedding encodes the
-    input index so tests can assert order preservation across chunks."""
-    def __init__(self):
-        self.calls: list[dict] = []
-        self._offset = 0  # running input-index counter across calls
-    def __call__(self, url, *, json, headers, timeout):
-        self.calls.append({"url": url, "json": json, "headers": headers, "timeout": timeout})
-        n = len(json.get("input") or [])
-        embs = [[float(self._offset + i)] for i in range(n)]
-        self._offset += n
-        return _FakeResponse(200, {"embeddings": embs})
-def test_chunking_below_max_batch_makes_one_call(monkeypatch):
-    """N <= max_batch sends one request, no chunking overhead."""
-    stub = _PentatonicEchoStub()
-    monkeypatch.setattr(httpx, "post", stub)
-    client = EmbedClient(
-        url="https://lambda-gateway.pentatonic.com/v1/embed",
-        api_key="k", model="m",
-        provider=PROVIDERS["pentatonic-gateway"],
-        max_batch=5,
-    )
-    out = client.embed_batch([f"t{i}" for i in range(5)])
-    assert len(out) == 5
-    assert len(stub.calls) == 1
-    assert len(stub.calls[0]["json"]["input"]) == 5
-def test_chunking_above_max_batch_splits_into_calls(monkeypatch):
-    """N > max_batch is split into len(N)/max_batch posts; results are
-    concatenated in input order so the caller can't tell."""
-    stub = _PentatonicEchoStub()
-    monkeypatch.setattr(httpx, "post", stub)
-    client = EmbedClient(
-        url="https://lambda-gateway.pentatonic.com/v1/embed",
-        api_key="k", model="m",
-        provider=PROVIDERS["pentatonic-gateway"],
-        max_batch=5,
-    )
-    out = client.embed_batch([f"t{i}" for i in range(12)])
-    # 12 texts → chunks of [5, 5, 2] → 3 calls
-    assert len(stub.calls) == 3
-    assert [len(c["json"]["input"]) for c in stub.calls] == [5, 5, 2]
-    # Stub returns one vector per input. Each vector encodes its
-    # cross-chunk input index → assert order preserved.
-    assert len(out) == 12
-    assert out == [[float(i)] for i in range(12)]
-def test_chunking_disabled_with_max_batch_zero(monkeypatch):
-    """max_batch=0 means no chunking — old behaviour (one big call)."""
-    stub = _PentatonicEchoStub()
-    monkeypatch.setattr(httpx, "post", stub)
-    client = EmbedClient(
-        url="https://lambda-gateway.pentatonic.com/v1/embed",
-        api_key="k", model="m",
-        provider=PROVIDERS["pentatonic-gateway"],
-        max_batch=0,
-    )
-    client.embed_batch([f"t{i}" for i in range(20)])
-    assert len(stub.calls) == 1
-    assert len(stub.calls[0]["json"]["input"]) == 20
-def test_chunking_propagates_first_error(recorder):
-    """If a chunk fails (e.g., gateway 502), the whole call raises with
-    the first error — matching the un-chunked semantics. We don't return
-    a partial vector list because the caller's downstream `for r, emb, txt
-    in zip(...)` loop would silently drop the failed records."""
-    # Pentatonic gateway 502 on every call (simulates the real bug)
-    recorder.respond(
-        "https://lambda-gateway.pentatonic.com/v1/embed",
-        _FakeResponse(502, "<html>...bad gateway...</html>"),
-    )
-    client = EmbedClient(
-        url="https://lambda-gateway.pentatonic.com/v1/embed",
-        api_key="k", model="m",
-        provider=PROVIDERS["pentatonic-gateway"],
-        max_batch=5,
-    )
-    with pytest.raises(EmbedHTTPError) as exc:
-        client.embed_batch([f"t{i}" for i in range(8)])
-    assert exc.value.status == 502
-class _OpenAIEchoStub:
-    """OpenAI-shaped stub: returns one embedding per input as
-    {data: [{embedding: [...]}]}."""
-    def __init__(self):
-        self.calls: list[dict] = []
-    def __call__(self, url, *, json, headers, timeout):
-        self.calls.append({"url": url, "json": json, "headers": headers, "timeout": timeout})
-        n = len(json.get("input") or [])
-        return _FakeResponse(200, {"data": [{"embedding": [0.0]} for _ in range(n)]})
-def test_from_env_reads_max_batch(monkeypatch):
-    """{prefix}EMBED_MAX_BATCH overrides the default of 5."""
-    monkeypatch.setenv("L4_NV_EMBED_URL", "https://gw/v1/embeddings")
-    monkeypatch.setenv("L4_EMBED_API_KEY", "k")
-    monkeypatch.setenv("L4_EMBED_MAX_BATCH", "3")
-    stub = _OpenAIEchoStub()
-    monkeypatch.setattr(httpx, "post", stub)
-    client = EmbedClient.from_env(prefix="L4_")
-    client.embed_batch([f"t{i}" for i in range(7)])
-    # 7 with chunk=3 → [3, 3, 1] → 3 calls
-    assert len(stub.calls) == 3
-    assert [len(c["json"]["input"]) for c in stub.calls] == [3, 3, 1]
-def test_from_env_default_max_batch_is_five(monkeypatch):
-    """Default max_batch=5 matches the observed Pentatonic Gateway cap."""
-    monkeypatch.setenv("L4_NV_EMBED_URL", "https://gw/v1/embeddings")
-    monkeypatch.setenv("L4_EMBED_API_KEY", "k")
-    stub = _OpenAIEchoStub()
-    monkeypatch.setattr(httpx, "post", stub)
-    client = EmbedClient.from_env(prefix="L4_")
-    client.embed_batch([f"t{i}" for i in range(10)])
-    # 10 with default chunk=5 → [5, 5] → 2 calls
-    assert len(stub.calls) == 2
-# ----------------------------------------------------------------------
-# Retry-with-jitter on transient gateway saturation (502/503/504/429)
-# ----------------------------------------------------------------------
-#
-# These tests exercise the retry path added 2026-05-15. Motivation:
-# the Pentatonic AI Gateway has a K≈10 concurrency cap and 502s under
-# saturation; without retry, a single 502 cascades through the engine's
-# per-layer fallback path and amplifies load instead of damping it.
-# See the prod incident note on EmbedClient.__init__ for context.
-class _SequencedRecorder:
-    """Returns a different response on each successive call.
-    The default `_Recorder` returns the same response every time, which
-    is wrong for retry tests — we need to verify "first call 502, then
-    succeed on retry". This recorder pops responses off a queue per
-    URL and falls back to the last response if the queue is empty
-    (matching the "persistent failure" test case naturally).
-    """
-    def __init__(self):
-        self.calls: list[dict] = []
-        self.queues: dict[str, list[_FakeResponse]] = {}
-    def queue(self, url: str, responses: list[_FakeResponse]) -> None:
-        self.queues[url] = list(responses)
-    def __call__(self, url, *, json, headers, timeout):
-        self.calls.append({"url": url, "json": json})
-        q = self.queues.get(url, [])
-        if not q:
-            return _FakeResponse(401, "no responses queued")
-        # Pop unless this is the last one — keep returning the tail so
-        # "all attempts fail" tests don't need to queue N copies.
-        return q.pop(0) if len(q) > 1 else q[0]
-@pytest.fixture
-def sequenced(monkeypatch):
-    rec = _SequencedRecorder()
-    monkeypatch.setattr(httpx, "post", rec)
-    # Avoid the test taking real wall time on backoff sleeps — patch
-    # time.sleep to no-op. The jitter calculation still runs, just
-    # without the actual delay.
-    import time as _time
-    monkeypatch.setattr(_time, "sleep", lambda _s: None)
-    return rec
-def test_retries_on_502_and_succeeds(sequenced):
-    sequenced.queue(
-        "https://gw/v1/embeddings",
-        [
-            _FakeResponse(502, "bad gateway"),
-            _FakeResponse(200, {"data": [{"embedding": [0.1, 0.2]}]}),
-        ],
-    )
-    client = EmbedClient(
-        url="https://gw/v1/embeddings",
-        api_key="k",
-        model="m",
-        provider=PROVIDERS["openai"],
-        max_retries=3,
-    )
-    out = client.embed_batch(["hello"])
-    assert out == [[0.1, 0.2]]
-    # First call 502, second call 200 — exactly two attempts.
-    assert len(sequenced.calls) == 2
-def test_retries_on_503_504_429(sequenced):
-    """Each transient code triggers the retry path the same way."""
-    for code in (503, 504, 429):
-        sequenced.calls.clear()
-        sequenced.queue(
-            "https://gw/v1/embeddings",
-            [
-                _FakeResponse(code, "transient"),
-                _FakeResponse(200, {"data": [{"embedding": [0.0]}]}),
-            ],
-        )
-        client = EmbedClient(
-            url="https://gw/v1/embeddings",
-            api_key="k",
-            model="m",
-            provider=PROVIDERS["openai"],
-            max_retries=3,
-        )
-        out = client.embed_batch(["x"])
-        assert out == [[0.0]], f"retry failed for status {code}"
-        assert len(sequenced.calls) == 2, f"wrong call count for status {code}"
-def test_does_not_retry_on_500(sequenced):
-    """500 is server-side bug, not transient saturation — fail fast."""
-    sequenced.queue(
-        "https://gw/v1/embeddings",
-        [_FakeResponse(500, "internal server error")],
-    )
-    client = EmbedClient(
-        url="https://gw/v1/embeddings",
-        api_key="k",
-        model="m",
-        provider=PROVIDERS["openai"],
-        max_retries=3,
-    )
-    with pytest.raises(EmbedHTTPError) as exc:
-        client.embed_batch(["x"])
-    assert exc.value.status == 500
-    # Exactly one attempt — no retry on 500.
-    assert len(sequenced.calls) == 1
-def test_does_not_retry_on_400(sequenced):
-    """4xx (other than 401-autodetect / 429) indicates caller error."""
-    sequenced.queue(
-        "https://gw/v1/embeddings",
-        [_FakeResponse(400, "bad request")],
-    )
-    client = EmbedClient(
-        url="https://gw/v1/embeddings",
-        api_key="k",
-        model="m",
-        provider=PROVIDERS["openai"],
-        max_retries=3,
-    )
-    with pytest.raises(EmbedHTTPError) as exc:
-        client.embed_batch(["x"])
-    assert exc.value.status == 400
-    assert len(sequenced.calls) == 1
-def test_max_retries_exhausted_raises(sequenced):
-    """Persistent 502 raises after max_retries+1 attempts."""
-    sequenced.queue(
-        "https://gw/v1/embeddings",
-        [_FakeResponse(502, "still down")],
-    )
-    client = EmbedClient(
-        url="https://gw/v1/embeddings",
-        api_key="k",
-        model="m",
-        provider=PROVIDERS["openai"],
-        max_retries=3,
-    )
-    with pytest.raises(EmbedHTTPError) as exc:
-        client.embed_batch(["x"])
-    assert exc.value.status == 502
-    # max_retries=3 → 1 original + 3 retries = 4 calls total.
-    assert len(sequenced.calls) == 4
-def test_max_retries_zero_disables_retry(sequenced):
-    """Explicit opt-out preserves pre-fix behaviour for callers that
-    handle their own retry."""
-    sequenced.queue(
-        "https://gw/v1/embeddings",
-        [_FakeResponse(502, "down")],
-    )
-    client = EmbedClient(
-        url="https://gw/v1/embeddings",
-        api_key="k",
-        model="m",
-        provider=PROVIDERS["openai"],
-        max_retries=0,
-    )
-    with pytest.raises(EmbedHTTPError):
-        client.embed_batch(["x"])
-    assert len(sequenced.calls) == 1
-def test_from_env_reads_retry_config(monkeypatch):
-    """{prefix}EMBED_MAX_RETRIES + EMBED_RETRY_BASE_DELAY +
-    EMBED_RETRY_MAX_DELAY override the defaults."""
-    monkeypatch.setenv("L4_NV_EMBED_URL", "https://gw/v1/embeddings")
-    monkeypatch.setenv("L4_EMBED_API_KEY", "k")
-    monkeypatch.setenv("L4_EMBED_MAX_RETRIES", "5")
-    monkeypatch.setenv("L4_EMBED_RETRY_BASE_DELAY", "0.25")
-    monkeypatch.setenv("L4_EMBED_RETRY_MAX_DELAY", "2.5")
-    client = EmbedClient.from_env(prefix="L4_")
-    assert client._max_retries == 5
-    assert client._retry_base_delay == 0.25
-    assert client._retry_max_delay == 2.5
-def test_from_env_default_retry_config(monkeypatch):
-    """Defaults: 3 retries, 100ms base, 1s cap — tuned for K≈10
-    gateway under burst load."""
-    monkeypatch.setenv("L4_NV_EMBED_URL", "https://gw/v1/embeddings")
-    monkeypatch.setenv("L4_EMBED_API_KEY", "k")
-    client = EmbedClient.from_env(prefix="L4_")
-    assert client._max_retries == 3
-    assert client._retry_base_delay == 0.1
-    assert client._retry_max_delay == 1.0