npm - @pentatonic-ai/ai-agent-sdk - Versions diffs - 0.9.4 → 0.9.6 - Mend

@pentatonic-ai/ai-agent-sdk 0.9.4 → 0.9.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/dist/index.cjs +39 -72
package/dist/index.js +36 -69
package/package.json +9 -2
package/packages/memory/package-lock.json +49 -33
package/packages/memory/package.json +4 -1
package/packages/memory/src/__tests__/engine.test.js +40 -5
package/packages/memory/src/engine.js +38 -3
package/packages/memory-engine/docker-compose.yml +24 -2
package/packages/memory-engine/engine/services/_shared/embed_provider.py +125 -31
package/packages/memory-engine/engine/services/l2/Dockerfile +7 -0
package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +233 -60
package/packages/memory-engine/tests/test_embed_provider.py +201 -0
package/packages/memory-engine/tests/test_l2_qmd_vec_search.py +280 -0

package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py CHANGED Viewed

@@ -18,6 +18,7 @@ import json
 import logging
 import os
 import sqlite3
+import struct
 import sys
 import time
 from contextlib import asynccontextmanager
@@ -34,6 +35,11 @@ from neo4j.time import DateTime as Neo4jDateTime, Date as Neo4jDate
 from pydantic import BaseModel
 import uvicorn
+try:
+    import sqlite_vec  # 0.1.9 — native KNN MATCH over packed-f32 vec0 tables
+except ImportError:
+    sqlite_vec = None  # Caller logs loudly if helpers can't load the extension
 # Shared embed client lives at engine/services/_shared/.
 sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
 from _shared.embed_provider import EmbedClient  # noqa: E402
@@ -101,6 +107,59 @@ QMD_DB_PATH = _resolve_qmd_db()
 OLLAMA_URL = os.environ.get("PME_OLLAMA_URL", "http://localhost:11434/api/embeddings")
 EMBEDDING_MODEL = os.environ.get("PME_EMBED_MODEL", "nomic-embed-text")
+# Embedding dimension for the vec0 virtual table. Production gateway
+# (lambda-gateway.pentatonic.com/v1/embed via pentatonic-gateway provider)
+# returns NV-Embed-v2 4096-dim vectors. The vec0 schema requires the dim
+# at DDL time and writers must match — keep this in lockstep with the
+# gateway / EmbedClient config.
+EMBED_DIM = int(os.environ.get("PME_EMBED_DIM", "4096"))
+def _open_qmd_conn() -> sqlite3.Connection:
+    """Open qmd.sqlite with sqlite-vec loaded.
+    Falls back to a plain sqlite3 connection if the extension can't load —
+    MATCH-form queries will then fail loudly at execute time, which is the
+    right signal (loud error > silent degradation back to Python cosine).
+    Callers that only need scalar columns (chunks.path, chunks.text) work
+    fine without the extension.
+    ``check_same_thread=False`` is intentional: the async backfill yields
+    via ``asyncio.to_thread`` to keep /search responsive, which means the
+    connection is handed off between event-loop / thread-pool workers.
+    sqlite's default thread-safety check would otherwise reject the
+    cross-thread reuse even though only one worker touches it at a time.
+    """
+    conn = sqlite3.connect(QMD_DB_PATH, timeout=10, check_same_thread=False)
+    if sqlite_vec is None:
+        log.error("sqlite_vec module not importable — qmd vec_index unavailable")
+        return conn
+    try:
+        conn.enable_load_extension(True)
+        sqlite_vec.load(conn)
+        conn.enable_load_extension(False)
+    except Exception as e:
+        log.error(f"sqlite-vec load failed: {e} — qmd search will be degraded")
+    return conn
+def _ensure_vec_index(conn: sqlite3.Connection) -> None:
+    """Create the vec0 KNN index if not already present. Idempotent.
+    `distance_metric=cosine` is non-default — sqlite-vec defaults to L2
+    (Euclidean). Probe confirmed cosine returns `1 - cos_sim` as the
+    distance. The id column is a regular INTEGER PRIMARY KEY so we can
+    JOIN back to `chunks` on the row's autoinc id.
+    """
+    conn.execute(
+        f"""
+        CREATE VIRTUAL TABLE IF NOT EXISTS vec_index USING vec0(
+            id INTEGER PRIMARY KEY,
+            embedding float[{EMBED_DIM}] distance_metric=cosine
+        )
+        """
+    )
 # NV-Embed-v2 service (primary, 4096-dim). URL/auth/path/body/response are
 # managed by the shared EmbedClient; PME_EMBED_PROVIDER (default openai)
 # selects auth scheme (Bearer vs X-API-Key) and request shape.
@@ -177,13 +236,25 @@ def get_http_client() -> httpx.AsyncClient:
 async def lifespan(_app: FastAPI) -> AsyncIterator[None]:
     """Open the neo4j driver + HTTP client at process startup, close on
     shutdown. Without this, the first request pays driver-open latency
-    and the driver is never properly closed on SIGTERM (leaking conns)."""
+    and the driver is never properly closed on SIGTERM (leaking conns).
+    Also schedules the vec_index backfill as a background task so the
+    proxy can start serving immediately while older chunks copy across
+    into the KNN index — first-time migration of ~450k rows takes
+    minutes and would otherwise block /health.
+    """
     global _neo4j_driver, _http_client
     _neo4j_driver = AsyncGraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)
     _http_client = httpx.AsyncClient(timeout=30.0)
+    backfill_task = asyncio.create_task(_backfill_vec_index())
     try:
         yield
     finally:
+        backfill_task.cancel()
+        try:
+            await backfill_task
+        except (asyncio.CancelledError, Exception):
+            pass
         if _neo4j_driver is not None:
             await _neo4j_driver.close()
             _neo4j_driver = None
@@ -192,6 +263,82 @@ async def lifespan(_app: FastAPI) -> AsyncIterator[None]:
             _http_client = None
+async def _backfill_vec_index() -> None:
+    """One-time migration: copy existing chunks.embedding (JSON) into
+    vec_index (f32 bytes).
+    Idempotent. Runs at proxy startup if vec_index has fewer rows than
+    chunks. Async so it doesn't block /health — the proxy serves
+    requests in parallel and search degrades to partial-corpus results
+    until the backfill finishes (any chunk already mirrored into
+    vec_index is findable; the rest are invisible to search but still
+    in L0/L3/L5/L6).
+    At 450k rows + ~460 rows/s insert rate this takes ~16 min on a
+    cold prod instance. Subsequent restarts no-op cleanly.
+    """
+    if sqlite_vec is None:
+        log.error("sqlite_vec module missing — backfill skipped, search will be degraded")
+        return
+    if not os.path.exists(QMD_DB_PATH):
+        log.info("vec_index backfill skipped — qmd.sqlite does not exist yet")
+        return
+    try:
+        conn = await asyncio.to_thread(_open_qmd_conn)
+        await asyncio.to_thread(_ensure_vec_index, conn)
+        chunks_n = conn.execute(
+            "SELECT count(*) FROM chunks WHERE embedding IS NOT NULL"
+        ).fetchone()[0]
+        vec_n = conn.execute("SELECT count(*) FROM vec_index").fetchone()[0]
+        if vec_n >= chunks_n:
+            log.info(f"vec_index backfill skipped — already in sync ({vec_n}/{chunks_n})")
+            conn.close()
+            return
+        missing = chunks_n - vec_n
+        log.info(f"vec_index backfill starting — {missing} rows to copy")
+        cursor = conn.execute(
+            """
+            SELECT c.id, c.embedding
+            FROM chunks c
+            LEFT JOIN vec_index v ON v.id = c.id
+            WHERE v.id IS NULL AND c.embedding IS NOT NULL
+            """
+        )
+        BATCH = 500
+        copied = 0
+        while True:
+            batch = await asyncio.to_thread(cursor.fetchmany, BATCH)
+            if not batch:
+                break
+            def _insert_batch() -> int:
+                inserted = 0
+                with conn:
+                    for cid, emb_json in batch:
+                        try:
+                            vec = json.loads(emb_json)
+                        except Exception:
+                            continue
+                        if len(vec) != EMBED_DIM:
+                            continue
+                        conn.execute(
+                            "INSERT INTO vec_index(id, embedding) VALUES (?, ?)",
+                            (cid, struct.pack(f"{len(vec)}f", *vec)),
+                        )
+                        inserted += 1
+                return inserted
+            copied += await asyncio.to_thread(_insert_batch)
+            log.info(f"vec_index backfill progress: {copied}/{missing}")
+            # Yield generously so /search + writers aren't starved.
+            await asyncio.sleep(0)
+        log.info(f"vec_index backfill done — {copied} rows copied")
+        conn.close()
+    except asyncio.CancelledError:
+        log.info("vec_index backfill cancelled during shutdown")
+        raise
+    except Exception as e:
+        log.error(f"vec_index backfill failed: {e}")
 app = FastAPI(title="Sequential HybridRAG Proxy", version="1.0.0", lifespan=lifespan)
 # ---------------------------------------------------------------------------
@@ -613,7 +760,15 @@ def cross_encoder_rerank(query: str, results: List[Dict], top_k: int = 16) -> Li
     return scored[:top_k] + remaining
 def search_qmd_informed(query: str, graph_context: Dict, limit: int = 12) -> List[Dict]:
-    """Phase 2: QMD vector search informed by graph results."""
+    """Phase 2: QMD vector search via sqlite-vec MATCH.
+    Replaces the legacy Python cosine loop over JSON-serialised embeddings
+    (which also had an `ORDER BY id LIMIT 2000` bug — only the OLDEST
+    2000 rows were ever considered, so 99%+ of the corpus was invisible to
+    search at production scale). Now: native KNN over the vec0 index,
+    full-corpus top-k. Wall time at 450k rows: ~50ms native MATCH vs
+    ~15s timeout previously.
+    """
     if not os.path.exists(QMD_DB_PATH):
         return []
@@ -621,69 +776,64 @@ def search_qmd_informed(query: str, graph_context: Dict, limit: int = 12) -> Lis
     if not query_embedding:
         return []
-    # Enhance query with graph entities for better vector search
     enhanced_query = query
     if graph_context["graph_entities"]:
         enhanced_query += " " + " ".join(graph_context["graph_entities"][:3])
-    enhanced_embedding = get_embedding(enhanced_query)
-    if not enhanced_embedding:
-        enhanced_embedding = query_embedding
+    enhanced_embedding = get_embedding(enhanced_query) or query_embedding
+    if len(enhanced_embedding) != EMBED_DIM:
+        # Dim mismatch vs vec0 DDL — the MATCH would error inside sqlite-vec.
+        # Bail with a loud log; an embedding-model mismatch in prod is the
+        # likely root cause and silent degradation would hide it.
+        log.error(
+            f"QMD search: query dim {len(enhanced_embedding)} != vec_index dim "
+            f"{EMBED_DIM} — embedding model mismatch?"
+        )
+        return []
+    qbytes = struct.pack(f"{len(enhanced_embedding)}f", *enhanced_embedding)
     try:
-        conn = sqlite3.connect(QMD_DB_PATH, timeout=5)
-        conn.row_factory = sqlite3.Row
-        # Get vectors and compute similarity
-        rows = conn.execute("""
-            SELECT id, path, text, embedding
-            FROM chunks
-            WHERE embedding IS NOT NULL
-            ORDER BY id
-            LIMIT 2000
-        """).fetchall()
+        conn = _open_qmd_conn()
+        # Pull a candidate pool larger than `limit` so entity-boost
+        # re-ranking has material to work with — 4× limit, floor 50.
+        k_pool = max(limit * 4, 50)
+        rows = conn.execute(
+            """
+            SELECT c.id, c.path, c.text, v.distance
+            FROM vec_index v
+            JOIN chunks c ON c.id = v.id
+            WHERE v.embedding MATCH ? AND k = ?
+            ORDER BY v.distance
+            """,
+            (qbytes, k_pool),
+        ).fetchall()
+        conn.close()
         results = []
-        for row in rows:
-            try:
-                # Deserialize embedding
-                embedding_data = row["embedding"]
-                if isinstance(embedding_data, str):
-                    embedding = json.loads(embedding_data)
-                else:
-                    embedding = list(embedding_data)
-                # Cosine similarity with enhanced query
-                dot = sum(a * b for a, b in zip(enhanced_embedding, embedding))
-                norm_q = sum(x * x for x in enhanced_embedding) ** 0.5
-                norm_e = sum(x * x for x in embedding) ** 0.5
-                if norm_q > 0 and norm_e > 0:
-                    similarity = dot / (norm_q * norm_e)
-                    # Boost score if path contains graph entities
-                    entity_boost = 0
-                    path_lower = row["path"].lower()
-                    for entity in graph_context["graph_entities"]:
-                        if entity.lower() in path_lower or entity.lower() in row["text"].lower():
-                            entity_boost = GRAPH_PRIORITY_BOOST
-                            break
-                    final_score = (similarity * VECTOR_BASE_WEIGHT) + entity_boost
-                    if similarity > 0.2:  # Threshold for inclusion
-                        results.append({
-                            "path": row["path"],
-                            "text": row["text"][:600],
-                            "score": final_score,
-                            "source": "vector",
-                            "base_similarity": similarity,
-                            "entity_boost": entity_boost
-                        })
-            except Exception as e:
-                logging.debug(f"Suppressed: {e}")
-        conn.close()
+        for row_id, path, text, distance in rows:
+            # vec0 distance_metric=cosine returns `1 - cos_sim` —
+            # invert to align with the rest of the codebase's `similarity`
+            # convention (1.0 = identical, 0.0 = orthogonal).
+            similarity = 1.0 - distance
+            if similarity <= 0.2:
+                continue
+            entity_boost = 0
+            path_lower = (path or "").lower()
+            text_lower = (text or "").lower()
+            for entity in graph_context["graph_entities"]:
+                el = entity.lower()
+                if el in path_lower or el in text_lower:
+                    entity_boost = GRAPH_PRIORITY_BOOST
+                    break
+            final_score = (similarity * VECTOR_BASE_WEIGHT) + entity_boost
+            results.append({
+                "path": path,
+                "text": (text or "")[:600],
+                "score": final_score,
+                "source": "vector",
+                "base_similarity": similarity,
+                "entity_boost": entity_boost,
+            })
         results.sort(key=lambda x: x["score"], reverse=True)
         return results[:limit]
@@ -1598,7 +1748,11 @@ async def index_internal_batch(req: IndexInternalBatchRequest) -> dict:
             log.warning(f"L4 embed count mismatch: {len(embeddings)} != {len(norm)}")
         qmd_db = Path(QMD_DB_PATH)
         qmd_db.parent.mkdir(parents=True, exist_ok=True)
-        conn = sqlite3.connect(str(qmd_db), timeout=10)
+        # Open with sqlite-vec loaded so we can dual-write to vec_index
+        # below. If extension load fails, vec_index inserts silently no-op
+        # via the try/except — chunks (JSON) still gets the write so the
+        # corpus stays whole; search just degrades to the old path.
+        conn = _open_qmd_conn()
         conn.execute("PRAGMA journal_mode=WAL")
         conn.execute("""
             CREATE TABLE IF NOT EXISTS chunks (
@@ -1612,14 +1766,33 @@ async def index_internal_batch(req: IndexInternalBatchRequest) -> dict:
                 created_at TEXT
             )
         """)
+        try:
+            _ensure_vec_index(conn)
+        except Exception as e:
+            log.error(f"vec_index DDL failed: {e} — falling back to chunks-only write")
         for n, vec in zip(norm, embeddings):
             if not vec:
                 continue
-            conn.execute(
+            cur = conn.execute(
                 "INSERT INTO chunks (path, text, embedding, embedding_model, embedding_dim, chunk_index, created_at) VALUES (?, ?, ?, ?, ?, ?, ?)",
                 (f"bench/{arena}/{n['path']}.md", n["content"],
                  json.dumps(vec), "nv-embed-v2", len(vec), 0, now_iso),
             )
+            # Mirror into the vec0 KNN index so search_qmd_informed can
+            # MATCH on the f32-packed vector. Dim must match the vec0 DDL
+            # (EMBED_DIM); skip rows where the embedding shape disagrees
+            # so a single bad row doesn't poison the batch insert.
+            if cur.lastrowid is not None and len(vec) == EMBED_DIM:
+                try:
+                    conn.execute(
+                        "INSERT INTO vec_index(id, embedding) VALUES (?, ?)",
+                        (cur.lastrowid, struct.pack(f"{len(vec)}f", *vec)),
+                    )
+                except Exception as e:
+                    # vec_index dual-write is defensive — the JSON column
+                    # in chunks is still the source of truth until the
+                    # backfill task confirms vec_index is in sync.
+                    log.debug(f"vec_index insert skipped for row {cur.lastrowid}: {e}")
             l4_inserted += 1
         conn.commit()
         conn.close()

package/packages/memory-engine/tests/test_embed_provider.py CHANGED Viewed

@@ -268,6 +268,9 @@ def test_autodetect_all_fail_raises(recorder):
 # ----------------------------------------------------------------------
 def test_non_401_http_error_does_not_trigger_autodetect(recorder):
+    # max_retries=0 isolates this test to autodetect behaviour. With
+    # retries enabled (default), 503 triggers the retry path which is
+    # exercised separately in the retry tests below.
     recorder.respond(
         "https://gw/v1/embeddings",
         _FakeResponse(503, "upstream down"),
@@ -277,6 +280,7 @@ def test_non_401_http_error_does_not_trigger_autodetect(recorder):
         api_key="k",
         model="m",
         provider=PROVIDERS["openai"],
+        max_retries=0,
     )
     with pytest.raises(EmbedHTTPError) as exc:
         client.embed_batch(["x"])
@@ -490,3 +494,200 @@ def test_from_env_default_max_batch_is_five(monkeypatch):
     client.embed_batch([f"t{i}" for i in range(10)])
     # 10 with default chunk=5 → [5, 5] → 2 calls
     assert len(stub.calls) == 2
+# ----------------------------------------------------------------------
+# Retry-with-jitter on transient gateway saturation (502/503/504/429)
+# ----------------------------------------------------------------------
+#
+# These tests exercise the retry path added 2026-05-15. Motivation:
+# the Pentatonic AI Gateway has a K≈10 concurrency cap and 502s under
+# saturation; without retry, a single 502 cascades through the engine's
+# per-layer fallback path and amplifies load instead of damping it.
+# See the prod incident note on EmbedClient.__init__ for context.
+class _SequencedRecorder:
+    """Returns a different response on each successive call.
+    The default `_Recorder` returns the same response every time, which
+    is wrong for retry tests — we need to verify "first call 502, then
+    succeed on retry". This recorder pops responses off a queue per
+    URL and falls back to the last response if the queue is empty
+    (matching the "persistent failure" test case naturally).
+    """
+    def __init__(self):
+        self.calls: list[dict] = []
+        self.queues: dict[str, list[_FakeResponse]] = {}
+    def queue(self, url: str, responses: list[_FakeResponse]) -> None:
+        self.queues[url] = list(responses)
+    def __call__(self, url, *, json, headers, timeout):
+        self.calls.append({"url": url, "json": json})
+        q = self.queues.get(url, [])
+        if not q:
+            return _FakeResponse(401, "no responses queued")
+        # Pop unless this is the last one — keep returning the tail so
+        # "all attempts fail" tests don't need to queue N copies.
+        return q.pop(0) if len(q) > 1 else q[0]
+@pytest.fixture
+def sequenced(monkeypatch):
+    rec = _SequencedRecorder()
+    monkeypatch.setattr(httpx, "post", rec)
+    # Avoid the test taking real wall time on backoff sleeps — patch
+    # time.sleep to no-op. The jitter calculation still runs, just
+    # without the actual delay.
+    import time as _time
+    monkeypatch.setattr(_time, "sleep", lambda _s: None)
+    return rec
+def test_retries_on_502_and_succeeds(sequenced):
+    sequenced.queue(
+        "https://gw/v1/embeddings",
+        [
+            _FakeResponse(502, "bad gateway"),
+            _FakeResponse(200, {"data": [{"embedding": [0.1, 0.2]}]}),
+        ],
+    )
+    client = EmbedClient(
+        url="https://gw/v1/embeddings",
+        api_key="k",
+        model="m",
+        provider=PROVIDERS["openai"],
+        max_retries=3,
+    )
+    out = client.embed_batch(["hello"])
+    assert out == [[0.1, 0.2]]
+    # First call 502, second call 200 — exactly two attempts.
+    assert len(sequenced.calls) == 2
+def test_retries_on_503_504_429(sequenced):
+    """Each transient code triggers the retry path the same way."""
+    for code in (503, 504, 429):
+        sequenced.calls.clear()
+        sequenced.queue(
+            "https://gw/v1/embeddings",
+            [
+                _FakeResponse(code, "transient"),
+                _FakeResponse(200, {"data": [{"embedding": [0.0]}]}),
+            ],
+        )
+        client = EmbedClient(
+            url="https://gw/v1/embeddings",
+            api_key="k",
+            model="m",
+            provider=PROVIDERS["openai"],
+            max_retries=3,
+        )
+        out = client.embed_batch(["x"])
+        assert out == [[0.0]], f"retry failed for status {code}"
+        assert len(sequenced.calls) == 2, f"wrong call count for status {code}"
+def test_does_not_retry_on_500(sequenced):
+    """500 is server-side bug, not transient saturation — fail fast."""
+    sequenced.queue(
+        "https://gw/v1/embeddings",
+        [_FakeResponse(500, "internal server error")],
+    )
+    client = EmbedClient(
+        url="https://gw/v1/embeddings",
+        api_key="k",
+        model="m",
+        provider=PROVIDERS["openai"],
+        max_retries=3,
+    )
+    with pytest.raises(EmbedHTTPError) as exc:
+        client.embed_batch(["x"])
+    assert exc.value.status == 500
+    # Exactly one attempt — no retry on 500.
+    assert len(sequenced.calls) == 1
+def test_does_not_retry_on_400(sequenced):
+    """4xx (other than 401-autodetect / 429) indicates caller error."""
+    sequenced.queue(
+        "https://gw/v1/embeddings",
+        [_FakeResponse(400, "bad request")],
+    )
+    client = EmbedClient(
+        url="https://gw/v1/embeddings",
+        api_key="k",
+        model="m",
+        provider=PROVIDERS["openai"],
+        max_retries=3,
+    )
+    with pytest.raises(EmbedHTTPError) as exc:
+        client.embed_batch(["x"])
+    assert exc.value.status == 400
+    assert len(sequenced.calls) == 1
+def test_max_retries_exhausted_raises(sequenced):
+    """Persistent 502 raises after max_retries+1 attempts."""
+    sequenced.queue(
+        "https://gw/v1/embeddings",
+        [_FakeResponse(502, "still down")],
+    )
+    client = EmbedClient(
+        url="https://gw/v1/embeddings",
+        api_key="k",
+        model="m",
+        provider=PROVIDERS["openai"],
+        max_retries=3,
+    )
+    with pytest.raises(EmbedHTTPError) as exc:
+        client.embed_batch(["x"])
+    assert exc.value.status == 502
+    # max_retries=3 → 1 original + 3 retries = 4 calls total.
+    assert len(sequenced.calls) == 4
+def test_max_retries_zero_disables_retry(sequenced):
+    """Explicit opt-out preserves pre-fix behaviour for callers that
+    handle their own retry."""
+    sequenced.queue(
+        "https://gw/v1/embeddings",
+        [_FakeResponse(502, "down")],
+    )
+    client = EmbedClient(
+        url="https://gw/v1/embeddings",
+        api_key="k",
+        model="m",
+        provider=PROVIDERS["openai"],
+        max_retries=0,
+    )
+    with pytest.raises(EmbedHTTPError):
+        client.embed_batch(["x"])
+    assert len(sequenced.calls) == 1
+def test_from_env_reads_retry_config(monkeypatch):
+    """{prefix}EMBED_MAX_RETRIES + EMBED_RETRY_BASE_DELAY +
+    EMBED_RETRY_MAX_DELAY override the defaults."""
+    monkeypatch.setenv("L4_NV_EMBED_URL", "https://gw/v1/embeddings")
+    monkeypatch.setenv("L4_EMBED_API_KEY", "k")
+    monkeypatch.setenv("L4_EMBED_MAX_RETRIES", "5")
+    monkeypatch.setenv("L4_EMBED_RETRY_BASE_DELAY", "0.25")
+    monkeypatch.setenv("L4_EMBED_RETRY_MAX_DELAY", "2.5")
+    client = EmbedClient.from_env(prefix="L4_")
+    assert client._max_retries == 5
+    assert client._retry_base_delay == 0.25
+    assert client._retry_max_delay == 2.5
+def test_from_env_default_retry_config(monkeypatch):
+    """Defaults: 3 retries, 100ms base, 1s cap — tuned for K≈10
+    gateway under burst load."""
+    monkeypatch.setenv("L4_NV_EMBED_URL", "https://gw/v1/embeddings")
+    monkeypatch.setenv("L4_EMBED_API_KEY", "k")
+    client = EmbedClient.from_env(prefix="L4_")
+    assert client._max_retries == 3
+    assert client._retry_base_delay == 0.1
+    assert client._retry_max_delay == 1.0