npm - @pentatonic-ai/ai-agent-sdk - Versions diffs - 0.9.0 → 0.9.1 - Mend

@pentatonic-ai/ai-agent-sdk 0.9.0 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/index.cjs +1 -1
package/dist/index.js +1 -1
package/package.json +1 -1
package/packages/memory-engine/engine/services/_shared/embed_provider.py +40 -3
package/packages/memory-engine/tests/test_embed_provider.py +138 -0

package/dist/index.cjs CHANGED Viewed

@@ -906,7 +906,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
 }
 // src/telemetry.js
-var VERSION = "0.9.0";
+var VERSION = "0.9.1";
 var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
 function machineId() {
   const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";

package/dist/index.js CHANGED Viewed

@@ -875,7 +875,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
 }
 // src/telemetry.js
-var VERSION = "0.9.0";
+var VERSION = "0.9.1";
 var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
 function machineId() {
   const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@pentatonic-ai/ai-agent-sdk",
-  "version": "0.9.0",
+  "version": "0.9.1",
   "description": "TES SDK — LLM observability and lifecycle tracking via Pentatonic Thing Event System. Track token usage, tool calls, and conversations. Manage things through event-sourced lifecycle stages with AI enrichment and vector search.",
   "type": "module",
   "main": "./dist/index.cjs",

package/packages/memory-engine/engine/services/_shared/embed_provider.py CHANGED Viewed

@@ -211,6 +211,7 @@ class EmbedClient:
         autodetect: bool = True,
         timeout: float = 120.0,
         env_prefix: str = "",
+        max_batch: int = 5,
     ) -> None:
         self._configured_provider = provider
         self._provider = provider
@@ -222,6 +223,12 @@ class EmbedClient:
         self._autodetect = autodetect
         self._env_prefix = env_prefix
         self._detected = False
+        # 0 = unlimited (no chunking). Positive = max texts per upstream call;
+        # larger inputs are split into multiple calls (concurrent in async path)
+        # and the results concatenated. Defaults to 5 because that's the per-call
+        # cap observed on Pentatonic AI Gateway — above which it 502s and the
+        # caller silently loses vector writes (see test_chunking_* tests).
+        self._max_batch = max(0, max_batch)
     # ------------------------------------------------------------------
     # Construction
@@ -248,6 +255,7 @@ class EmbedClient:
           {prefix}EMBED_PROVIDER     default 'openai'
           {prefix}EMBED_AUTODETECT   default 'true'
           {prefix}EMBED_TIMEOUT      default '120'
+          {prefix}EMBED_MAX_BATCH    default '5' (gateway-safe; '0' disables chunking)
         """
         url_var = url_var or f"{prefix}NV_EMBED_URL"
         key_var = key_var or f"{prefix}EMBED_API_KEY"
@@ -259,6 +267,7 @@ class EmbedClient:
         provider_name = os.environ.get(f"{prefix}EMBED_PROVIDER", "openai")
         autodetect = os.environ.get(f"{prefix}EMBED_AUTODETECT", "true").lower() == "true"
         timeout = float(os.environ.get(f"{prefix}EMBED_TIMEOUT", "120"))
+        max_batch = int(os.environ.get(f"{prefix}EMBED_MAX_BATCH", "5"))
         provider = resolve_provider(provider_name, env_prefix=prefix)
         return cls(
@@ -269,6 +278,7 @@ class EmbedClient:
             autodetect=autodetect,
             timeout=timeout,
             env_prefix=prefix,
+            max_batch=max_batch,
         )
     # ------------------------------------------------------------------
@@ -307,10 +317,21 @@ class EmbedClient:
     # ------------------------------------------------------------------
     def embed_batch(self, texts: list[str]) -> list[list[float]]:
-        """Embed a list of texts. Empty list returns empty list."""
+        """Embed a list of texts. Empty list returns empty list.
+        Splits into chunks of `max_batch` (default 5) and posts each
+        sequentially when the input exceeds the limit. Results are
+        concatenated in input order. `max_batch=0` disables chunking.
+        """
         if not texts:
             return []
-        return self._post_with_autodetect(texts, async_mode=False)
+        if self._max_batch == 0 or len(texts) <= self._max_batch:
+            return self._post_with_autodetect(texts, async_mode=False)
+        out: list[list[float]] = []
+        for start in range(0, len(texts), self._max_batch):
+            chunk = texts[start:start + self._max_batch]
+            out.extend(self._post_with_autodetect(chunk, async_mode=False))
+        return out
     def embed_one(self, text: str) -> list[float]:
         return self.embed_batch([text])[0]
@@ -320,9 +341,25 @@ class EmbedClient:
     # ------------------------------------------------------------------
     async def embed_batch_async(self, texts: list[str]) -> list[list[float]]:
+        """Async embed. Chunks are fired concurrently via asyncio.gather
+        when the input exceeds `max_batch`; raises the first error if any
+        chunk fails (matching the un-chunked semantics)."""
         if not texts:
             return []
-        return await self._post_with_autodetect_async(texts)
+        if self._max_batch == 0 or len(texts) <= self._max_batch:
+            return await self._post_with_autodetect_async(texts)
+        import asyncio
+        chunks = [
+            texts[start:start + self._max_batch]
+            for start in range(0, len(texts), self._max_batch)
+        ]
+        results = await asyncio.gather(
+            *(self._post_with_autodetect_async(chunk) for chunk in chunks)
+        )
+        out: list[list[float]] = []
+        for r in results:
+            out.extend(r)
+        return out
     async def embed_one_async(self, text: str) -> list[float]:
         out = await self.embed_batch_async([text])

package/packages/memory-engine/tests/test_embed_provider.py CHANGED Viewed

@@ -352,3 +352,141 @@ def test_url_without_path_gets_provider_default(recorder):
     )
     client.embed_batch(["x"])
     assert recorder.calls[0]["url"] == "https://lambda-gateway.pentatonic.com/v1/embed"
+# ----------------------------------------------------------------------
+# Chunking — work around the Pentatonic AI Gateway's per-call cap of 5
+# texts. Above the cap the gateway 502s; without chunking the layer's
+# /index-batch handler raises, the compat shim swallows it, and vector
+# writes silently drop. Chunking splits the request into chunks of
+# `max_batch` so each call stays within the gateway's limit.
+# ----------------------------------------------------------------------
+class _PentatonicEchoStub:
+    """httpx.post replacement that returns one embedding per input text,
+    matching real gateway behaviour. Each response embedding encodes the
+    input index so tests can assert order preservation across chunks."""
+    def __init__(self):
+        self.calls: list[dict] = []
+        self._offset = 0  # running input-index counter across calls
+    def __call__(self, url, *, json, headers, timeout):
+        self.calls.append({"url": url, "json": json, "headers": headers, "timeout": timeout})
+        n = len(json.get("input") or [])
+        embs = [[float(self._offset + i)] for i in range(n)]
+        self._offset += n
+        return _FakeResponse(200, {"embeddings": embs})
+def test_chunking_below_max_batch_makes_one_call(monkeypatch):
+    """N <= max_batch sends one request, no chunking overhead."""
+    stub = _PentatonicEchoStub()
+    monkeypatch.setattr(httpx, "post", stub)
+    client = EmbedClient(
+        url="https://lambda-gateway.pentatonic.com/v1/embed",
+        api_key="k", model="m",
+        provider=PROVIDERS["pentatonic-gateway"],
+        max_batch=5,
+    )
+    out = client.embed_batch([f"t{i}" for i in range(5)])
+    assert len(out) == 5
+    assert len(stub.calls) == 1
+    assert len(stub.calls[0]["json"]["input"]) == 5
+def test_chunking_above_max_batch_splits_into_calls(monkeypatch):
+    """N > max_batch is split into len(N)/max_batch posts; results are
+    concatenated in input order so the caller can't tell."""
+    stub = _PentatonicEchoStub()
+    monkeypatch.setattr(httpx, "post", stub)
+    client = EmbedClient(
+        url="https://lambda-gateway.pentatonic.com/v1/embed",
+        api_key="k", model="m",
+        provider=PROVIDERS["pentatonic-gateway"],
+        max_batch=5,
+    )
+    out = client.embed_batch([f"t{i}" for i in range(12)])
+    # 12 texts → chunks of [5, 5, 2] → 3 calls
+    assert len(stub.calls) == 3
+    assert [len(c["json"]["input"]) for c in stub.calls] == [5, 5, 2]
+    # Stub returns one vector per input. Each vector encodes its
+    # cross-chunk input index → assert order preserved.
+    assert len(out) == 12
+    assert out == [[float(i)] for i in range(12)]
+def test_chunking_disabled_with_max_batch_zero(monkeypatch):
+    """max_batch=0 means no chunking — old behaviour (one big call)."""
+    stub = _PentatonicEchoStub()
+    monkeypatch.setattr(httpx, "post", stub)
+    client = EmbedClient(
+        url="https://lambda-gateway.pentatonic.com/v1/embed",
+        api_key="k", model="m",
+        provider=PROVIDERS["pentatonic-gateway"],
+        max_batch=0,
+    )
+    client.embed_batch([f"t{i}" for i in range(20)])
+    assert len(stub.calls) == 1
+    assert len(stub.calls[0]["json"]["input"]) == 20
+def test_chunking_propagates_first_error(recorder):
+    """If a chunk fails (e.g., gateway 502), the whole call raises with
+    the first error — matching the un-chunked semantics. We don't return
+    a partial vector list because the caller's downstream `for r, emb, txt
+    in zip(...)` loop would silently drop the failed records."""
+    # Pentatonic gateway 502 on every call (simulates the real bug)
+    recorder.respond(
+        "https://lambda-gateway.pentatonic.com/v1/embed",
+        _FakeResponse(502, "<html>...bad gateway...</html>"),
+    )
+    client = EmbedClient(
+        url="https://lambda-gateway.pentatonic.com/v1/embed",
+        api_key="k", model="m",
+        provider=PROVIDERS["pentatonic-gateway"],
+        max_batch=5,
+    )
+    with pytest.raises(EmbedHTTPError) as exc:
+        client.embed_batch([f"t{i}" for i in range(8)])
+    assert exc.value.status == 502
+class _OpenAIEchoStub:
+    """OpenAI-shaped stub: returns one embedding per input as
+    {data: [{embedding: [...]}]}."""
+    def __init__(self):
+        self.calls: list[dict] = []
+    def __call__(self, url, *, json, headers, timeout):
+        self.calls.append({"url": url, "json": json, "headers": headers, "timeout": timeout})
+        n = len(json.get("input") or [])
+        return _FakeResponse(200, {"data": [{"embedding": [0.0]} for _ in range(n)]})
+def test_from_env_reads_max_batch(monkeypatch):
+    """{prefix}EMBED_MAX_BATCH overrides the default of 5."""
+    monkeypatch.setenv("L4_NV_EMBED_URL", "https://gw/v1/embeddings")
+    monkeypatch.setenv("L4_EMBED_API_KEY", "k")
+    monkeypatch.setenv("L4_EMBED_MAX_BATCH", "3")
+    stub = _OpenAIEchoStub()
+    monkeypatch.setattr(httpx, "post", stub)
+    client = EmbedClient.from_env(prefix="L4_")
+    client.embed_batch([f"t{i}" for i in range(7)])
+    # 7 with chunk=3 → [3, 3, 1] → 3 calls
+    assert len(stub.calls) == 3
+    assert [len(c["json"]["input"]) for c in stub.calls] == [3, 3, 1]
+def test_from_env_default_max_batch_is_five(monkeypatch):
+    """Default max_batch=5 matches the observed Pentatonic Gateway cap."""
+    monkeypatch.setenv("L4_NV_EMBED_URL", "https://gw/v1/embeddings")
+    monkeypatch.setenv("L4_EMBED_API_KEY", "k")
+    stub = _OpenAIEchoStub()
+    monkeypatch.setattr(httpx, "post", stub)
+    client = EmbedClient.from_env(prefix="L4_")
+    client.embed_batch([f"t{i}" for i in range(10)])
+    # 10 with default chunk=5 → [5, 5] → 2 calls
+    assert len(stub.calls) == 2