@pentatonic-ai/ai-agent-sdk 0.9.0 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -906,7 +906,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
906
906
  }
907
907
 
908
908
  // src/telemetry.js
909
- var VERSION = "0.9.0";
909
+ var VERSION = "0.9.1";
910
910
  var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
911
911
  function machineId() {
912
912
  const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
package/dist/index.js CHANGED
@@ -875,7 +875,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
875
875
  }
876
876
 
877
877
  // src/telemetry.js
878
- var VERSION = "0.9.0";
878
+ var VERSION = "0.9.1";
879
879
  var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
880
880
  function machineId() {
881
881
  const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pentatonic-ai/ai-agent-sdk",
3
- "version": "0.9.0",
3
+ "version": "0.9.1",
4
4
  "description": "TES SDK — LLM observability and lifecycle tracking via Pentatonic Thing Event System. Track token usage, tool calls, and conversations. Manage things through event-sourced lifecycle stages with AI enrichment and vector search.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.cjs",
@@ -211,6 +211,7 @@ class EmbedClient:
211
211
  autodetect: bool = True,
212
212
  timeout: float = 120.0,
213
213
  env_prefix: str = "",
214
+ max_batch: int = 5,
214
215
  ) -> None:
215
216
  self._configured_provider = provider
216
217
  self._provider = provider
@@ -222,6 +223,12 @@ class EmbedClient:
222
223
  self._autodetect = autodetect
223
224
  self._env_prefix = env_prefix
224
225
  self._detected = False
226
+ # 0 = unlimited (no chunking). Positive = max texts per upstream call;
227
+ # larger inputs are split into multiple calls (concurrent in async path)
228
+ # and the results concatenated. Defaults to 5 because that's the per-call
229
+ # cap observed on Pentatonic AI Gateway — above which it 502s and the
230
+ # caller silently loses vector writes (see test_chunking_* tests).
231
+ self._max_batch = max(0, max_batch)
225
232
 
226
233
  # ------------------------------------------------------------------
227
234
  # Construction
@@ -248,6 +255,7 @@ class EmbedClient:
248
255
  {prefix}EMBED_PROVIDER default 'openai'
249
256
  {prefix}EMBED_AUTODETECT default 'true'
250
257
  {prefix}EMBED_TIMEOUT default '120'
258
+ {prefix}EMBED_MAX_BATCH default '5' (gateway-safe; '0' disables chunking)
251
259
  """
252
260
  url_var = url_var or f"{prefix}NV_EMBED_URL"
253
261
  key_var = key_var or f"{prefix}EMBED_API_KEY"
@@ -259,6 +267,7 @@ class EmbedClient:
259
267
  provider_name = os.environ.get(f"{prefix}EMBED_PROVIDER", "openai")
260
268
  autodetect = os.environ.get(f"{prefix}EMBED_AUTODETECT", "true").lower() == "true"
261
269
  timeout = float(os.environ.get(f"{prefix}EMBED_TIMEOUT", "120"))
270
+ max_batch = int(os.environ.get(f"{prefix}EMBED_MAX_BATCH", "5"))
262
271
 
263
272
  provider = resolve_provider(provider_name, env_prefix=prefix)
264
273
  return cls(
@@ -269,6 +278,7 @@ class EmbedClient:
269
278
  autodetect=autodetect,
270
279
  timeout=timeout,
271
280
  env_prefix=prefix,
281
+ max_batch=max_batch,
272
282
  )
273
283
 
274
284
  # ------------------------------------------------------------------
@@ -307,10 +317,21 @@ class EmbedClient:
307
317
  # ------------------------------------------------------------------
308
318
 
309
319
  def embed_batch(self, texts: list[str]) -> list[list[float]]:
310
- """Embed a list of texts. Empty list returns empty list."""
320
+ """Embed a list of texts. Empty list returns empty list.
321
+
322
+ Splits into chunks of `max_batch` (default 5) and posts each
323
+ sequentially when the input exceeds the limit. Results are
324
+ concatenated in input order. `max_batch=0` disables chunking.
325
+ """
311
326
  if not texts:
312
327
  return []
313
- return self._post_with_autodetect(texts, async_mode=False)
328
+ if self._max_batch == 0 or len(texts) <= self._max_batch:
329
+ return self._post_with_autodetect(texts, async_mode=False)
330
+ out: list[list[float]] = []
331
+ for start in range(0, len(texts), self._max_batch):
332
+ chunk = texts[start:start + self._max_batch]
333
+ out.extend(self._post_with_autodetect(chunk, async_mode=False))
334
+ return out
314
335
 
315
336
  def embed_one(self, text: str) -> list[float]:
316
337
  return self.embed_batch([text])[0]
@@ -320,9 +341,25 @@ class EmbedClient:
320
341
  # ------------------------------------------------------------------
321
342
 
322
343
  async def embed_batch_async(self, texts: list[str]) -> list[list[float]]:
344
+ """Async embed. Chunks are fired concurrently via asyncio.gather
345
+ when the input exceeds `max_batch`; raises the first error if any
346
+ chunk fails (matching the un-chunked semantics)."""
323
347
  if not texts:
324
348
  return []
325
- return await self._post_with_autodetect_async(texts)
349
+ if self._max_batch == 0 or len(texts) <= self._max_batch:
350
+ return await self._post_with_autodetect_async(texts)
351
+ import asyncio
352
+ chunks = [
353
+ texts[start:start + self._max_batch]
354
+ for start in range(0, len(texts), self._max_batch)
355
+ ]
356
+ results = await asyncio.gather(
357
+ *(self._post_with_autodetect_async(chunk) for chunk in chunks)
358
+ )
359
+ out: list[list[float]] = []
360
+ for r in results:
361
+ out.extend(r)
362
+ return out
326
363
 
327
364
  async def embed_one_async(self, text: str) -> list[float]:
328
365
  out = await self.embed_batch_async([text])
@@ -352,3 +352,141 @@ def test_url_without_path_gets_provider_default(recorder):
352
352
  )
353
353
  client.embed_batch(["x"])
354
354
  assert recorder.calls[0]["url"] == "https://lambda-gateway.pentatonic.com/v1/embed"
355
+
356
+
357
+ # ----------------------------------------------------------------------
358
+ # Chunking — work around the Pentatonic AI Gateway's per-call cap of 5
359
+ # texts. Above the cap the gateway 502s; without chunking the layer's
360
+ # /index-batch handler raises, the compat shim swallows it, and vector
361
+ # writes silently drop. Chunking splits the request into chunks of
362
+ # `max_batch` so each call stays within the gateway's limit.
363
+ # ----------------------------------------------------------------------
364
+
365
+
366
+ class _PentatonicEchoStub:
367
+ """httpx.post replacement that returns one embedding per input text,
368
+ matching real gateway behaviour. Each response embedding encodes the
369
+ input index so tests can assert order preservation across chunks."""
370
+
371
+ def __init__(self):
372
+ self.calls: list[dict] = []
373
+ self._offset = 0 # running input-index counter across calls
374
+
375
+ def __call__(self, url, *, json, headers, timeout):
376
+ self.calls.append({"url": url, "json": json, "headers": headers, "timeout": timeout})
377
+ n = len(json.get("input") or [])
378
+ embs = [[float(self._offset + i)] for i in range(n)]
379
+ self._offset += n
380
+ return _FakeResponse(200, {"embeddings": embs})
381
+
382
+
383
+ def test_chunking_below_max_batch_makes_one_call(monkeypatch):
384
+ """N <= max_batch sends one request, no chunking overhead."""
385
+ stub = _PentatonicEchoStub()
386
+ monkeypatch.setattr(httpx, "post", stub)
387
+ client = EmbedClient(
388
+ url="https://lambda-gateway.pentatonic.com/v1/embed",
389
+ api_key="k", model="m",
390
+ provider=PROVIDERS["pentatonic-gateway"],
391
+ max_batch=5,
392
+ )
393
+ out = client.embed_batch([f"t{i}" for i in range(5)])
394
+ assert len(out) == 5
395
+ assert len(stub.calls) == 1
396
+ assert len(stub.calls[0]["json"]["input"]) == 5
397
+
398
+
399
+ def test_chunking_above_max_batch_splits_into_calls(monkeypatch):
400
+ """N > max_batch is split into len(N)/max_batch posts; results are
401
+ concatenated in input order so the caller can't tell."""
402
+ stub = _PentatonicEchoStub()
403
+ monkeypatch.setattr(httpx, "post", stub)
404
+ client = EmbedClient(
405
+ url="https://lambda-gateway.pentatonic.com/v1/embed",
406
+ api_key="k", model="m",
407
+ provider=PROVIDERS["pentatonic-gateway"],
408
+ max_batch=5,
409
+ )
410
+ out = client.embed_batch([f"t{i}" for i in range(12)])
411
+ # 12 texts → chunks of [5, 5, 2] → 3 calls
412
+ assert len(stub.calls) == 3
413
+ assert [len(c["json"]["input"]) for c in stub.calls] == [5, 5, 2]
414
+ # Stub returns one vector per input. Each vector encodes its
415
+ # cross-chunk input index → assert order preserved.
416
+ assert len(out) == 12
417
+ assert out == [[float(i)] for i in range(12)]
418
+
419
+
420
+ def test_chunking_disabled_with_max_batch_zero(monkeypatch):
421
+ """max_batch=0 means no chunking — old behaviour (one big call)."""
422
+ stub = _PentatonicEchoStub()
423
+ monkeypatch.setattr(httpx, "post", stub)
424
+ client = EmbedClient(
425
+ url="https://lambda-gateway.pentatonic.com/v1/embed",
426
+ api_key="k", model="m",
427
+ provider=PROVIDERS["pentatonic-gateway"],
428
+ max_batch=0,
429
+ )
430
+ client.embed_batch([f"t{i}" for i in range(20)])
431
+ assert len(stub.calls) == 1
432
+ assert len(stub.calls[0]["json"]["input"]) == 20
433
+
434
+
435
+ def test_chunking_propagates_first_error(recorder):
436
+ """If a chunk fails (e.g., gateway 502), the whole call raises with
437
+ the first error — matching the un-chunked semantics. We don't return
438
+ a partial vector list because the caller's downstream `for r, emb, txt
439
+ in zip(...)` loop would silently drop the failed records."""
440
+ # Pentatonic gateway 502 on every call (simulates the real bug)
441
+ recorder.respond(
442
+ "https://lambda-gateway.pentatonic.com/v1/embed",
443
+ _FakeResponse(502, "<html>...bad gateway...</html>"),
444
+ )
445
+ client = EmbedClient(
446
+ url="https://lambda-gateway.pentatonic.com/v1/embed",
447
+ api_key="k", model="m",
448
+ provider=PROVIDERS["pentatonic-gateway"],
449
+ max_batch=5,
450
+ )
451
+ with pytest.raises(EmbedHTTPError) as exc:
452
+ client.embed_batch([f"t{i}" for i in range(8)])
453
+ assert exc.value.status == 502
454
+
455
+
456
+ class _OpenAIEchoStub:
457
+ """OpenAI-shaped stub: returns one embedding per input as
458
+ {data: [{embedding: [...]}]}."""
459
+
460
+ def __init__(self):
461
+ self.calls: list[dict] = []
462
+
463
+ def __call__(self, url, *, json, headers, timeout):
464
+ self.calls.append({"url": url, "json": json, "headers": headers, "timeout": timeout})
465
+ n = len(json.get("input") or [])
466
+ return _FakeResponse(200, {"data": [{"embedding": [0.0]} for _ in range(n)]})
467
+
468
+
469
+ def test_from_env_reads_max_batch(monkeypatch):
470
+ """{prefix}EMBED_MAX_BATCH overrides the default of 5."""
471
+ monkeypatch.setenv("L4_NV_EMBED_URL", "https://gw/v1/embeddings")
472
+ monkeypatch.setenv("L4_EMBED_API_KEY", "k")
473
+ monkeypatch.setenv("L4_EMBED_MAX_BATCH", "3")
474
+ stub = _OpenAIEchoStub()
475
+ monkeypatch.setattr(httpx, "post", stub)
476
+ client = EmbedClient.from_env(prefix="L4_")
477
+ client.embed_batch([f"t{i}" for i in range(7)])
478
+ # 7 with chunk=3 → [3, 3, 1] → 3 calls
479
+ assert len(stub.calls) == 3
480
+ assert [len(c["json"]["input"]) for c in stub.calls] == [3, 3, 1]
481
+
482
+
483
+ def test_from_env_default_max_batch_is_five(monkeypatch):
484
+ """Default max_batch=5 matches the observed Pentatonic Gateway cap."""
485
+ monkeypatch.setenv("L4_NV_EMBED_URL", "https://gw/v1/embeddings")
486
+ monkeypatch.setenv("L4_EMBED_API_KEY", "k")
487
+ stub = _OpenAIEchoStub()
488
+ monkeypatch.setattr(httpx, "post", stub)
489
+ client = EmbedClient.from_env(prefix="L4_")
490
+ client.embed_batch([f"t{i}" for i in range(10)])
491
+ # 10 with default chunk=5 → [5, 5] → 2 calls
492
+ assert len(stub.calls) == 2