npm - @pentatonic-ai/ai-agent-sdk - Versions diffs - 0.10.6 → 0.10.7 - Mend

@pentatonic-ai/ai-agent-sdk 0.10.6 → 0.10.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/dist/index.cjs CHANGED Viewed

@@ -878,7 +878,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
 }
 // src/telemetry.js
-var VERSION = "0.10.6";
+var VERSION = "0.10.7";
 var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
 function machineId() {
   const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";

package/dist/index.js CHANGED Viewed

@@ -847,7 +847,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
 }
 // src/telemetry.js
-var VERSION = "0.10.6";
+var VERSION = "0.10.7";
 var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
 function machineId() {
   const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@pentatonic-ai/ai-agent-sdk",
-  "version": "0.10.6",
+  "version": "0.10.7",
   "description": "TES SDK — LLM observability and lifecycle tracking via Pentatonic Thing Event System. Track token usage, tool calls, and conversations. Manage things through event-sourced lifecycle stages with AI enrichment and vector search.",
   "type": "module",
   "main": "./dist/index.cjs",

package/packages/memory-engine-v2/docker-compose.aws.yml CHANGED Viewed

@@ -19,6 +19,14 @@
 services:
   org-model:
+    # max_connections + shared_buffers must be passed via `-c` flags;
+    # the postgres:16-alpine image does NOT honor POSTGRES_MAX_CONNECTIONS
+    # or POSTGRES_SHARED_BUFFERS env vars (only POSTGRES_USER/PASSWORD/DB).
+    # 2026-05-19: bumped from compiled default 100 -> 200 after Pip's
+    # aborted-forget incident saturated the slots (4 stuck DELETEs +
+    # baseline pools). Shared_buffers raised to match the operator intent
+    # that was previously expressed in the unread env vars.
+    command: ["postgres", "-c", "max_connections=200", "-c", "shared_buffers=1GB"]
     environment:
       # Production tuning: bigger shared_buffers for the materialised
       # views, more connection slots for the extractor + compat pools.
@@ -45,8 +53,53 @@ services:
       PG_DSN: ${PME_V2_PG_DSN}
       LLM_ENDPOINT: ${PME_V2_LLM_ENDPOINT:-}
       LLM_API_KEY: ${PENTATONIC_AI_GATEWAY_KEY:-}
+      # Default model id for the AWS self-hosted distiller (Qwen2.5-7B-Instruct
+      # via vLLM on i-0d658d1aa70b497a6, served as `qwen2.5-7b-instruct`).
+      # When PME_V2_LLM_ENDPOINT points back at the Lambda 30B gateway,
+      # override LLM_MODEL via env to that gateway's model id.
+      LLM_MODEL: ${LLM_MODEL:-qwen2.5-7b-instruct}
+      # Self-hosted distiller (Qwen3.6-27B-FP8 on L40S, served via the
+      # autoscaled fleet). Tuning vs the Lambda 30B fleet: smaller
+      # per-call chunks, higher concurrency, longer timeout.
+      #
+      # EVENTS_PER_LLM_CALL=3 (was 5) + LLM_MAX_TOKENS_PER_EVENT_JSON=900
+      # (was the 400 default): the guided-JSON max_tokens budget is
+      # SHARED across the chunk's events, so dense events (full email/doc
+      # bodies maxing 8 ent/6 fct/6 rel ≈ ~1.1k output tokens each)
+      # clustering in a 5-event chunk overran the old 2000-tok ceiling
+      # and truncated the JSON array tail — 15% of calls finished on
+      # `length` not `stop` (measured 2026-06-12). 3×900=2700 output +
+      # ~2100 prompt = ~4.8k, well inside the L40S's 8192 max-model-len
+      # (16384 OOMs the L40S), giving every event real headroom.
+      # Quality over throughput — the autoscaler adds boxes to recover
+      # the per-box throughput lost to smaller chunks.
+      EVENTS_PER_LLM_CALL: "3"
+      CONCURRENT_LLM_CALLS: "20"
+      LLM_MAX_TOKENS_PER_EVENT_JSON: "900"
+      LLM_TIMEOUT_SEC: "300"
       POLL_INTERVAL_SEC: "10"
-      CLAIM_TTL_SEC: "600"
+      CLAIM_TTL_SEC: "900"
+      POLL_INTERVAL_SEC_AFTER_EMPTY: "5"
+      # Skip-source list — never distil agent's-own-output, code ingest,
+      # orchestrator briefings, manual triage events into the graph.
+      # Source labels enumerated as they were observed leaking into prod
+      # over the weekend. New agent producers should be added here AND
+      # source_kind='agent' filtering should already drop them via worker.py.
+      DISTILL_SKIP_SOURCES: "pip-code-ingest,claude-code-plugin,openclaw-seesa,openclaw-plugin,openclaw-philip-mossop,openclaw-jamie,seesa,seesa-direct-curl-test,seesa-dedup-probe,orchestrator-web,briefing-morning,briefing-eod,triage-email,triage-manual"
+      # Trace logging — captures raw teacher I/O per distilled event into
+      # the distillation_traces table for student-model training data.
+      # Opt-in: defaults false here; set DISTILL_TRACE_ENABLED=true in
+      # SSM Parameter Store to flip on. See ai-events-sdk PR #74 for the
+      # worker-side logic + the migration that creates the table.
+      DISTILL_TRACE_ENABLED: ${DISTILL_TRACE_ENABLED:-false}
+      DISTILL_OUTPUT_MODE: ${DISTILL_OUTPUT_MODE:-kv}
+      DISTILL_GUIDED_PARAM_STYLE: ${DISTILL_GUIDED_PARAM_STYLE:-response_format}
+      # Chat-template switches forwarded verbatim on every completion
+      # (vLLM `chat_template_kwargs`). Required for thinking-capable
+      # teachers — Qwen3.x defaults enable_thinking=true, which burns
+      # the token budget on reasoning the distiller never reads. Set in
+      # SSM to '{"enable_thinking": false}' for the Qwen3.6 teacher.
+      DISTILL_CHAT_TEMPLATE_KWARGS: ${DISTILL_CHAT_TEMPLATE_KWARGS:-}
   compat:
     environment:
@@ -54,8 +107,15 @@ services:
       VECTOR_INDEX_URL: http://vector-index:6333
       EXTRACTOR_SYNC_URL: http://extractor-sync:8101
       NV_EMBED_URL: ${NV_EMBED_URL}
+      # Bulk embed lane (PR #76 ai-events-sdk) — separate box from the
+      # interactive lane so heavy backfills don't queue behind chat
+      # query embeds. Set in SSM to a different IP from NV_EMBED_URL.
+      NV_EMBED_URL_BULK: ${NV_EMBED_URL_BULK}
       NV_EMBED_API_KEY: ${PENTATONIC_AI_GATEWAY_KEY}
       NV_EMBED_PROVIDER: pentatonic-gateway
+      SEARCH_HYBRID_ENABLED: ${SEARCH_HYBRID_ENABLED:-}
+      SEARCH_MMR_ENABLED: ${SEARCH_MMR_ENABLED:-1}
+      SEARCH_INTENT_BOOST: ${SEARCH_INTENT_BOOST:-1}
       EMBED_DIM: "4096"
   # Cloudflared tunnel — same pattern as v1. Optional; only start if
@@ -76,3 +136,4 @@ services:
     depends_on:
       compat:
         condition: service_healthy

package/packages/memory-engine-v2/docker-compose.yml CHANGED Viewed

@@ -74,7 +74,14 @@ services:
   # --------------------------------------------------------------------
   vector-index:
     <<: *engine-base
-    image: qdrant/qdrant:v1.12.4
+    # v1.18.2: minimum version whose API can ADD a named (sparse) vector
+    # to an existing collection (PUT /collections/{c}/vectors/{v}) —
+    # required by hybrid retrieval's 'lex' migration. Upgraded in prod
+    # 2026-06-11 by stepping minors 1.13.6→…→1.18.2 (the 1.12→1.18
+    # direct jump fails: segment.json "unknown variant `on_disk`").
+    # Do NOT lower this pin: 1.18-migrated storage cannot be read by
+    # older servers.
+    image: qdrant/qdrant:v1.18.2
     container_name: pme2-vector-index
     ports:
       - "127.0.0.1:${PME_V2_QDRANT_HTTP_PORT:-16333}:6333"

package/packages/memory-engine-v2/extractor-async/test_guided_json_parser.py CHANGED Viewed

@@ -409,3 +409,47 @@ def test_guided_prompt_keeps_content_rules() -> None:
     # Pipe scaffolding gone
     assert "COUNT THE PIPES" not in p
     assert "PIPE-DELIMITED" not in p
+# ----------------------------------------------------------------------
+# DISTILL_CHAT_TEMPLATE_KWARGS — thinking-teacher template switch
+# ----------------------------------------------------------------------
+def test_default_body_has_no_chat_template_kwargs(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Unset env → the request body is byte-identical to before the
+    knob existed (Qwen2.5-class teachers need no template switches)."""
+    monkeypatch.delenv("DISTILL_CHAT_TEMPLATE_KWARGS", raising=False)
+    w = _load_worker("worker_no_ctk")
+    assert w.DISTILL_CHAT_TEMPLATE_KWARGS is None
+    assert "chat_template_kwargs" not in w._build_request_body("PROMPT", 5)
+def test_chat_template_kwargs_forwarded(monkeypatch: pytest.MonkeyPatch) -> None:
+    """The Qwen3.x swap case: {"enable_thinking": false} must land
+    verbatim in every request body, in both output modes."""
+    monkeypatch.setenv("DISTILL_CHAT_TEMPLATE_KWARGS", '{"enable_thinking": false}')
+    w = _load_worker("worker_ctk")
+    assert w.DISTILL_CHAT_TEMPLATE_KWARGS == {"enable_thinking": False}
+    body = w._build_request_body("PROMPT", 5)
+    assert body["chat_template_kwargs"] == {"enable_thinking": False}
+    monkeypatch.setenv("DISTILL_OUTPUT_MODE", "guided_json")
+    w2 = _load_worker("worker_ctk_guided")
+    body2 = w2._build_request_body("PROMPT", 5)
+    assert body2["chat_template_kwargs"] == {"enable_thinking": False}
+    assert "response_format" in body2
+def test_chat_template_kwargs_invalid_ignored(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Malformed JSON or a non-object must not take the worker down —
+    log + ignore, requests stay clean."""
+    for bad in ("{not json", '["a", "list"]', '"a string"'):
+        monkeypatch.setenv("DISTILL_CHAT_TEMPLATE_KWARGS", bad)
+        w = _load_worker(f"worker_ctk_bad_{abs(hash(bad))}")
+        assert w.DISTILL_CHAT_TEMPLATE_KWARGS is None
+        assert "chat_template_kwargs" not in w._build_request_body("PROMPT", 5)

package/packages/memory-engine-v2/extractor-async/worker.py CHANGED Viewed

@@ -149,13 +149,41 @@ if DISTILL_GUIDED_PARAM_STYLE not in ("response_format", "guided_json"):
     )
     DISTILL_GUIDED_PARAM_STYLE = "response_format"
+# Optional chat-template kwargs forwarded verbatim on every chat
+# completion (vLLM extension: top-level `chat_template_kwargs`).
+# Needed for thinking-capable teachers: Qwen3.x chat templates default
+# enable_thinking=true, which burns the max_tokens budget on reasoning
+# the distiller never reads. The 2026-06-11 teacher bake-off ran the
+# Qwen3.6 lanes with {"enable_thinking": false}, so the prod swap must
+# send the same switch for its traces to match the benchmarked
+# distribution. Unset (default) sends nothing — the request body stays
+# byte-identical for teachers without template switches (Qwen2.5).
+DISTILL_CHAT_TEMPLATE_KWARGS: dict[str, Any] | None = None
+_raw_ctk = os.environ.get("DISTILL_CHAT_TEMPLATE_KWARGS", "").strip()
+if _raw_ctk:
+    try:
+        _parsed_ctk = json.loads(_raw_ctk)
+        if not isinstance(_parsed_ctk, dict):
+            raise ValueError("must be a JSON object")
+        DISTILL_CHAT_TEMPLATE_KWARGS = _parsed_ctk
+    except ValueError as e:
+        log.warning(f"DISTILL_CHAT_TEMPLATE_KWARGS invalid ({e}) — ignoring")
 # JSON output carries structural overhead (braces, quotes, key names)
 # the KV format doesn't, so guided mode gets its own per-event token
 # budget. Truncation is guided mode's ONLY parse-failure mode (the
 # schema enforcer guarantees validity up to the cut), so this errs
 # higher than the KV 300.
+#
+# NOTE the budget is SHARED across the chunk (max_tokens = this × N
+# events per request). A fully-maxed event (8 ent / 6 fct with 140-char
+# statements / 6 rel + JSON overhead) is ~1.1k output tokens, so chunk
+# size and this value must be chosen together against the server's
+# max_model_len. Raised 400→900 after prod showed 15% of 5-event chunks
+# truncating on `length` (2026-06-12); prod now runs EVENTS_PER_LLM_CALL=3
+# so 3×900 output + ~2.1k prompt stays well inside the L40S 8192 ctx.
 LLM_MAX_TOKENS_PER_EVENT_JSON = int(
-    os.environ.get("LLM_MAX_TOKENS_PER_EVENT_JSON", "400")
+    os.environ.get("LLM_MAX_TOKENS_PER_EVENT_JSON", "900")
 )
@@ -667,6 +695,8 @@ def _build_request_body(user_prompt: str, n: int) -> dict[str, Any]:
             else LLM_MAX_TOKENS_PER_EVENT
         ) * n,
     }
+    if DISTILL_CHAT_TEMPLATE_KWARGS:
+        body["chat_template_kwargs"] = DISTILL_CHAT_TEMPLATE_KWARGS
     if DISTILL_OUTPUT_MODE == "guided_json":
         if DISTILL_GUIDED_PARAM_STYLE == "guided_json":
             body["guided_json"] = EXTRACTION_SCHEMA

package/packages/memory-engine-v2/extractor-sync/server.py CHANGED Viewed

@@ -56,11 +56,15 @@ _pool: AsyncConnectionPool | None = None
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     global _pool
+    # Default (tuple) row factory — _upsert_entities and friends index
+    # fetchone() rows positionally, matching extractor-async's worker.
+    # A dict_row factory here turns row[0] into KeyError: 0 on the
+    # entity-merge path (2026-06-11 prod incident: every extract that
+    # re-saw a known entity 500'd; only never-seen-entity events stored).
     _pool = AsyncConnectionPool(
         conninfo=PG_DSN,
         min_size=8,
         max_size=50,
-        kwargs={"row_factory": psycopg.rows.dict_row},
         open=False,
     )
     await _pool.open()
@@ -89,7 +93,7 @@ class ExtractRequest(BaseModel):
     clientId: str
     userId: str | None = None
     event_type: str = "STORE_MEMORY"
-    source_kind: str  # 'chat' | 'note' | 'doc' | 'event' | 'ticket' | 'commit' | 'system' | 'agent'
+    source_kind: str  # 'chat' | 'note' | 'doc' | 'event' | 'ticket' | 'commit' | 'system' | 'agent' | 'code_reference'
     source_id: str | None = None
     content: str
     attributes: dict[str, Any] = {}

package/packages/memory-engine-v2/extractor-sync/test_paired_extraction.py CHANGED Viewed

@@ -22,8 +22,14 @@ import pytest
 # Load extractor-sync's server.py as a module so we can call its
-# private helpers directly.
+# private helpers directly. server.py flat-imports its siblings
+# (entity_id) the way the container's WORKDIR layout resolves them, so
+# this directory must be on sys.path — otherwise exec_module raises
+# ImportError and the module-level skip below silently swallows the
+# whole suite whenever pytest runs from the repo root.
 _THIS = Path(__file__).resolve().parent
+if str(_THIS) not in sys.path:
+    sys.path.insert(0, str(_THIS))
 _SPEC = importlib.util.spec_from_file_location("extractor_sync_server",
                                                 _THIS / "server.py")
 assert _SPEC and _SPEC.loader
@@ -206,3 +212,78 @@ def test_extract_event_organizer_object_form() -> None:
     assert len(entities) == 1
     assert entities[0]["canonical_name"] == "X Person"
     assert "x@example.com" in entities[0]["aliases"]
+# ----------------------------------------------------------------------
+# _upsert_entities — merge path indexes rows positionally
+# ----------------------------------------------------------------------
+#
+# Regression for the 2026-06-11 prod incident: the pool was configured
+# with row_factory=dict_row while _upsert_entities did `row[0]`, so the
+# merge branch (entity already known) raised KeyError: 0 and every
+# extract that re-saw a known entity 500'd. Only never-seen-entity
+# events could store. Two guards:
+#   1. the pool must keep psycopg's default tuple row factory
+#      (matching extractor-async's worker, which also indexes
+#      positionally), and
+#   2. the merge branch must work against tuple rows end-to-end.
+import asyncio
+import inspect
+class _FakeCursor:
+    """Quacks like psycopg.AsyncCursor, returning TUPLE rows — the
+    shape the pool's default row factory produces. If the pool ever
+    grows a custom row_factory again, update this fake to match it or
+    test_pool_keeps_default_tuple_row_factory will flag the drift."""
+    def __init__(self, existing_id: str | None) -> None:
+        self.executed: list[tuple[str, object]] = []
+        self._existing_id = existing_id
+    async def execute(self, sql: str, params: object = None) -> None:
+        self.executed.append((" ".join(sql.split()), params))
+    async def fetchone(self):
+        return (self._existing_id,) if self._existing_id else None
+def _entity_stub() -> dict:
+    return {
+        "id": "e_new",
+        "arena": "arena1",
+        "entity_type": "person",
+        "canonical_name": "Alice One",
+        "aliases": ["Alice One", "alice@example.com"],
+        "provenance_event_ids": ["evt1"],
+        "participant_set": ["arena1"],
+        "disclosure_class": "private",
+    }
+def test_pool_keeps_default_tuple_row_factory() -> None:
+    src = inspect.getsource(sync_server.lifespan)
+    assert "row_factory" not in src, (
+        "extractor-sync's pool must use psycopg's default tuple rows: "
+        "_upsert_entities indexes fetchone() results positionally."
+    )
+def test_upsert_entities_merge_branch_with_tuple_rows() -> None:
+    """Entity already exists → UPDATE branch runs, id taken from row[0]."""
+    cur = _FakeCursor(existing_id="e_existing")
+    asyncio.run(sync_server._upsert_entities(cur, [_entity_stub()]))
+    updates = [(s, p) for s, p in cur.executed if s.startswith("UPDATE entities")]
+    assert len(updates) == 1
+    _, params = updates[0]
+    assert params[-1] == "e_existing"  # WHERE id = %s ← row[0]
+    assert not any(s.startswith("INSERT INTO entities") for s, _ in cur.executed)
+def test_upsert_entities_insert_branch_when_no_match() -> None:
+    cur = _FakeCursor(existing_id=None)
+    asyncio.run(sync_server._upsert_entities(cur, [_entity_stub()]))
+    inserts = [s for s, _ in cur.executed if s.startswith("INSERT INTO entities")]
+    assert len(inserts) == 1
+    assert not any(s.startswith("UPDATE entities") for s, _ in cur.executed)

package/packages/memory-engine-v2/org-model/migrations/004_source_kind_code_reference.sql ADDED Viewed

@@ -0,0 +1,12 @@
+-- 004: accept 'code_reference' source events (SDK corpus ingest).
+--
+-- The SDK corpus module (packages/memory/src/corpus/) emits events with
+-- source_kind='code_reference' (code-signature ingest, adapters.js).
+-- The enum predates that feature, so those events bounced with
+-- InvalidTextRepresentation and could never be stored — observed in
+-- prod 2026-06-11 as persistent /extract 500s + producer retry loops.
+--
+-- ALTER TYPE ... ADD VALUE cannot run inside a transaction block;
+-- apply with autocommit (psql's default per-statement behaviour).
+-- Applied manually to prod (pme2-org-model) on 2026-06-11.
+ALTER TYPE source_kind ADD VALUE IF NOT EXISTS 'code_reference';

package/packages/memory-engine-v2/org-model/migrations/005_fk_indexes.sql ADDED Viewed

@@ -0,0 +1,20 @@
+-- 005: index every column that references events(id).
+--
+-- events has four referencing constraints:
+--   distillation_queue.event_id   ON DELETE CASCADE
+--   vector_provenance.event_id    ON DELETE CASCADE
+--   distillation_traces.event_id  ON DELETE CASCADE
+--   events.forgets (self)         ON DELETE SET NULL
+--
+-- Postgres does NOT auto-index FK referencing columns. Without these,
+-- every DELETE on events seq-scans each referencing table per deleted
+-- row to enforce the constraint — the 2026-06-11 arena-scoped nuke of
+-- ~70k events ran for HOURS until the missing indexes were created
+-- on-box. (distillation_queue.event_id already had idx_distillation_
+-- event_id from 003; listed here for completeness via IF NOT EXISTS.)
+--
+-- All idempotent; applied manually to prod (pme2-org-model) 2026-06-12.
+CREATE INDEX IF NOT EXISTS idx_distillation_event_id ON distillation_queue(event_id);
+CREATE INDEX IF NOT EXISTS idx_traces_event_id ON distillation_traces(event_id);
+CREATE INDEX IF NOT EXISTS idx_vector_provenance_event_id ON vector_provenance(event_id);
+CREATE INDEX IF NOT EXISTS idx_events_forgets ON events(forgets);