@pentatonic-ai/ai-agent-sdk 0.10.6 → 0.10.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -878,7 +878,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
878
878
  }
879
879
 
880
880
  // src/telemetry.js
881
- var VERSION = "0.10.6";
881
+ var VERSION = "0.10.7";
882
882
  var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
883
883
  function machineId() {
884
884
  const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
package/dist/index.js CHANGED
@@ -847,7 +847,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
847
847
  }
848
848
 
849
849
  // src/telemetry.js
850
- var VERSION = "0.10.6";
850
+ var VERSION = "0.10.7";
851
851
  var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
852
852
  function machineId() {
853
853
  const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pentatonic-ai/ai-agent-sdk",
3
- "version": "0.10.6",
3
+ "version": "0.10.7",
4
4
  "description": "TES SDK — LLM observability and lifecycle tracking via Pentatonic Thing Event System. Track token usage, tool calls, and conversations. Manage things through event-sourced lifecycle stages with AI enrichment and vector search.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.cjs",
@@ -19,6 +19,14 @@
19
19
 
20
20
  services:
21
21
  org-model:
22
+ # max_connections + shared_buffers must be passed via `-c` flags;
23
+ # the postgres:16-alpine image does NOT honor POSTGRES_MAX_CONNECTIONS
24
+ # or POSTGRES_SHARED_BUFFERS env vars (only POSTGRES_USER/PASSWORD/DB).
25
+ # 2026-05-19: bumped from compiled default 100 -> 200 after Pip's
26
+ # aborted-forget incident saturated the slots (4 stuck DELETEs +
27
+ # baseline pools). Shared_buffers raised to match the operator intent
28
+ # that was previously expressed in the unread env vars.
29
+ command: ["postgres", "-c", "max_connections=200", "-c", "shared_buffers=1GB"]
22
30
  environment:
23
31
  # Production tuning: bigger shared_buffers for the materialised
24
32
  # views, more connection slots for the extractor + compat pools.
@@ -45,8 +53,53 @@ services:
45
53
  PG_DSN: ${PME_V2_PG_DSN}
46
54
  LLM_ENDPOINT: ${PME_V2_LLM_ENDPOINT:-}
47
55
  LLM_API_KEY: ${PENTATONIC_AI_GATEWAY_KEY:-}
56
+ # Default model id for the AWS self-hosted distiller (Qwen2.5-7B-Instruct
57
+ # via vLLM on i-0d658d1aa70b497a6, served as `qwen2.5-7b-instruct`).
58
+ # When PME_V2_LLM_ENDPOINT points back at the Lambda 30B gateway,
59
+ # override LLM_MODEL via env to that gateway's model id.
60
+ LLM_MODEL: ${LLM_MODEL:-qwen2.5-7b-instruct}
61
+ # Self-hosted distiller (Qwen3.6-27B-FP8 on L40S, served via the
62
+ # autoscaled fleet). Tuning vs the Lambda 30B fleet: smaller
63
+ # per-call chunks, higher concurrency, longer timeout.
64
+ #
65
+ # EVENTS_PER_LLM_CALL=3 (was 5) + LLM_MAX_TOKENS_PER_EVENT_JSON=900
66
+ # (was the 400 default): the guided-JSON max_tokens budget is
67
+ # SHARED across the chunk's events, so dense events (full email/doc
68
+ # bodies maxing 8 ent/6 fct/6 rel ≈ ~1.1k output tokens each)
69
+ # clustering in a 5-event chunk overran the old 2000-tok ceiling
70
+ # and truncated the JSON array tail — 15% of calls finished on
71
+ # `length` not `stop` (measured 2026-06-12). 3×900=2700 output +
72
+ # ~2100 prompt = ~4.8k, well inside the L40S's 8192 max-model-len
73
+ # (16384 OOMs the L40S), giving every event real headroom.
74
+ # Quality over throughput — the autoscaler adds boxes to recover
75
+ # the per-box throughput lost to smaller chunks.
76
+ EVENTS_PER_LLM_CALL: "3"
77
+ CONCURRENT_LLM_CALLS: "20"
78
+ LLM_MAX_TOKENS_PER_EVENT_JSON: "900"
79
+ LLM_TIMEOUT_SEC: "300"
48
80
  POLL_INTERVAL_SEC: "10"
49
- CLAIM_TTL_SEC: "600"
81
+ CLAIM_TTL_SEC: "900"
82
+ POLL_INTERVAL_SEC_AFTER_EMPTY: "5"
83
+ # Skip-source list — never distil agent's-own-output, code ingest,
84
+ # orchestrator briefings, manual triage events into the graph.
85
+ # Source labels enumerated as they were observed leaking into prod
86
+ # over the weekend. New agent producers should be added here AND
87
+ # source_kind='agent' filtering should already drop them via worker.py.
88
+ DISTILL_SKIP_SOURCES: "pip-code-ingest,claude-code-plugin,openclaw-seesa,openclaw-plugin,openclaw-philip-mossop,openclaw-jamie,seesa,seesa-direct-curl-test,seesa-dedup-probe,orchestrator-web,briefing-morning,briefing-eod,triage-email,triage-manual"
89
+ # Trace logging — captures raw teacher I/O per distilled event into
90
+ # the distillation_traces table for student-model training data.
91
+ # Opt-in: defaults false here; set DISTILL_TRACE_ENABLED=true in
92
+ # SSM Parameter Store to flip on. See ai-events-sdk PR #74 for the
93
+ # worker-side logic + the migration that creates the table.
94
+ DISTILL_TRACE_ENABLED: ${DISTILL_TRACE_ENABLED:-false}
95
+ DISTILL_OUTPUT_MODE: ${DISTILL_OUTPUT_MODE:-kv}
96
+ DISTILL_GUIDED_PARAM_STYLE: ${DISTILL_GUIDED_PARAM_STYLE:-response_format}
97
+ # Chat-template switches forwarded verbatim on every completion
98
+ # (vLLM `chat_template_kwargs`). Required for thinking-capable
99
+ # teachers — Qwen3.x defaults enable_thinking=true, which burns
100
+ # the token budget on reasoning the distiller never reads. Set in
101
+ # SSM to '{"enable_thinking": false}' for the Qwen3.6 teacher.
102
+ DISTILL_CHAT_TEMPLATE_KWARGS: ${DISTILL_CHAT_TEMPLATE_KWARGS:-}
50
103
 
51
104
  compat:
52
105
  environment:
@@ -54,8 +107,15 @@ services:
54
107
  VECTOR_INDEX_URL: http://vector-index:6333
55
108
  EXTRACTOR_SYNC_URL: http://extractor-sync:8101
56
109
  NV_EMBED_URL: ${NV_EMBED_URL}
110
+ # Bulk embed lane (PR #76 ai-events-sdk) — separate box from the
111
+ # interactive lane so heavy backfills don't queue behind chat
112
+ # query embeds. Set in SSM to a different IP from NV_EMBED_URL.
113
+ NV_EMBED_URL_BULK: ${NV_EMBED_URL_BULK}
57
114
  NV_EMBED_API_KEY: ${PENTATONIC_AI_GATEWAY_KEY}
58
115
  NV_EMBED_PROVIDER: pentatonic-gateway
116
+ SEARCH_HYBRID_ENABLED: ${SEARCH_HYBRID_ENABLED:-}
117
+ SEARCH_MMR_ENABLED: ${SEARCH_MMR_ENABLED:-1}
118
+ SEARCH_INTENT_BOOST: ${SEARCH_INTENT_BOOST:-1}
59
119
  EMBED_DIM: "4096"
60
120
 
61
121
  # Cloudflared tunnel — same pattern as v1. Optional; only start if
@@ -76,3 +136,4 @@ services:
76
136
  depends_on:
77
137
  compat:
78
138
  condition: service_healthy
139
+
@@ -74,7 +74,14 @@ services:
74
74
  # --------------------------------------------------------------------
75
75
  vector-index:
76
76
  <<: *engine-base
77
- image: qdrant/qdrant:v1.12.4
77
+ # v1.18.2: minimum version whose API can ADD a named (sparse) vector
78
+ # to an existing collection (PUT /collections/{c}/vectors/{v}) —
79
+ # required by hybrid retrieval's 'lex' migration. Upgraded in prod
80
+ # 2026-06-11 by stepping minors 1.13.6→…→1.18.2 (the 1.12→1.18
81
+ # direct jump fails: segment.json "unknown variant `on_disk`").
82
+ # Do NOT lower this pin: 1.18-migrated storage cannot be read by
83
+ # older servers.
84
+ image: qdrant/qdrant:v1.18.2
78
85
  container_name: pme2-vector-index
79
86
  ports:
80
87
  - "127.0.0.1:${PME_V2_QDRANT_HTTP_PORT:-16333}:6333"
@@ -409,3 +409,47 @@ def test_guided_prompt_keeps_content_rules() -> None:
409
409
  # Pipe scaffolding gone
410
410
  assert "COUNT THE PIPES" not in p
411
411
  assert "PIPE-DELIMITED" not in p
412
+
413
+
414
+ # ----------------------------------------------------------------------
415
+ # DISTILL_CHAT_TEMPLATE_KWARGS — thinking-teacher template switch
416
+ # ----------------------------------------------------------------------
417
+
418
+
419
+ def test_default_body_has_no_chat_template_kwargs(
420
+ monkeypatch: pytest.MonkeyPatch,
421
+ ) -> None:
422
+ """Unset env → the request body is byte-identical to before the
423
+ knob existed (Qwen2.5-class teachers need no template switches)."""
424
+ monkeypatch.delenv("DISTILL_CHAT_TEMPLATE_KWARGS", raising=False)
425
+ w = _load_worker("worker_no_ctk")
426
+ assert w.DISTILL_CHAT_TEMPLATE_KWARGS is None
427
+ assert "chat_template_kwargs" not in w._build_request_body("PROMPT", 5)
428
+
429
+
430
+ def test_chat_template_kwargs_forwarded(monkeypatch: pytest.MonkeyPatch) -> None:
431
+ """The Qwen3.x swap case: {"enable_thinking": false} must land
432
+ verbatim in every request body, in both output modes."""
433
+ monkeypatch.setenv("DISTILL_CHAT_TEMPLATE_KWARGS", '{"enable_thinking": false}')
434
+ w = _load_worker("worker_ctk")
435
+ assert w.DISTILL_CHAT_TEMPLATE_KWARGS == {"enable_thinking": False}
436
+ body = w._build_request_body("PROMPT", 5)
437
+ assert body["chat_template_kwargs"] == {"enable_thinking": False}
438
+
439
+ monkeypatch.setenv("DISTILL_OUTPUT_MODE", "guided_json")
440
+ w2 = _load_worker("worker_ctk_guided")
441
+ body2 = w2._build_request_body("PROMPT", 5)
442
+ assert body2["chat_template_kwargs"] == {"enable_thinking": False}
443
+ assert "response_format" in body2
444
+
445
+
446
+ def test_chat_template_kwargs_invalid_ignored(
447
+ monkeypatch: pytest.MonkeyPatch,
448
+ ) -> None:
449
+ """Malformed JSON or a non-object must not take the worker down —
450
+ log + ignore, requests stay clean."""
451
+ for bad in ("{not json", '["a", "list"]', '"a string"'):
452
+ monkeypatch.setenv("DISTILL_CHAT_TEMPLATE_KWARGS", bad)
453
+ w = _load_worker(f"worker_ctk_bad_{abs(hash(bad))}")
454
+ assert w.DISTILL_CHAT_TEMPLATE_KWARGS is None
455
+ assert "chat_template_kwargs" not in w._build_request_body("PROMPT", 5)
@@ -149,13 +149,41 @@ if DISTILL_GUIDED_PARAM_STYLE not in ("response_format", "guided_json"):
149
149
  )
150
150
  DISTILL_GUIDED_PARAM_STYLE = "response_format"
151
151
 
152
+ # Optional chat-template kwargs forwarded verbatim on every chat
153
+ # completion (vLLM extension: top-level `chat_template_kwargs`).
154
+ # Needed for thinking-capable teachers: Qwen3.x chat templates default
155
+ # enable_thinking=true, which burns the max_tokens budget on reasoning
156
+ # the distiller never reads. The 2026-06-11 teacher bake-off ran the
157
+ # Qwen3.6 lanes with {"enable_thinking": false}, so the prod swap must
158
+ # send the same switch for its traces to match the benchmarked
159
+ # distribution. Unset (default) sends nothing — the request body stays
160
+ # byte-identical for teachers without template switches (Qwen2.5).
161
+ DISTILL_CHAT_TEMPLATE_KWARGS: dict[str, Any] | None = None
162
+ _raw_ctk = os.environ.get("DISTILL_CHAT_TEMPLATE_KWARGS", "").strip()
163
+ if _raw_ctk:
164
+ try:
165
+ _parsed_ctk = json.loads(_raw_ctk)
166
+ if not isinstance(_parsed_ctk, dict):
167
+ raise ValueError("must be a JSON object")
168
+ DISTILL_CHAT_TEMPLATE_KWARGS = _parsed_ctk
169
+ except ValueError as e:
170
+ log.warning(f"DISTILL_CHAT_TEMPLATE_KWARGS invalid ({e}) — ignoring")
171
+
152
172
  # JSON output carries structural overhead (braces, quotes, key names)
153
173
  # the KV format doesn't, so guided mode gets its own per-event token
154
174
  # budget. Truncation is guided mode's ONLY parse-failure mode (the
155
175
  # schema enforcer guarantees validity up to the cut), so this errs
156
176
  # higher than the KV 300.
177
+ #
178
+ # NOTE the budget is SHARED across the chunk (max_tokens = this × N
179
+ # events per request). A fully-maxed event (8 ent / 6 fct with 140-char
180
+ # statements / 6 rel + JSON overhead) is ~1.1k output tokens, so chunk
181
+ # size and this value must be chosen together against the server's
182
+ # max_model_len. Raised 400→900 after prod showed 15% of 5-event chunks
183
+ # truncating on `length` (2026-06-12); prod now runs EVENTS_PER_LLM_CALL=3
184
+ # so 3×900 output + ~2.1k prompt stays well inside the L40S 8192 ctx.
157
185
  LLM_MAX_TOKENS_PER_EVENT_JSON = int(
158
- os.environ.get("LLM_MAX_TOKENS_PER_EVENT_JSON", "400")
186
+ os.environ.get("LLM_MAX_TOKENS_PER_EVENT_JSON", "900")
159
187
  )
160
188
 
161
189
 
@@ -667,6 +695,8 @@ def _build_request_body(user_prompt: str, n: int) -> dict[str, Any]:
667
695
  else LLM_MAX_TOKENS_PER_EVENT
668
696
  ) * n,
669
697
  }
698
+ if DISTILL_CHAT_TEMPLATE_KWARGS:
699
+ body["chat_template_kwargs"] = DISTILL_CHAT_TEMPLATE_KWARGS
670
700
  if DISTILL_OUTPUT_MODE == "guided_json":
671
701
  if DISTILL_GUIDED_PARAM_STYLE == "guided_json":
672
702
  body["guided_json"] = EXTRACTION_SCHEMA
@@ -56,11 +56,15 @@ _pool: AsyncConnectionPool | None = None
56
56
  @asynccontextmanager
57
57
  async def lifespan(app: FastAPI):
58
58
  global _pool
59
+ # Default (tuple) row factory — _upsert_entities and friends index
60
+ # fetchone() rows positionally, matching extractor-async's worker.
61
+ # A dict_row factory here turns row[0] into KeyError: 0 on the
62
+ # entity-merge path (2026-06-11 prod incident: every extract that
63
+ # re-saw a known entity 500'd; only never-seen-entity events stored).
59
64
  _pool = AsyncConnectionPool(
60
65
  conninfo=PG_DSN,
61
66
  min_size=8,
62
67
  max_size=50,
63
- kwargs={"row_factory": psycopg.rows.dict_row},
64
68
  open=False,
65
69
  )
66
70
  await _pool.open()
@@ -89,7 +93,7 @@ class ExtractRequest(BaseModel):
89
93
  clientId: str
90
94
  userId: str | None = None
91
95
  event_type: str = "STORE_MEMORY"
92
- source_kind: str # 'chat' | 'note' | 'doc' | 'event' | 'ticket' | 'commit' | 'system' | 'agent'
96
+ source_kind: str # 'chat' | 'note' | 'doc' | 'event' | 'ticket' | 'commit' | 'system' | 'agent' | 'code_reference'
93
97
  source_id: str | None = None
94
98
  content: str
95
99
  attributes: dict[str, Any] = {}
@@ -22,8 +22,14 @@ import pytest
22
22
 
23
23
 
24
24
  # Load extractor-sync's server.py as a module so we can call its
25
- # private helpers directly.
25
+ # private helpers directly. server.py flat-imports its siblings
26
+ # (entity_id) the way the container's WORKDIR layout resolves them, so
27
+ # this directory must be on sys.path — otherwise exec_module raises
28
+ # ImportError and the module-level skip below silently swallows the
29
+ # whole suite whenever pytest runs from the repo root.
26
30
  _THIS = Path(__file__).resolve().parent
31
+ if str(_THIS) not in sys.path:
32
+ sys.path.insert(0, str(_THIS))
27
33
  _SPEC = importlib.util.spec_from_file_location("extractor_sync_server",
28
34
  _THIS / "server.py")
29
35
  assert _SPEC and _SPEC.loader
@@ -206,3 +212,78 @@ def test_extract_event_organizer_object_form() -> None:
206
212
  assert len(entities) == 1
207
213
  assert entities[0]["canonical_name"] == "X Person"
208
214
  assert "x@example.com" in entities[0]["aliases"]
215
+
216
+
217
+ # ----------------------------------------------------------------------
218
+ # _upsert_entities — merge path indexes rows positionally
219
+ # ----------------------------------------------------------------------
220
+ #
221
+ # Regression for the 2026-06-11 prod incident: the pool was configured
222
+ # with row_factory=dict_row while _upsert_entities did `row[0]`, so the
223
+ # merge branch (entity already known) raised KeyError: 0 and every
224
+ # extract that re-saw a known entity 500'd. Only never-seen-entity
225
+ # events could store. Two guards:
226
+ # 1. the pool must keep psycopg's default tuple row factory
227
+ # (matching extractor-async's worker, which also indexes
228
+ # positionally), and
229
+ # 2. the merge branch must work against tuple rows end-to-end.
230
+
231
+ import asyncio
232
+ import inspect
233
+
234
+
235
+ class _FakeCursor:
236
+ """Quacks like psycopg.AsyncCursor, returning TUPLE rows — the
237
+ shape the pool's default row factory produces. If the pool ever
238
+ grows a custom row_factory again, update this fake to match it or
239
+ test_pool_keeps_default_tuple_row_factory will flag the drift."""
240
+
241
+ def __init__(self, existing_id: str | None) -> None:
242
+ self.executed: list[tuple[str, object]] = []
243
+ self._existing_id = existing_id
244
+
245
+ async def execute(self, sql: str, params: object = None) -> None:
246
+ self.executed.append((" ".join(sql.split()), params))
247
+
248
+ async def fetchone(self):
249
+ return (self._existing_id,) if self._existing_id else None
250
+
251
+
252
+ def _entity_stub() -> dict:
253
+ return {
254
+ "id": "e_new",
255
+ "arena": "arena1",
256
+ "entity_type": "person",
257
+ "canonical_name": "Alice One",
258
+ "aliases": ["Alice One", "alice@example.com"],
259
+ "provenance_event_ids": ["evt1"],
260
+ "participant_set": ["arena1"],
261
+ "disclosure_class": "private",
262
+ }
263
+
264
+
265
+ def test_pool_keeps_default_tuple_row_factory() -> None:
266
+ src = inspect.getsource(sync_server.lifespan)
267
+ assert "row_factory" not in src, (
268
+ "extractor-sync's pool must use psycopg's default tuple rows: "
269
+ "_upsert_entities indexes fetchone() results positionally."
270
+ )
271
+
272
+
273
+ def test_upsert_entities_merge_branch_with_tuple_rows() -> None:
274
+ """Entity already exists → UPDATE branch runs, id taken from row[0]."""
275
+ cur = _FakeCursor(existing_id="e_existing")
276
+ asyncio.run(sync_server._upsert_entities(cur, [_entity_stub()]))
277
+ updates = [(s, p) for s, p in cur.executed if s.startswith("UPDATE entities")]
278
+ assert len(updates) == 1
279
+ _, params = updates[0]
280
+ assert params[-1] == "e_existing" # WHERE id = %s ← row[0]
281
+ assert not any(s.startswith("INSERT INTO entities") for s, _ in cur.executed)
282
+
283
+
284
+ def test_upsert_entities_insert_branch_when_no_match() -> None:
285
+ cur = _FakeCursor(existing_id=None)
286
+ asyncio.run(sync_server._upsert_entities(cur, [_entity_stub()]))
287
+ inserts = [s for s, _ in cur.executed if s.startswith("INSERT INTO entities")]
288
+ assert len(inserts) == 1
289
+ assert not any(s.startswith("UPDATE entities") for s, _ in cur.executed)
@@ -0,0 +1,12 @@
1
+ -- 004: accept 'code_reference' source events (SDK corpus ingest).
2
+ --
3
+ -- The SDK corpus module (packages/memory/src/corpus/) emits events with
4
+ -- source_kind='code_reference' (code-signature ingest, adapters.js).
5
+ -- The enum predates that feature, so those events bounced with
6
+ -- InvalidTextRepresentation and could never be stored — observed in
7
+ -- prod 2026-06-11 as persistent /extract 500s + producer retry loops.
8
+ --
9
+ -- ALTER TYPE ... ADD VALUE cannot run inside a transaction block;
10
+ -- apply with autocommit (psql's default per-statement behaviour).
11
+ -- Applied manually to prod (pme2-org-model) on 2026-06-11.
12
+ ALTER TYPE source_kind ADD VALUE IF NOT EXISTS 'code_reference';
@@ -0,0 +1,20 @@
1
+ -- 005: index every column that references events(id).
2
+ --
3
+ -- events has four referencing constraints:
4
+ -- distillation_queue.event_id ON DELETE CASCADE
5
+ -- vector_provenance.event_id ON DELETE CASCADE
6
+ -- distillation_traces.event_id ON DELETE CASCADE
7
+ -- events.forgets (self) ON DELETE SET NULL
8
+ --
9
+ -- Postgres does NOT auto-index FK referencing columns. Without these,
10
+ -- every DELETE on events seq-scans each referencing table per deleted
11
+ -- row to enforce the constraint — the 2026-06-11 arena-scoped nuke of
12
+ -- ~70k events ran for HOURS until the missing indexes were created
13
+ -- on-box. (distillation_queue.event_id already had idx_distillation_
14
+ -- event_id from 003; listed here for completeness via IF NOT EXISTS.)
15
+ --
16
+ -- All idempotent; applied manually to prod (pme2-org-model) 2026-06-12.
17
+ CREATE INDEX IF NOT EXISTS idx_distillation_event_id ON distillation_queue(event_id);
18
+ CREATE INDEX IF NOT EXISTS idx_traces_event_id ON distillation_traces(event_id);
19
+ CREATE INDEX IF NOT EXISTS idx_vector_provenance_event_id ON vector_provenance(event_id);
20
+ CREATE INDEX IF NOT EXISTS idx_events_forgets ON events(forgets);