@pentatonic-ai/ai-agent-sdk 0.10.5 → 0.10.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. package/dist/index.cjs +1 -1
  2. package/dist/index.js +1 -1
  3. package/package.json +1 -1
  4. package/packages/memory-engine-v2/compat/requirements.txt +6 -0
  5. package/packages/memory-engine-v2/compat/server.py +258 -18
  6. package/packages/memory-engine-v2/docker-compose.aws.yml +62 -1
  7. package/packages/memory-engine-v2/docker-compose.yml +8 -1
  8. package/packages/memory-engine-v2/eval/recall_at_k.py +242 -0
  9. package/packages/memory-engine-v2/eval/retrieval_golden.seed.json +69 -0
  10. package/packages/memory-engine-v2/extractor-async/Dockerfile +1 -1
  11. package/packages/memory-engine-v2/extractor-async/extraction_schema.py +246 -0
  12. package/packages/memory-engine-v2/extractor-async/test_guided_json_parser.py +455 -0
  13. package/packages/memory-engine-v2/extractor-async/worker.py +391 -31
  14. package/packages/memory-engine-v2/extractor-sync/server.py +6 -2
  15. package/packages/memory-engine-v2/extractor-sync/test_paired_extraction.py +82 -1
  16. package/packages/memory-engine-v2/org-model/migrations/004_source_kind_code_reference.sql +12 -0
  17. package/packages/memory-engine-v2/org-model/migrations/005_fk_indexes.sql +20 -0
  18. package/packages/memory-engine-v2/resolution-queue-design.md +165 -0
  19. package/packages/memory-engine-v2/scripts/backfill_entity_reconciliation.py +11 -2
  20. package/packages/memory-engine-v2/scripts/backfill_sparse_vectors.py +369 -0
  21. package/packages/memory-engine-v2/scripts/bakeoff_guided_vs_kv.py +607 -0
  22. package/packages/memory-engine-v2/scripts/entity_resolution_v2.py +1041 -0
  23. package/packages/memory-engine-v2/tests/test_entity_resolution_v2.py +507 -0
  24. package/packages/memory-engine-v2/tests/test_hybrid_retrieval.py +810 -0
@@ -41,6 +41,14 @@ import psycopg.rows
41
41
 
42
42
  from confidence import corroborated_confidence
43
43
  from entity_id import entity_id, normalize_surface_form
44
+ from extraction_schema import (
45
+ ALLOWED_ENT_TYPES,
46
+ ALLOWED_FCT_CATEGORIES,
47
+ EXTRACTION_SCHEMA,
48
+ MAX_ENTITIES_PER_EVENT,
49
+ MAX_FACTS_PER_EVENT,
50
+ MAX_RELATIONSHIPS_PER_EVENT,
51
+ )
44
52
  from noise_filter import is_noise_entity_name
45
53
  from sensitive_filter import SKIP_SENSITIVE_CONTENT, is_sensitive_event
46
54
 
@@ -90,15 +98,93 @@ DISTILL_TRACE_ENABLED = os.environ.get(
90
98
  # chunk via a JSONDecodeError. Pipe-delimited records, one per line,
91
99
  # recover at line granularity — a malformed line skips itself, the rest
92
100
  # of the chunk lands. See 2026-05-18 ops notes.
101
+ #
102
+ # 2026-06-11 update: guided JSON is back as an OPT-IN second mode
103
+ # (DISTILL_OUTPUT_MODE=guided_json, default "kv" — a no-op until an
104
+ # operator flips it). Both halves of the 2026-05-18 removal rationale
105
+ # are answered this time:
106
+ # (a) the self-hosted Qwen2.5-7B vLLM box enforces structured output
107
+ # via logit masking (xgrammar/outlines) — the model CANNOT emit
108
+ # schema-invalid bytes, unlike the old VL gateway which
109
+ # half-ignored response_format;
110
+ # (b) blast radius is solved structurally — the schema is an array
111
+ # of per-event objects (see extraction_schema.py), so one
112
+ # event's content can't corrupt another's parse; the only
113
+ # residual failure is max_tokens truncation, and
114
+ # _parse_guided_json salvages every complete event object.
115
+ # ALLOWED_ENT_TYPES / ALLOWED_FCT_CATEGORIES now live in
116
+ # extraction_schema.py (imported above) so the schema enums and the
117
+ # KV prompt pin to the same single source.
93
118
  EVENT_HEADER_RE = re.compile(r"^===?\s*event\s+(\d+)\s*===?\s*$", re.IGNORECASE)
94
- ALLOWED_ENT_TYPES = {
95
- "person", "org", "product", "place", "project",
96
- "concept", "topic", "date", "other",
97
- }
98
- ALLOWED_FCT_CATEGORIES = {
99
- "decision", "commitment", "state", "mention",
100
- "observation", "preference",
101
- }
119
+
120
+ # Output mode flag. "kv" (default) keeps today's pipe-delimited path
121
+ # byte-for-byte; "guided_json" switches the prompt, request params and
122
+ # parser. Anything unrecognised falls back to "kv" — fail-safe.
123
+ DISTILL_OUTPUT_MODE = os.environ.get("DISTILL_OUTPUT_MODE", "kv").strip().lower()
124
+ if DISTILL_OUTPUT_MODE not in ("kv", "guided_json"):
125
+ log.warning(
126
+ f"DISTILL_OUTPUT_MODE={DISTILL_OUTPUT_MODE!r} unrecognised — using 'kv'"
127
+ )
128
+ DISTILL_OUTPUT_MODE = "kv"
129
+
130
+ # How the structured-output schema is attached to the request in
131
+ # guided_json mode. The repo carries no pin for the engine box's vLLM
132
+ # version, so this is operator-selectable:
133
+ # - "response_format" (default): OpenAI-style
134
+ # response_format={"type":"json_schema","json_schema":{...}} —
135
+ # supported by vLLM >= 0.6.x OpenAI-compat server.
136
+ # - "guided_json": vLLM's legacy extension param (top-level
137
+ # `guided_json` in the request body; what openai-client users pass
138
+ # via extra_body). FALLBACK for older vLLM builds that predate
139
+ # json_schema response_format.
140
+ # Exactly one is sent — some vLLM versions reject requests that carry
141
+ # both guided-decoding params at once.
142
+ DISTILL_GUIDED_PARAM_STYLE = os.environ.get(
143
+ "DISTILL_GUIDED_PARAM_STYLE", "response_format"
144
+ ).strip().lower()
145
+ if DISTILL_GUIDED_PARAM_STYLE not in ("response_format", "guided_json"):
146
+ log.warning(
147
+ f"DISTILL_GUIDED_PARAM_STYLE={DISTILL_GUIDED_PARAM_STYLE!r} unrecognised "
148
+ f"— using 'response_format'"
149
+ )
150
+ DISTILL_GUIDED_PARAM_STYLE = "response_format"
151
+
152
+ # Optional chat-template kwargs forwarded verbatim on every chat
153
+ # completion (vLLM extension: top-level `chat_template_kwargs`).
154
+ # Needed for thinking-capable teachers: Qwen3.x chat templates default
155
+ # enable_thinking=true, which burns the max_tokens budget on reasoning
156
+ # the distiller never reads. The 2026-06-11 teacher bake-off ran the
157
+ # Qwen3.6 lanes with {"enable_thinking": false}, so the prod swap must
158
+ # send the same switch for its traces to match the benchmarked
159
+ # distribution. Unset (default) sends nothing — the request body stays
160
+ # byte-identical for teachers without template switches (Qwen2.5).
161
+ DISTILL_CHAT_TEMPLATE_KWARGS: dict[str, Any] | None = None
162
+ _raw_ctk = os.environ.get("DISTILL_CHAT_TEMPLATE_KWARGS", "").strip()
163
+ if _raw_ctk:
164
+ try:
165
+ _parsed_ctk = json.loads(_raw_ctk)
166
+ if not isinstance(_parsed_ctk, dict):
167
+ raise ValueError("must be a JSON object")
168
+ DISTILL_CHAT_TEMPLATE_KWARGS = _parsed_ctk
169
+ except ValueError as e:
170
+ log.warning(f"DISTILL_CHAT_TEMPLATE_KWARGS invalid ({e}) — ignoring")
171
+
172
+ # JSON output carries structural overhead (braces, quotes, key names)
173
+ # the KV format doesn't, so guided mode gets its own per-event token
174
+ # budget. Truncation is guided mode's ONLY parse-failure mode (the
175
+ # schema enforcer guarantees validity up to the cut), so this errs
176
+ # higher than the KV 300.
177
+ #
178
+ # NOTE the budget is SHARED across the chunk (max_tokens = this × N
179
+ # events per request). A fully-maxed event (8 ent / 6 fct with 140-char
180
+ # statements / 6 rel + JSON overhead) is ~1.1k output tokens, so chunk
181
+ # size and this value must be chosen together against the server's
182
+ # max_model_len. Raised 400→900 after prod showed 15% of 5-event chunks
183
+ # truncating on `length` (2026-06-12); prod now runs EVENTS_PER_LLM_CALL=3
184
+ # so 3×900 output + ~2.1k prompt stays well inside the L40S 8192 ctx.
185
+ LLM_MAX_TOKENS_PER_EVENT_JSON = int(
186
+ os.environ.get("LLM_MAX_TOKENS_PER_EVENT_JSON", "900")
187
+ )
102
188
 
103
189
 
104
190
  # --------------------------------------------------------------------
@@ -165,10 +251,71 @@ A whole file is one entity, not twenty.
165
251
  - Output ONLY the formatted records. No header, no footer, no prose."""
166
252
 
167
253
 
254
+ # Guided-JSON variant of BATCH_SYSTEM_PROMPT. Same CONTENT rules
255
+ # (conservatism, per-event caps, code-content rule, subject-must-be-a-
256
+ # declared-entity, email-alias pairing, statement <= 140 chars, never
257
+ # skip an event) — only the output-format scaffolding changes. The
258
+ # pipe-format anchoring ("COUNT THE PIPES", pipe/newline substitution)
259
+ # is dropped: vLLM's guided decoding enforces the schema mechanically,
260
+ # so the prompt no longer needs to beg for format compliance, and JSON
261
+ # string escaping makes the pipe/newline substitution rules moot.
262
+ GUIDED_JSON_SYSTEM_PROMPT = """You extract structured knowledge from N \
263
+ events for a personal-memory graph.
264
+
265
+ You will receive N events, each prefixed with `[event K]`. Respond \
266
+ with a single JSON object: {"events": [...]} containing one object \
267
+ per input event. Be conservative — only emit things explicitly stated.
268
+
269
+ Each per-event object has:
270
+ "index": the zero-indexed event number, matching the input `[event K]`.
271
+ "entities": array of {"name", "type", "email"?}.
272
+ "facts": array of {"category", "subject", "predicate", "object", "statement"}.
273
+ "relationships": array of {"from", "to", "type"}.
274
+
275
+ RULES:
276
+ - NEVER skip an event — if an event has nothing to extract, emit its \
277
+ object with "index" set and empty arrays.
278
+ - entities: type ∈ {person, org, product, place, project, concept, \
279
+ topic, date, other}.
280
+ email (OPTIONAL, person only): when the event body or attributes
281
+ show an email address that unambiguously identifies the person,
282
+ include it. This pairs the name+email forms so a later event seeing
283
+ only the email resolves to the same entity. Omit the key otherwise.
284
+ - facts: category ∈ {decision, commitment, state, mention, \
285
+ observation, preference}.
286
+ subject MUST be an entity name declared in THIS event's "entities".
287
+ predicate is a short verb phrase (e.g. "agreed to", "owns", "works at").
288
+ object MAY be an entity name OR a literal string OR null if absent.
289
+ statement ≤ 140 characters, a self-contained sentence.
290
+ WORKED EXAMPLE: {"category": "commitment", "subject": "Timothy \
291
+ Bradley", "predicate": "agreed to", "object": "SAFE amendments", \
292
+ "statement": "Timothy confirmed the SAFE amendments are set (14 May 2026)"}
293
+ - relationships: "from" and "to" MUST be entity names declared in THIS \
294
+ event's "entities". "type" is a short verb / preposition phrase.
295
+ - HARD CAPS per event: 8 entities, 6 facts, 6 relationships. Pick the \
296
+ most salient.
297
+ - For code / technical content: extract only top-level services, \
298
+ modules, or domain concepts. NOT variables, types, or method names. \
299
+ A whole file is one entity, not twenty.
300
+ - Output ONLY the JSON object. No markdown fences, no prose."""
301
+
302
+
303
+ # The system prompt actually sent to the LLM under the current output
304
+ # mode. Everything downstream (request body, trace fingerprint) hangs
305
+ # off this so the two can never disagree.
306
+ ACTIVE_SYSTEM_PROMPT = (
307
+ GUIDED_JSON_SYSTEM_PROMPT
308
+ if DISTILL_OUTPUT_MODE == "guided_json"
309
+ else BATCH_SYSTEM_PROMPT
310
+ )
311
+
168
312
  # Teacher-prompt fingerprint for trace logging. If the prompt changes,
169
313
  # the hash changes — lets training-data exports filter by teacher
170
- # version so we never mix outputs from a retired prompt.
171
- SYSTEM_PROMPT_HASH = hashlib.sha256(BATCH_SYSTEM_PROMPT.encode()).hexdigest()[:16]
314
+ # version so we never mix outputs from a retired prompt. Computed from
315
+ # the ACTIVE prompt, so flipping DISTILL_OUTPUT_MODE auto-segments
316
+ # distillation_traces into a new teacher version (KV-format traces and
317
+ # guided-JSON traces never mix in a training export).
318
+ SYSTEM_PROMPT_HASH = hashlib.sha256(ACTIVE_SYSTEM_PROMPT.encode()).hexdigest()[:16]
172
319
 
173
320
 
174
321
  # --------------------------------------------------------------------
@@ -353,14 +500,234 @@ def _split_event_blocks(text: str, expected_n: int) -> list[str]:
353
500
  return slices
354
501
 
355
502
 
503
+ # --------------------------------------------------------------------
504
+ # Guided-JSON parsing (DISTILL_OUTPUT_MODE=guided_json)
505
+ # --------------------------------------------------------------------
506
+
507
+
508
+ def _load_guided_payload(text: str) -> dict[str, Any] | None:
509
+ """Parse the guided-JSON chunk output into the {"events": [...]}
510
+ payload, salvaging what's complete if the output was truncated.
511
+
512
+ Under guided decoding the server's logit masking guarantees every
513
+ emitted byte is schema-consistent, so the ONLY way the payload can
514
+ fail to parse is max_tokens truncation mid-stream. Salvage is
515
+ therefore simple and structural: walk back to the last complete
516
+ `}` (the close of the last fully-emitted event object), close the
517
+ events array + root object, and re-parse. Each step back drops at
518
+ most one (incomplete) event — per-event degradation, never
519
+ chunk-level loss. Returns None if nothing parseable remains."""
520
+ raw = (text or "").strip()
521
+ if not raw:
522
+ return None
523
+ # Defensive fence strip — can't occur under guided decoding, but
524
+ # the bake-off script replays this parser over unguided output too.
525
+ if raw.startswith("```"):
526
+ raw = raw.strip("`").strip()
527
+ if raw.lower().startswith("json"):
528
+ raw = raw[4:].lstrip()
529
+ try:
530
+ payload = json.loads(raw)
531
+ return payload if isinstance(payload, dict) else None
532
+ except json.JSONDecodeError:
533
+ pass
534
+ # Truncated: trim to the last complete `}` of the events array and
535
+ # close the structure. Walk back through `}` occurrences until a
536
+ # candidate parses (bounded — each iteration discards at least one
537
+ # char, and 200 closing braces covers far more events than a chunk
538
+ # can hold).
539
+ end = len(raw)
540
+ for _ in range(200):
541
+ idx = raw.rfind("}", 0, end)
542
+ if idx < 0:
543
+ return None
544
+ candidate = raw[: idx + 1] + "]}"
545
+ try:
546
+ payload = json.loads(candidate)
547
+ return payload if isinstance(payload, dict) else None
548
+ except json.JSONDecodeError:
549
+ end = idx
550
+ return None
551
+
552
+
553
+ def _resolve_event_index(ev: dict[str, Any], pos: int, expected_n: int) -> int | None:
554
+ """Map a parsed event object to its result slot. Trust the model's
555
+ "index" field when it's a valid in-range int (it mirrors the
556
+ `[event K]` input header); fall back to array position otherwise.
557
+ None = undeliverable (both out of range) — the object is dropped
558
+ without corrupting any other event's slot."""
559
+ idx = ev.get("index")
560
+ if isinstance(idx, int) and not isinstance(idx, bool) and 0 <= idx < expected_n:
561
+ return idx
562
+ if 0 <= pos < expected_n:
563
+ return pos
564
+ return None
565
+
566
+
567
+ def _parse_guided_json(text: str, expected_n: int) -> list[dict[str, Any]]:
568
+ """Parse guided-JSON output into per-event extraction dicts —
569
+ sibling of _parse_kv_records, returning the IDENTICAL shape
570
+ ({"entities": [...], "facts": [...], "relationships": [...]}, with
571
+ entity emails promoted into "aliases") so the upsert path and trace
572
+ logging are untouched by the output-mode flip.
573
+
574
+ Defensive beyond what guided decoding guarantees: truncation is
575
+ salvaged per-event (see _load_guided_payload), per-item junk is
576
+ skipped, the per-event hard caps are re-enforced, and string fields
577
+ are normalised exactly as the KV parser normalises them (strip,
578
+ lowercase type/category, `-`/empty/null object → None, non-email
579
+ "email" values dropped). Always returns expected_n entries."""
580
+ results: list[dict[str, Any]] = [
581
+ {"entities": [], "facts": [], "relationships": []} for _ in range(expected_n)
582
+ ]
583
+ payload = _load_guided_payload(text)
584
+ if payload is None:
585
+ return results
586
+ events = payload.get("events")
587
+ if not isinstance(events, list):
588
+ return results
589
+ for pos, ev in enumerate(events):
590
+ if not isinstance(ev, dict):
591
+ continue
592
+ idx = _resolve_event_index(ev, pos, expected_n)
593
+ if idx is None:
594
+ continue
595
+ target = results[idx]
596
+ ents = ev.get("entities")
597
+ for e in (ents if isinstance(ents, list) else [])[:MAX_ENTITIES_PER_EVENT]:
598
+ if not isinstance(e, dict):
599
+ continue
600
+ name = str(e.get("name") or "").strip()
601
+ if not name:
602
+ continue
603
+ etype = str(e.get("type") or "").strip().lower()
604
+ ent: dict[str, Any] = {"type": etype, "name": name}
605
+ # Mirror the KV 4th-field rule: promote into aliases only
606
+ # when it actually looks like an email; drop junk silently.
607
+ email = e.get("email")
608
+ if isinstance(email, str):
609
+ email = email.strip()
610
+ if email and "@" in email and " " not in email:
611
+ ent["aliases"] = [email]
612
+ target["entities"].append(ent)
613
+ facts = ev.get("facts")
614
+ for f in (facts if isinstance(facts, list) else [])[:MAX_FACTS_PER_EVENT]:
615
+ if not isinstance(f, dict):
616
+ continue
617
+ stmt = str(f.get("statement") or "").strip()
618
+ if not stmt:
619
+ continue
620
+ obj = f.get("object")
621
+ obj = obj.strip() if isinstance(obj, str) else None
622
+ target["facts"].append(
623
+ {
624
+ "category": str(f.get("category") or "").strip().lower(),
625
+ "subject": str(f.get("subject") or "").strip(),
626
+ "predicate": str(f.get("predicate") or "").strip(),
627
+ "object": None if obj in (None, "", "-", "null", "None") else obj,
628
+ "statement": stmt,
629
+ }
630
+ )
631
+ rels = ev.get("relationships")
632
+ for r in (rels if isinstance(rels, list) else [])[:MAX_RELATIONSHIPS_PER_EVENT]:
633
+ if not isinstance(r, dict):
634
+ continue
635
+ frm = str(r.get("from") or "").strip()
636
+ to = str(r.get("to") or "").strip()
637
+ rtype = str(r.get("type") or "").strip()
638
+ if frm and to and rtype:
639
+ target["relationships"].append({"from": frm, "to": to, "type": rtype})
640
+ return results
641
+
642
+
643
+ def _guided_event_slices(text: str, expected_n: int) -> list[str]:
644
+ """Per-event raw slices for trace logging in guided mode — the
645
+ JSON-mode sibling of _split_event_blocks, same shape contract
646
+ (expected_n entries, missing events as empty strings). Each slice
647
+ is the model's event object re-serialised verbatim-in-content
648
+ (key order preserved, non-ASCII kept) so distillation_traces stays
649
+ a faithful (input, output) training pair."""
650
+ slices: list[str] = [""] * expected_n
651
+ payload = _load_guided_payload(text)
652
+ if payload is None:
653
+ return slices
654
+ events = payload.get("events")
655
+ if not isinstance(events, list):
656
+ return slices
657
+ for pos, ev in enumerate(events):
658
+ if not isinstance(ev, dict):
659
+ continue
660
+ idx = _resolve_event_index(ev, pos, expected_n)
661
+ if idx is not None:
662
+ slices[idx] = json.dumps(ev, ensure_ascii=False)
663
+ return slices
664
+
665
+
666
+ def _build_request_body(user_prompt: str, n: int) -> dict[str, Any]:
667
+ """Chat-completions request body for one N-event chunk. Pure —
668
+ everything mode-dependent (prompt, token budget, structured-output
669
+ params) keys off the module-level flags so this is unit-testable.
670
+
671
+ kv mode (default): byte-for-byte the pre-flag body — KV-text
672
+ output, no guided_json / response_format. The benefit of
673
+ structured-output enforcement was half-ignored by the old VL
674
+ upstream, and the KV parser recovers from per-line drift.
675
+
676
+ guided_json mode: attaches EXTRACTION_SCHEMA via ONE of the two
677
+ vLLM structured-output param styles (DISTILL_GUIDED_PARAM_STYLE;
678
+ some vLLM versions reject requests carrying both at once):
679
+ - response_format {"type": "json_schema", ...} — OpenAI-style,
680
+ current vLLM (default).
681
+ - top-level guided_json — vLLM's legacy extension param (what
682
+ openai-client callers pass via extra_body), fallback for older
683
+ server builds.
684
+ """
685
+ body: dict[str, Any] = {
686
+ "model": LLM_MODEL,
687
+ "messages": [
688
+ {"role": "system", "content": ACTIVE_SYSTEM_PROMPT},
689
+ {"role": "user", "content": user_prompt},
690
+ ],
691
+ "temperature": 0.0,
692
+ "max_tokens": (
693
+ LLM_MAX_TOKENS_PER_EVENT_JSON
694
+ if DISTILL_OUTPUT_MODE == "guided_json"
695
+ else LLM_MAX_TOKENS_PER_EVENT
696
+ ) * n,
697
+ }
698
+ if DISTILL_CHAT_TEMPLATE_KWARGS:
699
+ body["chat_template_kwargs"] = DISTILL_CHAT_TEMPLATE_KWARGS
700
+ if DISTILL_OUTPUT_MODE == "guided_json":
701
+ if DISTILL_GUIDED_PARAM_STYLE == "guided_json":
702
+ body["guided_json"] = EXTRACTION_SCHEMA
703
+ else:
704
+ body["response_format"] = {
705
+ "type": "json_schema",
706
+ "json_schema": {
707
+ "name": "memory_extraction",
708
+ "strict": True,
709
+ "schema": EXTRACTION_SCHEMA,
710
+ },
711
+ }
712
+ return body
713
+
714
+
356
715
  async def call_llm_batch(
357
716
  client: httpx.AsyncClient, events: list[dict[str, Any]]
358
717
  ) -> list[dict[str, Any]]:
359
718
  """Send N events in a single chat-completion call, return the list
360
- of per-event extraction dicts in input order. The model emits
361
- pipe-delimited KV records (see BATCH_SYSTEM_PROMPT); the parser is
362
- line-tolerant so a malformed record skips itself rather than
363
- failing the chunk. Raises only on transport failure or completely
719
+ of per-event extraction dicts in input order.
720
+
721
+ kv mode (default): the model emits pipe-delimited KV records (see
722
+ BATCH_SYSTEM_PROMPT); the parser is line-tolerant so a malformed
723
+ record skips itself rather than failing the chunk.
724
+
725
+ guided_json mode: the model emits the EXTRACTION_SCHEMA-constrained
726
+ JSON envelope under server-side guided decoding; the parser
727
+ salvages complete event objects from a truncated stream so failure
728
+ degrades per-event, never per-chunk. Both parsers return the same
729
+ per-event dict shape, so everything downstream of this function is
730
+ mode-agnostic. Raises only on transport failure or completely
364
731
  empty output."""
365
732
  n = len(events)
366
733
  if n == 0:
@@ -378,20 +745,7 @@ async def call_llm_batch(
378
745
  build_event_block(i, ev) for i, ev in enumerate(events)
379
746
  )
380
747
 
381
- body: dict[str, Any] = {
382
- "model": LLM_MODEL,
383
- "messages": [
384
- {"role": "system", "content": BATCH_SYSTEM_PROMPT},
385
- {"role": "user", "content": user_prompt},
386
- ],
387
- "temperature": 0.0,
388
- "max_tokens": LLM_MAX_TOKENS_PER_EVENT * n,
389
- # KV-text output — no guided_json / response_format. The
390
- # benefit of structured-output enforcement was already
391
- # half-ignored by VL upstream, and the parser now recovers
392
- # from per-line drift so the schema enforcement isn't worth
393
- # the JSON brittleness it brought.
394
- }
748
+ body = _build_request_body(user_prompt, n)
395
749
  r = await client.post(LLM_ENDPOINT, json=body, headers=headers)
396
750
  r.raise_for_status()
397
751
  data = r.json()
@@ -400,12 +754,16 @@ async def call_llm_batch(
400
754
  text = data.get("message", {}).get("content", "")
401
755
  if not text:
402
756
  raise RuntimeError(f"llm returned no content: {json.dumps(data)[:300]}")
403
- parsed = _parse_kv_records(text, n)
757
+ if DISTILL_OUTPUT_MODE == "guided_json":
758
+ parsed = _parse_guided_json(text, n)
759
+ slices = _guided_event_slices(text, n)
760
+ else:
761
+ parsed = _parse_kv_records(text, n)
762
+ slices = _split_event_blocks(text, n)
404
763
  # Attach the per-event raw slice so downstream trace logging gets
405
764
  # the model's verbatim output for THIS event without re-splitting
406
765
  # the chunk-level text. Parser semantics are unaffected — the
407
766
  # raw_slice key is ignored by upsert paths.
408
- slices = _split_event_blocks(text, n)
409
767
  for record, slice_text in zip(parsed, slices):
410
768
  record["raw_slice"] = slice_text
411
769
  return parsed
@@ -1141,7 +1499,9 @@ async def amain():
1141
1499
  f"endpoint={LLM_ENDPOINT or '(stub)'}, model={LLM_MODEL}, "
1142
1500
  f"poll={POLL_INTERVAL_SEC}s, claim={BATCH_SIZE}, "
1143
1501
  f"events_per_call={EVENTS_PER_LLM_CALL}, "
1144
- f"concurrent_calls={CONCURRENT_LLM_CALLS})"
1502
+ f"concurrent_calls={CONCURRENT_LLM_CALLS}, "
1503
+ f"output_mode={DISTILL_OUTPUT_MODE}, "
1504
+ f"prompt_hash={SYSTEM_PROMPT_HASH})"
1145
1505
  )
1146
1506
  stub_mode = not LLM_ENDPOINT
1147
1507
  if stub_mode:
@@ -56,11 +56,15 @@ _pool: AsyncConnectionPool | None = None
56
56
  @asynccontextmanager
57
57
  async def lifespan(app: FastAPI):
58
58
  global _pool
59
+ # Default (tuple) row factory — _upsert_entities and friends index
60
+ # fetchone() rows positionally, matching extractor-async's worker.
61
+ # A dict_row factory here turns row[0] into KeyError: 0 on the
62
+ # entity-merge path (2026-06-11 prod incident: every extract that
63
+ # re-saw a known entity 500'd; only never-seen-entity events stored).
59
64
  _pool = AsyncConnectionPool(
60
65
  conninfo=PG_DSN,
61
66
  min_size=8,
62
67
  max_size=50,
63
- kwargs={"row_factory": psycopg.rows.dict_row},
64
68
  open=False,
65
69
  )
66
70
  await _pool.open()
@@ -89,7 +93,7 @@ class ExtractRequest(BaseModel):
89
93
  clientId: str
90
94
  userId: str | None = None
91
95
  event_type: str = "STORE_MEMORY"
92
- source_kind: str # 'chat' | 'note' | 'doc' | 'event' | 'ticket' | 'commit' | 'system' | 'agent'
96
+ source_kind: str # 'chat' | 'note' | 'doc' | 'event' | 'ticket' | 'commit' | 'system' | 'agent' | 'code_reference'
93
97
  source_id: str | None = None
94
98
  content: str
95
99
  attributes: dict[str, Any] = {}
@@ -22,8 +22,14 @@ import pytest
22
22
 
23
23
 
24
24
  # Load extractor-sync's server.py as a module so we can call its
25
- # private helpers directly.
25
+ # private helpers directly. server.py flat-imports its siblings
26
+ # (entity_id) the way the container's WORKDIR layout resolves them, so
27
+ # this directory must be on sys.path — otherwise exec_module raises
28
+ # ImportError and the module-level skip below silently swallows the
29
+ # whole suite whenever pytest runs from the repo root.
26
30
  _THIS = Path(__file__).resolve().parent
31
+ if str(_THIS) not in sys.path:
32
+ sys.path.insert(0, str(_THIS))
27
33
  _SPEC = importlib.util.spec_from_file_location("extractor_sync_server",
28
34
  _THIS / "server.py")
29
35
  assert _SPEC and _SPEC.loader
@@ -206,3 +212,78 @@ def test_extract_event_organizer_object_form() -> None:
206
212
  assert len(entities) == 1
207
213
  assert entities[0]["canonical_name"] == "X Person"
208
214
  assert "x@example.com" in entities[0]["aliases"]
215
+
216
+
217
+ # ----------------------------------------------------------------------
218
+ # _upsert_entities — merge path indexes rows positionally
219
+ # ----------------------------------------------------------------------
220
+ #
221
+ # Regression for the 2026-06-11 prod incident: the pool was configured
222
+ # with row_factory=dict_row while _upsert_entities did `row[0]`, so the
223
+ # merge branch (entity already known) raised KeyError: 0 and every
224
+ # extract that re-saw a known entity 500'd. Only never-seen-entity
225
+ # events could store. Two guards:
226
+ # 1. the pool must keep psycopg's default tuple row factory
227
+ # (matching extractor-async's worker, which also indexes
228
+ # positionally), and
229
+ # 2. the merge branch must work against tuple rows end-to-end.
230
+
231
+ import asyncio
232
+ import inspect
233
+
234
+
235
+ class _FakeCursor:
236
+ """Quacks like psycopg.AsyncCursor, returning TUPLE rows — the
237
+ shape the pool's default row factory produces. If the pool ever
238
+ grows a custom row_factory again, update this fake to match it or
239
+ test_pool_keeps_default_tuple_row_factory will flag the drift."""
240
+
241
+ def __init__(self, existing_id: str | None) -> None:
242
+ self.executed: list[tuple[str, object]] = []
243
+ self._existing_id = existing_id
244
+
245
+ async def execute(self, sql: str, params: object = None) -> None:
246
+ self.executed.append((" ".join(sql.split()), params))
247
+
248
+ async def fetchone(self):
249
+ return (self._existing_id,) if self._existing_id else None
250
+
251
+
252
+ def _entity_stub() -> dict:
253
+ return {
254
+ "id": "e_new",
255
+ "arena": "arena1",
256
+ "entity_type": "person",
257
+ "canonical_name": "Alice One",
258
+ "aliases": ["Alice One", "alice@example.com"],
259
+ "provenance_event_ids": ["evt1"],
260
+ "participant_set": ["arena1"],
261
+ "disclosure_class": "private",
262
+ }
263
+
264
+
265
+ def test_pool_keeps_default_tuple_row_factory() -> None:
266
+ src = inspect.getsource(sync_server.lifespan)
267
+ assert "row_factory" not in src, (
268
+ "extractor-sync's pool must use psycopg's default tuple rows: "
269
+ "_upsert_entities indexes fetchone() results positionally."
270
+ )
271
+
272
+
273
+ def test_upsert_entities_merge_branch_with_tuple_rows() -> None:
274
+ """Entity already exists → UPDATE branch runs, id taken from row[0]."""
275
+ cur = _FakeCursor(existing_id="e_existing")
276
+ asyncio.run(sync_server._upsert_entities(cur, [_entity_stub()]))
277
+ updates = [(s, p) for s, p in cur.executed if s.startswith("UPDATE entities")]
278
+ assert len(updates) == 1
279
+ _, params = updates[0]
280
+ assert params[-1] == "e_existing" # WHERE id = %s ← row[0]
281
+ assert not any(s.startswith("INSERT INTO entities") for s, _ in cur.executed)
282
+
283
+
284
+ def test_upsert_entities_insert_branch_when_no_match() -> None:
285
+ cur = _FakeCursor(existing_id=None)
286
+ asyncio.run(sync_server._upsert_entities(cur, [_entity_stub()]))
287
+ inserts = [s for s, _ in cur.executed if s.startswith("INSERT INTO entities")]
288
+ assert len(inserts) == 1
289
+ assert not any(s.startswith("UPDATE entities") for s, _ in cur.executed)
@@ -0,0 +1,12 @@
1
+ -- 004: accept 'code_reference' source events (SDK corpus ingest).
2
+ --
3
+ -- The SDK corpus module (packages/memory/src/corpus/) emits events with
4
+ -- source_kind='code_reference' (code-signature ingest, adapters.js).
5
+ -- The enum predates that feature, so those events bounced with
6
+ -- InvalidTextRepresentation and could never be stored — observed in
7
+ -- prod 2026-06-11 as persistent /extract 500s + producer retry loops.
8
+ --
9
+ -- ALTER TYPE ... ADD VALUE cannot run inside a transaction block;
10
+ -- apply with autocommit (psql's default per-statement behaviour).
11
+ -- Applied manually to prod (pme2-org-model) on 2026-06-11.
12
+ ALTER TYPE source_kind ADD VALUE IF NOT EXISTS 'code_reference';
@@ -0,0 +1,20 @@
1
+ -- 005: index every column that references events(id).
2
+ --
3
+ -- events has four referencing constraints:
4
+ -- distillation_queue.event_id ON DELETE CASCADE
5
+ -- vector_provenance.event_id ON DELETE CASCADE
6
+ -- distillation_traces.event_id ON DELETE CASCADE
7
+ -- events.forgets (self) ON DELETE SET NULL
8
+ --
9
+ -- Postgres does NOT auto-index FK referencing columns. Without these,
10
+ -- every DELETE on events seq-scans each referencing table per deleted
11
+ -- row to enforce the constraint — the 2026-06-11 arena-scoped nuke of
12
+ -- ~70k events ran for HOURS until the missing indexes were created
13
+ -- on-box. (distillation_queue.event_id already had idx_distillation_
14
+ -- event_id from 003; listed here for completeness via IF NOT EXISTS.)
15
+ --
16
+ -- All idempotent; applied manually to prod (pme2-org-model) 2026-06-12.
17
+ CREATE INDEX IF NOT EXISTS idx_distillation_event_id ON distillation_queue(event_id);
18
+ CREATE INDEX IF NOT EXISTS idx_traces_event_id ON distillation_traces(event_id);
19
+ CREATE INDEX IF NOT EXISTS idx_vector_provenance_event_id ON vector_provenance(event_id);
20
+ CREATE INDEX IF NOT EXISTS idx_events_forgets ON events(forgets);