@pentatonic-ai/ai-agent-sdk 0.10.5 → 0.10.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1 -1
- package/dist/index.js +1 -1
- package/package.json +1 -1
- package/packages/memory-engine-v2/compat/requirements.txt +6 -0
- package/packages/memory-engine-v2/compat/server.py +258 -18
- package/packages/memory-engine-v2/docker-compose.aws.yml +62 -1
- package/packages/memory-engine-v2/docker-compose.yml +8 -1
- package/packages/memory-engine-v2/eval/recall_at_k.py +242 -0
- package/packages/memory-engine-v2/eval/retrieval_golden.seed.json +69 -0
- package/packages/memory-engine-v2/extractor-async/Dockerfile +1 -1
- package/packages/memory-engine-v2/extractor-async/extraction_schema.py +246 -0
- package/packages/memory-engine-v2/extractor-async/test_guided_json_parser.py +455 -0
- package/packages/memory-engine-v2/extractor-async/worker.py +391 -31
- package/packages/memory-engine-v2/extractor-sync/server.py +6 -2
- package/packages/memory-engine-v2/extractor-sync/test_paired_extraction.py +82 -1
- package/packages/memory-engine-v2/org-model/migrations/004_source_kind_code_reference.sql +12 -0
- package/packages/memory-engine-v2/org-model/migrations/005_fk_indexes.sql +20 -0
- package/packages/memory-engine-v2/resolution-queue-design.md +165 -0
- package/packages/memory-engine-v2/scripts/backfill_entity_reconciliation.py +11 -2
- package/packages/memory-engine-v2/scripts/backfill_sparse_vectors.py +369 -0
- package/packages/memory-engine-v2/scripts/bakeoff_guided_vs_kv.py +607 -0
- package/packages/memory-engine-v2/scripts/entity_resolution_v2.py +1041 -0
- package/packages/memory-engine-v2/tests/test_entity_resolution_v2.py +507 -0
- package/packages/memory-engine-v2/tests/test_hybrid_retrieval.py +810 -0
|
@@ -41,6 +41,14 @@ import psycopg.rows
|
|
|
41
41
|
|
|
42
42
|
from confidence import corroborated_confidence
|
|
43
43
|
from entity_id import entity_id, normalize_surface_form
|
|
44
|
+
from extraction_schema import (
|
|
45
|
+
ALLOWED_ENT_TYPES,
|
|
46
|
+
ALLOWED_FCT_CATEGORIES,
|
|
47
|
+
EXTRACTION_SCHEMA,
|
|
48
|
+
MAX_ENTITIES_PER_EVENT,
|
|
49
|
+
MAX_FACTS_PER_EVENT,
|
|
50
|
+
MAX_RELATIONSHIPS_PER_EVENT,
|
|
51
|
+
)
|
|
44
52
|
from noise_filter import is_noise_entity_name
|
|
45
53
|
from sensitive_filter import SKIP_SENSITIVE_CONTENT, is_sensitive_event
|
|
46
54
|
|
|
@@ -90,15 +98,93 @@ DISTILL_TRACE_ENABLED = os.environ.get(
|
|
|
90
98
|
# chunk via a JSONDecodeError. Pipe-delimited records, one per line,
|
|
91
99
|
# recover at line granularity — a malformed line skips itself, the rest
|
|
92
100
|
# of the chunk lands. See 2026-05-18 ops notes.
|
|
101
|
+
#
|
|
102
|
+
# 2026-06-11 update: guided JSON is back as an OPT-IN second mode
|
|
103
|
+
# (DISTILL_OUTPUT_MODE=guided_json, default "kv" — a no-op until an
|
|
104
|
+
# operator flips it). Both halves of the 2026-05-18 removal rationale
|
|
105
|
+
# are answered this time:
|
|
106
|
+
# (a) the self-hosted Qwen2.5-7B vLLM box enforces structured output
|
|
107
|
+
# via logit masking (xgrammar/outlines) — the model CANNOT emit
|
|
108
|
+
# schema-invalid bytes, unlike the old VL gateway which
|
|
109
|
+
# half-ignored response_format;
|
|
110
|
+
# (b) blast radius is solved structurally — the schema is an array
|
|
111
|
+
# of per-event objects (see extraction_schema.py), so one
|
|
112
|
+
# event's content can't corrupt another's parse; the only
|
|
113
|
+
# residual failure is max_tokens truncation, and
|
|
114
|
+
# _parse_guided_json salvages every complete event object.
|
|
115
|
+
# ALLOWED_ENT_TYPES / ALLOWED_FCT_CATEGORIES now live in
|
|
116
|
+
# extraction_schema.py (imported above) so the schema enums and the
|
|
117
|
+
# KV prompt pin to the same single source.
|
|
93
118
|
EVENT_HEADER_RE = re.compile(r"^===?\s*event\s+(\d+)\s*===?\s*$", re.IGNORECASE)
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
}
|
|
119
|
+
|
|
120
|
+
# Output mode flag. "kv" (default) keeps today's pipe-delimited path
|
|
121
|
+
# byte-for-byte; "guided_json" switches the prompt, request params and
|
|
122
|
+
# parser. Anything unrecognised falls back to "kv" — fail-safe.
|
|
123
|
+
DISTILL_OUTPUT_MODE = os.environ.get("DISTILL_OUTPUT_MODE", "kv").strip().lower()
|
|
124
|
+
if DISTILL_OUTPUT_MODE not in ("kv", "guided_json"):
|
|
125
|
+
log.warning(
|
|
126
|
+
f"DISTILL_OUTPUT_MODE={DISTILL_OUTPUT_MODE!r} unrecognised — using 'kv'"
|
|
127
|
+
)
|
|
128
|
+
DISTILL_OUTPUT_MODE = "kv"
|
|
129
|
+
|
|
130
|
+
# How the structured-output schema is attached to the request in
|
|
131
|
+
# guided_json mode. The repo carries no pin for the engine box's vLLM
|
|
132
|
+
# version, so this is operator-selectable:
|
|
133
|
+
# - "response_format" (default): OpenAI-style
|
|
134
|
+
# response_format={"type":"json_schema","json_schema":{...}} —
|
|
135
|
+
# supported by vLLM >= 0.6.x OpenAI-compat server.
|
|
136
|
+
# - "guided_json": vLLM's legacy extension param (top-level
|
|
137
|
+
# `guided_json` in the request body; what openai-client users pass
|
|
138
|
+
# via extra_body). FALLBACK for older vLLM builds that predate
|
|
139
|
+
# json_schema response_format.
|
|
140
|
+
# Exactly one is sent — some vLLM versions reject requests that carry
|
|
141
|
+
# both guided-decoding params at once.
|
|
142
|
+
DISTILL_GUIDED_PARAM_STYLE = os.environ.get(
|
|
143
|
+
"DISTILL_GUIDED_PARAM_STYLE", "response_format"
|
|
144
|
+
).strip().lower()
|
|
145
|
+
if DISTILL_GUIDED_PARAM_STYLE not in ("response_format", "guided_json"):
|
|
146
|
+
log.warning(
|
|
147
|
+
f"DISTILL_GUIDED_PARAM_STYLE={DISTILL_GUIDED_PARAM_STYLE!r} unrecognised "
|
|
148
|
+
f"— using 'response_format'"
|
|
149
|
+
)
|
|
150
|
+
DISTILL_GUIDED_PARAM_STYLE = "response_format"
|
|
151
|
+
|
|
152
|
+
# Optional chat-template kwargs forwarded verbatim on every chat
|
|
153
|
+
# completion (vLLM extension: top-level `chat_template_kwargs`).
|
|
154
|
+
# Needed for thinking-capable teachers: Qwen3.x chat templates default
|
|
155
|
+
# enable_thinking=true, which burns the max_tokens budget on reasoning
|
|
156
|
+
# the distiller never reads. The 2026-06-11 teacher bake-off ran the
|
|
157
|
+
# Qwen3.6 lanes with {"enable_thinking": false}, so the prod swap must
|
|
158
|
+
# send the same switch for its traces to match the benchmarked
|
|
159
|
+
# distribution. Unset (default) sends nothing — the request body stays
|
|
160
|
+
# byte-identical for teachers without template switches (Qwen2.5).
|
|
161
|
+
DISTILL_CHAT_TEMPLATE_KWARGS: dict[str, Any] | None = None
|
|
162
|
+
_raw_ctk = os.environ.get("DISTILL_CHAT_TEMPLATE_KWARGS", "").strip()
|
|
163
|
+
if _raw_ctk:
|
|
164
|
+
try:
|
|
165
|
+
_parsed_ctk = json.loads(_raw_ctk)
|
|
166
|
+
if not isinstance(_parsed_ctk, dict):
|
|
167
|
+
raise ValueError("must be a JSON object")
|
|
168
|
+
DISTILL_CHAT_TEMPLATE_KWARGS = _parsed_ctk
|
|
169
|
+
except ValueError as e:
|
|
170
|
+
log.warning(f"DISTILL_CHAT_TEMPLATE_KWARGS invalid ({e}) — ignoring")
|
|
171
|
+
|
|
172
|
+
# JSON output carries structural overhead (braces, quotes, key names)
|
|
173
|
+
# the KV format doesn't, so guided mode gets its own per-event token
|
|
174
|
+
# budget. Truncation is guided mode's ONLY parse-failure mode (the
|
|
175
|
+
# schema enforcer guarantees validity up to the cut), so this errs
|
|
176
|
+
# higher than the KV 300.
|
|
177
|
+
#
|
|
178
|
+
# NOTE the budget is SHARED across the chunk (max_tokens = this × N
|
|
179
|
+
# events per request). A fully-maxed event (8 ent / 6 fct with 140-char
|
|
180
|
+
# statements / 6 rel + JSON overhead) is ~1.1k output tokens, so chunk
|
|
181
|
+
# size and this value must be chosen together against the server's
|
|
182
|
+
# max_model_len. Raised 400→900 after prod showed 15% of 5-event chunks
|
|
183
|
+
# truncating on `length` (2026-06-12); prod now runs EVENTS_PER_LLM_CALL=3
|
|
184
|
+
# so 3×900 output + ~2.1k prompt stays well inside the L40S 8192 ctx.
|
|
185
|
+
LLM_MAX_TOKENS_PER_EVENT_JSON = int(
|
|
186
|
+
os.environ.get("LLM_MAX_TOKENS_PER_EVENT_JSON", "900")
|
|
187
|
+
)
|
|
102
188
|
|
|
103
189
|
|
|
104
190
|
# --------------------------------------------------------------------
|
|
@@ -165,10 +251,71 @@ A whole file is one entity, not twenty.
|
|
|
165
251
|
- Output ONLY the formatted records. No header, no footer, no prose."""
|
|
166
252
|
|
|
167
253
|
|
|
254
|
+
# Guided-JSON variant of BATCH_SYSTEM_PROMPT. Same CONTENT rules
|
|
255
|
+
# (conservatism, per-event caps, code-content rule, subject-must-be-a-
|
|
256
|
+
# declared-entity, email-alias pairing, statement <= 140 chars, never
|
|
257
|
+
# skip an event) — only the output-format scaffolding changes. The
|
|
258
|
+
# pipe-format anchoring ("COUNT THE PIPES", pipe/newline substitution)
|
|
259
|
+
# is dropped: vLLM's guided decoding enforces the schema mechanically,
|
|
260
|
+
# so the prompt no longer needs to beg for format compliance, and JSON
|
|
261
|
+
# string escaping makes the pipe/newline substitution rules moot.
|
|
262
|
+
GUIDED_JSON_SYSTEM_PROMPT = """You extract structured knowledge from N \
|
|
263
|
+
events for a personal-memory graph.
|
|
264
|
+
|
|
265
|
+
You will receive N events, each prefixed with `[event K]`. Respond \
|
|
266
|
+
with a single JSON object: {"events": [...]} containing one object \
|
|
267
|
+
per input event. Be conservative — only emit things explicitly stated.
|
|
268
|
+
|
|
269
|
+
Each per-event object has:
|
|
270
|
+
"index": the zero-indexed event number, matching the input `[event K]`.
|
|
271
|
+
"entities": array of {"name", "type", "email"?}.
|
|
272
|
+
"facts": array of {"category", "subject", "predicate", "object", "statement"}.
|
|
273
|
+
"relationships": array of {"from", "to", "type"}.
|
|
274
|
+
|
|
275
|
+
RULES:
|
|
276
|
+
- NEVER skip an event — if an event has nothing to extract, emit its \
|
|
277
|
+
object with "index" set and empty arrays.
|
|
278
|
+
- entities: type ∈ {person, org, product, place, project, concept, \
|
|
279
|
+
topic, date, other}.
|
|
280
|
+
email (OPTIONAL, person only): when the event body or attributes
|
|
281
|
+
show an email address that unambiguously identifies the person,
|
|
282
|
+
include it. This pairs the name+email forms so a later event seeing
|
|
283
|
+
only the email resolves to the same entity. Omit the key otherwise.
|
|
284
|
+
- facts: category ∈ {decision, commitment, state, mention, \
|
|
285
|
+
observation, preference}.
|
|
286
|
+
subject MUST be an entity name declared in THIS event's "entities".
|
|
287
|
+
predicate is a short verb phrase (e.g. "agreed to", "owns", "works at").
|
|
288
|
+
object MAY be an entity name OR a literal string OR null if absent.
|
|
289
|
+
statement ≤ 140 characters, a self-contained sentence.
|
|
290
|
+
WORKED EXAMPLE: {"category": "commitment", "subject": "Timothy \
|
|
291
|
+
Bradley", "predicate": "agreed to", "object": "SAFE amendments", \
|
|
292
|
+
"statement": "Timothy confirmed the SAFE amendments are set (14 May 2026)"}
|
|
293
|
+
- relationships: "from" and "to" MUST be entity names declared in THIS \
|
|
294
|
+
event's "entities". "type" is a short verb / preposition phrase.
|
|
295
|
+
- HARD CAPS per event: 8 entities, 6 facts, 6 relationships. Pick the \
|
|
296
|
+
most salient.
|
|
297
|
+
- For code / technical content: extract only top-level services, \
|
|
298
|
+
modules, or domain concepts. NOT variables, types, or method names. \
|
|
299
|
+
A whole file is one entity, not twenty.
|
|
300
|
+
- Output ONLY the JSON object. No markdown fences, no prose."""
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
# The system prompt actually sent to the LLM under the current output
|
|
304
|
+
# mode. Everything downstream (request body, trace fingerprint) hangs
|
|
305
|
+
# off this so the two can never disagree.
|
|
306
|
+
ACTIVE_SYSTEM_PROMPT = (
|
|
307
|
+
GUIDED_JSON_SYSTEM_PROMPT
|
|
308
|
+
if DISTILL_OUTPUT_MODE == "guided_json"
|
|
309
|
+
else BATCH_SYSTEM_PROMPT
|
|
310
|
+
)
|
|
311
|
+
|
|
168
312
|
# Teacher-prompt fingerprint for trace logging. If the prompt changes,
|
|
169
313
|
# the hash changes — lets training-data exports filter by teacher
|
|
170
|
-
# version so we never mix outputs from a retired prompt.
|
|
171
|
-
|
|
314
|
+
# version so we never mix outputs from a retired prompt. Computed from
|
|
315
|
+
# the ACTIVE prompt, so flipping DISTILL_OUTPUT_MODE auto-segments
|
|
316
|
+
# distillation_traces into a new teacher version (KV-format traces and
|
|
317
|
+
# guided-JSON traces never mix in a training export).
|
|
318
|
+
SYSTEM_PROMPT_HASH = hashlib.sha256(ACTIVE_SYSTEM_PROMPT.encode()).hexdigest()[:16]
|
|
172
319
|
|
|
173
320
|
|
|
174
321
|
# --------------------------------------------------------------------
|
|
@@ -353,14 +500,234 @@ def _split_event_blocks(text: str, expected_n: int) -> list[str]:
|
|
|
353
500
|
return slices
|
|
354
501
|
|
|
355
502
|
|
|
503
|
+
# --------------------------------------------------------------------
|
|
504
|
+
# Guided-JSON parsing (DISTILL_OUTPUT_MODE=guided_json)
|
|
505
|
+
# --------------------------------------------------------------------
|
|
506
|
+
|
|
507
|
+
|
|
508
|
+
def _load_guided_payload(text: str) -> dict[str, Any] | None:
|
|
509
|
+
"""Parse the guided-JSON chunk output into the {"events": [...]}
|
|
510
|
+
payload, salvaging what's complete if the output was truncated.
|
|
511
|
+
|
|
512
|
+
Under guided decoding the server's logit masking guarantees every
|
|
513
|
+
emitted byte is schema-consistent, so the ONLY way the payload can
|
|
514
|
+
fail to parse is max_tokens truncation mid-stream. Salvage is
|
|
515
|
+
therefore simple and structural: walk back to the last complete
|
|
516
|
+
`}` (the close of the last fully-emitted event object), close the
|
|
517
|
+
events array + root object, and re-parse. Each step back drops at
|
|
518
|
+
most one (incomplete) event — per-event degradation, never
|
|
519
|
+
chunk-level loss. Returns None if nothing parseable remains."""
|
|
520
|
+
raw = (text or "").strip()
|
|
521
|
+
if not raw:
|
|
522
|
+
return None
|
|
523
|
+
# Defensive fence strip — can't occur under guided decoding, but
|
|
524
|
+
# the bake-off script replays this parser over unguided output too.
|
|
525
|
+
if raw.startswith("```"):
|
|
526
|
+
raw = raw.strip("`").strip()
|
|
527
|
+
if raw.lower().startswith("json"):
|
|
528
|
+
raw = raw[4:].lstrip()
|
|
529
|
+
try:
|
|
530
|
+
payload = json.loads(raw)
|
|
531
|
+
return payload if isinstance(payload, dict) else None
|
|
532
|
+
except json.JSONDecodeError:
|
|
533
|
+
pass
|
|
534
|
+
# Truncated: trim to the last complete `}` of the events array and
|
|
535
|
+
# close the structure. Walk back through `}` occurrences until a
|
|
536
|
+
# candidate parses (bounded — each iteration discards at least one
|
|
537
|
+
# char, and 200 closing braces covers far more events than a chunk
|
|
538
|
+
# can hold).
|
|
539
|
+
end = len(raw)
|
|
540
|
+
for _ in range(200):
|
|
541
|
+
idx = raw.rfind("}", 0, end)
|
|
542
|
+
if idx < 0:
|
|
543
|
+
return None
|
|
544
|
+
candidate = raw[: idx + 1] + "]}"
|
|
545
|
+
try:
|
|
546
|
+
payload = json.loads(candidate)
|
|
547
|
+
return payload if isinstance(payload, dict) else None
|
|
548
|
+
except json.JSONDecodeError:
|
|
549
|
+
end = idx
|
|
550
|
+
return None
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
def _resolve_event_index(ev: dict[str, Any], pos: int, expected_n: int) -> int | None:
|
|
554
|
+
"""Map a parsed event object to its result slot. Trust the model's
|
|
555
|
+
"index" field when it's a valid in-range int (it mirrors the
|
|
556
|
+
`[event K]` input header); fall back to array position otherwise.
|
|
557
|
+
None = undeliverable (both out of range) — the object is dropped
|
|
558
|
+
without corrupting any other event's slot."""
|
|
559
|
+
idx = ev.get("index")
|
|
560
|
+
if isinstance(idx, int) and not isinstance(idx, bool) and 0 <= idx < expected_n:
|
|
561
|
+
return idx
|
|
562
|
+
if 0 <= pos < expected_n:
|
|
563
|
+
return pos
|
|
564
|
+
return None
|
|
565
|
+
|
|
566
|
+
|
|
567
|
+
def _parse_guided_json(text: str, expected_n: int) -> list[dict[str, Any]]:
|
|
568
|
+
"""Parse guided-JSON output into per-event extraction dicts —
|
|
569
|
+
sibling of _parse_kv_records, returning the IDENTICAL shape
|
|
570
|
+
({"entities": [...], "facts": [...], "relationships": [...]}, with
|
|
571
|
+
entity emails promoted into "aliases") so the upsert path and trace
|
|
572
|
+
logging are untouched by the output-mode flip.
|
|
573
|
+
|
|
574
|
+
Defensive beyond what guided decoding guarantees: truncation is
|
|
575
|
+
salvaged per-event (see _load_guided_payload), per-item junk is
|
|
576
|
+
skipped, the per-event hard caps are re-enforced, and string fields
|
|
577
|
+
are normalised exactly as the KV parser normalises them (strip,
|
|
578
|
+
lowercase type/category, `-`/empty/null object → None, non-email
|
|
579
|
+
"email" values dropped). Always returns expected_n entries."""
|
|
580
|
+
results: list[dict[str, Any]] = [
|
|
581
|
+
{"entities": [], "facts": [], "relationships": []} for _ in range(expected_n)
|
|
582
|
+
]
|
|
583
|
+
payload = _load_guided_payload(text)
|
|
584
|
+
if payload is None:
|
|
585
|
+
return results
|
|
586
|
+
events = payload.get("events")
|
|
587
|
+
if not isinstance(events, list):
|
|
588
|
+
return results
|
|
589
|
+
for pos, ev in enumerate(events):
|
|
590
|
+
if not isinstance(ev, dict):
|
|
591
|
+
continue
|
|
592
|
+
idx = _resolve_event_index(ev, pos, expected_n)
|
|
593
|
+
if idx is None:
|
|
594
|
+
continue
|
|
595
|
+
target = results[idx]
|
|
596
|
+
ents = ev.get("entities")
|
|
597
|
+
for e in (ents if isinstance(ents, list) else [])[:MAX_ENTITIES_PER_EVENT]:
|
|
598
|
+
if not isinstance(e, dict):
|
|
599
|
+
continue
|
|
600
|
+
name = str(e.get("name") or "").strip()
|
|
601
|
+
if not name:
|
|
602
|
+
continue
|
|
603
|
+
etype = str(e.get("type") or "").strip().lower()
|
|
604
|
+
ent: dict[str, Any] = {"type": etype, "name": name}
|
|
605
|
+
# Mirror the KV 4th-field rule: promote into aliases only
|
|
606
|
+
# when it actually looks like an email; drop junk silently.
|
|
607
|
+
email = e.get("email")
|
|
608
|
+
if isinstance(email, str):
|
|
609
|
+
email = email.strip()
|
|
610
|
+
if email and "@" in email and " " not in email:
|
|
611
|
+
ent["aliases"] = [email]
|
|
612
|
+
target["entities"].append(ent)
|
|
613
|
+
facts = ev.get("facts")
|
|
614
|
+
for f in (facts if isinstance(facts, list) else [])[:MAX_FACTS_PER_EVENT]:
|
|
615
|
+
if not isinstance(f, dict):
|
|
616
|
+
continue
|
|
617
|
+
stmt = str(f.get("statement") or "").strip()
|
|
618
|
+
if not stmt:
|
|
619
|
+
continue
|
|
620
|
+
obj = f.get("object")
|
|
621
|
+
obj = obj.strip() if isinstance(obj, str) else None
|
|
622
|
+
target["facts"].append(
|
|
623
|
+
{
|
|
624
|
+
"category": str(f.get("category") or "").strip().lower(),
|
|
625
|
+
"subject": str(f.get("subject") or "").strip(),
|
|
626
|
+
"predicate": str(f.get("predicate") or "").strip(),
|
|
627
|
+
"object": None if obj in (None, "", "-", "null", "None") else obj,
|
|
628
|
+
"statement": stmt,
|
|
629
|
+
}
|
|
630
|
+
)
|
|
631
|
+
rels = ev.get("relationships")
|
|
632
|
+
for r in (rels if isinstance(rels, list) else [])[:MAX_RELATIONSHIPS_PER_EVENT]:
|
|
633
|
+
if not isinstance(r, dict):
|
|
634
|
+
continue
|
|
635
|
+
frm = str(r.get("from") or "").strip()
|
|
636
|
+
to = str(r.get("to") or "").strip()
|
|
637
|
+
rtype = str(r.get("type") or "").strip()
|
|
638
|
+
if frm and to and rtype:
|
|
639
|
+
target["relationships"].append({"from": frm, "to": to, "type": rtype})
|
|
640
|
+
return results
|
|
641
|
+
|
|
642
|
+
|
|
643
|
+
def _guided_event_slices(text: str, expected_n: int) -> list[str]:
|
|
644
|
+
"""Per-event raw slices for trace logging in guided mode — the
|
|
645
|
+
JSON-mode sibling of _split_event_blocks, same shape contract
|
|
646
|
+
(expected_n entries, missing events as empty strings). Each slice
|
|
647
|
+
is the model's event object re-serialised verbatim-in-content
|
|
648
|
+
(key order preserved, non-ASCII kept) so distillation_traces stays
|
|
649
|
+
a faithful (input, output) training pair."""
|
|
650
|
+
slices: list[str] = [""] * expected_n
|
|
651
|
+
payload = _load_guided_payload(text)
|
|
652
|
+
if payload is None:
|
|
653
|
+
return slices
|
|
654
|
+
events = payload.get("events")
|
|
655
|
+
if not isinstance(events, list):
|
|
656
|
+
return slices
|
|
657
|
+
for pos, ev in enumerate(events):
|
|
658
|
+
if not isinstance(ev, dict):
|
|
659
|
+
continue
|
|
660
|
+
idx = _resolve_event_index(ev, pos, expected_n)
|
|
661
|
+
if idx is not None:
|
|
662
|
+
slices[idx] = json.dumps(ev, ensure_ascii=False)
|
|
663
|
+
return slices
|
|
664
|
+
|
|
665
|
+
|
|
666
|
+
def _build_request_body(user_prompt: str, n: int) -> dict[str, Any]:
|
|
667
|
+
"""Chat-completions request body for one N-event chunk. Pure —
|
|
668
|
+
everything mode-dependent (prompt, token budget, structured-output
|
|
669
|
+
params) keys off the module-level flags so this is unit-testable.
|
|
670
|
+
|
|
671
|
+
kv mode (default): byte-for-byte the pre-flag body — KV-text
|
|
672
|
+
output, no guided_json / response_format. The benefit of
|
|
673
|
+
structured-output enforcement was half-ignored by the old VL
|
|
674
|
+
upstream, and the KV parser recovers from per-line drift.
|
|
675
|
+
|
|
676
|
+
guided_json mode: attaches EXTRACTION_SCHEMA via ONE of the two
|
|
677
|
+
vLLM structured-output param styles (DISTILL_GUIDED_PARAM_STYLE;
|
|
678
|
+
some vLLM versions reject requests carrying both at once):
|
|
679
|
+
- response_format {"type": "json_schema", ...} — OpenAI-style,
|
|
680
|
+
current vLLM (default).
|
|
681
|
+
- top-level guided_json — vLLM's legacy extension param (what
|
|
682
|
+
openai-client callers pass via extra_body), fallback for older
|
|
683
|
+
server builds.
|
|
684
|
+
"""
|
|
685
|
+
body: dict[str, Any] = {
|
|
686
|
+
"model": LLM_MODEL,
|
|
687
|
+
"messages": [
|
|
688
|
+
{"role": "system", "content": ACTIVE_SYSTEM_PROMPT},
|
|
689
|
+
{"role": "user", "content": user_prompt},
|
|
690
|
+
],
|
|
691
|
+
"temperature": 0.0,
|
|
692
|
+
"max_tokens": (
|
|
693
|
+
LLM_MAX_TOKENS_PER_EVENT_JSON
|
|
694
|
+
if DISTILL_OUTPUT_MODE == "guided_json"
|
|
695
|
+
else LLM_MAX_TOKENS_PER_EVENT
|
|
696
|
+
) * n,
|
|
697
|
+
}
|
|
698
|
+
if DISTILL_CHAT_TEMPLATE_KWARGS:
|
|
699
|
+
body["chat_template_kwargs"] = DISTILL_CHAT_TEMPLATE_KWARGS
|
|
700
|
+
if DISTILL_OUTPUT_MODE == "guided_json":
|
|
701
|
+
if DISTILL_GUIDED_PARAM_STYLE == "guided_json":
|
|
702
|
+
body["guided_json"] = EXTRACTION_SCHEMA
|
|
703
|
+
else:
|
|
704
|
+
body["response_format"] = {
|
|
705
|
+
"type": "json_schema",
|
|
706
|
+
"json_schema": {
|
|
707
|
+
"name": "memory_extraction",
|
|
708
|
+
"strict": True,
|
|
709
|
+
"schema": EXTRACTION_SCHEMA,
|
|
710
|
+
},
|
|
711
|
+
}
|
|
712
|
+
return body
|
|
713
|
+
|
|
714
|
+
|
|
356
715
|
async def call_llm_batch(
|
|
357
716
|
client: httpx.AsyncClient, events: list[dict[str, Any]]
|
|
358
717
|
) -> list[dict[str, Any]]:
|
|
359
718
|
"""Send N events in a single chat-completion call, return the list
|
|
360
|
-
of per-event extraction dicts in input order.
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
719
|
+
of per-event extraction dicts in input order.
|
|
720
|
+
|
|
721
|
+
kv mode (default): the model emits pipe-delimited KV records (see
|
|
722
|
+
BATCH_SYSTEM_PROMPT); the parser is line-tolerant so a malformed
|
|
723
|
+
record skips itself rather than failing the chunk.
|
|
724
|
+
|
|
725
|
+
guided_json mode: the model emits the EXTRACTION_SCHEMA-constrained
|
|
726
|
+
JSON envelope under server-side guided decoding; the parser
|
|
727
|
+
salvages complete event objects from a truncated stream so failure
|
|
728
|
+
degrades per-event, never per-chunk. Both parsers return the same
|
|
729
|
+
per-event dict shape, so everything downstream of this function is
|
|
730
|
+
mode-agnostic. Raises only on transport failure or completely
|
|
364
731
|
empty output."""
|
|
365
732
|
n = len(events)
|
|
366
733
|
if n == 0:
|
|
@@ -378,20 +745,7 @@ async def call_llm_batch(
|
|
|
378
745
|
build_event_block(i, ev) for i, ev in enumerate(events)
|
|
379
746
|
)
|
|
380
747
|
|
|
381
|
-
body
|
|
382
|
-
"model": LLM_MODEL,
|
|
383
|
-
"messages": [
|
|
384
|
-
{"role": "system", "content": BATCH_SYSTEM_PROMPT},
|
|
385
|
-
{"role": "user", "content": user_prompt},
|
|
386
|
-
],
|
|
387
|
-
"temperature": 0.0,
|
|
388
|
-
"max_tokens": LLM_MAX_TOKENS_PER_EVENT * n,
|
|
389
|
-
# KV-text output — no guided_json / response_format. The
|
|
390
|
-
# benefit of structured-output enforcement was already
|
|
391
|
-
# half-ignored by VL upstream, and the parser now recovers
|
|
392
|
-
# from per-line drift so the schema enforcement isn't worth
|
|
393
|
-
# the JSON brittleness it brought.
|
|
394
|
-
}
|
|
748
|
+
body = _build_request_body(user_prompt, n)
|
|
395
749
|
r = await client.post(LLM_ENDPOINT, json=body, headers=headers)
|
|
396
750
|
r.raise_for_status()
|
|
397
751
|
data = r.json()
|
|
@@ -400,12 +754,16 @@ async def call_llm_batch(
|
|
|
400
754
|
text = data.get("message", {}).get("content", "")
|
|
401
755
|
if not text:
|
|
402
756
|
raise RuntimeError(f"llm returned no content: {json.dumps(data)[:300]}")
|
|
403
|
-
|
|
757
|
+
if DISTILL_OUTPUT_MODE == "guided_json":
|
|
758
|
+
parsed = _parse_guided_json(text, n)
|
|
759
|
+
slices = _guided_event_slices(text, n)
|
|
760
|
+
else:
|
|
761
|
+
parsed = _parse_kv_records(text, n)
|
|
762
|
+
slices = _split_event_blocks(text, n)
|
|
404
763
|
# Attach the per-event raw slice so downstream trace logging gets
|
|
405
764
|
# the model's verbatim output for THIS event without re-splitting
|
|
406
765
|
# the chunk-level text. Parser semantics are unaffected — the
|
|
407
766
|
# raw_slice key is ignored by upsert paths.
|
|
408
|
-
slices = _split_event_blocks(text, n)
|
|
409
767
|
for record, slice_text in zip(parsed, slices):
|
|
410
768
|
record["raw_slice"] = slice_text
|
|
411
769
|
return parsed
|
|
@@ -1141,7 +1499,9 @@ async def amain():
|
|
|
1141
1499
|
f"endpoint={LLM_ENDPOINT or '(stub)'}, model={LLM_MODEL}, "
|
|
1142
1500
|
f"poll={POLL_INTERVAL_SEC}s, claim={BATCH_SIZE}, "
|
|
1143
1501
|
f"events_per_call={EVENTS_PER_LLM_CALL}, "
|
|
1144
|
-
f"concurrent_calls={CONCURRENT_LLM_CALLS}
|
|
1502
|
+
f"concurrent_calls={CONCURRENT_LLM_CALLS}, "
|
|
1503
|
+
f"output_mode={DISTILL_OUTPUT_MODE}, "
|
|
1504
|
+
f"prompt_hash={SYSTEM_PROMPT_HASH})"
|
|
1145
1505
|
)
|
|
1146
1506
|
stub_mode = not LLM_ENDPOINT
|
|
1147
1507
|
if stub_mode:
|
|
@@ -56,11 +56,15 @@ _pool: AsyncConnectionPool | None = None
|
|
|
56
56
|
@asynccontextmanager
|
|
57
57
|
async def lifespan(app: FastAPI):
|
|
58
58
|
global _pool
|
|
59
|
+
# Default (tuple) row factory — _upsert_entities and friends index
|
|
60
|
+
# fetchone() rows positionally, matching extractor-async's worker.
|
|
61
|
+
# A dict_row factory here turns row[0] into KeyError: 0 on the
|
|
62
|
+
# entity-merge path (2026-06-11 prod incident: every extract that
|
|
63
|
+
# re-saw a known entity 500'd; only never-seen-entity events stored).
|
|
59
64
|
_pool = AsyncConnectionPool(
|
|
60
65
|
conninfo=PG_DSN,
|
|
61
66
|
min_size=8,
|
|
62
67
|
max_size=50,
|
|
63
|
-
kwargs={"row_factory": psycopg.rows.dict_row},
|
|
64
68
|
open=False,
|
|
65
69
|
)
|
|
66
70
|
await _pool.open()
|
|
@@ -89,7 +93,7 @@ class ExtractRequest(BaseModel):
|
|
|
89
93
|
clientId: str
|
|
90
94
|
userId: str | None = None
|
|
91
95
|
event_type: str = "STORE_MEMORY"
|
|
92
|
-
source_kind: str # 'chat' | 'note' | 'doc' | 'event' | 'ticket' | 'commit' | 'system' | 'agent'
|
|
96
|
+
source_kind: str # 'chat' | 'note' | 'doc' | 'event' | 'ticket' | 'commit' | 'system' | 'agent' | 'code_reference'
|
|
93
97
|
source_id: str | None = None
|
|
94
98
|
content: str
|
|
95
99
|
attributes: dict[str, Any] = {}
|
|
@@ -22,8 +22,14 @@ import pytest
|
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
# Load extractor-sync's server.py as a module so we can call its
|
|
25
|
-
# private helpers directly.
|
|
25
|
+
# private helpers directly. server.py flat-imports its siblings
|
|
26
|
+
# (entity_id) the way the container's WORKDIR layout resolves them, so
|
|
27
|
+
# this directory must be on sys.path — otherwise exec_module raises
|
|
28
|
+
# ImportError and the module-level skip below silently swallows the
|
|
29
|
+
# whole suite whenever pytest runs from the repo root.
|
|
26
30
|
_THIS = Path(__file__).resolve().parent
|
|
31
|
+
if str(_THIS) not in sys.path:
|
|
32
|
+
sys.path.insert(0, str(_THIS))
|
|
27
33
|
_SPEC = importlib.util.spec_from_file_location("extractor_sync_server",
|
|
28
34
|
_THIS / "server.py")
|
|
29
35
|
assert _SPEC and _SPEC.loader
|
|
@@ -206,3 +212,78 @@ def test_extract_event_organizer_object_form() -> None:
|
|
|
206
212
|
assert len(entities) == 1
|
|
207
213
|
assert entities[0]["canonical_name"] == "X Person"
|
|
208
214
|
assert "x@example.com" in entities[0]["aliases"]
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
# ----------------------------------------------------------------------
|
|
218
|
+
# _upsert_entities — merge path indexes rows positionally
|
|
219
|
+
# ----------------------------------------------------------------------
|
|
220
|
+
#
|
|
221
|
+
# Regression for the 2026-06-11 prod incident: the pool was configured
|
|
222
|
+
# with row_factory=dict_row while _upsert_entities did `row[0]`, so the
|
|
223
|
+
# merge branch (entity already known) raised KeyError: 0 and every
|
|
224
|
+
# extract that re-saw a known entity 500'd. Only never-seen-entity
|
|
225
|
+
# events could store. Two guards:
|
|
226
|
+
# 1. the pool must keep psycopg's default tuple row factory
|
|
227
|
+
# (matching extractor-async's worker, which also indexes
|
|
228
|
+
# positionally), and
|
|
229
|
+
# 2. the merge branch must work against tuple rows end-to-end.
|
|
230
|
+
|
|
231
|
+
import asyncio
|
|
232
|
+
import inspect
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
class _FakeCursor:
|
|
236
|
+
"""Quacks like psycopg.AsyncCursor, returning TUPLE rows — the
|
|
237
|
+
shape the pool's default row factory produces. If the pool ever
|
|
238
|
+
grows a custom row_factory again, update this fake to match it or
|
|
239
|
+
test_pool_keeps_default_tuple_row_factory will flag the drift."""
|
|
240
|
+
|
|
241
|
+
def __init__(self, existing_id: str | None) -> None:
|
|
242
|
+
self.executed: list[tuple[str, object]] = []
|
|
243
|
+
self._existing_id = existing_id
|
|
244
|
+
|
|
245
|
+
async def execute(self, sql: str, params: object = None) -> None:
|
|
246
|
+
self.executed.append((" ".join(sql.split()), params))
|
|
247
|
+
|
|
248
|
+
async def fetchone(self):
|
|
249
|
+
return (self._existing_id,) if self._existing_id else None
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def _entity_stub() -> dict:
|
|
253
|
+
return {
|
|
254
|
+
"id": "e_new",
|
|
255
|
+
"arena": "arena1",
|
|
256
|
+
"entity_type": "person",
|
|
257
|
+
"canonical_name": "Alice One",
|
|
258
|
+
"aliases": ["Alice One", "alice@example.com"],
|
|
259
|
+
"provenance_event_ids": ["evt1"],
|
|
260
|
+
"participant_set": ["arena1"],
|
|
261
|
+
"disclosure_class": "private",
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def test_pool_keeps_default_tuple_row_factory() -> None:
|
|
266
|
+
src = inspect.getsource(sync_server.lifespan)
|
|
267
|
+
assert "row_factory" not in src, (
|
|
268
|
+
"extractor-sync's pool must use psycopg's default tuple rows: "
|
|
269
|
+
"_upsert_entities indexes fetchone() results positionally."
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def test_upsert_entities_merge_branch_with_tuple_rows() -> None:
|
|
274
|
+
"""Entity already exists → UPDATE branch runs, id taken from row[0]."""
|
|
275
|
+
cur = _FakeCursor(existing_id="e_existing")
|
|
276
|
+
asyncio.run(sync_server._upsert_entities(cur, [_entity_stub()]))
|
|
277
|
+
updates = [(s, p) for s, p in cur.executed if s.startswith("UPDATE entities")]
|
|
278
|
+
assert len(updates) == 1
|
|
279
|
+
_, params = updates[0]
|
|
280
|
+
assert params[-1] == "e_existing" # WHERE id = %s ← row[0]
|
|
281
|
+
assert not any(s.startswith("INSERT INTO entities") for s, _ in cur.executed)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def test_upsert_entities_insert_branch_when_no_match() -> None:
|
|
285
|
+
cur = _FakeCursor(existing_id=None)
|
|
286
|
+
asyncio.run(sync_server._upsert_entities(cur, [_entity_stub()]))
|
|
287
|
+
inserts = [s for s, _ in cur.executed if s.startswith("INSERT INTO entities")]
|
|
288
|
+
assert len(inserts) == 1
|
|
289
|
+
assert not any(s.startswith("UPDATE entities") for s, _ in cur.executed)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
-- 004: accept 'code_reference' source events (SDK corpus ingest).
|
|
2
|
+
--
|
|
3
|
+
-- The SDK corpus module (packages/memory/src/corpus/) emits events with
|
|
4
|
+
-- source_kind='code_reference' (code-signature ingest, adapters.js).
|
|
5
|
+
-- The enum predates that feature, so those events bounced with
|
|
6
|
+
-- InvalidTextRepresentation and could never be stored — observed in
|
|
7
|
+
-- prod 2026-06-11 as persistent /extract 500s + producer retry loops.
|
|
8
|
+
--
|
|
9
|
+
-- ALTER TYPE ... ADD VALUE cannot run inside a transaction block;
|
|
10
|
+
-- apply with autocommit (psql's default per-statement behaviour).
|
|
11
|
+
-- Applied manually to prod (pme2-org-model) on 2026-06-11.
|
|
12
|
+
ALTER TYPE source_kind ADD VALUE IF NOT EXISTS 'code_reference';
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
-- 005: index every column that references events(id).
|
|
2
|
+
--
|
|
3
|
+
-- events has four referencing constraints:
|
|
4
|
+
-- distillation_queue.event_id ON DELETE CASCADE
|
|
5
|
+
-- vector_provenance.event_id ON DELETE CASCADE
|
|
6
|
+
-- distillation_traces.event_id ON DELETE CASCADE
|
|
7
|
+
-- events.forgets (self) ON DELETE SET NULL
|
|
8
|
+
--
|
|
9
|
+
-- Postgres does NOT auto-index FK referencing columns. Without these,
|
|
10
|
+
-- every DELETE on events seq-scans each referencing table per deleted
|
|
11
|
+
-- row to enforce the constraint — the 2026-06-11 arena-scoped nuke of
|
|
12
|
+
-- ~70k events ran for HOURS until the missing indexes were created
|
|
13
|
+
-- on-box. (distillation_queue.event_id already had idx_distillation_
|
|
14
|
+
-- event_id from 003; listed here for completeness via IF NOT EXISTS.)
|
|
15
|
+
--
|
|
16
|
+
-- All idempotent; applied manually to prod (pme2-org-model) 2026-06-12.
|
|
17
|
+
CREATE INDEX IF NOT EXISTS idx_distillation_event_id ON distillation_queue(event_id);
|
|
18
|
+
CREATE INDEX IF NOT EXISTS idx_traces_event_id ON distillation_traces(event_id);
|
|
19
|
+
CREATE INDEX IF NOT EXISTS idx_vector_provenance_event_id ON vector_provenance(event_id);
|
|
20
|
+
CREATE INDEX IF NOT EXISTS idx_events_forgets ON events(forgets);
|