npm - @pentatonic-ai/ai-agent-sdk - Versions diffs - 0.10.4 → 0.10.6 - Mend

@pentatonic-ai/ai-agent-sdk 0.10.4 → 0.10.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/packages/memory-engine-v2/extractor-async/worker.py CHANGED Viewed

@@ -41,6 +41,14 @@ import psycopg.rows
 from confidence import corroborated_confidence
 from entity_id import entity_id, normalize_surface_form
+from extraction_schema import (
+    ALLOWED_ENT_TYPES,
+    ALLOWED_FCT_CATEGORIES,
+    EXTRACTION_SCHEMA,
+    MAX_ENTITIES_PER_EVENT,
+    MAX_FACTS_PER_EVENT,
+    MAX_RELATIONSHIPS_PER_EVENT,
+)
 from noise_filter import is_noise_entity_name
 from sensitive_filter import SKIP_SENSITIVE_CONTENT, is_sensitive_event
@@ -90,15 +98,65 @@ DISTILL_TRACE_ENABLED = os.environ.get(
 # chunk via a JSONDecodeError. Pipe-delimited records, one per line,
 # recover at line granularity — a malformed line skips itself, the rest
 # of the chunk lands. See 2026-05-18 ops notes.
+#
+# 2026-06-11 update: guided JSON is back as an OPT-IN second mode
+# (DISTILL_OUTPUT_MODE=guided_json, default "kv" — a no-op until an
+# operator flips it). Both halves of the 2026-05-18 removal rationale
+# are answered this time:
+#   (a) the self-hosted Qwen2.5-7B vLLM box enforces structured output
+#       via logit masking (xgrammar/outlines) — the model CANNOT emit
+#       schema-invalid bytes, unlike the old VL gateway which
+#       half-ignored response_format;
+#   (b) blast radius is solved structurally — the schema is an array
+#       of per-event objects (see extraction_schema.py), so one
+#       event's content can't corrupt another's parse; the only
+#       residual failure is max_tokens truncation, and
+#       _parse_guided_json salvages every complete event object.
+# ALLOWED_ENT_TYPES / ALLOWED_FCT_CATEGORIES now live in
+# extraction_schema.py (imported above) so the schema enums and the
+# KV prompt pin to the same single source.
 EVENT_HEADER_RE = re.compile(r"^===?\s*event\s+(\d+)\s*===?\s*$", re.IGNORECASE)
-ALLOWED_ENT_TYPES = {
-    "person", "org", "product", "place", "project",
-    "concept", "topic", "date", "other",
-}
-ALLOWED_FCT_CATEGORIES = {
-    "decision", "commitment", "state", "mention",
-    "observation", "preference",
-}
+# Output mode flag. "kv" (default) keeps today's pipe-delimited path
+# byte-for-byte; "guided_json" switches the prompt, request params and
+# parser. Anything unrecognised falls back to "kv" — fail-safe.
+DISTILL_OUTPUT_MODE = os.environ.get("DISTILL_OUTPUT_MODE", "kv").strip().lower()
+if DISTILL_OUTPUT_MODE not in ("kv", "guided_json"):
+    log.warning(
+        f"DISTILL_OUTPUT_MODE={DISTILL_OUTPUT_MODE!r} unrecognised — using 'kv'"
+    )
+    DISTILL_OUTPUT_MODE = "kv"
+# How the structured-output schema is attached to the request in
+# guided_json mode. The repo carries no pin for the engine box's vLLM
+# version, so this is operator-selectable:
+#   - "response_format" (default): OpenAI-style
+#     response_format={"type":"json_schema","json_schema":{...}} —
+#     supported by vLLM >= 0.6.x OpenAI-compat server.
+#   - "guided_json": vLLM's legacy extension param (top-level
+#     `guided_json` in the request body; what openai-client users pass
+#     via extra_body). FALLBACK for older vLLM builds that predate
+#     json_schema response_format.
+# Exactly one is sent — some vLLM versions reject requests that carry
+# both guided-decoding params at once.
+DISTILL_GUIDED_PARAM_STYLE = os.environ.get(
+    "DISTILL_GUIDED_PARAM_STYLE", "response_format"
+).strip().lower()
+if DISTILL_GUIDED_PARAM_STYLE not in ("response_format", "guided_json"):
+    log.warning(
+        f"DISTILL_GUIDED_PARAM_STYLE={DISTILL_GUIDED_PARAM_STYLE!r} unrecognised "
+        f"— using 'response_format'"
+    )
+    DISTILL_GUIDED_PARAM_STYLE = "response_format"
+# JSON output carries structural overhead (braces, quotes, key names)
+# the KV format doesn't, so guided mode gets its own per-event token
+# budget. Truncation is guided mode's ONLY parse-failure mode (the
+# schema enforcer guarantees validity up to the cut), so this errs
+# higher than the KV 300.
+LLM_MAX_TOKENS_PER_EVENT_JSON = int(
+    os.environ.get("LLM_MAX_TOKENS_PER_EVENT_JSON", "400")
+)
 # --------------------------------------------------------------------
@@ -165,10 +223,71 @@ A whole file is one entity, not twenty.
 - Output ONLY the formatted records. No header, no footer, no prose."""
+# Guided-JSON variant of BATCH_SYSTEM_PROMPT. Same CONTENT rules
+# (conservatism, per-event caps, code-content rule, subject-must-be-a-
+# declared-entity, email-alias pairing, statement <= 140 chars, never
+# skip an event) — only the output-format scaffolding changes. The
+# pipe-format anchoring ("COUNT THE PIPES", pipe/newline substitution)
+# is dropped: vLLM's guided decoding enforces the schema mechanically,
+# so the prompt no longer needs to beg for format compliance, and JSON
+# string escaping makes the pipe/newline substitution rules moot.
+GUIDED_JSON_SYSTEM_PROMPT = """You extract structured knowledge from N \
+events for a personal-memory graph.
+You will receive N events, each prefixed with `[event K]`. Respond \
+with a single JSON object: {"events": [...]} containing one object \
+per input event. Be conservative — only emit things explicitly stated.
+Each per-event object has:
+  "index": the zero-indexed event number, matching the input `[event K]`.
+  "entities": array of {"name", "type", "email"?}.
+  "facts": array of {"category", "subject", "predicate", "object", "statement"}.
+  "relationships": array of {"from", "to", "type"}.
+RULES:
+- NEVER skip an event — if an event has nothing to extract, emit its \
+object with "index" set and empty arrays.
+- entities: type ∈ {person, org, product, place, project, concept, \
+topic, date, other}.
+  email (OPTIONAL, person only): when the event body or attributes
+  show an email address that unambiguously identifies the person,
+  include it. This pairs the name+email forms so a later event seeing
+  only the email resolves to the same entity. Omit the key otherwise.
+- facts: category ∈ {decision, commitment, state, mention, \
+observation, preference}.
+  subject MUST be an entity name declared in THIS event's "entities".
+  predicate is a short verb phrase (e.g. "agreed to", "owns", "works at").
+  object MAY be an entity name OR a literal string OR null if absent.
+  statement ≤ 140 characters, a self-contained sentence.
+  WORKED EXAMPLE: {"category": "commitment", "subject": "Timothy \
+Bradley", "predicate": "agreed to", "object": "SAFE amendments", \
+"statement": "Timothy confirmed the SAFE amendments are set (14 May 2026)"}
+- relationships: "from" and "to" MUST be entity names declared in THIS \
+event's "entities". "type" is a short verb / preposition phrase.
+- HARD CAPS per event: 8 entities, 6 facts, 6 relationships. Pick the \
+most salient.
+- For code / technical content: extract only top-level services, \
+modules, or domain concepts. NOT variables, types, or method names. \
+A whole file is one entity, not twenty.
+- Output ONLY the JSON object. No markdown fences, no prose."""
+# The system prompt actually sent to the LLM under the current output
+# mode. Everything downstream (request body, trace fingerprint) hangs
+# off this so the two can never disagree.
+ACTIVE_SYSTEM_PROMPT = (
+    GUIDED_JSON_SYSTEM_PROMPT
+    if DISTILL_OUTPUT_MODE == "guided_json"
+    else BATCH_SYSTEM_PROMPT
+)
 # Teacher-prompt fingerprint for trace logging. If the prompt changes,
 # the hash changes — lets training-data exports filter by teacher
-# version so we never mix outputs from a retired prompt.
-SYSTEM_PROMPT_HASH = hashlib.sha256(BATCH_SYSTEM_PROMPT.encode()).hexdigest()[:16]
+# version so we never mix outputs from a retired prompt. Computed from
+# the ACTIVE prompt, so flipping DISTILL_OUTPUT_MODE auto-segments
+# distillation_traces into a new teacher version (KV-format traces and
+# guided-JSON traces never mix in a training export).
+SYSTEM_PROMPT_HASH = hashlib.sha256(ACTIVE_SYSTEM_PROMPT.encode()).hexdigest()[:16]
 # --------------------------------------------------------------------
@@ -353,14 +472,232 @@ def _split_event_blocks(text: str, expected_n: int) -> list[str]:
     return slices
+# --------------------------------------------------------------------
+# Guided-JSON parsing (DISTILL_OUTPUT_MODE=guided_json)
+# --------------------------------------------------------------------
+def _load_guided_payload(text: str) -> dict[str, Any] | None:
+    """Parse the guided-JSON chunk output into the {"events": [...]}
+    payload, salvaging what's complete if the output was truncated.
+    Under guided decoding the server's logit masking guarantees every
+    emitted byte is schema-consistent, so the ONLY way the payload can
+    fail to parse is max_tokens truncation mid-stream. Salvage is
+    therefore simple and structural: walk back to the last complete
+    `}` (the close of the last fully-emitted event object), close the
+    events array + root object, and re-parse. Each step back drops at
+    most one (incomplete) event — per-event degradation, never
+    chunk-level loss. Returns None if nothing parseable remains."""
+    raw = (text or "").strip()
+    if not raw:
+        return None
+    # Defensive fence strip — can't occur under guided decoding, but
+    # the bake-off script replays this parser over unguided output too.
+    if raw.startswith("```"):
+        raw = raw.strip("`").strip()
+        if raw.lower().startswith("json"):
+            raw = raw[4:].lstrip()
+    try:
+        payload = json.loads(raw)
+        return payload if isinstance(payload, dict) else None
+    except json.JSONDecodeError:
+        pass
+    # Truncated: trim to the last complete `}` of the events array and
+    # close the structure. Walk back through `}` occurrences until a
+    # candidate parses (bounded — each iteration discards at least one
+    # char, and 200 closing braces covers far more events than a chunk
+    # can hold).
+    end = len(raw)
+    for _ in range(200):
+        idx = raw.rfind("}", 0, end)
+        if idx < 0:
+            return None
+        candidate = raw[: idx + 1] + "]}"
+        try:
+            payload = json.loads(candidate)
+            return payload if isinstance(payload, dict) else None
+        except json.JSONDecodeError:
+            end = idx
+    return None
+def _resolve_event_index(ev: dict[str, Any], pos: int, expected_n: int) -> int | None:
+    """Map a parsed event object to its result slot. Trust the model's
+    "index" field when it's a valid in-range int (it mirrors the
+    `[event K]` input header); fall back to array position otherwise.
+    None = undeliverable (both out of range) — the object is dropped
+    without corrupting any other event's slot."""
+    idx = ev.get("index")
+    if isinstance(idx, int) and not isinstance(idx, bool) and 0 <= idx < expected_n:
+        return idx
+    if 0 <= pos < expected_n:
+        return pos
+    return None
+def _parse_guided_json(text: str, expected_n: int) -> list[dict[str, Any]]:
+    """Parse guided-JSON output into per-event extraction dicts —
+    sibling of _parse_kv_records, returning the IDENTICAL shape
+    ({"entities": [...], "facts": [...], "relationships": [...]}, with
+    entity emails promoted into "aliases") so the upsert path and trace
+    logging are untouched by the output-mode flip.
+    Defensive beyond what guided decoding guarantees: truncation is
+    salvaged per-event (see _load_guided_payload), per-item junk is
+    skipped, the per-event hard caps are re-enforced, and string fields
+    are normalised exactly as the KV parser normalises them (strip,
+    lowercase type/category, `-`/empty/null object → None, non-email
+    "email" values dropped). Always returns expected_n entries."""
+    results: list[dict[str, Any]] = [
+        {"entities": [], "facts": [], "relationships": []} for _ in range(expected_n)
+    ]
+    payload = _load_guided_payload(text)
+    if payload is None:
+        return results
+    events = payload.get("events")
+    if not isinstance(events, list):
+        return results
+    for pos, ev in enumerate(events):
+        if not isinstance(ev, dict):
+            continue
+        idx = _resolve_event_index(ev, pos, expected_n)
+        if idx is None:
+            continue
+        target = results[idx]
+        ents = ev.get("entities")
+        for e in (ents if isinstance(ents, list) else [])[:MAX_ENTITIES_PER_EVENT]:
+            if not isinstance(e, dict):
+                continue
+            name = str(e.get("name") or "").strip()
+            if not name:
+                continue
+            etype = str(e.get("type") or "").strip().lower()
+            ent: dict[str, Any] = {"type": etype, "name": name}
+            # Mirror the KV 4th-field rule: promote into aliases only
+            # when it actually looks like an email; drop junk silently.
+            email = e.get("email")
+            if isinstance(email, str):
+                email = email.strip()
+                if email and "@" in email and " " not in email:
+                    ent["aliases"] = [email]
+            target["entities"].append(ent)
+        facts = ev.get("facts")
+        for f in (facts if isinstance(facts, list) else [])[:MAX_FACTS_PER_EVENT]:
+            if not isinstance(f, dict):
+                continue
+            stmt = str(f.get("statement") or "").strip()
+            if not stmt:
+                continue
+            obj = f.get("object")
+            obj = obj.strip() if isinstance(obj, str) else None
+            target["facts"].append(
+                {
+                    "category": str(f.get("category") or "").strip().lower(),
+                    "subject": str(f.get("subject") or "").strip(),
+                    "predicate": str(f.get("predicate") or "").strip(),
+                    "object": None if obj in (None, "", "-", "null", "None") else obj,
+                    "statement": stmt,
+                }
+            )
+        rels = ev.get("relationships")
+        for r in (rels if isinstance(rels, list) else [])[:MAX_RELATIONSHIPS_PER_EVENT]:
+            if not isinstance(r, dict):
+                continue
+            frm = str(r.get("from") or "").strip()
+            to = str(r.get("to") or "").strip()
+            rtype = str(r.get("type") or "").strip()
+            if frm and to and rtype:
+                target["relationships"].append({"from": frm, "to": to, "type": rtype})
+    return results
+def _guided_event_slices(text: str, expected_n: int) -> list[str]:
+    """Per-event raw slices for trace logging in guided mode — the
+    JSON-mode sibling of _split_event_blocks, same shape contract
+    (expected_n entries, missing events as empty strings). Each slice
+    is the model's event object re-serialised verbatim-in-content
+    (key order preserved, non-ASCII kept) so distillation_traces stays
+    a faithful (input, output) training pair."""
+    slices: list[str] = [""] * expected_n
+    payload = _load_guided_payload(text)
+    if payload is None:
+        return slices
+    events = payload.get("events")
+    if not isinstance(events, list):
+        return slices
+    for pos, ev in enumerate(events):
+        if not isinstance(ev, dict):
+            continue
+        idx = _resolve_event_index(ev, pos, expected_n)
+        if idx is not None:
+            slices[idx] = json.dumps(ev, ensure_ascii=False)
+    return slices
+def _build_request_body(user_prompt: str, n: int) -> dict[str, Any]:
+    """Chat-completions request body for one N-event chunk. Pure —
+    everything mode-dependent (prompt, token budget, structured-output
+    params) keys off the module-level flags so this is unit-testable.
+    kv mode (default): byte-for-byte the pre-flag body — KV-text
+    output, no guided_json / response_format. The benefit of
+    structured-output enforcement was half-ignored by the old VL
+    upstream, and the KV parser recovers from per-line drift.
+    guided_json mode: attaches EXTRACTION_SCHEMA via ONE of the two
+    vLLM structured-output param styles (DISTILL_GUIDED_PARAM_STYLE;
+    some vLLM versions reject requests carrying both at once):
+      - response_format {"type": "json_schema", ...} — OpenAI-style,
+        current vLLM (default).
+      - top-level guided_json — vLLM's legacy extension param (what
+        openai-client callers pass via extra_body), fallback for older
+        server builds.
+    """
+    body: dict[str, Any] = {
+        "model": LLM_MODEL,
+        "messages": [
+            {"role": "system", "content": ACTIVE_SYSTEM_PROMPT},
+            {"role": "user", "content": user_prompt},
+        ],
+        "temperature": 0.0,
+        "max_tokens": (
+            LLM_MAX_TOKENS_PER_EVENT_JSON
+            if DISTILL_OUTPUT_MODE == "guided_json"
+            else LLM_MAX_TOKENS_PER_EVENT
+        ) * n,
+    }
+    if DISTILL_OUTPUT_MODE == "guided_json":
+        if DISTILL_GUIDED_PARAM_STYLE == "guided_json":
+            body["guided_json"] = EXTRACTION_SCHEMA
+        else:
+            body["response_format"] = {
+                "type": "json_schema",
+                "json_schema": {
+                    "name": "memory_extraction",
+                    "strict": True,
+                    "schema": EXTRACTION_SCHEMA,
+                },
+            }
+    return body
 async def call_llm_batch(
     client: httpx.AsyncClient, events: list[dict[str, Any]]
 ) -> list[dict[str, Any]]:
     """Send N events in a single chat-completion call, return the list
-    of per-event extraction dicts in input order. The model emits
-    pipe-delimited KV records (see BATCH_SYSTEM_PROMPT); the parser is
-    line-tolerant so a malformed record skips itself rather than
-    failing the chunk. Raises only on transport failure or completely
+    of per-event extraction dicts in input order.
+    kv mode (default): the model emits pipe-delimited KV records (see
+    BATCH_SYSTEM_PROMPT); the parser is line-tolerant so a malformed
+    record skips itself rather than failing the chunk.
+    guided_json mode: the model emits the EXTRACTION_SCHEMA-constrained
+    JSON envelope under server-side guided decoding; the parser
+    salvages complete event objects from a truncated stream so failure
+    degrades per-event, never per-chunk. Both parsers return the same
+    per-event dict shape, so everything downstream of this function is
+    mode-agnostic. Raises only on transport failure or completely
     empty output."""
     n = len(events)
     if n == 0:
@@ -378,20 +715,7 @@ async def call_llm_batch(
         build_event_block(i, ev) for i, ev in enumerate(events)
     )
-    body: dict[str, Any] = {
-        "model": LLM_MODEL,
-        "messages": [
-            {"role": "system", "content": BATCH_SYSTEM_PROMPT},
-            {"role": "user", "content": user_prompt},
-        ],
-        "temperature": 0.0,
-        "max_tokens": LLM_MAX_TOKENS_PER_EVENT * n,
-        # KV-text output — no guided_json / response_format. The
-        # benefit of structured-output enforcement was already
-        # half-ignored by VL upstream, and the parser now recovers
-        # from per-line drift so the schema enforcement isn't worth
-        # the JSON brittleness it brought.
-    }
+    body = _build_request_body(user_prompt, n)
     r = await client.post(LLM_ENDPOINT, json=body, headers=headers)
     r.raise_for_status()
     data = r.json()
@@ -400,12 +724,16 @@ async def call_llm_batch(
         text = data.get("message", {}).get("content", "")
     if not text:
         raise RuntimeError(f"llm returned no content: {json.dumps(data)[:300]}")
-    parsed = _parse_kv_records(text, n)
+    if DISTILL_OUTPUT_MODE == "guided_json":
+        parsed = _parse_guided_json(text, n)
+        slices = _guided_event_slices(text, n)
+    else:
+        parsed = _parse_kv_records(text, n)
+        slices = _split_event_blocks(text, n)
     # Attach the per-event raw slice so downstream trace logging gets
     # the model's verbatim output for THIS event without re-splitting
     # the chunk-level text. Parser semantics are unaffected — the
     # raw_slice key is ignored by upsert paths.
-    slices = _split_event_blocks(text, n)
     for record, slice_text in zip(parsed, slices):
         record["raw_slice"] = slice_text
     return parsed
@@ -761,6 +1089,25 @@ SKIP_ATTRIBUTE_SOURCES = set(
 )
 DISTILL_MAX_AGE_DAYS = int(os.environ.get("DISTILL_MAX_AGE_DAYS", "90"))
+# Layer-1 content pre-filter (cascade tier 1 — the cheap deterministic gate in
+# front of the student/7B). Skips events with NO extractable signal BEFORE the
+# LLM, so GPU is spent only on text that can yield facts.
+#   - bytes-garbage: a binary doc (raw PDF bytes) stored as text decodes to a
+#     wall of U+FFFD replacement chars. `build_event_block` feeds `content` to
+#     the LLM, so it sees the garbage and extracts nothing (live 2026-06-10:
+#     35,296/39,453 pentatonic-team `doc` events are bytes-garbage). HIGH
+#     PRECISION — real text effectively never crosses a 5–10% replacement-char
+#     ratio, so this is a zero-quality-loss skip. (Durable fix = extract PDF
+#     text at INGEST; this stops the GPU waste meanwhile.)
+#   - too-short: trivially short content (one-line acks / emoji) has no facts.
+#     Conservative and OFF by default (0) to guarantee zero quality loss; tune
+#     up once layer-2 (the student model) owns the borderline cases.
+SKIP_BYTES_GARBAGE = os.environ.get(
+    "DISTILL_SKIP_BYTES_GARBAGE", "true"
+).strip().lower() not in ("false", "0", "no", "off")
+GARBAGE_CHAR_RATIO = float(os.environ.get("DISTILL_GARBAGE_CHAR_RATIO", "0.10"))
+MIN_CONTENT_CHARS = int(os.environ.get("DISTILL_MIN_CONTENT_CHARS", "0"))
 def claim_next_batch(conn: psycopg.Connection) -> list[dict[str, Any]]:
     """Atomically claim up to BATCH_SIZE pending items. SKIP LOCKED so
@@ -843,6 +1190,43 @@ def claim_next_batch(conn: psycopg.Connection) -> list[dict[str, Any]]:
             """,
             (DISTILL_MAX_AGE_DAYS, DISTILL_MAX_AGE_DAYS),
         )
+        # Pre-filter: bytes-garbage content. A binary doc (raw PDF bytes)
+        # stored as text decodes to mostly U+FFFD (chr(65533)); the LLM
+        # extracts nothing from it. Skip when the replacement-char ratio
+        # exceeds GARBAGE_CHAR_RATIO — real text never crosses it, so no
+        # quality loss. Scoped to the pending set; one cheap UPDATE/cycle.
+        if SKIP_BYTES_GARBAGE:
+            cur.execute(
+                """
+                UPDATE distillation_queue dq SET
+                  status = 'done',
+                  completed_at = NOW(),
+                  last_error = 'filtered: bytes_garbage'
+                FROM events e
+                WHERE dq.event_id = e.id
+                  AND dq.status = 'pending'
+                  AND length(e.content) > 0
+                  AND (length(e.content) - length(replace(e.content, chr(65533), '')))::float
+                      / length(e.content) > %s
+                """,
+                (GARBAGE_CHAR_RATIO,),
+            )
+        # Pre-filter: trivially-short content (one-line acks / emoji). OFF by
+        # default (MIN_CONTENT_CHARS=0) so it never costs a fact unless tuned on.
+        if MIN_CONTENT_CHARS > 0:
+            cur.execute(
+                """
+                UPDATE distillation_queue dq SET
+                  status = 'done',
+                  completed_at = NOW(),
+                  last_error = 'filtered: too_short'
+                FROM events e
+                WHERE dq.event_id = e.id
+                  AND dq.status = 'pending'
+                  AND length(trim(e.content)) < %s
+                """,
+                (MIN_CONTENT_CHARS,),
+            )
     with conn.cursor(row_factory=psycopg.rows.dict_row) as cur:
         cur.execute(
@@ -1085,7 +1469,9 @@ async def amain():
         f"endpoint={LLM_ENDPOINT or '(stub)'}, model={LLM_MODEL}, "
         f"poll={POLL_INTERVAL_SEC}s, claim={BATCH_SIZE}, "
         f"events_per_call={EVENTS_PER_LLM_CALL}, "
-        f"concurrent_calls={CONCURRENT_LLM_CALLS})"
+        f"concurrent_calls={CONCURRENT_LLM_CALLS}, "
+        f"output_mode={DISTILL_OUTPUT_MODE}, "
+        f"prompt_hash={SYSTEM_PROMPT_HASH})"
     )
     stub_mode = not LLM_ENDPOINT
     if stub_mode: