npm - @pentatonic-ai/ai-agent-sdk - Versions diffs - 0.9.6 → 0.10.0 - Mend

@pentatonic-ai/ai-agent-sdk 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (127) hide show

package/packages/memory-engine-v2/extractor-async/worker.py ADDED Viewed

@@ -0,0 +1,797 @@
+"""extractor-async — LLM distillation worker.
+Polls org-model's distillation_queue, claims pending items, runs an
+LLM extraction pass against each event's content, writes entities,
+facts and relationships back, marks the queue item done.
+Wire as of 2026-05-17: LLM_ENDPOINT points at the pentatonic-gateway
+chat-completions endpoint (vLLM-served Qwen3-VL-30B-A3B-Instruct on
+the GH200). Prompts ask for a strict JSON envelope; we parse, then
+upsert entities first, then facts/relationships keyed on those
+entity IDs.
+Architecture notes:
+  - Single worker process per container; horizontal scaling = N
+    containers. Each container claims with its hostname, so
+    claim_expires_at lets a crashed container's items re-surface
+    after the TTL.
+  - Polling instead of LISTEN/NOTIFY: simpler, works with any
+    Postgres deployment, doesn't require keeping a long-lived
+    notification channel open.
+  - Idempotent: entities/facts/relationships are content-hash IDed,
+    so re-extraction of the same event converges to the same rows.
+"""
+from __future__ import annotations
+import asyncio
+import hashlib
+import json
+import logging
+import os
+import re
+import socket
+import time
+from typing import Any
+import httpx
+import psycopg
+import psycopg.rows
+from confidence import corroborated_confidence
+from noise_filter import is_noise_entity_name
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+log = logging.getLogger("extractor-async")
+PG_DSN = os.environ.get("PG_DSN", "postgresql://pme:local-dev-pw@org-model:5432/org_model")
+LLM_ENDPOINT = os.environ.get("LLM_ENDPOINT", "")
+LLM_API_KEY = os.environ.get("LLM_API_KEY", "")
+LLM_MODEL = os.environ.get("LLM_MODEL", "Qwen/Qwen3-VL-30B-A3B-Instruct")
+LLM_TIMEOUT_SEC = float(os.environ.get("LLM_TIMEOUT_SEC", "180"))
+POLL_INTERVAL_SEC = float(os.environ.get("POLL_INTERVAL_SEC", "5"))
+CLAIM_TTL_SEC = int(os.environ.get("CLAIM_TTL_SEC", "300"))
+MAX_ATTEMPTS = int(os.environ.get("MAX_ATTEMPTS", "3"))
+MAX_CONTENT_CHARS = int(os.environ.get("MAX_CONTENT_CHARS", "1200"))
+# Multi-event batching. Each LLM call distills EVENTS_PER_LLM_CALL
+# events in a single chat-completion request; CONCURRENT_LLM_CALLS such
+# requests run in parallel against vLLM's continuous batcher. Together
+# they cap how many events one worker has in-flight. The benchmarks on
+# Qwen3-VL-30B-A3B on the GH200 (2026-05-18) showed batch=15 is the
+# yield/latency sweet spot and concurrent=6 keeps max_num_seqs=32
+# saturated without queueing inside vLLM.
+EVENTS_PER_LLM_CALL = int(os.environ.get("EVENTS_PER_LLM_CALL", "15"))
+CONCURRENT_LLM_CALLS = int(os.environ.get("CONCURRENT_LLM_CALLS", "6"))
+BATCH_SIZE = int(
+    os.environ.get("BATCH_SIZE", str(EVENTS_PER_LLM_CALL * CONCURRENT_LLM_CALLS))
+)
+# KV-text output averages ~200 tokens per event (8 ENT, 6 FCT, 6 REL
+# capped). 300 leaves margin for verbose entities and for the per-event
+# header overhead. At EVENTS_PER_LLM_CALL=15 that's 4500 max_tokens —
+# fits comfortably under vLLM's 16k max-model-len with ~7k of input.
+LLM_MAX_TOKENS_PER_EVENT = int(os.environ.get("LLM_MAX_TOKENS_PER_EVENT", "300"))
+WORKER_ID = f"{socket.gethostname()}:{os.getpid()}"
+# KV-text output format constants. We dropped JSON output (and the
+# `guided_json` schema enforcement that went with it) because a single
+# invalid char inside a 13k-character JSON blob nukes the whole 10-event
+# chunk via a JSONDecodeError. Pipe-delimited records, one per line,
+# recover at line granularity — a malformed line skips itself, the rest
+# of the chunk lands. See 2026-05-18 ops notes.
+EVENT_HEADER_RE = re.compile(r"^===?\s*event\s+(\d+)\s*===?\s*$", re.IGNORECASE)
+ALLOWED_ENT_TYPES = {
+    "person", "org", "product", "place", "project",
+    "concept", "topic", "date", "other",
+}
+ALLOWED_FCT_CATEGORIES = {
+    "decision", "commitment", "state", "mention",
+    "observation", "preference",
+}
+# --------------------------------------------------------------------
+# LLM extraction prompt
+# --------------------------------------------------------------------
+# Pipe-delimited, line-oriented output. No JSON. Each record stands
+# on its own line so a single malformed line skips itself instead of
+# nuking the whole batch (as a broken char in a 13k-char JSON blob
+# previously did). Prompt is deliberately repetitive about format —
+# the model needs anchoring without a schema enforcer.
+BATCH_SYSTEM_PROMPT = """You extract structured knowledge from N \
+events for a personal-memory graph.
+You will receive N events, each prefixed with `[event K]`. For EACH \
+event, emit extractions in PIPE-DELIMITED TEXT (NOT JSON). Be \
+conservative — only emit things explicitly stated.
+OUTPUT FORMAT (exact, line-oriented):
+=== event 0 ===
+ENT|<type>|<name>
+ENT|<type>|<name>
+FCT|<category>|<subject>|<predicate>|<object>|<statement>
+REL|<from>|<to>|<rel_type>
+=== event 1 ===
+ENT|...
+RULES:
+- One record per line. NO JSON. NO markdown. NO prose between records.
+- Each event MUST start with a `=== event K ===` header (zero-indexed, \
+matching the input index). NEVER skip an event — if an event has \
+nothing to extract, emit ONLY the header.
+- ENT lines have exactly 3 fields: literal `ENT`, type, name.
+  type ∈ {person, org, product, place, project, concept, topic, date, other}
+- FCT lines have exactly 6 fields: `FCT`, category, subject, \
+predicate, object, statement.
+  category ∈ {decision, commitment, state, mention, observation, preference}
+  subject MUST be an entity name declared in THIS event's ENT lines.
+  object MAY be an entity name OR a literal string OR `-` if absent.
+  statement ≤ 140 characters.
+- REL lines have exactly 4 fields: `REL`, from, to, rel_type.
+  from and to MUST be entity names declared in THIS event's ENT lines.
+  rel_type is a short verb / preposition phrase.
+- Pipes (`|`) inside values are FORBIDDEN — replace any `|` in source \
+text with `/`. Newlines inside values are FORBIDDEN — replace with `; `.
+- HARD CAPS per event: 8 ENT, 6 FCT, 6 REL. Pick the most salient.
+- For code / technical content: extract only top-level services, \
+modules, or domain concepts. NOT variables, types, or method names. \
+A whole file is one entity, not twenty.
+- Output ONLY the formatted records. No header, no footer, no prose."""
+def build_event_block(idx: int, event: dict[str, Any]) -> str:
+    """Render one event as `[event K]\nheader\n---\ncontent` block."""
+    src = event.get("source_kind", "unknown")
+    content = (event.get("content") or "")[:MAX_CONTENT_CHARS]
+    attrs = event.get("attributes") or {}
+    when = attrs.get("emitted_at") or attrs.get("timestamp")
+    author = attrs.get("author") or attrs.get("user_id")
+    header = [f"[event {idx}]", f"source_kind: {src}"]
+    if when:
+        header.append(f"when: {when}")
+    if author:
+        header.append(f"author: {author}")
+    return "\n".join(header) + "\n---\n" + content
+def _parse_kv_records(text: str, expected_n: int) -> list[dict[str, Any]]:
+    """Parse pipe-delimited KV output into per-event extraction dicts.
+    Format (per BATCH_SYSTEM_PROMPT):
+        === event 0 ===
+        ENT|person|Phil Hauser
+        FCT|mention|Phil Hauser|works at|Pentatonic|Phil works at Pentatonic
+        REL|Phil Hauser|Pentatonic|works_at
+    Lenient: blank lines, surrounding whitespace, missing events, and
+    individual malformed lines are all skipped without failing the
+    chunk. Always returns expected_n entries — events the model
+    omitted come back as empty extractions so the queue still drains.
+    Lines outside an event header are ignored (prose, fence text,
+    rogue summaries). Lines with wrong field counts are ignored.
+    Entity-type / fact-category outside the allowed set are still
+    accepted but lowercased; downstream upserts normalise them."""
+    results: list[dict[str, Any]] = [
+        {"entities": [], "facts": [], "relationships": []} for _ in range(expected_n)
+    ]
+    current: dict[str, Any] | None = None
+    for raw in text.splitlines():
+        line = raw.strip()
+        if not line:
+            continue
+        m = EVENT_HEADER_RE.match(line)
+        if m:
+            idx = int(m.group(1))
+            current = results[idx] if 0 <= idx < expected_n else None
+            continue
+        if current is None:
+            # Lines before the first header (model preamble) get dropped.
+            continue
+        # Strip a stray leading bullet or `-` the model might add.
+        if line.startswith(("- ", "* ")):
+            line = line[2:]
+        # maxsplit so statement / name fields can contain colons or
+        # other reserved-looking content without breaking parsing.
+        if line.startswith("ENT|"):
+            parts = line.split("|", 2)
+            if len(parts) == 3 and parts[2].strip():
+                current["entities"].append(
+                    {"type": parts[1].strip().lower(), "name": parts[2].strip()}
+                )
+        elif line.startswith("FCT|"):
+            parts = line.split("|", 5)
+            if len(parts) == 6 and parts[5].strip():
+                obj = parts[4].strip()
+                current["facts"].append(
+                    {
+                        "category": parts[1].strip().lower(),
+                        "subject": parts[2].strip(),
+                        "predicate": parts[3].strip(),
+                        "object": None if obj in ("", "-", "null", "None") else obj,
+                        "statement": parts[5].strip(),
+                    }
+                )
+        elif line.startswith("REL|"):
+            parts = line.split("|", 3)
+            if len(parts) == 4 and all(p.strip() for p in parts[1:]):
+                current["relationships"].append(
+                    {
+                        "from": parts[1].strip(),
+                        "to": parts[2].strip(),
+                        "type": parts[3].strip(),
+                    }
+                )
+        # else: ignore unrecognised line (prose, malformed record).
+    return results
+async def call_llm_batch(
+    client: httpx.AsyncClient, events: list[dict[str, Any]]
+) -> list[dict[str, Any]]:
+    """Send N events in a single chat-completion call, return the list
+    of per-event extraction dicts in input order. The model emits
+    pipe-delimited KV records (see BATCH_SYSTEM_PROMPT); the parser is
+    line-tolerant so a malformed record skips itself rather than
+    failing the chunk. Raises only on transport failure or completely
+    empty output."""
+    n = len(events)
+    if n == 0:
+        return []
+    headers = {"Content-Type": "application/json"}
+    if LLM_API_KEY:
+        # pentatonic-gateway uses X-API-Key; OpenAI-style endpoints
+        # use Authorization Bearer. Send both — the gateway ignores
+        # the one it doesn't care about.
+        headers["X-API-Key"] = LLM_API_KEY
+        headers["Authorization"] = f"Bearer {LLM_API_KEY}"
+    user_prompt = "\n\n---\n\n".join(
+        build_event_block(i, ev) for i, ev in enumerate(events)
+    )
+    body: dict[str, Any] = {
+        "model": LLM_MODEL,
+        "messages": [
+            {"role": "system", "content": BATCH_SYSTEM_PROMPT},
+            {"role": "user", "content": user_prompt},
+        ],
+        "temperature": 0.0,
+        "max_tokens": LLM_MAX_TOKENS_PER_EVENT * n,
+        # KV-text output — no guided_json / response_format. The
+        # benefit of structured-output enforcement was already
+        # half-ignored by VL upstream, and the parser now recovers
+        # from per-line drift so the schema enforcement isn't worth
+        # the JSON brittleness it brought.
+    }
+    r = await client.post(LLM_ENDPOINT, json=body, headers=headers)
+    r.raise_for_status()
+    data = r.json()
+    text = (data.get("choices") or [{}])[0].get("message", {}).get("content", "")
+    if not text:
+        text = data.get("message", {}).get("content", "")
+    if not text:
+        raise RuntimeError(f"llm returned no content: {json.dumps(data)[:300]}")
+    return _parse_kv_records(text, n)
+# --------------------------------------------------------------------
+# Upsert helpers (mirror extractor-sync's idempotent shape)
+# --------------------------------------------------------------------
+def _content_id(*parts: str) -> str:
+    return hashlib.sha256("\x1f".join(parts).encode()).hexdigest()[:32]
+def upsert_entities(
+    conn: psycopg.Connection,
+    arena: str,
+    event_id: str,
+    participant_set: list[str],
+    disclosure_class: str,
+    entities: list[dict],
+) -> dict[str, str]:
+    """Insert (or merge) entities; return a name→id map so facts and
+    relationships can link to the inserted rows.
+    ID is sha256(arena:entity_type:canonical_name)[:32] so the same
+    entity in the same arena converges across events. Aliases and
+    provenance_event_ids array-append on conflict; never replace."""
+    name_to_id: dict[str, str] = {}
+    if not entities:
+        return name_to_id
+    with conn.cursor() as cur:
+        for e in entities:
+            etype = (e.get("type") or "other").lower()
+            name = (e.get("name") or "").strip()
+            if not name:
+                continue
+            # Drop junk names before they enter the graph. See
+            # noise_filter.py — patterns are anchored to live-arena
+            # noise (pronouns, hostnames, paths, agent-worktree
+            # labels). Skipping here means name_to_id never carries
+            # the bad name, so any fact/relationship the LLM tried to
+            # attach to it gets dropped downstream (subj/obj resolve
+            # to None ⇒ filtered out by upsert_facts /
+            # upsert_relationships).
+            if is_noise_entity_name(etype, name):
+                continue
+            aliases = [a for a in (e.get("aliases") or []) if a]
+            eid = _content_id(arena, etype, name)
+            name_to_id[name] = eid
+            cur.execute(
+                """
+                INSERT INTO entities (
+                  id, arena, entity_type, canonical_name, aliases,
+                  provenance_event_ids, participant_set, disclosure_class
+                ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s::disclosure_class)
+                ON CONFLICT (id) DO UPDATE SET
+                  aliases = (
+                    SELECT ARRAY(SELECT DISTINCT UNNEST(entities.aliases || EXCLUDED.aliases))
+                  ),
+                  provenance_event_ids = (
+                    SELECT ARRAY(SELECT DISTINCT UNNEST(entities.provenance_event_ids || EXCLUDED.provenance_event_ids))
+                  ),
+                  last_seen = NOW()
+                """,
+                (
+                    eid, arena, etype, name, aliases,
+                    [event_id], participant_set, disclosure_class,
+                ),
+            )
+    return name_to_id
+def upsert_facts(
+    conn: psycopg.Connection,
+    arena: str,
+    event_id: str,
+    participant_set: list[str],
+    disclosure_class: str,
+    facts: list[dict],
+    name_to_id: dict[str, str],
+) -> int:
+    """Facts are content-hashed on (arena, statement). Same statement
+    extracted from any event in the arena converges to the same row,
+    with `provenance_event_ids` accumulating the sources.
+    This is a forward-only change from the prior `(arena, event_id,
+    statement)` hash — historical rows extracted under the old key
+    will not back-merge with new extractions. Over a 30-day extraction
+    cycle the new rows dominate; until then the old + new shapes
+    co-exist.
+    Confidence promotes with corroboration on conflict: a statement
+    that appears in N events lands at `corroborated_confidence(N)`
+    (see confidence.py — caps at 0.9 to reserve [0.9, 1.0] for
+    `stage = 'verified'` which only a human can produce). Stage stays
+    `provisional`; corroboration is a signal, not a graduation.
+    """
+    if not facts:
+        return 0
+    inserted = 0
+    with conn.cursor() as cur:
+        for f in facts:
+            stmt = (f.get("statement") or "").strip()
+            if not stmt:
+                continue
+            subj_name = f.get("subject")
+            obj_name = f.get("object")
+            cur.execute(
+                """
+                INSERT INTO facts (
+                  id, arena, category, subject_entity_id, predicate,
+                  object_entity_id, statement, provenance_event_ids,
+                  stage, confidence, participant_set, disclosure_class
+                ) VALUES (
+                  %s, %s, %s, %s, %s, %s, %s, %s,
+                  'provisional'::extraction_stage, %s, %s, %s::disclosure_class
+                )
+                ON CONFLICT (id) DO UPDATE SET
+                  provenance_event_ids = (
+                    SELECT ARRAY(SELECT DISTINCT UNNEST(
+                      facts.provenance_event_ids || EXCLUDED.provenance_event_ids
+                    ))
+                  ),
+                  -- Confidence bumps with each additional independent
+                  -- source. The cardinality of the merged provenance
+                  -- array IS the corroboration count, so the formula
+                  -- lives inline rather than round-tripping through
+                  -- the worker. LEAST() guards the CHECK(<=1.0)
+                  -- constraint defensively even though the helper's
+                  -- cap is 0.9.
+                  confidence = LEAST(
+                    GREATEST(
+                      facts.confidence,
+                      0.5 + 0.15 * (
+                        cardinality(ARRAY(SELECT DISTINCT UNNEST(
+                          facts.provenance_event_ids
+                          || EXCLUDED.provenance_event_ids
+                        ))) - 1
+                      )
+                    ),
+                    0.9
+                  )
+                """,
+                (
+                    _content_id(arena, stmt),
+                    arena,
+                    (f.get("category") or "observation").lower(),
+                    name_to_id.get(subj_name),
+                    f.get("predicate"),
+                    name_to_id.get(obj_name),  # may be None if object is a literal
+                    stmt,
+                    [event_id],
+                    float(f.get("confidence") or corroborated_confidence(1)),
+                    participant_set,
+                    disclosure_class,
+                ),
+            )
+            inserted += 1
+    return inserted
+def upsert_relationships(
+    conn: psycopg.Connection,
+    arena: str,
+    event_id: str,
+    participant_set: list[str],
+    disclosure_class: str,
+    relationships: list[dict],
+    name_to_id: dict[str, str],
+) -> int:
+    """Edge identity is (arena, from, to, type). ON CONFLICT bumps
+    weight + last_seen rather than duplicating."""
+    if not relationships:
+        return 0
+    inserted = 0
+    with conn.cursor() as cur:
+        for r in relationships:
+            from_id = name_to_id.get(r.get("from"))
+            to_id = name_to_id.get(r.get("to"))
+            rtype = (r.get("type") or "").strip()
+            if not from_id or not to_id or not rtype:
+                continue
+            rid = _content_id(arena, from_id, to_id, rtype)
+            cur.execute(
+                """
+                INSERT INTO relationships (
+                  id, arena, from_entity_id, to_entity_id, relationship_type,
+                  weight, provenance_event_ids, participant_set, disclosure_class
+                ) VALUES (
+                  %s, %s, %s, %s, %s, %s, %s, %s, %s::disclosure_class
+                )
+                ON CONFLICT (id) DO UPDATE SET
+                  weight = relationships.weight + EXCLUDED.weight,
+                  provenance_event_ids = (
+                    SELECT ARRAY(SELECT DISTINCT UNNEST(relationships.provenance_event_ids || EXCLUDED.provenance_event_ids))
+                  ),
+                  last_seen = NOW()
+                """,
+                (
+                    rid, arena, from_id, to_id, rtype,
+                    float(r.get("confidence") or 0.5),
+                    [event_id], participant_set, disclosure_class,
+                ),
+            )
+            inserted += 1
+    return inserted
+# --------------------------------------------------------------------
+# Queue mechanics
+# --------------------------------------------------------------------
+# Distillation filters — applied at claim time so the worker never
+# wastes an LLM call on noise we don't want in the graph. Tunable via
+# env so we can revisit per-source value over time.
+#
+# Skip rules:
+#   - source attribute matches a known code-only ingest (pip-code-ingest
+#     and friends). Code chunks generate noisy entities — class names,
+#     file paths, variables — that pollute the graph and don't surface
+#     in human-memory queries.
+#   - received_at older than DISTILL_MAX_AGE_DAYS. Stale events have low
+#     facet value and burn LLM budget. Forward-only + 90-day window is
+#     the right default; old events stay vector-searchable.
+SKIP_ATTRIBUTE_SOURCES = set(
+    s.strip()
+    for s in os.environ.get(
+        "DISTILL_SKIP_SOURCES", "pip-code-ingest"
+    ).split(",")
+    if s.strip()
+)
+DISTILL_MAX_AGE_DAYS = int(os.environ.get("DISTILL_MAX_AGE_DAYS", "90"))
+def claim_next_batch(conn: psycopg.Connection) -> list[dict[str, Any]]:
+    """Atomically claim up to BATCH_SIZE pending items. SKIP LOCKED so
+    concurrent workers never race.
+    Filters at claim time:
+      - Events from skip-sources (attributes.source in SKIP_ATTRIBUTE_SOURCES)
+        are marked done with `filtered:<source>` rather than claimed.
+      - Events older than DISTILL_MAX_AGE_DAYS are similarly skipped.
+    Both pre-passes run BEFORE the claim so the worker never wastes an
+    LLM call on filtered events. They're cheap UPDATE statements scoped
+    to the current pending set."""
+    with conn.cursor() as cur:
+        # Pre-filter: skip-source events.
+        if SKIP_ATTRIBUTE_SOURCES:
+            cur.execute(
+                """
+                UPDATE distillation_queue dq SET
+                  status = 'done',
+                  completed_at = NOW(),
+                  last_error = 'filtered: source=' || (e.attributes->>'source')
+                FROM events e
+                WHERE dq.event_id = e.id
+                  AND dq.status = 'pending'
+                  AND e.attributes->>'source' = ANY(%s)
+                """,
+                (list(SKIP_ATTRIBUTE_SOURCES),),
+            )
+        # Pre-filter: events older than the window.
+        cur.execute(
+            """
+            UPDATE distillation_queue dq SET
+              status = 'done',
+              completed_at = NOW(),
+              last_error = 'filtered: age>' || %s || 'd'
+            FROM events e
+            WHERE dq.event_id = e.id
+              AND dq.status = 'pending'
+              AND e.received_at < NOW() - (%s || ' days')::interval
+            """,
+            (DISTILL_MAX_AGE_DAYS, DISTILL_MAX_AGE_DAYS),
+        )
+    with conn.cursor(row_factory=psycopg.rows.dict_row) as cur:
+        cur.execute(
+            """
+            UPDATE distillation_queue SET
+              status = 'claimed',
+              claimed_by = %s,
+              claimed_at = NOW(),
+              claim_expires_at = NOW() + (%s || ' seconds')::interval,
+              attempts = attempts + 1
+            WHERE id IN (
+              SELECT id FROM distillation_queue
+              WHERE (
+                status = 'pending'
+                OR (status = 'claimed' AND claim_expires_at < NOW())
+              ) AND attempts < %s
+              ORDER BY id
+              FOR UPDATE SKIP LOCKED
+              LIMIT %s
+            )
+            RETURNING id, event_id, attempts
+            """,
+            (WORKER_ID, CLAIM_TTL_SEC, MAX_ATTEMPTS, BATCH_SIZE),
+        )
+        return cur.fetchall()
+def fetch_event(conn: psycopg.Connection, event_id: str) -> dict[str, Any] | None:
+    with conn.cursor(row_factory=psycopg.rows.dict_row) as cur:
+        cur.execute(
+            "SELECT id, arena, source_kind, content, attributes, participant_set, "
+            "disclosure_class FROM events WHERE id = %s",
+            (event_id,),
+        )
+        return cur.fetchone()
+def mark_done(conn: psycopg.Connection, queue_id: int) -> None:
+    with conn.cursor() as cur:
+        cur.execute(
+            "UPDATE distillation_queue SET status = 'done', completed_at = NOW() WHERE id = %s",
+            (queue_id,),
+        )
+def mark_failed(conn: psycopg.Connection, queue_id: int, error: str) -> None:
+    with conn.cursor() as cur:
+        cur.execute(
+            "UPDATE distillation_queue SET status = 'failed', last_error = %s WHERE id = %s",
+            (error[:1024], queue_id),
+        )
+def release_claim(conn: psycopg.Connection, queue_id: int, error: str) -> None:
+    with conn.cursor() as cur:
+        cur.execute(
+            """
+            UPDATE distillation_queue SET
+              status = 'pending',
+              claimed_by = NULL,
+              claimed_at = NULL,
+              claim_expires_at = NULL,
+              last_error = %s
+            WHERE id = %s
+            """,
+            (error[:1024], queue_id),
+        )
+# --------------------------------------------------------------------
+# Main loop
+# --------------------------------------------------------------------
+async def _extract_chunk(
+    http: httpx.AsyncClient,
+    chunk_items: list[dict[str, Any]],
+    chunk_events: list[dict[str, Any]],
+    stub_mode: bool,
+) -> tuple[list[dict[str, Any] | Exception], float]:
+    """Run one multi-event LLM call for a chunk. Returns (per_item_results,
+    llm_ms). Each per_item_result is either a per-event extraction dict or
+    an Exception. If the whole call fails, every item gets the same
+    Exception — caller releases all of them."""
+    if stub_mode:
+        return ([{"entities": [], "facts": [], "relationships": []}] * len(chunk_items)), 0.0
+    t0 = time.perf_counter()
+    try:
+        results = await call_llm_batch(http, chunk_events)
+    except Exception as exc:
+        llm_ms = (time.perf_counter() - t0) * 1000
+        return ([exc] * len(chunk_items)), llm_ms
+    llm_ms = (time.perf_counter() - t0) * 1000
+    # call_llm_batch guarantees len(results) == len(chunk_events); guard
+    # anyway so a server quirk doesn't crash the worker.
+    if len(results) != len(chunk_items):
+        err = RuntimeError(
+            f"chunk result count mismatch: got {len(results)}, expected {len(chunk_items)}"
+        )
+        return ([err] * len(chunk_items)), llm_ms
+    return list(results), llm_ms
+async def process_batch(
+    http: httpx.AsyncClient,
+    conn: psycopg.Connection,
+    items: list[dict[str, Any]],
+    stub_mode: bool,
+) -> None:
+    """Process one claim with multi-event LLM batching. Items are split
+    into chunks of EVENTS_PER_LLM_CALL each, then CONCURRENT_LLM_CALLS
+    chunks run in parallel against vLLM's continuous batcher. DB writes
+    happen sequentially after the gather — keeping the psycopg conn
+    single-threaded sidesteps the lock dance a fully-async DB path
+    would need."""
+    # Pre-fetch all events from DB (sync, fast).
+    events_by_qid: dict[int, dict[str, Any] | None] = {}
+    for item in items:
+        events_by_qid[item["id"]] = fetch_event(conn, item["event_id"])
+    # Drop items whose event is missing (mark done up-front, no LLM call).
+    callable_items: list[dict[str, Any]] = []
+    for item in items:
+        if events_by_qid[item["id"]] is None:
+            log.warning(
+                f"event {item['event_id']} missing — marking queue {item['id']} done"
+            )
+            mark_done(conn, item["id"])
+        else:
+            callable_items.append(item)
+    if not callable_items:
+        return
+    # Build chunks of EVENTS_PER_LLM_CALL items each (last chunk may be
+    # short). Each chunk → one LLM call. Up to CONCURRENT_LLM_CALLS run
+    # concurrently; asyncio.gather queues the rest.
+    chunks: list[tuple[list[dict[str, Any]], list[dict[str, Any]]]] = []
+    for s in range(0, len(callable_items), EVENTS_PER_LLM_CALL):
+        chunk_items = callable_items[s : s + EVENTS_PER_LLM_CALL]
+        chunk_events = [events_by_qid[i["id"]] for i in chunk_items]
+        chunks.append((chunk_items, chunk_events))
+    # Bound parallelism with a semaphore — gather only waits for slots,
+    # not for everything to be ready at once.
+    sem = asyncio.Semaphore(CONCURRENT_LLM_CALLS)
+    async def run_one(chunk_items, chunk_events):
+        async with sem:
+            return await _extract_chunk(http, chunk_items, chunk_events, stub_mode)
+    coros = [run_one(ci, ce) for ci, ce in chunks]
+    chunk_outcomes = await asyncio.gather(*coros)
+    # Flatten chunk_outcomes back to per-item results, paired with items.
+    for (chunk_items, _chunk_events), (per_item, llm_ms) in zip(chunks, chunk_outcomes):
+        for item, result in zip(chunk_items, per_item):
+            queue_id = item["id"]
+            event_id = item["event_id"]
+            attempts = item["attempts"]
+            event = events_by_qid[queue_id]
+            if isinstance(result, Exception):
+                err = f"{type(result).__name__}: {result}"
+                log.warning(
+                    f"extraction failed queue_id={queue_id} attempts={attempts}: {err}"
+                )
+                if attempts >= MAX_ATTEMPTS:
+                    mark_failed(conn, queue_id, err)
+                else:
+                    release_claim(conn, queue_id, err)
+                continue
+            ents = result.get("entities") or []
+            facts = result.get("facts") or []
+            rels = result.get("relationships") or []
+            arena = event["arena"]
+            participant_set = event.get("participant_set") or [arena]
+            disclosure = event.get("disclosure_class") or "private"
+            try:
+                name_to_id = upsert_entities(
+                    conn, arena, event_id, participant_set, disclosure, ents
+                )
+                n_facts = upsert_facts(
+                    conn, arena, event_id, participant_set, disclosure, facts, name_to_id,
+                )
+                n_rels = upsert_relationships(
+                    conn, arena, event_id, participant_set, disclosure, rels, name_to_id,
+                )
+                mark_done(conn, queue_id)
+                log.info(
+                    f"completed queue_id={queue_id} event_id={event_id} "
+                    f"entities={len(name_to_id)} facts={n_facts} "
+                    f"relationships={n_rels}"
+                    + (f" llm_ms={llm_ms:.0f}/chunk" if not stub_mode else "")
+                )
+            except Exception as exc:
+                err = f"{type(exc).__name__}: {exc}"
+                log.warning(
+                    f"db upsert failed queue_id={queue_id} attempts={attempts}: {err}"
+                )
+                if attempts >= MAX_ATTEMPTS:
+                    mark_failed(conn, queue_id, err)
+                else:
+                    release_claim(conn, queue_id, err)
+async def amain():
+    log.info(
+        f"extractor-async starting (worker_id={WORKER_ID}, "
+        f"endpoint={LLM_ENDPOINT or '(stub)'}, model={LLM_MODEL}, "
+        f"poll={POLL_INTERVAL_SEC}s, claim={BATCH_SIZE}, "
+        f"events_per_call={EVENTS_PER_LLM_CALL}, "
+        f"concurrent_calls={CONCURRENT_LLM_CALLS})"
+    )
+    stub_mode = not LLM_ENDPOINT
+    if stub_mode:
+        log.warning("LLM_ENDPOINT not set — running in stub mode (no extraction).")
+    # Single async client across the loop. The httpx default limits
+    # (max_connections=100, max_keepalive=20) easily cover BATCH_SIZE
+    # concurrent inflight LLM calls.
+    async with httpx.AsyncClient(timeout=LLM_TIMEOUT_SEC) as http:
+        while True:
+            try:
+                with psycopg.connect(PG_DSN, autocommit=True) as conn:
+                    items = claim_next_batch(conn)
+                    if not items:
+                        await asyncio.sleep(POLL_INTERVAL_SEC)
+                        continue
+                    log.info(f"claimed {len(items)} item(s)")
+                    await process_batch(http, conn, items, stub_mode)
+            except Exception as exc:
+                log.error(f"worker loop error: {exc}")
+                await asyncio.sleep(POLL_INTERVAL_SEC * 2)
+if __name__ == "__main__":
+    asyncio.run(amain())