npm - @pentatonic-ai/ai-agent-sdk - Versions diffs - 0.9.6 → 0.10.1 - Mend

@pentatonic-ai/ai-agent-sdk 0.9.6 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (127) hide show

package/packages/memory-engine-v2/compat/server.py ADDED Viewed

@@ -0,0 +1,1047 @@
+"""pentatonic-memory-engine v2 compat shim.
+Wire-format compatible with v1: same /store, /store-batch, /search,
+/forget, /health, /health/deep wire shape. TES can be flipped from v1
+to v2 by changing a single env var (MEMORY_ENGINE_URL) — no caller-side
+changes.
+Internally the architecture is entirely different from v1:
+  - /store: embed → extractor-sync (org-model writes + distillation
+    queue) → vector-index (Qdrant upsert with provenance).
+  - /search: typed router. Org-model query for facts/entities;
+    vector-index search filtered by arena + kind for evidence.
+    Fused at the response layer.
+  - /forget: events DELETE (cascade trigger removes provenance from
+    facts/entities/relationships; orphaned facts get deleted in same
+    txn). Then Qdrant payload-filtered delete to drop vectors.
+  - /health: cheap liveness. /health/deep: round-trips all three
+    stores + the embed gateway.
+What's not in this v1: typed query routing logic. For tonight, /search
+just hits vector-index with arena filter and returns. The full
+typed-router implementation (intent classify → route by kind → fuse
+selected layers) lands once the keystone spec defines the intent
+taxonomy.
+"""
+from __future__ import annotations
+import asyncio
+import hashlib
+import logging
+import os
+import re
+import time
+import uuid
+from contextlib import asynccontextmanager
+from datetime import datetime
+from typing import Any
+import httpx
+import numpy as np
+import psycopg
+import psycopg.rows
+from fastapi import FastAPI, HTTPException
+from psycopg_pool import AsyncConnectionPool
+from pydantic import BaseModel, Field
+from qdrant_client import AsyncQdrantClient, models as qmodels
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("compat")
+PG_DSN = os.environ.get("PG_DSN", "postgresql://pme:local-dev-pw@org-model:5432/org_model")
+VECTOR_INDEX_URL = os.environ.get("VECTOR_INDEX_URL", "http://vector-index:6333")
+EXTRACTOR_SYNC_URL = os.environ.get("EXTRACTOR_SYNC_URL", "http://extractor-sync:8101")
+NV_EMBED_URL = os.environ.get("NV_EMBED_URL", "http://nv-embed:8041/v1/embeddings")
+NV_EMBED_API_KEY = os.environ.get("NV_EMBED_API_KEY", "")
+NV_EMBED_PROVIDER = os.environ.get("NV_EMBED_PROVIDER", "openai")  # 'openai' | 'pentatonic-gateway'
+EMBED_DIM = int(os.environ.get("EMBED_DIM", "4096"))
+COLLECTION_NAME = "evidence"
+# /search ranking-side knobs (issue #343).
+#
+# OVERFETCH_MULT: pull this many × the caller's limit from Qdrant
+# before dedup + quota trim. 3 covers the common case where Pip's
+# chunker stores ~3 overlapping chunks per source event; 100 caps
+# the absolute fetch so a caller with limit=100 doesn't ask for 300.
+#
+# SOURCE_TYPE_QUOTA: max fraction of the returned slots any single
+# `source_kind` can hold. 0.6 with limit=10 = max 6 of the same kind,
+# so when slack one-liners flood top-k a canonical event/doc record
+# can still land. Disable by setting to 1.0.
+SEARCH_OVERFETCH_MULT = int(os.environ.get("SEARCH_OVERFETCH_MULT", "3"))
+SEARCH_OVERFETCH_MAX = int(os.environ.get("SEARCH_OVERFETCH_MAX", "100"))
+SEARCH_SOURCE_TYPE_QUOTA = float(os.environ.get("SEARCH_SOURCE_TYPE_QUOTA", "0.6"))
+# Phase 3 (#343): MMR diversification over the deduped candidate pool.
+# Greedy: each pick maximises λ·sim(query,c) − (1−λ)·max sim(c,picked).
+# λ=0.7 leans relevant-first with mild diversity — high enough that
+# top results still match the query, low enough that we don't repeat
+# semantically near-identical chunks from different events.
+SEARCH_MMR_ENABLED = os.environ.get("SEARCH_MMR_ENABLED", "1") not in ("0", "false", "")
+SEARCH_MMR_LAMBDA = float(os.environ.get("SEARCH_MMR_LAMBDA", "0.7"))
+# Phase 4 (#343): intent-aware source_kind boosts. The patterns are
+# narrow on purpose — broad matchers would over-boost generic queries
+# and bury good vector hits. Boost magnitudes are added to the cosine
+# similarity score (typical 0.7–0.85 range), so +0.06 flips a near-tie
+# in favour of the structurally-better record without surfacing
+# unrelated content. Lift to 0 (env) to disable.
+SEARCH_INTENT_BOOST = os.environ.get("SEARCH_INTENT_BOOST", "1") not in ("0", "false", "")
+# Issue #350: for temporal intent (the "last meeting" class of queries),
+# rank the candidate pool by `attributes.timestamp` desc instead of
+# similarity. The intent boost (#343 Phase 4) already lifts source_kind=event
+# into top-k, but among event records pure cosine still picks
+# semantically-best, not chronologically-latest — that's the
+# confidently-wrong "stale 2025-12 meeting beats the actual 2026-05
+# meeting" failure mode #350 documents. Temporal re-rank trumps MMR for
+# this intent class (recency IS the diversification axis); records
+# without a parseable timestamp sink to the bottom but aren't dropped.
+SEARCH_TEMPORAL_RERANK = os.environ.get("SEARCH_TEMPORAL_RERANK", "1") not in ("0", "false", "")
+TEMPORAL_INTENT_RE = re.compile(
+    r"\b(when did|when was|last (?:time|met|saw|spoke|called)|"
+    r"how long ago|first time (?:i|we) (?:met|saw|spoke)|recent(?:ly)?|"
+    r"most recent|latest|"
+    r"timeline of|history with)\b",
+    re.IGNORECASE,
+)
+FACTUAL_INTENT_RE = re.compile(
+    r"\b(summary of|summarise|summarize|list of|tell me about|"
+    r"overview of|what (?:do|did) (?:i|we) (?:know|do))\b",
+    re.IGNORECASE,
+)
+INTENT_BOOSTS: dict[str, dict[str, float]] = {
+    # source_kind -> additive boost on cosine score
+    "temporal": {"event": 0.08, "doc": 0.04, "note": 0.02},
+    "factual": {"doc": 0.06, "note": 0.03, "event": 0.03},
+}
+def _classify_intent(query: str) -> str | None:
+    """Return 'temporal' | 'factual' | None. Heuristic; no LLM call.
+    First-match wins — temporal trumps factual when both fire (since
+    "summary of recent meetings" is more temporal-shaped)."""
+    if TEMPORAL_INTENT_RE.search(query):
+        return "temporal"
+    if FACTUAL_INTENT_RE.search(query):
+        return "factual"
+    return None
+def _apply_intent_boost(results: list[Any], intent: str | None) -> list[Any]:
+    """Add the intent-specific bump to each result.score in place and
+    return a freshly sorted list (highest first). No-op when intent is
+    None or has no boost table entry."""
+    if not intent or intent not in INTENT_BOOSTS:
+        return results
+    boosts = INTENT_BOOSTS[intent]
+    for r in results:
+        kind = r.payload.get("source_kind") or ""
+        bump = boosts.get(kind, 0.0)
+        if bump:
+            r.score = r.score + bump
+    return sorted(results, key=lambda r: r.score, reverse=True)
+def _parse_ts(value: Any) -> float | None:
+    """Best-effort ISO-8601 → unix timestamp. Returns None on anything
+    we can't parse. Accepts both 'Z' suffix and explicit offsets."""
+    if not isinstance(value, str) or not value:
+        return None
+    try:
+        # `fromisoformat` handles `+00:00` but not the bare `Z` suffix
+        # until Python 3.11; normalise to be safe across runtime
+        # versions on the engine box.
+        return datetime.fromisoformat(value.replace("Z", "+00:00")).timestamp()
+    except Exception:
+        return None
+def _apply_temporal_sort(
+    results: list[Any], attrs_by_event_id: dict[str, dict[str, Any]]
+) -> list[Any]:
+    """Sort by `attributes.timestamp` desc for temporal-intent queries.
+    Postgres-attributes timestamp wins over the Qdrant payload version
+    (postgres is authoritative and retroactively populated by #345).
+    Records with no parseable timestamp sink to the bottom but preserve
+    their relative similarity order (Python's sort is stable, so the
+    in-bucket order coming in from `_apply_intent_boost` is preserved).
+    """
+    def neg_ts(r):
+        eid = r.payload.get("event_id")
+        attrs = attrs_by_event_id.get(eid) or {}
+        # attrs.timestamp first (authoritative), Qdrant payload fallback
+        # for any record where the postgres row is missing.
+        ts = _parse_ts(attrs.get("timestamp")) or _parse_ts(r.payload.get("timestamp"))
+        # Records without a timestamp get +inf so they sort last.
+        return -ts if ts is not None else float("inf")
+    return sorted(results, key=neg_ts)
+def _mmr_select(
+    candidates: list[Any], target: int, lambda_: float
+) -> list[Any]:
+    """Greedy MMR. Candidates must carry `.vector` (Qdrant returns
+    these when search() is called with with_vectors=True). Vectors are
+    expected to be L2-normalised (Qdrant does this for COSINE
+    collections), so cosine = dot product.
+    Falls back to score-order if any candidate is missing a vector
+    (the collection might have been created without vector storage).
+    Time complexity O(target × |candidates| × dim) — fine for our
+    candidate pool (≤100) and 4096-dim embeddings."""
+    if not candidates or target <= 0:
+        return []
+    # Bail to pure-relevance ordering if vectors weren't returned.
+    if any(getattr(c, "vector", None) is None for c in candidates):
+        return sorted(candidates, key=lambda r: r.score, reverse=True)[:target]
+    vecs = np.asarray([c.vector for c in candidates], dtype=np.float32)
+    scores = np.asarray([c.score for c in candidates], dtype=np.float32)
+    # Precompute pairwise similarity matrix; cheaper than per-step
+    # dot products at our scale and lets us slice into it by index.
+    sim_matrix = vecs @ vecs.T  # (N, N), values in [-1, 1]
+    n = len(candidates)
+    target = min(target, n)
+    selected_idx: list[int] = []
+    remaining = set(range(n))
+    while len(selected_idx) < target and remaining:
+        best_i = -1
+        best_score = -1e9
+        for i in remaining:
+            rel = scores[i]
+            if not selected_idx:
+                penalty = 0.0
+            else:
+                penalty = float(np.max(sim_matrix[i, selected_idx]))
+            mmr_score = lambda_ * float(rel) - (1.0 - lambda_) * penalty
+            if mmr_score > best_score:
+                best_score = mmr_score
+                best_i = i
+        if best_i < 0:
+            break
+        selected_idx.append(best_i)
+        remaining.remove(best_i)
+    return [candidates[i] for i in selected_idx]
+# Connection pool for org-model writes/reads. Tuned for the same call
+# rate as v1's compat — bump if the consumer's drain rate justifies.
+_pool: AsyncConnectionPool | None = None
+_qdrant: AsyncQdrantClient | None = None
+_http: httpx.AsyncClient | None = None
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    global _pool, _qdrant, _http
+    _pool = AsyncConnectionPool(
+        conninfo=PG_DSN,
+        min_size=2,
+        max_size=20,
+        kwargs={"row_factory": psycopg.rows.dict_row},
+        open=False,
+    )
+    await _pool.open()
+    log.info("compat: pool opened")
+    _qdrant = AsyncQdrantClient(url=VECTOR_INDEX_URL, prefer_grpc=False)
+    # Idempotent collection creation. Qdrant rejects re-creation, so we
+    # check first. Schema: 4096-d vectors (NV-Embed-v2), cosine
+    # distance, mmap by default, scalar quantization for RAM efficiency.
+    try:
+        collections = await _qdrant.get_collections()
+        names = {c.name for c in collections.collections}
+        if COLLECTION_NAME not in names:
+            await _qdrant.create_collection(
+                collection_name=COLLECTION_NAME,
+                vectors_config=qmodels.VectorParams(
+                    size=EMBED_DIM,
+                    distance=qmodels.Distance.COSINE,
+                    on_disk=True,
+                ),
+                # Scalar quantization (int8) — 4× RAM reduction on the
+                # quantile-cached portion. Page-cache governs hot set.
+                quantization_config=qmodels.ScalarQuantization(
+                    scalar=qmodels.ScalarQuantizationConfig(
+                        type=qmodels.ScalarType.INT8,
+                        always_ram=False,
+                    )
+                ),
+            )
+            log.info(f"created qdrant collection: {COLLECTION_NAME} dim={EMBED_DIM}")
+            # Payload indexes for fast filtered search (this is the
+            # whole point of choosing Qdrant — first-class filter perf).
+            for field in ("arena", "source_kind", "clientId", "userId"):
+                await _qdrant.create_payload_index(
+                    collection_name=COLLECTION_NAME,
+                    field_name=field,
+                    field_schema=qmodels.PayloadSchemaType.KEYWORD,
+                )
+            log.info("created qdrant payload indexes: arena, source_kind, clientId, userId")
+    except Exception as e:
+        log.error(f"qdrant init error: {e}")
+        # Don't crash compat on Qdrant init failure — let liveness
+        # probe report it and operators investigate. The compat shim
+        # should be more available than the underlying store.
+    _http = httpx.AsyncClient(timeout=httpx.Timeout(60.0))
+    yield
+    await _pool.close()
+    await _qdrant.close()
+    await _http.aclose()
+app = FastAPI(title="pme2-compat", lifespan=lifespan)
+# ----------------------------------------------------------------------
+# Wire models — match v1's compat shim shape for drop-in compatibility.
+# ----------------------------------------------------------------------
+class StoreRequest(BaseModel):
+    content: str
+    metadata: dict[str, Any] | None = None
+class StoreBatchRequest(BaseModel):
+    records: list[dict[str, Any]] = Field(default_factory=list)
+    arena: str | None = "general"
+    # v1's optional pre-computed embeddings — passed through but we
+    # re-embed regardless. The shared-embed optimisation lives at the
+    # SDK level now (PR #58 retry-with-jitter); compat trusts the
+    # gateway will return.
+    embeddings: list[list[float]] | None = None
+class SearchRequest(BaseModel):
+    query: str
+    limit: int | None = 10
+    min_score: float | None = 0.001
+    arena: str | None = None
+    arenas: list[str] | None = None
+    metadata_filter: dict[str, Any] | None = None
+class ForgetRequest(BaseModel):
+    metadata_contains: dict[str, Any] | None = None
+    id: str | None = None
+# ----------------------------------------------------------------------
+# Helpers
+# ----------------------------------------------------------------------
+def _content_hash(arena: str, content: str) -> str:
+    """sha256(arena:content)[:32] — caller-predicted IDs match this."""
+    return hashlib.sha256(f"{arena}:{content}".encode()).hexdigest()[:32]
+# Embed gateway is a single-GPU NV-Embed-v2 instance — concurrent
+# bursts above its in-flight ceiling return 502s. Semaphore caps how
+# many /v1/embed calls we make at once so the gateway never sees more
+# than it can serve. With TES at shardCount=8 and BATCH_SIZE=50 we get
+# up to 8 concurrent /store-batch calls hitting this path; 4 keeps the
+# gateway healthy and queues the rest in compat instead of pushing
+# the failure back through the DO retry loop (which causes DLQ on
+# repeated 502s — observed 2026-05-17). Pair with retry below.
+_EMBED_SEMAPHORE = asyncio.Semaphore(4)
+_EMBED_RETRY_STATUSES = {502, 503, 504, 429}
+_EMBED_MAX_ATTEMPTS = 5
+async def _embed_batch(texts: list[str]) -> list[list[float]]:
+    """Call the external embed gateway. Both 'openai' and
+    'pentatonic-gateway' provider shapes supported."""
+    if not texts:
+        return []
+    headers = {"Content-Type": "application/json"}
+    if NV_EMBED_API_KEY:
+        if NV_EMBED_PROVIDER == "pentatonic-gateway":
+            headers["X-API-Key"] = NV_EMBED_API_KEY
+        else:
+            headers["Authorization"] = f"Bearer {NV_EMBED_API_KEY}"
+    body = {"input": texts, "model": "nv-embed-v2"}
+    async with _EMBED_SEMAPHORE:
+        # Retry transient gateway failures (502/503/504/429) with
+        # exponential backoff before bubbling up to the caller. Without
+        # this a single GPU hiccup propagates a 500 to the TES DO,
+        # which then DLQs after MAX_ATTEMPTS attempts.
+        last_exc: Exception | None = None
+        for attempt in range(_EMBED_MAX_ATTEMPTS):
+            try:
+                r = await _http.post(NV_EMBED_URL, json=body, headers=headers)
+                if r.status_code in _EMBED_RETRY_STATUSES:
+                    last_exc = httpx.HTTPStatusError(
+                        f"embed gateway {r.status_code}", request=r.request, response=r,
+                    )
+                    log.warning(
+                        f"embed gateway {r.status_code} attempt {attempt + 1}/{_EMBED_MAX_ATTEMPTS}; retrying"
+                    )
+                    await asyncio.sleep(0.25 * (2 ** attempt))
+                    continue
+                r.raise_for_status()
+                data = r.json()
+                # Two response shapes in the wild:
+                #   { "data": [{"embedding": [...]}] }  (openai-compat)
+                #   { "embeddings": [[...]] }            (pentatonic-gateway direct)
+                if "embeddings" in data:
+                    return data["embeddings"]
+                if "data" in data:
+                    return [d["embedding"] for d in data["data"]]
+                raise RuntimeError(f"unexpected embed response shape: keys={list(data.keys())}")
+            except (httpx.TimeoutException, httpx.NetworkError) as e:
+                last_exc = e
+                log.warning(
+                    f"embed gateway transport error attempt {attempt + 1}/{_EMBED_MAX_ATTEMPTS}: {e}"
+                )
+                await asyncio.sleep(0.25 * (2 ** attempt))
+        # Exhausted retries — propagate the last failure so the caller
+        # sees the real cause (vs a generic 500).
+        assert last_exc is not None
+        raise last_exc
+async def _extract(arena: str, clientId: str, userId: str | None,
+                   source_kind: str, content: str,
+                   attributes: dict[str, Any]) -> str:
+    """Call extractor-sync. Returns the event_id (content-hash). On
+    error, raise — compat doesn't silently swallow extraction failures
+    because they break the FK invariant for vector_provenance."""
+    body = {
+        "arena": arena,
+        "clientId": clientId,
+        "userId": userId,
+        "source_kind": source_kind,
+        "content": content,
+        "attributes": attributes or {},
+    }
+    r = await _http.post(f"{EXTRACTOR_SYNC_URL}/extract", json=body)
+    r.raise_for_status()
+    return r.json()["event_id"]
+def _arena_of(meta: dict[str, Any] | None, fallback: str = "general") -> str:
+    if not meta:
+        return fallback
+    if isinstance(meta.get("arena"), str) and meta["arena"]:
+        return meta["arena"]
+    return fallback
+def _source_kind_of(meta: dict[str, Any] | None) -> str:
+    """Resolve source_kind from metadata. Priority:
+       1. explicit metadata.source_kind
+       2. metadata.kind (Pip's existing producer field — see PR #285)
+       3. fallback 'agent' (TES default)"""
+    if meta:
+        for key in ("source_kind", "kind", "memory_kind"):
+            v = meta.get(key)
+            if isinstance(v, str) and v:
+                return v
+    return "agent"
+# ----------------------------------------------------------------------
+# Health
+# ----------------------------------------------------------------------
+@app.get("/health")
+async def health():
+    return {"status": "healthy", "service": "pme2-compat", "version": "0.1.0"}
+@app.get("/health/deep")
+async def health_deep():
+    """Round-trips all three stores + the embed gateway. Slow; do not
+    use as a docker healthcheck."""
+    result = {"compat": "ok", "stores": {}}
+    # org-model
+    try:
+        async with _pool.connection() as conn:
+            async with conn.cursor() as cur:
+                await cur.execute("SELECT * FROM health_counts")
+                row = await cur.fetchone()
+                result["stores"]["org_model"] = {"status": "ok", "counts": dict(row)}
+    except Exception as e:
+        result["stores"]["org_model"] = {"status": "error", "error": str(e)}
+    # vector-index
+    try:
+        info = await _qdrant.get_collection(COLLECTION_NAME)
+        result["stores"]["vector_index"] = {
+            "status": "ok",
+            "vectors_count": info.vectors_count,
+            "points_count": info.points_count,
+        }
+    except Exception as e:
+        result["stores"]["vector_index"] = {"status": "error", "error": str(e)}
+    # embed gateway
+    try:
+        v = await _embed_batch(["health probe"])
+        result["stores"]["embed_gateway"] = {"status": "ok", "dim": len(v[0]) if v else 0}
+    except Exception as e:
+        result["stores"]["embed_gateway"] = {"status": "error", "error": str(e)}
+    return result
+# ----------------------------------------------------------------------
+# /store
+# ----------------------------------------------------------------------
+@app.post("/store")
+async def store(req: StoreRequest):
+    """Single-record ingest. v1 wire shape: { content, metadata } →
+    { id, content, layerId, engine }."""
+    meta = req.metadata or {}
+    arena = _arena_of(meta)
+    clientId = meta.get("clientId") or arena.split(":")[0]
+    userId = meta.get("user_id") or (arena.split(":", 1)[1] if ":" in arena else None)
+    source_kind = _source_kind_of(meta)
+    t0 = time.perf_counter()
+    event_id = await _extract(arena, clientId, userId, source_kind, req.content, meta)
+    embeddings = await _embed_batch([req.content])
+    vector_id = str(uuid.uuid4())
+    # Write vector_provenance + Qdrant point in the same logical
+    # operation. If Qdrant fails, the provenance row gets rolled back —
+    # otherwise we'd have a vector with no FK back to its event.
+    async with _pool.connection() as conn:
+        async with conn.cursor() as cur:
+            await cur.execute(
+                "INSERT INTO vector_provenance (vector_id, event_id, embedding_model, embedding_dim) "
+                "VALUES (%s, %s, %s, %s)",
+                (vector_id, event_id, "nv-embed-v2", EMBED_DIM),
+            )
+        await _qdrant.upsert(
+            collection_name=COLLECTION_NAME,
+            points=[
+                qmodels.PointStruct(
+                    id=vector_id,
+                    vector=embeddings[0],
+                    # Issue #345 (caps #342/#343/#344): Pip emits a rich
+                    # metadata bag — timestamp, contact_email, channel,
+                    # kind, direction, source, etc. Pre-fix the payload
+                    # picked off 5 keys and discarded the rest, which
+                    # broke metadata_filter (#342), recency sort (#343),
+                    # and personEvents.occurred_at (#344). Persist the
+                    # whole thing. Structural keys win over any name
+                    # collision from the caller.
+                    payload={
+                        **(meta or {}),
+                        "event_id": event_id,
+                        "arena": arena,
+                        "clientId": clientId,
+                        "userId": userId,
+                        "source_kind": source_kind,
+                        "content_preview": req.content[:300],
+                    },
+                )
+            ],
+        )
+    dur_ms = (time.perf_counter() - t0) * 1000
+    log.info(f"store event_id={event_id} arena={arena} ms={dur_ms:.1f}")
+    return {
+        "id": event_id,
+        "content": req.content,
+        "layerId": f"ml_{arena}_{source_kind}",
+        "engine": {"vector_index": 1, "org_model": 1},
+    }
+# ----------------------------------------------------------------------
+# /store-batch
+# ----------------------------------------------------------------------
+@app.post("/store-batch")
+async def store_batch(req: StoreBatchRequest):
+    """Batch ingest. Same wire shape as v1: records[] → { inserted,
+    ids[] }. Pre-computed embeddings on the request are accepted but
+    ignored (we re-embed for now; sharing arrives once the keystone
+    spec settles the per-source vector configuration)."""
+    if not req.records:
+        return {"status": "ok", "inserted": 0, "ids": [], "engine": {}}
+    arena_default = req.arena or "general"
+    texts = [r["content"] for r in req.records]
+    embeddings = await _embed_batch(texts)
+    if len(embeddings) != len(texts):
+        raise HTTPException(500, f"embed count mismatch: {len(embeddings)} vs {len(texts)}")
+    # Resolve per-record routing fields first so we can fan out the
+    # extractor-sync calls in parallel. Each _extract is a network
+    # round-trip; serialising them was the dominant cost in /store-batch
+    # latency (~70ms × N records). asyncio.gather collapses N calls
+    # into one wall-time, capped by the extractor-sync pool size.
+    resolved: list[tuple[str, str, str | None, str, str, dict]] = []
+    for r in req.records:
+        meta = r.get("metadata") or {}
+        arena = _arena_of(meta, fallback=arena_default)
+        clientId = meta.get("clientId") or arena.split(":")[0]
+        userId = meta.get("user_id") or (arena.split(":", 1)[1] if ":" in arena else None)
+        source_kind = _source_kind_of(meta)
+        content = r["content"]
+        resolved.append((arena, clientId, userId, source_kind, content, meta))
+    event_ids = await asyncio.gather(*[
+        _extract(arena, clientId, userId, source_kind, content, meta)
+        for (arena, clientId, userId, source_kind, content, meta) in resolved
+    ])
+    ids: list[str] = []
+    points: list[qmodels.PointStruct] = []
+    provenance_rows: list[tuple] = []
+    for (arena, clientId, userId, source_kind, content, meta), vec, event_id in zip(
+        resolved, embeddings, event_ids
+    ):
+        vector_id = str(uuid.uuid4())
+        provenance_rows.append((vector_id, event_id, "nv-embed-v2", EMBED_DIM))
+        # See /store above — issue #345. Spread the caller's metadata
+        # into the payload so downstream metadata_filter / sort / and
+        # personEvents timestamp resolution actually have something to
+        # work with. Structural keys override on collision.
+        points.append(qmodels.PointStruct(
+            id=vector_id,
+            vector=vec,
+            payload={
+                **(meta or {}),
+                "event_id": event_id,
+                "arena": arena,
+                "clientId": clientId,
+                "userId": userId,
+                "source_kind": source_kind,
+                "content_preview": content[:300],
+            },
+        ))
+        ids.append(event_id)
+    async with _pool.connection() as conn:
+        async with conn.cursor() as cur:
+            await cur.executemany(
+                "INSERT INTO vector_provenance (vector_id, event_id, embedding_model, embedding_dim) "
+                "VALUES (%s, %s, %s, %s)",
+                provenance_rows,
+            )
+    await _qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
+    return {
+        "status": "ok",
+        "inserted": len(ids),
+        "ids": ids,
+        "engine": {"vector_index": len(ids), "org_model": len(ids)},
+    }
+# ----------------------------------------------------------------------
+# /search
+# ----------------------------------------------------------------------
+# ----------------------------------------------------------------------
+# Structured graph queries
+# ----------------------------------------------------------------------
+# Lightweight read endpoints over the org-model graph tables. Bypass the
+# vector index — these are facet/filter queries (find entities by name,
+# facts by subject, relationships by edge type), not similarity ranking.
+# Arena is required on every query (multi-tenancy boundary). userId is
+# optional; when present we span both [clientId, clientId:userId] arenas
+# the same way semantic search does.
+class GraphQueryRequest(BaseModel):
+    """Common envelope for the graph read endpoints. `arena` is a single
+    string OR `arenas` is a list — pick whichever the caller has handy.
+    All filter fields are optional; the endpoint returns most-recent
+    first, capped at `limit`."""
+    arena: str | None = None
+    arenas: list[str] | None = None
+    entity_type: str | None = None
+    name: str | None = None             # canonical_name (ILIKE)
+    subject: str | None = None          # entity name OR canonical_name (facts.subject_entity)
+    predicate: str | None = None
+    category: str | None = None         # facts.category
+    from_name: str | None = None        # relationships.from_entity.canonical_name
+    to_name: str | None = None
+    relationship_type: str | None = None
+    limit: int = 50
+def _resolve_arenas(req: GraphQueryRequest) -> list[str]:
+    arenas = req.arenas or ([req.arena] if req.arena else [])
+    if not arenas:
+        raise HTTPException(400, "arena or arenas required")
+    return arenas
+@app.post("/entities")
+async def list_entities(req: GraphQueryRequest):
+    """Filter entities by arena + optional type + optional name pattern.
+    Aliases are matched too — searching `name='Mastercard'` catches rows
+    where Mastercard is the canonical_name OR an alias."""
+    arenas = _resolve_arenas(req)
+    conditions = ["arena = ANY(%s)"]
+    params: list[Any] = [arenas]
+    if req.entity_type:
+        conditions.append("entity_type = %s")
+        params.append(req.entity_type)
+    if req.name:
+        conditions.append("(canonical_name ILIKE %s OR EXISTS (SELECT 1 FROM UNNEST(aliases) AS a WHERE a ILIKE %s))")
+        pattern = f"%{req.name}%"
+        params.extend([pattern, pattern])
+    sql = f"""
+        SELECT id, arena, entity_type, canonical_name, aliases,
+               provenance_event_ids, last_seen
+          FROM entities
+         WHERE {' AND '.join(conditions)}
+      ORDER BY last_seen DESC
+         LIMIT %s
+    """
+    params.append(req.limit)
+    async with _pool.connection() as conn:
+        async with conn.cursor() as cur:
+            await cur.execute(sql, params)
+            rows = await cur.fetchall()
+    return {"results": [dict(r) for r in rows]}
+@app.post("/facts")
+async def list_facts(req: GraphQueryRequest):
+    """Filter facts by arena + optional category/predicate + optional
+    subject-entity name. Subject filter joins facts → entities via
+    subject_entity_id."""
+    arenas = _resolve_arenas(req)
+    conditions = ["f.arena = ANY(%s)"]
+    params: list[Any] = [arenas]
+    if req.category:
+        conditions.append("f.category = %s")
+        params.append(req.category)
+    if req.predicate:
+        conditions.append("f.predicate ILIKE %s")
+        params.append(f"%{req.predicate}%")
+    if req.subject:
+        conditions.append("EXISTS (SELECT 1 FROM entities e WHERE e.id = f.subject_entity_id AND (e.canonical_name ILIKE %s OR %s = ANY(e.aliases)))")
+        params.extend([f"%{req.subject}%", req.subject])
+    sql = f"""
+        SELECT f.id, f.arena, f.category, f.predicate, f.statement,
+               f.subject_entity_id, f.object_entity_id,
+               f.confidence, f.stage, f.asserted_at,
+               f.provenance_event_ids
+          FROM facts f
+         WHERE {' AND '.join(conditions)}
+      ORDER BY f.asserted_at DESC
+         LIMIT %s
+    """
+    params.append(req.limit)
+    async with _pool.connection() as conn:
+        async with conn.cursor() as cur:
+            await cur.execute(sql, params)
+            rows = await cur.fetchall()
+    return {"results": [dict(r) for r in rows]}
+@app.post("/relationships")
+async def list_relationships(req: GraphQueryRequest):
+    """Filter edges by arena + optional from/to entity names + optional
+    relationship_type. Returns the resolved from/to canonical names so
+    the caller doesn't need to round-trip back to /entities."""
+    arenas = _resolve_arenas(req)
+    conditions = ["r.arena = ANY(%s)"]
+    params: list[Any] = [arenas]
+    if req.relationship_type:
+        conditions.append("r.relationship_type ILIKE %s")
+        params.append(f"%{req.relationship_type}%")
+    if req.from_name:
+        conditions.append("(ef.canonical_name ILIKE %s OR %s = ANY(ef.aliases))")
+        params.extend([f"%{req.from_name}%", req.from_name])
+    if req.to_name:
+        conditions.append("(et.canonical_name ILIKE %s OR %s = ANY(et.aliases))")
+        params.extend([f"%{req.to_name}%", req.to_name])
+    sql = f"""
+        SELECT r.id, r.arena, r.relationship_type, r.weight,
+               r.from_entity_id, r.to_entity_id,
+               ef.canonical_name AS from_name,
+               et.canonical_name AS to_name,
+               r.first_seen, r.last_seen,
+               r.provenance_event_ids
+          FROM relationships r
+          JOIN entities ef ON ef.id = r.from_entity_id
+          JOIN entities et ON et.id = r.to_entity_id
+         WHERE {' AND '.join(conditions)}
+      ORDER BY r.last_seen DESC
+         LIMIT %s
+    """
+    params.append(req.limit)
+    async with _pool.connection() as conn:
+        async with conn.cursor() as cur:
+            await cur.execute(sql, params)
+            rows = await cur.fetchall()
+    return {"results": [dict(r) for r in rows]}
+@app.post("/search")
+async def search(req: SearchRequest):
+    """Search the evidence index. Filtered by arena (single or list).
+    For tonight this is vector-only — no typed router, no org-model
+    fact lookup, no L0 BM25. Those come once the keystone spec defines
+    the intent classification scheme. The output shape matches v1 so
+    callers don't break: { results: [{id, content, similarity, metadata}] }."""
+    arenas = req.arenas or ([req.arena] if req.arena else [])
+    if not arenas:
+        # No arena scope = unsafe. v1 silently scoped to 'general'; v2
+        # rejects to force callers to be explicit.
+        raise HTTPException(400, "arena or arenas required")
+    qvec = (await _embed_batch([req.query]))[0]
+    # Compose Qdrant Filter: arena scope is always required, plus any
+    # caller-supplied metadata_filter keys ANDed in. Mirrors how
+    # /forget's `metadata_contains` already builds containment filters
+    # (see issue #342 — `metadata_filter` was previously a dead param
+    # silently accepted by SearchRequest but never applied, blocking
+    # consumer-side source_kind/kind retrieval filtering).
+    #
+    # Filter shape:
+    #   - list value -> MatchAny (`source_kind IN ('note','event')`)
+    #   - scalar    -> MatchValue (exact)
+    #   - null / "" -> skipped, so {"source_kind": null} doesn't match nothing
+    #
+    # MAX_META_FILTER_KEYS caps the number of extra clauses so a caller
+    # passing 100 keys can't blow up Qdrant's query plan.
+    MAX_META_FILTER_KEYS = 16
+    must: list[Any] = [
+        qmodels.FieldCondition(key="arena", match=qmodels.MatchAny(any=arenas))
+    ]
+    if req.metadata_filter:
+        for k, v in list(req.metadata_filter.items())[:MAX_META_FILTER_KEYS]:
+            if v is None or v == "":
+                continue
+            if isinstance(v, list):
+                if not v:
+                    continue
+                must.append(qmodels.FieldCondition(key=k, match=qmodels.MatchAny(any=v)))
+            else:
+                must.append(qmodels.FieldCondition(key=k, match=qmodels.MatchValue(value=v)))
+    filter_ = qmodels.Filter(must=must)
+    # Issue #343: over-fetch then dedup + quota.
+    #
+    # Pip's chunker stores ~3 overlapping chunks per source event, so
+    # raw Qdrant top-k can be dominated by 8/10 vectors that all share
+    # one event_id. Fetch (limit × OVERFETCH_MULT) candidates, then:
+    #   (a) collapse to one row per event_id, keeping the highest score
+    #       (Qdrant returns score-desc → first-wins is correct)
+    #   (b) apply a per-source_kind quota so slack one-liners can't
+    #       crowd out the canonical event/doc record that actually
+    #       answers the query.
+    # Anything quota-rejected goes to an overflow list and is appended
+    # last so we never return fewer than the available, deduped pool.
+    target_limit = req.limit or 10
+    overfetch = min(target_limit * SEARCH_OVERFETCH_MULT, SEARCH_OVERFETCH_MAX)
+    # Phase 4 (#343): classify the query intent once; cheap regex.
+    intent = _classify_intent(req.query) if SEARCH_INTENT_BOOST else None
+    # Issue #350: temporal-intent queries don't need vectors (we skip
+    # MMR in favour of timestamp-desc sort), so save the per-hit
+    # vector-payload bandwidth (4096 × float32 × overfetch) when
+    # vectors won't be used.
+    temporal_active = (intent == "temporal") and SEARCH_TEMPORAL_RERANK
+    raw_results = await _qdrant.search(
+        collection_name=COLLECTION_NAME,
+        query_vector=qvec,
+        query_filter=filter_,
+        limit=max(overfetch, target_limit),
+        score_threshold=req.min_score,
+        with_payload=True,
+        # Phase 3 (#343): MMR needs the actual vectors to score pairwise
+        # similarity. Only pull them when MMR is enabled AND we aren't
+        # about to skip MMR for a temporal re-rank.
+        with_vectors=SEARCH_MMR_ENABLED and not temporal_active,
+    )
+    # (a) dedup by event_id — first occurrence wins (highest score).
+    seen_eids: set[str] = set()
+    deduped: list[Any] = []
+    for r in raw_results:
+        eid = r.payload.get("event_id")
+        if not eid or eid in seen_eids:
+            continue
+        seen_eids.add(eid)
+        deduped.append(r)
+    # (b) Hoisted from below (#350): fetch content + attributes for the
+    # deduped candidate pool BEFORE re-ranking. The temporal sort needs
+    # `attributes.timestamp` to be available at rank time, and pulling
+    # for the deduped set (vs the final selected set) is one extra DB
+    # round-trip on N candidates which is dominated by the network
+    # cost of the Qdrant search itself — cheap.
+    candidate_event_ids = [r.payload["event_id"] for r in deduped if r.payload.get("event_id")]
+    full_content: dict[str, str] = {}
+    full_attrs: dict[str, dict[str, Any]] = {}
+    if candidate_event_ids:
+        async with _pool.connection() as conn:
+            async with conn.cursor() as cur:
+                await cur.execute(
+                    "SELECT id, content, attributes FROM events WHERE id = ANY(%s)",
+                    (candidate_event_ids,),
+                )
+                for row in await cur.fetchall():
+                    full_content[row["id"]] = row["content"]
+                    full_attrs[row["id"]] = row["attributes"] or {}
+    # (c) Phase 4: intent-aware boost. Re-sorts the pool by adjusted
+    # score so the temporal sort below operates on a similarity-adjusted
+    # baseline (records with no timestamp will sink to the bottom of
+    # the temporal sort but keep this in-bucket order).
+    if intent:
+        deduped = _apply_intent_boost(deduped, intent)
+    # (d) Phase 3 or Issue #350: diversify the pool.
+    #
+    #   - temporal intent (`last meeting`, `most recent`, ...) → sort
+    #     by attributes.timestamp DESC. Recency IS the diversification
+    #     axis for this class of query; MMR's semantic-spread would
+    #     un-sort the chronological order we want.
+    #   - everything else → MMR over the deduped pool for semantic
+    #     diversity.
+    mmr_target = min(target_limit * 2, len(deduped))
+    if temporal_active:
+        deduped = _apply_temporal_sort(deduped, full_attrs)
+        mmr_pool = deduped[:mmr_target]
+    elif SEARCH_MMR_ENABLED:
+        mmr_pool = _mmr_select(deduped, target=mmr_target, lambda_=SEARCH_MMR_LAMBDA)
+    else:
+        mmr_pool = deduped[:mmr_target]
+    # (e) source-type quota. max_per_kind floors at 1 so a single-kind
+    # corpus still returns results; quota >= 1.0 disables.
+    max_per_kind = max(1, int(target_limit * SEARCH_SOURCE_TYPE_QUOTA))
+    selected: list[Any] = []
+    overflow: list[Any] = []
+    counts: dict[str, int] = {}
+    for r in mmr_pool:
+        if len(selected) >= target_limit:
+            break
+        kind = r.payload.get("source_kind") or "unknown"
+        if counts.get(kind, 0) < max_per_kind or SEARCH_SOURCE_TYPE_QUOTA >= 1.0:
+            selected.append(r)
+            counts[kind] = counts.get(kind, 0) + 1
+        else:
+            overflow.append(r)
+    # Backfill from overflow if quota was over-restrictive (e.g. corpus
+    # is 90% one source_kind). Better to return slightly skewed top-k
+    # than fewer results than the caller asked for.
+    if len(selected) < target_limit and overflow:
+        selected.extend(overflow[: target_limit - len(selected)])
+    results = selected
+    # Note: `full_content` and `full_attrs` were populated above (over
+    # the candidate pool). The projection below reads from them by
+    # event_id — any selected result will have an entry since the
+    # candidate set is a superset of `results`.
+    # Issue #345: surface the rich metadata bag in the response.
+    # Composition: Qdrant payload first (structural keys: arena,
+    # clientId, userId, source_kind, event_id), then postgres
+    # `events.attributes` (the canonical full bag), so postgres wins
+    # on collision. `content_preview` and the embedded `content` field
+    # (which extractor-sync stamps into attributes for provenance) are
+    # excluded — the top-level `content` already carries the text and
+    # we don't want it duplicated inside metadata.
+    METADATA_INTERNAL_KEYS = {"content_preview", "content"}
+    out = []
+    for r in results:
+        eid = r.payload["event_id"]
+        merged_meta = {
+            **{k: v for k, v in r.payload.items() if k not in METADATA_INTERNAL_KEYS},
+            **{k: v for k, v in full_attrs.get(eid, {}).items() if k not in METADATA_INTERNAL_KEYS},
+        }
+        out.append({
+            "id": eid,
+            "content": full_content.get(eid, r.payload.get("content_preview", "")),
+            "similarity": r.score,
+            "metadata": merged_meta,
+        })
+    return {"results": out}
+# ----------------------------------------------------------------------
+# /forget
+# ----------------------------------------------------------------------
+@app.post("/forget")
+async def forget(req: ForgetRequest):
+    """Delete records by ID or metadata filter.
+    org-model events DELETE → cascade trigger drops provenance from
+    facts/entities/relationships; orphaned facts/relationships deleted
+    in same txn. Then Qdrant payload-filter delete drops the vectors.
+    """
+    if not req.id and not req.metadata_contains:
+        raise HTTPException(400, "id or metadata_contains required")
+    deleted_events: list[str] = []
+    async with _pool.connection() as conn:
+        async with conn.cursor() as cur:
+            if req.id:
+                await cur.execute(
+                    "DELETE FROM events WHERE id = %s RETURNING id",
+                    (req.id,),
+                )
+            else:
+                # Build a JSONB containment filter from metadata_contains.
+                # Engine arena is read from metadata_contains.arena for
+                # v1 wire compatibility (see PR #327 history).
+                arena = req.metadata_contains.get("arena")
+                # Other keys become attributes-JSONB containment.
+                other = {k: v for k, v in req.metadata_contains.items() if k != "arena"}
+                params: list = []
+                where = []
+                if arena:
+                    where.append("arena = %s")
+                    params.append(arena)
+                if other:
+                    where.append("attributes @> %s::jsonb")
+                    params.append(psycopg.types.json.Json(other))
+                if not where:
+                    raise HTTPException(400, "metadata_contains must specify arena or other filters")
+                sql = "DELETE FROM events WHERE " + " AND ".join(where) + " RETURNING id"
+                await cur.execute(sql, params)
+            rows = await cur.fetchall()
+            deleted_events = [row["id"] for row in rows]
+    # Drop the vectors. Qdrant supports payload-filter delete natively.
+    if deleted_events:
+        await _qdrant.delete(
+            collection_name=COLLECTION_NAME,
+            points_selector=qmodels.FilterSelector(
+                filter=qmodels.Filter(
+                    must=[qmodels.FieldCondition(
+                        key="event_id",
+                        match=qmodels.MatchAny(any=deleted_events),
+                    )]
+                )
+            ),
+        )
+    return {"deleted": len(deleted_events), "engine": "pme2"}