npm - @pentatonic-ai/ai-agent-sdk - Versions diffs - 0.10.5 → 0.10.7 - Mend

@pentatonic-ai/ai-agent-sdk 0.10.5 → 0.10.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/dist/index.cjs CHANGED Viewed

@@ -878,7 +878,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
 }
 // src/telemetry.js
-var VERSION = "0.10.5";
+var VERSION = "0.10.7";
 var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
 function machineId() {
   const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";

package/dist/index.js CHANGED Viewed

@@ -847,7 +847,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
 }
 // src/telemetry.js
-var VERSION = "0.10.5";
+var VERSION = "0.10.7";
 var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
 function machineId() {
   const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@pentatonic-ai/ai-agent-sdk",
-  "version": "0.10.5",
+  "version": "0.10.7",
   "description": "TES SDK — LLM observability and lifecycle tracking via Pentatonic Thing Event System. Track token usage, tool calls, and conversations. Manage things through event-sourced lifecycle stages with AI enrichment and vector search.",
   "type": "module",
   "main": "./dist/index.cjs",

package/packages/memory-engine-v2/compat/requirements.txt CHANGED Viewed

@@ -4,3 +4,9 @@ psycopg[binary,pool]==3.2.3
 httpx==0.27.2
 qdrant-client==1.12.1
 pydantic==2.9.2
+# BET 3 (hybrid retrieval): CPU-only BM25 sparse encoder for the named
+# 'lex' vector. 0.3.6 = the exact pin qdrant-client 1.12.1's own
+# [fastembed] extra uses (and it requires python <3.13 — the compat
+# image is python:3.12-slim). Only imported lazily when
+# SEARCH_HYBRID_ENABLED is on; flag-off behavior is unchanged.
+fastembed==0.3.6

package/packages/memory-engine-v2/compat/server.py CHANGED Viewed

@@ -102,6 +102,24 @@ SEARCH_INTENT_BOOST = os.environ.get("SEARCH_INTENT_BOOST", "1") not in ("0", "f
 # without a parseable timestamp sink to the bottom but aren't dropped.
 SEARCH_TEMPORAL_RERANK = os.environ.get("SEARCH_TEMPORAL_RERANK", "1") not in ("0", "false", "")
+# ── Hybrid lexical+dense retrieval (roadmap BET 3) ───────────────────
+# SEARCH_HYBRID_ENABLED gates EVERY hybrid behavior in one switch:
+#   - /store and /store-batch additionally write a NAMED sparse vector
+#     ("lex", BM25 term weights via fastembed, CPU-only) alongside the
+#     existing unnamed dense vector. The dense embedder + its vectors
+#     are never touched — additive only, zero dense re-embed.
+#   - startup runs an idempotent update_collection to add the sparse
+#     vector config ("lex": IDF modifier, on-disk index) when missing.
+#   - /search swaps the single dense search() for a server-side
+#     RRF-fused query_points(prefetch=[dense, lex]) — everything
+#     downstream (dedup → intent boost → MMR/temporal → quota →
+#     hydration) is untouched; the RRF score lands in r.score.
+# Default OFF (env unset/0/false): the request path is byte-identical
+# to pre-hybrid behavior and fastembed is never imported at all.
+SEARCH_HYBRID_ENABLED = os.environ.get("SEARCH_HYBRID_ENABLED", "") not in ("", "0", "false")
+SPARSE_VECTOR_NAME = "lex"
+SPARSE_MODEL_NAME = os.environ.get("SEARCH_SPARSE_MODEL", "Qdrant/bm25")
 TEMPORAL_INTENT_RE = re.compile(
     r"\b(when did|when was|last (?:time|met|saw|spoke|called)|"
     r"how long ago|first time (?:i|we) (?:met|saw|spoke)|recent(?:ly)?|"
@@ -116,6 +134,18 @@ FACTUAL_INTENT_RE = re.compile(
 )
 INTENT_BOOSTS: dict[str, dict[str, float]] = {
     # source_kind -> additive boost on cosine score
+    #
+    # ⚠️ HYBRID-RRF RECALIBRATION NEEDED (BET 3): these magnitudes were
+    # tuned against COSINE similarity scores (typical 0.7–0.85 range,
+    # where +0.06 flips a near-tie). When SEARCH_HYBRID_ENABLED is on,
+    # /search returns RRF fusion scores instead — 1/(k+rank) with
+    # Qdrant's k=60, i.e. ~0.016 at rank 1 decaying to ~0.006 at rank
+    # 100. On that scale a +0.06 additive boost is no longer a nudge:
+    # it catapults any matching source_kind above EVERY un-boosted
+    # result regardless of rank. Do not flip the hybrid flag to
+    # default-on until these are recalibrated against eval-harness
+    # numbers (see eval/recall_at_k.py); flag-off default protects
+    # prod until then.
     "temporal": {"event": 0.08, "doc": 0.04, "note": 0.02},
     "factual": {"doc": 0.06, "note": 0.03, "event": 0.03},
 }
@@ -184,6 +214,64 @@ def _apply_temporal_sort(
     return sorted(results, key=neg_ts)
+# ── Sparse (BM25) encoding — hybrid retrieval, BET 3 ─────────────────
+# fastembed's Qdrant/bm25 sparse encoder. CPU-only — no GPU contention
+# with the dense embed gateway. Lazily initialised so that (a) flag-off
+# deployments never import fastembed (it isn't even a hard dependency
+# of the request path) and (b) the model artifact download happens on
+# first use, not at process start.
+_sparse_encoder: Any | None = None
+def _get_sparse_encoder() -> Any:
+    global _sparse_encoder
+    if _sparse_encoder is None:
+        # Deferred import — module load must stay fastembed-free when
+        # SEARCH_HYBRID_ENABLED is off.
+        from fastembed import SparseTextEmbedding
+        _sparse_encoder = SparseTextEmbedding(model_name=SPARSE_MODEL_NAME)
+        log.info(f"sparse encoder initialised: {SPARSE_MODEL_NAME}")
+    return _sparse_encoder
+def _to_sparse_vector(emb: Any) -> qmodels.SparseVector:
+    """fastembed SparseEmbedding (numpy indices/values) → Qdrant model."""
+    return qmodels.SparseVector(
+        indices=[int(i) for i in emb.indices],
+        values=[float(v) for v in emb.values],
+    )
+async def _sparse_encode_documents(texts: list[str]) -> list[qmodels.SparseVector]:
+    """BM25-encode full document content for the named 'lex' vector.
+    Runs in a thread — fastembed is synchronous CPU work and must not
+    block the event loop under concurrent /store-batch load."""
+    enc = _get_sparse_encoder()
+    embs = await asyncio.to_thread(lambda: list(enc.embed(texts)))
+    return [_to_sparse_vector(e) for e in embs]
+async def _sparse_encode_query(text: str) -> qmodels.SparseVector:
+    """BM25-encode a query. `query_embed` (not `embed`) — BM25 weights
+    documents by term frequency/length but queries as bare term sets;
+    the IDF half lives server-side via Modifier.IDF on the collection."""
+    enc = _get_sparse_encoder()
+    embs = await asyncio.to_thread(lambda: list(enc.query_embed(text)))
+    return _to_sparse_vector(embs[0])
+def _dense_vector_of(candidate: Any) -> Any:
+    """Extract the dense vector from a scored point. With hybrid on,
+    Qdrant returns the full named-vector bag ({'': dense, 'lex':
+    sparse}); the dense vector rides the default '' slot. Flag-off
+    points return the bare list unchanged."""
+    v = getattr(candidate, "vector", None)
+    if isinstance(v, dict):
+        return v.get("")
+    return v
 def _mmr_select(
     candidates: list[Any], target: int, lambda_: float
 ) -> list[Any]:
@@ -199,10 +287,12 @@ def _mmr_select(
     if not candidates or target <= 0:
         return []
     # Bail to pure-relevance ordering if vectors weren't returned.
-    if any(getattr(c, "vector", None) is None for c in candidates):
+    # (_dense_vector_of unwraps the hybrid named-vector bag; flag-off
+    # bare-list vectors pass through unchanged.)
+    if any(_dense_vector_of(c) is None for c in candidates):
         return sorted(candidates, key=lambda r: r.score, reverse=True)[:target]
-    vecs = np.asarray([c.vector for c in candidates], dtype=np.float32)
+    vecs = np.asarray([_dense_vector_of(c) for c in candidates], dtype=np.float32)
     scores = np.asarray([c.score for c in candidates], dtype=np.float32)
     # Precompute pairwise similarity matrix; cheaper than per-step
     # dot products at our scale and lets us slice into it by index.
@@ -239,6 +329,47 @@ _qdrant: AsyncQdrantClient | None = None
 _http: httpx.AsyncClient | None = None
+def _sparse_vectors_config() -> dict[str, Any]:
+    """The 'lex' named-sparse-vector schema (BET 3).
+    Modifier.IDF — Qdrant computes/applies IDF server-side, so the
+    client-side BM25 encoding only needs term frequency × length
+    normalisation (which is exactly what fastembed's Qdrant/bm25
+    produces). on_disk index — the sparse index joins the dense
+    vectors on disk rather than competing for RAM; the 06-05 outage
+    was disk pressure, not RAM, and mmap/page-cache governs hot set
+    the same way the dense side is configured."""
+    return {
+        SPARSE_VECTOR_NAME: qmodels.SparseVectorParams(
+            modifier=qmodels.Modifier.IDF,
+            index=qmodels.SparseIndexParams(on_disk=True),
+        )
+    }
+async def _ensure_sparse_vector_config() -> bool:
+    """Idempotent collection migration: add the 'lex' sparse vector
+    config to the existing collection when missing. Called from
+    lifespan only when SEARCH_HYBRID_ENABLED — flag-off startups never
+    touch the collection config. Adding a sparse vector config is
+    additive metadata: existing points and the unnamed dense vector
+    are untouched (no re-embed, no rebuild). Returns True if the
+    config was added, False if already present."""
+    info = await _qdrant.get_collection(COLLECTION_NAME)
+    existing = getattr(info.config.params, "sparse_vectors", None) or {}
+    if SPARSE_VECTOR_NAME in existing:
+        return False
+    await _qdrant.update_collection(
+        collection_name=COLLECTION_NAME,
+        sparse_vectors_config=_sparse_vectors_config(),
+    )
+    log.info(
+        f"added sparse vector config '{SPARSE_VECTOR_NAME}' "
+        f"(modifier=idf, on_disk=true) to collection {COLLECTION_NAME}"
+    )
+    return True
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     global _pool, _qdrant, _http
@@ -260,6 +391,13 @@ async def lifespan(app: FastAPI):
         collections = await _qdrant.get_collections()
         names = {c.name for c in collections.collections}
         if COLLECTION_NAME not in names:
+            create_kwargs: dict[str, Any] = {}
+            if SEARCH_HYBRID_ENABLED:
+                # Fresh collection with the flag on gets the 'lex'
+                # sparse config at creation time (BET 3); existing
+                # collections are migrated by
+                # _ensure_sparse_vector_config below.
+                create_kwargs["sparse_vectors_config"] = _sparse_vectors_config()
             await _qdrant.create_collection(
                 collection_name=COLLECTION_NAME,
                 vectors_config=qmodels.VectorParams(
@@ -275,6 +413,7 @@ async def lifespan(app: FastAPI):
                         always_ram=False,
                     )
                 ),
+                **create_kwargs,
             )
             log.info(f"created qdrant collection: {COLLECTION_NAME} dim={EMBED_DIM}")
             # Payload indexes for fast filtered search (this is the
@@ -286,6 +425,12 @@ async def lifespan(app: FastAPI):
                     field_schema=qmodels.PayloadSchemaType.KEYWORD,
                 )
             log.info("created qdrant payload indexes: arena, source_kind, clientId, userId")
+        if SEARCH_HYBRID_ENABLED:
+            # BET 3 migration — idempotent, additive-only; no-op when
+            # the 'lex' config is already present. Flag-off startups
+            # never reach this line, so the collection config is
+            # byte-identical to today until the flag is flipped.
+            await _ensure_sparse_vector_config()
     except Exception as e:
         log.error(f"qdrant init error: {e}")
         # Don't crash compat on Qdrant init failure — let liveness
@@ -553,6 +698,16 @@ async def store(req: StoreRequest):
     event_id = await _extract(arena, clientId, userId, source_kind, req.content, meta)
     embeddings = await _embed_batch([req.content])
+    # BET 3: BM25-encode the FULL content into the named 'lex' sparse
+    # vector. Encode failure degrades to dense-only (ingest must not
+    # fail on the lexical leg; the backfill script repairs gaps).
+    sparse_vec: Any | None = None
+    if SEARCH_HYBRID_ENABLED:
+        try:
+            sparse_vec = (await _sparse_encode_documents([req.content]))[0]
+        except Exception as e:
+            log.warning(f"sparse encode failed; storing dense-only (backfill repairs): {e}")
     vector_id = str(uuid.uuid4())
     # Write vector_provenance + Qdrant point in the same logical
     # operation. If Qdrant fails, the provenance row gets rolled back —
@@ -569,7 +724,15 @@ async def store(req: StoreRequest):
             points=[
                 qmodels.PointStruct(
                     id=vector_id,
-                    vector=embeddings[0],
+                    # Flag-off: bare dense list — byte-identical to
+                    # today. Flag-on: named-vector bag; the dense
+                    # vector keeps its unnamed ('') slot, 'lex' is
+                    # purely additive.
+                    vector=(
+                        embeddings[0]
+                        if sparse_vec is None
+                        else {"": embeddings[0], SPARSE_VECTOR_NAME: sparse_vec}
+                    ),
                     # Issue #345 (caps #342/#343/#344): Pip emits a rich
                     # metadata bag — timestamp, contact_email, channel,
                     # kind, direction, source, etc. Pre-fix the payload
@@ -621,6 +784,22 @@ async def store_batch(req: StoreBatchRequest):
     if len(embeddings) != len(texts):
         raise HTTPException(500, f"embed count mismatch: {len(embeddings)} vs {len(texts)}")
+    # BET 3: sparse-encode the FULL content batch for the named 'lex'
+    # vector. Best-effort — a sparse failure degrades the whole batch
+    # to dense-only rather than failing ingest (backfill repairs).
+    sparse_vecs: list[Any] | None = None
+    if SEARCH_HYBRID_ENABLED:
+        try:
+            sparse_vecs = await _sparse_encode_documents(texts)
+            if len(sparse_vecs) != len(texts):
+                log.warning(
+                    f"sparse encode count mismatch ({len(sparse_vecs)} vs {len(texts)}); storing dense-only"
+                )
+                sparse_vecs = None
+        except Exception as e:
+            log.warning(f"sparse encode failed; storing dense-only (backfill repairs): {e}")
+            sparse_vecs = None
     # Resolve per-record routing fields first so we can fan out the
     # extractor-sync calls in parallel. Each _extract is a network
     # round-trip; serialising them was the dominant cost in /store-batch
@@ -644,9 +823,9 @@ async def store_batch(req: StoreBatchRequest):
     ids: list[str] = []
     points: list[qmodels.PointStruct] = []
     provenance_rows: list[tuple] = []
-    for (arena, clientId, userId, source_kind, content, meta), vec, event_id in zip(
+    for idx, ((arena, clientId, userId, source_kind, content, meta), vec, event_id) in enumerate(zip(
         resolved, embeddings, event_ids
-    ):
+    )):
         vector_id = str(uuid.uuid4())
         provenance_rows.append((vector_id, event_id, "nv-embed-v2", EMBED_DIM))
         # See /store above — issue #345. Spread the caller's metadata
@@ -655,7 +834,13 @@ async def store_batch(req: StoreBatchRequest):
         # work with. Structural keys override on collision.
         points.append(qmodels.PointStruct(
             id=vector_id,
-            vector=vec,
+            # BET 3: flag-off keeps the bare dense list (byte-identical
+            # to today); flag-on adds the named 'lex' sparse vector.
+            vector=(
+                vec
+                if sparse_vecs is None
+                else {"": vec, SPARSE_VECTOR_NAME: sparse_vecs[idx]}
+            ),
             payload={
                 **(meta or {}),
                 "event_id": event_id,
@@ -896,18 +1081,73 @@ async def search(req: SearchRequest):
     # vector-payload bandwidth (4096 × float32 × overfetch) when
     # vectors won't be used.
     temporal_active = (intent == "temporal") and SEARCH_TEMPORAL_RERANK
-    raw_results = await _qdrant.search(
-        collection_name=COLLECTION_NAME,
-        query_vector=qvec,
-        query_filter=filter_,
-        limit=max(overfetch, target_limit),
-        score_threshold=req.min_score,
-        with_payload=True,
-        # Phase 3 (#343): MMR needs the actual vectors to score pairwise
-        # similarity. Only pull them when MMR is enabled AND we aren't
-        # about to skip MMR for a temporal re-rank.
-        with_vectors=SEARCH_MMR_ENABLED and not temporal_active,
-    )
+    fetch_limit = max(overfetch, target_limit)
+    # Phase 3 (#343): MMR needs the actual vectors to score pairwise
+    # similarity. Only pull them when MMR is enabled AND we aren't
+    # about to skip MMR for a temporal re-rank.
+    fetch_vectors = SEARCH_MMR_ENABLED and not temporal_active
+    # ── BET 3: hybrid lexical+dense retrieval ────────────────────────
+    # Flag on → encode the query with BM25 and replace the single dense
+    # search() with a server-side RRF fusion over two prefetch legs
+    # (dense on the unnamed '' vector, lexical on the named 'lex'
+    # sparse vector). Qdrant runs both legs inside one request, fuses
+    # by reciprocal rank (1/(k+rank), k=60), and the fused score lands
+    # in r.score — everything downstream (dedup → intent boost →
+    # MMR/temporal → quota → hydration) is untouched.
+    #
+    # ⚠️ SCORE-SCALE CAVEAT (recalibration required before default-on):
+    # RRF scores live on a ~0.006–0.033 scale, NOT the cosine 0.7–0.85
+    # scale the intent-boost magnitudes (+0.02…+0.08, see INTENT_BOOSTS)
+    # were tuned against. With hybrid on, those additive boosts dominate
+    # the fused ranking instead of nudging it. The flag-off default
+    # protects prod until eval-harness numbers (eval/recall_at_k.py)
+    # exist to recalibrate them. `min_score` is likewise a cosine-scale
+    # knob, so it is NOT applied to the fused path.
+    #
+    # A sparse-encode failure (e.g. fastembed missing/model fetch
+    # failed) logs and falls back to the legacy dense-only path —
+    # /search availability never depends on the lexical leg.
+    sparse_qvec: Any | None = None
+    if SEARCH_HYBRID_ENABLED:
+        try:
+            sparse_qvec = await _sparse_encode_query(req.query)
+        except Exception as e:
+            log.warning(f"sparse query encode failed; dense-only fallback: {e}")
+    if sparse_qvec is not None:
+        fused = await _qdrant.query_points(
+            collection_name=COLLECTION_NAME,
+            prefetch=[
+                qmodels.Prefetch(
+                    query=qvec,
+                    using="",  # the unnamed dense vector's internal name
+                    filter=filter_,
+                    limit=fetch_limit,
+                ),
+                qmodels.Prefetch(
+                    query=sparse_qvec,
+                    using=SPARSE_VECTOR_NAME,
+                    filter=filter_,
+                    limit=fetch_limit,
+                ),
+            ],
+            query=qmodels.FusionQuery(fusion=qmodels.Fusion.RRF),
+            limit=fetch_limit,
+            with_payload=True,
+            with_vectors=fetch_vectors,
+        )
+        raw_results = fused.points
+    else:
+        raw_results = await _qdrant.search(
+            collection_name=COLLECTION_NAME,
+            query_vector=qvec,
+            query_filter=filter_,
+            limit=fetch_limit,
+            score_threshold=req.min_score,
+            with_payload=True,
+            with_vectors=fetch_vectors,
+        )
     # (a) dedup by event_id — first occurrence wins (highest score).
     seen_eids: set[str] = set()

package/packages/memory-engine-v2/docker-compose.aws.yml CHANGED Viewed

@@ -19,6 +19,14 @@
 services:
   org-model:
+    # max_connections + shared_buffers must be passed via `-c` flags;
+    # the postgres:16-alpine image does NOT honor POSTGRES_MAX_CONNECTIONS
+    # or POSTGRES_SHARED_BUFFERS env vars (only POSTGRES_USER/PASSWORD/DB).
+    # 2026-05-19: bumped from compiled default 100 -> 200 after Pip's
+    # aborted-forget incident saturated the slots (4 stuck DELETEs +
+    # baseline pools). Shared_buffers raised to match the operator intent
+    # that was previously expressed in the unread env vars.
+    command: ["postgres", "-c", "max_connections=200", "-c", "shared_buffers=1GB"]
     environment:
       # Production tuning: bigger shared_buffers for the materialised
       # views, more connection slots for the extractor + compat pools.
@@ -45,8 +53,53 @@ services:
       PG_DSN: ${PME_V2_PG_DSN}
       LLM_ENDPOINT: ${PME_V2_LLM_ENDPOINT:-}
       LLM_API_KEY: ${PENTATONIC_AI_GATEWAY_KEY:-}
+      # Default model id for the AWS self-hosted distiller (Qwen2.5-7B-Instruct
+      # via vLLM on i-0d658d1aa70b497a6, served as `qwen2.5-7b-instruct`).
+      # When PME_V2_LLM_ENDPOINT points back at the Lambda 30B gateway,
+      # override LLM_MODEL via env to that gateway's model id.
+      LLM_MODEL: ${LLM_MODEL:-qwen2.5-7b-instruct}
+      # Self-hosted distiller (Qwen3.6-27B-FP8 on L40S, served via the
+      # autoscaled fleet). Tuning vs the Lambda 30B fleet: smaller
+      # per-call chunks, higher concurrency, longer timeout.
+      #
+      # EVENTS_PER_LLM_CALL=3 (was 5) + LLM_MAX_TOKENS_PER_EVENT_JSON=900
+      # (was the 400 default): the guided-JSON max_tokens budget is
+      # SHARED across the chunk's events, so dense events (full email/doc
+      # bodies maxing 8 ent/6 fct/6 rel ≈ ~1.1k output tokens each)
+      # clustering in a 5-event chunk overran the old 2000-tok ceiling
+      # and truncated the JSON array tail — 15% of calls finished on
+      # `length` not `stop` (measured 2026-06-12). 3×900=2700 output +
+      # ~2100 prompt = ~4.8k, well inside the L40S's 8192 max-model-len
+      # (16384 OOMs the L40S), giving every event real headroom.
+      # Quality over throughput — the autoscaler adds boxes to recover
+      # the per-box throughput lost to smaller chunks.
+      EVENTS_PER_LLM_CALL: "3"
+      CONCURRENT_LLM_CALLS: "20"
+      LLM_MAX_TOKENS_PER_EVENT_JSON: "900"
+      LLM_TIMEOUT_SEC: "300"
       POLL_INTERVAL_SEC: "10"
-      CLAIM_TTL_SEC: "600"
+      CLAIM_TTL_SEC: "900"
+      POLL_INTERVAL_SEC_AFTER_EMPTY: "5"
+      # Skip-source list — never distil agent's-own-output, code ingest,
+      # orchestrator briefings, manual triage events into the graph.
+      # Source labels enumerated as they were observed leaking into prod
+      # over the weekend. New agent producers should be added here AND
+      # source_kind='agent' filtering should already drop them via worker.py.
+      DISTILL_SKIP_SOURCES: "pip-code-ingest,claude-code-plugin,openclaw-seesa,openclaw-plugin,openclaw-philip-mossop,openclaw-jamie,seesa,seesa-direct-curl-test,seesa-dedup-probe,orchestrator-web,briefing-morning,briefing-eod,triage-email,triage-manual"
+      # Trace logging — captures raw teacher I/O per distilled event into
+      # the distillation_traces table for student-model training data.
+      # Opt-in: defaults false here; set DISTILL_TRACE_ENABLED=true in
+      # SSM Parameter Store to flip on. See ai-events-sdk PR #74 for the
+      # worker-side logic + the migration that creates the table.
+      DISTILL_TRACE_ENABLED: ${DISTILL_TRACE_ENABLED:-false}
+      DISTILL_OUTPUT_MODE: ${DISTILL_OUTPUT_MODE:-kv}
+      DISTILL_GUIDED_PARAM_STYLE: ${DISTILL_GUIDED_PARAM_STYLE:-response_format}
+      # Chat-template switches forwarded verbatim on every completion
+      # (vLLM `chat_template_kwargs`). Required for thinking-capable
+      # teachers — Qwen3.x defaults enable_thinking=true, which burns
+      # the token budget on reasoning the distiller never reads. Set in
+      # SSM to '{"enable_thinking": false}' for the Qwen3.6 teacher.
+      DISTILL_CHAT_TEMPLATE_KWARGS: ${DISTILL_CHAT_TEMPLATE_KWARGS:-}
   compat:
     environment:
@@ -54,8 +107,15 @@ services:
       VECTOR_INDEX_URL: http://vector-index:6333
       EXTRACTOR_SYNC_URL: http://extractor-sync:8101
       NV_EMBED_URL: ${NV_EMBED_URL}
+      # Bulk embed lane (PR #76 ai-events-sdk) — separate box from the
+      # interactive lane so heavy backfills don't queue behind chat
+      # query embeds. Set in SSM to a different IP from NV_EMBED_URL.
+      NV_EMBED_URL_BULK: ${NV_EMBED_URL_BULK}
       NV_EMBED_API_KEY: ${PENTATONIC_AI_GATEWAY_KEY}
       NV_EMBED_PROVIDER: pentatonic-gateway
+      SEARCH_HYBRID_ENABLED: ${SEARCH_HYBRID_ENABLED:-}
+      SEARCH_MMR_ENABLED: ${SEARCH_MMR_ENABLED:-1}
+      SEARCH_INTENT_BOOST: ${SEARCH_INTENT_BOOST:-1}
       EMBED_DIM: "4096"
   # Cloudflared tunnel — same pattern as v1. Optional; only start if
@@ -76,3 +136,4 @@ services:
     depends_on:
       compat:
         condition: service_healthy

package/packages/memory-engine-v2/docker-compose.yml CHANGED Viewed

@@ -74,7 +74,14 @@ services:
   # --------------------------------------------------------------------
   vector-index:
     <<: *engine-base
-    image: qdrant/qdrant:v1.12.4
+    # v1.18.2: minimum version whose API can ADD a named (sparse) vector
+    # to an existing collection (PUT /collections/{c}/vectors/{v}) —
+    # required by hybrid retrieval's 'lex' migration. Upgraded in prod
+    # 2026-06-11 by stepping minors 1.13.6→…→1.18.2 (the 1.12→1.18
+    # direct jump fails: segment.json "unknown variant `on_disk`").
+    # Do NOT lower this pin: 1.18-migrated storage cannot be read by
+    # older servers.
+    image: qdrant/qdrant:v1.18.2
     container_name: pme2-vector-index
     ports:
       - "127.0.0.1:${PME_V2_QDRANT_HTTP_PORT:-16333}:6333"