@pentatonic-ai/ai-agent-sdk 0.10.5 → 0.10.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1 -1
- package/dist/index.js +1 -1
- package/package.json +1 -1
- package/packages/memory-engine-v2/compat/requirements.txt +6 -0
- package/packages/memory-engine-v2/compat/server.py +258 -18
- package/packages/memory-engine-v2/docker-compose.aws.yml +62 -1
- package/packages/memory-engine-v2/docker-compose.yml +8 -1
- package/packages/memory-engine-v2/eval/recall_at_k.py +242 -0
- package/packages/memory-engine-v2/eval/retrieval_golden.seed.json +69 -0
- package/packages/memory-engine-v2/extractor-async/Dockerfile +1 -1
- package/packages/memory-engine-v2/extractor-async/extraction_schema.py +246 -0
- package/packages/memory-engine-v2/extractor-async/test_guided_json_parser.py +455 -0
- package/packages/memory-engine-v2/extractor-async/worker.py +391 -31
- package/packages/memory-engine-v2/extractor-sync/server.py +6 -2
- package/packages/memory-engine-v2/extractor-sync/test_paired_extraction.py +82 -1
- package/packages/memory-engine-v2/org-model/migrations/004_source_kind_code_reference.sql +12 -0
- package/packages/memory-engine-v2/org-model/migrations/005_fk_indexes.sql +20 -0
- package/packages/memory-engine-v2/resolution-queue-design.md +165 -0
- package/packages/memory-engine-v2/scripts/backfill_entity_reconciliation.py +11 -2
- package/packages/memory-engine-v2/scripts/backfill_sparse_vectors.py +369 -0
- package/packages/memory-engine-v2/scripts/bakeoff_guided_vs_kv.py +607 -0
- package/packages/memory-engine-v2/scripts/entity_resolution_v2.py +1041 -0
- package/packages/memory-engine-v2/tests/test_entity_resolution_v2.py +507 -0
- package/packages/memory-engine-v2/tests/test_hybrid_retrieval.py +810 -0
package/dist/index.cjs
CHANGED
|
@@ -878,7 +878,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
|
|
|
878
878
|
}
|
|
879
879
|
|
|
880
880
|
// src/telemetry.js
|
|
881
|
-
var VERSION = "0.10.
|
|
881
|
+
var VERSION = "0.10.7";
|
|
882
882
|
var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
|
|
883
883
|
function machineId() {
|
|
884
884
|
const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
|
package/dist/index.js
CHANGED
|
@@ -847,7 +847,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
|
|
|
847
847
|
}
|
|
848
848
|
|
|
849
849
|
// src/telemetry.js
|
|
850
|
-
var VERSION = "0.10.
|
|
850
|
+
var VERSION = "0.10.7";
|
|
851
851
|
var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
|
|
852
852
|
function machineId() {
|
|
853
853
|
const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@pentatonic-ai/ai-agent-sdk",
|
|
3
|
-
"version": "0.10.
|
|
3
|
+
"version": "0.10.7",
|
|
4
4
|
"description": "TES SDK — LLM observability and lifecycle tracking via Pentatonic Thing Event System. Track token usage, tool calls, and conversations. Manage things through event-sourced lifecycle stages with AI enrichment and vector search.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.cjs",
|
|
@@ -4,3 +4,9 @@ psycopg[binary,pool]==3.2.3
|
|
|
4
4
|
httpx==0.27.2
|
|
5
5
|
qdrant-client==1.12.1
|
|
6
6
|
pydantic==2.9.2
|
|
7
|
+
# BET 3 (hybrid retrieval): CPU-only BM25 sparse encoder for the named
|
|
8
|
+
# 'lex' vector. 0.3.6 = the exact pin qdrant-client 1.12.1's own
|
|
9
|
+
# [fastembed] extra uses (and it requires python <3.13 — the compat
|
|
10
|
+
# image is python:3.12-slim). Only imported lazily when
|
|
11
|
+
# SEARCH_HYBRID_ENABLED is on; flag-off behavior is unchanged.
|
|
12
|
+
fastembed==0.3.6
|
|
@@ -102,6 +102,24 @@ SEARCH_INTENT_BOOST = os.environ.get("SEARCH_INTENT_BOOST", "1") not in ("0", "f
|
|
|
102
102
|
# without a parseable timestamp sink to the bottom but aren't dropped.
|
|
103
103
|
SEARCH_TEMPORAL_RERANK = os.environ.get("SEARCH_TEMPORAL_RERANK", "1") not in ("0", "false", "")
|
|
104
104
|
|
|
105
|
+
# ── Hybrid lexical+dense retrieval (roadmap BET 3) ───────────────────
|
|
106
|
+
# SEARCH_HYBRID_ENABLED gates EVERY hybrid behavior in one switch:
|
|
107
|
+
# - /store and /store-batch additionally write a NAMED sparse vector
|
|
108
|
+
# ("lex", BM25 term weights via fastembed, CPU-only) alongside the
|
|
109
|
+
# existing unnamed dense vector. The dense embedder + its vectors
|
|
110
|
+
# are never touched — additive only, zero dense re-embed.
|
|
111
|
+
# - startup runs an idempotent update_collection to add the sparse
|
|
112
|
+
# vector config ("lex": IDF modifier, on-disk index) when missing.
|
|
113
|
+
# - /search swaps the single dense search() for a server-side
|
|
114
|
+
# RRF-fused query_points(prefetch=[dense, lex]) — everything
|
|
115
|
+
# downstream (dedup → intent boost → MMR/temporal → quota →
|
|
116
|
+
# hydration) is untouched; the RRF score lands in r.score.
|
|
117
|
+
# Default OFF (env unset/0/false): the request path is byte-identical
|
|
118
|
+
# to pre-hybrid behavior and fastembed is never imported at all.
|
|
119
|
+
SEARCH_HYBRID_ENABLED = os.environ.get("SEARCH_HYBRID_ENABLED", "") not in ("", "0", "false")
|
|
120
|
+
SPARSE_VECTOR_NAME = "lex"
|
|
121
|
+
SPARSE_MODEL_NAME = os.environ.get("SEARCH_SPARSE_MODEL", "Qdrant/bm25")
|
|
122
|
+
|
|
105
123
|
TEMPORAL_INTENT_RE = re.compile(
|
|
106
124
|
r"\b(when did|when was|last (?:time|met|saw|spoke|called)|"
|
|
107
125
|
r"how long ago|first time (?:i|we) (?:met|saw|spoke)|recent(?:ly)?|"
|
|
@@ -116,6 +134,18 @@ FACTUAL_INTENT_RE = re.compile(
|
|
|
116
134
|
)
|
|
117
135
|
INTENT_BOOSTS: dict[str, dict[str, float]] = {
|
|
118
136
|
# source_kind -> additive boost on cosine score
|
|
137
|
+
#
|
|
138
|
+
# ⚠️ HYBRID-RRF RECALIBRATION NEEDED (BET 3): these magnitudes were
|
|
139
|
+
# tuned against COSINE similarity scores (typical 0.7–0.85 range,
|
|
140
|
+
# where +0.06 flips a near-tie). When SEARCH_HYBRID_ENABLED is on,
|
|
141
|
+
# /search returns RRF fusion scores instead — 1/(k+rank) with
|
|
142
|
+
# Qdrant's k=60, i.e. ~0.016 at rank 1 decaying to ~0.006 at rank
|
|
143
|
+
# 100. On that scale a +0.06 additive boost is no longer a nudge:
|
|
144
|
+
# it catapults any matching source_kind above EVERY un-boosted
|
|
145
|
+
# result regardless of rank. Do not flip the hybrid flag to
|
|
146
|
+
# default-on until these are recalibrated against eval-harness
|
|
147
|
+
# numbers (see eval/recall_at_k.py); flag-off default protects
|
|
148
|
+
# prod until then.
|
|
119
149
|
"temporal": {"event": 0.08, "doc": 0.04, "note": 0.02},
|
|
120
150
|
"factual": {"doc": 0.06, "note": 0.03, "event": 0.03},
|
|
121
151
|
}
|
|
@@ -184,6 +214,64 @@ def _apply_temporal_sort(
|
|
|
184
214
|
return sorted(results, key=neg_ts)
|
|
185
215
|
|
|
186
216
|
|
|
217
|
+
# ── Sparse (BM25) encoding — hybrid retrieval, BET 3 ─────────────────
|
|
218
|
+
# fastembed's Qdrant/bm25 sparse encoder. CPU-only — no GPU contention
|
|
219
|
+
# with the dense embed gateway. Lazily initialised so that (a) flag-off
|
|
220
|
+
# deployments never import fastembed (it isn't even a hard dependency
|
|
221
|
+
# of the request path) and (b) the model artifact download happens on
|
|
222
|
+
# first use, not at process start.
|
|
223
|
+
_sparse_encoder: Any | None = None
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def _get_sparse_encoder() -> Any:
|
|
227
|
+
global _sparse_encoder
|
|
228
|
+
if _sparse_encoder is None:
|
|
229
|
+
# Deferred import — module load must stay fastembed-free when
|
|
230
|
+
# SEARCH_HYBRID_ENABLED is off.
|
|
231
|
+
from fastembed import SparseTextEmbedding
|
|
232
|
+
|
|
233
|
+
_sparse_encoder = SparseTextEmbedding(model_name=SPARSE_MODEL_NAME)
|
|
234
|
+
log.info(f"sparse encoder initialised: {SPARSE_MODEL_NAME}")
|
|
235
|
+
return _sparse_encoder
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def _to_sparse_vector(emb: Any) -> qmodels.SparseVector:
|
|
239
|
+
"""fastembed SparseEmbedding (numpy indices/values) → Qdrant model."""
|
|
240
|
+
return qmodels.SparseVector(
|
|
241
|
+
indices=[int(i) for i in emb.indices],
|
|
242
|
+
values=[float(v) for v in emb.values],
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
async def _sparse_encode_documents(texts: list[str]) -> list[qmodels.SparseVector]:
|
|
247
|
+
"""BM25-encode full document content for the named 'lex' vector.
|
|
248
|
+
Runs in a thread — fastembed is synchronous CPU work and must not
|
|
249
|
+
block the event loop under concurrent /store-batch load."""
|
|
250
|
+
enc = _get_sparse_encoder()
|
|
251
|
+
embs = await asyncio.to_thread(lambda: list(enc.embed(texts)))
|
|
252
|
+
return [_to_sparse_vector(e) for e in embs]
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
async def _sparse_encode_query(text: str) -> qmodels.SparseVector:
|
|
256
|
+
"""BM25-encode a query. `query_embed` (not `embed`) — BM25 weights
|
|
257
|
+
documents by term frequency/length but queries as bare term sets;
|
|
258
|
+
the IDF half lives server-side via Modifier.IDF on the collection."""
|
|
259
|
+
enc = _get_sparse_encoder()
|
|
260
|
+
embs = await asyncio.to_thread(lambda: list(enc.query_embed(text)))
|
|
261
|
+
return _to_sparse_vector(embs[0])
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def _dense_vector_of(candidate: Any) -> Any:
|
|
265
|
+
"""Extract the dense vector from a scored point. With hybrid on,
|
|
266
|
+
Qdrant returns the full named-vector bag ({'': dense, 'lex':
|
|
267
|
+
sparse}); the dense vector rides the default '' slot. Flag-off
|
|
268
|
+
points return the bare list unchanged."""
|
|
269
|
+
v = getattr(candidate, "vector", None)
|
|
270
|
+
if isinstance(v, dict):
|
|
271
|
+
return v.get("")
|
|
272
|
+
return v
|
|
273
|
+
|
|
274
|
+
|
|
187
275
|
def _mmr_select(
|
|
188
276
|
candidates: list[Any], target: int, lambda_: float
|
|
189
277
|
) -> list[Any]:
|
|
@@ -199,10 +287,12 @@ def _mmr_select(
|
|
|
199
287
|
if not candidates or target <= 0:
|
|
200
288
|
return []
|
|
201
289
|
# Bail to pure-relevance ordering if vectors weren't returned.
|
|
202
|
-
|
|
290
|
+
# (_dense_vector_of unwraps the hybrid named-vector bag; flag-off
|
|
291
|
+
# bare-list vectors pass through unchanged.)
|
|
292
|
+
if any(_dense_vector_of(c) is None for c in candidates):
|
|
203
293
|
return sorted(candidates, key=lambda r: r.score, reverse=True)[:target]
|
|
204
294
|
|
|
205
|
-
vecs = np.asarray([c
|
|
295
|
+
vecs = np.asarray([_dense_vector_of(c) for c in candidates], dtype=np.float32)
|
|
206
296
|
scores = np.asarray([c.score for c in candidates], dtype=np.float32)
|
|
207
297
|
# Precompute pairwise similarity matrix; cheaper than per-step
|
|
208
298
|
# dot products at our scale and lets us slice into it by index.
|
|
@@ -239,6 +329,47 @@ _qdrant: AsyncQdrantClient | None = None
|
|
|
239
329
|
_http: httpx.AsyncClient | None = None
|
|
240
330
|
|
|
241
331
|
|
|
332
|
+
def _sparse_vectors_config() -> dict[str, Any]:
|
|
333
|
+
"""The 'lex' named-sparse-vector schema (BET 3).
|
|
334
|
+
|
|
335
|
+
Modifier.IDF — Qdrant computes/applies IDF server-side, so the
|
|
336
|
+
client-side BM25 encoding only needs term frequency × length
|
|
337
|
+
normalisation (which is exactly what fastembed's Qdrant/bm25
|
|
338
|
+
produces). on_disk index — the sparse index joins the dense
|
|
339
|
+
vectors on disk rather than competing for RAM; the 06-05 outage
|
|
340
|
+
was disk pressure, not RAM, and mmap/page-cache governs hot set
|
|
341
|
+
the same way the dense side is configured."""
|
|
342
|
+
return {
|
|
343
|
+
SPARSE_VECTOR_NAME: qmodels.SparseVectorParams(
|
|
344
|
+
modifier=qmodels.Modifier.IDF,
|
|
345
|
+
index=qmodels.SparseIndexParams(on_disk=True),
|
|
346
|
+
)
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
async def _ensure_sparse_vector_config() -> bool:
|
|
351
|
+
"""Idempotent collection migration: add the 'lex' sparse vector
|
|
352
|
+
config to the existing collection when missing. Called from
|
|
353
|
+
lifespan only when SEARCH_HYBRID_ENABLED — flag-off startups never
|
|
354
|
+
touch the collection config. Adding a sparse vector config is
|
|
355
|
+
additive metadata: existing points and the unnamed dense vector
|
|
356
|
+
are untouched (no re-embed, no rebuild). Returns True if the
|
|
357
|
+
config was added, False if already present."""
|
|
358
|
+
info = await _qdrant.get_collection(COLLECTION_NAME)
|
|
359
|
+
existing = getattr(info.config.params, "sparse_vectors", None) or {}
|
|
360
|
+
if SPARSE_VECTOR_NAME in existing:
|
|
361
|
+
return False
|
|
362
|
+
await _qdrant.update_collection(
|
|
363
|
+
collection_name=COLLECTION_NAME,
|
|
364
|
+
sparse_vectors_config=_sparse_vectors_config(),
|
|
365
|
+
)
|
|
366
|
+
log.info(
|
|
367
|
+
f"added sparse vector config '{SPARSE_VECTOR_NAME}' "
|
|
368
|
+
f"(modifier=idf, on_disk=true) to collection {COLLECTION_NAME}"
|
|
369
|
+
)
|
|
370
|
+
return True
|
|
371
|
+
|
|
372
|
+
|
|
242
373
|
@asynccontextmanager
|
|
243
374
|
async def lifespan(app: FastAPI):
|
|
244
375
|
global _pool, _qdrant, _http
|
|
@@ -260,6 +391,13 @@ async def lifespan(app: FastAPI):
|
|
|
260
391
|
collections = await _qdrant.get_collections()
|
|
261
392
|
names = {c.name for c in collections.collections}
|
|
262
393
|
if COLLECTION_NAME not in names:
|
|
394
|
+
create_kwargs: dict[str, Any] = {}
|
|
395
|
+
if SEARCH_HYBRID_ENABLED:
|
|
396
|
+
# Fresh collection with the flag on gets the 'lex'
|
|
397
|
+
# sparse config at creation time (BET 3); existing
|
|
398
|
+
# collections are migrated by
|
|
399
|
+
# _ensure_sparse_vector_config below.
|
|
400
|
+
create_kwargs["sparse_vectors_config"] = _sparse_vectors_config()
|
|
263
401
|
await _qdrant.create_collection(
|
|
264
402
|
collection_name=COLLECTION_NAME,
|
|
265
403
|
vectors_config=qmodels.VectorParams(
|
|
@@ -275,6 +413,7 @@ async def lifespan(app: FastAPI):
|
|
|
275
413
|
always_ram=False,
|
|
276
414
|
)
|
|
277
415
|
),
|
|
416
|
+
**create_kwargs,
|
|
278
417
|
)
|
|
279
418
|
log.info(f"created qdrant collection: {COLLECTION_NAME} dim={EMBED_DIM}")
|
|
280
419
|
# Payload indexes for fast filtered search (this is the
|
|
@@ -286,6 +425,12 @@ async def lifespan(app: FastAPI):
|
|
|
286
425
|
field_schema=qmodels.PayloadSchemaType.KEYWORD,
|
|
287
426
|
)
|
|
288
427
|
log.info("created qdrant payload indexes: arena, source_kind, clientId, userId")
|
|
428
|
+
if SEARCH_HYBRID_ENABLED:
|
|
429
|
+
# BET 3 migration — idempotent, additive-only; no-op when
|
|
430
|
+
# the 'lex' config is already present. Flag-off startups
|
|
431
|
+
# never reach this line, so the collection config is
|
|
432
|
+
# byte-identical to today until the flag is flipped.
|
|
433
|
+
await _ensure_sparse_vector_config()
|
|
289
434
|
except Exception as e:
|
|
290
435
|
log.error(f"qdrant init error: {e}")
|
|
291
436
|
# Don't crash compat on Qdrant init failure — let liveness
|
|
@@ -553,6 +698,16 @@ async def store(req: StoreRequest):
|
|
|
553
698
|
event_id = await _extract(arena, clientId, userId, source_kind, req.content, meta)
|
|
554
699
|
embeddings = await _embed_batch([req.content])
|
|
555
700
|
|
|
701
|
+
# BET 3: BM25-encode the FULL content into the named 'lex' sparse
|
|
702
|
+
# vector. Encode failure degrades to dense-only (ingest must not
|
|
703
|
+
# fail on the lexical leg; the backfill script repairs gaps).
|
|
704
|
+
sparse_vec: Any | None = None
|
|
705
|
+
if SEARCH_HYBRID_ENABLED:
|
|
706
|
+
try:
|
|
707
|
+
sparse_vec = (await _sparse_encode_documents([req.content]))[0]
|
|
708
|
+
except Exception as e:
|
|
709
|
+
log.warning(f"sparse encode failed; storing dense-only (backfill repairs): {e}")
|
|
710
|
+
|
|
556
711
|
vector_id = str(uuid.uuid4())
|
|
557
712
|
# Write vector_provenance + Qdrant point in the same logical
|
|
558
713
|
# operation. If Qdrant fails, the provenance row gets rolled back —
|
|
@@ -569,7 +724,15 @@ async def store(req: StoreRequest):
|
|
|
569
724
|
points=[
|
|
570
725
|
qmodels.PointStruct(
|
|
571
726
|
id=vector_id,
|
|
572
|
-
|
|
727
|
+
# Flag-off: bare dense list — byte-identical to
|
|
728
|
+
# today. Flag-on: named-vector bag; the dense
|
|
729
|
+
# vector keeps its unnamed ('') slot, 'lex' is
|
|
730
|
+
# purely additive.
|
|
731
|
+
vector=(
|
|
732
|
+
embeddings[0]
|
|
733
|
+
if sparse_vec is None
|
|
734
|
+
else {"": embeddings[0], SPARSE_VECTOR_NAME: sparse_vec}
|
|
735
|
+
),
|
|
573
736
|
# Issue #345 (caps #342/#343/#344): Pip emits a rich
|
|
574
737
|
# metadata bag — timestamp, contact_email, channel,
|
|
575
738
|
# kind, direction, source, etc. Pre-fix the payload
|
|
@@ -621,6 +784,22 @@ async def store_batch(req: StoreBatchRequest):
|
|
|
621
784
|
if len(embeddings) != len(texts):
|
|
622
785
|
raise HTTPException(500, f"embed count mismatch: {len(embeddings)} vs {len(texts)}")
|
|
623
786
|
|
|
787
|
+
# BET 3: sparse-encode the FULL content batch for the named 'lex'
|
|
788
|
+
# vector. Best-effort — a sparse failure degrades the whole batch
|
|
789
|
+
# to dense-only rather than failing ingest (backfill repairs).
|
|
790
|
+
sparse_vecs: list[Any] | None = None
|
|
791
|
+
if SEARCH_HYBRID_ENABLED:
|
|
792
|
+
try:
|
|
793
|
+
sparse_vecs = await _sparse_encode_documents(texts)
|
|
794
|
+
if len(sparse_vecs) != len(texts):
|
|
795
|
+
log.warning(
|
|
796
|
+
f"sparse encode count mismatch ({len(sparse_vecs)} vs {len(texts)}); storing dense-only"
|
|
797
|
+
)
|
|
798
|
+
sparse_vecs = None
|
|
799
|
+
except Exception as e:
|
|
800
|
+
log.warning(f"sparse encode failed; storing dense-only (backfill repairs): {e}")
|
|
801
|
+
sparse_vecs = None
|
|
802
|
+
|
|
624
803
|
# Resolve per-record routing fields first so we can fan out the
|
|
625
804
|
# extractor-sync calls in parallel. Each _extract is a network
|
|
626
805
|
# round-trip; serialising them was the dominant cost in /store-batch
|
|
@@ -644,9 +823,9 @@ async def store_batch(req: StoreBatchRequest):
|
|
|
644
823
|
ids: list[str] = []
|
|
645
824
|
points: list[qmodels.PointStruct] = []
|
|
646
825
|
provenance_rows: list[tuple] = []
|
|
647
|
-
for (arena, clientId, userId, source_kind, content, meta), vec, event_id in zip(
|
|
826
|
+
for idx, ((arena, clientId, userId, source_kind, content, meta), vec, event_id) in enumerate(zip(
|
|
648
827
|
resolved, embeddings, event_ids
|
|
649
|
-
):
|
|
828
|
+
)):
|
|
650
829
|
vector_id = str(uuid.uuid4())
|
|
651
830
|
provenance_rows.append((vector_id, event_id, "nv-embed-v2", EMBED_DIM))
|
|
652
831
|
# See /store above — issue #345. Spread the caller's metadata
|
|
@@ -655,7 +834,13 @@ async def store_batch(req: StoreBatchRequest):
|
|
|
655
834
|
# work with. Structural keys override on collision.
|
|
656
835
|
points.append(qmodels.PointStruct(
|
|
657
836
|
id=vector_id,
|
|
658
|
-
|
|
837
|
+
# BET 3: flag-off keeps the bare dense list (byte-identical
|
|
838
|
+
# to today); flag-on adds the named 'lex' sparse vector.
|
|
839
|
+
vector=(
|
|
840
|
+
vec
|
|
841
|
+
if sparse_vecs is None
|
|
842
|
+
else {"": vec, SPARSE_VECTOR_NAME: sparse_vecs[idx]}
|
|
843
|
+
),
|
|
659
844
|
payload={
|
|
660
845
|
**(meta or {}),
|
|
661
846
|
"event_id": event_id,
|
|
@@ -896,18 +1081,73 @@ async def search(req: SearchRequest):
|
|
|
896
1081
|
# vector-payload bandwidth (4096 × float32 × overfetch) when
|
|
897
1082
|
# vectors won't be used.
|
|
898
1083
|
temporal_active = (intent == "temporal") and SEARCH_TEMPORAL_RERANK
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
)
|
|
1084
|
+
fetch_limit = max(overfetch, target_limit)
|
|
1085
|
+
# Phase 3 (#343): MMR needs the actual vectors to score pairwise
|
|
1086
|
+
# similarity. Only pull them when MMR is enabled AND we aren't
|
|
1087
|
+
# about to skip MMR for a temporal re-rank.
|
|
1088
|
+
fetch_vectors = SEARCH_MMR_ENABLED and not temporal_active
|
|
1089
|
+
|
|
1090
|
+
# ── BET 3: hybrid lexical+dense retrieval ────────────────────────
|
|
1091
|
+
# Flag on → encode the query with BM25 and replace the single dense
|
|
1092
|
+
# search() with a server-side RRF fusion over two prefetch legs
|
|
1093
|
+
# (dense on the unnamed '' vector, lexical on the named 'lex'
|
|
1094
|
+
# sparse vector). Qdrant runs both legs inside one request, fuses
|
|
1095
|
+
# by reciprocal rank (1/(k+rank), k=60), and the fused score lands
|
|
1096
|
+
# in r.score — everything downstream (dedup → intent boost →
|
|
1097
|
+
# MMR/temporal → quota → hydration) is untouched.
|
|
1098
|
+
#
|
|
1099
|
+
# ⚠️ SCORE-SCALE CAVEAT (recalibration required before default-on):
|
|
1100
|
+
# RRF scores live on a ~0.006–0.033 scale, NOT the cosine 0.7–0.85
|
|
1101
|
+
# scale the intent-boost magnitudes (+0.02…+0.08, see INTENT_BOOSTS)
|
|
1102
|
+
# were tuned against. With hybrid on, those additive boosts dominate
|
|
1103
|
+
# the fused ranking instead of nudging it. The flag-off default
|
|
1104
|
+
# protects prod until eval-harness numbers (eval/recall_at_k.py)
|
|
1105
|
+
# exist to recalibrate them. `min_score` is likewise a cosine-scale
|
|
1106
|
+
# knob, so it is NOT applied to the fused path.
|
|
1107
|
+
#
|
|
1108
|
+
# A sparse-encode failure (e.g. fastembed missing/model fetch
|
|
1109
|
+
# failed) logs and falls back to the legacy dense-only path —
|
|
1110
|
+
# /search availability never depends on the lexical leg.
|
|
1111
|
+
sparse_qvec: Any | None = None
|
|
1112
|
+
if SEARCH_HYBRID_ENABLED:
|
|
1113
|
+
try:
|
|
1114
|
+
sparse_qvec = await _sparse_encode_query(req.query)
|
|
1115
|
+
except Exception as e:
|
|
1116
|
+
log.warning(f"sparse query encode failed; dense-only fallback: {e}")
|
|
1117
|
+
|
|
1118
|
+
if sparse_qvec is not None:
|
|
1119
|
+
fused = await _qdrant.query_points(
|
|
1120
|
+
collection_name=COLLECTION_NAME,
|
|
1121
|
+
prefetch=[
|
|
1122
|
+
qmodels.Prefetch(
|
|
1123
|
+
query=qvec,
|
|
1124
|
+
using="", # the unnamed dense vector's internal name
|
|
1125
|
+
filter=filter_,
|
|
1126
|
+
limit=fetch_limit,
|
|
1127
|
+
),
|
|
1128
|
+
qmodels.Prefetch(
|
|
1129
|
+
query=sparse_qvec,
|
|
1130
|
+
using=SPARSE_VECTOR_NAME,
|
|
1131
|
+
filter=filter_,
|
|
1132
|
+
limit=fetch_limit,
|
|
1133
|
+
),
|
|
1134
|
+
],
|
|
1135
|
+
query=qmodels.FusionQuery(fusion=qmodels.Fusion.RRF),
|
|
1136
|
+
limit=fetch_limit,
|
|
1137
|
+
with_payload=True,
|
|
1138
|
+
with_vectors=fetch_vectors,
|
|
1139
|
+
)
|
|
1140
|
+
raw_results = fused.points
|
|
1141
|
+
else:
|
|
1142
|
+
raw_results = await _qdrant.search(
|
|
1143
|
+
collection_name=COLLECTION_NAME,
|
|
1144
|
+
query_vector=qvec,
|
|
1145
|
+
query_filter=filter_,
|
|
1146
|
+
limit=fetch_limit,
|
|
1147
|
+
score_threshold=req.min_score,
|
|
1148
|
+
with_payload=True,
|
|
1149
|
+
with_vectors=fetch_vectors,
|
|
1150
|
+
)
|
|
911
1151
|
|
|
912
1152
|
# (a) dedup by event_id — first occurrence wins (highest score).
|
|
913
1153
|
seen_eids: set[str] = set()
|
|
@@ -19,6 +19,14 @@
|
|
|
19
19
|
|
|
20
20
|
services:
|
|
21
21
|
org-model:
|
|
22
|
+
# max_connections + shared_buffers must be passed via `-c` flags;
|
|
23
|
+
# the postgres:16-alpine image does NOT honor POSTGRES_MAX_CONNECTIONS
|
|
24
|
+
# or POSTGRES_SHARED_BUFFERS env vars (only POSTGRES_USER/PASSWORD/DB).
|
|
25
|
+
# 2026-05-19: bumped from compiled default 100 -> 200 after Pip's
|
|
26
|
+
# aborted-forget incident saturated the slots (4 stuck DELETEs +
|
|
27
|
+
# baseline pools). Shared_buffers raised to match the operator intent
|
|
28
|
+
# that was previously expressed in the unread env vars.
|
|
29
|
+
command: ["postgres", "-c", "max_connections=200", "-c", "shared_buffers=1GB"]
|
|
22
30
|
environment:
|
|
23
31
|
# Production tuning: bigger shared_buffers for the materialised
|
|
24
32
|
# views, more connection slots for the extractor + compat pools.
|
|
@@ -45,8 +53,53 @@ services:
|
|
|
45
53
|
PG_DSN: ${PME_V2_PG_DSN}
|
|
46
54
|
LLM_ENDPOINT: ${PME_V2_LLM_ENDPOINT:-}
|
|
47
55
|
LLM_API_KEY: ${PENTATONIC_AI_GATEWAY_KEY:-}
|
|
56
|
+
# Default model id for the AWS self-hosted distiller (Qwen2.5-7B-Instruct
|
|
57
|
+
# via vLLM on i-0d658d1aa70b497a6, served as `qwen2.5-7b-instruct`).
|
|
58
|
+
# When PME_V2_LLM_ENDPOINT points back at the Lambda 30B gateway,
|
|
59
|
+
# override LLM_MODEL via env to that gateway's model id.
|
|
60
|
+
LLM_MODEL: ${LLM_MODEL:-qwen2.5-7b-instruct}
|
|
61
|
+
# Self-hosted distiller (Qwen3.6-27B-FP8 on L40S, served via the
|
|
62
|
+
# autoscaled fleet). Tuning vs the Lambda 30B fleet: smaller
|
|
63
|
+
# per-call chunks, higher concurrency, longer timeout.
|
|
64
|
+
#
|
|
65
|
+
# EVENTS_PER_LLM_CALL=3 (was 5) + LLM_MAX_TOKENS_PER_EVENT_JSON=900
|
|
66
|
+
# (was the 400 default): the guided-JSON max_tokens budget is
|
|
67
|
+
# SHARED across the chunk's events, so dense events (full email/doc
|
|
68
|
+
# bodies maxing 8 ent/6 fct/6 rel ≈ ~1.1k output tokens each)
|
|
69
|
+
# clustering in a 5-event chunk overran the old 2000-tok ceiling
|
|
70
|
+
# and truncated the JSON array tail — 15% of calls finished on
|
|
71
|
+
# `length` not `stop` (measured 2026-06-12). 3×900=2700 output +
|
|
72
|
+
# ~2100 prompt = ~4.8k, well inside the L40S's 8192 max-model-len
|
|
73
|
+
# (16384 OOMs the L40S), giving every event real headroom.
|
|
74
|
+
# Quality over throughput — the autoscaler adds boxes to recover
|
|
75
|
+
# the per-box throughput lost to smaller chunks.
|
|
76
|
+
EVENTS_PER_LLM_CALL: "3"
|
|
77
|
+
CONCURRENT_LLM_CALLS: "20"
|
|
78
|
+
LLM_MAX_TOKENS_PER_EVENT_JSON: "900"
|
|
79
|
+
LLM_TIMEOUT_SEC: "300"
|
|
48
80
|
POLL_INTERVAL_SEC: "10"
|
|
49
|
-
CLAIM_TTL_SEC: "
|
|
81
|
+
CLAIM_TTL_SEC: "900"
|
|
82
|
+
POLL_INTERVAL_SEC_AFTER_EMPTY: "5"
|
|
83
|
+
# Skip-source list — never distil agent's-own-output, code ingest,
|
|
84
|
+
# orchestrator briefings, manual triage events into the graph.
|
|
85
|
+
# Source labels enumerated as they were observed leaking into prod
|
|
86
|
+
# over the weekend. New agent producers should be added here AND
|
|
87
|
+
# source_kind='agent' filtering should already drop them via worker.py.
|
|
88
|
+
DISTILL_SKIP_SOURCES: "pip-code-ingest,claude-code-plugin,openclaw-seesa,openclaw-plugin,openclaw-philip-mossop,openclaw-jamie,seesa,seesa-direct-curl-test,seesa-dedup-probe,orchestrator-web,briefing-morning,briefing-eod,triage-email,triage-manual"
|
|
89
|
+
# Trace logging — captures raw teacher I/O per distilled event into
|
|
90
|
+
# the distillation_traces table for student-model training data.
|
|
91
|
+
# Opt-in: defaults false here; set DISTILL_TRACE_ENABLED=true in
|
|
92
|
+
# SSM Parameter Store to flip on. See ai-events-sdk PR #74 for the
|
|
93
|
+
# worker-side logic + the migration that creates the table.
|
|
94
|
+
DISTILL_TRACE_ENABLED: ${DISTILL_TRACE_ENABLED:-false}
|
|
95
|
+
DISTILL_OUTPUT_MODE: ${DISTILL_OUTPUT_MODE:-kv}
|
|
96
|
+
DISTILL_GUIDED_PARAM_STYLE: ${DISTILL_GUIDED_PARAM_STYLE:-response_format}
|
|
97
|
+
# Chat-template switches forwarded verbatim on every completion
|
|
98
|
+
# (vLLM `chat_template_kwargs`). Required for thinking-capable
|
|
99
|
+
# teachers — Qwen3.x defaults enable_thinking=true, which burns
|
|
100
|
+
# the token budget on reasoning the distiller never reads. Set in
|
|
101
|
+
# SSM to '{"enable_thinking": false}' for the Qwen3.6 teacher.
|
|
102
|
+
DISTILL_CHAT_TEMPLATE_KWARGS: ${DISTILL_CHAT_TEMPLATE_KWARGS:-}
|
|
50
103
|
|
|
51
104
|
compat:
|
|
52
105
|
environment:
|
|
@@ -54,8 +107,15 @@ services:
|
|
|
54
107
|
VECTOR_INDEX_URL: http://vector-index:6333
|
|
55
108
|
EXTRACTOR_SYNC_URL: http://extractor-sync:8101
|
|
56
109
|
NV_EMBED_URL: ${NV_EMBED_URL}
|
|
110
|
+
# Bulk embed lane (PR #76 ai-events-sdk) — separate box from the
|
|
111
|
+
# interactive lane so heavy backfills don't queue behind chat
|
|
112
|
+
# query embeds. Set in SSM to a different IP from NV_EMBED_URL.
|
|
113
|
+
NV_EMBED_URL_BULK: ${NV_EMBED_URL_BULK}
|
|
57
114
|
NV_EMBED_API_KEY: ${PENTATONIC_AI_GATEWAY_KEY}
|
|
58
115
|
NV_EMBED_PROVIDER: pentatonic-gateway
|
|
116
|
+
SEARCH_HYBRID_ENABLED: ${SEARCH_HYBRID_ENABLED:-}
|
|
117
|
+
SEARCH_MMR_ENABLED: ${SEARCH_MMR_ENABLED:-1}
|
|
118
|
+
SEARCH_INTENT_BOOST: ${SEARCH_INTENT_BOOST:-1}
|
|
59
119
|
EMBED_DIM: "4096"
|
|
60
120
|
|
|
61
121
|
# Cloudflared tunnel — same pattern as v1. Optional; only start if
|
|
@@ -76,3 +136,4 @@ services:
|
|
|
76
136
|
depends_on:
|
|
77
137
|
compat:
|
|
78
138
|
condition: service_healthy
|
|
139
|
+
|
|
@@ -74,7 +74,14 @@ services:
|
|
|
74
74
|
# --------------------------------------------------------------------
|
|
75
75
|
vector-index:
|
|
76
76
|
<<: *engine-base
|
|
77
|
-
|
|
77
|
+
# v1.18.2: minimum version whose API can ADD a named (sparse) vector
|
|
78
|
+
# to an existing collection (PUT /collections/{c}/vectors/{v}) —
|
|
79
|
+
# required by hybrid retrieval's 'lex' migration. Upgraded in prod
|
|
80
|
+
# 2026-06-11 by stepping minors 1.13.6→…→1.18.2 (the 1.12→1.18
|
|
81
|
+
# direct jump fails: segment.json "unknown variant `on_disk`").
|
|
82
|
+
# Do NOT lower this pin: 1.18-migrated storage cannot be read by
|
|
83
|
+
# older servers.
|
|
84
|
+
image: qdrant/qdrant:v1.18.2
|
|
78
85
|
container_name: pme2-vector-index
|
|
79
86
|
ports:
|
|
80
87
|
- "127.0.0.1:${PME_V2_QDRANT_HTTP_PORT:-16333}:6333"
|