@pentatonic-ai/ai-agent-sdk 0.10.4 → 0.10.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1 -1
- package/dist/index.js +1 -1
- package/package.json +1 -1
- package/packages/memory-engine-v2/compat/requirements.txt +6 -0
- package/packages/memory-engine-v2/compat/server.py +258 -18
- package/packages/memory-engine-v2/eval/recall_at_k.py +242 -0
- package/packages/memory-engine-v2/eval/retrieval_golden.seed.json +69 -0
- package/packages/memory-engine-v2/extractor-async/Dockerfile +1 -1
- package/packages/memory-engine-v2/extractor-async/extraction_schema.py +246 -0
- package/packages/memory-engine-v2/extractor-async/test_guided_json_parser.py +411 -0
- package/packages/memory-engine-v2/extractor-async/worker.py +417 -31
- package/packages/memory-engine-v2/resolution-queue-design.md +165 -0
- package/packages/memory-engine-v2/scripts/backfill_entity_reconciliation.py +11 -2
- package/packages/memory-engine-v2/scripts/backfill_sparse_vectors.py +369 -0
- package/packages/memory-engine-v2/scripts/bakeoff_guided_vs_kv.py +607 -0
- package/packages/memory-engine-v2/scripts/entity_resolution_v2.py +1041 -0
- package/packages/memory-engine-v2/tests/test_entity_resolution_v2.py +507 -0
- package/packages/memory-engine-v2/tests/test_hybrid_retrieval.py +810 -0
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
# Design: incremental write-time entity resolution (`resolution_queue`)
|
|
2
|
+
|
|
3
|
+
**Status:** design only — NO implementation in this PR.
|
|
4
|
+
**Companion:** `scripts/entity_resolution_v2.py` (the one-time/batch
|
|
5
|
+
tool this design makes incremental), `RFC-entity-reconciliation.md`
|
|
6
|
+
(v1: extract-time pairing + alias-aware upsert + backfill).
|
|
7
|
+
|
|
8
|
+
## Problem
|
|
9
|
+
|
|
10
|
+
`entity_resolution_v2.py` is batch tooling: an operator runs it
|
|
11
|
+
against one arena, eyeballs the tiered report, and (snapshot-gated)
|
|
12
|
+
applies. But fragmentation is continuous — every day of ingest can
|
|
13
|
+
mint new variant rows ("Johann_Boedecker" from a doc filename,
|
|
14
|
+
"Bödecker, Johann" from a vCard) that v1's exact-surface alias
|
|
15
|
+
resolution at upsert will not catch. Re-running the batch tool
|
|
16
|
+
forever is operationally wrong, and running blocking + embeddings +
|
|
17
|
+
LLM adjudication **on the hot write path is a non-starter**: the
|
|
18
|
+
upsert path is latency-sensitive (advisory-lock held), embeddings are
|
|
19
|
+
a network call, and adjudication is an LLM call.
|
|
20
|
+
|
|
21
|
+
## Design: mirror `distillation_queue`'s claim pattern
|
|
22
|
+
|
|
23
|
+
The engine already has exactly the right shape for "expensive
|
|
24
|
+
asynchronous post-processing of a row that was written cheaply":
|
|
25
|
+
`distillation_queue` (001_init.sql) — extractor-sync enqueues, the
|
|
26
|
+
async worker `claim → process → done/failed`, with `claim_expires_at`
|
|
27
|
+
so a crashed worker's items resurface.
|
|
28
|
+
|
|
29
|
+
`resolution_queue` mirrors it 1:1, keyed on entities instead of
|
|
30
|
+
events:
|
|
31
|
+
|
|
32
|
+
1. **Enqueue (hot path, cheap):** whenever an extractor inserts a NEW
|
|
33
|
+
entity row (not an alias-resolved update onto an existing id), it
|
|
34
|
+
also inserts one `resolution_queue` row inside the same
|
|
35
|
+
transaction. Cost: one INSERT. No embeddings, no LLM, no blocking
|
|
36
|
+
on the write path.
|
|
37
|
+
2. **Sweep (nightly, off-path):** a resolver worker claims pending
|
|
38
|
+
items in batches (same `FOR UPDATE SKIP LOCKED` claim the
|
|
39
|
+
distiller uses), and for each claimed entity runs the
|
|
40
|
+
`entity_resolution_v2` pipeline *scoped to that entity*:
|
|
41
|
+
blocking keys → candidate set within its arena → embedding
|
|
42
|
+
similarity → threshold bands → LLM adjudication of the ambiguous
|
|
43
|
+
band → merge via the existing v1 `apply_proposals` machinery
|
|
44
|
+
(per-proposal transaction, `entity_merges` audit +
|
|
45
|
+
`rollback_payload`).
|
|
46
|
+
3. **Embeddings cached:** the resolver persists each entity's bundle
|
|
47
|
+
embedding (and the bundle fingerprint it was computed from) so the
|
|
48
|
+
nightly sweep only re-embeds entities whose surface forms/facts
|
|
49
|
+
changed. This is what actually keeps the sweep cheap at 100k+
|
|
50
|
+
entity scale.
|
|
51
|
+
|
|
52
|
+
### Policy carried over from the batch tool
|
|
53
|
+
|
|
54
|
+
- Tier precedence: `co_occurrence > alias_overlap > embedding_llm >
|
|
55
|
+
heuristic`. The sweep only ever auto-applies the same things the
|
|
56
|
+
batch tool would auto-apply.
|
|
57
|
+
- `unsure` adjudications NEVER merge — they land in
|
|
58
|
+
`resolution_queue.status = 'review'` for a human, and stay claimable
|
|
59
|
+
by a future "review UI" rather than being retried into oblivion.
|
|
60
|
+
- Bare-first-name policy: single-token entities merge only with
|
|
61
|
+
exactly one block candidate AND adjudication = yes.
|
|
62
|
+
- Arena scoping: the worker processes one claimed row at a time and
|
|
63
|
+
every statement carries that row's `arena`. A claimed
|
|
64
|
+
`pentatonic-team` entity can never read or write `pip-agents` rows.
|
|
65
|
+
|
|
66
|
+
## Migration draft (DESIGN ONLY — not in `org-model/migrations/`)
|
|
67
|
+
|
|
68
|
+
```sql
|
|
69
|
+
-- DRAFT 004_resolution_queue.sql — do not apply without review.
|
|
70
|
+
|
|
71
|
+
-- 1. Queue, mirroring distillation_queue's claim pattern.
|
|
72
|
+
CREATE TABLE resolution_queue (
|
|
73
|
+
id BIGSERIAL PRIMARY KEY,
|
|
74
|
+
entity_id TEXT NOT NULL REFERENCES entities(id) ON DELETE CASCADE,
|
|
75
|
+
arena TEXT NOT NULL, -- denormalised for scoped claims
|
|
76
|
+
enqueued_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
77
|
+
claimed_by TEXT,
|
|
78
|
+
claimed_at TIMESTAMPTZ,
|
|
79
|
+
claim_expires_at TIMESTAMPTZ, -- crashed workers' items resurface
|
|
80
|
+
status TEXT NOT NULL DEFAULT 'pending',
|
|
81
|
+
attempts INT NOT NULL DEFAULT 0,
|
|
82
|
+
last_error TEXT,
|
|
83
|
+
completed_at TIMESTAMPTZ,
|
|
84
|
+
-- 'review' = adjudication returned 'unsure' / bare-name policy hold;
|
|
85
|
+
-- terminal for the worker, surfaced to a human.
|
|
86
|
+
CONSTRAINT valid_status CHECK (
|
|
87
|
+
status IN ('pending', 'claimed', 'done', 'failed', 'review'))
|
|
88
|
+
);
|
|
89
|
+
|
|
90
|
+
CREATE INDEX idx_resolution_status ON resolution_queue(status);
|
|
91
|
+
CREATE INDEX idx_resolution_arena ON resolution_queue(arena, status);
|
|
92
|
+
CREATE INDEX idx_resolution_entity ON resolution_queue(entity_id);
|
|
93
|
+
CREATE INDEX idx_resolution_claim_expires ON resolution_queue(claim_expires_at)
|
|
94
|
+
WHERE status = 'claimed';
|
|
95
|
+
|
|
96
|
+
-- 2. Embedding cache so the sweep doesn't re-embed unchanged entities.
|
|
97
|
+
CREATE TABLE entity_embeddings (
|
|
98
|
+
entity_id TEXT PRIMARY KEY REFERENCES entities(id) ON DELETE CASCADE,
|
|
99
|
+
arena TEXT NOT NULL,
|
|
100
|
+
bundle_fingerprint TEXT NOT NULL, -- sha256 of the embedded bundle
|
|
101
|
+
embedding REAL[] NOT NULL, -- gateway dim (4096 today)
|
|
102
|
+
embedding_model TEXT NOT NULL,
|
|
103
|
+
embedded_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
|
104
|
+
);
|
|
105
|
+
|
|
106
|
+
CREATE INDEX idx_entity_embeddings_arena ON entity_embeddings(arena);
|
|
107
|
+
|
|
108
|
+
-- 3. Admit the new merge signal in the audit table. The existing
|
|
109
|
+
-- CHECK only allows ('co_occurrence','alias_overlap','heuristic',
|
|
110
|
+
-- 'online_resolver'); entity_resolution_v2.py refuses to apply
|
|
111
|
+
-- embedding_llm-tier proposals until this lands.
|
|
112
|
+
ALTER TABLE entity_merges DROP CONSTRAINT entity_merges_merge_signal_check;
|
|
113
|
+
ALTER TABLE entity_merges ADD CONSTRAINT entity_merges_merge_signal_check
|
|
114
|
+
CHECK (merge_signal IN (
|
|
115
|
+
'co_occurrence', 'alias_overlap', 'heuristic',
|
|
116
|
+
'online_resolver', 'embedding_llm'));
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
> Note: the CHECK in `002_entity_merges_audit.sql` is unnamed in DDL;
|
|
120
|
+
> Postgres auto-names it `entity_merges_merge_signal_check`. Verify
|
|
121
|
+
> the live name via `pg_constraint` before drafting the final ALTER —
|
|
122
|
+
> the v2 script does this probe programmatically
|
|
123
|
+
> (`_entity_merges_check_allows`).
|
|
124
|
+
|
|
125
|
+
### Claim query (worker pseudocode)
|
|
126
|
+
|
|
127
|
+
```sql
|
|
128
|
+
WITH next AS (
|
|
129
|
+
SELECT id FROM resolution_queue
|
|
130
|
+
WHERE status = 'pending'
|
|
131
|
+
OR (status = 'claimed' AND claim_expires_at < NOW())
|
|
132
|
+
ORDER BY enqueued_at
|
|
133
|
+
LIMIT 50
|
|
134
|
+
FOR UPDATE SKIP LOCKED
|
|
135
|
+
)
|
|
136
|
+
UPDATE resolution_queue q SET
|
|
137
|
+
status = 'claimed', claimed_by = $worker, claimed_at = NOW(),
|
|
138
|
+
claim_expires_at = NOW() + INTERVAL '15 minutes',
|
|
139
|
+
attempts = attempts + 1
|
|
140
|
+
FROM next WHERE q.id = next.id
|
|
141
|
+
RETURNING q.id, q.entity_id, q.arena;
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
## Rollout sequencing
|
|
145
|
+
|
|
146
|
+
1. Land + run the batch tool (`entity_resolution_v2.py`) once per
|
|
147
|
+
arena, snapshot-gated — clears the standing backlog. (Per the
|
|
148
|
+
clean-rebuild runbook: AFTER the distiller-format decision settles,
|
|
149
|
+
so we don't merge rows the re-distillation would re-shape.)
|
|
150
|
+
2. Apply draft migration 004 (review first; the engine is shared
|
|
151
|
+
multi-tenant infra — pip-agents stays frozen, the migration adds
|
|
152
|
+
tables and a CHECK value, touches no rows).
|
|
153
|
+
3. Ship the enqueue (one INSERT in each extractor's entity-insert
|
|
154
|
+
path) — write-path cost is negligible.
|
|
155
|
+
4. Ship the nightly sweep worker (a sibling of extractor-async,
|
|
156
|
+
same claim loop), initially in report-only mode (`status='review'`
|
|
157
|
+
for everything) for a week; then enable auto-apply for the
|
|
158
|
+
embedding_llm tier once precision on the report holds at ~100%.
|
|
159
|
+
|
|
160
|
+
## Non-goals
|
|
161
|
+
|
|
162
|
+
- Real-time resolution at upsert (v1's alias-aware exact-surface
|
|
163
|
+
resolution already covers the common case there).
|
|
164
|
+
- Cross-arena resolution of any kind. Identity never spans arenas.
|
|
165
|
+
- Non-person entity types (same follow-up posture as the v1 RFC).
|
|
@@ -53,8 +53,13 @@ from collections import defaultdict
|
|
|
53
53
|
from dataclasses import dataclass, field
|
|
54
54
|
from datetime import datetime, timezone
|
|
55
55
|
|
|
56
|
-
|
|
57
|
-
import psycopg
|
|
56
|
+
try:
|
|
57
|
+
import psycopg
|
|
58
|
+
import psycopg.rows
|
|
59
|
+
except ImportError: # pragma: no cover — allows importing the merge
|
|
60
|
+
# machinery (entity_resolution_v2.py, unit tests) without the DB
|
|
61
|
+
# driver installed. main() still requires it to actually run.
|
|
62
|
+
psycopg = None # type: ignore[assignment]
|
|
58
63
|
|
|
59
64
|
|
|
60
65
|
# ----------------------------------------------------------------------
|
|
@@ -533,6 +538,10 @@ def main() -> int:
|
|
|
533
538
|
if not args.pg_dsn:
|
|
534
539
|
print("error: --pg-dsn (or $PG_DSN) required", file=sys.stderr)
|
|
535
540
|
return 2
|
|
541
|
+
if psycopg is None:
|
|
542
|
+
print("error: psycopg is required to run this script "
|
|
543
|
+
"(pip install 'psycopg[binary]')", file=sys.stderr)
|
|
544
|
+
return 2
|
|
536
545
|
|
|
537
546
|
merged_by = args.merged_by or f"backfill-{datetime.now(timezone.utc):%Y-%m}"
|
|
538
547
|
|
|
@@ -0,0 +1,369 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Backfill: write the named 'lex' BM25 sparse vector onto existing
|
|
3
|
+
points in the Qdrant "evidence" collection (roadmap BET 3, hybrid
|
|
4
|
+
lexical+dense retrieval).
|
|
5
|
+
|
|
6
|
+
ADDITIVE ONLY. This script writes ONLY the named sparse vector via
|
|
7
|
+
`update_vectors` — the existing unnamed dense vector (Qwen3-Embedding-8B,
|
|
8
|
+
4096-d) is never read, re-embedded, or modified. Qdrant's
|
|
9
|
+
`update_vectors` updates exactly the vector names supplied and leaves
|
|
10
|
+
all other vectors on the point untouched.
|
|
11
|
+
|
|
12
|
+
Why a backfill at all: Qdrant point payloads only carry a 300-char
|
|
13
|
+
`content_preview` — BM25 term statistics over a truncated preview would
|
|
14
|
+
systematically miss tail terms, so the sparse vector MUST be computed
|
|
15
|
+
from the FULL event content, which lives in Postgres `events.content`
|
|
16
|
+
(joined via the `event_id` payload field).
|
|
17
|
+
|
|
18
|
+
Pipeline per batch:
|
|
19
|
+
|
|
20
|
+
1. scroll(collection, with_payload=["event_id"], with_vectors=False)
|
|
21
|
+
2. fetch full content: SELECT id, content FROM events WHERE id = ANY(...)
|
|
22
|
+
3. BM25-encode the full content (fastembed Qdrant/bm25, CPU)
|
|
23
|
+
4. update_vectors([PointVectors(id=point_id, vector={"lex": sparse})])
|
|
24
|
+
|
|
25
|
+
Dry-run by default — counts points, resolves content coverage on a
|
|
26
|
+
sample, prints an ETA, writes NOTHING. Pass --apply to write.
|
|
27
|
+
|
|
28
|
+
Resumable: the last scroll offset (a Qdrant point id) is persisted to
|
|
29
|
+
a state file after every batch; re-running resumes from there.
|
|
30
|
+
Idempotent: re-encoding the same content produces the same sparse
|
|
31
|
+
vector, and update_vectors overwrites in place — re-running over
|
|
32
|
+
already-backfilled points is wasted work, not corruption.
|
|
33
|
+
|
|
34
|
+
Prerequisites (run when the operator — Phil H — has flipped nothing
|
|
35
|
+
yet, but the collection must already carry the 'lex' sparse vector
|
|
36
|
+
config; either start compat once with SEARCH_HYBRID_ENABLED=1 or run
|
|
37
|
+
this script with --ensure-config):
|
|
38
|
+
|
|
39
|
+
python3 backfill_sparse_vectors.py \
|
|
40
|
+
--qdrant-url http://127.0.0.1:16333 \
|
|
41
|
+
--pg-dsn postgresql://pme:...@127.0.0.1:15432/org_model \
|
|
42
|
+
[--apply] # write; default is dry-run
|
|
43
|
+
[--ensure-config] # add the 'lex' sparse config if missing (idempotent)
|
|
44
|
+
[--arena <arena>] # optional payload filter
|
|
45
|
+
[--client <clientId>] # optional payload filter
|
|
46
|
+
[--batch-size 256]
|
|
47
|
+
[--state-file .backfill_sparse_vectors.state.json]
|
|
48
|
+
[--reset-state] # ignore + overwrite any previous offset
|
|
49
|
+
[--max-points N] # stop after N points (smoke runs)
|
|
50
|
+
|
|
51
|
+
⚠️ DISK HEADROOM: the sparse index lands on the same Qdrant storage
|
|
52
|
+
volume as the dense vectors. The 2026-06-05 outage was the engine box
|
|
53
|
+
root disk filling up — confirm headroom (sparse 'lex' vectors for the
|
|
54
|
+
~620–745k-point evidence collection are small relative to the 4096-d
|
|
55
|
+
dense data, expect low single-digit GB including index, but CHECK
|
|
56
|
+
`df -h` on the box before --apply).
|
|
57
|
+
|
|
58
|
+
Exit codes: 0 success; 1 partial failure (some batches errored); 2 bad args.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
from __future__ import annotations
|
|
62
|
+
|
|
63
|
+
import argparse
|
|
64
|
+
import json
|
|
65
|
+
import os
|
|
66
|
+
import sys
|
|
67
|
+
import time
|
|
68
|
+
|
|
69
|
+
SPARSE_VECTOR_NAME = "lex"
|
|
70
|
+
SPARSE_MODEL_NAME = os.environ.get("SEARCH_SPARSE_MODEL", "Qdrant/bm25")
|
|
71
|
+
DEFAULT_COLLECTION = "evidence"
|
|
72
|
+
|
|
73
|
+
# Conservative single-CPU-worker BM25 throughput planning figure for the
|
|
74
|
+
# dry-run ETA (fastembed Qdrant/bm25 tokenize+count is cheap; real rates
|
|
75
|
+
# are usually higher). Override with --eta-rate.
|
|
76
|
+
DEFAULT_ENCODE_RATE_PER_SEC = 400.0
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
# ----------------------------------------------------------------------
|
|
80
|
+
# Pure planning helpers — stdlib only, unit-tested without qdrant/pg.
|
|
81
|
+
# ----------------------------------------------------------------------
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def batch_count(total_points: int, batch_size: int) -> int:
|
|
85
|
+
"""Number of scroll/encode/update batches for `total_points`."""
|
|
86
|
+
if total_points <= 0 or batch_size <= 0:
|
|
87
|
+
return 0
|
|
88
|
+
return (total_points + batch_size - 1) // batch_size
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def eta_seconds(total_points: int, rate_per_sec: float) -> float:
|
|
92
|
+
"""Naive ETA: encode dominates (scroll + update are I/O-cheap)."""
|
|
93
|
+
if total_points <= 0 or rate_per_sec <= 0:
|
|
94
|
+
return 0.0
|
|
95
|
+
return total_points / rate_per_sec
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def format_eta(seconds: float) -> str:
|
|
99
|
+
seconds = int(seconds)
|
|
100
|
+
h, rem = divmod(seconds, 3600)
|
|
101
|
+
m, s = divmod(rem, 60)
|
|
102
|
+
if h:
|
|
103
|
+
return f"{h}h{m:02d}m"
|
|
104
|
+
if m:
|
|
105
|
+
return f"{m}m{s:02d}s"
|
|
106
|
+
return f"{s}s"
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def load_state(path: str) -> dict:
|
|
110
|
+
try:
|
|
111
|
+
with open(path) as f:
|
|
112
|
+
return json.load(f)
|
|
113
|
+
except FileNotFoundError:
|
|
114
|
+
return {}
|
|
115
|
+
except (json.JSONDecodeError, OSError):
|
|
116
|
+
return {}
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def save_state(path: str, state: dict) -> None:
|
|
120
|
+
tmp = path + ".tmp"
|
|
121
|
+
with open(tmp, "w") as f:
|
|
122
|
+
json.dump(state, f, indent=2)
|
|
123
|
+
os.replace(tmp, path)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
# ----------------------------------------------------------------------
|
|
127
|
+
# Heavy-dependency sections (lazy imports so the planning helpers above
|
|
128
|
+
# stay importable in dependency-free test environments).
|
|
129
|
+
# ----------------------------------------------------------------------
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _build_filter(arena: str | None, client: str | None):
|
|
133
|
+
from qdrant_client import models as qmodels
|
|
134
|
+
|
|
135
|
+
must = []
|
|
136
|
+
if arena:
|
|
137
|
+
must.append(qmodels.FieldCondition(key="arena", match=qmodels.MatchValue(value=arena)))
|
|
138
|
+
if client:
|
|
139
|
+
must.append(qmodels.FieldCondition(key="clientId", match=qmodels.MatchValue(value=client)))
|
|
140
|
+
return qmodels.Filter(must=must) if must else None
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _ensure_config(qdrant, collection: str) -> bool:
|
|
144
|
+
"""Idempotently add the 'lex' sparse vector config (mirrors compat's
|
|
145
|
+
_ensure_sparse_vector_config — additive metadata, no point data
|
|
146
|
+
touched). Returns True if added."""
|
|
147
|
+
from qdrant_client import models as qmodels
|
|
148
|
+
|
|
149
|
+
info = qdrant.get_collection(collection)
|
|
150
|
+
existing = getattr(info.config.params, "sparse_vectors", None) or {}
|
|
151
|
+
if SPARSE_VECTOR_NAME in existing:
|
|
152
|
+
return False
|
|
153
|
+
qdrant.update_collection(
|
|
154
|
+
collection_name=collection,
|
|
155
|
+
sparse_vectors_config={
|
|
156
|
+
SPARSE_VECTOR_NAME: qmodels.SparseVectorParams(
|
|
157
|
+
modifier=qmodels.Modifier.IDF,
|
|
158
|
+
index=qmodels.SparseIndexParams(on_disk=True),
|
|
159
|
+
)
|
|
160
|
+
},
|
|
161
|
+
)
|
|
162
|
+
return True
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def run(args: argparse.Namespace) -> int:
|
|
166
|
+
from qdrant_client import QdrantClient
|
|
167
|
+
from qdrant_client import models as qmodels
|
|
168
|
+
|
|
169
|
+
qdrant = QdrantClient(url=args.qdrant_url, timeout=60)
|
|
170
|
+
flt = _build_filter(args.arena, args.client)
|
|
171
|
+
|
|
172
|
+
# Collection-level facts up front (counts include points outside the
|
|
173
|
+
# filter; the filtered total is only known after a full scroll).
|
|
174
|
+
info = qdrant.get_collection(args.collection)
|
|
175
|
+
total_points = info.points_count or 0
|
|
176
|
+
sparse_cfg = getattr(info.config.params, "sparse_vectors", None) or {}
|
|
177
|
+
print(f"[backfill] collection={args.collection} points_count={total_points}")
|
|
178
|
+
print(f"[backfill] sparse config present: {SPARSE_VECTOR_NAME in sparse_cfg}")
|
|
179
|
+
|
|
180
|
+
if SPARSE_VECTOR_NAME not in sparse_cfg:
|
|
181
|
+
if args.ensure_config:
|
|
182
|
+
added = _ensure_config(qdrant, args.collection)
|
|
183
|
+
print(f"[backfill] sparse config '{SPARSE_VECTOR_NAME}' added: {added}")
|
|
184
|
+
elif args.apply:
|
|
185
|
+
print(
|
|
186
|
+
f"error: collection has no '{SPARSE_VECTOR_NAME}' sparse vector config. "
|
|
187
|
+
"Start compat once with SEARCH_HYBRID_ENABLED=1 or pass --ensure-config.",
|
|
188
|
+
file=sys.stderr,
|
|
189
|
+
)
|
|
190
|
+
return 2
|
|
191
|
+
|
|
192
|
+
if not args.apply:
|
|
193
|
+
eta = eta_seconds(total_points, args.eta_rate)
|
|
194
|
+
print(
|
|
195
|
+
f"[backfill] DRY-RUN: would scroll ~{total_points} points "
|
|
196
|
+
f"in {batch_count(total_points, args.batch_size)} batches of {args.batch_size}; "
|
|
197
|
+
f"ETA ~{format_eta(eta)} at {args.eta_rate:.0f} pts/s encode rate"
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
# Postgres + encoder only needed past this point.
|
|
201
|
+
import psycopg
|
|
202
|
+
import psycopg.rows
|
|
203
|
+
|
|
204
|
+
pg = psycopg.connect(args.pg_dsn, row_factory=psycopg.rows.dict_row) if args.pg_dsn else None
|
|
205
|
+
if pg is None and args.apply:
|
|
206
|
+
print("error: --pg-dsn (or $PG_DSN) required with --apply", file=sys.stderr)
|
|
207
|
+
return 2
|
|
208
|
+
|
|
209
|
+
encoder = None
|
|
210
|
+
if args.apply:
|
|
211
|
+
from fastembed import SparseTextEmbedding
|
|
212
|
+
|
|
213
|
+
encoder = SparseTextEmbedding(model_name=SPARSE_MODEL_NAME)
|
|
214
|
+
|
|
215
|
+
state = {} if args.reset_state else load_state(args.state_file)
|
|
216
|
+
offset = state.get("next_offset")
|
|
217
|
+
if offset:
|
|
218
|
+
print(f"[backfill] resuming from offset {offset}")
|
|
219
|
+
|
|
220
|
+
scanned = 0
|
|
221
|
+
written = 0
|
|
222
|
+
skipped_existing = 0
|
|
223
|
+
missing_content = 0
|
|
224
|
+
errors = 0
|
|
225
|
+
t0 = time.monotonic()
|
|
226
|
+
|
|
227
|
+
while True:
|
|
228
|
+
points, next_offset = qdrant.scroll(
|
|
229
|
+
collection_name=args.collection,
|
|
230
|
+
scroll_filter=flt,
|
|
231
|
+
limit=args.batch_size,
|
|
232
|
+
offset=offset,
|
|
233
|
+
with_payload=["event_id"],
|
|
234
|
+
# --apply additionally pulls the (tiny) existing 'lex'
|
|
235
|
+
# vector so already-backfilled points are skipped on
|
|
236
|
+
# resume. The dense vector is NEVER fetched.
|
|
237
|
+
with_vectors=[SPARSE_VECTOR_NAME] if (args.apply and not args.force) else False,
|
|
238
|
+
)
|
|
239
|
+
if not points:
|
|
240
|
+
break
|
|
241
|
+
|
|
242
|
+
batch = [] # (point_id, event_id)
|
|
243
|
+
for p in points:
|
|
244
|
+
scanned += 1
|
|
245
|
+
eid = (p.payload or {}).get("event_id")
|
|
246
|
+
if not eid:
|
|
247
|
+
missing_content += 1
|
|
248
|
+
continue
|
|
249
|
+
if args.apply and not args.force:
|
|
250
|
+
vecs = p.vector if isinstance(p.vector, dict) else {}
|
|
251
|
+
if vecs and SPARSE_VECTOR_NAME in vecs:
|
|
252
|
+
skipped_existing += 1
|
|
253
|
+
continue
|
|
254
|
+
batch.append((p.id, eid))
|
|
255
|
+
|
|
256
|
+
if batch and pg is not None:
|
|
257
|
+
event_ids = list({eid for _, eid in batch})
|
|
258
|
+
with pg.cursor() as cur:
|
|
259
|
+
cur.execute(
|
|
260
|
+
"SELECT id, content FROM events WHERE id = ANY(%s)",
|
|
261
|
+
(event_ids,),
|
|
262
|
+
)
|
|
263
|
+
content_by_id = {r["id"]: r["content"] for r in cur.fetchall()}
|
|
264
|
+
|
|
265
|
+
todo = []
|
|
266
|
+
for pid, eid in batch:
|
|
267
|
+
content = content_by_id.get(eid)
|
|
268
|
+
if not content:
|
|
269
|
+
missing_content += 1
|
|
270
|
+
continue
|
|
271
|
+
todo.append((pid, content))
|
|
272
|
+
|
|
273
|
+
if args.apply and todo:
|
|
274
|
+
try:
|
|
275
|
+
embs = list(encoder.embed([c for _, c in todo]))
|
|
276
|
+
qdrant.update_vectors(
|
|
277
|
+
collection_name=args.collection,
|
|
278
|
+
points=[
|
|
279
|
+
qmodels.PointVectors(
|
|
280
|
+
id=pid,
|
|
281
|
+
vector={
|
|
282
|
+
SPARSE_VECTOR_NAME: qmodels.SparseVector(
|
|
283
|
+
indices=[int(i) for i in e.indices],
|
|
284
|
+
values=[float(v) for v in e.values],
|
|
285
|
+
)
|
|
286
|
+
},
|
|
287
|
+
)
|
|
288
|
+
for (pid, _), e in zip(todo, embs)
|
|
289
|
+
],
|
|
290
|
+
wait=True,
|
|
291
|
+
)
|
|
292
|
+
written += len(todo)
|
|
293
|
+
except Exception as e:
|
|
294
|
+
errors += 1
|
|
295
|
+
print(f"[backfill] batch ERROR at offset {offset}: {e}", file=sys.stderr)
|
|
296
|
+
|
|
297
|
+
offset = next_offset
|
|
298
|
+
save_state(args.state_file, {
|
|
299
|
+
"next_offset": str(offset) if offset is not None else None,
|
|
300
|
+
"scanned": scanned,
|
|
301
|
+
"written": written,
|
|
302
|
+
"mode": "apply" if args.apply else "dry-run",
|
|
303
|
+
"collection": args.collection,
|
|
304
|
+
})
|
|
305
|
+
|
|
306
|
+
if scanned and scanned % (args.batch_size * 20) == 0:
|
|
307
|
+
rate = scanned / max(time.monotonic() - t0, 1e-6)
|
|
308
|
+
remaining = max(total_points - scanned, 0)
|
|
309
|
+
print(
|
|
310
|
+
f"[backfill] scanned={scanned} written={written} "
|
|
311
|
+
f"skipped={skipped_existing} missing_content={missing_content} "
|
|
312
|
+
f"rate={rate:.0f}/s eta={format_eta(remaining / max(rate, 1e-6))}"
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
if args.max_points and scanned >= args.max_points:
|
|
316
|
+
print(f"[backfill] --max-points {args.max_points} reached; stopping")
|
|
317
|
+
break
|
|
318
|
+
if next_offset is None:
|
|
319
|
+
break
|
|
320
|
+
|
|
321
|
+
dur = time.monotonic() - t0
|
|
322
|
+
print(
|
|
323
|
+
f"[backfill] DONE mode={'apply' if args.apply else 'dry-run'} "
|
|
324
|
+
f"scanned={scanned} written={written} skipped_existing={skipped_existing} "
|
|
325
|
+
f"missing_content={missing_content} errors={errors} in {format_eta(dur)}"
|
|
326
|
+
)
|
|
327
|
+
if not args.apply:
|
|
328
|
+
print("[backfill] dry-run only; pass --apply to write the 'lex' vectors")
|
|
329
|
+
if pg is not None:
|
|
330
|
+
pg.close()
|
|
331
|
+
return 1 if errors else 0
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
|
|
335
|
+
p = argparse.ArgumentParser(
|
|
336
|
+
description=__doc__,
|
|
337
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
338
|
+
)
|
|
339
|
+
p.add_argument("--qdrant-url", default=os.environ.get("VECTOR_INDEX_URL", "http://127.0.0.1:16333"))
|
|
340
|
+
p.add_argument("--pg-dsn", default=os.environ.get("PG_DSN"),
|
|
341
|
+
help="postgres DSN for full event content; defaults to $PG_DSN")
|
|
342
|
+
p.add_argument("--collection", default=DEFAULT_COLLECTION)
|
|
343
|
+
p.add_argument("--apply", action="store_true", help="write; default is dry-run")
|
|
344
|
+
p.add_argument("--ensure-config", action="store_true",
|
|
345
|
+
help="idempotently add the 'lex' sparse vector config if missing")
|
|
346
|
+
p.add_argument("--arena", default=None, help="optional arena payload filter")
|
|
347
|
+
p.add_argument("--client", default=None, help="optional clientId payload filter")
|
|
348
|
+
p.add_argument("--batch-size", type=int, default=256)
|
|
349
|
+
p.add_argument("--state-file", default=".backfill_sparse_vectors.state.json")
|
|
350
|
+
p.add_argument("--reset-state", action="store_true")
|
|
351
|
+
p.add_argument("--force", action="store_true",
|
|
352
|
+
help="re-write 'lex' even where it already exists")
|
|
353
|
+
p.add_argument("--max-points", type=int, default=0,
|
|
354
|
+
help="stop after scanning N points (smoke runs)")
|
|
355
|
+
p.add_argument("--eta-rate", type=float, default=DEFAULT_ENCODE_RATE_PER_SEC,
|
|
356
|
+
help="points/sec planning figure for the dry-run ETA")
|
|
357
|
+
return p.parse_args(argv)
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def main() -> int:
|
|
361
|
+
args = parse_args()
|
|
362
|
+
if args.batch_size <= 0:
|
|
363
|
+
print("error: --batch-size must be > 0", file=sys.stderr)
|
|
364
|
+
return 2
|
|
365
|
+
return run(args)
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
if __name__ == "__main__":
|
|
369
|
+
sys.exit(main())
|