@pentatonic-ai/ai-agent-sdk 0.10.4 → 0.10.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,165 @@
1
+ # Design: incremental write-time entity resolution (`resolution_queue`)
2
+
3
+ **Status:** design only — NO implementation in this PR.
4
+ **Companion:** `scripts/entity_resolution_v2.py` (the one-time/batch
5
+ tool this design makes incremental), `RFC-entity-reconciliation.md`
6
+ (v1: extract-time pairing + alias-aware upsert + backfill).
7
+
8
+ ## Problem
9
+
10
+ `entity_resolution_v2.py` is batch tooling: an operator runs it
11
+ against one arena, eyeballs the tiered report, and (snapshot-gated)
12
+ applies. But fragmentation is continuous — every day of ingest can
13
+ mint new variant rows ("Johann_Boedecker" from a doc filename,
14
+ "Bödecker, Johann" from a vCard) that v1's exact-surface alias
15
+ resolution at upsert will not catch. Re-running the batch tool
16
+ forever is operationally wrong, and running blocking + embeddings +
17
+ LLM adjudication **on the hot write path is a non-starter**: the
18
+ upsert path is latency-sensitive (advisory-lock held), embeddings are
19
+ a network call, and adjudication is an LLM call.
20
+
21
+ ## Design: mirror `distillation_queue`'s claim pattern
22
+
23
+ The engine already has exactly the right shape for "expensive
24
+ asynchronous post-processing of a row that was written cheaply":
25
+ `distillation_queue` (001_init.sql) — extractor-sync enqueues, the
26
+ async worker `claim → process → done/failed`, with `claim_expires_at`
27
+ so a crashed worker's items resurface.
28
+
29
+ `resolution_queue` mirrors it 1:1, keyed on entities instead of
30
+ events:
31
+
32
+ 1. **Enqueue (hot path, cheap):** whenever an extractor inserts a NEW
33
+ entity row (not an alias-resolved update onto an existing id), it
34
+ also inserts one `resolution_queue` row inside the same
35
+ transaction. Cost: one INSERT. No embeddings, no LLM, no blocking
36
+ on the write path.
37
+ 2. **Sweep (nightly, off-path):** a resolver worker claims pending
38
+ items in batches (same `FOR UPDATE SKIP LOCKED` claim the
39
+ distiller uses), and for each claimed entity runs the
40
+ `entity_resolution_v2` pipeline *scoped to that entity*:
41
+ blocking keys → candidate set within its arena → embedding
42
+ similarity → threshold bands → LLM adjudication of the ambiguous
43
+ band → merge via the existing v1 `apply_proposals` machinery
44
+ (per-proposal transaction, `entity_merges` audit +
45
+ `rollback_payload`).
46
+ 3. **Embeddings cached:** the resolver persists each entity's bundle
47
+ embedding (and the bundle fingerprint it was computed from) so the
48
+ nightly sweep only re-embeds entities whose surface forms/facts
49
+ changed. This is what actually keeps the sweep cheap at 100k+
50
+ entity scale.
51
+
52
+ ### Policy carried over from the batch tool
53
+
54
+ - Tier precedence: `co_occurrence > alias_overlap > embedding_llm >
55
+ heuristic`. The sweep only ever auto-applies the same things the
56
+ batch tool would auto-apply.
57
+ - `unsure` adjudications NEVER merge — they land in
58
+ `resolution_queue.status = 'review'` for a human, and stay claimable
59
+ by a future "review UI" rather than being retried into oblivion.
60
+ - Bare-first-name policy: single-token entities merge only with
61
+ exactly one block candidate AND adjudication = yes.
62
+ - Arena scoping: the worker processes one claimed row at a time and
63
+ every statement carries that row's `arena`. A claimed
64
+ `pentatonic-team` entity can never read or write `pip-agents` rows.
65
+
66
+ ## Migration draft (DESIGN ONLY — not in `org-model/migrations/`)
67
+
68
+ ```sql
69
+ -- DRAFT 004_resolution_queue.sql — do not apply without review.
70
+
71
+ -- 1. Queue, mirroring distillation_queue's claim pattern.
72
+ CREATE TABLE resolution_queue (
73
+ id BIGSERIAL PRIMARY KEY,
74
+ entity_id TEXT NOT NULL REFERENCES entities(id) ON DELETE CASCADE,
75
+ arena TEXT NOT NULL, -- denormalised for scoped claims
76
+ enqueued_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
77
+ claimed_by TEXT,
78
+ claimed_at TIMESTAMPTZ,
79
+ claim_expires_at TIMESTAMPTZ, -- crashed workers' items resurface
80
+ status TEXT NOT NULL DEFAULT 'pending',
81
+ attempts INT NOT NULL DEFAULT 0,
82
+ last_error TEXT,
83
+ completed_at TIMESTAMPTZ,
84
+ -- 'review' = adjudication returned 'unsure' / bare-name policy hold;
85
+ -- terminal for the worker, surfaced to a human.
86
+ CONSTRAINT valid_status CHECK (
87
+ status IN ('pending', 'claimed', 'done', 'failed', 'review'))
88
+ );
89
+
90
+ CREATE INDEX idx_resolution_status ON resolution_queue(status);
91
+ CREATE INDEX idx_resolution_arena ON resolution_queue(arena, status);
92
+ CREATE INDEX idx_resolution_entity ON resolution_queue(entity_id);
93
+ CREATE INDEX idx_resolution_claim_expires ON resolution_queue(claim_expires_at)
94
+ WHERE status = 'claimed';
95
+
96
+ -- 2. Embedding cache so the sweep doesn't re-embed unchanged entities.
97
+ CREATE TABLE entity_embeddings (
98
+ entity_id TEXT PRIMARY KEY REFERENCES entities(id) ON DELETE CASCADE,
99
+ arena TEXT NOT NULL,
100
+ bundle_fingerprint TEXT NOT NULL, -- sha256 of the embedded bundle
101
+ embedding REAL[] NOT NULL, -- gateway dim (4096 today)
102
+ embedding_model TEXT NOT NULL,
103
+ embedded_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
104
+ );
105
+
106
+ CREATE INDEX idx_entity_embeddings_arena ON entity_embeddings(arena);
107
+
108
+ -- 3. Admit the new merge signal in the audit table. The existing
109
+ -- CHECK only allows ('co_occurrence','alias_overlap','heuristic',
110
+ -- 'online_resolver'); entity_resolution_v2.py refuses to apply
111
+ -- embedding_llm-tier proposals until this lands.
112
+ ALTER TABLE entity_merges DROP CONSTRAINT entity_merges_merge_signal_check;
113
+ ALTER TABLE entity_merges ADD CONSTRAINT entity_merges_merge_signal_check
114
+ CHECK (merge_signal IN (
115
+ 'co_occurrence', 'alias_overlap', 'heuristic',
116
+ 'online_resolver', 'embedding_llm'));
117
+ ```
118
+
119
+ > Note: the CHECK in `002_entity_merges_audit.sql` is unnamed in DDL;
120
+ > Postgres auto-names it `entity_merges_merge_signal_check`. Verify
121
+ > the live name via `pg_constraint` before drafting the final ALTER —
122
+ > the v2 script does this probe programmatically
123
+ > (`_entity_merges_check_allows`).
124
+
125
+ ### Claim query (worker pseudocode)
126
+
127
+ ```sql
128
+ WITH next AS (
129
+ SELECT id FROM resolution_queue
130
+ WHERE status = 'pending'
131
+ OR (status = 'claimed' AND claim_expires_at < NOW())
132
+ ORDER BY enqueued_at
133
+ LIMIT 50
134
+ FOR UPDATE SKIP LOCKED
135
+ )
136
+ UPDATE resolution_queue q SET
137
+ status = 'claimed', claimed_by = $worker, claimed_at = NOW(),
138
+ claim_expires_at = NOW() + INTERVAL '15 minutes',
139
+ attempts = attempts + 1
140
+ FROM next WHERE q.id = next.id
141
+ RETURNING q.id, q.entity_id, q.arena;
142
+ ```
143
+
144
+ ## Rollout sequencing
145
+
146
+ 1. Land + run the batch tool (`entity_resolution_v2.py`) once per
147
+ arena, snapshot-gated — clears the standing backlog. (Per the
148
+ clean-rebuild runbook: AFTER the distiller-format decision settles,
149
+ so we don't merge rows the re-distillation would re-shape.)
150
+ 2. Apply draft migration 004 (review first; the engine is shared
151
+ multi-tenant infra — pip-agents stays frozen, the migration adds
152
+ tables and a CHECK value, touches no rows).
153
+ 3. Ship the enqueue (one INSERT in each extractor's entity-insert
154
+ path) — write-path cost is negligible.
155
+ 4. Ship the nightly sweep worker (a sibling of extractor-async,
156
+ same claim loop), initially in report-only mode (`status='review'`
157
+ for everything) for a week; then enable auto-apply for the
158
+ embedding_llm tier once precision on the report holds at ~100%.
159
+
160
+ ## Non-goals
161
+
162
+ - Real-time resolution at upsert (v1's alias-aware exact-surface
163
+ resolution already covers the common case there).
164
+ - Cross-arena resolution of any kind. Identity never spans arenas.
165
+ - Non-person entity types (same follow-up posture as the v1 RFC).
@@ -53,8 +53,13 @@ from collections import defaultdict
53
53
  from dataclasses import dataclass, field
54
54
  from datetime import datetime, timezone
55
55
 
56
- import psycopg
57
- import psycopg.rows
56
+ try:
57
+ import psycopg
58
+ import psycopg.rows
59
+ except ImportError: # pragma: no cover — allows importing the merge
60
+ # machinery (entity_resolution_v2.py, unit tests) without the DB
61
+ # driver installed. main() still requires it to actually run.
62
+ psycopg = None # type: ignore[assignment]
58
63
 
59
64
 
60
65
  # ----------------------------------------------------------------------
@@ -533,6 +538,10 @@ def main() -> int:
533
538
  if not args.pg_dsn:
534
539
  print("error: --pg-dsn (or $PG_DSN) required", file=sys.stderr)
535
540
  return 2
541
+ if psycopg is None:
542
+ print("error: psycopg is required to run this script "
543
+ "(pip install 'psycopg[binary]')", file=sys.stderr)
544
+ return 2
536
545
 
537
546
  merged_by = args.merged_by or f"backfill-{datetime.now(timezone.utc):%Y-%m}"
538
547
 
@@ -0,0 +1,369 @@
1
+ #!/usr/bin/env python3
2
+ """Backfill: write the named 'lex' BM25 sparse vector onto existing
3
+ points in the Qdrant "evidence" collection (roadmap BET 3, hybrid
4
+ lexical+dense retrieval).
5
+
6
+ ADDITIVE ONLY. This script writes ONLY the named sparse vector via
7
+ `update_vectors` — the existing unnamed dense vector (Qwen3-Embedding-8B,
8
+ 4096-d) is never read, re-embedded, or modified. Qdrant's
9
+ `update_vectors` updates exactly the vector names supplied and leaves
10
+ all other vectors on the point untouched.
11
+
12
+ Why a backfill at all: Qdrant point payloads only carry a 300-char
13
+ `content_preview` — BM25 term statistics over a truncated preview would
14
+ systematically miss tail terms, so the sparse vector MUST be computed
15
+ from the FULL event content, which lives in Postgres `events.content`
16
+ (joined via the `event_id` payload field).
17
+
18
+ Pipeline per batch:
19
+
20
+ 1. scroll(collection, with_payload=["event_id"], with_vectors=False)
21
+ 2. fetch full content: SELECT id, content FROM events WHERE id = ANY(...)
22
+ 3. BM25-encode the full content (fastembed Qdrant/bm25, CPU)
23
+ 4. update_vectors([PointVectors(id=point_id, vector={"lex": sparse})])
24
+
25
+ Dry-run by default — counts points, resolves content coverage on a
26
+ sample, prints an ETA, writes NOTHING. Pass --apply to write.
27
+
28
+ Resumable: the last scroll offset (a Qdrant point id) is persisted to
29
+ a state file after every batch; re-running resumes from there.
30
+ Idempotent: re-encoding the same content produces the same sparse
31
+ vector, and update_vectors overwrites in place — re-running over
32
+ already-backfilled points is wasted work, not corruption.
33
+
34
+ Prerequisites (run when the operator — Phil H — has flipped nothing
35
+ yet, but the collection must already carry the 'lex' sparse vector
36
+ config; either start compat once with SEARCH_HYBRID_ENABLED=1 or run
37
+ this script with --ensure-config):
38
+
39
+ python3 backfill_sparse_vectors.py \
40
+ --qdrant-url http://127.0.0.1:16333 \
41
+ --pg-dsn postgresql://pme:...@127.0.0.1:15432/org_model \
42
+ [--apply] # write; default is dry-run
43
+ [--ensure-config] # add the 'lex' sparse config if missing (idempotent)
44
+ [--arena <arena>] # optional payload filter
45
+ [--client <clientId>] # optional payload filter
46
+ [--batch-size 256]
47
+ [--state-file .backfill_sparse_vectors.state.json]
48
+ [--reset-state] # ignore + overwrite any previous offset
49
+ [--max-points N] # stop after N points (smoke runs)
50
+
51
+ ⚠️ DISK HEADROOM: the sparse index lands on the same Qdrant storage
52
+ volume as the dense vectors. The 2026-06-05 outage was the engine box
53
+ root disk filling up — confirm headroom (sparse 'lex' vectors for the
54
+ ~620–745k-point evidence collection are small relative to the 4096-d
55
+ dense data, expect low single-digit GB including index, but CHECK
56
+ `df -h` on the box before --apply).
57
+
58
+ Exit codes: 0 success; 1 partial failure (some batches errored); 2 bad args.
59
+ """
60
+
61
+ from __future__ import annotations
62
+
63
+ import argparse
64
+ import json
65
+ import os
66
+ import sys
67
+ import time
68
+
69
+ SPARSE_VECTOR_NAME = "lex"
70
+ SPARSE_MODEL_NAME = os.environ.get("SEARCH_SPARSE_MODEL", "Qdrant/bm25")
71
+ DEFAULT_COLLECTION = "evidence"
72
+
73
+ # Conservative single-CPU-worker BM25 throughput planning figure for the
74
+ # dry-run ETA (fastembed Qdrant/bm25 tokenize+count is cheap; real rates
75
+ # are usually higher). Override with --eta-rate.
76
+ DEFAULT_ENCODE_RATE_PER_SEC = 400.0
77
+
78
+
79
+ # ----------------------------------------------------------------------
80
+ # Pure planning helpers — stdlib only, unit-tested without qdrant/pg.
81
+ # ----------------------------------------------------------------------
82
+
83
+
84
+ def batch_count(total_points: int, batch_size: int) -> int:
85
+ """Number of scroll/encode/update batches for `total_points`."""
86
+ if total_points <= 0 or batch_size <= 0:
87
+ return 0
88
+ return (total_points + batch_size - 1) // batch_size
89
+
90
+
91
+ def eta_seconds(total_points: int, rate_per_sec: float) -> float:
92
+ """Naive ETA: encode dominates (scroll + update are I/O-cheap)."""
93
+ if total_points <= 0 or rate_per_sec <= 0:
94
+ return 0.0
95
+ return total_points / rate_per_sec
96
+
97
+
98
+ def format_eta(seconds: float) -> str:
99
+ seconds = int(seconds)
100
+ h, rem = divmod(seconds, 3600)
101
+ m, s = divmod(rem, 60)
102
+ if h:
103
+ return f"{h}h{m:02d}m"
104
+ if m:
105
+ return f"{m}m{s:02d}s"
106
+ return f"{s}s"
107
+
108
+
109
+ def load_state(path: str) -> dict:
110
+ try:
111
+ with open(path) as f:
112
+ return json.load(f)
113
+ except FileNotFoundError:
114
+ return {}
115
+ except (json.JSONDecodeError, OSError):
116
+ return {}
117
+
118
+
119
+ def save_state(path: str, state: dict) -> None:
120
+ tmp = path + ".tmp"
121
+ with open(tmp, "w") as f:
122
+ json.dump(state, f, indent=2)
123
+ os.replace(tmp, path)
124
+
125
+
126
+ # ----------------------------------------------------------------------
127
+ # Heavy-dependency sections (lazy imports so the planning helpers above
128
+ # stay importable in dependency-free test environments).
129
+ # ----------------------------------------------------------------------
130
+
131
+
132
+ def _build_filter(arena: str | None, client: str | None):
133
+ from qdrant_client import models as qmodels
134
+
135
+ must = []
136
+ if arena:
137
+ must.append(qmodels.FieldCondition(key="arena", match=qmodels.MatchValue(value=arena)))
138
+ if client:
139
+ must.append(qmodels.FieldCondition(key="clientId", match=qmodels.MatchValue(value=client)))
140
+ return qmodels.Filter(must=must) if must else None
141
+
142
+
143
+ def _ensure_config(qdrant, collection: str) -> bool:
144
+ """Idempotently add the 'lex' sparse vector config (mirrors compat's
145
+ _ensure_sparse_vector_config — additive metadata, no point data
146
+ touched). Returns True if added."""
147
+ from qdrant_client import models as qmodels
148
+
149
+ info = qdrant.get_collection(collection)
150
+ existing = getattr(info.config.params, "sparse_vectors", None) or {}
151
+ if SPARSE_VECTOR_NAME in existing:
152
+ return False
153
+ qdrant.update_collection(
154
+ collection_name=collection,
155
+ sparse_vectors_config={
156
+ SPARSE_VECTOR_NAME: qmodels.SparseVectorParams(
157
+ modifier=qmodels.Modifier.IDF,
158
+ index=qmodels.SparseIndexParams(on_disk=True),
159
+ )
160
+ },
161
+ )
162
+ return True
163
+
164
+
165
+ def run(args: argparse.Namespace) -> int:
166
+ from qdrant_client import QdrantClient
167
+ from qdrant_client import models as qmodels
168
+
169
+ qdrant = QdrantClient(url=args.qdrant_url, timeout=60)
170
+ flt = _build_filter(args.arena, args.client)
171
+
172
+ # Collection-level facts up front (counts include points outside the
173
+ # filter; the filtered total is only known after a full scroll).
174
+ info = qdrant.get_collection(args.collection)
175
+ total_points = info.points_count or 0
176
+ sparse_cfg = getattr(info.config.params, "sparse_vectors", None) or {}
177
+ print(f"[backfill] collection={args.collection} points_count={total_points}")
178
+ print(f"[backfill] sparse config present: {SPARSE_VECTOR_NAME in sparse_cfg}")
179
+
180
+ if SPARSE_VECTOR_NAME not in sparse_cfg:
181
+ if args.ensure_config:
182
+ added = _ensure_config(qdrant, args.collection)
183
+ print(f"[backfill] sparse config '{SPARSE_VECTOR_NAME}' added: {added}")
184
+ elif args.apply:
185
+ print(
186
+ f"error: collection has no '{SPARSE_VECTOR_NAME}' sparse vector config. "
187
+ "Start compat once with SEARCH_HYBRID_ENABLED=1 or pass --ensure-config.",
188
+ file=sys.stderr,
189
+ )
190
+ return 2
191
+
192
+ if not args.apply:
193
+ eta = eta_seconds(total_points, args.eta_rate)
194
+ print(
195
+ f"[backfill] DRY-RUN: would scroll ~{total_points} points "
196
+ f"in {batch_count(total_points, args.batch_size)} batches of {args.batch_size}; "
197
+ f"ETA ~{format_eta(eta)} at {args.eta_rate:.0f} pts/s encode rate"
198
+ )
199
+
200
+ # Postgres + encoder only needed past this point.
201
+ import psycopg
202
+ import psycopg.rows
203
+
204
+ pg = psycopg.connect(args.pg_dsn, row_factory=psycopg.rows.dict_row) if args.pg_dsn else None
205
+ if pg is None and args.apply:
206
+ print("error: --pg-dsn (or $PG_DSN) required with --apply", file=sys.stderr)
207
+ return 2
208
+
209
+ encoder = None
210
+ if args.apply:
211
+ from fastembed import SparseTextEmbedding
212
+
213
+ encoder = SparseTextEmbedding(model_name=SPARSE_MODEL_NAME)
214
+
215
+ state = {} if args.reset_state else load_state(args.state_file)
216
+ offset = state.get("next_offset")
217
+ if offset:
218
+ print(f"[backfill] resuming from offset {offset}")
219
+
220
+ scanned = 0
221
+ written = 0
222
+ skipped_existing = 0
223
+ missing_content = 0
224
+ errors = 0
225
+ t0 = time.monotonic()
226
+
227
+ while True:
228
+ points, next_offset = qdrant.scroll(
229
+ collection_name=args.collection,
230
+ scroll_filter=flt,
231
+ limit=args.batch_size,
232
+ offset=offset,
233
+ with_payload=["event_id"],
234
+ # --apply additionally pulls the (tiny) existing 'lex'
235
+ # vector so already-backfilled points are skipped on
236
+ # resume. The dense vector is NEVER fetched.
237
+ with_vectors=[SPARSE_VECTOR_NAME] if (args.apply and not args.force) else False,
238
+ )
239
+ if not points:
240
+ break
241
+
242
+ batch = [] # (point_id, event_id)
243
+ for p in points:
244
+ scanned += 1
245
+ eid = (p.payload or {}).get("event_id")
246
+ if not eid:
247
+ missing_content += 1
248
+ continue
249
+ if args.apply and not args.force:
250
+ vecs = p.vector if isinstance(p.vector, dict) else {}
251
+ if vecs and SPARSE_VECTOR_NAME in vecs:
252
+ skipped_existing += 1
253
+ continue
254
+ batch.append((p.id, eid))
255
+
256
+ if batch and pg is not None:
257
+ event_ids = list({eid for _, eid in batch})
258
+ with pg.cursor() as cur:
259
+ cur.execute(
260
+ "SELECT id, content FROM events WHERE id = ANY(%s)",
261
+ (event_ids,),
262
+ )
263
+ content_by_id = {r["id"]: r["content"] for r in cur.fetchall()}
264
+
265
+ todo = []
266
+ for pid, eid in batch:
267
+ content = content_by_id.get(eid)
268
+ if not content:
269
+ missing_content += 1
270
+ continue
271
+ todo.append((pid, content))
272
+
273
+ if args.apply and todo:
274
+ try:
275
+ embs = list(encoder.embed([c for _, c in todo]))
276
+ qdrant.update_vectors(
277
+ collection_name=args.collection,
278
+ points=[
279
+ qmodels.PointVectors(
280
+ id=pid,
281
+ vector={
282
+ SPARSE_VECTOR_NAME: qmodels.SparseVector(
283
+ indices=[int(i) for i in e.indices],
284
+ values=[float(v) for v in e.values],
285
+ )
286
+ },
287
+ )
288
+ for (pid, _), e in zip(todo, embs)
289
+ ],
290
+ wait=True,
291
+ )
292
+ written += len(todo)
293
+ except Exception as e:
294
+ errors += 1
295
+ print(f"[backfill] batch ERROR at offset {offset}: {e}", file=sys.stderr)
296
+
297
+ offset = next_offset
298
+ save_state(args.state_file, {
299
+ "next_offset": str(offset) if offset is not None else None,
300
+ "scanned": scanned,
301
+ "written": written,
302
+ "mode": "apply" if args.apply else "dry-run",
303
+ "collection": args.collection,
304
+ })
305
+
306
+ if scanned and scanned % (args.batch_size * 20) == 0:
307
+ rate = scanned / max(time.monotonic() - t0, 1e-6)
308
+ remaining = max(total_points - scanned, 0)
309
+ print(
310
+ f"[backfill] scanned={scanned} written={written} "
311
+ f"skipped={skipped_existing} missing_content={missing_content} "
312
+ f"rate={rate:.0f}/s eta={format_eta(remaining / max(rate, 1e-6))}"
313
+ )
314
+
315
+ if args.max_points and scanned >= args.max_points:
316
+ print(f"[backfill] --max-points {args.max_points} reached; stopping")
317
+ break
318
+ if next_offset is None:
319
+ break
320
+
321
+ dur = time.monotonic() - t0
322
+ print(
323
+ f"[backfill] DONE mode={'apply' if args.apply else 'dry-run'} "
324
+ f"scanned={scanned} written={written} skipped_existing={skipped_existing} "
325
+ f"missing_content={missing_content} errors={errors} in {format_eta(dur)}"
326
+ )
327
+ if not args.apply:
328
+ print("[backfill] dry-run only; pass --apply to write the 'lex' vectors")
329
+ if pg is not None:
330
+ pg.close()
331
+ return 1 if errors else 0
332
+
333
+
334
+ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
335
+ p = argparse.ArgumentParser(
336
+ description=__doc__,
337
+ formatter_class=argparse.RawDescriptionHelpFormatter,
338
+ )
339
+ p.add_argument("--qdrant-url", default=os.environ.get("VECTOR_INDEX_URL", "http://127.0.0.1:16333"))
340
+ p.add_argument("--pg-dsn", default=os.environ.get("PG_DSN"),
341
+ help="postgres DSN for full event content; defaults to $PG_DSN")
342
+ p.add_argument("--collection", default=DEFAULT_COLLECTION)
343
+ p.add_argument("--apply", action="store_true", help="write; default is dry-run")
344
+ p.add_argument("--ensure-config", action="store_true",
345
+ help="idempotently add the 'lex' sparse vector config if missing")
346
+ p.add_argument("--arena", default=None, help="optional arena payload filter")
347
+ p.add_argument("--client", default=None, help="optional clientId payload filter")
348
+ p.add_argument("--batch-size", type=int, default=256)
349
+ p.add_argument("--state-file", default=".backfill_sparse_vectors.state.json")
350
+ p.add_argument("--reset-state", action="store_true")
351
+ p.add_argument("--force", action="store_true",
352
+ help="re-write 'lex' even where it already exists")
353
+ p.add_argument("--max-points", type=int, default=0,
354
+ help="stop after scanning N points (smoke runs)")
355
+ p.add_argument("--eta-rate", type=float, default=DEFAULT_ENCODE_RATE_PER_SEC,
356
+ help="points/sec planning figure for the dry-run ETA")
357
+ return p.parse_args(argv)
358
+
359
+
360
+ def main() -> int:
361
+ args = parse_args()
362
+ if args.batch_size <= 0:
363
+ print("error: --batch-size must be > 0", file=sys.stderr)
364
+ return 2
365
+ return run(args)
366
+
367
+
368
+ if __name__ == "__main__":
369
+ sys.exit(main())