@pentatonic-ai/ai-agent-sdk 0.10.19 → 0.10.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1 -1
- package/dist/index.js +1 -1
- package/package.json +1 -1
- package/packages/memory-engine-v2/RFC-decay-and-fusion.md +122 -8
- package/packages/memory-engine-v2/compat/server.py +18 -4
- package/packages/memory-engine-v2/extractor-async/test_email_alias_guard.py +78 -0
- package/packages/memory-engine-v2/extractor-async/worker.py +52 -0
- package/packages/memory-engine-v2/scripts/build_retrain_corpus.py +240 -0
- package/packages/memory-engine-v2/scripts/fusion_defrag.py +440 -0
- package/packages/memory-engine-v2/scripts/redistill.py +236 -0
package/dist/index.cjs
CHANGED
|
@@ -878,7 +878,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
|
|
|
878
878
|
}
|
|
879
879
|
|
|
880
880
|
// src/telemetry.js
|
|
881
|
-
var VERSION = "0.10.
|
|
881
|
+
var VERSION = "0.10.20";
|
|
882
882
|
var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
|
|
883
883
|
function machineId() {
|
|
884
884
|
const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
|
package/dist/index.js
CHANGED
|
@@ -847,7 +847,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
|
|
|
847
847
|
}
|
|
848
848
|
|
|
849
849
|
// src/telemetry.js
|
|
850
|
-
var VERSION = "0.10.
|
|
850
|
+
var VERSION = "0.10.20";
|
|
851
851
|
var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
|
|
852
852
|
function machineId() {
|
|
853
853
|
const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@pentatonic-ai/ai-agent-sdk",
|
|
3
|
-
"version": "0.10.
|
|
3
|
+
"version": "0.10.20",
|
|
4
4
|
"description": "TES SDK — LLM observability and lifecycle tracking via Pentatonic Thing Event System. Track token usage, tool calls, and conversations. Manage things through event-sourced lifecycle stages with AI enrichment and vector search.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.cjs",
|
|
@@ -1,10 +1,18 @@
|
|
|
1
1
|
# RFC: the Fusion Drive — v2 memory self-healing (cross-run node fusion + decay)
|
|
2
2
|
|
|
3
3
|
> **Fusion Drive** = the continuous, arena-scoped background engine that keeps the v2
|
|
4
|
-
> memory graph self-healing
|
|
5
|
-
>
|
|
6
|
-
>
|
|
7
|
-
>
|
|
4
|
+
> memory graph self-healing. It triages every node into one of **three** outcomes:
|
|
5
|
+
> it *fuses* duplicate/near-duplicate nodes from different distillation runs into a single
|
|
6
|
+
> master node (horizontal convergence); it *re-distills* high-value extractions produced by
|
|
7
|
+
> a superseded teacher/prompt — regenerating them from the still-present source event through
|
|
8
|
+
> the current clean teacher (depth refresh); and it *decays* stale, low-value, and junk nodes
|
|
9
|
+
> out of existence (vertical aging). Named for the drive that does the fusing — the re-distill
|
|
10
|
+
> and decay passes ride the same engine.
|
|
11
|
+
>
|
|
12
|
+
> *(Revised 2026-06-22: added Part B′ — re-distillation — as the third triage verb, with the
|
|
13
|
+
> prompt-version-drift trigger. Motivated by the clean-prompt deploy (SDK 0.10.19, #126 +
|
|
14
|
+
> #129) which made "the current teacher is materially better than the one that produced most
|
|
15
|
+
> of the graph" concrete and measurable via `system_prompt_hash`.)*
|
|
8
16
|
|
|
9
17
|
**Status:** draft / spec — 2026-06-12
|
|
10
18
|
**Builds on:** `RFC-entity-reconciliation.md`, `scripts/entity_resolution_v2.py` (#82),
|
|
@@ -139,15 +147,101 @@ sparse backfill.
|
|
|
139
147
|
|
|
140
148
|
---
|
|
141
149
|
|
|
150
|
+
## Part B′ — Re-distillation: regenerate stale-prompt extractions from source
|
|
151
|
+
|
|
152
|
+
Fusion (A) needs a *correct counterpart* to converge toward; Decay (B) just *deletes*. But
|
|
153
|
+
the common case after a teacher/prompt upgrade is a **high-value node with no correct
|
|
154
|
+
counterpart yet** — the only extraction that exists is the stale-prompt one. Fusion has
|
|
155
|
+
nothing to fuse to; decay would throw away real information. The cure is the third verb: the
|
|
156
|
+
**source event still exists** (`events` table, 376k rows live), so regenerate the extraction
|
|
157
|
+
by re-running that event through the *current clean teacher*. Fusion converges horizontally,
|
|
158
|
+
decay ages vertically; re-distill refreshes **in depth**.
|
|
159
|
+
|
|
160
|
+
### B′1. Trigger — prompt-version drift, not raw age
|
|
161
|
+
The defect population is *exactly* the facts/entities whose provenance traces an **old
|
|
162
|
+
`system_prompt_hash`** — `bbdaba6b…` / `f1e0ff55…` / `ef0647c7…` (pre-clean), vs the clean
|
|
163
|
+
`6ccfe70f…` deployed with 0.10.19 (#126 modality/attribution + #129 email-discipline &
|
|
164
|
+
entity-separation). #118 propagated source onto facts, so provenance → the event's
|
|
165
|
+
`distillation_traces.system_prompt_hash` is queryable. **Age is a weak proxy; prompt-version
|
|
166
|
+
selects the defect set directly** — a months-old node the clean teacher would extract
|
|
167
|
+
identically needs nothing; a two-day-old node from the dirty prompt is a defect. Prioritize
|
|
168
|
+
by `salience` (B1) so high-value stale nodes go first.
|
|
169
|
+
|
|
170
|
+
### B′2. Triage routing — 3-way, by salience × prompt-version
|
|
171
|
+
Per assessed node/event:
|
|
172
|
+
|
|
173
|
+
| condition | outcome |
|
|
174
|
+
|---|---|
|
|
175
|
+
| stale prompt-hash **+** high salience **+** source event present | **re-distill** (this part) |
|
|
176
|
+
| has a correct newer-teacher counterpart in the arena | **fuse** (Part A) |
|
|
177
|
+
| low salience, junk-born (B2), no corroboration, never accessed | **decay** (Part B) |
|
|
178
|
+
|
|
179
|
+
### B′3. Mechanism — re-enqueue, don't mutate in place
|
|
180
|
+
Re-distill = re-insert the source `event_id` into `distillation_queue` (`status='pending'`,
|
|
181
|
+
`attempts=0`). The existing **extractor-async** worker claims it, runs the clean teacher,
|
|
182
|
+
writes the new extraction **and a fresh `6ccfe70f` trace**. No new pipeline — it reuses the
|
|
183
|
+
distiller, the combined-demand **autoscaler**, and the trace ledger. (Re-distill is a
|
|
184
|
+
*producer* of queue demand; the autoscaler's student-aware floor already keeps a teacher box
|
|
185
|
+
warm for it — see the deploy notes.)
|
|
186
|
+
|
|
187
|
+
### B′4. Supersedence — the load-bearing requirement
|
|
188
|
+
The store is **pure-accretion** (the whole motivation of this RFC). A naive re-enqueue makes
|
|
189
|
+
the clean extraction land **beside** the dirty one → it *worsens* fragmentation. So
|
|
190
|
+
re-distill MUST close the loop through Fusion's tombstone machinery — it is **sequenced into
|
|
191
|
+
the Fusion Drive, not bolted on**:
|
|
192
|
+
|
|
193
|
+
1. Each re-distill is recorded in a `redistill_runs` ledger with its triggering
|
|
194
|
+
`(event_id, old_prompt_hash)`.
|
|
195
|
+
2. When the clean extraction completes, **Fusion converges old ↔ new for that event** using
|
|
196
|
+
the teacher-version master signal (A2/A3): the new `6ccfe70f` extraction wins as master;
|
|
197
|
+
the old extraction's now-orphaned nodes (those whose **only** provenance was this event
|
|
198
|
+
under the old hash) are tombstoned/repointed via `entity_merges` / `fact_merges`.
|
|
199
|
+
3. Where an old node carries **other live provenance** (multi-event corroboration), only this
|
|
200
|
+
event's contribution is repointed — **never blind-delete a multi-source node** (the
|
|
201
|
+
over-merge failure mode: a hotel email wrongly attached to a person must not let one
|
|
202
|
+
event's repoint nuke an otherwise-corroborated node).
|
|
203
|
+
|
|
204
|
+
This dependency is hard: **re-distill is unsafe until Fusion's cross-run / teacher-version
|
|
205
|
+
master selection (E3) is live.** Until then a re-distill loop accretes. An interim cheaper
|
|
206
|
+
option (Open Q): explicit **event-scoped supersede** — delete only the facts/entities whose
|
|
207
|
+
provenance set is exactly `{this event}` under the old hash before re-enqueue — covers the
|
|
208
|
+
single-provenance majority without the full fusion adjudicator.
|
|
209
|
+
|
|
210
|
+
### B′5. Corpus-as-byproduct — one loop, three wins
|
|
211
|
+
Every re-distill emits a clean `6ccfe70f` `distillation_trace`. A prompt-version-drift
|
|
212
|
+
re-distill loop therefore **builds the student retrain corpus while it repairs the graph**
|
|
213
|
+
(`scripts/build_retrain_corpus.py` consumes those traces). It subsumes the one-shot full
|
|
214
|
+
re-distill: gradual, rate-limited, no nuke — graph repair **+** corpus **+** self-healing
|
|
215
|
+
from a single engine. This is the durable answer to "is the corpus building?": it is, as a
|
|
216
|
+
side effect of the gardener.
|
|
217
|
+
|
|
218
|
+
### B′6. Cadence + cost + safety
|
|
219
|
+
Rolling, rate-limited, autoscaler-aware, off-peak. Budget *N* events/hour against teacher
|
|
220
|
+
capacity; order by `salience × staleness`. **Never big-bang the full backlog** — gradual
|
|
221
|
+
migration is the point. Arena-scoped, dry-run → `--apply`, `redistill_runs` ledger for
|
|
222
|
+
observability and rollback. Same operational shape as fusion/decay/autoscaler.
|
|
223
|
+
|
|
224
|
+
---
|
|
225
|
+
|
|
142
226
|
## Part C — Ordering & how they combine
|
|
143
227
|
|
|
144
|
-
Per arena, on schedule: **(1)
|
|
145
|
-
|
|
146
|
-
|
|
228
|
+
Per arena, on schedule: **(1) triage → re-distill the high-value stale-prompt set (async via
|
|
229
|
+
the queue) → (2) fusion → (3) decay.** Re-distill is enqueued first so that by the time
|
|
230
|
+
fusion runs, the clean counterpart exists for it to crown as master (else fusion has only
|
|
231
|
+
stale renderings to choose between). Fusion then absorbs each master's duplicates'
|
|
232
|
+
provenance/salience *before* decay judges it (else a real node split across two weak dupes
|
|
233
|
+
could wrongly decay out). Then decay ages + evicts the survivors.
|
|
234
|
+
|
|
235
|
+
*(Re-distill is asynchronous — it completes on the teacher's schedule — so in practice a
|
|
236
|
+
node re-distilled in this pass is fused/decayed in the **next** per-arena pass, once its
|
|
237
|
+
clean trace + extraction have landed. The ledger links the two.)*
|
|
147
238
|
|
|
148
239
|
**This is what finally cures immortal pollution:**
|
|
149
240
|
- 7B polluted node *with* a correct Qwen3.6 counterpart → **fused**, correct one as master,
|
|
150
241
|
polluted demoted to alias / tombstoned.
|
|
242
|
+
- stale-prompt node, *high-value*, *no* correct counterpart, source event present →
|
|
243
|
+
**re-distilled** through the clean teacher → new master extraction; old superseded via
|
|
244
|
+
fusion (B′4). The information is *recovered*, not lost.
|
|
151
245
|
- 7B pure-junk node with *no* correct counterpart (numeric-ID-person, ungrounded) → born-low
|
|
152
246
|
salience + no corroboration + never accessed → **decays out and is evicted**.
|
|
153
247
|
|
|
@@ -165,8 +259,15 @@ reset, but no longer the *only* path).
|
|
|
165
259
|
- `relationships`: `+ salience REAL`, `+ last_accessed` (already has `weight`,
|
|
166
260
|
`first/last_seen`).
|
|
167
261
|
- new `fact_merges` audit (mirror `entity_merges` incl. `rollback_payload`).
|
|
168
|
-
- new `fusion_runs` + `decay_runs` ledgers for observability.
|
|
262
|
+
- new `fusion_runs` + `decay_runs` + `redistill_runs` ledgers for observability. `redistill_runs`:
|
|
263
|
+
`(id, arena, event_id, old_prompt_hash, new_prompt_hash, salience_at_trigger, enqueued_at,
|
|
264
|
+
completed_at, fused_at, mode)` — links a re-distill to its triggering node and to the fusion
|
|
265
|
+
that superseded the old extraction.
|
|
169
266
|
- `/search` gains a `last_accessed = NOW()` bump on returned nodes (batched).
|
|
267
|
+
- re-distill trigger needs provenance → prompt-version: either denormalize `system_prompt_hash`
|
|
268
|
+
onto `facts`/`entities` at write time (cheap filter), or join through
|
|
269
|
+
`distillation_traces(event_id → system_prompt_hash)` on the provenance event ids (no schema
|
|
270
|
+
change, costlier query). Prefer the join until the trigger volume justifies denormalizing.
|
|
170
271
|
|
|
171
272
|
## Part E — Rollout (each flag-gated, arena-scoped, dry-run-first, audited)
|
|
172
273
|
|
|
@@ -176,6 +277,13 @@ reset, but no longer the *only* path).
|
|
|
176
277
|
3. **Fusion extension** — scored canonical selection (fix typo-crowning) + cross-run
|
|
177
278
|
detection + fact fusion, dry-run → apply.
|
|
178
279
|
4. **Online/continuous** — wire fusion+decay to run after distillation per arena.
|
|
280
|
+
5. **Re-distill loop (Part B′)** — dry-run triage first (count stale-prompt nodes by
|
|
281
|
+
`system_prompt_hash` × salience bucket to size the work), then a **bounded `--apply` slice**
|
|
282
|
+
on one curated arena (re-enqueue + verify clean trace + verify fusion supersedes the old
|
|
283
|
+
extraction), then wire continuous. **Gated on step 3** (Fusion cross-run / teacher-version
|
|
284
|
+
master selection): until that's live, re-distill must use the interim **event-scoped
|
|
285
|
+
supersede** (B′4) or it accretes. Ships as `scripts/redistill.py` (dry-run default,
|
|
286
|
+
`--apply` gate, arena-scoped, `redistill_runs` ledger).
|
|
179
287
|
|
|
180
288
|
## Open questions
|
|
181
289
|
- Half-life constants per category — needs a calibration pass against real arenas.
|
|
@@ -183,3 +291,9 @@ reset, but no longer the *only* path).
|
|
|
183
291
|
- Directory authority source for canonical anchoring — HubSpot contacts? a curated table?
|
|
184
292
|
- Interaction with the (still-open) source_id supersede mode — fusion partly subsumes it,
|
|
185
293
|
but explicit supersede is cheaper for known-mutable sources.
|
|
294
|
+
- **Re-distill supersedence before full fusion is live** — is event-scoped supersede (delete
|
|
295
|
+
only nodes whose provenance set is exactly `{this event}` under the old hash) a safe enough
|
|
296
|
+
interim, or do we hard-gate the loop on E3? Single-provenance nodes are the majority, but
|
|
297
|
+
the multi-provenance tail is where the over-merge risk concentrates.
|
|
298
|
+
- **Re-distill prioritization** — pure `salience × staleness`, or weight toward the entities
|
|
299
|
+
behind known user-visible confabulations (Vickers/Boedecker) first?
|
|
@@ -896,6 +896,8 @@ class GraphQueryRequest(BaseModel):
|
|
|
896
896
|
entity_type: str | None = None
|
|
897
897
|
name: str | None = None # canonical_name (ILIKE)
|
|
898
898
|
subject: str | None = None # entity name OR canonical_name (facts.subject_entity)
|
|
899
|
+
subject_entity_id: str | None = None # EXACT facts.subject_entity_id — strict, no name bleed
|
|
900
|
+
object_entity_id: str | None = None # EXACT facts.object_entity_id
|
|
899
901
|
predicate: str | None = None
|
|
900
902
|
category: str | None = None # facts.category
|
|
901
903
|
from_name: str | None = None # relationships.from_entity.canonical_name
|
|
@@ -944,9 +946,14 @@ async def list_entities(req: GraphQueryRequest):
|
|
|
944
946
|
|
|
945
947
|
@app.post("/facts")
|
|
946
948
|
async def list_facts(req: GraphQueryRequest):
|
|
947
|
-
"""Filter facts by arena + optional category/predicate +
|
|
948
|
-
|
|
949
|
-
subject_entity_id
|
|
949
|
+
"""Filter facts by arena + optional category/predicate + subject.
|
|
950
|
+
|
|
951
|
+
PREFER `subject_entity_id` (exact id match) over `subject` (name ILIKE):
|
|
952
|
+
name matching bleeds one person's facts into another's answer when names
|
|
953
|
+
collide or fragment (the Will Vickers ⟵ Will Spencer confabulation — a
|
|
954
|
+
query resolved to one entity must NOT pull a same/similar-named entity's
|
|
955
|
+
facts). The name path is kept for back-compat callers that haven't resolved
|
|
956
|
+
an id yet, but entity-id is the strict, bleed-free path."""
|
|
950
957
|
arenas = _resolve_arenas(req)
|
|
951
958
|
conditions = ["f.arena = ANY(%s)"]
|
|
952
959
|
params: list[Any] = [arenas]
|
|
@@ -956,7 +963,14 @@ async def list_facts(req: GraphQueryRequest):
|
|
|
956
963
|
if req.predicate:
|
|
957
964
|
conditions.append("f.predicate ILIKE %s")
|
|
958
965
|
params.append(f"%{req.predicate}%")
|
|
959
|
-
if req.
|
|
966
|
+
if req.subject_entity_id:
|
|
967
|
+
conditions.append("f.subject_entity_id = %s")
|
|
968
|
+
params.append(req.subject_entity_id)
|
|
969
|
+
if req.object_entity_id:
|
|
970
|
+
conditions.append("f.object_entity_id = %s")
|
|
971
|
+
params.append(req.object_entity_id)
|
|
972
|
+
# Name path: only when no exact id was given (back-compat / unresolved callers).
|
|
973
|
+
if req.subject and not req.subject_entity_id:
|
|
960
974
|
conditions.append("EXISTS (SELECT 1 FROM entities e WHERE e.id = f.subject_entity_id AND (e.canonical_name ILIKE %s OR %s = ANY(e.aliases)))")
|
|
961
975
|
params.extend([f"%{req.subject}%", req.subject])
|
|
962
976
|
sql = f"""
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""Unit tests for the email-alias guard (_email_plausibly_belongs).
|
|
2
|
+
|
|
3
|
+
Pins the live pollution case (the "Johann Boedecker" node, 2026-06-22): keep the
|
|
4
|
+
person's own addresses; drop the bystander emails (a hotel, newsletters, unrelated
|
|
5
|
+
gmails) the LLM stapled on from co-occurring documents.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import importlib.util
|
|
11
|
+
import sys
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
import pytest
|
|
15
|
+
|
|
16
|
+
_THIS = Path(__file__).resolve().parent
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _load(name="extractor_async_worker_aliasguard"):
|
|
20
|
+
spec = importlib.util.spec_from_file_location(name, _THIS / "worker.py")
|
|
21
|
+
mod = importlib.util.module_from_spec(spec)
|
|
22
|
+
sys.modules[name] = mod
|
|
23
|
+
spec.loader.exec_module(mod)
|
|
24
|
+
return mod
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
worker = _load()
|
|
29
|
+
except ImportError as e:
|
|
30
|
+
pytest.skip(f"extractor-async deps unavailable: {e}", allow_module_level=True)
|
|
31
|
+
|
|
32
|
+
belongs = lambda n, e: worker._email_plausibly_belongs(n, e)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# ── KEEP: the person's own addresses ─────────────────────────────────────
|
|
36
|
+
@pytest.mark.parametrize("email", [
|
|
37
|
+
"johann@pentatonic.com",
|
|
38
|
+
"johann.boedecker@pentatonic.com",
|
|
39
|
+
"boedeckerjohann@gmail.com",
|
|
40
|
+
"JOHANN@pentatonic.com", # case-insensitive
|
|
41
|
+
"jb@pentatonic.com", # initials
|
|
42
|
+
"j.boedecker@pentatonic.com", # surname token
|
|
43
|
+
])
|
|
44
|
+
def test_keeps_owner_emails(email):
|
|
45
|
+
assert belongs("Johann Boedecker", email) is True
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# ── DROP: the actual bystander emails found on the live Johann node ──────
|
|
49
|
+
@pytest.mark.parametrize("email", [
|
|
50
|
+
"reservations.nyc@acehotel.com",
|
|
51
|
+
"marketingadmin@sustainablebrands.com",
|
|
52
|
+
"martinvasquez87@gmail.com",
|
|
53
|
+
"schwaabd@yahoo.de",
|
|
54
|
+
"cvanderlip@redish.com",
|
|
55
|
+
"leechihshan33@gmail.com",
|
|
56
|
+
])
|
|
57
|
+
def test_drops_bystander_emails(email):
|
|
58
|
+
assert belongs("Johann Boedecker", email) is False
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# ── edges ────────────────────────────────────────────────────────────────
|
|
62
|
+
def test_initials_either_order():
|
|
63
|
+
assert belongs("Johann Boedecker", "bj@pentatonic.com") is True # reversed initials
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def test_no_usable_name_does_not_overfilter():
|
|
67
|
+
# a bare/empty name has nothing to check against → keep (don't strip)
|
|
68
|
+
assert belongs("", "anything@x.com") is True
|
|
69
|
+
assert belongs("J", "anything@x.com") is True # single letter < 2 → no tokens
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def test_surname_only_person_keeps_surname_email():
|
|
73
|
+
assert belongs("Vickers", "will.vickers@vickers-oil.com") is True
|
|
74
|
+
assert belongs("Vickers", "reservations.nyc@acehotel.com") is False
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def test_guard_flag_default_on():
|
|
78
|
+
assert worker.EMAIL_ALIAS_GUARD is True
|
|
@@ -1253,6 +1253,43 @@ def org_node_id_key(entity_type: str, name: str, stamped_domain: str | None) ->
|
|
|
1253
1253
|
return name
|
|
1254
1254
|
|
|
1255
1255
|
|
|
1256
|
+
# --------------------------------------------------------------------
|
|
1257
|
+
# Email-alias guard — stop bystander emails polluting a person
|
|
1258
|
+
# --------------------------------------------------------------------
|
|
1259
|
+
# The async LLM pass sometimes emits a PERSON entity whose `email` is a BYSTANDER
|
|
1260
|
+
# address co-occurring in the same doc/thread (a hotel booking, a newsletter, an
|
|
1261
|
+
# unrelated gmail). _parse_guided_json promotes it into the entity's aliases and
|
|
1262
|
+
# upsert_entities then stores + RESOLVES on it — folding strangers' identities
|
|
1263
|
+
# (and their facts) onto the person. Measured live (pentatonic-team): a "Johann
|
|
1264
|
+
# Boedecker" node carrying reservations.nyc@acehotel.com + unrelated gmails, all
|
|
1265
|
+
# from STUDENT-distilled `doc` events. This guard keeps an email alias on a person
|
|
1266
|
+
# only when its local-part plausibly relates to the person's name; clear bystanders
|
|
1267
|
+
# are dropped BEFORE resolution/storage. Conservative: dropping a genuine but
|
|
1268
|
+
# non-name-matching alias is a mild loss; keeping a bystander is a confabulation
|
|
1269
|
+
# source. Flag-revertible (EMAIL_ALIAS_GUARD=false). Persons only — org domain
|
|
1270
|
+
# stamping is untouched.
|
|
1271
|
+
EMAIL_ALIAS_GUARD = _envflag("EMAIL_ALIAS_GUARD", "true")
|
|
1272
|
+
_ALIAS_NONALPHA = re.compile(r"[^a-z]")
|
|
1273
|
+
_ALIAS_SPLIT = re.compile(r"[^a-z]+")
|
|
1274
|
+
|
|
1275
|
+
|
|
1276
|
+
def _email_plausibly_belongs(person_name: str, email: str) -> bool:
|
|
1277
|
+
"""True ⇒ keep this email as an alias of `person_name`; False ⇒ drop (clear
|
|
1278
|
+
bystander). Match = a name token appears in the local-part, OR the local-part
|
|
1279
|
+
is the person's initials. Pure + deterministic."""
|
|
1280
|
+
local = email.split("@", 1)[0].lower()
|
|
1281
|
+
local_letters = _ALIAS_NONALPHA.sub("", local)
|
|
1282
|
+
name_tokens = {t for t in _ALIAS_SPLIT.split(person_name.lower()) if len(t) >= 2}
|
|
1283
|
+
if not name_tokens or not local_letters:
|
|
1284
|
+
return True # nothing to check against — don't over-filter
|
|
1285
|
+
if any(nt in local_letters for nt in name_tokens):
|
|
1286
|
+
return True # johann@…, johann.boedecker@…, boedeckerjohann@…
|
|
1287
|
+
initials = "".join(t[0] for t in person_name.lower().split() if t[:1].isalpha())
|
|
1288
|
+
if len(initials) >= 2 and local_letters in (initials, initials[::-1]):
|
|
1289
|
+
return True # jb@… / bj@… for "Johann Boedecker"
|
|
1290
|
+
return False
|
|
1291
|
+
|
|
1292
|
+
|
|
1256
1293
|
def upsert_entities(
|
|
1257
1294
|
conn: psycopg.Connection,
|
|
1258
1295
|
arena: str,
|
|
@@ -1345,6 +1382,21 @@ def upsert_entities(
|
|
|
1345
1382
|
continue
|
|
1346
1383
|
aliases = [a for a in (e.get("aliases") or []) if a]
|
|
1347
1384
|
|
|
1385
|
+
# Email-alias guard (persons only): drop bystander emails the LLM
|
|
1386
|
+
# stapled on from a co-occurring doc/thread, BEFORE they reach
|
|
1387
|
+
# resolution or storage. See _email_plausibly_belongs.
|
|
1388
|
+
if EMAIL_ALIAS_GUARD and etype == "person" and aliases:
|
|
1389
|
+
kept = []
|
|
1390
|
+
for a in aliases:
|
|
1391
|
+
if "@" in a and " " not in a and not _email_plausibly_belongs(name, a):
|
|
1392
|
+
log.info(
|
|
1393
|
+
f"alias-guard: dropped bystander email {a!r} from "
|
|
1394
|
+
f"person {name!r} (arena={arena})"
|
|
1395
|
+
)
|
|
1396
|
+
continue
|
|
1397
|
+
kept.append(a)
|
|
1398
|
+
aliases = kept
|
|
1399
|
+
|
|
1348
1400
|
# Hard-key stamps for THIS entity, merged onto the node's attributes
|
|
1349
1401
|
# and (for domain) into the resolution aliases. Adding domain to
|
|
1350
1402
|
# aliases before forms are computed is deliberate — that's what makes
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Build a student retrain corpus from CLEAN teacher gold.
|
|
3
|
+
|
|
4
|
+
The student (NuExtract-2.0-4B FT) was originally trained on teacher traces
|
|
5
|
+
produced under the *old* distiller prompts (bbdaba / f1e0ff), which had no
|
|
6
|
+
email-discipline or modality rules — so the student learned to (a) promote
|
|
7
|
+
bystander emails into a person's aliases (the Johann/hotel over-merge), (b)
|
|
8
|
+
collapse future/invited roles to established `state` facts, (c) conflate
|
|
9
|
+
"X & Y" into one entity, and (d) mint email-named / generic-infra entities.
|
|
10
|
+
|
|
11
|
+
This builder draws ONLY from traces produced under the clean prompt (#126
|
|
12
|
+
modality/attribution + #129 email-discipline & entity-separation), whose
|
|
13
|
+
`system_prompt_hash` is the deployed clean hash. Building from old-prompt
|
|
14
|
+
traces would just re-teach the defects, so that is explicitly NOT the default
|
|
15
|
+
(you must pass --allow-dirty-hash to override, and you shouldn't).
|
|
16
|
+
|
|
17
|
+
A defect filter runs as a SECOND line of defence: even clean-prompt output is
|
|
18
|
+
screened for the known defect signatures and dropped if any survive. Every
|
|
19
|
+
drop is counted by reason so the corpus's cleanliness is auditable (not a
|
|
20
|
+
black box — see the printed report).
|
|
21
|
+
|
|
22
|
+
INPUT — an NDJSON stream of trace rows, one object per line:
|
|
23
|
+
{"event_id": "...", "user_prompt": "...", "raw_response": "...",
|
|
24
|
+
"system_prompt_hash": "..."}
|
|
25
|
+
Produce it from the engine box's org_model DB with row_to_json (the escaping
|
|
26
|
+
that bit us before — \\copy double-escapes, $-quoting gets eaten by the shell —
|
|
27
|
+
is avoided by -At + row_to_json):
|
|
28
|
+
|
|
29
|
+
sudo docker exec -i pme2-org-model psql -U pme -d org_model -At -c \\
|
|
30
|
+
"SELECT row_to_json(t) FROM (
|
|
31
|
+
SELECT event_id, user_prompt, raw_response, system_prompt_hash
|
|
32
|
+
FROM distillation_traces
|
|
33
|
+
WHERE system_prompt_hash = '6ccfe70f1286a131'
|
|
34
|
+
) t" > traces.ndjson
|
|
35
|
+
|
|
36
|
+
OUTPUT — {"input": <per-event block>, "output": <extraction JSON string>}
|
|
37
|
+
JSONL(.gz), the exact shape train_lora.py's load() consumes (it keeps rows
|
|
38
|
+
where both `input` and `output` are truthy, then trains user=input ->
|
|
39
|
+
assistant=output via the NuExtract chat template; no system prompt in the
|
|
40
|
+
pair). The corpus is PER-EVENT while a trace is a 3-event chunk, so each
|
|
41
|
+
trace's user_prompt is split on the `[event K]` markers and matched to
|
|
42
|
+
raw_response[index == K].
|
|
43
|
+
|
|
44
|
+
Usage:
|
|
45
|
+
python build_retrain_corpus.py --traces traces.ndjson --out retrain_clean.jsonl.gz
|
|
46
|
+
zcat traces.ndjson.gz | python build_retrain_corpus.py --traces - --out c.jsonl.gz
|
|
47
|
+
"""
|
|
48
|
+
from __future__ import annotations
|
|
49
|
+
|
|
50
|
+
import argparse
|
|
51
|
+
import gzip
|
|
52
|
+
import hashlib
|
|
53
|
+
import json
|
|
54
|
+
import re
|
|
55
|
+
import sys
|
|
56
|
+
from collections import Counter
|
|
57
|
+
|
|
58
|
+
# The clean prompt deployed as SDK 0.10.19 (#126 + #129). Verify against the
|
|
59
|
+
# running extractor-async (worker.SYSTEM_PROMPT_HASH) before a real corpus cut —
|
|
60
|
+
# a prompt edit advances this and old-hash traces must not silently leak in.
|
|
61
|
+
CLEAN_PROMPT_HASH = "6ccfe70f1286a131"
|
|
62
|
+
|
|
63
|
+
# Generic infra / environment tokens that must never be standalone entities
|
|
64
|
+
# (mirrors the #129 DISTINCT ENTITIES rule — kept in sync by hand).
|
|
65
|
+
INFRA_TOKENS = {
|
|
66
|
+
"prod", "production", "staging", "stage", "uat", "qa", "dev", "test",
|
|
67
|
+
"warehouse", "datalake", "data lake", "cluster", "backend", "frontend",
|
|
68
|
+
"the system", "the platform", "the api", "the database", "the server",
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
EVENT_BLOCK_RE = re.compile(r"(?=^\[event \d+\])", re.MULTILINE)
|
|
72
|
+
EVENT_IDX_RE = re.compile(r"^\[event (\d+)\]")
|
|
73
|
+
EMAIL_RE = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$")
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _email_plausibly_belongs(person_name: str, email: str) -> bool:
|
|
77
|
+
"""Keep an email on a person only if it plausibly is theirs: a name token
|
|
78
|
+
appears in the local-part, or the initials match. Same heuristic as the
|
|
79
|
+
write-side guard (#128) so corpus filtering and runtime agree."""
|
|
80
|
+
local = email.split("@", 1)[0].lower()
|
|
81
|
+
local_alnum = re.sub(r"[^a-z0-9]", "", local)
|
|
82
|
+
tokens = [t for t in re.split(r"\s+", person_name.lower()) if t]
|
|
83
|
+
if not tokens:
|
|
84
|
+
return False
|
|
85
|
+
for t in tokens:
|
|
86
|
+
t_alnum = re.sub(r"[^a-z0-9]", "", t)
|
|
87
|
+
if len(t_alnum) >= 3 and t_alnum in local_alnum:
|
|
88
|
+
return True
|
|
89
|
+
initials = "".join(t[0] for t in tokens if t)
|
|
90
|
+
if len(initials) >= 2 and initials in local_alnum:
|
|
91
|
+
return True
|
|
92
|
+
return False
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _entity_defect(ent: dict) -> str | None:
|
|
96
|
+
"""Return a drop-reason if this entity carries a known defect, else None."""
|
|
97
|
+
name = (ent.get("name") or "").strip()
|
|
98
|
+
etype = (ent.get("type") or "").lower()
|
|
99
|
+
if not name:
|
|
100
|
+
return "empty_entity_name"
|
|
101
|
+
if EMAIL_RE.match(name):
|
|
102
|
+
return "email_as_entity"
|
|
103
|
+
if name.lower() in INFRA_TOKENS:
|
|
104
|
+
return "generic_infra_entity"
|
|
105
|
+
# Conflation: "Acme & Globex" / "Alice and Bob" smuggled into one node.
|
|
106
|
+
if re.search(r"\s&\s", name) or re.search(r"\b and \b", name.lower()):
|
|
107
|
+
return "conflated_entity"
|
|
108
|
+
if etype == "person":
|
|
109
|
+
for a in ent.get("aliases") or []:
|
|
110
|
+
if isinstance(a, str) and "@" in a and " " not in a \
|
|
111
|
+
and not _email_plausibly_belongs(name, a):
|
|
112
|
+
return "bystander_email_alias"
|
|
113
|
+
return None
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _output_is_clean(obj: dict) -> str | None:
|
|
117
|
+
"""Screen one per-event extraction object; return a drop-reason or None."""
|
|
118
|
+
if not isinstance(obj, dict):
|
|
119
|
+
return "output_not_object"
|
|
120
|
+
for ent in obj.get("entities") or []:
|
|
121
|
+
r = _entity_defect(ent)
|
|
122
|
+
if r:
|
|
123
|
+
return r
|
|
124
|
+
return None
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def split_events(user_prompt: str) -> dict[int, str]:
|
|
128
|
+
"""Split a chunk prompt into {event_index: block_text}."""
|
|
129
|
+
blocks: dict[int, str] = {}
|
|
130
|
+
for block in EVENT_BLOCK_RE.split(user_prompt):
|
|
131
|
+
block = block.rstrip()
|
|
132
|
+
m = EVENT_IDX_RE.match(block)
|
|
133
|
+
if m:
|
|
134
|
+
blocks[int(m.group(1))] = block
|
|
135
|
+
return blocks
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def main() -> int:
|
|
139
|
+
ap = argparse.ArgumentParser(description=__doc__,
|
|
140
|
+
formatter_class=argparse.RawDescriptionHelpFormatter)
|
|
141
|
+
ap.add_argument("--traces", required=True,
|
|
142
|
+
help="NDJSON trace rows, or '-' for stdin")
|
|
143
|
+
ap.add_argument("--out", required=True, help="output .jsonl.gz")
|
|
144
|
+
ap.add_argument("--hash", default=CLEAN_PROMPT_HASH,
|
|
145
|
+
help=f"keep only this system_prompt_hash (default {CLEAN_PROMPT_HASH})")
|
|
146
|
+
ap.add_argument("--allow-dirty-hash", action="store_true",
|
|
147
|
+
help="do NOT filter by hash — DANGER: re-teaches old-prompt defects")
|
|
148
|
+
ap.add_argument("--report", help="optional path for a JSON stats report")
|
|
149
|
+
args = ap.parse_args()
|
|
150
|
+
|
|
151
|
+
fh = sys.stdin if args.traces == "-" else open(args.traces, encoding="utf-8")
|
|
152
|
+
stats = Counter()
|
|
153
|
+
seen: set[str] = set()
|
|
154
|
+
examples: list[dict] = []
|
|
155
|
+
|
|
156
|
+
for line in fh:
|
|
157
|
+
line = line.strip()
|
|
158
|
+
if not line:
|
|
159
|
+
continue
|
|
160
|
+
stats["trace_rows"] += 1
|
|
161
|
+
try:
|
|
162
|
+
row = json.loads(line)
|
|
163
|
+
except json.JSONDecodeError:
|
|
164
|
+
stats["drop_trace_unparseable"] += 1
|
|
165
|
+
continue
|
|
166
|
+
|
|
167
|
+
if not args.allow_dirty_hash and row.get("system_prompt_hash") != args.hash:
|
|
168
|
+
stats["drop_wrong_hash"] += 1
|
|
169
|
+
continue
|
|
170
|
+
|
|
171
|
+
raw = row.get("raw_response") or ""
|
|
172
|
+
try:
|
|
173
|
+
parsed = json.loads(raw)
|
|
174
|
+
except json.JSONDecodeError:
|
|
175
|
+
stats["drop_response_unparseable"] += 1
|
|
176
|
+
continue
|
|
177
|
+
# raw_response is either a single per-event object (current trace
|
|
178
|
+
# format — one row per event) or, for legacy chunked traces, a JSON
|
|
179
|
+
# array / {"events": [...]} of per-event objects.
|
|
180
|
+
if isinstance(parsed, dict):
|
|
181
|
+
objs = parsed.get("events") if isinstance(parsed.get("events"), list) else [parsed]
|
|
182
|
+
elif isinstance(parsed, list):
|
|
183
|
+
objs = parsed
|
|
184
|
+
else:
|
|
185
|
+
stats["drop_response_shape"] += 1
|
|
186
|
+
continue
|
|
187
|
+
|
|
188
|
+
blocks = split_events(row.get("user_prompt") or "")
|
|
189
|
+
for obj in objs:
|
|
190
|
+
if not isinstance(obj, dict):
|
|
191
|
+
stats["drop_obj_not_object"] += 1
|
|
192
|
+
continue
|
|
193
|
+
idx = obj.get("index")
|
|
194
|
+
block = blocks.get(idx) if idx is not None else None
|
|
195
|
+
# Single-event trace: one block, one object — match by position
|
|
196
|
+
# even if the stored index doesn't line up with the marker.
|
|
197
|
+
if block is None and len(objs) == 1 and len(blocks) == 1:
|
|
198
|
+
block = next(iter(blocks.values()))
|
|
199
|
+
if not block:
|
|
200
|
+
stats["drop_no_matching_block"] += 1
|
|
201
|
+
continue
|
|
202
|
+
|
|
203
|
+
key = hashlib.sha1(block.encode("utf-8")).hexdigest()
|
|
204
|
+
if key in seen:
|
|
205
|
+
stats["drop_dup"] += 1
|
|
206
|
+
continue
|
|
207
|
+
|
|
208
|
+
reason = _output_is_clean(obj)
|
|
209
|
+
if reason:
|
|
210
|
+
stats[f"drop_{reason}"] += 1
|
|
211
|
+
continue
|
|
212
|
+
|
|
213
|
+
seen.add(key)
|
|
214
|
+
examples.append({"input": block, "output": json.dumps(obj, ensure_ascii=False)})
|
|
215
|
+
stats["kept"] += 1
|
|
216
|
+
|
|
217
|
+
if args.traces != "-":
|
|
218
|
+
fh.close()
|
|
219
|
+
|
|
220
|
+
with gzip.open(args.out, "wt", encoding="utf-8") as out:
|
|
221
|
+
for ex in examples:
|
|
222
|
+
out.write(json.dumps(ex, ensure_ascii=False) + "\n")
|
|
223
|
+
|
|
224
|
+
report = {"out": args.out, "hash": (None if args.allow_dirty_hash else args.hash),
|
|
225
|
+
"stats": dict(sorted(stats.items()))}
|
|
226
|
+
print(json.dumps(report, indent=2))
|
|
227
|
+
if args.report:
|
|
228
|
+
with open(args.report, "w", encoding="utf-8") as rf:
|
|
229
|
+
json.dump(report, rf, indent=2)
|
|
230
|
+
|
|
231
|
+
if stats["kept"] == 0:
|
|
232
|
+
print("\nWARNING: 0 examples kept. If you targeted the clean hash, the "
|
|
233
|
+
"clean-prompt teacher has not produced enough gold yet — let it "
|
|
234
|
+
"accumulate (or run a teacher-only re-distill of a curated event "
|
|
235
|
+
"slice through the clean prompt), then re-run.", file=sys.stderr)
|
|
236
|
+
return 0
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
if __name__ == "__main__":
|
|
240
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,440 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Fusion de-fragmentation — cluster same-surname PERSON fragments and propose
|
|
3
|
+
merges (RFC-decay-and-fusion A2/A3). DRY-RUN by default.
|
|
4
|
+
|
|
5
|
+
The deterministic upsert resolver (worker.py) converges same-form / shared-alias
|
|
6
|
+
entities, but it CANNOT safely merge surface-form/nickname variants of one real
|
|
7
|
+
person (e.g. "Will Vickers" / "William Vickers" / "William F. Vickers" / bare
|
|
8
|
+
"Vickers") — they have different normalized names and no shared alias, so they
|
|
9
|
+
fragment (209 "Vickers" nodes observed). Merging them is Fusion's job, and it is
|
|
10
|
+
DESTRUCTIVE (repoints facts/relationships, tombstones losers), so the over-merge
|
|
11
|
+
failure mode (folding two DIFFERENT people, or a person into an org) must be
|
|
12
|
+
designed out. This tool is conservative + dry-run-first; --apply is double-gated.
|
|
13
|
+
|
|
14
|
+
CLUSTERING (anti-over-merge by construction):
|
|
15
|
+
- PERSON entities only; never crosses entity_type (so "Vickers Oils" the org is
|
|
16
|
+
never pulled in).
|
|
17
|
+
- Same surname token (the --surname scope).
|
|
18
|
+
- First-name compatibility for the NON-surname tokens: equal, or one an initial
|
|
19
|
+
of the other (W ↔ William), or one a prefix of the other (Will ⊂ William).
|
|
20
|
+
Two DISTINCT full first names (Will vs Jane) are INCOMPATIBLE → never merged.
|
|
21
|
+
- Union-find over compatible NON-bare names → each cluster = one real person.
|
|
22
|
+
- Bare "<surname>" nodes (no first name) are folded in ONLY when there is
|
|
23
|
+
exactly ONE non-bare cluster for the surname (unambiguous); otherwise they
|
|
24
|
+
are left for human review (never used to bridge two distinct people).
|
|
25
|
+
|
|
26
|
+
CANONICAL (A3 scored master, replaces richest-row-wins):
|
|
27
|
+
+ has email (attributes.email or an email alias) strongest identity signal
|
|
28
|
+
+ full name (>=2 name tokens) a real rendering, not a stub
|
|
29
|
+
+ corroboration (provenance event count) grounded in more events
|
|
30
|
+
+ fact count the node that holds the picture
|
|
31
|
+
- bare single-token name penalize stub
|
|
32
|
+
- ID-like (digit ratio > 0.5) penalize 7B numeric-id junk
|
|
33
|
+
|
|
34
|
+
OUTPUT: per cluster — master, losers, why, and the repoint impact (facts +
|
|
35
|
+
relationships that would move onto the master). No DB writes in dry-run
|
|
36
|
+
(the session is forced read-only). --apply would execute via the reviewed
|
|
37
|
+
fusion_drive merge executor + entity_merges audit (NOT enabled here).
|
|
38
|
+
|
|
39
|
+
Usage:
|
|
40
|
+
python fusion_defrag.py --arena 'pentatonic-team%' --surname vickers
|
|
41
|
+
python fusion_defrag.py --arena 'pentatonic-team%' --surname vickers --json out.json
|
|
42
|
+
"""
|
|
43
|
+
from __future__ import annotations
|
|
44
|
+
|
|
45
|
+
import argparse
|
|
46
|
+
import json
|
|
47
|
+
import re
|
|
48
|
+
import sys
|
|
49
|
+
import uuid
|
|
50
|
+
from collections import defaultdict
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _connect(dsn: str):
|
|
54
|
+
import psycopg
|
|
55
|
+
import psycopg.rows
|
|
56
|
+
return psycopg.connect(dsn, row_factory=psycopg.rows.dict_row)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
_TOKEN_RE = re.compile(r"[^a-z0-9]+")
|
|
60
|
+
_EMAIL_RE = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def name_tokens(name: str) -> list[str]:
|
|
64
|
+
return [t for t in _TOKEN_RE.split((name or "").lower()) if t]
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def digit_ratio(name: str) -> float:
|
|
68
|
+
s = re.sub(r"\s+", "", name or "")
|
|
69
|
+
return sum(c.isdigit() for c in s) / len(s) if s else 0.0
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# Honorifics/titles are NOT first names — strip them so "Herr Johann Boedecker"
|
|
73
|
+
# matches "Johann Boedecker", and a title-only "Herr Boedecker" reduces to a bare
|
|
74
|
+
# surname (held for review, not merged on the title).
|
|
75
|
+
_TITLES = {"herr", "frau", "fr", "dr", "prof", "mr", "mrs", "ms", "miss",
|
|
76
|
+
"sir", "dame", "mx", "mme", "mlle", "hr"}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def first_name_tokens(name: str, surname: str) -> list[str]:
|
|
80
|
+
"""Name tokens minus the surname (first occurrence) and minus honorifics."""
|
|
81
|
+
toks = name_tokens(name)
|
|
82
|
+
out, dropped = [], False
|
|
83
|
+
for t in toks:
|
|
84
|
+
if not dropped and t == surname:
|
|
85
|
+
dropped = True
|
|
86
|
+
continue
|
|
87
|
+
if t in _TITLES:
|
|
88
|
+
continue
|
|
89
|
+
out.append(t)
|
|
90
|
+
return out
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def first_names_compatible(a: list[str], b: list[str]) -> bool:
|
|
94
|
+
"""Compatible iff the leading given-name tokens don't CONFLICT. Equal /
|
|
95
|
+
initial-of / prefix-of are compatible; two distinct full names are not.
|
|
96
|
+
Empty (bare surname) is handled separately by the caller — NOT here."""
|
|
97
|
+
if not a or not b:
|
|
98
|
+
return False # bare names never auto-bridge via this predicate
|
|
99
|
+
x, y = a[0], b[0]
|
|
100
|
+
if x == y:
|
|
101
|
+
return True
|
|
102
|
+
# initial ↔ full (w / william)
|
|
103
|
+
if (len(x) == 1 and y.startswith(x)) or (len(y) == 1 and x.startswith(y)):
|
|
104
|
+
return True
|
|
105
|
+
# nickname/prefix (will / william) — require >=3 chars to avoid junk
|
|
106
|
+
if len(x) >= 3 and y.startswith(x):
|
|
107
|
+
return True
|
|
108
|
+
if len(y) >= 3 and x.startswith(y):
|
|
109
|
+
return True
|
|
110
|
+
return False
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class UnionFind:
|
|
114
|
+
def __init__(self, ids):
|
|
115
|
+
self.p = {i: i for i in ids}
|
|
116
|
+
|
|
117
|
+
def find(self, i):
|
|
118
|
+
while self.p[i] != i:
|
|
119
|
+
self.p[i] = self.p[self.p[i]]
|
|
120
|
+
i = self.p[i]
|
|
121
|
+
return i
|
|
122
|
+
|
|
123
|
+
def union(self, a, b):
|
|
124
|
+
ra, rb = self.find(a), self.find(b)
|
|
125
|
+
if ra != rb:
|
|
126
|
+
self.p[ra] = rb
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def has_email(ent: dict) -> bool:
|
|
130
|
+
attrs = ent.get("attributes") or {}
|
|
131
|
+
if isinstance(attrs, dict) and attrs.get("email"):
|
|
132
|
+
return True
|
|
133
|
+
return any(isinstance(a, str) and _EMAIL_RE.match(a) for a in (ent.get("aliases") or []))
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def master_score(ent: dict, surname: str) -> float:
|
|
137
|
+
fn = first_name_tokens(ent["canonical_name"], surname)
|
|
138
|
+
score = 0.0
|
|
139
|
+
if has_email(ent):
|
|
140
|
+
score += 3.0
|
|
141
|
+
if len(name_tokens(ent["canonical_name"])) >= 2:
|
|
142
|
+
score += 2.0
|
|
143
|
+
score += min(len(ent.get("provenance_event_ids") or []), 20) * 0.2
|
|
144
|
+
score += min(ent.get("fact_n", 0), 40) * 0.05
|
|
145
|
+
if not fn: # bare surname
|
|
146
|
+
score -= 3.0
|
|
147
|
+
if digit_ratio(ent["canonical_name"]) > 0.5:
|
|
148
|
+
score -= 5.0
|
|
149
|
+
return score
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _union(*lists):
|
|
153
|
+
seen = {}
|
|
154
|
+
for lst in lists:
|
|
155
|
+
for x in lst or []:
|
|
156
|
+
seen.setdefault(x, None)
|
|
157
|
+
return list(seen.keys())
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def apply_cluster(cur, conn, arena: str, master: dict, losers: list[dict]) -> dict:
|
|
161
|
+
"""Fold `losers` into `master` within ONE transaction. Faithful inline of the
|
|
162
|
+
reviewed fusion_drive _execute_entity_plan + build_entity_merge_plan: repoint
|
|
163
|
+
facts (subject/object) and relationships (endpoints, summing weight on the
|
|
164
|
+
post-repoint (from,to,type) collision), accrete aliases+provenance onto the
|
|
165
|
+
master, write one entity_merges audit row per loser (rollback_payload = full
|
|
166
|
+
loser row), then delete the losers. Re-validates losers still exist first."""
|
|
167
|
+
loser_ids = [l["id"] for l in losers]
|
|
168
|
+
# Load edges/facts touching the losers AND THE MASTER. Including the master is
|
|
169
|
+
# load-bearing for relationships: a loser edge repointed onto the master can
|
|
170
|
+
# collide with an edge the master ALREADY has (or another loser's) on the
|
|
171
|
+
# UNIQUE(arena,from,to,type) key — if we don't see the master's existing edges
|
|
172
|
+
# in collision detection, the repoint UPDATE hits a duplicate-key violation
|
|
173
|
+
# (caught the hard way on the first Vickers apply; the txn rolled back clean).
|
|
174
|
+
# Facts have no such unique key, so master facts are loaded but never repointed
|
|
175
|
+
# (repoint decisions key on the loser set only).
|
|
176
|
+
targets = loser_ids + [master["id"]]
|
|
177
|
+
cur.execute(
|
|
178
|
+
"SELECT id, subject_entity_id, object_entity_id FROM facts "
|
|
179
|
+
"WHERE arena = %s AND (subject_entity_id = ANY(%s) OR object_entity_id = ANY(%s))",
|
|
180
|
+
(arena, targets, targets),
|
|
181
|
+
)
|
|
182
|
+
facts = cur.fetchall()
|
|
183
|
+
cur.execute(
|
|
184
|
+
"SELECT id, from_entity_id, to_entity_id, relationship_type, weight, "
|
|
185
|
+
"provenance_event_ids FROM relationships "
|
|
186
|
+
"WHERE arena = %s AND (from_entity_id = ANY(%s) OR to_entity_id = ANY(%s))",
|
|
187
|
+
(arena, targets, targets),
|
|
188
|
+
)
|
|
189
|
+
rels = cur.fetchall()
|
|
190
|
+
|
|
191
|
+
lset = set(loser_ids)
|
|
192
|
+
aliases = _union(master.get("aliases") or [],
|
|
193
|
+
[l["canonical_name"] for l in losers],
|
|
194
|
+
*[l.get("aliases") or [] for l in losers])
|
|
195
|
+
aliases = [a for a in aliases if a != master["canonical_name"]]
|
|
196
|
+
provenance = _union(master.get("provenance_event_ids") or [],
|
|
197
|
+
*[l.get("provenance_event_ids") or [] for l in losers])
|
|
198
|
+
fact_subj = [f["id"] for f in facts if f["subject_entity_id"] in lset]
|
|
199
|
+
fact_obj = [f["id"] for f in facts if f["object_entity_id"] in lset]
|
|
200
|
+
|
|
201
|
+
def rk(r):
|
|
202
|
+
frm = master["id"] if r["from_entity_id"] in lset else r["from_entity_id"]
|
|
203
|
+
to = master["id"] if r["to_entity_id"] in lset else r["to_entity_id"]
|
|
204
|
+
return (frm, to, r["relationship_type"])
|
|
205
|
+
by_key, rel_repoints, rel_collisions = {}, [], []
|
|
206
|
+
for r in rels:
|
|
207
|
+
touches = r["from_entity_id"] in lset or r["to_entity_id"] in lset
|
|
208
|
+
key = rk(r)
|
|
209
|
+
if key in by_key:
|
|
210
|
+
keep = by_key[key]
|
|
211
|
+
rel_collisions.append({
|
|
212
|
+
"keep": keep["id"], "drop": r["id"],
|
|
213
|
+
"summed_weight": round((keep.get("weight") or 1.0) + (r.get("weight") or 1.0), 4),
|
|
214
|
+
"provenance": _union(keep.get("provenance_event_ids") or [],
|
|
215
|
+
r.get("provenance_event_ids") or []),
|
|
216
|
+
})
|
|
217
|
+
else:
|
|
218
|
+
by_key[key] = r
|
|
219
|
+
if touches:
|
|
220
|
+
rel_repoints.append(r["id"])
|
|
221
|
+
|
|
222
|
+
with conn.transaction():
|
|
223
|
+
live = set()
|
|
224
|
+
cur.execute("SELECT id FROM entities WHERE id = ANY(%s)", (loser_ids,))
|
|
225
|
+
live = {r["id"] for r in cur.fetchall()}
|
|
226
|
+
if live != lset:
|
|
227
|
+
return {"applied": False, "reason": "stale: some losers already gone"}
|
|
228
|
+
cur.execute("UPDATE entities SET aliases=%s, provenance_event_ids=%s, last_seen=NOW() "
|
|
229
|
+
"WHERE id=%s", (aliases, provenance, master["id"]))
|
|
230
|
+
for fid in fact_subj:
|
|
231
|
+
cur.execute("UPDATE facts SET subject_entity_id=%s WHERE id=%s", (master["id"], fid))
|
|
232
|
+
for fid in fact_obj:
|
|
233
|
+
cur.execute("UPDATE facts SET object_entity_id=%s WHERE id=%s", (master["id"], fid))
|
|
234
|
+
# DELETE colliding edges BEFORE repointing — else repointing a "keep" edge
|
|
235
|
+
# onto the master collides with the not-yet-deleted "drop" on the UNIQUE
|
|
236
|
+
# (arena,from,to,type) key. Carry each drop's weight+provenance onto its keep.
|
|
237
|
+
for col in rel_collisions:
|
|
238
|
+
cur.execute("UPDATE relationships SET weight=%s, provenance_event_ids=%s WHERE id=%s",
|
|
239
|
+
(col["summed_weight"], col["provenance"], col["keep"]))
|
|
240
|
+
cur.execute("DELETE FROM relationships WHERE id=%s", (col["drop"],))
|
|
241
|
+
for rid in rel_repoints:
|
|
242
|
+
cur.execute(
|
|
243
|
+
"UPDATE relationships SET "
|
|
244
|
+
"from_entity_id = CASE WHEN from_entity_id = ANY(%s) THEN %s ELSE from_entity_id END, "
|
|
245
|
+
"to_entity_id = CASE WHEN to_entity_id = ANY(%s) THEN %s ELSE to_entity_id END "
|
|
246
|
+
"WHERE id=%s",
|
|
247
|
+
(loser_ids, master["id"], loser_ids, master["id"], rid))
|
|
248
|
+
for l in losers:
|
|
249
|
+
cur.execute(
|
|
250
|
+
"INSERT INTO entity_merges (id, arena, canonical_id, deprecated_id, "
|
|
251
|
+
"deprecated_canonical_name, deprecated_aliases, merge_signal, "
|
|
252
|
+
"facts_repointed, relationships_repointed, merged_by, rollback_payload) "
|
|
253
|
+
"VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s::jsonb)",
|
|
254
|
+
("em_" + uuid.uuid4().hex[:20], arena, master["id"], l["id"],
|
|
255
|
+
l["canonical_name"], l.get("aliases") or [], "heuristic",
|
|
256
|
+
len(fact_subj) + len(fact_obj), len(rel_repoints), "fusion-defrag",
|
|
257
|
+
json.dumps(l, default=str)))
|
|
258
|
+
cur.execute("DELETE FROM entities WHERE id = ANY(%s)", (loser_ids,))
|
|
259
|
+
return {"applied": True, "facts_repointed": len(fact_subj) + len(fact_obj),
|
|
260
|
+
"rels_repointed": len(rel_repoints), "rel_collisions": len(rel_collisions),
|
|
261
|
+
"tombstoned": len(loser_ids)}
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def main() -> int:
|
|
265
|
+
ap = argparse.ArgumentParser(description=__doc__,
|
|
266
|
+
formatter_class=argparse.RawDescriptionHelpFormatter)
|
|
267
|
+
ap.add_argument("--arena", required=True, help="arena LIKE filter (REQUIRED)")
|
|
268
|
+
ap.add_argument("--surname", required=True, help="surname token to scope (e.g. vickers)")
|
|
269
|
+
ap.add_argument("--pg-dsn", default="", help="Postgres DSN (or PG_DSN env)")
|
|
270
|
+
ap.add_argument("--json", help="write the proposal as JSON")
|
|
271
|
+
ap.add_argument("--apply", action="store_true",
|
|
272
|
+
help="EXECUTE the merges (default: dry-run). Requires --i-have-a-snapshot.")
|
|
273
|
+
ap.add_argument("--i-have-a-snapshot", action="store_true",
|
|
274
|
+
help="operator asserts a DB/row snapshot exists for rollback (required with --apply)")
|
|
275
|
+
args = ap.parse_args()
|
|
276
|
+
if args.apply and not args.i_have_a_snapshot:
|
|
277
|
+
print("REFUSED: --apply requires --i-have-a-snapshot (take a snapshot first; "
|
|
278
|
+
"merges repoint facts/rels + tombstone nodes — entity_merges holds rollback "
|
|
279
|
+
"payloads but a row/DB snapshot is the real safety net).", file=sys.stderr)
|
|
280
|
+
return 2
|
|
281
|
+
|
|
282
|
+
import os
|
|
283
|
+
dsn = args.pg_dsn or os.environ.get("PG_DSN", "")
|
|
284
|
+
if not dsn:
|
|
285
|
+
print("FATAL: no --pg-dsn / PG_DSN", file=sys.stderr)
|
|
286
|
+
return 2
|
|
287
|
+
surname = args.surname.lower()
|
|
288
|
+
|
|
289
|
+
with _connect(dsn) as conn:
|
|
290
|
+
with conn.cursor() as cur:
|
|
291
|
+
if not args.apply:
|
|
292
|
+
cur.execute("SET default_transaction_read_only = on") # dry-run safety
|
|
293
|
+
cur.execute("SET max_parallel_workers_per_gather = 0")
|
|
294
|
+
# Person fragments whose CANONICAL NAME carries the surname as a real
|
|
295
|
+
# token. Critically NOT alias-scoped: an @vickers-oil.com email domain
|
|
296
|
+
# in aliases would otherwise drag in unrelated employees (Paul Vann,
|
|
297
|
+
# Matt Tooze) who merely WORK at a Vickers company — the over-merge the
|
|
298
|
+
# first dry-run caught. Email-named stubs (canonical_name has '@') are
|
|
299
|
+
# also excluded: tokenizing an email is not a safe surname signal (they
|
|
300
|
+
# converge via the alias-resolution path instead).
|
|
301
|
+
cur.execute(
|
|
302
|
+
"""
|
|
303
|
+
SELECT e.id, e.arena, e.canonical_name, e.aliases, e.provenance_event_ids,
|
|
304
|
+
e.attributes, e.last_seen,
|
|
305
|
+
(SELECT count(*) FROM facts f
|
|
306
|
+
WHERE f.provenance_event_ids && e.provenance_event_ids
|
|
307
|
+
AND (f.subject_entity_id = e.id OR f.object_entity_id = e.id)) AS fact_n
|
|
308
|
+
FROM entities e
|
|
309
|
+
WHERE e.arena LIKE %s AND e.entity_type = 'person'
|
|
310
|
+
AND position('@' in e.canonical_name) = 0
|
|
311
|
+
AND lower(e.canonical_name) ~ %s
|
|
312
|
+
""",
|
|
313
|
+
(args.arena, rf"(^|[^a-z]){surname}([^a-z]|$)"),
|
|
314
|
+
)
|
|
315
|
+
ents = cur.fetchall()
|
|
316
|
+
# Confirm the surname is a real NAME token (regex is a coarse guard).
|
|
317
|
+
ents = [e for e in ents if surname in name_tokens(e["canonical_name"])]
|
|
318
|
+
print(f"[defrag] arena={args.arena} surname={surname!r}: "
|
|
319
|
+
f"{len(ents)} person fragments")
|
|
320
|
+
if not ents:
|
|
321
|
+
return 0
|
|
322
|
+
|
|
323
|
+
by_id = {e["id"]: e for e in ents}
|
|
324
|
+
non_bare = [e for e in ents if first_name_tokens(e["canonical_name"], surname)]
|
|
325
|
+
bare = [e for e in ents if not first_name_tokens(e["canonical_name"], surname)]
|
|
326
|
+
|
|
327
|
+
# Union-find over compatible non-bare names.
|
|
328
|
+
uf = UnionFind([e["id"] for e in non_bare])
|
|
329
|
+
for i in range(len(non_bare)):
|
|
330
|
+
for j in range(i + 1, len(non_bare)):
|
|
331
|
+
a, b = non_bare[i], non_bare[j]
|
|
332
|
+
# Never union across exact arenas — entity id = hash(arena|
|
|
333
|
+
# type|name), so cross-arena same-name nodes are genuinely
|
|
334
|
+
# different scoped entities; merging them would be wrong.
|
|
335
|
+
if a["arena"] != b["arena"]:
|
|
336
|
+
continue
|
|
337
|
+
if first_names_compatible(
|
|
338
|
+
first_name_tokens(a["canonical_name"], surname),
|
|
339
|
+
first_name_tokens(b["canonical_name"], surname),
|
|
340
|
+
):
|
|
341
|
+
uf.union(a["id"], b["id"])
|
|
342
|
+
clusters: dict[str, list[dict]] = defaultdict(list)
|
|
343
|
+
for e in non_bare:
|
|
344
|
+
clusters[uf.find(e["id"])].append(e)
|
|
345
|
+
|
|
346
|
+
# Bare surnames: fold in ONLY if exactly one non-bare cluster exists.
|
|
347
|
+
bare_note = ""
|
|
348
|
+
if bare:
|
|
349
|
+
if len(clusters) == 1:
|
|
350
|
+
only = next(iter(clusters))
|
|
351
|
+
cl_arena = clusters[only][0]["arena"]
|
|
352
|
+
same = [b for b in bare if b["arena"] == cl_arena]
|
|
353
|
+
clusters[only].extend(same)
|
|
354
|
+
bare_note = f"{len(same)} bare-'{surname}' node(s) folded into the single cluster"
|
|
355
|
+
else:
|
|
356
|
+
bare_note = (f"{len(bare)} bare-'{surname}' node(s) LEFT FOR REVIEW "
|
|
357
|
+
f"({len(clusters)} distinct name-clusters — ambiguous which person)")
|
|
358
|
+
|
|
359
|
+
proposals = []
|
|
360
|
+
for cid, members in clusters.items():
|
|
361
|
+
if len(members) < 2:
|
|
362
|
+
continue
|
|
363
|
+
master = max(members, key=lambda e: (master_score(e, surname),
|
|
364
|
+
len(e.get("provenance_event_ids") or []),
|
|
365
|
+
len(e["canonical_name"])))
|
|
366
|
+
losers = [e for e in members if e["id"] != master["id"]]
|
|
367
|
+
loser_ids = [l["id"] for l in losers]
|
|
368
|
+
# Repoint impact (read-only counts).
|
|
369
|
+
cur.execute(
|
|
370
|
+
"SELECT count(*) AS n FROM facts WHERE arena LIKE %s AND "
|
|
371
|
+
"(subject_entity_id = ANY(%s) OR object_entity_id = ANY(%s))",
|
|
372
|
+
(args.arena, loser_ids, loser_ids),
|
|
373
|
+
)
|
|
374
|
+
facts_repointed = cur.fetchone()["n"]
|
|
375
|
+
cur.execute(
|
|
376
|
+
"SELECT count(*) AS n FROM relationships WHERE arena LIKE %s AND "
|
|
377
|
+
"(from_entity_id = ANY(%s) OR to_entity_id = ANY(%s))",
|
|
378
|
+
(args.arena, loser_ids, loser_ids),
|
|
379
|
+
)
|
|
380
|
+
rels_repointed = cur.fetchone()["n"]
|
|
381
|
+
proposals.append({
|
|
382
|
+
"arena": master["arena"],
|
|
383
|
+
"master_row": master, # full row for apply
|
|
384
|
+
"loser_rows": losers,
|
|
385
|
+
"master": {"id": master["id"], "name": master["canonical_name"],
|
|
386
|
+
"facts": master.get("fact_n", 0),
|
|
387
|
+
"prov": len(master.get("provenance_event_ids") or []),
|
|
388
|
+
"email": has_email(master),
|
|
389
|
+
"score": round(master_score(master, surname), 2)},
|
|
390
|
+
"losers": [{"id": l["id"], "name": l["canonical_name"],
|
|
391
|
+
"facts": l.get("fact_n", 0),
|
|
392
|
+
"prov": len(l.get("provenance_event_ids") or []),
|
|
393
|
+
"email": has_email(l)} for l in losers],
|
|
394
|
+
"facts_repointed": facts_repointed,
|
|
395
|
+
"rels_repointed": rels_repointed,
|
|
396
|
+
})
|
|
397
|
+
|
|
398
|
+
if args.apply and proposals:
|
|
399
|
+
conn.rollback() # end the read-only probe txn cleanly before writes
|
|
400
|
+
print(f"\n[defrag] APPLYING {len(proposals)} cluster(s) — arena-scoped, transactional…")
|
|
401
|
+
for p in proposals:
|
|
402
|
+
p["apply_result"] = apply_cluster(cur, conn, p["arena"],
|
|
403
|
+
p["master_row"], p["loser_rows"])
|
|
404
|
+
print(f" master={p['master']['name']!r} ({p['arena']}): {p['apply_result']}")
|
|
405
|
+
|
|
406
|
+
# ---- report ----
|
|
407
|
+
mode = "APPLIED" if args.apply else "PROPOSED (dry-run, no writes)"
|
|
408
|
+
print(f"\n=== {mode} MERGES — surname '{surname}' ===")
|
|
409
|
+
if bare_note:
|
|
410
|
+
print(f" note: {bare_note}")
|
|
411
|
+
if not proposals:
|
|
412
|
+
print(" (no multi-node clusters — nothing to merge)")
|
|
413
|
+
tot_dep = tot_f = tot_r = 0
|
|
414
|
+
for i, p in enumerate(proposals, 1):
|
|
415
|
+
m = p["master"]
|
|
416
|
+
print(f"\n[{i}] MASTER ← {m['name']!r} ({m['id'][:10]}…) "
|
|
417
|
+
f"score={m['score']} facts={m['facts']} prov={m['prov']} email={m['email']}")
|
|
418
|
+
for l in p["losers"]:
|
|
419
|
+
print(f" merge: {l['name']!r} ({l['id'][:10]}…) "
|
|
420
|
+
f"facts={l['facts']} prov={l['prov']} email={l['email']}")
|
|
421
|
+
print(f" → would repoint {p['facts_repointed']} facts, "
|
|
422
|
+
f"{p['rels_repointed']} relationships onto the master; "
|
|
423
|
+
f"{len(p['losers'])} node(s) tombstoned")
|
|
424
|
+
tot_dep += len(p["losers"]); tot_f += p["facts_repointed"]; tot_r += p["rels_repointed"]
|
|
425
|
+
tail = ("APPLIED — rollback via entity_merges (merged_by='fusion-defrag') + snapshot."
|
|
426
|
+
if args.apply else "DRY-RUN — nothing written.")
|
|
427
|
+
print(f"\nTOTAL: {len(proposals)} cluster(s), {tot_dep} nodes tombstoned, "
|
|
428
|
+
f"{tot_f} facts + {tot_r} relationships repointed. {tail}")
|
|
429
|
+
if args.json:
|
|
430
|
+
with open(args.json, "w") as f:
|
|
431
|
+
json.dump({"surname": surname, "arena": args.arena, "applied": args.apply,
|
|
432
|
+
"bare_note": bare_note,
|
|
433
|
+
"proposals": [{k: v for k, v in p.items()
|
|
434
|
+
if k not in ("master_row", "loser_rows")}
|
|
435
|
+
for p in proposals]}, f, indent=2)
|
|
436
|
+
return 0
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
if __name__ == "__main__":
|
|
440
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Prompt-version-drift re-distillation — RFC-decay-and-fusion Part B′.
|
|
3
|
+
|
|
4
|
+
The third triage verb of the Fusion Drive: regenerate high-value extractions
|
|
5
|
+
that were produced by a *superseded* teacher/prompt by re-running their source
|
|
6
|
+
event through the *current clean* teacher. Fusion converges horizontally, decay
|
|
7
|
+
ages vertically; this refreshes in depth.
|
|
8
|
+
|
|
9
|
+
TRIGGER (not raw age): an event is stale when `event_distillations` shows it was
|
|
10
|
+
distilled only under a dirty `system_prompt_hash` (pre-clean: bbdaba/f1e0ff/
|
|
11
|
+
ef0647…) and never under the clean one (`6ccfe70f…`, SDK 0.10.19 = #126 + #129).
|
|
12
|
+
|
|
13
|
+
MECHANISM: re-insert the event into `distillation_queue` (status=pending). The
|
|
14
|
+
existing extractor-async worker + autoscaler do the rest, writing a fresh clean
|
|
15
|
+
extraction AND a clean trace (which `build_retrain_corpus.py` then harvests — one
|
|
16
|
+
loop repairs the graph and grows the retrain corpus).
|
|
17
|
+
|
|
18
|
+
SUPERSEDENCE (the load-bearing requirement — Part B′4): the store is
|
|
19
|
+
pure-accretion, so a naive re-enqueue makes the clean extraction land *beside*
|
|
20
|
+
the dirty one and WORSENS fragmentation. The durable fix routes supersede
|
|
21
|
+
through Fusion's tombstone machinery (RFC A2/A3), which is not yet live. This
|
|
22
|
+
tool ships the **interim event-scoped supersede** (Part B′4 Open Q): with
|
|
23
|
+
--supersede-facts it deletes only facts whose provenance set is exactly
|
|
24
|
+
{this event} under the old hash — the single-provenance majority, where no
|
|
25
|
+
corroboration is lost — after dumping a rollback payload. Multi-provenance facts
|
|
26
|
+
and ALL entities are left untouched (entity-level fragmentation is fusion/decay's
|
|
27
|
+
job; deleting an entity cascades to relationships and NULLs other facts' FKs).
|
|
28
|
+
|
|
29
|
+
SAFETY: dry-run by default; --apply gated; --arena REQUIRED (no global runs);
|
|
30
|
+
--limit caps the batch; every destructive op is dumped to a rollback JSONL first;
|
|
31
|
+
a run ledger is written per invocation. Idempotent: skips events that already
|
|
32
|
+
have a pending/claimed queue row or an existing clean-hash distillation.
|
|
33
|
+
|
|
34
|
+
Usage:
|
|
35
|
+
# size the work (no writes):
|
|
36
|
+
python redistill.py --arena 'pentatonic-team%' --dry-run
|
|
37
|
+
# re-enqueue a bounded slice (non-destructive — accretes until fusion runs):
|
|
38
|
+
python redistill.py --arena 'pentatonic-team%' --limit 25 --apply
|
|
39
|
+
# re-enqueue AND interim-supersede single-provenance stale facts:
|
|
40
|
+
python redistill.py --arena 'pentatonic-team%' --limit 25 --apply --supersede-facts
|
|
41
|
+
"""
|
|
42
|
+
from __future__ import annotations
|
|
43
|
+
|
|
44
|
+
import argparse
|
|
45
|
+
import json
|
|
46
|
+
import os
|
|
47
|
+
import sys
|
|
48
|
+
from datetime import datetime, timezone
|
|
49
|
+
|
|
50
|
+
CLEAN_PROMPT_HASH = "6ccfe70f1286a131" # SDK 0.10.19 (#126 + #129); verify vs worker.SYSTEM_PROMPT_HASH
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _connect(dsn: str):
|
|
54
|
+
import psycopg
|
|
55
|
+
import psycopg.rows
|
|
56
|
+
return psycopg.connect(dsn, row_factory=psycopg.rows.dict_row)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def triage(cur, arena: str, clean_hash: str, dirty_hashes: list[str] | None, limit: int,
|
|
60
|
+
order_by_recency: bool = False):
|
|
61
|
+
"""Stale events: distilled under a dirty prompt, not yet re-distilled clean.
|
|
62
|
+
|
|
63
|
+
IDEMPOTENCY SIGNAL = `distillation_traces.system_prompt_hash`, NOT
|
|
64
|
+
`event_distillations`. The teacher ALWAYS writes a trace (worker.py
|
|
65
|
+
`_insert_trace`, producer=='teacher'), but `event_distillations` is written
|
|
66
|
+
ONLY when CASCADE_ENABLED (worker.py line ~2325). In teacher-only mode a
|
|
67
|
+
re-distilled event gets a clean *trace* but its event_distillations row stays
|
|
68
|
+
stamped at the old student hash — so keying the clean-check off the ledger
|
|
69
|
+
would re-select the same events forever. Traces are the reliable per-event
|
|
70
|
+
prompt-version record here.
|
|
71
|
+
|
|
72
|
+
Dirty evidence = a dirty-hash trace OR a dirty-hash event_distillations row
|
|
73
|
+
(covers student-distilled events, which have a ledger row but no trace)."""
|
|
74
|
+
params: list = [arena, clean_hash] # arena, clean (no-clean-trace check)
|
|
75
|
+
dirty_trace = "AND t2.system_prompt_hash <> %s"
|
|
76
|
+
params_tail_dirty: list = [clean_hash]
|
|
77
|
+
if dirty_hashes:
|
|
78
|
+
dirty_trace = "AND t2.system_prompt_hash = ANY(%s)"
|
|
79
|
+
params_tail_dirty = [dirty_hashes]
|
|
80
|
+
# Build param order to match placeholders below.
|
|
81
|
+
params = [arena, clean_hash] + params_tail_dirty + [clean_hash, limit]
|
|
82
|
+
# ORDER BY received_at forces a full scan+sort of the whole arena (~270k rows
|
|
83
|
+
# for pentatonic-team) every call — pathological for bulk/loop work. Default
|
|
84
|
+
# OFF so the planner early-terminates once LIMIT stale rows are found (most
|
|
85
|
+
# arena rows ARE stale, so it scans ~LIMIT rows). Recency is only a salience
|
|
86
|
+
# proxy anyway, moot until the salience column lands (RFC Part D). Opt in with
|
|
87
|
+
# --order-by-recency for a one-off "freshest first" pass.
|
|
88
|
+
order_clause = "ORDER BY e.received_at DESC" if order_by_recency else ""
|
|
89
|
+
cur.execute(
|
|
90
|
+
f"""
|
|
91
|
+
WITH stale AS (
|
|
92
|
+
SELECT e.id AS event_id, e.received_at AS recency
|
|
93
|
+
FROM events e
|
|
94
|
+
WHERE e.arena LIKE %s
|
|
95
|
+
AND NOT EXISTS ( -- not yet re-distilled clean
|
|
96
|
+
SELECT 1 FROM distillation_traces t
|
|
97
|
+
WHERE t.event_id = e.id AND t.system_prompt_hash = %s)
|
|
98
|
+
AND NOT EXISTS ( -- not already queued (so loops/batches advance)
|
|
99
|
+
SELECT 1 FROM distillation_queue q
|
|
100
|
+
WHERE q.event_id = e.id AND q.status IN ('pending','claimed'))
|
|
101
|
+
AND (
|
|
102
|
+
EXISTS (SELECT 1 FROM distillation_traces t2 -- dirty teacher trace
|
|
103
|
+
WHERE t2.event_id = e.id {dirty_trace})
|
|
104
|
+
OR EXISTS (SELECT 1 FROM event_distillations d -- or dirty student/teacher ledger
|
|
105
|
+
WHERE d.event_id = e.id
|
|
106
|
+
AND d.system_prompt_hash IS NOT NULL
|
|
107
|
+
AND d.system_prompt_hash <> %s)
|
|
108
|
+
)
|
|
109
|
+
{order_clause}
|
|
110
|
+
LIMIT %s
|
|
111
|
+
)
|
|
112
|
+
SELECT s.event_id,
|
|
113
|
+
-- `@> ARRAY[id]` (containment) uses idx_facts_provenance (GIN);
|
|
114
|
+
-- `id = ANY(col)` does NOT and seq-scans facts per row (2 min+ at
|
|
115
|
+
-- limit 1000). Must stay @> for the bulk/loop path to be viable.
|
|
116
|
+
(SELECT count(*) FROM facts f
|
|
117
|
+
WHERE f.provenance_event_ids @> ARRAY[s.event_id]) AS fact_n,
|
|
118
|
+
(SELECT count(*) FROM facts f
|
|
119
|
+
WHERE f.provenance_event_ids @> ARRAY[s.event_id]
|
|
120
|
+
AND cardinality(f.provenance_event_ids) = 1) AS solo_fact_n,
|
|
121
|
+
EXISTS (SELECT 1 FROM distillation_queue q
|
|
122
|
+
WHERE q.event_id = s.event_id
|
|
123
|
+
AND q.status IN ('pending','claimed')) AS in_flight
|
|
124
|
+
FROM stale s
|
|
125
|
+
ORDER BY fact_n DESC
|
|
126
|
+
""",
|
|
127
|
+
params,
|
|
128
|
+
)
|
|
129
|
+
return cur.fetchall()
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def dump_and_delete_solo_facts(cur, event_id: str, rollback_fh) -> int:
|
|
133
|
+
"""Delete facts whose provenance is exactly {event_id}; dump them first."""
|
|
134
|
+
solo = "provenance_event_ids @> ARRAY[%s] AND cardinality(provenance_event_ids) = 1"
|
|
135
|
+
cur.execute(f"SELECT * FROM facts WHERE {solo}", (event_id,))
|
|
136
|
+
rows = cur.fetchall()
|
|
137
|
+
for r in rows:
|
|
138
|
+
rollback_fh.write(json.dumps({"op": "delete_fact", "event_id": event_id,
|
|
139
|
+
"row": _jsonable(r)}, ensure_ascii=False) + "\n")
|
|
140
|
+
if rows:
|
|
141
|
+
cur.execute(f"DELETE FROM facts WHERE {solo}", (event_id,))
|
|
142
|
+
return len(rows)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _jsonable(row: dict) -> dict:
|
|
146
|
+
out = {}
|
|
147
|
+
for k, v in row.items():
|
|
148
|
+
out[k] = v.isoformat() if isinstance(v, datetime) else v
|
|
149
|
+
return out
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def main() -> int:
|
|
153
|
+
ap = argparse.ArgumentParser(description=__doc__,
|
|
154
|
+
formatter_class=argparse.RawDescriptionHelpFormatter)
|
|
155
|
+
ap.add_argument("--arena", required=True, help="arena LIKE filter (REQUIRED — scope safety)")
|
|
156
|
+
ap.add_argument("--clean-hash", default=CLEAN_PROMPT_HASH)
|
|
157
|
+
ap.add_argument("--dirty-hashes", help="comma-separated hashes to target (default: any != clean)")
|
|
158
|
+
ap.add_argument("--limit", type=int, default=50, help="max events to re-enqueue (safety cap)")
|
|
159
|
+
ap.add_argument("--apply", action="store_true", help="actually write (default: dry-run)")
|
|
160
|
+
ap.add_argument("--supersede-facts", action="store_true",
|
|
161
|
+
help="also delete single-provenance stale facts (interim supersede, Part B′4)")
|
|
162
|
+
ap.add_argument("--order-by-recency", action="store_true",
|
|
163
|
+
help="freshest stale events first (forces a full arena scan+sort — slow; off by default)")
|
|
164
|
+
ap.add_argument("--rollback-dir", default=".", help="dir for rollback + ledger JSONL")
|
|
165
|
+
ap.add_argument("--pg-dsn", default=os.environ.get("PG_DSN", ""), help="Postgres DSN")
|
|
166
|
+
args = ap.parse_args()
|
|
167
|
+
|
|
168
|
+
if not args.pg_dsn:
|
|
169
|
+
print("FATAL: no --pg-dsn / PG_DSN", file=sys.stderr)
|
|
170
|
+
return 2
|
|
171
|
+
dirty = [h.strip() for h in args.dirty_hashes.split(",")] if args.dirty_hashes else None
|
|
172
|
+
stamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
|
173
|
+
|
|
174
|
+
with _connect(args.pg_dsn) as conn:
|
|
175
|
+
with conn.cursor() as cur:
|
|
176
|
+
# The org-model postgres container runs with the Docker default 64MB
|
|
177
|
+
# /dev/shm; a parallel-gather over the facts GIN exhausts the dynamic
|
|
178
|
+
# shared-memory segment ("DiskFull ... shared memory"). Disable
|
|
179
|
+
# parallel workers for this session — temp spills go to the (ample)
|
|
180
|
+
# data disk instead of shm.
|
|
181
|
+
cur.execute("SET max_parallel_workers_per_gather = 0")
|
|
182
|
+
cands = triage(cur, args.arena, args.clean_hash, dirty, args.limit,
|
|
183
|
+
order_by_recency=args.order_by_recency)
|
|
184
|
+
|
|
185
|
+
total = len(cands)
|
|
186
|
+
in_flight = sum(1 for c in cands if c["in_flight"])
|
|
187
|
+
actionable = [c for c in cands if not c["in_flight"]]
|
|
188
|
+
solo = sum(c["solo_fact_n"] for c in actionable)
|
|
189
|
+
multi = sum(c["fact_n"] - c["solo_fact_n"] for c in actionable)
|
|
190
|
+
print(json.dumps({
|
|
191
|
+
"mode": "apply" if args.apply else "dry-run",
|
|
192
|
+
"arena": args.arena, "clean_hash": args.clean_hash,
|
|
193
|
+
"candidates": total, "in_flight_skipped": in_flight,
|
|
194
|
+
"actionable": len(actionable),
|
|
195
|
+
"stale_facts_solo_provenance": solo,
|
|
196
|
+
"stale_facts_multi_provenance_LEFT_ALONE": multi,
|
|
197
|
+
"supersede_facts": args.supersede_facts,
|
|
198
|
+
}, indent=2))
|
|
199
|
+
|
|
200
|
+
if not args.apply:
|
|
201
|
+
print("\n[dry-run] no writes. Re-run with --apply to re-enqueue"
|
|
202
|
+
+ (" + supersede solo facts." if args.supersede_facts else "."))
|
|
203
|
+
return 0
|
|
204
|
+
|
|
205
|
+
rb_path = os.path.join(args.rollback_dir, f"redistill_rollback_{stamp}.jsonl")
|
|
206
|
+
ledger_path = os.path.join(args.rollback_dir, f"redistill_runs_{stamp}.jsonl")
|
|
207
|
+
enq = deleted = 0
|
|
208
|
+
with open(rb_path, "w", encoding="utf-8") as rb, \
|
|
209
|
+
open(ledger_path, "w", encoding="utf-8") as led:
|
|
210
|
+
for c in actionable:
|
|
211
|
+
eid = c["event_id"]
|
|
212
|
+
if args.supersede_facts:
|
|
213
|
+
deleted += dump_and_delete_solo_facts(cur, eid, rb)
|
|
214
|
+
cur.execute(
|
|
215
|
+
"INSERT INTO distillation_queue (event_id, status, attempts) "
|
|
216
|
+
"VALUES (%s, 'pending', 0)", (eid,))
|
|
217
|
+
enq += 1
|
|
218
|
+
led.write(json.dumps({
|
|
219
|
+
"event_id": eid, "arena": args.arena,
|
|
220
|
+
"old_hash": "dirty", "target_hash": args.clean_hash,
|
|
221
|
+
"solo_facts_superseded": c["solo_fact_n"] if args.supersede_facts else 0,
|
|
222
|
+
"enqueued_at": stamp,
|
|
223
|
+
}, ensure_ascii=False) + "\n")
|
|
224
|
+
conn.commit()
|
|
225
|
+
|
|
226
|
+
print(json.dumps({
|
|
227
|
+
"applied": True, "re_enqueued": enq, "facts_superseded": deleted,
|
|
228
|
+
"rollback": rb_path, "ledger": ledger_path,
|
|
229
|
+
}, indent=2))
|
|
230
|
+
if deleted:
|
|
231
|
+
print(f"\nROLLBACK: facts are in {rb_path} — re-INSERT them to undo.")
|
|
232
|
+
return 0
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
if __name__ == "__main__":
|
|
236
|
+
raise SystemExit(main())
|