@pentatonic-ai/ai-agent-sdk 0.10.19 → 0.10.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1 -1
- package/dist/index.js +1 -1
- package/package.json +1 -1
- package/packages/memory-engine-v2/RFC-decay-and-fusion.md +122 -8
- package/packages/memory-engine-v2/compat/server.py +55 -10
- package/packages/memory-engine-v2/extractor-async/test_email_alias_guard.py +78 -0
- package/packages/memory-engine-v2/extractor-async/worker.py +52 -0
- package/packages/memory-engine-v2/scripts/build_retrain_corpus.py +240 -0
- package/packages/memory-engine-v2/scripts/fusion_defrag.py +440 -0
- package/packages/memory-engine-v2/scripts/redistill.py +236 -0
package/dist/index.cjs
CHANGED
|
@@ -878,7 +878,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
|
|
|
878
878
|
}
|
|
879
879
|
|
|
880
880
|
// src/telemetry.js
|
|
881
|
-
var VERSION = "0.10.
|
|
881
|
+
var VERSION = "0.10.21";
|
|
882
882
|
var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
|
|
883
883
|
function machineId() {
|
|
884
884
|
const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
|
package/dist/index.js
CHANGED
|
@@ -847,7 +847,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
|
|
|
847
847
|
}
|
|
848
848
|
|
|
849
849
|
// src/telemetry.js
|
|
850
|
-
var VERSION = "0.10.
|
|
850
|
+
var VERSION = "0.10.21";
|
|
851
851
|
var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
|
|
852
852
|
function machineId() {
|
|
853
853
|
const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@pentatonic-ai/ai-agent-sdk",
|
|
3
|
-
"version": "0.10.
|
|
3
|
+
"version": "0.10.21",
|
|
4
4
|
"description": "TES SDK — LLM observability and lifecycle tracking via Pentatonic Thing Event System. Track token usage, tool calls, and conversations. Manage things through event-sourced lifecycle stages with AI enrichment and vector search.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.cjs",
|
|
@@ -1,10 +1,18 @@
|
|
|
1
1
|
# RFC: the Fusion Drive — v2 memory self-healing (cross-run node fusion + decay)
|
|
2
2
|
|
|
3
3
|
> **Fusion Drive** = the continuous, arena-scoped background engine that keeps the v2
|
|
4
|
-
> memory graph self-healing
|
|
5
|
-
>
|
|
6
|
-
>
|
|
7
|
-
>
|
|
4
|
+
> memory graph self-healing. It triages every node into one of **three** outcomes:
|
|
5
|
+
> it *fuses* duplicate/near-duplicate nodes from different distillation runs into a single
|
|
6
|
+
> master node (horizontal convergence); it *re-distills* high-value extractions produced by
|
|
7
|
+
> a superseded teacher/prompt — regenerating them from the still-present source event through
|
|
8
|
+
> the current clean teacher (depth refresh); and it *decays* stale, low-value, and junk nodes
|
|
9
|
+
> out of existence (vertical aging). Named for the drive that does the fusing — the re-distill
|
|
10
|
+
> and decay passes ride the same engine.
|
|
11
|
+
>
|
|
12
|
+
> *(Revised 2026-06-22: added Part B′ — re-distillation — as the third triage verb, with the
|
|
13
|
+
> prompt-version-drift trigger. Motivated by the clean-prompt deploy (SDK 0.10.19, #126 +
|
|
14
|
+
> #129) which made "the current teacher is materially better than the one that produced most
|
|
15
|
+
> of the graph" concrete and measurable via `system_prompt_hash`.)*
|
|
8
16
|
|
|
9
17
|
**Status:** draft / spec — 2026-06-12
|
|
10
18
|
**Builds on:** `RFC-entity-reconciliation.md`, `scripts/entity_resolution_v2.py` (#82),
|
|
@@ -139,15 +147,101 @@ sparse backfill.
|
|
|
139
147
|
|
|
140
148
|
---
|
|
141
149
|
|
|
150
|
+
## Part B′ — Re-distillation: regenerate stale-prompt extractions from source
|
|
151
|
+
|
|
152
|
+
Fusion (A) needs a *correct counterpart* to converge toward; Decay (B) just *deletes*. But
|
|
153
|
+
the common case after a teacher/prompt upgrade is a **high-value node with no correct
|
|
154
|
+
counterpart yet** — the only extraction that exists is the stale-prompt one. Fusion has
|
|
155
|
+
nothing to fuse to; decay would throw away real information. The cure is the third verb: the
|
|
156
|
+
**source event still exists** (`events` table, 376k rows live), so regenerate the extraction
|
|
157
|
+
by re-running that event through the *current clean teacher*. Fusion converges horizontally,
|
|
158
|
+
decay ages vertically; re-distill refreshes **in depth**.
|
|
159
|
+
|
|
160
|
+
### B′1. Trigger — prompt-version drift, not raw age
|
|
161
|
+
The defect population is *exactly* the facts/entities whose provenance traces an **old
|
|
162
|
+
`system_prompt_hash`** — `bbdaba6b…` / `f1e0ff55…` / `ef0647c7…` (pre-clean), vs the clean
|
|
163
|
+
`6ccfe70f…` deployed with 0.10.19 (#126 modality/attribution + #129 email-discipline &
|
|
164
|
+
entity-separation). #118 propagated source onto facts, so provenance → the event's
|
|
165
|
+
`distillation_traces.system_prompt_hash` is queryable. **Age is a weak proxy; prompt-version
|
|
166
|
+
selects the defect set directly** — a months-old node the clean teacher would extract
|
|
167
|
+
identically needs nothing; a two-day-old node from the dirty prompt is a defect. Prioritize
|
|
168
|
+
by `salience` (B1) so high-value stale nodes go first.
|
|
169
|
+
|
|
170
|
+
### B′2. Triage routing — 3-way, by salience × prompt-version
|
|
171
|
+
Per assessed node/event:
|
|
172
|
+
|
|
173
|
+
| condition | outcome |
|
|
174
|
+
|---|---|
|
|
175
|
+
| stale prompt-hash **+** high salience **+** source event present | **re-distill** (this part) |
|
|
176
|
+
| has a correct newer-teacher counterpart in the arena | **fuse** (Part A) |
|
|
177
|
+
| low salience, junk-born (B2), no corroboration, never accessed | **decay** (Part B) |
|
|
178
|
+
|
|
179
|
+
### B′3. Mechanism — re-enqueue, don't mutate in place
|
|
180
|
+
Re-distill = re-insert the source `event_id` into `distillation_queue` (`status='pending'`,
|
|
181
|
+
`attempts=0`). The existing **extractor-async** worker claims it, runs the clean teacher,
|
|
182
|
+
writes the new extraction **and a fresh `6ccfe70f` trace**. No new pipeline — it reuses the
|
|
183
|
+
distiller, the combined-demand **autoscaler**, and the trace ledger. (Re-distill is a
|
|
184
|
+
*producer* of queue demand; the autoscaler's student-aware floor already keeps a teacher box
|
|
185
|
+
warm for it — see the deploy notes.)
|
|
186
|
+
|
|
187
|
+
### B′4. Supersedence — the load-bearing requirement
|
|
188
|
+
The store is **pure-accretion** (the whole motivation of this RFC). A naive re-enqueue makes
|
|
189
|
+
the clean extraction land **beside** the dirty one → it *worsens* fragmentation. So
|
|
190
|
+
re-distill MUST close the loop through Fusion's tombstone machinery — it is **sequenced into
|
|
191
|
+
the Fusion Drive, not bolted on**:
|
|
192
|
+
|
|
193
|
+
1. Each re-distill is recorded in a `redistill_runs` ledger with its triggering
|
|
194
|
+
`(event_id, old_prompt_hash)`.
|
|
195
|
+
2. When the clean extraction completes, **Fusion converges old ↔ new for that event** using
|
|
196
|
+
the teacher-version master signal (A2/A3): the new `6ccfe70f` extraction wins as master;
|
|
197
|
+
the old extraction's now-orphaned nodes (those whose **only** provenance was this event
|
|
198
|
+
under the old hash) are tombstoned/repointed via `entity_merges` / `fact_merges`.
|
|
199
|
+
3. Where an old node carries **other live provenance** (multi-event corroboration), only this
|
|
200
|
+
event's contribution is repointed — **never blind-delete a multi-source node** (the
|
|
201
|
+
over-merge failure mode: a hotel email wrongly attached to a person must not let one
|
|
202
|
+
event's repoint nuke an otherwise-corroborated node).
|
|
203
|
+
|
|
204
|
+
This dependency is hard: **re-distill is unsafe until Fusion's cross-run / teacher-version
|
|
205
|
+
master selection (E3) is live.** Until then a re-distill loop accretes. An interim cheaper
|
|
206
|
+
option (Open Q): explicit **event-scoped supersede** — delete only the facts/entities whose
|
|
207
|
+
provenance set is exactly `{this event}` under the old hash before re-enqueue — covers the
|
|
208
|
+
single-provenance majority without the full fusion adjudicator.
|
|
209
|
+
|
|
210
|
+
### B′5. Corpus-as-byproduct — one loop, three wins
|
|
211
|
+
Every re-distill emits a clean `6ccfe70f` `distillation_trace`. A prompt-version-drift
|
|
212
|
+
re-distill loop therefore **builds the student retrain corpus while it repairs the graph**
|
|
213
|
+
(`scripts/build_retrain_corpus.py` consumes those traces). It subsumes the one-shot full
|
|
214
|
+
re-distill: gradual, rate-limited, no nuke — graph repair **+** corpus **+** self-healing
|
|
215
|
+
from a single engine. This is the durable answer to "is the corpus building?": it is, as a
|
|
216
|
+
side effect of the gardener.
|
|
217
|
+
|
|
218
|
+
### B′6. Cadence + cost + safety
|
|
219
|
+
Rolling, rate-limited, autoscaler-aware, off-peak. Budget *N* events/hour against teacher
|
|
220
|
+
capacity; order by `salience × staleness`. **Never big-bang the full backlog** — gradual
|
|
221
|
+
migration is the point. Arena-scoped, dry-run → `--apply`, `redistill_runs` ledger for
|
|
222
|
+
observability and rollback. Same operational shape as fusion/decay/autoscaler.
|
|
223
|
+
|
|
224
|
+
---
|
|
225
|
+
|
|
142
226
|
## Part C — Ordering & how they combine
|
|
143
227
|
|
|
144
|
-
Per arena, on schedule: **(1)
|
|
145
|
-
|
|
146
|
-
|
|
228
|
+
Per arena, on schedule: **(1) triage → re-distill the high-value stale-prompt set (async via
|
|
229
|
+
the queue) → (2) fusion → (3) decay.** Re-distill is enqueued first so that by the time
|
|
230
|
+
fusion runs, the clean counterpart exists for it to crown as master (else fusion has only
|
|
231
|
+
stale renderings to choose between). Fusion then absorbs each master's duplicates'
|
|
232
|
+
provenance/salience *before* decay judges it (else a real node split across two weak dupes
|
|
233
|
+
could wrongly decay out). Then decay ages + evicts the survivors.
|
|
234
|
+
|
|
235
|
+
*(Re-distill is asynchronous — it completes on the teacher's schedule — so in practice a
|
|
236
|
+
node re-distilled in this pass is fused/decayed in the **next** per-arena pass, once its
|
|
237
|
+
clean trace + extraction have landed. The ledger links the two.)*
|
|
147
238
|
|
|
148
239
|
**This is what finally cures immortal pollution:**
|
|
149
240
|
- 7B polluted node *with* a correct Qwen3.6 counterpart → **fused**, correct one as master,
|
|
150
241
|
polluted demoted to alias / tombstoned.
|
|
242
|
+
- stale-prompt node, *high-value*, *no* correct counterpart, source event present →
|
|
243
|
+
**re-distilled** through the clean teacher → new master extraction; old superseded via
|
|
244
|
+
fusion (B′4). The information is *recovered*, not lost.
|
|
151
245
|
- 7B pure-junk node with *no* correct counterpart (numeric-ID-person, ungrounded) → born-low
|
|
152
246
|
salience + no corroboration + never accessed → **decays out and is evicted**.
|
|
153
247
|
|
|
@@ -165,8 +259,15 @@ reset, but no longer the *only* path).
|
|
|
165
259
|
- `relationships`: `+ salience REAL`, `+ last_accessed` (already has `weight`,
|
|
166
260
|
`first/last_seen`).
|
|
167
261
|
- new `fact_merges` audit (mirror `entity_merges` incl. `rollback_payload`).
|
|
168
|
-
- new `fusion_runs` + `decay_runs` ledgers for observability.
|
|
262
|
+
- new `fusion_runs` + `decay_runs` + `redistill_runs` ledgers for observability. `redistill_runs`:
|
|
263
|
+
`(id, arena, event_id, old_prompt_hash, new_prompt_hash, salience_at_trigger, enqueued_at,
|
|
264
|
+
completed_at, fused_at, mode)` — links a re-distill to its triggering node and to the fusion
|
|
265
|
+
that superseded the old extraction.
|
|
169
266
|
- `/search` gains a `last_accessed = NOW()` bump on returned nodes (batched).
|
|
267
|
+
- re-distill trigger needs provenance → prompt-version: either denormalize `system_prompt_hash`
|
|
268
|
+
onto `facts`/`entities` at write time (cheap filter), or join through
|
|
269
|
+
`distillation_traces(event_id → system_prompt_hash)` on the provenance event ids (no schema
|
|
270
|
+
change, costlier query). Prefer the join until the trigger volume justifies denormalizing.
|
|
170
271
|
|
|
171
272
|
## Part E — Rollout (each flag-gated, arena-scoped, dry-run-first, audited)
|
|
172
273
|
|
|
@@ -176,6 +277,13 @@ reset, but no longer the *only* path).
|
|
|
176
277
|
3. **Fusion extension** — scored canonical selection (fix typo-crowning) + cross-run
|
|
177
278
|
detection + fact fusion, dry-run → apply.
|
|
178
279
|
4. **Online/continuous** — wire fusion+decay to run after distillation per arena.
|
|
280
|
+
5. **Re-distill loop (Part B′)** — dry-run triage first (count stale-prompt nodes by
|
|
281
|
+
`system_prompt_hash` × salience bucket to size the work), then a **bounded `--apply` slice**
|
|
282
|
+
on one curated arena (re-enqueue + verify clean trace + verify fusion supersedes the old
|
|
283
|
+
extraction), then wire continuous. **Gated on step 3** (Fusion cross-run / teacher-version
|
|
284
|
+
master selection): until that's live, re-distill must use the interim **event-scoped
|
|
285
|
+
supersede** (B′4) or it accretes. Ships as `scripts/redistill.py` (dry-run default,
|
|
286
|
+
`--apply` gate, arena-scoped, `redistill_runs` ledger).
|
|
179
287
|
|
|
180
288
|
## Open questions
|
|
181
289
|
- Half-life constants per category — needs a calibration pass against real arenas.
|
|
@@ -183,3 +291,9 @@ reset, but no longer the *only* path).
|
|
|
183
291
|
- Directory authority source for canonical anchoring — HubSpot contacts? a curated table?
|
|
184
292
|
- Interaction with the (still-open) source_id supersede mode — fusion partly subsumes it,
|
|
185
293
|
but explicit supersede is cheaper for known-mutable sources.
|
|
294
|
+
- **Re-distill supersedence before full fusion is live** — is event-scoped supersede (delete
|
|
295
|
+
only nodes whose provenance set is exactly `{this event}` under the old hash) a safe enough
|
|
296
|
+
interim, or do we hard-gate the loop on E3? Single-provenance nodes are the majority, but
|
|
297
|
+
the multi-provenance tail is where the over-merge risk concentrates.
|
|
298
|
+
- **Re-distill prioritization** — pure `salience × staleness`, or weight toward the entities
|
|
299
|
+
behind known user-visible confabulations (Vickers/Boedecker) first?
|
|
@@ -896,6 +896,8 @@ class GraphQueryRequest(BaseModel):
|
|
|
896
896
|
entity_type: str | None = None
|
|
897
897
|
name: str | None = None # canonical_name (ILIKE)
|
|
898
898
|
subject: str | None = None # entity name OR canonical_name (facts.subject_entity)
|
|
899
|
+
subject_entity_id: str | None = None # EXACT facts.subject_entity_id — strict, no name bleed
|
|
900
|
+
object_entity_id: str | None = None # EXACT facts.object_entity_id
|
|
899
901
|
predicate: str | None = None
|
|
900
902
|
category: str | None = None # facts.category
|
|
901
903
|
from_name: str | None = None # relationships.from_entity.canonical_name
|
|
@@ -911,6 +913,34 @@ def _resolve_arenas(req: GraphQueryRequest) -> list[str]:
|
|
|
911
913
|
return arenas
|
|
912
914
|
|
|
913
915
|
|
|
916
|
+
# Decay access signal (RFC-decay-and-fusion Part B1): the Fusion Drive decay pass
|
|
917
|
+
# ages salience by the most recent of (last_accessed, last_seen/asserted_at), so
|
|
918
|
+
# without an access bump a frequently-retrieved memory still decays and can be
|
|
919
|
+
# evicted. Bump last_accessed on the nodes a read returns so retrieval keeps them
|
|
920
|
+
# alive. THROTTLED to once / _ACCESS_BUMP_THROTTLE per node (the UPDATE no-ops for
|
|
921
|
+
# nodes touched recently) to bound write amplification on these hot read paths,
|
|
922
|
+
# and BEST-EFFORT — a bump failure must never fail the read. `table` is always a
|
|
923
|
+
# trusted literal (never user input).
|
|
924
|
+
_ACCESS_BUMP_THROTTLE = "6 hours"
|
|
925
|
+
|
|
926
|
+
|
|
927
|
+
async def _bump_last_accessed(conn, cur, table: str, ids: list[str]) -> None:
|
|
928
|
+
ids = [i for i in ids if i]
|
|
929
|
+
if not ids:
|
|
930
|
+
return
|
|
931
|
+
try:
|
|
932
|
+
await cur.execute(
|
|
933
|
+
f"UPDATE {table} SET last_accessed = NOW() "
|
|
934
|
+
f"WHERE id = ANY(%s) AND (last_accessed IS NULL "
|
|
935
|
+
f"OR last_accessed < NOW() - interval '{_ACCESS_BUMP_THROTTLE}')",
|
|
936
|
+
(ids,),
|
|
937
|
+
)
|
|
938
|
+
await conn.commit()
|
|
939
|
+
except Exception as e: # noqa: BLE001 — never let the access bump break a read
|
|
940
|
+
await conn.rollback()
|
|
941
|
+
log.warning("last_accessed bump failed on %s: %s", table, e)
|
|
942
|
+
|
|
943
|
+
|
|
914
944
|
@app.post("/entities")
|
|
915
945
|
async def list_entities(req: GraphQueryRequest):
|
|
916
946
|
"""Filter entities by arena + optional type + optional name pattern.
|
|
@@ -928,10 +958,10 @@ async def list_entities(req: GraphQueryRequest):
|
|
|
928
958
|
params.extend([pattern, pattern])
|
|
929
959
|
sql = f"""
|
|
930
960
|
SELECT id, arena, entity_type, canonical_name, aliases,
|
|
931
|
-
provenance_event_ids, attributes, last_seen
|
|
961
|
+
provenance_event_ids, attributes, salience, last_seen
|
|
932
962
|
FROM entities
|
|
933
963
|
WHERE {' AND '.join(conditions)}
|
|
934
|
-
ORDER BY last_seen DESC
|
|
964
|
+
ORDER BY salience DESC, last_seen DESC
|
|
935
965
|
LIMIT %s
|
|
936
966
|
"""
|
|
937
967
|
params.append(req.limit)
|
|
@@ -939,14 +969,20 @@ async def list_entities(req: GraphQueryRequest):
|
|
|
939
969
|
async with conn.cursor() as cur:
|
|
940
970
|
await cur.execute(sql, params)
|
|
941
971
|
rows = await cur.fetchall()
|
|
972
|
+
await _bump_last_accessed(conn, cur, "entities", [r["id"] for r in rows])
|
|
942
973
|
return {"results": [dict(r) for r in rows]}
|
|
943
974
|
|
|
944
975
|
|
|
945
976
|
@app.post("/facts")
|
|
946
977
|
async def list_facts(req: GraphQueryRequest):
|
|
947
|
-
"""Filter facts by arena + optional category/predicate +
|
|
948
|
-
|
|
949
|
-
subject_entity_id
|
|
978
|
+
"""Filter facts by arena + optional category/predicate + subject.
|
|
979
|
+
|
|
980
|
+
PREFER `subject_entity_id` (exact id match) over `subject` (name ILIKE):
|
|
981
|
+
name matching bleeds one person's facts into another's answer when names
|
|
982
|
+
collide or fragment (the Will Vickers ⟵ Will Spencer confabulation — a
|
|
983
|
+
query resolved to one entity must NOT pull a same/similar-named entity's
|
|
984
|
+
facts). The name path is kept for back-compat callers that haven't resolved
|
|
985
|
+
an id yet, but entity-id is the strict, bleed-free path."""
|
|
950
986
|
arenas = _resolve_arenas(req)
|
|
951
987
|
conditions = ["f.arena = ANY(%s)"]
|
|
952
988
|
params: list[Any] = [arenas]
|
|
@@ -956,17 +992,24 @@ async def list_facts(req: GraphQueryRequest):
|
|
|
956
992
|
if req.predicate:
|
|
957
993
|
conditions.append("f.predicate ILIKE %s")
|
|
958
994
|
params.append(f"%{req.predicate}%")
|
|
959
|
-
if req.
|
|
995
|
+
if req.subject_entity_id:
|
|
996
|
+
conditions.append("f.subject_entity_id = %s")
|
|
997
|
+
params.append(req.subject_entity_id)
|
|
998
|
+
if req.object_entity_id:
|
|
999
|
+
conditions.append("f.object_entity_id = %s")
|
|
1000
|
+
params.append(req.object_entity_id)
|
|
1001
|
+
# Name path: only when no exact id was given (back-compat / unresolved callers).
|
|
1002
|
+
if req.subject and not req.subject_entity_id:
|
|
960
1003
|
conditions.append("EXISTS (SELECT 1 FROM entities e WHERE e.id = f.subject_entity_id AND (e.canonical_name ILIKE %s OR %s = ANY(e.aliases)))")
|
|
961
1004
|
params.extend([f"%{req.subject}%", req.subject])
|
|
962
1005
|
sql = f"""
|
|
963
1006
|
SELECT f.id, f.arena, f.category, f.predicate, f.statement,
|
|
964
1007
|
f.subject_entity_id, f.object_entity_id,
|
|
965
1008
|
f.confidence, f.stage, f.asserted_at,
|
|
966
|
-
f.provenance_event_ids
|
|
1009
|
+
f.salience, f.provenance_event_ids
|
|
967
1010
|
FROM facts f
|
|
968
1011
|
WHERE {' AND '.join(conditions)}
|
|
969
|
-
ORDER BY f.asserted_at DESC
|
|
1012
|
+
ORDER BY f.salience DESC, f.asserted_at DESC
|
|
970
1013
|
LIMIT %s
|
|
971
1014
|
"""
|
|
972
1015
|
params.append(req.limit)
|
|
@@ -974,6 +1017,7 @@ async def list_facts(req: GraphQueryRequest):
|
|
|
974
1017
|
async with conn.cursor() as cur:
|
|
975
1018
|
await cur.execute(sql, params)
|
|
976
1019
|
rows = await cur.fetchall()
|
|
1020
|
+
await _bump_last_accessed(conn, cur, "facts", [r["id"] for r in rows])
|
|
977
1021
|
return {"results": [dict(r) for r in rows]}
|
|
978
1022
|
|
|
979
1023
|
|
|
@@ -999,13 +1043,13 @@ async def list_relationships(req: GraphQueryRequest):
|
|
|
999
1043
|
r.from_entity_id, r.to_entity_id,
|
|
1000
1044
|
ef.canonical_name AS from_name,
|
|
1001
1045
|
et.canonical_name AS to_name,
|
|
1002
|
-
r.first_seen, r.last_seen,
|
|
1046
|
+
r.first_seen, r.last_seen, r.salience,
|
|
1003
1047
|
r.provenance_event_ids
|
|
1004
1048
|
FROM relationships r
|
|
1005
1049
|
JOIN entities ef ON ef.id = r.from_entity_id
|
|
1006
1050
|
JOIN entities et ON et.id = r.to_entity_id
|
|
1007
1051
|
WHERE {' AND '.join(conditions)}
|
|
1008
|
-
ORDER BY r.last_seen DESC
|
|
1052
|
+
ORDER BY r.salience DESC, r.last_seen DESC
|
|
1009
1053
|
LIMIT %s
|
|
1010
1054
|
"""
|
|
1011
1055
|
params.append(req.limit)
|
|
@@ -1013,6 +1057,7 @@ async def list_relationships(req: GraphQueryRequest):
|
|
|
1013
1057
|
async with conn.cursor() as cur:
|
|
1014
1058
|
await cur.execute(sql, params)
|
|
1015
1059
|
rows = await cur.fetchall()
|
|
1060
|
+
await _bump_last_accessed(conn, cur, "relationships", [r["id"] for r in rows])
|
|
1016
1061
|
return {"results": [dict(r) for r in rows]}
|
|
1017
1062
|
|
|
1018
1063
|
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""Unit tests for the email-alias guard (_email_plausibly_belongs).
|
|
2
|
+
|
|
3
|
+
Pins the live pollution case (the "Johann Boedecker" node, 2026-06-22): keep the
|
|
4
|
+
person's own addresses; drop the bystander emails (a hotel, newsletters, unrelated
|
|
5
|
+
gmails) the LLM stapled on from co-occurring documents.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import importlib.util
|
|
11
|
+
import sys
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
import pytest
|
|
15
|
+
|
|
16
|
+
_THIS = Path(__file__).resolve().parent
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _load(name="extractor_async_worker_aliasguard"):
|
|
20
|
+
spec = importlib.util.spec_from_file_location(name, _THIS / "worker.py")
|
|
21
|
+
mod = importlib.util.module_from_spec(spec)
|
|
22
|
+
sys.modules[name] = mod
|
|
23
|
+
spec.loader.exec_module(mod)
|
|
24
|
+
return mod
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
worker = _load()
|
|
29
|
+
except ImportError as e:
|
|
30
|
+
pytest.skip(f"extractor-async deps unavailable: {e}", allow_module_level=True)
|
|
31
|
+
|
|
32
|
+
belongs = lambda n, e: worker._email_plausibly_belongs(n, e)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# ── KEEP: the person's own addresses ─────────────────────────────────────
|
|
36
|
+
@pytest.mark.parametrize("email", [
|
|
37
|
+
"johann@pentatonic.com",
|
|
38
|
+
"johann.boedecker@pentatonic.com",
|
|
39
|
+
"boedeckerjohann@gmail.com",
|
|
40
|
+
"JOHANN@pentatonic.com", # case-insensitive
|
|
41
|
+
"jb@pentatonic.com", # initials
|
|
42
|
+
"j.boedecker@pentatonic.com", # surname token
|
|
43
|
+
])
|
|
44
|
+
def test_keeps_owner_emails(email):
|
|
45
|
+
assert belongs("Johann Boedecker", email) is True
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# ── DROP: the actual bystander emails found on the live Johann node ──────
|
|
49
|
+
@pytest.mark.parametrize("email", [
|
|
50
|
+
"reservations.nyc@acehotel.com",
|
|
51
|
+
"marketingadmin@sustainablebrands.com",
|
|
52
|
+
"martinvasquez87@gmail.com",
|
|
53
|
+
"schwaabd@yahoo.de",
|
|
54
|
+
"cvanderlip@redish.com",
|
|
55
|
+
"leechihshan33@gmail.com",
|
|
56
|
+
])
|
|
57
|
+
def test_drops_bystander_emails(email):
|
|
58
|
+
assert belongs("Johann Boedecker", email) is False
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# ── edges ────────────────────────────────────────────────────────────────
|
|
62
|
+
def test_initials_either_order():
|
|
63
|
+
assert belongs("Johann Boedecker", "bj@pentatonic.com") is True # reversed initials
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def test_no_usable_name_does_not_overfilter():
|
|
67
|
+
# a bare/empty name has nothing to check against → keep (don't strip)
|
|
68
|
+
assert belongs("", "anything@x.com") is True
|
|
69
|
+
assert belongs("J", "anything@x.com") is True # single letter < 2 → no tokens
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def test_surname_only_person_keeps_surname_email():
|
|
73
|
+
assert belongs("Vickers", "will.vickers@vickers-oil.com") is True
|
|
74
|
+
assert belongs("Vickers", "reservations.nyc@acehotel.com") is False
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def test_guard_flag_default_on():
|
|
78
|
+
assert worker.EMAIL_ALIAS_GUARD is True
|
|
@@ -1253,6 +1253,43 @@ def org_node_id_key(entity_type: str, name: str, stamped_domain: str | None) ->
|
|
|
1253
1253
|
return name
|
|
1254
1254
|
|
|
1255
1255
|
|
|
1256
|
+
# --------------------------------------------------------------------
|
|
1257
|
+
# Email-alias guard — stop bystander emails polluting a person
|
|
1258
|
+
# --------------------------------------------------------------------
|
|
1259
|
+
# The async LLM pass sometimes emits a PERSON entity whose `email` is a BYSTANDER
|
|
1260
|
+
# address co-occurring in the same doc/thread (a hotel booking, a newsletter, an
|
|
1261
|
+
# unrelated gmail). _parse_guided_json promotes it into the entity's aliases and
|
|
1262
|
+
# upsert_entities then stores + RESOLVES on it — folding strangers' identities
|
|
1263
|
+
# (and their facts) onto the person. Measured live (pentatonic-team): a "Johann
|
|
1264
|
+
# Boedecker" node carrying reservations.nyc@acehotel.com + unrelated gmails, all
|
|
1265
|
+
# from STUDENT-distilled `doc` events. This guard keeps an email alias on a person
|
|
1266
|
+
# only when its local-part plausibly relates to the person's name; clear bystanders
|
|
1267
|
+
# are dropped BEFORE resolution/storage. Conservative: dropping a genuine but
|
|
1268
|
+
# non-name-matching alias is a mild loss; keeping a bystander is a confabulation
|
|
1269
|
+
# source. Flag-revertible (EMAIL_ALIAS_GUARD=false). Persons only — org domain
|
|
1270
|
+
# stamping is untouched.
|
|
1271
|
+
EMAIL_ALIAS_GUARD = _envflag("EMAIL_ALIAS_GUARD", "true")
|
|
1272
|
+
_ALIAS_NONALPHA = re.compile(r"[^a-z]")
|
|
1273
|
+
_ALIAS_SPLIT = re.compile(r"[^a-z]+")
|
|
1274
|
+
|
|
1275
|
+
|
|
1276
|
+
def _email_plausibly_belongs(person_name: str, email: str) -> bool:
|
|
1277
|
+
"""True ⇒ keep this email as an alias of `person_name`; False ⇒ drop (clear
|
|
1278
|
+
bystander). Match = a name token appears in the local-part, OR the local-part
|
|
1279
|
+
is the person's initials. Pure + deterministic."""
|
|
1280
|
+
local = email.split("@", 1)[0].lower()
|
|
1281
|
+
local_letters = _ALIAS_NONALPHA.sub("", local)
|
|
1282
|
+
name_tokens = {t for t in _ALIAS_SPLIT.split(person_name.lower()) if len(t) >= 2}
|
|
1283
|
+
if not name_tokens or not local_letters:
|
|
1284
|
+
return True # nothing to check against — don't over-filter
|
|
1285
|
+
if any(nt in local_letters for nt in name_tokens):
|
|
1286
|
+
return True # johann@…, johann.boedecker@…, boedeckerjohann@…
|
|
1287
|
+
initials = "".join(t[0] for t in person_name.lower().split() if t[:1].isalpha())
|
|
1288
|
+
if len(initials) >= 2 and local_letters in (initials, initials[::-1]):
|
|
1289
|
+
return True # jb@… / bj@… for "Johann Boedecker"
|
|
1290
|
+
return False
|
|
1291
|
+
|
|
1292
|
+
|
|
1256
1293
|
def upsert_entities(
|
|
1257
1294
|
conn: psycopg.Connection,
|
|
1258
1295
|
arena: str,
|
|
@@ -1345,6 +1382,21 @@ def upsert_entities(
|
|
|
1345
1382
|
continue
|
|
1346
1383
|
aliases = [a for a in (e.get("aliases") or []) if a]
|
|
1347
1384
|
|
|
1385
|
+
# Email-alias guard (persons only): drop bystander emails the LLM
|
|
1386
|
+
# stapled on from a co-occurring doc/thread, BEFORE they reach
|
|
1387
|
+
# resolution or storage. See _email_plausibly_belongs.
|
|
1388
|
+
if EMAIL_ALIAS_GUARD and etype == "person" and aliases:
|
|
1389
|
+
kept = []
|
|
1390
|
+
for a in aliases:
|
|
1391
|
+
if "@" in a and " " not in a and not _email_plausibly_belongs(name, a):
|
|
1392
|
+
log.info(
|
|
1393
|
+
f"alias-guard: dropped bystander email {a!r} from "
|
|
1394
|
+
f"person {name!r} (arena={arena})"
|
|
1395
|
+
)
|
|
1396
|
+
continue
|
|
1397
|
+
kept.append(a)
|
|
1398
|
+
aliases = kept
|
|
1399
|
+
|
|
1348
1400
|
# Hard-key stamps for THIS entity, merged onto the node's attributes
|
|
1349
1401
|
# and (for domain) into the resolution aliases. Adding domain to
|
|
1350
1402
|
# aliases before forms are computed is deliberate — that's what makes
|